Files
yoink-go/comic/comic.go
Bryan Bailey d2c715e973 feat: add batcave.biz support, closes #6
## What changed

- `BatcaveBizMarkup` now accepts a `clientChan chan *http.Client` and
  sends the authenticated cookie jar client back to the caller after
  completing the Cloudflare challenge flow. All error paths send nil so
  the caller never blocks.

- `Comic` struct gains a `Client *http.Client` field. `NewComic` wires
  up the channel, receives the client, and stores it so downstream code
  can reuse the same authenticated session.

- `downloadFile` branches on `c.Client`: when set it builds the request
  manually and only attaches a `Referer: https://batcave.biz/` header
  when the image URL is actually on batcave.biz. Some issues host images
  on third-party CDNs (e.g. readcomicsonline.ru) that actively block
  requests with a batcave Referer, returning 403 — omitting the header
  fixes those.

- `ParseBatcaveBizTitle` extracts the chapter title from the
  `__DATA__.chapters` JSON array by matching the chapter ID in the URL's
  last path segment. The HTML `<title>` on batcave.biz is prefixed with
  "Read " and suffixed with "comics online for free", making it
  unsuitable as a filename. Using the chapter data gives clean titles
  like "Nightwing (1996) 153". "Issue #" and bare "#" are stripped since
  the hash character causes problems on some filesystems and tools.

- `ParseBatcaveBizImageLinks` now unescapes `\/` → `/` in extracted
  URLs. The `__DATA__` JSON often contains forward-slash-escaped URLs
  that would otherwise be stored verbatim.

- `archive.go`: `filepath.Walk` was called on `filepath.Dir(sourcePath)`
  (the library root) instead of `sourcePath` (the comic's own folder).
  This caused any leftover image files from previous downloads in sibling
  directories to be included in every new CBZ. Fixed by walking
  `sourcePath` directly.

- `BatcaveBizMarkup` client now has a 30s `Timeout`. Without it, a
  single stalled CDN connection would hang the worker goroutine
  indefinitely, causing `Download()` to block forever waiting for a
  result that never arrives.

- Fixed `for e := range err` in `cli/root.go` — ranging over `[]error`
  with one variable yields the index, not the error value.
2026-03-11 20:55:03 -04:00

140 lines
3.7 KiB
Go

package comic
import (
"net/http"
"path/filepath"
"regexp"
"strings"
"github.com/PuerkitoBio/goquery"
)
// var debugUrl = "https://readallcomics.com/ultraman-x-avengers-001-2024/"
type Comic struct {
URL string
Title string
Markup *goquery.Document
Filelist []string
Next *Comic
Prev *Comic
LibraryPath string
Client *http.Client
}
// extractTitleFromMarkup extracts the title from the comic's markup.
//
// c is the Comic instance containing the markup to extract the title from.
// Returns the extracted title as a string.
func extractTitleFromMarkup(c Comic) string {
yearFormat := `^(.*?)\s+\(\d{4}(?:\s+.+)?\)`
regex := regexp.MustCompile(yearFormat)
extractFrom := func(text string) string {
matches := regex.FindStringSubmatch(text)
if len(matches) != 2 {
return ""
}
return strings.ReplaceAll(matches[1], ":", "")
}
title := extractFrom(c.Markup.Find("title").First().Text())
if strings.HasPrefix(title, "#") {
if h1 := extractFrom(c.Markup.Find("h1").First().Text()); h1 != "" && !strings.HasPrefix(h1, "#") {
return h1
}
if slug := titleFromSlug(c.URL); slug != "" {
return slug
}
}
if title != "" {
return title
}
return "Untitled"
}
// titleFromSlug derives a comic title from the last path segment of a URL.
// It strips a trailing year (-YYYY), replaces hyphens with spaces, and title-cases the result.
func titleFromSlug(url string) string {
slug := strings.TrimRight(url, "/")
if i := strings.LastIndex(slug, "/"); i >= 0 {
slug = slug[i+1:]
}
slug = regexp.MustCompile(`-\d{4}$`).ReplaceAllString(slug, "")
if slug == "" {
return ""
}
words := strings.Split(slug, "-")
for i, w := range words {
if len(w) > 0 {
words[i] = strings.ToUpper(w[:1]) + w[1:]
}
}
return strings.Join(words, " ")
}
// NewComic creates a new Comic instance from the provided URL and library path.
//
// url is the URL of the comic to be parsed.
// libraryPath is the path to the local library where the comic will be stored.
// imageChannel is a channel for receiving image links.
// markupChannel is a channel for receiving the comic's markup.
//
// Returns a pointer to the newly created Comic instance.
func NewComic(
url string, libraryPath string,
imageChannel chan []string,
markupChannel chan *goquery.Document,
) *Comic {
c := &Comic{
URL: url,
LibraryPath: libraryPath,
}
if strings.Contains(url, "batcave.biz") {
clientChan := make(chan *http.Client, 1)
go BatcaveBizMarkup(url, markupChannel, clientChan)
markup := <-markupChannel
c.Markup = markup
c.Client = <-clientChan
if t := ParseBatcaveBizTitle(markup, url); t != "" {
c.Title = t
} else {
c.Title = extractTitleFromMarkup(*c)
}
go ParseBatcaveBizImageLinks(markup, imageChannel)
} else {
go Markup(url, markupChannel)
markup := <-markupChannel
c.Markup = markup
c.Title = extractTitleFromMarkup(*c)
go ParseImageLinks(markup, imageChannel)
}
links := <-imageChannel
c.Filelist = links
return c
}
// Cover returns the absolute filepath of the cover image of the comic.
//
// It iterates through the list of images associated with the comic and returns the first image that ends with "000.jpg" or "001.jpg".
// If no such image is found, it returns an error.
// Returns the absolute filepath of the cover image and an error.
func (c *Comic) Cover() (imageFilepath string, err error) {
for _, image := range c.Filelist {
if strings.HasSuffix(image, "000.jpg") || strings.HasSuffix(image, "001.jpg") {
image, err := filepath.Abs(image)
if err != nil {
return image, ImageParseError{Message: err.Error(), Code: 1}
}
return image, nil
}
}
return "", ImageParseError{Message: "No cover found", Code: 1}
}