feat: add batcave.biz support, closes #6

## What changed

- `BatcaveBizMarkup` now accepts a `clientChan chan *http.Client` and
  sends the authenticated cookie jar client back to the caller after
  completing the Cloudflare challenge flow. All error paths send nil so
  the caller never blocks.

- `Comic` struct gains a `Client *http.Client` field. `NewComic` wires
  up the channel, receives the client, and stores it so downstream code
  can reuse the same authenticated session.

- `downloadFile` branches on `c.Client`: when set it builds the request
  manually and only attaches a `Referer: https://batcave.biz/` header
  when the image URL is actually on batcave.biz. Some issues host images
  on third-party CDNs (e.g. readcomicsonline.ru) that actively block
  requests with a batcave Referer, returning 403 — omitting the header
  fixes those.

- `ParseBatcaveBizTitle` extracts the chapter title from the
  `__DATA__.chapters` JSON array by matching the chapter ID in the URL's
  last path segment. The HTML `<title>` on batcave.biz is prefixed with
  "Read " and suffixed with "comics online for free", making it
  unsuitable as a filename. Using the chapter data gives clean titles
  like "Nightwing (1996) 153". "Issue #" and bare "#" are stripped since
  the hash character causes problems on some filesystems and tools.

- `ParseBatcaveBizImageLinks` now unescapes `\/` → `/` in extracted
  URLs. The `__DATA__` JSON often contains forward-slash-escaped URLs
  that would otherwise be stored verbatim.

- `archive.go`: `filepath.Walk` was called on `filepath.Dir(sourcePath)`
  (the library root) instead of `sourcePath` (the comic's own folder).
  This caused any leftover image files from previous downloads in sibling
  directories to be included in every new CBZ. Fixed by walking
  `sourcePath` directly.

- `BatcaveBizMarkup` client now has a 30s `Timeout`. Without it, a
  single stalled CDN connection would hang the worker goroutine
  indefinitely, causing `Download()` to block forever waiting for a
  result that never arrives.

- Fixed `for e := range err` in `cli/root.go` — ranging over `[]error`
  with one variable yields the index, not the error value.
This commit is contained in:
2026-03-11 20:55:03 -04:00
parent 9cb26f27ec
commit d2c715e973
6 changed files with 107 additions and 52 deletions

View File

@@ -40,7 +40,7 @@ var cli = &cobra.Command{
fmt.Println(comic.Title)
err := comic.Download(len(comic.Filelist))
for e := range err {
for _, e := range err {
fmt.Println(e)
}

View File

@@ -45,7 +45,7 @@ func (c *Comic) Archive() error {
sourcePath := filepath.Join(c.LibraryPath, c.Title)
err = filepath.Walk(
filepath.Dir(sourcePath),
sourcePath,
func(path string, info os.FileInfo, err error) error {
if err != nil {
return ArchiveError{

View File

@@ -1,6 +1,7 @@
package comic
import (
"net/http"
"path/filepath"
"regexp"
"strings"
@@ -18,6 +19,7 @@ type Comic struct {
Next *Comic
Prev *Comic
LibraryPath string
Client *http.Client
}
// extractTitleFromMarkup extracts the title from the comic's markup.
@@ -93,18 +95,22 @@ func NewComic(
}
if strings.Contains(url, "batcave.biz") {
go BatcaveBizMarkup(url, markupChannel)
clientChan := make(chan *http.Client, 1)
go BatcaveBizMarkup(url, markupChannel, clientChan)
markup := <-markupChannel
c.Markup = markup
c.Client = <-clientChan
if t := ParseBatcaveBizTitle(markup, url); t != "" {
c.Title = t
} else {
c.Title = extractTitleFromMarkup(*c)
}
go ParseBatcaveBizImageLinks(markup, imageChannel)
} else {
go Markup(url, markupChannel)
}
markup := <-markupChannel
c.Markup = markup
c.Title = extractTitleFromMarkup(*c)
if strings.Contains(url, "batcave.biz") {
go ParseBatcaveBizImageLinks(markup, imageChannel)
} else {
go ParseImageLinks(markup, imageChannel)
}
links := <-imageChannel

View File

@@ -6,6 +6,7 @@ import (
"net/http"
"os"
"path/filepath"
"strings"
"time"
cloudflarebp "github.com/DaRealFreak/cloudflare-bp-go"
@@ -39,13 +40,33 @@ func downloadFile(url string, page int, c *Comic) error {
}
}
res, err := handleRequest(url)
var res *http.Response
var err error
if c.Client != nil {
req, reqErr := http.NewRequest("GET", url, nil)
if reqErr != nil {
return ComicDownloadError{Message: "invalid request", Code: 1}
}
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
if strings.Contains(url, "batcave.biz") {
req.Header.Set("Referer", "https://batcave.biz/")
}
res, err = c.Client.Do(req)
} else {
res, err = handleRequest(url)
}
if err != nil {
return ComicDownloadError{
Message: "invalid request",
Code: 1,
}
}
if res.StatusCode != http.StatusOK {
return ComicDownloadError{
Message: "bad response",
Code: 1,
}
}
defer res.Body.Close()
imageFile, err := os.Create(imageFilepath)

View File

@@ -7,6 +7,7 @@ import (
"net/url"
"regexp"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
@@ -50,10 +51,21 @@ func Markup(url string, c chan *goquery.Document) *goquery.Document {
return markup
}
func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Document {
func BatcaveBizMarkup(referer string, c chan *goquery.Document, clientChan chan *http.Client) *goquery.Document {
sendErr := func() *goquery.Document {
if c != nil {
c <- &goquery.Document{}
}
if clientChan != nil {
clientChan <- nil
}
return &goquery.Document{}
}
jar, _ := cookiejar.New(nil)
client := &http.Client{
Jar: jar,
Timeout: time.Second * 30,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
return nil
},
@@ -68,10 +80,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
// GET the challange page to obtain cookies and any necessary tokens
req, err := http.NewRequest("GET", referer, nil)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
for k, v := range headers {
req.Header.Set(k, v)
@@ -79,19 +88,13 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
res, err := client.Do(req)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
defer res.Body.Close()
body, err := io.ReadAll(res.Body)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
tokenRegex := regexp.MustCompile(`token:\s*"([^"]+)"`)
@@ -101,14 +104,14 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
// no challenge, parse directly
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
if c != nil {
c <- doc
}
if clientChan != nil {
clientChan <- client
}
return doc
}
@@ -132,10 +135,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
postReq, err := http.NewRequest("POST", "https://batcave.biz/_v", strings.NewReader(params.Encode()))
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
for k, v := range headers {
postReq.Header.Set(k, v)
@@ -145,10 +145,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
postRes, err := client.Do(postReq)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
defer postRes.Body.Close()
io.ReadAll(postRes.Body)
@@ -156,10 +153,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
// GET the real page with the set cookie
realReq, err := http.NewRequest("GET", referer, nil)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
for k, v := range headers {
realReq.Header.Set(k, v)
@@ -167,23 +161,20 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen
realRes, err := client.Do(realReq)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
defer realRes.Body.Close()
doc, err := goquery.NewDocumentFromReader(realRes.Body)
if err != nil {
if c != nil {
c <- &goquery.Document{}
}
return &goquery.Document{}
return sendErr()
}
if c != nil {
c <- doc
}
if clientChan != nil {
clientChan <- client
}
return doc
}
@@ -228,6 +219,34 @@ func ParseReadAllComicsLinks(markup *goquery.Document, c chan []string) ([]strin
return links, ImageParseError{Message: "No images found", Code: 1}
}
// ParseBatcaveBizTitle extracts the chapter title from the __DATA__.chapters array
// by matching the chapter id to the last path segment of the provided URL.
func ParseBatcaveBizTitle(markup *goquery.Document, chapterURL string) string {
slug := strings.TrimRight(chapterURL, "/")
if i := strings.LastIndex(slug, "/"); i >= 0 {
slug = slug[i+1:]
}
var title string
markup.Find("script").Each(func(_ int, s *goquery.Selection) {
if title != "" {
return
}
text := s.Text()
if !strings.Contains(text, "__DATA__") {
return
}
chapterRegex := regexp.MustCompile(`"id"\s*:\s*` + regexp.QuoteMeta(slug) + `[^}]*?"title"\s*:\s*"([^"]+)"`)
m := chapterRegex.FindStringSubmatch(text)
if len(m) >= 2 {
title = strings.ReplaceAll(m[1], `\/`, "/")
title = strings.ReplaceAll(title, "Issue #", "")
title = strings.ReplaceAll(title, "#", "")
}
})
return title
}
// ParseBatcaveBizImageLinks extracts image URLs from the __DATA__.images JavaScript
// variable embedded in a batcave.biz page.
func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]string, error) {
@@ -248,7 +267,7 @@ func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]str
urlRegex := regexp.MustCompile(`"([^"]+)"`)
for _, m := range urlRegex.FindAllStringSubmatch(arrayMatch[1], -1) {
if len(m) >= 2 {
links = append(links, m[1])
links = append(links, strings.ReplaceAll(m[1], `\/`, "/"))
}
}
})

View File

@@ -24,6 +24,15 @@ func TestParseBatcaveBizImageLinks(t *testing.T) {
expectErr: false,
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg", "https://cdn.batcave.biz/img/002.jpg"},
},
{
name: "unescapes forward slashes in URLs",
html: `<html><body><script>
var __DATA__ = {"images":["https:\/\/cdn.batcave.biz\/img\/001.jpg"]};
</script></body></html>`,
expectCount: 1,
expectErr: false,
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg"},
},
{
name: "extracts images with spaces around colon and bracket",
html: `<html><body><script>