Files
yoink-go/comic/parser_test.go
Bryan Bailey d2c715e973 feat: add batcave.biz support, closes #6
## What changed

- `BatcaveBizMarkup` now accepts a `clientChan chan *http.Client` and
  sends the authenticated cookie jar client back to the caller after
  completing the Cloudflare challenge flow. All error paths send nil so
  the caller never blocks.

- `Comic` struct gains a `Client *http.Client` field. `NewComic` wires
  up the channel, receives the client, and stores it so downstream code
  can reuse the same authenticated session.

- `downloadFile` branches on `c.Client`: when set it builds the request
  manually and only attaches a `Referer: https://batcave.biz/` header
  when the image URL is actually on batcave.biz. Some issues host images
  on third-party CDNs (e.g. readcomicsonline.ru) that actively block
  requests with a batcave Referer, returning 403 — omitting the header
  fixes those.

- `ParseBatcaveBizTitle` extracts the chapter title from the
  `__DATA__.chapters` JSON array by matching the chapter ID in the URL's
  last path segment. The HTML `<title>` on batcave.biz is prefixed with
  "Read " and suffixed with "comics online for free", making it
  unsuitable as a filename. Using the chapter data gives clean titles
  like "Nightwing (1996) 153". "Issue #" and bare "#" are stripped since
  the hash character causes problems on some filesystems and tools.

- `ParseBatcaveBizImageLinks` now unescapes `\/` → `/` in extracted
  URLs. The `__DATA__` JSON often contains forward-slash-escaped URLs
  that would otherwise be stored verbatim.

- `archive.go`: `filepath.Walk` was called on `filepath.Dir(sourcePath)`
  (the library root) instead of `sourcePath` (the comic's own folder).
  This caused any leftover image files from previous downloads in sibling
  directories to be included in every new CBZ. Fixed by walking
  `sourcePath` directly.

- `BatcaveBizMarkup` client now has a 30s `Timeout`. Without it, a
  single stalled CDN connection would hang the worker goroutine
  indefinitely, causing `Download()` to block forever waiting for a
  result that never arrives.

- Fixed `for e := range err` in `cli/root.go` — ranging over `[]error`
  with one variable yields the index, not the error value.
2026-03-11 20:55:03 -04:00

193 lines
5.0 KiB
Go

package comic
import (
"strings"
"testing"
"github.com/PuerkitoBio/goquery"
)
func TestParseBatcaveBizImageLinks(t *testing.T) {
tests := []struct {
name string
html string
expectCount int
expectErr bool
expectURLs []string
}{
{
name: "extracts images from __DATA__",
html: `<html><body><script>
var __DATA__ = {"images":["https://cdn.batcave.biz/img/001.jpg","https://cdn.batcave.biz/img/002.jpg"]};
</script></body></html>`,
expectCount: 2,
expectErr: false,
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg", "https://cdn.batcave.biz/img/002.jpg"},
},
{
name: "unescapes forward slashes in URLs",
html: `<html><body><script>
var __DATA__ = {"images":["https:\/\/cdn.batcave.biz\/img\/001.jpg"]};
</script></body></html>`,
expectCount: 1,
expectErr: false,
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg"},
},
{
name: "extracts images with spaces around colon and bracket",
html: `<html><body><script>
var __DATA__ = {"images" : [ "https://cdn.batcave.biz/img/001.jpg" ]};
</script></body></html>`,
expectCount: 1,
expectErr: false,
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg"},
},
{
name: "no __DATA__ script",
html: `<html><body><script>
var foo = "bar";
</script></body></html>`,
expectCount: 0,
expectErr: true,
},
{
name: "__DATA__ present but no images key",
html: `<html><body><script>
var __DATA__ = {"title":"Nightwing"};
</script></body></html>`,
expectCount: 0,
expectErr: true,
},
{
name: "no script tags",
html: `<html><body><p>nothing here</p></body></html>`,
expectCount: 0,
expectErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(tt.html))
ch := make(chan []string, 1)
links, err := ParseBatcaveBizImageLinks(doc, ch)
if tt.expectErr && err == nil {
t.Error("ParseBatcaveBizImageLinks() expected error, got nil")
}
if !tt.expectErr && err != nil {
t.Errorf("ParseBatcaveBizImageLinks() unexpected error: %v", err)
}
if len(links) != tt.expectCount {
t.Errorf("ParseBatcaveBizImageLinks() returned %d links, want %d", len(links), tt.expectCount)
}
for i, expected := range tt.expectURLs {
if i >= len(links) {
t.Errorf("missing link at index %d: want %q", i, expected)
continue
}
if links[i] != expected {
t.Errorf("links[%d] = %q, want %q", i, links[i], expected)
}
}
channelLinks := <-ch
if len(channelLinks) != tt.expectCount {
t.Errorf("channel received %d links, want %d", len(channelLinks), tt.expectCount)
}
})
}
}
func TestImageParseError(t *testing.T) {
err := ImageParseError{Message: "test error", Code: 1}
if err.Error() != "test error" {
t.Errorf("Error() = %q, want %q", err.Error(), "test error")
}
}
func TestParseImageLinks(t *testing.T) {
tests := []struct {
name string
html string
expectCount int
expectErr bool
}{
{
name: "extracts blogspot images",
html: `<html><body>
<img src="https://bp.blogspot.com/page-001.jpg" />
<img src="https://bp.blogspot.com/page-002.jpg" />
</body></html>`,
expectCount: 2,
expectErr: false,
},
{
name: "extracts blogger googleusercontent images",
html: `<html><body>
<img src="https://blogger.googleusercontent.com/page-001.jpg" />
</body></html>`,
expectCount: 1,
expectErr: false,
},
{
name: "extracts covers images",
html: `<html><body>
<img src="https://example.com/covers/cover-001.jpg" />
</body></html>`,
expectCount: 1,
expectErr: false,
},
{
name: "excludes logo images",
html: `<html><body>
<img src="https://bp.blogspot.com/logo-site.jpg" />
<img src="https://bp.blogspot.com/page-001.jpg" />
</body></html>`,
expectCount: 1,
expectErr: false,
},
{
name: "excludes non-matching images",
html: `<html><body>
<img src="https://other-site.com/image.jpg" />
<img src="https://cdn.example.com/banner.png" />
</body></html>`,
expectCount: 0,
expectErr: true,
},
{
name: "no images at all",
html: `<html><body><p>No images here</p></body></html>`,
expectCount: 0,
expectErr: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(tt.html))
ch := make(chan []string, 1)
links, err := ParseImageLinks(doc, ch)
if tt.expectErr && err == nil {
t.Error("ParseImageLinks() expected error, got nil")
}
if !tt.expectErr && err != nil {
t.Errorf("ParseImageLinks() unexpected error: %v", err)
}
if len(links) != tt.expectCount {
t.Errorf("ParseImageLinks() returned %d links, want %d", len(links), tt.expectCount)
}
// Verify the channel also received the links
channelLinks := <-ch
if len(channelLinks) != tt.expectCount {
t.Errorf("channel received %d links, want %d", len(channelLinks), tt.expectCount)
}
})
}
}