When readallcomics.com pages have a <title> containing only the issue number (e.g. '#018 (2026)'), fall back to the h1 element first, then derive the title from the URL slug by stripping the trailing year and title-casing the hyphen-separated segments. Closes #4
184 lines
4.7 KiB
Go
184 lines
4.7 KiB
Go
package comic
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
func TestParseBatcaveBizImageLinks(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
html string
|
|
expectCount int
|
|
expectErr bool
|
|
expectURLs []string
|
|
}{
|
|
{
|
|
name: "extracts images from __DATA__",
|
|
html: `<html><body><script>
|
|
var __DATA__ = {"images":["https://cdn.batcave.biz/img/001.jpg","https://cdn.batcave.biz/img/002.jpg"]};
|
|
</script></body></html>`,
|
|
expectCount: 2,
|
|
expectErr: false,
|
|
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg", "https://cdn.batcave.biz/img/002.jpg"},
|
|
},
|
|
{
|
|
name: "extracts images with spaces around colon and bracket",
|
|
html: `<html><body><script>
|
|
var __DATA__ = {"images" : [ "https://cdn.batcave.biz/img/001.jpg" ]};
|
|
</script></body></html>`,
|
|
expectCount: 1,
|
|
expectErr: false,
|
|
expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg"},
|
|
},
|
|
{
|
|
name: "no __DATA__ script",
|
|
html: `<html><body><script>
|
|
var foo = "bar";
|
|
</script></body></html>`,
|
|
expectCount: 0,
|
|
expectErr: true,
|
|
},
|
|
{
|
|
name: "__DATA__ present but no images key",
|
|
html: `<html><body><script>
|
|
var __DATA__ = {"title":"Nightwing"};
|
|
</script></body></html>`,
|
|
expectCount: 0,
|
|
expectErr: true,
|
|
},
|
|
{
|
|
name: "no script tags",
|
|
html: `<html><body><p>nothing here</p></body></html>`,
|
|
expectCount: 0,
|
|
expectErr: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(tt.html))
|
|
ch := make(chan []string, 1)
|
|
|
|
links, err := ParseBatcaveBizImageLinks(doc, ch)
|
|
|
|
if tt.expectErr && err == nil {
|
|
t.Error("ParseBatcaveBizImageLinks() expected error, got nil")
|
|
}
|
|
if !tt.expectErr && err != nil {
|
|
t.Errorf("ParseBatcaveBizImageLinks() unexpected error: %v", err)
|
|
}
|
|
if len(links) != tt.expectCount {
|
|
t.Errorf("ParseBatcaveBizImageLinks() returned %d links, want %d", len(links), tt.expectCount)
|
|
}
|
|
for i, expected := range tt.expectURLs {
|
|
if i >= len(links) {
|
|
t.Errorf("missing link at index %d: want %q", i, expected)
|
|
continue
|
|
}
|
|
if links[i] != expected {
|
|
t.Errorf("links[%d] = %q, want %q", i, links[i], expected)
|
|
}
|
|
}
|
|
|
|
channelLinks := <-ch
|
|
if len(channelLinks) != tt.expectCount {
|
|
t.Errorf("channel received %d links, want %d", len(channelLinks), tt.expectCount)
|
|
}
|
|
})
|
|
}
|
|
}
|
|
|
|
func TestImageParseError(t *testing.T) {
|
|
err := ImageParseError{Message: "test error", Code: 1}
|
|
if err.Error() != "test error" {
|
|
t.Errorf("Error() = %q, want %q", err.Error(), "test error")
|
|
}
|
|
}
|
|
|
|
func TestParseImageLinks(t *testing.T) {
|
|
tests := []struct {
|
|
name string
|
|
html string
|
|
expectCount int
|
|
expectErr bool
|
|
}{
|
|
{
|
|
name: "extracts blogspot images",
|
|
html: `<html><body>
|
|
<img src="https://bp.blogspot.com/page-001.jpg" />
|
|
<img src="https://bp.blogspot.com/page-002.jpg" />
|
|
</body></html>`,
|
|
expectCount: 2,
|
|
expectErr: false,
|
|
},
|
|
{
|
|
name: "extracts blogger googleusercontent images",
|
|
html: `<html><body>
|
|
<img src="https://blogger.googleusercontent.com/page-001.jpg" />
|
|
</body></html>`,
|
|
expectCount: 1,
|
|
expectErr: false,
|
|
},
|
|
{
|
|
name: "extracts covers images",
|
|
html: `<html><body>
|
|
<img src="https://example.com/covers/cover-001.jpg" />
|
|
</body></html>`,
|
|
expectCount: 1,
|
|
expectErr: false,
|
|
},
|
|
{
|
|
name: "excludes logo images",
|
|
html: `<html><body>
|
|
<img src="https://bp.blogspot.com/logo-site.jpg" />
|
|
<img src="https://bp.blogspot.com/page-001.jpg" />
|
|
</body></html>`,
|
|
expectCount: 1,
|
|
expectErr: false,
|
|
},
|
|
{
|
|
name: "excludes non-matching images",
|
|
html: `<html><body>
|
|
<img src="https://other-site.com/image.jpg" />
|
|
<img src="https://cdn.example.com/banner.png" />
|
|
</body></html>`,
|
|
expectCount: 0,
|
|
expectErr: true,
|
|
},
|
|
{
|
|
name: "no images at all",
|
|
html: `<html><body><p>No images here</p></body></html>`,
|
|
expectCount: 0,
|
|
expectErr: true,
|
|
},
|
|
}
|
|
|
|
for _, tt := range tests {
|
|
t.Run(tt.name, func(t *testing.T) {
|
|
doc, _ := goquery.NewDocumentFromReader(strings.NewReader(tt.html))
|
|
ch := make(chan []string, 1)
|
|
|
|
links, err := ParseImageLinks(doc, ch)
|
|
|
|
if tt.expectErr && err == nil {
|
|
t.Error("ParseImageLinks() expected error, got nil")
|
|
}
|
|
if !tt.expectErr && err != nil {
|
|
t.Errorf("ParseImageLinks() unexpected error: %v", err)
|
|
}
|
|
if len(links) != tt.expectCount {
|
|
t.Errorf("ParseImageLinks() returned %d links, want %d", len(links), tt.expectCount)
|
|
}
|
|
|
|
// Verify the channel also received the links
|
|
channelLinks := <-ch
|
|
if len(channelLinks) != tt.expectCount {
|
|
t.Errorf("channel received %d links, want %d", len(channelLinks), tt.expectCount)
|
|
}
|
|
})
|
|
}
|
|
}
|