From dcb41deea9f971822dc551dbe17976b7a5987c3c Mon Sep 17 00:00:00 2001 From: Bryan Bailey Date: Wed, 11 Mar 2026 18:13:14 -0400 Subject: [PATCH] fix: extract title from h1 or URL slug when page title starts with # When readallcomics.com pages have a containing only the issue number (e.g. '#018 (2026)'), fall back to the h1 element first, then derive the title from the URL slug by stripping the trailing year and title-casing the hyphen-separated segments. Closes #4 --- comic/archive_test.go | 110 +++++++++++++++++++++++ comic/cleanup_test.go | 93 ++++++++++++++++++++ comic/comic.go | 65 +++++++++++--- comic/comic_test.go | 170 ++++++++++++++++++++++++++++++++++++ comic/download_test.go | 145 +++++++++++++++++++++++++++++++ comic/parser.go | 192 +++++++++++++++++++++++++++++++++++++++++ comic/parser_test.go | 183 +++++++++++++++++++++++++++++++++++++++ go.mod | 1 + go.sum | 4 + 9 files changed, 950 insertions(+), 13 deletions(-) create mode 100644 comic/archive_test.go create mode 100644 comic/cleanup_test.go create mode 100644 comic/comic_test.go create mode 100644 comic/download_test.go create mode 100644 comic/parser_test.go diff --git a/comic/archive_test.go b/comic/archive_test.go new file mode 100644 index 0000000..6f1645e --- /dev/null +++ b/comic/archive_test.go @@ -0,0 +1,110 @@ +package comic + +import ( + "archive/zip" + "os" + "path/filepath" + "testing" +) + +func TestArchiveError(t *testing.T) { + err := ArchiveError{Message: "archive failed", Code: 1} + if err.Error() != "archive failed" { + t.Errorf("Error() = %q, want %q", err.Error(), "archive failed") + } +} + +func TestArchive(t *testing.T) { + t.Run("creates cbz with image files", func(t *testing.T) { + tmpDir := t.TempDir() + title := "TestComic" + comicDir := filepath.Join(tmpDir, title) + os.MkdirAll(comicDir, os.ModePerm) + + // Create fake image files + for _, name := range []string{"TestComic 001.jpg", "TestComic 002.jpg", "TestComic 003.png"} { + os.WriteFile(filepath.Join(comicDir, name), []byte("fake image"), 0644) + } + + c := &Comic{ + Title: title, + LibraryPath: tmpDir, + } + + err := c.Archive() + if err != nil { + t.Fatalf("Archive() unexpected error: %v", err) + } + + archivePath := filepath.Join(comicDir, title+".cbz") + if _, err := os.Stat(archivePath); os.IsNotExist(err) { + t.Fatalf("expected archive %s to exist", archivePath) + } + + // Verify the zip contains the image files + reader, err := zip.OpenReader(archivePath) + if err != nil { + t.Fatalf("failed to open archive: %v", err) + } + defer reader.Close() + + if len(reader.File) != 3 { + t.Errorf("archive contains %d files, want 3", len(reader.File)) + } + }) + + t.Run("excludes non-image files from archive", func(t *testing.T) { + tmpDir := t.TempDir() + title := "TestComic" + comicDir := filepath.Join(tmpDir, title) + os.MkdirAll(comicDir, os.ModePerm) + + // Create mixed files + os.WriteFile(filepath.Join(comicDir, "page-001.jpg"), []byte("image"), 0644) + os.WriteFile(filepath.Join(comicDir, "readme.txt"), []byte("text"), 0644) + os.WriteFile(filepath.Join(comicDir, "data.json"), []byte("json"), 0644) + + c := &Comic{ + Title: title, + LibraryPath: tmpDir, + } + + err := c.Archive() + if err != nil { + t.Fatalf("Archive() unexpected error: %v", err) + } + + archivePath := filepath.Join(comicDir, title+".cbz") + reader, err := zip.OpenReader(archivePath) + if err != nil { + t.Fatalf("failed to open archive: %v", err) + } + defer reader.Close() + + if len(reader.File) != 1 { + t.Errorf("archive contains %d files, want 1 (only .jpg)", len(reader.File)) + } + }) + + t.Run("handles empty directory", func(t *testing.T) { + tmpDir := t.TempDir() + title := "EmptyComic" + comicDir := filepath.Join(tmpDir, title) + os.MkdirAll(comicDir, os.ModePerm) + + c := &Comic{ + Title: title, + LibraryPath: tmpDir, + } + + err := c.Archive() + if err != nil { + t.Fatalf("Archive() unexpected error: %v", err) + } + + archivePath := filepath.Join(comicDir, title+".cbz") + if _, err := os.Stat(archivePath); os.IsNotExist(err) { + t.Fatalf("expected archive %s to exist even if empty", archivePath) + } + }) +} diff --git a/comic/cleanup_test.go b/comic/cleanup_test.go new file mode 100644 index 0000000..da90ce4 --- /dev/null +++ b/comic/cleanup_test.go @@ -0,0 +1,93 @@ +package comic + +import ( + "os" + "path/filepath" + "testing" +) + +func TestCleanup(t *testing.T) { + t.Run("keeps cover image 001 and removes others", func(t *testing.T) { + tmpDir := t.TempDir() + title := "TestComic" + comicDir := filepath.Join(tmpDir, title) + os.MkdirAll(comicDir, os.ModePerm) + + files := map[string]bool{ + "TestComic 001.jpg": true, // should be kept + "TestComic 002.jpg": false, // should be removed + "TestComic 003.jpg": false, // should be removed + } + + for name := range files { + os.WriteFile(filepath.Join(comicDir, name), []byte("fake"), 0644) + } + + c := &Comic{ + Title: title, + LibraryPath: tmpDir, + } + + err := c.Cleanup() + if err != nil { + t.Fatalf("Cleanup() unexpected error: %v", err) + } + + for name, shouldExist := range files { + path := filepath.Join(comicDir, name) + _, err := os.Stat(path) + exists := !os.IsNotExist(err) + + if shouldExist && !exists { + t.Errorf("expected %s to be kept, but it was removed", name) + } + if !shouldExist && exists { + t.Errorf("expected %s to be removed, but it still exists", name) + } + } + }) + + t.Run("keeps non-image files", func(t *testing.T) { + tmpDir := t.TempDir() + title := "TestComic" + comicDir := filepath.Join(tmpDir, title) + os.MkdirAll(comicDir, os.ModePerm) + + os.WriteFile(filepath.Join(comicDir, "TestComic.cbz"), []byte("archive"), 0644) + os.WriteFile(filepath.Join(comicDir, "metadata.json"), []byte("data"), 0644) + + c := &Comic{ + Title: title, + LibraryPath: tmpDir, + } + + err := c.Cleanup() + if err != nil { + t.Fatalf("Cleanup() unexpected error: %v", err) + } + + for _, name := range []string{"TestComic.cbz", "metadata.json"} { + path := filepath.Join(comicDir, name) + if _, err := os.Stat(path); os.IsNotExist(err) { + t.Errorf("expected non-image file %s to be kept", name) + } + } + }) + + t.Run("handles empty directory", func(t *testing.T) { + tmpDir := t.TempDir() + title := "EmptyComic" + comicDir := filepath.Join(tmpDir, title) + os.MkdirAll(comicDir, os.ModePerm) + + c := &Comic{ + Title: title, + LibraryPath: tmpDir, + } + + err := c.Cleanup() + if err != nil { + t.Fatalf("Cleanup() unexpected error for empty dir: %v", err) + } + }) +} diff --git a/comic/comic.go b/comic/comic.go index 4a48c73..16fa617 100644 --- a/comic/comic.go +++ b/comic/comic.go @@ -26,21 +26,52 @@ type Comic struct { // Returns the extracted title as a string. func extractTitleFromMarkup(c Comic) string { yearFormat := `^(.*?)\s+\(\d{4}(?:\s+.+)?\)` - selection := c.Markup.Find("title") - - if selection.Length() == 0 { - return "Untitled" - } - - content := selection.First().Text() regex := regexp.MustCompile(yearFormat) - matches := regex.FindStringSubmatch(content) - if len(matches) != 2 { - return "Untitled" + extractFrom := func(text string) string { + matches := regex.FindStringSubmatch(text) + if len(matches) != 2 { + return "" + } + return strings.ReplaceAll(matches[1], ":", "") } - return strings.ReplaceAll(matches[1], ":", "") + title := extractFrom(c.Markup.Find("title").First().Text()) + + if strings.HasPrefix(title, "#") { + if h1 := extractFrom(c.Markup.Find("h1").First().Text()); h1 != "" && !strings.HasPrefix(h1, "#") { + return h1 + } + if slug := titleFromSlug(c.URL); slug != "" { + return slug + } + } + + if title != "" { + return title + } + + return "Untitled" +} + +// titleFromSlug derives a comic title from the last path segment of a URL. +// It strips a trailing year (-YYYY), replaces hyphens with spaces, and title-cases the result. +func titleFromSlug(url string) string { + slug := strings.TrimRight(url, "/") + if i := strings.LastIndex(slug, "/"); i >= 0 { + slug = slug[i+1:] + } + slug = regexp.MustCompile(`-\d{4}$`).ReplaceAllString(slug, "") + if slug == "" { + return "" + } + words := strings.Split(slug, "-") + for i, w := range words { + if len(w) > 0 { + words[i] = strings.ToUpper(w[:1]) + w[1:] + } + } + return strings.Join(words, " ") } // NewComic creates a new Comic instance from the provided URL and library path. @@ -61,13 +92,21 @@ func NewComic( LibraryPath: libraryPath, } - go Markup(c.URL, markupChannel) + if strings.Contains(url, "batcave.biz") { + go BatcaveBizMarkup(url, markupChannel) + } else { + go Markup(url, markupChannel) + } markup := <-markupChannel c.Markup = markup c.Title = extractTitleFromMarkup(*c) - go ParseImageLinks(markup, imageChannel) + if strings.Contains(url, "batcave.biz") { + go ParseBatcaveBizImageLinks(markup, imageChannel) + } else { + go ParseImageLinks(markup, imageChannel) + } links := <-imageChannel c.Filelist = links diff --git a/comic/comic_test.go b/comic/comic_test.go new file mode 100644 index 0000000..56f2318 --- /dev/null +++ b/comic/comic_test.go @@ -0,0 +1,170 @@ +package comic + +import ( + "strings" + "testing" + + "github.com/PuerkitoBio/goquery" +) + +func newDocFromHTML(html string) *goquery.Document { + doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html)) + return doc +} + +func TestExtractTitleFromMarkup(t *testing.T) { + tests := []struct { + name string + html string + url string + expected string + }{ + { + name: "standard title with year", + html: `<html><head><title>Ultraman X Avengers 001 (2024)`, + expected: "Ultraman X Avengers 001", + }, + { + name: "title with year and extra text", + html: `Batman 042 (2023 Digital)`, + expected: "Batman 042", + }, + { + name: "title with colon removed", + html: `Spider-Man: No Way Home 001 (2022)`, + expected: "Spider-Man No Way Home 001", + }, + { + name: "no title tag", + html: ``, + expected: "Untitled", + }, + { + name: "title without year pattern", + html: `Some Random Page`, + expected: "Untitled", + }, + { + name: "empty title", + html: ``, + expected: "Untitled", + }, + { + name: "title starts with # falls back to h1", + html: `#018 (2026)

Absolute Batman #018 (2026)

`, + expected: "Absolute Batman #018", + }, + { + name: "title starts with # but h1 also starts with #, falls back to slug", + html: `#018 (2026)

#018 (2026)

`, + url: "https://readallcomics.com/absolute-batman-018-2026/", + expected: "Absolute Batman 018", + }, + { + name: "title starts with # falls back to slug when no h1", + html: `#018 (2026)`, + url: "https://readallcomics.com/absolute-batman-018-2026/", + expected: "Absolute Batman 018", + }, + { + name: "title starts with # no h1 no url", + html: `#018 (2026)`, + expected: "#018", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + doc := newDocFromHTML(tt.html) + c := Comic{Markup: doc, URL: tt.url} + result := extractTitleFromMarkup(c) + if result != tt.expected { + t.Errorf("extractTitleFromMarkup() = %q, want %q", result, tt.expected) + } + }) + } +} + +func TestTitleFromSlug(t *testing.T) { + tests := []struct { + name string + url string + expected string + }{ + { + name: "standard comic URL", + url: "https://readallcomics.com/absolute-batman-018-2026/", + expected: "Absolute Batman 018", + }, + { + name: "no trailing slash", + url: "https://readallcomics.com/absolute-batman-018-2026", + expected: "Absolute Batman 018", + }, + { + name: "no year in slug", + url: "https://readallcomics.com/absolute-batman-018/", + expected: "Absolute Batman 018", + }, + { + name: "single word slug", + url: "https://readallcomics.com/batman/", + expected: "Batman", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := titleFromSlug(tt.url) + if result != tt.expected { + t.Errorf("titleFromSlug() = %q, want %q", result, tt.expected) + } + }) + } +} + +func TestCover(t *testing.T) { + tests := []struct { + name string + filelist []string + wantSuffix string + expectErr bool + }{ + { + name: "finds cover ending in 001.jpg", + filelist: []string{"https://example.com/image-002.jpg", "https://example.com/image-001.jpg", "https://example.com/image-003.jpg"}, + wantSuffix: "image-001.jpg", + }, + { + name: "finds cover ending in 000.jpg", + filelist: []string{"https://example.com/image-000.jpg", "https://example.com/image-001.jpg"}, + wantSuffix: "image-000.jpg", + }, + { + name: "returns error when no cover found", + filelist: []string{"https://example.com/image-002.jpg", "https://example.com/image-003.jpg"}, + expectErr: true, + }, + { + name: "returns error for empty filelist", + filelist: []string{}, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := &Comic{Filelist: tt.filelist} + cover, err := c.Cover() + if tt.expectErr && err == nil { + t.Error("Cover() expected error, got nil") + } + if !tt.expectErr && err != nil { + t.Errorf("Cover() unexpected error: %v", err) + } + if tt.wantSuffix != "" && !strings.HasSuffix(cover, tt.wantSuffix) { + t.Errorf("Cover() = %q, want path ending in %q", cover, tt.wantSuffix) + } + }) + } +} diff --git a/comic/download_test.go b/comic/download_test.go new file mode 100644 index 0000000..690ad68 --- /dev/null +++ b/comic/download_test.go @@ -0,0 +1,145 @@ +package comic + +import ( + "net/http" + "net/http/httptest" + "os" + "path/filepath" + "testing" +) + +func TestComicDownloadError(t *testing.T) { + err := ComicDownloadError{Message: "download failed", Code: 1} + if err.Error() != "download failed" { + t.Errorf("Error() = %q, want %q", err.Error(), "download failed") + } +} + +func TestHandleRequest(t *testing.T) { + t.Run("successful request", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if r.Header.Get("User-Agent") == "" { + t.Error("expected User-Agent header to be set") + } + w.WriteHeader(http.StatusOK) + w.Write([]byte("image data")) + })) + defer server.Close() + + resp, err := handleRequest(server.URL) + if err != nil { + t.Fatalf("handleRequest() unexpected error: %v", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + t.Errorf("handleRequest() status = %d, want %d", resp.StatusCode, http.StatusOK) + } + }) + + t.Run("non-200 response", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + defer server.Close() + + _, err := handleRequest(server.URL) + if err == nil { + t.Error("handleRequest() expected error for 404 response, got nil") + } + }) + + t.Run("invalid URL", func(t *testing.T) { + _, err := handleRequest("http://invalid.localhost:0/bad") + if err == nil { + t.Error("handleRequest() expected error for invalid URL, got nil") + } + }) +} + +func TestDownloadFile(t *testing.T) { + t.Run("successful download", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("fake image content")) + })) + defer server.Close() + + tmpDir := t.TempDir() + c := &Comic{ + Title: "TestComic", + LibraryPath: tmpDir, + } + + err := downloadFile(server.URL+"/image.jpg", 1, c) + if err != nil { + t.Fatalf("downloadFile() unexpected error: %v", err) + } + + expectedPath := filepath.Join(tmpDir, "TestComic", "TestComic 001.jpg") + if _, err := os.Stat(expectedPath); os.IsNotExist(err) { + t.Errorf("expected file %s to exist", expectedPath) + } + }) + + t.Run("formats page number with leading zeros", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + w.Write([]byte("fake image content")) + })) + defer server.Close() + + tmpDir := t.TempDir() + c := &Comic{ + Title: "TestComic", + LibraryPath: tmpDir, + } + + err := downloadFile(server.URL+"/image.jpg", 42, c) + if err != nil { + t.Fatalf("downloadFile() unexpected error: %v", err) + } + + expectedPath := filepath.Join(tmpDir, "TestComic", "TestComic 042.jpg") + if _, err := os.Stat(expectedPath); os.IsNotExist(err) { + t.Errorf("expected file %s to exist", expectedPath) + } + }) + + t.Run("server error returns error", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + })) + defer server.Close() + + tmpDir := t.TempDir() + c := &Comic{ + Title: "TestComic", + LibraryPath: tmpDir, + } + + err := downloadFile(server.URL+"/image.jpg", 1, c) + if err == nil { + t.Error("downloadFile() expected error for server error, got nil") + } + }) + + t.Run("empty response body returns error", func(t *testing.T) { + server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusOK) + // write nothing + })) + defer server.Close() + + tmpDir := t.TempDir() + c := &Comic{ + Title: "TestComic", + LibraryPath: tmpDir, + } + + err := downloadFile(server.URL+"/image.jpg", 1, c) + if err == nil { + t.Error("downloadFile() expected error for empty body, got nil") + } + }) +} diff --git a/comic/parser.go b/comic/parser.go index da8429b..32b1058 100644 --- a/comic/parser.go +++ b/comic/parser.go @@ -3,6 +3,9 @@ package comic import ( "io" "net/http" + "net/http/cookiejar" + "net/url" + "regexp" "strings" "github.com/PuerkitoBio/goquery" @@ -47,6 +50,143 @@ func Markup(url string, c chan *goquery.Document) *goquery.Document { return markup } +func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Document { + jar, _ := cookiejar.New(nil) + client := &http.Client{ + Jar: jar, + CheckRedirect: func(req *http.Request, via []*http.Request) error { + return nil + }, + } + + headers := map[string]string{ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", + "Accept-Language": "en-US,en;q=0.9", + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + } + + // GET the challange page to obtain cookies and any necessary tokens + req, err := http.NewRequest("GET", referer, nil) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + for k, v := range headers { + req.Header.Set(k, v) + } + + res, err := client.Do(req) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + defer res.Body.Close() + + body, err := io.ReadAll(res.Body) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + + tokenRegex := regexp.MustCompile(`token:\s*"([^"]+)"`) + matches := tokenRegex.FindSubmatch(body) + + if matches == nil { + // no challenge, parse directly + doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + if c != nil { + c <- doc + } + return doc + } + + encodedToken := string(matches[1]) + token, err := url.QueryUnescape(encodedToken) + if err != nil { + token = encodedToken + } + + // Step 3: POST to /_v with fake browser metrics + params := url.Values{} + params.Set("token", token) + params.Set("mode", "modern") + params.Set("workTime", "462") + params.Set("iterations", "183") + params.Set("webdriver", "0") + params.Set("touch", "0") + params.Set("screen_w", "1920") + params.Set("screen_h", "1080") + params.Set("screen_cd", "24") + + postReq, err := http.NewRequest("POST", "https://batcave.biz/_v", strings.NewReader(params.Encode())) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + for k, v := range headers { + postReq.Header.Set(k, v) + } + postReq.Header.Set("Content-Type", "application/x-www-form-urlencoded") + postReq.Header.Set("Referer", referer) + + postRes, err := client.Do(postReq) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + defer postRes.Body.Close() + io.ReadAll(postRes.Body) + + // GET the real page with the set cookie + realReq, err := http.NewRequest("GET", referer, nil) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + for k, v := range headers { + realReq.Header.Set(k, v) + } + + realRes, err := client.Do(realReq) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + defer realRes.Body.Close() + + doc, err := goquery.NewDocumentFromReader(realRes.Body) + if err != nil { + if c != nil { + c <- &goquery.Document{} + } + return &goquery.Document{} + } + if c != nil { + c <- doc + } + return doc +} + // ParseImageLinks parses a goquery document to extract image links. // // markup is the goquery document to parse for image links. @@ -69,3 +209,55 @@ func ParseImageLinks(markup *goquery.Document, c chan []string) ([]string, error return links, ImageParseError{Message: "No images found", Code: 1} } + +func ParseReadAllComicsLinks(markup *goquery.Document, c chan []string) ([]string, error) { + var links []string + markup.Find("img").Each(func(_ int, image *goquery.Selection) { + link, _ := image.Attr("src") + if !strings.Contains(link, "logo") && (strings.Contains(link, "bp.blogspot.com") || strings.Contains(link, "blogger.googleusercontent") || strings.Contains(link, "covers")) { + links = append(links, link) + } + }) + + c <- links + + if len(links) > 0 { + return links, nil + } + + return links, ImageParseError{Message: "No images found", Code: 1} +} + +// ParseBatcaveBizImageLinks extracts image URLs from the __DATA__.images JavaScript +// variable embedded in a batcave.biz page. +func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]string, error) { + var links []string + + markup.Find("script").Each(func(_ int, s *goquery.Selection) { + text := s.Text() + if !strings.Contains(text, "__DATA__") { + return + } + + arrayRegex := regexp.MustCompile(`"images"\s*:\s*\[([^\]]+)\]`) + arrayMatch := arrayRegex.FindStringSubmatch(text) + if len(arrayMatch) < 2 { + return + } + + urlRegex := regexp.MustCompile(`"([^"]+)"`) + for _, m := range urlRegex.FindAllStringSubmatch(arrayMatch[1], -1) { + if len(m) >= 2 { + links = append(links, m[1]) + } + } + }) + + c <- links + + if len(links) > 0 { + return links, nil + } + + return links, ImageParseError{Message: "No images found", Code: 1} +} diff --git a/comic/parser_test.go b/comic/parser_test.go new file mode 100644 index 0000000..9502639 --- /dev/null +++ b/comic/parser_test.go @@ -0,0 +1,183 @@ +package comic + +import ( + "strings" + "testing" + + "github.com/PuerkitoBio/goquery" +) + +func TestParseBatcaveBizImageLinks(t *testing.T) { + tests := []struct { + name string + html string + expectCount int + expectErr bool + expectURLs []string + }{ + { + name: "extracts images from __DATA__", + html: ``, + expectCount: 2, + expectErr: false, + expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg", "https://cdn.batcave.biz/img/002.jpg"}, + }, + { + name: "extracts images with spaces around colon and bracket", + html: ``, + expectCount: 1, + expectErr: false, + expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg"}, + }, + { + name: "no __DATA__ script", + html: ``, + expectCount: 0, + expectErr: true, + }, + { + name: "__DATA__ present but no images key", + html: ``, + expectCount: 0, + expectErr: true, + }, + { + name: "no script tags", + html: `

nothing here

`, + expectCount: 0, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + doc, _ := goquery.NewDocumentFromReader(strings.NewReader(tt.html)) + ch := make(chan []string, 1) + + links, err := ParseBatcaveBizImageLinks(doc, ch) + + if tt.expectErr && err == nil { + t.Error("ParseBatcaveBizImageLinks() expected error, got nil") + } + if !tt.expectErr && err != nil { + t.Errorf("ParseBatcaveBizImageLinks() unexpected error: %v", err) + } + if len(links) != tt.expectCount { + t.Errorf("ParseBatcaveBizImageLinks() returned %d links, want %d", len(links), tt.expectCount) + } + for i, expected := range tt.expectURLs { + if i >= len(links) { + t.Errorf("missing link at index %d: want %q", i, expected) + continue + } + if links[i] != expected { + t.Errorf("links[%d] = %q, want %q", i, links[i], expected) + } + } + + channelLinks := <-ch + if len(channelLinks) != tt.expectCount { + t.Errorf("channel received %d links, want %d", len(channelLinks), tt.expectCount) + } + }) + } +} + +func TestImageParseError(t *testing.T) { + err := ImageParseError{Message: "test error", Code: 1} + if err.Error() != "test error" { + t.Errorf("Error() = %q, want %q", err.Error(), "test error") + } +} + +func TestParseImageLinks(t *testing.T) { + tests := []struct { + name string + html string + expectCount int + expectErr bool + }{ + { + name: "extracts blogspot images", + html: ` + + + `, + expectCount: 2, + expectErr: false, + }, + { + name: "extracts blogger googleusercontent images", + html: ` + + `, + expectCount: 1, + expectErr: false, + }, + { + name: "extracts covers images", + html: ` + + `, + expectCount: 1, + expectErr: false, + }, + { + name: "excludes logo images", + html: ` + + + `, + expectCount: 1, + expectErr: false, + }, + { + name: "excludes non-matching images", + html: ` + + + `, + expectCount: 0, + expectErr: true, + }, + { + name: "no images at all", + html: `

No images here

`, + expectCount: 0, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + doc, _ := goquery.NewDocumentFromReader(strings.NewReader(tt.html)) + ch := make(chan []string, 1) + + links, err := ParseImageLinks(doc, ch) + + if tt.expectErr && err == nil { + t.Error("ParseImageLinks() expected error, got nil") + } + if !tt.expectErr && err != nil { + t.Errorf("ParseImageLinks() unexpected error: %v", err) + } + if len(links) != tt.expectCount { + t.Errorf("ParseImageLinks() returned %d links, want %d", len(links), tt.expectCount) + } + + // Verify the channel also received the links + channelLinks := <-ch + if len(channelLinks) != tt.expectCount { + t.Errorf("channel received %d links, want %d", len(channelLinks), tt.expectCount) + } + }) + } +} diff --git a/go.mod b/go.mod index 0d3d77b..a078973 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.22.3 require ( github.com/DaRealFreak/cloudflare-bp-go v1.0.4 github.com/PuerkitoBio/goquery v1.9.2 + github.com/andybalholm/brotli v1.2.0 github.com/spf13/cobra v1.8.1 ) diff --git a/go.sum b/go.sum index 9bd373d..7c1d02f 100644 --- a/go.sum +++ b/go.sum @@ -4,6 +4,8 @@ github.com/EDDYCJY/fake-useragent v0.2.0 h1:Jcnkk2bgXmDpX0z+ELlUErTkoLb/mxFBNd2Y github.com/EDDYCJY/fake-useragent v0.2.0/go.mod h1:5wn3zzlDxhKW6NYknushqinPcAqZcAPHy8lLczCdJdc= github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE= github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk= +github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ= +github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY= github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= github.com/cpuguy83/go-md2man/v2 v2.0.4/go.mod h1:tgQtvFlXSQOSOSIRvRPT7W67SCa46tRHOmNcaadrF8o= @@ -20,6 +22,8 @@ github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/testify v1.4.0 h1:2E4SXV/wtOkTonXsotYi4li6zVWxYlZuYNCXe9XRJyk= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= +github.com/xyproto/randomstring v1.0.5 h1:YtlWPoRdgMu3NZtP45drfy1GKoojuR7hmRcnhZqKjWU= +github.com/xyproto/randomstring v1.0.5/go.mod h1:rgmS5DeNXLivK7YprL0pY+lTuhNQW3iGxZ18UQApw/E= github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -- 2.49.1