When readallcomics.com pages have a <title> containing only the issue number (e.g. '#018 (2026)'), fall back to the h1 element first, then derive the title from the URL slug by stripping the trailing year and title-casing the hyphen-separated segments. Closes #4
134 lines
3.4 KiB
Go
134 lines
3.4 KiB
Go
package comic
|
|
|
|
import (
|
|
"path/filepath"
|
|
"regexp"
|
|
"strings"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
// var debugUrl = "https://readallcomics.com/ultraman-x-avengers-001-2024/"
|
|
|
|
type Comic struct {
|
|
URL string
|
|
Title string
|
|
Markup *goquery.Document
|
|
Filelist []string
|
|
Next *Comic
|
|
Prev *Comic
|
|
LibraryPath string
|
|
}
|
|
|
|
// extractTitleFromMarkup extracts the title from the comic's markup.
|
|
//
|
|
// c is the Comic instance containing the markup to extract the title from.
|
|
// Returns the extracted title as a string.
|
|
func extractTitleFromMarkup(c Comic) string {
|
|
yearFormat := `^(.*?)\s+\(\d{4}(?:\s+.+)?\)`
|
|
regex := regexp.MustCompile(yearFormat)
|
|
|
|
extractFrom := func(text string) string {
|
|
matches := regex.FindStringSubmatch(text)
|
|
if len(matches) != 2 {
|
|
return ""
|
|
}
|
|
return strings.ReplaceAll(matches[1], ":", "")
|
|
}
|
|
|
|
title := extractFrom(c.Markup.Find("title").First().Text())
|
|
|
|
if strings.HasPrefix(title, "#") {
|
|
if h1 := extractFrom(c.Markup.Find("h1").First().Text()); h1 != "" && !strings.HasPrefix(h1, "#") {
|
|
return h1
|
|
}
|
|
if slug := titleFromSlug(c.URL); slug != "" {
|
|
return slug
|
|
}
|
|
}
|
|
|
|
if title != "" {
|
|
return title
|
|
}
|
|
|
|
return "Untitled"
|
|
}
|
|
|
|
// titleFromSlug derives a comic title from the last path segment of a URL.
|
|
// It strips a trailing year (-YYYY), replaces hyphens with spaces, and title-cases the result.
|
|
func titleFromSlug(url string) string {
|
|
slug := strings.TrimRight(url, "/")
|
|
if i := strings.LastIndex(slug, "/"); i >= 0 {
|
|
slug = slug[i+1:]
|
|
}
|
|
slug = regexp.MustCompile(`-\d{4}$`).ReplaceAllString(slug, "")
|
|
if slug == "" {
|
|
return ""
|
|
}
|
|
words := strings.Split(slug, "-")
|
|
for i, w := range words {
|
|
if len(w) > 0 {
|
|
words[i] = strings.ToUpper(w[:1]) + w[1:]
|
|
}
|
|
}
|
|
return strings.Join(words, " ")
|
|
}
|
|
|
|
// NewComic creates a new Comic instance from the provided URL and library path.
|
|
//
|
|
// url is the URL of the comic to be parsed.
|
|
// libraryPath is the path to the local library where the comic will be stored.
|
|
// imageChannel is a channel for receiving image links.
|
|
// markupChannel is a channel for receiving the comic's markup.
|
|
//
|
|
// Returns a pointer to the newly created Comic instance.
|
|
func NewComic(
|
|
url string, libraryPath string,
|
|
imageChannel chan []string,
|
|
markupChannel chan *goquery.Document,
|
|
) *Comic {
|
|
c := &Comic{
|
|
URL: url,
|
|
LibraryPath: libraryPath,
|
|
}
|
|
|
|
if strings.Contains(url, "batcave.biz") {
|
|
go BatcaveBizMarkup(url, markupChannel)
|
|
} else {
|
|
go Markup(url, markupChannel)
|
|
}
|
|
|
|
markup := <-markupChannel
|
|
c.Markup = markup
|
|
c.Title = extractTitleFromMarkup(*c)
|
|
|
|
if strings.Contains(url, "batcave.biz") {
|
|
go ParseBatcaveBizImageLinks(markup, imageChannel)
|
|
} else {
|
|
go ParseImageLinks(markup, imageChannel)
|
|
}
|
|
links := <-imageChannel
|
|
|
|
c.Filelist = links
|
|
|
|
return c
|
|
}
|
|
|
|
// Cover returns the absolute filepath of the cover image of the comic.
|
|
//
|
|
// It iterates through the list of images associated with the comic and returns the first image that ends with "000.jpg" or "001.jpg".
|
|
// If no such image is found, it returns an error.
|
|
// Returns the absolute filepath of the cover image and an error.
|
|
func (c *Comic) Cover() (imageFilepath string, err error) {
|
|
for _, image := range c.Filelist {
|
|
if strings.HasSuffix(image, "000.jpg") || strings.HasSuffix(image, "001.jpg") {
|
|
image, err := filepath.Abs(image)
|
|
if err != nil {
|
|
return image, ImageParseError{Message: err.Error(), Code: 1}
|
|
}
|
|
return image, nil
|
|
}
|
|
}
|
|
return "", ImageParseError{Message: "No cover found", Code: 1}
|
|
}
|