yoink-go/comic/comic.go

package comic

import (
	"path/filepath"
	"regexp"
	"strings"

	"github.com/PuerkitoBio/goquery"
)

// var debugUrl = "https://readallcomics.com/ultraman-x-avengers-001-2024/"

type Comic struct {
	URL         string
	Title       string
	Markup      *goquery.Document
	Filelist    []string
	Next        *Comic
	Prev        *Comic
	LibraryPath string
}

// extractTitleFromMarkup extracts the title from the comic's markup.
//
// c is the Comic instance containing the markup to extract the title from.
// Returns the extracted title as a string.
func extractTitleFromMarkup(c Comic) string {
	yearFormat := `^(.*?)\s+\(\d{4}(?:\s+.+)?\)`
	regex := regexp.MustCompile(yearFormat)

	extractFrom := func(text string) string {
		matches := regex.FindStringSubmatch(text)
		if len(matches) != 2 {
			return ""
		}
		return strings.ReplaceAll(matches[1], ":", "")
	}

	title := extractFrom(c.Markup.Find("title").First().Text())

	if strings.HasPrefix(title, "#") {
		if h1 := extractFrom(c.Markup.Find("h1").First().Text()); h1 != "" && !strings.HasPrefix(h1, "#") {
			return h1
		}
		if slug := titleFromSlug(c.URL); slug != "" {
			return slug
		}
	}

	if title != "" {
		return title
	}

	return "Untitled"
}

// titleFromSlug derives a comic title from the last path segment of a URL.
// It strips a trailing year (-YYYY), replaces hyphens with spaces, and title-cases the result.
func titleFromSlug(url string) string {
	slug := strings.TrimRight(url, "/")
	if i := strings.LastIndex(slug, "/"); i >= 0 {
		slug = slug[i+1:]
	}
	slug = regexp.MustCompile(`-\d{4}$`).ReplaceAllString(slug, "")
	if slug == "" {
		return ""
	}
	words := strings.Split(slug, "-")
	for i, w := range words {
		if len(w) > 0 {
			words[i] = strings.ToUpper(w[:1]) + w[1:]
		}
	}
	return strings.Join(words, " ")
}

// NewComic creates a new Comic instance from the provided URL and library path.
//
// url is the URL of the comic to be parsed.
// libraryPath is the path to the local library where the comic will be stored.
// imageChannel is a channel for receiving image links.
// markupChannel is a channel for receiving the comic's markup.
//
// Returns a pointer to the newly created Comic instance.
func NewComic(
	url string, libraryPath string,
	imageChannel chan []string,
	markupChannel chan *goquery.Document,
) *Comic {
	c := &Comic{
		URL:         url,
		LibraryPath: libraryPath,
	}

	if strings.Contains(url, "batcave.biz") {
		go BatcaveBizMarkup(url, markupChannel)
	} else {
		go Markup(url, markupChannel)
	}

	markup := <-markupChannel
	c.Markup = markup
	c.Title = extractTitleFromMarkup(*c)

	if strings.Contains(url, "batcave.biz") {
		go ParseBatcaveBizImageLinks(markup, imageChannel)
	} else {
		go ParseImageLinks(markup, imageChannel)
	}
	links := <-imageChannel

	c.Filelist = links

	return c
}

// Cover returns the absolute filepath of the cover image of the comic.
//
// It iterates through the list of images associated with the comic and returns the first image that ends with "000.jpg" or "001.jpg".
// If no such image is found, it returns an error.
// Returns the absolute filepath of the cover image and an error.
func (c *Comic) Cover() (imageFilepath string, err error) {
	for _, image := range c.Filelist {
		if strings.HasSuffix(image, "000.jpg") || strings.HasSuffix(image, "001.jpg") {
			image, err := filepath.Abs(image)
			if err != nil {
				return image, ImageParseError{Message: err.Error(), Code: 1}
			}
			return image, nil
		}
	}
	return "", ImageParseError{Message: "No cover found", Code: 1}
}