yoink-go/comic/parser.go

package comic

import (
	"io"
	"net/http"
	"net/http/cookiejar"
	"net/url"
	"regexp"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
)

type ImageParseError struct {
	Message string
	Code    int
}

func (i ImageParseError) Error() string {
	return i.Message
}

// Markup retrieves the HTML content from a given URL and returns a goquery Document.
//
// url is the URL to retrieve the HTML content from.
// c is a channel for sending the retrieved goquery Document.
// Returns the retrieved goquery Document.
func Markup(url string, c chan *goquery.Document) *goquery.Document {
	res, err := http.Get(url)
	if err != nil {
		return &goquery.Document{}
	}
	defer res.Body.Close()

	if res.StatusCode != http.StatusOK {
		return &goquery.Document{}
	}

	content, err := io.ReadAll(res.Body)
	if err != nil {
		return &goquery.Document{}
	}

	markup, err := goquery.NewDocumentFromReader(strings.NewReader(string(content)))
	if err != nil {
		return &goquery.Document{}
	}

	c <- markup
	return markup
}

func BatcaveBizMarkup(referer string, c chan *goquery.Document, clientChan chan *http.Client) *goquery.Document {
	sendErr := func() *goquery.Document {
		if c != nil {
			c <- &goquery.Document{}
		}
		if clientChan != nil {
			clientChan <- nil
		}
		return &goquery.Document{}
	}

	jar, _ := cookiejar.New(nil)
	client := &http.Client{
		Jar:     jar,
		Timeout: time.Second * 30,
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			return nil
		},
	}

	headers := map[string]string{
		"User-Agent":      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
		"Accept-Language": "en-US,en;q=0.9",
		"Accept":          "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
	}

	// GET the challange page to obtain cookies and any necessary tokens
	req, err := http.NewRequest("GET", referer, nil)
	if err != nil {
		return sendErr()
	}
	for k, v := range headers {
		req.Header.Set(k, v)
	}

	res, err := client.Do(req)
	if err != nil {
		return sendErr()
	}
	defer res.Body.Close()

	body, err := io.ReadAll(res.Body)
	if err != nil {
		return sendErr()
	}

	tokenRegex := regexp.MustCompile(`token:\s*"([^"]+)"`)
	matches := tokenRegex.FindSubmatch(body)

	if matches == nil {
		//  no challenge, parse directly
		doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
		if err != nil {
			return sendErr()
		}
		if c != nil {
			c <- doc
		}
		if clientChan != nil {
			clientChan <- client
		}
		return doc
	}

	encodedToken := string(matches[1])
	token, err := url.QueryUnescape(encodedToken)
	if err != nil {
		token = encodedToken
	}

	// Step 3: POST to /_v with fake browser metrics
	params := url.Values{}
	params.Set("token", token)
	params.Set("mode", "modern")
	params.Set("workTime", "462")
	params.Set("iterations", "183")
	params.Set("webdriver", "0")
	params.Set("touch", "0")
	params.Set("screen_w", "1920")
	params.Set("screen_h", "1080")
	params.Set("screen_cd", "24")

	postReq, err := http.NewRequest("POST", "https://batcave.biz/_v", strings.NewReader(params.Encode()))
	if err != nil {
		return sendErr()
	}
	for k, v := range headers {
		postReq.Header.Set(k, v)
	}
	postReq.Header.Set("Content-Type", "application/x-www-form-urlencoded")
	postReq.Header.Set("Referer", referer)

	postRes, err := client.Do(postReq)
	if err != nil {
		return sendErr()
	}
	defer postRes.Body.Close()
	io.ReadAll(postRes.Body)

	// GET the real page with the set cookie
	realReq, err := http.NewRequest("GET", referer, nil)
	if err != nil {
		return sendErr()
	}
	for k, v := range headers {
		realReq.Header.Set(k, v)
	}

	realRes, err := client.Do(realReq)
	if err != nil {
		return sendErr()
	}
	defer realRes.Body.Close()

	doc, err := goquery.NewDocumentFromReader(realRes.Body)
	if err != nil {
		return sendErr()
	}
	if c != nil {
		c <- doc
	}
	if clientChan != nil {
		clientChan <- client
	}
	return doc
}

// ParseImageLinks parses a goquery document to extract image links.
//
// markup is the goquery document to parse for image links.
// c is a channel for sending the extracted image links.
// Returns a slice of image links and an error if no images are found.
func ParseImageLinks(markup *goquery.Document, c chan []string) ([]string, error) {
	var links []string
	markup.Find("img").Each(func(_ int, image *goquery.Selection) {
		link, _ := image.Attr("src")
		if !strings.Contains(link, "logo") && (strings.Contains(link, "bp.blogspot.com") || strings.Contains(link, "blogger.googleusercontent") || strings.Contains(link, "covers")) {
			links = append(links, link)
		}
	})

	c <- links

	if len(links) > 0 {
		return links, nil
	}

	return links, ImageParseError{Message: "No images found", Code: 1}
}

func ParseReadAllComicsLinks(markup *goquery.Document, c chan []string) ([]string, error) {
	var links []string
	markup.Find("img").Each(func(_ int, image *goquery.Selection) {
		link, _ := image.Attr("src")
		if !strings.Contains(link, "logo") && (strings.Contains(link, "bp.blogspot.com") || strings.Contains(link, "blogger.googleusercontent") || strings.Contains(link, "covers")) {
			links = append(links, link)
		}
	})

	c <- links

	if len(links) > 0 {
		return links, nil
	}

	return links, ImageParseError{Message: "No images found", Code: 1}
}

// ParseBatcaveBizTitle extracts the chapter title from the __DATA__.chapters array
// by matching the chapter id to the last path segment of the provided URL.
func ParseBatcaveBizTitle(markup *goquery.Document, chapterURL string) string {
	slug := strings.TrimRight(chapterURL, "/")
	if i := strings.LastIndex(slug, "/"); i >= 0 {
		slug = slug[i+1:]
	}

	var title string
	markup.Find("script").Each(func(_ int, s *goquery.Selection) {
		if title != "" {
			return
		}
		text := s.Text()
		if !strings.Contains(text, "__DATA__") {
			return
		}
		chapterRegex := regexp.MustCompile(`"id"\s*:\s*` + regexp.QuoteMeta(slug) + `[^}]*?"title"\s*:\s*"([^"]+)"`)
		m := chapterRegex.FindStringSubmatch(text)
		if len(m) >= 2 {
			title = strings.ReplaceAll(m[1], `\/`, "/")
			title = strings.ReplaceAll(title, "Issue #", "")
			title = strings.ReplaceAll(title, "#", "")
		}
	})
	return title
}

// ParseBatcaveBizImageLinks extracts image URLs from the __DATA__.images JavaScript
// variable embedded in a batcave.biz page.
func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]string, error) {
	var links []string

	markup.Find("script").Each(func(_ int, s *goquery.Selection) {
		text := s.Text()
		if !strings.Contains(text, "__DATA__") {
			return
		}

		arrayRegex := regexp.MustCompile(`"images"\s*:\s*\[([^\]]+)\]`)
		arrayMatch := arrayRegex.FindStringSubmatch(text)
		if len(arrayMatch) < 2 {
			return
		}

		urlRegex := regexp.MustCompile(`"([^"]+)"`)
		for _, m := range urlRegex.FindAllStringSubmatch(arrayMatch[1], -1) {
			if len(m) >= 2 {
				links = append(links, strings.ReplaceAll(m[1], `\/`, "/"))
			}
		}
	})

	c <- links

	if len(links) > 0 {
		return links, nil
	}

	return links, ImageParseError{Message: "No images found", Code: 1}
}