- Add delete button (SVG X, hover-reveal) and confirmation modal to comic cards - Add DELETE /api/comics/delete endpoint with path traversal protection - Fix container downloads: delegate Cloudflare-blocked requests to FlareSolverr (headless Chrome sidecar) instead of retrying with Go HTTP client, whose Linux TCP fingerprint is flagged by Cloudflare even with network_mode: host - Add FlareSolverr service to docker-compose; inject FLARESOLVERR_URL env var - Add diagnostic logging to BatcaveBizMarkup request flow - Trim URL whitespace before storing in download job - Guard Archive() against empty filelist; fix runJob error-check ordering
371 lines
9.5 KiB
Go
371 lines
9.5 KiB
Go
package comic
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"log"
|
|
"net/http"
|
|
"net/http/cookiejar"
|
|
"net/url"
|
|
"os"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
type ImageParseError struct {
|
|
Message string
|
|
Code int
|
|
}
|
|
|
|
func (i ImageParseError) Error() string {
|
|
return i.Message
|
|
}
|
|
|
|
// Markup retrieves the HTML content from a given URL and returns a goquery Document.
|
|
//
|
|
// url is the URL to retrieve the HTML content from.
|
|
// c is a channel for sending the retrieved goquery Document.
|
|
// Returns the retrieved goquery Document.
|
|
func Markup(url string, c chan *goquery.Document) *goquery.Document {
|
|
res, err := http.Get(url)
|
|
if err != nil {
|
|
return &goquery.Document{}
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
if res.StatusCode != http.StatusOK {
|
|
return &goquery.Document{}
|
|
}
|
|
|
|
content, err := io.ReadAll(res.Body)
|
|
if err != nil {
|
|
return &goquery.Document{}
|
|
}
|
|
|
|
markup, err := goquery.NewDocumentFromReader(strings.NewReader(string(content)))
|
|
if err != nil {
|
|
return &goquery.Document{}
|
|
}
|
|
|
|
c <- markup
|
|
return markup
|
|
}
|
|
|
|
// fetchViaFlareSolverr fetches a URL through FlareSolverr (headless Chrome),
|
|
// returning the final page HTML as a Document. Cookies from the browser session
|
|
// are written into jar for use in subsequent requests (e.g. image downloads).
|
|
func fetchViaFlareSolverr(targetURL string, jar *cookiejar.Jar) (*goquery.Document, error) {
|
|
fsURL := os.Getenv("FLARESOLVERR_URL")
|
|
if fsURL == "" {
|
|
return nil, fmt.Errorf("FLARESOLVERR_URL not set")
|
|
}
|
|
|
|
payload, _ := json.Marshal(map[string]interface{}{
|
|
"cmd": "request.get",
|
|
"url": targetURL,
|
|
"maxTimeout": 60000,
|
|
})
|
|
|
|
resp, err := http.Post(fsURL+"/v1", "application/json", bytes.NewReader(payload))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
var result struct {
|
|
Status string `json:"status"`
|
|
Solution struct {
|
|
Response string `json:"response"`
|
|
Cookies []struct {
|
|
Name string `json:"name"`
|
|
Value string `json:"value"`
|
|
Domain string `json:"domain"`
|
|
Path string `json:"path"`
|
|
Secure bool `json:"secure"`
|
|
} `json:"cookies"`
|
|
} `json:"solution"`
|
|
}
|
|
if err := json.NewDecoder(resp.Body).Decode(&result); err != nil {
|
|
return nil, err
|
|
}
|
|
if result.Status != "ok" {
|
|
return nil, fmt.Errorf("flaresolverr: %s", result.Status)
|
|
}
|
|
|
|
parsed, _ := url.Parse(targetURL)
|
|
var cookies []*http.Cookie
|
|
for _, c := range result.Solution.Cookies {
|
|
cookies = append(cookies, &http.Cookie{
|
|
Name: c.Name,
|
|
Value: c.Value,
|
|
Domain: c.Domain,
|
|
Path: c.Path,
|
|
Secure: c.Secure,
|
|
})
|
|
}
|
|
jar.SetCookies(parsed, cookies)
|
|
|
|
return goquery.NewDocumentFromReader(strings.NewReader(result.Solution.Response))
|
|
}
|
|
|
|
func BatcaveBizMarkup(referer string, c chan *goquery.Document, clientChan chan *http.Client) *goquery.Document {
|
|
sendErr := func() *goquery.Document {
|
|
if c != nil {
|
|
c <- &goquery.Document{}
|
|
}
|
|
if clientChan != nil {
|
|
clientChan <- nil
|
|
}
|
|
return &goquery.Document{}
|
|
}
|
|
|
|
jar, _ := cookiejar.New(nil)
|
|
client := &http.Client{
|
|
Jar: jar,
|
|
Timeout: time.Second * 30,
|
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
|
return nil
|
|
},
|
|
}
|
|
|
|
headers := map[string]string{
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
|
|
}
|
|
|
|
// GET the challenge page to obtain cookies and any necessary tokens
|
|
req, err := http.NewRequest("GET", referer, nil)
|
|
if err != nil {
|
|
return sendErr()
|
|
}
|
|
for k, v := range headers {
|
|
req.Header.Set(k, v)
|
|
}
|
|
|
|
res, err := client.Do(req)
|
|
if err != nil {
|
|
log.Printf("[batcave] initial GET failed: %v", err)
|
|
return sendErr()
|
|
}
|
|
log.Printf("[batcave] initial GET status: %d", res.StatusCode)
|
|
|
|
// Cloudflare challenge — use FlareSolverr (headless Chrome) to fetch the
|
|
// full page and solve any JS challenges. cf_clearance is stored in jar for
|
|
// subsequent image downloads.
|
|
if res.StatusCode == 403 || res.StatusCode == 503 {
|
|
res.Body.Close()
|
|
log.Printf("[batcave] Cloudflare challenge detected, fetching via FlareSolverr")
|
|
doc, err := fetchViaFlareSolverr(referer, jar)
|
|
if err != nil {
|
|
log.Printf("[batcave] FlareSolverr failed: %v", err)
|
|
return sendErr()
|
|
}
|
|
if c != nil {
|
|
c <- doc
|
|
}
|
|
if clientChan != nil {
|
|
clientChan <- client
|
|
}
|
|
return doc
|
|
}
|
|
defer res.Body.Close()
|
|
|
|
body, err := io.ReadAll(res.Body)
|
|
if err != nil {
|
|
return sendErr()
|
|
}
|
|
|
|
tokenRegex := regexp.MustCompile(`token:\s*"([^"]+)"`)
|
|
matches := tokenRegex.FindSubmatch(body)
|
|
|
|
if matches == nil {
|
|
// no challenge, parse directly
|
|
doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body)))
|
|
if err != nil {
|
|
return sendErr()
|
|
}
|
|
if c != nil {
|
|
c <- doc
|
|
}
|
|
if clientChan != nil {
|
|
clientChan <- client
|
|
}
|
|
return doc
|
|
}
|
|
|
|
encodedToken := string(matches[1])
|
|
token, err := url.QueryUnescape(encodedToken)
|
|
if err != nil {
|
|
token = encodedToken
|
|
}
|
|
|
|
// POST to /_v with fake browser metrics
|
|
params := url.Values{}
|
|
params.Set("token", token)
|
|
params.Set("mode", "modern")
|
|
params.Set("workTime", "462")
|
|
params.Set("iterations", "183")
|
|
params.Set("webdriver", "0")
|
|
params.Set("touch", "0")
|
|
params.Set("screen_w", "1920")
|
|
params.Set("screen_h", "1080")
|
|
params.Set("screen_cd", "24")
|
|
|
|
postReq, err := http.NewRequest("POST", "https://batcave.biz/_v", strings.NewReader(params.Encode()))
|
|
if err != nil {
|
|
return sendErr()
|
|
}
|
|
for k, v := range headers {
|
|
postReq.Header.Set(k, v)
|
|
}
|
|
postReq.Header.Set("Content-Type", "application/x-www-form-urlencoded")
|
|
postReq.Header.Set("Referer", referer)
|
|
|
|
postRes, err := client.Do(postReq)
|
|
if err != nil {
|
|
log.Printf("[batcave] POST to /_v failed: %v", err)
|
|
return sendErr()
|
|
}
|
|
defer postRes.Body.Close()
|
|
log.Printf("[batcave] POST to /_v status: %d", postRes.StatusCode)
|
|
io.ReadAll(postRes.Body)
|
|
|
|
// GET the real page with the set cookie
|
|
realReq, err := http.NewRequest("GET", referer, nil)
|
|
if err != nil {
|
|
return sendErr()
|
|
}
|
|
for k, v := range headers {
|
|
realReq.Header.Set(k, v)
|
|
}
|
|
|
|
realRes, err := client.Do(realReq)
|
|
if err != nil {
|
|
log.Printf("[batcave] final GET failed: %v", err)
|
|
return sendErr()
|
|
}
|
|
log.Printf("[batcave] final GET status: %d", realRes.StatusCode)
|
|
defer realRes.Body.Close()
|
|
|
|
doc, err := goquery.NewDocumentFromReader(realRes.Body)
|
|
if err != nil {
|
|
return sendErr()
|
|
}
|
|
if c != nil {
|
|
c <- doc
|
|
}
|
|
if clientChan != nil {
|
|
clientChan <- client
|
|
}
|
|
return doc
|
|
}
|
|
|
|
// ParseImageLinks parses a goquery document to extract image links.
|
|
//
|
|
// markup is the goquery document to parse for image links.
|
|
// c is a channel for sending the extracted image links.
|
|
// Returns a slice of image links and an error if no images are found.
|
|
func ParseImageLinks(markup *goquery.Document, c chan []string) ([]string, error) {
|
|
var links []string
|
|
markup.Find("img").Each(func(_ int, image *goquery.Selection) {
|
|
link, _ := image.Attr("src")
|
|
if !strings.Contains(link, "logo") && (strings.Contains(link, "bp.blogspot.com") || strings.Contains(link, "blogger.googleusercontent") || strings.Contains(link, "covers")) {
|
|
links = append(links, link)
|
|
}
|
|
})
|
|
|
|
c <- links
|
|
|
|
if len(links) > 0 {
|
|
return links, nil
|
|
}
|
|
|
|
return links, ImageParseError{Message: "No images found", Code: 1}
|
|
}
|
|
|
|
func ParseReadAllComicsLinks(markup *goquery.Document, c chan []string) ([]string, error) {
|
|
var links []string
|
|
markup.Find("img").Each(func(_ int, image *goquery.Selection) {
|
|
link, _ := image.Attr("src")
|
|
if !strings.Contains(link, "logo") && (strings.Contains(link, "bp.blogspot.com") || strings.Contains(link, "blogger.googleusercontent") || strings.Contains(link, "covers")) {
|
|
links = append(links, link)
|
|
}
|
|
})
|
|
|
|
c <- links
|
|
|
|
if len(links) > 0 {
|
|
return links, nil
|
|
}
|
|
|
|
return links, ImageParseError{Message: "No images found", Code: 1}
|
|
}
|
|
|
|
// ParseBatcaveBizTitle extracts the chapter title from the __DATA__.chapters array
|
|
// by matching the chapter id to the last path segment of the provided URL.
|
|
func ParseBatcaveBizTitle(markup *goquery.Document, chapterURL string) string {
|
|
slug := strings.TrimRight(chapterURL, "/")
|
|
if i := strings.LastIndex(slug, "/"); i >= 0 {
|
|
slug = slug[i+1:]
|
|
}
|
|
|
|
var title string
|
|
markup.Find("script").Each(func(_ int, s *goquery.Selection) {
|
|
if title != "" {
|
|
return
|
|
}
|
|
text := s.Text()
|
|
if !strings.Contains(text, "__DATA__") {
|
|
return
|
|
}
|
|
chapterRegex := regexp.MustCompile(`"id"\s*:\s*` + regexp.QuoteMeta(slug) + `[^}]*?"title"\s*:\s*"([^"]+)"`)
|
|
m := chapterRegex.FindStringSubmatch(text)
|
|
if len(m) >= 2 {
|
|
title = strings.ReplaceAll(m[1], `\/`, "/")
|
|
title = strings.ReplaceAll(title, "Issue #", "")
|
|
title = strings.ReplaceAll(title, "#", "")
|
|
}
|
|
})
|
|
return title
|
|
}
|
|
|
|
// ParseBatcaveBizImageLinks extracts image URLs from the __DATA__.images JavaScript
|
|
// variable embedded in a batcave.biz page.
|
|
func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]string, error) {
|
|
var links []string
|
|
|
|
markup.Find("script").Each(func(_ int, s *goquery.Selection) {
|
|
text := s.Text()
|
|
if !strings.Contains(text, "__DATA__") {
|
|
return
|
|
}
|
|
|
|
arrayRegex := regexp.MustCompile(`"images"\s*:\s*\[([^\]]+)\]`)
|
|
arrayMatch := arrayRegex.FindStringSubmatch(text)
|
|
if len(arrayMatch) < 2 {
|
|
return
|
|
}
|
|
|
|
urlRegex := regexp.MustCompile(`"([^"]+)"`)
|
|
for _, m := range urlRegex.FindAllStringSubmatch(arrayMatch[1], -1) {
|
|
if len(m) >= 2 {
|
|
links = append(links, strings.ReplaceAll(m[1], `\/`, "/"))
|
|
}
|
|
}
|
|
})
|
|
|
|
c <- links
|
|
|
|
if len(links) > 0 {
|
|
return links, nil
|
|
}
|
|
|
|
return links, ImageParseError{Message: "No images found", Code: 1}
|
|
}
|