diff --git a/cli/root.go b/cli/root.go index 55a27a8..9e07056 100644 --- a/cli/root.go +++ b/cli/root.go @@ -40,7 +40,7 @@ var cli = &cobra.Command{ fmt.Println(comic.Title) err := comic.Download(len(comic.Filelist)) - for e := range err { + for _, e := range err { fmt.Println(e) } diff --git a/comic/archive.go b/comic/archive.go index 98d6f69..0ce5f48 100644 --- a/comic/archive.go +++ b/comic/archive.go @@ -45,7 +45,7 @@ func (c *Comic) Archive() error { sourcePath := filepath.Join(c.LibraryPath, c.Title) err = filepath.Walk( - filepath.Dir(sourcePath), + sourcePath, func(path string, info os.FileInfo, err error) error { if err != nil { return ArchiveError{ diff --git a/comic/comic.go b/comic/comic.go index 16fa617..c45fa0a 100644 --- a/comic/comic.go +++ b/comic/comic.go @@ -1,6 +1,7 @@ package comic import ( + "net/http" "path/filepath" "regexp" "strings" @@ -18,6 +19,7 @@ type Comic struct { Next *Comic Prev *Comic LibraryPath string + Client *http.Client } // extractTitleFromMarkup extracts the title from the comic's markup. @@ -93,18 +95,22 @@ func NewComic( } if strings.Contains(url, "batcave.biz") { - go BatcaveBizMarkup(url, markupChannel) - } else { - go Markup(url, markupChannel) - } - - markup := <-markupChannel - c.Markup = markup - c.Title = extractTitleFromMarkup(*c) - - if strings.Contains(url, "batcave.biz") { + clientChan := make(chan *http.Client, 1) + go BatcaveBizMarkup(url, markupChannel, clientChan) + markup := <-markupChannel + c.Markup = markup + c.Client = <-clientChan + if t := ParseBatcaveBizTitle(markup, url); t != "" { + c.Title = t + } else { + c.Title = extractTitleFromMarkup(*c) + } go ParseBatcaveBizImageLinks(markup, imageChannel) } else { + go Markup(url, markupChannel) + markup := <-markupChannel + c.Markup = markup + c.Title = extractTitleFromMarkup(*c) go ParseImageLinks(markup, imageChannel) } links := <-imageChannel diff --git a/comic/download.go b/comic/download.go index 8fe8cb7..f0923f7 100644 --- a/comic/download.go +++ b/comic/download.go @@ -6,6 +6,7 @@ import ( "net/http" "os" "path/filepath" + "strings" "time" cloudflarebp "github.com/DaRealFreak/cloudflare-bp-go" @@ -39,13 +40,33 @@ func downloadFile(url string, page int, c *Comic) error { } } - res, err := handleRequest(url) + var res *http.Response + var err error + if c.Client != nil { + req, reqErr := http.NewRequest("GET", url, nil) + if reqErr != nil { + return ComicDownloadError{Message: "invalid request", Code: 1} + } + req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") + if strings.Contains(url, "batcave.biz") { + req.Header.Set("Referer", "https://batcave.biz/") + } + res, err = c.Client.Do(req) + } else { + res, err = handleRequest(url) + } if err != nil { return ComicDownloadError{ Message: "invalid request", Code: 1, } } + if res.StatusCode != http.StatusOK { + return ComicDownloadError{ + Message: "bad response", + Code: 1, + } + } defer res.Body.Close() imageFile, err := os.Create(imageFilepath) diff --git a/comic/parser.go b/comic/parser.go index 32b1058..d64a405 100644 --- a/comic/parser.go +++ b/comic/parser.go @@ -7,6 +7,7 @@ import ( "net/url" "regexp" "strings" + "time" "github.com/PuerkitoBio/goquery" ) @@ -50,10 +51,21 @@ func Markup(url string, c chan *goquery.Document) *goquery.Document { return markup } -func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Document { +func BatcaveBizMarkup(referer string, c chan *goquery.Document, clientChan chan *http.Client) *goquery.Document { + sendErr := func() *goquery.Document { + if c != nil { + c <- &goquery.Document{} + } + if clientChan != nil { + clientChan <- nil + } + return &goquery.Document{} + } + jar, _ := cookiejar.New(nil) client := &http.Client{ - Jar: jar, + Jar: jar, + Timeout: time.Second * 30, CheckRedirect: func(req *http.Request, via []*http.Request) error { return nil }, @@ -68,10 +80,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen // GET the challange page to obtain cookies and any necessary tokens req, err := http.NewRequest("GET", referer, nil) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } for k, v := range headers { req.Header.Set(k, v) @@ -79,19 +88,13 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen res, err := client.Do(req) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } defer res.Body.Close() body, err := io.ReadAll(res.Body) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } tokenRegex := regexp.MustCompile(`token:\s*"([^"]+)"`) @@ -101,14 +104,14 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen // no challenge, parse directly doc, err := goquery.NewDocumentFromReader(strings.NewReader(string(body))) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } if c != nil { c <- doc } + if clientChan != nil { + clientChan <- client + } return doc } @@ -132,10 +135,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen postReq, err := http.NewRequest("POST", "https://batcave.biz/_v", strings.NewReader(params.Encode())) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } for k, v := range headers { postReq.Header.Set(k, v) @@ -145,10 +145,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen postRes, err := client.Do(postReq) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } defer postRes.Body.Close() io.ReadAll(postRes.Body) @@ -156,10 +153,7 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen // GET the real page with the set cookie realReq, err := http.NewRequest("GET", referer, nil) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } for k, v := range headers { realReq.Header.Set(k, v) @@ -167,23 +161,20 @@ func BatcaveBizMarkup(referer string, c chan *goquery.Document) *goquery.Documen realRes, err := client.Do(realReq) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } defer realRes.Body.Close() doc, err := goquery.NewDocumentFromReader(realRes.Body) if err != nil { - if c != nil { - c <- &goquery.Document{} - } - return &goquery.Document{} + return sendErr() } if c != nil { c <- doc } + if clientChan != nil { + clientChan <- client + } return doc } @@ -228,6 +219,34 @@ func ParseReadAllComicsLinks(markup *goquery.Document, c chan []string) ([]strin return links, ImageParseError{Message: "No images found", Code: 1} } +// ParseBatcaveBizTitle extracts the chapter title from the __DATA__.chapters array +// by matching the chapter id to the last path segment of the provided URL. +func ParseBatcaveBizTitle(markup *goquery.Document, chapterURL string) string { + slug := strings.TrimRight(chapterURL, "/") + if i := strings.LastIndex(slug, "/"); i >= 0 { + slug = slug[i+1:] + } + + var title string + markup.Find("script").Each(func(_ int, s *goquery.Selection) { + if title != "" { + return + } + text := s.Text() + if !strings.Contains(text, "__DATA__") { + return + } + chapterRegex := regexp.MustCompile(`"id"\s*:\s*` + regexp.QuoteMeta(slug) + `[^}]*?"title"\s*:\s*"([^"]+)"`) + m := chapterRegex.FindStringSubmatch(text) + if len(m) >= 2 { + title = strings.ReplaceAll(m[1], `\/`, "/") + title = strings.ReplaceAll(title, "Issue #", "") + title = strings.ReplaceAll(title, "#", "") + } + }) + return title +} + // ParseBatcaveBizImageLinks extracts image URLs from the __DATA__.images JavaScript // variable embedded in a batcave.biz page. func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]string, error) { @@ -248,7 +267,7 @@ func ParseBatcaveBizImageLinks(markup *goquery.Document, c chan []string) ([]str urlRegex := regexp.MustCompile(`"([^"]+)"`) for _, m := range urlRegex.FindAllStringSubmatch(arrayMatch[1], -1) { if len(m) >= 2 { - links = append(links, m[1]) + links = append(links, strings.ReplaceAll(m[1], `\/`, "/")) } } }) diff --git a/comic/parser_test.go b/comic/parser_test.go index 9502639..2e08c29 100644 --- a/comic/parser_test.go +++ b/comic/parser_test.go @@ -24,6 +24,15 @@ func TestParseBatcaveBizImageLinks(t *testing.T) { expectErr: false, expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg", "https://cdn.batcave.biz/img/002.jpg"}, }, + { + name: "unescapes forward slashes in URLs", + html: `
`, + expectCount: 1, + expectErr: false, + expectURLs: []string{"https://cdn.batcave.biz/img/001.jpg"}, + }, { name: "extracts images with spaces around colon and bracket", html: `