From 91ff7e0b807519daf1f55ac874a70220f042f037 Mon Sep 17 00:00:00 2001 From: Bryan Bailey Date: Mon, 14 Mar 2022 23:07:50 -0400 Subject: [PATCH] Error handling for http errors; added gitlab-ci --- .gitlab-ci.yml | 8 ++++++++ README.md | 2 +- results.xml | 1 + yoink/cli.py | 6 +++++- yoink/comic.py | 34 +++++++++++++++++++++------------- yoink/common.py | 4 ++-- yoink/scraper.py | 19 ++++++++++--------- yoink/tests/test_basic.py | 14 +++++++++++++- 8 files changed, 61 insertions(+), 27 deletions(-) create mode 100644 .gitlab-ci.yml create mode 100644 results.xml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..69cdc8e --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,8 @@ +ruby: + stage: test + script: + - pytest --junitxml report.xml yoink/tests/test_basic.py + artifacts: + when: always + reports: + junit: report.xml \ No newline at end of file diff --git a/README.md b/README.md index 4428dfa..26292bf 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ [![wakatime](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink.svg)](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink) -Yoink! is a multisite media download tool. It scrapes comics from readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party \ No newline at end of file +Yoink! is a multisite media download tool. It scrapes comics from online comic aggragate sites like readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party \ No newline at end of file diff --git a/results.xml b/results.xml new file mode 100644 index 0000000..5181bf7 --- /dev/null +++ b/results.xml @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/yoink/cli.py b/yoink/cli.py index f0a8275..085fbeb 100644 --- a/yoink/cli.py +++ b/yoink/cli.py @@ -40,16 +40,20 @@ def download(url, comic, torrent, path): except ValueError: click.echo(f'{url} is not supported or is not a valid URL') return 1 + click.echo(f'Downloading {comic.title}') comic.archiver.download() + click.echo('Building comic archive') comic.archiver.generate_archive() + click.echo('Cleaning up') comic.archiver.cleanup_worktree() + click.echo('Success') if torrent: - click.echo('Downloading a torrent') + click.echo('Opps! It looks like Torrents aren\'t yet fully supported.') diff --git a/yoink/comic.py b/yoink/comic.py index 0323e68..61ed807 100644 --- a/yoink/comic.py +++ b/yoink/comic.py @@ -1,5 +1,3 @@ -from click import format_filename -from soupsieve import select from yoink.common import required_comic_files, skippable_images, library_path from yoink.scraper import Scrapable @@ -14,10 +12,16 @@ class Comic(Scrapable): super().__init__(url) self.archiver = ComicArchiver(self, library=path) + def __is_supported_image(self, image): + return image.endswith('.jpg' or '.jpeg') + def __get_image_src(self, comic): if comic.attrs: - return comic.attrs['src'] + try: + return comic.attrs['src'] + except KeyError: + return comic['data-src'] for image in comic: return image.attrs['src'] @@ -27,7 +31,8 @@ class Comic(Scrapable): 'default': self.soup.find_all('div', class_='separator'), 'no-div': self.soup.find_all('img', attrs={'width': '1000px'}), 'excaliber': self.soup.find_all('img'), - 'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}) + 'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}), + 'mangadex': self.soup.find_all('img', attrs={'draggable': 'false'}) } for case in soup.keys(): @@ -39,11 +44,18 @@ class Comic(Scrapable): @property def filelist(self): comics = self.__parse_soup() + for comic in comics: print(comic) return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)] @property - def title(self): return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip() + def title(self): + if 'readallcomics' in self.url: + return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip() + elif 'mangadex' in self.url: + return self.soup.find('meta', property='og:title').attrs['content'].replace(' - Mangadex', '').replace('Read ', '') + else: + return 'Uncategorized' @property def category(self): @@ -75,9 +87,11 @@ class ComicArchiver: print(formatted_file, end='\r') urllib.request.urlretrieve(url, filename=formatted_file) else: - page_number = url.split('/')[-1].split('.')[0].zfill(3) + page_number = str(index).zfill(3) file_extension = url.split('/')[-1].split('.')[1] - urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, f'{self.comic.title}{page_number}.{file_extension}')) + formatted_file = f'{self.comic.title} - {page_number}.{file_extension}' + print(formatted_file, end='\r') + urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, formatted_file)) print() def generate_archive(self, archive_format='.cbr'): @@ -95,10 +109,4 @@ class ComicArchiver: if __name__ == '__main__': comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/') - # # print(comic.filelist) - # # print(len(comic.filelist)) - # archiver = ComicArchiver(comic) - # archiver.download() - # archiver.generate_archive() - # archiver.cleanup_worktree() print(comic.category) \ No newline at end of file diff --git a/yoink/common.py b/yoink/common.py index 508fefe..146b7eb 100644 --- a/yoink/common.py +++ b/yoink/common.py @@ -11,7 +11,7 @@ app_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) config_path = os.path.abspath(os.path.join(os.environ.get('HOME'), '.config/yoink')) library_path = os.path.abspath(os.path.join(os.environ.get('HOME'), 'yoink/library')) required_comic_files = ('.cbr', '.cbz', '000.jpg', '001.jpg') -skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png') +skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png', 'navbar.svg') torrent_concurrent_download_limit = 1 -supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net'] +supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net', 'mangadex.tv'] headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} diff --git a/yoink/scraper.py b/yoink/scraper.py index 593b435..5d26c92 100644 --- a/yoink/scraper.py +++ b/yoink/scraper.py @@ -1,7 +1,6 @@ import requests from bs4 import BeautifulSoup - from yoink.common import supported_sites @@ -12,17 +11,19 @@ class Scrapable: self.__check_site_support() - # for link in supported_sites: - # if link in self.url: - # return - # else: - # raise ValueError('Unsupported site') - # if not any(url in link for link in supported_sites): - # raise ValueError('Unsupported site') @property - def markup(self) -> str: return requests.get(self.url).content + def markup(self) -> str: + try: + # raise_for_status alters the default response behavior allowing http errors to raise exception + req = requests.get(self.url) + req.raise_for_status() + return req.content + except requests.exceptions.HTTPError as e: + # returns {status_code} Client Error: Not found for url: {self.url} in the event of any http errors and exits + raise SystemExit(e) + @property def soup(self) -> BeautifulSoup: return BeautifulSoup(self.markup, 'html.parser') diff --git a/yoink/tests/test_basic.py b/yoink/tests/test_basic.py index fb41e5e..e236a01 100644 --- a/yoink/tests/test_basic.py +++ b/yoink/tests/test_basic.py @@ -2,6 +2,7 @@ from bs4 import BeautifulSoup import os import unittest +from shutil import rmtree from yoink.common import app_root, library_path, config_path, skippable_images, supported_sites, required_comic_files, torrent_concurrent_download_limit, headers from yoink.comic import Comic, ComicArchiver @@ -14,6 +15,14 @@ class BasicTestCase(unittest.TestCase): self.test_comic = 'http://readallcomics.com/static-season-one-4-2021/' self.comic = Comic(self.test_comic) self.archiver = ComicArchiver(self.comic) + self.remove_queue = [] + + + def tearDown(self) -> None: + for folder in self.remove_queue: + rmtree(folder) + + def test_000_comic_generates_valid_markup(self): self.assertTrue('!DOCTYPE html' in str(self.comic.markup)) @@ -51,4 +60,7 @@ class BasicTestCase(unittest.TestCase): with self.assertRaises(ValueError) as condition: comic = Comic('https://viz.com') - self.assertTrue('Unsupported' in str(condition.exception)) \ No newline at end of file + self.assertTrue('Unsupported' in str(condition.exception)) + + self.remove_queue.append(os.path.join(library_path, f'comics/{self.comic.title}')) +