Error handling for http errors; added gitlab-ci

2022-03-14 23:07:50 -04:00
parent a5c0f83cd4
commit 91ff7e0b80
8 changed files with 61 additions and 27 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -0,0 +1,8 @@
+ruby:
+  stage: test
+  script:
+    - pytest --junitxml report.xml yoink/tests/test_basic.py 
+  artifacts:
+    when: always
+    reports:
+      junit: report.xml
--- a/README.md
+++ b/README.md
@@ -2,4 +2,4 @@

 [![wakatime](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink.svg)](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink)

-Yoink! is a multisite media download tool. It scrapes comics from readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
+Yoink! is a multisite media download tool. It scrapes comics from online comic aggragate sites like readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
--- a/results.xml
+++ b/results.xml
@@ -0,0 +1 @@
+<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="34.081" timestamp="2022-03-14T23:05:57.364590" hostname="DESKTOP-SE506CG"><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_000_comic_generates_valid_markup" time="1.243" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_001_comic_has_valid_title" time="0.998" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_002_comic_has_valid_category" time="1.250" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_003_empty_comic_folder" time="0.591" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_004_comic_folder_created_and_populated" time="22.773" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_005_comic_archive_generated" time="3.357" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_006_folder_cleaned_after_archive_generation" time="1.079" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_007_comic_instance_has_archiver" time="0.587" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_008_comic_is_subclass_scrapable" time="0.959" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_009_invalid_comic_link" time="1.050" /></testsuite></testsuites>
--- a/yoink/cli.py
+++ b/yoink/cli.py
@@ -40,16 +40,20 @@ def download(url, comic, torrent, path):
        except ValueError:
            click.echo(f'{url} is not supported or is not a valid URL')
            return 1
+
        click.echo(f'Downloading {comic.title}')
        comic.archiver.download()
+
        click.echo('Building comic archive')
        comic.archiver.generate_archive()
+
        click.echo('Cleaning up')
        comic.archiver.cleanup_worktree()
+        
        click.echo('Success')
    
    if torrent:
-        click.echo('Downloading a torrent')
+        click.echo('Opps! It looks like Torrents aren\'t yet fully supported.')

    

--- a/yoink/comic.py
+++ b/yoink/comic.py
@@ -1,5 +1,3 @@
-from click import format_filename
-from soupsieve import select
 from yoink.common import required_comic_files, skippable_images, library_path
 from yoink.scraper import Scrapable

@@ -14,10 +12,16 @@ class Comic(Scrapable):
        super().__init__(url)
        self.archiver = ComicArchiver(self, library=path)

+    def __is_supported_image(self, image):
+        return image.endswith('.jpg' or '.jpeg')
+

    def __get_image_src(self, comic):
        if comic.attrs:
-            return comic.attrs['src']
+            try:
+                return comic.attrs['src']
+            except KeyError:
+                return comic['data-src']

        for image in comic:
            return image.attrs['src']
@@ -27,7 +31,8 @@ class Comic(Scrapable):
            'default': self.soup.find_all('div', class_='separator'),
            'no-div': self.soup.find_all('img', attrs={'width': '1000px'}),
            'excaliber': self.soup.find_all('img'),
-            'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'})
+            'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}),
+            'mangadex': self.soup.find_all('img', attrs={'draggable': 'false'})
        }

        for case in soup.keys():
@@ -39,11 +44,18 @@ class Comic(Scrapable):
    @property
    def filelist(self):
        comics = self.__parse_soup()
+        for comic in comics: print(comic)
        return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)]


    @property
-    def title(self): return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
+    def title(self):
+        if 'readallcomics' in self.url:
+            return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
+        elif 'mangadex' in self.url:
+            return self.soup.find('meta', property='og:title').attrs['content'].replace(' - Mangadex', '').replace('Read ', '')
+        else:
+            return 'Uncategorized'

    @property
    def category(self):
@@ -75,9 +87,11 @@ class ComicArchiver:
                print(formatted_file, end='\r')
                urllib.request.urlretrieve(url, filename=formatted_file)
            else:
-                page_number = url.split('/')[-1].split('.')[0].zfill(3)
+                page_number = str(index).zfill(3)
                file_extension = url.split('/')[-1].split('.')[1]
-                urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, f'{self.comic.title}{page_number}.{file_extension}'))
+                formatted_file = f'{self.comic.title} - {page_number}.{file_extension}'
+                print(formatted_file, end='\r')
+                urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, formatted_file))
        print()

    def generate_archive(self, archive_format='.cbr'):
@@ -95,10 +109,4 @@ class ComicArchiver:

 if __name__ == '__main__':
    comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/')
-    # # print(comic.filelist)
-    # # print(len(comic.filelist))
-    # archiver = ComicArchiver(comic)
-    # archiver.download()
-    # archiver.generate_archive()
-    # archiver.cleanup_worktree()
    print(comic.category)
--- a/yoink/common.py
+++ b/yoink/common.py
@@ -11,7 +11,7 @@ app_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 config_path = os.path.abspath(os.path.join(os.environ.get('HOME'), '.config/yoink'))
 library_path = os.path.abspath(os.path.join(os.environ.get('HOME'), 'yoink/library'))
 required_comic_files = ('.cbr', '.cbz', '000.jpg', '001.jpg')
-skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png')
+skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png', 'navbar.svg')
 torrent_concurrent_download_limit = 1
-supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net']
+supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net', 'mangadex.tv']
 headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
--- a/yoink/scraper.py
+++ b/yoink/scraper.py
@@ -1,7 +1,6 @@
 import requests
 from bs4 import BeautifulSoup

-
 from yoink.common import supported_sites


@@ -12,17 +11,19 @@ class Scrapable:

        
        self.__check_site_support()
-        # for link in supported_sites:
-        #     if link in self.url:
-        #         return
-        #     else:
-        #         raise ValueError('Unsupported site')
-        # if not any(url in link for link in supported_sites):
-        #     raise ValueError('Unsupported site')


    @property
-    def markup(self) -> str: return requests.get(self.url).content
+    def markup(self) -> str:
+        try:
+            # raise_for_status alters the default response behavior allowing http errors to raise exception
+            req = requests.get(self.url)
+            req.raise_for_status()
+            return req.content
+        except requests.exceptions.HTTPError as e:
+            # returns {status_code} Client Error: Not found for url: {self.url} in the event of any http errors and exits
+            raise SystemExit(e)
+

    @property
    def soup(self) -> BeautifulSoup: return BeautifulSoup(self.markup, 'html.parser')
--- a/yoink/tests/test_basic.py
+++ b/yoink/tests/test_basic.py
@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup

 import os
 import unittest
+from shutil import rmtree

 from yoink.common import app_root, library_path, config_path, skippable_images, supported_sites, required_comic_files, torrent_concurrent_download_limit, headers
 from yoink.comic import Comic, ComicArchiver
@@ -14,6 +15,14 @@ class BasicTestCase(unittest.TestCase):
        self.test_comic = 'http://readallcomics.com/static-season-one-4-2021/'
        self.comic = Comic(self.test_comic)
        self.archiver = ComicArchiver(self.comic)
+        self.remove_queue = []
+
+        
+    def tearDown(self) -> None:
+        for folder in self.remove_queue:
+            rmtree(folder)
+        
+

    def test_000_comic_generates_valid_markup(self):
        self.assertTrue('!DOCTYPE html' in str(self.comic.markup))
@@ -51,4 +60,7 @@ class BasicTestCase(unittest.TestCase):
        with self.assertRaises(ValueError) as condition:
            comic = Comic('https://viz.com')

-        self.assertTrue('Unsupported' in str(condition.exception))
+        self.assertTrue('Unsupported' in str(condition.exception))
+
+        self.remove_queue.append(os.path.join(library_path, f'comics/{self.comic.title}'))
+
				`@@ -0,0 +1 @@`
				<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="34.081" timestamp="2022-03-14T23:05:57.364590" hostname="DESKTOP-SE506CG"><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_000_comic_generates_valid_markup" time="1.243" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_001_comic_has_valid_title" time="0.998" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_002_comic_has_valid_category" time="1.250" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_003_empty_comic_folder" time="0.591" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_004_comic_folder_created_and_populated" time="22.773" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_005_comic_archive_generated" time="3.357" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_006_folder_cleaned_after_archive_generation" time="1.079" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_007_comic_instance_has_archiver" time="0.587" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_008_comic_is_subclass_scrapable" time="0.959" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_009_invalid_comic_link" time="1.050" /></testsuite></testsuites>