Error handling for http errors; added gitlab-ci

This commit is contained in:
Bryan Bailey
2022-03-14 23:07:50 -04:00
parent a5c0f83cd4
commit 91ff7e0b80
8 changed files with 61 additions and 27 deletions

8
.gitlab-ci.yml Normal file
View File

@@ -0,0 +1,8 @@
ruby:
stage: test
script:
- pytest --junitxml report.xml yoink/tests/test_basic.py
artifacts:
when: always
reports:
junit: report.xml

View File

@@ -2,4 +2,4 @@
[![wakatime](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink.svg)](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink)
Yoink! is a multisite media download tool. It scrapes comics from readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
Yoink! is a multisite media download tool. It scrapes comics from online comic aggragate sites like readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party

1
results.xml Normal file
View File

@@ -0,0 +1 @@
<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="34.081" timestamp="2022-03-14T23:05:57.364590" hostname="DESKTOP-SE506CG"><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_000_comic_generates_valid_markup" time="1.243" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_001_comic_has_valid_title" time="0.998" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_002_comic_has_valid_category" time="1.250" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_003_empty_comic_folder" time="0.591" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_004_comic_folder_created_and_populated" time="22.773" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_005_comic_archive_generated" time="3.357" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_006_folder_cleaned_after_archive_generation" time="1.079" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_007_comic_instance_has_archiver" time="0.587" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_008_comic_is_subclass_scrapable" time="0.959" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_009_invalid_comic_link" time="1.050" /></testsuite></testsuites>

View File

@@ -40,16 +40,20 @@ def download(url, comic, torrent, path):
except ValueError:
click.echo(f'{url} is not supported or is not a valid URL')
return 1
click.echo(f'Downloading {comic.title}')
comic.archiver.download()
click.echo('Building comic archive')
comic.archiver.generate_archive()
click.echo('Cleaning up')
comic.archiver.cleanup_worktree()
click.echo('Success')
if torrent:
click.echo('Downloading a torrent')
click.echo('Opps! It looks like Torrents aren\'t yet fully supported.')

View File

@@ -1,5 +1,3 @@
from click import format_filename
from soupsieve import select
from yoink.common import required_comic_files, skippable_images, library_path
from yoink.scraper import Scrapable
@@ -14,10 +12,16 @@ class Comic(Scrapable):
super().__init__(url)
self.archiver = ComicArchiver(self, library=path)
def __is_supported_image(self, image):
return image.endswith('.jpg' or '.jpeg')
def __get_image_src(self, comic):
if comic.attrs:
return comic.attrs['src']
try:
return comic.attrs['src']
except KeyError:
return comic['data-src']
for image in comic:
return image.attrs['src']
@@ -27,7 +31,8 @@ class Comic(Scrapable):
'default': self.soup.find_all('div', class_='separator'),
'no-div': self.soup.find_all('img', attrs={'width': '1000px'}),
'excaliber': self.soup.find_all('img'),
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'})
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}),
'mangadex': self.soup.find_all('img', attrs={'draggable': 'false'})
}
for case in soup.keys():
@@ -39,11 +44,18 @@ class Comic(Scrapable):
@property
def filelist(self):
comics = self.__parse_soup()
for comic in comics: print(comic)
return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)]
@property
def title(self): return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('', '').replace('#', '').replace(':', '').strip()
def title(self):
if 'readallcomics' in self.url:
return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('', '').replace('#', '').replace(':', '').strip()
elif 'mangadex' in self.url:
return self.soup.find('meta', property='og:title').attrs['content'].replace(' - Mangadex', '').replace('Read ', '')
else:
return 'Uncategorized'
@property
def category(self):
@@ -75,9 +87,11 @@ class ComicArchiver:
print(formatted_file, end='\r')
urllib.request.urlretrieve(url, filename=formatted_file)
else:
page_number = url.split('/')[-1].split('.')[0].zfill(3)
page_number = str(index).zfill(3)
file_extension = url.split('/')[-1].split('.')[1]
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, f'{self.comic.title}{page_number}.{file_extension}'))
formatted_file = f'{self.comic.title} - {page_number}.{file_extension}'
print(formatted_file, end='\r')
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, formatted_file))
print()
def generate_archive(self, archive_format='.cbr'):
@@ -95,10 +109,4 @@ class ComicArchiver:
if __name__ == '__main__':
comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/')
# # print(comic.filelist)
# # print(len(comic.filelist))
# archiver = ComicArchiver(comic)
# archiver.download()
# archiver.generate_archive()
# archiver.cleanup_worktree()
print(comic.category)

View File

@@ -11,7 +11,7 @@ app_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
config_path = os.path.abspath(os.path.join(os.environ.get('HOME'), '.config/yoink'))
library_path = os.path.abspath(os.path.join(os.environ.get('HOME'), 'yoink/library'))
required_comic_files = ('.cbr', '.cbz', '000.jpg', '001.jpg')
skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png')
skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png', 'navbar.svg')
torrent_concurrent_download_limit = 1
supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net']
supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net', 'mangadex.tv']
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}

View File

@@ -1,7 +1,6 @@
import requests
from bs4 import BeautifulSoup
from yoink.common import supported_sites
@@ -12,17 +11,19 @@ class Scrapable:
self.__check_site_support()
# for link in supported_sites:
# if link in self.url:
# return
# else:
# raise ValueError('Unsupported site')
# if not any(url in link for link in supported_sites):
# raise ValueError('Unsupported site')
@property
def markup(self) -> str: return requests.get(self.url).content
def markup(self) -> str:
try:
# raise_for_status alters the default response behavior allowing http errors to raise exception
req = requests.get(self.url)
req.raise_for_status()
return req.content
except requests.exceptions.HTTPError as e:
# returns {status_code} Client Error: Not found for url: {self.url} in the event of any http errors and exits
raise SystemExit(e)
@property
def soup(self) -> BeautifulSoup: return BeautifulSoup(self.markup, 'html.parser')

View File

@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup
import os
import unittest
from shutil import rmtree
from yoink.common import app_root, library_path, config_path, skippable_images, supported_sites, required_comic_files, torrent_concurrent_download_limit, headers
from yoink.comic import Comic, ComicArchiver
@@ -14,6 +15,14 @@ class BasicTestCase(unittest.TestCase):
self.test_comic = 'http://readallcomics.com/static-season-one-4-2021/'
self.comic = Comic(self.test_comic)
self.archiver = ComicArchiver(self.comic)
self.remove_queue = []
def tearDown(self) -> None:
for folder in self.remove_queue:
rmtree(folder)
def test_000_comic_generates_valid_markup(self):
self.assertTrue('!DOCTYPE html' in str(self.comic.markup))
@@ -51,4 +60,7 @@ class BasicTestCase(unittest.TestCase):
with self.assertRaises(ValueError) as condition:
comic = Comic('https://viz.com')
self.assertTrue('Unsupported' in str(condition.exception))
self.assertTrue('Unsupported' in str(condition.exception))
self.remove_queue.append(os.path.join(library_path, f'comics/{self.comic.title}'))