Error handling for http errors; added gitlab-ci
This commit is contained in:
8
.gitlab-ci.yml
Normal file
8
.gitlab-ci.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
ruby:
|
||||
stage: test
|
||||
script:
|
||||
- pytest --junitxml report.xml yoink/tests/test_basic.py
|
||||
artifacts:
|
||||
when: always
|
||||
reports:
|
||||
junit: report.xml
|
||||
@@ -2,4 +2,4 @@
|
||||
|
||||
[](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink)
|
||||
|
||||
Yoink! is a multisite media download tool. It scrapes comics from readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
|
||||
Yoink! is a multisite media download tool. It scrapes comics from online comic aggragate sites like readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
|
||||
1
results.xml
Normal file
1
results.xml
Normal file
@@ -0,0 +1 @@
|
||||
<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="34.081" timestamp="2022-03-14T23:05:57.364590" hostname="DESKTOP-SE506CG"><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_000_comic_generates_valid_markup" time="1.243" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_001_comic_has_valid_title" time="0.998" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_002_comic_has_valid_category" time="1.250" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_003_empty_comic_folder" time="0.591" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_004_comic_folder_created_and_populated" time="22.773" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_005_comic_archive_generated" time="3.357" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_006_folder_cleaned_after_archive_generation" time="1.079" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_007_comic_instance_has_archiver" time="0.587" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_008_comic_is_subclass_scrapable" time="0.959" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_009_invalid_comic_link" time="1.050" /></testsuite></testsuites>
|
||||
@@ -40,16 +40,20 @@ def download(url, comic, torrent, path):
|
||||
except ValueError:
|
||||
click.echo(f'{url} is not supported or is not a valid URL')
|
||||
return 1
|
||||
|
||||
click.echo(f'Downloading {comic.title}')
|
||||
comic.archiver.download()
|
||||
|
||||
click.echo('Building comic archive')
|
||||
comic.archiver.generate_archive()
|
||||
|
||||
click.echo('Cleaning up')
|
||||
comic.archiver.cleanup_worktree()
|
||||
|
||||
click.echo('Success')
|
||||
|
||||
if torrent:
|
||||
click.echo('Downloading a torrent')
|
||||
click.echo('Opps! It looks like Torrents aren\'t yet fully supported.')
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from click import format_filename
|
||||
from soupsieve import select
|
||||
from yoink.common import required_comic_files, skippable_images, library_path
|
||||
from yoink.scraper import Scrapable
|
||||
|
||||
@@ -14,10 +12,16 @@ class Comic(Scrapable):
|
||||
super().__init__(url)
|
||||
self.archiver = ComicArchiver(self, library=path)
|
||||
|
||||
def __is_supported_image(self, image):
|
||||
return image.endswith('.jpg' or '.jpeg')
|
||||
|
||||
|
||||
def __get_image_src(self, comic):
|
||||
if comic.attrs:
|
||||
try:
|
||||
return comic.attrs['src']
|
||||
except KeyError:
|
||||
return comic['data-src']
|
||||
|
||||
for image in comic:
|
||||
return image.attrs['src']
|
||||
@@ -27,7 +31,8 @@ class Comic(Scrapable):
|
||||
'default': self.soup.find_all('div', class_='separator'),
|
||||
'no-div': self.soup.find_all('img', attrs={'width': '1000px'}),
|
||||
'excaliber': self.soup.find_all('img'),
|
||||
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'})
|
||||
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}),
|
||||
'mangadex': self.soup.find_all('img', attrs={'draggable': 'false'})
|
||||
}
|
||||
|
||||
for case in soup.keys():
|
||||
@@ -39,11 +44,18 @@ class Comic(Scrapable):
|
||||
@property
|
||||
def filelist(self):
|
||||
comics = self.__parse_soup()
|
||||
for comic in comics: print(comic)
|
||||
return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)]
|
||||
|
||||
|
||||
@property
|
||||
def title(self): return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
|
||||
def title(self):
|
||||
if 'readallcomics' in self.url:
|
||||
return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
|
||||
elif 'mangadex' in self.url:
|
||||
return self.soup.find('meta', property='og:title').attrs['content'].replace(' - Mangadex', '').replace('Read ', '')
|
||||
else:
|
||||
return 'Uncategorized'
|
||||
|
||||
@property
|
||||
def category(self):
|
||||
@@ -75,9 +87,11 @@ class ComicArchiver:
|
||||
print(formatted_file, end='\r')
|
||||
urllib.request.urlretrieve(url, filename=formatted_file)
|
||||
else:
|
||||
page_number = url.split('/')[-1].split('.')[0].zfill(3)
|
||||
page_number = str(index).zfill(3)
|
||||
file_extension = url.split('/')[-1].split('.')[1]
|
||||
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, f'{self.comic.title}{page_number}.{file_extension}'))
|
||||
formatted_file = f'{self.comic.title} - {page_number}.{file_extension}'
|
||||
print(formatted_file, end='\r')
|
||||
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, formatted_file))
|
||||
print()
|
||||
|
||||
def generate_archive(self, archive_format='.cbr'):
|
||||
@@ -95,10 +109,4 @@ class ComicArchiver:
|
||||
|
||||
if __name__ == '__main__':
|
||||
comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/')
|
||||
# # print(comic.filelist)
|
||||
# # print(len(comic.filelist))
|
||||
# archiver = ComicArchiver(comic)
|
||||
# archiver.download()
|
||||
# archiver.generate_archive()
|
||||
# archiver.cleanup_worktree()
|
||||
print(comic.category)
|
||||
@@ -11,7 +11,7 @@ app_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
||||
config_path = os.path.abspath(os.path.join(os.environ.get('HOME'), '.config/yoink'))
|
||||
library_path = os.path.abspath(os.path.join(os.environ.get('HOME'), 'yoink/library'))
|
||||
required_comic_files = ('.cbr', '.cbz', '000.jpg', '001.jpg')
|
||||
skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png')
|
||||
skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png', 'navbar.svg')
|
||||
torrent_concurrent_download_limit = 1
|
||||
supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net']
|
||||
supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net', 'mangadex.tv']
|
||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
from yoink.common import supported_sites
|
||||
|
||||
|
||||
@@ -12,17 +11,19 @@ class Scrapable:
|
||||
|
||||
|
||||
self.__check_site_support()
|
||||
# for link in supported_sites:
|
||||
# if link in self.url:
|
||||
# return
|
||||
# else:
|
||||
# raise ValueError('Unsupported site')
|
||||
# if not any(url in link for link in supported_sites):
|
||||
# raise ValueError('Unsupported site')
|
||||
|
||||
|
||||
@property
|
||||
def markup(self) -> str: return requests.get(self.url).content
|
||||
def markup(self) -> str:
|
||||
try:
|
||||
# raise_for_status alters the default response behavior allowing http errors to raise exception
|
||||
req = requests.get(self.url)
|
||||
req.raise_for_status()
|
||||
return req.content
|
||||
except requests.exceptions.HTTPError as e:
|
||||
# returns {status_code} Client Error: Not found for url: {self.url} in the event of any http errors and exits
|
||||
raise SystemExit(e)
|
||||
|
||||
|
||||
@property
|
||||
def soup(self) -> BeautifulSoup: return BeautifulSoup(self.markup, 'html.parser')
|
||||
|
||||
@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup
|
||||
|
||||
import os
|
||||
import unittest
|
||||
from shutil import rmtree
|
||||
|
||||
from yoink.common import app_root, library_path, config_path, skippable_images, supported_sites, required_comic_files, torrent_concurrent_download_limit, headers
|
||||
from yoink.comic import Comic, ComicArchiver
|
||||
@@ -14,6 +15,14 @@ class BasicTestCase(unittest.TestCase):
|
||||
self.test_comic = 'http://readallcomics.com/static-season-one-4-2021/'
|
||||
self.comic = Comic(self.test_comic)
|
||||
self.archiver = ComicArchiver(self.comic)
|
||||
self.remove_queue = []
|
||||
|
||||
|
||||
def tearDown(self) -> None:
|
||||
for folder in self.remove_queue:
|
||||
rmtree(folder)
|
||||
|
||||
|
||||
|
||||
def test_000_comic_generates_valid_markup(self):
|
||||
self.assertTrue('!DOCTYPE html' in str(self.comic.markup))
|
||||
@@ -52,3 +61,6 @@ class BasicTestCase(unittest.TestCase):
|
||||
comic = Comic('https://viz.com')
|
||||
|
||||
self.assertTrue('Unsupported' in str(condition.exception))
|
||||
|
||||
self.remove_queue.append(os.path.join(library_path, f'comics/{self.comic.title}'))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user