Error handling for http errors; added gitlab-ci
This commit is contained in:
8
.gitlab-ci.yml
Normal file
8
.gitlab-ci.yml
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
ruby:
|
||||||
|
stage: test
|
||||||
|
script:
|
||||||
|
- pytest --junitxml report.xml yoink/tests/test_basic.py
|
||||||
|
artifacts:
|
||||||
|
when: always
|
||||||
|
reports:
|
||||||
|
junit: report.xml
|
||||||
@@ -2,4 +2,4 @@
|
|||||||
|
|
||||||
[](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink)
|
[](https://wakatime.com/badge/gitlab/Rigil-Kent/yoink)
|
||||||
|
|
||||||
Yoink! is a multisite media download tool. It scrapes comics from readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
|
Yoink! is a multisite media download tool. It scrapes comics from online comic aggragate sites like readallcomics.com compressing them into a .cbr archive and grabs magnet links from tpb.party
|
||||||
1
results.xml
Normal file
1
results.xml
Normal file
@@ -0,0 +1 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?><testsuites><testsuite name="pytest" errors="0" failures="0" skipped="0" tests="10" time="34.081" timestamp="2022-03-14T23:05:57.364590" hostname="DESKTOP-SE506CG"><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_000_comic_generates_valid_markup" time="1.243" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_001_comic_has_valid_title" time="0.998" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_002_comic_has_valid_category" time="1.250" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_003_empty_comic_folder" time="0.591" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_004_comic_folder_created_and_populated" time="22.773" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_005_comic_archive_generated" time="3.357" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_006_folder_cleaned_after_archive_generation" time="1.079" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_007_comic_instance_has_archiver" time="0.587" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_008_comic_is_subclass_scrapable" time="0.959" /><testcase classname="yoink.tests.test_basic.BasicTestCase" name="test_009_invalid_comic_link" time="1.050" /></testsuite></testsuites>
|
||||||
@@ -40,16 +40,20 @@ def download(url, comic, torrent, path):
|
|||||||
except ValueError:
|
except ValueError:
|
||||||
click.echo(f'{url} is not supported or is not a valid URL')
|
click.echo(f'{url} is not supported or is not a valid URL')
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
click.echo(f'Downloading {comic.title}')
|
click.echo(f'Downloading {comic.title}')
|
||||||
comic.archiver.download()
|
comic.archiver.download()
|
||||||
|
|
||||||
click.echo('Building comic archive')
|
click.echo('Building comic archive')
|
||||||
comic.archiver.generate_archive()
|
comic.archiver.generate_archive()
|
||||||
|
|
||||||
click.echo('Cleaning up')
|
click.echo('Cleaning up')
|
||||||
comic.archiver.cleanup_worktree()
|
comic.archiver.cleanup_worktree()
|
||||||
|
|
||||||
click.echo('Success')
|
click.echo('Success')
|
||||||
|
|
||||||
if torrent:
|
if torrent:
|
||||||
click.echo('Downloading a torrent')
|
click.echo('Opps! It looks like Torrents aren\'t yet fully supported.')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,3 @@
|
|||||||
from click import format_filename
|
|
||||||
from soupsieve import select
|
|
||||||
from yoink.common import required_comic_files, skippable_images, library_path
|
from yoink.common import required_comic_files, skippable_images, library_path
|
||||||
from yoink.scraper import Scrapable
|
from yoink.scraper import Scrapable
|
||||||
|
|
||||||
@@ -14,10 +12,16 @@ class Comic(Scrapable):
|
|||||||
super().__init__(url)
|
super().__init__(url)
|
||||||
self.archiver = ComicArchiver(self, library=path)
|
self.archiver = ComicArchiver(self, library=path)
|
||||||
|
|
||||||
|
def __is_supported_image(self, image):
|
||||||
|
return image.endswith('.jpg' or '.jpeg')
|
||||||
|
|
||||||
|
|
||||||
def __get_image_src(self, comic):
|
def __get_image_src(self, comic):
|
||||||
if comic.attrs:
|
if comic.attrs:
|
||||||
|
try:
|
||||||
return comic.attrs['src']
|
return comic.attrs['src']
|
||||||
|
except KeyError:
|
||||||
|
return comic['data-src']
|
||||||
|
|
||||||
for image in comic:
|
for image in comic:
|
||||||
return image.attrs['src']
|
return image.attrs['src']
|
||||||
@@ -27,7 +31,8 @@ class Comic(Scrapable):
|
|||||||
'default': self.soup.find_all('div', class_='separator'),
|
'default': self.soup.find_all('div', class_='separator'),
|
||||||
'no-div': self.soup.find_all('img', attrs={'width': '1000px'}),
|
'no-div': self.soup.find_all('img', attrs={'width': '1000px'}),
|
||||||
'excaliber': self.soup.find_all('img'),
|
'excaliber': self.soup.find_all('img'),
|
||||||
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'})
|
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}),
|
||||||
|
'mangadex': self.soup.find_all('img', attrs={'draggable': 'false'})
|
||||||
}
|
}
|
||||||
|
|
||||||
for case in soup.keys():
|
for case in soup.keys():
|
||||||
@@ -39,11 +44,18 @@ class Comic(Scrapable):
|
|||||||
@property
|
@property
|
||||||
def filelist(self):
|
def filelist(self):
|
||||||
comics = self.__parse_soup()
|
comics = self.__parse_soup()
|
||||||
|
for comic in comics: print(comic)
|
||||||
return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)]
|
return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)]
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def title(self): return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
|
def title(self):
|
||||||
|
if 'readallcomics' in self.url:
|
||||||
|
return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
|
||||||
|
elif 'mangadex' in self.url:
|
||||||
|
return self.soup.find('meta', property='og:title').attrs['content'].replace(' - Mangadex', '').replace('Read ', '')
|
||||||
|
else:
|
||||||
|
return 'Uncategorized'
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def category(self):
|
def category(self):
|
||||||
@@ -75,9 +87,11 @@ class ComicArchiver:
|
|||||||
print(formatted_file, end='\r')
|
print(formatted_file, end='\r')
|
||||||
urllib.request.urlretrieve(url, filename=formatted_file)
|
urllib.request.urlretrieve(url, filename=formatted_file)
|
||||||
else:
|
else:
|
||||||
page_number = url.split('/')[-1].split('.')[0].zfill(3)
|
page_number = str(index).zfill(3)
|
||||||
file_extension = url.split('/')[-1].split('.')[1]
|
file_extension = url.split('/')[-1].split('.')[1]
|
||||||
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, f'{self.comic.title}{page_number}.{file_extension}'))
|
formatted_file = f'{self.comic.title} - {page_number}.{file_extension}'
|
||||||
|
print(formatted_file, end='\r')
|
||||||
|
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, formatted_file))
|
||||||
print()
|
print()
|
||||||
|
|
||||||
def generate_archive(self, archive_format='.cbr'):
|
def generate_archive(self, archive_format='.cbr'):
|
||||||
@@ -95,10 +109,4 @@ class ComicArchiver:
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/')
|
comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/')
|
||||||
# # print(comic.filelist)
|
|
||||||
# # print(len(comic.filelist))
|
|
||||||
# archiver = ComicArchiver(comic)
|
|
||||||
# archiver.download()
|
|
||||||
# archiver.generate_archive()
|
|
||||||
# archiver.cleanup_worktree()
|
|
||||||
print(comic.category)
|
print(comic.category)
|
||||||
@@ -11,7 +11,7 @@ app_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
|
|||||||
config_path = os.path.abspath(os.path.join(os.environ.get('HOME'), '.config/yoink'))
|
config_path = os.path.abspath(os.path.join(os.environ.get('HOME'), '.config/yoink'))
|
||||||
library_path = os.path.abspath(os.path.join(os.environ.get('HOME'), 'yoink/library'))
|
library_path = os.path.abspath(os.path.join(os.environ.get('HOME'), 'yoink/library'))
|
||||||
required_comic_files = ('.cbr', '.cbz', '000.jpg', '001.jpg')
|
required_comic_files = ('.cbr', '.cbz', '000.jpg', '001.jpg')
|
||||||
skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png')
|
skippable_images = ('logo-1.png', 'logo.png', 'report.png', 'request.png', 'prev.png', 'Next.png', 'Donate.png', '11.png', 'navbar.svg')
|
||||||
torrent_concurrent_download_limit = 1
|
torrent_concurrent_download_limit = 1
|
||||||
supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net']
|
supported_sites = ['readallcomics.com', 'tpb.party', 'dragonballsupermanga.net', 'mangadex.tv']
|
||||||
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
from yoink.common import supported_sites
|
from yoink.common import supported_sites
|
||||||
|
|
||||||
|
|
||||||
@@ -12,17 +11,19 @@ class Scrapable:
|
|||||||
|
|
||||||
|
|
||||||
self.__check_site_support()
|
self.__check_site_support()
|
||||||
# for link in supported_sites:
|
|
||||||
# if link in self.url:
|
|
||||||
# return
|
|
||||||
# else:
|
|
||||||
# raise ValueError('Unsupported site')
|
|
||||||
# if not any(url in link for link in supported_sites):
|
|
||||||
# raise ValueError('Unsupported site')
|
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def markup(self) -> str: return requests.get(self.url).content
|
def markup(self) -> str:
|
||||||
|
try:
|
||||||
|
# raise_for_status alters the default response behavior allowing http errors to raise exception
|
||||||
|
req = requests.get(self.url)
|
||||||
|
req.raise_for_status()
|
||||||
|
return req.content
|
||||||
|
except requests.exceptions.HTTPError as e:
|
||||||
|
# returns {status_code} Client Error: Not found for url: {self.url} in the event of any http errors and exits
|
||||||
|
raise SystemExit(e)
|
||||||
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def soup(self) -> BeautifulSoup: return BeautifulSoup(self.markup, 'html.parser')
|
def soup(self) -> BeautifulSoup: return BeautifulSoup(self.markup, 'html.parser')
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ from bs4 import BeautifulSoup
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
from shutil import rmtree
|
||||||
|
|
||||||
from yoink.common import app_root, library_path, config_path, skippable_images, supported_sites, required_comic_files, torrent_concurrent_download_limit, headers
|
from yoink.common import app_root, library_path, config_path, skippable_images, supported_sites, required_comic_files, torrent_concurrent_download_limit, headers
|
||||||
from yoink.comic import Comic, ComicArchiver
|
from yoink.comic import Comic, ComicArchiver
|
||||||
@@ -14,6 +15,14 @@ class BasicTestCase(unittest.TestCase):
|
|||||||
self.test_comic = 'http://readallcomics.com/static-season-one-4-2021/'
|
self.test_comic = 'http://readallcomics.com/static-season-one-4-2021/'
|
||||||
self.comic = Comic(self.test_comic)
|
self.comic = Comic(self.test_comic)
|
||||||
self.archiver = ComicArchiver(self.comic)
|
self.archiver = ComicArchiver(self.comic)
|
||||||
|
self.remove_queue = []
|
||||||
|
|
||||||
|
|
||||||
|
def tearDown(self) -> None:
|
||||||
|
for folder in self.remove_queue:
|
||||||
|
rmtree(folder)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_000_comic_generates_valid_markup(self):
|
def test_000_comic_generates_valid_markup(self):
|
||||||
self.assertTrue('!DOCTYPE html' in str(self.comic.markup))
|
self.assertTrue('!DOCTYPE html' in str(self.comic.markup))
|
||||||
@@ -52,3 +61,6 @@ class BasicTestCase(unittest.TestCase):
|
|||||||
comic = Comic('https://viz.com')
|
comic = Comic('https://viz.com')
|
||||||
|
|
||||||
self.assertTrue('Unsupported' in str(condition.exception))
|
self.assertTrue('Unsupported' in str(condition.exception))
|
||||||
|
|
||||||
|
self.remove_queue.append(os.path.join(library_path, f'comics/{self.comic.title}'))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user