169 lines
5.7 KiB
Python
169 lines
5.7 KiB
Python
from yoink.common import required_comic_files, skippable_images, library_path
|
|
from yoink.scraper import Scrapable
|
|
|
|
import os
|
|
import shutil
|
|
import urllib
|
|
import re
|
|
|
|
|
|
|
|
class Comic(Scrapable):
|
|
def __init__(self, url, path=None) -> None:
|
|
super().__init__(url)
|
|
self.archiver = ComicArchiver(self, library=path)
|
|
|
|
def __is_supported_image(self, image):
|
|
return image.endswith('.jpg' or '.jpeg')
|
|
|
|
|
|
def __get_image_src(self, comic):
|
|
if comic.attrs:
|
|
try:
|
|
return comic.attrs['src']
|
|
except KeyError:
|
|
return comic['data-src']
|
|
|
|
for image in comic:
|
|
return image.attrs['src']
|
|
|
|
def __parse_soup(self):
|
|
soup = {
|
|
'default': self.soup.find_all('div', class_='separator'),
|
|
'no-div': self.soup.find_all('img', attrs={'width': '1000px'}),
|
|
'excaliber': self.soup.find_all('img'),
|
|
'dbsuper': self.soup.findAll('meta', attrs={'property': 'twitter:image'}),
|
|
'mangadex': self.soup.find_all('img', attrs={'draggable': 'false'})
|
|
}
|
|
|
|
for case in soup.keys():
|
|
comics = soup.get(case)
|
|
|
|
if len(comics) > 0:
|
|
return comics
|
|
|
|
@property
|
|
def filelist(self):
|
|
comics = self.__parse_soup()
|
|
return [comic for comic in list(map(self.__get_image_src, comics)) if not comic.endswith(skippable_images)]
|
|
|
|
|
|
@property
|
|
def title(self):
|
|
if 'readallcomics' in self.url:
|
|
return self.soup.title.string.replace(' | Read All Comics Online For Free', '').replace('…', '').replace('#', '').replace(':', '').strip()
|
|
elif 'mangadex' in self.url:
|
|
return self.soup.find('meta', property='og:title').attrs['content'].replace(' - Mangadex', '').replace('Read ', '')
|
|
else:
|
|
return 'Uncategorized'
|
|
|
|
@property
|
|
def category(self):
|
|
data = self.soup.find('a', attrs={'rel': 'category tag'} )
|
|
return data.text
|
|
|
|
@property
|
|
def series_list(self) -> list:
|
|
queue = []
|
|
|
|
return queue
|
|
|
|
@property
|
|
def issue_number(self) -> int:
|
|
# maches any year in parentheses
|
|
date_reg = re.search("(\([12]\d{3}\))", self.title)
|
|
|
|
try:
|
|
return int(self.title[:date_reg.start() - 1][-1])
|
|
except TypeError:
|
|
return 1
|
|
except AttributeError:
|
|
return 1
|
|
|
|
@property
|
|
def volume(self) -> int:
|
|
return
|
|
|
|
@property
|
|
def next(self):
|
|
''' returns the url of the next comic in the series. returns None if current'''
|
|
try:
|
|
return self.soup.find('img', attrs={'title': 'Next Issue'}).parent.attrs['href'] or None
|
|
except AttributeError:
|
|
return None
|
|
|
|
@property
|
|
def prev(self):
|
|
''' returns the url of the previous comic in the series. returns None if first'''
|
|
try:
|
|
return self.soup.find('img', attrs={'title': 'Previous Issue'}).parent.attrs['href']
|
|
except AttributeError:
|
|
return None
|
|
|
|
|
|
def can_remove(self, filename):
|
|
return not filename.endswith(required_comic_files)
|
|
|
|
|
|
class ComicArchiver:
|
|
def __init__(self, comic : Comic, library=None) -> None:
|
|
self.comic = comic
|
|
self.worktree = library if library else os.path.join(library_path, f'comics/{self.comic.title}')
|
|
self.queue = []
|
|
|
|
def add(self, link):
|
|
self.queue.append(link)
|
|
|
|
def download(self):
|
|
|
|
if not os.path.exists(self.worktree):
|
|
os.makedirs(self.worktree, mode=0o777)
|
|
|
|
opener = urllib.request.build_opener()
|
|
opener.addheaders = [('User-Agent', 'Mozilla/5.0')]
|
|
urllib.request.install_opener(opener)
|
|
|
|
for index,url in enumerate(self.comic.filelist):
|
|
|
|
if not url.endswith('.jpg'):
|
|
formatted_file = os.path.join(self.worktree, f'{self.comic.title} ' + ''.join([str(index).zfill(3), '.jpg']))
|
|
print(formatted_file, end='\r')
|
|
urllib.request.urlretrieve(url, filename=formatted_file)
|
|
else:
|
|
page_number = str(index).zfill(3)
|
|
file_extension = url.split('/')[-1].split('.')[1]
|
|
|
|
if len(file_extension) > 3:
|
|
file_extension = 'jpg'
|
|
|
|
formatted_file = f'{self.comic.title} - {page_number}.{file_extension}'
|
|
print(formatted_file, end='\r',)
|
|
urllib.request.urlretrieve(url, filename=os.path.join(self.worktree, formatted_file))
|
|
print()
|
|
|
|
def generate_archive(self, archive_format='.cbr'):
|
|
|
|
archive_from = os.path.basename(self.worktree)
|
|
if os.path.exists(os.path.join(self.worktree, f'{self.comic.title}{archive_format}')):
|
|
return
|
|
|
|
output = shutil.make_archive(self.comic.title, 'zip', self.worktree, self.worktree)
|
|
# os.rename casuses OSError: [Errno 18] Invalid cross-device link and files build test
|
|
# os rename only works if src and dest are on the same file system
|
|
shutil.move(output, os.path.join(self.worktree, f'{self.comic.title}{archive_format}'))
|
|
|
|
|
|
def cleanup_worktree(self):
|
|
for image in os.listdir(self.worktree):
|
|
if not image.endswith(required_comic_files):
|
|
os.remove(os.path.join(self.worktree, image))
|
|
|
|
if __name__ == '__main__':
|
|
comic = Comic('http://www.readallcomics.com/static-season-one-4-2021/') # all links
|
|
# comic = Comic('http://readallcomics.com/static-season-one-001-2021/') # no prev link
|
|
# comic = Comic('http://readallcomics.com/static-season-one-6-2022/') # no next link
|
|
comic = Comic('http://readallcomics.com/superman-vs-lobo-4-2022/')
|
|
test_comic_b = 'http://readallcomics.com/captain-marvel-vs-rogue-2021-part-1/'
|
|
print(comic.next)
|
|
print(comic.prev)
|
|
print(comic.issue_number) |