diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..c7950c5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,164 @@ +.vscode + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/latest/usage/project/#working-with-version-control +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ \ No newline at end of file diff --git a/bild/selenium_imp/.gitignore b/bild/selenium_imp/.gitignore new file mode 100644 index 0000000..1bf153c --- /dev/null +++ b/bild/selenium_imp/.gitignore @@ -0,0 +1,4 @@ +*.csv +*.txt +*.dump +*.json \ No newline at end of file diff --git a/bild/selenium_imp/archive_downloader.py b/bild/selenium_imp/archive_downloader.py new file mode 100644 index 0000000..2ae163c --- /dev/null +++ b/bild/selenium_imp/archive_downloader.py @@ -0,0 +1,11 @@ +from bild_article_classes import ArticleCollection +import datetime + + +def main(): + ac = ArticleCollection(min_date = datetime.datetime(year=2022, month=1, day=1)) + # ac = ArticleCollection(min_date = datetime.datetime.now()) + ac.collect() + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/bild/selenium_imp/bild_article_classes.py b/bild/selenium_imp/bild_article_classes.py new file mode 100644 index 0000000..03bf5ce --- /dev/null +++ b/bild/selenium_imp/bild_article_classes.py @@ -0,0 +1,620 @@ +import contextlib +import datetime +import hashlib +import os +import pickle +import random +import time +from itertools import islice +from typing import Union +from urllib.parse import urlparse + +import dateutil +import requests +from requests.adapters import HTTPAdapter +from urllib3.util.retry import Retry +from urllib3.exceptions import MaxRetryError +from bs4 import BeautifulSoup, Comment, NavigableString +from bs4.element import PageElement + +from util import ANSICodes as AC, failhandler, link as lk + + +class ArticleBaseClass: + _default_args = { + 'cache': f'{os.getcwd()}/.cache', + 'debug': False + } + + def __init__(self, **kwargs): + if getattr(self, '_isinit', False): + return + + kwargs = dict(list(ArticleBaseClass._default_args.items())+list(kwargs.items())) + if diff := set(kwargs.keys()).difference(ArticleBaseClass._default_args.keys()): + raise ValueError(f"keyword{'s' if len(diff) > 1 else ''} {', '.join(diff)} unknown. supported: {', '.join(self._default_args)}") + self.cache = kwargs.get('cache') + self._debug = kwargs.get('debug') + + if self.cache: + if isinstance(self.cache, bool): + self.cache = ArticleBaseClass._default_args['cache'] + os.makedirs(self.cache, exist_ok=True) + self._hash = hashlib.sha256() + + # self.get_page = file_cache(cache_dir=self.cache, verbose=self._debug)(self.get_page) + self._isinit = True + + def update_target_from_source(self, target: dict, source:dict): + for k, v in target.items(): + if isinstance(v, dict): + if isinstance(sk := source.get(k), dict): + self.update_target_from_source(v, sk) + else: + target[k] = source.get(k) + + def add_debug(self, target): + if isinstance(target, dict): + target['debug'] = self._debug + for _, v in target.items(): + if isinstance(v, dict): + self.add_debug(v) + + # @file_cache(cache_dir=self.cache) + def get_page(self, link): + def _get_page(link): + with self.get_session() as s: + page = s.get(link) + return page + + if self.cache: + try: + self._hash.update(link.encode()) + fname = self._hash.hexdigest() + with open(f'{self.cache.rstrip('/')}/{fname}', 'rb') as f: + # print(' -> cache hit!') + page = pickle.load(f) + except FileNotFoundError: + # print(' -> not yet in cache') + page = _get_page(link) + if self.cache: + with open(f'{self.cache.rstrip('/')}/{fname}', 'wb') as f: + pickle.dump(page, f) + else: + page = _get_page(link) + return page + + def get_session(self): + local_session = self.session or requests.Session() + retry = Retry(connect=self._http_retries, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + local_session.mount('https://', adapter) + local_session.mount('http://', adapter) + return local_session + + def close_session(self, session=None): + if session is None: + if self.session is not None: + self.session.close() + else: + session.close() + + ... + +############ +class ArticleTitle: + _default_args = { + 'debug': False} + def __init__(self, title:str='', suptitle:str='', **kwargs) -> None: + self._debug = kwargs.get('debug', self._default_args['debug']) + + + self.title = ' '.join(title.strip().splitlines()) + self.suptitle = ' '.join(suptitle.strip().splitlines()) + + def __repr__(self) -> str: + return f'{self.title}' + + def __str__(self) -> str: + return f'({self.suptitle}) {self.title}' + + ... + +############ +class ArticleDepartment: + _default_args = { + 'max_link_departments': 5, + 'debug': False} + + def __init__(self, department:str='', link:str='', **kwargs) -> None: + self._debug = kwargs.get('debug', self._default_args['debug']) + self._max_link_departments = kwargs.get('max_link_departments', self._default_args['max_link_departments']) + + self.department = ' '.join(department.strip().splitlines()) + + # get departments from split url [example.com, ressort-1, ressort-2, ...] + self.departments_link = urlparse(link).path.split('/')[1:-1] + + # generate link string + self._link_str = ' > '.join(self.departments_link) + + # pad to max_link_departments + self.departments_link = (self.departments_link+self._max_link_departments*[''])[:self._max_link_departments] + + + def __repr__(self) -> str: + return f'{self.department}' + + def __str__(self) -> str: + return f'{self.department} ({self._link_str})' + + ... + +############ +class ArticleMetadata: + _default_args = { + 'department': ArticleDepartment._default_args, + 'title': ArticleTitle._default_args, + 'datetime_fmt': '%Y-%m-%d %H:%M:%S', + 'debug': False} + + def __init__(self, html:Union[PageElement, None]=None, base_url:str='example.com', date:Union[datetime.datetime,None]=None, **kwargs): + self._debug = kwargs.get('debug', self._default_args['debug']) + self._datetime_fmt = kwargs.get('datetime_fmt', self._default_args['datetime_fmt']) + + + self._title_kwargs = self._default_args['title'] + if title_args := kwargs.get('title'): + self.update_target_from_source(self._title_kwargs, title_args) + self._add_debug(self._title_kwargs) + + self._department_kwargs = self._default_args['department'] + if department_args := kwargs.get('department'): + self.update_target_from_source(self._department_kwargs, department_args) + self._add_debug(self._department_kwargs) + + self.page = None + + self.base_url = base_url + if html is None: + self.create_empty() + else: + self.authors = None + self.parse_html(html, date) + + def update_target_from_source(self, target: dict, source:dict): + for k, v in target.items(): + if isinstance(v, dict): + if isinstance(sk := source.get(k), dict): + self.update_target_from_source(v, sk) + else: + target[k] = source.get(k) + + def _add_debug(self, target): + if isinstance(target, dict): + target['debug'] = self._debug + for _, v in target.items(): + if isinstance(v, dict): + self._add_debug(v) + + def create_empty(self): + self.link = '' + self.time = datetime.time() + self.title = ArticleTitle() + self.department = ArticleDepartment() + self.authors = None + + def parse_html(self, html:PageElement, date:Union[datetime.datetime,None]): + try: + href = html.find('a', {'class': 'stage-feed-item__link'}).attrs['href'] + self.link = self.base_url+href + except (AttributeError, KeyError): + self.link = '' + + try: + datestring = html.find('time').attrs['datetime'] + self.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal()) + except (AttributeError, KeyError): + self.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal()) if date else datetime.datetime() + + try: + title = html.find('span', {'class': 'stage-feed-item__headline'}).contents[0] + except AttributeError: + title = '' + + try: + suptitle = html.find('span', {'class': 'stage-feed-item__kicker'}).contents[0] + except AttributeError: + suptitle = '' + + self.title = ArticleTitle(title, suptitle, **self._title_kwargs) + + try: + department = html.find('span', {'class': 'stage-feed-item__channel'}).contents[0] + except AttributeError: + department = '' + + self.department = ArticleDepartment(department, self.link, **self._department_kwargs) + + def csv_line(self, delimiter:str=',', quote_char:str='"', newline=True): + def _quote(s:str): + return f'{quote_char}{s}{quote_char}' + + elements = [ + self.time.strftime('%Y-%m-%d') if self.time else '0000-00-00', + self.time.strftime('%H:%M:%S') if self.time else '00:00:00', + # self.time.strftime('%Y') if self.time else '00', + # self.time.strftime('%m') if self.time else '00', + # self.time.strftime('%d') if self.time else '00', + # self.time.strftime('%H') if self.time else '00', + # self.time.strftime('%M') if self.time else '00', + # self.time.strftime('%S') if self.time else '00', + _quote(self.title.title if self.title else ''), + _quote(self.title.suptitle if self.title else ''), + _quote(self.department.department if self.department else ''), + *[_quote(str(dep)) for dep in (self.department.departments_link if self.department else ['']*self._department_kwargs['max_link_departments'])], + _quote(self.link) or '', + str(self.page.status_code) if self.page else ''] + + return delimiter.join(elements) + ('\n' if newline else '') + + def __repr__(self): + return f'{self.title.title} ({self.time.strftime(self._datetime_fmt)})' + + def __str__(self): + return ( + f'{self.title.suptitle}\n' + f'{self.title.title}\n' + f'{self.department.department}\n' + f'{self.department._link_str}\n' + f'{self.time.strftime(self._datetime_fmt)}\n' + f'{self.link}' + ) + + ... + +############ +class Article(ArticleBaseClass): + _default_args = { + 'http_retries': 3, + 'meta': ArticleMetadata._default_args, + 'debug': False, + 'full_text_exclude': [ + ('aside', {'class': 'related-topics'}), + ('figure', {}), + ('div', {'class': 'ad-info'}), + ('div', {'class': 'float-container'}), + ('a', {'class': ['text-link--external', 'text-link']}), + ]} + + def __init__(self, *, link:str=None, metadata:Union[ArticleMetadata, None]=None, session=None, **kwargs): + super().__init__() + self._debug = kwargs.get('debug', self._default_args['debug']) + self._http_retries = kwargs.get('http_retries', self._default_args['http_retries']) + self._meta_kwargs = self._default_args['meta'] + if meta_args := kwargs.get('meta'): + self.update_target_from_source(self._meta_kwargs, meta_args) + self.add_debug(self._meta_kwargs) + self.full_text_exclude = kwargs.get('full_text_exclude', self._default_args['full_text_exclude']) + + self.session = session + self.meta = metadata or ArticleMetadata(**self._meta_kwargs) + self.meta.link = link or self.meta.link + self.full_text = None + + self.parse_page(self.meta.link) + + # parsers + def parse_page(self, link): + self.meta.page = self.get_page(link) + + soupy_page = BeautifulSoup(self.meta.page.content, 'html.parser') + + if article := soupy_page.find('article'): + self.parse_article(article) + + if error_page := soupy_page.find('div', {'class': 'error-page'}): + self.parse_error_page(error_page) + + def parse_error_page(self, error_page): + with contextlib.suppress(AttributeError): + wrapper = error_page.find('div', {'class': 'error-page__wrapper'}) + self.full_text = self.get_fulltext(wrapper, exclude=(('a'),)) + + def parse_article(self, article): + with contextlib.suppress(AttributeError): + self.meta.title.title = self.get_fulltext(article.find('span', {'class': 'document-title__headline'})) + with contextlib.suppress(AttributeError): + self.meta.title.suptitle = self.get_fulltext(article.find('span', {'class': 'document-title__kicker'})) + + with contextlib.suppress(AttributeError): + if article.find('div', {'class': 'author'}): + self.meta.authors = [self.get_fulltext(article.find('span', {'class': 'author__name'}))] + elif article.find('div', {'class': 'authors'}): + authors = article.find_all('div', {'class': 'article_author__details'}) + self.meta.authors = [self.get_fulltext(details) for details in authors] + + with contextlib.suppress(AttributeError, KeyError): + if date := article.find('time', {'class': ['datetime']}): + datestring = date.attrs['datetime'] + self.meta.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal()) + + with contextlib.suppress(AttributeError): + body = article.find_all('div', {'class': 'article-body'}) + self.full_text = self.get_fulltext(body) + + def _clean_exclude_list(self, excludes): + if excludes is None: + return excludes + excl_names = [] + excl_attrs = [] + for excl in excludes: + if isinstance(excl, (list, tuple)): + excl_names.append(excl[0]) + try: + local_attr = { + k: v if isinstance(v, (list, tuple)) else [v] + for k, v in excl[1].items() + } + excl_attrs.append(local_attr) + except (KeyError, IndexError): + excl_attrs.append({}) + else: + excl_names.append(excl) + excl_attrs.append({}) + + return list(zip(excl_names, excl_attrs)) + # return excl_names,excl_attrs + + def skip_element(self, elm, excludes): + if isinstance(elm, Comment): + return True + if excludes is None: + return False + for excl_name, excl_attr in excludes: + if elm.name == excl_name: + if not excl_attr: + return True + for k, v in excl_attr.items(): + with contextlib.suppress(KeyError): + if elm.attrs[k] == v: + return True + return False + + def get_fulltext(self, html:Union[PageElement, list], exclude:Union[list, None]=None, sep:str=' '): + if html is None: + return '' + + if exclude is not None: + exclude = self._clean_exclude_list(tuple(exclude)) + else: + exclude = self.full_text_exclude + + local_elems = [] + for elm in html: + if self.skip_element(elm, exclude): + continue + if isinstance(elm, NavigableString): + local_elems.append(elm) + elif isinstance(elm, PageElement): + local_elem = self.get_fulltext(elm, exclude=exclude, sep=sep) + local_elems.append(local_elem) + + + return sep.join(local_elems).strip() + + # util + def to_csv_line(self, delimiter:str=',', quote_char:str='"', newline=True): + def _quote(s:str): + return f'{quote_char}{s}{quote_char}' + + line = delimiter.join(( + self.meta.csv_line(delimiter=delimiter, quote_char=quote_char, newline=False), + _quote(' '.join(self.full_text.splitlines())) if self.full_text else '') + ) + ('\n' if newline else '') + + return line + + def __repr__(self): + department = self.meta.department.department if self.meta.department else '' + title = self.meta.title.title if self.meta.title else '' + full_text = self.full_text or '' + datestr = self.meta.time.strftime('%d.%m.%Y %H:%M:%S') if self.meta.time else '' + return f'[{department}] {title} ({datestr}): {islice(full_text, 100)}...' + + def __str__(self) -> str: + return ( + f'{self.meta.title.suptitle if self.meta.title else ''}\n' + f'{self.meta.title.title if self.meta.title else ''}\n' + f'{self.meta.department.department if self.meta.department else ''}\n' + f'{self.meta.department._link_str if self.meta.department else ''}\n' + f'{self.meta.time.strftime('%d.%m.%Y %H:%M:%S') if self.meta.time else ''}\n' + f'{self.meta.link or ''} {[self.meta.page.status_code]}\n' + f'{self.full_text or ''}\n' + ) + ... + +############ +class ArticleCollection(ArticleBaseClass): + _default_args = { + 'min_date': datetime.datetime(year=2006, month=1, day=6), + 'max_date': datetime.datetime.now(), + 'random': True, + 'out_file': 'out.csv', + 'out_file_mode': 'new', + 'out_file_header': 'date,time,title,suptitle,department,[link_departments],link,http status code,full text', + 'failed_file': 'failed.txt', + 'http_retries': 5, + 'retries': 2, + 'base_link': 'https://www.bild.de/themen/uebersicht/archiv/archiv-82532020.bild.html?archiveDate=', + 'link_time_format': '%Y-%m-%d', + 'article_args': Article._default_args, + 'debug': False, + } + + _file_modes_overwrite = ('new', 'overwrite', 'write', 'w') + _file_modes_append = ('append', 'a') + _file_modes = (*_file_modes_overwrite, *_file_modes_append) + + def __init__(self, session:Union[requests.Session,None]=None, **kwargs): + self._debug = kwargs.get('debug', self._default_args['debug']) + super().__init__(debug=self._debug) + + self._min_date = kwargs.get('min_date', self._default_args['min_date']) + self._max_date = kwargs.get('max_date', self._default_args['max_date']) + self._max_date = self._max_date.date() + self._min_date = self._min_date.date() + + self._random = kwargs.get('random', self._default_args['random']) + + + self._article_args = self._default_args['article_args'] + if article_args := kwargs.get('article_args'): + self.update_target_from_source(self._article_args, article_args) + self.add_debug(self._article_args) + + + self._out_file = kwargs.get('out_file', self._default_args['out_file']) + self._out_file_mode = kwargs.get('out_file_mode', self._default_args['out_file_mode']) + if self._out_file_mode not in self._file_modes: + raise AttributeError(f'file mode {self._out_file_mode} unknown. supported: [{','.join(self._file_modes)}]') + + + self._out_file_header = kwargs.get('out_file_header', self._default_args['out_file_header']) + max_link_departments = self._article_args.get('meta', {}).get('department', {}).get('max_link_departments', self._default_args['article_args']['meta']['department']['max_link_departments']) + link_dep_strings = [f'department from link {i}' for i in range(max_link_departments)] + self._out_file_header = self._out_file_header.replace('[link_departments]', ','.join(link_dep_strings)) + + self._failed_file = kwargs.get('failed_file', self._default_args['failed_file']) + self._http_retries = kwargs.get('http_retries', self._default_args['http_retries']) + self._retries = kwargs.get('retries', self._default_args['retries']) + self._base_link = kwargs.get('base_link', self._default_args['base_link']) + self._link_time_format = kwargs.get('link_time_format', self._default_args['link_time_format']) + + self.prepare_dates() + + self.prepare_files() + + self.articles = [] + self.article_metas = [] + self.session = session + + self.get_page = failhandler(callback=self.write_failed_to_file)(lambda args: ArticleCollection.get_page(self, args)) + + def prepare_dates(self): + self.dates = [self._max_date - datetime.timedelta(days=x) for x in range((self._max_date - self._min_date).days+1)] + if self._random: + random.shuffle(self.dates) + + def collect(self): + self.session = self.get_session() + + print(f'Collecting article metadata from archive pages for {len(self.dates)} days') + for i, date in enumerate(self.dates): + link = self.build_archive_link(date) + self.print_date(date, link, prefix=f'Date {i+1:>{len(str(len(self.dates)))}}/{len(self.dates)} ') + self.process_archive_page(link) + + print() + print(f'Collecting fulltext for {len(self.article_metas)} articles') + self.get_fulltexts() + + self.close_session() + + def build_archive_link(self, date): + return f'{self._base_link}{date.strftime(self._link_time_format)}' + + def print_date(self, date:datetime.datetime, link:str=None, fmt:str=None, prefix:str=None, suffix:str=None): + if fmt is None: + fmt = self._link_time_format + print(f'{prefix or ''}{AC.UNDERLINE}{lk(link,date.strftime(fmt)) if link else date.strftime(fmt)}{AC.DEFAULT}{suffix or ''}') + + def prepare_files(self): + if self._out_file_mode in self._file_modes: + if self._out_file_mode in self._file_modes_overwrite and self._out_file: + with open(self._out_file, 'w') as f: + f.write(self._out_file_header.strip()+'\n') + elif self._out_file_mode in self._file_modes_append and self._out_file: + ... + else: + raise ValueError(f'file mode \'{self._out_file_mode}\' not supported. supported: {self._file_modes}') + if self._failed_file: + with open(self._failed_file, 'w') as f: + f.write('') + + def process_archive_page(self, link): + page = self.get_page(link) + soupy_page = BeautifulSoup(page.content, 'html.parser') + articles_html = soupy_page.find_all("article", {"class": "stage-feed-item"}) + slice_args = (None, 3, None) if self._debug else (None, None, 1) + + for article_html in islice(articles_html, *slice_args): # debugging + article_metadata = ArticleMetadata(article_html, 'https://www.bild.de', **self._article_args.get('meta', {})) + self.print_article_metadata(article_metadata) + # save metadata + self.article_metas.append(article_metadata) + + + + def get_fulltexts(self): + if self._random: + random.shuffle(self.article_metas) + for i, article_metadata in enumerate(self.article_metas): + self.print_article_metadata(article_metadata, prefix=f'{i+1:>{len(str(len(self.article_metas)))}}/{len(self.article_metas)} ') + self.process_article_from_meta(article_metadata) + + + def process_article_from_meta(self, article_metadata): + try: + art = Article(metadata=article_metadata, session=self.session, **self._article_args) + self.articles.append(art) + if self._out_file: + with open(self._out_file, 'a') as f: + f.write(art.to_csv_line()) + except (MaxRetryError,ConnectionError) as e: + if self._debug: + print(e) + self.write_failed_to_file(e, article_metadata) + + + def print_article_metadata(self, metadata, *, date_fmt=None, time_fmt=None, prefix:str=None, suffix:str=None): + if date_fmt is None: + date_fmt = self._link_time_format + if time_fmt is None: + time_fmt = '%H:%M:%S' + datetime_fmt = f'{date_fmt} {time_fmt}' + + link = '' or metadata.link + + timestr = ( + AC.FG_BRIGHT_GREY + + metadata.time.strftime(datetime_fmt) + + AC.DEFAULT + ) if metadata.time else '' + + suptitle = ( + AC.FG_BLUE + + metadata.title.suptitle + + AC.DEFAULT + ) if metadata.title.suptitle else '' + + title = ( + AC.STYLE_BOLD + AC.FG_BRIGHT_PURPLE + + metadata.title.title + + AC.DEFAULT + ) if metadata.title.title else '' + + error_string = ( + AC.STYLE_BOLD + AC.FG_BRIGHT_RED + + f'[{metadata.page.status_code}]' + + AC.DEFAULT + ) if (metadata.page and metadata.page.status_code != 200) else '' + + print(f'{prefix or ''}{timestr} {error_string}({suptitle}) {lk(link, title) if link else title}{suffix or ''}') + + def write_failed_to_file(self, exception, elem): + with open(self._failed_file, 'a') as f: + if isinstance(elem, ArticleMetadata): + f.write(f'{elem.link}, "{exception}"\n') + elif isinstance(elem, str): + f.write(f'{elem}, "{exception}"\n') diff --git a/bild/selenium_imp/scratch.py b/bild/selenium_imp/scratch.py new file mode 100644 index 0000000..f0374d3 --- /dev/null +++ b/bild/selenium_imp/scratch.py @@ -0,0 +1,88 @@ +import itertools +def print_format_table(): + """ + prints table of formatted text format options + """ + # SGR arguments: + # n Name Note + # 0 Reset or normal All attributes become turned off + # 1 Bold or increased intensity As with faint, the color change is a PC (SCO / CGA) invention.[22][better source needed] + # 2 Faint, decreased intensity, or dim May be implemented as a light font weight like bold.[23] + # 3 Italic Not widely supported. Sometimes treated as inverse or blink.[22] + # 4 Underline Style extensions exist for Kitty, VTE, mintty, iTerm2 and Konsole.[24][25][26] + # 5 Slow blink Sets blinking to less than 150 times per minute + # 6 Rapid blink MS-DOS ANSI.SYS, 150+ per minute; not widely supported + # 7 Reverse video or invert Swap foreground and background colors; inconsistent emulation[27][dubious – discuss] + # 8 Conceal or hide Not widely supported. + # 9 Crossed-out, or strike Characters legible but marked as if for deletion. Not supported in Terminal.app. + # 10 Primary (default) font + # 11–19 Alternative font Select alternative font n − 10 + # 20 Fraktur (Gothic) Rarely supported + # 21 Doubly underlined; or: not bold Double-underline per ECMA-48,[5]: 8.3.117  but instead disables bold intensity on several terminals, including in the Linux kernel's console before version 4.17.[28] + # 22 Normal intensity Neither bold nor faint; color changes where intensity is implemented as such. + # 23 Neither italic, nor blackletter + # 24 Not underlined Neither singly nor doubly underlined + # 25 Not blinking Turn blinking off + # 26 Proportional spacing ITU T.61 and T.416, not known to be used on terminals + # 27 Not reversed + # 28 Reveal Not concealed + # 29 Not crossed out + # 30–37 Set foreground color + # 38 Set foreground color Next arguments are 5;n or 2;r;g;b + # 39 Default foreground color Implementation defined (according to standard) + # 40–47 Set background color + # 48 Set background color Next arguments are 5;n or 2;r;g;b + # 49 Default background color Implementation defined (according to standard) + # 50 Disable proportional spacing T.61 and T.416 + # 51 Framed Implemented as "emoji variation selector" in mintty.[29] + # 52 Encircled + # 53 Overlined Not supported in Terminal.app + # 54 Neither framed nor encircled + # 55 Not overlined + # 58 Set underline color Not in standard; implemented in Kitty, VTE, mintty, and iTerm2.[24][25] Next arguments are 5;n or 2;r;g;b. + # 59 Default underline color Not in standard; implemented in Kitty, VTE, mintty, and iTerm2.[24][25] + # 60 Ideogram underline or right side line Rarely supported + # 61 Ideogram double underline, or double line on the right side + # 62 Ideogram overline or left side line + # 63 Ideogram double overline, or double line on the left side + # 64 Ideogram stress marking + # 65 No ideogram attributes Reset the effects of all of 60–64 + # 73 Superscript Implemented only in mintty[29] + # 74 Subscript + # 75 Neither superscript nor subscript + # 90–97 Set bright foreground color Not in standard; originally implemented by aixterm[13] + # 100–107 Set bright background color + + print(''.join( + f'\x1b[0;{command}m\\x1b[{command}m\x1b[m{' '*(3-len(str(command)))}{' ' if (command + 1) % 18 else '\n'}' + for command in range(108) + )) + + supported = (0, # reset + 1, # bold + 2, # dim + 22, # normal intensity + 3, # italic + 23, # ? neither italic nor blackletter + 53, # overlined + 55, # not overlined + 4, # underline + 21, # dunderline + 24, # ? not underlined + 9, # strike + 29, # not strike + 7, # invert + 27, # not inverted + 8, # hidden + 28, # not hidden + 30, 31, 32, 33, 34, 35, 36, 37, # fg color + 38, # fg color 38;5;n or 38;2;r;g;b + 39, # reset fg color + 40, 41, 42, 43, 44, 45, 46, 47, # bg color + 48, # bg color 48;5;n or 48;2;r;g;b + 49, # reset bg color + 90, 91, 92, 93, 94, 95, 96, 97, + 100, 101, 102, 103, 104, 105, 106, 107 + ) + +print_format_table() diff --git a/bild/selenium_imp/util.py b/bild/selenium_imp/util.py new file mode 100644 index 0000000..b27deae --- /dev/null +++ b/bild/selenium_imp/util.py @@ -0,0 +1,120 @@ +from functools import wraps +import random + + +def link(uri, label=None): + if label is None: + label = uri + parameters = '' + + # OSC 8 ; params ; URI ST OSC 8 ;; ST + escape_mask = '\033]8;{};{}\033\\{}\033]8;;\033\\' + + return escape_mask.format(parameters, uri, label) + +class ANSICodes: + DEFAULT = OFF = '\x1b[0m' + BOLD = STYLE_BOLD = '\x1b[1m' + DIM = STYLE_DIM = '\x1b[2m' + REGULAR = STYLE_REGULAR = '\x1b[22m' + ITALIC = STYLE_ITALIC = '\x1b[3m' + NITALIC = STYLE_NITALIC = '\x1b[23m' + UNDERLINE = DECO_UNDERLINE = '\x1b[4m' + DUNDERLINE = DECO_DUNDERLINE = '\x1b[21m' + NUNDERLINE = DECO_NUNDERLINE = '\x1b[24m' + OVERLINE = DECO_OVERLINE = '\x1b[53m' + NOVERLINE = DECO_NOVERLINE = '\x1b[55m' + INVERT = DECO_INVERT = '\x1b[7m' + NINVERT = DECO_NINVERT = '\x1b[27m' + HIDDEN = DECO_HIDDEN = '\x1b[8m' + NHIDDEN = DECO_NHIDDEN = '\x1b[28m' + STRIKE = DECO_STRIKE = '\x1b[9m' + NSTRIKE = DECO_NSTRIKE = '\x1b[29m' + + GREY = FG_GREY = '\x1b[30m' + RED = FG_RED = '\x1b[31m' + GREEN = FG_GREEN = '\x1b[32m' + YELLOW = FG_YELLOW = '\x1b[33m' + PURPLE = FG_PURPLE = '\x1b[34m' + PINK = FG_PINK = '\x1b[35m' + BLUE = FG_BLUE = '\x1b[36m' + WHITE = FG_WHITE = '\x1b[37m' + BRIGHT_GREY = FG_BRIGHT_GREY = '\x1b[90m' + BRIGHT_RED = FG_BRIGHT_RED = '\x1b[91m' + BRIGHT_GREEN = FG_BRIGHT_GREEN = '\x1b[92m' + BRIGHT_YELLOW = FG_BRIGHT_YELLOW = '\x1b[93m' + BRIGHT_PURPLE = FG_BRIGHT_PURPLE = '\x1b[94m' + BRIGHT_PINK = FG_BRIGHT_PINK = '\x1b[95m' + BRIGHT_BLUE = FG_BRIGHT_BLUE = '\x1b[96m' + BRIGHT_WHITE = FG_BRIGHT_WHITE = '\x1b[97m' + + BG_GREY = '\x1b[40m' + BG_RED = '\x1b[41m' + BG_GREEN = '\x1b[42m' + BG_YELLOW = '\x1b[43m' + BG_PURPLE = '\x1b[44m' + BG_PINK = '\x1b[45m' + BG_BLUE = '\x1b[46m' + BG_WHITE = '\x1b[47m' + BG_BRIGHT_BLUE = '\x1b[100m' + BG_BRIGHT_RED = '\x1b[101m' + BG_BRIGHT_GREEN = '\x1b[102m' + BG_BRIGHT_YELLOW = '\x1b[103m' + BG_BRIGHT_PURPLE = '\x1b[104m' + BG_BRIGHT_PINK = '\x1b[105m' + BG_BRIGHT_BLUE = '\x1b[106m' + BG_BRIGHT_WHITE = '\x1b[107m' + + @staticmethod + def FG_CUSTOM_N(n,/): + # 0- 7: standard colors (as in ESC [ 30–37 m) + # 8- 15: high intensity colors (as in ESC [ 90–97 m) + # 16-231: 6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5) + # 232-255: grayscale from dark to light in 24 steps + return f'\x1b[38;5;{n}m' + + @staticmethod + def FG_CUSTOM_RGB(r,g,b,/): + # r, g, b: 0-255 + return f'\x1b[38;5;{r};{g};{b}m' + + @staticmethod + def BG_CUSTOM_N(n,/): + # 0- 7: standard colors (as in ESC [ 30–37 m) + # 8- 15: high intensity colors (as in ESC [ 90–97 m) + # 16-231: 6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5) + # 232-255: grayscale from dark to light in 24 steps + return f'\x1b[48;5;{n}m' + + @staticmethod + def BG_CUSTOM_RGB(r,g,b,/): + # r, g, b: 0-255 + return f'\x1b[48;5;{r};{g};{b}m' + + +def debugging_rand(chance): + import random + class RandomException(Exception): + def __init__(self, *args): + super().__init__(*args) + + if chance > 1: + chance /= 100 + + a = random.random() + if a <= chance: + raise RandomException(f'RandomException {chance*100}%') + + +def failhandler(callback, exceptions:Union[tuple, list, Exception, None]=None): + if exceptions is None: + exceptions = Exception + def fail_decorator(func): + @wraps(func) + def wrapped_function(*args, **kwargs): + try: + return func(*args, **kwargs) + except exceptions as e: + callback(e,*args,**kwargs) + return wrapped_function + return fail_decorator