initial commit

This commit is contained in:
2024-09-17 23:47:27 +02:00
parent d23c59437b
commit eb74bc0606
6 changed files with 1007 additions and 0 deletions

164
.gitignore vendored Normal file
View File

@@ -0,0 +1,164 @@
.vscode
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

4
bild/selenium_imp/.gitignore vendored Normal file
View File

@@ -0,0 +1,4 @@
*.csv
*.txt
*.dump
*.json

View File

@@ -0,0 +1,11 @@
from bild_article_classes import ArticleCollection
import datetime
def main():
ac = ArticleCollection(min_date = datetime.datetime(year=2022, month=1, day=1))
# ac = ArticleCollection(min_date = datetime.datetime.now())
ac.collect()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,620 @@
import contextlib
import datetime
import hashlib
import os
import pickle
import random
import time
from itertools import islice
from typing import Union
from urllib.parse import urlparse
import dateutil
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from urllib3.exceptions import MaxRetryError
from bs4 import BeautifulSoup, Comment, NavigableString
from bs4.element import PageElement
from util import ANSICodes as AC, failhandler, link as lk
class ArticleBaseClass:
_default_args = {
'cache': f'{os.getcwd()}/.cache',
'debug': False
}
def __init__(self, **kwargs):
if getattr(self, '_isinit', False):
return
kwargs = dict(list(ArticleBaseClass._default_args.items())+list(kwargs.items()))
if diff := set(kwargs.keys()).difference(ArticleBaseClass._default_args.keys()):
raise ValueError(f"keyword{'s' if len(diff) > 1 else ''} {', '.join(diff)} unknown. supported: {', '.join(self._default_args)}")
self.cache = kwargs.get('cache')
self._debug = kwargs.get('debug')
if self.cache:
if isinstance(self.cache, bool):
self.cache = ArticleBaseClass._default_args['cache']
os.makedirs(self.cache, exist_ok=True)
self._hash = hashlib.sha256()
# self.get_page = file_cache(cache_dir=self.cache, verbose=self._debug)(self.get_page)
self._isinit = True
def update_target_from_source(self, target: dict, source:dict):
for k, v in target.items():
if isinstance(v, dict):
if isinstance(sk := source.get(k), dict):
self.update_target_from_source(v, sk)
else:
target[k] = source.get(k)
def add_debug(self, target):
if isinstance(target, dict):
target['debug'] = self._debug
for _, v in target.items():
if isinstance(v, dict):
self.add_debug(v)
# @file_cache(cache_dir=self.cache)
def get_page(self, link):
def _get_page(link):
with self.get_session() as s:
page = s.get(link)
return page
if self.cache:
try:
self._hash.update(link.encode())
fname = self._hash.hexdigest()
with open(f'{self.cache.rstrip('/')}/{fname}', 'rb') as f:
# print(' -> cache hit!')
page = pickle.load(f)
except FileNotFoundError:
# print(' -> not yet in cache')
page = _get_page(link)
if self.cache:
with open(f'{self.cache.rstrip('/')}/{fname}', 'wb') as f:
pickle.dump(page, f)
else:
page = _get_page(link)
return page
def get_session(self):
local_session = self.session or requests.Session()
retry = Retry(connect=self._http_retries, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
local_session.mount('https://', adapter)
local_session.mount('http://', adapter)
return local_session
def close_session(self, session=None):
if session is None:
if self.session is not None:
self.session.close()
else:
session.close()
...
############
class ArticleTitle:
_default_args = {
'debug': False}
def __init__(self, title:str='', suptitle:str='', **kwargs) -> None:
self._debug = kwargs.get('debug', self._default_args['debug'])
self.title = ' '.join(title.strip().splitlines())
self.suptitle = ' '.join(suptitle.strip().splitlines())
def __repr__(self) -> str:
return f'{self.title}'
def __str__(self) -> str:
return f'({self.suptitle}) {self.title}'
...
############
class ArticleDepartment:
_default_args = {
'max_link_departments': 5,
'debug': False}
def __init__(self, department:str='', link:str='', **kwargs) -> None:
self._debug = kwargs.get('debug', self._default_args['debug'])
self._max_link_departments = kwargs.get('max_link_departments', self._default_args['max_link_departments'])
self.department = ' '.join(department.strip().splitlines())
# get departments from split url [example.com, ressort-1, ressort-2, ...]
self.departments_link = urlparse(link).path.split('/')[1:-1]
# generate link string
self._link_str = ' > '.join(self.departments_link)
# pad to max_link_departments
self.departments_link = (self.departments_link+self._max_link_departments*[''])[:self._max_link_departments]
def __repr__(self) -> str:
return f'{self.department}'
def __str__(self) -> str:
return f'{self.department} ({self._link_str})'
...
############
class ArticleMetadata:
_default_args = {
'department': ArticleDepartment._default_args,
'title': ArticleTitle._default_args,
'datetime_fmt': '%Y-%m-%d %H:%M:%S',
'debug': False}
def __init__(self, html:Union[PageElement, None]=None, base_url:str='example.com', date:Union[datetime.datetime,None]=None, **kwargs):
self._debug = kwargs.get('debug', self._default_args['debug'])
self._datetime_fmt = kwargs.get('datetime_fmt', self._default_args['datetime_fmt'])
self._title_kwargs = self._default_args['title']
if title_args := kwargs.get('title'):
self.update_target_from_source(self._title_kwargs, title_args)
self._add_debug(self._title_kwargs)
self._department_kwargs = self._default_args['department']
if department_args := kwargs.get('department'):
self.update_target_from_source(self._department_kwargs, department_args)
self._add_debug(self._department_kwargs)
self.page = None
self.base_url = base_url
if html is None:
self.create_empty()
else:
self.authors = None
self.parse_html(html, date)
def update_target_from_source(self, target: dict, source:dict):
for k, v in target.items():
if isinstance(v, dict):
if isinstance(sk := source.get(k), dict):
self.update_target_from_source(v, sk)
else:
target[k] = source.get(k)
def _add_debug(self, target):
if isinstance(target, dict):
target['debug'] = self._debug
for _, v in target.items():
if isinstance(v, dict):
self._add_debug(v)
def create_empty(self):
self.link = ''
self.time = datetime.time()
self.title = ArticleTitle()
self.department = ArticleDepartment()
self.authors = None
def parse_html(self, html:PageElement, date:Union[datetime.datetime,None]):
try:
href = html.find('a', {'class': 'stage-feed-item__link'}).attrs['href']
self.link = self.base_url+href
except (AttributeError, KeyError):
self.link = ''
try:
datestring = html.find('time').attrs['datetime']
self.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal())
except (AttributeError, KeyError):
self.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal()) if date else datetime.datetime()
try:
title = html.find('span', {'class': 'stage-feed-item__headline'}).contents[0]
except AttributeError:
title = ''
try:
suptitle = html.find('span', {'class': 'stage-feed-item__kicker'}).contents[0]
except AttributeError:
suptitle = ''
self.title = ArticleTitle(title, suptitle, **self._title_kwargs)
try:
department = html.find('span', {'class': 'stage-feed-item__channel'}).contents[0]
except AttributeError:
department = ''
self.department = ArticleDepartment(department, self.link, **self._department_kwargs)
def csv_line(self, delimiter:str=',', quote_char:str='"', newline=True):
def _quote(s:str):
return f'{quote_char}{s}{quote_char}'
elements = [
self.time.strftime('%Y-%m-%d') if self.time else '0000-00-00',
self.time.strftime('%H:%M:%S') if self.time else '00:00:00',
# self.time.strftime('%Y') if self.time else '00',
# self.time.strftime('%m') if self.time else '00',
# self.time.strftime('%d') if self.time else '00',
# self.time.strftime('%H') if self.time else '00',
# self.time.strftime('%M') if self.time else '00',
# self.time.strftime('%S') if self.time else '00',
_quote(self.title.title if self.title else ''),
_quote(self.title.suptitle if self.title else ''),
_quote(self.department.department if self.department else ''),
*[_quote(str(dep)) for dep in (self.department.departments_link if self.department else ['']*self._department_kwargs['max_link_departments'])],
_quote(self.link) or '',
str(self.page.status_code) if self.page else '']
return delimiter.join(elements) + ('\n' if newline else '')
def __repr__(self):
return f'{self.title.title} ({self.time.strftime(self._datetime_fmt)})'
def __str__(self):
return (
f'{self.title.suptitle}\n'
f'{self.title.title}\n'
f'{self.department.department}\n'
f'{self.department._link_str}\n'
f'{self.time.strftime(self._datetime_fmt)}\n'
f'{self.link}'
)
...
############
class Article(ArticleBaseClass):
_default_args = {
'http_retries': 3,
'meta': ArticleMetadata._default_args,
'debug': False,
'full_text_exclude': [
('aside', {'class': 'related-topics'}),
('figure', {}),
('div', {'class': 'ad-info'}),
('div', {'class': 'float-container'}),
('a', {'class': ['text-link--external', 'text-link']}),
]}
def __init__(self, *, link:str=None, metadata:Union[ArticleMetadata, None]=None, session=None, **kwargs):
super().__init__()
self._debug = kwargs.get('debug', self._default_args['debug'])
self._http_retries = kwargs.get('http_retries', self._default_args['http_retries'])
self._meta_kwargs = self._default_args['meta']
if meta_args := kwargs.get('meta'):
self.update_target_from_source(self._meta_kwargs, meta_args)
self.add_debug(self._meta_kwargs)
self.full_text_exclude = kwargs.get('full_text_exclude', self._default_args['full_text_exclude'])
self.session = session
self.meta = metadata or ArticleMetadata(**self._meta_kwargs)
self.meta.link = link or self.meta.link
self.full_text = None
self.parse_page(self.meta.link)
# parsers
def parse_page(self, link):
self.meta.page = self.get_page(link)
soupy_page = BeautifulSoup(self.meta.page.content, 'html.parser')
if article := soupy_page.find('article'):
self.parse_article(article)
if error_page := soupy_page.find('div', {'class': 'error-page'}):
self.parse_error_page(error_page)
def parse_error_page(self, error_page):
with contextlib.suppress(AttributeError):
wrapper = error_page.find('div', {'class': 'error-page__wrapper'})
self.full_text = self.get_fulltext(wrapper, exclude=(('a'),))
def parse_article(self, article):
with contextlib.suppress(AttributeError):
self.meta.title.title = self.get_fulltext(article.find('span', {'class': 'document-title__headline'}))
with contextlib.suppress(AttributeError):
self.meta.title.suptitle = self.get_fulltext(article.find('span', {'class': 'document-title__kicker'}))
with contextlib.suppress(AttributeError):
if article.find('div', {'class': 'author'}):
self.meta.authors = [self.get_fulltext(article.find('span', {'class': 'author__name'}))]
elif article.find('div', {'class': 'authors'}):
authors = article.find_all('div', {'class': 'article_author__details'})
self.meta.authors = [self.get_fulltext(details) for details in authors]
with contextlib.suppress(AttributeError, KeyError):
if date := article.find('time', {'class': ['datetime']}):
datestring = date.attrs['datetime']
self.meta.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal())
with contextlib.suppress(AttributeError):
body = article.find_all('div', {'class': 'article-body'})
self.full_text = self.get_fulltext(body)
def _clean_exclude_list(self, excludes):
if excludes is None:
return excludes
excl_names = []
excl_attrs = []
for excl in excludes:
if isinstance(excl, (list, tuple)):
excl_names.append(excl[0])
try:
local_attr = {
k: v if isinstance(v, (list, tuple)) else [v]
for k, v in excl[1].items()
}
excl_attrs.append(local_attr)
except (KeyError, IndexError):
excl_attrs.append({})
else:
excl_names.append(excl)
excl_attrs.append({})
return list(zip(excl_names, excl_attrs))
# return excl_names,excl_attrs
def skip_element(self, elm, excludes):
if isinstance(elm, Comment):
return True
if excludes is None:
return False
for excl_name, excl_attr in excludes:
if elm.name == excl_name:
if not excl_attr:
return True
for k, v in excl_attr.items():
with contextlib.suppress(KeyError):
if elm.attrs[k] == v:
return True
return False
def get_fulltext(self, html:Union[PageElement, list], exclude:Union[list, None]=None, sep:str=' '):
if html is None:
return ''
if exclude is not None:
exclude = self._clean_exclude_list(tuple(exclude))
else:
exclude = self.full_text_exclude
local_elems = []
for elm in html:
if self.skip_element(elm, exclude):
continue
if isinstance(elm, NavigableString):
local_elems.append(elm)
elif isinstance(elm, PageElement):
local_elem = self.get_fulltext(elm, exclude=exclude, sep=sep)
local_elems.append(local_elem)
return sep.join(local_elems).strip()
# util
def to_csv_line(self, delimiter:str=',', quote_char:str='"', newline=True):
def _quote(s:str):
return f'{quote_char}{s}{quote_char}'
line = delimiter.join((
self.meta.csv_line(delimiter=delimiter, quote_char=quote_char, newline=False),
_quote(' '.join(self.full_text.splitlines())) if self.full_text else '')
) + ('\n' if newline else '')
return line
def __repr__(self):
department = self.meta.department.department if self.meta.department else ''
title = self.meta.title.title if self.meta.title else ''
full_text = self.full_text or ''
datestr = self.meta.time.strftime('%d.%m.%Y %H:%M:%S') if self.meta.time else ''
return f'[{department}] {title} ({datestr}): {islice(full_text, 100)}...'
def __str__(self) -> str:
return (
f'{self.meta.title.suptitle if self.meta.title else ''}\n'
f'{self.meta.title.title if self.meta.title else ''}\n'
f'{self.meta.department.department if self.meta.department else ''}\n'
f'{self.meta.department._link_str if self.meta.department else ''}\n'
f'{self.meta.time.strftime('%d.%m.%Y %H:%M:%S') if self.meta.time else ''}\n'
f'{self.meta.link or ''} {[self.meta.page.status_code]}\n'
f'{self.full_text or ''}\n'
)
...
############
class ArticleCollection(ArticleBaseClass):
_default_args = {
'min_date': datetime.datetime(year=2006, month=1, day=6),
'max_date': datetime.datetime.now(),
'random': True,
'out_file': 'out.csv',
'out_file_mode': 'new',
'out_file_header': 'date,time,title,suptitle,department,[link_departments],link,http status code,full text',
'failed_file': 'failed.txt',
'http_retries': 5,
'retries': 2,
'base_link': 'https://www.bild.de/themen/uebersicht/archiv/archiv-82532020.bild.html?archiveDate=',
'link_time_format': '%Y-%m-%d',
'article_args': Article._default_args,
'debug': False,
}
_file_modes_overwrite = ('new', 'overwrite', 'write', 'w')
_file_modes_append = ('append', 'a')
_file_modes = (*_file_modes_overwrite, *_file_modes_append)
def __init__(self, session:Union[requests.Session,None]=None, **kwargs):
self._debug = kwargs.get('debug', self._default_args['debug'])
super().__init__(debug=self._debug)
self._min_date = kwargs.get('min_date', self._default_args['min_date'])
self._max_date = kwargs.get('max_date', self._default_args['max_date'])
self._max_date = self._max_date.date()
self._min_date = self._min_date.date()
self._random = kwargs.get('random', self._default_args['random'])
self._article_args = self._default_args['article_args']
if article_args := kwargs.get('article_args'):
self.update_target_from_source(self._article_args, article_args)
self.add_debug(self._article_args)
self._out_file = kwargs.get('out_file', self._default_args['out_file'])
self._out_file_mode = kwargs.get('out_file_mode', self._default_args['out_file_mode'])
if self._out_file_mode not in self._file_modes:
raise AttributeError(f'file mode {self._out_file_mode} unknown. supported: [{','.join(self._file_modes)}]')
self._out_file_header = kwargs.get('out_file_header', self._default_args['out_file_header'])
max_link_departments = self._article_args.get('meta', {}).get('department', {}).get('max_link_departments', self._default_args['article_args']['meta']['department']['max_link_departments'])
link_dep_strings = [f'department from link {i}' for i in range(max_link_departments)]
self._out_file_header = self._out_file_header.replace('[link_departments]', ','.join(link_dep_strings))
self._failed_file = kwargs.get('failed_file', self._default_args['failed_file'])
self._http_retries = kwargs.get('http_retries', self._default_args['http_retries'])
self._retries = kwargs.get('retries', self._default_args['retries'])
self._base_link = kwargs.get('base_link', self._default_args['base_link'])
self._link_time_format = kwargs.get('link_time_format', self._default_args['link_time_format'])
self.prepare_dates()
self.prepare_files()
self.articles = []
self.article_metas = []
self.session = session
self.get_page = failhandler(callback=self.write_failed_to_file)(lambda args: ArticleCollection.get_page(self, args))
def prepare_dates(self):
self.dates = [self._max_date - datetime.timedelta(days=x) for x in range((self._max_date - self._min_date).days+1)]
if self._random:
random.shuffle(self.dates)
def collect(self):
self.session = self.get_session()
print(f'Collecting article metadata from archive pages for {len(self.dates)} days')
for i, date in enumerate(self.dates):
link = self.build_archive_link(date)
self.print_date(date, link, prefix=f'Date {i+1:>{len(str(len(self.dates)))}}/{len(self.dates)} ')
self.process_archive_page(link)
print()
print(f'Collecting fulltext for {len(self.article_metas)} articles')
self.get_fulltexts()
self.close_session()
def build_archive_link(self, date):
return f'{self._base_link}{date.strftime(self._link_time_format)}'
def print_date(self, date:datetime.datetime, link:str=None, fmt:str=None, prefix:str=None, suffix:str=None):
if fmt is None:
fmt = self._link_time_format
print(f'{prefix or ''}{AC.UNDERLINE}{lk(link,date.strftime(fmt)) if link else date.strftime(fmt)}{AC.DEFAULT}{suffix or ''}')
def prepare_files(self):
if self._out_file_mode in self._file_modes:
if self._out_file_mode in self._file_modes_overwrite and self._out_file:
with open(self._out_file, 'w') as f:
f.write(self._out_file_header.strip()+'\n')
elif self._out_file_mode in self._file_modes_append and self._out_file:
...
else:
raise ValueError(f'file mode \'{self._out_file_mode}\' not supported. supported: {self._file_modes}')
if self._failed_file:
with open(self._failed_file, 'w') as f:
f.write('')
def process_archive_page(self, link):
page = self.get_page(link)
soupy_page = BeautifulSoup(page.content, 'html.parser')
articles_html = soupy_page.find_all("article", {"class": "stage-feed-item"})
slice_args = (None, 3, None) if self._debug else (None, None, 1)
for article_html in islice(articles_html, *slice_args): # debugging
article_metadata = ArticleMetadata(article_html, 'https://www.bild.de', **self._article_args.get('meta', {}))
self.print_article_metadata(article_metadata)
# save metadata
self.article_metas.append(article_metadata)
def get_fulltexts(self):
if self._random:
random.shuffle(self.article_metas)
for i, article_metadata in enumerate(self.article_metas):
self.print_article_metadata(article_metadata, prefix=f'{i+1:>{len(str(len(self.article_metas)))}}/{len(self.article_metas)} ')
self.process_article_from_meta(article_metadata)
def process_article_from_meta(self, article_metadata):
try:
art = Article(metadata=article_metadata, session=self.session, **self._article_args)
self.articles.append(art)
if self._out_file:
with open(self._out_file, 'a') as f:
f.write(art.to_csv_line())
except (MaxRetryError,ConnectionError) as e:
if self._debug:
print(e)
self.write_failed_to_file(e, article_metadata)
def print_article_metadata(self, metadata, *, date_fmt=None, time_fmt=None, prefix:str=None, suffix:str=None):
if date_fmt is None:
date_fmt = self._link_time_format
if time_fmt is None:
time_fmt = '%H:%M:%S'
datetime_fmt = f'{date_fmt} {time_fmt}'
link = '' or metadata.link
timestr = (
AC.FG_BRIGHT_GREY +
metadata.time.strftime(datetime_fmt)
+ AC.DEFAULT
) if metadata.time else ''
suptitle = (
AC.FG_BLUE +
metadata.title.suptitle
+ AC.DEFAULT
) if metadata.title.suptitle else ''
title = (
AC.STYLE_BOLD + AC.FG_BRIGHT_PURPLE +
metadata.title.title
+ AC.DEFAULT
) if metadata.title.title else ''
error_string = (
AC.STYLE_BOLD + AC.FG_BRIGHT_RED +
f'[{metadata.page.status_code}]'
+ AC.DEFAULT
) if (metadata.page and metadata.page.status_code != 200) else ''
print(f'{prefix or ''}{timestr} {error_string}({suptitle}) {lk(link, title) if link else title}{suffix or ''}')
def write_failed_to_file(self, exception, elem):
with open(self._failed_file, 'a') as f:
if isinstance(elem, ArticleMetadata):
f.write(f'{elem.link}, "{exception}"\n')
elif isinstance(elem, str):
f.write(f'{elem}, "{exception}"\n')

View File

@@ -0,0 +1,88 @@
import itertools
def print_format_table():
"""
prints table of formatted text format options
"""
# SGR arguments:
# n Name Note
# 0 Reset or normal All attributes become turned off
# 1 Bold or increased intensity As with faint, the color change is a PC (SCO / CGA) invention.[22][better source needed]
# 2 Faint, decreased intensity, or dim May be implemented as a light font weight like bold.[23]
# 3 Italic Not widely supported. Sometimes treated as inverse or blink.[22]
# 4 Underline Style extensions exist for Kitty, VTE, mintty, iTerm2 and Konsole.[24][25][26]
# 5 Slow blink Sets blinking to less than 150 times per minute
# 6 Rapid blink MS-DOS ANSI.SYS, 150+ per minute; not widely supported
# 7 Reverse video or invert Swap foreground and background colors; inconsistent emulation[27][dubious discuss]
# 8 Conceal or hide Not widely supported.
# 9 Crossed-out, or strike Characters legible but marked as if for deletion. Not supported in Terminal.app.
# 10 Primary (default) font
# 1119 Alternative font Select alternative font n 10
# 20 Fraktur (Gothic) Rarely supported
# 21 Doubly underlined; or: not bold Double-underline per ECMA-48,[5]:8.3.117 but instead disables bold intensity on several terminals, including in the Linux kernel's console before version 4.17.[28]
# 22 Normal intensity Neither bold nor faint; color changes where intensity is implemented as such.
# 23 Neither italic, nor blackletter
# 24 Not underlined Neither singly nor doubly underlined
# 25 Not blinking Turn blinking off
# 26 Proportional spacing ITU T.61 and T.416, not known to be used on terminals
# 27 Not reversed
# 28 Reveal Not concealed
# 29 Not crossed out
# 3037 Set foreground color
# 38 Set foreground color Next arguments are 5;n or 2;r;g;b
# 39 Default foreground color Implementation defined (according to standard)
# 4047 Set background color
# 48 Set background color Next arguments are 5;n or 2;r;g;b
# 49 Default background color Implementation defined (according to standard)
# 50 Disable proportional spacing T.61 and T.416
# 51 Framed Implemented as "emoji variation selector" in mintty.[29]
# 52 Encircled
# 53 Overlined Not supported in Terminal.app
# 54 Neither framed nor encircled
# 55 Not overlined
# 58 Set underline color Not in standard; implemented in Kitty, VTE, mintty, and iTerm2.[24][25] Next arguments are 5;n or 2;r;g;b.
# 59 Default underline color Not in standard; implemented in Kitty, VTE, mintty, and iTerm2.[24][25]
# 60 Ideogram underline or right side line Rarely supported
# 61 Ideogram double underline, or double line on the right side
# 62 Ideogram overline or left side line
# 63 Ideogram double overline, or double line on the left side
# 64 Ideogram stress marking
# 65 No ideogram attributes Reset the effects of all of 6064
# 73 Superscript Implemented only in mintty[29]
# 74 Subscript
# 75 Neither superscript nor subscript
# 9097 Set bright foreground color Not in standard; originally implemented by aixterm[13]
# 100107 Set bright background color
print(''.join(
f'\x1b[0;{command}m\\x1b[{command}m\x1b[m{' '*(3-len(str(command)))}{' ' if (command + 1) % 18 else '\n'}'
for command in range(108)
))
supported = (0, # reset
1, # bold
2, # dim
22, # normal intensity
3, # italic
23, # ? neither italic nor blackletter
53, # overlined
55, # not overlined
4, # underline
21, # dunderline
24, # ? not underlined
9, # strike
29, # not strike
7, # invert
27, # not inverted
8, # hidden
28, # not hidden
30, 31, 32, 33, 34, 35, 36, 37, # fg color
38, # fg color 38;5;n or 38;2;r;g;b
39, # reset fg color
40, 41, 42, 43, 44, 45, 46, 47, # bg color
48, # bg color 48;5;n or 48;2;r;g;b
49, # reset bg color
90, 91, 92, 93, 94, 95, 96, 97,
100, 101, 102, 103, 104, 105, 106, 107
)
print_format_table()

120
bild/selenium_imp/util.py Normal file
View File

@@ -0,0 +1,120 @@
from functools import wraps
import random
def link(uri, label=None):
if label is None:
label = uri
parameters = ''
# OSC 8 ; params ; URI ST <name> OSC 8 ;; ST
escape_mask = '\033]8;{};{}\033\\{}\033]8;;\033\\'
return escape_mask.format(parameters, uri, label)
class ANSICodes:
DEFAULT = OFF = '\x1b[0m'
BOLD = STYLE_BOLD = '\x1b[1m'
DIM = STYLE_DIM = '\x1b[2m'
REGULAR = STYLE_REGULAR = '\x1b[22m'
ITALIC = STYLE_ITALIC = '\x1b[3m'
NITALIC = STYLE_NITALIC = '\x1b[23m'
UNDERLINE = DECO_UNDERLINE = '\x1b[4m'
DUNDERLINE = DECO_DUNDERLINE = '\x1b[21m'
NUNDERLINE = DECO_NUNDERLINE = '\x1b[24m'
OVERLINE = DECO_OVERLINE = '\x1b[53m'
NOVERLINE = DECO_NOVERLINE = '\x1b[55m'
INVERT = DECO_INVERT = '\x1b[7m'
NINVERT = DECO_NINVERT = '\x1b[27m'
HIDDEN = DECO_HIDDEN = '\x1b[8m'
NHIDDEN = DECO_NHIDDEN = '\x1b[28m'
STRIKE = DECO_STRIKE = '\x1b[9m'
NSTRIKE = DECO_NSTRIKE = '\x1b[29m'
GREY = FG_GREY = '\x1b[30m'
RED = FG_RED = '\x1b[31m'
GREEN = FG_GREEN = '\x1b[32m'
YELLOW = FG_YELLOW = '\x1b[33m'
PURPLE = FG_PURPLE = '\x1b[34m'
PINK = FG_PINK = '\x1b[35m'
BLUE = FG_BLUE = '\x1b[36m'
WHITE = FG_WHITE = '\x1b[37m'
BRIGHT_GREY = FG_BRIGHT_GREY = '\x1b[90m'
BRIGHT_RED = FG_BRIGHT_RED = '\x1b[91m'
BRIGHT_GREEN = FG_BRIGHT_GREEN = '\x1b[92m'
BRIGHT_YELLOW = FG_BRIGHT_YELLOW = '\x1b[93m'
BRIGHT_PURPLE = FG_BRIGHT_PURPLE = '\x1b[94m'
BRIGHT_PINK = FG_BRIGHT_PINK = '\x1b[95m'
BRIGHT_BLUE = FG_BRIGHT_BLUE = '\x1b[96m'
BRIGHT_WHITE = FG_BRIGHT_WHITE = '\x1b[97m'
BG_GREY = '\x1b[40m'
BG_RED = '\x1b[41m'
BG_GREEN = '\x1b[42m'
BG_YELLOW = '\x1b[43m'
BG_PURPLE = '\x1b[44m'
BG_PINK = '\x1b[45m'
BG_BLUE = '\x1b[46m'
BG_WHITE = '\x1b[47m'
BG_BRIGHT_BLUE = '\x1b[100m'
BG_BRIGHT_RED = '\x1b[101m'
BG_BRIGHT_GREEN = '\x1b[102m'
BG_BRIGHT_YELLOW = '\x1b[103m'
BG_BRIGHT_PURPLE = '\x1b[104m'
BG_BRIGHT_PINK = '\x1b[105m'
BG_BRIGHT_BLUE = '\x1b[106m'
BG_BRIGHT_WHITE = '\x1b[107m'
@staticmethod
def FG_CUSTOM_N(n,/):
# 0- 7: standard colors (as in ESC [ 3037 m)
# 8- 15: high intensity colors (as in ESC [ 9097 m)
# 16-231: 6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5)
# 232-255: grayscale from dark to light in 24 steps
return f'\x1b[38;5;{n}m'
@staticmethod
def FG_CUSTOM_RGB(r,g,b,/):
# r, g, b: 0-255
return f'\x1b[38;5;{r};{g};{b}m'
@staticmethod
def BG_CUSTOM_N(n,/):
# 0- 7: standard colors (as in ESC [ 3037 m)
# 8- 15: high intensity colors (as in ESC [ 9097 m)
# 16-231: 6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5)
# 232-255: grayscale from dark to light in 24 steps
return f'\x1b[48;5;{n}m'
@staticmethod
def BG_CUSTOM_RGB(r,g,b,/):
# r, g, b: 0-255
return f'\x1b[48;5;{r};{g};{b}m'
def debugging_rand(chance):
import random
class RandomException(Exception):
def __init__(self, *args):
super().__init__(*args)
if chance > 1:
chance /= 100
a = random.random()
if a <= chance:
raise RandomException(f'RandomException {chance*100}%')
def failhandler(callback, exceptions:Union[tuple, list, Exception, None]=None):
if exceptions is None:
exceptions = Exception
def fail_decorator(func):
@wraps(func)
def wrapped_function(*args, **kwargs):
try:
return func(*args, **kwargs)
except exceptions as e:
callback(e,*args,**kwargs)
return wrapped_function
return fail_decorator