initial commit

2024-09-17 23:47:27 +02:00
parent d23c59437b
commit eb74bc0606
6 changed files with 1007 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,164 @@
+.vscode
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
--- a/bild/selenium_imp/.gitignore
+++ b/bild/selenium_imp/.gitignore
@@ -0,0 +1,4 @@
+*.csv
+*.txt
+*.dump
+*.json
--- a/bild/selenium_imp/archive_downloader.py
+++ b/bild/selenium_imp/archive_downloader.py
@@ -0,0 +1,11 @@
+from bild_article_classes import ArticleCollection
+import datetime
+
+
+def main():
+    ac = ArticleCollection(min_date = datetime.datetime(year=2022, month=1, day=1))
+    # ac = ArticleCollection(min_date = datetime.datetime.now())
+    ac.collect()
+
+if __name__ == "__main__":
+    main()
--- a/bild/selenium_imp/bild_article_classes.py
+++ b/bild/selenium_imp/bild_article_classes.py
@@ -0,0 +1,620 @@
+import contextlib
+import datetime
+import hashlib
+import os
+import pickle
+import random
+import time
+from itertools import islice
+from typing import Union
+from urllib.parse import urlparse
+
+import dateutil
+import requests
+from requests.adapters import HTTPAdapter
+from urllib3.util.retry import Retry
+from urllib3.exceptions import MaxRetryError
+from bs4 import BeautifulSoup, Comment, NavigableString
+from bs4.element import PageElement
+
+from util import ANSICodes as AC, failhandler, link as lk
+
+
+class ArticleBaseClass:
+    _default_args = {
+        'cache': f'{os.getcwd()}/.cache',
+        'debug': False
+    }
+
+    def __init__(self, **kwargs):
+        if getattr(self, '_isinit', False):
+            return
+    
+        kwargs = dict(list(ArticleBaseClass._default_args.items())+list(kwargs.items()))
+        if diff := set(kwargs.keys()).difference(ArticleBaseClass._default_args.keys()):
+            raise ValueError(f"keyword{'s' if len(diff) > 1 else ''} {', '.join(diff)} unknown. supported: {', '.join(self._default_args)}")
+        self.cache = kwargs.get('cache')
+        self._debug = kwargs.get('debug')
+
+        if self.cache:
+            if isinstance(self.cache, bool):
+                self.cache = ArticleBaseClass._default_args['cache']
+            os.makedirs(self.cache, exist_ok=True)
+            self._hash = hashlib.sha256()
+
+        # self.get_page = file_cache(cache_dir=self.cache, verbose=self._debug)(self.get_page)
+        self._isinit = True
+
+    def update_target_from_source(self, target: dict, source:dict):
+        for k, v in target.items():
+            if isinstance(v, dict):
+                if isinstance(sk := source.get(k), dict):
+                    self.update_target_from_source(v, sk)
+            else:
+                target[k] = source.get(k)
+    
+    def add_debug(self, target):
+        if isinstance(target, dict):
+            target['debug'] = self._debug
+            for _, v in target.items():
+                if isinstance(v, dict):
+                    self.add_debug(v)
+    
+    # @file_cache(cache_dir=self.cache)
+    def get_page(self, link):
+        def _get_page(link):
+            with self.get_session() as s:
+                page = s.get(link)
+            return page
+        
+        if self.cache:
+            try:
+                self._hash.update(link.encode())
+                fname = self._hash.hexdigest()
+                with open(f'{self.cache.rstrip('/')}/{fname}', 'rb') as f:
+                    # print(' -> cache hit!')
+                    page = pickle.load(f)
+            except FileNotFoundError:
+                # print(' -> not yet in cache')
+                page = _get_page(link)
+                if self.cache:
+                    with open(f'{self.cache.rstrip('/')}/{fname}', 'wb') as f:
+                        pickle.dump(page, f)
+        else:
+            page = _get_page(link)
+        return page
+
+    def get_session(self):
+        local_session = self.session or requests.Session()
+        retry = Retry(connect=self._http_retries, backoff_factor=0.5)
+        adapter = HTTPAdapter(max_retries=retry)
+        local_session.mount('https://', adapter)
+        local_session.mount('http://', adapter)
+        return local_session
+
+    def close_session(self, session=None):
+        if session is None:
+            if self.session is not None:
+                self.session.close()
+        else:
+            session.close()
+
+    ...
+
+############
+class ArticleTitle:
+    _default_args = {
+        'debug': False}
+    def __init__(self, title:str='', suptitle:str='', **kwargs) -> None:        
+        self._debug = kwargs.get('debug', self._default_args['debug'])
+
+
+        self.title = ' '.join(title.strip().splitlines())
+        self.suptitle = ' '.join(suptitle.strip().splitlines())
+    
+    def __repr__(self) -> str:
+        return f'{self.title}'
+
+    def __str__(self) -> str:
+        return f'({self.suptitle}) {self.title}'
+    
+    ...
+
+############
+class ArticleDepartment:
+    _default_args = {
+        'max_link_departments': 5,
+        'debug': False}
+    
+    def __init__(self, department:str='', link:str='', **kwargs) -> None:
+        self._debug = kwargs.get('debug', self._default_args['debug'])
+        self._max_link_departments = kwargs.get('max_link_departments', self._default_args['max_link_departments'])
+        
+        self.department = ' '.join(department.strip().splitlines())
+
+        # get departments from split url [example.com, ressort-1, ressort-2, ...]
+        self.departments_link = urlparse(link).path.split('/')[1:-1]
+
+        # generate link string
+        self._link_str = ' > '.join(self.departments_link)
+
+        # pad to max_link_departments
+        self.departments_link = (self.departments_link+self._max_link_departments*[''])[:self._max_link_departments]
+
+
+    def __repr__(self) -> str:
+        return f'{self.department}'
+
+    def __str__(self) -> str:
+        return f'{self.department} ({self._link_str})'
+    
+    ...
+
+############
+class ArticleMetadata:
+    _default_args = {
+        'department': ArticleDepartment._default_args,
+        'title': ArticleTitle._default_args,
+        'datetime_fmt': '%Y-%m-%d %H:%M:%S',
+        'debug': False}
+    
+    def __init__(self, html:Union[PageElement, None]=None, base_url:str='example.com', date:Union[datetime.datetime,None]=None, **kwargs):
+        self._debug = kwargs.get('debug', self._default_args['debug'])
+        self._datetime_fmt = kwargs.get('datetime_fmt', self._default_args['datetime_fmt'])
+
+
+        self._title_kwargs = self._default_args['title']
+        if title_args := kwargs.get('title'):
+            self.update_target_from_source(self._title_kwargs, title_args)
+        self._add_debug(self._title_kwargs)
+
+        self._department_kwargs = self._default_args['department']
+        if department_args := kwargs.get('department'):
+            self.update_target_from_source(self._department_kwargs, department_args)
+        self._add_debug(self._department_kwargs)
+
+        self.page = None
+                
+        self.base_url = base_url
+        if html is None:
+            self.create_empty()
+        else:
+            self.authors = None
+            self.parse_html(html, date)
+    
+    def update_target_from_source(self, target: dict, source:dict):
+        for k, v in target.items():
+            if isinstance(v, dict):
+                if isinstance(sk := source.get(k), dict):
+                    self.update_target_from_source(v, sk)
+            else:
+                target[k] = source.get(k)
+
+    def _add_debug(self, target):
+        if isinstance(target, dict):
+            target['debug'] = self._debug
+            for _, v in target.items():
+                if isinstance(v, dict):
+                    self._add_debug(v)
+
+    def create_empty(self):
+        self.link = ''
+        self.time = datetime.time()
+        self.title = ArticleTitle()
+        self.department = ArticleDepartment()
+        self.authors = None
+    
+    def parse_html(self, html:PageElement, date:Union[datetime.datetime,None]):
+        try:
+            href = html.find('a', {'class': 'stage-feed-item__link'}).attrs['href']
+            self.link = self.base_url+href
+        except (AttributeError, KeyError):
+            self.link = ''
+
+        try:
+            datestring = html.find('time').attrs['datetime']
+            self.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal())
+        except (AttributeError, KeyError):
+            self.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal()) if date else datetime.datetime()
+
+        try:
+            title = html.find('span', {'class': 'stage-feed-item__headline'}).contents[0]
+        except AttributeError:
+            title = ''
+        
+        try:
+            suptitle = html.find('span', {'class': 'stage-feed-item__kicker'}).contents[0]
+        except AttributeError:
+            suptitle = ''
+
+        self.title = ArticleTitle(title, suptitle, **self._title_kwargs)
+
+        try:
+            department = html.find('span', {'class': 'stage-feed-item__channel'}).contents[0]
+        except AttributeError:
+            department = ''
+        
+        self.department = ArticleDepartment(department, self.link, **self._department_kwargs)
+
+    def csv_line(self, delimiter:str=',', quote_char:str='"', newline=True):
+        def _quote(s:str):
+            return f'{quote_char}{s}{quote_char}'
+        
+        elements = [
+            self.time.strftime('%Y-%m-%d') if self.time else '0000-00-00',
+            self.time.strftime('%H:%M:%S') if self.time else '00:00:00',
+            # self.time.strftime('%Y') if self.time else '00',
+            # self.time.strftime('%m') if self.time else '00',
+            # self.time.strftime('%d') if self.time else '00',
+            # self.time.strftime('%H') if self.time else '00',
+            # self.time.strftime('%M') if self.time else '00',
+            # self.time.strftime('%S') if self.time else '00',
+            _quote(self.title.title if self.title else ''),
+            _quote(self.title.suptitle if self.title else ''),
+            _quote(self.department.department if self.department else ''),
+            *[_quote(str(dep)) for dep in (self.department.departments_link if self.department else ['']*self._department_kwargs['max_link_departments'])],
+            _quote(self.link) or '',
+            str(self.page.status_code) if self.page else '']
+
+        return delimiter.join(elements) + ('\n' if newline else '')
+    
+    def __repr__(self):
+        return f'{self.title.title} ({self.time.strftime(self._datetime_fmt)})'
+
+    def __str__(self):
+        return (
+            f'{self.title.suptitle}\n'
+            f'{self.title.title}\n'
+            f'{self.department.department}\n'
+            f'{self.department._link_str}\n'
+            f'{self.time.strftime(self._datetime_fmt)}\n'
+            f'{self.link}'
+            )
+    
+    ...
+
+############
+class Article(ArticleBaseClass):
+    _default_args = {
+        'http_retries': 3,
+        'meta': ArticleMetadata._default_args,
+        'debug': False,
+        'full_text_exclude': [
+            ('aside', {'class': 'related-topics'}),
+            ('figure', {}),
+            ('div', {'class': 'ad-info'}),
+            ('div', {'class': 'float-container'}),
+            ('a', {'class': ['text-link--external', 'text-link']}),
+        ]}
+
+    def __init__(self, *, link:str=None, metadata:Union[ArticleMetadata, None]=None, session=None, **kwargs):
+        super().__init__()
+        self._debug = kwargs.get('debug', self._default_args['debug'])
+        self._http_retries = kwargs.get('http_retries', self._default_args['http_retries'])
+        self._meta_kwargs = self._default_args['meta']
+        if meta_args := kwargs.get('meta'):
+            self.update_target_from_source(self._meta_kwargs, meta_args)
+        self.add_debug(self._meta_kwargs)
+        self.full_text_exclude = kwargs.get('full_text_exclude', self._default_args['full_text_exclude'])
+        
+        self.session = session
+        self.meta = metadata or ArticleMetadata(**self._meta_kwargs)
+        self.meta.link = link or self.meta.link
+        self.full_text = None
+
+        self.parse_page(self.meta.link)
+
+    # parsers
+    def parse_page(self, link):
+        self.meta.page = self.get_page(link)
+
+        soupy_page = BeautifulSoup(self.meta.page.content, 'html.parser')
+
+        if article := soupy_page.find('article'):
+            self.parse_article(article)
+
+        if error_page := soupy_page.find('div', {'class': 'error-page'}):
+            self.parse_error_page(error_page)
+
+    def parse_error_page(self, error_page):
+        with contextlib.suppress(AttributeError):
+            wrapper = error_page.find('div', {'class': 'error-page__wrapper'})
+            self.full_text = self.get_fulltext(wrapper, exclude=(('a'),))
+
+    def parse_article(self, article):
+        with contextlib.suppress(AttributeError):
+            self.meta.title.title = self.get_fulltext(article.find('span', {'class': 'document-title__headline'}))
+        with contextlib.suppress(AttributeError):
+            self.meta.title.suptitle = self.get_fulltext(article.find('span', {'class': 'document-title__kicker'}))
+
+        with contextlib.suppress(AttributeError):
+            if article.find('div', {'class': 'author'}):
+                self.meta.authors = [self.get_fulltext(article.find('span', {'class': 'author__name'}))]
+            elif article.find('div', {'class': 'authors'}):
+                authors = article.find_all('div', {'class': 'article_author__details'})
+                self.meta.authors = [self.get_fulltext(details) for details in authors]
+
+        with contextlib.suppress(AttributeError, KeyError):
+            if date := article.find('time', {'class': ['datetime']}):
+                datestring = date.attrs['datetime']
+                self.meta.time = dateutil.parser.parse(datestring).astimezone(dateutil.tz.tzlocal())
+
+        with contextlib.suppress(AttributeError):
+            body = article.find_all('div', {'class': 'article-body'})
+            self.full_text = self.get_fulltext(body)
+
+    def _clean_exclude_list(self, excludes):
+        if excludes is None:
+            return excludes
+        excl_names = []
+        excl_attrs = []
+        for excl in excludes:
+            if isinstance(excl, (list, tuple)):
+                excl_names.append(excl[0])
+                try:
+                    local_attr = {
+                        k: v if isinstance(v, (list, tuple)) else [v]
+                        for k, v in excl[1].items()
+                    }
+                    excl_attrs.append(local_attr)
+                except (KeyError, IndexError):
+                    excl_attrs.append({})
+            else:
+                excl_names.append(excl)
+                excl_attrs.append({})
+
+        return list(zip(excl_names, excl_attrs))
+        # return excl_names,excl_attrs
+
+    def skip_element(self, elm, excludes):
+        if isinstance(elm, Comment):
+            return True
+        if excludes is None:
+            return False
+        for excl_name, excl_attr in excludes:
+            if elm.name == excl_name:
+                if not excl_attr:
+                    return True
+                for k, v in excl_attr.items():
+                    with contextlib.suppress(KeyError):
+                        if elm.attrs[k] == v:
+                            return True
+        return False
+
+    def get_fulltext(self, html:Union[PageElement, list], exclude:Union[list, None]=None, sep:str=' '):
+        if html is None:
+            return ''
+
+        if exclude is not None:
+            exclude = self._clean_exclude_list(tuple(exclude))
+        else:
+            exclude = self.full_text_exclude
+
+        local_elems = []
+        for elm in html:
+            if self.skip_element(elm, exclude):
+                continue
+            if isinstance(elm, NavigableString):
+                local_elems.append(elm)
+            elif isinstance(elm, PageElement):
+                local_elem = self.get_fulltext(elm, exclude=exclude, sep=sep)
+                local_elems.append(local_elem)
+            
+
+        return sep.join(local_elems).strip()
+
+    # util
+    def to_csv_line(self, delimiter:str=',', quote_char:str='"', newline=True):
+        def _quote(s:str):
+            return f'{quote_char}{s}{quote_char}'
+        
+        line = delimiter.join((
+            self.meta.csv_line(delimiter=delimiter, quote_char=quote_char, newline=False),
+            _quote(' '.join(self.full_text.splitlines())) if self.full_text else '')
+        ) + ('\n' if newline else '')
+
+        return line
+
+    def __repr__(self):
+        department = self.meta.department.department if self.meta.department else ''
+        title = self.meta.title.title if self.meta.title else ''
+        full_text = self.full_text or ''
+        datestr = self.meta.time.strftime('%d.%m.%Y %H:%M:%S') if self.meta.time else ''
+        return f'[{department}] {title} ({datestr}): {islice(full_text, 100)}...'
+    
+    def __str__(self) -> str:
+        return (
+            f'{self.meta.title.suptitle if self.meta.title else ''}\n'
+            f'{self.meta.title.title if self.meta.title else ''}\n'
+            f'{self.meta.department.department if self.meta.department else ''}\n'
+            f'{self.meta.department._link_str if self.meta.department else ''}\n'
+            f'{self.meta.time.strftime('%d.%m.%Y %H:%M:%S') if self.meta.time else ''}\n'
+            f'{self.meta.link or ''} {[self.meta.page.status_code]}\n'
+            f'{self.full_text or ''}\n'
+            )
+    ...
+
+############
+class ArticleCollection(ArticleBaseClass):
+    _default_args = {
+        'min_date': datetime.datetime(year=2006, month=1, day=6),
+        'max_date': datetime.datetime.now(),
+        'random': True,
+        'out_file': 'out.csv',
+        'out_file_mode': 'new',
+        'out_file_header': 'date,time,title,suptitle,department,[link_departments],link,http status code,full text',
+        'failed_file': 'failed.txt',
+        'http_retries': 5,
+        'retries': 2,
+        'base_link': 'https://www.bild.de/themen/uebersicht/archiv/archiv-82532020.bild.html?archiveDate=',
+        'link_time_format': '%Y-%m-%d',
+        'article_args': Article._default_args,
+        'debug': False,
+    }
+
+    _file_modes_overwrite = ('new', 'overwrite', 'write', 'w')
+    _file_modes_append = ('append', 'a')
+    _file_modes = (*_file_modes_overwrite, *_file_modes_append)
+
+    def __init__(self, session:Union[requests.Session,None]=None, **kwargs):
+        self._debug = kwargs.get('debug', self._default_args['debug'])
+        super().__init__(debug=self._debug)
+        
+        self._min_date = kwargs.get('min_date', self._default_args['min_date'])
+        self._max_date = kwargs.get('max_date', self._default_args['max_date'])
+        self._max_date = self._max_date.date()
+        self._min_date = self._min_date.date()
+
+        self._random = kwargs.get('random', self._default_args['random'])
+        
+
+        self._article_args = self._default_args['article_args']
+        if article_args := kwargs.get('article_args'):
+            self.update_target_from_source(self._article_args, article_args)
+        self.add_debug(self._article_args)
+        
+
+        self._out_file = kwargs.get('out_file', self._default_args['out_file'])
+        self._out_file_mode = kwargs.get('out_file_mode', self._default_args['out_file_mode'])
+        if self._out_file_mode not in self._file_modes:
+            raise AttributeError(f'file mode {self._out_file_mode} unknown. supported: [{','.join(self._file_modes)}]')
+
+
+        self._out_file_header = kwargs.get('out_file_header', self._default_args['out_file_header'])
+        max_link_departments = self._article_args.get('meta', {}).get('department', {}).get('max_link_departments', self._default_args['article_args']['meta']['department']['max_link_departments'])
+        link_dep_strings = [f'department from link {i}' for i in range(max_link_departments)]
+        self._out_file_header = self._out_file_header.replace('[link_departments]', ','.join(link_dep_strings))
+
+        self._failed_file = kwargs.get('failed_file', self._default_args['failed_file'])
+        self._http_retries = kwargs.get('http_retries', self._default_args['http_retries'])
+        self._retries = kwargs.get('retries', self._default_args['retries'])
+        self._base_link = kwargs.get('base_link', self._default_args['base_link'])
+        self._link_time_format = kwargs.get('link_time_format', self._default_args['link_time_format'])
+
+        self.prepare_dates()
+
+        self.prepare_files()
+
+        self.articles = []
+        self.article_metas = []
+        self.session = session
+      
+        self.get_page = failhandler(callback=self.write_failed_to_file)(lambda args: ArticleCollection.get_page(self, args))
+    
+    def prepare_dates(self):
+        self.dates = [self._max_date - datetime.timedelta(days=x) for x in range((self._max_date - self._min_date).days+1)]
+        if self._random:
+            random.shuffle(self.dates)
+
+    def collect(self):
+        self.session = self.get_session()
+
+        print(f'Collecting article metadata from archive pages for {len(self.dates)} days')
+        for i, date in enumerate(self.dates):
+            link = self.build_archive_link(date)
+            self.print_date(date, link, prefix=f'Date {i+1:>{len(str(len(self.dates)))}}/{len(self.dates)} ')
+            self.process_archive_page(link)
+        
+        print()
+        print(f'Collecting fulltext for {len(self.article_metas)} articles')
+        self.get_fulltexts()
+
+        self.close_session()
+
+    def build_archive_link(self, date):
+        return f'{self._base_link}{date.strftime(self._link_time_format)}'
+
+    def print_date(self, date:datetime.datetime, link:str=None, fmt:str=None, prefix:str=None, suffix:str=None):
+        if fmt is None:
+            fmt = self._link_time_format
+        print(f'{prefix or ''}{AC.UNDERLINE}{lk(link,date.strftime(fmt)) if link else date.strftime(fmt)}{AC.DEFAULT}{suffix or ''}')
+
+    def prepare_files(self):
+        if self._out_file_mode in self._file_modes:
+            if self._out_file_mode in self._file_modes_overwrite and self._out_file:
+                with open(self._out_file, 'w') as f:
+                    f.write(self._out_file_header.strip()+'\n')
+            elif self._out_file_mode in self._file_modes_append and self._out_file:
+                ...
+            else:
+                raise ValueError(f'file mode \'{self._out_file_mode}\' not supported. supported: {self._file_modes}')
+        if self._failed_file:
+            with open(self._failed_file, 'w') as f:
+                    f.write('')
+
+    def process_archive_page(self, link):
+        page = self.get_page(link)
+        soupy_page = BeautifulSoup(page.content, 'html.parser')
+        articles_html = soupy_page.find_all("article", {"class": "stage-feed-item"})
+        slice_args = (None, 3, None) if self._debug else (None, None, 1)
+        
+        for article_html in islice(articles_html, *slice_args): # debugging
+            article_metadata = ArticleMetadata(article_html, 'https://www.bild.de', **self._article_args.get('meta', {}))
+            self.print_article_metadata(article_metadata)
+            # save metadata
+            self.article_metas.append(article_metadata)
+
+            
+
+    def get_fulltexts(self):
+        if self._random:
+            random.shuffle(self.article_metas)
+        for i, article_metadata in enumerate(self.article_metas):
+            self.print_article_metadata(article_metadata, prefix=f'{i+1:>{len(str(len(self.article_metas)))}}/{len(self.article_metas)} ')
+            self.process_article_from_meta(article_metadata)
+
+
+    def process_article_from_meta(self, article_metadata):
+        try:
+            art = Article(metadata=article_metadata, session=self.session, **self._article_args)
+            self.articles.append(art)
+            if self._out_file:
+                with open(self._out_file, 'a') as f:
+                    f.write(art.to_csv_line())
+        except (MaxRetryError,ConnectionError) as e:
+            if self._debug:
+                print(e)
+            self.write_failed_to_file(e, article_metadata)
+
+
+    def print_article_metadata(self, metadata, *, date_fmt=None, time_fmt=None, prefix:str=None, suffix:str=None):
+        if date_fmt is None:
+            date_fmt = self._link_time_format
+        if time_fmt is None:
+            time_fmt = '%H:%M:%S'
+        datetime_fmt = f'{date_fmt} {time_fmt}'
+
+        link = '' or metadata.link
+
+        timestr = (
+            AC.FG_BRIGHT_GREY + 
+            metadata.time.strftime(datetime_fmt)
+            + AC.DEFAULT 
+        ) if metadata.time else ''
+
+        suptitle = (
+            AC.FG_BLUE + 
+            metadata.title.suptitle
+            + AC.DEFAULT
+        ) if metadata.title.suptitle else ''
+
+        title = (
+            AC.STYLE_BOLD + AC.FG_BRIGHT_PURPLE + 
+            metadata.title.title 
+            + AC.DEFAULT
+        ) if metadata.title.title else ''
+
+        error_string = (
+            AC.STYLE_BOLD + AC.FG_BRIGHT_RED + 
+            f'[{metadata.page.status_code}]'
+            + AC.DEFAULT
+        ) if (metadata.page and metadata.page.status_code != 200) else ''
+
+        print(f'{prefix or ''}{timestr} {error_string}({suptitle}) {lk(link, title) if link else title}{suffix or ''}')
+
+    def write_failed_to_file(self, exception, elem):
+        with open(self._failed_file, 'a') as f:
+            if isinstance(elem, ArticleMetadata):
+                f.write(f'{elem.link}, "{exception}"\n')
+            elif isinstance(elem, str):
+                f.write(f'{elem}, "{exception}"\n')
--- a/bild/selenium_imp/scratch.py
+++ b/bild/selenium_imp/scratch.py
@@ -0,0 +1,88 @@
+import itertools
+def print_format_table():
+    """
+    prints table of formatted text format options
+    """
+    # SGR arguments:
+    # n 	    Name 	                                                    Note
+    # 0 	    Reset or normal 	                                        All attributes become turned off
+    # 1 	    Bold or increased intensity 	                            As with faint, the color change is a PC (SCO / CGA) invention.[22][better source needed]
+    # 2 	    Faint, decreased intensity, or dim 	                        May be implemented as a light font weight like bold.[23]
+    # 3 	    Italic 	                                                    Not widely supported. Sometimes treated as inverse or blink.[22]
+    # 4 	    Underline 	                                                Style extensions exist for Kitty, VTE, mintty, iTerm2 and Konsole.[24][25][26]
+    # 5 	    Slow blink 	                                                Sets blinking to less than 150 times per minute
+    # 6 	    Rapid blink 	                                            MS-DOS ANSI.SYS, 150+ per minute; not widely supported
+    # 7 	    Reverse video or invert 	                                Swap foreground and background colors; inconsistent emulation[27][dubious – discuss]
+    # 8 	    Conceal or hide 	                                        Not widely supported.
+    # 9 	    Crossed-out, or strike                                      Characters legible but marked as if for deletion. Not supported in Terminal.app.
+    # 10 	    Primary (default) font 	
+    # 11–19 	Alternative font 	                                        Select alternative font n − 10
+    # 20 	    Fraktur (Gothic) 	                                        Rarely supported
+    # 21 	    Doubly underlined; or: not bold 	                        Double-underline per ECMA-48,[5]: 8.3.117  but instead disables bold intensity on several terminals, including in the Linux kernel's console before version 4.17.[28]
+    # 22 	    Normal intensity 	                                        Neither bold nor faint; color changes where intensity is implemented as such.
+    # 23 	    Neither italic, nor blackletter 	
+    # 24 	    Not underlined 	                                            Neither singly nor doubly underlined
+    # 25 	    Not blinking 	                                            Turn blinking off
+    # 26 	    Proportional spacing 	                                    ITU T.61 and T.416, not known to be used on terminals
+    # 27 	    Not reversed 	
+    # 28 	    Reveal 	                                                    Not concealed
+    # 29 	    Not crossed out 	
+    # 30–37 	Set foreground color 	
+    # 38 	    Set foreground color 	                                    Next arguments are 5;n or 2;r;g;b
+    # 39 	    Default foreground color 	                                Implementation defined (according to standard)
+    # 40–47 	Set background color 	
+    # 48 	    Set background color 	                                    Next arguments are 5;n or 2;r;g;b
+    # 49 	    Default background color 	                                Implementation defined (according to standard)
+    # 50 	    Disable proportional spacing 	                            T.61 and T.416
+    # 51 	    Framed 	                                                    Implemented as "emoji variation selector" in mintty.[29]
+    # 52 	    Encircled
+    # 53 	    Overlined 	                                                Not supported in Terminal.app
+    # 54 	    Neither framed nor encircled 	
+    # 55 	    Not overlined 	
+    # 58 	    Set underline color 	                                    Not in standard; implemented in Kitty, VTE, mintty, and iTerm2.[24][25] Next arguments are 5;n or 2;r;g;b.
+    # 59 	    Default underline color 	                                Not in standard; implemented in Kitty, VTE, mintty, and iTerm2.[24][25]
+    # 60 	    Ideogram underline or right side line 	                    Rarely supported
+    # 61 	    Ideogram double underline, or double line on the right side
+    # 62 	    Ideogram overline or left side line
+    # 63 	    Ideogram double overline, or double line on the left side
+    # 64 	    Ideogram stress marking
+    # 65 	    No ideogram attributes 	                                    Reset the effects of all of 60–64
+    # 73 	    Superscript 	                                            Implemented only in mintty[29]
+    # 74 	    Subscript
+    # 75 	    Neither superscript nor subscript
+    # 90–97 	Set bright foreground color 	                            Not in standard; originally implemented by aixterm[13]
+    # 100–107 	Set bright background color 
+
+    print(''.join(
+        f'\x1b[0;{command}m\\x1b[{command}m\x1b[m{' '*(3-len(str(command)))}{' ' if (command + 1) % 18 else '\n'}'
+        for command in range(108)
+    ))
+
+    supported = (0,     # reset
+                 1,     # bold
+                 2,     # dim
+                 22,    # normal intensity
+                 3,     # italic
+                 23,    # ? neither italic nor blackletter
+                 53,    # overlined
+                 55,    # not overlined
+                 4,     # underline
+                 21,    # dunderline
+                 24,    # ? not underlined
+                 9,     # strike
+                 29,    # not strike
+                 7,     # invert
+                 27,    # not inverted
+                 8,     # hidden
+                 28,    # not hidden
+                 30, 31, 32, 33, 34, 35, 36, 37,    # fg color
+                 38,    # fg color 38;5;n or 38;2;r;g;b
+                 39,    # reset fg color
+                 40, 41, 42, 43, 44, 45, 46, 47,    # bg color
+                 48,    # bg color 48;5;n or 48;2;r;g;b
+                 49,    # reset bg color
+                 90, 91, 92, 93, 94, 95, 96, 97,
+                 100, 101, 102, 103, 104, 105, 106, 107
+                 )
+
+print_format_table()
--- a/bild/selenium_imp/util.py
+++ b/bild/selenium_imp/util.py
@@ -0,0 +1,120 @@
+from functools import wraps
+import random
+
+
+def link(uri, label=None):
+    if label is None: 
+        label = uri
+    parameters = ''
+
+    # OSC 8 ; params ; URI ST <name> OSC 8 ;; ST 
+    escape_mask = '\033]8;{};{}\033\\{}\033]8;;\033\\'
+
+    return escape_mask.format(parameters, uri, label)
+
+class ANSICodes:
+    DEFAULT         = OFF               =   '\x1b[0m'
+    BOLD            = STYLE_BOLD        =   '\x1b[1m'
+    DIM             = STYLE_DIM         =   '\x1b[2m'
+    REGULAR         = STYLE_REGULAR     =  '\x1b[22m'
+    ITALIC          = STYLE_ITALIC      =   '\x1b[3m'
+    NITALIC         = STYLE_NITALIC     =  '\x1b[23m'
+    UNDERLINE       = DECO_UNDERLINE    =   '\x1b[4m'
+    DUNDERLINE      = DECO_DUNDERLINE   =  '\x1b[21m'
+    NUNDERLINE      = DECO_NUNDERLINE   =  '\x1b[24m'
+    OVERLINE        = DECO_OVERLINE     =  '\x1b[53m'
+    NOVERLINE       = DECO_NOVERLINE    =  '\x1b[55m'
+    INVERT          = DECO_INVERT       =   '\x1b[7m'
+    NINVERT         = DECO_NINVERT      =  '\x1b[27m'
+    HIDDEN          = DECO_HIDDEN       =   '\x1b[8m'
+    NHIDDEN         = DECO_NHIDDEN      =  '\x1b[28m'
+    STRIKE          = DECO_STRIKE       =   '\x1b[9m'
+    NSTRIKE         = DECO_NSTRIKE      =  '\x1b[29m'
+
+    GREY            = FG_GREY           =  '\x1b[30m'
+    RED             = FG_RED            =  '\x1b[31m'
+    GREEN           = FG_GREEN          =  '\x1b[32m'
+    YELLOW          = FG_YELLOW         =  '\x1b[33m'
+    PURPLE          = FG_PURPLE         =  '\x1b[34m'
+    PINK            = FG_PINK           =  '\x1b[35m'
+    BLUE            = FG_BLUE           =  '\x1b[36m'
+    WHITE           = FG_WHITE          =  '\x1b[37m'
+    BRIGHT_GREY     = FG_BRIGHT_GREY    =  '\x1b[90m'
+    BRIGHT_RED      = FG_BRIGHT_RED     =  '\x1b[91m'
+    BRIGHT_GREEN    = FG_BRIGHT_GREEN   =  '\x1b[92m'
+    BRIGHT_YELLOW   = FG_BRIGHT_YELLOW  =  '\x1b[93m'
+    BRIGHT_PURPLE   = FG_BRIGHT_PURPLE  =  '\x1b[94m'
+    BRIGHT_PINK     = FG_BRIGHT_PINK    =  '\x1b[95m'
+    BRIGHT_BLUE     = FG_BRIGHT_BLUE    =  '\x1b[96m'
+    BRIGHT_WHITE    = FG_BRIGHT_WHITE   =  '\x1b[97m'
+
+    BG_GREY                             =  '\x1b[40m'
+    BG_RED                              =  '\x1b[41m'
+    BG_GREEN                            =  '\x1b[42m'
+    BG_YELLOW                           =  '\x1b[43m'
+    BG_PURPLE                           =  '\x1b[44m'
+    BG_PINK                             =  '\x1b[45m'
+    BG_BLUE                             =  '\x1b[46m'
+    BG_WHITE                            =  '\x1b[47m'
+    BG_BRIGHT_BLUE                      = '\x1b[100m'
+    BG_BRIGHT_RED                       = '\x1b[101m'
+    BG_BRIGHT_GREEN                     = '\x1b[102m'
+    BG_BRIGHT_YELLOW                    = '\x1b[103m'
+    BG_BRIGHT_PURPLE                    = '\x1b[104m'
+    BG_BRIGHT_PINK                      = '\x1b[105m'
+    BG_BRIGHT_BLUE                      = '\x1b[106m'
+    BG_BRIGHT_WHITE                     = '\x1b[107m'
+
+    @staticmethod
+    def FG_CUSTOM_N(n,/):
+        #   0-  7:  standard colors (as in ESC [ 30–37 m)
+        #   8- 15:  high intensity colors (as in ESC [ 90–97 m)
+        #  16-231:  6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5)
+        # 232-255:  grayscale from dark to light in 24 steps
+        return f'\x1b[38;5;{n}m'
+
+    @staticmethod
+    def FG_CUSTOM_RGB(r,g,b,/):
+        # r, g, b:  0-255
+        return f'\x1b[38;5;{r};{g};{b}m'
+
+    @staticmethod
+    def BG_CUSTOM_N(n,/):
+        #   0-  7:  standard colors (as in ESC [ 30–37 m)
+        #   8- 15:  high intensity colors (as in ESC [ 90–97 m)
+        #  16-231:  6 × 6 × 6 cube (216 colors): 16 + 36 × r + 6 × g + b (0 ≤ r, g, b ≤ 5)
+        # 232-255:  grayscale from dark to light in 24 steps
+        return f'\x1b[48;5;{n}m'
+    
+    @staticmethod
+    def BG_CUSTOM_RGB(r,g,b,/):
+        # r, g, b:  0-255
+        return f'\x1b[48;5;{r};{g};{b}m'
+
+
+def debugging_rand(chance):
+    import random
+    class RandomException(Exception):
+        def __init__(self, *args):
+            super().__init__(*args)
+
+    if chance > 1:
+        chance /= 100
+
+    a = random.random()
+    if a <= chance:
+        raise RandomException(f'RandomException {chance*100}%')
+
+
+def failhandler(callback, exceptions:Union[tuple, list, Exception, None]=None):
+    if exceptions is None:
+        exceptions = Exception
+    def fail_decorator(func):
+        @wraps(func)
+        def wrapped_function(*args, **kwargs):
+            try:
+                return func(*args, **kwargs)
+            except exceptions as e:
+                callback(e,*args,**kwargs)
+        return wrapped_function
+    return fail_decorator