initial commit

2022-01-12 21:31:56 +01:00 · 2022-01-12 21:31:56 +01:00 · 00eca9aa4e
commit 00eca9aa4e
28 changed files with 2926 additions and 0 deletions
--- a/.chglog/config.yml
+++ b/.chglog/config.yml
@ -0,0 +1,27 @@
 style: none
 template: CHANGELOG.tpl.md
 info:
  title: CHANGELOG
  repository_url: git.fqserv.eu:takaoni/tenkan.git
 options:
  commits:
    # filters:
    #   Type:
    #     - feat
    #     - fix
    #     - perf
    #     - refactor
  commit_groups:
    # title_maps:
    #   feat: Features
    #   fix: Bug Fixes
    #   perf: Performance Improvements
    #   refactor: Code Refactoring
  header:
    pattern: "^(\\w*)\\:\\s(.*)$"
    pattern_maps:
      - Type
      - Subject
  notes:
    keywords:
      - BREAKING CHANGE
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,140 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 .chglog/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,55 @@
 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.1.0
    hooks:
    -   id: trailing-whitespace
    -   id: end-of-file-fixer
    -   id: check-json
    -   id: check-added-large-files
    -   id: double-quote-string-fixer
    -   id: fix-encoding-pragma
    -   id: no-commit-to-branch
    -   id: name-tests-test
 -   repo: https://gitlab.com/pycqa/flake8
    rev: 3.9.2
    hooks:
    -   id: flake8
 -   repo: https://github.com/psf/black
    rev: 21.12b0
    hooks:
      - id: black
        name: black (python)
        args: ['-S']
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v0.931
    hooks:
        - id: mypy
          additional_dependencies: [pydantic]  # add if use pydantic
 -   repo: https://github.com/PyCQA/isort
    rev: 5.10.1
    hooks:
    -   id: isort
        name: isort (python)
        args: ['--profile', 'black']
 -   repo: https://github.com/PyCQA/bandit
    rev: 1.7.1
    hooks:
    -   id: bandit
        exclude: ^tests/
 -   repo: https://github.com/asottile/pyupgrade
    rev: v2.31.0
    hooks:
    -   id: pyupgrade
 -   repo: local
    hooks:
      - id: pylint
        name: pylint
        entry: pylint --disable=E1101,E0401,C0301 --ignore=__init__.py --ignore-patterns=(.)*_test\.py,test_(.)*\.py
        language: system
        types: [python]
      - id: pytest
        name: Check pytest unit tests pass
        entry: pytest
        pass_filenames: false
        language: system
        types: [python]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/13
+++ b/13
@ -0,0 +1,13 @@
           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
                   Version 2, December 2004
 Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
 Everyone is permitted to copy and distribute verbatim or modified
 copies of this license document, and changing it is allowed as long
 as the name is changed.
           DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
  TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 0. You just DO WHAT THE FUCK YOU WANT TO.
--- a/README.md
+++ b/README.md
@ -0,0 +1,80 @@
 # tenkan
 Command line tool to convert HTTP RSS/Atom feeds to gemini format.
 ## Installation
 ```shell script
 pip install tenkan
 ```
 ## Usage
 Add a feed
 ```shell script
 # Any valid RSS/Atom feed
 tenkan add feedname url
 ```
 Update content of feed list
 ```shell script
 tenkan update
 ```
 Delete feed
 ```shell script
 tenkan delete feedname
 ```
 List subscripted feeds
 ```shell script
 tenkan list
 ```
 ## Options
 A debug mode is avaible via --debug option.
 If you want to use your configuration or feeds file in another place than default one, you can use --config and --feedsfile options.
 ## Configuration
 tenkan searches for a configuration file at the following location:
 `$XDG_CONFIG_HOME/tenkan/tenkan.conf`
 ### Example config
 This can be found in tenkan.conf.example.
 ```ini
 [tenkan]
 gemini_path = /usr/local/gemini/
 gemini_url = gemini://foo.bar/feeds/
 # will purge feed folders having more than defined element count
 # purge_feed_folder_after = 100
 [filters]
 # authors we don't want to read
 # authors_blacklist = foo, bar
 # blacklist of article titles, if provided, it won't be processed
 # titles_blacklist = foo, bar
 # blacklist of article links, if provided, it won't be processed
 # links_blacklist = foo/bar.com, bar/foo, bla
 [formatting]
 # maximum article title size, 120 chars if not provided
 # title_size = 120
 # feeds with a truncated content
 # will be fetched and converted using readability
 # truncated_feeds = foo, bar
 ```
 ## Todolist
 - [ ] Add a edit command
 - [ ] Add a --feedname option to update command, to update a single feed
 - [ ] Rewrite configuration checks
 - [ ] Improve tests
 - [ ] Refactor needed parts like write_article
 - [ ] (not sure if relevant) migrate images too, for gemini clients that can handle it
 ## Development
 I recommend using pre-commit. The pre-commit configuration I use is located in .pre-commit-config.yamlfile.
 Run pre-commit command before every pull request and fix the warnings or errors it produces.
--- a/poetry.lock
+++ b/poetry.lock
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,66 @@
 [tool.poetry]
 name = "tenkan"
 version = "0.1.0"
 description = "RSS/atom feed converter from html to gemini"
 authors = ["Quentin Ferrand <quentin.ferrand@protonmail.com>"]
 [tool.poetry.dependencies]
 python = "^3.8"
 DateTime = "^4.3"
 feedparser = "^6.0.8"
 feedgen = "^0.9.0"
 requests = "^2.26.0"
 markdownify = "^0.10.0"
 md2gemini = "^1.8.1"
 readability-lxml = "^0.8.1"
 rich = "^10.16.2"
 prettytable = "^3.0.0"
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"
 black = {version = "^21.11b1", allow-prereleases = true}
 flake8 = "^4.0.1"
 mypy = "^0.910"
 isort = "^5.10.1"
 pytest-cov = "^3.0.0"
 pylint = "^2.12.2"
 pyupgrade = "^2.31.0"
 bandit = "^1.7.1"
 [build-system]
 requires = ["poetry-core>=1.0.0"]
 build-backend = "poetry.core.masonry.api"
 [tool.black]
 line-length = 79
 target-version = ['py38']
 include = '\.pyi?$'
 exclude = '''
 (
  /(
      \.eggs         # exclude a few common directories in the
    | \.git          # root of the project
    | \.hg
    | \.mypy_cache
    | \.tox
    | \.venv
    | _build
    | buck-out
    | build
    | dist
  )/
  | foo.py           # also separately exclude a file named foo.py in
                     # the root of the project
 )
 '''
 [tool.isort]
 multi_line_output = 3
 include_trailing_comma = true
 force_grid_wrap = 0
 use_parentheses = true
 line_length = 79
 [tool.poetry.scripts]
 tenkan = "tenkan.cli:main"
--- a/tenkan.conf.example
+++ b/tenkan.conf.example
@ -0,0 +1,21 @@
 [tenkan]
 gemini_path = /usr/local/gemini/
 gemini_url = gemini://foo.bar/feeds/
 # will purge feed folders having more than defined element count
 # purge_feed_folder_after = 100
 [filters]
 # authors we don't want to read
 # authors_blacklist = foo, bar
 # blacklist of article titles, if provided, it won't be processed
 # titles_blacklist = foo, bar
 # blacklist of article links, if provided, it won't be processed
 # links_blacklist = foo/bar.com, bar/foo, bla
 [formatting]
 # maximum article title size, 120 chars if not provided
 # title_size = 120
 # feeds with a truncated content
 # will be fetched and converted using readability
 # truncated_feeds = foo, bar
--- a/tenkan/init.py
+++ b/tenkan/init.py
@ -0,0 +1,2 @@
 # -*- coding: utf-8 -*-
 __version__ = '0.1.0'
--- a/tenkan/cli.py
+++ b/tenkan/cli.py
@ -0,0 +1,214 @@
 # -*- coding: utf-8 -*-
 """
 cli module
 It parses args and runs what's needed if every over modules
 depending of what command is done
 """
 import configparser
 import logging
 import sys
 from argparse import ArgumentParser, RawTextHelpFormatter
 from datetime import datetime
 from pathlib import Path
 from typing import NoReturn
 from rich.traceback import install
 from tenkan.config import load_config
 from tenkan.feedsfile import (
    add_feed,
    create,
    del_feed,
    list_feeds,
    update_last_run,
 )
 from tenkan.files import delete_folder
 from tenkan.processing import (
    fetch_feeds,
    prepare_fetched_content,
    process_fetched_feeds,
    write_processed_feeds,
 )
 # rich tracebacks
 install(show_locals=True)
 class MyParser(ArgumentParser):  # pylint: disable=too-few-public-methods
    """Child class to print help msg if no or bad args given"""
    def error(self, message: str) -> NoReturn:
        """exit"""
        sys.stderr.write(f'error: {message}')
        self.print_help()
        sys.exit(2)
 def load_args(args: list):
    """args parsing function"""
    desc = 'tenkan : RSS/atom feed converter from html to gemini\n\nTo show the detailed help of a COMMAND run `ytcc COMMAND --help`.'
    parser = MyParser(
        description=desc, prog='tenkan', formatter_class=RawTextHelpFormatter
    )
    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s 0.1.0',
        help='show %(prog)s version number and exit',
    )
    parser.add_argument(
        '--config',
        default=f'{str(Path.home())}/.config/tenkan/tenkan.conf',
        help='config file, $HOME/.config/tenkan/tenkan.conf by default',
        dest='config',
    )
    parser.add_argument(
        '--feedsfile',
        default=f'{str(Path.home())}/.config/tenkan/feeds.json',
        help='feeds file containing feed list, $HOME/.config/tenkan/feeds.json by default',
        dest='feedsfile',
    )
    parser.add_argument(
        '--debug', action='store_true', help='debug mode', dest='debug'
    )
    subparsers = parser.add_subparsers(
        title='command', required=True, dest='command'
    )
    parser_add = subparsers.add_parser(
        'add', help='add a feed to the feeds list'
    )
    parser_add.add_argument(
        'name', help='the name of the feed you want to add'
    )
    parser_add.add_argument('url', help='the HTTP url of the feed')
    parser_update = subparsers.add_parser(
        'update', help='update feeds folder from feed list'
    )
    parser_update.add_argument(
        '--force',
        action='store_true',
        default=False,
        help='update feed list even if there is no new content',
    )
    parser_list = subparsers.add_parser(
        'list', help='list all feeds in feeds list'
    )
    parser_list.add_argument(
        'list', help='list all feeds in feeds list', action='store_true'
    )
    parser_delete = subparsers.add_parser(
        'delete', help='remove a feed to the feeds list'
    )
    parser_delete.add_argument(
        'name', help='the name of the feed you want to delete'
    )
    parser_delete.add_argument(
        '--delete-gmi-folder',
        help='delete gmi folder, True by default',
        action='store_true',
        default=False,
        dest='delete_folder',
    )
    return parser.parse_args(args)
 def set_logging(args, config: configparser.ConfigParser) -> None:
    """define logging settings"""
    log = logging.getLogger()
    log.setLevel(logging.INFO)
    if args.debug:
        log.setLevel(logging.DEBUG)
    console_formatter = logging.Formatter(fmt='%(message)s')
    file_formatter = logging.Formatter(
        fmt='%(asctime)s %(levelname)s: %(message)s'
    )
    stdout_handler = logging.StreamHandler(stream=sys.stdout)
    stdout_handler.setFormatter(console_formatter)
    log.addHandler(stdout_handler)
    if config['tenkan'].get('log_file'):
        file_handler = logging.FileHandler(
            filename=config['tenkan'].get('log_file'),
            encoding='utf-8',
        )
        file_handler.setFormatter(file_formatter)
        log.addHandler(file_handler)
 def run(args, config: configparser.ConfigParser) -> None:
    """run stuff depending of command used"""
    # exit with error if json file not found with actions other than add
    if not Path(args.feedsfile).exists() and 'add' not in args.command:
        logging.error('No json file %s, can\'t continue', args.feedsfile)
        sys.exit(1)
    # list feeds in a pretty format
    if args.command == 'list':
        list_feeds(file=args.feedsfile)
    # add a feed to feeds file
    if args.command == 'add':
        # if home directory, creates json with empty structure if no file yet
        if not Path(args.feedsfile).parents[0].exists():
            if str(Path(args.feedsfile).parents[0]) == str(Path.home()):
                Path(args.feedsfile).parents[0].mkdir(
                    parents=True, exist_ok=True
                )
            else:
                logging.error(
                    'Directory of feeds file %s not found, exiting',
                )
                sys.exit(1)
        if not Path(args.feedsfile).is_file():
            create(args.feedsfile)
        add_feed(file=args.feedsfile, feed_name=args.name, feed_url=args.url)
    # delete a feed from feeds file
    if args.command == 'delete':
        del_feed(file=args.feedsfile, feed_name=args.name)
        if args.delete_folder:
            delete_folder(
                path=config['tenkan']['gemini_path'], feed_name=args.name
            )
    # update content
    if args.command == 'update':
        fetched_feeds = fetch_feeds(
            feeds_file=args.feedsfile,
            gmi_url=config['tenkan']['gemini_url'],
        )
        print('')
        fetched_feeds = prepare_fetched_content(fetched_feeds, args.force)
        feed_list = process_fetched_feeds(
            config=config,
            fetched_feeds=fetched_feeds,
            force=args.force,
        )
        if feed_list:
            write_processed_feeds(args, config, feed_list)
        else:
            logging.info('No new content to process, stopping')
        update_last_run(args.feedsfile, str(datetime.now()))
 def main() -> None:
    """load conf, args, set logging and run main program"""
    args = load_args(args=sys.argv[1:])
    config = load_config(args.config)
    set_logging(args, config)
    run(args, config)
--- a/tenkan/config.py
+++ b/tenkan/config.py
@ -0,0 +1,57 @@
 # -*- coding: utf-8 -*-
 """config module : configuration file parsing"""
 import configparser
 import logging
 import sys
 from pathlib import Path
 def load_config(config_file) -> configparser.ConfigParser:
    """config load"""
    # exit with error if config file not found
    if not Path(config_file).exists():
        logging.error('No config file found %s, exiting', config_file)
        sys.exit(1)
    parser = configparser.ConfigParser()
    parser.read(config_file)
    if 'tenkan' not in parser.sections():
        logging.critical(
            "Missing [tenkan] section in config file %s, can't go further",
            config_file,
        )
        sys.exit(1)
    # shitty checks of config content
    # to improve later...
    for opt in ['gemini_path', 'gemini_url']:
        if not parser.has_option('tenkan', opt):
            logging.error('Missing option %s', opt)
            sys.exit(1)
    if parser.has_option('tenkan', 'purge_feed_folder_after'):
        if not int(parser['tenkan']['purge_feed_folder_after']):
            logging.error(
                'Wrong type for purge_feed_folder_after option, should be a number'
            )
            sys.exit(1)
    if parser.has_section('filters'):
        for item in parser['filters']:
            parser['filters'][item] = parser['filters'][item].replace(' ', '')
    if parser.has_option('formatting', 'truncated_feeds'):
        parser['formatting']['truncated_feeds'] = parser['formatting'][
            'truncated_feeds'
        ].replace(' ', '')
    if parser.has_option('formatting', 'title_size') and not int(
        parser['formatting']['title_size']
    ):
        logging.error('Wrong type for title_size option, should be a number')
        sys.exit(1)
    return parser
--- a/tenkan/feed.py
+++ b/tenkan/feed.py
@ -0,0 +1,196 @@
 # -*- coding: utf-8 -*-
 """ feed module : feed object """
 import logging
 import re
 import sys
 from datetime import datetime, timezone
 from typing import List
 import requests  # type: ignore
 from markdownify import markdownify  # type: ignore
 from md2gemini import md2gemini  # type: ignore
 from readability import Document  # type: ignore
 from requests.adapters import HTTPAdapter  # type: ignore
 from urllib3.util.retry import Retry
 from tenkan.utils import measure
 class Feed:
    """
    receives various feed data and applies necessary changes to make it usable into files
    """
    def __init__(
        self,
        input_content: dict,
        filters=None,
        formatting=None,
    ) -> None:
        self.content = input_content
        self.filters = filters
        self.formatting = formatting
        self.new_entries: list = []
    def needs_update(self) -> bool:
        """Checks if updates are available"""
        if not self.content['json_hash_last_update']:
            return True
        if (
            self.content['json_hash_last_update']
            != self.content['fetched_hash_last_update']
        ):
            return True
        return False
    @measure
    def get_new_entries(self) -> None:
        """Selects new entries depending on filters defined on config file"""
        for entry in self.content['fetched_content']['entries']:
            if (
                any(
                    x in entry['title']
                    for x in self.filters.get('titles_blacklist', '').split(
                        ','
                    )
                )
                or any(
                    x in entry['link']
                    for x in self.filters.get('links_blacklist', '').split(',')
                )
                or any(
                    # feedparser object can be problematic sometimes
                    # we need to check if we have an authors item
                    # AND we check if we can get it's name because it can be empty
                    # AND if we don't have any of these, we return a stupid string
                    # to match the str type which is expected
                    x
                    in (
                        entry.get('authors')
                        and entry.authors[0].get('name')
                        or 'random string'
                    )
                    for x in self.filters.get('authors_blacklist', '').split(
                        ','
                    )
                )
            ):
                self.content['fetched_content']['entries'].remove(entry)
                continue
            self.new_entries.append(entry)
    @measure
    def export_content(self) -> dict:
        """Exports properly formatted content"""
        # create feed item structure
        data_export: dict[str, List] = {
            'title': self.content['title'],
            'last_update': self.content['last_update'],
            'gmi_url': self.content['gmi_url'],
            'articles': [],
            'hash_last_update': self.content['fetched_hash_last_update'],
        }
        for article in self.new_entries:
            article_formatted_title = self._format_article_title(article)
            article_date = self._get_article_date(article)
            # 2 possibilities to get content : content['value'] or summary
            content = (
                article['content'][0]['value']
                if article.get('content')
                else article['summary']
            )
            article_content = self._format_article_content(
                content, link=article['link']
            )
            data_export['articles'].append(
                {
                    'article_title': article['title'],
                    'article_formatted_title': article_formatted_title,
                    'article_content': article_content,
                    'article_date': article_date,
                    'http_url': article['link'],
                    'updated': article_date,
                }
            )
        return data_export
    @classmethod
    def _get_article_date(cls, article: dict) -> datetime:
        """get date string and return datetime object"""
        try:
            return (
                datetime(
                    *article.get(
                        'published_parsed', article['updated_parsed']
                    )[:6]
                )
                .replace(tzinfo=timezone.utc)
                .astimezone(tz=None)
            )
        except KeyError:
            logging.error(
                "Can't find a proper date field in article data, this should not happen !"
            )
            sys.exit(1)
    @measure
    def _format_article_title(self, article: dict) -> str:
        """title formatting to make it usable as a file title"""
        # truncate title size depending on title size
        maxlen = int(self.formatting.get('title_size', 120))
        if len(self.content['title']) + len(article['title']) > maxlen:
            maxlen = maxlen - len(self.content['title'])
        # We don't want multiline titles (yes, it happens)
        article['title'] = article['title'].replace('\n', '')[:maxlen]
        # remove special characters
        # probably not the best way to do it, as it seems there is performance issues here
        # to improve later if possible
        formatted_str = (
            article['title']
            .encode('utf8', 'ignore')
            .decode('utf8', 'ignore')
            .replace(' ', '-')
        )
        return re.sub('[«»!@#$%^&*(){};:,./<>?/|`~=_+]', '', formatted_str)[
            :maxlen
        ]
    @measure
    def _format_article_content(self, content: str, link: str) -> str:
        """
        Formats article content from html to gmi
        Will use readability if the feed is truncated, so it should retrieve the full content
        """
        # conversion to readability format if asked
        if self.content['title'] in self.formatting.get(
            'truncated_feeds', 'アケオメ'
        ).split(','):
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'
            }
            req = requests.Session()
            retries = Retry(
                total=5,
                backoff_factor=0.1,
                status_forcelist=[500, 502, 503, 504],
            )
            req.mount('http://', HTTPAdapter(max_retries=retries))
            req.mount('https://', HTTPAdapter(max_retries=retries))
            res = req.get(url=link, headers=headers)
            content = Document(res.text).summary()
        # convert html -> md -> gemini
        article = md2gemini(markdownify(content))
        return article
--- a/tenkan/feedsfile.py
+++ b/tenkan/feedsfile.py
@ -0,0 +1,85 @@
 #!/usr/bin/python3
 # -*- coding: utf-8 -*-
 """ feedsfile mddule : json feeds file manipulation """
 import json
 import logging
 from typing import Dict  # , Optional
 from prettytable import PrettyTable
 def create(file: str) -> None:
    """file creation"""
    with open(file, 'x') as _file:
        data: dict = {'feeds': {}}
        json.dump(data, _file)
 def read(file: str) -> Dict[str, Dict[str, Dict[str, str]]]:
    """read file and return json data"""
    with open(file, 'r') as _file:
        file_data = json.load(_file)
    return file_data
 def _write(file: str, file_data: Dict[str, Dict[str, Dict[str, str]]]) -> None:
    """write new data into file"""
    with open(file, 'w') as file_updated:
        json.dump(file_data, file_updated, indent=4)
 def add_feed(file: str, feed_name: str, feed_url: str) -> None:
    """add a new feed into existing file"""
    file_data: Dict[str, Dict[str, Dict[str, str]]] = read(file)
    file_data['feeds'][feed_name] = {
        'url': feed_url,
        'last_update': '',
        'hash_last_update': '',
    }
    _write(file, file_data)
    logging.info('feed %s added', feed_name)
 def del_feed(file: str, feed_name: str) -> None:
    """remove feed from file"""
    file_data = read(file)
    # don't do anything if no feed found
    if file_data['feeds'].get(feed_name):
        del file_data['feeds'][feed_name]
        _write(file, file_data)
        logging.info('feed %s deleted', feed_name)
    else:
        logging.info('no feed %s found into feeds file', feed_name)
 def get_feed_item(file: str, feed_name: str, item: str) -> str:
    """Return element of a defined feed"""
    file_data = read(file)
    item = file_data['feeds'][feed_name][item]
    return item
 def update_last_run(file: str, date: str) -> None:
    """Update last_run key in json file"""
    file_data: dict = read(file)
    file_data['last_run'] = date
    _write(file, file_data)
 def update_feed(file: str, feed_name: str, hash_last_update: str) -> None:
    """update last update date of a defined feed"""
    file_data = read(file)
    file_data['feeds'][feed_name]['hash_last_update'] = hash_last_update
    _write(file, file_data)
 def list_feeds(file: str) -> None:
    """list feed file content"""
    file_data = read(file)
    table = PrettyTable()
    table.field_names = ['Title', 'URL']
    for item, value in file_data['feeds'].items():
        table.add_row([item, value['url']])
    logging.info(table)
--- a/tenkan/files.py
+++ b/tenkan/files.py
@ -0,0 +1,133 @@
 # -*- coding: utf-8 -*-
 """ files module : generated gemini feeds files management """
 import logging
 import pathlib
 import shutil
 from typing import Dict, Union
 from feedgen.feed import FeedGenerator  # type: ignore
 def path_exists(path: str) -> bool:
    """Check if feed path exists"""
    if pathlib.Path(path).is_dir():
        return True
    return False
 def write_files(path: str, data: dict, max_num_entries: int) -> None:
    """
    Converts feed objects into files and write them in the feed folder
    """
    tpath = path
    path = path + data['title']
    pathlib.Path(path).mkdir(exist_ok=True)
    num_entries = 0
    # count entries in index file
    if pathlib.Path(f'{path}/index.gmi').is_file():
        num_entries = sum(1 for line in open(f'{path}/index.gmi'))
    # if there is more articles than defined in max_num_entries, delete and rewrite
    if num_entries > max_num_entries:
        delete_folder(tpath, data['title'])
    index_file_write_header(path, data['title'])
    urls = []
    art_output = {}
    for article in data['articles']:
        art_output = write_article(article, data, path)
        urls.append(art_output['url'])
    index_file_write_footer(path)
    # no need to update atom file if no new articles (write_article func returns url list)
    if art_output.get('new_file'):
        _rebuild_atom_file(path=path, data=data, urls=urls)
 # def purge_folder(path: str) -> None:
 #     """Purge folder with too many entries"""
 #     logging.info('Purging %s folder', path)
 #     files = [x for x in pathlib.Path(f'{path}').iterdir() if x.is_file()]
 #     for file in files:
 #         pathlib.Path.unlink(file)
 def delete_folder(path: str, feed_name: str) -> None:
    """delete a feed folder"""
    if pathlib.Path(f'{path}{feed_name}/').exists():
        shutil.rmtree(f'{path}{feed_name}')
        logging.info('%s/%s folder deleted', path, feed_name)
    else:
        logging.info(
            'folder %s%s not present, nothing to delete', path, feed_name
        )
 def index_file_write_header(path: str, title: str) -> None:
    """Write index header"""
    with open(f'{path}/index.gmi', 'w') as index:
        index.write(f'# {title}\n\n')
        index.write('=> ../ ..\n')
 def index_file_write_footer(path: str) -> None:
    """Write index footer"""
    with open(f'{path}/index.gmi', 'a') as index:
        index.write('\n=> atom.xml Atom feed\n')
 def write_article(
    article: dict, data: dict, path: str
 ) -> Dict[str, Union[bool, str]]:
    """Write individual article"""
    # prepare data for file format
    date = article['article_date']
    file_date = date.strftime('%Y-%m-%d_%H-%M-%S')
    date = date.strftime('%Y-%m-%d %H:%M:%S')
    file_title = article['article_formatted_title']
    content = article['article_content']
    # we add the entry into index file
    with open(f'{path}/index.gmi', 'a') as index:
        index.write(
            f"=> {file_date}_{file_title}.gmi {date} - {article['article_title']}\n"
        )
    new_file = False
    # write the file is it doesn't exist, obviously
    if not pathlib.Path(f'{path}/{file_date}_{file_title}.gmi').is_file():
        new_file = True
        logging.info('%s : adding entry %s', data['title'], file_title)
        # we write the entry file
        author = article['author'] if 'author' in article else None
        pathlib.Path(f'{path}/{file_date}_{file_title}.gmi').write_text(
            f"# {article['article_title']}\n\n=> {article['http_url']}\n\n{date}, {author}\n\n{content}"
        )
    url = f"{data['gmi_url']}{data['title']}/{file_date}_{file_title}.gmi"
    return {'new_file': new_file, 'url': url}
 def _rebuild_atom_file(path: str, data: dict, urls: list) -> None:
    """rebuilds the atom file into gmi folder"""
    atomfeed = FeedGenerator()
    atomfeed.id(data['gmi_url'])
    atomfeed.title(data['title'])
    atomfeed.updated = data['last_update']
    atomfeed.link(href=f"{data['gmi_url']}.atom.xml", rel='self')
    atomfeed.link(href=data['gmi_url'], rel='alternate')
    # rebuild all articles
    for art, article in enumerate(data['articles']):
        atomentry = atomfeed.add_entry()
        url = urls[art]
        atomentry.guid(url)
        atomentry.link(href=url, rel='alternate')
        atomentry.updated(article['updated'])
        atomentry.title(article['article_title'])
    atomfeed.atom_file(f'{path}/atom.xml', pretty=True)
    logging.info('Wrote Atom feed for %s', data['title'])
--- a/tenkan/processing.py
+++ b/tenkan/processing.py
@ -0,0 +1,114 @@
 # -*- coding: utf-8 -*-
 """processing module : feeds file processing """
 import configparser
 import hashlib
 import json
 import logging
 import os
 from concurrent.futures import ThreadPoolExecutor
 import feedparser  # type: ignore
 from tenkan.feed import Feed
 from tenkan.feedsfile import read, update_feed
 from tenkan.files import path_exists, write_files
 from tenkan.utils import display_feeds_fetch_progress, measure
@measure
 def fetch_feeds(feeds_file: str, gmi_url: str) -> list:
    """Fetch all http feeds with threads"""
    workers = os.cpu_count() or 1
    try:
        fetched_feeds = []
        with ThreadPoolExecutor(max_workers=workers) as executor:
            for item, values in read(feeds_file)['feeds'].items():
                fetched_feeds.append(
                    {
                        'title': item,
                        'fetched_content': executor.submit(
                            feedparser.parse, values['url']
                        ),
                        'gmi_url': gmi_url,
                        'last_update': values['last_update'],
                        'fetched_hash_last_update': None,
                        'json_hash_last_update': values['hash_last_update'],
                    }
                )
            display_feeds_fetch_progress(fetched_feeds)
        return fetched_feeds
    except json.decoder.JSONDecodeError as bad_json:
        raise bad_json
@measure
 def prepare_fetched_content(fetched_feeds: list, force: bool = False) -> list:
    """Prepare some necessary data to be sent to feed object"""
    list_to_export = []
    for ftfd in fetched_feeds:
        try:
            # store workers result into fetched_content
            ftfd['fetched_content'] = ftfd['fetched_content'].result()  # type: ignore
            # we store a sha256 footprint of fetched content,
            # to compare to last known footprint
            tmp_hash = hashlib.sha256(
                str(ftfd['fetched_content'].get('entries')[0]).encode()
            )
            if tmp_hash.hexdigest() != ftfd['json_hash_last_update'] or force:
                ftfd['fetched_hash_last_update'] = tmp_hash.hexdigest()
                list_to_export.append(ftfd)
        # sometimes we don't get anything in fetched_content, so just ignore it
        except IndexError:
            pass
    return list_to_export
@measure
 def process_fetched_feeds(
    config: configparser.ConfigParser, fetched_feeds: list, force: bool = False
 ) -> list:
    """Process previously fetched feeds"""
    feed_list = []
    for ftfd in fetched_feeds:
        # initialize feed object
        feed = Feed(
            input_content=ftfd,
            filters=config['filters'],
            formatting=config['formatting'],
        )
        # process feeds if there are updates since last run
        # or if the feed had never been processed
        # or if --force option is used
        if (
            feed.needs_update()
            or not path_exists(
                path=config['tenkan']['gemini_path'] + ftfd['title']
            )
            or force
        ):
            logging.info('Processing %s', ftfd['title'])
            feed.get_new_entries()
            feed_list.append(feed.export_content())
    return feed_list
@measure
 def write_processed_feeds(
    args, config: configparser.ConfigParser, feed_list: list
 ) -> None:
    """Write files from processed feeds into gemini folder"""
    for files_data in feed_list:
        write_files(
            path=config['tenkan']['gemini_path'],
            data=files_data,
            max_num_entries=int(
                config['tenkan'].get('purge_feed_folder_after', '9999')
            ),
        )
        update_feed(
            file=args.feedsfile,
            feed_name=files_data['title'],
            hash_last_update=files_data['hash_last_update'],
        )
--- a/tenkan/utils.py
+++ b/tenkan/utils.py
@ -0,0 +1,35 @@
 # -*- coding: utf-8 -*-
 """utils module : various utils"""
 import logging
 from time import sleep, time
 def display_feeds_fetch_progress(fetched_feeds: list) -> None:
    """Display feeds being fetched"""
    qsize = len(fetched_feeds)
    while True:
        done = len([x for x in fetched_feeds if x['fetched_content'].done()])
        print(f'Fetching feeds [{done}/{qsize}]', end='\r', flush=True)
        sleep(0.3)
        if done == qsize:
            break
 def measure(func):
    """
    Decorator to measure time took by a func
    Used only in debug mode
    """
    def wrap_func(*args, **kwargs):
        time1 = time()
        result = func(*args, **kwargs)
        time2 = time()
        logging.debug(
            'Function %s executed in %ss', func.__name__, time2 - time1
        )
        return result
    return wrap_func
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/cli_test.py
+++ b/tests/cli_test.py
@ -0,0 +1,81 @@
 # -*- coding: utf-8 -*-
 import configparser
 from pathlib import Path
 import pytest
 from tenkan.cli import load_args, load_config, run
 from tenkan.feedsfile import add_feed, read
 def test_config_loaded():
    config_file = Path('./tests/data/tenkan.conf')
    res = load_config(config_file)
    assert isinstance(res, configparser.ConfigParser)
 def test_config_tenkan_section_missing():
    config_file = Path('./tests/data/tenkan.conf_fail')
    with pytest.raises(SystemExit) as pytest_wrapped_e:
        load_config(config_file)
    assert pytest_wrapped_e.type == SystemExit
    assert pytest_wrapped_e.value.code == 1
 def test_arg_feedsfile_missing():
    args = load_args(['--feedsfile', '/tmp/toto.json', 'list'])
    config = Path('./tests/data/tenkan.conf')
    with pytest.raises(SystemExit) as pytest_wrapped_e:
        run(args, config)
    assert pytest_wrapped_e.type == SystemExit
    assert pytest_wrapped_e.value.code == 1
 # def test_stupid_command():
 #    args = load_args(['bla'])
 #    config = Path('./tests/data/tenkan.conf')
 #    with pytest.raises(SystemExit) as pytest_wrapped_e:
 #        load_args(args)
 #    assert pytest_wrapped_e.type == SystemExit
 #    assert pytest_wrapped_e.value.code == 2
 def test_add_cmd_feedsfile_missing(tmp_path):
    feeds = tmp_path / 'toto.json'
    args = load_args(['--feedsfile', str(feeds), 'add', 'blabla', 'blibli'])
    config = Path('./tests/data/tenkan.conf')
    run(args, config)
    assert Path(f'{feeds}').is_file()
 def test_add_bad_feedsfile_folder():
    args = load_args(
        ['--feedsfile', '/tmp/tmp/tmp/titi.json', 'add', 'blabla', 'blibli']
    )
    config = Path('./tests/data/tenkan.conf')
    with pytest.raises(SystemExit) as pytest_wrapped_e:
        run(args, config)
    assert pytest_wrapped_e.type == SystemExit
    assert pytest_wrapped_e.value.code == 1
 def test_del_cmd():
    feeds = Path('./tests/data/feeds.json')
    args = load_args(['--feedsfile', str(feeds), 'delete', 'tutu'])
    config = Path('./tests/data/tenkan.conf')
    add_feed(file=feeds, feed_name='tutu', feed_url='tata')
    run(args, config)
    data = read(file=feeds)
    assert not data['feeds'].get('tutu')
 def test_update_cmd():
    feeds = Path('./tests/data/feeds.json')
    args = load_args(['--feedsfile', str(feeds), 'update'])
    config = load_config(str(Path('./tests/data/tenkan.conf')))
    data1 = read(file=feeds)['last_run']
    run(args, config)
    data2 = read(file=feeds)['last_run']
    assert data1 != data2
--- a/tests/config_test.py
+++ b/tests/config_test.py
@ -0,0 +1,14 @@
 # -*- coding: utf-8 -*-
 from pathlib import Path
 import pytest
 from tenkan.config import load_config
 def test_configfile_missing():
    config = Path('/tmp/toto.conf')
    with pytest.raises(SystemExit) as pytest_wrapped_e:
        load_config(config)
    assert pytest_wrapped_e.type == SystemExit
    assert pytest_wrapped_e.value.code == 1
--- a/tests/data/feeds.json
+++ b/tests/data/feeds.json
@ -0,0 +1,10 @@
 {
    "last_run": "2022-01-12 21:31:10.703787",
    "feeds": {
        "srad-science": {
            "url": "https://srad.jp/science.rss",
            "last_update": null,
            "hash_last_update": ""
        }
    }
 }
--- a/tests/data/feeds.json_fail
+++ b/tests/data/feeds.json_fail
@ -0,0 +1,7 @@
 {
    "feeds": {
        "srad-science": {
            "url": "https://srad.jp/science.rss",
            "last_update": null
        }
    }
--- a/tests/data/tenkan.conf
+++ b/tests/data/tenkan.conf
@ -0,0 +1,15 @@
 [tenkan]
 gemini_path = /tmp/
 gemini_url = gemini://space.fqserv.eu/feeds/
 [filters]
 # authors we don't want to read
 authors_blacklist = Rabaudy, Élise Costa, Sagalovitch, Pessin, Gallerey
 titles_blacklist = Pinned
 links_blacklist = slate.fr/audio, slate.fr/grand-format, slate.fr/boire-manger/top-chef
 [formatting]
 title_size = 120
 # feeds with a truncated content
 # will be fetched and converted using readability-lxml
 truncated_feeds = gurumed, slate, cnrs
--- a/tests/data/tenkan.conf_fail
+++ b/tests/data/tenkan.conf_fail
@ -0,0 +1,15 @@
 #[tenkan]
 #gemini_path = /tmp/hu/
 #gemini_url = gemini://space.fqserv.eu/feeds/
 [filters]
 # authors we don't want to read
 authors_blacklist = Rabaudy, Élise Costa, Sagalovitch, Pessin, Gallerey
 titles_blacklist = Pinned
 links_blacklist = slate.fr/audio, slate.fr/grand-format, slate.fr/boire-manger/top-chef
 [formatting]
 title_size = 120
 # feeds with a truncated content
 # will be fetched and converted using readability-lxml
 truncated_feeds = gurumed, slate, cnrs
--- a/tests/feed_test.py
+++ b/tests/feed_test.py
@ -0,0 +1,101 @@
 # -*- coding: utf-8 -*-
 from datetime import datetime, timezone
 import pytest
 from tenkan.feed import Feed
 data = {
    'title': 'bla',
    'url': 'bla',
    'fetched_content': 'bla',
    'last_update': None,
    'gmi_url': 'bla',
    'json_hash_last_update': 'bl',
    'fetched_hash_last_update': 'bla',
 }
 article_data1 = {
    'title': 'article_title',
    'article_formatted_title': 'article_formatted_title',
    'article_content': {'summary': 'article_content'},
    'article_date': datetime(2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc),
    'http_url': 'article_link',
    'updated': 'Fri, 07 Jan 2022 15:25:00 +0000',
    'updated_parsed': datetime(
        2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc
    ).timetuple(),
 }
 article_data2 = {
    'title': 'article_title',
    'article_formatted_title': 'article_formatted_title',
    'article_content': {'summary': 'article_content'},
    'article_date': 'bad_date',
    'http_url': 'article_link',
    'updated_': 'bad_date',
 }
 def test_needs_update_no_last_update():
    data['json_hash_last_update'] = None
    feed = Feed(input_content=data)
    assert feed.needs_update() is True
 def test_needs_update_last_update_ne_updated_field():
    feed = Feed(input_content=data)
    assert feed.needs_update() is True
 def test_no_need_update():
    data['json_hash_last_update'] = 'bla'
    feed = Feed(input_content=data)
    assert feed.needs_update() is False
 def test_content_exported():
    # TODO : use article_data
    feed = Feed(input_content=data)
    expected_data = {
        'title': 'bla',
        'last_update': None,
        'gmi_url': 'bla',
        'articles': [],
        'hash_last_update': 'bla',
    }
    assert feed.export_content() == expected_data
 def test_date_format_published():
    data['articles'] = article_data1
    feed = Feed(input_content=data)
    assert (
        feed._get_article_date(article_data1)
        == data['articles']['article_date']
    )
 def test_bad_date_format():
    data['articles'] = article_data2
    feed = Feed(input_content=data)
    with pytest.raises(SystemExit) as pytest_wrapped_e:
        feed._get_article_date(article_data2)
    assert pytest_wrapped_e.type == SystemExit
    assert pytest_wrapped_e.value.code == 1
 def test_article_content_formatted():
    feed = Feed(input_content=data, formatting={'truncated_feeds': 'rien'})
    res = feed._format_article_content(content='coucou', link='blbl')
    assert res == 'coucou'
 def test_title_formatted():
    feed = Feed(input_content=data, formatting={'title_size': 10})
    art = article_data1
    art['title'] = 'blabla / bla ?'
    res = feed._format_article_title(article=article_data1)
    assert res == 'blabla-'
--- a/tests/feedsfile_test.py
+++ b/tests/feedsfile_test.py
@ -0,0 +1,43 @@
 # -*- coding: utf-8 -*-
 from pathlib import Path
 from tenkan.feedsfile import (
    add_feed,
    del_feed,
    get_feed_item,
    read,
    update_feed,
 )
 def test_get_feed_item():
    feeds = Path('./tests/data/feeds.json')
    item = get_feed_item(file=feeds, feed_name='srad-science', item='url')
    assert item == 'https://srad.jp/science.rss'
 def test_update_hash():
    feeds = Path('./tests/data/feeds.json')
    update_feed(file=feeds, feed_name='srad-science', hash_last_update='blbl')
    item = get_feed_item(
        file=feeds, feed_name='srad-science', item='hash_last_update'
    )
    assert item == 'blbl'
    update_feed(file=feeds, feed_name='srad-science', hash_last_update='')
 def test_add_feed():
    feeds = Path('./tests/data/feeds.json')
    add_feed(file=feeds, feed_name='toto', feed_url='tata')
    data = read(file=feeds)
    assert data['feeds'].get('toto')
    del_feed(file=feeds, feed_name='toto')
 def test_del_feed():
    feeds = Path('./tests/data/feeds.json')
    add_feed(file=feeds, feed_name='tutu', feed_url='tata')
    del_feed(file=feeds, feed_name='tutu')
    data = read(file=feeds)
    assert not data['feeds'].get('tutu')
--- a/tests/files_test.py
+++ b/tests/files_test.py
@ -0,0 +1,70 @@
 # -*- coding: utf-8 -*-
 from datetime import datetime, timezone
 from pathlib import Path
 from tenkan.files import (
    _rebuild_atom_file,
    delete_folder,
    path_exists,
    write_article,
 )
 data: dict = {
    'title': 'bla',
    'url': 'bla',
    'fetched_content': 'bla',
    'last_update': None,
    'gmi_url': 'bla',
    'articles': [],
 }
 article_data = {
    'article_title': 'article_title',
    'article_formatted_title': 'article_formatted_title',
    'article_content': {'summary': 'article_content'},
    'article_date': datetime(2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc),
    'http_url': 'article_link',
    'updated': 'Fri, 07 Jan 2022 15:25:00 +0000',
    'updated_parsed': datetime(
        2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc
    ).timetuple(),
 }
 def test_path_exists(tmp_path):
    d = tmp_path / 'sub'
    d.mkdir()
    assert path_exists(d) is True
 def test_path_doesnt_exist(tmp_path):
    d = tmp_path / 'sub'
    assert path_exists(d) is False
 def test_article_written(tmp_path):
    path = tmp_path / 'sub'
    path.mkdir()
    date = article_data['article_date']
    file_date = date.strftime('%Y-%m-%d_%H-%M-%S')
    file_title = article_data['article_formatted_title']
    res = write_article(article=article_data, data=data, path=path)
    assert res['new_file'] is True
    assert (
        res['url']
        == f"{data['gmi_url']}{data['title']}/{file_date}_{file_title}.gmi"
    )
 def test_folder_deleted(tmp_path):
    subpath = tmp_path / 'sub2'
    delete_folder(path=tmp_path, feed_name='sub2')
    assert not subpath.exists()
 def test_atomfile_built(tmp_path):
    data['articles'].append(article_data)
    _rebuild_atom_file(path=tmp_path, data=data, urls=['bla'])
    assert Path(f'{tmp_path}/atom.xml').is_file()
--- a/tests/processing_test.py
+++ b/tests/processing_test.py
@ -0,0 +1,45 @@
 # -*- coding: utf-8 -*-
 from json import JSONDecodeError
 from pathlib import Path
 import feedparser
 import pytest
 from tenkan.config import load_config
 from tenkan.processing import fetch_feeds, process_fetched_feeds
 data = [
    {
        'title': 'bla',
        'url': 'bla',
        'fetched_content': None,
        'last_update': None,
        'gmi_url': 'bla',
        'json_hash_last_update': 'bli',
        'fetched_hash_last_update': 'bli',
    }
 ]
 def test_feed_fetched():
    feeds = Path('./tests/data/feeds.json')
    res = fetch_feeds(feeds_file=feeds, gmi_url='blbl')
    assert type(res) is list
    assert len(res) == 1
 def test_feed_raise_when_shitty_feedfile():
    feeds = Path('./tests/data/feeds.json_fail')
    with pytest.raises(JSONDecodeError):
        fetch_feeds(feeds_file=feeds, gmi_url='blbl')
 def test_feed_processed():
    config_file = Path('./tests/data/tenkan.conf')
    conf = load_config(config_file)
    data[0]['fetched_content'] = feedparser.parse(
        'https://srad.jp/science.rss'
    )
    process_fetched_feeds(config=conf, fetched_feeds=data)
		`@ -0,0 +1,2 @@`
							`# -- coding: utf-8 --`
							`__version__ = '0.1.0'`