initial commit
This commit is contained in:
commit
00eca9aa4e
28 changed files with 2926 additions and 0 deletions
27
.chglog/config.yml
Executable file
27
.chglog/config.yml
Executable file
|
@ -0,0 +1,27 @@
|
|||
style: none
|
||||
template: CHANGELOG.tpl.md
|
||||
info:
|
||||
title: CHANGELOG
|
||||
repository_url: git.fqserv.eu:takaoni/tenkan.git
|
||||
options:
|
||||
commits:
|
||||
# filters:
|
||||
# Type:
|
||||
# - feat
|
||||
# - fix
|
||||
# - perf
|
||||
# - refactor
|
||||
commit_groups:
|
||||
# title_maps:
|
||||
# feat: Features
|
||||
# fix: Bug Fixes
|
||||
# perf: Performance Improvements
|
||||
# refactor: Code Refactoring
|
||||
header:
|
||||
pattern: "^(\\w*)\\:\\s(.*)$"
|
||||
pattern_maps:
|
||||
- Type
|
||||
- Subject
|
||||
notes:
|
||||
keywords:
|
||||
- BREAKING CHANGE
|
140
.gitignore
vendored
Normal file
140
.gitignore
vendored
Normal file
|
@ -0,0 +1,140 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
.chglog/
|
55
.pre-commit-config.yaml
Normal file
55
.pre-commit-config.yaml
Normal file
|
@ -0,0 +1,55 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v4.1.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-json
|
||||
- id: check-added-large-files
|
||||
- id: double-quote-string-fixer
|
||||
- id: fix-encoding-pragma
|
||||
- id: no-commit-to-branch
|
||||
- id: name-tests-test
|
||||
- repo: https://gitlab.com/pycqa/flake8
|
||||
rev: 3.9.2
|
||||
hooks:
|
||||
- id: flake8
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 21.12b0
|
||||
hooks:
|
||||
- id: black
|
||||
name: black (python)
|
||||
args: ['-S']
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v0.931
|
||||
hooks:
|
||||
- id: mypy
|
||||
additional_dependencies: [pydantic] # add if use pydantic
|
||||
- repo: https://github.com/PyCQA/isort
|
||||
rev: 5.10.1
|
||||
hooks:
|
||||
- id: isort
|
||||
name: isort (python)
|
||||
args: ['--profile', 'black']
|
||||
- repo: https://github.com/PyCQA/bandit
|
||||
rev: 1.7.1
|
||||
hooks:
|
||||
- id: bandit
|
||||
exclude: ^tests/
|
||||
- repo: https://github.com/asottile/pyupgrade
|
||||
rev: v2.31.0
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
- repo: local
|
||||
hooks:
|
||||
- id: pylint
|
||||
name: pylint
|
||||
entry: pylint --disable=E1101,E0401,C0301 --ignore=__init__.py --ignore-patterns=(.)*_test\.py,test_(.)*\.py
|
||||
language: system
|
||||
types: [python]
|
||||
- id: pytest
|
||||
name: Check pytest unit tests pass
|
||||
entry: pytest
|
||||
pass_filenames: false
|
||||
language: system
|
||||
types: [python]
|
0
CHANGELOG.md
Normal file
0
CHANGELOG.md
Normal file
13
LICENCE
Normal file
13
LICENCE
Normal file
|
@ -0,0 +1,13 @@
|
|||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
Version 2, December 2004
|
||||
|
||||
Copyright (C) 2004 Sam Hocevar <sam@hocevar.net>
|
||||
|
||||
Everyone is permitted to copy and distribute verbatim or modified
|
||||
copies of this license document, and changing it is allowed as long
|
||||
as the name is changed.
|
||||
|
||||
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
|
||||
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
|
||||
|
||||
0. You just DO WHAT THE FUCK YOU WANT TO.
|
80
README.md
Normal file
80
README.md
Normal file
|
@ -0,0 +1,80 @@
|
|||
# tenkan
|
||||
|
||||
Command line tool to convert HTTP RSS/Atom feeds to gemini format.
|
||||
|
||||
## Installation
|
||||
```shell script
|
||||
pip install tenkan
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
Add a feed
|
||||
```shell script
|
||||
# Any valid RSS/Atom feed
|
||||
tenkan add feedname url
|
||||
```
|
||||
|
||||
Update content of feed list
|
||||
```shell script
|
||||
tenkan update
|
||||
```
|
||||
|
||||
Delete feed
|
||||
```shell script
|
||||
tenkan delete feedname
|
||||
```
|
||||
|
||||
List subscripted feeds
|
||||
```shell script
|
||||
tenkan list
|
||||
```
|
||||
## Options
|
||||
A debug mode is avaible via --debug option.
|
||||
If you want to use your configuration or feeds file in another place than default one, you can use --config and --feedsfile options.
|
||||
|
||||
|
||||
## Configuration
|
||||
tenkan searches for a configuration file at the following location:
|
||||
|
||||
`$XDG_CONFIG_HOME/tenkan/tenkan.conf`
|
||||
|
||||
### Example config
|
||||
This can be found in tenkan.conf.example.
|
||||
|
||||
```ini
|
||||
[tenkan]
|
||||
gemini_path = /usr/local/gemini/
|
||||
gemini_url = gemini://foo.bar/feeds/
|
||||
# will purge feed folders having more than defined element count
|
||||
# purge_feed_folder_after = 100
|
||||
|
||||
[filters]
|
||||
# authors we don't want to read
|
||||
# authors_blacklist = foo, bar
|
||||
# blacklist of article titles, if provided, it won't be processed
|
||||
# titles_blacklist = foo, bar
|
||||
# blacklist of article links, if provided, it won't be processed
|
||||
# links_blacklist = foo/bar.com, bar/foo, bla
|
||||
|
||||
[formatting]
|
||||
# maximum article title size, 120 chars if not provided
|
||||
# title_size = 120
|
||||
|
||||
# feeds with a truncated content
|
||||
# will be fetched and converted using readability
|
||||
# truncated_feeds = foo, bar
|
||||
```
|
||||
|
||||
## Todolist
|
||||
- [ ] Add a edit command
|
||||
- [ ] Add a --feedname option to update command, to update a single feed
|
||||
- [ ] Rewrite configuration checks
|
||||
- [ ] Improve tests
|
||||
- [ ] Refactor needed parts like write_article
|
||||
- [ ] (not sure if relevant) migrate images too, for gemini clients that can handle it
|
||||
|
||||
## Development
|
||||
I recommend using pre-commit. The pre-commit configuration I use is located in .pre-commit-config.yamlfile.
|
||||
|
||||
Run pre-commit command before every pull request and fix the warnings or errors it produces.
|
1287
poetry.lock
generated
Normal file
1287
poetry.lock
generated
Normal file
File diff suppressed because it is too large
Load diff
66
pyproject.toml
Normal file
66
pyproject.toml
Normal file
|
@ -0,0 +1,66 @@
|
|||
[tool.poetry]
|
||||
name = "tenkan"
|
||||
version = "0.1.0"
|
||||
description = "RSS/atom feed converter from html to gemini"
|
||||
authors = ["Quentin Ferrand <quentin.ferrand@protonmail.com>"]
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.8"
|
||||
DateTime = "^4.3"
|
||||
feedparser = "^6.0.8"
|
||||
feedgen = "^0.9.0"
|
||||
requests = "^2.26.0"
|
||||
markdownify = "^0.10.0"
|
||||
md2gemini = "^1.8.1"
|
||||
readability-lxml = "^0.8.1"
|
||||
rich = "^10.16.2"
|
||||
prettytable = "^3.0.0"
|
||||
|
||||
[tool.poetry.dev-dependencies]
|
||||
pytest = "^5.2"
|
||||
black = {version = "^21.11b1", allow-prereleases = true}
|
||||
flake8 = "^4.0.1"
|
||||
mypy = "^0.910"
|
||||
isort = "^5.10.1"
|
||||
pytest-cov = "^3.0.0"
|
||||
pylint = "^2.12.2"
|
||||
pyupgrade = "^2.31.0"
|
||||
bandit = "^1.7.1"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.black]
|
||||
line-length = 79
|
||||
target-version = ['py38']
|
||||
include = '\.pyi?$'
|
||||
exclude = '''
|
||||
|
||||
(
|
||||
/(
|
||||
\.eggs # exclude a few common directories in the
|
||||
| \.git # root of the project
|
||||
| \.hg
|
||||
| \.mypy_cache
|
||||
| \.tox
|
||||
| \.venv
|
||||
| _build
|
||||
| buck-out
|
||||
| build
|
||||
| dist
|
||||
)/
|
||||
| foo.py # also separately exclude a file named foo.py in
|
||||
# the root of the project
|
||||
)
|
||||
'''
|
||||
|
||||
[tool.isort]
|
||||
multi_line_output = 3
|
||||
include_trailing_comma = true
|
||||
force_grid_wrap = 0
|
||||
use_parentheses = true
|
||||
line_length = 79
|
||||
|
||||
[tool.poetry.scripts]
|
||||
tenkan = "tenkan.cli:main"
|
21
tenkan.conf.example
Normal file
21
tenkan.conf.example
Normal file
|
@ -0,0 +1,21 @@
|
|||
[tenkan]
|
||||
gemini_path = /usr/local/gemini/
|
||||
gemini_url = gemini://foo.bar/feeds/
|
||||
# will purge feed folders having more than defined element count
|
||||
# purge_feed_folder_after = 100
|
||||
|
||||
[filters]
|
||||
# authors we don't want to read
|
||||
# authors_blacklist = foo, bar
|
||||
# blacklist of article titles, if provided, it won't be processed
|
||||
# titles_blacklist = foo, bar
|
||||
# blacklist of article links, if provided, it won't be processed
|
||||
# links_blacklist = foo/bar.com, bar/foo, bla
|
||||
|
||||
[formatting]
|
||||
# maximum article title size, 120 chars if not provided
|
||||
# title_size = 120
|
||||
|
||||
# feeds with a truncated content
|
||||
# will be fetched and converted using readability
|
||||
# truncated_feeds = foo, bar
|
2
tenkan/__init__.py
Normal file
2
tenkan/__init__.py
Normal file
|
@ -0,0 +1,2 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
__version__ = '0.1.0'
|
214
tenkan/cli.py
Normal file
214
tenkan/cli.py
Normal file
|
@ -0,0 +1,214 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""
|
||||
cli module
|
||||
It parses args and runs what's needed if every over modules
|
||||
depending of what command is done
|
||||
"""
|
||||
|
||||
import configparser
|
||||
import logging
|
||||
import sys
|
||||
from argparse import ArgumentParser, RawTextHelpFormatter
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import NoReturn
|
||||
|
||||
from rich.traceback import install
|
||||
|
||||
from tenkan.config import load_config
|
||||
from tenkan.feedsfile import (
|
||||
add_feed,
|
||||
create,
|
||||
del_feed,
|
||||
list_feeds,
|
||||
update_last_run,
|
||||
)
|
||||
from tenkan.files import delete_folder
|
||||
from tenkan.processing import (
|
||||
fetch_feeds,
|
||||
prepare_fetched_content,
|
||||
process_fetched_feeds,
|
||||
write_processed_feeds,
|
||||
)
|
||||
|
||||
# rich tracebacks
|
||||
install(show_locals=True)
|
||||
|
||||
|
||||
class MyParser(ArgumentParser): # pylint: disable=too-few-public-methods
|
||||
"""Child class to print help msg if no or bad args given"""
|
||||
|
||||
def error(self, message: str) -> NoReturn:
|
||||
"""exit"""
|
||||
sys.stderr.write(f'error: {message}')
|
||||
self.print_help()
|
||||
sys.exit(2)
|
||||
|
||||
|
||||
def load_args(args: list):
|
||||
"""args parsing function"""
|
||||
|
||||
desc = 'tenkan : RSS/atom feed converter from html to gemini\n\nTo show the detailed help of a COMMAND run `ytcc COMMAND --help`.'
|
||||
parser = MyParser(
|
||||
description=desc, prog='tenkan', formatter_class=RawTextHelpFormatter
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'-v',
|
||||
'--version',
|
||||
action='version',
|
||||
version='%(prog)s 0.1.0',
|
||||
help='show %(prog)s version number and exit',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--config',
|
||||
default=f'{str(Path.home())}/.config/tenkan/tenkan.conf',
|
||||
help='config file, $HOME/.config/tenkan/tenkan.conf by default',
|
||||
dest='config',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--feedsfile',
|
||||
default=f'{str(Path.home())}/.config/tenkan/feeds.json',
|
||||
help='feeds file containing feed list, $HOME/.config/tenkan/feeds.json by default',
|
||||
dest='feedsfile',
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--debug', action='store_true', help='debug mode', dest='debug'
|
||||
)
|
||||
|
||||
subparsers = parser.add_subparsers(
|
||||
title='command', required=True, dest='command'
|
||||
)
|
||||
|
||||
parser_add = subparsers.add_parser(
|
||||
'add', help='add a feed to the feeds list'
|
||||
)
|
||||
parser_add.add_argument(
|
||||
'name', help='the name of the feed you want to add'
|
||||
)
|
||||
parser_add.add_argument('url', help='the HTTP url of the feed')
|
||||
|
||||
parser_update = subparsers.add_parser(
|
||||
'update', help='update feeds folder from feed list'
|
||||
)
|
||||
parser_update.add_argument(
|
||||
'--force',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='update feed list even if there is no new content',
|
||||
)
|
||||
|
||||
parser_list = subparsers.add_parser(
|
||||
'list', help='list all feeds in feeds list'
|
||||
)
|
||||
parser_list.add_argument(
|
||||
'list', help='list all feeds in feeds list', action='store_true'
|
||||
)
|
||||
|
||||
parser_delete = subparsers.add_parser(
|
||||
'delete', help='remove a feed to the feeds list'
|
||||
)
|
||||
parser_delete.add_argument(
|
||||
'name', help='the name of the feed you want to delete'
|
||||
)
|
||||
parser_delete.add_argument(
|
||||
'--delete-gmi-folder',
|
||||
help='delete gmi folder, True by default',
|
||||
action='store_true',
|
||||
default=False,
|
||||
dest='delete_folder',
|
||||
)
|
||||
return parser.parse_args(args)
|
||||
|
||||
|
||||
def set_logging(args, config: configparser.ConfigParser) -> None:
|
||||
"""define logging settings"""
|
||||
log = logging.getLogger()
|
||||
log.setLevel(logging.INFO)
|
||||
if args.debug:
|
||||
log.setLevel(logging.DEBUG)
|
||||
|
||||
console_formatter = logging.Formatter(fmt='%(message)s')
|
||||
file_formatter = logging.Formatter(
|
||||
fmt='%(asctime)s %(levelname)s: %(message)s'
|
||||
)
|
||||
|
||||
stdout_handler = logging.StreamHandler(stream=sys.stdout)
|
||||
stdout_handler.setFormatter(console_formatter)
|
||||
log.addHandler(stdout_handler)
|
||||
|
||||
if config['tenkan'].get('log_file'):
|
||||
file_handler = logging.FileHandler(
|
||||
filename=config['tenkan'].get('log_file'),
|
||||
encoding='utf-8',
|
||||
)
|
||||
file_handler.setFormatter(file_formatter)
|
||||
log.addHandler(file_handler)
|
||||
|
||||
|
||||
def run(args, config: configparser.ConfigParser) -> None:
|
||||
"""run stuff depending of command used"""
|
||||
# exit with error if json file not found with actions other than add
|
||||
if not Path(args.feedsfile).exists() and 'add' not in args.command:
|
||||
logging.error('No json file %s, can\'t continue', args.feedsfile)
|
||||
sys.exit(1)
|
||||
|
||||
# list feeds in a pretty format
|
||||
if args.command == 'list':
|
||||
list_feeds(file=args.feedsfile)
|
||||
|
||||
# add a feed to feeds file
|
||||
if args.command == 'add':
|
||||
# if home directory, creates json with empty structure if no file yet
|
||||
if not Path(args.feedsfile).parents[0].exists():
|
||||
if str(Path(args.feedsfile).parents[0]) == str(Path.home()):
|
||||
Path(args.feedsfile).parents[0].mkdir(
|
||||
parents=True, exist_ok=True
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
'Directory of feeds file %s not found, exiting',
|
||||
)
|
||||
sys.exit(1)
|
||||
if not Path(args.feedsfile).is_file():
|
||||
create(args.feedsfile)
|
||||
add_feed(file=args.feedsfile, feed_name=args.name, feed_url=args.url)
|
||||
|
||||
# delete a feed from feeds file
|
||||
if args.command == 'delete':
|
||||
del_feed(file=args.feedsfile, feed_name=args.name)
|
||||
if args.delete_folder:
|
||||
delete_folder(
|
||||
path=config['tenkan']['gemini_path'], feed_name=args.name
|
||||
)
|
||||
|
||||
# update content
|
||||
if args.command == 'update':
|
||||
fetched_feeds = fetch_feeds(
|
||||
feeds_file=args.feedsfile,
|
||||
gmi_url=config['tenkan']['gemini_url'],
|
||||
)
|
||||
print('')
|
||||
fetched_feeds = prepare_fetched_content(fetched_feeds, args.force)
|
||||
feed_list = process_fetched_feeds(
|
||||
config=config,
|
||||
fetched_feeds=fetched_feeds,
|
||||
force=args.force,
|
||||
)
|
||||
if feed_list:
|
||||
write_processed_feeds(args, config, feed_list)
|
||||
else:
|
||||
logging.info('No new content to process, stopping')
|
||||
update_last_run(args.feedsfile, str(datetime.now()))
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""load conf, args, set logging and run main program"""
|
||||
|
||||
args = load_args(args=sys.argv[1:])
|
||||
config = load_config(args.config)
|
||||
set_logging(args, config)
|
||||
run(args, config)
|
57
tenkan/config.py
Normal file
57
tenkan/config.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""config module : configuration file parsing"""
|
||||
|
||||
import configparser
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_config(config_file) -> configparser.ConfigParser:
|
||||
"""config load"""
|
||||
|
||||
# exit with error if config file not found
|
||||
if not Path(config_file).exists():
|
||||
logging.error('No config file found %s, exiting', config_file)
|
||||
sys.exit(1)
|
||||
|
||||
parser = configparser.ConfigParser()
|
||||
parser.read(config_file)
|
||||
if 'tenkan' not in parser.sections():
|
||||
logging.critical(
|
||||
"Missing [tenkan] section in config file %s, can't go further",
|
||||
config_file,
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# shitty checks of config content
|
||||
# to improve later...
|
||||
for opt in ['gemini_path', 'gemini_url']:
|
||||
if not parser.has_option('tenkan', opt):
|
||||
logging.error('Missing option %s', opt)
|
||||
sys.exit(1)
|
||||
|
||||
if parser.has_option('tenkan', 'purge_feed_folder_after'):
|
||||
if not int(parser['tenkan']['purge_feed_folder_after']):
|
||||
logging.error(
|
||||
'Wrong type for purge_feed_folder_after option, should be a number'
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
if parser.has_section('filters'):
|
||||
for item in parser['filters']:
|
||||
parser['filters'][item] = parser['filters'][item].replace(' ', '')
|
||||
|
||||
if parser.has_option('formatting', 'truncated_feeds'):
|
||||
parser['formatting']['truncated_feeds'] = parser['formatting'][
|
||||
'truncated_feeds'
|
||||
].replace(' ', '')
|
||||
|
||||
if parser.has_option('formatting', 'title_size') and not int(
|
||||
parser['formatting']['title_size']
|
||||
):
|
||||
logging.error('Wrong type for title_size option, should be a number')
|
||||
sys.exit(1)
|
||||
|
||||
return parser
|
196
tenkan/feed.py
Normal file
196
tenkan/feed.py
Normal file
|
@ -0,0 +1,196 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" feed module : feed object """
|
||||
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from typing import List
|
||||
|
||||
import requests # type: ignore
|
||||
from markdownify import markdownify # type: ignore
|
||||
from md2gemini import md2gemini # type: ignore
|
||||
from readability import Document # type: ignore
|
||||
from requests.adapters import HTTPAdapter # type: ignore
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from tenkan.utils import measure
|
||||
|
||||
|
||||
class Feed:
|
||||
"""
|
||||
receives various feed data and applies necessary changes to make it usable into files
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_content: dict,
|
||||
filters=None,
|
||||
formatting=None,
|
||||
) -> None:
|
||||
self.content = input_content
|
||||
self.filters = filters
|
||||
self.formatting = formatting
|
||||
self.new_entries: list = []
|
||||
|
||||
def needs_update(self) -> bool:
|
||||
"""Checks if updates are available"""
|
||||
if not self.content['json_hash_last_update']:
|
||||
return True
|
||||
if (
|
||||
self.content['json_hash_last_update']
|
||||
!= self.content['fetched_hash_last_update']
|
||||
):
|
||||
return True
|
||||
return False
|
||||
|
||||
@measure
|
||||
def get_new_entries(self) -> None:
|
||||
"""Selects new entries depending on filters defined on config file"""
|
||||
for entry in self.content['fetched_content']['entries']:
|
||||
if (
|
||||
any(
|
||||
x in entry['title']
|
||||
for x in self.filters.get('titles_blacklist', '').split(
|
||||
','
|
||||
)
|
||||
)
|
||||
or any(
|
||||
x in entry['link']
|
||||
for x in self.filters.get('links_blacklist', '').split(',')
|
||||
)
|
||||
or any(
|
||||
# feedparser object can be problematic sometimes
|
||||
# we need to check if we have an authors item
|
||||
# AND we check if we can get it's name because it can be empty
|
||||
# AND if we don't have any of these, we return a stupid string
|
||||
# to match the str type which is expected
|
||||
x
|
||||
in (
|
||||
entry.get('authors')
|
||||
and entry.authors[0].get('name')
|
||||
or 'random string'
|
||||
)
|
||||
for x in self.filters.get('authors_blacklist', '').split(
|
||||
','
|
||||
)
|
||||
)
|
||||
):
|
||||
self.content['fetched_content']['entries'].remove(entry)
|
||||
continue
|
||||
self.new_entries.append(entry)
|
||||
|
||||
@measure
|
||||
def export_content(self) -> dict:
|
||||
"""Exports properly formatted content"""
|
||||
# create feed item structure
|
||||
data_export: dict[str, List] = {
|
||||
'title': self.content['title'],
|
||||
'last_update': self.content['last_update'],
|
||||
'gmi_url': self.content['gmi_url'],
|
||||
'articles': [],
|
||||
'hash_last_update': self.content['fetched_hash_last_update'],
|
||||
}
|
||||
for article in self.new_entries:
|
||||
article_formatted_title = self._format_article_title(article)
|
||||
article_date = self._get_article_date(article)
|
||||
|
||||
# 2 possibilities to get content : content['value'] or summary
|
||||
content = (
|
||||
article['content'][0]['value']
|
||||
if article.get('content')
|
||||
else article['summary']
|
||||
)
|
||||
|
||||
article_content = self._format_article_content(
|
||||
content, link=article['link']
|
||||
)
|
||||
|
||||
data_export['articles'].append(
|
||||
{
|
||||
'article_title': article['title'],
|
||||
'article_formatted_title': article_formatted_title,
|
||||
'article_content': article_content,
|
||||
'article_date': article_date,
|
||||
'http_url': article['link'],
|
||||
'updated': article_date,
|
||||
}
|
||||
)
|
||||
|
||||
return data_export
|
||||
|
||||
@classmethod
|
||||
def _get_article_date(cls, article: dict) -> datetime:
|
||||
"""get date string and return datetime object"""
|
||||
try:
|
||||
return (
|
||||
datetime(
|
||||
*article.get(
|
||||
'published_parsed', article['updated_parsed']
|
||||
)[:6]
|
||||
)
|
||||
.replace(tzinfo=timezone.utc)
|
||||
.astimezone(tz=None)
|
||||
)
|
||||
except KeyError:
|
||||
logging.error(
|
||||
"Can't find a proper date field in article data, this should not happen !"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
@measure
|
||||
def _format_article_title(self, article: dict) -> str:
|
||||
"""title formatting to make it usable as a file title"""
|
||||
# truncate title size depending on title size
|
||||
maxlen = int(self.formatting.get('title_size', 120))
|
||||
if len(self.content['title']) + len(article['title']) > maxlen:
|
||||
maxlen = maxlen - len(self.content['title'])
|
||||
|
||||
# We don't want multiline titles (yes, it happens)
|
||||
article['title'] = article['title'].replace('\n', '')[:maxlen]
|
||||
|
||||
# remove special characters
|
||||
# probably not the best way to do it, as it seems there is performance issues here
|
||||
# to improve later if possible
|
||||
formatted_str = (
|
||||
article['title']
|
||||
.encode('utf8', 'ignore')
|
||||
.decode('utf8', 'ignore')
|
||||
.replace(' ', '-')
|
||||
)
|
||||
return re.sub('[«»!@#$%^&*(){};:,./<>?/|`~=_+]', '', formatted_str)[
|
||||
:maxlen
|
||||
]
|
||||
|
||||
@measure
|
||||
def _format_article_content(self, content: str, link: str) -> str:
|
||||
"""
|
||||
Formats article content from html to gmi
|
||||
Will use readability if the feed is truncated, so it should retrieve the full content
|
||||
"""
|
||||
|
||||
# conversion to readability format if asked
|
||||
if self.content['title'] in self.formatting.get(
|
||||
'truncated_feeds', 'アケオメ'
|
||||
).split(','):
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'
|
||||
}
|
||||
|
||||
req = requests.Session()
|
||||
retries = Retry(
|
||||
total=5,
|
||||
backoff_factor=0.1,
|
||||
status_forcelist=[500, 502, 503, 504],
|
||||
)
|
||||
req.mount('http://', HTTPAdapter(max_retries=retries))
|
||||
req.mount('https://', HTTPAdapter(max_retries=retries))
|
||||
res = req.get(url=link, headers=headers)
|
||||
|
||||
content = Document(res.text).summary()
|
||||
|
||||
# convert html -> md -> gemini
|
||||
article = md2gemini(markdownify(content))
|
||||
|
||||
return article
|
85
tenkan/feedsfile.py
Normal file
85
tenkan/feedsfile.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
#!/usr/bin/python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" feedsfile mddule : json feeds file manipulation """
|
||||
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict # , Optional
|
||||
|
||||
from prettytable import PrettyTable
|
||||
|
||||
|
||||
def create(file: str) -> None:
|
||||
"""file creation"""
|
||||
with open(file, 'x') as _file:
|
||||
data: dict = {'feeds': {}}
|
||||
json.dump(data, _file)
|
||||
|
||||
|
||||
def read(file: str) -> Dict[str, Dict[str, Dict[str, str]]]:
|
||||
"""read file and return json data"""
|
||||
with open(file, 'r') as _file:
|
||||
file_data = json.load(_file)
|
||||
return file_data
|
||||
|
||||
|
||||
def _write(file: str, file_data: Dict[str, Dict[str, Dict[str, str]]]) -> None:
|
||||
"""write new data into file"""
|
||||
with open(file, 'w') as file_updated:
|
||||
json.dump(file_data, file_updated, indent=4)
|
||||
|
||||
|
||||
def add_feed(file: str, feed_name: str, feed_url: str) -> None:
|
||||
"""add a new feed into existing file"""
|
||||
file_data: Dict[str, Dict[str, Dict[str, str]]] = read(file)
|
||||
file_data['feeds'][feed_name] = {
|
||||
'url': feed_url,
|
||||
'last_update': '',
|
||||
'hash_last_update': '',
|
||||
}
|
||||
_write(file, file_data)
|
||||
logging.info('feed %s added', feed_name)
|
||||
|
||||
|
||||
def del_feed(file: str, feed_name: str) -> None:
|
||||
"""remove feed from file"""
|
||||
file_data = read(file)
|
||||
# don't do anything if no feed found
|
||||
if file_data['feeds'].get(feed_name):
|
||||
del file_data['feeds'][feed_name]
|
||||
_write(file, file_data)
|
||||
logging.info('feed %s deleted', feed_name)
|
||||
else:
|
||||
logging.info('no feed %s found into feeds file', feed_name)
|
||||
|
||||
|
||||
def get_feed_item(file: str, feed_name: str, item: str) -> str:
|
||||
"""Return element of a defined feed"""
|
||||
file_data = read(file)
|
||||
item = file_data['feeds'][feed_name][item]
|
||||
return item
|
||||
|
||||
|
||||
def update_last_run(file: str, date: str) -> None:
|
||||
"""Update last_run key in json file"""
|
||||
file_data: dict = read(file)
|
||||
file_data['last_run'] = date
|
||||
_write(file, file_data)
|
||||
|
||||
|
||||
def update_feed(file: str, feed_name: str, hash_last_update: str) -> None:
|
||||
"""update last update date of a defined feed"""
|
||||
file_data = read(file)
|
||||
file_data['feeds'][feed_name]['hash_last_update'] = hash_last_update
|
||||
_write(file, file_data)
|
||||
|
||||
|
||||
def list_feeds(file: str) -> None:
|
||||
"""list feed file content"""
|
||||
file_data = read(file)
|
||||
table = PrettyTable()
|
||||
table.field_names = ['Title', 'URL']
|
||||
for item, value in file_data['feeds'].items():
|
||||
table.add_row([item, value['url']])
|
||||
logging.info(table)
|
133
tenkan/files.py
Normal file
133
tenkan/files.py
Normal file
|
@ -0,0 +1,133 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
""" files module : generated gemini feeds files management """
|
||||
|
||||
import logging
|
||||
import pathlib
|
||||
import shutil
|
||||
from typing import Dict, Union
|
||||
|
||||
from feedgen.feed import FeedGenerator # type: ignore
|
||||
|
||||
|
||||
def path_exists(path: str) -> bool:
|
||||
"""Check if feed path exists"""
|
||||
if pathlib.Path(path).is_dir():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def write_files(path: str, data: dict, max_num_entries: int) -> None:
|
||||
"""
|
||||
Converts feed objects into files and write them in the feed folder
|
||||
"""
|
||||
tpath = path
|
||||
path = path + data['title']
|
||||
pathlib.Path(path).mkdir(exist_ok=True)
|
||||
num_entries = 0
|
||||
# count entries in index file
|
||||
if pathlib.Path(f'{path}/index.gmi').is_file():
|
||||
num_entries = sum(1 for line in open(f'{path}/index.gmi'))
|
||||
|
||||
# if there is more articles than defined in max_num_entries, delete and rewrite
|
||||
if num_entries > max_num_entries:
|
||||
delete_folder(tpath, data['title'])
|
||||
|
||||
index_file_write_header(path, data['title'])
|
||||
urls = []
|
||||
art_output = {}
|
||||
for article in data['articles']:
|
||||
art_output = write_article(article, data, path)
|
||||
urls.append(art_output['url'])
|
||||
index_file_write_footer(path)
|
||||
# no need to update atom file if no new articles (write_article func returns url list)
|
||||
if art_output.get('new_file'):
|
||||
_rebuild_atom_file(path=path, data=data, urls=urls)
|
||||
|
||||
|
||||
# def purge_folder(path: str) -> None:
|
||||
# """Purge folder with too many entries"""
|
||||
# logging.info('Purging %s folder', path)
|
||||
# files = [x for x in pathlib.Path(f'{path}').iterdir() if x.is_file()]
|
||||
# for file in files:
|
||||
# pathlib.Path.unlink(file)
|
||||
|
||||
|
||||
def delete_folder(path: str, feed_name: str) -> None:
|
||||
"""delete a feed folder"""
|
||||
if pathlib.Path(f'{path}{feed_name}/').exists():
|
||||
shutil.rmtree(f'{path}{feed_name}')
|
||||
logging.info('%s/%s folder deleted', path, feed_name)
|
||||
else:
|
||||
logging.info(
|
||||
'folder %s%s not present, nothing to delete', path, feed_name
|
||||
)
|
||||
|
||||
|
||||
def index_file_write_header(path: str, title: str) -> None:
|
||||
"""Write index header"""
|
||||
with open(f'{path}/index.gmi', 'w') as index:
|
||||
index.write(f'# {title}\n\n')
|
||||
index.write('=> ../ ..\n')
|
||||
|
||||
|
||||
def index_file_write_footer(path: str) -> None:
|
||||
"""Write index footer"""
|
||||
with open(f'{path}/index.gmi', 'a') as index:
|
||||
index.write('\n=> atom.xml Atom feed\n')
|
||||
|
||||
|
||||
def write_article(
|
||||
article: dict, data: dict, path: str
|
||||
) -> Dict[str, Union[bool, str]]:
|
||||
"""Write individual article"""
|
||||
# prepare data for file format
|
||||
date = article['article_date']
|
||||
file_date = date.strftime('%Y-%m-%d_%H-%M-%S')
|
||||
date = date.strftime('%Y-%m-%d %H:%M:%S')
|
||||
file_title = article['article_formatted_title']
|
||||
content = article['article_content']
|
||||
|
||||
# we add the entry into index file
|
||||
with open(f'{path}/index.gmi', 'a') as index:
|
||||
index.write(
|
||||
f"=> {file_date}_{file_title}.gmi {date} - {article['article_title']}\n"
|
||||
)
|
||||
|
||||
new_file = False
|
||||
# write the file is it doesn't exist, obviously
|
||||
if not pathlib.Path(f'{path}/{file_date}_{file_title}.gmi').is_file():
|
||||
new_file = True
|
||||
logging.info('%s : adding entry %s', data['title'], file_title)
|
||||
# we write the entry file
|
||||
author = article['author'] if 'author' in article else None
|
||||
|
||||
pathlib.Path(f'{path}/{file_date}_{file_title}.gmi').write_text(
|
||||
f"# {article['article_title']}\n\n=> {article['http_url']}\n\n{date}, {author}\n\n{content}"
|
||||
)
|
||||
url = f"{data['gmi_url']}{data['title']}/{file_date}_{file_title}.gmi"
|
||||
|
||||
return {'new_file': new_file, 'url': url}
|
||||
|
||||
|
||||
def _rebuild_atom_file(path: str, data: dict, urls: list) -> None:
|
||||
"""rebuilds the atom file into gmi folder"""
|
||||
|
||||
atomfeed = FeedGenerator()
|
||||
atomfeed.id(data['gmi_url'])
|
||||
atomfeed.title(data['title'])
|
||||
atomfeed.updated = data['last_update']
|
||||
atomfeed.link(href=f"{data['gmi_url']}.atom.xml", rel='self')
|
||||
atomfeed.link(href=data['gmi_url'], rel='alternate')
|
||||
|
||||
# rebuild all articles
|
||||
for art, article in enumerate(data['articles']):
|
||||
atomentry = atomfeed.add_entry()
|
||||
url = urls[art]
|
||||
atomentry.guid(url)
|
||||
atomentry.link(href=url, rel='alternate')
|
||||
atomentry.updated(article['updated'])
|
||||
atomentry.title(article['article_title'])
|
||||
|
||||
atomfeed.atom_file(f'{path}/atom.xml', pretty=True)
|
||||
logging.info('Wrote Atom feed for %s', data['title'])
|
114
tenkan/processing.py
Normal file
114
tenkan/processing.py
Normal file
|
@ -0,0 +1,114 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""processing module : feeds file processing """
|
||||
|
||||
import configparser
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
|
||||
import feedparser # type: ignore
|
||||
|
||||
from tenkan.feed import Feed
|
||||
from tenkan.feedsfile import read, update_feed
|
||||
from tenkan.files import path_exists, write_files
|
||||
from tenkan.utils import display_feeds_fetch_progress, measure
|
||||
|
||||
|
||||
@measure
|
||||
def fetch_feeds(feeds_file: str, gmi_url: str) -> list:
|
||||
"""Fetch all http feeds with threads"""
|
||||
workers = os.cpu_count() or 1
|
||||
try:
|
||||
fetched_feeds = []
|
||||
with ThreadPoolExecutor(max_workers=workers) as executor:
|
||||
for item, values in read(feeds_file)['feeds'].items():
|
||||
fetched_feeds.append(
|
||||
{
|
||||
'title': item,
|
||||
'fetched_content': executor.submit(
|
||||
feedparser.parse, values['url']
|
||||
),
|
||||
'gmi_url': gmi_url,
|
||||
'last_update': values['last_update'],
|
||||
'fetched_hash_last_update': None,
|
||||
'json_hash_last_update': values['hash_last_update'],
|
||||
}
|
||||
)
|
||||
display_feeds_fetch_progress(fetched_feeds)
|
||||
return fetched_feeds
|
||||
except json.decoder.JSONDecodeError as bad_json:
|
||||
raise bad_json
|
||||
|
||||
|
||||
@measure
|
||||
def prepare_fetched_content(fetched_feeds: list, force: bool = False) -> list:
|
||||
"""Prepare some necessary data to be sent to feed object"""
|
||||
list_to_export = []
|
||||
for ftfd in fetched_feeds:
|
||||
try:
|
||||
# store workers result into fetched_content
|
||||
ftfd['fetched_content'] = ftfd['fetched_content'].result() # type: ignore
|
||||
# we store a sha256 footprint of fetched content,
|
||||
# to compare to last known footprint
|
||||
tmp_hash = hashlib.sha256(
|
||||
str(ftfd['fetched_content'].get('entries')[0]).encode()
|
||||
)
|
||||
if tmp_hash.hexdigest() != ftfd['json_hash_last_update'] or force:
|
||||
ftfd['fetched_hash_last_update'] = tmp_hash.hexdigest()
|
||||
list_to_export.append(ftfd)
|
||||
# sometimes we don't get anything in fetched_content, so just ignore it
|
||||
except IndexError:
|
||||
pass
|
||||
return list_to_export
|
||||
|
||||
|
||||
@measure
|
||||
def process_fetched_feeds(
|
||||
config: configparser.ConfigParser, fetched_feeds: list, force: bool = False
|
||||
) -> list:
|
||||
"""Process previously fetched feeds"""
|
||||
feed_list = []
|
||||
for ftfd in fetched_feeds:
|
||||
# initialize feed object
|
||||
feed = Feed(
|
||||
input_content=ftfd,
|
||||
filters=config['filters'],
|
||||
formatting=config['formatting'],
|
||||
)
|
||||
# process feeds if there are updates since last run
|
||||
# or if the feed had never been processed
|
||||
# or if --force option is used
|
||||
if (
|
||||
feed.needs_update()
|
||||
or not path_exists(
|
||||
path=config['tenkan']['gemini_path'] + ftfd['title']
|
||||
)
|
||||
or force
|
||||
):
|
||||
logging.info('Processing %s', ftfd['title'])
|
||||
feed.get_new_entries()
|
||||
feed_list.append(feed.export_content())
|
||||
return feed_list
|
||||
|
||||
|
||||
@measure
|
||||
def write_processed_feeds(
|
||||
args, config: configparser.ConfigParser, feed_list: list
|
||||
) -> None:
|
||||
"""Write files from processed feeds into gemini folder"""
|
||||
for files_data in feed_list:
|
||||
write_files(
|
||||
path=config['tenkan']['gemini_path'],
|
||||
data=files_data,
|
||||
max_num_entries=int(
|
||||
config['tenkan'].get('purge_feed_folder_after', '9999')
|
||||
),
|
||||
)
|
||||
update_feed(
|
||||
file=args.feedsfile,
|
||||
feed_name=files_data['title'],
|
||||
hash_last_update=files_data['hash_last_update'],
|
||||
)
|
35
tenkan/utils.py
Normal file
35
tenkan/utils.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
"""utils module : various utils"""
|
||||
|
||||
import logging
|
||||
from time import sleep, time
|
||||
|
||||
|
||||
def display_feeds_fetch_progress(fetched_feeds: list) -> None:
|
||||
"""Display feeds being fetched"""
|
||||
qsize = len(fetched_feeds)
|
||||
while True:
|
||||
done = len([x for x in fetched_feeds if x['fetched_content'].done()])
|
||||
print(f'Fetching feeds [{done}/{qsize}]', end='\r', flush=True)
|
||||
sleep(0.3)
|
||||
if done == qsize:
|
||||
break
|
||||
|
||||
|
||||
def measure(func):
|
||||
"""
|
||||
Decorator to measure time took by a func
|
||||
Used only in debug mode
|
||||
"""
|
||||
|
||||
def wrap_func(*args, **kwargs):
|
||||
time1 = time()
|
||||
result = func(*args, **kwargs)
|
||||
time2 = time()
|
||||
logging.debug(
|
||||
'Function %s executed in %ss', func.__name__, time2 - time1
|
||||
)
|
||||
return result
|
||||
|
||||
return wrap_func
|
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
81
tests/cli_test.py
Normal file
81
tests/cli_test.py
Normal file
|
@ -0,0 +1,81 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
import configparser
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from tenkan.cli import load_args, load_config, run
|
||||
from tenkan.feedsfile import add_feed, read
|
||||
|
||||
|
||||
def test_config_loaded():
|
||||
config_file = Path('./tests/data/tenkan.conf')
|
||||
res = load_config(config_file)
|
||||
assert isinstance(res, configparser.ConfigParser)
|
||||
|
||||
|
||||
def test_config_tenkan_section_missing():
|
||||
|
||||
config_file = Path('./tests/data/tenkan.conf_fail')
|
||||
|
||||
with pytest.raises(SystemExit) as pytest_wrapped_e:
|
||||
load_config(config_file)
|
||||
assert pytest_wrapped_e.type == SystemExit
|
||||
assert pytest_wrapped_e.value.code == 1
|
||||
|
||||
|
||||
def test_arg_feedsfile_missing():
|
||||
args = load_args(['--feedsfile', '/tmp/toto.json', 'list'])
|
||||
config = Path('./tests/data/tenkan.conf')
|
||||
with pytest.raises(SystemExit) as pytest_wrapped_e:
|
||||
run(args, config)
|
||||
assert pytest_wrapped_e.type == SystemExit
|
||||
assert pytest_wrapped_e.value.code == 1
|
||||
|
||||
|
||||
# def test_stupid_command():
|
||||
# args = load_args(['bla'])
|
||||
# config = Path('./tests/data/tenkan.conf')
|
||||
# with pytest.raises(SystemExit) as pytest_wrapped_e:
|
||||
# load_args(args)
|
||||
# assert pytest_wrapped_e.type == SystemExit
|
||||
# assert pytest_wrapped_e.value.code == 2
|
||||
|
||||
|
||||
def test_add_cmd_feedsfile_missing(tmp_path):
|
||||
feeds = tmp_path / 'toto.json'
|
||||
args = load_args(['--feedsfile', str(feeds), 'add', 'blabla', 'blibli'])
|
||||
config = Path('./tests/data/tenkan.conf')
|
||||
run(args, config)
|
||||
assert Path(f'{feeds}').is_file()
|
||||
|
||||
|
||||
def test_add_bad_feedsfile_folder():
|
||||
args = load_args(
|
||||
['--feedsfile', '/tmp/tmp/tmp/titi.json', 'add', 'blabla', 'blibli']
|
||||
)
|
||||
config = Path('./tests/data/tenkan.conf')
|
||||
with pytest.raises(SystemExit) as pytest_wrapped_e:
|
||||
run(args, config)
|
||||
assert pytest_wrapped_e.type == SystemExit
|
||||
assert pytest_wrapped_e.value.code == 1
|
||||
|
||||
|
||||
def test_del_cmd():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
args = load_args(['--feedsfile', str(feeds), 'delete', 'tutu'])
|
||||
config = Path('./tests/data/tenkan.conf')
|
||||
add_feed(file=feeds, feed_name='tutu', feed_url='tata')
|
||||
run(args, config)
|
||||
data = read(file=feeds)
|
||||
assert not data['feeds'].get('tutu')
|
||||
|
||||
|
||||
def test_update_cmd():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
args = load_args(['--feedsfile', str(feeds), 'update'])
|
||||
config = load_config(str(Path('./tests/data/tenkan.conf')))
|
||||
data1 = read(file=feeds)['last_run']
|
||||
run(args, config)
|
||||
data2 = read(file=feeds)['last_run']
|
||||
assert data1 != data2
|
14
tests/config_test.py
Normal file
14
tests/config_test.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from tenkan.config import load_config
|
||||
|
||||
|
||||
def test_configfile_missing():
|
||||
config = Path('/tmp/toto.conf')
|
||||
with pytest.raises(SystemExit) as pytest_wrapped_e:
|
||||
load_config(config)
|
||||
assert pytest_wrapped_e.type == SystemExit
|
||||
assert pytest_wrapped_e.value.code == 1
|
10
tests/data/feeds.json
Normal file
10
tests/data/feeds.json
Normal file
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"last_run": "2022-01-12 21:31:10.703787",
|
||||
"feeds": {
|
||||
"srad-science": {
|
||||
"url": "https://srad.jp/science.rss",
|
||||
"last_update": null,
|
||||
"hash_last_update": ""
|
||||
}
|
||||
}
|
||||
}
|
7
tests/data/feeds.json_fail
Normal file
7
tests/data/feeds.json_fail
Normal file
|
@ -0,0 +1,7 @@
|
|||
{
|
||||
"feeds": {
|
||||
"srad-science": {
|
||||
"url": "https://srad.jp/science.rss",
|
||||
"last_update": null
|
||||
}
|
||||
}
|
15
tests/data/tenkan.conf
Normal file
15
tests/data/tenkan.conf
Normal file
|
@ -0,0 +1,15 @@
|
|||
[tenkan]
|
||||
gemini_path = /tmp/
|
||||
gemini_url = gemini://space.fqserv.eu/feeds/
|
||||
|
||||
[filters]
|
||||
# authors we don't want to read
|
||||
authors_blacklist = Rabaudy, Élise Costa, Sagalovitch, Pessin, Gallerey
|
||||
titles_blacklist = Pinned
|
||||
links_blacklist = slate.fr/audio, slate.fr/grand-format, slate.fr/boire-manger/top-chef
|
||||
|
||||
[formatting]
|
||||
title_size = 120
|
||||
# feeds with a truncated content
|
||||
# will be fetched and converted using readability-lxml
|
||||
truncated_feeds = gurumed, slate, cnrs
|
15
tests/data/tenkan.conf_fail
Normal file
15
tests/data/tenkan.conf_fail
Normal file
|
@ -0,0 +1,15 @@
|
|||
#[tenkan]
|
||||
#gemini_path = /tmp/hu/
|
||||
#gemini_url = gemini://space.fqserv.eu/feeds/
|
||||
|
||||
[filters]
|
||||
# authors we don't want to read
|
||||
authors_blacklist = Rabaudy, Élise Costa, Sagalovitch, Pessin, Gallerey
|
||||
titles_blacklist = Pinned
|
||||
links_blacklist = slate.fr/audio, slate.fr/grand-format, slate.fr/boire-manger/top-chef
|
||||
|
||||
[formatting]
|
||||
title_size = 120
|
||||
# feeds with a truncated content
|
||||
# will be fetched and converted using readability-lxml
|
||||
truncated_feeds = gurumed, slate, cnrs
|
101
tests/feed_test.py
Normal file
101
tests/feed_test.py
Normal file
|
@ -0,0 +1,101 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import pytest
|
||||
|
||||
from tenkan.feed import Feed
|
||||
|
||||
data = {
|
||||
'title': 'bla',
|
||||
'url': 'bla',
|
||||
'fetched_content': 'bla',
|
||||
'last_update': None,
|
||||
'gmi_url': 'bla',
|
||||
'json_hash_last_update': 'bl',
|
||||
'fetched_hash_last_update': 'bla',
|
||||
}
|
||||
|
||||
article_data1 = {
|
||||
'title': 'article_title',
|
||||
'article_formatted_title': 'article_formatted_title',
|
||||
'article_content': {'summary': 'article_content'},
|
||||
'article_date': datetime(2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc),
|
||||
'http_url': 'article_link',
|
||||
'updated': 'Fri, 07 Jan 2022 15:25:00 +0000',
|
||||
'updated_parsed': datetime(
|
||||
2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc
|
||||
).timetuple(),
|
||||
}
|
||||
|
||||
article_data2 = {
|
||||
'title': 'article_title',
|
||||
'article_formatted_title': 'article_formatted_title',
|
||||
'article_content': {'summary': 'article_content'},
|
||||
'article_date': 'bad_date',
|
||||
'http_url': 'article_link',
|
||||
'updated_': 'bad_date',
|
||||
}
|
||||
|
||||
|
||||
def test_needs_update_no_last_update():
|
||||
data['json_hash_last_update'] = None
|
||||
feed = Feed(input_content=data)
|
||||
assert feed.needs_update() is True
|
||||
|
||||
|
||||
def test_needs_update_last_update_ne_updated_field():
|
||||
feed = Feed(input_content=data)
|
||||
assert feed.needs_update() is True
|
||||
|
||||
|
||||
def test_no_need_update():
|
||||
data['json_hash_last_update'] = 'bla'
|
||||
feed = Feed(input_content=data)
|
||||
assert feed.needs_update() is False
|
||||
|
||||
|
||||
def test_content_exported():
|
||||
# TODO : use article_data
|
||||
feed = Feed(input_content=data)
|
||||
|
||||
expected_data = {
|
||||
'title': 'bla',
|
||||
'last_update': None,
|
||||
'gmi_url': 'bla',
|
||||
'articles': [],
|
||||
'hash_last_update': 'bla',
|
||||
}
|
||||
|
||||
assert feed.export_content() == expected_data
|
||||
|
||||
|
||||
def test_date_format_published():
|
||||
data['articles'] = article_data1
|
||||
feed = Feed(input_content=data)
|
||||
assert (
|
||||
feed._get_article_date(article_data1)
|
||||
== data['articles']['article_date']
|
||||
)
|
||||
|
||||
|
||||
def test_bad_date_format():
|
||||
data['articles'] = article_data2
|
||||
feed = Feed(input_content=data)
|
||||
with pytest.raises(SystemExit) as pytest_wrapped_e:
|
||||
feed._get_article_date(article_data2)
|
||||
assert pytest_wrapped_e.type == SystemExit
|
||||
assert pytest_wrapped_e.value.code == 1
|
||||
|
||||
|
||||
def test_article_content_formatted():
|
||||
feed = Feed(input_content=data, formatting={'truncated_feeds': 'rien'})
|
||||
res = feed._format_article_content(content='coucou', link='blbl')
|
||||
assert res == 'coucou'
|
||||
|
||||
|
||||
def test_title_formatted():
|
||||
feed = Feed(input_content=data, formatting={'title_size': 10})
|
||||
art = article_data1
|
||||
art['title'] = 'blabla / bla ?'
|
||||
res = feed._format_article_title(article=article_data1)
|
||||
assert res == 'blabla-'
|
43
tests/feedsfile_test.py
Normal file
43
tests/feedsfile_test.py
Normal file
|
@ -0,0 +1,43 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from tenkan.feedsfile import (
|
||||
add_feed,
|
||||
del_feed,
|
||||
get_feed_item,
|
||||
read,
|
||||
update_feed,
|
||||
)
|
||||
|
||||
|
||||
def test_get_feed_item():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
item = get_feed_item(file=feeds, feed_name='srad-science', item='url')
|
||||
assert item == 'https://srad.jp/science.rss'
|
||||
|
||||
|
||||
def test_update_hash():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
update_feed(file=feeds, feed_name='srad-science', hash_last_update='blbl')
|
||||
item = get_feed_item(
|
||||
file=feeds, feed_name='srad-science', item='hash_last_update'
|
||||
)
|
||||
assert item == 'blbl'
|
||||
update_feed(file=feeds, feed_name='srad-science', hash_last_update='')
|
||||
|
||||
|
||||
def test_add_feed():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
add_feed(file=feeds, feed_name='toto', feed_url='tata')
|
||||
data = read(file=feeds)
|
||||
assert data['feeds'].get('toto')
|
||||
del_feed(file=feeds, feed_name='toto')
|
||||
|
||||
|
||||
def test_del_feed():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
add_feed(file=feeds, feed_name='tutu', feed_url='tata')
|
||||
del_feed(file=feeds, feed_name='tutu')
|
||||
data = read(file=feeds)
|
||||
assert not data['feeds'].get('tutu')
|
70
tests/files_test.py
Normal file
70
tests/files_test.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from tenkan.files import (
|
||||
_rebuild_atom_file,
|
||||
delete_folder,
|
||||
path_exists,
|
||||
write_article,
|
||||
)
|
||||
|
||||
data: dict = {
|
||||
'title': 'bla',
|
||||
'url': 'bla',
|
||||
'fetched_content': 'bla',
|
||||
'last_update': None,
|
||||
'gmi_url': 'bla',
|
||||
'articles': [],
|
||||
}
|
||||
|
||||
article_data = {
|
||||
'article_title': 'article_title',
|
||||
'article_formatted_title': 'article_formatted_title',
|
||||
'article_content': {'summary': 'article_content'},
|
||||
'article_date': datetime(2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc),
|
||||
'http_url': 'article_link',
|
||||
'updated': 'Fri, 07 Jan 2022 15:25:00 +0000',
|
||||
'updated_parsed': datetime(
|
||||
2022, 1, 7, 15, 25, 0, tzinfo=timezone.utc
|
||||
).timetuple(),
|
||||
}
|
||||
|
||||
|
||||
def test_path_exists(tmp_path):
|
||||
d = tmp_path / 'sub'
|
||||
d.mkdir()
|
||||
|
||||
assert path_exists(d) is True
|
||||
|
||||
|
||||
def test_path_doesnt_exist(tmp_path):
|
||||
d = tmp_path / 'sub'
|
||||
|
||||
assert path_exists(d) is False
|
||||
|
||||
|
||||
def test_article_written(tmp_path):
|
||||
path = tmp_path / 'sub'
|
||||
path.mkdir()
|
||||
date = article_data['article_date']
|
||||
file_date = date.strftime('%Y-%m-%d_%H-%M-%S')
|
||||
file_title = article_data['article_formatted_title']
|
||||
res = write_article(article=article_data, data=data, path=path)
|
||||
assert res['new_file'] is True
|
||||
assert (
|
||||
res['url']
|
||||
== f"{data['gmi_url']}{data['title']}/{file_date}_{file_title}.gmi"
|
||||
)
|
||||
|
||||
|
||||
def test_folder_deleted(tmp_path):
|
||||
subpath = tmp_path / 'sub2'
|
||||
delete_folder(path=tmp_path, feed_name='sub2')
|
||||
assert not subpath.exists()
|
||||
|
||||
|
||||
def test_atomfile_built(tmp_path):
|
||||
data['articles'].append(article_data)
|
||||
_rebuild_atom_file(path=tmp_path, data=data, urls=['bla'])
|
||||
assert Path(f'{tmp_path}/atom.xml').is_file()
|
45
tests/processing_test.py
Normal file
45
tests/processing_test.py
Normal file
|
@ -0,0 +1,45 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from json import JSONDecodeError
|
||||
from pathlib import Path
|
||||
|
||||
import feedparser
|
||||
import pytest
|
||||
|
||||
from tenkan.config import load_config
|
||||
from tenkan.processing import fetch_feeds, process_fetched_feeds
|
||||
|
||||
data = [
|
||||
{
|
||||
'title': 'bla',
|
||||
'url': 'bla',
|
||||
'fetched_content': None,
|
||||
'last_update': None,
|
||||
'gmi_url': 'bla',
|
||||
'json_hash_last_update': 'bli',
|
||||
'fetched_hash_last_update': 'bli',
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def test_feed_fetched():
|
||||
feeds = Path('./tests/data/feeds.json')
|
||||
|
||||
res = fetch_feeds(feeds_file=feeds, gmi_url='blbl')
|
||||
assert type(res) is list
|
||||
assert len(res) == 1
|
||||
|
||||
|
||||
def test_feed_raise_when_shitty_feedfile():
|
||||
feeds = Path('./tests/data/feeds.json_fail')
|
||||
|
||||
with pytest.raises(JSONDecodeError):
|
||||
fetch_feeds(feeds_file=feeds, gmi_url='blbl')
|
||||
|
||||
|
||||
def test_feed_processed():
|
||||
config_file = Path('./tests/data/tenkan.conf')
|
||||
conf = load_config(config_file)
|
||||
data[0]['fetched_content'] = feedparser.parse(
|
||||
'https://srad.jp/science.rss'
|
||||
)
|
||||
process_fetched_feeds(config=conf, fetched_feeds=data)
|
Loading…
Reference in a new issue