summaryrefslogtreecommitdiff
path: root/ar/.config
diff options
context:
space:
mode:
authorTheSiahxyz <164138827+TheSiahxyz@users.noreply.github.com>2026-06-15 14:15:20 +0900
committerTheSiahxyz <164138827+TheSiahxyz@users.noreply.github.com>2026-06-15 14:15:20 +0900
commitafea4e23cfef12e5edb2a94157596809d377501f (patch)
tree3d235c06980fbb69d420e21ba10b14f2a49ce16f /ar/.config
parent04db7330e879a75d002999f2042d633d06ff2b57 (diff)
created newsboat/fulltext.py
Diffstat (limited to 'ar/.config')
-rwxr-xr-xar/.config/newsboat/fulltext.py108
1 files changed, 108 insertions, 0 deletions
diff --git a/ar/.config/newsboat/fulltext.py b/ar/.config/newsboat/fulltext.py
new file mode 100755
index 0000000..73775b0
--- /dev/null
+++ b/ar/.config/newsboat/fulltext.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+"""newsboat full-text filter.
+
+Reads an RSS/Atom feed on stdin, and for each item replaces the body with the
+full article text extracted from the item's link via `rdrview`. Output is the
+same feed with <content:encoded> filled in, which newsboat shows in the article
+view.
+
+Used from ~/.config/newsboat/urls as:
+ filter:~/.config/newsboat/fulltext.py:https://example.com/feed.xml
+
+Design notes:
+- Per-URL disk cache so reloads don't re-fetch unchanged articles.
+- Concurrency-limited; per-article timeout.
+- Fail-safe: if extraction fails (rdrview missing, network error, paywall),
+ the item's original summary is left untouched so the feed never breaks.
+"""
+
+import sys
+import os
+import re
+import html
+import hashlib
+import subprocess
+import xml.etree.ElementTree as ET
+from concurrent.futures import ThreadPoolExecutor
+
+CACHE_DIR = os.path.expanduser("~/.cache/newsboat-fulltext")
+TIMEOUT = 20 # seconds per article
+MAX_WORKERS = 6 # parallel rdrview processes
+CONTENT_NS = "http://purl.org/rss/1.0/modules/content/"
+
+os.makedirs(CACHE_DIR, exist_ok=True)
+ET.register_namespace("content", CONTENT_NS)
+
+
+def cache_path(url):
+ return os.path.join(CACHE_DIR, hashlib.sha256(url.encode()).hexdigest() + ".html")
+
+
+def extract(url):
+ """Return clean article HTML for url, or None on any failure."""
+ if not url:
+ return None
+ cp = cache_path(url)
+ if os.path.exists(cp) and os.path.getsize(cp) > 0:
+ with open(cp, "r", encoding="utf-8", errors="replace") as f:
+ return f.read()
+ try:
+ out = subprocess.run(
+ ["rdrview", "-H", url],
+ capture_output=True, text=True, timeout=TIMEOUT,
+ )
+ except (FileNotFoundError, subprocess.TimeoutExpired):
+ return None
+ body = out.stdout.strip()
+ if out.returncode != 0 or len(body) < 200:
+ return None
+ with open(cp, "w", encoding="utf-8") as f:
+ f.write(body)
+ return body
+
+
+def find_link(item):
+ # RSS <link>text</link>
+ el = item.find("link")
+ if el is not None and el.text and el.text.strip():
+ return el.text.strip()
+ # Atom <link href="..."/> (prefer rel=alternate / no rel)
+ for el in item.findall("{http://www.w3.org/2005/Atom}link"):
+ rel = el.get("rel", "alternate")
+ if rel in ("alternate", "") and el.get("href"):
+ return el.get("href").strip()
+ return None
+
+
+def main():
+ raw = sys.stdin.buffer.read()
+ try:
+ root = ET.fromstring(raw)
+ except ET.ParseError:
+ # Not parseable as XML — pass through untouched so feed still works.
+ sys.stdout.buffer.write(raw)
+ return
+
+ # RSS items live under channel/item; Atom uses <entry>.
+ items = root.findall(".//item")
+ if not items:
+ items = root.findall(".//{http://www.w3.org/2005/Atom}entry")
+
+ links = [find_link(it) for it in items]
+ with ThreadPoolExecutor(max_workers=MAX_WORKERS) as ex:
+ bodies = list(ex.map(extract, links))
+
+ for item, body in zip(items, bodies):
+ if not body:
+ continue
+ tag = "{%s}encoded" % CONTENT_NS
+ ce = item.find(tag)
+ if ce is None:
+ ce = ET.SubElement(item, tag)
+ ce.text = body # ElementTree escapes it; newsboat's HTML renderer decodes
+
+ sys.stdout.buffer.write(ET.tostring(root, encoding="utf-8"))
+
+
+if __name__ == "__main__":
+ main()