diff options
| author | TheSiahxyz <164138827+TheSiahxyz@users.noreply.github.com> | 2026-06-16 11:10:02 +0900 |
|---|---|---|
| committer | TheSiahxyz <164138827+TheSiahxyz@users.noreply.github.com> | 2026-06-16 11:10:02 +0900 |
| commit | fdc9dd9666dbf630b6d432dc20a56298779cde32 (patch) | |
| tree | bce693d4ce9d6a3ef6fac909ddcee16bf099a56d /ar/.config/newsboat | |
| parent | 5a43678930da79ac87fd9e6f66e96273a202f032 (diff) | |
modified newsboat/fulltext.py
Diffstat (limited to 'ar/.config/newsboat')
| -rwxr-xr-x | ar/.config/newsboat/fulltext.py | 47 |
1 files changed, 47 insertions, 0 deletions
diff --git a/ar/.config/newsboat/fulltext.py b/ar/.config/newsboat/fulltext.py index 73775b0..4f567e5 100755 --- a/ar/.config/newsboat/fulltext.py +++ b/ar/.config/newsboat/fulltext.py @@ -14,6 +14,12 @@ Design notes: - Concurrency-limited; per-article timeout. - Fail-safe: if extraction fails (rdrview missing, network error, paywall), the item's original summary is left untouched so the feed never breaks. +- Paywall honesty: some sites (e.g. Seeking Alpha, behind PerimeterX + a metered + wall) only serve a short teaser to non-browser clients like rdrview. rdrview + "succeeds" but the body is cut off mid-sentence with a regwall stub. We detect + that, strip the legal boilerplate, and prepend a visible banner so the preview + isn't mistaken for the full article. The rest is only reachable in a real + browser, so open the link ('o') to read it. """ import sys @@ -24,6 +30,7 @@ import hashlib import subprocess import xml.etree.ElementTree as ET from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse CACHE_DIR = os.path.expanduser("~/.cache/newsboat-fulltext") TIMEOUT = 20 # seconds per article @@ -33,6 +40,42 @@ CONTENT_NS = "http://purl.org/rss/1.0/modules/content/" os.makedirs(CACHE_DIR, exist_ok=True) ET.register_namespace("content", CONTENT_NS) +# Hosts that gate articles behind bot-detection (PerimeterX) + a metered wall: +# a non-browser client like rdrview always gets a short teaser cut off +# mid-sentence, never the full body. Anything fetched from these is a teaser. +GATED_HOSTS = ( + "seekingalpha.com", +) + +# Content markers for the same situation on other sites. Kept conservative so +# normal feeds (CNBC, etc.) are never flagged. (rdrview's readability strips +# empty regwall stubs, so host-based detection above is the reliable path.) +PAYWALL_MARKERS = ( + "signup_widget_placeholder", + 'data-test-id="paywall"', +) + +# Legal boilerplate to drop from teasers so the short preview isn't buried. +DISCLAIMER_RE = re.compile( + r"<p>\s*<strong>[^<]*Disclaimer:?\s*</strong>.*?</p>", + re.IGNORECASE | re.DOTALL, +) + +# Prepended to detected teasers so the reader sees at a glance it's not the +# full text. newsboat's HTML renderer turns ⚠ into the warning sign. +TEASER_BANNER = ( + "<p><strong>⚠ PAYWALLED TEASER</strong> — the site gated this " + "article; only the preview below is available. Open the link in a browser " + "('o') for the full text.</p>\n<hr/>\n" +) + + +def is_teaser(url, body): + host = (urlparse(url).hostname or "").lower() + if any(host == g or host.endswith("." + g) for g in GATED_HOSTS): + return True + return any(marker in body for marker in PAYWALL_MARKERS) + def cache_path(url): return os.path.join(CACHE_DIR, hashlib.sha256(url.encode()).hexdigest() + ".html") @@ -56,6 +99,10 @@ def extract(url): body = out.stdout.strip() if out.returncode != 0 or len(body) < 200: return None + if is_teaser(url, body): + # Not the full article — flag it honestly instead of passing the + # teaser off as extracted content. + body = TEASER_BANNER + DISCLAIMER_RE.sub("", body).strip() with open(cp, "w", encoding="utf-8") as f: f.write(body) return body |
