From fdc9dd9666dbf630b6d432dc20a56298779cde32 Mon Sep 17 00:00:00 2001 From: TheSiahxyz <164138827+TheSiahxyz@users.noreply.github.com> Date: Tue, 16 Jun 2026 11:10:02 +0900 Subject: modified newsboat/fulltext.py --- ar/.config/newsboat/fulltext.py | 47 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) (limited to 'ar/.config') diff --git a/ar/.config/newsboat/fulltext.py b/ar/.config/newsboat/fulltext.py index 73775b0..4f567e5 100755 --- a/ar/.config/newsboat/fulltext.py +++ b/ar/.config/newsboat/fulltext.py @@ -14,6 +14,12 @@ Design notes: - Concurrency-limited; per-article timeout. - Fail-safe: if extraction fails (rdrview missing, network error, paywall), the item's original summary is left untouched so the feed never breaks. +- Paywall honesty: some sites (e.g. Seeking Alpha, behind PerimeterX + a metered + wall) only serve a short teaser to non-browser clients like rdrview. rdrview + "succeeds" but the body is cut off mid-sentence with a regwall stub. We detect + that, strip the legal boilerplate, and prepend a visible banner so the preview + isn't mistaken for the full article. The rest is only reachable in a real + browser, so open the link ('o') to read it. """ import sys @@ -24,6 +30,7 @@ import hashlib import subprocess import xml.etree.ElementTree as ET from concurrent.futures import ThreadPoolExecutor +from urllib.parse import urlparse CACHE_DIR = os.path.expanduser("~/.cache/newsboat-fulltext") TIMEOUT = 20 # seconds per article @@ -33,6 +40,42 @@ CONTENT_NS = "http://purl.org/rss/1.0/modules/content/" os.makedirs(CACHE_DIR, exist_ok=True) ET.register_namespace("content", CONTENT_NS) +# Hosts that gate articles behind bot-detection (PerimeterX) + a metered wall: +# a non-browser client like rdrview always gets a short teaser cut off +# mid-sentence, never the full body. Anything fetched from these is a teaser. +GATED_HOSTS = ( + "seekingalpha.com", +) + +# Content markers for the same situation on other sites. Kept conservative so +# normal feeds (CNBC, etc.) are never flagged. (rdrview's readability strips +# empty regwall stubs, so host-based detection above is the reliable path.) +PAYWALL_MARKERS = ( + "signup_widget_placeholder", + 'data-test-id="paywall"', +) + +# Legal boilerplate to drop from teasers so the short preview isn't buried. +DISCLAIMER_RE = re.compile( + r"
\s*[^<]*Disclaimer:?\s*.*?
", + re.IGNORECASE | re.DOTALL, +) + +# Prepended to detected teasers so the reader sees at a glance it's not the +# full text. newsboat's HTML renderer turns ⚠ into the warning sign. +TEASER_BANNER = ( + "⚠ PAYWALLED TEASER — the site gated this " + "article; only the preview below is available. Open the link in a browser " + "('o') for the full text.
\n