summaryrefslogtreecommitdiff
path: root/content/rss-full-text.md
blob: 995797de937c2c4a834270e33b19eb06a3719cf9 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
---
title: "RSS Full Text"
date: 2025-07-01
tags: ["service"]
icon: "rss.svg"
short_desc: "Creates RSS full text feeds."
---

[fivefilters full-text-rss](https://www.fivefilters.org/full-text-rss/)
is Full-Text RSS service. Free open source RSS feeds usually provide
the summary or limited lines of URL contents. There is a way to see
the entire web page with text-based web browser such as lynx, w3m, and so on.
However, I want to do everything in [newsboat](https://github.com/newsboat/newsboat).
With this, you can retrieve the full-text of individual articles or
complete full-text RSS feeds. It enriches third-party RSS feeds with
full text articles.

## Installation

### Setting Up and Configuring

Check PHP version (e.g., php7.4-fpm). Adjust in later steps accordingly.

```sh
sudo apt update
sudo apt install nginx php php-fpm php-tidy git unzip certbot python3-certbot-nginx
```

Download Full-Text RSS

```sh
cd /var/www
sudo git clone https://bitbucket.org/fivefilters/full-text-rss.git
cd full-text-rss
sudo git reset --hard 384d52fd83361ffd6e7f28bd39b322970a015a28
```

Download Full-Text-RSS site config

```sh
sudo git clone https://github.com/fivefilters/ftr-site-config site_config
```

Set permissions:

```sh
sudo mkdir -p cache/rss
sudo chown -R www-data:www-data cache site_config
sudo chmod -R 777 cache site_config
```

### Configuring nginx & cerbot

Create Nginx Config with Clean /feed + Rate Limiting

Create a new site config:

```sh
sudo nano /etc/nginx/sites-available/fulltextrss
```

Paste this (adjust php7.4-fpm.sock if needed):

```nginx
# Rate limiting zone: 10 req/min per IP
limit_req_zone $binary_remote_addr zone=ratelimit:10m rate=10r/m;

server {
    listen 80;
    server_name rss.thesiah.xyz;  # Change to your domain

    root /var/www/full-text-rss;
    index index.php;

    # Rewrite clean URL /feed?url=... to makefulltextfeed.php
    location /feed {
        rewrite ^/feed$ /makefulltextfeed.php last;
        limit_req zone=ratelimit burst=5;
    }

    location ~ \.php$ {
        include snippets/fastcgi-php.conf;
        fastcgi_pass unix:/var/run/php/php7.4-fpm.sock;
    }

    location ~ /\.ht {
        deny all;
    }
}
```

Enable nginx on the site:

```sh
sudo ln -s /etc/nginx/sites-available/fulltextrss /etc/nginx/sites-enabled/
sudo nginx -t && sudo systemctl reload nginx
```

Secure with HTTPS (Certbot)

```sh
sudo certbot --nginx -d rss.thesiah.xyz
```

### Custom Config

Add custom_config.php:
Save to: /var/www/full-text-rss/custom_config.php

```php
<?php
/* Full-Text RSS config */
if (!isset($options)) $options = new stdClass();
$options->enabled = true;
$options->debug = true;
$options->default_entries = 200;
$options->max_entries = 1000;
$options->content = 'user';
$options->summary = 'user';
$options->rewrite_relative_urls = true;
$options->exclude_items_on_fail = 'user';
$options->singlepage = true;
$options->multipage = true;
$options->caching = true;
$options->cache_time = 120;
$options->cache_dir = dirname(__FILE__).'/cache';
$options->message_to_prepend = '';
$options->message_to_append = '';
$options->error_message = '[unable to retrieve full-text content]';
$options->keep_enclosures = true;
$options->detect_language = 'user';
$options->user_submitted_config = false;
$options->remove_native_ads = false;
$options->admin_credentials = array('username'=>'admin', 'password'=>getenv('FTR_ADMIN_PASSWORD'));
$options->allowed_urls = array();
$options->blocked_urls = array();
$options->key_required = false;
$options->api_keys = array();
$options->default_entries_with_key = 5;
$options->max_entries_with_key = 10;
$options->xss_filter = 'user';
$options->favour_feed_titles = 'user';
$options->allowed_parsers = array('libxml', 'html5php');
$options->allow_parser_override = true;
$options->cors = false;
$options->proxy_servers = array();
$options->proxy = true;
$options->allow_proxy_override = true;
$options->apc = true;
$options->smart_cache = true;
$options->fingerprints = array(
	'<meta name="generator" content="Posterous"' => array('hostname'=>'fingerprint.posterous.com', 'head'=>true),
	'<meta content=\'blogger\' name=\'generator\'' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
	'<meta name="generator" content="Blogger"' => array('hostname'=>'fingerprint.blogspot.com', 'head'=>true),
	'<meta name="generator" content="WordPress' => array('hostname'=>'fingerprint.wordpress.com', 'head'=>true)
);
$options->rewrite_url = array(
	'docs.google.com' => array('/Doc?' => '/View?'),
	'tnr.com' => array('tnr.com/article/' => 'tnr.com/print/article/'),
	'.m.wikipedia.org' => array('.m.wikipedia.org' => '.wikipedia.org'),
	'm.vanityfair.com' => array('m.vanityfair.com' => 'www.vanityfair.com')
);
$options->content_type_exc = array(
							   'application/pdf' => array('action'=>'link', 'name'=>'PDF'),
							   'image' => array('action'=>'link', 'name'=>'Image'),
							   'audio' => array('action'=>'link', 'name'=>'Audio'),
							   'video' => array('action'=>'link', 'name'=>'Video')
							  );
$options->cache_directory_level = 0;
$options->cache_cleanup = 100;
if (!defined('_FF_FTR_VERSION')) define('_FF_FTR_VERSION', '3.5');
if (basename(__FILE__) == 'config.php') {
	if (file_exists(dirname(__FILE__).'/custom_config.php')) {
		require_once dirname(__FILE__).'/custom_config.php';
	}

	foreach ($options as $_key=>&$_val) {
		$_key = "ftr_$_key";
		if (($_env = getenv($_key)) !== false) {
			if (is_array($_val)) {
				if ($_key === 'ftr_admin_credentials') {
					$_val = array_combine(array('username', 'password'), array_map('trim', explode(':', $_env, 2)));
					if ($_val === false) $_val = array('username'=>'admin', 'password'=>'');
				}
			} elseif ($_env === 'true' || $_env === 'false') {
				$_val = ($_env === 'true');
			} elseif (is_numeric($_env)) {
				$_val = (int)$_env;
				$_val = $_env;
			}
		}
	}
	unset($_key, $_val, $_env);
}
```

### Local host

Visit http://localhost:80 for the integrated web UI

{{<img src="/pix/rss-webui.png" alt="A screenshot of the Full Text RSS" >}}

Article extraction: http://localhost/extract.php?url=[url]
Feed conversion: http://localhost/makefulltextfeed.php?url=[url]