mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2025-01-10 18:02:40 +00:00
Basic support for PicoFeed content scraping
- At the moment this is a completely manual setting: feed deduplication makes the setting very hard to handle for multiple users - Improves #60
This commit is contained in:
parent
da092d5f8c
commit
aaa4d1e988
7 changed files with 77 additions and 15 deletions
|
@ -74,6 +74,8 @@ class Conf {
|
||||||
public $fetchTimeout = 10;
|
public $fetchTimeout = 10;
|
||||||
/** @var integer Maximum size, in bytes, of data when fetching feeds from foreign servers */
|
/** @var integer Maximum size, in bytes, of data when fetching feeds from foreign servers */
|
||||||
public $fetchSizeLimit = 2 * 1024 * 1024;
|
public $fetchSizeLimit = 2 * 1024 * 1024;
|
||||||
|
/** @var boolean Whether to allow the possibility of fetching full article contents using an item's URL. Whether fetching will actually happen is also governed by a per-feed setting */
|
||||||
|
public $fetchEnableScraping = true;
|
||||||
/** @var string User-Agent string to use when fetching feeds from foreign servers */
|
/** @var string User-Agent string to use when fetching feeds from foreign servers */
|
||||||
public $fetchUserAgentString;
|
public $fetchUserAgentString;
|
||||||
|
|
||||||
|
|
|
@ -433,13 +433,15 @@ class Database {
|
||||||
public function feedUpdate(int $feedID, bool $throwError = false): bool {
|
public function feedUpdate(int $feedID, bool $throwError = false): bool {
|
||||||
$tr = $this->db->begin();
|
$tr = $this->db->begin();
|
||||||
// check to make sure the feed exists
|
// check to make sure the feed exists
|
||||||
$f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count FROM arsse_feeds where id is ?", "int")->run($feedID)->getRow();
|
$f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count, scrape FROM arsse_feeds where id is ?", "int")->run($feedID)->getRow();
|
||||||
if(!$f) throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
|
if(!$f) throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
|
||||||
|
// determine whether the feed's items should be scraped for full content from the source Web site
|
||||||
|
$scrape = (Arsse::$conf->fetchEnableScraping && $f['scrape']);
|
||||||
// the Feed object throws an exception when there are problems, but that isn't ideal
|
// the Feed object throws an exception when there are problems, but that isn't ideal
|
||||||
// here. When an exception is thrown it should update the database with the
|
// here. When an exception is thrown it should update the database with the
|
||||||
// error instead of failing; if other exceptions are thrown, we should simply roll back
|
// error instead of failing; if other exceptions are thrown, we should simply roll back
|
||||||
try {
|
try {
|
||||||
$feed = new Feed($feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password']);
|
$feed = new Feed($feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password'], $scrape);
|
||||||
if(!$feed->modified) {
|
if(!$feed->modified) {
|
||||||
// if the feed hasn't changed, just compute the next fetch time and record it
|
// if the feed hasn't changed, just compute the next fetch time and record it
|
||||||
$this->db->prepare("UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ? WHERE id is ?", 'datetime', 'int')->run($feed->nextFetch, $feedID);
|
$this->db->prepare("UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ? WHERE id is ?", 'datetime', 'int')->run($feed->nextFetch, $feedID);
|
||||||
|
|
38
lib/Feed.php
38
lib/Feed.php
|
@ -2,10 +2,11 @@
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
namespace JKingWeb\Arsse;
|
namespace JKingWeb\Arsse;
|
||||||
use JKingWeb\Arsse\Misc\Date;
|
use JKingWeb\Arsse\Misc\Date;
|
||||||
use PicoFeed\Reader\Reader;
|
|
||||||
use PicoFeed\PicoFeedException;
|
use PicoFeed\PicoFeedException;
|
||||||
use PicoFeed\Reader\Favicon;
|
|
||||||
use PicoFeed\Config\Config;
|
use PicoFeed\Config\Config;
|
||||||
|
use PicoFeed\Reader\Reader;
|
||||||
|
use PicoFeed\Reader\Favicon;
|
||||||
|
use PicoFeed\Scraper\Scraper;
|
||||||
|
|
||||||
class Feed {
|
class Feed {
|
||||||
public $data = null;
|
public $data = null;
|
||||||
|
@ -19,7 +20,14 @@ class Feed {
|
||||||
public $newItems = [];
|
public $newItems = [];
|
||||||
public $changedItems = [];
|
public $changedItems = [];
|
||||||
|
|
||||||
public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') {
|
public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
|
||||||
|
// set the configuration
|
||||||
|
$this->config = new Config;
|
||||||
|
$this->config->setMaxBodySize(Arsse::$conf->fetchSizeLimit);
|
||||||
|
$this->config->setClientTimeout(Arsse::$conf->fetchTimeout);
|
||||||
|
$this->config->setGrabberTimeout(Arsse::$conf->fetchTimeout);
|
||||||
|
$this->config->setClientUserAgent(Arsse::$conf->fetchUserAgentString);
|
||||||
|
$this->config->setGrabberUserAgent(Arsse::$conf->fetchUserAgentString);
|
||||||
// fetch the feed
|
// fetch the feed
|
||||||
$this->download($url, $lastModified, $etag, $username, $password);
|
$this->download($url, $lastModified, $etag, $username, $password);
|
||||||
// format the HTTP Last-Modified date returned
|
// format the HTTP Last-Modified date returned
|
||||||
|
@ -37,6 +45,8 @@ class Feed {
|
||||||
if(!$this->lastModified) $this->lastModified = $this->computeLastModified();
|
if(!$this->lastModified) $this->lastModified = $this->computeLastModified();
|
||||||
// we only really care if articles have been modified; if there are no new articles, act as if the feed is unchanged
|
// we only really care if articles have been modified; if there are no new articles, act as if the feed is unchanged
|
||||||
if(!sizeof($this->newItems) && !sizeof($this->changedItems)) $this->modified = false;
|
if(!sizeof($this->newItems) && !sizeof($this->changedItems)) $this->modified = false;
|
||||||
|
// if requested, scrape full content for any new and changed items
|
||||||
|
if($scrape) $this->scrape();
|
||||||
}
|
}
|
||||||
// compute the time at which the feed should next be fetched
|
// compute the time at which the feed should next be fetched
|
||||||
$this->nextFetch = $this->computeNextFetch();
|
$this->nextFetch = $this->computeNextFetch();
|
||||||
|
@ -44,14 +54,7 @@ class Feed {
|
||||||
|
|
||||||
public function download(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = ''): bool {
|
public function download(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = ''): bool {
|
||||||
try {
|
try {
|
||||||
$config = new Config;
|
$this->reader = new Reader($this->config);
|
||||||
$config->setMaxBodySize(Arsse::$conf->fetchSizeLimit);
|
|
||||||
$config->setClientTimeout(Arsse::$conf->fetchTimeout);
|
|
||||||
$config->setGrabberTimeout(Arsse::$conf->fetchTimeout);
|
|
||||||
$config->setClientUserAgent(Arsse::$conf->fetchUserAgentString);
|
|
||||||
$config->setGrabberUserAgent(Arsse::$conf->fetchUserAgentString);
|
|
||||||
|
|
||||||
$this->reader = new Reader($config);
|
|
||||||
$this->resource = $this->reader->download($url, $lastModified, $etag, $username, $password);
|
$this->resource = $this->reader->download($url, $lastModified, $etag, $username, $password);
|
||||||
} catch (PicoFeedException $e) {
|
} catch (PicoFeedException $e) {
|
||||||
throw new Feed\Exception($url, $e);
|
throw new Feed\Exception($url, $e);
|
||||||
|
@ -211,7 +214,6 @@ class Feed {
|
||||||
// merge the two change-lists, preserving keys
|
// merge the two change-lists, preserving keys
|
||||||
$this->changedItems = array_combine(array_merge(array_keys($this->changedItems), array_keys($changed)), array_merge($this->changedItems, $changed));
|
$this->changedItems = array_combine(array_merge(array_keys($this->changedItems), array_keys($changed)), array_merge($this->changedItems, $changed));
|
||||||
}
|
}
|
||||||
// TODO: fetch full content when appropriate
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -332,4 +334,16 @@ class Feed {
|
||||||
rsort($dates);
|
rsort($dates);
|
||||||
return $dates;
|
return $dates;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected function scrape(): bool {
|
||||||
|
$scraper = new Scraper($this->config);
|
||||||
|
foreach(array_merge($this->newItems, $this->changedItems) as $item) {
|
||||||
|
$scraper->setUrl($item->url);
|
||||||
|
$scraper->execute();
|
||||||
|
if($scraper->hasRelevantContent()) {
|
||||||
|
$item->content = $scraper->getFilteredContent();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
|
@ -48,6 +48,8 @@ create table arsse_feeds(
|
||||||
err_msg text, -- last error message
|
err_msg text, -- last error message
|
||||||
username text not null default '', -- HTTP authentication username
|
username text not null default '', -- HTTP authentication username
|
||||||
password text not null default '', -- HTTP authentication password (this is stored in plain text)
|
password text not null default '', -- HTTP authentication password (this is stored in plain text)
|
||||||
|
size integer not null default 0, -- number of articles in the feed at last fetch
|
||||||
|
scrape boolean not null default 0, -- whether to use picoFeed's content scraper with this feed
|
||||||
unique(url,username,password) -- a URL with particular credentials should only appear once
|
unique(url,username,password) -- a URL with particular credentials should only appear once
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
@ -333,4 +333,15 @@ class TestFeed extends Test\AbstractTest {
|
||||||
$this->assertCount(0, $f->newItems);
|
$this->assertCount(0, $f->newItems);
|
||||||
$this->assertCount(0, $f->changedItems);
|
$this->assertCount(0, $f->changedItems);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function testScrapeFullContent() {
|
||||||
|
// first make sure that the absence of scraping works as expected
|
||||||
|
$f = new Feed(null, $this->base."Scraping/Feed");
|
||||||
|
$exp = "<p>Partial content</p>";
|
||||||
|
$this->assertSame($exp, $f->newItems[0]->content);
|
||||||
|
// now try to scrape and get different content
|
||||||
|
$f = new Feed(null, $this->base."Scraping/Feed", "", "", "", "", true);
|
||||||
|
$exp = "<p>Partial content, followed by more content</p>";
|
||||||
|
$this->assertSame($exp, $f->newItems[0]->content);
|
||||||
|
}
|
||||||
}
|
}
|
13
tests/docroot/Feed/Scraping/Document.php
Normal file
13
tests/docroot/Feed/Scraping/Document.php
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
<?php return [
|
||||||
|
'mime' => "text/html",
|
||||||
|
'content' => <<<MESSAGE_BODY
|
||||||
|
<html>
|
||||||
|
<title>Example article</title>
|
||||||
|
<body>
|
||||||
|
<article>
|
||||||
|
<p>Partial content, followed by more content</p>
|
||||||
|
</article>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
MESSAGE_BODY
|
||||||
|
];
|
18
tests/docroot/Feed/Scraping/Feed.php
Normal file
18
tests/docroot/Feed/Scraping/Feed.php
Normal file
|
@ -0,0 +1,18 @@
|
||||||
|
<?php return [
|
||||||
|
'mime' => "application/rss+xml",
|
||||||
|
'content' => <<<MESSAGE_BODY
|
||||||
|
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
|
||||||
|
<channel>
|
||||||
|
<title>Test feed</title>
|
||||||
|
<link>http://example.com/</link>
|
||||||
|
<description>Example newsfeed title</description>
|
||||||
|
|
||||||
|
<item>
|
||||||
|
<guid>http://localhost:8000/Feed/Scraping/Document</guid>
|
||||||
|
<title>Example article</title>
|
||||||
|
<description>Partial content</description>
|
||||||
|
</item>
|
||||||
|
</channel>
|
||||||
|
</rss>
|
||||||
|
MESSAGE_BODY
|
||||||
|
];
|
Loading…
Reference in a new issue