From aaa4d1e988faa4d4f72c8ce5cad13260f9256b28 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Mon, 17 Jul 2017 14:56:50 -0400 Subject: [PATCH] Basic support for PicoFeed content scraping - At the moment this is a completely manual setting: feed deduplication makes the setting very hard to handle for multiple users - Improves #60 --- lib/Conf.php | 4 ++- lib/Database.php | 6 ++-- lib/Feed.php | 38 ++++++++++++++++-------- sql/SQLite3/0.sql | 2 ++ tests/Feed/TestFeed.php | 11 +++++++ tests/docroot/Feed/Scraping/Document.php | 13 ++++++++ tests/docroot/Feed/Scraping/Feed.php | 18 +++++++++++ 7 files changed, 77 insertions(+), 15 deletions(-) create mode 100644 tests/docroot/Feed/Scraping/Document.php create mode 100644 tests/docroot/Feed/Scraping/Feed.php diff --git a/lib/Conf.php b/lib/Conf.php index a6905985..d4166f3d 100644 --- a/lib/Conf.php +++ b/lib/Conf.php @@ -74,6 +74,8 @@ class Conf { public $fetchTimeout = 10; /** @var integer Maximum size, in bytes, of data when fetching feeds from foreign servers */ public $fetchSizeLimit = 2 * 1024 * 1024; + /** @var boolean Whether to allow the possibility of fetching full article contents using an item's URL. Whether fetching will actually happen is also governed by a per-feed setting */ + public $fetchEnableScraping = true; /** @var string User-Agent string to use when fetching feeds from foreign servers */ public $fetchUserAgentString; @@ -125,7 +127,7 @@ class Conf { /** Outputs non-default configuration settings as a string compatible with var_export() * - * If provided a file name, will produce the text of a PHP script suitable for laterimport + * If provided a file name, will produce the text of a PHP script suitable for later import * @param string $file Full path and file name for the file to export to */ public function export(string $file = ""): string { // TODO: write export method diff --git a/lib/Database.php b/lib/Database.php index 9fb5d81e..a75e89db 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -433,13 +433,15 @@ class Database { public function feedUpdate(int $feedID, bool $throwError = false): bool { $tr = $this->db->begin(); // check to make sure the feed exists - $f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count FROM arsse_feeds where id is ?", "int")->run($feedID)->getRow(); + $f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count, scrape FROM arsse_feeds where id is ?", "int")->run($feedID)->getRow(); if(!$f) throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]); + // determine whether the feed's items should be scraped for full content from the source Web site + $scrape = (Arsse::$conf->fetchEnableScraping && $f['scrape']); // the Feed object throws an exception when there are problems, but that isn't ideal // here. When an exception is thrown it should update the database with the // error instead of failing; if other exceptions are thrown, we should simply roll back try { - $feed = new Feed($feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password']); + $feed = new Feed($feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password'], $scrape); if(!$feed->modified) { // if the feed hasn't changed, just compute the next fetch time and record it $this->db->prepare("UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ? WHERE id is ?", 'datetime', 'int')->run($feed->nextFetch, $feedID); diff --git a/lib/Feed.php b/lib/Feed.php index 46751165..4be63f65 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -2,10 +2,11 @@ declare(strict_types=1); namespace JKingWeb\Arsse; use JKingWeb\Arsse\Misc\Date; -use PicoFeed\Reader\Reader; use PicoFeed\PicoFeedException; -use PicoFeed\Reader\Favicon; use PicoFeed\Config\Config; +use PicoFeed\Reader\Reader; +use PicoFeed\Reader\Favicon; +use PicoFeed\Scraper\Scraper; class Feed { public $data = null; @@ -19,7 +20,14 @@ class Feed { public $newItems = []; public $changedItems = []; - public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') { + public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) { + // set the configuration + $this->config = new Config; + $this->config->setMaxBodySize(Arsse::$conf->fetchSizeLimit); + $this->config->setClientTimeout(Arsse::$conf->fetchTimeout); + $this->config->setGrabberTimeout(Arsse::$conf->fetchTimeout); + $this->config->setClientUserAgent(Arsse::$conf->fetchUserAgentString); + $this->config->setGrabberUserAgent(Arsse::$conf->fetchUserAgentString); // fetch the feed $this->download($url, $lastModified, $etag, $username, $password); // format the HTTP Last-Modified date returned @@ -37,6 +45,8 @@ class Feed { if(!$this->lastModified) $this->lastModified = $this->computeLastModified(); // we only really care if articles have been modified; if there are no new articles, act as if the feed is unchanged if(!sizeof($this->newItems) && !sizeof($this->changedItems)) $this->modified = false; + // if requested, scrape full content for any new and changed items + if($scrape) $this->scrape(); } // compute the time at which the feed should next be fetched $this->nextFetch = $this->computeNextFetch(); @@ -44,14 +54,7 @@ class Feed { public function download(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = ''): bool { try { - $config = new Config; - $config->setMaxBodySize(Arsse::$conf->fetchSizeLimit); - $config->setClientTimeout(Arsse::$conf->fetchTimeout); - $config->setGrabberTimeout(Arsse::$conf->fetchTimeout); - $config->setClientUserAgent(Arsse::$conf->fetchUserAgentString); - $config->setGrabberUserAgent(Arsse::$conf->fetchUserAgentString); - - $this->reader = new Reader($config); + $this->reader = new Reader($this->config); $this->resource = $this->reader->download($url, $lastModified, $etag, $username, $password); } catch (PicoFeedException $e) { throw new Feed\Exception($url, $e); @@ -211,7 +214,6 @@ class Feed { // merge the two change-lists, preserving keys $this->changedItems = array_combine(array_merge(array_keys($this->changedItems), array_keys($changed)), array_merge($this->changedItems, $changed)); } - // TODO: fetch full content when appropriate return true; } @@ -332,4 +334,16 @@ class Feed { rsort($dates); return $dates; } + + protected function scrape(): bool { + $scraper = new Scraper($this->config); + foreach(array_merge($this->newItems, $this->changedItems) as $item) { + $scraper->setUrl($item->url); + $scraper->execute(); + if($scraper->hasRelevantContent()) { + $item->content = $scraper->getFilteredContent(); + } + } + return true; + } } \ No newline at end of file diff --git a/sql/SQLite3/0.sql b/sql/SQLite3/0.sql index f42f7c89..4c69b93d 100644 --- a/sql/SQLite3/0.sql +++ b/sql/SQLite3/0.sql @@ -48,6 +48,8 @@ create table arsse_feeds( err_msg text, -- last error message username text not null default '', -- HTTP authentication username password text not null default '', -- HTTP authentication password (this is stored in plain text) + size integer not null default 0, -- number of articles in the feed at last fetch + scrape boolean not null default 0, -- whether to use picoFeed's content scraper with this feed unique(url,username,password) -- a URL with particular credentials should only appear once ); diff --git a/tests/Feed/TestFeed.php b/tests/Feed/TestFeed.php index 558c8fdb..83cb5771 100644 --- a/tests/Feed/TestFeed.php +++ b/tests/Feed/TestFeed.php @@ -333,4 +333,15 @@ class TestFeed extends Test\AbstractTest { $this->assertCount(0, $f->newItems); $this->assertCount(0, $f->changedItems); } + + function testScrapeFullContent() { + // first make sure that the absence of scraping works as expected + $f = new Feed(null, $this->base."Scraping/Feed"); + $exp = "

Partial content

"; + $this->assertSame($exp, $f->newItems[0]->content); + // now try to scrape and get different content + $f = new Feed(null, $this->base."Scraping/Feed", "", "", "", "", true); + $exp = "

Partial content, followed by more content

"; + $this->assertSame($exp, $f->newItems[0]->content); + } } \ No newline at end of file diff --git a/tests/docroot/Feed/Scraping/Document.php b/tests/docroot/Feed/Scraping/Document.php new file mode 100644 index 00000000..09b389c2 --- /dev/null +++ b/tests/docroot/Feed/Scraping/Document.php @@ -0,0 +1,13 @@ + "text/html", + 'content' => << +Example article + +
+

Partial content, followed by more content

+
+ + +MESSAGE_BODY +]; \ No newline at end of file diff --git a/tests/docroot/Feed/Scraping/Feed.php b/tests/docroot/Feed/Scraping/Feed.php new file mode 100644 index 00000000..8018b52c --- /dev/null +++ b/tests/docroot/Feed/Scraping/Feed.php @@ -0,0 +1,18 @@ + "application/rss+xml", + 'content' => << + + Test feed + http://example.com/ + Example newsfeed title + + + http://localhost:8000/Feed/Scraping/Document + Example article + Partial content + + + +MESSAGE_BODY +]; \ No newline at end of file