From aaa4d1e988faa4d4f72c8ce5cad13260f9256b28 Mon Sep 17 00:00:00 2001
From: "J. King" <jking@jkingweb.ca>
Date: Mon, 17 Jul 2017 14:56:50 -0400
Subject: [PATCH] Basic support for PicoFeed content scraping

- At the moment this is a completely manual setting: feed deduplication makes the setting very hard to handle for multiple users
- Improves #60
---
 lib/Conf.php                             |  4 ++-
 lib/Database.php                         |  6 ++--
 lib/Feed.php                             | 38 ++++++++++++++++--------
 sql/SQLite3/0.sql                        |  2 ++
 tests/Feed/TestFeed.php                  | 11 +++++++
 tests/docroot/Feed/Scraping/Document.php | 13 ++++++++
 tests/docroot/Feed/Scraping/Feed.php     | 18 +++++++++++
 7 files changed, 77 insertions(+), 15 deletions(-)
 create mode 100644 tests/docroot/Feed/Scraping/Document.php
 create mode 100644 tests/docroot/Feed/Scraping/Feed.php

diff --git a/lib/Conf.php b/lib/Conf.php
index a6905985..d4166f3d 100644
--- a/lib/Conf.php
+++ b/lib/Conf.php
@@ -74,6 +74,8 @@ class Conf {
     public $fetchTimeout            = 10;
     /** @var integer Maximum size, in bytes, of data when fetching feeds from foreign servers */
     public $fetchSizeLimit          = 2 * 1024 * 1024;
+    /** @var boolean Whether to allow the possibility of fetching full article contents using an item's URL. Whether fetching will actually happen is also governed by a per-feed setting */
+    public $fetchEnableScraping     = true;
     /** @var string User-Agent string to use when fetching feeds from foreign servers */
     public $fetchUserAgentString;
 
@@ -125,7 +127,7 @@ class Conf {
 
     /** Outputs non-default configuration settings as a string compatible with var_export()
     *
-    * If provided a file name, will produce the text of a PHP script suitable for laterimport
+    * If provided a file name, will produce the text of a PHP script suitable for later import
     * @param string $file Full path and file name for the file to export to */
     public function export(string $file = ""): string {
         // TODO: write export method
diff --git a/lib/Database.php b/lib/Database.php
index 9fb5d81e..a75e89db 100644
--- a/lib/Database.php
+++ b/lib/Database.php
@@ -433,13 +433,15 @@ class Database {
     public function feedUpdate(int $feedID, bool $throwError = false): bool {
         $tr = $this->db->begin();
         // check to make sure the feed exists
-        $f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count FROM arsse_feeds where id is ?", "int")->run($feedID)->getRow();
+        $f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count, scrape FROM arsse_feeds where id is ?", "int")->run($feedID)->getRow();
         if(!$f) throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
+        // determine whether the feed's items should be scraped for full content from the source Web site
+        $scrape = (Arsse::$conf->fetchEnableScraping && $f['scrape']);
         // the Feed object throws an exception when there are problems, but that isn't ideal
         // here. When an exception is thrown it should update the database with the
         // error instead of failing; if other exceptions are thrown, we should simply roll back
         try {
-            $feed = new Feed($feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password']);
+            $feed = new Feed($feedID, $f['url'], (string) Date::transform($f['modified'], "http", "sql"), $f['etag'], $f['username'], $f['password'], $scrape);
             if(!$feed->modified) {
                 // if the feed hasn't changed, just compute the next fetch time and record it
                 $this->db->prepare("UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ? WHERE id is ?", 'datetime', 'int')->run($feed->nextFetch, $feedID);
diff --git a/lib/Feed.php b/lib/Feed.php
index 46751165..4be63f65 100644
--- a/lib/Feed.php
+++ b/lib/Feed.php
@@ -2,10 +2,11 @@
 declare(strict_types=1);
 namespace JKingWeb\Arsse;
 use JKingWeb\Arsse\Misc\Date;
-use PicoFeed\Reader\Reader;
 use PicoFeed\PicoFeedException;
-use PicoFeed\Reader\Favicon;
 use PicoFeed\Config\Config;
+use PicoFeed\Reader\Reader;
+use PicoFeed\Reader\Favicon;
+use PicoFeed\Scraper\Scraper;
 
 class Feed {    
     public $data = null;
@@ -19,7 +20,14 @@ class Feed {
     public $newItems = [];
     public $changedItems = [];
 
-    public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') {
+    public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
+        // set the configuration
+        $this->config = new Config;
+        $this->config->setMaxBodySize(Arsse::$conf->fetchSizeLimit);
+        $this->config->setClientTimeout(Arsse::$conf->fetchTimeout);
+        $this->config->setGrabberTimeout(Arsse::$conf->fetchTimeout);
+        $this->config->setClientUserAgent(Arsse::$conf->fetchUserAgentString);
+        $this->config->setGrabberUserAgent(Arsse::$conf->fetchUserAgentString);
         // fetch the feed
         $this->download($url, $lastModified, $etag, $username, $password);
         // format the HTTP Last-Modified date returned
@@ -37,6 +45,8 @@ class Feed {
             if(!$this->lastModified) $this->lastModified = $this->computeLastModified();
             // we only really care if articles have been modified; if there are no new articles, act as if the feed is unchanged
             if(!sizeof($this->newItems) && !sizeof($this->changedItems)) $this->modified = false;
+            // if requested, scrape full content for any new and changed items
+            if($scrape) $this->scrape();
         }
         // compute the time at which the feed should next be fetched
         $this->nextFetch = $this->computeNextFetch();
@@ -44,14 +54,7 @@ class Feed {
 
     public function download(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = ''): bool {
         try {
-            $config = new Config;
-            $config->setMaxBodySize(Arsse::$conf->fetchSizeLimit);
-            $config->setClientTimeout(Arsse::$conf->fetchTimeout);
-            $config->setGrabberTimeout(Arsse::$conf->fetchTimeout);
-            $config->setClientUserAgent(Arsse::$conf->fetchUserAgentString);
-            $config->setGrabberUserAgent(Arsse::$conf->fetchUserAgentString);
-
-            $this->reader = new Reader($config);
+            $this->reader = new Reader($this->config);
             $this->resource = $this->reader->download($url, $lastModified, $etag, $username, $password);
         } catch (PicoFeedException $e) {
             throw new Feed\Exception($url, $e);
@@ -211,7 +214,6 @@ class Feed {
             // merge the two change-lists, preserving keys
             $this->changedItems = array_combine(array_merge(array_keys($this->changedItems), array_keys($changed)), array_merge($this->changedItems, $changed));
         }
-        // TODO: fetch full content when appropriate
         return true;
     }
 
@@ -332,4 +334,16 @@ class Feed {
         rsort($dates);
         return $dates;
     }
+
+    protected function scrape(): bool {
+        $scraper = new Scraper($this->config);
+        foreach(array_merge($this->newItems, $this->changedItems) as $item) {
+            $scraper->setUrl($item->url);
+            $scraper->execute();
+            if($scraper->hasRelevantContent()) {
+                $item->content = $scraper->getFilteredContent();
+            }
+        }
+        return true;
+    }
 }
\ No newline at end of file
diff --git a/sql/SQLite3/0.sql b/sql/SQLite3/0.sql
index f42f7c89..4c69b93d 100644
--- a/sql/SQLite3/0.sql
+++ b/sql/SQLite3/0.sql
@@ -48,6 +48,8 @@ create table arsse_feeds(
     err_msg text,                                                                                           -- last error message
     username text not null default '',                                                                      -- HTTP authentication username
     password text not null default '',                                                                      -- HTTP authentication password (this is stored in plain text)
+    size integer not null default 0,                                                                        -- number of articles in the feed at last fetch
+    scrape boolean not null default 0,                                                                      -- whether to use picoFeed's content scraper with this feed
     unique(url,username,password)                                                                           -- a URL with particular credentials should only appear once
 );
 
diff --git a/tests/Feed/TestFeed.php b/tests/Feed/TestFeed.php
index 558c8fdb..83cb5771 100644
--- a/tests/Feed/TestFeed.php
+++ b/tests/Feed/TestFeed.php
@@ -333,4 +333,15 @@ class TestFeed extends Test\AbstractTest {
         $this->assertCount(0, $f->newItems);
         $this->assertCount(0, $f->changedItems);   
     }
+
+    function testScrapeFullContent() {
+        // first make sure that the absence of scraping works as expected
+        $f = new Feed(null, $this->base."Scraping/Feed");
+        $exp = "<p>Partial content</p>";
+        $this->assertSame($exp, $f->newItems[0]->content);
+        // now try to scrape and get different content
+        $f = new Feed(null, $this->base."Scraping/Feed", "", "", "", "", true);
+        $exp = "<p>Partial content, followed by more content</p>";
+        $this->assertSame($exp, $f->newItems[0]->content);
+    }
 }
\ No newline at end of file
diff --git a/tests/docroot/Feed/Scraping/Document.php b/tests/docroot/Feed/Scraping/Document.php
new file mode 100644
index 00000000..09b389c2
--- /dev/null
+++ b/tests/docroot/Feed/Scraping/Document.php
@@ -0,0 +1,13 @@
+<?php return [
+    'mime'    => "text/html",
+    'content' => <<<MESSAGE_BODY
+<html>
+<title>Example article</title>
+<body>
+    <article>
+        <p>Partial content, followed by more content</p>
+    </article>
+</body>
+</html>
+MESSAGE_BODY
+];
\ No newline at end of file
diff --git a/tests/docroot/Feed/Scraping/Feed.php b/tests/docroot/Feed/Scraping/Feed.php
new file mode 100644
index 00000000..8018b52c
--- /dev/null
+++ b/tests/docroot/Feed/Scraping/Feed.php
@@ -0,0 +1,18 @@
+<?php return [
+    'mime'    => "application/rss+xml",
+    'content' => <<<MESSAGE_BODY
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/">
+<channel>
+    <title>Test feed</title>
+    <link>http://example.com/</link>
+    <description>Example newsfeed title</description>
+
+    <item>
+        <guid>http://localhost:8000/Feed/Scraping/Document</guid>
+        <title>Example article</title>
+        <description>Partial content</description>
+    </item>
+</channel>
+</rss>
+MESSAGE_BODY
+];
\ No newline at end of file