From 3f61921b97d2cbf6bd411cb0a60e244bf1df4e4a Mon Sep 17 00:00:00 2001 From: Dustin Wilson Date: Sat, 18 Mar 2017 11:01:23 -0500 Subject: [PATCH] Added picoFeed wrapper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit • Implemented a simple wrapper for picoFeed which fixes the id problems and keeps error handling within its own class • Updated Database.php to use the new class • Replaced mentions of ownCloud with NextCloud in the schema • Added hashes to schema for identification and change detection; removed NextCloud hash and fingerprint; removed enclosure and category hashes --- lib/Database.php | 83 ++++++++++++++++++++++++++++------------------- lib/Feed.php | 75 ++++++++++++++++++++++++++++++++++++++++++ sql/SQLite3/0.sql | 15 ++++----- 3 files changed, 131 insertions(+), 42 deletions(-) create mode 100644 lib/Feed.php diff --git a/lib/Database.php b/lib/Database.php index 6ff36c50..301d50a0 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -2,8 +2,6 @@ declare(strict_types=1); namespace JKingWeb\NewsSync; use PasswordGenerator\Generator as PassGen; -use PicoFeed\Reader\Reader; -use PicoFeed\PicoFeedException; class Database { @@ -262,47 +260,64 @@ class Database { $this->db->begin(); - // If the feed doesn't already exist in the database then add it to the database after determining its validity with PicoFeed. + // If the feed doesn't already exist in the database then add it to the database + // after determining its validity with PicoFeed. $qFeed = $this->db->prepare("SELECT id from newssync_feeds where url is ? and username is ? and password is ?", "str", "str", "str"); $feed = $qFeed->run($url, $fetchUser, $fetchPassword)->getValue(); if ($feed === null) { - try { - $reader = new Reader; - $resource = $reader->download($url); - - $parser = $reader->getParser( - $resource->getUrl(), - $resource->getContent(), - $resource->getEncoding() - ); - - $feed = $parser->execute(); - } catch (PicoFeedException $e) { - // If there's any error while trying to download or parse the feed then return an exception. - throw new Feed\Exception($url, $e); - } + $feed = new Feed($url); + $feed->parse(); + // Add the feed to the database and return its Id which will be used when adding + // its articles to the database. $feedID = $this->db->prepare( - "INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password) values(?,?,?,?,?,?,?,?,?)", - "str", "str", "str", "str", "datetime", "datetime", "str", "str", "str" - )->run( - $url, - $feed->title, - // Grab the favicon for the Goodfeed; returns an empty string if it cannot find one. - (new \PicoFeed\Reader\Favicon)->find($url), - $feed->siteUrl, - $feed->date, - $resource->getLastModified(), - $resource->getEtag(), - $fetchUser, - $fetchPassword - )->lastId(); + 'INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password) + values(?,?,?,?,?,?,?,?,?)', + 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str')->run( + $url, + $feed->data->title, + // Grab the favicon for the feed; returns an empty string if it cannot find one. + $feed->favicon, + $feed->data->siteUrl, + $feed->data->date, + $feed->resource->getLastModified(), + $feed->resource->getEtag(), + $fetchUser, + $fetchPassword + )->lastId(); - // TODO: Populate newssync_articles with contents of what was obtained from PicoFeed. + // Add each of the articles to the database. + foreach ($feed->data->items as $i) { + $articleID = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash) + values(?,?,?,?,?,?,?,?,?,?,?)', + 'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run( + $feedID, + $i->url, + $i->title, + $i->author, + $i->publishedDate, + $i->updatedDate, + $i->id, + $i->content, + // Since feeds cannot be trusted to have valid ids additional hashes are used for identifiers. + // These hashes are made regardless to check against for changes. + hash('sha256', $i->url.$i->title), + hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType), + hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType) + )->lastId(); + + // If the article has categories add them into the categories database. + $categories = $i->getTag('category'); + if (count($categories) > 0) { + foreach ($categories as $c) { + $this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleID, $c); + } + } + } } // Add the feed to the user's subscriptions. - $sub = $this->db->prepare("INSERT INTO newssync_subscriptions(owner,feed) values(?,?)", "str", "int")->run($user, $feedID)->lastId(); + $sub = $this->db->prepare('INSERT INTO newssync_subscriptions(owner,feed) values(?,?)', 'str', 'int')->run($user, $feedID)->lastId(); $this->db->commit(); return $sub; } diff --git a/lib/Feed.php b/lib/Feed.php new file mode 100644 index 00000000..795cc92a --- /dev/null +++ b/lib/Feed.php @@ -0,0 +1,75 @@ +reader = new Reader; + $this->resource = $reader->download($url, $lastModified, $etag); + // Grab the favicon for the feed; returns an empty string if it cannot find one. + $this->favicon = new Favicon->find($url); + } catch (PicoFeedException $e) { + throw new Feed\Exception($url, $e); + } + } + + public function parse(): bool { + try { + $this->parser = $this->reader->getParser( + $resource->getUrl(), + $resource->getContent(), + $resource->getEncoding() + ); + + $feed = $this->parser->execute(); + } catch (PicoFeedException $e) { + throw new Feed\Exception($url, $e); + } + + // PicoFeed does not provide valid ids when there is no id element. Its solution + // of hashing the url, title, and content together for the id if there is no id + // element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but + // some are pure RSS with guid elements while others use the Dublin Core spec for + // identification. These feeds shouldn't be duplicated when updated. That should + // only be reserved for severely broken feeds. + + foreach ($feed->items as &$f) { + // If there is an id element then continue. The id is used already. + $id = (string)$f->xml->id; + if ($id !== '') { + continue; + } + + // If there is a guid element use it as the id. + $id = (string)$f->xml->guid; + if ($id !== '') { + $f->id = hash('sha256', $id); + continue; + } + + // If there is a Dublin Core identifier use it. + $id = (string)$f->xml->children('http://purl.org/dc/elements/1.1/')->identifier; + if ($id !== '') { + $f->id = hash('sha256', $id); + continue; + } + + // If there aren't any of those there is no id. Hashes are created when adding + // the feed to the database which will serve to identify the post in this + // situation. + $f->id = ''; + } + + $this->data = $feed; + return true; + } +} \ No newline at end of file diff --git a/sql/SQLite3/0.sql b/sql/SQLite3/0.sql index eefcb9a3..a181f131 100644 --- a/sql/SQLite3/0.sql +++ b/sql/SQLite3/0.sql @@ -43,18 +43,18 @@ create table newssync_subscriptions( added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added modified datetime not null default CURRENT_TIMESTAMP, -- date at which subscription properties were last modified title TEXT, -- user-supplied title - order_type int not null default 0, -- ownCloud sort order + order_type int not null default 0, -- NextCloud sort order pinned boolean not null default 0, -- whether feed is pinned (always sorts at top) - folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as ownCloud folder) is joined in when needed + folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as NextCloud folder) is joined in when needed unique(owner,feed) -- a given feed should only appear once for a given owner ); --- TT-RSS categories and ownCloud folders +-- TT-RSS categories and NextCloud folders create table newssync_folders( id integer primary key not null, -- sequence number owner TEXT not null references newssync_users(id) on delete cascade on update cascade, -- owner of folder parent integer default null, -- parent folder id - root integer default null, -- first-level folder (ownCloud folder) + root integer default null, -- first-level folder (NextCloud folder) name TEXT not null, -- folder name modified datetime not null default CURRENT_TIMESTAMP, -- unique(owner,name,parent) -- cannot have multiple folders with the same name under the same parent for the same owner @@ -72,10 +72,9 @@ create table newssync_articles( guid TEXT, -- GUID content TEXT, -- content, as (X)HTML modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified - hash varchar(64) not null, -- ownCloud hash - fingerprint varchar(64) not null, -- ownCloud fingerprint - enclosures_hash varchar(64), -- hash of enclosures, if any; since enclosures are not uniquely identified, we need to know when they change - tags_hash varchar(64) -- hash of RSS/Atom categories included in article; since these categories are not uniquely identified, we need to know when they change + url_title_hash varchar(64), -- hash of URL + title; used when checking for updates and for identification if there is no guid. + url_content_hash varchar(64), -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. + title_content_hash varchar(64), -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid. ); -- enclosures associated with articles