1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2025-01-08 17:02:41 +00:00

Added picoFeed wrapper

• Implemented a simple wrapper for picoFeed which fixes the id problems
and keeps error handling within its own class
• Updated Database.php to use the new class
• Replaced mentions of ownCloud with NextCloud in the schema
• Added hashes to schema for identification and change detection;
removed NextCloud hash and fingerprint; removed enclosure and category
hashes
This commit is contained in:
Dustin Wilson 2017-03-18 11:01:23 -05:00
parent b821d728e4
commit 3f61921b97
3 changed files with 131 additions and 42 deletions

View file

@ -2,8 +2,6 @@
declare(strict_types=1); declare(strict_types=1);
namespace JKingWeb\NewsSync; namespace JKingWeb\NewsSync;
use PasswordGenerator\Generator as PassGen; use PasswordGenerator\Generator as PassGen;
use PicoFeed\Reader\Reader;
use PicoFeed\PicoFeedException;
class Database { class Database {
@ -262,47 +260,64 @@ class Database {
$this->db->begin(); $this->db->begin();
// If the feed doesn't already exist in the database then add it to the database after determining its validity with PicoFeed. // If the feed doesn't already exist in the database then add it to the database
// after determining its validity with PicoFeed.
$qFeed = $this->db->prepare("SELECT id from newssync_feeds where url is ? and username is ? and password is ?", "str", "str", "str"); $qFeed = $this->db->prepare("SELECT id from newssync_feeds where url is ? and username is ? and password is ?", "str", "str", "str");
$feed = $qFeed->run($url, $fetchUser, $fetchPassword)->getValue(); $feed = $qFeed->run($url, $fetchUser, $fetchPassword)->getValue();
if ($feed === null) { if ($feed === null) {
try { $feed = new Feed($url);
$reader = new Reader; $feed->parse();
$resource = $reader->download($url);
$parser = $reader->getParser(
$resource->getUrl(),
$resource->getContent(),
$resource->getEncoding()
);
$feed = $parser->execute();
} catch (PicoFeedException $e) {
// If there's any error while trying to download or parse the feed then return an exception.
throw new Feed\Exception($url, $e);
}
// Add the feed to the database and return its Id which will be used when adding
// its articles to the database.
$feedID = $this->db->prepare( $feedID = $this->db->prepare(
"INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password) values(?,?,?,?,?,?,?,?,?)", 'INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password)
"str", "str", "str", "str", "datetime", "datetime", "str", "str", "str" values(?,?,?,?,?,?,?,?,?)',
)->run( 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str')->run(
$url, $url,
$feed->title, $feed->data->title,
// Grab the favicon for the Goodfeed; returns an empty string if it cannot find one. // Grab the favicon for the feed; returns an empty string if it cannot find one.
(new \PicoFeed\Reader\Favicon)->find($url), $feed->favicon,
$feed->siteUrl, $feed->data->siteUrl,
$feed->date, $feed->data->date,
$resource->getLastModified(), $feed->resource->getLastModified(),
$resource->getEtag(), $feed->resource->getEtag(),
$fetchUser, $fetchUser,
$fetchPassword $fetchPassword
)->lastId(); )->lastId();
// TODO: Populate newssync_articles with contents of what was obtained from PicoFeed. // Add each of the articles to the database.
foreach ($feed->data->items as $i) {
$articleID = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash)
values(?,?,?,?,?,?,?,?,?,?,?)',
'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run(
$feedID,
$i->url,
$i->title,
$i->author,
$i->publishedDate,
$i->updatedDate,
$i->id,
$i->content,
// Since feeds cannot be trusted to have valid ids additional hashes are used for identifiers.
// These hashes are made regardless to check against for changes.
hash('sha256', $i->url.$i->title),
hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType),
hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType)
)->lastId();
// If the article has categories add them into the categories database.
$categories = $i->getTag('category');
if (count($categories) > 0) {
foreach ($categories as $c) {
$this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleID, $c);
}
}
}
} }
// Add the feed to the user's subscriptions. // Add the feed to the user's subscriptions.
$sub = $this->db->prepare("INSERT INTO newssync_subscriptions(owner,feed) values(?,?)", "str", "int")->run($user, $feedID)->lastId(); $sub = $this->db->prepare('INSERT INTO newssync_subscriptions(owner,feed) values(?,?)', 'str', 'int')->run($user, $feedID)->lastId();
$this->db->commit(); $this->db->commit();
return $sub; return $sub;
} }

75
lib/Feed.php Normal file
View file

@ -0,0 +1,75 @@
<?php
namespace JKingWeb\NewsSync;
use PicoFeed\Reader\Reader;
use PicoFeed\PicoFeedException;
use PicoFeed\Reader\Favicon;
class Feed {
public $reader;
public $resource;
public $parser;
public $data;
public $favicon;
public function __construct(string $url, string $lastModified = '', string $etag = '') {
try {
$this->reader = new Reader;
$this->resource = $reader->download($url, $lastModified, $etag);
// Grab the favicon for the feed; returns an empty string if it cannot find one.
$this->favicon = new Favicon->find($url);
} catch (PicoFeedException $e) {
throw new Feed\Exception($url, $e);
}
}
public function parse(): bool {
try {
$this->parser = $this->reader->getParser(
$resource->getUrl(),
$resource->getContent(),
$resource->getEncoding()
);
$feed = $this->parser->execute();
} catch (PicoFeedException $e) {
throw new Feed\Exception($url, $e);
}
// PicoFeed does not provide valid ids when there is no id element. Its solution
// of hashing the url, title, and content together for the id if there is no id
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
// some are pure RSS with guid elements while others use the Dublin Core spec for
// identification. These feeds shouldn't be duplicated when updated. That should
// only be reserved for severely broken feeds.
foreach ($feed->items as &$f) {
// If there is an id element then continue. The id is used already.
$id = (string)$f->xml->id;
if ($id !== '') {
continue;
}
// If there is a guid element use it as the id.
$id = (string)$f->xml->guid;
if ($id !== '') {
$f->id = hash('sha256', $id);
continue;
}
// If there is a Dublin Core identifier use it.
$id = (string)$f->xml->children('http://purl.org/dc/elements/1.1/')->identifier;
if ($id !== '') {
$f->id = hash('sha256', $id);
continue;
}
// If there aren't any of those there is no id. Hashes are created when adding
// the feed to the database which will serve to identify the post in this
// situation.
$f->id = '';
}
$this->data = $feed;
return true;
}
}

View file

@ -43,18 +43,18 @@ create table newssync_subscriptions(
added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added
modified datetime not null default CURRENT_TIMESTAMP, -- date at which subscription properties were last modified modified datetime not null default CURRENT_TIMESTAMP, -- date at which subscription properties were last modified
title TEXT, -- user-supplied title title TEXT, -- user-supplied title
order_type int not null default 0, -- ownCloud sort order order_type int not null default 0, -- NextCloud sort order
pinned boolean not null default 0, -- whether feed is pinned (always sorts at top) pinned boolean not null default 0, -- whether feed is pinned (always sorts at top)
folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as ownCloud folder) is joined in when needed folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as NextCloud folder) is joined in when needed
unique(owner,feed) -- a given feed should only appear once for a given owner unique(owner,feed) -- a given feed should only appear once for a given owner
); );
-- TT-RSS categories and ownCloud folders -- TT-RSS categories and NextCloud folders
create table newssync_folders( create table newssync_folders(
id integer primary key not null, -- sequence number id integer primary key not null, -- sequence number
owner TEXT not null references newssync_users(id) on delete cascade on update cascade, -- owner of folder owner TEXT not null references newssync_users(id) on delete cascade on update cascade, -- owner of folder
parent integer default null, -- parent folder id parent integer default null, -- parent folder id
root integer default null, -- first-level folder (ownCloud folder) root integer default null, -- first-level folder (NextCloud folder)
name TEXT not null, -- folder name name TEXT not null, -- folder name
modified datetime not null default CURRENT_TIMESTAMP, -- modified datetime not null default CURRENT_TIMESTAMP, --
unique(owner,name,parent) -- cannot have multiple folders with the same name under the same parent for the same owner unique(owner,name,parent) -- cannot have multiple folders with the same name under the same parent for the same owner
@ -72,10 +72,9 @@ create table newssync_articles(
guid TEXT, -- GUID guid TEXT, -- GUID
content TEXT, -- content, as (X)HTML content TEXT, -- content, as (X)HTML
modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified
hash varchar(64) not null, -- ownCloud hash url_title_hash varchar(64), -- hash of URL + title; used when checking for updates and for identification if there is no guid.
fingerprint varchar(64) not null, -- ownCloud fingerprint url_content_hash varchar(64), -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
enclosures_hash varchar(64), -- hash of enclosures, if any; since enclosures are not uniquely identified, we need to know when they change title_content_hash varchar(64), -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
tags_hash varchar(64) -- hash of RSS/Atom categories included in article; since these categories are not uniquely identified, we need to know when they change
); );
-- enclosures associated with articles -- enclosures associated with articles