mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2025-01-08 17:02:41 +00:00
Added picoFeed wrapper
• Implemented a simple wrapper for picoFeed which fixes the id problems and keeps error handling within its own class • Updated Database.php to use the new class • Replaced mentions of ownCloud with NextCloud in the schema • Added hashes to schema for identification and change detection; removed NextCloud hash and fingerprint; removed enclosure and category hashes
This commit is contained in:
parent
b821d728e4
commit
3f61921b97
3 changed files with 131 additions and 42 deletions
|
@ -2,8 +2,6 @@
|
||||||
declare(strict_types=1);
|
declare(strict_types=1);
|
||||||
namespace JKingWeb\NewsSync;
|
namespace JKingWeb\NewsSync;
|
||||||
use PasswordGenerator\Generator as PassGen;
|
use PasswordGenerator\Generator as PassGen;
|
||||||
use PicoFeed\Reader\Reader;
|
|
||||||
use PicoFeed\PicoFeedException;
|
|
||||||
|
|
||||||
class Database {
|
class Database {
|
||||||
|
|
||||||
|
@ -262,47 +260,64 @@ class Database {
|
||||||
|
|
||||||
$this->db->begin();
|
$this->db->begin();
|
||||||
|
|
||||||
// If the feed doesn't already exist in the database then add it to the database after determining its validity with PicoFeed.
|
// If the feed doesn't already exist in the database then add it to the database
|
||||||
|
// after determining its validity with PicoFeed.
|
||||||
$qFeed = $this->db->prepare("SELECT id from newssync_feeds where url is ? and username is ? and password is ?", "str", "str", "str");
|
$qFeed = $this->db->prepare("SELECT id from newssync_feeds where url is ? and username is ? and password is ?", "str", "str", "str");
|
||||||
$feed = $qFeed->run($url, $fetchUser, $fetchPassword)->getValue();
|
$feed = $qFeed->run($url, $fetchUser, $fetchPassword)->getValue();
|
||||||
if ($feed === null) {
|
if ($feed === null) {
|
||||||
try {
|
$feed = new Feed($url);
|
||||||
$reader = new Reader;
|
$feed->parse();
|
||||||
$resource = $reader->download($url);
|
|
||||||
|
|
||||||
$parser = $reader->getParser(
|
|
||||||
$resource->getUrl(),
|
|
||||||
$resource->getContent(),
|
|
||||||
$resource->getEncoding()
|
|
||||||
);
|
|
||||||
|
|
||||||
$feed = $parser->execute();
|
|
||||||
} catch (PicoFeedException $e) {
|
|
||||||
// If there's any error while trying to download or parse the feed then return an exception.
|
|
||||||
throw new Feed\Exception($url, $e);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
// Add the feed to the database and return its Id which will be used when adding
|
||||||
|
// its articles to the database.
|
||||||
$feedID = $this->db->prepare(
|
$feedID = $this->db->prepare(
|
||||||
"INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password) values(?,?,?,?,?,?,?,?,?)",
|
'INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password)
|
||||||
"str", "str", "str", "str", "datetime", "datetime", "str", "str", "str"
|
values(?,?,?,?,?,?,?,?,?)',
|
||||||
)->run(
|
'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str')->run(
|
||||||
$url,
|
$url,
|
||||||
$feed->title,
|
$feed->data->title,
|
||||||
// Grab the favicon for the Goodfeed; returns an empty string if it cannot find one.
|
// Grab the favicon for the feed; returns an empty string if it cannot find one.
|
||||||
(new \PicoFeed\Reader\Favicon)->find($url),
|
$feed->favicon,
|
||||||
$feed->siteUrl,
|
$feed->data->siteUrl,
|
||||||
$feed->date,
|
$feed->data->date,
|
||||||
$resource->getLastModified(),
|
$feed->resource->getLastModified(),
|
||||||
$resource->getEtag(),
|
$feed->resource->getEtag(),
|
||||||
$fetchUser,
|
$fetchUser,
|
||||||
$fetchPassword
|
$fetchPassword
|
||||||
)->lastId();
|
)->lastId();
|
||||||
|
|
||||||
// TODO: Populate newssync_articles with contents of what was obtained from PicoFeed.
|
// Add each of the articles to the database.
|
||||||
|
foreach ($feed->data->items as $i) {
|
||||||
|
$articleID = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash)
|
||||||
|
values(?,?,?,?,?,?,?,?,?,?,?)',
|
||||||
|
'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run(
|
||||||
|
$feedID,
|
||||||
|
$i->url,
|
||||||
|
$i->title,
|
||||||
|
$i->author,
|
||||||
|
$i->publishedDate,
|
||||||
|
$i->updatedDate,
|
||||||
|
$i->id,
|
||||||
|
$i->content,
|
||||||
|
// Since feeds cannot be trusted to have valid ids additional hashes are used for identifiers.
|
||||||
|
// These hashes are made regardless to check against for changes.
|
||||||
|
hash('sha256', $i->url.$i->title),
|
||||||
|
hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType),
|
||||||
|
hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType)
|
||||||
|
)->lastId();
|
||||||
|
|
||||||
|
// If the article has categories add them into the categories database.
|
||||||
|
$categories = $i->getTag('category');
|
||||||
|
if (count($categories) > 0) {
|
||||||
|
foreach ($categories as $c) {
|
||||||
|
$this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleID, $c);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add the feed to the user's subscriptions.
|
// Add the feed to the user's subscriptions.
|
||||||
$sub = $this->db->prepare("INSERT INTO newssync_subscriptions(owner,feed) values(?,?)", "str", "int")->run($user, $feedID)->lastId();
|
$sub = $this->db->prepare('INSERT INTO newssync_subscriptions(owner,feed) values(?,?)', 'str', 'int')->run($user, $feedID)->lastId();
|
||||||
$this->db->commit();
|
$this->db->commit();
|
||||||
return $sub;
|
return $sub;
|
||||||
}
|
}
|
||||||
|
|
75
lib/Feed.php
Normal file
75
lib/Feed.php
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
<?php
|
||||||
|
namespace JKingWeb\NewsSync;
|
||||||
|
use PicoFeed\Reader\Reader;
|
||||||
|
use PicoFeed\PicoFeedException;
|
||||||
|
use PicoFeed\Reader\Favicon;
|
||||||
|
|
||||||
|
class Feed {
|
||||||
|
public $reader;
|
||||||
|
public $resource;
|
||||||
|
public $parser;
|
||||||
|
public $data;
|
||||||
|
public $favicon;
|
||||||
|
|
||||||
|
public function __construct(string $url, string $lastModified = '', string $etag = '') {
|
||||||
|
try {
|
||||||
|
$this->reader = new Reader;
|
||||||
|
$this->resource = $reader->download($url, $lastModified, $etag);
|
||||||
|
// Grab the favicon for the feed; returns an empty string if it cannot find one.
|
||||||
|
$this->favicon = new Favicon->find($url);
|
||||||
|
} catch (PicoFeedException $e) {
|
||||||
|
throw new Feed\Exception($url, $e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public function parse(): bool {
|
||||||
|
try {
|
||||||
|
$this->parser = $this->reader->getParser(
|
||||||
|
$resource->getUrl(),
|
||||||
|
$resource->getContent(),
|
||||||
|
$resource->getEncoding()
|
||||||
|
);
|
||||||
|
|
||||||
|
$feed = $this->parser->execute();
|
||||||
|
} catch (PicoFeedException $e) {
|
||||||
|
throw new Feed\Exception($url, $e);
|
||||||
|
}
|
||||||
|
|
||||||
|
// PicoFeed does not provide valid ids when there is no id element. Its solution
|
||||||
|
// of hashing the url, title, and content together for the id if there is no id
|
||||||
|
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
|
||||||
|
// some are pure RSS with guid elements while others use the Dublin Core spec for
|
||||||
|
// identification. These feeds shouldn't be duplicated when updated. That should
|
||||||
|
// only be reserved for severely broken feeds.
|
||||||
|
|
||||||
|
foreach ($feed->items as &$f) {
|
||||||
|
// If there is an id element then continue. The id is used already.
|
||||||
|
$id = (string)$f->xml->id;
|
||||||
|
if ($id !== '') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is a guid element use it as the id.
|
||||||
|
$id = (string)$f->xml->guid;
|
||||||
|
if ($id !== '') {
|
||||||
|
$f->id = hash('sha256', $id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there is a Dublin Core identifier use it.
|
||||||
|
$id = (string)$f->xml->children('http://purl.org/dc/elements/1.1/')->identifier;
|
||||||
|
if ($id !== '') {
|
||||||
|
$f->id = hash('sha256', $id);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If there aren't any of those there is no id. Hashes are created when adding
|
||||||
|
// the feed to the database which will serve to identify the post in this
|
||||||
|
// situation.
|
||||||
|
$f->id = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
$this->data = $feed;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
|
@ -43,18 +43,18 @@ create table newssync_subscriptions(
|
||||||
added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added
|
added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added
|
||||||
modified datetime not null default CURRENT_TIMESTAMP, -- date at which subscription properties were last modified
|
modified datetime not null default CURRENT_TIMESTAMP, -- date at which subscription properties were last modified
|
||||||
title TEXT, -- user-supplied title
|
title TEXT, -- user-supplied title
|
||||||
order_type int not null default 0, -- ownCloud sort order
|
order_type int not null default 0, -- NextCloud sort order
|
||||||
pinned boolean not null default 0, -- whether feed is pinned (always sorts at top)
|
pinned boolean not null default 0, -- whether feed is pinned (always sorts at top)
|
||||||
folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as ownCloud folder) is joined in when needed
|
folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as NextCloud folder) is joined in when needed
|
||||||
unique(owner,feed) -- a given feed should only appear once for a given owner
|
unique(owner,feed) -- a given feed should only appear once for a given owner
|
||||||
);
|
);
|
||||||
|
|
||||||
-- TT-RSS categories and ownCloud folders
|
-- TT-RSS categories and NextCloud folders
|
||||||
create table newssync_folders(
|
create table newssync_folders(
|
||||||
id integer primary key not null, -- sequence number
|
id integer primary key not null, -- sequence number
|
||||||
owner TEXT not null references newssync_users(id) on delete cascade on update cascade, -- owner of folder
|
owner TEXT not null references newssync_users(id) on delete cascade on update cascade, -- owner of folder
|
||||||
parent integer default null, -- parent folder id
|
parent integer default null, -- parent folder id
|
||||||
root integer default null, -- first-level folder (ownCloud folder)
|
root integer default null, -- first-level folder (NextCloud folder)
|
||||||
name TEXT not null, -- folder name
|
name TEXT not null, -- folder name
|
||||||
modified datetime not null default CURRENT_TIMESTAMP, --
|
modified datetime not null default CURRENT_TIMESTAMP, --
|
||||||
unique(owner,name,parent) -- cannot have multiple folders with the same name under the same parent for the same owner
|
unique(owner,name,parent) -- cannot have multiple folders with the same name under the same parent for the same owner
|
||||||
|
@ -72,10 +72,9 @@ create table newssync_articles(
|
||||||
guid TEXT, -- GUID
|
guid TEXT, -- GUID
|
||||||
content TEXT, -- content, as (X)HTML
|
content TEXT, -- content, as (X)HTML
|
||||||
modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified
|
modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified
|
||||||
hash varchar(64) not null, -- ownCloud hash
|
url_title_hash varchar(64), -- hash of URL + title; used when checking for updates and for identification if there is no guid.
|
||||||
fingerprint varchar(64) not null, -- ownCloud fingerprint
|
url_content_hash varchar(64), -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
|
||||||
enclosures_hash varchar(64), -- hash of enclosures, if any; since enclosures are not uniquely identified, we need to know when they change
|
title_content_hash varchar(64), -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
|
||||||
tags_hash varchar(64) -- hash of RSS/Atom categories included in article; since these categories are not uniquely identified, we need to know when they change
|
|
||||||
);
|
);
|
||||||
|
|
||||||
-- enclosures associated with articles
|
-- enclosures associated with articles
|
||||||
|
|
Loading…
Reference in a new issue