mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-22 13:12:41 +00:00
Added picoFeed wrapper
• Implemented a simple wrapper for picoFeed which fixes the id problems and keeps error handling within its own class • Updated Database.php to use the new class • Replaced mentions of ownCloud with NextCloud in the schema • Added hashes to schema for identification and change detection; removed NextCloud hash and fingerprint; removed enclosure and category hashes
This commit is contained in:
parent
b821d728e4
commit
3f61921b97
3 changed files with 131 additions and 42 deletions
|
@ -2,8 +2,6 @@
|
|||
declare(strict_types=1);
|
||||
namespace JKingWeb\NewsSync;
|
||||
use PasswordGenerator\Generator as PassGen;
|
||||
use PicoFeed\Reader\Reader;
|
||||
use PicoFeed\PicoFeedException;
|
||||
|
||||
class Database {
|
||||
|
||||
|
@ -262,47 +260,64 @@ class Database {
|
|||
|
||||
$this->db->begin();
|
||||
|
||||
// If the feed doesn't already exist in the database then add it to the database after determining its validity with PicoFeed.
|
||||
// If the feed doesn't already exist in the database then add it to the database
|
||||
// after determining its validity with PicoFeed.
|
||||
$qFeed = $this->db->prepare("SELECT id from newssync_feeds where url is ? and username is ? and password is ?", "str", "str", "str");
|
||||
$feed = $qFeed->run($url, $fetchUser, $fetchPassword)->getValue();
|
||||
if ($feed === null) {
|
||||
try {
|
||||
$reader = new Reader;
|
||||
$resource = $reader->download($url);
|
||||
|
||||
$parser = $reader->getParser(
|
||||
$resource->getUrl(),
|
||||
$resource->getContent(),
|
||||
$resource->getEncoding()
|
||||
);
|
||||
|
||||
$feed = $parser->execute();
|
||||
} catch (PicoFeedException $e) {
|
||||
// If there's any error while trying to download or parse the feed then return an exception.
|
||||
throw new Feed\Exception($url, $e);
|
||||
}
|
||||
$feed = new Feed($url);
|
||||
$feed->parse();
|
||||
|
||||
// Add the feed to the database and return its Id which will be used when adding
|
||||
// its articles to the database.
|
||||
$feedID = $this->db->prepare(
|
||||
"INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password) values(?,?,?,?,?,?,?,?,?)",
|
||||
"str", "str", "str", "str", "datetime", "datetime", "str", "str", "str"
|
||||
)->run(
|
||||
$url,
|
||||
$feed->title,
|
||||
// Grab the favicon for the Goodfeed; returns an empty string if it cannot find one.
|
||||
(new \PicoFeed\Reader\Favicon)->find($url),
|
||||
$feed->siteUrl,
|
||||
$feed->date,
|
||||
$resource->getLastModified(),
|
||||
$resource->getEtag(),
|
||||
$fetchUser,
|
||||
$fetchPassword
|
||||
)->lastId();
|
||||
'INSERT INTO newssync_feeds(url,title,favicon,source,updated,modified,etag,username,password)
|
||||
values(?,?,?,?,?,?,?,?,?)',
|
||||
'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str')->run(
|
||||
$url,
|
||||
$feed->data->title,
|
||||
// Grab the favicon for the feed; returns an empty string if it cannot find one.
|
||||
$feed->favicon,
|
||||
$feed->data->siteUrl,
|
||||
$feed->data->date,
|
||||
$feed->resource->getLastModified(),
|
||||
$feed->resource->getEtag(),
|
||||
$fetchUser,
|
||||
$fetchPassword
|
||||
)->lastId();
|
||||
|
||||
// TODO: Populate newssync_articles with contents of what was obtained from PicoFeed.
|
||||
// Add each of the articles to the database.
|
||||
foreach ($feed->data->items as $i) {
|
||||
$articleID = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash)
|
||||
values(?,?,?,?,?,?,?,?,?,?,?)',
|
||||
'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run(
|
||||
$feedID,
|
||||
$i->url,
|
||||
$i->title,
|
||||
$i->author,
|
||||
$i->publishedDate,
|
||||
$i->updatedDate,
|
||||
$i->id,
|
||||
$i->content,
|
||||
// Since feeds cannot be trusted to have valid ids additional hashes are used for identifiers.
|
||||
// These hashes are made regardless to check against for changes.
|
||||
hash('sha256', $i->url.$i->title),
|
||||
hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType),
|
||||
hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType)
|
||||
)->lastId();
|
||||
|
||||
// If the article has categories add them into the categories database.
|
||||
$categories = $i->getTag('category');
|
||||
if (count($categories) > 0) {
|
||||
foreach ($categories as $c) {
|
||||
$this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleID, $c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Add the feed to the user's subscriptions.
|
||||
$sub = $this->db->prepare("INSERT INTO newssync_subscriptions(owner,feed) values(?,?)", "str", "int")->run($user, $feedID)->lastId();
|
||||
$sub = $this->db->prepare('INSERT INTO newssync_subscriptions(owner,feed) values(?,?)', 'str', 'int')->run($user, $feedID)->lastId();
|
||||
$this->db->commit();
|
||||
return $sub;
|
||||
}
|
||||
|
|
75
lib/Feed.php
Normal file
75
lib/Feed.php
Normal file
|
@ -0,0 +1,75 @@
|
|||
<?php
|
||||
namespace JKingWeb\NewsSync;
|
||||
use PicoFeed\Reader\Reader;
|
||||
use PicoFeed\PicoFeedException;
|
||||
use PicoFeed\Reader\Favicon;
|
||||
|
||||
class Feed {
|
||||
public $reader;
|
||||
public $resource;
|
||||
public $parser;
|
||||
public $data;
|
||||
public $favicon;
|
||||
|
||||
public function __construct(string $url, string $lastModified = '', string $etag = '') {
|
||||
try {
|
||||
$this->reader = new Reader;
|
||||
$this->resource = $reader->download($url, $lastModified, $etag);
|
||||
// Grab the favicon for the feed; returns an empty string if it cannot find one.
|
||||
$this->favicon = new Favicon->find($url);
|
||||
} catch (PicoFeedException $e) {
|
||||
throw new Feed\Exception($url, $e);
|
||||
}
|
||||
}
|
||||
|
||||
public function parse(): bool {
|
||||
try {
|
||||
$this->parser = $this->reader->getParser(
|
||||
$resource->getUrl(),
|
||||
$resource->getContent(),
|
||||
$resource->getEncoding()
|
||||
);
|
||||
|
||||
$feed = $this->parser->execute();
|
||||
} catch (PicoFeedException $e) {
|
||||
throw new Feed\Exception($url, $e);
|
||||
}
|
||||
|
||||
// PicoFeed does not provide valid ids when there is no id element. Its solution
|
||||
// of hashing the url, title, and content together for the id if there is no id
|
||||
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
|
||||
// some are pure RSS with guid elements while others use the Dublin Core spec for
|
||||
// identification. These feeds shouldn't be duplicated when updated. That should
|
||||
// only be reserved for severely broken feeds.
|
||||
|
||||
foreach ($feed->items as &$f) {
|
||||
// If there is an id element then continue. The id is used already.
|
||||
$id = (string)$f->xml->id;
|
||||
if ($id !== '') {
|
||||
continue;
|
||||
}
|
||||
|
||||
// If there is a guid element use it as the id.
|
||||
$id = (string)$f->xml->guid;
|
||||
if ($id !== '') {
|
||||
$f->id = hash('sha256', $id);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If there is a Dublin Core identifier use it.
|
||||
$id = (string)$f->xml->children('http://purl.org/dc/elements/1.1/')->identifier;
|
||||
if ($id !== '') {
|
||||
$f->id = hash('sha256', $id);
|
||||
continue;
|
||||
}
|
||||
|
||||
// If there aren't any of those there is no id. Hashes are created when adding
|
||||
// the feed to the database which will serve to identify the post in this
|
||||
// situation.
|
||||
$f->id = '';
|
||||
}
|
||||
|
||||
$this->data = $feed;
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -43,18 +43,18 @@ create table newssync_subscriptions(
|
|||
added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added
|
||||
modified datetime not null default CURRENT_TIMESTAMP, -- date at which subscription properties were last modified
|
||||
title TEXT, -- user-supplied title
|
||||
order_type int not null default 0, -- ownCloud sort order
|
||||
order_type int not null default 0, -- NextCloud sort order
|
||||
pinned boolean not null default 0, -- whether feed is pinned (always sorts at top)
|
||||
folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as ownCloud folder) is joined in when needed
|
||||
folder integer references newssync_folders(id) on delete set null, -- TT-RSS category (nestable); the first-level category (which acts as NextCloud folder) is joined in when needed
|
||||
unique(owner,feed) -- a given feed should only appear once for a given owner
|
||||
);
|
||||
|
||||
-- TT-RSS categories and ownCloud folders
|
||||
-- TT-RSS categories and NextCloud folders
|
||||
create table newssync_folders(
|
||||
id integer primary key not null, -- sequence number
|
||||
owner TEXT not null references newssync_users(id) on delete cascade on update cascade, -- owner of folder
|
||||
parent integer default null, -- parent folder id
|
||||
root integer default null, -- first-level folder (ownCloud folder)
|
||||
root integer default null, -- first-level folder (NextCloud folder)
|
||||
name TEXT not null, -- folder name
|
||||
modified datetime not null default CURRENT_TIMESTAMP, --
|
||||
unique(owner,name,parent) -- cannot have multiple folders with the same name under the same parent for the same owner
|
||||
|
@ -72,10 +72,9 @@ create table newssync_articles(
|
|||
guid TEXT, -- GUID
|
||||
content TEXT, -- content, as (X)HTML
|
||||
modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified
|
||||
hash varchar(64) not null, -- ownCloud hash
|
||||
fingerprint varchar(64) not null, -- ownCloud fingerprint
|
||||
enclosures_hash varchar(64), -- hash of enclosures, if any; since enclosures are not uniquely identified, we need to know when they change
|
||||
tags_hash varchar(64) -- hash of RSS/Atom categories included in article; since these categories are not uniquely identified, we need to know when they change
|
||||
url_title_hash varchar(64), -- hash of URL + title; used when checking for updates and for identification if there is no guid.
|
||||
url_content_hash varchar(64), -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
|
||||
title_content_hash varchar(64), -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
|
||||
);
|
||||
|
||||
-- enclosures associated with articles
|
||||
|
|
Loading…
Reference in a new issue