1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2025-01-08 17:02:41 +00:00

Start on refactoring of feed processing

This commit is contained in:
J. King 2017-04-13 22:17:53 -04:00
parent 557d17ef5d
commit 4e57e56ca7
3 changed files with 149 additions and 161 deletions

View file

@ -387,52 +387,16 @@ class Database {
} }
public function subscriptionAdd(string $user, string $url, string $fetchUser = "", string $fetchPassword = ""): int { public function subscriptionAdd(string $user, string $url, string $fetchUser = "", string $fetchPassword = ""): int {
// If the user isn't authorized to perform this action then throw an exception. if(!Data::$user->authorize($user, __FUNCTION__)) throw new User\ExceptionAuthz("notAuthorized", ["action" => __FUNCTION__, "user" => $user]);
if(!Data::$user->authorize($user, __FUNCTION__)) { if(!$this->userExists($user)) throw new User\Exception("doesNotExist", ["user" => $user, "action" => __FUNCTION__]);
throw new User\ExceptionAuthz("notAuthorized", ["action" => __FUNCTION__, "user" => $user]);
}
// If the user doesn't exist throw an exception.
if(!$this->userExists($user)) {
throw new User\Exception("doesNotExist", ["user" => $user, "action" => __FUNCTION__]);
}
$this->db->begin();
try {
// If the feed doesn't already exist in the database then add it to the database // If the feed doesn't already exist in the database then add it to the database
// after determining its validity with PicoFeed. // after determining its validity with PicoFeed.
$feedID = $this->db->prepare("SELECT id from arsse_feeds where url is ? and username is ? and password is ?", "str", "str", "str")->run($url, $fetchUser, $fetchPassword)->getValue(); $feedID = $this->db->prepare("SELECT id from arsse_feeds where url is ? and username is ? and password is ?", "str", "str", "str")->run($url, $fetchUser, $fetchPassword)->getValue();
if($feedID === null) { if($feedID === null) {
$feed = new Feed($url); $feedID = $this->feedAdd($url, $fetchUser, $fetchPassword);
$feed->parse();
// Add the feed to the database and return its Id which will be used when adding
// its articles to the database.
$feedID = $this->db->prepare(
'INSERT INTO arsse_feeds(url,title,favicon,source,updated,modified,etag,username,password) values(?,?,?,?,?,?,?,?,?)',
'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str'
)->run(
$url,
$feed->data->title,
// Grab the favicon for the feed; returns an empty string if it cannot find one.
$feed->favicon,
$feed->data->siteUrl,
$feed->data->date,
\DateTime::createFromFormat("!D, d M Y H:i:s e", $feed->resource->getLastModified()),
$feed->resource->getEtag(),
$fetchUser,
$fetchPassword
)->lastId();
// Add each of the articles to the database.
foreach($feed->data->items as $i) {
$this->articleAdd($feedID, $i);
}
} }
// Add the feed to the user's subscriptions. // Add the feed to the user's subscriptions.
$sub = $this->db->prepare('INSERT INTO arsse_subscriptions(owner,feed) values(?,?)', 'str', 'int')->run($user, $feedID)->lastId(); return $this->db->prepare('INSERT INTO arsse_subscriptions(owner,feed) values(?,?)', 'str', 'int')->run($user, $feedID)->lastId();
} catch(\Throwable $e) {
$this->db->rollback();
throw $e;
}
$this->db->commit();
return $sub;
} }
public function subscriptionRemove(string $user, int $id): bool { public function subscriptionRemove(string $user, int $id): bool {
@ -440,6 +404,131 @@ class Database {
return (bool) $this->db->prepare("DELETE from arsse_subscriptions where owner is ? and id is ?", "str", "int")->run($user, $id)->changes(); return (bool) $this->db->prepare("DELETE from arsse_subscriptions where owner is ? and id is ?", "str", "int")->run($user, $id)->changes();
} }
public function feedAdd(string $url, string $fetchUser = "", string $fetchPassword = ""): int {
$feed = new Feed($url, "", "", $fetchUser, $fetchPassword);
$feed->parse();
$feedID = $this->db->prepare('INSERT INTO arsse_feeds(url,username,password) values(?,?,?)', 'str', 'str', 'str')->run($url, $fetchUser, $fetchPassword)->lastId();
// Add the feed to the database and return its Id which will be used when adding
// its articles to the database.
try {
$this->feedUpdate($feedID, $feed);
} catch(\Throwable $e) {
$this->db->prepare('DELETE from arsse_feeds where id is ?', 'int')->run($feedID);
throw $e;
}
return $feedID;
}
public function feedUpdate(int $feedID, Feed $feed = null): bool {
$this->db->begin();
try {
// upon the very first update of a feed the $feed object is already supplied and already parsed; for all other updates we must parse it ourselves here
if(!$feed) {
$f = $this->db->prepare('SELECT url, username, password, DATEFORMAT("http", modified) AS lastmodified, etag FROM arsse_feeds where id is ?', "int")->run($feedID)->getRow();
if(!$f) throw new Db\ExceptionInput("idMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
// Feed object throws an exception when there are problems, but that isn't ideal
// here. When an exception is occurred it should update the database with the
// error instead of failing.
try {
$feed = new Feed($f['url'], $f['lastmodified'], $f['etag'], $f['username'], $f['password']);
if($feed->resource->isModified()) {
$feed->parse();
} else {
$this->db->rollback();
return false;
}
} catch (Feed\Exception $e) {
$this->db->prepare('UPDATE arsse_feeds SET err_count = err_count + 1, err_msg = ? WHERE id is ?', 'str', 'int')->run($e->getMessage(),$feedID);
$this->db->commit();
return false;
}
}
$articles = $this->db->prepare('SELECT id, url, title, author, DATEFORMAT("http", edited) AS edited_date, guid, content, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY id', 'int')->run($feedID)->getAll();
foreach($feed->data->items as $i) {
// Iterate through the articles in the database to determine a match for the one
// in the just-parsed feed.
$match = null;
foreach($articles as $a) {
// If the id exists and is equal to one in the database then this is the post.
if($i->id) {
if($i->id === $a['guid']) {
$match = $a;
}
}
// Otherwise if the id doesn't exist and any of the hashes match then this is
// the post.
elseif($i->urlTitleHash === $a['url_title_hash'] || $i->urlContentHash === $a['url_content_hash'] || $i->titleContentHash === $a['title_content_hash']) {
$match = $a;
}
}
// If there is no match then this is a new post and must be added to the
// database.
if(!$match) {
$this->articleAdd($feedID, $i);
continue;
}
// With that out of the way determine if the post has been updated.
// If there is an updated date, and it doesn't match the database's then update
// the post.
$update = false;
if($i->updatedDate) {
if($i->updatedDate !== $match['edited_date']) {
$update = true;
}
}
// Otherwise if there isn't an updated date and any of the hashes don't match
// then update the post.
elseif($i->urlTitleHash !== $match['url_title_hash'] || $i->urlContentHash !== $match['url_content_hash'] || $i->titleContentHash !== $match['title_content_hash']) {
$update = true;
}
if($update) {
$this->db->prepare(
'UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id is ?',
'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int'
)->run(
$i->url,
$i->title,
$i->author,
$i->publishedDate,
$i->updatedDate,
$i->id,
$i->content,
$i->urlTitleHash,
$i->urlContentHash,
$i->titleContentHash,
$match['id']
);
// If the article has categories update them.
$this->db->prepare('DELETE FROM arsse_categories WHERE article is ?', 'int')->run($match['id']);
$this->categoriesAdd($i, $match['id']);
}
}
// Lastly update the feed database itself with updated information.
$this->db->prepare('UPDATE arsse_feeds SET url = ?, title = ?, favicon = ?, source = ?, updated = ?, modified = ?, etag = ?, err_count = 0, err_msg = "" WHERE id is ?', 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'int')->run(
$feed->data->feedUrl,
$feed->data->title,
$feed->favicon,
$feed->data->siteUrl,
$feed->data->date,
\DateTime::createFromFormat("!D, d M Y H:i:s e", $feed->resource->getLastModified()),
$feed->resource->getEtag(),
$feedID
);
} catch(\Throwable $e) {
$this->db->rollback();
throw $e;
}
$this->db->commit();
return true;
}
public function articleAdd(int $feedID, \PicoFeed\Parser\Item $article): int { public function articleAdd(int $feedID, \PicoFeed\Parser\Item $article): int {
$this->db->begin(); $this->db->begin();
try { try {
@ -485,111 +574,4 @@ class Database {
$this->db->commit(); $this->db->commit();
return count($categories); return count($categories);
} }
public function updateFeeds(): int {
$feeds = $this->db->query('SELECT id, url, username, password, DATEFORMAT("http", modified) AS lastmodified, etag FROM arsse_feeds')->getAll();
foreach($feeds as $f) {
// Feed object throws an exception when there are problems, but that isn't ideal
// here. When an exception is occurred it should update the database with the
// error instead of failing.
try {
$feed = new Feed($f['url'], $f['lastmodified'], $f['etag'], $f['username'], $f['password']);
} catch (Feed\Exception $e) {
$this->db->prepare('UPDATE arsse_feeds SET err_count = err_count + 1, err_msg = ? WHERE id is ?', 'str', 'int')->run(
$e->getMessage(),
$f['id']
);
continue;
}
// If the feed has been updated then update the database.
if($feed->resource->isModified()) {
$feed->parse();
$this->db->begin();
$articles = $this->db->prepare('SELECT id, url, title, author, DATEFORMAT("http", edited) AS edited_date, guid, content, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY id', 'int')->run($f['id'])->getAll();
foreach($feed->data->items as $i) {
// Iterate through the articles in the database to determine a match for the one
// in the just-parsed feed.
$match = null;
foreach($articles as $a) {
// If the id exists and is equal to one in the database then this is the post.
if($i->id) {
if($i->id === $a['guid']) {
$match = $a;
}
}
// Otherwise if the id doesn't exist and any of the hashes match then this is
// the post.
elseif($i->urlTitleHash === $a['url_title_hash'] || $i->urlContentHash === $a['url_content_hash'] || $i->titleContentHash === $a['title_content_hash']) {
$match = $a;
}
}
// If there is no match then this is a new post and must be added to the
// database.
if(!$match) {
$this->articleAdd($i);
continue;
}
// With that out of the way determine if the post has been updated.
// If there is an updated date, and it doesn't match the database's then update
// the post.
$update = false;
if($i->updatedDate) {
if($i->updatedDate !== $match['edited_date']) {
$update = true;
}
}
// Otherwise if there isn't an updated date and any of the hashes don't match
// then update the post.
elseif($i->urlTitleHash !== $match['url_title_hash'] || $i->urlContentHash !== $match['url_content_hash'] || $i->titleContentHash !== $match['title_content_hash']) {
$update = true;
}
if($update) {
$this->db->prepare(
'UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id is ?',
'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int'
)->run(
$i->url,
$i->title,
$i->author,
$i->publishedDate,
$i->updatedDate,
$i->id,
$i->content,
$i->urlTitleHash,
$i->urlContentHash,
$i->titleContentHash,
$match['id']
);
// If the article has categories update them.
$this->db->prepare('DELETE FROM arsse_categories WHERE article is ?', 'int')->run($match['id']);
$this->categoriesAdd($i, $match['id']);
}
}
// Lastly update the feed database itself with updated information.
$this->db->prepare('UPDATE arsse_feeds SET url = ?, title = ?, favicon = ?, source = ?, updated = ?, modified = ?, etag = ?, err_count = 0, err_msg = "" WHERE id is ?', 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'int')->run(
$feed->feedUrl,
$feed->title,
$feed->favicon,
$feed->siteUrl,
$feed->date,
$feed->resource->getLastModified(),
$feed->resource->getEtag(),
$f['id']
);
}
}
$this->db->commit();
return 1;
}
} }

View file

@ -79,7 +79,7 @@ return [
other {Authenticated user is not authorized to perform the action "{action}" on behalf of {user}} other {Authenticated user is not authorized to perform the action "{action}" on behalf of {user}}
}', }',
'Exception.JKingWeb/Arsse/Feed/Exception.invalidCertificate' => 'Could not download feed "{url}" because its server is serving an invalid SSL certificate', 'Exception.JKingWeb/Arsse/Feed/Exception.invalidCertificate' => 'Could not download feed "{url}" because its server is serving an invalid SSL certificate',
'Exception.JKingWeb/Arsse/Feed/Exception.invalidURL' => 'Feed URL "{url}" is invalid', 'Exception.JKingWeb/Arsse/Feed/Exception.invalidUrl' => 'Feed URL "{url}" is invalid',
'Exception.JKingWeb/Arsse/Feed/Exception.maxRedirect' => 'Could not download feed "{url}" because its server reached its maximum number of HTTP redirections', 'Exception.JKingWeb/Arsse/Feed/Exception.maxRedirect' => 'Could not download feed "{url}" because its server reached its maximum number of HTTP redirections',
'Exception.JKingWeb/Arsse/Feed/Exception.maxSize' => 'Could not download feed "{url}" because its size exceeds the maximum allowed on its server', 'Exception.JKingWeb/Arsse/Feed/Exception.maxSize' => 'Could not download feed "{url}" because its size exceeds the maximum allowed on its server',
'Exception.JKingWeb/Arsse/Feed/Exception.timeout' => 'Could not download feed "{url}" because its server timed out', 'Exception.JKingWeb/Arsse/Feed/Exception.timeout' => 'Could not download feed "{url}" because its server timed out',

View file

@ -20,14 +20,14 @@ create table arsse_users(
-- newsfeeds, deduplicated -- newsfeeds, deduplicated
create table arsse_feeds( create table arsse_feeds(
id integer primary key not null, -- sequence number id integer primary key, -- sequence number
url TEXT not null, -- URL of feed url TEXT not null, -- URL of feed
title TEXT, -- default title of feed title TEXT, -- default title of feed
favicon TEXT, -- URL of favicon favicon TEXT, -- URL of favicon
source TEXT, -- URL of site to which the feed belongs source TEXT, -- URL of site to which the feed belongs
updated datetime, -- time at which the feed was last fetched updated datetime, -- time at which the feed was last fetched
modified datetime, -- time at which the feed last actually changed modified datetime, -- time at which the feed last actually changed
etag TEXT, -- HTTP ETag hash used for cache validation, changes each time the content changes etag TEXT not null default '', -- HTTP ETag hash used for cache validation, changes each time the content changes
err_count integer not null default 0, -- count of successive times update resulted in error since last successful update err_count integer not null default 0, -- count of successive times update resulted in error since last successful update
err_msg TEXT, -- last error message err_msg TEXT, -- last error message
username TEXT not null default '', -- HTTP authentication username username TEXT not null default '', -- HTTP authentication username
@ -37,7 +37,7 @@ create table arsse_feeds(
-- users' subscriptions to newsfeeds, with settings -- users' subscriptions to newsfeeds, with settings
create table arsse_subscriptions( create table arsse_subscriptions(
id integer primary key not null, -- sequence number id integer primary key, -- sequence number
owner TEXT not null references arsse_users(id) on delete cascade on update cascade, -- owner of subscription owner TEXT not null references arsse_users(id) on delete cascade on update cascade, -- owner of subscription
feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription
added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added added datetime not null default CURRENT_TIMESTAMP, -- time at which feed was added
@ -51,7 +51,7 @@ create table arsse_subscriptions(
-- TT-RSS categories and NextCloud folders -- TT-RSS categories and NextCloud folders
create table arsse_folders( create table arsse_folders(
id integer primary key not null, -- sequence number id integer primary key, -- sequence number
owner TEXT not null references arsse_users(id) on delete cascade on update cascade, -- owner of folder owner TEXT not null references arsse_users(id) on delete cascade on update cascade, -- owner of folder
parent integer references arsse_folders(id) on delete cascade, -- parent folder id parent integer references arsse_folders(id) on delete cascade, -- parent folder id
name TEXT not null, -- folder name name TEXT not null, -- folder name
@ -61,7 +61,7 @@ create table arsse_folders(
-- entries in newsfeeds -- entries in newsfeeds
create table arsse_articles( create table arsse_articles(
id integer primary key not null, -- sequence number id integer primary key, -- sequence number
feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription
url TEXT not null, -- URL of article url TEXT not null, -- URL of article
title TEXT, -- article title title TEXT, -- article title
@ -85,17 +85,23 @@ create table arsse_enclosures(
-- users' actions on newsfeed entries -- users' actions on newsfeed entries
create table arsse_subscription_articles( create table arsse_subscription_articles(
id integer primary key not null, id integer primary key,
article integer not null references arsse_articles(id) on delete cascade, article integer not null references arsse_articles(id) on delete cascade,
owner TEXT not null references arsse_users(id) on delete cascade on update cascade,
read boolean not null default 0, read boolean not null default 0,
starred boolean not null default 0, starred boolean not null default 0,
modified datetime not null default CURRENT_TIMESTAMP modified datetime not null default CURRENT_TIMESTAMP
); );
-- IDs for specific editions of articles (required for at least NextCloud News)
create table arsse_editions(
id integer primary key,
article integer not null references arsse_articles(id) on delete cascade
);
-- user labels associated with newsfeed entries -- user labels associated with newsfeed entries
create table arsse_labels( create table arsse_labels(
sub_article integer not null references arsse_subscription_articles(id) on delete cascade, -- sub_article integer not null references arsse_subscription_articles(id) on delete cascade,
owner TEXT not null references arsse_users(id) on delete cascade on update cascade,
name TEXT name TEXT
); );
create index arsse_label_names on arsse_labels(name); create index arsse_label_names on arsse_labels(name);