mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-22 21:22:40 +00:00
More feed update refactoring
Still very much incomplete: in its present form it would yield many false duplicates
This commit is contained in:
parent
4e57e56ca7
commit
119d42907e
4 changed files with 61 additions and 42 deletions
|
@ -405,13 +405,11 @@ class Database {
|
||||||
}
|
}
|
||||||
|
|
||||||
public function feedAdd(string $url, string $fetchUser = "", string $fetchPassword = ""): int {
|
public function feedAdd(string $url, string $fetchUser = "", string $fetchPassword = ""): int {
|
||||||
$feed = new Feed($url, "", "", $fetchUser, $fetchPassword);
|
|
||||||
$feed->parse();
|
|
||||||
$feedID = $this->db->prepare('INSERT INTO arsse_feeds(url,username,password) values(?,?,?)', 'str', 'str', 'str')->run($url, $fetchUser, $fetchPassword)->lastId();
|
$feedID = $this->db->prepare('INSERT INTO arsse_feeds(url,username,password) values(?,?,?)', 'str', 'str', 'str')->run($url, $fetchUser, $fetchPassword)->lastId();
|
||||||
// Add the feed to the database and return its Id which will be used when adding
|
// Add the feed to the database and return its Id which will be used when adding
|
||||||
// its articles to the database.
|
// its articles to the database.
|
||||||
try {
|
try {
|
||||||
$this->feedUpdate($feedID, $feed);
|
$this->feedUpdate($feedID);
|
||||||
} catch(\Throwable $e) {
|
} catch(\Throwable $e) {
|
||||||
$this->db->prepare('DELETE from arsse_feeds where id is ?', 'int')->run($feedID);
|
$this->db->prepare('DELETE from arsse_feeds where id is ?', 'int')->run($feedID);
|
||||||
throw $e;
|
throw $e;
|
||||||
|
@ -419,54 +417,63 @@ class Database {
|
||||||
return $feedID;
|
return $feedID;
|
||||||
}
|
}
|
||||||
|
|
||||||
public function feedUpdate(int $feedID, Feed $feed = null): bool {
|
public function feedUpdate(int $feedID): bool {
|
||||||
$this->db->begin();
|
$this->db->begin();
|
||||||
try {
|
try {
|
||||||
// upon the very first update of a feed the $feed object is already supplied and already parsed; for all other updates we must parse it ourselves here
|
// check to make sure the feed exists
|
||||||
if(!$feed) {
|
$f = $this->db->prepare('SELECT url, username, password, DATEFORMAT("http", modified) AS lastmodified, etag FROM arsse_feeds where id is ?', "int")->run($feedID)->getRow();
|
||||||
$f = $this->db->prepare('SELECT url, username, password, DATEFORMAT("http", modified) AS lastmodified, etag FROM arsse_feeds where id is ?', "int")->run($feedID)->getRow();
|
if(!$f) throw new Db\ExceptionInput("idMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
|
||||||
if(!$f) throw new Db\ExceptionInput("idMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
|
// the Feed object throws an exception when there are problems, but that isn't ideal
|
||||||
// Feed object throws an exception when there are problems, but that isn't ideal
|
// here. When an exception is thrown it should update the database with the
|
||||||
// here. When an exception is occurred it should update the database with the
|
// error instead of failing; if other exceptions are thrown, we should simply roll back
|
||||||
// error instead of failing.
|
try {
|
||||||
try {
|
$feed = new Feed($f['url'], (string)$f['lastmodified'], $f['etag'], $f['username'], $f['password']);
|
||||||
$feed = new Feed($f['url'], $f['lastmodified'], $f['etag'], $f['username'], $f['password']);
|
if($feed->resource->isModified()) {
|
||||||
if($feed->resource->isModified()) {
|
$feed->parse();
|
||||||
$feed->parse();
|
} else {
|
||||||
} else {
|
$next = $this->feedNextFetch($feedID);
|
||||||
$this->db->rollback();
|
$this->db->prepare('UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ? WHERE id is ?', 'datetime', 'int')->run($next, $feedID);
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} catch (Feed\Exception $e) {
|
|
||||||
$this->db->prepare('UPDATE arsse_feeds SET err_count = err_count + 1, err_msg = ? WHERE id is ?', 'str', 'int')->run($e->getMessage(),$feedID);
|
|
||||||
$this->db->commit();
|
$this->db->commit();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
} catch (Feed\Exception $e) {
|
||||||
|
$next = $this->feedNextFetch($feedID);
|
||||||
|
$this->db->prepare('UPDATE arsse_feeds SET updated = CURRENT_TIMESTAMP, next_fetch = ?, err_count = err_count + 1, err_msg = ? WHERE id is ?', 'datetime', 'str', 'int')->run($next, $e->getMessage(),$feedID);
|
||||||
|
$this->db->commit();
|
||||||
|
return false;
|
||||||
|
} catch(\Throwable $e) {
|
||||||
|
$this->db->rollback();
|
||||||
|
throw $e;
|
||||||
}
|
}
|
||||||
$articles = $this->db->prepare('SELECT id, url, title, author, DATEFORMAT("http", edited) AS edited_date, guid, content, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY id', 'int')->run($feedID)->getAll();
|
// array if items in the fetched feed
|
||||||
|
$items = $feed->data->items;
|
||||||
foreach($feed->data->items as $i) {
|
// get as many of the latest articles in the database as there are in the feed
|
||||||
|
$articles = $this->db->prepare(
|
||||||
|
'SELECT id, DATEFORMAT("http", edited) AS edited_date, guid, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY edited desc limit ?',
|
||||||
|
'int', 'int'
|
||||||
|
)->run(
|
||||||
|
$feedID,
|
||||||
|
sizeof($items)
|
||||||
|
)->getAll();
|
||||||
|
foreach($items as $index => $i) {
|
||||||
// Iterate through the articles in the database to determine a match for the one
|
// Iterate through the articles in the database to determine a match for the one
|
||||||
// in the just-parsed feed.
|
// in the just-parsed feed.
|
||||||
$match = null;
|
$match = null;
|
||||||
foreach($articles as $a) {
|
foreach($articles as $a) {
|
||||||
// If the id exists and is equal to one in the database then this is the post.
|
// If the id exists and is equal to one in the database then this is the post.
|
||||||
if($i->id) {
|
if($i->id && $i->id === $a['guid']) {
|
||||||
if($i->id === $a['guid']) {
|
$match = $a;
|
||||||
$match = $a;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Otherwise if the id doesn't exist and any of the hashes match then this is
|
// Otherwise if the id doesn't exist and any of the hashes match then this is
|
||||||
// the post.
|
// the post.
|
||||||
elseif($i->urlTitleHash === $a['url_title_hash'] || $i->urlContentHash === $a['url_content_hash'] || $i->titleContentHash === $a['title_content_hash']) {
|
elseif($i->urlTitleHash === $a['url_title_hash'] || $i->urlContentHash === $a['url_content_hash'] || $i->titleContentHash === $a['title_content_hash']) {
|
||||||
$match = $a;
|
$match = $a;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If there is no match then this is a new post and must be added to the
|
// If there is no match then this is a new post and must be added to the
|
||||||
// database.
|
// database.
|
||||||
if(!$match) {
|
if(!$match) {
|
||||||
|
// FIXME: First perform a second pass
|
||||||
$this->articleAdd($feedID, $i);
|
$this->articleAdd($feedID, $i);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -511,14 +518,15 @@ class Database {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Lastly update the feed database itself with updated information.
|
// Lastly update the feed database itself with updated information.
|
||||||
$this->db->prepare('UPDATE arsse_feeds SET url = ?, title = ?, favicon = ?, source = ?, updated = ?, modified = ?, etag = ?, err_count = 0, err_msg = "" WHERE id is ?', 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'int')->run(
|
$next = $this->feedNextFetch($feedID, $feed);
|
||||||
|
$this->db->prepare('UPDATE arsse_feeds SET url = ?, title = ?, favicon = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = "", next_fetch = ? WHERE id is ?', 'str', 'str', 'str', 'str', 'datetime', 'str', 'datetime', 'int')->run(
|
||||||
$feed->data->feedUrl,
|
$feed->data->feedUrl,
|
||||||
$feed->data->title,
|
$feed->data->title,
|
||||||
$feed->favicon,
|
$feed->favicon,
|
||||||
$feed->data->siteUrl,
|
$feed->data->siteUrl,
|
||||||
$feed->data->date,
|
|
||||||
\DateTime::createFromFormat("!D, d M Y H:i:s e", $feed->resource->getLastModified()),
|
\DateTime::createFromFormat("!D, d M Y H:i:s e", $feed->resource->getLastModified()),
|
||||||
$feed->resource->getEtag(),
|
$feed->resource->getEtag(),
|
||||||
|
$next,
|
||||||
$feedID
|
$feedID
|
||||||
);
|
);
|
||||||
} catch(\Throwable $e) {
|
} catch(\Throwable $e) {
|
||||||
|
@ -529,6 +537,11 @@ class Database {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
protected function feedNextFetch(int $feedID, Feed $feed = null): \DateTime {
|
||||||
|
// FIXME: stub
|
||||||
|
return new \DateTime("now + 3 hours", new \DateTimeZone("UTC"));
|
||||||
|
}
|
||||||
|
|
||||||
public function articleAdd(int $feedID, \PicoFeed\Parser\Item $article): int {
|
public function articleAdd(int $feedID, \PicoFeed\Parser\Item $article): int {
|
||||||
$this->db->begin();
|
$this->db->begin();
|
||||||
try {
|
try {
|
||||||
|
|
|
@ -3,19 +3,24 @@ declare(strict_types=1);
|
||||||
namespace JKingWeb\Arsse\Db\SQLite3;
|
namespace JKingWeb\Arsse\Db\SQLite3;
|
||||||
|
|
||||||
class CustomFunctions {
|
class CustomFunctions {
|
||||||
|
protected static $tz;
|
||||||
|
|
||||||
// Converts from SQLite3's date format to a specified standard date format.
|
// Converts from SQLite3's date format to a specified standard date format.
|
||||||
public static function dateFormat(string $format, string $date): string {
|
public static function dateFormat(string $format, $date) {
|
||||||
$date = \DateTime::createFromFormat('Y-m-d H:i:s', $date, 'UTC');
|
settype($date, "string");
|
||||||
|
if($date=="") return null;
|
||||||
|
if(is_null(self::$tz)) self::$tz = new \DateTimeZone("UTC");
|
||||||
|
$date = \DateTime::createFromFormat('Y-m-d H:i:s', $date, self::$tz);
|
||||||
$format = strtolower($format);
|
$format = strtolower($format);
|
||||||
switch ($format) {
|
switch ($format) {
|
||||||
case 'unix': return (string)$date->getTimestamp();
|
case 'unix':
|
||||||
break;
|
return $date->getTimestamp();
|
||||||
case 'rfc822':
|
case 'rfc822':
|
||||||
case 'http': return $date->format(\DateTime::RFC822);
|
case 'http':
|
||||||
break;
|
return $date->format(\DateTime::RFC822);
|
||||||
case 'iso8601':
|
case 'iso8601':
|
||||||
default: return $date->format(\DateTime::ISO8601);
|
default:
|
||||||
|
return $date->format(\DateTime::ISO8601);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -40,9 +40,9 @@ class Statement extends \JKingWeb\Arsse\Db\AbstractStatement {
|
||||||
|
|
||||||
public static function dateFormat(int $part = self::TS_BOTH): string {
|
public static function dateFormat(int $part = self::TS_BOTH): string {
|
||||||
return ([
|
return ([
|
||||||
self::TS_TIME => 'h:i:sP',
|
self::TS_TIME => 'h:i:s',
|
||||||
self::TS_DATE => 'Y-m-d',
|
self::TS_DATE => 'Y-m-d',
|
||||||
self::TS_BOTH => 'Y-m-d h:i:sP',
|
self::TS_BOTH => 'Y-m-d h:i:s',
|
||||||
])[$part];
|
])[$part];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,6 +27,7 @@ create table arsse_feeds(
|
||||||
source TEXT, -- URL of site to which the feed belongs
|
source TEXT, -- URL of site to which the feed belongs
|
||||||
updated datetime, -- time at which the feed was last fetched
|
updated datetime, -- time at which the feed was last fetched
|
||||||
modified datetime, -- time at which the feed last actually changed
|
modified datetime, -- time at which the feed last actually changed
|
||||||
|
next_fetch datetime, -- time at which the feed should next be fetched
|
||||||
etag TEXT not null default '', -- HTTP ETag hash used for cache validation, changes each time the content changes
|
etag TEXT not null default '', -- HTTP ETag hash used for cache validation, changes each time the content changes
|
||||||
err_count integer not null default 0, -- count of successive times update resulted in error since last successful update
|
err_count integer not null default 0, -- count of successive times update resulted in error since last successful update
|
||||||
err_msg TEXT, -- last error message
|
err_msg TEXT, -- last error message
|
||||||
|
|
Loading…
Reference in a new issue