mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-22 21:22:40 +00:00
Move feed item change detection to Feed class
This commit is contained in:
parent
b1b96bb806
commit
93c010d3d5
2 changed files with 120 additions and 107 deletions
129
lib/Database.php
129
lib/Database.php
|
@ -441,7 +441,7 @@ class Database {
|
||||||
try {
|
try {
|
||||||
$feed = new Feed($f['url'], (string)$f['lastmodified'], $f['etag'], $f['username'], $f['password']);
|
$feed = new Feed($f['url'], (string)$f['lastmodified'], $f['etag'], $f['username'], $f['password']);
|
||||||
if($feed->resource->isModified()) {
|
if($feed->resource->isModified()) {
|
||||||
$feed->parse();
|
$feed->parse($feedID);
|
||||||
} else {
|
} else {
|
||||||
// if the feed hasn't changed, just compute the next fetch time and record it
|
// if the feed hasn't changed, just compute the next fetch time and record it
|
||||||
$next = $this->feedNextFetch($feedID);
|
$next = $this->feedNextFetch($feedID);
|
||||||
|
@ -459,110 +459,12 @@ class Database {
|
||||||
$this->db->rollback();
|
$this->db->rollback();
|
||||||
throw $e;
|
throw $e;
|
||||||
}
|
}
|
||||||
// FIXME: first perform deduplication on the feed itself
|
|
||||||
// array if items in the fetched feed
|
|
||||||
$items = $feed->data->items;
|
|
||||||
// get as many of the latest articles in the database as there are in the feed
|
|
||||||
$articles = $this->db->prepare(
|
|
||||||
'SELECT id, DATEFORMAT("unix", edited) AS edited_date, guid, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY edited desc limit ?',
|
|
||||||
'int', 'int'
|
|
||||||
)->run(
|
|
||||||
$feedID, sizeof($items)
|
|
||||||
)->getAll();
|
|
||||||
// arrays holding new, edited, and tentatively new items
|
|
||||||
// items may be tentatively new because we perform two passes
|
|
||||||
$new = $tentative = $edited = [];
|
|
||||||
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
|
|
||||||
foreach($items as $index => $i) {
|
|
||||||
foreach($articles as $a) {
|
|
||||||
if(
|
|
||||||
// the item matches if the GUID matches...
|
|
||||||
($i->id && $i->id === $a['guid']) ||
|
|
||||||
// ... or if any one of the hashes match
|
|
||||||
$i->urlTitleHash === $a['url_title_hash'] ||
|
|
||||||
$i->urlContentHash === $a['url_content_hash'] ||
|
|
||||||
$i->titleContentHash === $a['title_content_hash']
|
|
||||||
) {
|
|
||||||
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
|
|
||||||
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
|
|
||||||
// we store the item index and database record ID as a key/value pair
|
|
||||||
$edited[$index] = $a['id'];
|
|
||||||
break;
|
|
||||||
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
|
|
||||||
// if any of the hashes do not match, then the article has been edited
|
|
||||||
$edited[$index] = $a['id'];
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
// otherwise the item is unchanged and we can ignore it
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// if we don't have a match, add the item to the tentatively new list
|
|
||||||
$tentative[] = $index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if(sizeof($tentative)) {
|
|
||||||
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
|
|
||||||
$vId = $vHashUT = $vHashUC = $vHashTC = [];
|
|
||||||
foreach($tentative as $index) {
|
|
||||||
$i = $items[$index];
|
|
||||||
if($i->id) $vId[] = $id->id;
|
|
||||||
$vHashUT[] = $i->urlTitleHash;
|
|
||||||
$vHashUC[] = $i->urlContentHash;
|
|
||||||
$vHashTC[] = $i->titleContentHash;
|
|
||||||
}
|
|
||||||
// compile SQL IN() clauses and necessary type bindings for the four identifier lists
|
|
||||||
list($cId, $tId) = $thiis->generateIn($vId, "str");
|
|
||||||
list($cHashUT, $tHashUT) = $thiis->generateIn($vHashUT, "str");
|
|
||||||
list($cHashUC, $tHashUC) = $thiis->generateIn($vHashUC, "str");
|
|
||||||
list($cHashTC, $tHashTC) = $thiis->generateIn($vHashTC, "str");
|
|
||||||
// perform the query
|
|
||||||
$articles = $this->db->prepare(
|
|
||||||
'SELECT id, DATEFORMAT("unix", edited) AS edited_date, guid, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles '.
|
|
||||||
'WHERE feed is ? and (guid in($cId) or url_title_hash in($cHashUT) or url_content_hash in($cHashUC) or title_content_hash in($cHashTC)',
|
|
||||||
'int', $tId, $tHashUT, $tHashUC, $tHashTC
|
|
||||||
)->run(
|
|
||||||
$feedID, $vId, $vHashUT, $vHashUC, $vHashTC
|
|
||||||
)->getAll();
|
|
||||||
foreach($tentative as $index) {
|
|
||||||
$i = $items[$index];
|
|
||||||
foreach($articles as $a) {
|
|
||||||
if(
|
|
||||||
// the item matches if the GUID matches...
|
|
||||||
($i->id && $i->id === $a['guid']) ||
|
|
||||||
// ... or if any one of the hashes match
|
|
||||||
$i->urlTitleHash === $a['url_title_hash'] ||
|
|
||||||
$i->urlContentHash === $a['url_content_hash'] ||
|
|
||||||
$i->titleContentHash === $a['title_content_hash']
|
|
||||||
) {
|
|
||||||
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
|
|
||||||
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
|
|
||||||
// we store the item index and database record ID as a key/value pair
|
|
||||||
$edited[$index] = $a['id'];
|
|
||||||
break;
|
|
||||||
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
|
|
||||||
// if any of the hashes do not match, then the article has been edited
|
|
||||||
$edited[$index] = $a['id'];
|
|
||||||
break;
|
|
||||||
} else {
|
|
||||||
// otherwise the item is unchanged and we can ignore it
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// if we don't have a match, add the item to the definite new list
|
|
||||||
$new[] = $index;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// FIXME: fetch full content when appropriate
|
|
||||||
// finally actually perform updates
|
// finally actually perform updates
|
||||||
foreach($new as $index) {
|
foreach($feed->newItems as $item) {
|
||||||
$this->articleAdd($feedID, $items[$index]);
|
$this->articleAdd($feedID, $item);
|
||||||
}
|
}
|
||||||
foreach($edited as $index => $id) {
|
foreach($feed->changedItems as $id => $item) {
|
||||||
$this->articleAdd($feedID, $items[$index], $id);
|
$this->articleAdd($feedID, $item, $id);
|
||||||
}
|
}
|
||||||
// lastly update the feed database itself with updated information.
|
// lastly update the feed database itself with updated information.
|
||||||
$next = $this->feedNextFetch($feedID, $feed);
|
$next = $this->feedNextFetch($feedID, $feed);
|
||||||
|
@ -589,6 +491,27 @@ class Database {
|
||||||
return new \DateTime("now + 3 hours", new \DateTimeZone("UTC"));
|
return new \DateTime("now + 3 hours", new \DateTimeZone("UTC"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public function articleMatchLatest(int $feedID, int $count): Db\Result {
|
||||||
|
return $this->db->prepare(
|
||||||
|
'SELECT id, DATEFORMAT("unix", edited) AS edited_date, guid, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY edited desc limit ?',
|
||||||
|
'int', 'int'
|
||||||
|
)->run($feedID, $count);
|
||||||
|
}
|
||||||
|
|
||||||
|
public function articleMatchIds(int $feedID, array $ids = [], array $hashesUT = [], array $hashesUC = [], array $hashesTC = []): Db\Result {
|
||||||
|
// compile SQL IN() clauses and necessary type bindings for the four identifier lists
|
||||||
|
list($cId, $tId) = $this->generateIn($ids, "str");
|
||||||
|
list($cHashUT, $tHashUT) = $this->generateIn($hashesUT, "str");
|
||||||
|
list($cHashUC, $tHashUC) = $this->generateIn($hashesUC, "str");
|
||||||
|
list($cHashTC, $tHashTC) = $this->generateIn($hashesTC, "str");
|
||||||
|
// perform the query
|
||||||
|
return $articles = $this->db->prepare(
|
||||||
|
'SELECT id, DATEFORMAT("unix", edited) AS edited_date, guid, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles '.
|
||||||
|
'WHERE feed is ? and (guid in($cId) or url_title_hash in($cHashUT) or url_content_hash in($cHashUC) or title_content_hash in($cHashTC)',
|
||||||
|
'int', $tId, $tHashUT, $tHashUC, $tHashTC
|
||||||
|
)->run($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
|
||||||
|
}
|
||||||
|
|
||||||
public function articleAdd(int $feedID, \PicoFeed\Parser\Item $article, int $articleID = null): int {
|
public function articleAdd(int $feedID, \PicoFeed\Parser\Item $article, int $articleID = null): int {
|
||||||
$this->db->begin();
|
$this->db->begin();
|
||||||
try {
|
try {
|
||||||
|
|
96
lib/Feed.php
96
lib/Feed.php
|
@ -12,6 +12,8 @@ class Feed {
|
||||||
public $parser;
|
public $parser;
|
||||||
public $reader;
|
public $reader;
|
||||||
public $resource;
|
public $resource;
|
||||||
|
public $newItems = [];
|
||||||
|
public $changedItems = [];
|
||||||
|
|
||||||
public function __construct(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') {
|
public function __construct(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') {
|
||||||
try {
|
try {
|
||||||
|
@ -26,7 +28,7 @@ class Feed {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public function parse(): bool {
|
public function parse(int $feedID = null): bool {
|
||||||
try {
|
try {
|
||||||
$this->parser = $this->reader->getParser(
|
$this->parser = $this->reader->getParser(
|
||||||
$this->resource->getUrl(),
|
$this->resource->getUrl(),
|
||||||
|
@ -50,7 +52,7 @@ class Feed {
|
||||||
// identification. These feeds shouldn't be duplicated when updated. That should
|
// identification. These feeds shouldn't be duplicated when updated. That should
|
||||||
// only be reserved for severely broken feeds.
|
// only be reserved for severely broken feeds.
|
||||||
|
|
||||||
foreach ($feed->items as &$f) {
|
foreach ($feed->items as $f) {
|
||||||
// Hashes used for comparison to check for updates and also to identify when an
|
// Hashes used for comparison to check for updates and also to identify when an
|
||||||
// id doesn't exist.
|
// id doesn't exist.
|
||||||
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
|
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
|
||||||
|
@ -80,7 +82,95 @@ class Feed {
|
||||||
// If there aren't any of those there is no id.
|
// If there aren't any of those there is no id.
|
||||||
$f->id = '';
|
$f->id = '';
|
||||||
}
|
}
|
||||||
|
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited
|
||||||
|
if(!is_null($feedID)) {
|
||||||
|
// FIXME: first perform deduplication on items
|
||||||
|
// array if items in the fetched feed
|
||||||
|
$items = $feed->items;
|
||||||
|
// get as many of the latest articles in the database as there are in the feed
|
||||||
|
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
|
||||||
|
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
|
||||||
|
$new = $tentative = $edited = [];
|
||||||
|
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
|
||||||
|
foreach($items as $index => $i) {
|
||||||
|
foreach($articles as $a) {
|
||||||
|
if(
|
||||||
|
// the item matches if the GUID matches...
|
||||||
|
($i->id && $i->id === $a['guid']) ||
|
||||||
|
// ... or if any one of the hashes match
|
||||||
|
$i->urlTitleHash === $a['url_title_hash'] ||
|
||||||
|
$i->urlContentHash === $a['url_content_hash'] ||
|
||||||
|
$i->titleContentHash === $a['title_content_hash']
|
||||||
|
) {
|
||||||
|
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
|
||||||
|
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
|
||||||
|
// we store the item index and database record ID as a key/value pair
|
||||||
|
$edited[$index] = $a['id'];
|
||||||
|
break;
|
||||||
|
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
|
||||||
|
// if any of the hashes do not match, then the article has been edited
|
||||||
|
$edited[$index] = $a['id'];
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// otherwise the item is unchanged and we can ignore it
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// if we don't have a match, add the item to the tentatively new list
|
||||||
|
$tentative[] = $index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if(sizeof($tentative)) {
|
||||||
|
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
|
||||||
|
$ids = $hashesUT = $hashesUC = $hashesTC = [];
|
||||||
|
foreach($tentative as $index) {
|
||||||
|
$i = $items[$index];
|
||||||
|
if($i->id) $ids[] = $id->id;
|
||||||
|
$hashesUT[] = $i->urlTitleHash;
|
||||||
|
$hashesUC[] = $i->urlContentHash;
|
||||||
|
$hashesTC[] = $i->titleContentHash;
|
||||||
|
}
|
||||||
|
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
|
||||||
|
foreach($tentative as $index) {
|
||||||
|
$i = $items[$index];
|
||||||
|
foreach($articles as $a) {
|
||||||
|
if(
|
||||||
|
// the item matches if the GUID matches...
|
||||||
|
($i->id && $i->id === $a['guid']) ||
|
||||||
|
// ... or if any one of the hashes match
|
||||||
|
$i->urlTitleHash === $a['url_title_hash'] ||
|
||||||
|
$i->urlContentHash === $a['url_content_hash'] ||
|
||||||
|
$i->titleContentHash === $a['title_content_hash']
|
||||||
|
) {
|
||||||
|
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
|
||||||
|
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
|
||||||
|
// we store the item index and database record ID as a key/value pair
|
||||||
|
$edited[$index] = $a['id'];
|
||||||
|
break;
|
||||||
|
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
|
||||||
|
// if any of the hashes do not match, then the article has been edited
|
||||||
|
$edited[$index] = $a['id'];
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
// otherwise the item is unchanged and we can ignore it
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// if we don't have a match, add the item to the definite new list
|
||||||
|
$new[] = $index;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// FIXME: fetch full content when appropriate
|
||||||
|
foreach($new as $index) {
|
||||||
|
$this->newItems[] = $items[$index];
|
||||||
|
}
|
||||||
|
foreach($edited as $index => $id) {
|
||||||
|
$this->changedItems[$id] = $items[$index];
|
||||||
|
}
|
||||||
|
}
|
||||||
$this->data = $feed;
|
$this->data = $feed;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue