1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-22 13:12:41 +00:00

Deduplicate feed items within the feed itself

This commit is contained in:
J. King 2017-04-23 13:12:33 -04:00
parent 93c010d3d5
commit f842439b01

View file

@ -82,17 +82,104 @@ class Feed {
// If there aren't any of those there is no id.
$f->id = '';
}
$this->data = $feed;
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited
if(!is_null($feedID)) {
// FIXME: first perform deduplication on items
// array if items in the fetched feed
$items = $feed->items;
// get as many of the latest articles in the database as there are in the feed
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
$new = $tentative = $edited = [];
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
foreach($items as $index => $i) {
$this->matchToDatabase($feedID);
}
return true;
}
protected function deduplicateItems(array $items): array {
/* Rationale:
Some newsfeeds (notably Planet) include multiple versions of an
item if it is updated. As we only care about the latest, we
try to remove any "old" versions of an item that might also be
present within the feed.
*/
$out = [];
foreach($items as $item) {
foreach($out as $index => $check) {
// if the two items have the same ID or any one hash matches, they are two versions of the same item
if(
($item->id && $check->id && $item->id == $check->id) ||
$item->urlTitleHash == $check->urlTitleHash ||
$item->urlContentHash == $check->urlContentHash ||
$item->titleContentHash == $check->titleContentHash
) {
if(// because newsfeeds are usually order newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
($item->updatedDate && !$check->updatedDate) ||
// the later item has an update date newer than the existing item's
($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) ||
// neither item has update dates, both have publish dates, and the later item has a newer publish date
(!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp())
) {
// if the later item should be used, replace the existing one
$out[$index] = $item;
continue 2;
} else {
// otherwise skip the item
continue 2;
}
}
}
// if there was no match, add the item
$out[] = $item;
}
return $out;
}
protected function matchToDatabase(int $feedID): bool {
// first perform deduplication on items
$items = $this->deduplicateItems($this->data->items);
// get as many of the latest articles in the database as there are in the feed
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
$new = $tentative = $edited = [];
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
foreach($items as $index => $i) {
foreach($articles as $a) {
if(
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
$i->urlTitleHash === $a['url_title_hash'] ||
$i->urlContentHash === $a['url_content_hash'] ||
$i->titleContentHash === $a['title_content_hash']
) {
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
$edited[$index] = $a['id'];
break;
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
// if any of the hashes do not match, then the article has been edited
$edited[$index] = $a['id'];
break;
} else {
// otherwise the item is unchanged and we can ignore it
break;
}
} else {
// if we don't have a match, add the item to the tentatively new list
$tentative[] = $index;
}
}
}
if(sizeof($tentative)) {
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
$ids = $hashesUT = $hashesUC = $hashesTC = [];
foreach($tentative as $index) {
$i = $items[$index];
if($i->id) $ids[] = $id->id;
$hashesUT[] = $i->urlTitleHash;
$hashesUC[] = $i->urlContentHash;
$hashesTC[] = $i->titleContentHash;
}
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
foreach($tentative as $index) {
$i = $items[$index];
foreach($articles as $a) {
if(
// the item matches if the GUID matches...
@ -116,62 +203,19 @@ class Feed {
break;
}
} else {
// if we don't have a match, add the item to the tentatively new list
$tentative[] = $index;
// if we don't have a match, add the item to the definite new list
$new[] = $index;
}
}
}
if(sizeof($tentative)) {
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
$ids = $hashesUT = $hashesUC = $hashesTC = [];
foreach($tentative as $index) {
$i = $items[$index];
if($i->id) $ids[] = $id->id;
$hashesUT[] = $i->urlTitleHash;
$hashesUC[] = $i->urlContentHash;
$hashesTC[] = $i->titleContentHash;
}
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
foreach($tentative as $index) {
$i = $items[$index];
foreach($articles as $a) {
if(
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
$i->urlTitleHash === $a['url_title_hash'] ||
$i->urlContentHash === $a['url_content_hash'] ||
$i->titleContentHash === $a['title_content_hash']
) {
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
$edited[$index] = $a['id'];
break;
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
// if any of the hashes do not match, then the article has been edited
$edited[$index] = $a['id'];
break;
} else {
// otherwise the item is unchanged and we can ignore it
break;
}
} else {
// if we don't have a match, add the item to the definite new list
$new[] = $index;
}
}
}
}
// FIXME: fetch full content when appropriate
foreach($new as $index) {
$this->newItems[] = $items[$index];
}
foreach($edited as $index => $id) {
$this->changedItems[$id] = $items[$index];
}
}
$this->data = $feed;
// FIXME: fetch full content when appropriate
foreach($new as $index) {
$this->newItems[] = $items[$index];
}
foreach($edited as $index => $id) {
$this->changedItems[$id] = $items[$index];
}
return true;
}
}