1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2025-01-08 17:02:41 +00:00

Deduplicate feed items within the feed itself

This commit is contained in:
J. King 2017-04-23 13:12:33 -04:00
parent 93c010d3d5
commit f842439b01

View file

@ -82,17 +82,104 @@ class Feed {
// If there aren't any of those there is no id. // If there aren't any of those there is no id.
$f->id = ''; $f->id = '';
} }
$this->data = $feed;
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited // if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited
if(!is_null($feedID)) { if(!is_null($feedID)) {
// FIXME: first perform deduplication on items $this->matchToDatabase($feedID);
// array if items in the fetched feed }
$items = $feed->items; return true;
// get as many of the latest articles in the database as there are in the feed }
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes protected function deduplicateItems(array $items): array {
$new = $tentative = $edited = []; /* Rationale:
// iterate through the articles and for each determine whether it is existing, edited, or entirely new Some newsfeeds (notably Planet) include multiple versions of an
foreach($items as $index => $i) { item if it is updated. As we only care about the latest, we
try to remove any "old" versions of an item that might also be
present within the feed.
*/
$out = [];
foreach($items as $item) {
foreach($out as $index => $check) {
// if the two items have the same ID or any one hash matches, they are two versions of the same item
if(
($item->id && $check->id && $item->id == $check->id) ||
$item->urlTitleHash == $check->urlTitleHash ||
$item->urlContentHash == $check->urlContentHash ||
$item->titleContentHash == $check->titleContentHash
) {
if(// because newsfeeds are usually order newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
($item->updatedDate && !$check->updatedDate) ||
// the later item has an update date newer than the existing item's
($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) ||
// neither item has update dates, both have publish dates, and the later item has a newer publish date
(!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp())
) {
// if the later item should be used, replace the existing one
$out[$index] = $item;
continue 2;
} else {
// otherwise skip the item
continue 2;
}
}
}
// if there was no match, add the item
$out[] = $item;
}
return $out;
}
protected function matchToDatabase(int $feedID): bool {
// first perform deduplication on items
$items = $this->deduplicateItems($this->data->items);
// get as many of the latest articles in the database as there are in the feed
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
$new = $tentative = $edited = [];
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
foreach($items as $index => $i) {
foreach($articles as $a) {
if(
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
$i->urlTitleHash === $a['url_title_hash'] ||
$i->urlContentHash === $a['url_content_hash'] ||
$i->titleContentHash === $a['title_content_hash']
) {
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
$edited[$index] = $a['id'];
break;
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
// if any of the hashes do not match, then the article has been edited
$edited[$index] = $a['id'];
break;
} else {
// otherwise the item is unchanged and we can ignore it
break;
}
} else {
// if we don't have a match, add the item to the tentatively new list
$tentative[] = $index;
}
}
}
if(sizeof($tentative)) {
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
$ids = $hashesUT = $hashesUC = $hashesTC = [];
foreach($tentative as $index) {
$i = $items[$index];
if($i->id) $ids[] = $id->id;
$hashesUT[] = $i->urlTitleHash;
$hashesUC[] = $i->urlContentHash;
$hashesTC[] = $i->titleContentHash;
}
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
foreach($tentative as $index) {
$i = $items[$index];
foreach($articles as $a) { foreach($articles as $a) {
if( if(
// the item matches if the GUID matches... // the item matches if the GUID matches...
@ -116,62 +203,19 @@ class Feed {
break; break;
} }
} else { } else {
// if we don't have a match, add the item to the tentatively new list // if we don't have a match, add the item to the definite new list
$tentative[] = $index; $new[] = $index;
} }
} }
} }
if(sizeof($tentative)) {
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
$ids = $hashesUT = $hashesUC = $hashesTC = [];
foreach($tentative as $index) {
$i = $items[$index];
if($i->id) $ids[] = $id->id;
$hashesUT[] = $i->urlTitleHash;
$hashesUC[] = $i->urlContentHash;
$hashesTC[] = $i->titleContentHash;
}
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
foreach($tentative as $index) {
$i = $items[$index];
foreach($articles as $a) {
if(
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
$i->urlTitleHash === $a['url_title_hash'] ||
$i->urlContentHash === $a['url_content_hash'] ||
$i->titleContentHash === $a['title_content_hash']
) {
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
$edited[$index] = $a['id'];
break;
} else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
// if any of the hashes do not match, then the article has been edited
$edited[$index] = $a['id'];
break;
} else {
// otherwise the item is unchanged and we can ignore it
break;
}
} else {
// if we don't have a match, add the item to the definite new list
$new[] = $index;
}
}
}
}
// FIXME: fetch full content when appropriate
foreach($new as $index) {
$this->newItems[] = $items[$index];
}
foreach($edited as $index => $id) {
$this->changedItems[$id] = $items[$index];
}
} }
$this->data = $feed; // FIXME: fetch full content when appropriate
foreach($new as $index) {
$this->newItems[] = $items[$index];
}
foreach($edited as $index => $id) {
$this->changedItems[$id] = $items[$index];
}
return true; return true;
} }
} }