mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-23 09:02:41 +00:00
Deduplicate feed items within the feed itself
This commit is contained in:
parent
93c010d3d5
commit
f842439b01
1 changed files with 105 additions and 61 deletions
54
lib/Feed.php
54
lib/Feed.php
|
@ -82,11 +82,57 @@ class Feed {
|
||||||
// If there aren't any of those there is no id.
|
// If there aren't any of those there is no id.
|
||||||
$f->id = '';
|
$f->id = '';
|
||||||
}
|
}
|
||||||
|
$this->data = $feed;
|
||||||
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited
|
// if a feedID is supplied, determine which items are already in the database, which are not, and which might have been edited
|
||||||
if(!is_null($feedID)) {
|
if(!is_null($feedID)) {
|
||||||
// FIXME: first perform deduplication on items
|
$this->matchToDatabase($feedID);
|
||||||
// array if items in the fetched feed
|
}
|
||||||
$items = $feed->items;
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function deduplicateItems(array $items): array {
|
||||||
|
/* Rationale:
|
||||||
|
Some newsfeeds (notably Planet) include multiple versions of an
|
||||||
|
item if it is updated. As we only care about the latest, we
|
||||||
|
try to remove any "old" versions of an item that might also be
|
||||||
|
present within the feed.
|
||||||
|
*/
|
||||||
|
$out = [];
|
||||||
|
foreach($items as $item) {
|
||||||
|
foreach($out as $index => $check) {
|
||||||
|
// if the two items have the same ID or any one hash matches, they are two versions of the same item
|
||||||
|
if(
|
||||||
|
($item->id && $check->id && $item->id == $check->id) ||
|
||||||
|
$item->urlTitleHash == $check->urlTitleHash ||
|
||||||
|
$item->urlContentHash == $check->urlContentHash ||
|
||||||
|
$item->titleContentHash == $check->titleContentHash
|
||||||
|
) {
|
||||||
|
if(// because newsfeeds are usually order newest-first, the later item should only be used if...
|
||||||
|
// the later item has an update date and the existing item does not
|
||||||
|
($item->updatedDate && !$check->updatedDate) ||
|
||||||
|
// the later item has an update date newer than the existing item's
|
||||||
|
($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) ||
|
||||||
|
// neither item has update dates, both have publish dates, and the later item has a newer publish date
|
||||||
|
(!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp())
|
||||||
|
) {
|
||||||
|
// if the later item should be used, replace the existing one
|
||||||
|
$out[$index] = $item;
|
||||||
|
continue 2;
|
||||||
|
} else {
|
||||||
|
// otherwise skip the item
|
||||||
|
continue 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if there was no match, add the item
|
||||||
|
$out[] = $item;
|
||||||
|
}
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected function matchToDatabase(int $feedID): bool {
|
||||||
|
// first perform deduplication on items
|
||||||
|
$items = $this->deduplicateItems($this->data->items);
|
||||||
// get as many of the latest articles in the database as there are in the feed
|
// get as many of the latest articles in the database as there are in the feed
|
||||||
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
|
$articles = Data::$db->articleMatchLatest($feedID, sizeof($items));
|
||||||
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
|
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
|
||||||
|
@ -170,8 +216,6 @@ class Feed {
|
||||||
foreach($edited as $index => $id) {
|
foreach($edited as $index => $id) {
|
||||||
$this->changedItems[$id] = $items[$index];
|
$this->changedItems[$id] = $items[$index];
|
||||||
}
|
}
|
||||||
}
|
|
||||||
$this->data = $feed;
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in a new issue