mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-22 13:12:41 +00:00
Fixes to Feed class for bugs uncovered by initial deduplication tests
Three bugs: - The parser wrapping was generating hashes for fallback values in absence of actual values for both URL and title (which is valid if obscure RSS), URL and content, or title and content; now fallback values are detected and empty strings used instead of hashes - The deduplicator was assuming all three hashes would always exist, which is no longer the case - The database matcher was making the same assumption as the deduplicator
This commit is contained in:
parent
148875c0dd
commit
fd7d1c3192
4 changed files with 116 additions and 17 deletions
48
lib/Feed.php
48
lib/Feed.php
|
@ -82,9 +82,25 @@ class Feed {
|
|||
foreach ($feed->items as $f) {
|
||||
// Hashes used for comparison to check for updates and also to identify when an
|
||||
// id doesn't exist.
|
||||
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
|
||||
$f->urlContentHash = hash('sha256', $f->url.$f->content.$f->enclosureUrl.$f->enclosureType);
|
||||
$f->titleContentHash = hash('sha256', $f->title.$f->content.$f->enclosureUrl.$f->enclosureType);
|
||||
$content = $f->content.$f->enclosureUrl.$f->enclosureType;
|
||||
// if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
|
||||
if($f->url==$feed->siteUrl && $f->title==$feed->siteUrl) {
|
||||
$f->urlTitleHash = "";
|
||||
} else {
|
||||
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
|
||||
}
|
||||
// if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
|
||||
if(!strlen($content) && $f->url==$feed->siteUrl) {
|
||||
$f->urlContentHash = "";
|
||||
} else {
|
||||
$f->urlContentHash = hash('sha256', $f->url.$content);
|
||||
}
|
||||
// if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
|
||||
if(!strlen($content) && $f->title==$f->url) {
|
||||
$f->titleContentHash = "";
|
||||
} else {
|
||||
$f->titleContentHash = hash('sha256', $f->title.$content);
|
||||
}
|
||||
|
||||
// If there is an id element then continue. The id is used already.
|
||||
$id = (string)$f->xml->id;
|
||||
|
@ -126,9 +142,9 @@ class Feed {
|
|||
// if the two items have the same ID or any one hash matches, they are two versions of the same item
|
||||
if(
|
||||
($item->id && $check->id && $item->id == $check->id) ||
|
||||
$item->urlTitleHash == $check->urlTitleHash ||
|
||||
$item->urlContentHash == $check->urlContentHash ||
|
||||
$item->titleContentHash == $check->titleContentHash
|
||||
($item->urlTitleHash && $item->urlTitleHash == $check->urlTitleHash) ||
|
||||
($item->urlContentHash && $item->urlContentHash == $check->urlContentHash) ||
|
||||
($item->titleContentHash && $item->titleContentHash == $check->titleContentHash)
|
||||
) {
|
||||
if(// because newsfeeds are usually order newest-first, the later item should only be used if...
|
||||
// the later item has an update date and the existing item does not
|
||||
|
@ -173,9 +189,9 @@ class Feed {
|
|||
// the item matches if the GUID matches...
|
||||
($i->id && $i->id === $a['guid']) ||
|
||||
// ... or if any one of the hashes match
|
||||
$i->urlTitleHash === $a['url_title_hash'] ||
|
||||
$i->urlContentHash === $a['url_content_hash'] ||
|
||||
$i->titleContentHash === $a['title_content_hash']
|
||||
($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) ||
|
||||
($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) ||
|
||||
($i->titleContentHash && $i->titleContentHash === $a['title_content_hash'])
|
||||
) {
|
||||
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
|
||||
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
|
||||
|
@ -202,10 +218,10 @@ class Feed {
|
|||
$ids = $hashesUT = $hashesUC = $hashesTC = [];
|
||||
foreach($tentative as $index) {
|
||||
$i = $items[$index];
|
||||
if($i->id) $ids[] = $i->id;
|
||||
$hashesUT[] = $i->urlTitleHash;
|
||||
$hashesUC[] = $i->urlContentHash;
|
||||
$hashesTC[] = $i->titleContentHash;
|
||||
if($i->id) $ids[] = $i->id;
|
||||
if($i->urlTitleHash) $hashesUT[] = $i->urlTitleHash;
|
||||
if($i->urlContentHash) $hashesUC[] = $i->urlContentHash;
|
||||
if($i->titleContentHash) $hashesTC[] = $i->titleContentHash;
|
||||
}
|
||||
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
|
||||
foreach($tentative as $index) {
|
||||
|
@ -216,9 +232,9 @@ class Feed {
|
|||
// the item matches if the GUID matches...
|
||||
($i->id && $i->id === $a['guid']) ||
|
||||
// ... or if any one of the hashes match
|
||||
$i->urlTitleHash === $a['url_title_hash'] ||
|
||||
$i->urlContentHash === $a['url_content_hash'] ||
|
||||
$i->titleContentHash === $a['title_content_hash']
|
||||
($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) ||
|
||||
($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) ||
|
||||
($i->titleContentHash && $i->titleContentHash === $a['title_content_hash'])
|
||||
) {
|
||||
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
|
||||
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
|
||||
|
|
|
@ -19,6 +19,16 @@ class TestFeed extends \PHPUnit\Framework\TestCase {
|
|||
Data::$conf = new Conf();
|
||||
}
|
||||
|
||||
function testDeduplicateFeedItems() {
|
||||
$t = strtotime("2002-05-19T15:21:36Z");
|
||||
$f = new Feed(null, $this->base."Deduplication/Permalink-Dates");
|
||||
$this->assertCount(2, $f->newItems);
|
||||
$this->assertTime($t, $f->newItems[0]->updatedDate);
|
||||
$f = new Feed(null, $this->base."Deduplication/ID-Dates");
|
||||
$this->assertCount(2, $f->newItems);
|
||||
$this->assertTime($t, $f->newItems[0]->updatedDate);
|
||||
}
|
||||
|
||||
function testHandleCacheHeadersOn304() {
|
||||
// upon 304, the client should re-use the caching header values it supplied the server
|
||||
$t = time();
|
||||
|
@ -63,7 +73,6 @@ class TestFeed extends \PHPUnit\Framework\TestCase {
|
|||
$t = time();
|
||||
$f = new Feed(null, $this->base."Caching/200None");
|
||||
$this->assertTime($t, $f->lastModified);
|
||||
|
||||
}
|
||||
|
||||
function testComputeNextFetchOnError() {
|
||||
|
|
37
tests/docroot/Feed/Deduplication/ID-Dates.php
Normal file
37
tests/docroot/Feed/Deduplication/ID-Dates.php
Normal file
|
@ -0,0 +1,37 @@
|
|||
<?php return [
|
||||
'mime' => "application/rss+xml",
|
||||
'content' => <<<MESSAGE_BODY
|
||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
<channel>
|
||||
<title>Test feed</title>
|
||||
<link>http://example.com/</link>
|
||||
<description>A basic feed for testing</description>
|
||||
|
||||
<item>
|
||||
<guid>1</guid>
|
||||
<description>Sample article 1</description>
|
||||
<pubDate>Sun, 18 May 1995 15:21:36 GMT</pubDate>
|
||||
<atom:updated>2002-02-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
<item>
|
||||
<guid>1</guid>
|
||||
<description>Sample article 2</description>
|
||||
<pubDate>Sun, 19 May 2002 15:21:36 GMT</pubDate> <!-- This is the correct item and date -->
|
||||
<atom:updated>2002-04-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
<item>
|
||||
<guid>1</guid>
|
||||
<description>Sample article 3</description>
|
||||
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
|
||||
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
<item>
|
||||
<guid>2</guid>
|
||||
<description>Sample article 4</description>
|
||||
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
|
||||
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
MESSAGE_BODY
|
||||
];
|
37
tests/docroot/Feed/Deduplication/Permalink-Dates.php
Normal file
37
tests/docroot/Feed/Deduplication/Permalink-Dates.php
Normal file
|
@ -0,0 +1,37 @@
|
|||
<?php return [
|
||||
'mime' => "application/rss+xml",
|
||||
'content' => <<<MESSAGE_BODY
|
||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
<channel>
|
||||
<title>Test feed</title>
|
||||
<link>http://example.com/</link>
|
||||
<description>A basic feed for testing</description>
|
||||
|
||||
<item>
|
||||
<guid>http://example.com/1</guid>
|
||||
<description>Sample article 1</description>
|
||||
<pubDate>Sun, 18 May 1995 15:21:36 GMT</pubDate>
|
||||
<atom:updated>2002-02-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
<item>
|
||||
<guid>http://example.com/1</guid>
|
||||
<description>Sample article 2</description>
|
||||
<pubDate>Sun, 19 May 2002 15:21:36 GMT</pubDate> <!-- This is the correct item and date -->
|
||||
<atom:updated>2002-04-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
<item>
|
||||
<guid>http://example.com/1</guid>
|
||||
<description>Sample article 3</description>
|
||||
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
|
||||
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
<item>
|
||||
<guid>http://example.com/2</guid>
|
||||
<description>Sample article 4</description>
|
||||
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
|
||||
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
MESSAGE_BODY
|
||||
];
|
Loading…
Reference in a new issue