From fd7d1c3192bb086fa1f041c2e11e727b47c9a582 Mon Sep 17 00:00:00 2001 From: "J. King" Date: Tue, 23 May 2017 20:39:29 -0400 Subject: [PATCH] Fixes to Feed class for bugs uncovered by initial deduplication tests Three bugs: - The parser wrapping was generating hashes for fallback values in absence of actual values for both URL and title (which is valid if obscure RSS), URL and content, or title and content; now fallback values are detected and empty strings used instead of hashes - The deduplicator was assuming all three hashes would always exist, which is no longer the case - The database matcher was making the same assumption as the deduplicator --- lib/Feed.php | 48 ++++++++++++------- tests/Feed/TestFeed.php | 11 ++++- tests/docroot/Feed/Deduplication/ID-Dates.php | 37 ++++++++++++++ .../Feed/Deduplication/Permalink-Dates.php | 37 ++++++++++++++ 4 files changed, 116 insertions(+), 17 deletions(-) create mode 100644 tests/docroot/Feed/Deduplication/ID-Dates.php create mode 100644 tests/docroot/Feed/Deduplication/Permalink-Dates.php diff --git a/lib/Feed.php b/lib/Feed.php index 014297bd..7c89dcfe 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -82,9 +82,25 @@ class Feed { foreach ($feed->items as $f) { // Hashes used for comparison to check for updates and also to identify when an // id doesn't exist. - $f->urlTitleHash = hash('sha256', $f->url.$f->title); - $f->urlContentHash = hash('sha256', $f->url.$f->content.$f->enclosureUrl.$f->enclosureType); - $f->titleContentHash = hash('sha256', $f->title.$f->content.$f->enclosureUrl.$f->enclosureType); + $content = $f->content.$f->enclosureUrl.$f->enclosureType; + // if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title + if($f->url==$feed->siteUrl && $f->title==$feed->siteUrl) { + $f->urlTitleHash = ""; + } else { + $f->urlTitleHash = hash('sha256', $f->url.$f->title); + } + // if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed + if(!strlen($content) && $f->url==$feed->siteUrl) { + $f->urlContentHash = ""; + } else { + $f->urlContentHash = hash('sha256', $f->url.$content); + } + // if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed + if(!strlen($content) && $f->title==$f->url) { + $f->titleContentHash = ""; + } else { + $f->titleContentHash = hash('sha256', $f->title.$content); + } // If there is an id element then continue. The id is used already. $id = (string)$f->xml->id; @@ -126,9 +142,9 @@ class Feed { // if the two items have the same ID or any one hash matches, they are two versions of the same item if( ($item->id && $check->id && $item->id == $check->id) || - $item->urlTitleHash == $check->urlTitleHash || - $item->urlContentHash == $check->urlContentHash || - $item->titleContentHash == $check->titleContentHash + ($item->urlTitleHash && $item->urlTitleHash == $check->urlTitleHash) || + ($item->urlContentHash && $item->urlContentHash == $check->urlContentHash) || + ($item->titleContentHash && $item->titleContentHash == $check->titleContentHash) ) { if(// because newsfeeds are usually order newest-first, the later item should only be used if... // the later item has an update date and the existing item does not @@ -173,9 +189,9 @@ class Feed { // the item matches if the GUID matches... ($i->id && $i->id === $a['guid']) || // ... or if any one of the hashes match - $i->urlTitleHash === $a['url_title_hash'] || - $i->urlContentHash === $a['url_content_hash'] || - $i->titleContentHash === $a['title_content_hash'] + ($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) || + ($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) || + ($i->titleContentHash && $i->titleContentHash === $a['title_content_hash']) ) { if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { // if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited @@ -202,10 +218,10 @@ class Feed { $ids = $hashesUT = $hashesUC = $hashesTC = []; foreach($tentative as $index) { $i = $items[$index]; - if($i->id) $ids[] = $i->id; - $hashesUT[] = $i->urlTitleHash; - $hashesUC[] = $i->urlContentHash; - $hashesTC[] = $i->titleContentHash; + if($i->id) $ids[] = $i->id; + if($i->urlTitleHash) $hashesUT[] = $i->urlTitleHash; + if($i->urlContentHash) $hashesUC[] = $i->urlContentHash; + if($i->titleContentHash) $hashesTC[] = $i->titleContentHash; } $articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC); foreach($tentative as $index) { @@ -216,9 +232,9 @@ class Feed { // the item matches if the GUID matches... ($i->id && $i->id === $a['guid']) || // ... or if any one of the hashes match - $i->urlTitleHash === $a['url_title_hash'] || - $i->urlContentHash === $a['url_content_hash'] || - $i->titleContentHash === $a['title_content_hash'] + ($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) || + ($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) || + ($i->titleContentHash && $i->titleContentHash === $a['title_content_hash']) ) { if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { // if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited diff --git a/tests/Feed/TestFeed.php b/tests/Feed/TestFeed.php index bbb301fe..dd294a27 100644 --- a/tests/Feed/TestFeed.php +++ b/tests/Feed/TestFeed.php @@ -19,6 +19,16 @@ class TestFeed extends \PHPUnit\Framework\TestCase { Data::$conf = new Conf(); } + function testDeduplicateFeedItems() { + $t = strtotime("2002-05-19T15:21:36Z"); + $f = new Feed(null, $this->base."Deduplication/Permalink-Dates"); + $this->assertCount(2, $f->newItems); + $this->assertTime($t, $f->newItems[0]->updatedDate); + $f = new Feed(null, $this->base."Deduplication/ID-Dates"); + $this->assertCount(2, $f->newItems); + $this->assertTime($t, $f->newItems[0]->updatedDate); + } + function testHandleCacheHeadersOn304() { // upon 304, the client should re-use the caching header values it supplied the server $t = time(); @@ -63,7 +73,6 @@ class TestFeed extends \PHPUnit\Framework\TestCase { $t = time(); $f = new Feed(null, $this->base."Caching/200None"); $this->assertTime($t, $f->lastModified); - } function testComputeNextFetchOnError() { diff --git a/tests/docroot/Feed/Deduplication/ID-Dates.php b/tests/docroot/Feed/Deduplication/ID-Dates.php new file mode 100644 index 00000000..98837ea3 --- /dev/null +++ b/tests/docroot/Feed/Deduplication/ID-Dates.php @@ -0,0 +1,37 @@ + "application/rss+xml", + 'content' => << + + Test feed + http://example.com/ + A basic feed for testing + + + 1 + Sample article 1 + Sun, 18 May 1995 15:21:36 GMT + 2002-02-19T15:21:36Z + + + 1 + Sample article 2 + Sun, 19 May 2002 15:21:36 GMT + 2002-04-19T15:21:36Z + + + 1 + Sample article 3 + Sun, 18 May 2000 15:21:36 GMT + 1999-05-19T15:21:36Z + + + 2 + Sample article 4 + Sun, 18 May 2000 15:21:36 GMT + 1999-05-19T15:21:36Z + + + +MESSAGE_BODY +]; \ No newline at end of file diff --git a/tests/docroot/Feed/Deduplication/Permalink-Dates.php b/tests/docroot/Feed/Deduplication/Permalink-Dates.php new file mode 100644 index 00000000..51cc69d7 --- /dev/null +++ b/tests/docroot/Feed/Deduplication/Permalink-Dates.php @@ -0,0 +1,37 @@ + "application/rss+xml", + 'content' => << + + Test feed + http://example.com/ + A basic feed for testing + + + http://example.com/1 + Sample article 1 + Sun, 18 May 1995 15:21:36 GMT + 2002-02-19T15:21:36Z + + + http://example.com/1 + Sample article 2 + Sun, 19 May 2002 15:21:36 GMT + 2002-04-19T15:21:36Z + + + http://example.com/1 + Sample article 3 + Sun, 18 May 2000 15:21:36 GMT + 1999-05-19T15:21:36Z + + + http://example.com/2 + Sample article 4 + Sun, 18 May 2000 15:21:36 GMT + 1999-05-19T15:21:36Z + + + +MESSAGE_BODY +]; \ No newline at end of file