1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-22 13:12:41 +00:00

Fixes to Feed class for bugs uncovered by initial deduplication tests

Three bugs:
- The parser wrapping was generating hashes for fallback values in absence of actual values for both URL and title (which is valid if obscure RSS), URL and content, or title and content; now fallback values are detected and empty strings used instead of hashes
- The deduplicator was assuming all three hashes would always exist, which is no longer the case
- The database matcher was making the same assumption as the deduplicator
This commit is contained in:
J. King 2017-05-23 20:39:29 -04:00
parent 148875c0dd
commit fd7d1c3192
4 changed files with 116 additions and 17 deletions

View file

@ -82,9 +82,25 @@ class Feed {
foreach ($feed->items as $f) {
// Hashes used for comparison to check for updates and also to identify when an
// id doesn't exist.
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
$f->urlContentHash = hash('sha256', $f->url.$f->content.$f->enclosureUrl.$f->enclosureType);
$f->titleContentHash = hash('sha256', $f->title.$f->content.$f->enclosureUrl.$f->enclosureType);
$content = $f->content.$f->enclosureUrl.$f->enclosureType;
// if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
if($f->url==$feed->siteUrl && $f->title==$feed->siteUrl) {
$f->urlTitleHash = "";
} else {
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
}
// if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
if(!strlen($content) && $f->url==$feed->siteUrl) {
$f->urlContentHash = "";
} else {
$f->urlContentHash = hash('sha256', $f->url.$content);
}
// if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
if(!strlen($content) && $f->title==$f->url) {
$f->titleContentHash = "";
} else {
$f->titleContentHash = hash('sha256', $f->title.$content);
}
// If there is an id element then continue. The id is used already.
$id = (string)$f->xml->id;
@ -126,9 +142,9 @@ class Feed {
// if the two items have the same ID or any one hash matches, they are two versions of the same item
if(
($item->id && $check->id && $item->id == $check->id) ||
$item->urlTitleHash == $check->urlTitleHash ||
$item->urlContentHash == $check->urlContentHash ||
$item->titleContentHash == $check->titleContentHash
($item->urlTitleHash && $item->urlTitleHash == $check->urlTitleHash) ||
($item->urlContentHash && $item->urlContentHash == $check->urlContentHash) ||
($item->titleContentHash && $item->titleContentHash == $check->titleContentHash)
) {
if(// because newsfeeds are usually order newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
@ -173,9 +189,9 @@ class Feed {
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
$i->urlTitleHash === $a['url_title_hash'] ||
$i->urlContentHash === $a['url_content_hash'] ||
$i->titleContentHash === $a['title_content_hash']
($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) ||
($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) ||
($i->titleContentHash && $i->titleContentHash === $a['title_content_hash'])
) {
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
@ -202,10 +218,10 @@ class Feed {
$ids = $hashesUT = $hashesUC = $hashesTC = [];
foreach($tentative as $index) {
$i = $items[$index];
if($i->id) $ids[] = $i->id;
$hashesUT[] = $i->urlTitleHash;
$hashesUC[] = $i->urlContentHash;
$hashesTC[] = $i->titleContentHash;
if($i->id) $ids[] = $i->id;
if($i->urlTitleHash) $hashesUT[] = $i->urlTitleHash;
if($i->urlContentHash) $hashesUC[] = $i->urlContentHash;
if($i->titleContentHash) $hashesTC[] = $i->titleContentHash;
}
$articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC);
foreach($tentative as $index) {
@ -216,9 +232,9 @@ class Feed {
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
$i->urlTitleHash === $a['url_title_hash'] ||
$i->urlContentHash === $a['url_content_hash'] ||
$i->titleContentHash === $a['title_content_hash']
($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) ||
($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) ||
($i->titleContentHash && $i->titleContentHash === $a['title_content_hash'])
) {
if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited

View file

@ -19,6 +19,16 @@ class TestFeed extends \PHPUnit\Framework\TestCase {
Data::$conf = new Conf();
}
function testDeduplicateFeedItems() {
$t = strtotime("2002-05-19T15:21:36Z");
$f = new Feed(null, $this->base."Deduplication/Permalink-Dates");
$this->assertCount(2, $f->newItems);
$this->assertTime($t, $f->newItems[0]->updatedDate);
$f = new Feed(null, $this->base."Deduplication/ID-Dates");
$this->assertCount(2, $f->newItems);
$this->assertTime($t, $f->newItems[0]->updatedDate);
}
function testHandleCacheHeadersOn304() {
// upon 304, the client should re-use the caching header values it supplied the server
$t = time();
@ -63,7 +73,6 @@ class TestFeed extends \PHPUnit\Framework\TestCase {
$t = time();
$f = new Feed(null, $this->base."Caching/200None");
$this->assertTime($t, $f->lastModified);
}
function testComputeNextFetchOnError() {

View file

@ -0,0 +1,37 @@
<?php return [
'mime' => "application/rss+xml",
'content' => <<<MESSAGE_BODY
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Test feed</title>
<link>http://example.com/</link>
<description>A basic feed for testing</description>
<item>
<guid>1</guid>
<description>Sample article 1</description>
<pubDate>Sun, 18 May 1995 15:21:36 GMT</pubDate>
<atom:updated>2002-02-19T15:21:36Z</atom:updated>
</item>
<item>
<guid>1</guid>
<description>Sample article 2</description>
<pubDate>Sun, 19 May 2002 15:21:36 GMT</pubDate> <!-- This is the correct item and date -->
<atom:updated>2002-04-19T15:21:36Z</atom:updated>
</item>
<item>
<guid>1</guid>
<description>Sample article 3</description>
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
</item>
<item>
<guid>2</guid>
<description>Sample article 4</description>
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
</item>
</channel>
</rss>
MESSAGE_BODY
];

View file

@ -0,0 +1,37 @@
<?php return [
'mime' => "application/rss+xml",
'content' => <<<MESSAGE_BODY
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
<channel>
<title>Test feed</title>
<link>http://example.com/</link>
<description>A basic feed for testing</description>
<item>
<guid>http://example.com/1</guid>
<description>Sample article 1</description>
<pubDate>Sun, 18 May 1995 15:21:36 GMT</pubDate>
<atom:updated>2002-02-19T15:21:36Z</atom:updated>
</item>
<item>
<guid>http://example.com/1</guid>
<description>Sample article 2</description>
<pubDate>Sun, 19 May 2002 15:21:36 GMT</pubDate> <!-- This is the correct item and date -->
<atom:updated>2002-04-19T15:21:36Z</atom:updated>
</item>
<item>
<guid>http://example.com/1</guid>
<description>Sample article 3</description>
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
</item>
<item>
<guid>http://example.com/2</guid>
<description>Sample article 4</description>
<pubDate>Sun, 18 May 2000 15:21:36 GMT</pubDate>
<atom:updated>1999-05-19T15:21:36Z</atom:updated>
</item>
</channel>
</rss>
MESSAGE_BODY
];