From 5b8f23441c7c070b6114c0bc5af2c1c696b6f52e Mon Sep 17 00:00:00 2001 From: "J. King" Date: Tue, 30 May 2017 20:18:04 -0400 Subject: [PATCH] Feed tests: article matching These should be the last required tests for the Feed class --- lib/Database.php | 4 +- lib/Feed.php | 95 ++++++++++--------------------- tests/Feed/TestFeed.php | 93 ++++++++++++++++++++++++++++++ tests/docroot/Feed/Matching/1.php | 51 +++++++++++++++++ tests/docroot/Feed/Matching/2.php | 59 +++++++++++++++++++ tests/docroot/Feed/Matching/3.php | 59 +++++++++++++++++++ tests/docroot/Feed/Matching/4.php | 58 +++++++++++++++++++ tests/docroot/Feed/Matching/5.php | 51 +++++++++++++++++ 8 files changed, 402 insertions(+), 68 deletions(-) create mode 100644 tests/docroot/Feed/Matching/1.php create mode 100644 tests/docroot/Feed/Matching/2.php create mode 100644 tests/docroot/Feed/Matching/3.php create mode 100644 tests/docroot/Feed/Matching/4.php create mode 100644 tests/docroot/Feed/Matching/5.php diff --git a/lib/Database.php b/lib/Database.php index aa6b8c70..0ffadec8 100644 --- a/lib/Database.php +++ b/lib/Database.php @@ -658,14 +658,14 @@ class Database { return true; } - public function articleMatchLatest(int $feedID, int $count): Db\Result { + public function feedMatchLatest(int $feedID, int $count): Db\Result { return $this->db->prepare( 'SELECT id, DATEFORMAT("unix", edited) AS edited_date, guid, url_title_hash, url_content_hash, title_content_hash FROM arsse_articles WHERE feed is ? ORDER BY edited desc limit ?', 'int', 'int' )->run($feedID, $count); } - public function articleMatchIds(int $feedID, array $ids = [], array $hashesUT = [], array $hashesUC = [], array $hashesTC = []): Db\Result { + public function feedMatchIds(int $feedID, array $ids = [], array $hashesUT = [], array $hashesUC = [], array $hashesTC = []): Db\Result { // compile SQL IN() clauses and necessary type bindings for the four identifier lists list($cId, $tId) = $this->generateIn($ids, "str"); list($cHashUT, $tHashUT) = $this->generateIn($hashesUT, "str"); diff --git a/lib/Feed.php b/lib/Feed.php index 33a3b34c..9b4ab2ef 100644 --- a/lib/Feed.php +++ b/lib/Feed.php @@ -127,7 +127,7 @@ class Feed { } // If there aren't any of those there is no id. - $f->id = ''; + $f->id = null; } $this->data = $feed; return true; @@ -184,11 +184,30 @@ class Feed { return true; } // get as many of the latest articles in the database as there are in the feed - $articles = Data::$db->articleMatchLatest($feedID, sizeof($items)); - // arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes - $new = $tentative = $edited = []; + $articles = Data::$db->feedMatchLatest($feedID, sizeof($items))->getAll(); + // perform a first pass matching the latest articles against items in the feed + list($this->newItems, $this->changedItems) = $this->matchItems($items, $articles); + if(sizeof($this->newItems) && sizeof($items) <= sizeof($articles)) { + // if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items + $ids = $hashesUT = $hashesUC = $hashesTC = []; + foreach($this->newItems as $i) { + if($i->id) $ids[] = $i->id; + if($i->urlTitleHash) $hashesUT[] = $i->urlTitleHash; + if($i->urlContentHash) $hashesUC[] = $i->urlContentHash; + if($i->titleContentHash) $hashesTC[] = $i->titleContentHash; + } + $articles = Data::$db->feedMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC)->getAll(); + list($this->newItems, $changed) = $this->matchItems($this->newItems, $articles); + $this->changedItems = array_merge($this->changedItems, $changed); + } + // TODO: fetch full content when appropriate + return true; + } + + public function matchItems(array $items, array $articles): array { + $new = $edited = []; // iterate through the articles and for each determine whether it is existing, edited, or entirely new - foreach($items as $index => $i) { + foreach($items as $i) { $found = false; foreach($articles as $a) { // if the item has an ID and it doesn't match the article ID, the two don't match, regardless of hashes @@ -201,16 +220,16 @@ class Feed { ($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) || ($i->titleContentHash && $i->titleContentHash === $a['title_content_hash']) ) { - if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { + if($i->updatedDate && $i->updatedDate->getTimestamp() !== $a['edited_date']) { // if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited // we store the item index and database record ID as a key/value pair $found = true; - $edited[$index] = $a['id']; + $edited[$a['id']] = $i; break; } else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) { // if any of the hashes do not match, then the article has been edited $found = true; - $edited[$index] = $a['id']; + $edited[$a['id']] = $i; break; } else { // otherwise the item is unchanged and we can ignore it @@ -219,65 +238,9 @@ class Feed { } } } - if(!$found) $tentative[] = $index; + if(!$found) $new[] = $i; } - if(sizeof($tentative) && sizeof($items)==sizeof($articles)) { - // if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items - $ids = $hashesUT = $hashesUC = $hashesTC = []; - foreach($tentative as $index) { - $i = $items[$index]; - if($i->id) $ids[] = $i->id; - if($i->urlTitleHash) $hashesUT[] = $i->urlTitleHash; - if($i->urlContentHash) $hashesUC[] = $i->urlContentHash; - if($i->titleContentHash) $hashesTC[] = $i->titleContentHash; - } - $articles = Data::$db->articleMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC); - foreach($tentative as $index) { - $i = $items[$index]; - $found = false; - foreach($articles as $a) { - // if the item has an ID and it doesn't match the article ID, the two don't match, regardless of hashes - if($i->id && $i->id !== $a['guid']) continue; - if( - // the item matches if the GUID matches... - ($i->id && $i->id === $a['guid']) || - // ... or if any one of the hashes match - ($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) || - ($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) || - ($i->titleContentHash && $i->titleContentHash === $a['title_content_hash']) - ) { - if($i->updatedDate && $i->updatedDate->getTimestamp() !== $match['edited_date']) { - // if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited - // we store the item index and database record ID as a key/value pair - $found = true; - $edited[$index] = $a['id']; - break; - } else if($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) { - // if any of the hashes do not match, then the article has been edited - $found = true; - $edited[$index] = $a['id']; - break; - } else { - // otherwise the item is unchanged and we can ignore it - $found = true; - break; - } - } - } - if(!$found) $new[] = $index; - } - } else { - // if there are no tentatively new articles and/or the number of stored articles is less than the size of the feed, don't do a second pass; assume any tentatively new items are in fact new - $new = $tentative; - } - // TODO: fetch full content when appropriate - foreach($new as $index) { - $this->newItems[] = $items[$index]; - } - foreach($edited as $index => $id) { - $this->changedItems[$id] = $items[$index]; - } - return true; + return [$new, $edited]; } public function computeNextFetch(): \DateTime { diff --git a/tests/Feed/TestFeed.php b/tests/Feed/TestFeed.php index 0fb1851d..65c81f01 100644 --- a/tests/Feed/TestFeed.php +++ b/tests/Feed/TestFeed.php @@ -9,6 +9,74 @@ class TestFeed extends \PHPUnit\Framework\TestCase { protected static $host = "http://localhost:8000/"; protected $base = ""; + protected $latest = [ + [ + 'id' => 1, + 'edited_date' => 946684800, + 'guid' => 'e433653cef2e572eee4215fa299a4a5af9137b2cefd6283c85bd69a32915beda', + 'url_title_hash' => 'f5cb8bfc1c7396dc9816af212a3e2ac5221585c2a00bf7ccb6aabd95dcfcd6a6', + 'url_content_hash' => 'fb0bc8f8cb08913dc5a497db700e327f1d34e4987402687d494a5891f24714d4', + 'title_content_hash' => '18fdd4fa93d693128c43b004399e5c9cea6c261ddfa002518d3669f55d8c2207', + ], + [ + 'id' => 2, + 'edited_date' => 946771200, + 'guid' => '5be8a5a46ecd52ed132191c8d27fb1af6b3d4edc00234c5d9f8f0e10562ed3b7', + 'url_title_hash' => '0e86d2de822a174fe3c44a466953e63ca1f1a58a19cbf475fce0855d4e3d5153', + 'url_content_hash' => '13075894189c47ffcfafd1dfe7fbb539f7c74a69d35a399b3abf8518952714f9', + 'title_content_hash' => '2abd0a8cba83b8214a66c8f0293ba63e467d720540e29ff8ddcdab069d4f1c9e', + ], + [ + 'id' => 3, + 'edited_date' => 946857600, + 'guid' => '31a6594500a48b59fcc8a075ce82b946c9c3c782460d088bd7b8ef3ede97ad92', + 'url_title_hash' => 'f74b06b240bd08abf4d3fdfc20dba6a6f6eb8b4f1a00e9a617efd63a87180a4b', + 'url_content_hash' => 'b278380e984cefe63f0e412b88ffc9cb0befdfa06fdc00bace1da99a8daff406', + 'title_content_hash' => 'ad622b31e739cd3a3f3c788991082cf4d2f7a8773773008e75f0572e58cd373b', + ], + [ + 'id' => 4, + 'edited_date' => 946944000, + 'guid' => '804e517d623390e71497982c77cf6823180342ebcd2e7d5e32da1e55b09dd180', + 'url_title_hash' => 'f3615c7f16336d3ea242d35cf3fc17dbc4ee3afb78376bf49da2dd7a5a25dec8', + 'url_content_hash' => 'f11c2b4046f207579aeb9c69a8c20ca5461cef49756ccfa5ba5e2344266da3b3', + 'title_content_hash' => 'ab2da63276acce431250b18d3d49b988b226a99c7faadf275c90b751aee05be9', + ], + [ + 'id' => 5, + 'edited_date' => 947030400, + 'guid' => 'db3e736c2c492f5def5c5da33ddcbea1824040e9ced2142069276b0a6e291a41', + 'url_title_hash' => 'd40da96e39eea6c55948ccbe9b3d275b5f931298288dbe953990c5f496097022', + 'url_content_hash' => '834240f84501b5341d375414718204ec421561f3825d34c22bf9182203e42900', + 'title_content_hash' => '43b970ac6ec5f8a9647b2c7e4eed8b1d7f62e154a95eed748b0294c1256764ba', + ], + ]; + protected $others = [ + [ + 'id' => 6, + 'edited_date' => 947116800, + 'guid' => 'b3461ab8e8759eeb1d65a818c65051ec00c1dfbbb32a3c8f6999434e3e3b76ab', + 'url_title_hash' => '91d051a8e6749d014506848acd45e959af50bf876427c4f0e3a1ec0f04777b51', + 'url_content_hash' => '211d78b1a040d40d17e747a363cc283f58767b2e502630d8de9b8f1d5e941d18', + 'title_content_hash' => '5ed68ccb64243b8c1931241d2c9276274c3b1d87f223634aa7a1ab0141292ca7', + ], + [ + 'id' => 7, + 'edited_date' => 947203200, + 'guid' => 'f4fae999d6531747523f4ff0c74f3f0c7c588b67e4f32d8f7dba5f6f36e8a45d', + 'url_title_hash' => 'b92f805f0d0643dad1d6c0bb5cbaec24729f5f71b37b831cf7ad31f6c9403ac8', + 'url_content_hash' => '4fc8789b787246e9be08ca1bac0d4a1ac4db1984f0db07f7142417598cf7211f', + 'title_content_hash' => '491df9338740b5297b3a3e8292be992ac112eb676c34595f7a38f3ee646ffe84', + ], + [ + 'id' => 8, + 'edited_date' => 947289600, + 'guid' => 'b9d2d58e3172096b1d23b42a59961fabc89962836c3cd5de54f3d3a98ff08e6c', + 'url_title_hash' => '53a6cbcfeb66b46d09cbb7b25035df0562da35786933319c83b04be29acfb6f4', + 'url_content_hash' => 'c6f3722b4445b49d19d39c3bf5b11a7cf23dd69873e2a0a458aab662f1cd9438', + 'title_content_hash' => '607d2da48807ca984ce2a9faa1d291bd9e3de9e912f83306167f4f5cd3c23bbd', + ], + ]; function setUp() { if(!@file_get_contents(self::$host."IsUp")) { @@ -19,6 +87,7 @@ class TestFeed extends \PHPUnit\Framework\TestCase { $this->base = self::$host."Feed/"; $this->clearData(); Data::$conf = new Conf(); + Data::$db = Phake::mock(Database::class); } function testHandle400() { @@ -278,4 +347,28 @@ class TestFeed extends \PHPUnit\Framework\TestCase { $exp = strtotime("now + 1 hour"); $this->assertTime($exp, $f->nextFetch); } + + function testMatchLatestArticles() { + Phake::when(Data::$db)->feedMatchLatest(1, $this->anything())->thenReturn(new Test\Result($this->latest)); + $f = new Feed(1, $this->base."Matching/1"); + $this->assertCount(0, $f->newItems); + $this->assertCount(0, $f->changedItems); + $f = new Feed(1, $this->base."Matching/2"); + $this->assertCount(1, $f->newItems); + $this->assertCount(0, $f->changedItems); + $f = new Feed(1, $this->base."Matching/3"); + $this->assertCount(1, $f->newItems); + $this->assertCount(2, $f->changedItems); + $f = new Feed(1, $this->base."Matching/4"); + $this->assertCount(1, $f->newItems); + $this->assertCount(2, $f->changedItems); + } + + function testMatchHistoricalArticles() { + Phake::when(Data::$db)->feedMatchLatest(1, $this->anything())->thenReturn(new Test\Result($this->latest)); + Phake::when(Data::$db)->feedMatchIds(1, $this->anything(), $this->anything(), $this->anything(), $this->anything())->thenReturn(new Test\Result($this->others)); + $f = new Feed(1, $this->base."Matching/5"); + $this->assertCount(0, $f->newItems); + $this->assertCount(0, $f->changedItems); + } } \ No newline at end of file diff --git a/tests/docroot/Feed/Matching/1.php b/tests/docroot/Feed/Matching/1.php new file mode 100644 index 00000000..eb04a3dc --- /dev/null +++ b/tests/docroot/Feed/Matching/1.php @@ -0,0 +1,51 @@ + "application/atom+xml", + 'content' => << + Example feed title + urn:uuid:0fd8f6d8-43df-11e7-8511-9b59a0324eb8 + + + + urn:uuid:df329114-43df-11e7-9f23-a938604d62f8 + + Article title 1 + Article content 1 + 2000-01-01T00:00:00Z + 2000-01-01T00:00:00Z + + + urn:uuid:24382fa8-43e0-11e7-bd9c-559df0ea4b9b + + Article title 2 + Article content 2 + 2000-01-02T00:00:00Z + 2000-01-02T00:00:00Z + + + urn:uuid:03b9f558-43e1-11e7-87c5-ebaab4fd4cd1 + + Article title 3 + Article content 3 + 2000-01-03T00:00:00Z + 2000-01-03T00:00:00Z + + + urn:uuid:3d5f5154-43e1-11e7-ba11-1dcae392a974 + + Article title 4 + Article content 4 + 2000-01-04T00:00:00Z + 2000-01-04T00:00:00Z + + + urn:uuid:6d4c7964-43e1-11e7-92bd-4fed65d89793 + + Article title 5 + Article content 5 + 2000-01-05T00:00:00Z + 2000-01-05T00:00:00Z + + +MESSAGE_BODY +]; \ No newline at end of file diff --git a/tests/docroot/Feed/Matching/2.php b/tests/docroot/Feed/Matching/2.php new file mode 100644 index 00000000..0b0102e0 --- /dev/null +++ b/tests/docroot/Feed/Matching/2.php @@ -0,0 +1,59 @@ + "application/atom+xml", + 'content' => << + Example feed title + urn:uuid:0fd8f6d8-43df-11e7-8511-9b59a0324eb8 + + + + urn:uuid:df329114-43df-11e7-9f23-a938604d62f8 + + Article title 1 + Article content 1 + 2000-01-01T00:00:00Z + 2000-01-01T00:00:00Z + + + urn:uuid:24382fa8-43e0-11e7-bd9c-559df0ea4b9b + + Article title 2 + Article content 2 + 2000-01-02T00:00:00Z + 2000-01-02T00:00:00Z + + + urn:uuid:03b9f558-43e1-11e7-87c5-ebaab4fd4cd1 + + Article title 3 + Article content 3 + 2000-01-03T00:00:00Z + 2000-01-03T00:00:00Z + + + urn:uuid:3d5f5154-43e1-11e7-ba11-1dcae392a974 + + Article title 4 + Article content 4 + 2000-01-04T00:00:00Z + 2000-01-04T00:00:00Z + + + urn:uuid:6d4c7964-43e1-11e7-92bd-4fed65d89793 + + Article title 5 + Article content 5 + 2000-01-05T00:00:00Z + 2000-01-05T00:00:00Z + + + urn:uuid:b0b9698c-43e6-11e7-85b4-53a6b351844b + + Article title 6 + Article content 6 + 2000-01-06T00:00:00Z + 2000-01-06T00:00:00Z + + +MESSAGE_BODY +]; \ No newline at end of file diff --git a/tests/docroot/Feed/Matching/3.php b/tests/docroot/Feed/Matching/3.php new file mode 100644 index 00000000..1daca60a --- /dev/null +++ b/tests/docroot/Feed/Matching/3.php @@ -0,0 +1,59 @@ + "application/atom+xml", + 'content' => << + Example feed title + urn:uuid:0fd8f6d8-43df-11e7-8511-9b59a0324eb8 + + + + urn:uuid:df329114-43df-11e7-9f23-a938604d62f8 + + Article title 1 + Article content 1 + 2000-01-01T00:00:00Z + 2000-01-01T00:00:00Z + + + urn:uuid:24382fa8-43e0-11e7-bd9c-559df0ea4b9b + + Article title 2 + Article content 2 + 2000-01-02T00:00:00Z + 2000-01-02T00:00:00Z + + + urn:uuid:03b9f558-43e1-11e7-87c5-ebaab4fd4cd1 + + Article title 3 (updated) + Article content 3 + 2000-01-03T00:00:00Z + 2000-01-03T00:00:00Z + + + urn:uuid:3d5f5154-43e1-11e7-ba11-1dcae392a974 + + Article title 4 + Article content 4 + 2000-01-04T00:00:00Z + 2000-01-04T00:00:01Z + + + urn:uuid:6d4c7964-43e1-11e7-92bd-4fed65d89793 + + Article title 5 + Article content 5 + 2000-01-05T00:00:00Z + 2000-01-05T00:00:00Z + + + urn:uuid:b0b9698c-43e6-11e7-85b4-53a6b351844b + + Article title 6 + Article content 6 + 2000-01-06T00:00:00Z + 2000-01-06T00:00:00Z + + +MESSAGE_BODY +]; \ No newline at end of file diff --git a/tests/docroot/Feed/Matching/4.php b/tests/docroot/Feed/Matching/4.php new file mode 100644 index 00000000..1815ea4c --- /dev/null +++ b/tests/docroot/Feed/Matching/4.php @@ -0,0 +1,58 @@ + "application/atom+xml", + 'content' => << + Example feed title + urn:uuid:0fd8f6d8-43df-11e7-8511-9b59a0324eb8 + + + + urn:uuid:df329114-43df-11e7-9f23-a938604d62f8 + + Article title 1 + Article content 1 + 2000-01-01T00:00:00Z + 2000-01-01T00:00:00Z + + + urn:uuid:24382fa8-43e0-11e7-bd9c-559df0ea4b9b + + Article title 2 + Article content 2 + 2000-01-02T00:00:00Z + 2000-01-02T00:00:00Z + + + urn:uuid:03b9f558-43e1-11e7-87c5-ebaab4fd4cd1 + + Article title 3 (updated) + Article content 3 + 2000-01-03T00:00:00Z + 2000-01-03T00:00:00Z + + + + Article title 4 + Article content 4 with updates + 2000-01-04T00:00:00Z + 2000-01-04T00:00:00Z + + + urn:uuid:6d4c7964-43e1-11e7-92bd-4fed65d89793 + + Article title 5 + Article content 5 + 2000-01-05T00:00:00Z + 2000-01-05T00:00:00Z + + + urn:uuid:b0b9698c-43e6-11e7-85b4-53a6b351844b + + Article title 6 + Article content 6 + 2000-01-06T00:00:00Z + 2000-01-06T00:00:00Z + + +MESSAGE_BODY +]; \ No newline at end of file diff --git a/tests/docroot/Feed/Matching/5.php b/tests/docroot/Feed/Matching/5.php new file mode 100644 index 00000000..5cdb16cd --- /dev/null +++ b/tests/docroot/Feed/Matching/5.php @@ -0,0 +1,51 @@ + "application/atom+xml", + 'content' => << + Example feed title + urn:uuid:0fd8f6d8-43df-11e7-8511-9b59a0324eb8 + + + + urn:uuid:3d5f5154-43e1-11e7-ba11-1dcae392a974 + + Article title 4 + Article content 4 + 2000-01-04T00:00:00Z + 2000-01-04T00:00:00Z + + + urn:uuid:6d4c7964-43e1-11e7-92bd-4fed65d89793 + + Article title 5 + Article content 5 + 2000-01-05T00:00:00Z + 2000-01-05T00:00:00Z + + + urn:uuid:b0b9698c-43e6-11e7-85b4-53a6b351844b + + Article title 6 + Article content 6 + 2000-01-06T00:00:00Z + 2000-01-06T00:00:00Z + + + urn:uuid:7017ed6a-43ee-11e7-b2db-09225eb114d1 + + Article title 7 + Article content 7 + 2000-01-07T00:00:00Z + 2000-01-07T00:00:00Z + + + urn:uuid:845a98fe-43ee-11e7-a252-cde4cbf755f3 + + Article title 8 + Article content 8 + 2000-01-08T00:00:00Z + 2000-01-08T00:00:00Z + + +MESSAGE_BODY +]; \ No newline at end of file