1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-22 13:12:41 +00:00

More work on scraping

This commit is contained in:
J. King 2021-01-16 16:48:35 -05:00
parent 4cb23dd198
commit 76f70119fd
2 changed files with 43 additions and 39 deletions

View file

@ -93,22 +93,23 @@ trait SeriesArticle {
'feed' => "int",
'folder' => "int",
'title' => "str",
'scrape' => "bool",
],
'rows' => [
[1, "john.doe@example.com",1, null,"Subscription 1"],
[2, "john.doe@example.com",2, null,null],
[3, "john.doe@example.com",3, 1,"Subscription 3"],
[4, "john.doe@example.com",4, 6,null],
[5, "john.doe@example.com",10, 5,"Subscription 5"],
[6, "jane.doe@example.com",1, null,null],
[7, "jane.doe@example.com",10,null,"Subscription 7"],
[8, "john.doe@example.org",11,null,null],
[9, "john.doe@example.org",12,null,"Subscription 9"],
[10,"john.doe@example.org",13,null,null],
[11,"john.doe@example.net",10,null,"Subscription 11"],
[12,"john.doe@example.net",2, 9,null],
[13,"john.doe@example.net",3, 8,"Subscription 13"],
[14,"john.doe@example.net",4, 7,null],
[1, "john.doe@example.com",1, null,"Subscription 1",0],
[2, "john.doe@example.com",2, null,null,0],
[3, "john.doe@example.com",3, 1,"Subscription 3",0],
[4, "john.doe@example.com",4, 6,null,0],
[5, "john.doe@example.com",10, 5,"Subscription 5",0],
[6, "jane.doe@example.com",1, null,null,0],
[7, "jane.doe@example.com",10,null,"Subscription 7",0],
[8, "john.doe@example.org",11,null,null,0],
[9, "john.doe@example.org",12,null,"Subscription 9",0],
[10,"john.doe@example.org",13,null,null,0],
[11,"john.doe@example.net",10,null,"Subscription 11",0],
[12,"john.doe@example.net",2, 9,null,0],
[13,"john.doe@example.net",3, 8,"Subscription 13",0],
[14,"john.doe@example.net",4, 7,null,0],
],
],
'arsse_tag_members' => [
@ -145,33 +146,34 @@ trait SeriesArticle {
'url_content_hash' => "str",
'title_content_hash' => "str",
'modified' => "datetime",
'content_scraped' => "str",
],
'rows' => [
[1,1,null,"Title one", null,null,null,"First article", null,"","","","2000-01-01T00:00:00Z"],
[2,1,null,"Title two", null,null,null,"Second article",null,"","","","2010-01-01T00:00:00Z"],
[3,2,null,"Title three",null,null,null,"Third article", null,"","","","2000-01-01T00:00:00Z"],
[4,2,null,null,"John Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[5,3,null,null,"John Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[6,3,null,null,"Jane Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[7,4,null,null,"Jane Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[8,4,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[9,5,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[10,5,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[11,6,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[12,6,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[13,7,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[14,7,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[15,8,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[16,8,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[17,9,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[18,9,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[19,10,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
[20,10,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
[101,11,'http://example.com/1','Article title 1','','2000-01-01 00:00:00','2000-01-01 00:00:01','<p>Article content 1</p>','e433653cef2e572eee4215fa299a4a5af9137b2cefd6283c85bd69a32915beda','f5cb8bfc1c7396dc9816af212a3e2ac5221585c2a00bf7ccb6aabd95dcfcd6a6','fb0bc8f8cb08913dc5a497db700e327f1d34e4987402687d494a5891f24714d4','18fdd4fa93d693128c43b004399e5c9cea6c261ddfa002518d3669f55d8c2207','2000-01-01 01:00:00'],
[102,11,'http://example.com/2','Article title 2','','2000-01-02 00:00:00','2000-01-02 00:00:02','<p>Article content 2</p>','5be8a5a46ecd52ed132191c8d27fb1af6b3d4edc00234c5d9f8f0e10562ed3b7','0e86d2de822a174fe3c44a466953e63ca1f1a58a19cbf475fce0855d4e3d5153','13075894189c47ffcfafd1dfe7fbb539f7c74a69d35a399b3abf8518952714f9','2abd0a8cba83b8214a66c8f0293ba63e467d720540e29ff8ddcdab069d4f1c9e','2000-01-02 02:00:00'],
[103,12,'http://example.com/3','Article title 3','','2000-01-03 00:00:00','2000-01-03 00:00:03','<p>Article content 3</p>','31a6594500a48b59fcc8a075ce82b946c9c3c782460d088bd7b8ef3ede97ad92','f74b06b240bd08abf4d3fdfc20dba6a6f6eb8b4f1a00e9a617efd63a87180a4b','b278380e984cefe63f0e412b88ffc9cb0befdfa06fdc00bace1da99a8daff406','ad622b31e739cd3a3f3c788991082cf4d2f7a8773773008e75f0572e58cd373b','2000-01-03 03:00:00'],
[104,12,'http://example.com/4','Article title 4','','2000-01-04 00:00:00','2000-01-04 00:00:04','<p>Article content 4</p>','804e517d623390e71497982c77cf6823180342ebcd2e7d5e32da1e55b09dd180','f3615c7f16336d3ea242d35cf3fc17dbc4ee3afb78376bf49da2dd7a5a25dec8','f11c2b4046f207579aeb9c69a8c20ca5461cef49756ccfa5ba5e2344266da3b3','ab2da63276acce431250b18d3d49b988b226a99c7faadf275c90b751aee05be9','2000-01-04 04:00:00'],
[105,13,'http://example.com/5','Article title 5','','2000-01-05 00:00:00','2000-01-05 00:00:05','<p>Article content 5</p>','db3e736c2c492f5def5c5da33ddcbea1824040e9ced2142069276b0a6e291a41','d40da96e39eea6c55948ccbe9b3d275b5f931298288dbe953990c5f496097022','834240f84501b5341d375414718204ec421561f3825d34c22bf9182203e42900','43b970ac6ec5f8a9647b2c7e4eed8b1d7f62e154a95eed748b0294c1256764ba','2000-01-05 05:00:00'],
[1,1,null,"Title one", null,null,null,"First article", null,"","","","2000-01-01T00:00:00Z",null],
[2,1,null,"Title two", null,null,null,"Second article",null,"","","","2010-01-01T00:00:00Z",null],
[3,2,null,"Title three",null,null,null,"Third article", null,"","","","2000-01-01T00:00:00Z",null],
[4,2,null,null,"John Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[5,3,null,null,"John Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[6,3,null,null,"Jane Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[7,4,null,null,"Jane Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[8,4,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[9,5,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[10,5,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[11,6,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[12,6,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[13,7,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[14,7,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[15,8,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[16,8,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[17,9,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[18,9,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[19,10,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
[20,10,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
[101,11,'http://example.com/1','Article title 1','','2000-01-01 00:00:00','2000-01-01 00:00:01','<p>Article content 1</p>','e433653cef2e572eee4215fa299a4a5af9137b2cefd6283c85bd69a32915beda','f5cb8bfc1c7396dc9816af212a3e2ac5221585c2a00bf7ccb6aabd95dcfcd6a6','fb0bc8f8cb08913dc5a497db700e327f1d34e4987402687d494a5891f24714d4','18fdd4fa93d693128c43b004399e5c9cea6c261ddfa002518d3669f55d8c2207','2000-01-01 01:00:00',"<p>Scraped content 1</p>"],
[102,11,'http://example.com/2','Article title 2','','2000-01-02 00:00:00','2000-01-02 00:00:02','<p>Article content 2</p>','5be8a5a46ecd52ed132191c8d27fb1af6b3d4edc00234c5d9f8f0e10562ed3b7','0e86d2de822a174fe3c44a466953e63ca1f1a58a19cbf475fce0855d4e3d5153','13075894189c47ffcfafd1dfe7fbb539f7c74a69d35a399b3abf8518952714f9','2abd0a8cba83b8214a66c8f0293ba63e467d720540e29ff8ddcdab069d4f1c9e','2000-01-02 02:00:00',null],
[103,12,'http://example.com/3','Article title 3','','2000-01-03 00:00:00','2000-01-03 00:00:03','<p>Article content 3</p>','31a6594500a48b59fcc8a075ce82b946c9c3c782460d088bd7b8ef3ede97ad92','f74b06b240bd08abf4d3fdfc20dba6a6f6eb8b4f1a00e9a617efd63a87180a4b','b278380e984cefe63f0e412b88ffc9cb0befdfa06fdc00bace1da99a8daff406','ad622b31e739cd3a3f3c788991082cf4d2f7a8773773008e75f0572e58cd373b','2000-01-03 03:00:00',null],
[104,12,'http://example.com/4','Article title 4','','2000-01-04 00:00:00','2000-01-04 00:00:04','<p>Article content 4</p>','804e517d623390e71497982c77cf6823180342ebcd2e7d5e32da1e55b09dd180','f3615c7f16336d3ea242d35cf3fc17dbc4ee3afb78376bf49da2dd7a5a25dec8','f11c2b4046f207579aeb9c69a8c20ca5461cef49756ccfa5ba5e2344266da3b3','ab2da63276acce431250b18d3d49b988b226a99c7faadf275c90b751aee05be9','2000-01-04 04:00:00',null],
[105,13,'http://example.com/5','Article title 5','','2000-01-05 00:00:00','2000-01-05 00:00:05','<p>Article content 5</p>','db3e736c2c492f5def5c5da33ddcbea1824040e9ced2142069276b0a6e291a41','d40da96e39eea6c55948ccbe9b3d275b5f931298288dbe953990c5f496097022','834240f84501b5341d375414718204ec421561f3825d34c22bf9182203e42900','43b970ac6ec5f8a9647b2c7e4eed8b1d7f62e154a95eed748b0294c1256764ba','2000-01-05 05:00:00',null],
],
],
'arsse_enclosures' => [

View file

@ -369,6 +369,8 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
// now try to scrape and get different content
$f = new Feed(null, $this->base."Scraping/Feed", "", "", "", "", true);
$exp = "<p>Partial content, followed by more content</p>";
$this->assertSame($exp, $f->newItems[0]->scrapedContent);
$exp = "<p>Partial content</p>";
$this->assertSame($exp, $f->newItems[0]->content);
}