mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-22 13:12:41 +00:00
More work on scraping
This commit is contained in:
parent
4cb23dd198
commit
76f70119fd
2 changed files with 43 additions and 39 deletions
|
@ -93,22 +93,23 @@ trait SeriesArticle {
|
|||
'feed' => "int",
|
||||
'folder' => "int",
|
||||
'title' => "str",
|
||||
'scrape' => "bool",
|
||||
],
|
||||
'rows' => [
|
||||
[1, "john.doe@example.com",1, null,"Subscription 1"],
|
||||
[2, "john.doe@example.com",2, null,null],
|
||||
[3, "john.doe@example.com",3, 1,"Subscription 3"],
|
||||
[4, "john.doe@example.com",4, 6,null],
|
||||
[5, "john.doe@example.com",10, 5,"Subscription 5"],
|
||||
[6, "jane.doe@example.com",1, null,null],
|
||||
[7, "jane.doe@example.com",10,null,"Subscription 7"],
|
||||
[8, "john.doe@example.org",11,null,null],
|
||||
[9, "john.doe@example.org",12,null,"Subscription 9"],
|
||||
[10,"john.doe@example.org",13,null,null],
|
||||
[11,"john.doe@example.net",10,null,"Subscription 11"],
|
||||
[12,"john.doe@example.net",2, 9,null],
|
||||
[13,"john.doe@example.net",3, 8,"Subscription 13"],
|
||||
[14,"john.doe@example.net",4, 7,null],
|
||||
[1, "john.doe@example.com",1, null,"Subscription 1",0],
|
||||
[2, "john.doe@example.com",2, null,null,0],
|
||||
[3, "john.doe@example.com",3, 1,"Subscription 3",0],
|
||||
[4, "john.doe@example.com",4, 6,null,0],
|
||||
[5, "john.doe@example.com",10, 5,"Subscription 5",0],
|
||||
[6, "jane.doe@example.com",1, null,null,0],
|
||||
[7, "jane.doe@example.com",10,null,"Subscription 7",0],
|
||||
[8, "john.doe@example.org",11,null,null,0],
|
||||
[9, "john.doe@example.org",12,null,"Subscription 9",0],
|
||||
[10,"john.doe@example.org",13,null,null,0],
|
||||
[11,"john.doe@example.net",10,null,"Subscription 11",0],
|
||||
[12,"john.doe@example.net",2, 9,null,0],
|
||||
[13,"john.doe@example.net",3, 8,"Subscription 13",0],
|
||||
[14,"john.doe@example.net",4, 7,null,0],
|
||||
],
|
||||
],
|
||||
'arsse_tag_members' => [
|
||||
|
@ -145,33 +146,34 @@ trait SeriesArticle {
|
|||
'url_content_hash' => "str",
|
||||
'title_content_hash' => "str",
|
||||
'modified' => "datetime",
|
||||
'content_scraped' => "str",
|
||||
],
|
||||
'rows' => [
|
||||
[1,1,null,"Title one", null,null,null,"First article", null,"","","","2000-01-01T00:00:00Z"],
|
||||
[2,1,null,"Title two", null,null,null,"Second article",null,"","","","2010-01-01T00:00:00Z"],
|
||||
[3,2,null,"Title three",null,null,null,"Third article", null,"","","","2000-01-01T00:00:00Z"],
|
||||
[4,2,null,null,"John Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[5,3,null,null,"John Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[6,3,null,null,"Jane Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[7,4,null,null,"Jane Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[8,4,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[9,5,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[10,5,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[11,6,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[12,6,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[13,7,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[14,7,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[15,8,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[16,8,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[17,9,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[18,9,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[19,10,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z"],
|
||||
[20,10,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z"],
|
||||
[101,11,'http://example.com/1','Article title 1','','2000-01-01 00:00:00','2000-01-01 00:00:01','<p>Article content 1</p>','e433653cef2e572eee4215fa299a4a5af9137b2cefd6283c85bd69a32915beda','f5cb8bfc1c7396dc9816af212a3e2ac5221585c2a00bf7ccb6aabd95dcfcd6a6','fb0bc8f8cb08913dc5a497db700e327f1d34e4987402687d494a5891f24714d4','18fdd4fa93d693128c43b004399e5c9cea6c261ddfa002518d3669f55d8c2207','2000-01-01 01:00:00'],
|
||||
[102,11,'http://example.com/2','Article title 2','','2000-01-02 00:00:00','2000-01-02 00:00:02','<p>Article content 2</p>','5be8a5a46ecd52ed132191c8d27fb1af6b3d4edc00234c5d9f8f0e10562ed3b7','0e86d2de822a174fe3c44a466953e63ca1f1a58a19cbf475fce0855d4e3d5153','13075894189c47ffcfafd1dfe7fbb539f7c74a69d35a399b3abf8518952714f9','2abd0a8cba83b8214a66c8f0293ba63e467d720540e29ff8ddcdab069d4f1c9e','2000-01-02 02:00:00'],
|
||||
[103,12,'http://example.com/3','Article title 3','','2000-01-03 00:00:00','2000-01-03 00:00:03','<p>Article content 3</p>','31a6594500a48b59fcc8a075ce82b946c9c3c782460d088bd7b8ef3ede97ad92','f74b06b240bd08abf4d3fdfc20dba6a6f6eb8b4f1a00e9a617efd63a87180a4b','b278380e984cefe63f0e412b88ffc9cb0befdfa06fdc00bace1da99a8daff406','ad622b31e739cd3a3f3c788991082cf4d2f7a8773773008e75f0572e58cd373b','2000-01-03 03:00:00'],
|
||||
[104,12,'http://example.com/4','Article title 4','','2000-01-04 00:00:00','2000-01-04 00:00:04','<p>Article content 4</p>','804e517d623390e71497982c77cf6823180342ebcd2e7d5e32da1e55b09dd180','f3615c7f16336d3ea242d35cf3fc17dbc4ee3afb78376bf49da2dd7a5a25dec8','f11c2b4046f207579aeb9c69a8c20ca5461cef49756ccfa5ba5e2344266da3b3','ab2da63276acce431250b18d3d49b988b226a99c7faadf275c90b751aee05be9','2000-01-04 04:00:00'],
|
||||
[105,13,'http://example.com/5','Article title 5','','2000-01-05 00:00:00','2000-01-05 00:00:05','<p>Article content 5</p>','db3e736c2c492f5def5c5da33ddcbea1824040e9ced2142069276b0a6e291a41','d40da96e39eea6c55948ccbe9b3d275b5f931298288dbe953990c5f496097022','834240f84501b5341d375414718204ec421561f3825d34c22bf9182203e42900','43b970ac6ec5f8a9647b2c7e4eed8b1d7f62e154a95eed748b0294c1256764ba','2000-01-05 05:00:00'],
|
||||
[1,1,null,"Title one", null,null,null,"First article", null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[2,1,null,"Title two", null,null,null,"Second article",null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[3,2,null,"Title three",null,null,null,"Third article", null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[4,2,null,null,"John Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[5,3,null,null,"John Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[6,3,null,null,"Jane Doe",null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[7,4,null,null,"Jane Doe",null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[8,4,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[9,5,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[10,5,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[11,6,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[12,6,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[13,7,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[14,7,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[15,8,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[16,8,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[17,9,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[18,9,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[19,10,null,null,null,null,null,null,null,"","","","2000-01-01T00:00:00Z",null],
|
||||
[20,10,null,null,null,null,null,null,null,"","","","2010-01-01T00:00:00Z",null],
|
||||
[101,11,'http://example.com/1','Article title 1','','2000-01-01 00:00:00','2000-01-01 00:00:01','<p>Article content 1</p>','e433653cef2e572eee4215fa299a4a5af9137b2cefd6283c85bd69a32915beda','f5cb8bfc1c7396dc9816af212a3e2ac5221585c2a00bf7ccb6aabd95dcfcd6a6','fb0bc8f8cb08913dc5a497db700e327f1d34e4987402687d494a5891f24714d4','18fdd4fa93d693128c43b004399e5c9cea6c261ddfa002518d3669f55d8c2207','2000-01-01 01:00:00',"<p>Scraped content 1</p>"],
|
||||
[102,11,'http://example.com/2','Article title 2','','2000-01-02 00:00:00','2000-01-02 00:00:02','<p>Article content 2</p>','5be8a5a46ecd52ed132191c8d27fb1af6b3d4edc00234c5d9f8f0e10562ed3b7','0e86d2de822a174fe3c44a466953e63ca1f1a58a19cbf475fce0855d4e3d5153','13075894189c47ffcfafd1dfe7fbb539f7c74a69d35a399b3abf8518952714f9','2abd0a8cba83b8214a66c8f0293ba63e467d720540e29ff8ddcdab069d4f1c9e','2000-01-02 02:00:00',null],
|
||||
[103,12,'http://example.com/3','Article title 3','','2000-01-03 00:00:00','2000-01-03 00:00:03','<p>Article content 3</p>','31a6594500a48b59fcc8a075ce82b946c9c3c782460d088bd7b8ef3ede97ad92','f74b06b240bd08abf4d3fdfc20dba6a6f6eb8b4f1a00e9a617efd63a87180a4b','b278380e984cefe63f0e412b88ffc9cb0befdfa06fdc00bace1da99a8daff406','ad622b31e739cd3a3f3c788991082cf4d2f7a8773773008e75f0572e58cd373b','2000-01-03 03:00:00',null],
|
||||
[104,12,'http://example.com/4','Article title 4','','2000-01-04 00:00:00','2000-01-04 00:00:04','<p>Article content 4</p>','804e517d623390e71497982c77cf6823180342ebcd2e7d5e32da1e55b09dd180','f3615c7f16336d3ea242d35cf3fc17dbc4ee3afb78376bf49da2dd7a5a25dec8','f11c2b4046f207579aeb9c69a8c20ca5461cef49756ccfa5ba5e2344266da3b3','ab2da63276acce431250b18d3d49b988b226a99c7faadf275c90b751aee05be9','2000-01-04 04:00:00',null],
|
||||
[105,13,'http://example.com/5','Article title 5','','2000-01-05 00:00:00','2000-01-05 00:00:05','<p>Article content 5</p>','db3e736c2c492f5def5c5da33ddcbea1824040e9ced2142069276b0a6e291a41','d40da96e39eea6c55948ccbe9b3d275b5f931298288dbe953990c5f496097022','834240f84501b5341d375414718204ec421561f3825d34c22bf9182203e42900','43b970ac6ec5f8a9647b2c7e4eed8b1d7f62e154a95eed748b0294c1256764ba','2000-01-05 05:00:00',null],
|
||||
],
|
||||
],
|
||||
'arsse_enclosures' => [
|
||||
|
|
|
@ -369,6 +369,8 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
|
|||
// now try to scrape and get different content
|
||||
$f = new Feed(null, $this->base."Scraping/Feed", "", "", "", "", true);
|
||||
$exp = "<p>Partial content, followed by more content</p>";
|
||||
$this->assertSame($exp, $f->newItems[0]->scrapedContent);
|
||||
$exp = "<p>Partial content</p>";
|
||||
$this->assertSame($exp, $f->newItems[0]->content);
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in a new issue