1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-22 05:02:40 +00:00

Add ability to enable scraper

Also transfer any existing scraper booleans on database upgrade. It was
previously possible to enable scraping manually by editing the database,
and these settings will be honoured.
This commit is contained in:
J. King 2021-01-16 19:06:20 -05:00
parent 7897585d98
commit 86897af0b3
6 changed files with 61 additions and 39 deletions

View file

@ -898,6 +898,7 @@ class Database {
* - "title": The title of the subscription
* - "folder": The numeric identifier (or null) of the subscription's folder
* - "pinned": Whether the subscription is pinned
* - "scrape": Whether to scrape full article contents from the HTML article
* - "order_type": Whether articles should be sorted in reverse cronological order (2), chronological order (1), or the default (0)
* - "keep_rule": The subscription's "keep" filter rule; articles which do not match this are hidden
* - "block_rule": The subscription's "block" filter rule; articles which match this are hidden
@ -948,6 +949,7 @@ class Database {
'pinned' => "strict bool",
'keep_rule' => "str",
'block_rule' => "str",
'scrape' => "bool",
];
[$setClause, $setTypes, $setValues] = $this->generateSet($data, $valid);
if (!$setClause) {

View file

@ -33,6 +33,7 @@ create table arsse_user_meta(
) character set utf8mb4 collate utf8mb4_unicode_ci;
alter table arsse_subscriptions add column scrape boolean not null default 0;
update arsse_subscriptions set scrape = 1 where feed in (select id from arsse_feeds where scrape = 1);
alter table arsse_feeds drop column scrape;
alter table arsse_articles add column content_scraped longtext;

View file

@ -33,6 +33,7 @@ create table arsse_user_meta(
);
alter table arsse_subscriptions add column scrape smallint not null default 0;
update arsse_subscriptions set scrape = 1 where feed in (select id from arsse_feeds where scrape = 1);
alter table arsse_feeds drop column scrape;
alter table arsse_articles add column content_scraped text;

View file

@ -44,8 +44,31 @@ create table arsse_user_meta(
primary key(owner,key)
) without rowid;
-- Add a "scrape" column for subscriptions
-- Add a "scrape" column for subscriptions and copy any existing scraping
alter table arsse_subscriptions add column scrape boolean not null default 0;
update arsse_subscriptions set scrape = 1 where feed in (select id from arsse_feeds where scrape = 1);
-- Add a column for scraped article content, and re-order some columns
create table arsse_articles_new(
-- entries in newsfeeds
id integer primary key, -- sequence number
feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription
url text, -- URL of article
title text collate nocase, -- article title
author text collate nocase, -- author's name
published text, -- time of original publication
edited text, -- time of last edit by author
modified text not null default CURRENT_TIMESTAMP, -- time when article was last modified in database
guid text, -- GUID
url_title_hash text not null, -- hash of URL + title; used when checking for updates and for identification if there is no guid.
url_content_hash text not null, -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
title_content_hash text not null, -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
content_scraped text, -- scraped content, as HTML
content text -- content, as HTML
);
insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles;
drop table arsse_articles;
alter table arsse_articles_new rename to arsse_articles;
-- Add a separate table for feed icons and replace their URLs in the feeds table with their IDs
-- Also remove the "scrape" column of the feeds table, which was never an advertised feature
@ -88,28 +111,6 @@ insert into arsse_feeds_new
drop table arsse_feeds;
alter table arsse_feeds_new rename to arsse_feeds;
-- Add a column for scraped article content, and re-order some column
create table arsse_articles_new(
-- entries in newsfeeds
id integer primary key, -- sequence number
feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription
url text, -- URL of article
title text collate nocase, -- article title
author text collate nocase, -- author's name
published text, -- time of original publication
edited text, -- time of last edit by author
modified text not null default CURRENT_TIMESTAMP, -- time when article was last modified in database
guid text, -- GUID
url_title_hash text not null, -- hash of URL + title; used when checking for updates and for identification if there is no guid.
url_content_hash text not null, -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
title_content_hash text not null, -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
content_scraped text, -- scraped content, as HTML
content text -- content, as HTML
);
insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles;
drop table arsse_articles;
alter table arsse_articles_new rename to arsse_articles;
-- set version marker
pragma user_version = 7;
update arsse_meta set value = '7' where "key" = 'schema_version';

View file

@ -80,13 +80,14 @@ trait SeriesSubscription {
'order_type' => "int",
'keep_rule' => "str",
'block_rule' => "str",
'scrape' => "bool",
],
'rows' => [
[1,"john.doe@example.com",2,null,null,1,2,null,null],
[2,"jane.doe@example.com",2,null,null,0,0,null,null],
[3,"john.doe@example.com",3,"Ook",2,0,1,null,null],
[4,"jill.doe@example.com",2,null,null,0,0,null,null],
[5,"jack.doe@example.com",2,null,null,1,2,"","3|E"],
[1,"john.doe@example.com",2,null,null,1,2,null,null,0],
[2,"jane.doe@example.com",2,null,null,0,0,null,null,0],
[3,"john.doe@example.com",3,"Ook",2,0,1,null,null,0],
[4,"jill.doe@example.com",2,null,null,0,0,null,null,0],
[5,"jack.doe@example.com",2,null,null,1,2,"","3|E",0],
],
],
'arsse_tags' => [
@ -409,22 +410,23 @@ trait SeriesSubscription {
'title' => "Ook Ook",
'folder' => 3,
'pinned' => false,
'scrape' => true,
'order_type' => 0,
'keep_rule' => "ook",
'block_rule' => "eek",
]);
$state = $this->primeExpectations($this->data, [
'arsse_feeds' => ['id','url','username','password','title'],
'arsse_subscriptions' => ['id','owner','feed','title','folder','pinned','order_type','keep_rule','block_rule'],
'arsse_subscriptions' => ['id','owner','feed','title','folder','pinned','order_type','keep_rule','block_rule','scrape'],
]);
$state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,"Ook Ook",3,0,0,"ook","eek"];
$state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,"Ook Ook",3,0,0,"ook","eek",1];
$this->compareExpectations(static::$drv, $state);
Arsse::$db->subscriptionPropertiesSet($this->user, 1, [
'title' => null,
'keep_rule' => null,
'block_rule' => null,
]);
$state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,null,3,0,0,null,null];
$state['arsse_subscriptions']['rows'][0] = [1,"john.doe@example.com",2,null,3,0,0,null,null,1];
$this->compareExpectations(static::$drv, $state);
// making no changes is a valid result
Arsse::$db->subscriptionPropertiesSet($this->user, 1, ['unhinged' => true]);

View file

@ -139,14 +139,22 @@ class BaseUpdate extends \JKingWeb\Arsse\Test\AbstractTest {
$this->drv->schemaUpdate(6);
$this->drv->exec(
<<<QUERY_TEXT
INSERT INTO arsse_users values('a', 'xyz');
INSERT INTO arsse_users values('b', 'abc');
INSERT INTO arsse_folders(owner,name) values('a', '1');
INSERT INTO arsse_folders(owner,name) values('b', '2');
INSERT INTO arsse_feeds(url,favicon) values('http://example.com/', 'http://example.com/icon');
INSERT INTO arsse_feeds(url,favicon) values('http://example.org/', 'http://example.org/icon');
INSERT INTO arsse_feeds(url,favicon) values('https://example.com/', 'http://example.com/icon');
INSERT INTO arsse_feeds(url,favicon) values('http://example.net/', null);
INSERT INTO arsse_users values
('a', 'xyz'),
('b', 'abc');
INSERT INTO arsse_folders(owner,name) values
('a', '1'),
('b', '2');
INSERT INTO arsse_feeds(id,scrape,url,favicon) values
(1, 1, 'http://example.com/', 'http://example.com/icon'),
(2, 0, 'http://example.org/', 'http://example.org/icon'),
(3, 0, 'https://example.com/', 'http://example.com/icon'),
(4, 0, 'http://example.net/', null);
INSERT INTO arsse_subscriptions(id,owner,feed) values
(1, 'a', 1),
(2, 'b', 1),
(3, 'a', 2),
(4, 'b', 2);
QUERY_TEXT
);
$this->drv->schemaUpdate(7);
@ -168,9 +176,16 @@ QUERY_TEXT
['url' => 'https://example.com/', 'icon' => 1],
['url' => 'http://example.net/', 'icon' => null],
];
$subs = [
['id' => 1, 'scrape' => 1],
['id' => 2, 'scrape' => 1],
['id' => 3, 'scrape' => 0],
['id' => 4, 'scrape' => 0],
];
$this->assertEquals($users, $this->drv->query("SELECT id, password, num from arsse_users order by id")->getAll());
$this->assertEquals($folders, $this->drv->query("SELECT owner, name from arsse_folders order by owner")->getAll());
$this->assertEquals($icons, $this->drv->query("SELECT id, url from arsse_icons order by id")->getAll());
$this->assertEquals($feeds, $this->drv->query("SELECT url, icon from arsse_feeds order by id")->getAll());
$this->assertEquals($subs, $this->drv->query("SELECT id, scrape from arsse_subscriptions order by id")->getAll());
}
}