mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2024-12-22 21:22:40 +00:00
Partial implementation of proper content scraping
This commit is contained in:
parent
e74b44cc39
commit
4cb23dd198
5 changed files with 53 additions and 12 deletions
|
@ -1126,12 +1126,19 @@ class Database {
|
||||||
if (!V::id($feedID)) {
|
if (!V::id($feedID)) {
|
||||||
throw new Db\ExceptionInput("typeViolation", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID, 'type' => "int > 0"]);
|
throw new Db\ExceptionInput("typeViolation", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID, 'type' => "int > 0"]);
|
||||||
}
|
}
|
||||||
$f = $this->db->prepare("SELECT url, username, password, modified, etag, err_count, scrape FROM arsse_feeds where id = ?", "int")->run($feedID)->getRow();
|
$f = $this->db->prepareArray(
|
||||||
|
"SELECT
|
||||||
|
url, username, password, modified, etag, err_count, scrapers
|
||||||
|
FROM arsse_feeds as f
|
||||||
|
left join (select feed, count(*) as scrapers from arsse_subscriptions where scrape = 1 group by feed) as s on f.id = s.feed
|
||||||
|
where id = ?",
|
||||||
|
["int"]
|
||||||
|
)->run($feedID)->getRow();
|
||||||
if (!$f) {
|
if (!$f) {
|
||||||
throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
|
throw new Db\ExceptionInput("subjectMissing", ["action" => __FUNCTION__, "field" => "feed", 'id' => $feedID]);
|
||||||
}
|
}
|
||||||
// determine whether the feed's items should be scraped for full content from the source Web site
|
// determine whether the feed's items should be scraped for full content from the source Web site
|
||||||
$scrape = (Arsse::$conf->fetchEnableScraping && $f['scrape']);
|
$scrape = (Arsse::$conf->fetchEnableScraping && $f['scrapers']);
|
||||||
// the Feed object throws an exception when there are problems, but that isn't ideal
|
// the Feed object throws an exception when there are problems, but that isn't ideal
|
||||||
// here. When an exception is thrown it should update the database with the
|
// here. When an exception is thrown it should update the database with the
|
||||||
// error instead of failing; if other exceptions are thrown, we should simply roll back
|
// error instead of failing; if other exceptions are thrown, we should simply roll back
|
||||||
|
@ -1161,8 +1168,8 @@ class Database {
|
||||||
}
|
}
|
||||||
if (sizeof($feed->newItems)) {
|
if (sizeof($feed->newItems)) {
|
||||||
$qInsertArticle = $this->db->prepareArray(
|
$qInsertArticle = $this->db->prepareArray(
|
||||||
"INSERT INTO arsse_articles(url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash,feed) values(?,?,?,?,?,?,?,?,?,?,?)",
|
"INSERT INTO arsse_articles(url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash,feed,content_scraped) values(?,?,?,?,?,?,?,?,?,?,?,?)",
|
||||||
['str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int']
|
["str", "str", "str", "datetime", "datetime", "str", "str", "str", "str", "str", "int", "str"]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (sizeof($feed->changedItems)) {
|
if (sizeof($feed->changedItems)) {
|
||||||
|
@ -1170,8 +1177,8 @@ class Database {
|
||||||
$qDeleteCategories = $this->db->prepare("DELETE FROM arsse_categories WHERE article = ?", 'int');
|
$qDeleteCategories = $this->db->prepare("DELETE FROM arsse_categories WHERE article = ?", 'int');
|
||||||
$qClearReadMarks = $this->db->prepare("UPDATE arsse_marks SET \"read\" = 0, modified = CURRENT_TIMESTAMP WHERE article = ? and \"read\" = 1", 'int');
|
$qClearReadMarks = $this->db->prepare("UPDATE arsse_marks SET \"read\" = 0, modified = CURRENT_TIMESTAMP WHERE article = ? and \"read\" = 1", 'int');
|
||||||
$qUpdateArticle = $this->db->prepareArray(
|
$qUpdateArticle = $this->db->prepareArray(
|
||||||
"UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id = ?",
|
"UPDATE arsse_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = CURRENT_TIMESTAMP, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ?, content_scraped = ? WHERE id = ?",
|
||||||
['str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int']
|
["str", "str", "str", "datetime", "datetime", "str", "str", "str", "str", "str", "str", "int"]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
// determine if the feed icon needs to be updated, and update it if appropriate
|
// determine if the feed icon needs to be updated, and update it if appropriate
|
||||||
|
@ -1204,7 +1211,8 @@ class Database {
|
||||||
$article->urlTitleHash,
|
$article->urlTitleHash,
|
||||||
$article->urlContentHash,
|
$article->urlContentHash,
|
||||||
$article->titleContentHash,
|
$article->titleContentHash,
|
||||||
$feedID
|
$feedID,
|
||||||
|
$article->scrapedContent ?? null
|
||||||
)->lastId();
|
)->lastId();
|
||||||
// note the new ID for later use
|
// note the new ID for later use
|
||||||
$articleMap[$k] = $articleID;
|
$articleMap[$k] = $articleID;
|
||||||
|
@ -1232,6 +1240,7 @@ class Database {
|
||||||
$article->urlTitleHash,
|
$article->urlTitleHash,
|
||||||
$article->urlContentHash,
|
$article->urlContentHash,
|
||||||
$article->titleContentHash,
|
$article->titleContentHash,
|
||||||
|
$article->scrapedContent ?? null,
|
||||||
$articleID
|
$articleID
|
||||||
);
|
);
|
||||||
// delete all enclosures and categories and re-insert them
|
// delete all enclosures and categories and re-insert them
|
||||||
|
@ -1273,7 +1282,7 @@ class Database {
|
||||||
// lastly update the feed database itself with updated information.
|
// lastly update the feed database itself with updated information.
|
||||||
$this->db->prepareArray(
|
$this->db->prepareArray(
|
||||||
"UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
|
"UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
|
||||||
['str', 'str', 'datetime', 'strict str', 'datetime', 'int', 'int', 'int']
|
["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"]
|
||||||
)->run(
|
)->run(
|
||||||
$feed->data->title,
|
$feed->data->title,
|
||||||
$feed->data->siteUrl,
|
$feed->data->siteUrl,
|
||||||
|
@ -1429,7 +1438,7 @@ class Database {
|
||||||
'url' => "arsse_articles.url",
|
'url' => "arsse_articles.url",
|
||||||
'title' => "arsse_articles.title",
|
'title' => "arsse_articles.title",
|
||||||
'author' => "arsse_articles.author",
|
'author' => "arsse_articles.author",
|
||||||
'content' => "arsse_articles.content",
|
'content' => "coalesce(case when arsse_subscriptions.scrape = 1 then arsse_articles.content_scraped end, arsse_articles.content)",
|
||||||
'guid' => "arsse_articles.guid",
|
'guid' => "arsse_articles.guid",
|
||||||
'fingerprint' => "arsse_articles.url_title_hash || ':' || arsse_articles.url_content_hash || ':' || arsse_articles.title_content_hash",
|
'fingerprint' => "arsse_articles.url_title_hash || ':' || arsse_articles.url_content_hash || ':' || arsse_articles.title_content_hash",
|
||||||
'folder' => "coalesce(arsse_subscriptions.folder,0)",
|
'folder' => "coalesce(arsse_subscriptions.folder,0)",
|
||||||
|
|
|
@ -448,7 +448,7 @@ class Feed {
|
||||||
$scraper->setUrl($item->url);
|
$scraper->setUrl($item->url);
|
||||||
$scraper->execute();
|
$scraper->execute();
|
||||||
if ($scraper->hasRelevantContent()) {
|
if ($scraper->hasRelevantContent()) {
|
||||||
$item->content = $scraper->getFilteredContent();
|
$item->scrapedContent = $scraper->getFilteredContent();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -32,6 +32,10 @@ create table arsse_user_meta(
|
||||||
primary key(owner,"key")
|
primary key(owner,"key")
|
||||||
) character set utf8mb4 collate utf8mb4_unicode_ci;
|
) character set utf8mb4 collate utf8mb4_unicode_ci;
|
||||||
|
|
||||||
|
alter table arsse_subscriptions add column scrape boolean not null default 0;
|
||||||
|
alter table arsse_feeds drop column scrape;
|
||||||
|
alter table arsse_articles add column content_scraped longtext;
|
||||||
|
|
||||||
create table arsse_icons(
|
create table arsse_icons(
|
||||||
id serial primary key,
|
id serial primary key,
|
||||||
url varchar(767) unique not null,
|
url varchar(767) unique not null,
|
||||||
|
|
|
@ -32,6 +32,10 @@ create table arsse_user_meta(
|
||||||
primary key(owner,key)
|
primary key(owner,key)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
alter table arsse_subscriptions add column scrape smallint not null default 0;
|
||||||
|
alter table arsse_feeds drop column scrape;
|
||||||
|
alter table arsse_articles add column content_scraped text;
|
||||||
|
|
||||||
create table arsse_icons(
|
create table arsse_icons(
|
||||||
id bigserial primary key,
|
id bigserial primary key,
|
||||||
url text unique not null,
|
url text unique not null,
|
||||||
|
|
|
@ -44,8 +44,11 @@ create table arsse_user_meta(
|
||||||
primary key(owner,key)
|
primary key(owner,key)
|
||||||
) without rowid;
|
) without rowid;
|
||||||
|
|
||||||
|
-- Add a "scrape" column for subscriptions
|
||||||
|
alter table arsse_subscriptions add column scrape boolean not null default 0;
|
||||||
|
|
||||||
-- Add a separate table for feed icons and replace their URLs in the feeds table with their IDs
|
-- Add a separate table for feed icons and replace their URLs in the feeds table with their IDs
|
||||||
|
-- Also remove the "scrape" column of the feeds table, which was never an advertised feature
|
||||||
create table arsse_icons(
|
create table arsse_icons(
|
||||||
-- Icons associated with feeds
|
-- Icons associated with feeds
|
||||||
-- At a minimum the URL of the icon must be known, but its content may be missing
|
-- At a minimum the URL of the icon must be known, but its content may be missing
|
||||||
|
@ -76,16 +79,37 @@ create table arsse_feeds_new(
|
||||||
username text not null default '', -- HTTP authentication username
|
username text not null default '', -- HTTP authentication username
|
||||||
password text not null default '', -- HTTP authentication password (this is stored in plain text)
|
password text not null default '', -- HTTP authentication password (this is stored in plain text)
|
||||||
size integer not null default 0, -- number of articles in the feed at last fetch
|
size integer not null default 0, -- number of articles in the feed at last fetch
|
||||||
scrape boolean not null default 0, -- whether to use picoFeed's content scraper with this feed
|
|
||||||
icon integer references arsse_icons(id) on delete set null, -- numeric identifier of any associated icon
|
icon integer references arsse_icons(id) on delete set null, -- numeric identifier of any associated icon
|
||||||
unique(url,username,password) -- a URL with particular credentials should only appear once
|
unique(url,username,password) -- a URL with particular credentials should only appear once
|
||||||
);
|
);
|
||||||
insert into arsse_feeds_new
|
insert into arsse_feeds_new
|
||||||
select f.id, f.url, title, source, updated, f.modified, f.next_fetch, f.orphaned, f.etag, err_count, err_msg, username, password, size, scrape, i.id
|
select f.id, f.url, title, source, updated, f.modified, f.next_fetch, f.orphaned, f.etag, err_count, err_msg, username, password, size, i.id
|
||||||
from arsse_feeds as f left join arsse_icons as i on f.favicon = i.url;
|
from arsse_feeds as f left join arsse_icons as i on f.favicon = i.url;
|
||||||
drop table arsse_feeds;
|
drop table arsse_feeds;
|
||||||
alter table arsse_feeds_new rename to arsse_feeds;
|
alter table arsse_feeds_new rename to arsse_feeds;
|
||||||
|
|
||||||
|
-- Add a column for scraped article content, and re-order some column
|
||||||
|
create table arsse_articles_new(
|
||||||
|
-- entries in newsfeeds
|
||||||
|
id integer primary key, -- sequence number
|
||||||
|
feed integer not null references arsse_feeds(id) on delete cascade, -- feed for the subscription
|
||||||
|
url text, -- URL of article
|
||||||
|
title text collate nocase, -- article title
|
||||||
|
author text collate nocase, -- author's name
|
||||||
|
published text, -- time of original publication
|
||||||
|
edited text, -- time of last edit by author
|
||||||
|
modified text not null default CURRENT_TIMESTAMP, -- time when article was last modified in database
|
||||||
|
guid text, -- GUID
|
||||||
|
url_title_hash text not null, -- hash of URL + title; used when checking for updates and for identification if there is no guid.
|
||||||
|
url_content_hash text not null, -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
|
||||||
|
title_content_hash text not null, -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
|
||||||
|
content_scraped text, -- scraped content, as HTML
|
||||||
|
content text -- content, as HTML
|
||||||
|
);
|
||||||
|
insert into arsse_articles_new select id, feed, url, title, author, published, edited, modified, guid, url_title_hash, url_content_hash, title_content_hash, null, content from arsse_articles;
|
||||||
|
drop table arsse_articles;
|
||||||
|
alter table arsse_articles_new rename to arsse_articles;
|
||||||
|
|
||||||
-- set version marker
|
-- set version marker
|
||||||
pragma user_version = 7;
|
pragma user_version = 7;
|
||||||
update arsse_meta set value = '7' where "key" = 'schema_version';
|
update arsse_meta set value = '7' where "key" = 'schema_version';
|
||||||
|
|
Loading…
Reference in a new issue