1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-22 21:22:40 +00:00

Added Feed Updating

• Started implementing feed updating (Database->updateFeeds())
• Moved hashing to the Feed object, now done when parsing
• Moved adding of articles to the database to its own method
(Database->articleAdd())
This commit is contained in:
Dustin Wilson 2017-03-26 15:16:15 -05:00
parent 8c76c22d74
commit ce0584e7f8
3 changed files with 139 additions and 36 deletions

View file

@ -288,31 +288,7 @@ class Database {
// Add each of the articles to the database.
foreach ($feed->data->items as $i) {
$articleID = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash)
values(?,?,?,?,?,?,?,?,?,?,?)',
'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run(
$feedID,
$i->url,
$i->title,
$i->author,
$i->publishedDate,
$i->updatedDate,
$i->id,
$i->content,
// Since feeds cannot be trusted to have valid ids additional hashes are used for identifiers.
// These hashes are made regardless to check against for changes.
hash('sha256', $i->url.$i->title),
hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType),
hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType)
)->lastId();
// If the article has categories add them into the categories database.
$categories = $i->getTag('category');
if (count($categories) > 0) {
foreach ($categories as $c) {
$this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleID, $c);
}
}
$this->articleAdd($i);
}
}
@ -389,4 +365,127 @@ class Database {
"str", "int")->run($user, $parent);
}
}
public function articleAdd(PicoFeed\Parser\Item $article): int {
$this->db->begin();
$articleId = $this->db->prepare('INSERT INTO newssync_articles(feed,url,title,author,published,edited,guid,content,url_title_hash,url_content_hash,title_content_hash)
values(?,?,?,?,?,?,?,?,?,?,?)',
'int', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str')->run(
$feedID,
$article->url,
$article->title,
$article->author,
$article->publishedDate,
$article->updatedDate,
$article->id,
$article->content,
$article->urlTitleHash,
$article->urlContentHash,
$article->titleContentHash
)->lastId();
// If the article has categories add them into the categories database.
$categories = $article->getTag('category');
if (count($categories) > 0) {
foreach ($categories as $c) {
$this->db->prepare('INSERT INTO newssync_tags(article,name) values(?,?)', 'int', 'str')->run($articleId, $c);
}
}
$this->db->commit();
return 1;
}
public function updateFeeds(): int {
$feeds = $this->db->query('SELECT id, url, username, password, DATEFORMAT("http", modified) AS lastmodified, etag FROM newssync_feeds')->getAll();
foreach ($feeds as $f) {
$feed = new Feed($f['url'], $f['lastmodified'], $f['etag'], $f['username'], $f['password']);
// FIXME: What to do if fails? It currently throws an exception which isn't ideal here.
// If the feed has been updated then
if ($feed->resource->isModified()) {
$feed->parse();
$this->db->begin();
$articles = $this->db->prepare('SELECT id, url, title, author, DATEFORMAT("http", edited) AS edited_date, guid, content, url_title_hash, url_content_hash, title_content_hash FROM newssync_articles WHERE feed is ? ORDER BY id', 'int')->run($f['id'])->getAll();
foreach ($feed->data->items as $i) {
// Iterate through the articles in the database to determine a match for the one
// in the just-parsed feed.
$match = null;
foreach ($articles as $a) {
// If the id exists and is equal to one in the database then this is the post.
if ($i->id) {
if ($i->id === $a['guid']) {
$match = $a;
}
}
// Otherwise if the id doesn't exist and any of the hashes match then this is
// the post.
elseif ($i->urlTitleHash === $a['url_title_hash'] || $i->urlContentHash === $a['url_content_hash'] || $i->titleContentHash === $a['title_content_hash']) {
$match = $a;
}
}
// If there is no match then this is a new post and must be added to the
// database.
if (!$match) {
$this->articleAdd($i);
continue;
}
// With that out of the way determine if the post has been updated.
// If there is an updated date, and it doesn't match the database's then update
// the post.
$update = false;
if ($i->updatedDate) {
if ($i->updatedDate !== $match['edited_date']) {
$update = true;
}
}
// Otherwise if there isn't an updated date and any of the hashes don't match
// then update the post.
elseif ($i->urlTitleHash !== $match['url_title_hash'] || $i->urlContentHash !== $match['url_content_hash'] || $i->titleContentHash !== $match['title_content_hash']) {
$update = true;
}
if ($update) {
$this->db->prepare('UPDATE newssync_articles SET url = ?, title = ?, author = ?, published = ?, edited = ?, modified = ?, guid = ?, content = ?, url_title_hash = ?, url_content_hash = ?, title_content_hash = ? WHERE id is ?', 'str', 'str', 'str', 'datetime', 'datetime', 'datetime', 'str', 'str', 'str', 'str', 'str', 'int')->run(
$i->url,
$i->title,
$i->author,
$i->publishedDate,
$i->updatedDate,
time(),
$i->id,
$i->content,
$i->urlTitleHash,
$i->urlContentHash,
$i->titleContentHash,
$match['id']
);
// TODO: Update categories
}
}
// Lastly update the feed database itself with updated information.
$this->db->prepare('UPDATE newssync_feeds SET url = ?, title = ?, favicon = ?, source = ?, updated = ?, modified = ?, etag = ? WHERE id is ?', 'str', 'str', 'str', 'str', 'datetime', 'datetime', 'str', 'int')->run(
$feed->feedUrl,
$feed->title,
$feed->favicon,
$feed->siteUrl,
$feed->date,
$feed->resource->getLastModified(),
$feed->resource->getEtag(),
$f['id']
);
}
}
$this->db->commit();
return 1;
}
}

View file

@ -5,18 +5,18 @@ use PicoFeed\PicoFeedException;
use PicoFeed\Reader\Favicon;
class Feed {
public $data = null;
public $favicon;
public $parser;
public $reader;
public $resource;
public $parser;
public $data;
public $favicon;
public function __construct(string $url, string $lastModified = '', string $etag = '') {
public function __construct(string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '') {
try {
$this->reader = new Reader;
$this->resource = $reader->download($url, $lastModified, $etag);
$this->resource = $reader->download($url, $lastModified, $etag, $username, $password);
// Grab the favicon for the feed; returns an empty string if it cannot find one.
$this->favicon = new Favicon->find($url);
$this->favicon = (new Favicon)->find($url);
} catch (PicoFeedException $e) {
throw new Feed\Exception($url, $e);
}
@ -43,6 +43,12 @@ class Feed {
// only be reserved for severely broken feeds.
foreach ($feed->items as &$f) {
// Hashes used for comparison to check for updates and also to identify when an
// id doesn't exist.
$f->urlTitleHash = hash('sha256', $i->url.$i->title);
$f->urlContentHash = hash('sha256', $i->url.$i->content.$i->enclosureUrl.$i->enclosureType);
$f->titleContentHash = hash('sha256', $i->title.$i->content.$i->enclosureUrl.$i->enclosureType);
// If there is an id element then continue. The id is used already.
$id = (string)$f->xml->id;
if ($id !== '') {
@ -63,9 +69,7 @@ class Feed {
continue;
}
// If there aren't any of those there is no id. Hashes are created when adding
// the feed to the database which will serve to identify the post in this
// situation.
// If there aren't any of those there is no id.
$f->id = '';
}

View file

@ -69,9 +69,9 @@ create table newssync_articles(
author TEXT, -- author's name
published datetime, -- time of original publication
edited datetime, -- time of last edit
modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified
guid TEXT, -- GUID
content TEXT, -- content, as (X)HTML
modified datetime not null default CURRENT_TIMESTAMP, -- date when article properties were last modified
url_title_hash varchar(64), -- hash of URL + title; used when checking for updates and for identification if there is no guid.
url_content_hash varchar(64), -- hash of URL + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.
title_content_hash varchar(64) -- hash of title + content, enclosure URL, & content type; used when checking for updates and for identification if there is no guid.