1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-31 21:12:41 +00:00
Arsse/lib/Feed.php

471 lines
21 KiB
PHP
Raw Normal View History

<?php
/** @license MIT
* Copyright 2017 J. King, Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
2021-04-14 15:17:01 +00:00
2017-03-28 04:12:12 +00:00
namespace JKingWeb\Arsse;
2017-08-29 14:50:31 +00:00
use JKingWeb\Arsse\Misc\Date;
use JKingWeb\Arsse\Rule\Rule;
use PicoFeed\PicoFeedException;
use PicoFeed\Config\Config;
use PicoFeed\Client\Client;
use PicoFeed\Reader\Reader;
use PicoFeed\Reader\Favicon;
use PicoFeed\Scraper\Scraper;
2017-08-29 14:50:31 +00:00
class Feed {
public $data = null;
2020-11-05 01:00:00 +00:00
public $iconUrl;
public $iconType;
public $iconData;
public $resource;
public $modified = false;
public $lastModified;
public $nextFetch;
public $newItems = [];
public $changedItems = [];
public $filteredItems = [];
public static function discover(string $url, string $username = '', string $password = ''): string {
// fetch the candidate feed
$f = self::download($url, "", "", $username, $password);
if ($f->reader->detectFormat($f->getContent())) {
// if the prospective URL is a feed, use it
$out = $url;
} else {
$links = $f->reader->find($f->getUrl(), $f->getContent());
if (!$links) {
// work around a PicoFeed memory leak
libxml_use_internal_errors(false);
2021-01-22 03:44:22 +00:00
throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription'));
} else {
$out = $links[0];
}
}
// work around a PicoFeed memory leak
libxml_use_internal_errors(false);
return $out;
}
2018-10-26 18:58:04 +00:00
2020-12-01 22:12:19 +00:00
public static function discoverAll(string $url, string $username = '', string $password = ''): array {
// fetch the candidate feed
$f = self::download($url, "", "", $username, $password);
if ($f->reader->detectFormat($f->getContent())) {
// if the prospective URL is a feed, use it
return [$url];
} else {
return $f->reader->find($f->getUrl(), $f->getContent());
}
}
public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
// fetch the feed
$this->resource = self::download($url, $lastModified, $etag, $username, $password);
// format the HTTP Last-Modified date returned
$lastMod = $this->resource->getLastModified();
if (strlen($lastMod ?? "")) {
$this->lastModified = Date::normalize($lastMod, "http");
}
$this->modified = $this->resource->isModified();
//parse the feed, if it has been modified
2017-08-29 14:50:31 +00:00
if ($this->modified) {
$this->parse();
// ascertain whether there are any articles not in the database
$this->matchToDatabase($feedID);
// if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents
2017-08-29 14:50:31 +00:00
if (!$this->lastModified) {
2017-07-21 02:40:09 +00:00
$this->lastModified = $this->computeLastModified();
}
// we only really care if articles have been modified; if there are no new articles, act as if the feed is unchanged
2017-08-29 14:50:31 +00:00
if (!sizeof($this->newItems) && !sizeof($this->changedItems)) {
2017-07-21 02:40:09 +00:00
$this->modified = false;
} else {
if ($feedID) {
$this->computeFilterRules($feedID);
}
// if requested, scrape full content for any new and changed items
if ($scrape) {
$this->scrape();
}
2017-07-21 02:40:09 +00:00
}
}
// compute the time at which the feed should next be fetched
$this->nextFetch = $this->computeNextFetch();
}
protected static function configure(): Config {
2017-12-07 20:18:25 +00:00
$userAgent = Arsse::$conf->fetchUserAgentString ?? sprintf(
'Arsse/%s (%s %s; %s; https://thearsse.com/)',
Arsse::VERSION, // Arsse version
php_uname('s'), // OS
php_uname('r'), // OS version
php_uname('m') // platform architecture
);
$config = new Config;
$config->setMaxBodySize(Arsse::$conf->fetchSizeLimit);
$config->setClientTimeout(Arsse::$conf->fetchTimeout);
$config->setGrabberTimeout(Arsse::$conf->fetchTimeout);
$config->setClientUserAgent($userAgent);
$config->setGrabberUserAgent($userAgent);
return $config;
}
protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client {
try {
$reader = new Reader(self::configure());
$client = $reader->download($url, $lastModified, $etag, $username, $password);
$client->reader = $reader;
return $client;
} catch (PicoFeedException $e) {
2021-01-22 03:44:22 +00:00
throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore
} catch (\GuzzleHttp\Exception\GuzzleException $e) {
2021-01-22 03:44:22 +00:00
throw new Feed\Exception("", ['url' => $url], $e);
}
}
protected function parse(): void {
try {
$feed = $this->resource->reader->getParser(
$this->resource->getUrl(),
$this->resource->getContent(),
$this->resource->getEncoding()
)->execute();
} catch (PicoFeedException $e) {
2021-01-22 03:44:22 +00:00
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e);
} catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore
2021-01-22 03:44:22 +00:00
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); // @codeCoverageIgnore
}
2020-11-05 01:00:00 +00:00
// Grab the favicon for the feed, or null if no valid icon is found
// Some feeds might use a different domain (eg: feedburner), so the site url is
// used instead of the feed's url.
$icon = new Favicon;
2020-11-05 15:14:42 +00:00
$this->iconUrl = $icon->find($feed->siteUrl, $feed->getIcon());
2020-11-05 01:00:00 +00:00
$this->iconData = $icon->getContent();
if (strlen($this->iconData)) {
$this->iconType = $icon->getType();
} else {
$this->iconUrl = $this->iconData = null;
}
// PicoFeed does not provide valid ids when there is no id element. Its solution
// of hashing the url, title, and content together for the id if there is no id
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
// some are pure RSS with guid elements while others use the Dublin Core spec for
// identification. These feeds shouldn't be duplicated when updated. That should
// only be reserved for severely broken feeds.
foreach ($feed->items as $f) {
// Hashes used for comparison to check for updates and also to identify when an
// id doesn't exist.
$content = $f->content.$f->enclosureUrl.$f->enclosureType;
// if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) {
$f->urlTitleHash = "";
} else {
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
}
// if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
if (!strlen($content) && $f->url === $feed->siteUrl) {
2017-08-29 14:50:31 +00:00
$f->urlContentHash = "";
} else {
$f->urlContentHash = hash('sha256', $f->url.$content);
}
// if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
if (!strlen($content) && $f->title === $f->url) {
$f->titleContentHash = "";
} else {
$f->titleContentHash = hash('sha256', $f->title.$content);
}
$f->id = null;
// prefer an Atom ID as the item's ID
$id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id;
// otherwise use the RSS2 guid element
2017-08-29 14:50:31 +00:00
if (!strlen($id)) {
2017-07-21 02:40:09 +00:00
$id = (string) $f->xml->guid;
}
// otherwise use the Dublin Core identifier element
2017-08-29 14:50:31 +00:00
if (!strlen($id)) {
2017-07-21 02:40:09 +00:00
$id = (string) $f->xml->children('http://purl.org/dc/elements/1.1/')->identifier;
}
// otherwise there is no ID; if there is one, hash it
2017-08-29 14:50:31 +00:00
if (strlen($id)) {
2017-07-21 02:40:09 +00:00
$f->id = hash('sha256', $id);
}
2017-06-03 15:16:26 +00:00
// PicoFeed also doesn't gather up categories, so we do this as well
$f->categories = [];
// first add Atom categories
2017-08-29 14:50:31 +00:00
foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) {
2017-06-03 15:16:26 +00:00
// if the category has a label, use that
$name = (string) $c->attributes()->label;
// otherwise use the term
2017-08-29 14:50:31 +00:00
if (!strlen($name)) {
2017-07-21 02:40:09 +00:00
$name = (string) $c->attributes()->term;
}
2017-06-03 15:16:26 +00:00
// ... assuming it has that much
2017-08-29 14:50:31 +00:00
if (strlen($name)) {
2017-07-21 02:40:09 +00:00
$f->categories[] = $name;
}
2017-06-03 15:16:26 +00:00
}
// next add RSS2 categories
2017-08-29 14:50:31 +00:00
foreach ($f->xml->children()->category as $c) {
2017-06-03 15:16:26 +00:00
$name = (string) $c;
2017-08-29 14:50:31 +00:00
if (strlen($name)) {
2017-07-21 02:40:09 +00:00
$f->categories[] = $name;
}
2017-06-03 15:16:26 +00:00
}
// and finally try Dublin Core subjects
2017-08-29 14:50:31 +00:00
foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) {
2017-06-03 15:16:26 +00:00
$name = (string) $c;
2017-08-29 14:50:31 +00:00
if (strlen($name)) {
2017-07-21 02:40:09 +00:00
$f->categories[] = $name;
}
2017-06-03 15:16:26 +00:00
}
//sort the results
sort($f->categories);
}
$this->data = $feed;
}
protected function deduplicateItems(array $items): array {
/* Rationale:
2017-08-29 14:50:31 +00:00
Some newsfeeds (notably Planet) include multiple versions of an
item if it is updated. As we only care about the latest, we
2017-08-29 14:50:31 +00:00
try to remove any "old" versions of an item that might also be
present within the feed.
*/
$out = [];
2017-08-29 14:50:31 +00:00
foreach ($items as $item) {
foreach ($out as $index => $check) {
// if the two items both have IDs and they differ, they do not match, regardless of hashes
if ($item->id && $check->id && $item->id !== $check->id) {
2017-07-21 02:40:09 +00:00
continue;
}
// if the two items have the same ID or any one hash matches, they are two versions of the same item
2017-08-29 14:50:31 +00:00
if (
($item->id && $check->id && $item->id === $check->id) ||
2020-03-01 20:16:50 +00:00
($item->urlTitleHash && $item->urlTitleHash === $check->urlTitleHash) ||
($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) ||
($item->titleContentHash && $item->titleContentHash === $check->titleContentHash)
) {
2017-08-29 14:50:31 +00:00
if (// because newsfeeds are usually order newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
($item->updatedDate && !$check->updatedDate) ||
// the later item has an update date newer than the existing item's
($item->updatedDate && $check->updatedDate && $item->updatedDate->getTimestamp() > $check->updatedDate->getTimestamp()) ||
// neither item has update dates, both have publish dates, and the later item has a newer publish date
(!$item->updatedDate && !$check->updatedDate && $item->publishedDate && $check->publishedDate && $item->publishedDate->getTimestamp() > $check->publishedDate->getTimestamp())
) {
// if the later item should be used, replace the existing one
$out[$index] = $item;
continue 2;
} else {
// otherwise skip the item
continue 2;
}
}
}
// if there was no match, add the item
$out[] = $item;
}
return $out;
}
protected function matchToDatabase(int $feedID = null): void {
// first perform deduplication on items
$items = $this->deduplicateItems($this->data->items);
// if we haven't been given a database feed ID to check against, all items are new
2017-08-29 14:50:31 +00:00
if (is_null($feedID)) {
$this->newItems = $items;
return;
}
// get as many of the latest articles in the database as there are in the feed
$articles = Arsse::$db->feedMatchLatest($feedID, sizeof($items))->getAll();
// perform a first pass matching the latest articles against items in the feed
2020-03-01 20:16:50 +00:00
[$this->newItems, $this->changedItems] = $this->matchItems($items, $articles);
if (sizeof($this->newItems)) {
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
$ids = $hashesUT = $hashesUC = $hashesTC = [];
2017-08-29 14:50:31 +00:00
foreach ($this->newItems as $i) {
if ($i->id) {
2017-07-21 02:40:09 +00:00
$ids[] = $i->id;
}
2017-08-29 14:50:31 +00:00
if ($i->urlTitleHash) {
2017-07-21 02:40:09 +00:00
$hashesUT[] = $i->urlTitleHash;
}
2017-08-29 14:50:31 +00:00
if ($i->urlContentHash) {
2017-07-21 02:40:09 +00:00
$hashesUC[] = $i->urlContentHash;
}
2017-08-29 14:50:31 +00:00
if ($i->titleContentHash) {
2017-07-21 02:40:09 +00:00
$hashesTC[] = $i->titleContentHash;
}
}
$articles = Arsse::$db->feedMatchIds($feedID, $ids, $hashesUT, $hashesUC, $hashesTC)->getAll();
2020-03-01 20:16:50 +00:00
[$this->newItems, $changed] = $this->matchItems($this->newItems, $articles);
// merge the two change-lists, preserving keys
$this->changedItems = array_combine(array_merge(array_keys($this->changedItems), array_keys($changed)), array_merge($this->changedItems, $changed));
}
}
protected function matchItems(array $items, array $articles): array {
2020-03-01 20:16:50 +00:00
$new = $edited = [];
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
2017-08-29 14:50:31 +00:00
foreach ($items as $i) {
$found = false;
2017-08-29 14:50:31 +00:00
foreach ($articles as $a) {
// if the item has an ID and it doesn't match the article ID, the two don't match, regardless of hashes
2017-08-29 14:50:31 +00:00
if ($i->id && $i->id !== $a['guid']) {
2017-07-21 02:40:09 +00:00
continue;
}
2017-08-29 14:50:31 +00:00
if (
// the item matches if the GUID matches...
($i->id && $i->id === $a['guid']) ||
// ... or if any one of the hashes match
2020-03-01 20:16:50 +00:00
($i->urlTitleHash && $i->urlTitleHash === $a['url_title_hash']) ||
($i->urlContentHash && $i->urlContentHash === $a['url_content_hash']) ||
($i->titleContentHash && $i->titleContentHash === $a['title_content_hash'])
) {
2017-08-29 14:50:31 +00:00
if ($i->updatedDate && Date::transform($i->updatedDate, "sql") !== $a['edited']) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
$found = true;
$edited[$a['id']] = $i;
break;
2017-08-29 14:50:31 +00:00
} elseif ($i->urlTitleHash !== $a['url_title_hash'] || $i->urlContentHash !== $a['url_content_hash'] || $i->titleContentHash !== $a['title_content_hash']) {
// if any of the hashes do not match, then the article has been edited
$found = true;
$edited[$a['id']] = $i;
break;
} else {
// otherwise the item is unchanged and we can ignore it
$found = true;
break;
}
}
}
2017-08-29 14:50:31 +00:00
if (!$found) {
2017-07-21 02:40:09 +00:00
$new[] = $i;
}
}
return [$new, $edited];
}
protected function computeNextFetch(): \DateTimeImmutable {
$now = Date::normalize(time());
2017-08-29 14:50:31 +00:00
if (!$this->modified) {
if ($this->lastModified) {
$diff = $now->getTimestamp() - $this->lastModified->getTimestamp();
$offset = $this->normalizeDateDiff($diff);
} else {
// if no timestamp is available, fall back to three hours
$offset = "3 hours";
}
return $now->modify("+".$offset);
} else {
// the algorithm for updated feeds (returning 200 rather than 304) uses the same parameters as for 304,
// save that the last three intervals between item dates are computed, and if any two fall within
// the same interval range, that interval is used (e.g. if the intervals are 23m, 12m, and 4h, the used
// interval is "less than 30m"). If there is no commonality, the feed is checked in 1 hour.
$offsets = [];
$dates = $this->gatherDates();
2017-08-29 14:50:31 +00:00
if (sizeof($dates) > 3) {
for ($a = 0; $a < 3; $a++) {
2020-03-01 20:16:50 +00:00
$diff = $dates[$a] - $dates[$a + 1];
$offsets[] = $this->normalizeDateDiff($diff);
}
if ($offsets[0] === $offsets[1] || $offsets[0] === $offsets[2]) {
return $now->modify("+".$offsets[0]);
} elseif ($offsets[1] === $offsets[2]) {
return $now->modify("+".$offsets[1]);
} else {
return $now->modify("+ 1 hour");
}
} else {
return $now->modify("+ 1 hour");
}
}
}
public static function nextFetchOnError($errCount): \DateTimeImmutable {
2017-08-29 14:50:31 +00:00
if ($errCount < 3) {
$offset = "5 minutes";
2017-08-29 14:50:31 +00:00
} elseif ($errCount < 15) {
$offset = "3 hours";
} else {
$offset = "1 day";
}
return Date::normalize("now + ".$offset);
}
protected function normalizeDateDiff(int $diff): string {
2017-08-29 14:50:31 +00:00
if ($diff < (30 * 60)) { // less than 30 minutes
$offset = "15 minutes";
2017-08-29 14:50:31 +00:00
} elseif ($diff < (60 * 60)) { // less than an hour
$offset = "30 minutes";
2017-08-29 14:50:31 +00:00
} elseif ($diff < (3 * 60 * 60)) { // less than three hours
$offset = "1 hour";
2017-08-29 14:50:31 +00:00
} elseif ($diff >= (36 * 60 * 60)) { // more than 36 hours
$offset = "1 day";
} else {
$offset = "3 hours";
}
return $offset;
}
2020-01-20 18:52:48 +00:00
protected function computeLastModified(): ?\DateTimeImmutable {
2017-08-29 14:50:31 +00:00
if (!$this->modified) {
2017-09-30 15:43:43 +00:00
return $this->lastModified; // @codeCoverageIgnore
2017-07-21 02:40:09 +00:00
}
$dates = $this->gatherDates();
2017-08-29 14:50:31 +00:00
if (sizeof($dates)) {
return Date::normalize($dates[0]);
} else {
2017-09-30 15:43:43 +00:00
return null; // @codeCoverageIgnore
}
}
protected function gatherDates(): array {
$dates = [];
2017-08-29 14:50:31 +00:00
foreach ($this->data->items as $item) {
if ($item->updatedDate) {
2017-07-21 02:40:09 +00:00
$dates[] = $item->updatedDate->getTimestamp();
}
2017-08-29 14:50:31 +00:00
if ($item->publishedDate) {
2017-07-21 02:40:09 +00:00
$dates[] = $item->publishedDate->getTimestamp();
}
}
$dates = array_unique($dates, \SORT_NUMERIC);
rsort($dates);
return $dates;
}
protected function scrape(): void {
$scraper = new Scraper(self::configure());
2017-08-29 14:50:31 +00:00
foreach (array_merge($this->newItems, $this->changedItems) as $item) {
$scraper->setUrl($item->url);
$scraper->execute();
2017-08-29 14:50:31 +00:00
if ($scraper->hasRelevantContent()) {
$item->scrapedContent = $scraper->getFilteredContent();
}
}
}
protected function computeFilterRules(int $feedID): void {
$rules = Arsse::$db->feedRulesGet($feedID);
foreach ($rules as $user => $r) {
$stats = ['new' => [], 'changed' => []];
foreach ($this->newItems as $index => $item) {
$stats['new'][$index] = Rule::apply($r['keep'], $r['block'], $item->title, $item->categories);
}
foreach ($this->changedItems as $index => $item) {
$stats['changed'][$index] = Rule::apply($r['keep'], $r['block'], $item->title, $item->categories);
}
$this->filteredItems[$user] = $stats;
}
}
2017-08-29 14:50:31 +00:00
}