mirror of
https://code.mensbeam.com/MensBeam/Arsse.git
synced 2025-01-11 02:12:40 +00:00
Avoid dynamic property creation with PicoFeed
This only leaves the Laminas XML deprecated behaviour to handle
This commit is contained in:
parent
0d6f8d2921
commit
fe06ffc176
4 changed files with 112 additions and 75 deletions
|
@ -1335,12 +1335,12 @@ class Database {
|
|||
"UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
|
||||
["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"]
|
||||
)->run(
|
||||
$feed->data->title,
|
||||
$feed->data->siteUrl,
|
||||
$feed->title,
|
||||
$feed->siteUrl,
|
||||
$feed->lastModified,
|
||||
$feed->resource->getEtag(),
|
||||
$feed->etag,
|
||||
$feed->nextFetch,
|
||||
sizeof($feed->data->items),
|
||||
sizeof($feed->items),
|
||||
$icon,
|
||||
$feedID
|
||||
);
|
||||
|
|
105
lib/Feed.php
105
lib/Feed.php
|
@ -6,6 +6,7 @@
|
|||
declare(strict_types=1);
|
||||
namespace JKingWeb\Arsse;
|
||||
|
||||
use JKingWeb\Arsse\Feed\Item;
|
||||
use JKingWeb\Arsse\Misc\Date;
|
||||
use JKingWeb\Arsse\Rule\Rule;
|
||||
use PicoFeed\PicoFeedException;
|
||||
|
@ -16,62 +17,62 @@ use PicoFeed\Reader\Favicon;
|
|||
use PicoFeed\Scraper\Scraper;
|
||||
|
||||
class Feed {
|
||||
public $data = null;
|
||||
public $title;
|
||||
public $siteUrl;
|
||||
public $iconUrl;
|
||||
public $iconType;
|
||||
public $iconData;
|
||||
public $resource;
|
||||
public $modified = false;
|
||||
public $lastModified;
|
||||
public $etag;
|
||||
public $nextFetch;
|
||||
public $items = [];
|
||||
public $newItems = [];
|
||||
public $changedItems = [];
|
||||
public $filteredItems = [];
|
||||
|
||||
public static function discover(string $url, string $username = '', string $password = ''): string {
|
||||
// fetch the candidate feed
|
||||
$f = self::download($url, "", "", $username, $password);
|
||||
if ($f->reader->detectFormat($f->getContent())) {
|
||||
[$client, $reader] = self::download($url, "", "", $username, $password);
|
||||
if ($reader->detectFormat($client->getContent())) {
|
||||
// if the prospective URL is a feed, use it
|
||||
$out = $url;
|
||||
} else {
|
||||
$links = $f->reader->find($f->getUrl(), $f->getContent());
|
||||
$links = $reader->find($client->getUrl(), $client->getContent());
|
||||
if (!$links) {
|
||||
// work around a PicoFeed memory leak
|
||||
libxml_use_internal_errors(false);
|
||||
throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription'));
|
||||
} else {
|
||||
$out = $links[0];
|
||||
}
|
||||
}
|
||||
// work around a PicoFeed memory leak
|
||||
libxml_use_internal_errors(false);
|
||||
return $out;
|
||||
}
|
||||
|
||||
public static function discoverAll(string $url, string $username = '', string $password = ''): array {
|
||||
// fetch the candidate feed
|
||||
$f = self::download($url, "", "", $username, $password);
|
||||
if ($f->reader->detectFormat($f->getContent())) {
|
||||
[$client, $reader] = self::download($url, "", "", $username, $password);
|
||||
if ($reader->detectFormat($client->getContent())) {
|
||||
// if the prospective URL is a feed, use it
|
||||
return [$url];
|
||||
} else {
|
||||
return $f->reader->find($f->getUrl(), $f->getContent());
|
||||
return $reader->find($client->getUrl(), $client->getContent());
|
||||
}
|
||||
}
|
||||
|
||||
public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
|
||||
// fetch the feed
|
||||
$this->resource = self::download($url, $lastModified, $etag, $username, $password);
|
||||
[$client, $reader] = self::download($url, $lastModified, $etag, $username, $password);
|
||||
// format the HTTP Last-Modified date returned
|
||||
$lastMod = $this->resource->getLastModified();
|
||||
$lastMod = $client->getLastModified();
|
||||
if (strlen($lastMod ?? "")) {
|
||||
$this->lastModified = Date::normalize($lastMod, "http");
|
||||
}
|
||||
$this->modified = $this->resource->isModified();
|
||||
//parse the feed, if it has been modified
|
||||
$this->modified = $client->isModified();
|
||||
// get the ETag
|
||||
$this->etag = $client->getEtag();
|
||||
// parse the feed, if it has been modified
|
||||
if ($this->modified) {
|
||||
$this->parse();
|
||||
$this->parse($client, $reader);
|
||||
// ascertain whether there are any articles not in the database
|
||||
$this->matchToDatabase($feedID);
|
||||
// if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents
|
||||
|
@ -112,12 +113,11 @@ class Feed {
|
|||
return $config;
|
||||
}
|
||||
|
||||
protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client {
|
||||
protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): array {
|
||||
try {
|
||||
$reader = new Reader(self::configure());
|
||||
$client = $reader->download($url, $lastModified, $etag, $username, $password);
|
||||
$client->reader = $reader;
|
||||
return $client;
|
||||
return [$client, $reader];
|
||||
} catch (PicoFeedException $e) {
|
||||
throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore
|
||||
} catch (\GuzzleHttp\Exception\GuzzleException $e) {
|
||||
|
@ -125,17 +125,17 @@ class Feed {
|
|||
}
|
||||
}
|
||||
|
||||
protected function parse(): void {
|
||||
protected function parse(Client $client, Reader $reader): void {
|
||||
try {
|
||||
$feed = $this->resource->reader->getParser(
|
||||
$this->resource->getUrl(),
|
||||
$this->resource->getContent(),
|
||||
$this->resource->getEncoding()
|
||||
$feed = $reader->getParser(
|
||||
$client->getUrl(),
|
||||
$client->getContent(),
|
||||
$client->getEncoding()
|
||||
)->execute();
|
||||
} catch (PicoFeedException $e) {
|
||||
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e);
|
||||
throw new Feed\Exception("", ['url' => $client->getUrl()], $e);
|
||||
} catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore
|
||||
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); // @codeCoverageIgnore
|
||||
throw new Feed\Exception("", ['url' => $client->getUrl()], $e); // @codeCoverageIgnore
|
||||
}
|
||||
|
||||
// Grab the favicon for the feed, or null if no valid icon is found
|
||||
|
@ -150,6 +150,10 @@ class Feed {
|
|||
$this->iconUrl = $this->iconData = null;
|
||||
}
|
||||
|
||||
// Next gather all other feed-level information we want out of the feed
|
||||
$this->siteUrl = $feed->siteUrl;
|
||||
$this->title = $feed->title;
|
||||
|
||||
// PicoFeed does not provide valid ids when there is no id element. Its solution
|
||||
// of hashing the url, title, and content together for the id if there is no id
|
||||
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
|
||||
|
@ -158,29 +162,38 @@ class Feed {
|
|||
// only be reserved for severely broken feeds.
|
||||
|
||||
foreach ($feed->items as $f) {
|
||||
// Hashes used for comparison to check for updates and also to identify when an
|
||||
// copy the basic information of an article
|
||||
$i = new Item;
|
||||
$i->url = $f->url;
|
||||
$i->title = $f->title;
|
||||
$i->content = $f->content;
|
||||
$i->author = $f->author;
|
||||
$i->publishedDate = $f->publishedDate;
|
||||
$i->updatedDate = $f->updatedDate;
|
||||
$i->enclosureType = $f->enclosureType;
|
||||
$i->enclosureUrl = $f->enclosureUrl;
|
||||
// add hashes used for comparison to check for updates and also to identify when an
|
||||
// id doesn't exist.
|
||||
$content = $f->content.$f->enclosureUrl.$f->enclosureType;
|
||||
// if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
|
||||
if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) {
|
||||
$f->urlTitleHash = "";
|
||||
$i->urlTitleHash = "";
|
||||
} else {
|
||||
$f->urlTitleHash = hash('sha256', $f->url.$f->title);
|
||||
$i->urlTitleHash = hash('sha256', $f->url.$f->title);
|
||||
}
|
||||
// if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
|
||||
if (!strlen($content) && $f->url === $feed->siteUrl) {
|
||||
$f->urlContentHash = "";
|
||||
$i->urlContentHash = "";
|
||||
} else {
|
||||
$f->urlContentHash = hash('sha256', $f->url.$content);
|
||||
$i->urlContentHash = hash('sha256', $f->url.$content);
|
||||
}
|
||||
// if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
|
||||
if (!strlen($content) && $f->title === $f->url) {
|
||||
$f->titleContentHash = "";
|
||||
$i->titleContentHash = "";
|
||||
} else {
|
||||
$f->titleContentHash = hash('sha256', $f->title.$content);
|
||||
$i->titleContentHash = hash('sha256', $f->title.$content);
|
||||
}
|
||||
$f->id = null;
|
||||
// prefer an Atom ID as the item's ID
|
||||
// next add an id; prefer an Atom ID as the item's ID
|
||||
$id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id;
|
||||
// otherwise use the RSS2 guid element
|
||||
if (!strlen($id)) {
|
||||
|
@ -192,11 +205,10 @@ class Feed {
|
|||
}
|
||||
// otherwise there is no ID; if there is one, hash it
|
||||
if (strlen($id)) {
|
||||
$f->id = hash('sha256', $id);
|
||||
$i->id = hash('sha256', $id);
|
||||
}
|
||||
|
||||
// PicoFeed also doesn't gather up categories, so we do this as well
|
||||
$f->categories = [];
|
||||
// first add Atom categories
|
||||
foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) {
|
||||
// if the category has a label, use that
|
||||
|
@ -207,27 +219,28 @@ class Feed {
|
|||
}
|
||||
// ... assuming it has that much
|
||||
if (strlen($name)) {
|
||||
$f->categories[] = $name;
|
||||
$i->categories[] = $name;
|
||||
}
|
||||
}
|
||||
// next add RSS2 categories
|
||||
foreach ($f->xml->children()->category as $c) {
|
||||
$name = (string) $c;
|
||||
if (strlen($name)) {
|
||||
$f->categories[] = $name;
|
||||
$i->categories[] = $name;
|
||||
}
|
||||
}
|
||||
// and finally try Dublin Core subjects
|
||||
foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) {
|
||||
$name = (string) $c;
|
||||
if (strlen($name)) {
|
||||
$f->categories[] = $name;
|
||||
$i->categories[] = $name;
|
||||
}
|
||||
}
|
||||
//sort the results
|
||||
sort($f->categories);
|
||||
sort($i->categories);
|
||||
// add the item to the feed's list of items
|
||||
$this->items[] = $i;
|
||||
}
|
||||
$this->data = $feed;
|
||||
}
|
||||
|
||||
protected function deduplicateItems(array $items): array {
|
||||
|
@ -251,7 +264,7 @@ class Feed {
|
|||
($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) ||
|
||||
($item->titleContentHash && $item->titleContentHash === $check->titleContentHash)
|
||||
) {
|
||||
if (// because newsfeeds are usually order newest-first, the later item should only be used if...
|
||||
if (// because newsfeeds are usually ordered newest-first, the later item should only be used if...
|
||||
// the later item has an update date and the existing item does not
|
||||
($item->updatedDate && !$check->updatedDate) ||
|
||||
// the later item has an update date newer than the existing item's
|
||||
|
@ -276,7 +289,7 @@ class Feed {
|
|||
|
||||
protected function matchToDatabase(int $feedID = null): void {
|
||||
// first perform deduplication on items
|
||||
$items = $this->deduplicateItems($this->data->items);
|
||||
$items = $this->deduplicateItems($this->items);
|
||||
// if we haven't been given a database feed ID to check against, all items are new
|
||||
if (is_null($feedID)) {
|
||||
$this->newItems = $items;
|
||||
|
@ -429,7 +442,7 @@ class Feed {
|
|||
|
||||
protected function gatherDates(): array {
|
||||
$dates = [];
|
||||
foreach ($this->data->items as $item) {
|
||||
foreach ($this->items as $item) {
|
||||
if ($item->updatedDate) {
|
||||
$dates[] = $item->updatedDate->getTimestamp();
|
||||
}
|
||||
|
|
24
lib/Feed/Item.php
Normal file
24
lib/Feed/Item.php
Normal file
|
@ -0,0 +1,24 @@
|
|||
<?php
|
||||
/** @license MIT
|
||||
* Copyright 2017 J. King, Dustin Wilson et al.
|
||||
* See LICENSE and AUTHORS files for details */
|
||||
|
||||
declare(strict_types=1);
|
||||
namespace JKingWeb\Arsse\Feed;
|
||||
|
||||
class Item {
|
||||
public $id;
|
||||
public $url;
|
||||
public $title;
|
||||
public $author;
|
||||
public $publishedDate;
|
||||
public $updatedDate;
|
||||
public $urlContentHash;
|
||||
public $urlTitleHash;
|
||||
public $titleContentHash;
|
||||
public $content;
|
||||
public $scrapedContent;
|
||||
public $enclosureUrl;
|
||||
public $enclosureType;
|
||||
public $categories = [];
|
||||
}
|
|
@ -113,26 +113,26 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
|
|||
$h0 = "0a4f0e3768c8a5e9d8d9a16545ae4ff5b097f6dac3ad49555a94a7cace68ba73"; // hash of Atom ID
|
||||
$h1 = "a135beced0236b723d12f845ff20ec22d4fc3afe1130012618f027170d57cb4e"; // hash of RSS2 GUID
|
||||
$h2 = "205e986f4f8b3acfa281227beadb14f5e8c32c8dae4737f888c94c0df49c56f8"; // hash of Dublin Core identifier
|
||||
$this->assertSame($h0, $f->data->items[0]->id);
|
||||
$this->assertSame($h1, $f->data->items[1]->id);
|
||||
$this->assertSame($h2, $f->data->items[2]->id);
|
||||
$this->assertSame($h0, $f->items[0]->id);
|
||||
$this->assertSame($h1, $f->items[1]->id);
|
||||
$this->assertSame($h2, $f->items[2]->id);
|
||||
// check null hashes
|
||||
$h3 = "6287ba30f534e404e68356237e809683e311285d8b9f47d046ac58784eece052"; // URL hash
|
||||
$h4 = "6cbb5d2dcb11610a99eb3f633dc246690c0acf33327bf7534f95542caa8f27c4"; // title hash
|
||||
$h5 = "2b7c57ffa9adde92ccd1884fa1153a5bcd3211e48d99e27be5414cb078e6891c"; // content/enclosure hash
|
||||
$this->assertNotEquals("", $f->data->items[3]->urlTitleHash);
|
||||
$this->assertSame($h3, $f->data->items[3]->urlContentHash);
|
||||
$this->assertSame("", $f->data->items[3]->titleContentHash);
|
||||
$this->assertNotEquals("", $f->data->items[4]->urlTitleHash);
|
||||
$this->assertSame("", $f->data->items[4]->urlContentHash);
|
||||
$this->assertSame($h4, $f->data->items[4]->titleContentHash);
|
||||
$this->assertSame("", $f->data->items[5]->urlTitleHash);
|
||||
$this->assertNotEquals("", $f->data->items[5]->urlContentHash);
|
||||
$this->assertNotEquals("", $f->data->items[5]->titleContentHash);
|
||||
$this->assertNotEquals("", $f->items[3]->urlTitleHash);
|
||||
$this->assertSame($h3, $f->items[3]->urlContentHash);
|
||||
$this->assertSame("", $f->items[3]->titleContentHash);
|
||||
$this->assertNotEquals("", $f->items[4]->urlTitleHash);
|
||||
$this->assertSame("", $f->items[4]->urlContentHash);
|
||||
$this->assertSame($h4, $f->items[4]->titleContentHash);
|
||||
$this->assertSame("", $f->items[5]->urlTitleHash);
|
||||
$this->assertNotEquals("", $f->items[5]->urlContentHash);
|
||||
$this->assertNotEquals("", $f->items[5]->titleContentHash);
|
||||
// check null IDs
|
||||
$this->assertSame(null, $f->data->items[3]->id);
|
||||
$this->assertSame(null, $f->data->items[4]->id);
|
||||
$this->assertSame(null, $f->data->items[5]->id);
|
||||
$this->assertSame(null, $f->items[3]->id);
|
||||
$this->assertSame(null, $f->items[4]->id);
|
||||
$this->assertSame(null, $f->items[5]->id);
|
||||
// check categories
|
||||
$categories = [
|
||||
"Aniki!",
|
||||
|
@ -140,11 +140,11 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
|
|||
"Bodybuilders",
|
||||
"Men",
|
||||
];
|
||||
$this->assertSame([], $f->data->items[0]->categories);
|
||||
$this->assertSame([], $f->data->items[1]->categories);
|
||||
$this->assertSame([], $f->data->items[3]->categories);
|
||||
$this->assertSame([], $f->data->items[4]->categories);
|
||||
$this->assertSame($categories, $f->data->items[5]->categories);
|
||||
$this->assertSame([], $f->items[0]->categories);
|
||||
$this->assertSame([], $f->items[1]->categories);
|
||||
$this->assertSame([], $f->items[3]->categories);
|
||||
$this->assertSame([], $f->items[4]->categories);
|
||||
$this->assertSame($categories, $f->items[5]->categories);
|
||||
}
|
||||
|
||||
public function testDiscoverAFeedSuccessfully(): void {
|
||||
|
@ -232,7 +232,7 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
|
|||
$e = "78567a";
|
||||
$f = new Feed(null, $this->base.$url."?t=$t&e=$e", Date::transform($t, "http"), $e);
|
||||
$this->assertTime($t, $f->lastModified);
|
||||
$this->assertSame($e, $f->resource->getETag());
|
||||
$this->assertSame($e, $f->etag);
|
||||
}
|
||||
|
||||
public function provide304ResponseURLs() {
|
||||
|
@ -250,15 +250,15 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
|
|||
$t = time() - 2000;
|
||||
$f = new Feed(null, $this->base."Caching/200Past");
|
||||
$this->assertTime($t, $f->lastModified);
|
||||
$this->assertNotEmpty($f->resource->getETag());
|
||||
$this->assertNotEmpty($f->etag);
|
||||
$t = time() - 2000;
|
||||
$f = new Feed(null, $this->base."Caching/200Past", Date::transform(time(), "http"));
|
||||
$this->assertTime($t, $f->lastModified);
|
||||
$this->assertNotEmpty($f->resource->getETag());
|
||||
$this->assertNotEmpty($f->etag);
|
||||
$t = time() + 2000;
|
||||
$f = new Feed(null, $this->base."Caching/200Future");
|
||||
$this->assertTime($t, $f->lastModified);
|
||||
$this->assertNotEmpty($f->resource->getETag());
|
||||
$this->assertNotEmpty($f->etag);
|
||||
// these tests have no HTTP headers and rely on article dates
|
||||
$t = strtotime("2002-05-19T15:21:36Z");
|
||||
$f = new Feed(null, $this->base."Caching/200PubDateOnly");
|
||||
|
|
Loading…
Reference in a new issue