1
1
Fork 0
mirror of https://code.mensbeam.com/MensBeam/Arsse.git synced 2024-12-22 13:12:41 +00:00

Avoid dynamic property creation with PicoFeed

This only leaves the Laminas XML deprecated behaviour to handle
This commit is contained in:
J. King 2023-01-28 11:18:14 -05:00
parent 0d6f8d2921
commit fe06ffc176
4 changed files with 112 additions and 75 deletions

View file

@ -1335,12 +1335,12 @@ class Database {
"UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?", "UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"] ["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"]
)->run( )->run(
$feed->data->title, $feed->title,
$feed->data->siteUrl, $feed->siteUrl,
$feed->lastModified, $feed->lastModified,
$feed->resource->getEtag(), $feed->etag,
$feed->nextFetch, $feed->nextFetch,
sizeof($feed->data->items), sizeof($feed->items),
$icon, $icon,
$feedID $feedID
); );

View file

@ -6,6 +6,7 @@
declare(strict_types=1); declare(strict_types=1);
namespace JKingWeb\Arsse; namespace JKingWeb\Arsse;
use JKingWeb\Arsse\Feed\Item;
use JKingWeb\Arsse\Misc\Date; use JKingWeb\Arsse\Misc\Date;
use JKingWeb\Arsse\Rule\Rule; use JKingWeb\Arsse\Rule\Rule;
use PicoFeed\PicoFeedException; use PicoFeed\PicoFeedException;
@ -16,62 +17,62 @@ use PicoFeed\Reader\Favicon;
use PicoFeed\Scraper\Scraper; use PicoFeed\Scraper\Scraper;
class Feed { class Feed {
public $data = null; public $title;
public $siteUrl;
public $iconUrl; public $iconUrl;
public $iconType; public $iconType;
public $iconData; public $iconData;
public $resource;
public $modified = false; public $modified = false;
public $lastModified; public $lastModified;
public $etag;
public $nextFetch; public $nextFetch;
public $items = [];
public $newItems = []; public $newItems = [];
public $changedItems = []; public $changedItems = [];
public $filteredItems = []; public $filteredItems = [];
public static function discover(string $url, string $username = '', string $password = ''): string { public static function discover(string $url, string $username = '', string $password = ''): string {
// fetch the candidate feed // fetch the candidate feed
$f = self::download($url, "", "", $username, $password); [$client, $reader] = self::download($url, "", "", $username, $password);
if ($f->reader->detectFormat($f->getContent())) { if ($reader->detectFormat($client->getContent())) {
// if the prospective URL is a feed, use it // if the prospective URL is a feed, use it
$out = $url; $out = $url;
} else { } else {
$links = $f->reader->find($f->getUrl(), $f->getContent()); $links = $reader->find($client->getUrl(), $client->getContent());
if (!$links) { if (!$links) {
// work around a PicoFeed memory leak
libxml_use_internal_errors(false);
throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription')); throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription'));
} else { } else {
$out = $links[0]; $out = $links[0];
} }
} }
// work around a PicoFeed memory leak
libxml_use_internal_errors(false);
return $out; return $out;
} }
public static function discoverAll(string $url, string $username = '', string $password = ''): array { public static function discoverAll(string $url, string $username = '', string $password = ''): array {
// fetch the candidate feed // fetch the candidate feed
$f = self::download($url, "", "", $username, $password); [$client, $reader] = self::download($url, "", "", $username, $password);
if ($f->reader->detectFormat($f->getContent())) { if ($reader->detectFormat($client->getContent())) {
// if the prospective URL is a feed, use it // if the prospective URL is a feed, use it
return [$url]; return [$url];
} else { } else {
return $f->reader->find($f->getUrl(), $f->getContent()); return $reader->find($client->getUrl(), $client->getContent());
} }
} }
public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) { public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
// fetch the feed // fetch the feed
$this->resource = self::download($url, $lastModified, $etag, $username, $password); [$client, $reader] = self::download($url, $lastModified, $etag, $username, $password);
// format the HTTP Last-Modified date returned // format the HTTP Last-Modified date returned
$lastMod = $this->resource->getLastModified(); $lastMod = $client->getLastModified();
if (strlen($lastMod ?? "")) { if (strlen($lastMod ?? "")) {
$this->lastModified = Date::normalize($lastMod, "http"); $this->lastModified = Date::normalize($lastMod, "http");
} }
$this->modified = $this->resource->isModified(); $this->modified = $client->isModified();
// get the ETag
$this->etag = $client->getEtag();
// parse the feed, if it has been modified // parse the feed, if it has been modified
if ($this->modified) { if ($this->modified) {
$this->parse(); $this->parse($client, $reader);
// ascertain whether there are any articles not in the database // ascertain whether there are any articles not in the database
$this->matchToDatabase($feedID); $this->matchToDatabase($feedID);
// if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents // if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents
@ -112,12 +113,11 @@ class Feed {
return $config; return $config;
} }
protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client { protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): array {
try { try {
$reader = new Reader(self::configure()); $reader = new Reader(self::configure());
$client = $reader->download($url, $lastModified, $etag, $username, $password); $client = $reader->download($url, $lastModified, $etag, $username, $password);
$client->reader = $reader; return [$client, $reader];
return $client;
} catch (PicoFeedException $e) { } catch (PicoFeedException $e) {
throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore
} catch (\GuzzleHttp\Exception\GuzzleException $e) { } catch (\GuzzleHttp\Exception\GuzzleException $e) {
@ -125,17 +125,17 @@ class Feed {
} }
} }
protected function parse(): void { protected function parse(Client $client, Reader $reader): void {
try { try {
$feed = $this->resource->reader->getParser( $feed = $reader->getParser(
$this->resource->getUrl(), $client->getUrl(),
$this->resource->getContent(), $client->getContent(),
$this->resource->getEncoding() $client->getEncoding()
)->execute(); )->execute();
} catch (PicoFeedException $e) { } catch (PicoFeedException $e) {
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); throw new Feed\Exception("", ['url' => $client->getUrl()], $e);
} catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore } catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore
throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); // @codeCoverageIgnore throw new Feed\Exception("", ['url' => $client->getUrl()], $e); // @codeCoverageIgnore
} }
// Grab the favicon for the feed, or null if no valid icon is found // Grab the favicon for the feed, or null if no valid icon is found
@ -150,6 +150,10 @@ class Feed {
$this->iconUrl = $this->iconData = null; $this->iconUrl = $this->iconData = null;
} }
// Next gather all other feed-level information we want out of the feed
$this->siteUrl = $feed->siteUrl;
$this->title = $feed->title;
// PicoFeed does not provide valid ids when there is no id element. Its solution // PicoFeed does not provide valid ids when there is no id element. Its solution
// of hashing the url, title, and content together for the id if there is no id // of hashing the url, title, and content together for the id if there is no id
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but // element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
@ -158,29 +162,38 @@ class Feed {
// only be reserved for severely broken feeds. // only be reserved for severely broken feeds.
foreach ($feed->items as $f) { foreach ($feed->items as $f) {
// Hashes used for comparison to check for updates and also to identify when an // copy the basic information of an article
$i = new Item;
$i->url = $f->url;
$i->title = $f->title;
$i->content = $f->content;
$i->author = $f->author;
$i->publishedDate = $f->publishedDate;
$i->updatedDate = $f->updatedDate;
$i->enclosureType = $f->enclosureType;
$i->enclosureUrl = $f->enclosureUrl;
// add hashes used for comparison to check for updates and also to identify when an
// id doesn't exist. // id doesn't exist.
$content = $f->content.$f->enclosureUrl.$f->enclosureType; $content = $f->content.$f->enclosureUrl.$f->enclosureType;
// if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title // if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) { if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) {
$f->urlTitleHash = ""; $i->urlTitleHash = "";
} else { } else {
$f->urlTitleHash = hash('sha256', $f->url.$f->title); $i->urlTitleHash = hash('sha256', $f->url.$f->title);
} }
// if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed // if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
if (!strlen($content) && $f->url === $feed->siteUrl) { if (!strlen($content) && $f->url === $feed->siteUrl) {
$f->urlContentHash = ""; $i->urlContentHash = "";
} else { } else {
$f->urlContentHash = hash('sha256', $f->url.$content); $i->urlContentHash = hash('sha256', $f->url.$content);
} }
// if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed // if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
if (!strlen($content) && $f->title === $f->url) { if (!strlen($content) && $f->title === $f->url) {
$f->titleContentHash = ""; $i->titleContentHash = "";
} else { } else {
$f->titleContentHash = hash('sha256', $f->title.$content); $i->titleContentHash = hash('sha256', $f->title.$content);
} }
$f->id = null; // next add an id; prefer an Atom ID as the item's ID
// prefer an Atom ID as the item's ID
$id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id; $id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id;
// otherwise use the RSS2 guid element // otherwise use the RSS2 guid element
if (!strlen($id)) { if (!strlen($id)) {
@ -192,11 +205,10 @@ class Feed {
} }
// otherwise there is no ID; if there is one, hash it // otherwise there is no ID; if there is one, hash it
if (strlen($id)) { if (strlen($id)) {
$f->id = hash('sha256', $id); $i->id = hash('sha256', $id);
} }
// PicoFeed also doesn't gather up categories, so we do this as well // PicoFeed also doesn't gather up categories, so we do this as well
$f->categories = [];
// first add Atom categories // first add Atom categories
foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) { foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) {
// if the category has a label, use that // if the category has a label, use that
@ -207,27 +219,28 @@ class Feed {
} }
// ... assuming it has that much // ... assuming it has that much
if (strlen($name)) { if (strlen($name)) {
$f->categories[] = $name; $i->categories[] = $name;
} }
} }
// next add RSS2 categories // next add RSS2 categories
foreach ($f->xml->children()->category as $c) { foreach ($f->xml->children()->category as $c) {
$name = (string) $c; $name = (string) $c;
if (strlen($name)) { if (strlen($name)) {
$f->categories[] = $name; $i->categories[] = $name;
} }
} }
// and finally try Dublin Core subjects // and finally try Dublin Core subjects
foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) { foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) {
$name = (string) $c; $name = (string) $c;
if (strlen($name)) { if (strlen($name)) {
$f->categories[] = $name; $i->categories[] = $name;
} }
} }
//sort the results //sort the results
sort($f->categories); sort($i->categories);
// add the item to the feed's list of items
$this->items[] = $i;
} }
$this->data = $feed;
} }
protected function deduplicateItems(array $items): array { protected function deduplicateItems(array $items): array {
@ -251,7 +264,7 @@ class Feed {
($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) || ($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) ||
($item->titleContentHash && $item->titleContentHash === $check->titleContentHash) ($item->titleContentHash && $item->titleContentHash === $check->titleContentHash)
) { ) {
if (// because newsfeeds are usually order newest-first, the later item should only be used if... if (// because newsfeeds are usually ordered newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not // the later item has an update date and the existing item does not
($item->updatedDate && !$check->updatedDate) || ($item->updatedDate && !$check->updatedDate) ||
// the later item has an update date newer than the existing item's // the later item has an update date newer than the existing item's
@ -276,7 +289,7 @@ class Feed {
protected function matchToDatabase(int $feedID = null): void { protected function matchToDatabase(int $feedID = null): void {
// first perform deduplication on items // first perform deduplication on items
$items = $this->deduplicateItems($this->data->items); $items = $this->deduplicateItems($this->items);
// if we haven't been given a database feed ID to check against, all items are new // if we haven't been given a database feed ID to check against, all items are new
if (is_null($feedID)) { if (is_null($feedID)) {
$this->newItems = $items; $this->newItems = $items;
@ -429,7 +442,7 @@ class Feed {
protected function gatherDates(): array { protected function gatherDates(): array {
$dates = []; $dates = [];
foreach ($this->data->items as $item) { foreach ($this->items as $item) {
if ($item->updatedDate) { if ($item->updatedDate) {
$dates[] = $item->updatedDate->getTimestamp(); $dates[] = $item->updatedDate->getTimestamp();
} }

24
lib/Feed/Item.php Normal file
View file

@ -0,0 +1,24 @@
<?php
/** @license MIT
* Copyright 2017 J. King, Dustin Wilson et al.
* See LICENSE and AUTHORS files for details */
declare(strict_types=1);
namespace JKingWeb\Arsse\Feed;
class Item {
public $id;
public $url;
public $title;
public $author;
public $publishedDate;
public $updatedDate;
public $urlContentHash;
public $urlTitleHash;
public $titleContentHash;
public $content;
public $scrapedContent;
public $enclosureUrl;
public $enclosureType;
public $categories = [];
}

View file

@ -113,26 +113,26 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
$h0 = "0a4f0e3768c8a5e9d8d9a16545ae4ff5b097f6dac3ad49555a94a7cace68ba73"; // hash of Atom ID $h0 = "0a4f0e3768c8a5e9d8d9a16545ae4ff5b097f6dac3ad49555a94a7cace68ba73"; // hash of Atom ID
$h1 = "a135beced0236b723d12f845ff20ec22d4fc3afe1130012618f027170d57cb4e"; // hash of RSS2 GUID $h1 = "a135beced0236b723d12f845ff20ec22d4fc3afe1130012618f027170d57cb4e"; // hash of RSS2 GUID
$h2 = "205e986f4f8b3acfa281227beadb14f5e8c32c8dae4737f888c94c0df49c56f8"; // hash of Dublin Core identifier $h2 = "205e986f4f8b3acfa281227beadb14f5e8c32c8dae4737f888c94c0df49c56f8"; // hash of Dublin Core identifier
$this->assertSame($h0, $f->data->items[0]->id); $this->assertSame($h0, $f->items[0]->id);
$this->assertSame($h1, $f->data->items[1]->id); $this->assertSame($h1, $f->items[1]->id);
$this->assertSame($h2, $f->data->items[2]->id); $this->assertSame($h2, $f->items[2]->id);
// check null hashes // check null hashes
$h3 = "6287ba30f534e404e68356237e809683e311285d8b9f47d046ac58784eece052"; // URL hash $h3 = "6287ba30f534e404e68356237e809683e311285d8b9f47d046ac58784eece052"; // URL hash
$h4 = "6cbb5d2dcb11610a99eb3f633dc246690c0acf33327bf7534f95542caa8f27c4"; // title hash $h4 = "6cbb5d2dcb11610a99eb3f633dc246690c0acf33327bf7534f95542caa8f27c4"; // title hash
$h5 = "2b7c57ffa9adde92ccd1884fa1153a5bcd3211e48d99e27be5414cb078e6891c"; // content/enclosure hash $h5 = "2b7c57ffa9adde92ccd1884fa1153a5bcd3211e48d99e27be5414cb078e6891c"; // content/enclosure hash
$this->assertNotEquals("", $f->data->items[3]->urlTitleHash); $this->assertNotEquals("", $f->items[3]->urlTitleHash);
$this->assertSame($h3, $f->data->items[3]->urlContentHash); $this->assertSame($h3, $f->items[3]->urlContentHash);
$this->assertSame("", $f->data->items[3]->titleContentHash); $this->assertSame("", $f->items[3]->titleContentHash);
$this->assertNotEquals("", $f->data->items[4]->urlTitleHash); $this->assertNotEquals("", $f->items[4]->urlTitleHash);
$this->assertSame("", $f->data->items[4]->urlContentHash); $this->assertSame("", $f->items[4]->urlContentHash);
$this->assertSame($h4, $f->data->items[4]->titleContentHash); $this->assertSame($h4, $f->items[4]->titleContentHash);
$this->assertSame("", $f->data->items[5]->urlTitleHash); $this->assertSame("", $f->items[5]->urlTitleHash);
$this->assertNotEquals("", $f->data->items[5]->urlContentHash); $this->assertNotEquals("", $f->items[5]->urlContentHash);
$this->assertNotEquals("", $f->data->items[5]->titleContentHash); $this->assertNotEquals("", $f->items[5]->titleContentHash);
// check null IDs // check null IDs
$this->assertSame(null, $f->data->items[3]->id); $this->assertSame(null, $f->items[3]->id);
$this->assertSame(null, $f->data->items[4]->id); $this->assertSame(null, $f->items[4]->id);
$this->assertSame(null, $f->data->items[5]->id); $this->assertSame(null, $f->items[5]->id);
// check categories // check categories
$categories = [ $categories = [
"Aniki!", "Aniki!",
@ -140,11 +140,11 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
"Bodybuilders", "Bodybuilders",
"Men", "Men",
]; ];
$this->assertSame([], $f->data->items[0]->categories); $this->assertSame([], $f->items[0]->categories);
$this->assertSame([], $f->data->items[1]->categories); $this->assertSame([], $f->items[1]->categories);
$this->assertSame([], $f->data->items[3]->categories); $this->assertSame([], $f->items[3]->categories);
$this->assertSame([], $f->data->items[4]->categories); $this->assertSame([], $f->items[4]->categories);
$this->assertSame($categories, $f->data->items[5]->categories); $this->assertSame($categories, $f->items[5]->categories);
} }
public function testDiscoverAFeedSuccessfully(): void { public function testDiscoverAFeedSuccessfully(): void {
@ -232,7 +232,7 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
$e = "78567a"; $e = "78567a";
$f = new Feed(null, $this->base.$url."?t=$t&e=$e", Date::transform($t, "http"), $e); $f = new Feed(null, $this->base.$url."?t=$t&e=$e", Date::transform($t, "http"), $e);
$this->assertTime($t, $f->lastModified); $this->assertTime($t, $f->lastModified);
$this->assertSame($e, $f->resource->getETag()); $this->assertSame($e, $f->etag);
} }
public function provide304ResponseURLs() { public function provide304ResponseURLs() {
@ -250,15 +250,15 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
$t = time() - 2000; $t = time() - 2000;
$f = new Feed(null, $this->base."Caching/200Past"); $f = new Feed(null, $this->base."Caching/200Past");
$this->assertTime($t, $f->lastModified); $this->assertTime($t, $f->lastModified);
$this->assertNotEmpty($f->resource->getETag()); $this->assertNotEmpty($f->etag);
$t = time() - 2000; $t = time() - 2000;
$f = new Feed(null, $this->base."Caching/200Past", Date::transform(time(), "http")); $f = new Feed(null, $this->base."Caching/200Past", Date::transform(time(), "http"));
$this->assertTime($t, $f->lastModified); $this->assertTime($t, $f->lastModified);
$this->assertNotEmpty($f->resource->getETag()); $this->assertNotEmpty($f->etag);
$t = time() + 2000; $t = time() + 2000;
$f = new Feed(null, $this->base."Caching/200Future"); $f = new Feed(null, $this->base."Caching/200Future");
$this->assertTime($t, $f->lastModified); $this->assertTime($t, $f->lastModified);
$this->assertNotEmpty($f->resource->getETag()); $this->assertNotEmpty($f->etag);
// these tests have no HTTP headers and rely on article dates // these tests have no HTTP headers and rely on article dates
$t = strtotime("2002-05-19T15:21:36Z"); $t = strtotime("2002-05-19T15:21:36Z");
$f = new Feed(null, $this->base."Caching/200PubDateOnly"); $f = new Feed(null, $this->base."Caching/200PubDateOnly");