Avoid dynamic property creation with PicoFeed

This only leaves the Laminas XML deprecated behaviour to handle
2024-12-22 13:12:41 +00:00 · 2023-01-28 11:18:14 -05:00 · 2023-01-28 11:18:14 -05:00 · fe06ffc176
commit fe06ffc176
parent 0d6f8d2921
4 changed files with 112 additions and 75 deletions
--- a/lib/Database.php
+++ b/lib/Database.php
@ -1335,12 +1335,12 @@ class Database {
            "UPDATE arsse_feeds SET title = ?, source = ?, updated = CURRENT_TIMESTAMP, modified = ?, etag = ?, err_count = 0, err_msg = '', next_fetch = ?, size = ?, icon = ? WHERE id = ?",
            ["str", "str", "datetime", "strict str", "datetime", "int", "int", "int"]
        )->run(
-            $feed->data->title,
-            $feed->data->siteUrl,
+            $feed->title,
+            $feed->siteUrl,
            $feed->lastModified,
-            $feed->resource->getEtag(),
+            $feed->etag,
            $feed->nextFetch,
-            sizeof($feed->data->items),
+            sizeof($feed->items),
            $icon,
            $feedID
        );
--- a/lib/Feed.php
+++ b/lib/Feed.php
@ -6,6 +6,7 @@
 declare(strict_types=1);
 namespace JKingWeb\Arsse;

+use JKingWeb\Arsse\Feed\Item;
 use JKingWeb\Arsse\Misc\Date;
 use JKingWeb\Arsse\Rule\Rule;
 use PicoFeed\PicoFeedException;
@ -16,62 +17,62 @@ use PicoFeed\Reader\Favicon;
 use PicoFeed\Scraper\Scraper;

 class Feed {    
-    public $data = null;
+    public $title;
+    public $siteUrl;
    public $iconUrl;
    public $iconType;
    public $iconData;
-    public $resource;
    public $modified = false;
    public $lastModified;
+    public $etag;
    public $nextFetch;
+    public $items = [];
    public $newItems = [];
    public $changedItems = [];
    public $filteredItems = [];

    public static function discover(string $url, string $username = '', string $password = ''): string {
        // fetch the candidate feed
-        $f = self::download($url, "", "", $username, $password);
-        if ($f->reader->detectFormat($f->getContent())) {
+        [$client, $reader] = self::download($url, "", "", $username, $password);
+        if ($reader->detectFormat($client->getContent())) {
            // if the prospective URL is a feed, use it
            $out = $url;
        } else {
-            $links = $f->reader->find($f->getUrl(), $f->getContent());
+            $links = $reader->find($client->getUrl(), $client->getContent());
            if (!$links) {
-                // work around a PicoFeed memory leak
-                libxml_use_internal_errors(false);
                throw new Feed\Exception("", ['url' => $url], new \PicoFeed\Reader\SubscriptionNotFoundException('Unable to find a subscription'));
            } else {
                $out = $links[0];
            }
        }
-        // work around a PicoFeed memory leak
-        libxml_use_internal_errors(false);
        return $out;
    }

    public static function discoverAll(string $url, string $username = '', string $password = ''): array {
        // fetch the candidate feed
-        $f = self::download($url, "", "", $username, $password);
-        if ($f->reader->detectFormat($f->getContent())) {
+        [$client, $reader] = self::download($url, "", "", $username, $password);
+        if ($reader->detectFormat($client->getContent())) {
            // if the prospective URL is a feed, use it
            return [$url];
        } else {
-            return $f->reader->find($f->getUrl(), $f->getContent());
+            return $reader->find($client->getUrl(), $client->getContent());
        }
    }

    public function __construct(int $feedID = null, string $url, string $lastModified = '', string $etag = '', string $username = '', string $password = '', bool $scrape = false) {
        // fetch the feed
-        $this->resource = self::download($url, $lastModified, $etag, $username, $password);
+        [$client, $reader] = self::download($url, $lastModified, $etag, $username, $password);
        // format the HTTP Last-Modified date returned
-        $lastMod = $this->resource->getLastModified();
+        $lastMod = $client->getLastModified();
        if (strlen($lastMod ?? "")) {
            $this->lastModified = Date::normalize($lastMod, "http");
        }
-        $this->modified = $this->resource->isModified();
-        //parse the feed, if it has been modified
+        $this->modified = $client->isModified();
+        // get the ETag
+        $this->etag = $client->getEtag();
+        // parse the feed, if it has been modified
        if ($this->modified) {
-            $this->parse();
+            $this->parse($client, $reader);
            // ascertain whether there are any articles not in the database
            $this->matchToDatabase($feedID);
            // if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents
@ -112,12 +113,11 @@ class Feed {
        return $config;
    }

-    protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): Client {
+    protected static function download(string $url, string $lastModified, string $etag, string $username, string $password): array {
        try {
            $reader = new Reader(self::configure());
            $client = $reader->download($url, $lastModified, $etag, $username, $password);
-            $client->reader = $reader;
-            return $client;
+            return [$client, $reader];
        } catch (PicoFeedException $e) {
            throw new Feed\Exception("", ['url' => $url], $e); // @codeCoverageIgnore
        } catch (\GuzzleHttp\Exception\GuzzleException $e) {
@ -125,17 +125,17 @@ class Feed {
        }
    }

-    protected function parse(): void {
+    protected function parse(Client $client, Reader $reader): void {
        try {
-            $feed = $this->resource->reader->getParser(
-                $this->resource->getUrl(),
-                $this->resource->getContent(),
-                $this->resource->getEncoding()
+            $feed = $reader->getParser(
+                $client->getUrl(),
+                $client->getContent(),
+                $client->getEncoding()
            )->execute();
        } catch (PicoFeedException $e) {
-            throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e);
+            throw new Feed\Exception("", ['url' => $client->getUrl()], $e);
        } catch (\GuzzleHttp\Exception\GuzzleException $e) { // @codeCoverageIgnore
-            throw new Feed\Exception("", ['url' => $this->resource->getUrl()], $e); // @codeCoverageIgnore
+            throw new Feed\Exception("", ['url' => $client->getUrl()], $e); // @codeCoverageIgnore
        }

        // Grab the favicon for the feed, or null if no valid icon is found
@ -150,6 +150,10 @@ class Feed {
            $this->iconUrl = $this->iconData = null;
        }

+        // Next gather all other feed-level information we want out of the feed
+        $this->siteUrl = $feed->siteUrl;
+        $this->title = $feed->title;
+
        // PicoFeed does not provide valid ids when there is no id element. Its solution
        // of hashing the url, title, and content together for the id if there is no id
        // element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
@ -158,29 +162,38 @@ class Feed {
        // only be reserved for severely broken feeds.

        foreach ($feed->items as $f) {
-            // Hashes used for comparison to check for updates and also to identify when an
+            // copy the basic information of an article
+            $i = new Item;
+            $i->url = $f->url;
+            $i->title = $f->title;
+            $i->content = $f->content;
+            $i->author = $f->author;
+            $i->publishedDate = $f->publishedDate;
+            $i->updatedDate = $f->updatedDate;
+            $i->enclosureType = $f->enclosureType;
+            $i->enclosureUrl = $f->enclosureUrl;
+            // add hashes used for comparison to check for updates and also to identify when an
            // id doesn't exist.
            $content = $f->content.$f->enclosureUrl.$f->enclosureType;
            // if the item link URL and item title are both equal to the feed link URL, then the item has neither a link URL nor a title
            if ($f->url === $feed->siteUrl && $f->title === $feed->siteUrl) {
-                $f->urlTitleHash = "";
+                $i->urlTitleHash = "";
            } else {
-                $f->urlTitleHash = hash('sha256', $f->url.$f->title);
+                $i->urlTitleHash = hash('sha256', $f->url.$f->title);
            }
            // if the item link URL is equal to the feed link URL, it has no link URL; if there is additionally no content, these should not be hashed
            if (!strlen($content) && $f->url === $feed->siteUrl) {
-                $f->urlContentHash = "";
+                $i->urlContentHash = "";
            } else {
-                $f->urlContentHash = hash('sha256', $f->url.$content);
+                $i->urlContentHash = hash('sha256', $f->url.$content);
            }
            // if the item's title is the same as its link URL, it has no title; if there is additionally no content, these should not be hashed
            if (!strlen($content) && $f->title === $f->url) {
-                $f->titleContentHash = "";
+                $i->titleContentHash = "";
            } else {
-                $f->titleContentHash = hash('sha256', $f->title.$content);
+                $i->titleContentHash = hash('sha256', $f->title.$content);
            }
-            $f->id = null;
-            // prefer an Atom ID as the item's ID
+            // next add an id; prefer an Atom ID as the item's ID
            $id = (string) $f->xml->children('http://www.w3.org/2005/Atom')->id;
            // otherwise use the RSS2 guid element
            if (!strlen($id)) {
@ -192,11 +205,10 @@ class Feed {
            }
            // otherwise there is no ID; if there is one, hash it
            if (strlen($id)) {
-                $f->id = hash('sha256', $id);
+                $i->id = hash('sha256', $id);
            }

            // PicoFeed also doesn't gather up categories, so we do this as well
-            $f->categories = [];
            // first add Atom categories
            foreach ($f->xml->children('http://www.w3.org/2005/Atom')->category as $c) {
                // if the category has a label, use that
@ -207,27 +219,28 @@ class Feed {
                }
                // ... assuming it has that much
                if (strlen($name)) {
-                    $f->categories[] = $name;
+                    $i->categories[] = $name;
                }
            }
            // next add RSS2 categories
            foreach ($f->xml->children()->category as $c) {
                $name = (string) $c;
                if (strlen($name)) {
-                    $f->categories[] = $name;
+                    $i->categories[] = $name;
                }
            }
            // and finally try Dublin Core subjects
            foreach ($f->xml->children('http://purl.org/dc/elements/1.1/')->subject as $c) {
                $name = (string) $c;
                if (strlen($name)) {
-                    $f->categories[] = $name;
+                    $i->categories[] = $name;
                }
            }
            //sort the results
-            sort($f->categories);
+            sort($i->categories);
+            // add the item to the feed's list of items
+            $this->items[] = $i;
        }
-        $this->data = $feed;
    }

    protected function deduplicateItems(array $items): array {
@ -251,7 +264,7 @@ class Feed {
                    ($item->urlContentHash && $item->urlContentHash === $check->urlContentHash) ||
                    ($item->titleContentHash && $item->titleContentHash === $check->titleContentHash)
                ) {
-                    if (// because newsfeeds are usually order newest-first, the later item should only be used if...
+                    if (// because newsfeeds are usually ordered newest-first, the later item should only be used if...
                        // the later item has an update date and the existing item does not
                        ($item->updatedDate && !$check->updatedDate) ||
                        // the later item has an update date newer than the existing item's
@ -276,7 +289,7 @@ class Feed {

    protected function matchToDatabase(int $feedID = null): void {
        // first perform deduplication on items
-        $items = $this->deduplicateItems($this->data->items);
+        $items = $this->deduplicateItems($this->items);
        // if we haven't been given a database feed ID to check against, all items are new
        if (is_null($feedID)) {
            $this->newItems = $items;
@ -429,7 +442,7 @@ class Feed {

    protected function gatherDates(): array {
        $dates = [];
-        foreach ($this->data->items as $item) {
+        foreach ($this->items as $item) {
            if ($item->updatedDate) {
                $dates[] = $item->updatedDate->getTimestamp();
            }
--- a/lib/Feed/Item.php
+++ b/lib/Feed/Item.php
@ -0,0 +1,24 @@
+<?php
+/** @license MIT
+ * Copyright 2017 J. King, Dustin Wilson et al.
+ * See LICENSE and AUTHORS files for details */
+
+declare(strict_types=1);
+namespace JKingWeb\Arsse\Feed;
+
+class Item {
+    public $id;
+    public $url;
+    public $title;
+    public $author;
+    public $publishedDate;
+    public $updatedDate;
+    public $urlContentHash;
+    public $urlTitleHash;
+    public $titleContentHash;
+    public $content;
+    public $scrapedContent;
+    public $enclosureUrl;
+    public $enclosureType;
+    public $categories = [];
+}
--- a/tests/cases/Feed/TestFeed.php
+++ b/tests/cases/Feed/TestFeed.php
@ -113,26 +113,26 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
        $h0 = "0a4f0e3768c8a5e9d8d9a16545ae4ff5b097f6dac3ad49555a94a7cace68ba73"; // hash of Atom ID
        $h1 = "a135beced0236b723d12f845ff20ec22d4fc3afe1130012618f027170d57cb4e"; // hash of RSS2 GUID
        $h2 = "205e986f4f8b3acfa281227beadb14f5e8c32c8dae4737f888c94c0df49c56f8"; // hash of Dublin Core identifier
-        $this->assertSame($h0, $f->data->items[0]->id);
-        $this->assertSame($h1, $f->data->items[1]->id);
-        $this->assertSame($h2, $f->data->items[2]->id);
+        $this->assertSame($h0, $f->items[0]->id);
+        $this->assertSame($h1, $f->items[1]->id);
+        $this->assertSame($h2, $f->items[2]->id);
        // check null hashes
        $h3 = "6287ba30f534e404e68356237e809683e311285d8b9f47d046ac58784eece052"; // URL hash
        $h4 = "6cbb5d2dcb11610a99eb3f633dc246690c0acf33327bf7534f95542caa8f27c4"; // title hash
        $h5 = "2b7c57ffa9adde92ccd1884fa1153a5bcd3211e48d99e27be5414cb078e6891c"; // content/enclosure hash
-        $this->assertNotEquals("", $f->data->items[3]->urlTitleHash);
-        $this->assertSame($h3, $f->data->items[3]->urlContentHash);
-        $this->assertSame("", $f->data->items[3]->titleContentHash);
-        $this->assertNotEquals("", $f->data->items[4]->urlTitleHash);
-        $this->assertSame("", $f->data->items[4]->urlContentHash);
-        $this->assertSame($h4, $f->data->items[4]->titleContentHash);
-        $this->assertSame("", $f->data->items[5]->urlTitleHash);
-        $this->assertNotEquals("", $f->data->items[5]->urlContentHash);
-        $this->assertNotEquals("", $f->data->items[5]->titleContentHash);
+        $this->assertNotEquals("", $f->items[3]->urlTitleHash);
+        $this->assertSame($h3, $f->items[3]->urlContentHash);
+        $this->assertSame("", $f->items[3]->titleContentHash);
+        $this->assertNotEquals("", $f->items[4]->urlTitleHash);
+        $this->assertSame("", $f->items[4]->urlContentHash);
+        $this->assertSame($h4, $f->items[4]->titleContentHash);
+        $this->assertSame("", $f->items[5]->urlTitleHash);
+        $this->assertNotEquals("", $f->items[5]->urlContentHash);
+        $this->assertNotEquals("", $f->items[5]->titleContentHash);
        // check null IDs
-        $this->assertSame(null, $f->data->items[3]->id);
-        $this->assertSame(null, $f->data->items[4]->id);
-        $this->assertSame(null, $f->data->items[5]->id);
+        $this->assertSame(null, $f->items[3]->id);
+        $this->assertSame(null, $f->items[4]->id);
+        $this->assertSame(null, $f->items[5]->id);
        // check categories
        $categories = [
            "Aniki!",
@ -140,11 +140,11 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
            "Bodybuilders",
            "Men",
        ];
-        $this->assertSame([], $f->data->items[0]->categories);
-        $this->assertSame([], $f->data->items[1]->categories);
-        $this->assertSame([], $f->data->items[3]->categories);
-        $this->assertSame([], $f->data->items[4]->categories);
-        $this->assertSame($categories, $f->data->items[5]->categories);
+        $this->assertSame([], $f->items[0]->categories);
+        $this->assertSame([], $f->items[1]->categories);
+        $this->assertSame([], $f->items[3]->categories);
+        $this->assertSame([], $f->items[4]->categories);
+        $this->assertSame($categories, $f->items[5]->categories);
    }

    public function testDiscoverAFeedSuccessfully(): void {
@ -232,7 +232,7 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
        $e = "78567a";
        $f = new Feed(null, $this->base.$url."?t=$t&e=$e", Date::transform($t, "http"), $e);
        $this->assertTime($t, $f->lastModified);
-        $this->assertSame($e, $f->resource->getETag());
+        $this->assertSame($e, $f->etag);
    }

    public function provide304ResponseURLs() {
@ -250,15 +250,15 @@ class TestFeed extends \JKingWeb\Arsse\Test\AbstractTest {
        $t = time() - 2000;
        $f = new Feed(null, $this->base."Caching/200Past");
        $this->assertTime($t, $f->lastModified);
-        $this->assertNotEmpty($f->resource->getETag());
+        $this->assertNotEmpty($f->etag);
        $t = time() - 2000;
        $f = new Feed(null, $this->base."Caching/200Past", Date::transform(time(), "http"));
        $this->assertTime($t, $f->lastModified);
-        $this->assertNotEmpty($f->resource->getETag());
+        $this->assertNotEmpty($f->etag);
        $t = time() + 2000;
        $f = new Feed(null, $this->base."Caching/200Future");
        $this->assertTime($t, $f->lastModified);
-        $this->assertNotEmpty($f->resource->getETag());
+        $this->assertNotEmpty($f->etag);
        // these tests have no HTTP headers and rely on article dates
        $t = strtotime("2002-05-19T15:21:36Z");
        $f = new Feed(null, $this->base."Caching/200PubDateOnly");