2017-03-18 11:01:23 -05:00
< ? php
2017-03-28 18:19:12 -05:00
declare ( strict_types = 1 );
2017-03-27 23:12:12 -05:00
namespace JKingWeb\Arsse ;
2017-03-18 11:01:23 -05:00
use PicoFeed\Reader\Reader ;
use PicoFeed\PicoFeedException ;
use PicoFeed\Reader\Favicon ;
2017-03-28 18:19:12 -05:00
use PicoFeed\Config\Config ;
2017-03-18 11:01:23 -05:00
class Feed {
2017-03-26 15:16:15 -05:00
public $data = null ;
public $favicon ;
public $parser ;
2017-03-18 11:01:23 -05:00
public $reader ;
public $resource ;
2017-04-24 21:51:56 -04:00
public $modified = false ;
2017-04-30 17:54:29 -04:00
public $lastModified ;
public $nextFetch ;
2017-04-22 23:40:57 -04:00
public $newItems = [];
public $changedItems = [];
2017-03-18 11:01:23 -05:00
2017-04-30 17:54:29 -04:00
public function __construct ( int $feedID = null , string $url , string $lastModified = '' , string $etag = '' , string $username = '' , string $password = '' ) {
// fetch the feed
$this -> download ( $url , $lastModified , $etag , $username , $password );
// format the HTTP Last-Modified date returned
$lastMod = $this -> resource -> getLastModified ();
if ( strlen ( $lastMod )) {
2017-05-04 14:42:40 -04:00
$this -> lastModified = \DateTime :: createFromFormat ( " !D, d M Y H:i:s e " , $lastMod );
2017-04-30 17:54:29 -04:00
}
$this -> modified = $this -> resource -> isModified ();
//parse the feed, if it has been modified
if ( $this -> modified ) {
$this -> parse ();
// ascertain whether there are any articles not in the database
$this -> matchToDatabase ( $feedID );
// if caching header fields are not sent by the server, try to ascertain a last-modified date from the feed contents
if ( ! $this -> lastModified ) $this -> lastModified = $this -> computeLastModified ();
// we only really care if articles have been modified; if there are no new articles, act as if the feed is unchanged
if ( ! sizeof ( $this -> newItems ) && ! sizeof ( $this -> changedItems )) $this -> modified = false ;
}
// compute the time at which the feed should next be fetched
$this -> nextFetch = $this -> computeNextFetch ();
}
public function download ( string $url , string $lastModified = '' , string $etag = '' , string $username = '' , string $password = '' ) : bool {
2017-03-18 11:01:23 -05:00
try {
2017-03-28 18:19:12 -05:00
$config = new Config ;
$config -> setClientUserAgent ( Data :: $conf -> userAgentString );
$config -> setGrabberUserAgent ( Data :: $conf -> userAgentString );
$this -> reader = new Reader ( $config );
2017-04-01 15:42:10 -04:00
$this -> resource = $this -> reader -> download ( $url , $lastModified , $etag , $username , $password );
2017-03-18 11:01:23 -05:00
} catch ( PicoFeedException $e ) {
throw new Feed\Exception ( $url , $e );
}
2017-04-30 17:54:29 -04:00
return true ;
2017-03-18 11:01:23 -05:00
}
2017-04-30 17:54:29 -04:00
public function parse () : bool {
2017-03-18 11:01:23 -05:00
try {
$this -> parser = $this -> reader -> getParser (
2017-04-01 15:42:10 -04:00
$this -> resource -> getUrl (),
$this -> resource -> getContent (),
$this -> resource -> getEncoding ()
2017-03-18 11:01:23 -05:00
);
$feed = $this -> parser -> execute ();
2017-04-02 21:23:15 -05:00
// Grab the favicon for the feed; returns an empty string if it cannot find one.
// Some feeds might use a different domain (eg: feedburner), so the site url is
// used instead of the feed's url.
2017-04-06 20:50:47 -04:00
$this -> favicon = ( new Favicon ) -> find ( $feed -> siteUrl );
2017-03-18 11:01:23 -05:00
} catch ( PicoFeedException $e ) {
throw new Feed\Exception ( $url , $e );
}
// PicoFeed does not provide valid ids when there is no id element. Its solution
// of hashing the url, title, and content together for the id if there is no id
// element is stupid. Many feeds are frankenstein mixtures of Atom and RSS, but
// some are pure RSS with guid elements while others use the Dublin Core spec for
// identification. These feeds shouldn't be duplicated when updated. That should
// only be reserved for severely broken feeds.
2017-04-22 23:40:57 -04:00
foreach ( $feed -> items as $f ) {
2017-03-26 15:16:15 -05:00
// Hashes used for comparison to check for updates and also to identify when an
// id doesn't exist.
2017-04-01 15:42:10 -04:00
$f -> urlTitleHash = hash ( 'sha256' , $f -> url . $f -> title );
$f -> urlContentHash = hash ( 'sha256' , $f -> url . $f -> content . $f -> enclosureUrl . $f -> enclosureType );
$f -> titleContentHash = hash ( 'sha256' , $f -> title . $f -> content . $f -> enclosureUrl . $f -> enclosureType );
2017-03-26 15:16:15 -05:00
2017-03-18 11:01:23 -05:00
// If there is an id element then continue. The id is used already.
$id = ( string ) $f -> xml -> id ;
if ( $id !== '' ) {
continue ;
}
// If there is a guid element use it as the id.
$id = ( string ) $f -> xml -> guid ;
if ( $id !== '' ) {
$f -> id = hash ( 'sha256' , $id );
continue ;
}
// If there is a Dublin Core identifier use it.
$id = ( string ) $f -> xml -> children ( 'http://purl.org/dc/elements/1.1/' ) -> identifier ;
if ( $id !== '' ) {
$f -> id = hash ( 'sha256' , $id );
continue ;
}
2017-03-26 15:16:15 -05:00
// If there aren't any of those there is no id.
2017-03-18 11:01:23 -05:00
$f -> id = '' ;
}
2017-04-23 13:12:33 -04:00
$this -> data = $feed ;
return true ;
}
protected function deduplicateItems ( array $items ) : array {
/* Rationale :
Some newsfeeds ( notably Planet ) include multiple versions of an
item if it is updated . As we only care about the latest , we
try to remove any " old " versions of an item that might also be
present within the feed .
*/
$out = [];
foreach ( $items as $item ) {
foreach ( $out as $index => $check ) {
// if the two items have the same ID or any one hash matches, they are two versions of the same item
if (
( $item -> id && $check -> id && $item -> id == $check -> id ) ||
$item -> urlTitleHash == $check -> urlTitleHash ||
$item -> urlContentHash == $check -> urlContentHash ||
$item -> titleContentHash == $check -> titleContentHash
) {
if ( // because newsfeeds are usually order newest-first, the later item should only be used if...
// the later item has an update date and the existing item does not
( $item -> updatedDate && ! $check -> updatedDate ) ||
// the later item has an update date newer than the existing item's
( $item -> updatedDate && $check -> updatedDate && $item -> updatedDate -> getTimestamp () > $check -> updatedDate -> getTimestamp ()) ||
// neither item has update dates, both have publish dates, and the later item has a newer publish date
( ! $item -> updatedDate && ! $check -> updatedDate && $item -> publishedDate && $check -> publishedDate && $item -> publishedDate -> getTimestamp () > $check -> publishedDate -> getTimestamp ())
) {
// if the later item should be used, replace the existing one
$out [ $index ] = $item ;
continue 2 ;
} else {
// otherwise skip the item
continue 2 ;
}
}
}
// if there was no match, add the item
$out [] = $item ;
}
return $out ;
}
2017-04-30 17:54:29 -04:00
public function matchToDatabase ( int $feedID ) : bool {
2017-04-23 13:12:33 -04:00
// first perform deduplication on items
$items = $this -> deduplicateItems ( $this -> data -> items );
// get as many of the latest articles in the database as there are in the feed
$articles = Data :: $db -> articleMatchLatest ( $feedID , sizeof ( $items ));
// arrays holding new, edited, and tentatively new items; items may be tentatively new because we perform two passes
$new = $tentative = $edited = [];
// iterate through the articles and for each determine whether it is existing, edited, or entirely new
foreach ( $items as $index => $i ) {
2017-05-04 14:42:40 -04:00
$found = false ;
2017-04-23 13:12:33 -04:00
foreach ( $articles as $a ) {
if (
// the item matches if the GUID matches...
( $i -> id && $i -> id === $a [ 'guid' ]) ||
// ... or if any one of the hashes match
$i -> urlTitleHash === $a [ 'url_title_hash' ] ||
$i -> urlContentHash === $a [ 'url_content_hash' ] ||
$i -> titleContentHash === $a [ 'title_content_hash' ]
) {
if ( $i -> updatedDate && $i -> updatedDate -> getTimestamp () !== $match [ 'edited_date' ]) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
2017-05-04 14:42:40 -04:00
$found = true ;
2017-04-23 13:12:33 -04:00
$edited [ $index ] = $a [ 'id' ];
break ;
} else if ( $i -> urlTitleHash !== $a [ 'url_title_hash' ] || $i -> urlContentHash !== $a [ 'url_content_hash' ] || $i -> titleContentHash !== $a [ 'title_content_hash' ]) {
// if any of the hashes do not match, then the article has been edited
2017-05-04 14:42:40 -04:00
$found = true ;
2017-04-23 13:12:33 -04:00
$edited [ $index ] = $a [ 'id' ];
break ;
} else {
// otherwise the item is unchanged and we can ignore it
2017-05-04 14:42:40 -04:00
$found = true ;
2017-04-23 13:12:33 -04:00
break ;
}
}
}
2017-05-04 14:42:40 -04:00
if ( ! $found ) $tentative [] = $index ;
2017-04-23 13:12:33 -04:00
}
2017-05-04 22:43:18 -04:00
if ( sizeof ( $tentative ) && sizeof ( $items ) == sizeof ( $articles )) {
2017-04-23 13:12:33 -04:00
// if we need to, perform a second pass on the database looking specifically for IDs and hashes of the new items
$ids = $hashesUT = $hashesUC = $hashesTC = [];
foreach ( $tentative as $index ) {
$i = $items [ $index ];
2017-05-04 14:42:40 -04:00
if ( $i -> id ) $ids [] = $i -> id ;
2017-04-23 13:12:33 -04:00
$hashesUT [] = $i -> urlTitleHash ;
$hashesUC [] = $i -> urlContentHash ;
$hashesTC [] = $i -> titleContentHash ;
}
$articles = Data :: $db -> articleMatchIds ( $feedID , $ids , $hashesUT , $hashesUC , $hashesTC );
foreach ( $tentative as $index ) {
$i = $items [ $index ];
2017-05-04 14:42:40 -04:00
$found = false ;
2017-04-22 23:40:57 -04:00
foreach ( $articles as $a ) {
if (
// the item matches if the GUID matches...
( $i -> id && $i -> id === $a [ 'guid' ]) ||
// ... or if any one of the hashes match
$i -> urlTitleHash === $a [ 'url_title_hash' ] ||
$i -> urlContentHash === $a [ 'url_content_hash' ] ||
$i -> titleContentHash === $a [ 'title_content_hash' ]
) {
if ( $i -> updatedDate && $i -> updatedDate -> getTimestamp () !== $match [ 'edited_date' ]) {
// if the item has an edit timestamp and it doesn't match that of the article in the database, the the article has been edited
// we store the item index and database record ID as a key/value pair
2017-05-04 14:42:40 -04:00
$found = true ;
2017-04-22 23:40:57 -04:00
$edited [ $index ] = $a [ 'id' ];
break ;
} else if ( $i -> urlTitleHash !== $a [ 'url_title_hash' ] || $i -> urlContentHash !== $a [ 'url_content_hash' ] || $i -> titleContentHash !== $a [ 'title_content_hash' ]) {
// if any of the hashes do not match, then the article has been edited
2017-05-04 14:42:40 -04:00
$found = true ;
2017-04-22 23:40:57 -04:00
$edited [ $index ] = $a [ 'id' ];
break ;
} else {
// otherwise the item is unchanged and we can ignore it
2017-05-04 14:42:40 -04:00
$found = true ;
2017-04-22 23:40:57 -04:00
break ;
}
} else {
2017-04-23 13:12:33 -04:00
// if we don't have a match, add the item to the definite new list
$new [] = $index ;
2017-04-22 23:40:57 -04:00
}
}
2017-05-04 14:42:40 -04:00
if ( ! $found ) $new [] = $index ;
2017-04-22 23:40:57 -04:00
}
2017-05-04 22:43:18 -04:00
} else {
// if there are no tentatively new articles and/or the number of stored articles is less than the size of the feed, don't do a second pass; assume any tentatively new items are in fact new
$new = $tentative ;
2017-04-22 23:40:57 -04:00
}
2017-04-23 13:12:33 -04:00
// FIXME: fetch full content when appropriate
foreach ( $new as $index ) {
$this -> newItems [] = $items [ $index ];
}
foreach ( $edited as $index => $id ) {
$this -> changedItems [ $id ] = $items [ $index ];
}
2017-03-18 11:01:23 -05:00
return true ;
}
2017-04-24 21:51:56 -04:00
2017-04-30 17:54:29 -04:00
public function computeNextFetch () : \DateTime {
2017-04-27 09:47:40 -04:00
$now = new \DateTime ();
2017-04-24 21:51:56 -04:00
if ( ! $this -> modified ) {
2017-04-27 09:47:40 -04:00
$diff = $now -> getTimestamp () - $this -> lastModified -> getTimestamp ();
$offset = $this -> normalizeDateDiff ( $diff );
$now -> modify ( " + " . $offset );
} else {
$offsets = [];
2017-04-30 17:54:29 -04:00
$dates = $this -> gatherDates ();
2017-04-27 09:47:40 -04:00
if ( sizeof ( $dates ) > 3 ) {
for ( $a = 0 ; $a < 3 ; $a ++ ) {
$diff = $dates [ $a + 1 ] - $dates [ $a ];
$offsets [] = $this -> normalizeDateDiff ( $diff );
}
if ( $offsets [ 0 ] == $offsets [ 1 ] || $offsets [ 0 ] == $offsets [ 2 ]) {
$now -> modify ( " + " . $offsets [ 0 ]);
} else if ( $offsets [ 1 ] == $offsets [ 2 ]) {
$now -> modify ( " + " . $offsets [ 1 ]);
} else {
$now -> modify ( " + 1 hour " );
}
2017-04-24 21:51:56 -04:00
} else {
2017-04-27 09:47:40 -04:00
$now -> modify ( " + 1 hour " );
2017-04-24 21:51:56 -04:00
}
2017-04-27 09:47:40 -04:00
}
return $now ;
}
2017-04-30 17:54:29 -04:00
public static function nextFetchOnError ( $errCount ) : \DateTime {
if ( $errCount < 3 ) {
$offset = " 5 minutes " ;
} else if ( $errCount < 15 ) {
$offset = " 3 hours " ;
} else {
$offset = " 1 day " ;
}
return new \DateTime ( " now + " . $offset );
}
2017-04-27 09:47:40 -04:00
protected function normalizeDateDiff ( int $diff ) : string {
if ( $diff < ( 30 * 60 )) { // less than 30 minutes
$offset = " 15 minutes " ;
} else if ( $diff < ( 60 * 60 )) { // less than an hour
$offset = " 30 minutes " ;
} else if ( $diff < ( 3 * 60 * 60 )) { // less than three hours
$offset = " 1 hour " ;
} else if ( $diff > ( 36 * 60 * 60 )) { // more than 36 hours
$offset = " 1 day " ;
2017-04-24 21:51:56 -04:00
} else {
2017-04-27 09:47:40 -04:00
$offset = " 3 hours " ;
2017-04-24 21:51:56 -04:00
}
2017-04-27 09:47:40 -04:00
return $offset ;
2017-04-24 21:51:56 -04:00
}
2017-04-30 17:54:29 -04:00
public function computeLastModified () {
if ( ! $this -> modified ) {
return $this -> lastModified ;
} else {
$dates = $this -> gatherDates ();
}
if ( sizeof ( $dates )) {
$now = new \DateTime ();
$now -> setTimestamp ( $dates [ 0 ]);
return $now ;
} else {
return null ;
}
}
protected function gatherDates () : array {
$dates = [];
foreach ( $this -> data -> items as $item ) {
if ( $item -> updatedDate ) $dates [] = $item -> updatedDate -> getTimestamp ();
if ( $item -> publishedDate ) $dates [] = $item -> publishedDate -> getTimestamp ();
}
$dates = array_unique ( $dates , \SORT_NUMERIC );
rsort ( $dates );
return $dates ;
}
2017-03-18 11:01:23 -05:00
}