2019-08-26 22:13:30 -04:00
|
|
|
<?php
|
|
|
|
/** @license MIT
|
|
|
|
* Copyright 2017 J. King, Dustin Wilson et al.
|
|
|
|
* See LICENSE and AUTHORS files for details */
|
|
|
|
|
|
|
|
declare(strict_types=1);
|
|
|
|
namespace JKingWeb\Arsse\Misc;
|
|
|
|
|
2019-08-27 11:08:13 -04:00
|
|
|
/**
|
|
|
|
* A collection of functions for manipulating URLs
|
|
|
|
*/
|
2019-08-26 22:13:30 -04:00
|
|
|
class URL {
|
2019-09-25 18:30:53 -04:00
|
|
|
/** Returns whether a URL is absolute i.e. has a scheme */
|
|
|
|
public static function absolute(string $url): bool {
|
|
|
|
return (bool) strlen((string) parse_url($url, \PHP_URL_SCHEME));
|
|
|
|
}
|
|
|
|
|
2019-09-03 19:34:56 -04:00
|
|
|
/** Normalizes a URL
|
2019-09-05 11:25:50 -04:00
|
|
|
*
|
2019-08-27 11:08:13 -04:00
|
|
|
* Normalizations performed are:
|
2019-09-05 11:25:50 -04:00
|
|
|
*
|
2019-08-27 11:08:13 -04:00
|
|
|
* - Lowercasing scheme
|
2019-08-27 15:18:02 -04:00
|
|
|
* - Lowercasing ASCII host names
|
|
|
|
* - IDN normalization
|
2019-09-03 13:26:00 -04:00
|
|
|
* - IPv6 address normalization
|
2019-08-27 11:08:13 -04:00
|
|
|
* - Resolution of relative path segments
|
|
|
|
* - Discarding empty path segments
|
|
|
|
* - Discarding empty queries
|
2019-09-03 13:26:00 -04:00
|
|
|
* - Generic percent-encoding normalization
|
2019-08-27 11:08:13 -04:00
|
|
|
* - Fragment discarding
|
2019-09-05 11:25:50 -04:00
|
|
|
*
|
2019-09-03 13:26:00 -04:00
|
|
|
* It does NOT drop trailing slashes from paths, nor does it perform Unicode normalization or context-aware percent-encoding normalization
|
2019-09-05 11:25:50 -04:00
|
|
|
*
|
2019-09-03 19:34:56 -04:00
|
|
|
* @param string $url The URL to normalize
|
2019-08-27 11:08:13 -04:00
|
|
|
* @param string $u Username to add to the URL, replacing any existing credentials
|
|
|
|
* @param string $p Password to add to the URL, if a username is specified
|
|
|
|
*/
|
2019-08-26 22:13:30 -04:00
|
|
|
public static function normalize(string $url, string $u = null, string $p = null): string {
|
|
|
|
extract(parse_url($url));
|
2019-09-03 19:34:56 -04:00
|
|
|
$out = "";
|
|
|
|
if (isset($scheme)) {
|
|
|
|
$out .= strtolower($scheme).":";
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
2019-09-03 19:34:56 -04:00
|
|
|
if (isset($host)) {
|
|
|
|
$out .= "//";
|
|
|
|
if (strlen($u ?? "")) {
|
|
|
|
$out .= self::normalizeEncoding(rawurlencode($u));
|
|
|
|
if (strlen($p ?? "")) {
|
|
|
|
$out .= ":".self::normalizeEncoding(rawurlencode($p));
|
|
|
|
}
|
|
|
|
$out .= "@";
|
|
|
|
} elseif (strlen($user ?? "")) {
|
|
|
|
$out .= self::normalizeEncoding($user);
|
|
|
|
if (strlen($pass ?? "")) {
|
|
|
|
$out .= ":".self::normalizeEncoding($pass);
|
|
|
|
}
|
|
|
|
$out .= "@";
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
2019-09-03 19:34:56 -04:00
|
|
|
$out .= self::normalizeHost($host);
|
|
|
|
$out .= isset($port) ? ":$port" : "";
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
2019-09-03 19:34:56 -04:00
|
|
|
$out .= self::normalizePath($path ?? "", isset($host));
|
2019-08-26 22:13:30 -04:00
|
|
|
if (isset($query) && strlen($query)) {
|
2019-09-03 13:16:05 -04:00
|
|
|
$out .= "?".self::normalizeEncoding($query);
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
|
|
|
return $out;
|
|
|
|
}
|
|
|
|
|
2019-08-29 12:28:23 -04:00
|
|
|
/** Perform percent-encoding normalization for a given URL component */
|
2019-09-03 13:16:05 -04:00
|
|
|
protected static function normalizeEncoding(string $part): string {
|
|
|
|
$pos = 0;
|
|
|
|
$end = strlen($part);
|
|
|
|
$out = "";
|
|
|
|
// process each character in sequence
|
|
|
|
while ($pos < $end) {
|
|
|
|
$c = $part[$pos];
|
|
|
|
if ($c === "%") {
|
|
|
|
// the % character signals an encoded character...
|
2020-03-01 15:16:50 -05:00
|
|
|
$d = substr($part, $pos + 1, 2);
|
2021-06-24 11:58:50 -04:00
|
|
|
if (!preg_match("/^[0-9a-fA-F]{2}$/D", $d)) {
|
2019-09-03 13:16:05 -04:00
|
|
|
// unless there are fewer than two characters left in the string or the two characters are not hex digits
|
|
|
|
$d = ord($c);
|
|
|
|
} else {
|
|
|
|
$d = hexdec($d);
|
|
|
|
$pos += 2;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
$d = ord($c);
|
|
|
|
}
|
|
|
|
$dc = chr($d);
|
|
|
|
if ($d < 0x21 || $d > 0x7E || $d == 0x25) {
|
|
|
|
// these characters are always encoded
|
|
|
|
$out .= "%".strtoupper(dechex($d));
|
|
|
|
} elseif (preg_match("/[a-zA-Z0-9\._~-]/", $dc)) {
|
|
|
|
// these characters are never encoded
|
|
|
|
$out .= $dc;
|
|
|
|
} else {
|
|
|
|
// these characters are passed through as-is
|
|
|
|
if ($c === "%") {
|
|
|
|
$out .= "%".strtoupper(dechex($d));
|
|
|
|
} else {
|
|
|
|
$out .= $c;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
$pos++;
|
|
|
|
}
|
|
|
|
return $out;
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
|
|
|
|
2019-08-29 12:28:23 -04:00
|
|
|
/** Normalizes a hostname per IDNA:2008 */
|
2019-08-26 22:13:30 -04:00
|
|
|
protected static function normalizeHost(string $host): string {
|
2019-09-03 13:16:05 -04:00
|
|
|
if ($host[0] === "[" && substr($host, -1) === "]") {
|
|
|
|
// normalize IPv6 addresses
|
|
|
|
$addr = @inet_pton(substr($host, 1, strlen($host) - 2));
|
|
|
|
if ($addr !== false) {
|
|
|
|
return "[".inet_ntop($addr)."]";
|
|
|
|
}
|
|
|
|
}
|
2019-08-27 15:18:02 -04:00
|
|
|
$idn = idn_to_ascii($host, \IDNA_NONTRANSITIONAL_TO_ASCII, \INTL_IDNA_VARIANT_UTS46);
|
|
|
|
return $idn !== false ? idn_to_utf8($idn, \IDNA_NONTRANSITIONAL_TO_UNICODE, \INTL_IDNA_VARIANT_UTS46) : $host;
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
|
|
|
|
2019-08-27 11:08:13 -04:00
|
|
|
/** Normalizes the whole path segment to remove empty segments and relative segments */
|
2019-09-03 19:34:56 -04:00
|
|
|
protected static function normalizePath(string $path, bool $hasHost): string {
|
2019-09-03 13:16:05 -04:00
|
|
|
$parts = explode("/", self::normalizeEncoding($path));
|
2019-09-03 19:34:56 -04:00
|
|
|
$absolute = ($hasHost || $path[0] === "/");
|
|
|
|
$index = (substr($path, -1) === "/");
|
2019-08-27 11:08:13 -04:00
|
|
|
$out = [];
|
2019-09-05 11:25:50 -04:00
|
|
|
foreach ($parts as $p) {
|
2019-08-27 11:08:13 -04:00
|
|
|
switch ($p) {
|
|
|
|
case "":
|
|
|
|
case ".":
|
|
|
|
break;
|
|
|
|
case "..":
|
|
|
|
array_pop($out);
|
|
|
|
break;
|
|
|
|
default:
|
2019-09-03 13:16:05 -04:00
|
|
|
$out[] = $p;
|
2019-08-27 11:08:13 -04:00
|
|
|
}
|
|
|
|
}
|
2019-09-03 19:34:56 -04:00
|
|
|
$out = implode("/", $out);
|
|
|
|
$out = ($absolute ? "/" : "").$out.($index ? "/" : "");
|
|
|
|
return str_replace("//", "/", $out);
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|
2019-09-25 18:30:53 -04:00
|
|
|
|
|
|
|
/** Appends data to a URL's query component
|
2019-10-25 15:16:35 -04:00
|
|
|
*
|
2019-09-25 18:30:53 -04:00
|
|
|
* @param string $url The input URL
|
|
|
|
* @param string $data The data to append. This should already be escaped where necessary and not start with any delimiter
|
|
|
|
* @param string $glue The query subcomponent delimiter, usually "&". If the URL has no query, "?" will be prepended instead
|
|
|
|
*/
|
|
|
|
public static function queryAppend(string $url, string $data, string $glue = "&"): string {
|
|
|
|
if (!strlen($data)) {
|
|
|
|
return $url;
|
|
|
|
}
|
|
|
|
$insPos = strpos($url, "#");
|
|
|
|
$insPos = $insPos === false ? strlen($url) : $insPos;
|
|
|
|
$qPos = strpos($url, "?");
|
|
|
|
$hasQuery = $qPos !== false;
|
|
|
|
$glue = $hasQuery ? $glue : "?";
|
|
|
|
if ($hasQuery && $insPos > 0) {
|
|
|
|
if ($url[$insPos - 1] === $glue || ($insPos - 1) == $qPos) {
|
|
|
|
// if the URL already has excess glue, use it
|
|
|
|
$glue = "";
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return substr($url, 0, $insPos).$glue.$data.substr($url, $insPos);
|
|
|
|
}
|
2019-08-26 22:13:30 -04:00
|
|
|
}
|