From bd71ddb929625c904c64972ac82fd63298209e4c Mon Sep 17 00:00:00 2001 From: "J. King" Date: Tue, 3 Sep 2019 13:16:05 -0400 Subject: [PATCH] Percent-encoding and IPv6 normalization --- lib/Misc/URL.php | 70 ++++++++++++++++++++++++++---------- tests/cases/Misc/TestURL.php | 8 ++++- 2 files changed, 59 insertions(+), 19 deletions(-) diff --git a/lib/Misc/URL.php b/lib/Misc/URL.php index 30ab6d56..bbb0b59c 100644 --- a/lib/Misc/URL.php +++ b/lib/Misc/URL.php @@ -10,14 +10,6 @@ namespace JKingWeb\Arsse\Misc; * A collection of functions for manipulating URLs */ class URL { - /** User component */ - const P_USER = 1; - /** Password component */ - const P_PASS = 2; - /** Path segment component */ - const P_PATH = 3; - /** Full query component */ - const P_QUERY = 4; /** Normalizes an absolute URL * @@ -45,15 +37,15 @@ class URL { } $out = strtolower($scheme)."://"; if (strlen($u ?? "")) { - $out .= self::normalizePart(rawurlencode($u), self::P_USER); + $out .= self::normalizeEncoding(rawurlencode($u)); if (strlen($p ?? "")) { - $out .= ":".self::normalizePart(rawurlencode($p), self::P_PASS); + $out .= ":".self::normalizeEncoding(rawurlencode($p)); } $out .= "@"; } elseif (strlen($user ?? "")) { - $out .= self::normalizePart($user, self::P_USER); + $out .= self::normalizeEncoding($user); if (strlen($pass ?? "")) { - $out .= ":".self::normalizePart($pass, self::P_PASS); + $out .= ":".self::normalizeEncoding($pass); } $out .= "@"; } @@ -61,26 +53,68 @@ class URL { $out .= isset($port) ? ":$port" : ""; $out .= self::normalizePath($path ?? ""); if (isset($query) && strlen($query)) { - $out .= "?".self::normalizePart($query, self::P_QUERY); + $out .= "?".self::normalizeEncoding($query); } return $out; } /** Perform percent-encoding normalization for a given URL component */ - protected static function normalizePart(string $part, int $type): string { - // stub - return $part; + protected static function normalizeEncoding(string $part): string { + $pos = 0; + $end = strlen($part); + $out = ""; + // process each character in sequence + while ($pos < $end) { + $c = $part[$pos]; + if ($c === "%") { + // the % character signals an encoded character... + $d = substr($part, $pos+1, 2); + if (!preg_match("/^[0-9a-fA-F]{2}$/", $d)) { + // unless there are fewer than two characters left in the string or the two characters are not hex digits + $d = ord($c); + } else { + $d = hexdec($d); + $pos += 2; + } + } else { + $d = ord($c); + } + $dc = chr($d); + if ($d < 0x21 || $d > 0x7E || $d == 0x25) { + // these characters are always encoded + $out .= "%".strtoupper(dechex($d)); + } elseif (preg_match("/[a-zA-Z0-9\._~-]/", $dc)) { + // these characters are never encoded + $out .= $dc; + } else { + // these characters are passed through as-is + if ($c === "%") { + $out .= "%".strtoupper(dechex($d)); + } else { + $out .= $c; + } + } + $pos++; + } + return $out; } /** Normalizes a hostname per IDNA:2008 */ protected static function normalizeHost(string $host): string { + if ($host[0] === "[" && substr($host, -1) === "]") { + // normalize IPv6 addresses + $addr = @inet_pton(substr($host, 1, strlen($host) - 2)); + if ($addr !== false) { + return "[".inet_ntop($addr)."]"; + } + } $idn = idn_to_ascii($host, \IDNA_NONTRANSITIONAL_TO_ASCII, \INTL_IDNA_VARIANT_UTS46); return $idn !== false ? idn_to_utf8($idn, \IDNA_NONTRANSITIONAL_TO_UNICODE, \INTL_IDNA_VARIANT_UTS46) : $host; } /** Normalizes the whole path segment to remove empty segments and relative segments */ protected static function normalizePath(string $path): string { - $parts = explode("/", $path); + $parts = explode("/", self::normalizeEncoding($path)); $out = []; foreach($parts as $p) { switch ($p) { @@ -91,7 +125,7 @@ class URL { array_pop($out); break; default: - $out[] = self::normalizePart($p, self::P_PATH); + $out[] = $p; } } return str_replace("//", "/", "/".implode("/", $out).(substr($path, -1) === "/" ? "/" : "")); diff --git a/tests/cases/Misc/TestURL.php b/tests/cases/Misc/TestURL.php index 2a8a7733..3c08a3d4 100644 --- a/tests/cases/Misc/TestURL.php +++ b/tests/cases/Misc/TestURL.php @@ -23,7 +23,6 @@ class TestURL extends \JKingWeb\Arsse\Test\AbstractTest { return [ ["/", "/"], ["//example.com/", "//example.com/"], - ["http://[::1]/", "http://[::1]/"], ["http://example.com/", "http://example.com/"], ["HTTP://example.com/", "http://example.com/"], ["http://example.com", "http://example.com/"], @@ -61,6 +60,13 @@ class TestURL extends \JKingWeb\Arsse\Test\AbstractTest { ["http://日本.example.com/", "http://日本.example.com/"], ["http://EXAMPLE.COM/", "http://example.com/"], ["http://É.example.com/", "http://é.example.com/"], + ["http://[::1]/", "http://[::1]/"], + ["http://[0::1]/", "http://[::1]/"], + ["http://[Z]/", "http://[z]/"], + ["http://example.com/ ?%61=%3d", "http://example.com/%20?a=%3D"], + ["http://example.com/%", "http://example.com/%25"], + ["http://example.com/%a", "http://example.com/%25a"], + ["http://example.com/%za", "http://example.com/%25za"], ]; } }