From 80feeb36f92a923f57f740c7c28c12bb8b69ec16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 28 Jul 2017 11:04:19 +0200 Subject: stdlib: Add API and doc of uri_string module --- lib/stdlib/doc/src/Makefile | 1 + lib/stdlib/doc/src/ref_man.xml | 1 + lib/stdlib/doc/src/specs.xml | 1 + lib/stdlib/doc/src/uri_string.xml | 255 ++++++++++++++++++++++++++++++ lib/stdlib/src/Makefile | 1 + lib/stdlib/src/uri_string.erl | 325 ++++++++++++++++++++++++++++++++++++++ 6 files changed, 584 insertions(+) create mode 100644 lib/stdlib/doc/src/uri_string.xml create mode 100644 lib/stdlib/src/uri_string.erl (limited to 'lib') diff --git a/lib/stdlib/doc/src/Makefile b/lib/stdlib/doc/src/Makefile index 93eac8220d..aeed79408b 100644 --- a/lib/stdlib/doc/src/Makefile +++ b/lib/stdlib/doc/src/Makefile @@ -98,6 +98,7 @@ XML_REF3_FILES = \ sys.xml \ timer.xml \ unicode.xml \ + uri_string.xml \ win32reg.xml \ zip.xml diff --git a/lib/stdlib/doc/src/ref_man.xml b/lib/stdlib/doc/src/ref_man.xml index 878a3babc5..68bfddbc71 100644 --- a/lib/stdlib/doc/src/ref_man.xml +++ b/lib/stdlib/doc/src/ref_man.xml @@ -93,6 +93,7 @@ + diff --git a/lib/stdlib/doc/src/specs.xml b/lib/stdlib/doc/src/specs.xml index 45b207b13d..d559adf9b6 100644 --- a/lib/stdlib/doc/src/specs.xml +++ b/lib/stdlib/doc/src/specs.xml @@ -60,6 +60,7 @@ + diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml new file mode 100644 index 0000000000..e6b2bd5e80 --- /dev/null +++ b/lib/stdlib/doc/src/uri_string.xml @@ -0,0 +1,255 @@ + + + + +
+ + 20172017 + Ericsson AB. All Rights Reserved. + + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + + + maps + Péter Dimitrov + 1 + 2017-08-23 + A +
+ uri_string + RFC 3986 compliant URI processing functions. + +

This module contains functions for parsing and handling RFC 3986 compliant URIs.

+

A URI is an identifier consisting of a sequence of characters matching the syntax + rule named URI in RFC 3986.

+

The generic URI syntax consists of a hierarchical sequence of components referred + to as the scheme, authority, path, query, and fragment:

+    URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
+    hier-part   = "//" authority path-abempty
+                   / path-absolute
+                   / path-rootless
+                   / path-empty
+    scheme      = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
+    authority   = [ userinfo "@" ] host [ ":" port ]
+    userinfo    = *( unreserved / pct-encoded / sub-delims / ":" )
+
+    reserved    = gen-delims / sub-delims
+    gen-delims  = ":" / "/" / "?" / "#" / "[" / "]" / "@"
+    sub-delims  = "!" / "$" / "&" / "'" / "(" / ")"
+                / "*" / "+" / "," / ";" / "="
+
+    unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
+    


+

+

The interpretation of a URI depends only on the characters used and not on how those + characters are represented in a network protocol.

+

The functions implemented by this module covers the following use cases: + + Parsing URIs

+ parse/1
+ Recomposing URIs

+ recompose/2
+ Resolving URI references

+ resolve_uri_reference/3
+ Creating URI references

+ create_uri_reference/3
+ Normalizing URIs

+ normalize/1
+ Transcoding URIs

+ transcode/2
+ Working with urlencoded query strings

+ compose_query/1, dissect_query/1
+
+

+

There are four different encodings present during the handling of URIs: + + Inbound binary encoding in binaries + Inbound percent-encoding in lists and binaries + Outbound binary encoding in binaries + Outbound percent-encoding in lists and binaries + +

+

Unless otherwise specified the return value type and encoding are the same as the input + type and encoding. That is, binary input returns binary output, list input returns a list + output but mixed input returns list output. Input and output encodings are the same except + for transcode/2.

+

All of the functions but transcode/2 expects input as unicode codepoints in + lists, UTF-8 encoding in binaries and UTF-8 encoding in percent-encoded URI parts. + transcode/2 provides the means to convert between the supported URI encodings.

+
+ + + + + +

Maybe improper list of bytes (0..255).

+
+
+ + + +

URI map holding the main components of a URI.

+
+
+ + + +

List of unicode codepoints, UTF-8 encoded binary, or a mix of the two, + representing an RFC 3986 compliant URI (percent-encoded form). + A URI is a sequence of characters from a very limited set: the letters of + the basic Latin alphabet, digits, and a few special characters.

+
+
+
+ + + + + + Compose urlencoded query string. + +

Composes an urlencoded QueryString based on a + QueryList, a list of unescaped key-value pairs. + Media type application/x-www-form-urlencoded is defined in section + 8.2.1 of RFC 1866 (HTML 2.0). +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:compose_query(...).
+
+
+
+ + + + Create references. + +

Creates an RFC 3986 compliant RelativeDestURI, + based AbsoluteSourceURI and AbsoluteSourceURI +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:create_uri_reference(...,...).
+
+
+
+ + + + Dissect query string. + +

Dissects an urlencoded QueryString and returns a + QueryList, a list of unescaped key-value pairs. + Media type application/x-www-form-urlencoded is defined in section + 8.2.1 of RFC 1866 (HTML 2.0). +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:dissect_query(...).
+
+
+
+ + + + Normalize URI. + +

Normalizes an RFC 3986 compliant URIString and returns + a NormalizedURI. The algorithm used to shorten the input + URI is called Syntax-Based Normalization and described at + Section 6.2.2 of RFC 3986. +

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:normalize("http://example.org/one/two/../../one").
+"http://example.org/one"
+
+
+
+ + + + Parse URI into a map. + +

Returns a URIMap, that is a uri_map() with the parsed components + of the URIString.

+

If parsing fails, a parse_error exception is raised.

+

Example:

+
+1> uri_string:parse("foo://user@example.com:8042/over/there?name=ferret#nose").
+#{fragment => "nose",host => "example.com",
+  path => "/over/there",port => 8042,query => "name=ferret",
+  scheme => foo,userinfo => "user"}
+2> 
+
+
+ + + + Recompose URI. + +

Returns an RFC 3986 compliant URIString (percent-encoded).

+

If the URIMap is invalid, a badarg exception is raised.

+

Example:

+
+1> URIMap = #{fragment => "nose", host => "example.com", path => "/over/there",
+port => 8042, query => "name=ferret", scheme => foo, userinfo => "user"}.
+#{fragment => "top",host => "example.com",
+  path => "/over/there",port => 8042,query => "?name=ferret",
+  scheme => foo,userinfo => "user"}
+
+2> uri_string:recompose(URIMap, []).
+"foo://example.com:8042/over/there?name=ferret#nose"
+
+
+ + + + Resolve URI reference. + +

Resolves an RFC 3986 compliant RelativeURI, + based AbsoluteBaseURI and returns a new absolute URI + (AbsoluteDestURI).

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:resolve_uri_reference(...,...).
+
+
+
+ + + + Transcode URI. + +

Transcodes an RFC 3986 compliant URIString, + where Options is a list of tagged tuples, specifying the inbound + (in_encoding) and outbound (out_encoding) encodings.

+

If an argument is invalid, a badarg exception is raised.

+

Example:

+
+1> uri_string:transcode(<<"foo://f%20oo">>, [{in_encoding, utf8},
+{out_encoding, utf16}]).
+<<0,102,0,111,0,111,0,58,0,47,0,47,0,102,0,37,0,48,0,48,0,37,0,50,0,48,0,
+  111,0,111>>
+
+
+
+ +
+
diff --git a/lib/stdlib/src/Makefile b/lib/stdlib/src/Makefile index bf836203ec..8b156929d7 100644 --- a/lib/stdlib/src/Makefile +++ b/lib/stdlib/src/Makefile @@ -121,6 +121,7 @@ MODULES= \ timer \ unicode \ unicode_util \ + uri_string \ win32reg \ zip diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl new file mode 100644 index 0000000000..2c10c34f39 --- /dev/null +++ b/lib/stdlib/src/uri_string.erl @@ -0,0 +1,325 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +%% +%% [RFC 3986, Chapter 2.2. Reserved Characters] +%% +%% reserved = gen-delims / sub-delims +%% +%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +%% +%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +%% / "*" / "+" / "," / ";" / "=" +%% +%% +%% [RFC 3986, Chapter 2.3. Unreserved Characters] +%% +%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +%% +%% +%% [RFC 3986, Chapter 3. Syntax Components] +%% +%% The generic URI syntax consists of a hierarchical sequence of +%% components referred to as the scheme, authority, path, query, and +%% fragment. +%% +%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +%% +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%% +%% The scheme and path components are required, though the path may be +%% empty (no characters). When authority is present, the path must +%% either be empty or begin with a slash ("/") character. When +%% authority is not present, the path cannot begin with two slash +%% characters ("//"). These restrictions result in five different ABNF +%% rules for a path (Section 3.3), only one of which will match any +%% given URI reference. +%% +%% The following are two example URIs and their component parts: +%% +%% foo://example.com:8042/over/there?name=ferret#nose +%% \_/ \______________/\_________/ \_________/ \__/ +%% | | | | | +%% scheme authority path query fragment +%% | _____________________|__ +%% / \ / \ +%% urn:example:animal:ferret:nose +%% +%% +%% [RFC 3986, Chapter 3.1. Scheme] +%% +%% Each URI begins with a scheme name that refers to a specification for +%% assigning identifiers within that scheme. +%% +%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +%% +%% +%% [RFC 3986, Chapter 3.2. Authority] +%% +%% Many URI schemes include a hierarchical element for a naming +%% authority so that governance of the name space defined by the +%% remainder of the URI is delegated to that authority (which may, in +%% turn, delegate it further). +%% +%% authority = [ userinfo "@" ] host [ ":" port ] +%% +%% +%% [RFC 3986, Chapter 3.2.1. User Information] +%% +%% The userinfo subcomponent may consist of a user name and, optionally, +%% scheme-specific information about how to gain authorization to access +%% the resource. The user information, if present, is followed by a +%% commercial at-sign ("@") that delimits it from the host. +%% +%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Host] +%% +%% The host subcomponent of authority is identified by an IP literal +%% encapsulated within square brackets, an IPv4 address in dotted- +%% decimal form, or a registered name. +%% +%% host = IP-literal / IPv4address / reg-name +%% +%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" +%% +%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +%% +%% IPv6address = 6( h16 ":" ) ls32 +%% / "::" 5( h16 ":" ) ls32 +%% / [ h16 ] "::" 4( h16 ":" ) ls32 +%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +%% / [ *4( h16 ":" ) h16 ] "::" ls32 +%% / [ *5( h16 ":" ) h16 ] "::" h16 +%% / [ *6( h16 ":" ) h16 ] "::" +%% +%% ls32 = ( h16 ":" h16 ) / IPv4address +%% ; least-significant 32 bits of address +%% +%% h16 = 1*4HEXDIG +%% ; 16 bits of address represented in hexadecimal +%% +%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +%% +%% dec-octet = DIGIT ; 0-9 +%% / %x31-39 DIGIT ; 10-99 +%% / "1" 2DIGIT ; 100-199 +%% / "2" %x30-34 DIGIT ; 200-249 +%% / "25" %x30-35 ; 250-255 +%% +%% reg-name = *( unreserved / pct-encoded / sub-delims ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Port] +%% +%% The port subcomponent of authority is designated by an optional port +%% number in decimal following the host and delimited from it by a +%% single colon (":") character. +%% +%% port = *DIGIT +%% +%% +%% [RFC 3986, Chapter 3.3. Path] +%% +%% The path component contains data, usually organized in hierarchical +%% form, that, along with data in the non-hierarchical query component +%% (Section 3.4), serves to identify a resource within the scope of the +%% URI's scheme and naming authority (if any). The path is terminated +%% by the first question mark ("?") or number sign ("#") character, or +%% by the end of the URI. +%% +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% +%% path-abempty = *( "/" segment ) +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] +%% path-noscheme = segment-nz-nc *( "/" segment ) +%% path-rootless = segment-nz *( "/" segment ) +%% path-empty = 0 +%% segment = *pchar +%% segment-nz = 1*pchar +%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +%% ; non-zero-length segment without any colon ":" +%% +%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +%% +%% +%% [RFC 3986, Chapter 3.4. Query] +%% +%% The query component contains non-hierarchical data that, along with +%% data in the path component (Section 3.3), serves to identify a +%% resource within the scope of the URI's scheme and naming authority +%% (if any). The query component is indicated by the first question +%% mark ("?") character and terminated by a number sign ("#") character +%% or by the end of the URI. +%% +%% query = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 3.5. Fragment] +%% +%% The fragment identifier component of a URI allows indirect +%% identification of a secondary resource by reference to a primary +%% resource and additional identifying information. +%% +%% fragment = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 4.1. URI Reference] +%% +%% URI-reference is used to denote the most common usage of a resource +%% identifier. +%% +%% URI-reference = URI / relative-ref +%% +%% +%% [RFC 3986, Chapter 4.2. Relative Reference] +%% +%% A relative reference takes advantage of the hierarchical syntax +%% (Section 1.2.3) to express a URI reference relative to the name space +%% of another hierarchical URI. +%% +%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] +%% +%% relative-part = "//" authority path-abempty +%% / path-absolute +%% / path-noscheme +%% / path-empty +%% +%% +%% [RFC 3986, Chapter 4.3. Absolute URI] +%% +%% Some protocol elements allow only the absolute form of a URI without +%% a fragment identifier. For example, defining a base URI for later +%% use by relative references calls for an absolute-URI syntax rule that +%% does not allow a fragment. +%% +%% absolute-URI = scheme ":" hier-part [ "?" query ] +%% + +-module(uri_string). + + +-export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, + parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export_type([uri_map/0, uri_string/0, bytelist/0]). + + +%%%========================================================================= +%%% API +%%%========================================================================= + + +-type bytelist() :: maybe_improper_list( + 0..255 | + binary() | bytelist(), + binary() | []). + +%% URI compliant with RFC 3986 +%% ASCII %x21 - %x7A ("!" - "z") except +%% %x34 " double quote +%% %x60 < less than +%% %x62 > greater than +%% %x92 \ backslash +%% %x94 ^ caret / circumflex +%% %x96 ` grave / accent +-type uri_string() :: bytelist() | binary(). + + +%% RFC 3986, Chapter 3. Syntax Components +-type uri_map() :: + #{fragment := unicode:chardata(), + host := unicode:chardata(), + path := unicode:chardata(), + port := non_neg_integer(), + query := unicode:chardata(), + scheme := atom(), + userinfo := unicode:chardata()}. + +%% Parse URIs +-spec parse(URIString) -> URIMap when + URIString :: uri_string(), + URIMap :: uri_map(). +parse(_) -> + ok. + +%% Recompose URIs +-spec recompose(URIMap) -> URIString when + URIMap :: uri_map(), + URIString :: uri_string(). +recompose(_) -> + ok. + +%% Resolve references +-spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when + RelativeURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + AbsoluteDestURI :: uri_string(). +resolve_uri_reference(_,_) -> + ok. + +%% Create references +-spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when + AbsoluteSourceURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + RelativeDestURI :: uri_string(). +create_uri_reference(_,_) -> + ok. + +%% Normalize URIs +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(_) -> + ok. + +%% Transcode URIs +-spec transcode(URIString, Options) -> URIString when + URIString :: uri_string(), + Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. +transcode(_, _) -> + ok. + + +%% Working with query strings +%% HTML 2.0 - application/x-www-form-urlencoded +%% RFC 1866 [8.2.1] + +%% Compose urlencoded query string from a list of unescaped key/value pairs. +-spec compose_query(QueryList) -> QueryString when + QueryList :: [{unicode:chardata(), unicode:chardata()}], + QueryString :: uri_string(). +compose_query(_) -> + ok. + +%% Dissect a query string into a list of unescaped key/value pairs. +-spec dissect_query(QueryString) -> QueryList when + QueryString :: uri_string(), + QueryList :: [{unicode:chardata(), unicode:chardata()}]. +dissect_query(_) -> + ok. -- cgit v1.2.3 From 29a9dd0e17a97a3e6e46f0d08c6ba8f31db33f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 31 Aug 2017 15:39:45 +0200 Subject: stdlib: Implement uri_string:parse --- lib/stdlib/doc/src/uri_string.xml | 6 - lib/stdlib/src/uri_string.erl | 838 ++++++++++++++++++++- lib/stdlib/test/Makefile | 1 + lib/stdlib/test/property_test/README | 12 + .../test/property_test/uri_string_decode.erl | 55 ++ lib/stdlib/test/uri_string_SUITE.erl | 326 ++++++++ lib/stdlib/test/uri_string_property_test_SUITE.erl | 42 ++ 7 files changed, 1249 insertions(+), 31 deletions(-) mode change 100644 => 100755 lib/stdlib/src/uri_string.erl create mode 100644 lib/stdlib/test/property_test/README create mode 100644 lib/stdlib/test/property_test/uri_string_decode.erl create mode 100644 lib/stdlib/test/uri_string_SUITE.erl create mode 100644 lib/stdlib/test/uri_string_property_test_SUITE.erl (limited to 'lib') diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index e6b2bd5e80..8283b8ca0e 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -90,12 +90,6 @@ - - - -

Maybe improper list of bytes (0..255).

-
-
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl old mode 100644 new mode 100755 index 2c10c34f39..619da24cbc --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -221,25 +221,24 @@ %% %% absolute-URI = scheme ":" hier-part [ "?" query ] %% - -module(uri_string). -export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). --export_type([uri_map/0, uri_string/0, bytelist/0]). +-export_type([uri_map/0, uri_string/0]). + +-define(CHAR(Char), <>). +-define(STRING_EMPTY, <<>>). +-define(STRING(MatchStr), <>). +-define(STRING_REST(MatchStr, Rest), <>). %%%========================================================================= %%% API %%%========================================================================= - --type bytelist() :: maybe_improper_list( - 0..255 | - binary() | bytelist(), - binary() | []). - +%%------------------------------------------------------------------------- %% URI compliant with RFC 3986 %% ASCII %x21 - %x7A ("!" - "z") except %% %x34 " double quote @@ -248,32 +247,37 @@ %% %x92 \ backslash %% %x94 ^ caret / circumflex %% %x96 ` grave / accent --type uri_string() :: bytelist() | binary(). +%%------------------------------------------------------------------------- +-type uri_string() :: iodata(). %% RFC 3986, Chapter 3. Syntax Components -type uri_map() :: - #{fragment := unicode:chardata(), - host := unicode:chardata(), - path := unicode:chardata(), - port := non_neg_integer(), - query := unicode:chardata(), - scheme := atom(), - userinfo := unicode:chardata()}. + #{fragment => unicode:chardata(), + host => unicode:chardata(), + path => unicode:chardata(), + port => non_neg_integer(), + query => unicode:chardata(), + scheme => unicode:chardata(), + userinfo => unicode:chardata()} | #{}. %% Parse URIs -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map(). -parse(_) -> - ok. +parse(URIString) -> + if is_binary(URIString) -> + parse_uri_reference(URIString, #{}); + true -> + parse_uri_reference(URIString, [], #{}) + end. %% Recompose URIs -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). recompose(_) -> - ok. + "". %% Resolve references -spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when @@ -281,7 +285,7 @@ recompose(_) -> AbsoluteBaseURI :: uri_string(), AbsoluteDestURI :: uri_string(). resolve_uri_reference(_,_) -> - ok. + "". %% Create references -spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when @@ -289,21 +293,21 @@ resolve_uri_reference(_,_) -> AbsoluteBaseURI :: uri_string(), RelativeDestURI :: uri_string(). create_uri_reference(_,_) -> - ok. + "". %% Normalize URIs -spec normalize(URIString) -> NormalizedURI when URIString :: uri_string(), NormalizedURI :: uri_string(). normalize(_) -> - ok. + "". %% Transcode URIs -spec transcode(URIString, Options) -> URIString when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. transcode(_, _) -> - ok. + "". %% Working with query strings @@ -315,11 +319,795 @@ transcode(_, _) -> QueryList :: [{unicode:chardata(), unicode:chardata()}], QueryString :: uri_string(). compose_query(_) -> - ok. + "". %% Dissect a query string into a list of unescaped key/value pairs. -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), QueryList :: [{unicode:chardata(), unicode:chardata()}]. dissect_query(_) -> - ok. + "". + + +%%%======================================================================== +%%% Internal functions +%%%======================================================================== + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 4.1. URI Reference] +%% +%% URI-reference is used to denote the most common usage of a resource +%% identifier. +%% +%% URI-reference = URI / relative-ref +%%------------------------------------------------------------------------- +-spec parse_uri_reference(iolist(), list(), uri_map()) -> uri_map(). +parse_uri_reference([], _, _) -> #{}; +parse_uri_reference(URIString, Acc, URI) -> + try parse_scheme_start(URIString, Acc, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_relative_part(URIString, Acc, URI) + end. + +-spec parse_uri_reference(binary(), uri_map()) -> uri_map(). +parse_uri_reference(<<>>, _) -> #{}; +parse_uri_reference(URIString, URI) -> + try parse_scheme_start(URIString, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_relative_part(URIString, URI) + end. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 4.2. Relative Reference] +%% +%% A relative reference takes advantage of the hierarchical syntax +%% (Section 1.2.3) to express a URI reference relative to the name space +%% of another hierarchical URI. +%% +%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] +%% +%% relative-part = "//" authority path-abempty +%% / path-absolute +%% / path-noscheme +%% / path-empty +%%------------------------------------------------------------------------- +-spec parse_relative_part(binary(), uri_map()) -> uri_map(). +parse_relative_part(?STRING_REST("//", Rest), URI) -> + %% Parse userinfo - "//" is NOT part of authority + try parse_userinfo(Rest, URI) of + {T, URI1} -> + {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + URI1#{userinfo => Userinfo} + catch + throw:uri_parse_error -> + {T, URI1} = parse_host(Rest, URI), + {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + URI1#{host => Host} + end; +parse_relative_part(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-absolute + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + URI1#{path => ?STRING_REST($/, Path)}; +parse_relative_part(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + URI1#{query => ?STRING_REST($?, Query)}; +parse_relative_part(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + URI1#{fragment => Fragment}; +parse_relative_part(?STRING_REST(Char, Rest), URI) -> + case is_segment_nz_nc(Char) of + true -> + {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + URI1#{path => ?STRING_REST(Char, Path)}; + false -> throw(uri_parse_error) + end. + +-spec parse_relative_part(iolist(), list(), uri_map()) -> uri_map(). +parse_relative_part([H|Rest], Acc, URI) when is_binary(H) -> + parse_relative_part(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_relative_part([H|Rest], Acc, URI) when is_list(H) -> + parse_relative_part(H ++ Rest, Acc, URI); +parse_relative_part("//" ++ Rest, Acc, URI) -> + % Parse userinfo + try parse_userinfo(Rest, Acc, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_host(Rest, Acc, URI) + end; +parse_relative_part([$/|Rest], _Acc, URI) -> + parse_segment(Rest, [$/], URI); % path-absolute +parse_relative_part([$?|Rest], _Acc, URI) -> + parse_query(Rest, [$?], URI); % path-empty ?query +parse_relative_part([$#|Rest], _Acc, URI) -> + parse_fragment(Rest, [], URI); % path-empty +parse_relative_part([Char|Rest], _, URI) -> + case is_segment_nz_nc(Char) of + true -> parse_segment_nz_nc(Rest, [Char], URI); % path-noscheme + false -> throw(uri_parse_error) + end. + + +%% Returns size of 'Rest' for proper calculation of splitting position. +%% Solves the following special case: +%% +%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). +%% +%% While keeping the following true: +%% +%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). +%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). +%% +-spec byte_size_exl_single_slash(uri_string()) -> number(). +byte_size_exl_single_slash(<<$/>>) -> 0; +byte_size_exl_single_slash(Rest) -> byte_size(Rest). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.3. Path] +%% +%% The path component contains data, usually organized in hierarchical +%% form, that, along with data in the non-hierarchical query component +%% (Section 3.4), serves to identify a resource within the scope of the +%% URI's scheme and naming authority (if any). The path is terminated +%% by the first question mark ("?") or number sign ("#") character, or +%% by the end of the URI. +%% +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% +%% path-abempty = *( "/" segment ) +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] +%% path-noscheme = segment-nz-nc *( "/" segment ) +%% path-rootless = segment-nz *( "/" segment ) +%% path-empty = 0 +%% segment = *pchar +%% segment-nz = 1*pchar +%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +%% ; non-zero-length segment without any colon ":" +%% +%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% path-abempty +%%------------------------------------------------------------------------- +-spec parse_segment(binary(), uri_map()) -> {binary(), uri_map()}. +parse_segment(?STRING_REST($/, Rest), URI) -> + parse_segment(Rest, URI); % segment +parse_segment(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_segment(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_segment(?STRING_REST(Char, Rest), URI) -> + case is_pchar(Char) of + true -> parse_segment(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_segment(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_segment(iolist(), list(), uri_map()) -> uri_map(). +parse_segment(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_segment(unicode:characters_to_list(Str), Acc, URI); +parse_segment([H|Rest], Acc, URI) when is_binary(H) -> + parse_segment(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_segment([H|Rest], Acc, URI) when is_list(H) -> + parse_segment(H ++ Rest, Acc, URI); +parse_segment([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/|Acc], URI); % segment +parse_segment([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query +parse_segment([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); +parse_segment([Char|Rest], Acc, URI) -> + case is_pchar(Char) of + true -> parse_segment(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_segment([], Acc, URI) -> + URI#{path => lists:reverse(Acc)}. + +%%------------------------------------------------------------------------- +%% path-noscheme +%%------------------------------------------------------------------------- +-spec parse_segment_nz_nc(binary(), uri_map()) -> {binary(), uri_map()}. +parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> + parse_segment(Rest, URI); % segment +parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> + case is_segment_nz_nc(Char) of + true -> parse_segment_nz_nc(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_segment_nz_nc(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_segment_nz_nc(iolist(), list(), uri_map()) -> uri_map(). +parse_segment_nz_nc(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_segment_nz_nc(unicode:characters_to_list(Str), Acc, URI); +parse_segment_nz_nc([H|Rest], Acc, URI) when is_binary(H) -> + parse_segment_nz_nc(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_segment_nz_nc([H|Rest], Acc, URI) when is_list(H) -> + parse_segment_nz_nc(H ++ Rest, Acc, URI); +parse_segment_nz_nc([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/|Acc], URI); % segment +parse_segment_nz_nc([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query +parse_segment_nz_nc([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); +parse_segment_nz_nc([Char|Rest], Acc, URI) -> + case is_segment_nz_nc(Char) of + true -> parse_segment_nz_nc(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_segment_nz_nc([], Acc, URI) -> + URI#{path => lists:reverse(Acc)}. + +%% Check if char is pchar. +-spec is_pchar(char()) -> boolean(). +is_pchar($%) -> true; % pct-encoded +is_pchar($:) -> true; +is_pchar($@) -> true; +is_pchar(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + +%% Check if char is segment_nz_nc. +-spec is_segment_nz_nc(char()) -> boolean(). +is_segment_nz_nc($%) -> true; % pct-encoded +is_segment_nz_nc($@) -> true; +is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.1. Scheme] +%% +%% Each URI begins with a scheme name that refers to a specification for +%% assigning identifiers within that scheme. +%% +%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +%%------------------------------------------------------------------------- +-spec parse_scheme_start(binary(), uri_map()) -> uri_map(). +parse_scheme_start(?STRING_REST(Char, Rest), URI) -> + case is_alpha(Char) of + true -> {T, URI1} = parse_scheme(Rest, URI), + {Scheme, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + URI1#{scheme => ?STRING_REST(Char, Scheme)}; + false -> throw(uri_parse_error) + end. + +-spec parse_scheme_start(iolist(), list(), uri_map()) -> uri_map(). +parse_scheme_start([H|Rest], Acc, URI) when is_binary(H) -> + parse_scheme_start(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_scheme_start([H|Rest], Acc, URI) when is_list(H) -> + parse_scheme_start(H ++ Rest, Acc, URI); +parse_scheme_start([Char|Rest], Acc, URI) -> + case is_alpha(Char) of + true -> parse_scheme(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end. + + +-spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. +parse_scheme(?STRING_REST($:, Rest), URI) -> + {_, URI1} = parse_hier(Rest, URI), + {Rest, URI1}; +parse_scheme(?STRING_REST(Char, Rest), URI) -> + case is_scheme(Char) of + true -> parse_scheme(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_scheme(?STRING_EMPTY, _URI) -> + throw(uri_parse_error). + +-spec parse_scheme(iolist(), list(), uri_map()) -> uri_map(). +parse_scheme(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_scheme(unicode:characters_to_list(Str), Acc, URI); +parse_scheme([H|Rest], Acc, URI) when is_binary(H) -> + parse_scheme(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_scheme([H|Rest], Acc, URI) when is_list(H) -> + parse_scheme(H ++ Rest, Acc, URI); +parse_scheme([$:|Rest], Acc, URI) -> + parse_hier(Rest, [], URI#{scheme => lists:reverse(Acc)}); +parse_scheme([Char|Rest], Acc, URI) -> + case is_scheme(Char) of + true -> parse_scheme(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_scheme([], _Acc, _URI) -> + throw(uri_parse_error). + +%% Check if char is allowed in scheme +-spec is_scheme(char()) -> boolean(). +is_scheme($+) -> true; +is_scheme($-) -> true; +is_scheme($.) -> true; +is_scheme(Char) -> is_alpha(Char) orelse is_digit(Char). + + +%%------------------------------------------------------------------------- +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%%------------------------------------------------------------------------- +-spec parse_hier(binary(), uri_map()) -> {binary(), uri_map()}. +parse_hier(?STRING_REST("//", Rest), URI) -> + % Parse userinfo - "//" is NOT part of authority + try parse_userinfo(Rest, URI) of + {T, URI1} -> + {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + {Rest, URI1#{userinfo => Userinfo}} + catch + throw:uri_parse_error -> + {T, URI1} = parse_host(Rest, URI), + {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{host => Host}} + end; +parse_hier(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-absolute + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_hier(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_hier(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless + case is_pchar(Char) of + true -> % segment_nz + {T, URI1} = parse_segment(Rest, URI), + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST(Char, Path)}}; + false -> throw(uri_parse_error) + end; +parse_hier(?STRING_EMPTY, URI) -> + {<<>>, URI}. + +-spec parse_hier(iolist(), list(), uri_map()) -> uri_map(). +parse_hier(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_hier(unicode:characters_to_list(Str), Acc, URI); +parse_hier([H|Rest], Acc, URI) when is_binary(H) -> + parse_hier(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_hier([H|Rest], Acc, URI) when is_list(H) -> + parse_hier(H ++ Rest, Acc, URI); +parse_hier("//" ++ Rest, Acc, URI) -> + % Parse userinfo + try parse_userinfo(Rest, Acc, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_host(Rest, [], URI) + end; +parse_hier([$/|Rest], _Acc, URI) -> + parse_segment(Rest, [$/], URI); % path-absolute +parse_hier([$?|Rest], _Acc, URI) -> + parse_query(Rest, [$?], URI); % path-empty ?query +parse_hier([$#|Rest], _Acc, URI) -> + parse_fragment(Rest, [], URI); % path-empty +parse_hier([Char|Rest], _, URI) -> % path-rootless + case is_pchar(Char) of + true -> parse_segment(Rest, [Char], URI); + false -> throw(uri_parse_error) + end; +parse_hier([], _, URI) -> + URI. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.2. Authority] +%% +%% Many URI schemes include a hierarchical element for a naming +%% authority so that governance of the name space defined by the +%% remainder of the URI is delegated to that authority (which may, in +%% turn, delegate it further). +%% +%% The authority component is preceded by a double slash ("//") and is +%% terminated by the next slash ("/"), question mark ("?"), or number +%% sign ("#") character, or by the end of the URI. +%% +%% authority = [ userinfo "@" ] host [ ":" port ] +%% +%% +%% [RFC 3986, Chapter 3.2.1. User Information] +%% +%% The userinfo subcomponent may consist of a user name and, optionally, +%% scheme-specific information about how to gain authorization to access +%% the resource. The user information, if present, is followed by a +%% commercial at-sign ("@") that delimits it from the host. +%% +%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +%%------------------------------------------------------------------------- +-spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. +parse_userinfo(?CHAR($@), _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error); +parse_userinfo(?STRING_REST($@, Rest), URI) -> + {T, URI1} = parse_host(Rest, URI), + {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{host => Host}}; +parse_userinfo(?STRING_REST(Char, Rest), URI) -> + case is_userinfo(Char) of + true -> parse_userinfo(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_userinfo(?STRING_EMPTY, _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error). + +-spec parse_userinfo(iolist(), list(), uri_map()) -> uri_map(). +parse_userinfo(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_userinfo(unicode:characters_to_list(Str), Acc, URI); +parse_userinfo([H|Rest], Acc, URI) when is_binary(H) -> + parse_userinfo(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_userinfo([H|Rest], Acc, URI) when is_list(H) -> + parse_userinfo(H ++ Rest, Acc, URI); +parse_userinfo([$@], _Acc, _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error); +parse_userinfo([$@|Rest], Acc, URI) -> + parse_host(Rest, [], URI#{userinfo => lists:reverse(Acc)}); +parse_userinfo([Char|Rest], Acc, URI) -> + case is_userinfo(Char) of + true -> parse_userinfo(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) % URI#{userinfo => lists:reverse(Acc)} + end; +parse_userinfo([], _Acc, _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error). + +%% Check if char is allowed in userinfo +-spec is_userinfo(char()) -> boolean(). +is_userinfo($%) -> true; % pct-encoded +is_userinfo($:) -> true; +is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.2.2. Host] +%% +%% The host subcomponent of authority is identified by an IP literal +%% encapsulated within square brackets, an IPv4 address in dotted- +%% decimal form, or a registered name. +%% +%% host = IP-literal / IPv4address / reg-name +%% +%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" +%% +%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +%% +%% IPv6address = 6( h16 ":" ) ls32 +%% / "::" 5( h16 ":" ) ls32 +%% / [ h16 ] "::" 4( h16 ":" ) ls32 +%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +%% / [ *4( h16 ":" ) h16 ] "::" ls32 +%% / [ *5( h16 ":" ) h16 ] "::" h16 +%% / [ *6( h16 ":" ) h16 ] "::" +%% +%% ls32 = ( h16 ":" h16 ) / IPv4address +%% ; least-significant 32 bits of address +%% +%% h16 = 1*4HEXDIG +%% ; 16 bits of address represented in hexadecimal +%% +%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +%% +%% dec-octet = DIGIT ; 0-9 +%% / %x31-39 DIGIT ; 10-99 +%% / "1" 2DIGIT ; 100-199 +%% / "2" %x30-34 DIGIT ; 200-249 +%% / "25" %x30-35 ; 250-255 +%% +%% reg-name = *( unreserved / pct-encoded / sub-delims ) +%%------------------------------------------------------------------------- +%% TODO: implement parsing of IPv4/IPv6 addresses +-spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. +parse_host(?STRING_REST($:, Rest), URI) -> + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_host(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_host(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_host(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_host(?STRING_REST(Char, Rest), URI) -> + case is_reg_name(Char) of + true -> parse_host(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_host(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_host(iolist(), list(), uri_map()) -> uri_map(). +parse_host(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_host(unicode:characters_to_list(Str), Acc, URI); +parse_host([H|Rest], Acc, URI) when is_binary(H) -> + parse_host(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_host([H|Rest], Acc, URI) when is_list(H) -> + parse_host(H ++ Rest, Acc, URI); +parse_host([$:|Rest], Acc, URI) -> + parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); +parse_host([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty +parse_host([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query +parse_host([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty +parse_host([Char|Rest], Acc, URI) -> + case is_reg_name(Char) of + true -> parse_host(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_host([], Acc, URI) -> + URI#{host => lists:reverse(Acc)}. + +%% Check if char is allowed in reg-name +-spec is_reg_name(char()) -> boolean(). +is_reg_name($%) -> true; +is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.2.2. Port] +%% +%% The port subcomponent of authority is designated by an optional port +%% number in decimal following the host and delimited from it by a +%% single colon (":") character. +%% +%% port = *DIGIT +%%------------------------------------------------------------------------- +-spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}. +parse_port(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_port(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_port(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_port(?STRING_REST(Char, Rest), URI) -> + case is_digit(Char) of + true -> parse_port(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_port(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_port(iolist(), list(), uri_map()) -> uri_map(). +parse_port(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_port(unicode:characters_to_list(Str), Acc, URI); +parse_port([H|Rest], Acc, URI) when is_binary(H) -> + parse_port(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_port([H|Rest], Acc, URI) when is_list(H) -> + parse_port(H ++ Rest, Acc, URI); +parse_port([$/|Rest], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + parse_segment(Rest, [$/], URI#{port => Port}); % path-abempty +parse_port([$?|Rest], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + parse_query(Rest, [$?], URI#{port => Port}); % path-empty ?query +parse_port([$#|Rest], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + parse_fragment(Rest, [], URI#{port => Port}); % path-empty +parse_port([Char|Rest], Acc, URI) -> + case is_digit(Char) of + true -> parse_port(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_port([], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + URI#{port => Port}. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.4. Query] +%% +%% The query component contains non-hierarchical data that, along with +%% data in the path component (Section 3.3), serves to identify a +%% resource within the scope of the URI's scheme and naming authority +%% (if any). The query component is indicated by the first question +%% mark ("?") character and terminated by a number sign ("#") character +%% or by the end of the URI. +%% +%% query = *( pchar / "/" / "?" ) +%%------------------------------------------------------------------------- +-spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. +parse_query(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_query(?STRING_REST(Char, Rest), URI) -> + case is_query(Char) of + true -> parse_query(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_query(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_query(iolist(), list(), uri_map()) -> uri_map(). +parse_query(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_query(unicode:characters_to_list(Str), Acc, URI); +parse_query([H|Rest], Acc, URI) when is_binary(H) -> + parse_query(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_query([H|Rest], Acc, URI) when is_list(H) -> + parse_query(H ++ Rest, Acc, URI); +parse_query([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{query => lists:reverse(Acc)}); +parse_query([Char|Rest], Acc, URI) -> + case is_query(Char) of + true -> parse_query(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_query([], Acc, URI) -> + URI#{query => lists:reverse(Acc)}. + +%% Check if char is allowed in query +-spec is_query(char()) -> boolean(). +is_query($/) -> true; +is_query(Char) -> is_pchar(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.5. Fragment] +%% +%% The fragment identifier component of a URI allows indirect +%% identification of a secondary resource by reference to a primary +%% resource and additional identifying information. +%% +%% fragment = *( pchar / "/" / "?" ) +%%------------------------------------------------------------------------- +-spec parse_fragment(binary(), uri_map()) -> {binary(), uri_map()}. +parse_fragment(?STRING_REST(Char, Rest), URI) -> + case is_fragment(Char) of + true -> parse_fragment(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_fragment(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_fragment(iolist(), list(), uri_map()) -> uri_map(). +parse_fragment(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_fragment(unicode:characters_to_list(Str), Acc, URI); +parse_fragment([H|Rest], Acc, URI) when is_binary(H) -> + parse_fragment(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_fragment([H|Rest], Acc, URI) when is_list(H) -> + parse_fragment(H ++ Rest, Acc, URI); +parse_fragment([Char|Rest], Acc, URI) -> + case is_fragment(Char) of + true -> parse_fragment(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_fragment([], Acc, URI) -> + URI#{fragment => lists:reverse(Acc)}. + +%% Check if char is allowed in fragment +-spec is_fragment(char()) -> boolean(). +is_fragment($/) -> true; +is_fragment($?) -> true; +is_fragment(Char) -> is_pchar(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 2.2. Reserved Characters] +%% +%% reserved = gen-delims / sub-delims +%% +%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +%% +%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +%% / "*" / "+" / "," / ";" / "=" +%% +%%------------------------------------------------------------------------- +%% %% Return true if input char is reserved. +%% -spec is_reserved(char()) -> boolean(). +%% is_reserved(Char) -> +%% is_gen_delim(Char) orelse is_sub_delim(Char). + +%% %% Check if char is reserved. +%% -spec is_gen_delim(char()) -> boolean(). +%% is_gen_delim($:) -> true; +%% is_gen_delim($/) -> true; +%% is_gen_delim($?) -> true; +%% is_gen_delim($#) -> true; +%% is_gen_delim($[) -> true; +%% is_gen_delim($]) -> true; +%% is_gen_delim($@) -> true; +%% is_gen_delim(_) -> false. + +%% Check if char is sub-delim. +-spec is_sub_delim(char()) -> boolean(). +is_sub_delim($!) -> true; +is_sub_delim($$) -> true; +is_sub_delim($&) -> true; +is_sub_delim($') -> true; +is_sub_delim($() -> true; +is_sub_delim($)) -> true; + +is_sub_delim($*) -> true; +is_sub_delim($+) -> true; +is_sub_delim($,) -> true; +is_sub_delim($;) -> true; +is_sub_delim($=) -> true; +is_sub_delim(_) -> false. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 2.3. Unreserved Characters] +%% +%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +%% +%%------------------------------------------------------------------------- +-spec is_unreserved(char()) -> boolean(). +is_unreserved($-) -> true; +is_unreserved($.) -> true; +is_unreserved($_) -> true; +is_unreserved($~) -> true; +is_unreserved(Char) -> is_alpha(Char) orelse is_digit(Char). + +-spec is_alpha(char()) -> boolean(). +is_alpha(C) + when $A =< C, C =< $Z; + $a =< C, C =< $z -> true; +is_alpha(_) -> false. + +-spec is_digit(char()) -> boolean(). +is_digit(C) + when $0 =< C, C =< $9 -> true; +is_digit(_) -> false. + +%% Returns the size of a binary exluding the first element. +%% Used in calls to split_binary(). +-spec byte_size_exl_head(binary()) -> number(). +byte_size_exl_head(<<>>) -> 0; +byte_size_exl_head(Binary) -> byte_size(Binary) + 1. diff --git a/lib/stdlib/test/Makefile b/lib/stdlib/test/Makefile index 523cb95065..8490770f3d 100644 --- a/lib/stdlib/test/Makefile +++ b/lib/stdlib/test/Makefile @@ -87,6 +87,7 @@ MODULES= \ timer_simple_SUITE \ unicode_SUITE \ unicode_util_SUITE \ + uri_string_SUITE \ win32reg_SUITE \ y2k_SUITE \ select_SUITE \ diff --git a/lib/stdlib/test/property_test/README b/lib/stdlib/test/property_test/README new file mode 100644 index 0000000000..57602bf719 --- /dev/null +++ b/lib/stdlib/test/property_test/README @@ -0,0 +1,12 @@ + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% %%% +%%% WARNING %%% +%%% %%% +%%% This is experimental code which may be changed or removed %%% +%%% anytime without any warning. %%% +%%% %%% +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +The test in this directory are written assuming that the user has a QuickCheck license. They are to be run manually. Some may be possible to be run with other tools, e.g. PropEr. + diff --git a/lib/stdlib/test/property_test/uri_string_decode.erl b/lib/stdlib/test/property_test/uri_string_decode.erl new file mode 100644 index 0000000000..137a649cf1 --- /dev/null +++ b/lib/stdlib/test/property_test/uri_string_decode.erl @@ -0,0 +1,55 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(uri_string_decode). + +-compile(export_all). + +-proptest(eqc). +-proptest([triq,proper]). + +-ifndef(EQC). +-ifndef(PROPER). +-ifndef(TRIQ). +-define(EQC,true). +-endif. +-endif. +-endif. + +-ifdef(EQC). +-include_lib("eqc/include/eqc.hrl"). +-define(MOD_eqc,eqc). + +-else. +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). +-define(MOD_eqc,proper). + +-else. +-ifdef(TRIQ). +-define(MOD_eqc,triq). +-include_lib("triq/include/triq.hrl"). + +-endif. +-endif. +-endif. + + +prop_uri_string_decode() -> + ok. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl new file mode 100644 index 0000000000..189941de03 --- /dev/null +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -0,0 +1,326 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(uri_string_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([all/0, suite/0,groups/0, + parse_binary_fragment/1, parse_binary_host/1, parse_binary_path/1, parse_binary_port/1, + parse_binary_query/1, parse_binary_scheme/1, parse_binary_userinfo/1, + parse_fragment/1, parse_host/1, parse_path/1, parse_port/1, + parse_query/1, parse_scheme/1, parse_userinfo/1, + parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1 + ]). + +suite() -> + [{timetrap,{minutes,1}}]. + +all() -> + [ + parse_binary_scheme, + parse_binary_userinfo, + parse_binary_host, + parse_binary_port, + parse_binary_path, + parse_binary_query, + parse_binary_fragment, + parse_scheme, + parse_userinfo, + parse_host, + parse_port, + parse_path, + parse_query, + parse_fragment, + parse_list, + parse_binary, + parse_mixed, + parse_relative + ]. + +groups() -> + []. + + +%% TODO: Negative tests +%% uri_string:parse(<<"?name=ferret">>). +%% uri_string:parse("//user@") +%% uri_string:parse("foo://user@") +%% uri_string:parse(":600"). +%% +%% uri_string:parse("//:8042x"). +%% +%% io:format("# DEBUG T: >>~s<<~n", [T]), + +parse_binary_scheme(_Config) -> + #{} = uri_string:parse(<<>>), + #{path := <<"foo">>} = uri_string:parse(<<"foo">>), + #{scheme := <<"foo">>} = uri_string:parse(<<"foo:">>), + #{scheme := <<"foo">>, path := <<"bar:nisse">>} = uri_string:parse(<<"foo:bar:nisse">>), + #{scheme := <<"foo">>, host := <<"">>} = uri_string:parse(<<"foo://">>), + #{scheme := <<"foo">>, host := <<"">>, path := <<"/">>} = uri_string:parse(<<"foo:///">>), + #{scheme := <<"foo">>, host := <<"">>, path := <<"//">>} = uri_string:parse(<<"foo:////">>), + + #{path := <<"/">>} = uri_string:parse(<<"/">>), + #{host := <<>>} = uri_string:parse(<<"//">>), + #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). + +parse_binary_userinfo(_Config) -> + #{scheme := <<"user">>, path := <<"password@localhost">>} = + uri_string:parse(<<"user:password@localhost">>), + #{path := <<"user@">>} = uri_string:parse(<<"user@">>), + #{path := <<"/user@">>} = uri_string:parse(<<"/user@">>), + #{path := <<"user@localhost">>} = uri_string:parse(<<"user@localhost">>), + #{userinfo := <<"user">>, host := <<"localhost">>} = uri_string:parse(<<"//user@localhost">>), + #{userinfo := <<"user:password">>, host := <<"localhost">>} = + uri_string:parse(<<"//user:password@localhost">>), + #{scheme := <<"foo">>, path := <<"/user@">>} = + uri_string:parse(<<"foo:/user@">>), + #{scheme := <<"foo">>, userinfo := <<"user">>, host := <<"localhost">>} = + uri_string:parse(<<"foo://user@localhost">>), + #{scheme := <<"foo">>, userinfo := <<"user:password">>, host := <<"localhost">>} = + uri_string:parse(<<"foo://user:password@localhost">>). + +parse_binary_host(_Config) -> + #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>), + #{host := <<"hostname">>,scheme := <<"foo">>} = uri_string:parse(<<"foo://hostname">>), + #{host := <<"hostname">>,scheme := <<"foo">>, userinfo := <<"user">>} = + uri_string:parse(<<"foo://user@hostname">>). + +parse_binary_port(_Config) -> + #{path:= <<"/:8042">>} = + uri_string:parse(<<"/:8042">>), + #{host:= <<>>, port := 8042} = + uri_string:parse(<<"//:8042">>), + #{host := <<"example.com">>, port:= 8042} = + uri_string:parse(<<"//example.com:8042">>), + #{scheme := <<"foo">>, path := <<"/:8042">>} = + uri_string:parse(<<"foo:/:8042">>), + #{scheme := <<"foo">>, host := <<>>, port := 8042} = + uri_string:parse(<<"foo://:8042">>), + #{scheme := <<"foo">>, host := <<"example.com">>, port := 8042} = + uri_string:parse(<<"foo://example.com:8042">>). + +parse_binary_path(_Config) -> + #{path := <<"over/there">>} = uri_string:parse(<<"over/there">>), + #{path := <<"/over/there">>} = uri_string:parse(<<"/over/there">>), + #{scheme := <<"foo">>, path := <<"/over/there">>} = + uri_string:parse(<<"foo:/over/there">>), + #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/over/there">>} = + uri_string:parse(<<"foo://example.com/over/there">>), + #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/over/there">>, port := 8042} = + uri_string:parse(<<"foo://example.com:8042/over/there">>). + +parse_binary_query(_Config) -> + #{scheme := <<"foo">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"foo:?name=ferret">>), + #{scheme := <<"foo">>, path:= <<"over/there">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"foo:over/there?name=ferret">>), + #{scheme := <<"foo">>, path:= <<"/over/there">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"foo:/over/there?name=ferret">>), + #{scheme := <<"foo">>, host := <<"example.com">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"foo://example.com?name=ferret">>), + #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"foo://example.com/?name=ferret">>), + + #{query := <<"?name=ferret">>} = + uri_string:parse(<<"?name=ferret">>), + #{path := <<"over/there">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"over/there?name=ferret">>), + #{path := <<"/">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"/?name=ferret">>), + #{path := <<"/over/there">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"/over/there?name=ferret">>), + #{host := <<"example.com">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"//example.com?name=ferret">>), + #{host := <<"example.com">>, path := <<"/">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"//example.com/?name=ferret">>). + + +parse_binary_fragment(_Config) -> + #{scheme := <<"foo">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo:#nose">>), + #{scheme := <<"foo">>, path:= <<"over/there">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo:over/there#nose">>), + #{scheme := <<"foo">>, path:= <<"/over/there">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo:/over/there#nose">>), + #{scheme := <<"foo">>, host := <<"example.com">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo://example.com#nose">>), + #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo://example.com/#nose">>), + #{scheme := <<"foo">>, host := <<"example.com">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo://example.com#nose">>), + + #{fragment := <<"nose">>} = + uri_string:parse(<<"#nose">>), + #{path := <<"over/there">>, fragment := <<"nose">>} = + uri_string:parse(<<"over/there#nose">>), + #{path := <<"/">>, fragment := <<"nose">>} = + uri_string:parse(<<"/#nose">>), + #{path := <<"/over/there">>, fragment := <<"nose">>} = + uri_string:parse(<<"/over/there#nose">>), + #{host := <<"example.com">>, fragment := <<"nose">>} = + uri_string:parse(<<"//example.com#nose">>), + #{host := <<"example.com">>, path := <<"/">>, fragment := <<"nose">>} = + uri_string:parse(<<"//example.com/#nose">>). + +parse_scheme(_Config) -> + #{} = uri_string:parse(""), + #{path := "foo"} = uri_string:parse("foo"), + #{scheme := "foo"} = uri_string:parse("foo:"), + #{scheme := "foo", path := "bar:nisse"} = uri_string:parse("foo:bar:nisse"), + #{scheme := "foo", host := ""} = uri_string:parse("foo://"), + #{scheme := "foo", host := "", path := "/"} = uri_string:parse("foo:///"), + #{scheme := "foo", host := "", path := "//"} = uri_string:parse("foo:////"), + + #{path := "/"} = uri_string:parse("/"), + #{host := ""} = uri_string:parse("//"), + #{host := "", path := "/"} = uri_string:parse("///"). + +parse_userinfo(_Config) -> + #{scheme := "user", path := "password@localhost"} = uri_string:parse("user:password@localhost"), + #{path := "user@"} = uri_string:parse("user@"), + #{path := "/user@"} = uri_string:parse("/user@"), + #{path := "user@localhost"} = uri_string:parse("user@localhost"), + #{userinfo := "user", host := "localhost"} = uri_string:parse("//user@localhost"), + #{userinfo := "user:password", host := "localhost"} = + uri_string:parse("//user:password@localhost"), + #{scheme := "foo", path := "/user@"} = + uri_string:parse("foo:/user@"), + #{scheme := "foo", userinfo := "user", host := "localhost"} = + uri_string:parse("foo://user@localhost"), + #{scheme := "foo", userinfo := "user:password", host := "localhost"} = + uri_string:parse("foo://user:password@localhost"). + +parse_host(_Config) -> + #{host := "hostname"} = uri_string:parse("//hostname"), + #{host := "hostname",scheme := "foo"} = uri_string:parse("foo://hostname"), + #{host := "hostname",scheme := "foo", userinfo := "user"} = + uri_string:parse("foo://user@hostname"). + +parse_port(_Config) -> + #{path:= "/:8042"} = + uri_string:parse("/:8042"), + #{host:= "", port := 8042} = + uri_string:parse("//:8042"), + #{host := "example.com", port:= 8042} = + uri_string:parse("//example.com:8042"), + #{scheme := "foo", path := "/:8042"} = + uri_string:parse("foo:/:8042"), + #{scheme := "foo", host := "", port := 8042} = + uri_string:parse("foo://:8042"), + #{scheme := "foo", host := "example.com", port := 8042} = + uri_string:parse("foo://example.com:8042"). + +parse_path(_Config) -> + #{path := "over/there"} = uri_string:parse("over/there"), + #{path := "/over/there"} = uri_string:parse("/over/there"), + #{scheme := "foo", path := "/over/there"} = + uri_string:parse("foo:/over/there"), + #{scheme := "foo", host := "example.com", path := "/over/there"} = + uri_string:parse("foo://example.com/over/there"), + #{scheme := "foo", host := "example.com", path := "/over/there", port := 8042} = + uri_string:parse("foo://example.com:8042/over/there"). + +parse_query(_Config) -> + #{scheme := "foo", query := "?name=ferret"} = + uri_string:parse("foo:?name=ferret"), + #{scheme := "foo", path:= "over/there", query := "?name=ferret"} = + uri_string:parse("foo:over/there?name=ferret"), + #{scheme := "foo", path:= "/over/there", query := "?name=ferret"} = + uri_string:parse("foo:/over/there?name=ferret"), + #{scheme := "foo", host := "example.com", query := "?name=ferret"} = + uri_string:parse("foo://example.com?name=ferret"), + #{scheme := "foo", host := "example.com", path := "/", query := "?name=ferret"} = + uri_string:parse("foo://example.com/?name=ferret"), + + #{query := "?name=ferret"} = + uri_string:parse("?name=ferret"), + #{path := "over/there", query := "?name=ferret"} = + uri_string:parse("over/there?name=ferret"), + #{path := "/", query := "?name=ferret"} = + uri_string:parse("/?name=ferret"), + #{path := "/over/there", query := "?name=ferret"} = + uri_string:parse("/over/there?name=ferret"), + #{host := "example.com", query := "?name=ferret"} = + uri_string:parse("//example.com?name=ferret"), + #{host := "example.com", path := "/", query := "?name=ferret"} = + uri_string:parse("//example.com/?name=ferret"). + + +parse_fragment(_Config) -> + #{scheme := "foo", fragment := "nose"} = + uri_string:parse("foo:#nose"), + #{scheme := "foo", path:= "over/there", fragment := "nose"} = + uri_string:parse("foo:over/there#nose"), + #{scheme := "foo", path:= "/over/there", fragment := "nose"} = + uri_string:parse("foo:/over/there#nose"), + #{scheme := "foo", host := "example.com", fragment := "nose"} = + uri_string:parse("foo://example.com#nose"), + #{scheme := "foo", host := "example.com", path := "/", fragment := "nose"} = + uri_string:parse("foo://example.com/#nose"), + #{scheme := "foo", host := "example.com", fragment := "nose"} = + uri_string:parse("foo://example.com#nose"), + + #{fragment := "nose"} = + uri_string:parse("#nose"), + #{path := "over/there", fragment := "nose"} = + uri_string:parse("over/there#nose"), + #{path := "/", fragment := "nose"} = + uri_string:parse("/#nose"), + #{path := "/over/there", fragment := "nose"} = + uri_string:parse("/over/there#nose"), + #{host := "example.com", fragment := "nose"} = + uri_string:parse("//example.com#nose"), + #{host := "example.com", path := "/", fragment := "nose"} = + uri_string:parse("//example.com/#nose"). + + +parse_list(_Config) -> + #{scheme := "foo", path := "bar:nisse"} = uri_string:parse("foo:bar:nisse"), + #{scheme := "foo", host := "example.com", port := 8042, + path := "/over/there", query := "?name=ferret", fragment := "nose"} = + uri_string:parse("foo://example.com:8042/over/there?name=ferret#nose"), + #{scheme := "foo", userinfo := "admin:admin", host := "example.com", port := 8042, + path := "/over/there", query := "?name=ferret", fragment := "nose"} = + uri_string:parse("foo://admin:admin@example.com:8042/over/there?name=ferret#nose"). + +parse_binary(_Config) -> + #{scheme := <<"foo">>, path := <<"bar:nisse">>} = uri_string:parse(<<"foo:bar:nisse">>), + #{scheme := <<"foo">>, host := <<"example.com">>, port := 8042, + path := <<"/over/there">>, query := <<"?name=ferret">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo://example.com:8042/over/there?name=ferret#nose">>), + #{scheme := <<"foo">>, userinfo := <<"admin:admin">>, host := <<"example.com">>, port := 8042, + path := <<"/over/there">>, query := <<"?name=ferret">>, fragment := <<"nose">>} = + uri_string:parse(<<"foo://admin:admin@example.com:8042/over/there?name=ferret#nose">>). + + +parse_mixed(_Config) -> + #{scheme := "foo", path := "bar"} = + uri_string:parse(lists:append("fo",<<"o:bar">>)), + #{scheme := "foo", path := "bar"} = + uri_string:parse(lists:append("foo:b",<<"ar">>)), + #{scheme := "foo", path := "bar:bar"} = + uri_string:parse([[102],[111,111],<<":bar">>,58,98,97,114]). + +parse_relative(_Config) -> + #{path := "/path"} = + uri_string:parse(lists:append("/pa",<<"th">>)), + #{path := "foo"} = + uri_string:parse(lists:append("fo",<<"o">>)). diff --git a/lib/stdlib/test/uri_string_property_test_SUITE.erl b/lib/stdlib/test/uri_string_property_test_SUITE.erl new file mode 100644 index 0000000000..de5edf54aa --- /dev/null +++ b/lib/stdlib/test/uri_string_property_test_SUITE.erl @@ -0,0 +1,42 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(uri_string_property_test_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-compile(export_all). + +all() -> [decode]. + +init_per_suite(Config) -> + ct_property_test:init_per_suite(Config). + +end_per_suite(Config) -> + Config. + +%%%================================================================ +%%% Test suites +%%% + +decode(Config) -> + ct_property_test:quickcheck( + uri_string_decode:prop_uri_string_decode(), + Config + ). -- cgit v1.2.3 From ec3f0c7f96531b714082f5af694a7ed6a02769ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 14 Sep 2017 14:25:47 +0200 Subject: stdlib: Add support for parsing IPv4 and IPv6 --- lib/stdlib/src/uri_string.erl | 246 +++++++++++++++++++++++++++++++++-- lib/stdlib/test/uri_string_SUITE.erl | 74 ++++++++--- 2 files changed, 296 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 619da24cbc..3656d561be 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -388,7 +388,7 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - URI1#{host => Host} + URI1#{host => remove_brackets(Host)} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -669,7 +669,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => Host}} + {Rest, URI1#{host => remove_brackets(Host)}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -756,7 +756,7 @@ parse_userinfo(?CHAR($@), _URI) -> parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => Host}}; + {Rest, URI1#{host => remove_brackets(Host)}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); @@ -834,7 +834,6 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). %% %% reg-name = *( unreserved / pct-encoded / sub-delims ) %%------------------------------------------------------------------------- -%% TODO: implement parsing of IPv4/IPv6 addresses -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), @@ -849,14 +848,16 @@ parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_host(?STRING_REST($[, Rest), URI) -> + parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), {Rest, URI1#{fragment => Fragment}}; parse_host(?STRING_REST(Char, Rest), URI) -> - case is_reg_name(Char) of - true -> parse_host(Rest, URI); - false -> throw(uri_parse_error) + case is_digit(Char) of + true -> parse_ipv4_bin(Rest, [Char], URI); + false -> parse_reg_name(?STRING_REST(Char, Rest), URI) end; parse_host(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -877,12 +878,65 @@ parse_host([$?|Rest], Acc, URI) -> parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query parse_host([$#|Rest], Acc, URI) -> parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty +parse_host([$[|Rest], _Acc, URI) -> + parse_ipv6(Rest, [], URI); parse_host([Char|Rest], Acc, URI) -> + case is_digit(Char) of + true -> parse_ipv4(Rest, [Char|Acc], URI); + false -> parse_reg_name([Char|Rest], Acc, URI) + end; +parse_host([], Acc, URI) -> + URI#{host => lists:reverse(Acc)}. + + +-spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. +parse_reg_name(?STRING_REST($:, Rest), URI) -> + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_reg_name(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_reg_name(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_reg_name(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of - true -> parse_host(Rest, [Char|Acc], URI); + true -> parse_reg_name(Rest, URI); false -> throw(uri_parse_error) end; -parse_host([], Acc, URI) -> +parse_reg_name(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_reg_name(iolist(), list(), uri_map()) -> uri_map(). +parse_reg_name(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_reg_name(unicode:characters_to_list(Str), Acc, URI); +parse_reg_name([H|Rest], Acc, URI) when is_binary(H) -> + parse_reg_name(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_reg_name([H|Rest], Acc, URI) when is_list(H) -> + parse_reg_name(H ++ Rest, Acc, URI); +parse_reg_name([$:|Rest], Acc, URI) -> + parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); +parse_reg_name([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty +parse_reg_name([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query +parse_reg_name([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty +parse_reg_name([Char|Rest], Acc, URI) -> + case is_reg_name(Char) of + true -> parse_reg_name(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_reg_name([], Acc, URI) -> URI#{host => lists:reverse(Acc)}. %% Check if char is allowed in reg-name @@ -891,6 +945,168 @@ is_reg_name($%) -> true; is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). +-spec parse_ipv4_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. +parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> + case is_ipv4(Char) of + true -> parse_ipv4_bin(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {?STRING_EMPTY, URI}. + +-spec parse_ipv4(iolist(), list(), uri_map()) -> uri_map(). +parse_ipv4(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_ipv4(unicode:characters_to_list(Str), Acc, URI); +parse_ipv4([H|Rest], Acc, URI) when is_binary(H) -> + parse_ipv4(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_ipv4([H|Rest], Acc, URI) when is_list(H) -> + parse_ipv4(H ++ Rest, Acc, URI); +parse_ipv4([$:|Rest], Acc, URI) -> + parse_port(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); +parse_ipv4([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-abempty +parse_ipv4([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty ?query +parse_ipv4([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty +parse_ipv4([Char|Rest], Acc, URI) -> + case is_ipv4(Char) of + true -> parse_ipv4(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_ipv4([], Acc, URI) -> + URI#{host => validate_ipv4_address(lists:reverse(Acc))}. + +%% Check if char is allowed in IPv4 addresses +-spec is_ipv4(char()) -> boolean(). +is_ipv4($.) -> true; +is_ipv4(Char) -> is_digit(Char). + +-spec validate_ipv4_address(list()) -> list(). +validate_ipv4_address(Addr) -> + case inet:parse_ipv4strict_address(Addr) of + {ok, _} -> Addr; + {error, _} -> throw(uri_parse_error) + end. + + +-spec parse_ipv6_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. +parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) -> + _ = validate_ipv6_address(lists:reverse(Acc)), + parse_ipv6_bin_end(Rest, URI); +parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> + case is_ipv6(Char) of + true -> parse_ipv6_bin(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> + throw(uri_parse_error). + +-spec parse_ipv6(iolist(), list(), uri_map()) -> uri_map(). +parse_ipv6(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_ipv6(unicode:characters_to_list(Str), Acc, URI); +parse_ipv6([H|Rest], Acc, URI) when is_binary(H) -> + parse_ipv6(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_ipv6([H|Rest], Acc, URI) when is_list(H) -> + parse_ipv6(H ++ Rest, Acc, URI); +parse_ipv6([$]|Rest], Acc, URI) -> + parse_ipv6_end(Rest, [], URI#{host => validate_ipv6_address(lists:reverse(Acc))}); +parse_ipv6([Char|Rest], Acc, URI) -> + case is_ipv6(Char) of + true -> parse_ipv6(Rest, [Char|Acc], URI); + false -> + io:format("# DEBUG Char: >>~c<<~n", [Char]), + io:format("# DEBUG Rest: >>~s<<~n", [Rest]), + throw(uri_parse_error) + end; +parse_ipv6([], _Acc, _URI) -> + throw(uri_parse_error). + +%% Check if char is allowed in IPv6 addresses +-spec is_ipv6(char()) -> boolean(). +is_ipv6($:) -> true; +is_ipv6($.) -> true; +is_ipv6(Char) -> is_hex_digit(Char). + + +-spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. +parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> + case is_ipv6(Char) of + true -> parse_ipv6_bin_end(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_ipv6_bin_end(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_ipv6_end(iolist(), list(), uri_map()) -> uri_map(). +parse_ipv6_end(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_ipv6_end(unicode:characters_to_list(Str), Acc, URI); +parse_ipv6_end([H|Rest], Acc, URI) when is_binary(H) -> + parse_ipv6_end(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_ipv6_end([H|Rest], Acc, URI) when is_list(H) -> + parse_ipv6_end(H ++ Rest, Acc, URI); +parse_ipv6_end([$:|Rest], _Acc, URI) -> + parse_port(Rest, [], URI); +parse_ipv6_end([$/|Rest], _Acc, URI) -> + parse_segment(Rest, [$/], URI); % path-abempty +parse_ipv6_end([$?|Rest], _Acc, URI) -> + parse_query(Rest, [$?], URI); % path-empty ?query +parse_ipv6_end([$#|Rest], _Acc, URI) -> + parse_fragment(Rest, [], URI); % path-empty +parse_ipv6_end([], _Acc, URI) -> + URI. + + +-spec validate_ipv6_address(list()) -> list(). +validate_ipv6_address(Addr) -> + case inet:parse_ipv6strict_address(Addr) of + {ok, _} -> Addr; + {error, _} -> throw(uri_parse_error) + end. + + %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2.2. Port] %% @@ -1106,8 +1322,20 @@ is_digit(C) when $0 =< C, C =< $9 -> true; is_digit(_) -> false. +-spec is_hex_digit(char()) -> boolean(). +is_hex_digit(C) + when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true; +is_hex_digit(_) -> false. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). byte_size_exl_head(<<>>) -> 0; byte_size_exl_head(Binary) -> byte_size(Binary) + 1. + +% Remove brackets from binary +-spec remove_brackets(binary()) -> binary(). +remove_brackets(?STRING_REST($[,Addr)) -> + A1 = binary:replace(Addr, <<$[>>, <<>>), + binary:replace(A1, <<$]>>, <<>>); +remove_brackets(Addr) -> Addr. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 189941de03..9b8e52f0b2 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -22,9 +22,12 @@ -include_lib("common_test/include/ct.hrl"). -export([all/0, suite/0,groups/0, - parse_binary_fragment/1, parse_binary_host/1, parse_binary_path/1, parse_binary_port/1, + parse_binary_fragment/1, parse_binary_host/1, parse_binary_host_ipv4/1, + parse_binary_host_ipv6/1, + parse_binary_path/1, parse_binary_port/1, parse_binary_query/1, parse_binary_scheme/1, parse_binary_userinfo/1, - parse_fragment/1, parse_host/1, parse_path/1, parse_port/1, + parse_fragment/1, parse_host/1, parse_host_ipv4/1, parse_host_ipv6/1, + parse_path/1, parse_port/1, parse_query/1, parse_scheme/1, parse_userinfo/1, parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1 ]). @@ -37,6 +40,8 @@ all() -> parse_binary_scheme, parse_binary_userinfo, parse_binary_host, + parse_binary_host_ipv4, + parse_binary_host_ipv6, parse_binary_port, parse_binary_path, parse_binary_query, @@ -44,6 +49,8 @@ all() -> parse_scheme, parse_userinfo, parse_host, + parse_host_ipv4, + parse_host_ipv6, parse_port, parse_path, parse_query, @@ -57,17 +64,6 @@ all() -> groups() -> []. - -%% TODO: Negative tests -%% uri_string:parse(<<"?name=ferret">>). -%% uri_string:parse("//user@") -%% uri_string:parse("foo://user@") -%% uri_string:parse(":600"). -%% -%% uri_string:parse("//:8042x"). -%% -%% io:format("# DEBUG T: >>~s<<~n", [T]), - parse_binary_scheme(_Config) -> #{} = uri_string:parse(<<>>), #{path := <<"foo">>} = uri_string:parse(<<"foo">>), @@ -95,7 +91,9 @@ parse_binary_userinfo(_Config) -> #{scheme := <<"foo">>, userinfo := <<"user">>, host := <<"localhost">>} = uri_string:parse(<<"foo://user@localhost">>), #{scheme := <<"foo">>, userinfo := <<"user:password">>, host := <<"localhost">>} = - uri_string:parse(<<"foo://user:password@localhost">>). + uri_string:parse(<<"foo://user:password@localhost">>), + uri_parse_error =(catch uri_string:parse("//user@")), + uri_parse_error = (catch uri_string:parse("foo://user@")). parse_binary_host(_Config) -> #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>), @@ -103,6 +101,30 @@ parse_binary_host(_Config) -> #{host := <<"hostname">>,scheme := <<"foo">>, userinfo := <<"user">>} = uri_string:parse(<<"foo://user@hostname">>). +parse_binary_host_ipv4(_Config) -> + #{host := <<"127.0.0.1">>} = uri_string:parse(<<"//127.0.0.1">>), + #{host := <<"127.0.0.1">>, path := <<"/over/there">>} = + uri_string:parse(<<"//127.0.0.1/over/there">>), + #{host := <<"127.0.0.1">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"//127.0.0.1?name=ferret">>), + #{host := <<"127.0.0.1">>, fragment := <<"nose">>} = uri_string:parse(<<"//127.0.0.1#nose">>), + uri_parse_error = (catch uri_string:parse(<<"//127.0.0.x">>)), + uri_parse_error = (catch uri_string:parse(<<"//1227.0.0.1">>)). + +parse_binary_host_ipv6(_Config) -> + #{host := <<"::127.0.0.1">>} = uri_string:parse(<<"//[::127.0.0.1]">>), + #{host := <<"2001:0db8:0000:0000:0000:0000:1428:07ab">>} = + uri_string:parse(<<"//[2001:0db8:0000:0000:0000:0000:1428:07ab]">>), + #{host := <<"::127.0.0.1">>, path := <<"/over/there">>} = + uri_string:parse(<<"//[::127.0.0.1]/over/there">>), + #{host := <<"::127.0.0.1">>, query := <<"?name=ferret">>} = + uri_string:parse(<<"//[::127.0.0.1]?name=ferret">>), + #{host := <<"::127.0.0.1">>, fragment := <<"nose">>} = + uri_string:parse(<<"//[::127.0.0.1]#nose">>), + uri_parse_error = (catch uri_string:parse(<<"//[::127.0.0.x]">>)), + uri_parse_error = (catch uri_string:parse(<<"//[::1227.0.0.1]">>)), + uri_parse_error = (catch uri_string:parse(<<"//[2001:0db8:0000:0000:0000:0000:1428:G7ab]">>)). + parse_binary_port(_Config) -> #{path:= <<"/:8042">>} = uri_string:parse(<<"/:8042">>), @@ -115,7 +137,9 @@ parse_binary_port(_Config) -> #{scheme := <<"foo">>, host := <<>>, port := 8042} = uri_string:parse(<<"foo://:8042">>), #{scheme := <<"foo">>, host := <<"example.com">>, port := 8042} = - uri_string:parse(<<"foo://example.com:8042">>). + uri_string:parse(<<"foo://example.com:8042">>), + uri_parse_error = (catch uri_string:parse(":600")), + uri_parse_error = (catch uri_string:parse("//:8042x")). parse_binary_path(_Config) -> #{path := <<"over/there">>} = uri_string:parse(<<"over/there">>), @@ -214,6 +238,26 @@ parse_host(_Config) -> #{host := "hostname",scheme := "foo", userinfo := "user"} = uri_string:parse("foo://user@hostname"). +parse_host_ipv4(_Config) -> + #{host := "127.0.0.1"} = uri_string:parse("//127.0.0.1"), + #{host := "2001:0db8:0000:0000:0000:0000:1428:07ab"} = + uri_string:parse("//[2001:0db8:0000:0000:0000:0000:1428:07ab]"), + #{host := "127.0.0.1", path := "/over/there"} = uri_string:parse("//127.0.0.1/over/there"), + #{host := "127.0.0.1", query := "?name=ferret"} = uri_string:parse("//127.0.0.1?name=ferret"), + #{host := "127.0.0.1", fragment := "nose"} = uri_string:parse("//127.0.0.1#nose"), + uri_parse_error = (catch uri_string:parse("//127.0.0.x")), + uri_parse_error = (catch uri_string:parse("//1227.0.0.1")). + +parse_host_ipv6(_Config) -> + #{host := "::127.0.0.1"} = uri_string:parse("//[::127.0.0.1]"), + #{host := "::127.0.0.1", path := "/over/there"} = uri_string:parse("//[::127.0.0.1]/over/there"), + #{host := "::127.0.0.1", query := "?name=ferret"} = + uri_string:parse("//[::127.0.0.1]?name=ferret"), + #{host := "::127.0.0.1", fragment := "nose"} = uri_string:parse("//[::127.0.0.1]#nose"), + uri_parse_error = (catch uri_string:parse("//[::127.0.0.x]")), + uri_parse_error = (catch uri_string:parse("//[::1227.0.0.1]")), + uri_parse_error = (catch uri_string:parse("//[2001:0db8:0000:0000:0000:0000:1428:G7ab]")). + parse_port(_Config) -> #{path:= "/:8042"} = uri_string:parse("/:8042"), -- cgit v1.2.3 From 6c0c11eeaf0649cfbca5e426263c7dc43b49feff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Tue, 19 Sep 2017 16:07:49 +0200 Subject: stdlib: Add support to parse percent-encoded URIs --- lib/stdlib/src/uri_string.erl | 198 +++++++++++++++++++++++++---------- lib/stdlib/test/uri_string_SUITE.erl | 78 +++++++++++++- 2 files changed, 217 insertions(+), 59 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 3656d561be..50e8a0bf5a 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -223,9 +223,9 @@ %% -module(uri_string). - -export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export([is_host/1, is_path/1]). % suppress warnings -export_type([uri_map/0, uri_string/0]). -define(CHAR(Char), <>). @@ -383,31 +383,31 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> try parse_userinfo(Rest, URI) of {T, URI1} -> {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), - URI1#{userinfo => Userinfo} + URI1#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - URI1#{host => remove_brackets(Host)} + URI1#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - URI1#{path => ?STRING_REST($/, Path)}; + URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - URI1#{query => ?STRING_REST($?, Query)}; + URI1#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - URI1#{fragment => Fragment}; + URI1#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - URI1#{path => ?STRING_REST(Char, Path)}; + URI1#{path => decode_path(?STRING_REST(Char, Path))}; false -> throw(uri_parse_error) end. @@ -491,11 +491,11 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of true -> parse_segment(Rest, URI); @@ -515,16 +515,16 @@ parse_segment([H|Rest], Acc, URI) when is_list(H) -> parse_segment([$/|Rest], Acc, URI) -> parse_segment(Rest, [$/|Acc], URI); % segment parse_segment([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query + parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query parse_segment([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); + parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); parse_segment([Char|Rest], Acc, URI) -> case is_pchar(Char) of true -> parse_segment(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_segment([], Acc, URI) -> - URI#{path => lists:reverse(Acc)}. + URI#{path => decode_path(lists:reverse(Acc))}. %%------------------------------------------------------------------------- %% path-noscheme @@ -535,11 +535,11 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, URI); @@ -559,16 +559,16 @@ parse_segment_nz_nc([H|Rest], Acc, URI) when is_list(H) -> parse_segment_nz_nc([$/|Rest], Acc, URI) -> parse_segment(Rest, [$/|Acc], URI); % segment parse_segment_nz_nc([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query + parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query parse_segment_nz_nc([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); + parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); parse_segment_nz_nc([Char|Rest], Acc, URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_segment_nz_nc([], Acc, URI) -> - URI#{path => lists:reverse(Acc)}. + URI#{path => decode_path(lists:reverse(Acc))}. %% Check if char is pchar. -spec is_pchar(char()) -> boolean(). @@ -664,31 +664,31 @@ parse_hier(?STRING_REST("//", Rest), URI) -> try parse_userinfo(Rest, URI) of {T, URI1} -> {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), - {Rest, URI1#{userinfo => Userinfo}} + {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => remove_brackets(Host)}} + {Rest, URI1#{host => decode_host(remove_brackets(Host))}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of true -> % segment_nz {T, URI1} = parse_segment(Rest, URI), {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST(Char, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST(Char, Path))}}; false -> throw(uri_parse_error) end; parse_hier(?STRING_EMPTY, URI) -> @@ -756,7 +756,7 @@ parse_userinfo(?CHAR($@), _URI) -> parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => remove_brackets(Host)}}; + {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); @@ -778,11 +778,11 @@ parse_userinfo([$@], _Acc, _URI) -> %% URI cannot end in userinfo state throw(uri_parse_error); parse_userinfo([$@|Rest], Acc, URI) -> - parse_host(Rest, [], URI#{userinfo => lists:reverse(Acc)}); + parse_host(Rest, [], URI#{userinfo => decode_userinfo(lists:reverse(Acc))}); parse_userinfo([Char|Rest], Acc, URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) % URI#{userinfo => lists:reverse(Acc)} + false -> throw(uri_parse_error) end; parse_userinfo([], _Acc, _URI) -> %% URI cannot end in userinfo state @@ -843,17 +843,17 @@ parse_host(?STRING_REST($:, Rest), URI) -> parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_ipv4_bin(Rest, [Char], URI); @@ -871,13 +871,13 @@ parse_host([H|Rest], Acc, URI) when is_binary(H) -> parse_host([H|Rest], Acc, URI) when is_list(H) -> parse_host(H ++ Rest, Acc, URI); parse_host([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); + parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); parse_host([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty + parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty parse_host([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query + parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query parse_host([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty + parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty parse_host([$[|Rest], _Acc, URI) -> parse_ipv6(Rest, [], URI); parse_host([Char|Rest], Acc, URI) -> @@ -886,7 +886,7 @@ parse_host([Char|Rest], Acc, URI) -> false -> parse_reg_name([Char|Rest], Acc, URI) end; parse_host([], Acc, URI) -> - URI#{host => lists:reverse(Acc)}. + URI#{host => decode_host(lists:reverse(Acc))}. -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. @@ -898,15 +898,15 @@ parse_reg_name(?STRING_REST($:, Rest), URI) -> parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, URI); @@ -924,20 +924,20 @@ parse_reg_name([H|Rest], Acc, URI) when is_binary(H) -> parse_reg_name([H|Rest], Acc, URI) when is_list(H) -> parse_reg_name(H ++ Rest, Acc, URI); parse_reg_name([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); + parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); parse_reg_name([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty + parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty parse_reg_name([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query + parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query parse_reg_name([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty + parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty parse_reg_name([Char|Rest], Acc, URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_reg_name([], Acc, URI) -> - URI#{host => lists:reverse(Acc)}. + URI#{host => decode_host(lists:reverse(Acc))}. %% Check if char is allowed in reg-name -spec is_reg_name(char()) -> boolean(). @@ -956,17 +956,17 @@ parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of true -> parse_ipv4_bin(Rest, [Char|Acc], URI); @@ -1062,15 +1062,15 @@ parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin_end(Rest, URI); @@ -1120,15 +1120,15 @@ validate_ipv6_address(Addr) -> parse_port(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_port(Rest, URI); @@ -1180,7 +1180,7 @@ parse_port([], Acc, URI) -> parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of true -> parse_query(Rest, URI); @@ -1198,18 +1198,19 @@ parse_query([H|Rest], Acc, URI) when is_binary(H) -> parse_query([H|Rest], Acc, URI) when is_list(H) -> parse_query(H ++ Rest, Acc, URI); parse_query([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{query => lists:reverse(Acc)}); + parse_fragment(Rest, [], URI#{query => decode_query(lists:reverse(Acc))}); parse_query([Char|Rest], Acc, URI) -> case is_query(Char) of true -> parse_query(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_query([], Acc, URI) -> - URI#{query => lists:reverse(Acc)}. + URI#{query => decode_query(lists:reverse(Acc))}. %% Check if char is allowed in query -spec is_query(char()) -> boolean(). is_query($/) -> true; +is_query($?) -> true; is_query(Char) -> is_pchar(Char). @@ -1245,7 +1246,7 @@ parse_fragment([Char|Rest], Acc, URI) -> false -> throw(uri_parse_error) end; parse_fragment([], Acc, URI) -> - URI#{fragment => lists:reverse(Acc)}. + URI#{fragment => decode_fragment(lists:reverse(Acc))}. %% Check if char is allowed in fragment -spec is_fragment(char()) -> boolean(). @@ -1339,3 +1340,90 @@ remove_brackets(?STRING_REST($[,Addr)) -> A1 = binary:replace(Addr, <<$[>>, <<>>), binary:replace(A1, <<$]>>, <<>>); remove_brackets(Addr) -> Addr. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 2.1. Percent-Encoding] +%% +%% A percent-encoding mechanism is used to represent a data octet in a +%% component when that octet's corresponding character is outside the +%% allowed set or is being used as a delimiter of, or within, the +%% component. A percent-encoded octet is encoded as a character +%% triplet, consisting of the percent character "%" followed by the two +%% hexadecimal digits representing that octet's numeric value. For +%% example, "%20" is the percent-encoding for the binary octet +%% "00100000" (ABNF: %x20), which in US-ASCII corresponds to the space +%% character (SP). Section 2.4 describes when percent-encoding and +%% decoding is applied. +%% +%% pct-encoded = "%" HEXDIG HEXDIG +%%------------------------------------------------------------------------- +-spec decode_userinfo(list()|binary()) -> list() | binary(). +decode_userinfo(Cs) -> + decode(Cs, fun is_userinfo/1, <<>>). + + +-spec decode_host(list()|binary()) -> list() | binary(). +decode_host(Cs) -> + decode(Cs, fun is_host/1, <<>>). + +%% Check if char is allowed in host +-spec is_host(char()) -> boolean(). +is_host($:) -> true; +is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +-spec decode_path(list()|binary()) -> list() | binary(). +decode_path(Cs) -> + decode(Cs, fun is_path/1, <<>>). + +%% Check if char is allowed in path +-spec is_path(char()) -> boolean(). +is_path($/) -> true; + +is_path(Char) -> is_pchar(Char). + + +-spec decode_query(list()|binary()) -> list() | binary(). +decode_query(Cs) -> + decode(Cs, fun is_query/1, <<>>). + +-spec decode_fragment(list()|binary()) -> list() | binary(). +decode_fragment(Cs) -> + decode(Cs, fun is_host/1, <<>>). + + +-spec decode(list()|binary(), fun(), binary()) -> list() | binary(). +decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + B = hex2dec(C0)*16+hex2dec(C1), + decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode(<>, Fun, Acc) -> + case Fun(C) of + true -> decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode(<<>>, _Fun, Acc) -> + Acc; +decode([$%,C0,C1|Cs], Fun, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + B = hex2dec(C0)*16+hex2dec(C1), + decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode([C|Cs], Fun, Acc) -> + case Fun(C) of + true -> decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode([], _Fun, Acc) -> + unicode:characters_to_list(Acc). + + +hex2dec(X) when (X >= $0) andalso (X =< $9) -> X - $0; +hex2dec(X) when (X >= $A) andalso (X =< $F) -> X - $A + 10; +hex2dec(X) when (X >= $a) andalso (X =< $f) -> X - $a + 10. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 9b8e52f0b2..c379eeb15b 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -24,10 +24,12 @@ -export([all/0, suite/0,groups/0, parse_binary_fragment/1, parse_binary_host/1, parse_binary_host_ipv4/1, parse_binary_host_ipv6/1, - parse_binary_path/1, parse_binary_port/1, + parse_binary_path/1, parse_binary_pct_encoded_fragment/1, parse_binary_pct_encoded_query/1, + parse_binary_pct_encoded_userinfo/1, parse_binary_port/1, parse_binary_query/1, parse_binary_scheme/1, parse_binary_userinfo/1, parse_fragment/1, parse_host/1, parse_host_ipv4/1, parse_host_ipv6/1, - parse_path/1, parse_port/1, + parse_path/1, parse_pct_encoded_fragment/1, parse_pct_encoded_query/1, + parse_pct_encoded_userinfo/1, parse_port/1, parse_query/1, parse_scheme/1, parse_userinfo/1, parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1 ]). @@ -39,22 +41,28 @@ all() -> [ parse_binary_scheme, parse_binary_userinfo, + parse_binary_pct_encoded_userinfo, parse_binary_host, parse_binary_host_ipv4, parse_binary_host_ipv6, parse_binary_port, parse_binary_path, parse_binary_query, + parse_binary_pct_encoded_query, parse_binary_fragment, + parse_binary_pct_encoded_fragment, parse_scheme, parse_userinfo, + parse_pct_encoded_userinfo, parse_host, parse_host_ipv4, parse_host_ipv6, parse_port, parse_path, parse_query, + parse_pct_encoded_query, parse_fragment, + parse_pct_encoded_fragment, parse_list, parse_binary, parse_mixed, @@ -92,8 +100,27 @@ parse_binary_userinfo(_Config) -> uri_string:parse(<<"foo://user@localhost">>), #{scheme := <<"foo">>, userinfo := <<"user:password">>, host := <<"localhost">>} = uri_string:parse(<<"foo://user:password@localhost">>), - uri_parse_error =(catch uri_string:parse("//user@")), - uri_parse_error = (catch uri_string:parse("foo://user@")). + uri_parse_error =(catch uri_string:parse(<<"//user@">>)), + uri_parse_error = (catch uri_string:parse(<<"foo://user@">>)). + +parse_binary_pct_encoded_userinfo(_Config) -> + #{scheme := <<"user">>, path := <<"合@気道"/utf8>>} = + uri_string:parse(<<"user:%E5%90%88@%E6%B0%97%E9%81%93">>), + #{path := <<"合気道@"/utf8>>} = uri_string:parse(<<"%E5%90%88%E6%B0%97%E9%81%93@">>), + #{path := <<"/合気道@"/utf8>>} = uri_string:parse(<<"/%E5%90%88%E6%B0%97%E9%81%93@">>), + #{path := <<"合@気道"/utf8>>} = uri_string:parse(<<"%E5%90%88@%E6%B0%97%E9%81%93">>), + #{userinfo := <<"合"/utf8>>, host := <<"気道"/utf8>>} = + uri_string:parse(<<"//%E5%90%88@%E6%B0%97%E9%81%93">>), + #{userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} = + uri_string:parse(<<"//%E5%90%88:%E6%B0%97@%E9%81%93">>), + #{scheme := <<"foo">>, path := <<"/合気道@"/utf8>>} = + uri_string:parse(<<"foo:/%E5%90%88%E6%B0%97%E9%81%93@">>), + #{scheme := <<"foo">>, userinfo := <<"合"/utf8>>, host := <<"気道"/utf8>>} = + uri_string:parse(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93">>), + #{scheme := <<"foo">>, userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} = + uri_string:parse(<<"foo://%E5%90%88:%E6%B0%97@%E9%81%93">>), + uri_parse_error =(catch uri_string:parse(<<"//%E5%90%88@%E6%B0%97%E9%81%93@">>)), + uri_parse_error = (catch uri_string:parse(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>)). parse_binary_host(_Config) -> #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>), @@ -176,6 +203,12 @@ parse_binary_query(_Config) -> #{host := <<"example.com">>, path := <<"/">>, query := <<"?name=ferret">>} = uri_string:parse(<<"//example.com/?name=ferret">>). +parse_binary_pct_encoded_query(_Config) -> + #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>, + query := <<"?name=合気道"/utf8>>} = + uri_string:parse(<<"foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>), + #{host := <<"example.com">>, path := <<"/">>, query := <<"?name=合気道"/utf8>>} = + uri_string:parse(<<"//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>). parse_binary_fragment(_Config) -> #{scheme := <<"foo">>, fragment := <<"nose">>} = @@ -204,6 +237,12 @@ parse_binary_fragment(_Config) -> #{host := <<"example.com">>, path := <<"/">>, fragment := <<"nose">>} = uri_string:parse(<<"//example.com/#nose">>). +parse_binary_pct_encoded_fragment(_Config) -> + #{scheme := <<"foo">>, host := <<"example.com">>, fragment := <<"合気道"/utf8>>} = + uri_string:parse(<<"foo://example.com#%E5%90%88%E6%B0%97%E9%81%93">>), + #{host := <<"example.com">>, path := <<"/">>, fragment := <<"合気道"/utf8>>} = + uri_string:parse(<<"//example.com/#%E5%90%88%E6%B0%97%E9%81%93">>). + parse_scheme(_Config) -> #{} = uri_string:parse(""), #{path := "foo"} = uri_string:parse("foo"), @@ -232,6 +271,26 @@ parse_userinfo(_Config) -> #{scheme := "foo", userinfo := "user:password", host := "localhost"} = uri_string:parse("foo://user:password@localhost"). +parse_pct_encoded_userinfo(_Config) -> + #{scheme := "user", path := "合@気道"} = + uri_string:parse("user:%E5%90%88@%E6%B0%97%E9%81%93"), + #{path := "合気道@"} = uri_string:parse("%E5%90%88%E6%B0%97%E9%81%93@"), + #{path := "/合気道@"} = uri_string:parse("/%E5%90%88%E6%B0%97%E9%81%93@"), + #{path := "合@気道"} = uri_string:parse("%E5%90%88@%E6%B0%97%E9%81%93"), + #{userinfo := "合", host := "気道"} = + uri_string:parse("//%E5%90%88@%E6%B0%97%E9%81%93"), + #{userinfo := "合:気", host := "道"} = + uri_string:parse("//%E5%90%88:%E6%B0%97@%E9%81%93"), + #{scheme := "foo", path := "/合気道@"} = + uri_string:parse("foo:/%E5%90%88%E6%B0%97%E9%81%93@"), + #{scheme := "foo", userinfo := "合", host := "気道"} = + uri_string:parse("foo://%E5%90%88@%E6%B0%97%E9%81%93"), + #{scheme := "foo", userinfo := "合:気", host := "道"} = + uri_string:parse("foo://%E5%90%88:%E6%B0%97@%E9%81%93"), + uri_parse_error =(catch uri_string:parse("//%E5%90%88@%E6%B0%97%E9%81%93@")), + uri_parse_error = (catch uri_string:parse("foo://%E5%90%88@%E6%B0%97%E9%81%93@")). + + parse_host(_Config) -> #{host := "hostname"} = uri_string:parse("//hostname"), #{host := "hostname",scheme := "foo"} = uri_string:parse("foo://hostname"), @@ -307,6 +366,12 @@ parse_query(_Config) -> #{host := "example.com", path := "/", query := "?name=ferret"} = uri_string:parse("//example.com/?name=ferret"). +parse_pct_encoded_query(_Config) -> + #{scheme := "foo", host := "example.com", path := "/", + query := "?name=合気道"} = + uri_string:parse("foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93"), + #{host := "example.com", path := "/", query := "?name=合気道"} = + uri_string:parse("//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93"). parse_fragment(_Config) -> #{scheme := "foo", fragment := "nose"} = @@ -335,6 +400,11 @@ parse_fragment(_Config) -> #{host := "example.com", path := "/", fragment := "nose"} = uri_string:parse("//example.com/#nose"). +parse_pct_encoded_fragment(_Config) -> + #{scheme := "foo", host := "example.com", fragment := "合気道"} = + uri_string:parse("foo://example.com#%E5%90%88%E6%B0%97%E9%81%93"), + #{host := "example.com", path := "/", fragment := "合気道"} = + uri_string:parse("//example.com/#%E5%90%88%E6%B0%97%E9%81%93"). parse_list(_Config) -> #{scheme := "foo", path := "bar:nisse"} = uri_string:parse("foo:bar:nisse"), -- cgit v1.2.3 From 892bf58ee115a7e56ff38083afd85702bb8e14d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 20 Sep 2017 17:17:34 +0200 Subject: stdlib: Implement recompose - Implemented recompose function with percent-encoding and validation of IPv4/IPv6 addresses. - Added test for recompose that uses a generated test vector (URI combinations based on a fix set of URI components). - Added test for parse-recompose using a generated test vector. - Removed parsing functions for lists. Lists are converted to binary before parsing. --- lib/stdlib/src/uri_string.erl | 783 ++++++++++++++++++----------------- lib/stdlib/test/uri_string_SUITE.erl | 300 +++++++++++++- 2 files changed, 701 insertions(+), 382 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 50e8a0bf5a..89a2c21518 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -223,16 +223,39 @@ %% -module(uri_string). +%%------------------------------------------------------------------------- +%% External API +%%------------------------------------------------------------------------- -export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). --export([is_host/1, is_path/1]). % suppress warnings -export_type([uri_map/0, uri_string/0]). + +%%------------------------------------------------------------------------- +%% Internal API +%%------------------------------------------------------------------------- +-export([is_host/1, is_path/1]). % suppress warnings + + +%%------------------------------------------------------------------------- +%% Macros +%%------------------------------------------------------------------------- -define(CHAR(Char), <>). -define(STRING_EMPTY, <<>>). -define(STRING(MatchStr), <>). -define(STRING_REST(MatchStr, Rest), <>). +-define(DEC2HEX(X), + if ((X) >= 0) andalso ((X) =< 9) -> (X) + $0; + ((X) >= 10) andalso ((X) =< 15) -> (X) + $A - 10 + end). + +-define(HEX2DEC(X), + if ((X) >= $0) andalso ((X) =< $9) -> (X) - $0; + ((X) >= $A) andalso ((X) =< $F) -> (X) - $A + 10; + ((X) >= $a) andalso ((X) =< $f) -> (X) - $a + 10 + end). + %%%========================================================================= %%% API @@ -250,8 +273,9 @@ %%------------------------------------------------------------------------- -type uri_string() :: iodata(). - +%%------------------------------------------------------------------------- %% RFC 3986, Chapter 3. Syntax Components +%%------------------------------------------------------------------------- -type uri_map() :: #{fragment => unicode:chardata(), host => unicode:chardata(), @@ -261,25 +285,44 @@ scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. +%%------------------------------------------------------------------------- %% Parse URIs +%%------------------------------------------------------------------------- -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map(). -parse(URIString) -> - if is_binary(URIString) -> - parse_uri_reference(URIString, #{}); - true -> - parse_uri_reference(URIString, [], #{}) - end. +parse(URIString) when is_binary(URIString) -> + parse_uri_reference(URIString, #{}); +parse(URIString) when is_list(URIString) -> + Binary = unicode:characters_to_binary(URIString), + Map = parse_uri_reference(Binary, #{}), + convert_mapfields_to_list(Map). +%%------------------------------------------------------------------------- %% Recompose URIs +%%------------------------------------------------------------------------- -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). -recompose(_) -> - "". +recompose(Map) when map_size(Map) =:= 0 -> + ""; +recompose(Map) -> + case is_valid_map(Map) of + false -> + error({badarg, invalid_map}); + true -> + T0 = update_scheme(Map, empty), + T1 = update_userinfo(Map, T0), + T2 = update_host(Map, T1), + T3 = update_port(Map, T2), + T4 = update_path(Map, T3), + T5 = update_query(Map, T4), + update_fragment(Map, T5) + end. +%%------------------------------------------------------------------------- %% Resolve references +%%------------------------------------------------------------------------- -spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when RelativeURI :: uri_string(), AbsoluteBaseURI :: uri_string(), @@ -287,7 +330,9 @@ recompose(_) -> resolve_uri_reference(_,_) -> "". +%%------------------------------------------------------------------------- %% Create references +%%------------------------------------------------------------------------- -spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when AbsoluteSourceURI :: uri_string(), AbsoluteBaseURI :: uri_string(), @@ -295,33 +340,42 @@ resolve_uri_reference(_,_) -> create_uri_reference(_,_) -> "". +%%------------------------------------------------------------------------- %% Normalize URIs +%%------------------------------------------------------------------------- -spec normalize(URIString) -> NormalizedURI when URIString :: uri_string(), NormalizedURI :: uri_string(). normalize(_) -> "". +%%------------------------------------------------------------------------- %% Transcode URIs +%%------------------------------------------------------------------------- -spec transcode(URIString, Options) -> URIString when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. transcode(_, _) -> "". - +%%------------------------------------------------------------------------- %% Working with query strings %% HTML 2.0 - application/x-www-form-urlencoded %% RFC 1866 [8.2.1] +%%------------------------------------------------------------------------- +%%------------------------------------------------------------------------- %% Compose urlencoded query string from a list of unescaped key/value pairs. +%%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{unicode:chardata(), unicode:chardata()}], QueryString :: uri_string(). compose_query(_) -> "". +%%------------------------------------------------------------------------- %% Dissect a query string into a list of unescaped key/value pairs. +%%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), QueryList :: [{unicode:chardata(), unicode:chardata()}]. @@ -333,6 +387,14 @@ dissect_query(_) -> %%% Internal functions %%%======================================================================== +%%------------------------------------------------------------------------- +%% Converts Map fields to lists +%%------------------------------------------------------------------------- +convert_mapfields_to_list(Map) -> + Fun = fun (_, V) when is_binary(V) -> unicode:characters_to_list(V); + (_, V) -> V end, + maps:map(Fun, Map). + %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 4.1. URI Reference] @@ -342,16 +404,6 @@ dissect_query(_) -> %% %% URI-reference = URI / relative-ref %%------------------------------------------------------------------------- --spec parse_uri_reference(iolist(), list(), uri_map()) -> uri_map(). -parse_uri_reference([], _, _) -> #{}; -parse_uri_reference(URIString, Acc, URI) -> - try parse_scheme_start(URIString, Acc, URI) of - Res -> Res - catch - throw:uri_parse_error -> - parse_relative_part(URIString, Acc, URI) - end. - -spec parse_uri_reference(binary(), uri_map()) -> uri_map(). parse_uri_reference(<<>>, _) -> #{}; parse_uri_reference(URIString, URI) -> @@ -411,32 +463,6 @@ parse_relative_part(?STRING_REST(Char, Rest), URI) -> false -> throw(uri_parse_error) end. --spec parse_relative_part(iolist(), list(), uri_map()) -> uri_map(). -parse_relative_part([H|Rest], Acc, URI) when is_binary(H) -> - parse_relative_part(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_relative_part([H|Rest], Acc, URI) when is_list(H) -> - parse_relative_part(H ++ Rest, Acc, URI); -parse_relative_part("//" ++ Rest, Acc, URI) -> - % Parse userinfo - try parse_userinfo(Rest, Acc, URI) of - Res -> Res - catch - throw:uri_parse_error -> - parse_host(Rest, Acc, URI) - end; -parse_relative_part([$/|Rest], _Acc, URI) -> - parse_segment(Rest, [$/], URI); % path-absolute -parse_relative_part([$?|Rest], _Acc, URI) -> - parse_query(Rest, [$?], URI); % path-empty ?query -parse_relative_part([$#|Rest], _Acc, URI) -> - parse_fragment(Rest, [], URI); % path-empty -parse_relative_part([Char|Rest], _, URI) -> - case is_segment_nz_nc(Char) of - true -> parse_segment_nz_nc(Rest, [Char], URI); % path-noscheme - false -> throw(uri_parse_error) - end. - %% Returns size of 'Rest' for proper calculation of splitting position. %% Solves the following special case: @@ -504,27 +530,6 @@ parse_segment(?STRING_REST(Char, Rest), URI) -> parse_segment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_segment(iolist(), list(), uri_map()) -> uri_map(). -parse_segment(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_segment(unicode:characters_to_list(Str), Acc, URI); -parse_segment([H|Rest], Acc, URI) when is_binary(H) -> - parse_segment(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_segment([H|Rest], Acc, URI) when is_list(H) -> - parse_segment(H ++ Rest, Acc, URI); -parse_segment([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/|Acc], URI); % segment -parse_segment([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query -parse_segment([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); -parse_segment([Char|Rest], Acc, URI) -> - case is_pchar(Char) of - true -> parse_segment(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_segment([], Acc, URI) -> - URI#{path => decode_path(lists:reverse(Acc))}. %%------------------------------------------------------------------------- %% path-noscheme @@ -548,27 +553,6 @@ parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> parse_segment_nz_nc(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_segment_nz_nc(iolist(), list(), uri_map()) -> uri_map(). -parse_segment_nz_nc(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_segment_nz_nc(unicode:characters_to_list(Str), Acc, URI); -parse_segment_nz_nc([H|Rest], Acc, URI) when is_binary(H) -> - parse_segment_nz_nc(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_segment_nz_nc([H|Rest], Acc, URI) when is_list(H) -> - parse_segment_nz_nc(H ++ Rest, Acc, URI); -parse_segment_nz_nc([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/|Acc], URI); % segment -parse_segment_nz_nc([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query -parse_segment_nz_nc([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); -parse_segment_nz_nc([Char|Rest], Acc, URI) -> - case is_segment_nz_nc(Char) of - true -> parse_segment_nz_nc(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_segment_nz_nc([], Acc, URI) -> - URI#{path => decode_path(lists:reverse(Acc))}. %% Check if char is pchar. -spec is_pchar(char()) -> boolean(). @@ -601,18 +585,6 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> false -> throw(uri_parse_error) end. --spec parse_scheme_start(iolist(), list(), uri_map()) -> uri_map(). -parse_scheme_start([H|Rest], Acc, URI) when is_binary(H) -> - parse_scheme_start(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_scheme_start([H|Rest], Acc, URI) when is_list(H) -> - parse_scheme_start(H ++ Rest, Acc, URI); -parse_scheme_start([Char|Rest], Acc, URI) -> - case is_alpha(Char) of - true -> parse_scheme(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end. - -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> @@ -626,23 +598,6 @@ parse_scheme(?STRING_REST(Char, Rest), URI) -> parse_scheme(?STRING_EMPTY, _URI) -> throw(uri_parse_error). --spec parse_scheme(iolist(), list(), uri_map()) -> uri_map(). -parse_scheme(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_scheme(unicode:characters_to_list(Str), Acc, URI); -parse_scheme([H|Rest], Acc, URI) when is_binary(H) -> - parse_scheme(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_scheme([H|Rest], Acc, URI) when is_list(H) -> - parse_scheme(H ++ Rest, Acc, URI); -parse_scheme([$:|Rest], Acc, URI) -> - parse_hier(Rest, [], URI#{scheme => lists:reverse(Acc)}); -parse_scheme([Char|Rest], Acc, URI) -> - case is_scheme(Char) of - true -> parse_scheme(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_scheme([], _Acc, _URI) -> - throw(uri_parse_error). %% Check if char is allowed in scheme -spec is_scheme(char()) -> boolean(). @@ -694,36 +649,6 @@ parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless parse_hier(?STRING_EMPTY, URI) -> {<<>>, URI}. --spec parse_hier(iolist(), list(), uri_map()) -> uri_map(). -parse_hier(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_hier(unicode:characters_to_list(Str), Acc, URI); -parse_hier([H|Rest], Acc, URI) when is_binary(H) -> - parse_hier(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_hier([H|Rest], Acc, URI) when is_list(H) -> - parse_hier(H ++ Rest, Acc, URI); -parse_hier("//" ++ Rest, Acc, URI) -> - % Parse userinfo - try parse_userinfo(Rest, Acc, URI) of - Res -> Res - catch - throw:uri_parse_error -> - parse_host(Rest, [], URI) - end; -parse_hier([$/|Rest], _Acc, URI) -> - parse_segment(Rest, [$/], URI); % path-absolute -parse_hier([$?|Rest], _Acc, URI) -> - parse_query(Rest, [$?], URI); % path-empty ?query -parse_hier([$#|Rest], _Acc, URI) -> - parse_fragment(Rest, [], URI); % path-empty -parse_hier([Char|Rest], _, URI) -> % path-rootless - case is_pchar(Char) of - true -> parse_segment(Rest, [Char], URI); - false -> throw(uri_parse_error) - end; -parse_hier([], _, URI) -> - URI. - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2. Authority] @@ -766,27 +691,6 @@ parse_userinfo(?STRING_EMPTY, _URI) -> %% URI cannot end in userinfo state throw(uri_parse_error). --spec parse_userinfo(iolist(), list(), uri_map()) -> uri_map(). -parse_userinfo(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_userinfo(unicode:characters_to_list(Str), Acc, URI); -parse_userinfo([H|Rest], Acc, URI) when is_binary(H) -> - parse_userinfo(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_userinfo([H|Rest], Acc, URI) when is_list(H) -> - parse_userinfo(H ++ Rest, Acc, URI); -parse_userinfo([$@], _Acc, _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error); -parse_userinfo([$@|Rest], Acc, URI) -> - parse_host(Rest, [], URI#{userinfo => decode_userinfo(lists:reverse(Acc))}); -parse_userinfo([Char|Rest], Acc, URI) -> - case is_userinfo(Char) of - true -> parse_userinfo(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_userinfo([], _Acc, _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error). %% Check if char is allowed in userinfo -spec is_userinfo(char()) -> boolean(). @@ -862,32 +766,6 @@ parse_host(?STRING_REST(Char, Rest), URI) -> parse_host(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_host(iolist(), list(), uri_map()) -> uri_map(). -parse_host(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_host(unicode:characters_to_list(Str), Acc, URI); -parse_host([H|Rest], Acc, URI) when is_binary(H) -> - parse_host(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_host([H|Rest], Acc, URI) when is_list(H) -> - parse_host(H ++ Rest, Acc, URI); -parse_host([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); -parse_host([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty -parse_host([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query -parse_host([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty -parse_host([$[|Rest], _Acc, URI) -> - parse_ipv6(Rest, [], URI); -parse_host([Char|Rest], Acc, URI) -> - case is_digit(Char) of - true -> parse_ipv4(Rest, [Char|Acc], URI); - false -> parse_reg_name([Char|Rest], Acc, URI) - end; -parse_host([], Acc, URI) -> - URI#{host => decode_host(lists:reverse(Acc))}. - -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> @@ -915,30 +793,6 @@ parse_reg_name(?STRING_REST(Char, Rest), URI) -> parse_reg_name(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_reg_name(iolist(), list(), uri_map()) -> uri_map(). -parse_reg_name(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_reg_name(unicode:characters_to_list(Str), Acc, URI); -parse_reg_name([H|Rest], Acc, URI) when is_binary(H) -> - parse_reg_name(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_reg_name([H|Rest], Acc, URI) when is_list(H) -> - parse_reg_name(H ++ Rest, Acc, URI); -parse_reg_name([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); -parse_reg_name([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty -parse_reg_name([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query -parse_reg_name([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty -parse_reg_name([Char|Rest], Acc, URI) -> - case is_reg_name(Char) of - true -> parse_reg_name(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_reg_name([], Acc, URI) -> - URI#{host => decode_host(lists:reverse(Acc))}. - %% Check if char is allowed in reg-name -spec is_reg_name(char()) -> boolean(). is_reg_name($%) -> true; @@ -976,29 +830,6 @@ parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {?STRING_EMPTY, URI}. --spec parse_ipv4(iolist(), list(), uri_map()) -> uri_map(). -parse_ipv4(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_ipv4(unicode:characters_to_list(Str), Acc, URI); -parse_ipv4([H|Rest], Acc, URI) when is_binary(H) -> - parse_ipv4(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_ipv4([H|Rest], Acc, URI) when is_list(H) -> - parse_ipv4(H ++ Rest, Acc, URI); -parse_ipv4([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); -parse_ipv4([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-abempty -parse_ipv4([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty ?query -parse_ipv4([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty -parse_ipv4([Char|Rest], Acc, URI) -> - case is_ipv4(Char) of - true -> parse_ipv4(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_ipv4([], Acc, URI) -> - URI#{host => validate_ipv4_address(lists:reverse(Acc))}. %% Check if char is allowed in IPv4 addresses -spec is_ipv4(char()) -> boolean(). @@ -1025,27 +856,6 @@ parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> throw(uri_parse_error). --spec parse_ipv6(iolist(), list(), uri_map()) -> uri_map(). -parse_ipv6(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_ipv6(unicode:characters_to_list(Str), Acc, URI); -parse_ipv6([H|Rest], Acc, URI) when is_binary(H) -> - parse_ipv6(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_ipv6([H|Rest], Acc, URI) when is_list(H) -> - parse_ipv6(H ++ Rest, Acc, URI); -parse_ipv6([$]|Rest], Acc, URI) -> - parse_ipv6_end(Rest, [], URI#{host => validate_ipv6_address(lists:reverse(Acc))}); -parse_ipv6([Char|Rest], Acc, URI) -> - case is_ipv6(Char) of - true -> parse_ipv6(Rest, [Char|Acc], URI); - false -> - io:format("# DEBUG Char: >>~c<<~n", [Char]), - io:format("# DEBUG Rest: >>~s<<~n", [Rest]), - throw(uri_parse_error) - end; -parse_ipv6([], _Acc, _URI) -> - throw(uri_parse_error). - %% Check if char is allowed in IPv6 addresses -spec is_ipv6(char()) -> boolean(). is_ipv6($:) -> true; @@ -1079,26 +889,6 @@ parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> parse_ipv6_bin_end(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_ipv6_end(iolist(), list(), uri_map()) -> uri_map(). -parse_ipv6_end(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_ipv6_end(unicode:characters_to_list(Str), Acc, URI); -parse_ipv6_end([H|Rest], Acc, URI) when is_binary(H) -> - parse_ipv6_end(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_ipv6_end([H|Rest], Acc, URI) when is_list(H) -> - parse_ipv6_end(H ++ Rest, Acc, URI); -parse_ipv6_end([$:|Rest], _Acc, URI) -> - parse_port(Rest, [], URI); -parse_ipv6_end([$/|Rest], _Acc, URI) -> - parse_segment(Rest, [$/], URI); % path-abempty -parse_ipv6_end([$?|Rest], _Acc, URI) -> - parse_query(Rest, [$?], URI); % path-empty ?query -parse_ipv6_end([$#|Rest], _Acc, URI) -> - parse_fragment(Rest, [], URI); % path-empty -parse_ipv6_end([], _Acc, URI) -> - URI. - - -spec validate_ipv6_address(list()) -> list(). validate_ipv6_address(Addr) -> case inet:parse_ipv6strict_address(Addr) of @@ -1137,32 +927,6 @@ parse_port(?STRING_REST(Char, Rest), URI) -> parse_port(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_port(iolist(), list(), uri_map()) -> uri_map(). -parse_port(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_port(unicode:characters_to_list(Str), Acc, URI); -parse_port([H|Rest], Acc, URI) when is_binary(H) -> - parse_port(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_port([H|Rest], Acc, URI) when is_list(H) -> - parse_port(H ++ Rest, Acc, URI); -parse_port([$/|Rest], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - parse_segment(Rest, [$/], URI#{port => Port}); % path-abempty -parse_port([$?|Rest], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - parse_query(Rest, [$?], URI#{port => Port}); % path-empty ?query -parse_port([$#|Rest], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - parse_fragment(Rest, [], URI#{port => Port}); % path-empty -parse_port([Char|Rest], Acc, URI) -> - case is_digit(Char) of - true -> parse_port(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_port([], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - URI#{port => Port}. - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.4. Query] @@ -1189,23 +953,6 @@ parse_query(?STRING_REST(Char, Rest), URI) -> parse_query(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_query(iolist(), list(), uri_map()) -> uri_map(). -parse_query(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_query(unicode:characters_to_list(Str), Acc, URI); -parse_query([H|Rest], Acc, URI) when is_binary(H) -> - parse_query(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_query([H|Rest], Acc, URI) when is_list(H) -> - parse_query(H ++ Rest, Acc, URI); -parse_query([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{query => decode_query(lists:reverse(Acc))}); -parse_query([Char|Rest], Acc, URI) -> - case is_query(Char) of - true -> parse_query(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_query([], Acc, URI) -> - URI#{query => decode_query(lists:reverse(Acc))}. %% Check if char is allowed in query -spec is_query(char()) -> boolean(). @@ -1232,21 +979,6 @@ parse_fragment(?STRING_REST(Char, Rest), URI) -> parse_fragment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_fragment(iolist(), list(), uri_map()) -> uri_map(). -parse_fragment(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_fragment(unicode:characters_to_list(Str), Acc, URI); -parse_fragment([H|Rest], Acc, URI) when is_binary(H) -> - parse_fragment(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_fragment([H|Rest], Acc, URI) when is_list(H) -> - parse_fragment(H ++ Rest, Acc, URI); -parse_fragment([Char|Rest], Acc, URI) -> - case is_fragment(Char) of - true -> parse_fragment(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_fragment([], Acc, URI) -> - URI#{fragment => decode_fragment(lists:reverse(Acc))}. %% Check if char is allowed in fragment -spec is_fragment(char()) -> boolean(). @@ -1266,21 +998,6 @@ is_fragment(Char) -> is_pchar(Char). %% / "*" / "+" / "," / ";" / "=" %% %%------------------------------------------------------------------------- -%% %% Return true if input char is reserved. -%% -spec is_reserved(char()) -> boolean(). -%% is_reserved(Char) -> -%% is_gen_delim(Char) orelse is_sub_delim(Char). - -%% %% Check if char is reserved. -%% -spec is_gen_delim(char()) -> boolean(). -%% is_gen_delim($:) -> true; -%% is_gen_delim($/) -> true; -%% is_gen_delim($?) -> true; -%% is_gen_delim($#) -> true; -%% is_gen_delim($[) -> true; -%% is_gen_delim($]) -> true; -%% is_gen_delim($@) -> true; -%% is_gen_delim(_) -> false. %% Check if char is sub-delim. -spec is_sub_delim(char()) -> boolean(). @@ -1328,17 +1045,22 @@ is_hex_digit(C) when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true; is_hex_digit(_) -> false. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). byte_size_exl_head(<<>>) -> 0; byte_size_exl_head(Binary) -> byte_size(Binary) + 1. -% Remove brackets from binary + +%% Remove enclosing brackets from binary -spec remove_brackets(binary()) -> binary(). -remove_brackets(?STRING_REST($[,Addr)) -> - A1 = binary:replace(Addr, <<$[>>, <<>>), - binary:replace(A1, <<$]>>, <<>>); +remove_brackets(<<$[/utf8, Rest/binary>>) -> + {H,T} = split_binary(Rest, byte_size(Rest) - 1), + case T =:= <<$]/utf8>> of + true -> H; + false -> Rest + end; remove_brackets(Addr) -> Addr. @@ -1362,42 +1084,72 @@ remove_brackets(Addr) -> Addr. decode_userinfo(Cs) -> decode(Cs, fun is_userinfo/1, <<>>). - -spec decode_host(list()|binary()) -> list() | binary(). decode_host(Cs) -> decode(Cs, fun is_host/1, <<>>). -%% Check if char is allowed in host --spec is_host(char()) -> boolean(). -is_host($:) -> true; -is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). - - -spec decode_path(list()|binary()) -> list() | binary(). decode_path(Cs) -> decode(Cs, fun is_path/1, <<>>). -%% Check if char is allowed in path --spec is_path(char()) -> boolean(). -is_path($/) -> true; - -is_path(Char) -> is_pchar(Char). - - -spec decode_query(list()|binary()) -> list() | binary(). decode_query(Cs) -> decode(Cs, fun is_query/1, <<>>). -spec decode_fragment(list()|binary()) -> list() | binary(). decode_fragment(Cs) -> - decode(Cs, fun is_host/1, <<>>). + decode(Cs, fun is_fragment/1, <<>>). + + +%%------------------------------------------------------------------------- +%% Percent-encode +%%------------------------------------------------------------------------- + +%% Only validates as scheme cannot have percent-encoded characters +-spec encode_scheme(list()|binary()) -> list() | binary(). +encode_scheme([]) -> + throw(uri_parse_error); +encode_scheme(<<>>) -> + throw(uri_parse_error); +encode_scheme(Scheme) -> + case validate_scheme(Scheme) of + true -> Scheme; + false -> throw(uri_parse_error) + end. + +-spec encode_userinfo(list()|binary()) -> list() | binary(). +encode_userinfo(Cs) -> + encode(Cs, fun is_userinfo/1). + +-spec encode_host(list()|binary()) -> list() | binary(). +encode_host(Cs) -> + case classify_host(Cs) of + regname -> Cs; + ipv4 -> Cs; + ipv6 -> bracket_ipv6(Cs); + other -> encode(Cs, fun is_reg_name/1) + end. +-spec encode_path(list()|binary()) -> list() | binary(). +encode_path(Cs) -> + encode(Cs, fun is_path/1). +-spec encode_query(list()|binary()) -> list() | binary(). +encode_query(Cs) -> + encode(Cs, fun is_query/1). + +-spec encode_fragment(list()|binary()) -> list() | binary(). +encode_fragment(Cs) -> + encode(Cs, fun is_fragment/1). + +%%------------------------------------------------------------------------- +%% Helper funtions for percent-decode +%%------------------------------------------------------------------------- -spec decode(list()|binary(), fun(), binary()) -> list() | binary(). decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> - B = hex2dec(C0)*16+hex2dec(C1), + B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); false -> throw(uri_parse_error) end; @@ -1411,7 +1163,7 @@ decode(<<>>, _Fun, Acc) -> decode([$%,C0,C1|Cs], Fun, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> - B = hex2dec(C0)*16+hex2dec(C1), + B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); false -> throw(uri_parse_error) end; @@ -1423,7 +1175,278 @@ decode([C|Cs], Fun, Acc) -> decode([], _Fun, Acc) -> unicode:characters_to_list(Acc). +%% Check if char is allowed in host +-spec is_host(char()) -> boolean(). +is_host($:) -> true; +is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + +%% Check if char is allowed in path +-spec is_path(char()) -> boolean(). +is_path($/) -> true; +is_path(Char) -> is_pchar(Char). + + + +%%------------------------------------------------------------------------- +%% Helper functions for percent-encode +%%------------------------------------------------------------------------- +-spec encode(list()|binary(), fun()) -> list() | binary(). +encode(Component, Fun) when is_list(Component) -> + B = unicode:characters_to_binary(Component), + unicode:characters_to_list(encode(B, Fun, <<>>)); +encode(Component, Fun) when is_binary(Component) -> + encode(Component, Fun, <<>>). +%% +encode(<>, Fun, Acc) -> + C = encode_codepoint_binary(Char, Fun), + encode(Rest, Fun, <>); +encode(<<_Char, _Rest/binary>>, _Fun, _Acc) -> + throw(uri_parse_error); +encode(<<>>, _Fun, Acc) -> + Acc. + + +-spec encode_codepoint_binary(integer(), fun()) -> list(). +encode_codepoint_binary(C, Fun) -> + case Fun(C) of + false -> percent_encode_binary(C); + true -> <> + end. + + +-spec percent_encode_binary(integer()) -> binary(). +percent_encode_binary(Code) -> + percent_encode_binary(<>, <<>>). + + +percent_encode_binary(<>, Acc) -> + percent_encode_binary(Rest, <>); +percent_encode_binary(<<>>, Acc) -> + Acc. -hex2dec(X) when (X >= $0) andalso (X =< $9) -> X - $0; -hex2dec(X) when (X >= $A) andalso (X =< $F) -> X - $A + 10; -hex2dec(X) when (X >= $a) andalso (X =< $f) -> X - $a + 10. + +%%------------------------------------------------------------------------- +%%------------------------------------------------------------------------- +validate_scheme([]) -> true; +validate_scheme([H|T]) -> + case is_scheme(H) of + true -> validate_scheme(T); + false -> false + end; +validate_scheme(<<>>) -> true; +validate_scheme(<>) -> + case is_scheme(H) of + true -> validate_scheme(Rest); + false -> false + end. + +%%------------------------------------------------------------------------- +%% Classifies hostname into the following categories: +%% regname, ipv4 - address does not contain reserved characters to be +%% percent-encoded +%% ipv6 - address does not contain reserved characters but it shall be +%% encolsed in brackets +%% other - address shall be percent-encoded +%%------------------------------------------------------------------------- +classify_host([]) -> false; +classify_host(Addr) when is_binary(Addr) -> + A = unicode:characters_to_list(Addr), + classify_host_ipv6(A); +classify_host(Addr) -> + classify_host_ipv6(Addr). + +classify_host_ipv6(Addr) -> + case is_ipv6_address(Addr) of + true -> ipv6; + false -> classify_host_ipv4(Addr) + end. + +classify_host_ipv4(Addr) -> + case is_ipv4_address(Addr) of + true -> ipv4; + false -> classify_host_regname(Addr) + end. + +classify_host_regname([]) -> regname; +classify_host_regname([H|T]) -> + case is_reg_name(H) of + true -> classify_host_regname(T); + false -> other + end; +classify_host_regname(<<>>) -> regname; +classify_host_regname(<>) -> + case is_reg_name(H) of + true -> classify_host_regname(Rest); + false -> other + end. + +is_ipv4_address(Addr) -> + case inet:parse_ipv4strict_address(Addr) of + {ok, _} -> true; + {error, _} -> false + end. + +is_ipv6_address(Addr) -> + case inet:parse_ipv6strict_address(Addr) of + {ok, _} -> true; + {error, _} -> false + end. + +bracket_ipv6(Addr) when is_binary(Addr) -> + concat(<<$[,Addr/binary>>,<<$]>>); +bracket_ipv6(Addr) when is_list(Addr) -> + [$[|Addr] ++ "]". + + +%%------------------------------------------------------------------------- +%% Helper funtions for recompose +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% Checks if input Map has valid combination of fields that can be +%% recomposed into a URI. +%% It filters out the following combinations from the set of all possible +%% values: +%% - port +%% E.g. ":8080" - invalid URI +%% - userinfo +%% E.g. "//user@" - invalid URI +%% - userinfo port +%% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} +%% There is always at least an empty host when both userinfo and port +%% are present. +%%------------------------------------------------------------------------- +is_valid_map(Map) -> + case + (not maps:is_key(userinfo, Map) andalso + not maps:is_key(host, Map) andalso + maps:is_key(port, Map)) + orelse + (maps:is_key(userinfo, Map) andalso + not maps:is_key(host, Map) andalso + not maps:is_key(port, Map)) + orelse + (maps:is_key(userinfo, Map) andalso + not maps:is_key(host, Map) andalso + maps:is_key(port, Map)) + of + true -> + false; + false -> + true + end. + + +update_scheme(#{scheme := Scheme}, _) -> + add_colon_postfix(encode_scheme(Scheme)); +update_scheme(#{}, _) -> + empty. + + +update_userinfo(#{userinfo := Userinfo}, empty) -> + add_auth_prefix(encode_userinfo(Userinfo)); +update_userinfo(#{userinfo := Userinfo}, URI) -> + concat(URI,add_auth_prefix(encode_userinfo(Userinfo))); +update_userinfo(#{}, empty) -> + empty; +update_userinfo(#{}, URI) -> + URI. + + +update_host(#{host := Host}, empty) -> + add_auth_prefix(encode_host(Host)); +update_host(#{host := Host} = Map, URI) -> + concat(URI,add_host_prefix(Map, encode_host(Host))); +update_host(#{}, empty) -> + empty; +update_host(#{}, URI) -> + URI. + + +%% URI cannot be empty for ports. E.g. ":8080" is not a valid URI +update_port(#{port := Port}, URI) -> + concat(URI,add_colon(encode_port(Port))); +update_port(#{}, URI) -> + URI. + + +update_path(#{path := Path}, empty) -> + encode_path(Path); +update_path(#{path := Path}, URI) -> + concat(URI,encode_path(Path)); +update_path(#{}, empty) -> + empty; +update_path(#{}, URI) -> + URI. + + +update_query(#{query := Query}, empty) -> + encode_query(Query); +update_query(#{query := Query}, URI) -> + concat(URI,encode_query(Query)); +update_query(#{}, empty) -> + empty; +update_query(#{}, URI) -> + URI. + + +update_fragment(#{fragment := Fragment}, empty) -> + add_hashmark(encode_query(Fragment)); +update_fragment(#{fragment := Fragment}, URI) -> + concat(URI,add_hashmark(encode_fragment(Fragment))); +update_fragment(#{}, empty) -> + ""; +update_fragment(#{}, URI) -> + URI. + +%%------------------------------------------------------------------------- +%% Concatenates its arguments that can be lists and binaries. +%% The result is a list if at least one of its argument is a list and +%% binary otherwise. +%%------------------------------------------------------------------------- +concat(A, B) when is_binary(A), is_binary(B) -> + <>; +concat(A, B) when is_binary(A), is_list(B) -> + unicode:characters_to_list(A) ++ B; +concat(A, B) when is_list(A) -> + A ++ maybe_to_list(B). + +add_hashmark(empty) -> empty; +add_hashmark(Comp) when is_binary(Comp) -> + <<$#, Comp/binary>>; +add_hashmark(Comp) when is_list(Comp) -> + [$#|Comp]. + +add_colon(empty) -> empty; +add_colon(Comp) when is_binary(Comp) -> + <<$:, Comp/binary>>; +add_colon(Comp) when is_list(Comp) -> + [$:|Comp]. + +add_colon_postfix(empty) -> empty; +add_colon_postfix(Comp) when is_binary(Comp) -> + <>; +add_colon_postfix(Comp) when is_list(Comp) -> + Comp ++ ":". + +add_auth_prefix(empty) -> empty; +add_auth_prefix(Comp) when is_binary(Comp) -> + <<"//", Comp/binary>>; +add_auth_prefix(Comp) when is_list(Comp) -> + [$/,$/|Comp]. + +add_host_prefix(_, empty) -> empty; +add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) -> + <<$@,Host/binary>>; +add_host_prefix(#{}, Host) when is_binary(Host) -> + <<"//",Host/binary>>; +add_host_prefix(#{userinfo := _}, Host) when is_list(Host) -> + [$@|Host]; +add_host_prefix(#{}, Host) when is_list(Host) -> + [$/,$/|Host]. + +maybe_to_list(Comp) when is_binary(Comp) -> unicode:characters_to_list(Comp); +maybe_to_list(Comp) -> Comp. + +encode_port(Port) -> + integer_to_binary(Port). diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index c379eeb15b..1859a25a18 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -31,9 +31,31 @@ parse_path/1, parse_pct_encoded_fragment/1, parse_pct_encoded_query/1, parse_pct_encoded_userinfo/1, parse_port/1, parse_query/1, parse_scheme/1, parse_userinfo/1, - parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1 + parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1, + recompose_fragment/1, recompose_parse_fragment/1, + recompose_query/1, recompose_parse_query/1, + recompose_path/1, recompose_parse_path/1, + recompose_autogen/1, parse_recompose_autogen/1 ]). + +-define(SCHEME, "foo"). +-define(USERINFO, "åsa"). +-define(USERINFO_ENC, "%C3%A5sa"). +-define(HOST, "älvsjö"). +-define(HOST_ENC, "%C3%A4lvsj%C3%B6"). +-define(IPV6, "::127.0.0.1"). +-define(IPV6_ENC, "[::127.0.0.1]"). +-define(PORT, 8042). +-define(PORT_ENC, ":8042"). +-define(PATH, "/där"). +-define(PATH_ENC, "/d%C3%A4r"). +-define(QUERY, "?name=örn"). +-define(QUERY_ENC, "?name=%C3%B6rn"). +-define(FRAGMENT, "näsa"). +-define(FRAGMENT_ENC, "#n%C3%A4sa"). + + suite() -> [{timetrap,{minutes,1}}]. @@ -66,12 +88,202 @@ all() -> parse_list, parse_binary, parse_mixed, - parse_relative + parse_relative, + recompose_fragment, + recompose_parse_fragment, + recompose_query, + recompose_parse_query, + recompose_path, + recompose_parse_path, + recompose_autogen, + parse_recompose_autogen ]. groups() -> []. + +%%------------------------------------------------------------------------- +%% Helper functions +%%------------------------------------------------------------------------- +uri_combinations() -> + [[Sch,Usr,Hst,Prt,Pat,Qry,Frg] || + Sch <- [fun update_scheme/1, fun update_scheme_binary/1, none], + Usr <- [fun update_userinfo/1, fun update_userinfo_binary/1, none], + Hst <- [fun update_host/1, fun update_host_binary/1, + fun update_ipv6/1, fun update_ipv6_binary/1, none], + Prt <- [fun update_port/1, none], + Pat <- [fun update_path/1, fun update_path_binary/1, none], + Qry <- [fun update_query/1,fun update_query_binary/1, none], + Frg <- [fun update_fragment/1, fun update_fragment_binary/1, none], + not (Usr =:= none andalso Hst =:= none andalso Prt =/= none), + not (Usr =/= none andalso Hst =:= none andalso Prt =:= none), + not (Usr =/= none andalso Hst =:= none andalso Prt =/= none)]. + + +generate_test_vector(Comb) -> + Fun = fun (F, {Map, URI}) when is_function(F) -> F({Map, URI}); + (_, Map) -> Map + end, + lists:foldl(Fun, {#{}, empty}, Comb). + +generate_test_vectors(L) -> + lists:map(fun generate_test_vector/1, L). + +update_fragment({In, empty}) -> + {In#{fragment => ?FRAGMENT}, ?FRAGMENT_ENC}; +update_fragment({In, Out}) when is_list(Out) -> + {In#{fragment => ?FRAGMENT}, Out ++ ?FRAGMENT_ENC}; +update_fragment({In, Out}) when is_binary(Out) -> + {In#{fragment => ?FRAGMENT}, binary_to_list(Out) ++ ?FRAGMENT_ENC}. + +update_fragment_binary({In, empty}) -> + {In#{fragment => <>}, <>}; +update_fragment_binary({In, Out}) when is_list(Out) -> + {In#{fragment => <>}, Out ++ ?FRAGMENT_ENC}; +update_fragment_binary({In, Out}) when is_binary(Out) -> + {In#{fragment => <>}, <>}. + + +update_query({In, empty}) -> + {In#{query => ?QUERY}, ?QUERY_ENC}; +update_query({In, Out}) when is_list(Out) -> + {In#{query => ?QUERY}, Out ++ ?QUERY_ENC}; +update_query({In, Out}) when is_binary(Out) -> + {In#{query => ?QUERY}, binary_to_list(Out) ++ ?QUERY_ENC}. + +update_query_binary({In, empty}) -> + {In#{query => <>}, <>}; +update_query_binary({In, Out}) when is_list(Out) -> + {In#{query => <>}, Out ++ ?QUERY_ENC}; +update_query_binary({In, Out}) when is_binary(Out) -> + {In#{query => <>}, <>}. + +update_path({In, empty}) -> + {In#{path => ?PATH}, ?PATH_ENC}; +update_path({In, Out}) when is_list(Out) -> + {In#{path => ?PATH}, Out ++ ?PATH_ENC}; +update_path({In, Out}) when is_binary(Out) -> + {In#{path => ?PATH}, binary_to_list(Out) ++ ?PATH_ENC}. + +update_path_binary({In, empty}) -> + {In#{path => <>}, <>}; +update_path_binary({In, Out}) when is_list(Out) -> + {In#{path => <>}, Out ++ ?PATH_ENC}; +update_path_binary({In, Out}) when is_binary(Out) -> + {In#{path => <>}, <>}. + +update_port({In, Out}) when is_list(Out) -> + {In#{port => ?PORT}, Out ++ ?PORT_ENC}; +update_port({In, Out}) when is_binary(Out) -> + {In#{port => ?PORT}, <>}. + +update_host({In, empty}) -> + {In#{host => ?HOST}, "//" ++ ?HOST_ENC}; +update_host({In, Out}) when is_list(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => ?HOST}, Out ++ [$@|?HOST_ENC]}; + false -> {In#{host => ?HOST}, Out ++ [$/,$/|?HOST_ENC]} + end; +update_host({In, Out}) when is_binary(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => ?HOST}, binary_to_list(Out) ++ [$@|?HOST_ENC]}; + false -> {In#{host => ?HOST}, binary_to_list(Out) ++ [$/,$/|?HOST_ENC]} + end. + +update_host_binary({In, empty}) -> + {In#{host => <>}, <<"//",?HOST_ENC>>}; +update_host_binary({In, Out}) when is_list(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => <>}, Out ++ [$@|?HOST_ENC]}; + false -> {In#{host => <>}, Out ++ [$/,$/|?HOST_ENC]} + end; +update_host_binary({In, Out}) when is_binary(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => <>}, <>}; + false-> {In#{host => <>}, <>} + end. + +update_ipv6({In, empty}) -> + {In#{host => ?IPV6}, "//" ++ ?IPV6_ENC}; +update_ipv6({In, Out}) when is_list(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => ?IPV6}, Out ++ [$@|?IPV6_ENC]}; + false -> {In#{host => ?IPV6}, Out ++ [$/,$/|?IPV6_ENC]} + end; +update_ipv6({In, Out}) when is_binary(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => ?IPV6}, binary_to_list(Out) ++ [$@|?IPV6_ENC]}; + false -> {In#{host => ?IPV6}, binary_to_list(Out) ++ [$/,$/|?IPV6_ENC]} + end. + +update_ipv6_binary({In, empty}) -> + {In#{host => <>}, <<"//",?IPV6_ENC>>}; +update_ipv6_binary({In, Out}) when is_list(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => <>}, Out ++ [$@|?IPV6_ENC]}; + false -> {In#{host => <>}, Out ++ [$/,$/|?IPV6_ENC]} + end; +update_ipv6_binary({In, Out}) when is_binary(Out) -> + case maps:is_key(userinfo, In) of + true -> {In#{host => <>}, <>}; + false-> {In#{host => <>}, <>} + end. + +update_userinfo({In, empty}) -> + {In#{userinfo => ?USERINFO}, "//" ++ ?USERINFO_ENC}; +update_userinfo({In, Out}) when is_list(Out) -> + {In#{userinfo => ?USERINFO}, Out ++ "//" ++ ?USERINFO_ENC}; +update_userinfo({In, Out}) when is_binary(Out) -> + {In#{userinfo => ?USERINFO}, binary_to_list(Out) ++ "//" ++ ?USERINFO_ENC}. + +update_userinfo_binary({In, empty}) -> + {In#{userinfo => <>}, <<"//",?USERINFO_ENC>>}; +update_userinfo_binary({In, Out}) when is_list(Out) -> + {In#{userinfo => <>}, Out ++ "//" ++ ?USERINFO_ENC}; +update_userinfo_binary({In, Out}) when is_binary(Out) -> + {In#{userinfo => <>}, <>}. + +update_scheme({In, empty}) -> + {In#{scheme => ?SCHEME}, ?SCHEME ++ ":"}. + +update_scheme_binary({In, empty}) -> + {In#{scheme => <>}, <>}. + + +%% Test recompose on a generated test vector +run_test_recompose({#{}, empty}) -> + try "" = uri_string:recompose(#{}) of + _ -> ok + catch + _:_ -> error({test_failed, #{}, ""}) + end; +run_test_recompose({Map, URI}) -> + try URI = uri_string:recompose(Map) of + URI -> ok + catch + _:_ -> error({test_failed, Map, URI}) + end. + +%% Test parse - recompose on a generated test vector +run_test_parse_recompose({#{}, empty}) -> + try "" = uri_string:recompose(uri_string:parse("")) of + _ -> ok + catch + _:_ -> error({test_failed, #{}, ""}) + end; +run_test_parse_recompose({Map, URI}) -> + try URI = uri_string:recompose(uri_string:parse(URI)) of + URI -> ok + catch + _:_ -> error({test_failed, Map, URI}) + end. + + +%%------------------------------------------------------------------------- +%% Parse tests +%%------------------------------------------------------------------------- + parse_binary_scheme(_Config) -> #{} = uri_string:parse(<<>>), #{path := <<"foo">>} = uri_string:parse(<<"foo">>), @@ -438,3 +650,87 @@ parse_relative(_Config) -> uri_string:parse(lists:append("/pa",<<"th">>)), #{path := "foo"} = uri_string:parse(lists:append("fo",<<"o">>)). + + +%%------------------------------------------------------------------------- +%% Recompose tests +%%------------------------------------------------------------------------- +recompose_fragment(_Config) -> + <> = uri_string:recompose(#{fragment => <>}), + ?FRAGMENT_ENC = uri_string:recompose(#{fragment => ?FRAGMENT}). + +recompose_parse_fragment(_Config) -> + <> = uri_string:recompose(uri_string:parse(<>)), + ?FRAGMENT_ENC = uri_string:recompose(uri_string:parse(?FRAGMENT_ENC)). + +recompose_query(_Config) -> + <> = + uri_string:recompose(#{query => <>}), + <> = + uri_string:recompose(#{query => <>, + fragment => <>}), + "?name=%C3%B6rn" = + uri_string:recompose(#{query => "?name=örn"}), + "?name=%C3%B6rn#n%C3%A4sa" = + uri_string:recompose(#{query => "?name=örn", + fragment => "näsa"}). + +recompose_parse_query(_Config) -> + <<"?name=%C3%B6rn">> = uri_string:recompose(uri_string:parse(<<"?name=%C3%B6rn">>)), + <<"?name=%C3%B6rn#n%C3%A4sa">> = + uri_string:recompose(uri_string:parse(<<"?name=%C3%B6rn#n%C3%A4sa">>)), + "?name=%C3%B6rn" = uri_string:recompose(uri_string:parse("?name=%C3%B6rn")), + "?name=%C3%B6rn#n%C3%A4sa" = uri_string:recompose(uri_string:parse("?name=%C3%B6rn#n%C3%A4sa")). + +recompose_path(_Config) -> + <<"/d%C3%A4r">> = + uri_string:recompose(#{path => <<"/där"/utf8>>}), + <<"/d%C3%A4r#n%C3%A4sa">> = + uri_string:recompose(#{path => <<"/där"/utf8>>, + fragment => <<"näsa"/utf8>>}), + <<"/d%C3%A4r?name=%C3%B6rn">> = + uri_string:recompose(#{path => <<"/där"/utf8>>, + query => <<"?name=örn"/utf8>>}), + <<"/d%C3%A4r?name=%C3%B6rn#n%C3%A4sa">> = + uri_string:recompose(#{path => <<"/där"/utf8>>, + query => <<"?name=örn"/utf8>>, + fragment => <<"näsa"/utf8>>}), + + + "/d%C3%A4r" = + uri_string:recompose(#{path => "/där"}), + "/d%C3%A4r#n%C3%A4sa" = + uri_string:recompose(#{path => "/där", + fragment => "näsa"}), + "/d%C3%A4r?name=%C3%B6rn" = + uri_string:recompose(#{path => "/där", + query => "?name=örn"}), + "/d%C3%A4r?name=%C3%B6rn#n%C3%A4sa" = + uri_string:recompose(#{path => "/där", + query => "?name=örn", + fragment => "näsa"}). + + +recompose_parse_path(_Config) -> + <<"/d%C3%A4r">> = + uri_string:recompose(uri_string:parse(<<"/d%C3%A4r">>)), + <<"/d%C3%A4r#n%C3%A4sa">> = + uri_string:recompose(uri_string:parse(<<"/d%C3%A4r#n%C3%A4sa">>)), + <<"/d%C3%A4r?name=%C3%B6rn">> = + uri_string:recompose(uri_string:parse(<<"/d%C3%A4r?name=%C3%B6rn">>)), + + "/d%C3%A4r" = + uri_string:recompose(uri_string:parse("/d%C3%A4r")), + "/d%C3%A4r#n%C3%A4sa" = + uri_string:recompose(uri_string:parse("/d%C3%A4r#n%C3%A4sa")), + "/d%C3%A4r?name=%C3%B6rn" = + uri_string:recompose(uri_string:parse("/d%C3%A4r?name=%C3%B6rn")). + + +recompose_autogen(_Config) -> + Tests = generate_test_vectors(uri_combinations()), + lists:map(fun run_test_recompose/1, Tests). + +parse_recompose_autogen(_Config) -> + Tests = generate_test_vectors(uri_combinations()), + lists:map(fun run_test_parse_recompose/1, Tests). -- cgit v1.2.3 From 505579acda74b9281c965488f86cbd6c83254a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 29 Sep 2017 16:54:50 +0200 Subject: stdlib: Improve calculation of parsed binary - Improved calculation of parsed binary. - Added tests for special corner cases. - Fixed dialyzer warnings. --- lib/stdlib/src/uri_string.erl | 246 +++++++++++++++++++++-------------- lib/stdlib/test/uri_string_SUITE.erl | 19 ++- 2 files changed, 164 insertions(+), 101 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 89a2c21518..bb7079c193 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -434,51 +434,36 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> %% Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + Userinfo = calculate_parsed_part(Rest, T), URI1#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), - {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + Host = calculate_parsed_part_sl(Rest, T), URI1#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), URI1#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), URI1#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), URI1#{path => decode_path(?STRING_REST(Char, Path))}; false -> throw(uri_parse_error) end. -%% Returns size of 'Rest' for proper calculation of splitting position. -%% Solves the following special case: -%% -%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). -%% -%% While keeping the following true: -%% -%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). -%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). -%% --spec byte_size_exl_single_slash(uri_string()) -> number(). -byte_size_exl_single_slash(<<$/>>) -> 0; -byte_size_exl_single_slash(Rest) -> byte_size(Rest). - - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.3. Path] %% @@ -516,11 +501,11 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of @@ -539,11 +524,11 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of @@ -580,7 +565,7 @@ is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_scheme_start(?STRING_REST(Char, Rest), URI) -> case is_alpha(Char) of true -> {T, URI1} = parse_scheme(Rest, URI), - {Scheme, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + Scheme = calculate_parsed_scheme(Rest, T), URI1#{scheme => ?STRING_REST(Char, Scheme)}; false -> throw(uri_parse_error) end. @@ -618,31 +603,31 @@ parse_hier(?STRING_REST("//", Rest), URI) -> % Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + Userinfo = calculate_parsed_part(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), - {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + Host = calculate_parsed_part_sl(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of true -> % segment_nz {T, URI1} = parse_segment(Rest, URI), - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST(Char, Path))}}; false -> throw(uri_parse_error) end; @@ -680,7 +665,7 @@ parse_userinfo(?CHAR($@), _URI) -> throw(uri_parse_error); parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), - {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Host = calculate_parsed_part(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of @@ -741,22 +726,22 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -770,20 +755,20 @@ parse_host(?STRING_EMPTY, URI) -> -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of @@ -803,23 +788,23 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of @@ -866,20 +851,20 @@ is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of @@ -909,15 +894,15 @@ validate_ipv6_address(Addr) -> -spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}. parse_port(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -943,7 +928,7 @@ parse_port(?STRING_EMPTY, URI) -> -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of @@ -1046,13 +1031,6 @@ is_hex_digit(C) is_hex_digit(_) -> false. -%% Returns the size of a binary exluding the first element. -%% Used in calls to split_binary(). --spec byte_size_exl_head(binary()) -> number(). -byte_size_exl_head(<<>>) -> 0; -byte_size_exl_head(Binary) -> byte_size(Binary) + 1. - - %% Remove enclosing brackets from binary -spec remove_brackets(binary()) -> binary(). remove_brackets(<<$[/utf8, Rest/binary>>) -> @@ -1064,6 +1042,95 @@ remove_brackets(<<$[/utf8, Rest/binary>>) -> remove_brackets(Addr) -> Addr. +%%------------------------------------------------------------------------- +%% Helper functions for calculating the parsed binary. +%%------------------------------------------------------------------------- + +%% Returns the parsed binary based on Input and the Unparsed part. +%% Handles the following special cases: +%% +%% #{host => [],path => "/",query => "?"} = uri_string:parse("///?") +%% #{fragment => [],host => [],path => "/"} = uri_string:parse("///#") +%% +-spec calculate_parsed_part(binary(), binary()) -> binary(). +calculate_parsed_part(<<$?>>, _) -> <<>>; +calculate_parsed_part(<<$#>>, _) -> <<>>; +calculate_parsed_part(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +%% Returns the parsed binary based on Input and the Unparsed part. +%% Used when parsing authority. +%% +%% Handles the following special cases: +%% +%% #{host => "foo",query => "?"} = uri_string:parse("//foo?") +%% #{fragment => [],host => "foo"} = uri_string:parse("//foo#") +%% #{host => "foo",path => "/"} = uri_string:parse("//foo/") +%% #{host => "foo",query => "?",scheme => "http"} = uri_string:parse("http://foo?") +%% #{fragment => [],host => "foo",scheme => "http"} = uri_string:parse("http://foo#") +%% #{host => "foo",path => "/",scheme => "http"} = uri_string:parse("http://foo/") +%% +-spec calculate_parsed_part_sl(binary(), binary()) -> binary(). +calculate_parsed_part_sl(<<$?>>, _) -> <<>>; +calculate_parsed_part_sl(<<$#>>, _) -> <<>>; +calculate_parsed_part_sl(<<>>, _) -> <<>>; +calculate_parsed_part_sl(Input, <<>>) -> + case binary:last(Input) of + $? -> + {First, _} = + split_binary(Input, byte_size(Input) - 1), + First; + + $# -> + {First, _} = + split_binary(Input, byte_size(Input) - 1), + First; + $/ -> + {First, _} = + split_binary(Input, byte_size(Input) - 1), + First; + _Else -> + {First, _} = + split_binary(Input, byte_size_exl_single_slash(Input)), + First + end; +calculate_parsed_part_sl(Input, Unparsed) -> + {First, _} = + split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)), + First. + + +%% Returns the parsed binary based on Input and the Unparsed part. +%% Used when parsing scheme. +-spec calculate_parsed_scheme(binary(), binary()) -> binary(). +calculate_parsed_scheme(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1), + First. + +%% Returns the size of a binary exluding the first element. +%% Used in calls to split_binary(). +-spec byte_size_exl_head(binary()) -> number(). +byte_size_exl_head(<<>>) -> 0; +byte_size_exl_head(Binary) -> byte_size(Binary) + 1. + + +%% Returns size of 'Rest' for proper calculation of splitting position. +%% Solves the following special case: +%% +%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). +%% +%% While keeping the following true: +%% +%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). +%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). +%% +-spec byte_size_exl_single_slash(uri_string()) -> number(). +byte_size_exl_single_slash(<<$/>>) -> 0; +byte_size_exl_single_slash(Rest) -> byte_size(Rest). + + %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 2.1. Percent-Encoding] %% @@ -1080,23 +1147,23 @@ remove_brackets(Addr) -> Addr. %% %% pct-encoded = "%" HEXDIG HEXDIG %%------------------------------------------------------------------------- --spec decode_userinfo(list()|binary()) -> list() | binary(). +-spec decode_userinfo(binary()) -> binary(). decode_userinfo(Cs) -> decode(Cs, fun is_userinfo/1, <<>>). --spec decode_host(list()|binary()) -> list() | binary(). +-spec decode_host(binary()) -> binary(). decode_host(Cs) -> decode(Cs, fun is_host/1, <<>>). --spec decode_path(list()|binary()) -> list() | binary(). +-spec decode_path(binary()) -> binary(). decode_path(Cs) -> decode(Cs, fun is_path/1, <<>>). --spec decode_query(list()|binary()) -> list() | binary(). +-spec decode_query(binary()) -> binary(). decode_query(Cs) -> decode(Cs, fun is_query/1, <<>>). --spec decode_fragment(list()|binary()) -> list() | binary(). +-spec decode_fragment(binary()) -> binary(). decode_fragment(Cs) -> decode(Cs, fun is_fragment/1, <<>>). @@ -1136,7 +1203,10 @@ encode_path(Cs) -> -spec encode_query(list()|binary()) -> list() | binary(). encode_query(Cs) -> - encode(Cs, fun is_query/1). + case validate_query(Cs) of + true -> encode(Cs, fun is_query/1); + false -> throw(uri_parse_error) + end. -spec encode_fragment(list()|binary()) -> list() | binary(). encode_fragment(Cs) -> @@ -1145,7 +1215,6 @@ encode_fragment(Cs) -> %%------------------------------------------------------------------------- %% Helper funtions for percent-decode %%------------------------------------------------------------------------- --spec decode(list()|binary(), fun(), binary()) -> list() | binary(). decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> @@ -1159,21 +1228,7 @@ decode(<>, Fun, Acc) -> false -> throw(uri_parse_error) end; decode(<<>>, _Fun, Acc) -> - Acc; -decode([$%,C0,C1|Cs], Fun, Acc) -> - case is_hex_digit(C0) andalso is_hex_digit(C1) of - true -> - B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), - decode(Cs, Fun, <>); - false -> throw(uri_parse_error) - end; -decode([C|Cs], Fun, Acc) -> - case Fun(C) of - true -> decode(Cs, Fun, <>); - false -> throw(uri_parse_error) - end; -decode([], _Fun, Acc) -> - unicode:characters_to_list(Acc). + Acc. %% Check if char is allowed in host -spec is_host(char()) -> boolean(). @@ -1186,7 +1241,6 @@ is_path($/) -> true; is_path(Char) -> is_pchar(Char). - %%------------------------------------------------------------------------- %% Helper functions for percent-encode %%------------------------------------------------------------------------- @@ -1206,7 +1260,7 @@ encode(<<>>, _Fun, Acc) -> Acc. --spec encode_codepoint_binary(integer(), fun()) -> list(). +-spec encode_codepoint_binary(integer(), fun()) -> binary(). encode_codepoint_binary(C, Fun) -> case Fun(C) of false -> percent_encode_binary(C); @@ -1240,6 +1294,11 @@ validate_scheme(<>) -> false -> false end. +validate_query([$?|_]) -> true; +validate_query(<<$?/utf8, _/binary>>) -> true; +validate_query(_) -> false. + + %%------------------------------------------------------------------------- %% Classifies hostname into the following categories: %% regname, ipv4 - address does not contain reserved characters to be @@ -1248,7 +1307,7 @@ validate_scheme(<>) -> %% encolsed in brackets %% other - address shall be percent-encoded %%------------------------------------------------------------------------- -classify_host([]) -> false; +classify_host([]) -> other; classify_host(Addr) when is_binary(Addr) -> A = unicode:characters_to_list(Addr), classify_host_ipv6(A); @@ -1272,12 +1331,6 @@ classify_host_regname([H|T]) -> case is_reg_name(H) of true -> classify_host_regname(T); false -> other - end; -classify_host_regname(<<>>) -> regname; -classify_host_regname(<>) -> - case is_reg_name(H) of - true -> classify_host_regname(Rest); - false -> other end. is_ipv4_address(Addr) -> @@ -1391,7 +1444,7 @@ update_query(#{}, URI) -> update_fragment(#{fragment := Fragment}, empty) -> - add_hashmark(encode_query(Fragment)); + add_hashmark(encode_fragment(Fragment)); update_fragment(#{fragment := Fragment}, URI) -> concat(URI,add_hashmark(encode_fragment(Fragment))); update_fragment(#{}, empty) -> @@ -1411,31 +1464,24 @@ concat(A, B) when is_binary(A), is_list(B) -> concat(A, B) when is_list(A) -> A ++ maybe_to_list(B). -add_hashmark(empty) -> empty; add_hashmark(Comp) when is_binary(Comp) -> <<$#, Comp/binary>>; add_hashmark(Comp) when is_list(Comp) -> [$#|Comp]. -add_colon(empty) -> empty; add_colon(Comp) when is_binary(Comp) -> - <<$:, Comp/binary>>; -add_colon(Comp) when is_list(Comp) -> - [$:|Comp]. + <<$:, Comp/binary>>. -add_colon_postfix(empty) -> empty; add_colon_postfix(Comp) when is_binary(Comp) -> <>; add_colon_postfix(Comp) when is_list(Comp) -> Comp ++ ":". -add_auth_prefix(empty) -> empty; add_auth_prefix(Comp) when is_binary(Comp) -> <<"//", Comp/binary>>; add_auth_prefix(Comp) when is_list(Comp) -> [$/,$/|Comp]. -add_host_prefix(_, empty) -> empty; add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) -> <<$@,Host/binary>>; add_host_prefix(#{}, Host) when is_binary(Host) -> diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 1859a25a18..0eb5105c35 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -31,7 +31,7 @@ parse_path/1, parse_pct_encoded_fragment/1, parse_pct_encoded_query/1, parse_pct_encoded_userinfo/1, parse_port/1, parse_query/1, parse_scheme/1, parse_userinfo/1, - parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1, + parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1, parse_special/1, recompose_fragment/1, recompose_parse_fragment/1, recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, @@ -89,6 +89,7 @@ all() -> parse_binary, parse_mixed, parse_relative, + parse_special, recompose_fragment, recompose_parse_fragment, recompose_query, @@ -651,6 +652,22 @@ parse_relative(_Config) -> #{path := "foo"} = uri_string:parse(lists:append("fo",<<"o">>)). +parse_special(_Config) -> + #{host := [],query := "?"} = uri_string:parse("//?"), + #{fragment := [],host := []} = uri_string:parse("//#"), + #{host := [],query := "?",scheme := "foo"} = uri_string:parse("foo://?"), + #{fragment := [],host := [],scheme := "foo"} = uri_string:parse("foo://#"), + #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>), + #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>), + #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>), + #{host := [],path := "/",query := "?"} = uri_string:parse("///?"), + #{fragment := [],host := [],path := "/"} = uri_string:parse("///#"), + #{host := "foo",query := "?"} = uri_string:parse("//foo?"), + #{fragment := [],host := "foo"} = uri_string:parse("//foo#"), + #{host := "foo",path := "/"} = uri_string:parse("//foo/"), + #{host := "foo",query := "?",scheme := "http"} = uri_string:parse("http://foo?"), + #{fragment := [],host := "foo",scheme := "http"} = uri_string:parse("http://foo#"), + #{host := "foo",path := "/",scheme := "http"} = uri_string:parse("http://foo/"). %%------------------------------------------------------------------------- %% Recompose tests -- cgit v1.2.3 From 1335e59a60d5e195baf519d2c52b0ca0aa96831f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 4 Oct 2017 16:45:51 +0200 Subject: stdlib: Add property tests, bugfixes - Add property tests using PropEr. - Add new testcases to uri_string_SUITE. - Improve calculation of parsed binary. - Verify if input to parse() is UTF8 encoded. - Update is_valid_map(): added check for path and host. --- lib/stdlib/src/uri_string.erl | 224 ++++++++++--- .../test/property_test/uri_string_decode.erl | 55 ---- .../test/property_test/uri_string_recompose.erl | 360 +++++++++++++++++++++ lib/stdlib/test/uri_string_SUITE.erl | 36 ++- lib/stdlib/test/uri_string_property_test_SUITE.erl | 15 +- 5 files changed, 566 insertions(+), 124 deletions(-) delete mode 100644 lib/stdlib/test/property_test/uri_string_decode.erl create mode 100644 lib/stdlib/test/property_test/uri_string_recompose.erl (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index bb7079c193..893ba4c6bf 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -304,8 +304,6 @@ parse(URIString) when is_list(URIString) -> -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). -recompose(Map) when map_size(Map) =:= 0 -> - ""; recompose(Map) -> case is_valid_map(Map) of false -> @@ -405,7 +403,7 @@ convert_mapfields_to_list(Map) -> %% URI-reference = URI / relative-ref %%------------------------------------------------------------------------- -spec parse_uri_reference(binary(), uri_map()) -> uri_map(). -parse_uri_reference(<<>>, _) -> #{}; +parse_uri_reference(<<>>, _) -> #{path => <<>>}; parse_uri_reference(URIString, URI) -> try parse_scheme_start(URIString, URI) of Res -> Res @@ -434,13 +432,15 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> %% Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - Userinfo = calculate_parsed_part(Rest, T), - URI1#{userinfo => decode_userinfo(Userinfo)} + Userinfo = calculate_parsed_userinfo(Rest, T), + URI2 = maybe_add_path(URI1), + URI2#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), - URI1#{host => decode_host(remove_brackets(Host))} + URI2 = maybe_add_path(URI1), + URI2#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -449,11 +449,13 @@ parse_relative_part(?STRING_REST($/, Rest), URI) -> parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_part(Rest, T), - URI1#{query => decode_query(?STRING_REST($?, Query))}; + URI2 = maybe_add_path(URI1), + URI2#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), - URI1#{fragment => decode_fragment(Fragment)}; + Fragment = calculate_parsed_fragment(Rest, T), + URI2 = maybe_add_path(URI1), + URI2#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> @@ -505,7 +507,7 @@ parse_segment(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of @@ -528,7 +530,7 @@ parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of @@ -566,10 +568,32 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> case is_alpha(Char) of true -> {T, URI1} = parse_scheme(Rest, URI), Scheme = calculate_parsed_scheme(Rest, T), - URI1#{scheme => ?STRING_REST(Char, Scheme)}; + URI2 = maybe_add_path(URI1), + URI2#{scheme => ?STRING_REST(Char, Scheme)}; false -> throw(uri_parse_error) end. +%% Add path component if it missing after parsing the URI. +%% According to the URI specification there is always a +%% path component in every URI-reference and it can be +%% empty. + +%% maybe_add_path(Map) -> +%% case length(maps:keys(Map)) of +%% 0 -> +%% Map#{path => <<>>}; +%% _Else -> +%% Map +%% end. +maybe_add_path(Map) -> + case maps:is_key(path, Map) of + false -> + Map#{path => <<>>}; + _Else -> + Map + end. + + -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> @@ -603,7 +627,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> % Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - Userinfo = calculate_parsed_part(Rest, T), + Userinfo = calculate_parsed_userinfo(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> @@ -621,7 +645,7 @@ parse_hier(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of @@ -660,12 +684,11 @@ parse_hier(?STRING_EMPTY, URI) -> %% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) %%------------------------------------------------------------------------- -spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. -parse_userinfo(?CHAR($@), _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error); +parse_userinfo(?CHAR($@), URI) -> + {?STRING_EMPTY, URI#{host => <<>>}}; parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), - Host = calculate_parsed_part(Rest, T), + Host = calculate_parsed_host(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of @@ -726,7 +749,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> @@ -741,7 +764,7 @@ parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -755,7 +778,7 @@ parse_host(?STRING_EMPTY, URI) -> -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> @@ -768,7 +791,7 @@ parse_reg_name(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of @@ -788,7 +811,7 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> @@ -804,7 +827,7 @@ parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of @@ -851,7 +874,7 @@ is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> @@ -864,7 +887,7 @@ parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of @@ -902,7 +925,7 @@ parse_port(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -928,7 +951,7 @@ parse_port(?STRING_EMPTY, URI) -> -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of @@ -1055,11 +1078,88 @@ remove_brackets(Addr) -> Addr. -spec calculate_parsed_part(binary(), binary()) -> binary(). calculate_parsed_part(<<$?>>, _) -> <<>>; calculate_parsed_part(<<$#>>, _) -> <<>>; +calculate_parsed_part(<<>>, _) -> <<>>; +calculate_parsed_part(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + _Else -> + Input + end; calculate_parsed_part(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), First. +-spec calculate_parsed_userinfo(binary(), binary()) -> binary(). +calculate_parsed_userinfo(<<$?>>, _) -> <<>>; +calculate_parsed_userinfo(<<$#>>, _) -> <<>>; +calculate_parsed_userinfo(<<>>, _) -> <<>>; +calculate_parsed_userinfo(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $@ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_userinfo(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_host(binary(), binary()) -> binary(). +calculate_parsed_host(<<$?>>, _) -> <<>>; +calculate_parsed_host(<<$#>>, _) -> <<>>; +calculate_parsed_host(<<>>, _) -> <<>>; +calculate_parsed_host(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $/ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_host(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_port(binary(), binary()) -> binary(). +calculate_parsed_port(<<$?>>, _) -> <<>>; +calculate_parsed_port(<<$#>>, _) -> <<>>; +calculate_parsed_port(<<>>, _) -> <<>>; +calculate_parsed_port(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $/ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_port(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_fragment(binary(), binary()) -> binary(). +calculate_parsed_fragment(<<$#>>, _) -> <<>>; +calculate_parsed_fragment(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + %% Returns the parsed binary based on Input and the Unparsed part. %% Used when parsing authority. %% @@ -1079,28 +1179,25 @@ calculate_parsed_part_sl(<<>>, _) -> <<>>; calculate_parsed_part_sl(Input, <<>>) -> case binary:last(Input) of $? -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; - + init_binary(Input); $# -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; + init_binary(Input); $/ -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; + init_binary(Input); _Else -> - {First, _} = - split_binary(Input, byte_size_exl_single_slash(Input)), - First + Input end; calculate_parsed_part_sl(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)), First. +%% Return all bytes of the binary except the last one. The binary must be non-empty. +init_binary(B) -> + {Init, _} = + split_binary(B, byte_size(B) - 1), + Init. + %% Returns the parsed binary based on Input and the Unparsed part. %% Used when parsing scheme. @@ -1109,6 +1206,7 @@ calculate_parsed_scheme(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1), First. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). @@ -1149,25 +1247,35 @@ byte_size_exl_single_slash(Rest) -> byte_size(Rest). %%------------------------------------------------------------------------- -spec decode_userinfo(binary()) -> binary(). decode_userinfo(Cs) -> - decode(Cs, fun is_userinfo/1, <<>>). + check_utf8(decode(Cs, fun is_userinfo/1, <<>>)). -spec decode_host(binary()) -> binary(). decode_host(Cs) -> - decode(Cs, fun is_host/1, <<>>). + check_utf8(decode(Cs, fun is_host/1, <<>>)). -spec decode_path(binary()) -> binary(). decode_path(Cs) -> - decode(Cs, fun is_path/1, <<>>). + check_utf8(decode(Cs, fun is_path/1, <<>>)). -spec decode_query(binary()) -> binary(). decode_query(Cs) -> - decode(Cs, fun is_query/1, <<>>). + check_utf8(decode(Cs, fun is_query/1, <<>>)). -spec decode_fragment(binary()) -> binary(). decode_fragment(Cs) -> - decode(Cs, fun is_fragment/1, <<>>). + check_utf8(decode(Cs, fun is_fragment/1, <<>>)). +%% Returns Cs if it is utf8 encoded. +check_utf8(Cs) -> + case unicode:characters_to_list(Cs) of + {incomplete,_,_} -> + throw(uri_parse_error); + {error,_,_} -> + throw(uri_parse_error); + _ -> Cs + end. + %%------------------------------------------------------------------------- %% Percent-encode %%------------------------------------------------------------------------- @@ -1368,10 +1476,15 @@ bracket_ipv6(Addr) when is_list(Addr) -> %% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} %% There is always at least an empty host when both userinfo and port %% are present. +%% - #{path => "///"} otherwise the following would be true: +%% "/////" = uri_string:recompose(#{host => "", path => "///"}) +%% "/////" = uri_string:recompose(#{path => "/////"}) +%% AND +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] %%------------------------------------------------------------------------- is_valid_map(Map) -> case - (not maps:is_key(userinfo, Map) andalso + ((not maps:is_key(userinfo, Map) andalso not maps:is_key(host, Map) andalso maps:is_key(port, Map)) orelse @@ -1381,7 +1494,9 @@ is_valid_map(Map) -> orelse (maps:is_key(userinfo, Map) andalso not maps:is_key(host, Map) andalso - maps:is_key(port, Map)) + maps:is_key(port, Map))) orelse + not maps:is_key(path, Map) orelse + not is_host_and_path_valid(Map) of true -> false; @@ -1390,6 +1505,19 @@ is_valid_map(Map) -> end. +is_host_and_path_valid(Map) -> + Host = maps:get(host, Map, undefined), + Path = maps:get(path, Map, undefined), + not (Host =:= undefined andalso starts_with_two_slash(Path)). + + +starts_with_two_slash([$/,$/|_]) -> + true; +starts_with_two_slash(?STRING_REST("//", _)) -> + true; +starts_with_two_slash(_) -> false. + + update_scheme(#{scheme := Scheme}, _) -> add_colon_postfix(encode_scheme(Scheme)); update_scheme(#{}, _) -> diff --git a/lib/stdlib/test/property_test/uri_string_decode.erl b/lib/stdlib/test/property_test/uri_string_decode.erl deleted file mode 100644 index 137a649cf1..0000000000 --- a/lib/stdlib/test/property_test/uri_string_decode.erl +++ /dev/null @@ -1,55 +0,0 @@ -%% -%% %CopyrightBegin% -%% -%% Copyright Ericsson AB 2008-2017. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. -%% -%% %CopyrightEnd% -%% --module(uri_string_decode). - --compile(export_all). - --proptest(eqc). --proptest([triq,proper]). - --ifndef(EQC). --ifndef(PROPER). --ifndef(TRIQ). --define(EQC,true). --endif. --endif. --endif. - --ifdef(EQC). --include_lib("eqc/include/eqc.hrl"). --define(MOD_eqc,eqc). - --else. --ifdef(PROPER). --include_lib("proper/include/proper.hrl"). --define(MOD_eqc,proper). - --else. --ifdef(TRIQ). --define(MOD_eqc,triq). --include_lib("triq/include/triq.hrl"). - --endif. --endif. --endif. - - -prop_uri_string_decode() -> - ok. diff --git a/lib/stdlib/test/property_test/uri_string_recompose.erl b/lib/stdlib/test/property_test/uri_string_recompose.erl new file mode 100644 index 0000000000..dad67cd4c1 --- /dev/null +++ b/lib/stdlib/test/property_test/uri_string_recompose.erl @@ -0,0 +1,360 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2008-2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(uri_string_recompose). + +-compile(export_all). + +-proptest(eqc). +-proptest([triq,proper]). + +-ifndef(EQC). +-ifndef(PROPER). +-ifndef(TRIQ). +-define(EQC,true). +-endif. +-endif. +-endif. + +-ifdef(EQC). +-include_lib("eqc/include/eqc.hrl"). +-define(MOD_eqc,eqc). + +-else. +-ifdef(PROPER). +-include_lib("proper/include/proper.hrl"). +-define(MOD_eqc,proper). + +-else. +-ifdef(TRIQ). +-define(MOD_eqc,triq). +-include_lib("triq/include/triq.hrl"). + +-endif. +-endif. +-endif. + + +-define(STRING_REST(MatchStr, Rest), <>). + +-define(SCHEME, {scheme, scheme()}). +-define(USER, {userinfo, unicode()}). +-define(HOST, {host, host_map()}). +-define(PORT, {port, port()}). +-define(PATH_ABE, {path, path_abempty_map()}). +-define(PATH_ABS, {path, path_absolute_map()}). +-define(PATH_NOS, {path, path_noscheme_map()}). +-define(PATH_ROO, {path, path_rootless_map()}). +-define(PATH_EMP, {path, path_empty_map()}). +-define(QUERY, {query, query_map()}). +-define(FRAGMENT, {fragment, fragment_map()}). + + +%%%======================================================================== +%%% Properties +%%%======================================================================== + +prop_recompose() -> + ?FORALL(Map, map(), + Map =:= uri_string:parse(uri_string:recompose(Map)) + ). + +%% Stats +prop_map_key_length_collect() -> + ?FORALL(List, map(), + collect(length(maps:keys(List)), true)). + +prop_map_collect() -> + ?FORALL(List, map(), + collect(lists:sort(maps:keys(List)), true)). + +prop_scheme_collect() -> + ?FORALL(List, scheme(), + collect(length(List), true)). + + +%%%======================================================================== +%%% Generators +%%%======================================================================== + +map() -> + ?LET(Gen, comp_proplist(), proplist_to_map(Gen)). + +comp_proplist() -> + frequency([ + {2, [?SCHEME,?PATH_ABS]}, + {2, [?SCHEME,?PATH_ROO]}, + {2, [?SCHEME,?PATH_EMP]}, + {2, [?SCHEME,?HOST,?PATH_ABE]}, + {2, [?SCHEME,?USER,?HOST,?PATH_ABE]}, + {2, [?SCHEME,?HOST,?PORT,?PATH_ABE]}, + {2, [?SCHEME,?USER,?HOST,?PORT,?PATH_ABE]}, + + {2, [?PATH_ABS]}, + {2, [?PATH_NOS]}, + {2, [?PATH_EMP]}, + {2, [?HOST,?PATH_ABE]}, + {2, [?USER,?HOST,?PATH_ABE]}, + {2, [?HOST,?PORT,?PATH_ABE]}, + {2, [?USER,?HOST,?PORT,?PATH_ABE]}, + + + {2, [?SCHEME,?PATH_ABS,?QUERY]}, + {2, [?SCHEME,?PATH_ROO,?QUERY]}, + {2, [?SCHEME,?PATH_EMP,?QUERY]}, + {2, [?SCHEME,?HOST,?PATH_ABE,?QUERY]}, + {2, [?SCHEME,?USER,?HOST,?PATH_ABE,?QUERY]}, + {2, [?SCHEME,?HOST,?PORT,?PATH_ABE,?QUERY]}, + {2, [?SCHEME,?USER,?HOST,?PORT,?PATH_ABE,?QUERY]}, + + {2, [?PATH_ABS,?QUERY]}, + {2, [?PATH_NOS,?QUERY]}, + {2, [?PATH_EMP,?QUERY]}, + {2, [?HOST,?PATH_ABE,?QUERY]}, + {2, [?USER,?HOST,?PATH_ABE,?QUERY]}, + {2, [?HOST,?PORT,?PATH_ABE,?QUERY]}, + {2, [?USER,?HOST,?PORT,?PATH_ABE,?QUERY]}, + + + {2, [?SCHEME,?PATH_ABS,?FRAGMENT]}, + {2, [?SCHEME,?PATH_ROO,?FRAGMENT]}, + {2, [?SCHEME,?PATH_EMP,?FRAGMENT]}, + {2, [?SCHEME,?HOST,?PATH_ABE,?FRAGMENT]}, + {2, [?SCHEME,?USER,?HOST,?PATH_ABE,?FRAGMENT]}, + {2, [?SCHEME,?HOST,?PORT,?PATH_ABE,?FRAGMENT]}, + {2, [?SCHEME,?USER,?HOST,?PORT,?PATH_ABE,?FRAGMENT]}, + + {2, [?PATH_ABS,?FRAGMENT]}, + {2, [?PATH_NOS,?FRAGMENT]}, + {2, [?PATH_EMP,?FRAGMENT]}, + {2, [?HOST,?PATH_ABE,?FRAGMENT]}, + {2, [?USER,?HOST,?PATH_ABE,?FRAGMENT]}, + {2, [?HOST,?PORT,?PATH_ABE,?FRAGMENT]}, + {2, [?USER,?HOST,?PORT,?PATH_ABE,?FRAGMENT]}, + + + {2, [?SCHEME,?PATH_ABS,?QUERY,?FRAGMENT]}, + {2, [?SCHEME,?PATH_ROO,?QUERY,?FRAGMENT]}, + {2, [?SCHEME,?PATH_EMP,?QUERY,?FRAGMENT]}, + {2, [?SCHEME,?HOST,?PATH_ABE,?QUERY,?FRAGMENT]}, + {2, [?SCHEME,?USER,?HOST,?PATH_ABE,?QUERY,?FRAGMENT]}, + {2, [?SCHEME,?HOST,?PORT,?PATH_ABE,?QUERY,?FRAGMENT]}, + {2, [?SCHEME,?USER,?HOST,?PORT,?PATH_ABE,?QUERY,?FRAGMENT]}, + + {2, [?PATH_ABS,?QUERY,?FRAGMENT]}, + {2, [?PATH_NOS,?QUERY,?FRAGMENT]}, + {2, [?PATH_EMP,?QUERY,?FRAGMENT]}, + {2, [?HOST,?PATH_ABE,?QUERY,?FRAGMENT]}, + {2, [?USER,?HOST,?PATH_ABE,?QUERY,?FRAGMENT]}, + {2, [?HOST,?PORT,?PATH_ABE,?QUERY,?FRAGMENT]}, + {2, [?USER,?HOST,?PORT,?PATH_ABE,?QUERY,?FRAGMENT]} + ]). + + +%%------------------------------------------------------------------------- +%% Path +%%------------------------------------------------------------------------- +path_abempty_map() -> + frequency([{90, path_abe_map()}, + {10, path_empty_map()}]). + +path_abe_map() -> + ?SIZED(Length, path_abe_map(Length, [])). +%% +path_abe_map(0, Segments) -> + ?LET(Gen, Segments, lists:append(Gen)); +path_abe_map(N, Segments) -> + path_abe_map(N-1, [slash(),segment()|Segments]). + + +path_absolute_map() -> + ?SIZED(Length, path_absolute_map(Length, [])). +%% +path_absolute_map(0, Segments) -> + ?LET(Gen, [slash(),segment_nz()|Segments], lists:append(Gen)); +path_absolute_map(N, Segments) -> + path_absolute_map(N-1, [slash(),segment()|Segments]). + + +path_noscheme_map() -> + ?SIZED(Length, path_noscheme_map(Length, [])). +%% +path_noscheme_map(0, Segments) -> + ?LET(Gen, [segment_nz_nc()|Segments], lists:append(Gen)); +path_noscheme_map(N, Segments) -> + path_noscheme_map(N-1, [slash(),segment()|Segments]). + +path_rootless_map() -> + ?SIZED(Length, path_rootless_map(Length, [])). +%% +path_rootless_map(0, Segments) -> + ?LET(Gen, [segment_nz()|Segments], lists:append(Gen)); +path_rootless_map(N, Segments) -> + path_rootless_map(N-1, [slash(),segment()|Segments]). + + +segment_nz() -> + non_empty(segment()). + +segment_nz_nc() -> + non_empty(list(frequency([{30, unreserved()}, + {10, sub_delims()}, + {10, unicode_char()}, + {5, oneof([$@])} + ]))). + + +segment() -> + list(frequency([{30, unreserved()}, + {10, sub_delims()}, + {10, unicode_char()}, + {5, oneof([$:, $@])} + ])). + +slash() -> + "/". + +path_empty_map() -> + "". + + +%%------------------------------------------------------------------------- +%% Path +%%------------------------------------------------------------------------- +host_map() -> + frequency([{30, reg_name()}, + {30, ip_address()} + ]). + + +reg_name() -> + list(frequency([{30, alpha()}, + {10, sub_delims()}, + {10, unicode_char()} + ])). + +ip_address() -> + oneof(["127.0.0.1", "::127.0.0.1", + "2001:0db8:0000:0000:0000:0000:1428:07ab", + "2001:0db8:0000:0000:0000::1428:07ab", + "2001:0db8:0:0:0:0:1428:07ab", + "2001:0db8:0::0:1428:07ab"]). + +%% Generating only reg-names +host_uri() -> + non_empty(list(frequency([{30, unreserved()}, + {10, sub_delims()}, + {10, pct_encoded()} + ]))). + +%%------------------------------------------------------------------------- +%% Port, Query, Fragment +%%------------------------------------------------------------------------- +port() -> + range(1,65535). + + +query_map() -> + [$?| unicode()]. + + +query_uri() -> + [$?| non_empty(list(frequency([{20, pchar()}, + {5, oneof([$/, $?])} % punctuation + ])))]. + +fragment_map() -> + unicode(). + +fragment_uri() -> + [$?| non_empty(list(frequency([{20, pchar()}, + {5, oneof([$/, $?])} % punctuation + ])))]. + + +%%------------------------------------------------------------------------- +%% Scheme +%%------------------------------------------------------------------------- +scheme() -> + ?SIZED(Length, scheme_start(Length, [])). +%% +scheme_start(0, L) -> + ?LET(Gen, L, lists:reverse(Gen)); +scheme_start(N, L) -> + scheme(N-1,[alpha()|L]). + +scheme(0, L) -> + ?LET(Gen, L, lists:reverse(Gen)); +scheme(N, L) -> + scheme(N-1, [scheme_char()|L]). + + +%%------------------------------------------------------------------------- +%% Misc +%%------------------------------------------------------------------------- +unicode() -> + list(frequency([{20, alpha()}, % alpha + {10, digit()}, % digit + {10, unicode_char()} % unicode + ])). + +scheme_char() -> + frequency([{20, alpha()}, % alpha + {20, digit()}, % digit + {5, oneof([$+, $-, $.])} % punctuation + ]). + +sub_delims() -> + oneof([$!, $$, $&, $', $(, $), + $*, $+, $,,$;, $=]). + +pchar() -> + frequency([{20, unreserved()}, + {5, pct_encoded()}, + {5, sub_delims()}, + {1, oneof([$:, $@])} % punctuation + ]). + +unreserved() -> + frequency([{20, alpha()}, + {5, digit()}, + {1, oneof([$-, $., $_, $~])} % punctuation + ]). + +unicode_char() -> + range(913, 1023). + +alpha() -> + frequency([{20, range($a, $z)}, % letters + {20, range($A, $Z)}]). % letters + +digit() -> + range($0, $9). % numbers + +pct_encoded() -> + oneof(["%C3%A4", "%C3%A5", "%C3%B6"]). + + +%%%======================================================================== +%%% Helpers +%%%======================================================================== +proplist_to_map(L) -> + lists:foldl(fun({K,V},M) -> M#{K => V}; + (_,M) -> M + end, #{}, L). diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 0eb5105c35..cd2e003d02 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -31,7 +31,8 @@ parse_path/1, parse_pct_encoded_fragment/1, parse_pct_encoded_query/1, parse_pct_encoded_userinfo/1, parse_port/1, parse_query/1, parse_scheme/1, parse_userinfo/1, - parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1, parse_special/1, + parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1, + parse_special/1, parse_special2/1, recompose_fragment/1, recompose_parse_fragment/1, recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, @@ -90,6 +91,7 @@ all() -> parse_mixed, parse_relative, parse_special, + parse_special2, recompose_fragment, recompose_parse_fragment, recompose_query, @@ -114,7 +116,7 @@ uri_combinations() -> Hst <- [fun update_host/1, fun update_host_binary/1, fun update_ipv6/1, fun update_ipv6_binary/1, none], Prt <- [fun update_port/1, none], - Pat <- [fun update_path/1, fun update_path_binary/1, none], + Pat <- [fun update_path/1, fun update_path_binary/1], Qry <- [fun update_query/1,fun update_query_binary/1, none], Frg <- [fun update_fragment/1, fun update_fragment_binary/1, none], not (Usr =:= none andalso Hst =:= none andalso Prt =/= none), @@ -312,9 +314,7 @@ parse_binary_userinfo(_Config) -> #{scheme := <<"foo">>, userinfo := <<"user">>, host := <<"localhost">>} = uri_string:parse(<<"foo://user@localhost">>), #{scheme := <<"foo">>, userinfo := <<"user:password">>, host := <<"localhost">>} = - uri_string:parse(<<"foo://user:password@localhost">>), - uri_parse_error =(catch uri_string:parse(<<"//user@">>)), - uri_parse_error = (catch uri_string:parse(<<"foo://user@">>)). + uri_string:parse(<<"foo://user:password@localhost">>). parse_binary_pct_encoded_userinfo(_Config) -> #{scheme := <<"user">>, path := <<"合@気道"/utf8>>} = @@ -667,14 +667,24 @@ parse_special(_Config) -> #{host := "foo",path := "/"} = uri_string:parse("//foo/"), #{host := "foo",query := "?",scheme := "http"} = uri_string:parse("http://foo?"), #{fragment := [],host := "foo",scheme := "http"} = uri_string:parse("http://foo#"), - #{host := "foo",path := "/",scheme := "http"} = uri_string:parse("http://foo/"). + #{host := "foo",path := "/",scheme := "http"} = uri_string:parse("http://foo/"), + #{fragment := [],host := "host",port := 80,scheme := "http"} = uri_string:parse("http://host:80#"), + #{host := "host",port := 80,query := "?",scheme := "http"} = uri_string:parse("http://host:80?"). + +parse_special2(_Config) -> + #{host := [],path := "/",port := 1,scheme := "a"} = uri_string:parse("a://:1/"), + #{path := "/a/",scheme := "a"} = uri_string:parse("a:/a/"), + #{host := [],path := [],userinfo := []} = uri_string:parse("//@"), + #{host := [],path := [],scheme := "foo",userinfo := []} = uri_string:parse("foo://@"), + #{host := [],path := "/",userinfo := []} = uri_string:parse("//@/"), + #{host := [],path := "/",scheme := "foo",userinfo := []} = uri_string:parse("foo://@/"). %%------------------------------------------------------------------------- %% Recompose tests %%------------------------------------------------------------------------- recompose_fragment(_Config) -> - <> = uri_string:recompose(#{fragment => <>}), - ?FRAGMENT_ENC = uri_string:recompose(#{fragment => ?FRAGMENT}). + <> = uri_string:recompose(#{fragment => <>, path => <<>>}), + ?FRAGMENT_ENC = uri_string:recompose(#{fragment => ?FRAGMENT, path => ""}). recompose_parse_fragment(_Config) -> <> = uri_string:recompose(uri_string:parse(<>)), @@ -682,15 +692,17 @@ recompose_parse_fragment(_Config) -> recompose_query(_Config) -> <> = - uri_string:recompose(#{query => <>}), + uri_string:recompose(#{query => <>, path => <<>>}), <> = uri_string:recompose(#{query => <>, - fragment => <>}), + fragment => <>, + path => <<>>}), "?name=%C3%B6rn" = - uri_string:recompose(#{query => "?name=örn"}), + uri_string:recompose(#{query => "?name=örn", path => ""}), "?name=%C3%B6rn#n%C3%A4sa" = uri_string:recompose(#{query => "?name=örn", - fragment => "näsa"}). + fragment => "näsa", + path => ""}). recompose_parse_query(_Config) -> <<"?name=%C3%B6rn">> = uri_string:recompose(uri_string:parse(<<"?name=%C3%B6rn">>)), diff --git a/lib/stdlib/test/uri_string_property_test_SUITE.erl b/lib/stdlib/test/uri_string_property_test_SUITE.erl index de5edf54aa..ae2c61c7aa 100644 --- a/lib/stdlib/test/uri_string_property_test_SUITE.erl +++ b/lib/stdlib/test/uri_string_property_test_SUITE.erl @@ -20,10 +20,9 @@ -module(uri_string_property_test_SUITE). -include_lib("common_test/include/ct.hrl"). - -compile(export_all). -all() -> [decode]. +all() -> [recompose]. init_per_suite(Config) -> ct_property_test:init_per_suite(Config). @@ -31,12 +30,10 @@ init_per_suite(Config) -> end_per_suite(Config) -> Config. -%%%================================================================ +%%%======================================================================== %%% Test suites -%%% - -decode(Config) -> +%%%======================================================================== +recompose(Config) -> ct_property_test:quickcheck( - uri_string_decode:prop_uri_string_decode(), - Config - ). + uri_string_recompose:prop_recompose(), + Config). -- cgit v1.2.3 From 4a2358bbf4a4049a765aab435a31daeeffbbd677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 11 Oct 2017 16:36:14 +0200 Subject: stdlib: Implement transcode/2. --- lib/stdlib/src/uri_string.erl | 112 ++++++++++++++++++++++++++++++++++- lib/stdlib/test/uri_string_SUITE.erl | 39 +++++++++++- 2 files changed, 147 insertions(+), 4 deletions(-) mode change 100755 => 100644 lib/stdlib/src/uri_string.erl (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl old mode 100755 new mode 100644 index 893ba4c6bf..439ffa80da --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -353,8 +353,26 @@ normalize(_) -> -spec transcode(URIString, Options) -> URIString when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. -transcode(_, _) -> - "". +transcode(URIString, Options) when is_binary(URIString) -> + try + InEnc = proplists:get_value(in_encoding, Options, utf8), + OutEnc = proplists:get_value(out_encoding, Options, utf8), + List = convert_list(URIString, InEnc), + Output = transcode(List, [], InEnc, OutEnc), + convert_binary(Output, utf8, OutEnc) + of + Result -> Result + catch + throw:{error, L, RestData} -> {invalid_input, L, RestData} + end; +transcode(URIString, Options) when is_list(URIString) -> + InEnc = proplists:get_value(in_encoding, Options, utf8), + OutEnc = proplists:get_value(out_encoding, Options, utf8), + try transcode(URIString, [], InEnc, OutEnc) of + Result -> Result + catch + throw:{error, List, RestData} -> {invalid_input, List, RestData} + end. %%------------------------------------------------------------------------- %% Working with query strings @@ -1624,3 +1642,93 @@ maybe_to_list(Comp) -> Comp. encode_port(Port) -> integer_to_binary(Port). + +%%------------------------------------------------------------------------- +%% Helper functions for transcode +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% uri_string:transcode(<<"x%00%00%00%F6"/utf32>>). +%% 1. Convert (transcode/2) input to list form (list of unicode codepoints) +%% "x%00%00%00%F6" +%% 2. Accumulate characters until percent-encoded segment (transcode/4). +%% Acc = "x" +%% 3. Convert percent-encoded triplets to binary form (transcode_pct/4) +%% <<0,0,0,246>> +%% 4. Transcode in-encoded binary to out-encoding (utf32 -> utf8): +%% <<195,182>> +%% 5. Percent-encode out-encoded binary: +%% <<"%C3%B6"/utf8>> = <<37,67,51,37,66,54>> +%% 6. Convert binary to list form, reverse it and append the accumulator +%% "6B%3C%" + "x" +%% 7. Reverse Acc and return it +%%------------------------------------------------------------------------- +transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) -> + transcode_pct(L, Acc, <<>>, InEnc, OutEnc); +transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) -> + transcode(L, Acc, [], InEnc, OutEnc). +%% +transcode([H|T], Acc, List, InEnc, OutEnc) when is_binary(H) -> + L = convert_list(H, InEnc), + transcode(L ++ T, Acc, List, InEnc, OutEnc); +transcode([H|T], Acc, List, InEnc, OutEnc) when is_list(H) -> + transcode(H ++ T, Acc, List, InEnc, OutEnc); +transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) -> + transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding); +transcode([C|Rest], Acc, List, InEncoding, OutEncoding) -> + transcode(Rest, Acc, [C|List], InEncoding, OutEncoding); +transcode([], Acc, List, _InEncoding, _OutEncoding) -> + lists:reverse(List ++ Acc). + + +%% Transcode percent-encoded segment +transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_binary(H) -> + L = convert_list(H, InEnc), + transcode_pct(L ++ T, Acc, B, InEnc, OutEnc); +transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_list(H) -> + transcode_pct(H ++ T, Acc, B, InEnc, OutEnc); +transcode_pct([$%,C0,C1|Rest], Acc, B, InEncoding, OutEncoding) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1), + transcode_pct(Rest, Acc, <>, InEncoding, OutEncoding); + false -> throw({error, lists:reverse(Acc),[C0,C1]}) + end; +transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) -> + OutBinary = convert_binary(B, InEncoding, OutEncoding), + PctEncUtf8 = percent_encode_segment(OutBinary), + Out = lists:reverse(convert_list(PctEncUtf8, utf8)), + transcode(L, Out ++ Acc, [], InEncoding, OutEncoding); +transcode_pct([], Acc, B, InEncoding, OutEncoding) -> + OutBinary = convert_binary(B, InEncoding, OutEncoding), + PctEncUtf8 = percent_encode_segment(OutBinary), + Out = convert_list(PctEncUtf8, utf8), + lists:reverse(Acc) ++ Out. + + +% Convert binary +convert_binary(Binary, InEncoding, OutEncoding) -> + case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of + {error, List, RestData} -> + throw({error, List, RestData}); + {incomplete, List, RestData} -> + throw({error, List, RestData}); + Result -> + Result + end. + + +% Convert binary +convert_list(Binary, InEncoding) -> + case unicode:characters_to_list(Binary, InEncoding) of + {error, List, RestData} -> + throw({error, List, RestData}); + {incomplete, List, RestData} -> + throw({error, List, RestData}); + Result -> + Result + end. + + +percent_encode_segment(Segment) -> + percent_encode_binary(Segment, <<>>). diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index cd2e003d02..83f702dd13 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -36,7 +36,8 @@ recompose_fragment/1, recompose_parse_fragment/1, recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, - recompose_autogen/1, parse_recompose_autogen/1 + recompose_autogen/1, parse_recompose_autogen/1, + transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1 ]). @@ -99,7 +100,11 @@ all() -> recompose_path, recompose_parse_path, recompose_autogen, - parse_recompose_autogen + parse_recompose_autogen, + transcode_basic, + transcode_options, + transcode_mixed, + transcode_negative ]. groups() -> @@ -763,3 +768,33 @@ recompose_autogen(_Config) -> parse_recompose_autogen(_Config) -> Tests = generate_test_vectors(uri_combinations()), lists:map(fun run_test_parse_recompose/1, Tests). + +transcode_basic(_Config) -> + <<"foo%C3%B6bar"/utf8>> = + uri_string:transcode(<<"foo%00%00%00%F6bar"/utf32>>, [{in_encoding, utf32},{out_encoding, utf8}]), + "foo%C3%B6bar" = + uri_string:transcode("foo%00%00%00%F6bar", [{in_encoding, utf32},{out_encoding, utf8}]), + <<"foo%00%00%00%F6bar"/utf32>> = + uri_string:transcode(<<"foo%C3%B6bar"/utf8>>, [{in_encoding, utf8},{out_encoding, utf32}]), + "foo%00%00%00%F6bar" = + uri_string:transcode("foo%C3%B6bar", [{in_encoding, utf8},{out_encoding, utf32}]), + "foo%C3%B6bar" = + uri_string:transcode("foo%F6bar", [{in_encoding, latin1},{out_encoding, utf8}]). + +transcode_options(_Config) -> + <<"foo%C3%B6bar"/utf8>> = + uri_string:transcode(<<"foo%C3%B6bar"/utf8>>, []), + <<"foo%C3%B6bar"/utf8>> = + uri_string:transcode(<<"foo%00%00%00%F6bar"/utf32>>, [{in_encoding, utf32}]), + <<"foo%00%00%00%F6bar"/utf32>> = + uri_string:transcode(<<"foo%C3%B6bar"/utf8>>, [{out_encoding, utf32}]). + +transcode_mixed(_Config) -> + "foo%00%00%00%F6bar" = + uri_string:transcode(["foo",<<"%C3%B6"/utf8>>,<<"ba"/utf8>>,"r"], [{out_encoding, utf32}]). + +transcode_negative(_Config) -> + {invalid_input,"foo","BX"} = + uri_string:transcode(<<"foo%C3%BXbar"/utf8>>, [{in_encoding, utf8},{out_encoding, utf32}]), + {invalid_input,<<>>,<<"ö">>} = + uri_string:transcode("foo%F6bar", [{in_encoding, utf8},{out_encoding, utf8}]). -- cgit v1.2.3 From 57f8021105f1c213be674681f48d0c8e92935ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 16 Oct 2017 13:30:36 +0200 Subject: stdlib: Change handling of queries ["?" query] Previously when parsing queries the first "?" was part of the parsed query in the result Map. This behavior has been changed to follow the patterns used with other URI components and to not include the special character(s) that mark the start of a specific component. --- lib/stdlib/src/uri_string.erl | 80 +++++++++--------- .../test/property_test/uri_string_recompose.erl | 2 +- lib/stdlib/test/uri_string_SUITE.erl | 97 +++++++++++----------- 3 files changed, 93 insertions(+), 86 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 439ffa80da..f9e1e273bc 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -466,9 +466,9 @@ parse_relative_part(?STRING_REST($/, Rest), URI) -> URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), + Query = calculate_parsed_query(Rest, T), URI2 = maybe_add_path(URI1), - URI2#{query => decode_query(?STRING_REST($?, Query))}; + URI2#{query => decode_query(Query)}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -521,8 +521,8 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_fragment(Rest, T), @@ -544,8 +544,8 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_fragment(Rest, T), @@ -595,14 +595,6 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> %% According to the URI specification there is always a %% path component in every URI-reference and it can be %% empty. - -%% maybe_add_path(Map) -> -%% case length(maps:keys(Map)) of -%% 0 -> -%% Map#{path => <<>>}; -%% _Else -> -%% Map -%% end. maybe_add_path(Map) -> case maps:is_key(path, Map) of false -> @@ -659,8 +651,8 @@ parse_hier(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -776,8 +768,8 @@ parse_host(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> @@ -805,8 +797,8 @@ parse_reg_name(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -840,8 +832,8 @@ parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty @@ -901,8 +893,8 @@ parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -939,8 +931,8 @@ parse_port(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -1090,7 +1082,7 @@ remove_brackets(Addr) -> Addr. %% Returns the parsed binary based on Input and the Unparsed part. %% Handles the following special cases: %% -%% #{host => [],path => "/",query => "?"} = uri_string:parse("///?") +%% #{host => [],path => "/",query => []} = uri_string:parse("///?") %% #{fragment => [],host => [],path => "/"} = uri_string:parse("///#") %% -spec calculate_parsed_part(binary(), binary()) -> binary(). @@ -1171,6 +1163,20 @@ calculate_parsed_port(Input, Unparsed) -> First. +calculate_parsed_query(<<$#>>, _) -> <<>>; +calculate_parsed_query(<<>>, _) -> <<>>; +calculate_parsed_query(Input, <<>>) -> + case binary:last(Input) of + $# -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_query(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + -spec calculate_parsed_fragment(binary(), binary()) -> binary(). calculate_parsed_fragment(<<$#>>, _) -> <<>>; calculate_parsed_fragment(Input, Unparsed) -> @@ -1183,10 +1189,10 @@ calculate_parsed_fragment(Input, Unparsed) -> %% %% Handles the following special cases: %% -%% #{host => "foo",query => "?"} = uri_string:parse("//foo?") +%% #{host => "foo",query => []} = uri_string:parse("//foo?") %% #{fragment => [],host => "foo"} = uri_string:parse("//foo#") %% #{host => "foo",path => "/"} = uri_string:parse("//foo/") -%% #{host => "foo",query => "?",scheme => "http"} = uri_string:parse("http://foo?") +%% #{host => "foo",query => [],scheme => "http"} = uri_string:parse("http://foo?") %% #{fragment => [],host => "foo",scheme => "http"} = uri_string:parse("http://foo#") %% #{host => "foo",path => "/",scheme => "http"} = uri_string:parse("http://foo/") %% @@ -1329,10 +1335,7 @@ encode_path(Cs) -> -spec encode_query(list()|binary()) -> list() | binary(). encode_query(Cs) -> - case validate_query(Cs) of - true -> encode(Cs, fun is_query/1); - false -> throw(uri_parse_error) - end. + encode(Cs, fun is_query/1). -spec encode_fragment(list()|binary()) -> list() | binary(). encode_fragment(Cs) -> @@ -1420,10 +1423,6 @@ validate_scheme(<>) -> false -> false end. -validate_query([$?|_]) -> true; -validate_query(<<$?/utf8, _/binary>>) -> true; -validate_query(_) -> false. - %%------------------------------------------------------------------------- %% Classifies hostname into the following categories: @@ -1582,7 +1581,7 @@ update_path(#{}, URI) -> update_query(#{query := Query}, empty) -> encode_query(Query); update_query(#{query := Query}, URI) -> - concat(URI,encode_query(Query)); + concat(URI,add_question_mark(encode_query(Query))); update_query(#{}, empty) -> empty; update_query(#{}, URI) -> @@ -1615,6 +1614,11 @@ add_hashmark(Comp) when is_binary(Comp) -> add_hashmark(Comp) when is_list(Comp) -> [$#|Comp]. +add_question_mark(Comp) when is_binary(Comp) -> + <<$?, Comp/binary>>; +add_question_mark(Comp) when is_list(Comp) -> + [$?|Comp]. + add_colon(Comp) when is_binary(Comp) -> <<$:, Comp/binary>>. diff --git a/lib/stdlib/test/property_test/uri_string_recompose.erl b/lib/stdlib/test/property_test/uri_string_recompose.erl index dad67cd4c1..97f9d727a0 100644 --- a/lib/stdlib/test/property_test/uri_string_recompose.erl +++ b/lib/stdlib/test/property_test/uri_string_recompose.erl @@ -271,7 +271,7 @@ port() -> query_map() -> - [$?| unicode()]. + unicode(). query_uri() -> diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 83f702dd13..8a10948f32 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -52,7 +52,7 @@ -define(PORT_ENC, ":8042"). -define(PATH, "/där"). -define(PATH_ENC, "/d%C3%A4r"). --define(QUERY, "?name=örn"). +-define(QUERY, "name=örn"). -define(QUERY_ENC, "?name=%C3%B6rn"). -define(FRAGMENT, "näsa"). -define(FRAGMENT_ENC, "#n%C3%A4sa"). @@ -350,7 +350,7 @@ parse_binary_host_ipv4(_Config) -> #{host := <<"127.0.0.1">>} = uri_string:parse(<<"//127.0.0.1">>), #{host := <<"127.0.0.1">>, path := <<"/over/there">>} = uri_string:parse(<<"//127.0.0.1/over/there">>), - #{host := <<"127.0.0.1">>, query := <<"?name=ferret">>} = + #{host := <<"127.0.0.1">>, query := <<"name=ferret">>} = uri_string:parse(<<"//127.0.0.1?name=ferret">>), #{host := <<"127.0.0.1">>, fragment := <<"nose">>} = uri_string:parse(<<"//127.0.0.1#nose">>), uri_parse_error = (catch uri_string:parse(<<"//127.0.0.x">>)), @@ -362,7 +362,7 @@ parse_binary_host_ipv6(_Config) -> uri_string:parse(<<"//[2001:0db8:0000:0000:0000:0000:1428:07ab]">>), #{host := <<"::127.0.0.1">>, path := <<"/over/there">>} = uri_string:parse(<<"//[::127.0.0.1]/over/there">>), - #{host := <<"::127.0.0.1">>, query := <<"?name=ferret">>} = + #{host := <<"::127.0.0.1">>, query := <<"name=ferret">>} = uri_string:parse(<<"//[::127.0.0.1]?name=ferret">>), #{host := <<"::127.0.0.1">>, fragment := <<"nose">>} = uri_string:parse(<<"//[::127.0.0.1]#nose">>), @@ -397,35 +397,35 @@ parse_binary_path(_Config) -> uri_string:parse(<<"foo://example.com:8042/over/there">>). parse_binary_query(_Config) -> - #{scheme := <<"foo">>, query := <<"?name=ferret">>} = + #{scheme := <<"foo">>, query := <<"name=ferret">>} = uri_string:parse(<<"foo:?name=ferret">>), - #{scheme := <<"foo">>, path:= <<"over/there">>, query := <<"?name=ferret">>} = + #{scheme := <<"foo">>, path:= <<"over/there">>, query := <<"name=ferret">>} = uri_string:parse(<<"foo:over/there?name=ferret">>), - #{scheme := <<"foo">>, path:= <<"/over/there">>, query := <<"?name=ferret">>} = + #{scheme := <<"foo">>, path:= <<"/over/there">>, query := <<"name=ferret">>} = uri_string:parse(<<"foo:/over/there?name=ferret">>), - #{scheme := <<"foo">>, host := <<"example.com">>, query := <<"?name=ferret">>} = + #{scheme := <<"foo">>, host := <<"example.com">>, query := <<"name=ferret">>} = uri_string:parse(<<"foo://example.com?name=ferret">>), - #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>, query := <<"?name=ferret">>} = + #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>, query := <<"name=ferret">>} = uri_string:parse(<<"foo://example.com/?name=ferret">>), - #{query := <<"?name=ferret">>} = + #{path := <<>>, query := <<"name=ferret">>} = uri_string:parse(<<"?name=ferret">>), - #{path := <<"over/there">>, query := <<"?name=ferret">>} = + #{path := <<"over/there">>, query := <<"name=ferret">>} = uri_string:parse(<<"over/there?name=ferret">>), - #{path := <<"/">>, query := <<"?name=ferret">>} = + #{path := <<"/">>, query := <<"name=ferret">>} = uri_string:parse(<<"/?name=ferret">>), - #{path := <<"/over/there">>, query := <<"?name=ferret">>} = + #{path := <<"/over/there">>, query := <<"name=ferret">>} = uri_string:parse(<<"/over/there?name=ferret">>), - #{host := <<"example.com">>, query := <<"?name=ferret">>} = + #{host := <<"example.com">>, query := <<"name=ferret">>} = uri_string:parse(<<"//example.com?name=ferret">>), - #{host := <<"example.com">>, path := <<"/">>, query := <<"?name=ferret">>} = + #{host := <<"example.com">>, path := <<"/">>, query := <<"name=ferret">>} = uri_string:parse(<<"//example.com/?name=ferret">>). parse_binary_pct_encoded_query(_Config) -> #{scheme := <<"foo">>, host := <<"example.com">>, path := <<"/">>, - query := <<"?name=合気道"/utf8>>} = + query := <<"name=合気道"/utf8>>} = uri_string:parse(<<"foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>), - #{host := <<"example.com">>, path := <<"/">>, query := <<"?name=合気道"/utf8>>} = + #{host := <<"example.com">>, path := <<"/">>, query := <<"name=合気道"/utf8>>} = uri_string:parse(<<"//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93">>). parse_binary_fragment(_Config) -> @@ -520,7 +520,7 @@ parse_host_ipv4(_Config) -> #{host := "2001:0db8:0000:0000:0000:0000:1428:07ab"} = uri_string:parse("//[2001:0db8:0000:0000:0000:0000:1428:07ab]"), #{host := "127.0.0.1", path := "/over/there"} = uri_string:parse("//127.0.0.1/over/there"), - #{host := "127.0.0.1", query := "?name=ferret"} = uri_string:parse("//127.0.0.1?name=ferret"), + #{host := "127.0.0.1", query := "name=ferret"} = uri_string:parse("//127.0.0.1?name=ferret"), #{host := "127.0.0.1", fragment := "nose"} = uri_string:parse("//127.0.0.1#nose"), uri_parse_error = (catch uri_string:parse("//127.0.0.x")), uri_parse_error = (catch uri_string:parse("//1227.0.0.1")). @@ -528,7 +528,7 @@ parse_host_ipv4(_Config) -> parse_host_ipv6(_Config) -> #{host := "::127.0.0.1"} = uri_string:parse("//[::127.0.0.1]"), #{host := "::127.0.0.1", path := "/over/there"} = uri_string:parse("//[::127.0.0.1]/over/there"), - #{host := "::127.0.0.1", query := "?name=ferret"} = + #{host := "::127.0.0.1", query := "name=ferret"} = uri_string:parse("//[::127.0.0.1]?name=ferret"), #{host := "::127.0.0.1", fragment := "nose"} = uri_string:parse("//[::127.0.0.1]#nose"), uri_parse_error = (catch uri_string:parse("//[::127.0.0.x]")), @@ -560,35 +560,35 @@ parse_path(_Config) -> uri_string:parse("foo://example.com:8042/over/there"). parse_query(_Config) -> - #{scheme := "foo", query := "?name=ferret"} = + #{scheme := "foo", query := "name=ferret"} = uri_string:parse("foo:?name=ferret"), - #{scheme := "foo", path:= "over/there", query := "?name=ferret"} = + #{scheme := "foo", path:= "over/there", query := "name=ferret"} = uri_string:parse("foo:over/there?name=ferret"), - #{scheme := "foo", path:= "/over/there", query := "?name=ferret"} = + #{scheme := "foo", path:= "/over/there", query := "name=ferret"} = uri_string:parse("foo:/over/there?name=ferret"), - #{scheme := "foo", host := "example.com", query := "?name=ferret"} = + #{scheme := "foo", host := "example.com", query := "name=ferret"} = uri_string:parse("foo://example.com?name=ferret"), - #{scheme := "foo", host := "example.com", path := "/", query := "?name=ferret"} = + #{scheme := "foo", host := "example.com", path := "/", query := "name=ferret"} = uri_string:parse("foo://example.com/?name=ferret"), - #{query := "?name=ferret"} = + #{path := "", query := "name=ferret"} = uri_string:parse("?name=ferret"), - #{path := "over/there", query := "?name=ferret"} = + #{path := "over/there", query := "name=ferret"} = uri_string:parse("over/there?name=ferret"), - #{path := "/", query := "?name=ferret"} = + #{path := "/", query := "name=ferret"} = uri_string:parse("/?name=ferret"), - #{path := "/over/there", query := "?name=ferret"} = + #{path := "/over/there", query := "name=ferret"} = uri_string:parse("/over/there?name=ferret"), - #{host := "example.com", query := "?name=ferret"} = + #{host := "example.com", query := "name=ferret"} = uri_string:parse("//example.com?name=ferret"), - #{host := "example.com", path := "/", query := "?name=ferret"} = + #{host := "example.com", path := "/", query := "name=ferret"} = uri_string:parse("//example.com/?name=ferret"). parse_pct_encoded_query(_Config) -> #{scheme := "foo", host := "example.com", path := "/", - query := "?name=合気道"} = + query := "name=合気道"} = uri_string:parse("foo://example.com/?name=%E5%90%88%E6%B0%97%E9%81%93"), - #{host := "example.com", path := "/", query := "?name=合気道"} = + #{host := "example.com", path := "/", query := "name=合気道"} = uri_string:parse("//example.com/?name=%E5%90%88%E6%B0%97%E9%81%93"). parse_fragment(_Config) -> @@ -627,19 +627,19 @@ parse_pct_encoded_fragment(_Config) -> parse_list(_Config) -> #{scheme := "foo", path := "bar:nisse"} = uri_string:parse("foo:bar:nisse"), #{scheme := "foo", host := "example.com", port := 8042, - path := "/over/there", query := "?name=ferret", fragment := "nose"} = + path := "/over/there", query := "name=ferret", fragment := "nose"} = uri_string:parse("foo://example.com:8042/over/there?name=ferret#nose"), #{scheme := "foo", userinfo := "admin:admin", host := "example.com", port := 8042, - path := "/over/there", query := "?name=ferret", fragment := "nose"} = + path := "/over/there", query := "name=ferret", fragment := "nose"} = uri_string:parse("foo://admin:admin@example.com:8042/over/there?name=ferret#nose"). parse_binary(_Config) -> #{scheme := <<"foo">>, path := <<"bar:nisse">>} = uri_string:parse(<<"foo:bar:nisse">>), #{scheme := <<"foo">>, host := <<"example.com">>, port := 8042, - path := <<"/over/there">>, query := <<"?name=ferret">>, fragment := <<"nose">>} = + path := <<"/over/there">>, query := <<"name=ferret">>, fragment := <<"nose">>} = uri_string:parse(<<"foo://example.com:8042/over/there?name=ferret#nose">>), #{scheme := <<"foo">>, userinfo := <<"admin:admin">>, host := <<"example.com">>, port := 8042, - path := <<"/over/there">>, query := <<"?name=ferret">>, fragment := <<"nose">>} = + path := <<"/over/there">>, query := <<"name=ferret">>, fragment := <<"nose">>} = uri_string:parse(<<"foo://admin:admin@example.com:8042/over/there?name=ferret#nose">>). @@ -658,23 +658,26 @@ parse_relative(_Config) -> uri_string:parse(lists:append("fo",<<"o">>)). parse_special(_Config) -> - #{host := [],query := "?"} = uri_string:parse("//?"), + #{host := [],query := []} = uri_string:parse("//?"), #{fragment := [],host := []} = uri_string:parse("//#"), - #{host := [],query := "?",scheme := "foo"} = uri_string:parse("foo://?"), + #{host := [],query := [],scheme := "foo"} = uri_string:parse("foo://?"), #{fragment := [],host := [],scheme := "foo"} = uri_string:parse("foo://#"), #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>), #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>), #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>), - #{host := [],path := "/",query := "?"} = uri_string:parse("///?"), + #{host := [],path := "/",query := []} = uri_string:parse("///?"), #{fragment := [],host := [],path := "/"} = uri_string:parse("///#"), - #{host := "foo",query := "?"} = uri_string:parse("//foo?"), + #{host := "foo",query := []} = uri_string:parse("//foo?"), #{fragment := [],host := "foo"} = uri_string:parse("//foo#"), #{host := "foo",path := "/"} = uri_string:parse("//foo/"), - #{host := "foo",query := "?",scheme := "http"} = uri_string:parse("http://foo?"), + #{host := "foo",query := [],scheme := "http"} = uri_string:parse("http://foo?"), #{fragment := [],host := "foo",scheme := "http"} = uri_string:parse("http://foo#"), #{host := "foo",path := "/",scheme := "http"} = uri_string:parse("http://foo/"), #{fragment := [],host := "host",port := 80,scheme := "http"} = uri_string:parse("http://host:80#"), - #{host := "host",port := 80,query := "?",scheme := "http"} = uri_string:parse("http://host:80?"). + #{host := "host",port := 80,query := [],scheme := "http"} = uri_string:parse("http://host:80?"), + #{path := [],query := []} = uri_string:parse("?"), + #{path := [],query := "?"} = uri_string:parse("??"), + #{path := [],query := "??"} = uri_string:parse("???"). parse_special2(_Config) -> #{host := [],path := "/",port := 1,scheme := "a"} = uri_string:parse("a://:1/"), @@ -703,9 +706,9 @@ recompose_query(_Config) -> fragment => <>, path => <<>>}), "?name=%C3%B6rn" = - uri_string:recompose(#{query => "?name=örn", path => ""}), + uri_string:recompose(#{query => "name=örn", path => ""}), "?name=%C3%B6rn#n%C3%A4sa" = - uri_string:recompose(#{query => "?name=örn", + uri_string:recompose(#{query => "name=örn", fragment => "näsa", path => ""}). @@ -724,10 +727,10 @@ recompose_path(_Config) -> fragment => <<"näsa"/utf8>>}), <<"/d%C3%A4r?name=%C3%B6rn">> = uri_string:recompose(#{path => <<"/där"/utf8>>, - query => <<"?name=örn"/utf8>>}), + query => <<"name=örn"/utf8>>}), <<"/d%C3%A4r?name=%C3%B6rn#n%C3%A4sa">> = uri_string:recompose(#{path => <<"/där"/utf8>>, - query => <<"?name=örn"/utf8>>, + query => <<"name=örn"/utf8>>, fragment => <<"näsa"/utf8>>}), @@ -738,10 +741,10 @@ recompose_path(_Config) -> fragment => "näsa"}), "/d%C3%A4r?name=%C3%B6rn" = uri_string:recompose(#{path => "/där", - query => "?name=örn"}), + query => "name=örn"}), "/d%C3%A4r?name=%C3%B6rn#n%C3%A4sa" = uri_string:recompose(#{path => "/där", - query => "?name=örn", + query => "name=örn", fragment => "näsa"}). -- cgit v1.2.3 From fd276f4a2a109d19d25cffee54a2c21ee4568085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 16 Oct 2017 16:12:18 +0200 Subject: stdlib: Improve support of mixed lists (transcode) - transcode/2 flattens input lists in order to be able to handle lists with percent-encoded parts that are split into muliple list and binary segments. - Add additional tests for transcoding mixed lists. --- lib/stdlib/src/uri_string.erl | 35 ++++++++++++++++++++++------------- lib/stdlib/test/uri_string_SUITE.erl | 6 +++++- 2 files changed, 27 insertions(+), 14 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index f9e1e273bc..7d180f73b8 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -368,12 +368,14 @@ transcode(URIString, Options) when is_binary(URIString) -> transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), - try transcode(URIString, [], InEnc, OutEnc) of + Flattened = flatten_list(URIString, InEnc), + try transcode(Flattened, [], InEnc, OutEnc) of Result -> Result catch throw:{error, List, RestData} -> {invalid_input, List, RestData} end. + %%------------------------------------------------------------------------- %% Working with query strings %% HTML 2.0 - application/x-www-form-urlencoded @@ -1672,11 +1674,6 @@ transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) -> transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) -> transcode(L, Acc, [], InEnc, OutEnc). %% -transcode([H|T], Acc, List, InEnc, OutEnc) when is_binary(H) -> - L = convert_list(H, InEnc), - transcode(L ++ T, Acc, List, InEnc, OutEnc); -transcode([H|T], Acc, List, InEnc, OutEnc) when is_list(H) -> - transcode(H ++ T, Acc, List, InEnc, OutEnc); transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) -> transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding); transcode([C|Rest], Acc, List, InEncoding, OutEncoding) -> @@ -1686,11 +1683,6 @@ transcode([], Acc, List, _InEncoding, _OutEncoding) -> %% Transcode percent-encoded segment -transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_binary(H) -> - L = convert_list(H, InEnc), - transcode_pct(L ++ T, Acc, B, InEnc, OutEnc); -transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_list(H) -> - transcode_pct(H ++ T, Acc, B, InEnc, OutEnc); transcode_pct([$%,C0,C1|Rest], Acc, B, InEncoding, OutEncoding) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> @@ -1710,7 +1702,7 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) -> lists:reverse(Acc) ++ Out. -% Convert binary +%% Convert to binary convert_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of {error, List, RestData} -> @@ -1722,7 +1714,7 @@ convert_binary(Binary, InEncoding, OutEncoding) -> end. -% Convert binary +%% Convert to list convert_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of {error, List, RestData} -> @@ -1734,5 +1726,22 @@ convert_list(Binary, InEncoding) -> end. +%% Flatten input list +flatten_list([], _) -> + []; +flatten_list(L, InEnc) -> + flatten_list(L, InEnc, []). +%% +flatten_list([H|T], InEnc, Acc) when is_binary(H) -> + L = convert_list(H, InEnc), + flatten_list(T, InEnc, lists:reverse(L) ++ Acc); +flatten_list([H|T], InEnc, Acc) when is_list(H) -> + flatten_list(H ++ T, InEnc, Acc); +flatten_list([H|T], InEnc, Acc) -> + flatten_list(T, InEnc, [H|Acc]); +flatten_list([], _InEnc, Acc) -> + lists:reverse(Acc). + + percent_encode_segment(Segment) -> percent_encode_binary(Segment, <<>>). diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 8a10948f32..901d38a4da 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -794,7 +794,11 @@ transcode_options(_Config) -> transcode_mixed(_Config) -> "foo%00%00%00%F6bar" = - uri_string:transcode(["foo",<<"%C3%B6"/utf8>>,<<"ba"/utf8>>,"r"], [{out_encoding, utf32}]). + uri_string:transcode(["foo",<<"%C3%B6"/utf8>>,<<"ba"/utf8>>,"r"], [{out_encoding, utf32}]), + "foo%00%00%00%F6bar" = + uri_string:transcode(["foo",<<"%C3%"/utf8>>,<<"B6ba"/utf8>>,"r"], [{out_encoding, utf32}]), + "foo%C3%B6bar" = + uri_string:transcode(["foo%00", <<"%00%0"/utf32>>,<<"0%F"/utf32>>,"6bar"], [{in_encoding, utf32},{out_encoding, utf8}]). transcode_negative(_Config) -> {invalid_input,"foo","BX"} = -- cgit v1.2.3 From 5fe4c673bb8ee10d0fccadb4da14d7a500c2b8ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 18 Oct 2017 15:48:04 +0200 Subject: stdlib: Implement compose_query and dissect_query --- lib/stdlib/src/uri_string.erl | 226 +++++++++++++++++++++++++++++++---- lib/stdlib/test/uri_string_SUITE.erl | 38 +++++- 2 files changed, 240 insertions(+), 24 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 7d180f73b8..1b8f8b828f 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,8 +226,9 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, - parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export([compose_query/1, compose_query/2, create_uri_reference/2, + dissect_query/1, normalize/1, parse/1, + recompose/1, resolve_uri_reference/2, transcode/2]). -export_type([uri_map/0, uri_string/0]). @@ -377,28 +378,66 @@ transcode(URIString, Options) when is_list(URIString) -> %%------------------------------------------------------------------------- -%% Working with query strings -%% HTML 2.0 - application/x-www-form-urlencoded -%% RFC 1866 [8.2.1] +%% Functions for working with the query part of a URI as a list +%% of key/value pairs. +%% HTML 2.0 (RFC 1866) defines a media type application/x-www-form-urlencoded +%% in section [8.2.1] "The form-urlencoded Media Type". %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% Compose urlencoded query string from a list of unescaped key/value pairs. %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when - QueryList :: [{unicode:chardata(), unicode:chardata()}], - QueryString :: uri_string(). -compose_query(_) -> - "". + QueryList :: [{uri_string(), uri_string()}], + QueryString :: string(). +compose_query(List) -> + compose_query(List, []). + + +-spec compose_query(QueryList, Options) -> QueryString when + QueryList :: [{uri_string(), uri_string()}], + Options :: [{separator, atom()}], + QueryString :: string(). +compose_query([],_Options) -> + []; +compose_query(List, Options) -> + try compose_query(List, Options, []) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. +%% +compose_query([{Key,Value}|Rest], Options, Acc) -> + Separator = get_separator(Options, Acc), + K = form_urlencode(Key), + V = form_urlencode(Value), + compose_query(Rest, Options, Acc ++ Separator ++ K ++ "=" ++ V); +compose_query([], _Options, Acc) -> + Acc. + %%------------------------------------------------------------------------- %% Dissect a query string into a list of unescaped key/value pairs. %%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), - QueryList :: [{unicode:chardata(), unicode:chardata()}]. -dissect_query(_) -> - "". + QueryList :: [{string(), string()}]. +dissect_query([]) -> + []; +dissect_query(QueryString) when is_binary(QueryString) -> + L = convert_list(QueryString, utf8), + try dissect_query_key(L, [], [], []) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end; +dissect_query(QueryString) -> + L = flatten_list(QueryString, utf8), + try dissect_query_key(L, [], [], []) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. %%%======================================================================== @@ -1705,10 +1744,10 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) -> %% Convert to binary convert_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of - {error, List, RestData} -> - throw({error, List, RestData}); - {incomplete, List, RestData} -> - throw({error, List, RestData}); + {error, _List, RestData} -> + throw({error, unicode, RestData}); + {incomplete, _List, RestData} -> + throw({error, unicode, RestData}); Result -> Result end. @@ -1717,10 +1756,10 @@ convert_binary(Binary, InEncoding, OutEncoding) -> %% Convert to list convert_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of - {error, List, RestData} -> - throw({error, List, RestData}); - {incomplete, List, RestData} -> - throw({error, List, RestData}); + {error, _List, RestData} -> + throw({error, unicode, RestData}); + {incomplete, _List, RestData} -> + throw({error, unicode, RestData}); Result -> Result end. @@ -1740,8 +1779,153 @@ flatten_list([H|T], InEnc, Acc) when is_list(H) -> flatten_list([H|T], InEnc, Acc) -> flatten_list(T, InEnc, [H|Acc]); flatten_list([], _InEnc, Acc) -> - lists:reverse(Acc). + lists:reverse(Acc); +flatten_list(Arg, _, _) -> + throw({error, badarg, Arg}). + percent_encode_segment(Segment) -> percent_encode_binary(Segment, <<>>). + + +%%------------------------------------------------------------------------- +%% Helper functions for compose_query +%%------------------------------------------------------------------------- + +%% Returns separator to be used between key-value pairs +get_separator(_, Acc) when length(Acc) =:= 0 -> + []; +get_separator([], _Acc) -> + "&"; +get_separator([{separator, amp}], _Acc) -> + "&"; +get_separator([{separator, semicolon}], _Acc) -> + ";". + + +%% Form-urlencode input based on RFC 1866 [8.2.1] +form_urlencode(Cs) when is_binary(Cs) -> + L = convert_list(Cs, utf8), + form_urlencode(L, []); +form_urlencode(Cs) -> + L = flatten_list(Cs, utf8), + form_urlencode(L, []). +%% +form_urlencode([], Acc) -> + lists:reverse(Acc); +form_urlencode([$ |T], Acc) -> + form_urlencode(T, [$+|Acc]); +form_urlencode([H|T], Acc) -> + case is_url_char(H) of + true -> + form_urlencode(T, [H|Acc]); + false -> + E = urlencode_char(H), + form_urlencode(T, lists:reverse(E) ++ Acc) + end. + + +urlencode_char(C) -> + B = percent_encode_binary(C), + unicode:characters_to_list(B). + + +%% Return true if input char can appear in URL according to +%% RFC 1738 "Uniform Resource Locators". +is_url_char(C) + when 0 =< C, C =< 31; + 128 =< C, C =< 255 -> false; +is_url_char(127) -> false; +is_url_char(C) -> + not (is_reserved(C) orelse is_unsafe(C)). + + +%% Reserved characters (RFC 1738) +is_reserved($;) -> true; +is_reserved($/) -> true; +is_reserved($?) -> true; +is_reserved($:) -> true; +is_reserved($@) -> true; +is_reserved($=) -> true; +is_reserved($&) -> true; +is_reserved(_) -> false. + + +%% Unsafe characters (RFC 1738) +is_unsafe(${) -> true; +is_unsafe($}) -> true; +is_unsafe($|) -> true; +is_unsafe($\\) -> true; +is_unsafe($^) -> true; +is_unsafe($~) -> true; +is_unsafe($[) -> true; +is_unsafe($]) -> true; +is_unsafe($`) -> true; +is_unsafe(_) -> false. + + +%%------------------------------------------------------------------------- +%% Helper functions for dissect_query +%%------------------------------------------------------------------------- +dissect_query_key([$=|T], Acc, Key, Value) -> + dissect_query_value(T, Acc, Key, Value); +dissect_query_key([H|T], Acc, Key, Value) -> + dissect_query_key(T, Acc, [H|Key], Value); +dissect_query_key(L, _, _, _) -> + throw({error, missing_value, L}). + + +dissect_query_value([$&|_] = L, Acc, Key, Value) -> + K = form_urldecode(lists:reverse(Key)), + V = form_urldecode(lists:reverse(Value)), + dissect_query_separator_amp(L, [{K,V}|Acc], [], []); +dissect_query_value([$;|_] = L, Acc, Key, Value) -> + K = form_urldecode(lists:reverse(Key)), + V = form_urldecode(lists:reverse(Value)), + dissect_query_separator_semicolon(L, [{K,V}|Acc], [], []); +dissect_query_value([H|T], Acc, Key, Value) -> + dissect_query_value(T, Acc, Key, [H|Value]); +dissect_query_value([], Acc, Key, Value) -> + K = form_urldecode(lists:reverse(Key)), + V = form_urldecode(lists:reverse(Value)), + lists:reverse([{K,V}|Acc]). + + +dissect_query_separator_amp("&" ++ T, Acc, Key, Value) -> + dissect_query_key(T, Acc, Key, Value); +dissect_query_separator_amp(L, _, _, _) -> + throw({error, invalid_separator, L}). + + +dissect_query_separator_semicolon([$;|T], Acc, Key, Value) -> + dissect_query_key(T, Acc, Key, Value). + + +%% Form-urldecode input based on RFC 1866 [8.2.1] +form_urldecode(Cs) -> + B = convert_binary(Cs, utf8, utf8), + Result = form_urldecode(B, <<>>), + convert_list(Result, utf8). +%% +form_urldecode(<<>>, Acc) -> + convert_list(Acc, utf8); +form_urldecode(<<$+,T/binary>>, Acc) -> + form_urlencode(T, [$ |Acc]); +form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), + form_urldecode(T, <>); + false -> + L = convert_list(<<$%,C0,C1,T/binary>>, utf8), + throw({error, urldecode, L}) + end; +form_urldecode(<>, Acc) -> + case is_url_char(H) of + true -> + form_urldecode(T, <>); + false -> + L = convert_list(<>, utf8), + throw({error, urldecode, L}) + end. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 901d38a4da..beb534e023 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -37,7 +37,9 @@ recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, recompose_autogen/1, parse_recompose_autogen/1, - transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1 + transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1, + compose_query/1, compose_query_negative/1, + dissect_query/1, dissect_query_negative/1 ]). @@ -104,7 +106,11 @@ all() -> transcode_basic, transcode_options, transcode_mixed, - transcode_negative + transcode_negative, + compose_query, + compose_query_negative, + dissect_query, + dissect_query_negative ]. groups() -> @@ -803,5 +809,31 @@ transcode_mixed(_Config) -> transcode_negative(_Config) -> {invalid_input,"foo","BX"} = uri_string:transcode(<<"foo%C3%BXbar"/utf8>>, [{in_encoding, utf8},{out_encoding, utf32}]), - {invalid_input,<<>>,<<"ö">>} = + {invalid_input,unicode,<<"ö">>} = uri_string:transcode("foo%F6bar", [{in_encoding, utf8},{out_encoding, utf8}]). + +compose_query(_Config) -> + [] = uri_string:compose_query([]), + "foo=1&bar=2" = uri_string:compose_query([{<<"foo">>,"1"}, {"bar", "2"}]), + "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,amp}]), + "foo=1;bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,semicolon}]), + "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{"foo bar","1"}, {"ö", "2"}]). + +compose_query_negative(_Config) -> + {error,badarg,4} = uri_string:compose_query([{"",4}]). + +dissect_query(_Config) -> + [] = uri_string:dissect_query(""), + [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"), + [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1;bar=2"), + [{"foo","1"}, {"bar", "222"}] = uri_string:dissect_query([<<"foo=1;bar=2">>,"22"]), + [{"foo","ö"}, {"bar", "2"}] = uri_string:dissect_query("foo=%C3%B6&bar=2"). + +dissect_query_negative(_Config) -> + {error,invalid_separator,"≈bar=2"} = + uri_string:dissect_query("foo=1≈bar=2"), + {error,urldecode,"&bar"} = + uri_string:dissect_query("foo1&bar=2"), + {error,urldecode,"%XX%B6"} = uri_string:dissect_query("foo=%XX%B6&bar=2"), + {error,unicode,<<153,182>>} = + uri_string:dissect_query("foo=%99%B6&bar=2"). -- cgit v1.2.3 From 75989c8024283155f6f8075ee9e81b50a65e9ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 19 Oct 2017 17:19:46 +0200 Subject: stdlib: Improve error handling --- lib/stdlib/src/uri_string.erl | 129 +++++++++++++++++++++-------------- lib/stdlib/test/uri_string_SUITE.erl | 52 ++++++++------ 2 files changed, 111 insertions(+), 70 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 1b8f8b828f..51f7564934 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -293,11 +293,22 @@ URIString :: uri_string(), URIMap :: uri_map(). parse(URIString) when is_binary(URIString) -> - parse_uri_reference(URIString, #{}); + try parse_uri_reference(URIString, #{}) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end; parse(URIString) when is_list(URIString) -> - Binary = unicode:characters_to_binary(URIString), - Map = parse_uri_reference(Binary, #{}), - convert_mapfields_to_list(Map). + try + Binary = unicode:characters_to_binary(URIString), + Map = parse_uri_reference(Binary, #{}), + convert_mapfields_to_list(Map) + of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. + %%------------------------------------------------------------------------- %% Recompose URIs @@ -308,17 +319,24 @@ parse(URIString) when is_list(URIString) -> recompose(Map) -> case is_valid_map(Map) of false -> - error({badarg, invalid_map}); + {error, invalid_map, Map}; true -> - T0 = update_scheme(Map, empty), - T1 = update_userinfo(Map, T0), - T2 = update_host(Map, T1), - T3 = update_port(Map, T2), - T4 = update_path(Map, T3), - T5 = update_query(Map, T4), - update_fragment(Map, T5) + try + T0 = update_scheme(Map, empty), + T1 = update_userinfo(Map, T0), + T2 = update_host(Map, T1), + T3 = update_port(Map, T2), + T4 = update_path(Map, T3), + T5 = update_query(Map, T4), + update_fragment(Map, T5) + of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end end. + %%------------------------------------------------------------------------- %% Resolve references %%------------------------------------------------------------------------- @@ -364,7 +382,7 @@ transcode(URIString, Options) when is_binary(URIString) -> of Result -> Result catch - throw:{error, L, RestData} -> {invalid_input, L, RestData} + throw:{error, _, RestData} -> {error, invalid_input, RestData} end; transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -373,7 +391,7 @@ transcode(URIString, Options) when is_list(URIString) -> try transcode(Flattened, [], InEnc, OutEnc) of Result -> Result catch - throw:{error, List, RestData} -> {invalid_input, List, RestData} + throw:{error, _, RestData} -> {error, invalid_input, RestData} end. @@ -467,7 +485,7 @@ parse_uri_reference(URIString, URI) -> try parse_scheme_start(URIString, URI) of Res -> Res catch - throw:uri_parse_error -> + throw:{_,_,_} -> parse_relative_part(URIString, URI) end. @@ -495,7 +513,7 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> URI2 = maybe_add_path(URI1), URI2#{userinfo => decode_userinfo(Userinfo)} catch - throw:uri_parse_error -> + throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), URI2 = maybe_add_path(URI1), @@ -521,7 +539,7 @@ parse_relative_part(?STRING_REST(Char, Rest), URI) -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme Path = calculate_parsed_part(Rest, T), URI1#{path => decode_path(?STRING_REST(Char, Path))}; - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end. @@ -571,7 +589,7 @@ parse_segment(?STRING_REST($#, Rest), URI) -> parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of true -> parse_segment(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_segment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -594,7 +612,7 @@ parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_segment_nz_nc(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -629,7 +647,7 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> Scheme = calculate_parsed_scheme(Rest, T), URI2 = maybe_add_path(URI1), URI2#{scheme => ?STRING_REST(Char, Scheme)}; - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end. %% Add path component if it missing after parsing the URI. @@ -653,10 +671,10 @@ parse_scheme(?STRING_REST($:, Rest), URI) -> parse_scheme(?STRING_REST(Char, Rest), URI) -> case is_scheme(Char) of true -> parse_scheme(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_scheme(?STRING_EMPTY, _URI) -> - throw(uri_parse_error). + throw({error,invalid_uri,<<>>}). %% Check if char is allowed in scheme @@ -681,7 +699,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> Userinfo = calculate_parsed_userinfo(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch - throw:uri_parse_error -> + throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}} @@ -704,7 +722,7 @@ parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless {T, URI1} = parse_segment(Rest, URI), Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST(Char, Path))}}; - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_hier(?STRING_EMPTY, URI) -> {<<>>, URI}. @@ -744,11 +762,11 @@ parse_userinfo(?STRING_REST($@, Rest), URI) -> parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_userinfo(?STRING_EMPTY, _URI) -> %% URI cannot end in userinfo state - throw(uri_parse_error). + throw({error,invalid_uri,<<>>}). %% Check if char is allowed in userinfo @@ -847,7 +865,7 @@ parse_reg_name(?STRING_REST($#, Rest), URI) -> parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_reg_name(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -883,7 +901,7 @@ parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of true -> parse_ipv4_bin(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), @@ -899,7 +917,7 @@ is_ipv4(Char) -> is_digit(Char). validate_ipv4_address(Addr) -> case inet:parse_ipv4strict_address(Addr) of {ok, _} -> Addr; - {error, _} -> throw(uri_parse_error) + {error, _} -> throw({error,invalid_uri,Addr}) end. @@ -910,10 +928,10 @@ parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) -> parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> - throw(uri_parse_error). + throw({error,invalid_uri,<<>>}). %% Check if char is allowed in IPv6 addresses -spec is_ipv6(char()) -> boolean(). @@ -943,7 +961,7 @@ parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin_end(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_ipv6_bin_end(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -952,7 +970,7 @@ parse_ipv6_bin_end(?STRING_EMPTY, URI) -> validate_ipv6_address(Addr) -> case inet:parse_ipv6strict_address(Addr) of {ok, _} -> Addr; - {error, _} -> throw(uri_parse_error) + {error, _} -> throw({error,invalid_uri,Addr}) end. @@ -981,7 +999,7 @@ parse_port(?STRING_REST($#, Rest), URI) -> parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_port(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_port(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -1007,7 +1025,7 @@ parse_query(?STRING_REST($#, Rest), URI) -> parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of true -> parse_query(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_query(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -1033,7 +1051,7 @@ is_query(Char) -> is_pchar(Char). parse_fragment(?STRING_REST(Char, Rest), URI) -> case is_fragment(Char) of true -> parse_fragment(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_fragment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -1335,9 +1353,9 @@ decode_fragment(Cs) -> check_utf8(Cs) -> case unicode:characters_to_list(Cs) of {incomplete,_,_} -> - throw(uri_parse_error); + throw({error,non_utf8,Cs}); {error,_,_} -> - throw(uri_parse_error); + throw({error,non_utf8,Cs}); _ -> Cs end. @@ -1348,13 +1366,13 @@ check_utf8(Cs) -> %% Only validates as scheme cannot have percent-encoded characters -spec encode_scheme(list()|binary()) -> list() | binary(). encode_scheme([]) -> - throw(uri_parse_error); + throw({error,invalid_scheme,""}); encode_scheme(<<>>) -> - throw(uri_parse_error); + throw({error,invalid_scheme,<<>>}); encode_scheme(Scheme) -> case validate_scheme(Scheme) of true -> Scheme; - false -> throw(uri_parse_error) + false -> throw({error,invalid_scheme,Scheme}) end. -spec encode_userinfo(list()|binary()) -> list() | binary(). @@ -1390,12 +1408,12 @@ decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> true -> B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); - false -> throw(uri_parse_error) + false -> throw({error,percent_decode,<<$%,C0,C1>>}) end; decode(<>, Fun, Acc) -> case Fun(C) of true -> decode(Cs, Fun, <>); - false -> throw(uri_parse_error) + false -> throw({error,percent_decode,<>}) end; decode(<<>>, _Fun, Acc) -> Acc. @@ -1424,8 +1442,8 @@ encode(Component, Fun) when is_binary(Component) -> encode(<>, Fun, Acc) -> C = encode_codepoint_binary(Char, Fun), encode(Rest, Fun, <>); -encode(<<_Char, _Rest/binary>>, _Fun, _Acc) -> - throw(uri_parse_error); +encode(<>, _Fun, _Acc) -> + throw({error,percent_encode,<>}); encode(<<>>, _Fun, Acc) -> Acc. @@ -1554,7 +1572,8 @@ is_valid_map(Map) -> not maps:is_key(host, Map) andalso maps:is_key(port, Map))) orelse not maps:is_key(path, Map) orelse - not is_host_and_path_valid(Map) + not is_host_and_path_valid(Map) orelse + invalid_field_present(Map) of true -> false; @@ -1563,6 +1582,16 @@ is_valid_map(Map) -> end. +invalid_field_present(Map) -> + Fun = fun(K, _, AccIn) -> AccIn orelse + ((K =/= scheme) andalso (K =/= userinfo) + andalso (K =/= host) andalso (K =/= port) + andalso (K =/= path) andalso (K =/= query) + andalso (K =/= fragment)) + end, + maps:fold(Fun, false, Map). + + is_host_and_path_valid(Map) -> Host = maps:get(host, Map, undefined), Path = maps:get(path, Map, undefined), @@ -1745,9 +1774,9 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) -> convert_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of {error, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); {incomplete, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); Result -> Result end. @@ -1757,9 +1786,9 @@ convert_binary(Binary, InEncoding, OutEncoding) -> convert_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of {error, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); {incomplete, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); Result -> Result end. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index beb534e023..b70cb842de 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -32,7 +32,7 @@ parse_pct_encoded_userinfo/1, parse_port/1, parse_query/1, parse_scheme/1, parse_userinfo/1, parse_list/1, parse_binary/1, parse_mixed/1, parse_relative/1, - parse_special/1, parse_special2/1, + parse_special/1, parse_special2/1, parse_negative/1, recompose_fragment/1, recompose_parse_fragment/1, recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, @@ -95,6 +95,7 @@ all() -> parse_relative, parse_special, parse_special2, + parse_negative, recompose_fragment, recompose_parse_fragment, recompose_query, @@ -343,8 +344,8 @@ parse_binary_pct_encoded_userinfo(_Config) -> uri_string:parse(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93">>), #{scheme := <<"foo">>, userinfo := <<"合:気"/utf8>>, host := <<"道"/utf8>>} = uri_string:parse(<<"foo://%E5%90%88:%E6%B0%97@%E9%81%93">>), - uri_parse_error =(catch uri_string:parse(<<"//%E5%90%88@%E6%B0%97%E9%81%93@">>)), - uri_parse_error = (catch uri_string:parse(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>)). + {error,invalid_uri,"@"} = uri_string:parse(<<"//%E5%90%88@%E6%B0%97%E9%81%93@">>), + {error,invalid_uri,":"} = uri_string:parse(<<"foo://%E5%90%88@%E6%B0%97%E9%81%93@">>). parse_binary_host(_Config) -> #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>), @@ -359,8 +360,8 @@ parse_binary_host_ipv4(_Config) -> #{host := <<"127.0.0.1">>, query := <<"name=ferret">>} = uri_string:parse(<<"//127.0.0.1?name=ferret">>), #{host := <<"127.0.0.1">>, fragment := <<"nose">>} = uri_string:parse(<<"//127.0.0.1#nose">>), - uri_parse_error = (catch uri_string:parse(<<"//127.0.0.x">>)), - uri_parse_error = (catch uri_string:parse(<<"//1227.0.0.1">>)). + {error,invalid_uri,"x"} = uri_string:parse(<<"//127.0.0.x">>), + {error,invalid_uri,"1227.0.0.1"} = uri_string:parse(<<"//1227.0.0.1">>). parse_binary_host_ipv6(_Config) -> #{host := <<"::127.0.0.1">>} = uri_string:parse(<<"//[::127.0.0.1]">>), @@ -372,9 +373,9 @@ parse_binary_host_ipv6(_Config) -> uri_string:parse(<<"//[::127.0.0.1]?name=ferret">>), #{host := <<"::127.0.0.1">>, fragment := <<"nose">>} = uri_string:parse(<<"//[::127.0.0.1]#nose">>), - uri_parse_error = (catch uri_string:parse(<<"//[::127.0.0.x]">>)), - uri_parse_error = (catch uri_string:parse(<<"//[::1227.0.0.1]">>)), - uri_parse_error = (catch uri_string:parse(<<"//[2001:0db8:0000:0000:0000:0000:1428:G7ab]">>)). + {error,invalid_uri,"x"} = uri_string:parse(<<"//[::127.0.0.x]">>), + {error,invalid_uri,"::1227.0.0.1"} = uri_string:parse(<<"//[::1227.0.0.1]">>), + {error,invalid_uri,"G"} = uri_string:parse(<<"//[2001:0db8:0000:0000:0000:0000:1428:G7ab]">>). parse_binary_port(_Config) -> #{path:= <<"/:8042">>} = @@ -389,8 +390,8 @@ parse_binary_port(_Config) -> uri_string:parse(<<"foo://:8042">>), #{scheme := <<"foo">>, host := <<"example.com">>, port := 8042} = uri_string:parse(<<"foo://example.com:8042">>), - uri_parse_error = (catch uri_string:parse(":600")), - uri_parse_error = (catch uri_string:parse("//:8042x")). + {error,invalid_uri,":"} = uri_string:parse(":600"), + {error,invalid_uri,"x"} = uri_string:parse("//:8042x"). parse_binary_path(_Config) -> #{path := <<"over/there">>} = uri_string:parse(<<"over/there">>), @@ -511,8 +512,8 @@ parse_pct_encoded_userinfo(_Config) -> uri_string:parse("foo://%E5%90%88@%E6%B0%97%E9%81%93"), #{scheme := "foo", userinfo := "合:気", host := "道"} = uri_string:parse("foo://%E5%90%88:%E6%B0%97@%E9%81%93"), - uri_parse_error =(catch uri_string:parse("//%E5%90%88@%E6%B0%97%E9%81%93@")), - uri_parse_error = (catch uri_string:parse("foo://%E5%90%88@%E6%B0%97%E9%81%93@")). + {error,invalid_uri,"@"} = uri_string:parse("//%E5%90%88@%E6%B0%97%E9%81%93@"), + {error,invalid_uri,":"} = uri_string:parse("foo://%E5%90%88@%E6%B0%97%E9%81%93@"). parse_host(_Config) -> @@ -528,8 +529,8 @@ parse_host_ipv4(_Config) -> #{host := "127.0.0.1", path := "/over/there"} = uri_string:parse("//127.0.0.1/over/there"), #{host := "127.0.0.1", query := "name=ferret"} = uri_string:parse("//127.0.0.1?name=ferret"), #{host := "127.0.0.1", fragment := "nose"} = uri_string:parse("//127.0.0.1#nose"), - uri_parse_error = (catch uri_string:parse("//127.0.0.x")), - uri_parse_error = (catch uri_string:parse("//1227.0.0.1")). + {error,invalid_uri,"x"} = uri_string:parse("//127.0.0.x"), + {error,invalid_uri,"1227.0.0.1"} = uri_string:parse("//1227.0.0.1"). parse_host_ipv6(_Config) -> #{host := "::127.0.0.1"} = uri_string:parse("//[::127.0.0.1]"), @@ -537,9 +538,9 @@ parse_host_ipv6(_Config) -> #{host := "::127.0.0.1", query := "name=ferret"} = uri_string:parse("//[::127.0.0.1]?name=ferret"), #{host := "::127.0.0.1", fragment := "nose"} = uri_string:parse("//[::127.0.0.1]#nose"), - uri_parse_error = (catch uri_string:parse("//[::127.0.0.x]")), - uri_parse_error = (catch uri_string:parse("//[::1227.0.0.1]")), - uri_parse_error = (catch uri_string:parse("//[2001:0db8:0000:0000:0000:0000:1428:G7ab]")). + {error,invalid_uri,"x"} = uri_string:parse("//[::127.0.0.x]"), + {error,invalid_uri,"::1227.0.0.1"} = uri_string:parse("//[::1227.0.0.1]"), + {error,invalid_uri,"G"} = uri_string:parse("//[2001:0db8:0000:0000:0000:0000:1428:G7ab]"). parse_port(_Config) -> #{path:= "/:8042"} = @@ -693,6 +694,17 @@ parse_special2(_Config) -> #{host := [],path := "/",userinfo := []} = uri_string:parse("//@/"), #{host := [],path := "/",scheme := "foo",userinfo := []} = uri_string:parse("foo://@/"). +parse_negative(_Config) -> + {error,invalid_uri,"å"} = uri_string:parse("å"), + {error,invalid_uri,"å"} = uri_string:parse("aå:/foo"), + {error,invalid_uri,":"} = uri_string:parse("foo://usär@host"), + {error,invalid_uri,"ö"} = uri_string:parse("//host/path?foö=bar"), + {error,invalid_uri,"ö"} = uri_string:parse("//host/path#foö"), + {error,invalid_uri,"127.256.0.1"} = uri_string:parse("//127.256.0.1"), + {error,invalid_uri,":::127.0.0.1"} = uri_string:parse("//[:::127.0.0.1]"), + {error,non_utf8,<<0,0,0,246>>} = uri_string:parse("//%00%00%00%F6"). + + %%------------------------------------------------------------------------- %% Recompose tests %%------------------------------------------------------------------------- @@ -807,9 +819,9 @@ transcode_mixed(_Config) -> uri_string:transcode(["foo%00", <<"%00%0"/utf32>>,<<"0%F"/utf32>>,"6bar"], [{in_encoding, utf32},{out_encoding, utf8}]). transcode_negative(_Config) -> - {invalid_input,"foo","BX"} = + {error,invalid_input,"BX"} = uri_string:transcode(<<"foo%C3%BXbar"/utf8>>, [{in_encoding, utf8},{out_encoding, utf32}]), - {invalid_input,unicode,<<"ö">>} = + {error,invalid_input,<<"ö">>} = uri_string:transcode("foo%F6bar", [{in_encoding, utf8},{out_encoding, utf8}]). compose_query(_Config) -> @@ -835,5 +847,5 @@ dissect_query_negative(_Config) -> {error,urldecode,"&bar"} = uri_string:dissect_query("foo1&bar=2"), {error,urldecode,"%XX%B6"} = uri_string:dissect_query("foo=%XX%B6&bar=2"), - {error,unicode,<<153,182>>} = + {error,invalid_input,<<153,182>>} = uri_string:dissect_query("foo=%99%B6&bar=2"). -- cgit v1.2.3 From b439d19d38479d6264d906dd926a168c9c514da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 20 Oct 2017 16:32:42 +0200 Subject: stdlib: Update documentation (uri_string) --- lib/stdlib/doc/src/uri_string.xml | 114 +++++++++++++------------------------- lib/stdlib/src/uri_string.erl | 58 ++++++------------- 2 files changed, 56 insertions(+), 116 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index 8283b8ca0e..496573ae2f 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -24,7 +24,7 @@ maps Péter Dimitrov 1 - 2017-08-23 + 2017-10-20 A uri_string @@ -34,7 +34,8 @@

A URI is an identifier consisting of a sequence of characters matching the syntax rule named URI in RFC 3986.

The generic URI syntax consists of a hierarchical sequence of components referred - to as the scheme, authority, path, query, and fragment:

+    to as the scheme, authority, path, query, and fragment:

+
     URI         = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
     hier-part   = "//" authority path-abempty
                    / path-absolute
@@ -51,35 +52,26 @@
 
     unreserved  = ALPHA / DIGIT / "-" / "." / "_" / "~"
     


-

The interpretation of a URI depends only on the characters used and not on how those characters are represented in a network protocol.

-

The functions implemented by this module covers the following use cases: +

The functions implemented by this module covers the following use cases:

Parsing URIs

parse/1
Recomposing URIs

recompose/2
- Resolving URI references

- resolve_uri_reference/3
- Creating URI references

- create_uri_reference/3
- Normalizing URIs

- normalize/1
Transcoding URIs

transcode/2
- Working with urlencoded query strings

- compose_query/1, dissect_query/1
+ Working with form-urlencoded query strings

+ compose_query/[1,2], dissect_query/1
-

-

There are four different encodings present during the handling of URIs: +

There are four different encodings present during the handling of URIs:

Inbound binary encoding in binaries Inbound percent-encoding in lists and binaries Outbound binary encoding in binaries Outbound percent-encoding in lists and binaries -

Unless otherwise specified the return value type and encoding are the same as the input type and encoding. That is, binary input returns binary output, list input returns a list output but mixed input returns list output. Input and output encodings are the same except @@ -113,31 +105,34 @@ Compose urlencoded query string. -

Composes an urlencoded QueryString based on a +

Composes a form-urlencoded QueryString based on a QueryList, a list of unescaped key-value pairs. Media type application/x-www-form-urlencoded is defined in section - 8.2.1 of RFC 1866 (HTML 2.0). + 8.2.1 of RFC 1866 (HTML 2.0). Reserved and unsafe characters, as + defined by RFC 1738 (Uniform Resource Locators), are procent-encoded.

-

If an argument is invalid, a badarg exception is raised.

Example:

-1> uri_string:compose_query(...).
-
+1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}]). + +
- - Create references. + + Compose urlencoded query string. -

Creates an RFC 3986 compliant RelativeDestURI, - based AbsoluteSourceURI and AbsoluteSourceURI -

-

If an argument is invalid, a badarg exception is raised.

+

Same as compose_query/1 but with an additional + Options parameter, that controls the type of separator used + between key-value pairs. There are two supported separator types: amp () + and semicolon (;).

Example:

-1> uri_string:create_uri_reference(...,...).
-
+1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}], +2> [{separator, semicolon}]). +"foo+bar=1;city=%C3%B6rebro" +
@@ -148,31 +143,14 @@

Dissects an urlencoded QueryString and returns a QueryList, a list of unescaped key-value pairs. Media type application/x-www-form-urlencoded is defined in section - 8.2.1 of RFC 1866 (HTML 2.0). + 8.2.1 of RFC 1866 (HTML 2.0). Percent-encoded segments are decoded + as defined by RFC 1738 (Uniform Resource Locators).

-

If an argument is invalid, a badarg exception is raised.

Example:

-1> uri_string:dissect_query(...).
-
- - - - - - Normalize URI. - -

Normalizes an RFC 3986 compliant URIString and returns - a NormalizedURI. The algorithm used to shorten the input - URI is called Syntax-Based Normalization and described at - Section 6.2.2 of RFC 3986. -

-

If an argument is invalid, a badarg exception is raised.

-

Example:

-
-1> uri_string:normalize("http://example.org/one/two/../../one").
-"http://example.org/one"
-
+1> uri_string:dissect_query("foo+bar=1;city=%C3%B6rebro"). +[{"foo bar","1"},{"city","örebro"}] +
@@ -182,14 +160,14 @@

Returns a URIMap, that is a uri_map() with the parsed components of the URIString.

-

If parsing fails, a parse_error exception is raised.

+

If parsing fails, an error tuple is returned.

Example:

 1> uri_string:parse("foo://user@example.com:8042/over/there?name=ferret#nose").
 #{fragment => "nose",host => "example.com",
   path => "/over/there",port => 8042,query => "name=ferret",
   scheme => foo,userinfo => "user"}
-2> 
+
@@ -198,35 +176,20 @@ Recompose URI.

Returns an RFC 3986 compliant URIString (percent-encoded).

-

If the URIMap is invalid, a badarg exception is raised.

+

If the URIMap is invalid, an error tuple is returned.

Example:

 1> URIMap = #{fragment => "nose", host => "example.com", path => "/over/there",
-port => 8042, query => "name=ferret", scheme => foo, userinfo => "user"}.
+port => 8042, query => "name=ferret", scheme => "foo", userinfo => "user"}.
 #{fragment => "top",host => "example.com",
   path => "/over/there",port => 8042,query => "?name=ferret",
   scheme => foo,userinfo => "user"}
 
-2> uri_string:recompose(URIMap, []).
+2> uri_string:recompose(URIMap).
 "foo://example.com:8042/over/there?name=ferret#nose"
- - - Resolve URI reference. - -

Resolves an RFC 3986 compliant RelativeURI, - based AbsoluteBaseURI and returns a new absolute URI - (AbsoluteDestURI).

-

If an argument is invalid, a badarg exception is raised.

-

Example:

-
-1> uri_string:resolve_uri_reference(...,...).
-
-
-
- Transcode URI. @@ -234,14 +197,13 @@ port => 8042, query => "name=ferret", scheme => foo, userinfo => "user"}.

Transcodes an RFC 3986 compliant URIString, where Options is a list of tagged tuples, specifying the inbound (in_encoding) and outbound (out_encoding) encodings.

-

If an argument is invalid, a badarg exception is raised.

+

If an argument is invalid, an error tuple is returned.

Example:

-1> uri_string:transcode(<<"foo://f%20oo">>, [{in_encoding, utf8},
-{out_encoding, utf16}]).
-<<0,102,0,111,0,111,0,58,0,47,0,47,0,102,0,37,0,48,0,48,0,37,0,50,0,48,0,
-  111,0,111>>
-
+1> >,]]> +2> [{in_encoding, utf32},{out_encoding, utf8}]). +>]]> +
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 51f7564934..8723d3f183 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,9 +226,9 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([compose_query/1, compose_query/2, create_uri_reference/2, - dissect_query/1, normalize/1, parse/1, - recompose/1, resolve_uri_reference/2, transcode/2]). +-export([compose_query/1, compose_query/2, + dissect_query/1, parse/1, + recompose/1, transcode/2]). -export_type([uri_map/0, uri_string/0]). @@ -291,7 +291,8 @@ %%------------------------------------------------------------------------- -spec parse(URIString) -> URIMap when URIString :: uri_string(), - URIMap :: uri_map(). + URIMap :: uri_map() + | {error, atom(), list() | binary()}. parse(URIString) when is_binary(URIString) -> try parse_uri_reference(URIString, #{}) of Result -> Result @@ -315,7 +316,8 @@ parse(URIString) when is_list(URIString) -> %%------------------------------------------------------------------------- -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), - URIString :: uri_string(). + URIString :: uri_string() + | {error, atom(), list() | binary()}. recompose(Map) -> case is_valid_map(Map) of false -> @@ -337,41 +339,14 @@ recompose(Map) -> end. -%%------------------------------------------------------------------------- -%% Resolve references -%%------------------------------------------------------------------------- --spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when - RelativeURI :: uri_string(), - AbsoluteBaseURI :: uri_string(), - AbsoluteDestURI :: uri_string(). -resolve_uri_reference(_,_) -> - "". - -%%------------------------------------------------------------------------- -%% Create references -%%------------------------------------------------------------------------- --spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when - AbsoluteSourceURI :: uri_string(), - AbsoluteBaseURI :: uri_string(), - RelativeDestURI :: uri_string(). -create_uri_reference(_,_) -> - "". - -%%------------------------------------------------------------------------- -%% Normalize URIs -%%------------------------------------------------------------------------- --spec normalize(URIString) -> NormalizedURI when - URIString :: uri_string(), - NormalizedURI :: uri_string(). -normalize(_) -> - "". - %%------------------------------------------------------------------------- %% Transcode URIs %%------------------------------------------------------------------------- --spec transcode(URIString, Options) -> URIString when +-spec transcode(URIString, Options) -> Result when URIString :: uri_string(), - Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. + Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}], + Result :: uri_string() + | {error, atom(), list() | binary()}. transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -407,7 +382,8 @@ transcode(URIString, Options) when is_list(URIString) -> %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{uri_string(), uri_string()}], - QueryString :: string(). + QueryString :: string() + | {error, atom(), list() | binary()}. compose_query(List) -> compose_query(List, []). @@ -415,7 +391,8 @@ compose_query(List) -> -spec compose_query(QueryList, Options) -> QueryString when QueryList :: [{uri_string(), uri_string()}], Options :: [{separator, atom()}], - QueryString :: string(). + QueryString :: string() + | {error, atom(), list() | binary()}. compose_query([],_Options) -> []; compose_query(List, Options) -> @@ -439,7 +416,8 @@ compose_query([], _Options, Acc) -> %%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), - QueryList :: [{string(), string()}]. + QueryList :: [{string(), string()}] + | {error, atom(), list() | binary()}. dissect_query([]) -> []; dissect_query(QueryString) when is_binary(QueryString) -> @@ -1940,7 +1918,7 @@ form_urldecode(Cs) -> form_urldecode(<<>>, Acc) -> convert_list(Acc, utf8); form_urldecode(<<$+,T/binary>>, Acc) -> - form_urlencode(T, [$ |Acc]); + form_urldecode(T, <>); form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> -- cgit v1.2.3 From da11b15aef87f392a807b4756bf285160e15a194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 23 Oct 2017 12:02:16 +0200 Subject: stdlib: Update supported separators (query string) Update list of supported separators: - escaped_amp (default): "&" - amp: "&" - semicolon: ";" --- lib/stdlib/doc/src/uri_string.xml | 10 +++++----- lib/stdlib/src/uri_string.erl | 4 ++++ lib/stdlib/test/uri_string_SUITE.erl | 6 ++++-- 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index 496573ae2f..97b38ea93e 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -28,9 +28,10 @@ A uri_string - RFC 3986 compliant URI processing functions. + URI processing functions. -

This module contains functions for parsing and handling RFC 3986 compliant URIs.

+

This module contains functions for parsing and handling URIs (RFC 3986) and + form-urlencoded query strings (RFC 1866).

A URI is an identifier consisting of a sequence of characters matching the syntax rule named URI in RFC 3986.

The generic URI syntax consists of a hierarchical sequence of components referred @@ -109,7 +110,7 @@ QueryList, a list of unescaped key-value pairs. Media type application/x-www-form-urlencoded is defined in section 8.2.1 of RFC 1866 (HTML 2.0). Reserved and unsafe characters, as - defined by RFC 1738 (Uniform Resource Locators), are procent-encoded. + defined by RFC 1738 (Uniform Resource Locators), are percent-encoded.

Example:

@@ -125,8 +126,7 @@
       
         

Same as compose_query/1 but with an additional Options parameter, that controls the type of separator used - between key-value pairs. There are two supported separator types: amp () - and semicolon (;).

+ between key-value pairs. There are three supported separator types: amp (), escaped_amp () and semicolon (;). If the parameter Options is empty, separator takes the default value (escaped_amp).

Example:

 1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl
index 8723d3f183..a4fd9c66f4 100644
--- a/lib/stdlib/src/uri_string.erl
+++ b/lib/stdlib/src/uri_string.erl
@@ -1806,6 +1806,8 @@ get_separator(_, Acc) when length(Acc) =:= 0 ->
 get_separator([], _Acc) ->
     "&";
 get_separator([{separator, amp}], _Acc) ->
+    "&";
+get_separator([{separator, escaped_amp}], _Acc) ->
     "&";
 get_separator([{separator, semicolon}], _Acc) ->
     ";".
@@ -1901,6 +1903,8 @@ dissect_query_value([], Acc, Key, Value) ->
 
 dissect_query_separator_amp("&" ++ T, Acc, Key, Value) ->
     dissect_query_key(T, Acc, Key, Value);
+dissect_query_separator_amp("&" ++ T, Acc, Key, Value) ->
+    dissect_query_key(T, Acc, Key, Value);
 dissect_query_separator_amp(L, _, _, _) ->
     throw({error, invalid_separator, L}).
 
diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl
index b70cb842de..fe832ac82c 100644
--- a/lib/stdlib/test/uri_string_SUITE.erl
+++ b/lib/stdlib/test/uri_string_SUITE.erl
@@ -827,7 +827,8 @@ transcode_negative(_Config) ->
 compose_query(_Config) ->
     [] = uri_string:compose_query([]),
     "foo=1&bar=2" = uri_string:compose_query([{<<"foo">>,"1"}, {"bar", "2"}]),
-    "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,amp}]),
+    "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,escaped_amp}]),
+    "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,amp}]),
     "foo=1;bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,semicolon}]),
     "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{"foo bar","1"}, {"ö", "2"}]).
 
@@ -837,12 +838,13 @@ compose_query_negative(_Config) ->
 dissect_query(_Config) ->
     [] = uri_string:dissect_query(""),
     [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"),
+    [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"),
     [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1;bar=2"),
     [{"foo","1"}, {"bar", "222"}] = uri_string:dissect_query([<<"foo=1;bar=2">>,"22"]),
     [{"foo","ö"}, {"bar", "2"}] = uri_string:dissect_query("foo=%C3%B6&bar=2").
 
 dissect_query_negative(_Config) ->
-    {error,invalid_separator,"≈bar=2"} =
+    {error,urldecode,";bar"} =
         uri_string:dissect_query("foo=1≈bar=2"),
     {error,urldecode,"&bar"} =
         uri_string:dissect_query("foo1&bar=2"),
-- 
cgit v1.2.3


From 642bb27f8104991445a1f507f6b065d3cd7cd1ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= 
Date: Tue, 24 Oct 2017 09:17:55 +0200
Subject: stdlib: Fix title in uri_string.xml

---
 lib/stdlib/doc/src/uri_string.xml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib')

diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml
index 97b38ea93e..d67c687fd1 100644
--- a/lib/stdlib/doc/src/uri_string.xml
+++ b/lib/stdlib/doc/src/uri_string.xml
@@ -21,10 +21,10 @@
       limitations under the License.
     
 
-    maps
+    uri_string
     Péter Dimitrov
     1
-    2017-10-20
+    2017-10-24
     A
   
   uri_string
-- 
cgit v1.2.3


From 3c80849dc9167018a66542b76b441e675d404a78 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= 
Date: Tue, 24 Oct 2017 13:19:37 +0200
Subject: stdlib: Refactor parsed binary calculation

---
 lib/stdlib/src/uri_string.erl | 220 +++++++++++++-----------------------------
 1 file changed, 65 insertions(+), 155 deletions(-)

(limited to 'lib')

diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl
index a4fd9c66f4..684087b870 100644
--- a/lib/stdlib/src/uri_string.erl
+++ b/lib/stdlib/src/uri_string.erl
@@ -493,7 +493,7 @@ parse_relative_part(?STRING_REST("//", Rest), URI) ->
     catch
         throw:{_,_,_} ->
             {T, URI1} = parse_host(Rest, URI),
-            Host = calculate_parsed_part_sl(Rest, T),
+            Host = calculate_parsed_host_port(Rest, T),
             URI2 = maybe_add_path(URI1),
             URI2#{host => decode_host(remove_brackets(Host))}
     end;
@@ -503,12 +503,12 @@ parse_relative_part(?STRING_REST($/, Rest), URI) ->
     URI1#{path => decode_path(?STRING_REST($/, Path))};
 parse_relative_part(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     URI2 = maybe_add_path(URI1),
     URI2#{query => decode_query(Query)};
 parse_relative_part(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     URI2 = maybe_add_path(URI1),
     URI2#{fragment => decode_fragment(Fragment)};
 parse_relative_part(?STRING_REST(Char, Rest), URI) ->
@@ -558,11 +558,11 @@ parse_segment(?STRING_REST($/, Rest), URI) ->
     parse_segment(Rest, URI);  % segment
 parse_segment(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_segment(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_segment(?STRING_REST(Char, Rest), URI) ->
     case is_pchar(Char) of
@@ -581,11 +581,11 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) ->
     parse_segment(Rest, URI);  % segment
 parse_segment_nz_nc(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_segment_nz_nc(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) ->
     case is_segment_nz_nc(Char) of
@@ -679,7 +679,7 @@ parse_hier(?STRING_REST("//", Rest), URI) ->
     catch
         throw:{_,_,_} ->
             {T, URI1} = parse_host(Rest, URI),
-            Host = calculate_parsed_part_sl(Rest, T),
+            Host = calculate_parsed_host_port(Rest, T),
 	    {Rest, URI1#{host => decode_host(remove_brackets(Host))}}
     end;
 parse_hier(?STRING_REST($/, Rest), URI) ->
@@ -688,11 +688,11 @@ parse_hier(?STRING_REST($/, Rest), URI) ->
     {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}};
 parse_hier(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_hier(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_hier(?STRING_REST(Char, Rest), URI) ->  % path-rootless
     case is_pchar(Char) of
@@ -735,7 +735,7 @@ parse_userinfo(?CHAR($@), URI) ->
     {?STRING_EMPTY, URI#{host => <<>>}};
 parse_userinfo(?STRING_REST($@, Rest), URI) ->
     {T, URI1} = parse_host(Rest, URI),
-    Host = calculate_parsed_host(Rest, T),
+    Host = calculate_parsed_host_port(Rest, T),
     {Rest, URI1#{host => decode_host(remove_brackets(Host))}};
 parse_userinfo(?STRING_REST(Char, Rest), URI) ->
     case is_userinfo(Char) of
@@ -796,7 +796,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
 -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}.
 parse_host(?STRING_REST($:, Rest), URI) ->
     {T, URI1} = parse_port(Rest, URI),
-    H = calculate_parsed_port(Rest, T),
+    H = calculate_parsed_host_port(Rest, T),
     Port = binary_to_integer(H),
     {Rest, URI1#{port => Port}};
 parse_host(?STRING_REST($/, Rest), URI) ->
@@ -805,13 +805,13 @@ parse_host(?STRING_REST($/, Rest), URI) ->
     {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}};
 parse_host(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_host(?STRING_REST($[, Rest), URI) ->
     parse_ipv6_bin(Rest, [], URI);
 parse_host(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_host(?STRING_REST(Char, Rest), URI) ->
     case is_digit(Char) of
@@ -825,7 +825,7 @@ parse_host(?STRING_EMPTY, URI) ->
 -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}.
 parse_reg_name(?STRING_REST($:, Rest), URI) ->
     {T, URI1} = parse_port(Rest, URI),
-    H = calculate_parsed_port(Rest, T),
+    H = calculate_parsed_host_port(Rest, T),
     Port = binary_to_integer(H),
     {Rest, URI1#{port => Port}};
 parse_reg_name(?STRING_REST($/, Rest), URI) ->
@@ -834,11 +834,11 @@ parse_reg_name(?STRING_REST($/, Rest), URI) ->
     {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}};
 parse_reg_name(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_reg_name(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_reg_name(?STRING_REST(Char, Rest), URI) ->
     case is_reg_name(Char) of
@@ -858,7 +858,7 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char).
 parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) ->
     _ = validate_ipv4_address(lists:reverse(Acc)),
     {T, URI1} = parse_port(Rest, URI),
-    H = calculate_parsed_port(Rest, T),
+    H = calculate_parsed_host_port(Rest, T),
     Port = binary_to_integer(H),
     {Rest, URI1#{port => Port}};
 parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) ->
@@ -869,12 +869,12 @@ parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) ->
 parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) ->
     _ = validate_ipv4_address(lists:reverse(Acc)),
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) ->
     _ = validate_ipv4_address(lists:reverse(Acc)),
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) ->
     case is_ipv4(Char) of
@@ -921,7 +921,7 @@ is_ipv6(Char) -> is_hex_digit(Char).
 -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}.
 parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) ->
     {T, URI1} = parse_port(Rest, URI),
-    H = calculate_parsed_port(Rest, T),
+    H = calculate_parsed_host_port(Rest, T),
     Port = binary_to_integer(H),
     {Rest, URI1#{port => Port}};
 parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) ->
@@ -930,11 +930,11 @@ parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) ->
     {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}};
 parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) ->
     case is_ipv6(Char) of
@@ -968,11 +968,11 @@ parse_port(?STRING_REST($/, Rest), URI) ->
     {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}};
 parse_port(?STRING_REST($?, Rest), URI) ->
     {T, URI1} = parse_query(Rest, URI),  % path-empty ?query
-    Query = calculate_parsed_query(Rest, T),
+    Query = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{query => decode_query(Query)}};
 parse_port(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),  % path-empty
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_port(?STRING_REST(Char, Rest), URI) ->
     case is_digit(Char) of
@@ -998,7 +998,7 @@ parse_port(?STRING_EMPTY, URI) ->
 -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}.
 parse_query(?STRING_REST($#, Rest), URI) ->
     {T, URI1} = parse_fragment(Rest, URI),
-    Fragment = calculate_parsed_fragment(Rest, T),
+    Fragment = calculate_parsed_query_fragment(Rest, T),
     {Rest, URI1#{fragment => decode_fragment(Fragment)}};
 parse_query(?STRING_REST(Char, Rest), URI) ->
     case is_query(Char) of
@@ -1115,144 +1115,77 @@ remove_brackets(Addr) -> Addr.
 %%-------------------------------------------------------------------------
 %% Helper functions for calculating the parsed binary.
 %%-------------------------------------------------------------------------
+-spec calculate_parsed_scheme(binary(), binary()) -> binary().
+calculate_parsed_scheme(Input, <<>>) ->
+    strip_last_char(Input, [$:]);
+calculate_parsed_scheme(Input, Unparsed) ->
+    get_parsed_binary(Input, Unparsed).
+
 
-%% Returns the parsed binary based on Input and the Unparsed part.
-%% Handles the following special cases:
-%%
-%%    #{host => [],path => "/",query => []} = uri_string:parse("///?")
-%%    #{fragment => [],host => [],path => "/"} = uri_string:parse("///#")
-%%
 -spec calculate_parsed_part(binary(), binary()) -> binary().
-calculate_parsed_part(<<$?>>, _) -> <<>>;
-calculate_parsed_part(<<$#>>, _) -> <<>>;
-calculate_parsed_part(<<>>, _) -> <<>>;
 calculate_parsed_part(Input, <<>>) ->
-    case binary:last(Input) of
-        $? ->
-            init_binary(Input);
-        $# ->
-            init_binary(Input);
-        _Else ->
-            Input
-    end;
+    strip_last_char(Input, [$?,$#]);
 calculate_parsed_part(Input, Unparsed) ->
-    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
-    First.
+    get_parsed_binary(Input, Unparsed).
 
 
 -spec calculate_parsed_userinfo(binary(), binary()) -> binary().
-calculate_parsed_userinfo(<<$?>>, _) -> <<>>;
-calculate_parsed_userinfo(<<$#>>, _) -> <<>>;
-calculate_parsed_userinfo(<<>>, _) -> <<>>;
 calculate_parsed_userinfo(Input, <<>>) ->
-    case binary:last(Input) of
-        $? ->
-            init_binary(Input);
-        $# ->
-            init_binary(Input);
-        $@ ->
-            init_binary(Input);
-        _Else ->
-            Input
-    end;
+    strip_last_char(Input, [$?,$#,$@]);
 calculate_parsed_userinfo(Input, Unparsed) ->
-    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
-    First.
+    get_parsed_binary(Input, Unparsed).
+
+
+-spec calculate_parsed_host_port(binary(), binary()) -> binary().
+calculate_parsed_host_port(Input, <<>>) ->
+    strip_last_char(Input, [$?,$#,$/]);
+calculate_parsed_host_port(Input, Unparsed) ->
+    get_parsed_binary(Input, Unparsed).
+
 
+calculate_parsed_query_fragment(Input, <<>>) ->
+    strip_last_char(Input, [$#]);
+calculate_parsed_query_fragment(Input, Unparsed) ->
+    get_parsed_binary(Input, Unparsed).
 
--spec calculate_parsed_host(binary(), binary()) -> binary().
-calculate_parsed_host(<<$?>>, _) -> <<>>;
-calculate_parsed_host(<<$#>>, _) -> <<>>;
-calculate_parsed_host(<<>>, _) -> <<>>;
-calculate_parsed_host(Input, <<>>) ->
+
+%% Strip last char if it is in list
+strip_last_char(<<>>, _) -> <<>>;
+strip_last_char(Input, [C0]) ->
     case binary:last(Input) of
-        $? ->
-            init_binary(Input);
-        $# ->
-            init_binary(Input);
-        $/ ->
+        C0 ->
             init_binary(Input);
         _Else ->
             Input
     end;
-calculate_parsed_host(Input, Unparsed) ->
-    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
-    First.
-
-
--spec calculate_parsed_port(binary(), binary()) -> binary().
-calculate_parsed_port(<<$?>>, _) -> <<>>;
-calculate_parsed_port(<<$#>>, _) -> <<>>;
-calculate_parsed_port(<<>>, _) -> <<>>;
-calculate_parsed_port(Input, <<>>) ->
+strip_last_char(Input, [C0,C1]) ->
     case binary:last(Input) of
-        $? ->
-            init_binary(Input);
-        $# ->
+        C0 ->
             init_binary(Input);
-        $/ ->
+        C1 ->
             init_binary(Input);
         _Else ->
             Input
     end;
-calculate_parsed_port(Input, Unparsed) ->
-    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
-    First.
-
-
-calculate_parsed_query(<<$#>>, _) -> <<>>;
-calculate_parsed_query(<<>>, _) -> <<>>;
-calculate_parsed_query(Input, <<>>) ->
+strip_last_char(Input, [C0,C1,C2]) ->
     case binary:last(Input) of
-        $# ->
+        C0 ->
+            init_binary(Input);
+        C1 ->
+            init_binary(Input);
+        C2 ->
             init_binary(Input);
         _Else ->
             Input
-    end;
-calculate_parsed_query(Input, Unparsed) ->
-    {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
-    First.
+    end.
 
 
--spec calculate_parsed_fragment(binary(), binary()) -> binary().
-calculate_parsed_fragment(<<$#>>, _) -> <<>>;
-calculate_parsed_fragment(Input, Unparsed) ->
+%% Get parsed binary
+get_parsed_binary(Input, Unparsed) ->
     {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)),
     First.
 
 
-%% Returns the parsed binary based on Input and the Unparsed part.
-%% Used when parsing authority.
-%%
-%% Handles the following special cases:
-%%
-%%    #{host => "foo",query => []} = uri_string:parse("//foo?")
-%%    #{fragment => [],host => "foo"} = uri_string:parse("//foo#")
-%%    #{host => "foo",path => "/"} = uri_string:parse("//foo/")
-%%    #{host => "foo",query => [],scheme => "http"} = uri_string:parse("http://foo?")
-%%    #{fragment => [],host => "foo",scheme => "http"} = uri_string:parse("http://foo#")
-%%    #{host => "foo",path => "/",scheme => "http"} = uri_string:parse("http://foo/")
-%%
--spec calculate_parsed_part_sl(binary(), binary()) -> binary().
-calculate_parsed_part_sl(<<$?>>, _) -> <<>>;
-calculate_parsed_part_sl(<<$#>>, _) -> <<>>;
-calculate_parsed_part_sl(<<>>, _) -> <<>>;
-calculate_parsed_part_sl(Input, <<>>) ->
-    case binary:last(Input) of
-        $? ->
-            init_binary(Input);
-        $# ->
-            init_binary(Input);
-        $/ ->
-            init_binary(Input);
-        _Else ->
-            Input
-    end;
-calculate_parsed_part_sl(Input, Unparsed) ->
-    {First, _} =
-        split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)),
-    First.
-
 %% Return all bytes of the binary except the last one. The binary must be non-empty.
 init_binary(B) ->
     {Init, _} =
@@ -1260,14 +1193,6 @@ init_binary(B) ->
     Init.
 
 
-%% Returns the parsed binary based on Input and the Unparsed part.
-%% Used when parsing scheme.
--spec calculate_parsed_scheme(binary(), binary()) -> binary().
-calculate_parsed_scheme(Input, Unparsed) ->
-    {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1),
-    First.
-
-
 %% Returns the size of a binary exluding the first element.
 %% Used in calls to split_binary().
 -spec byte_size_exl_head(binary()) -> number().
@@ -1275,21 +1200,6 @@ byte_size_exl_head(<<>>) -> 0;
 byte_size_exl_head(Binary) -> byte_size(Binary) + 1.
 
 
-%% Returns size of 'Rest' for proper calculation of splitting position.
-%% Solves the following special case:
-%%
-%%    #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>).
-%%
-%% While keeping the following true:
-%%
-%%     #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>).
-%%     #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>).
-%%
--spec byte_size_exl_single_slash(uri_string()) -> number().
-byte_size_exl_single_slash(<<$/>>) -> 0;
-byte_size_exl_single_slash(Rest) -> byte_size(Rest).
-
-
 %%-------------------------------------------------------------------------
 %% [RFC 3986, Chapter 2.1.  Percent-Encoding]
 %%
-- 
cgit v1.2.3


From 992cda82f16ee23b0114563858d5a082711f659b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= 
Date: Wed, 25 Oct 2017 10:11:14 +0200
Subject: stdlib: Refactor compose_query

---
 lib/stdlib/src/uri_string.erl        | 73 +++++++++++++++++++-----------------
 lib/stdlib/test/uri_string_SUITE.erl |  9 ++++-
 2 files changed, 45 insertions(+), 37 deletions(-)

(limited to 'lib')

diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl
index 684087b870..2bf7ceaff1 100644
--- a/lib/stdlib/src/uri_string.erl
+++ b/lib/stdlib/src/uri_string.erl
@@ -396,19 +396,24 @@ compose_query(List) ->
 compose_query([],_Options) ->
     [];
 compose_query(List, Options) ->
-    try compose_query(List, Options, []) of
+    try compose_query(List, Options, false, <<>>) of
         Result -> Result
     catch
       throw:{error, Atom, RestData} -> {error, Atom, RestData}
     end.
 %%
-compose_query([{Key,Value}|Rest], Options, Acc) ->
-    Separator = get_separator(Options, Acc),
+compose_query([{Key,Value}|Rest], Options, IsList, Acc) ->
+    Separator = get_separator(Options, Rest),
     K = form_urlencode(Key),
     V = form_urlencode(Value),
-    compose_query(Rest, Options, Acc ++ Separator ++ K ++ "=" ++ V);
-compose_query([], _Options, Acc) ->
-    Acc.
+    Flag = is_list(Key) orelse is_list(Value),
+    IsListNew = IsList orelse Flag,
+    compose_query(Rest, Options, IsListNew, <
-    "&";
-get_separator([{separator, escaped_amp}], _Acc) ->
-    "&";
-get_separator([{separator, semicolon}], _Acc) ->
-    ";".
+get_separator(_, L) when length(L) =:= 0 ->
+    <<>>;
+get_separator([], _L) ->
+    <<"&">>;
+get_separator([{separator, amp}], _L) ->
+    <<"&">>;
+get_separator([{separator, escaped_amp}], _L) ->
+    <<"&">>;
+get_separator([{separator, semicolon}], _L) ->
+    <<";">>.
 
 
 %% Form-urlencode input based on RFC 1866 [8.2.1]
-form_urlencode(Cs) when is_binary(Cs) ->
-    L = convert_list(Cs, utf8),
-    form_urlencode(L, []);
+form_urlencode(Cs) when is_list(Cs) ->
+    B = convert_binary(Cs, utf8, utf8),
+    form_urlencode(B, <<>>);
 form_urlencode(Cs) ->
-    L = flatten_list(Cs, utf8),
-    form_urlencode(L, []).
+    form_urlencode(Cs, <<>>).
 %%
-form_urlencode([], Acc) ->
-    lists:reverse(Acc);
-form_urlencode([$ |T], Acc) ->
-    form_urlencode(T, [$+|Acc]);
-form_urlencode([H|T], Acc) ->
+form_urlencode(<<>>, Acc) ->
+    Acc;
+form_urlencode(<<$ ,T/binary>>, Acc) ->
+    form_urlencode(T, <>);
+form_urlencode(<>, Acc) ->
     case is_url_char(H) of
         true ->
-            form_urlencode(T, [H|Acc]);
+            form_urlencode(T, <>);
         false ->
-            E = urlencode_char(H),
-            form_urlencode(T, lists:reverse(E) ++ Acc)
-    end.
-
-
-urlencode_char(C) ->
-    B = percent_encode_binary(C),
-    unicode:characters_to_list(B).
+            E = percent_encode_binary(H),
+            form_urlencode(T, <>)
+    end;
+form_urlencode(<>, _Acc) ->
+    throw({error,invalid_utf8,<>});
+form_urlencode(H, _Acc) ->
+    throw({error,badarg, H}).
 
 
 %% Return true if input char can appear in URL according to
diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl
index fe832ac82c..c230f7c631 100644
--- a/lib/stdlib/test/uri_string_SUITE.erl
+++ b/lib/stdlib/test/uri_string_SUITE.erl
@@ -830,10 +830,15 @@ compose_query(_Config) ->
     "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,escaped_amp}]),
     "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,amp}]),
     "foo=1;bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,semicolon}]),
-    "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{"foo bar","1"}, {"ö", "2"}]).
+    "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{"foo bar","1"}, {"ö", "2"}]),
+    "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {"ö", <<"2">>}]),
+    <<"foo+bar=1&%C3%B6=2">> =
+        uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {<<"ö"/utf8>>, <<"2">>}]).
 
 compose_query_negative(_Config) ->
-    {error,badarg,4} = uri_string:compose_query([{"",4}]).
+    {error,badarg,4} = uri_string:compose_query([{"",4}]),
+    {error,badarg,5} = uri_string:compose_query([{5,""}]),
+    {error,invalid_utf8,<<"ö">>} = uri_string:compose_query([{"foo bar","1"}, {<<"ö">>, "2"}]).
 
 dissect_query(_Config) ->
     [] = uri_string:dissect_query(""),
-- 
cgit v1.2.3


From eba3d3e5e9b08839dafcb2e8adc6620d9211d96c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= 
Date: Wed, 25 Oct 2017 14:43:45 +0200
Subject: stdlib: Refactor dissect_query

---
 lib/stdlib/src/uri_string.erl        | 91 ++++++++++++++++++------------------
 lib/stdlib/test/uri_string_SUITE.erl | 18 +++++--
 2 files changed, 59 insertions(+), 50 deletions(-)

(limited to 'lib')

diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl
index 2bf7ceaff1..09bf4aef1d 100644
--- a/lib/stdlib/src/uri_string.erl
+++ b/lib/stdlib/src/uri_string.erl
@@ -423,18 +423,21 @@ compose_query([], _Options, IsList, Acc) ->
       QueryString :: uri_string(),
       QueryList :: [{string(), string()}]
                  | {error, atom(), list() | binary()}.
+dissect_query(<<>>) ->
+    [];
 dissect_query([]) ->
     [];
-dissect_query(QueryString) when is_binary(QueryString) ->
-    L = convert_list(QueryString, utf8),
-    try dissect_query_key(L, [], [], []) of
+dissect_query(QueryString) when is_list(QueryString) ->
+    try
+        B = convert_binary(QueryString, utf8, utf8),
+        dissect_query_key(B, true, [], <<>>, <<>>)
+    of
         Result -> Result
     catch
         throw:{error, Atom, RestData} -> {error, Atom, RestData}
     end;
 dissect_query(QueryString) ->
-    L = flatten_list(QueryString, utf8),
-    try dissect_query_key(L, [], [], []) of
+    try dissect_query_key(QueryString, false, [], <<>>, <<>>) of
         Result -> Result
     catch
         throw:{error, Atom, RestData} -> {error, Atom, RestData}
@@ -1706,7 +1709,6 @@ flatten_list(Arg, _, _) ->
     throw({error, badarg, Arg}).
 
 
-
 percent_encode_segment(Segment) ->
     percent_encode_binary(Segment, <<>>).
 
@@ -1790,50 +1792,48 @@ is_unsafe(_) -> false.
 %%-------------------------------------------------------------------------
 %% Helper functions for dissect_query
 %%-------------------------------------------------------------------------
-dissect_query_key([$=|T], Acc, Key, Value) ->
-    dissect_query_value(T, Acc, Key, Value);
-dissect_query_key([H|T], Acc, Key, Value) ->
-    dissect_query_key(T, Acc, [H|Key], Value);
-dissect_query_key(L, _, _, _) ->
-    throw({error, missing_value, L}).
-
-
-dissect_query_value([$&|_] = L, Acc, Key, Value) ->
-    K = form_urldecode(lists:reverse(Key)),
-    V = form_urldecode(lists:reverse(Value)),
-    dissect_query_separator_amp(L, [{K,V}|Acc], [], []);
-dissect_query_value([$;|_] = L, Acc, Key, Value) ->
-    K = form_urldecode(lists:reverse(Key)),
-    V = form_urldecode(lists:reverse(Value)),
-    dissect_query_separator_semicolon(L, [{K,V}|Acc], [], []);
-dissect_query_value([H|T], Acc, Key, Value) ->
-    dissect_query_value(T, Acc, Key, [H|Value]);
-dissect_query_value([], Acc, Key, Value) ->
-    K = form_urldecode(lists:reverse(Key)),
-    V = form_urldecode(lists:reverse(Value)),
+dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) ->
+    dissect_query_value(T, IsList, Acc, Key, Value);
+dissect_query_key(<>, IsList, Acc, Key, Value) ->
+    dissect_query_key(T, IsList, Acc, <>, Value);
+dissect_query_key(B, _, _, _, _) ->
+    throw({error, missing_value, B}).
+
+
+dissect_query_value(<<$&,_/binary>> = B, IsList, Acc, Key, Value) ->
+    K = form_urldecode(IsList, Key),
+    V = form_urldecode(IsList, Value),
+    dissect_query_separator_amp(B, IsList, [{K,V}|Acc], <<>>, <<>>);
+dissect_query_value(<<$;,_/binary>> = B, IsList, Acc, Key, Value) ->
+    K = form_urldecode(IsList, Key),
+    V = form_urldecode(IsList, Value),
+    dissect_query_separator_semicolon(B, IsList, [{K,V}|Acc], <<>>, <<>>);
+dissect_query_value(<>, IsList, Acc, Key, Value) ->
+    dissect_query_value(T, IsList, Acc, Key, <>);
+dissect_query_value(<<>>, IsList, Acc, Key, Value) ->
+    K = form_urldecode(IsList, Key),
+    V = form_urldecode(IsList, Value),
     lists:reverse([{K,V}|Acc]).
 
 
-dissect_query_separator_amp("&" ++ T, Acc, Key, Value) ->
-    dissect_query_key(T, Acc, Key, Value);
-dissect_query_separator_amp("&" ++ T, Acc, Key, Value) ->
-    dissect_query_key(T, Acc, Key, Value);
-dissect_query_separator_amp(L, _, _, _) ->
-    throw({error, invalid_separator, L}).
+dissect_query_separator_amp(<<"&",T/binary>>, IsList, Acc, Key, Value) ->
+    dissect_query_key(T, IsList, Acc, Key, Value);
+dissect_query_separator_amp(<<$&,T/binary>>, IsList, Acc, Key, Value) ->
+    dissect_query_key(T, IsList, Acc, Key, Value).
 
 
-dissect_query_separator_semicolon([$;|T], Acc, Key, Value) ->
-    dissect_query_key(T, Acc, Key, Value).
+dissect_query_separator_semicolon(<<$;,T/binary>>, IsList, Acc, Key, Value) ->
+    dissect_query_key(T, IsList, Acc, Key, Value).
 
 
 %% Form-urldecode input based on RFC 1866 [8.2.1]
-form_urldecode(Cs) ->
-    B = convert_binary(Cs, utf8, utf8),
+form_urldecode(true, B) ->
     Result = form_urldecode(B, <<>>),
-    convert_list(Result, utf8).
-%%
+    convert_list(Result, utf8);
+form_urldecode(false, B) ->
+    form_urldecode(B, <<>>);
 form_urldecode(<<>>, Acc) ->
-    convert_list(Acc, utf8);
+    Acc;
 form_urldecode(<<$+,T/binary>>, Acc) ->
     form_urldecode(T, <>);
 form_urldecode(<<$%,C0,C1,T/binary>>, Acc) ->
@@ -1843,13 +1843,14 @@ form_urldecode(<<$%,C0,C1,T/binary>>, Acc) ->
             form_urldecode(T, <>);
         false ->
             L = convert_list(<<$%,C0,C1,T/binary>>, utf8),
-            throw({error, urldecode, L})
+            throw({error, invalid_percent_encoding, L})
     end;
-form_urldecode(<>, Acc) ->
+form_urldecode(<>, Acc) ->
     case is_url_char(H) of
         true ->
             form_urldecode(T, <>);
         false ->
-            L = convert_list(<>, utf8),
-            throw({error, urldecode, L})
-    end.
+            throw({error, invalid_character, [H]})
+    end;
+form_urldecode(<>, _Acc) ->
+    throw({error, invalid_character, [H]}).
diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl
index c230f7c631..2fc4e1a092 100644
--- a/lib/stdlib/test/uri_string_SUITE.erl
+++ b/lib/stdlib/test/uri_string_SUITE.erl
@@ -846,13 +846,21 @@ dissect_query(_Config) ->
     [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"),
     [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1;bar=2"),
     [{"foo","1"}, {"bar", "222"}] = uri_string:dissect_query([<<"foo=1;bar=2">>,"22"]),
-    [{"foo","ö"}, {"bar", "2"}] = uri_string:dissect_query("foo=%C3%B6&bar=2").
+    [{"foo","ö"}, {"bar", "2"}] = uri_string:dissect_query("foo=%C3%B6&bar=2"),
+    [{<<"foo">>,<<"ö"/utf8>>}, {<<"bar">>, <<"2">>}] =
+        uri_string:dissect_query(<<"foo=%C3%B6&bar=2">>),
+    [{"foo bar","1"},{"ö","2"}] =
+        uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2">>]).
 
 dissect_query_negative(_Config) ->
-    {error,urldecode,";bar"} =
+    {error,invalid_character,";"} =
         uri_string:dissect_query("foo=1≈bar=2"),
-    {error,urldecode,"&bar"} =
+    {error,invalid_character,"&"} =
         uri_string:dissect_query("foo1&bar=2"),
-    {error,urldecode,"%XX%B6"} = uri_string:dissect_query("foo=%XX%B6&bar=2"),
+    {error,invalid_percent_encoding,"%XX%B6"} = uri_string:dissect_query("foo=%XX%B6&bar=2"),
     {error,invalid_input,<<153,182>>} =
-        uri_string:dissect_query("foo=%99%B6&bar=2").
+        uri_string:dissect_query("foo=%99%B6&bar=2"),
+    {error,invalid_character,"ö"} = uri_string:dissect_query("föo+bar=1&%C3%B6=2"),
+    {error,invalid_character,"ö"} = uri_string:dissect_query(<<"föo+bar=1&%C3%B6=2">>),
+    {error,invalid_input,<<"ö">>} =
+        uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2ö">>]).
-- 
cgit v1.2.3


From b0c682a8118c5775da784e9a0f569ee995319f80 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= 
Date: Thu, 26 Oct 2017 11:29:48 +0200
Subject: stdlib: Update documentation, error tuples

---
 lib/stdlib/doc/src/uri_string.xml    | 117 +++++++++++++++++++++++++----------
 lib/stdlib/src/uri_string.erl        |  44 ++++++-------
 lib/stdlib/test/uri_string_SUITE.erl |   2 +-
 3 files changed, 109 insertions(+), 54 deletions(-)

(limited to 'lib')

diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml
index d67c687fd1..8322eecb24 100644
--- a/lib/stdlib/doc/src/uri_string.xml
+++ b/lib/stdlib/doc/src/uri_string.xml
@@ -30,10 +30,13 @@
   uri_string
   URI processing functions.
   
-    

This module contains functions for parsing and handling URIs (RFC 3986) and - form-urlencoded query strings (RFC 1866).

+

This module contains functions for parsing and handling URIs + (RFC 3986) and + form-urlencoded query strings (RFC 1866). +

A URI is an identifier consisting of a sequence of characters matching the syntax - rule named URI in RFC 3986.

+ rule named URI in RFC 3986. +

The generic URI syntax consists of a hierarchical sequence of components referred to as the scheme, authority, path, query, and fragment:

@@ -55,16 +58,24 @@
     


The interpretation of a URI depends only on the characters used and not on how those characters are represented in a network protocol.

-

The functions implemented by this module covers the following use cases:

+

The functions implemented by this module cover the following use cases:

- Parsing URIs

- parse/1
- Recomposing URIs

- recompose/2
- Transcoding URIs

- transcode/2
- Working with form-urlencoded query strings

- compose_query/[1,2], dissect_query/1
+ Parsing URIs into its components and returing a map

+ parse/1 +
+ Recomposing a map of URI components into a URI string

+ recompose/1 +
+ Changing inbound binary and percent-encoding of URIs

+ transcode/2 +
+ Composing form-urlencoded query strings from a list of key-value pairs

+ compose_query/1

+ compose_query/2 +
+ Dissecting form-urlencoded query strings into a list of key-value pairs

+ dissect_query/1 +

There are four different encodings present during the handling of URIs:

@@ -75,14 +86,29 @@

Unless otherwise specified the return value type and encoding are the same as the input type and encoding. That is, binary input returns binary output, list input returns a list - output but mixed input returns list output. Input and output encodings are the same except - for transcode/2.

+ output but mixed input returns list output.

All of the functions but transcode/2 expects input as unicode codepoints in lists, UTF-8 encoding in binaries and UTF-8 encoding in percent-encoded URI parts. transcode/2 provides the means to convert between the supported URI encodings.

+ + + +

Error tuple indicating the type of error. Possible values of the second component:

+ + invalid_character + invalid_input + invalid_map + invalid_percent_encoding + invalid_scheme + invalid_uri + invalid_utf8 + missing_value + +
+
@@ -93,7 +119,8 @@

List of unicode codepoints, UTF-8 encoded binary, or a mix of the two, - representing an RFC 3986 compliant URI (percent-encoded form). + representing an RFC 3986 + compliant URI (percent-encoded form). A URI is a sequence of characters from a very limited set: the letters of the basic Latin alphabet, digits, and a few special characters.

@@ -109,13 +136,21 @@

Composes a form-urlencoded QueryString based on a QueryList, a list of unescaped key-value pairs. Media type application/x-www-form-urlencoded is defined in section - 8.2.1 of RFC 1866 (HTML 2.0). Reserved and unsafe characters, as - defined by RFC 1738 (Uniform Resource Locators), are percent-encoded. + 8.2.1 of RFC 1866 + (HTML 2.0). Reserved and unsafe characters, as + defined by RFC 1738 + (Uniform Resource Locators), are percent-encoded.

+

See also the opposite operation + dissect_query/1.

Example:

-1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}]).
-
+1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
+1> [{separator, semicolon}]).
+"foo+bar=1;city=%C3%B6rebro"
+2> >,<<"1">>},
+2> {<<"city">>,<<"örebro"/utf8>>}]).]]>
+>]]>
 	
@@ -127,11 +162,14 @@

Same as compose_query/1 but with an additional Options parameter, that controls the type of separator used between key-value pairs. There are three supported separator types: amp (), escaped_amp () and semicolon (;). If the parameter Options is empty, separator takes the default value (escaped_amp).

+

See also the opposite operation + dissect_query/1. +

Example:

 1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
-2> [{separator, semicolon}]).
-"foo+bar=1;city=%C3%B6rebro"
+1> [{separator, amp}]).
+
 	
@@ -143,13 +181,19 @@

Dissects an urlencoded QueryString and returns a QueryList, a list of unescaped key-value pairs. Media type application/x-www-form-urlencoded is defined in section - 8.2.1 of RFC 1866 (HTML 2.0). Percent-encoded segments are decoded - as defined by RFC 1738 (Uniform Resource Locators). + 8.2.1 of RFC 1866 + (HTML 2.0). Percent-encoded segments are decoded + as defined by RFC 1738 + (Uniform Resource Locators).

+

See also the opposite operation + compose_query/1.

Example:

 1> uri_string:dissect_query("foo+bar=1;city=%C3%B6rebro").
 [{"foo bar","1"},{"city","örebro"}]
+2> >).]]>
+>,<<"1">>},{<<"city">>,<<"örebro"/utf8>>}] ]]>
 	
@@ -159,14 +203,19 @@ Parse URI into a map.

Returns a URIMap, that is a uri_map() with the parsed components - of the URIString.

-

If parsing fails, an error tuple is returned.

+ of the URIString. If parsing fails, an error tuple is returned.

+

See also the opposite operation + recompose/1.

Example:

 1> uri_string:parse("foo://user@example.com:8042/over/there?name=ferret#nose").
 #{fragment => "nose",host => "example.com",
   path => "/over/there",port => 8042,query => "name=ferret",
   scheme => foo,userinfo => "user"}
+2> >).]]>
+ <<"example.com">>,path => <<"/over/there">>,
+  port => 8042,query => <<"name=ferret">>,scheme => <<"foo">>,
+  userinfo => <<"user">>}]]>
 	
@@ -175,12 +224,15 @@ Recompose URI. -

Returns an RFC 3986 compliant URIString (percent-encoded).

-

If the URIMap is invalid, an error tuple is returned.

+

Returns an RFC 3986 compliant + URIString (percent-encoded). + If the URIMap is invalid, an error tuple is returned.

+

See also the opposite operation + parse/1.

Example:

 1> URIMap = #{fragment => "nose", host => "example.com", path => "/over/there",
-port => 8042, query => "name=ferret", scheme => "foo", userinfo => "user"}.
+1> port => 8042, query => "name=ferret", scheme => "foo", userinfo => "user"}.
 #{fragment => "top",host => "example.com",
   path => "/over/there",port => 8042,query => "?name=ferret",
   scheme => foo,userinfo => "user"}
@@ -194,14 +246,15 @@ port => 8042, query => "name=ferret", scheme => "foo", userinfo => "user"}.
       
       Transcode URI.
       
-        

Transcodes an RFC 3986 compliant URIString, +

Transcodes an RFC 3986 + compliant URIString, where Options is a list of tagged tuples, specifying the inbound - (in_encoding) and outbound (out_encoding) encodings.

-

If an argument is invalid, an error tuple is returned.

+ (in_encoding) and outbound (out_encoding) encodings. + If an argument is invalid, an error tuple is returned.

Example:

 1> >,]]>
-2> [{in_encoding, utf32},{out_encoding, utf8}]).
+1> [{in_encoding, utf32},{out_encoding, utf8}]).
 >]]>
 	
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 09bf4aef1d..ca212284d2 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -229,7 +229,7 @@ -export([compose_query/1, compose_query/2, dissect_query/1, parse/1, recompose/1, transcode/2]). --export_type([uri_map/0, uri_string/0]). +-export_type([error/0, uri_map/0, uri_string/0]). %%------------------------------------------------------------------------- @@ -273,6 +273,8 @@ %% %x96 ` grave / accent %%------------------------------------------------------------------------- -type uri_string() :: iodata(). +-type error() :: {error, atom(), list() | binary()}. + %%------------------------------------------------------------------------- %% RFC 3986, Chapter 3. Syntax Components @@ -292,7 +294,7 @@ -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map() - | {error, atom(), list() | binary()}. + | error(). parse(URIString) when is_binary(URIString) -> try parse_uri_reference(URIString, #{}) of Result -> Result @@ -317,7 +319,7 @@ parse(URIString) when is_list(URIString) -> -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string() - | {error, atom(), list() | binary()}. + | error(). recompose(Map) -> case is_valid_map(Map) of false -> @@ -346,7 +348,7 @@ recompose(Map) -> URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}], Result :: uri_string() - | {error, atom(), list() | binary()}. + | error(). transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -357,7 +359,7 @@ transcode(URIString, Options) when is_binary(URIString) -> of Result -> Result catch - throw:{error, _, RestData} -> {error, invalid_input, RestData} + throw:{error, Atom, RestData} -> {error, Atom, RestData} end; transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -366,7 +368,7 @@ transcode(URIString, Options) when is_list(URIString) -> try transcode(Flattened, [], InEnc, OutEnc) of Result -> Result catch - throw:{error, _, RestData} -> {error, invalid_input, RestData} + throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -382,8 +384,8 @@ transcode(URIString, Options) when is_list(URIString) -> %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{uri_string(), uri_string()}], - QueryString :: string() - | {error, atom(), list() | binary()}. + QueryString :: uri_string() + | error(). compose_query(List) -> compose_query(List, []). @@ -391,8 +393,8 @@ compose_query(List) -> -spec compose_query(QueryList, Options) -> QueryString when QueryList :: [{uri_string(), uri_string()}], Options :: [{separator, atom()}], - QueryString :: string() - | {error, atom(), list() | binary()}. + QueryString :: uri_string() + | error(). compose_query([],_Options) -> []; compose_query(List, Options) -> @@ -421,8 +423,8 @@ compose_query([], _Options, IsList, Acc) -> %%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), - QueryList :: [{string(), string()}] - | {error, atom(), list() | binary()}. + QueryList :: [{uri_string(), uri_string()}] + | error(). dissect_query(<<>>) -> []; dissect_query([]) -> @@ -1249,9 +1251,9 @@ decode_fragment(Cs) -> check_utf8(Cs) -> case unicode:characters_to_list(Cs) of {incomplete,_,_} -> - throw({error,non_utf8,Cs}); + throw({error,invalid_utf8,Cs}); {error,_,_} -> - throw({error,non_utf8,Cs}); + throw({error,invalid_utf8,Cs}); _ -> Cs end. @@ -1304,12 +1306,12 @@ decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> true -> B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); - false -> throw({error,percent_decode,<<$%,C0,C1>>}) + false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>}) end; decode(<>, Fun, Acc) -> case Fun(C) of true -> decode(Cs, Fun, <>); - false -> throw({error,percent_decode,<>}) + false -> throw({error,invalid_percent_encoding,<>}) end; decode(<<>>, _Fun, Acc) -> Acc. @@ -1339,7 +1341,7 @@ encode(<>, Fun, Acc) -> C = encode_codepoint_binary(Char, Fun), encode(Rest, Fun, <>); encode(<>, _Fun, _Acc) -> - throw({error,percent_encode,<>}); + throw({error,invalid_input,<>}); encode(<<>>, _Fun, Acc) -> Acc. @@ -1647,12 +1649,12 @@ transcode([], Acc, List, _InEncoding, _OutEncoding) -> %% Transcode percent-encoded segment -transcode_pct([$%,C0,C1|Rest], Acc, B, InEncoding, OutEncoding) -> +transcode_pct([$%,C0,C1|Rest] = L, Acc, B, InEncoding, OutEncoding) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1), transcode_pct(Rest, Acc, <>, InEncoding, OutEncoding); - false -> throw({error, lists:reverse(Acc),[C0,C1]}) + false -> throw({error, invalid_percent_encoding,L}) end; transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) -> OutBinary = convert_binary(B, InEncoding, OutEncoding), @@ -1706,7 +1708,7 @@ flatten_list([H|T], InEnc, Acc) -> flatten_list([], _InEnc, Acc) -> lists:reverse(Acc); flatten_list(Arg, _, _) -> - throw({error, badarg, Arg}). + throw({error, invalid_input, Arg}). percent_encode_segment(Segment) -> @@ -1752,7 +1754,7 @@ form_urlencode(<>, Acc) -> form_urlencode(<>, _Acc) -> throw({error,invalid_utf8,<>}); form_urlencode(H, _Acc) -> - throw({error,badarg, H}). + throw({error,invalid_input, H}). %% Return true if input char can appear in URL according to diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 2fc4e1a092..95a49f5eb3 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -819,7 +819,7 @@ transcode_mixed(_Config) -> uri_string:transcode(["foo%00", <<"%00%0"/utf32>>,<<"0%F"/utf32>>,"6bar"], [{in_encoding, utf32},{out_encoding, utf8}]). transcode_negative(_Config) -> - {error,invalid_input,"BX"} = + {error,invalid_percent_encoding,"%BXbar"} = uri_string:transcode(<<"foo%C3%BXbar"/utf8>>, [{in_encoding, utf8},{out_encoding, utf32}]), {error,invalid_input,<<"ö">>} = uri_string:transcode("foo%F6bar", [{in_encoding, utf8},{out_encoding, utf8}]). -- cgit v1.2.3 From 3d12c8f164f79dd67967ba5c7df7d3c555dc0f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 27 Oct 2017 14:14:22 +0200 Subject: stdlib: Allow undefined port in uri_map() uri_map() updated to allow 'undefined' ports in order to align the implementation with RFC 3986: port = *DIGIT An 'undefined' port is mapped to a ":" during recompose operation. --- lib/stdlib/src/uri_string.erl | 39 ++++++++++++++++++---- .../test/property_test/uri_string_recompose.erl | 5 +-- lib/stdlib/test/uri_string_SUITE.erl | 11 +++--- 3 files changed, 43 insertions(+), 12 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index ca212284d2..16650d5005 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -283,7 +283,7 @@ #{fragment => unicode:chardata(), host => unicode:chardata(), path => unicode:chardata(), - port => non_neg_integer(), + port => non_neg_integer() | undefined, query => unicode:chardata(), scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. @@ -807,7 +807,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty @@ -836,7 +836,7 @@ parse_host(?STRING_EMPTY, URI) -> parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty @@ -869,7 +869,7 @@ parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), @@ -932,7 +932,7 @@ is_ipv6(Char) -> is_hex_digit(Char). parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty @@ -1148,7 +1148,7 @@ calculate_parsed_userinfo(Input, Unparsed) -> -spec calculate_parsed_host_port(binary(), binary()) -> binary(). calculate_parsed_host_port(Input, <<>>) -> - strip_last_char(Input, [$?,$#,$/]); + strip_last_char(Input, [$:,$?,$#,$/]); calculate_parsed_host_port(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). @@ -1159,6 +1159,18 @@ calculate_parsed_query_fragment(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). +get_port(<<>>) -> + undefined; +get_port(B) -> + try binary_to_integer(B) of + Port -> + Port + catch + error:badarg -> + throw({error, invalid_uri, B}) + end. + + %% Strip last char if it is in list strip_last_char(<<>>, _) -> <<>>; strip_last_char(Input, [C0]) -> @@ -1187,6 +1199,19 @@ strip_last_char(Input, [C0,C1,C2]) -> init_binary(Input); _Else -> Input + end; +strip_last_char(Input, [C0,C1,C2,C3]) -> + case binary:last(Input) of + C0 -> + init_binary(Input); + C1 -> + init_binary(Input); + C2 -> + init_binary(Input); + C3 -> + init_binary(Input); + _Else -> + Input end. @@ -1530,6 +1555,8 @@ update_host(#{}, URI) -> %% URI cannot be empty for ports. E.g. ":8080" is not a valid URI +update_port(#{port := undefined}, URI) -> + concat(URI, <<":">>); update_port(#{port := Port}, URI) -> concat(URI,add_colon(encode_port(Port))); update_port(#{}, URI) -> diff --git a/lib/stdlib/test/property_test/uri_string_recompose.erl b/lib/stdlib/test/property_test/uri_string_recompose.erl index 97f9d727a0..e51a671172 100644 --- a/lib/stdlib/test/property_test/uri_string_recompose.erl +++ b/lib/stdlib/test/property_test/uri_string_recompose.erl @@ -267,8 +267,9 @@ host_uri() -> %% Port, Query, Fragment %%------------------------------------------------------------------------- port() -> - range(1,65535). - + frequency([{10, undefined}, + {10, range(1,65535)} + ]). query_map() -> unicode(). diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 95a49f5eb3..9ee321c509 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -692,7 +692,9 @@ parse_special2(_Config) -> #{host := [],path := [],userinfo := []} = uri_string:parse("//@"), #{host := [],path := [],scheme := "foo",userinfo := []} = uri_string:parse("foo://@"), #{host := [],path := "/",userinfo := []} = uri_string:parse("//@/"), - #{host := [],path := "/",scheme := "foo",userinfo := []} = uri_string:parse("foo://@/"). + #{host := [],path := "/",scheme := "foo",userinfo := []} = uri_string:parse("foo://@/"), + #{host := "localhost",path := "/",port := undefined} = uri_string:parse("//localhost:/"), + #{host := [],path := [],port := undefined} = uri_string:parse("//:"). parse_negative(_Config) -> {error,invalid_uri,"å"} = uri_string:parse("å"), @@ -702,7 +704,8 @@ parse_negative(_Config) -> {error,invalid_uri,"ö"} = uri_string:parse("//host/path#foö"), {error,invalid_uri,"127.256.0.1"} = uri_string:parse("//127.256.0.1"), {error,invalid_uri,":::127.0.0.1"} = uri_string:parse("//[:::127.0.0.1]"), - {error,non_utf8,<<0,0,0,246>>} = uri_string:parse("//%00%00%00%F6"). + {error,invalid_utf8,<<0,0,0,246>>} = uri_string:parse("//%00%00%00%F6"), + {error,invalid_uri,"A"} = uri_string:parse("//localhost:A8"). %%------------------------------------------------------------------------- @@ -836,8 +839,8 @@ compose_query(_Config) -> uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {<<"ö"/utf8>>, <<"2">>}]). compose_query_negative(_Config) -> - {error,badarg,4} = uri_string:compose_query([{"",4}]), - {error,badarg,5} = uri_string:compose_query([{5,""}]), + {error,invalid_input,4} = uri_string:compose_query([{"",4}]), + {error,invalid_input,5} = uri_string:compose_query([{5,""}]), {error,invalid_utf8,<<"ö">>} = uri_string:compose_query([{"foo bar","1"}, {<<"ö">>, "2"}]). dissect_query(_Config) -> -- cgit v1.2.3 From ce78af7e5a76dc4a27673ab5c80a315762b992b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 27 Oct 2017 16:54:27 +0200 Subject: stdlib: Implement normalize/1 Implements the following Syntax-Based Normalizations: - Case Normalization - Percent-Encoding Normalization - Path Segment Normalization - Scheme-Based Normalization - HTTP(S) - Basic support for FTP, SSH, SFTP, TFTP --- lib/stdlib/src/uri_string.erl | 167 ++++++++++++++++++++++++++++++++++- lib/stdlib/test/uri_string_SUITE.erl | 22 +++++ 2 files changed, 188 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 16650d5005..cf8c388f54 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -227,7 +227,7 @@ %% External API %%------------------------------------------------------------------------- -export([compose_query/1, compose_query/2, - dissect_query/1, parse/1, + dissect_query/1, normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -288,6 +288,21 @@ scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. + +%%------------------------------------------------------------------------- +%% Normalize URIs +%%------------------------------------------------------------------------- +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(URIString) -> + %% Case normalization and percent-encoding normalization are achieved + %% by running parse and recompose on the input URI string. + M = parse(URIString), + M1 = normalize_scheme_based(M), + M2 = normalize_path_segment(M1), + recompose(M2). + %%------------------------------------------------------------------------- %% Parse URIs %%------------------------------------------------------------------------- @@ -1883,3 +1898,153 @@ form_urldecode(<>, Acc) -> end; form_urldecode(<>, _Acc) -> throw({error, invalid_character, [H]}). + + +%%------------------------------------------------------------------------- +%% Helper functions for normalize +%%------------------------------------------------------------------------- + +%% RFC 3986 +%% 6.2.2.3. Path Segment Normalization +%% 5.2.4. Remove Dot Segments +normalize_path_segment(Map) -> + Path = maps:get(path, Map, undefined), + Map#{path => remove_dot_segments(Path)}. + + +remove_dot_segments(Path) when is_binary(Path) -> + remove_dot_segments(Path, <<>>); +remove_dot_segments(Path) when is_list(Path) -> + B = convert_binary(Path, utf8, utf8), + B1 = remove_dot_segments(B, <<>>), + convert_list(B1, utf8). +%% +remove_dot_segments(<<>>, Output) -> + Output; +remove_dot_segments(<<"../",T/binary>>, Output) -> + remove_dot_segments(T, Output); +remove_dot_segments(<<"./",T/binary>>, Output) -> + remove_dot_segments(T, Output); +remove_dot_segments(<<"/./",T/binary>>, Output) -> + remove_dot_segments(<<$/,T/binary>>, Output); +remove_dot_segments(<<"/.">>, Output) -> + remove_dot_segments(<<$/>>, Output); +remove_dot_segments(<<"/../",T/binary>>, Output) -> + Out1 = remove_last_segment(Output), + remove_dot_segments(<<$/,T/binary>>, Out1); +remove_dot_segments(<<"/..">>, Output) -> + Out1 = remove_last_segment(Output), + remove_dot_segments(<<$/>>, Out1); +remove_dot_segments(<<$.>>, Output) -> + remove_dot_segments(<<>>, Output); +remove_dot_segments(<<"..">>, Output) -> + remove_dot_segments(<<>>, Output); +remove_dot_segments(Input, Output) -> + {First, Rest} = first_path_segment(Input), + remove_dot_segments(Rest, <>). + + +first_path_segment(Input) -> + F = first_path_segment(Input, <<>>), + split_binary(Input, byte_size(F)). +%% +first_path_segment(<<$/,T/binary>>, Acc) -> + first_path_segment_end(<>, <>); +first_path_segment(<>, Acc) -> + first_path_segment_end(<>, <>). + + +first_path_segment_end(<<>>, Acc) -> + Acc; +first_path_segment_end(<<$/,_/binary>>, Acc) -> + Acc; +first_path_segment_end(<>, Acc) -> + first_path_segment_end(<>, <>). + + +remove_last_segment(<<>>) -> + <<>>; +remove_last_segment(B) -> + {Init, Last} = split_binary(B, byte_size(B) - 1), + case Last of + <<$/>> -> + Init; + _Char -> + remove_last_segment(Init) + end. + + +%% RFC 3986, 6.2.3. Scheme-Based Normalization +normalize_scheme_based(Map) -> + Scheme = maps:get(scheme, Map, undefined), + Port = maps:get(port, Map, undefined), + Path= maps:get(path, Map, undefined), + case Scheme of + "http" -> + normalize_http(Map, Port, Path); + <<"http">> -> + normalize_http(Map, Port, Path); + "https" -> + normalize_https(Map, Port, Path); + <<"https">> -> + normalize_https(Map, Port, Path); + "ftp" -> + normalize_ftp(Map, Port); + <<"ftp">> -> + normalize_ftp(Map, Port); + "ssh" -> + normalize_ssh_sftp(Map, Port); + <<"ssh">> -> + normalize_ssh_sftp(Map, Port); + "sftp" -> + normalize_ssh_sftp(Map, Port); + <<"sftp">> -> + normalize_ssh_sftp(Map, Port); + "tftp" -> + normalize_tftp(Map, Port); + <<"tftp">> -> + normalize_tftp(Map, Port); + _Else -> Map + end. + + +normalize_http(Map, Port, Path) -> + M1 = normalize_port(Map, Port, 80), + normalize_http_path(M1, Path). + + +normalize_https(Map, Port, Path) -> + M1 = normalize_port(Map, Port, 443), + normalize_http_path(M1, Path). + + +normalize_ftp(Map, Port) -> + normalize_port(Map, Port, 21). + + +normalize_ssh_sftp(Map, Port) -> + normalize_port(Map, Port, 22). + + +normalize_tftp(Map, Port) -> + normalize_port(Map, Port, 69). + + +normalize_port(Map, Port, Default) -> + case Port of + Default -> + maps:remove(port, Map); + _Else -> + Map + end. + + +normalize_http_path(Map, Path) -> + case Path of + "" -> + Map#{path => "/"}; + <<>> -> + Map#{path => <<"/">>}; + _Else -> + Map + end. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 9ee321c509..1567b9333a 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -22,6 +22,7 @@ -include_lib("common_test/include/ct.hrl"). -export([all/0, suite/0,groups/0, + normalize/1, parse_binary_fragment/1, parse_binary_host/1, parse_binary_host_ipv4/1, parse_binary_host_ipv6/1, parse_binary_path/1, parse_binary_pct_encoded_fragment/1, parse_binary_pct_encoded_query/1, @@ -65,6 +66,7 @@ suite() -> all() -> [ + normalize, parse_binary_scheme, parse_binary_userinfo, parse_binary_pct_encoded_userinfo, @@ -867,3 +869,23 @@ dissect_query_negative(_Config) -> {error,invalid_character,"ö"} = uri_string:dissect_query(<<"föo+bar=1&%C3%B6=2">>), {error,invalid_input,<<"ö">>} = uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2ö">>]). + +normalize(_Config) -> + "/a/g" = uri_string:normalize("/a/b/c/./../../g"), + <<"mid/6">> = uri_string:normalize(<<"mid/content=5/../6">>), + "http://localhost-%C3%B6rebro/a/g" = + uri_string:normalize("http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g"), + <<"http://localhost-%C3%B6rebro/a/g">> = + uri_string:normalize(<<"http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g">>), + <<"https://localhost/">> = + uri_string:normalize(<<"https://localhost:443">>), + <<"https://localhost:445/">> = + uri_string:normalize(<<"https://localhost:445">>), + <<"ftp://localhost">> = + uri_string:normalize(<<"ftp://localhost:21">>), + <<"ssh://localhost">> = + uri_string:normalize(<<"ssh://localhost:22">>), + <<"sftp://localhost">> = + uri_string:normalize(<<"sftp://localhost:22">>), + <<"tftp://localhost">> = + uri_string:normalize(<<"tftp://localhost:69">>). -- cgit v1.2.3 From f7d3033dfeeb012841729bf8ed3889da8457b4f7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 30 Oct 2017 13:38:28 +0100 Subject: stdlib: Update documentation (normalize/1) --- lib/stdlib/doc/src/uri_string.xml | 115 +++++++++++++++++++++++++++----------- 1 file changed, 83 insertions(+), 32 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index 8322eecb24..55d8690b98 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -69,6 +69,9 @@ Changing inbound binary and percent-encoding of URIs

transcode/2
+ Transforming URIs into a normalized form

+ normalize/1 +
Composing form-urlencoded query strings from a list of key-value pairs

compose_query/1

compose_query/2 @@ -84,12 +87,21 @@ Outbound binary encoding in binaries Outbound percent-encoding in lists and binaries +

Functions with uri_string() argument accept lists, binaries and + mixed lists (lists with binary elements) as input type. All of the functions but + transcode/2 expects input as lists of unicode codepoints, UTF-8 encoded binaries + and UTF-8 percent-encoded URI parts ("%C3%B6" corresponds to the unicode character "ö").

Unless otherwise specified the return value type and encoding are the same as the input type and encoding. That is, binary input returns binary output, list input returns a list output but mixed input returns list output.

-

All of the functions but transcode/2 expects input as unicode codepoints in - lists, UTF-8 encoding in binaries and UTF-8 encoding in percent-encoded URI parts. - transcode/2 provides the means to convert between the supported URI encodings.

+

In case of lists there is only percent-encoding. In binaries, however, both binary encoding + and percent-encoding shall be considered. transcode/2 provides the means to convert + between the supported encodings, it takes a uri_string() and a list of options + specifying inbound and outbound encodings.

+

RFC 3986 does not mandate any specific + character encoding and it is usually defined by the protocol or surrounding text. This library + takes the same assumption, binary and percent-encoding are handled as one configuration unit, + they cannot be set to different values.

@@ -97,28 +109,30 @@

Error tuple indicating the type of error. Possible values of the second component:

- - invalid_character - invalid_input - invalid_map - invalid_percent_encoding - invalid_scheme - invalid_uri - invalid_utf8 - missing_value - + + invalid_character + invalid_input + invalid_map + invalid_percent_encoding + invalid_scheme + invalid_uri + invalid_utf8 + missing_value + +

The third component is a list or binary providing additional information about the + cause of the error.

-

URI map holding the main components of a URI.

+

Map holding the main components of a URI.

-

List of unicode codepoints, UTF-8 encoded binary, or a mix of the two, +

List of unicode codepoints, a UTF-8 encoded binary, or a mix of the two, representing an RFC 3986 compliant URI (percent-encoded form). A URI is a sequence of characters from a very limited set: the letters of @@ -134,10 +148,11 @@ Compose urlencoded query string.

Composes a form-urlencoded QueryString based on a - QueryList, a list of unescaped key-value pairs. - Media type application/x-www-form-urlencoded is defined in section + QueryList, a list of non-percent-encoded key-value pairs. + Form-urlencoding is defined in section 8.2.1 of RFC 1866 - (HTML 2.0). Reserved and unsafe characters, as + (HTML 2.0) for media type application/x-www-form-urlencoded. + Reserved and unsafe characters, as defined by RFC 1738 (Uniform Resource Locators), are percent-encoded.

See also the opposite operation @@ -145,9 +160,8 @@

Example:

-1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
-1> [{separator, semicolon}]).
-"foo+bar=1;city=%C3%B6rebro"
+1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}]).
+
 2> >,<<"1">>},
 2> {<<"city">>,<<"örebro"/utf8>>}]).]]>
 >]]>
@@ -169,7 +183,10 @@
         
 1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
 1> [{separator, amp}]).
-
+ uri_string:compose_query([{<<"foo bar">>,<<"1">>},
+2> {<<"city">>,<<"örebro"/utf8>>}], [{separator, escaped_amp}]).]]>
+>]]>
 	
@@ -179,12 +196,15 @@ Dissect query string.

Dissects an urlencoded QueryString and returns a - QueryList, a list of unescaped key-value pairs. - Media type application/x-www-form-urlencoded is defined in section + QueryList, a list of non-percent-encoded key-value pairs. + Form-urlencoding is defined in section 8.2.1 of RFC 1866 - (HTML 2.0). Percent-encoded segments are decoded - as defined by RFC 1738 + (HTML 2.0) for media type application/x-www-form-urlencoded. + Percent-encoded segments are decoded as defined by + RFC 1738 (Uniform Resource Locators).

+

Supported separator types: amp (), escaped_amp + () and semicolon (;).

See also the opposite operation compose_query/1.

@@ -192,18 +212,42 @@
 1> uri_string:dissect_query("foo+bar=1;city=%C3%B6rebro").
 [{"foo bar","1"},{"city","örebro"}]
-2> >).]]>
+2> >).]]>
 >,<<"1">>},{<<"city">>,<<"örebro"/utf8>>}] ]]>
 	
+ + + Syntax-based normalization. + +

Transforms URIString into a normalized form + using Syntax-Based Normalization as defined by + RFC 3986.

+

This function implements case normalization, percent-encoding + normalization, path segment normalization and scheme based normalization + for HTTP(S) with basic support for FTP, SSH, SFTP and TFTP.

+

Example:

+
+1> uri_string:normalize("/a/b/c/./../../g").
+"/a/g"
+2> >).]]>
+>]]>
+3> uri_string:normalize("http://localhost:80").
+"https://localhost/"
+	
+
+
+ Parse URI into a map. -

Returns a URIMap, that is a uri_map() with the parsed components - of the URIString. If parsing fails, an error tuple is returned.

+

Parses an RFC 3986 + compliant uri_string() into a uri_map(), that holds the parsed + components of the URI. + If parsing fails, an error tuple is returned.

See also the opposite operation recompose/1.

Example:

@@ -224,8 +268,9 @@ Recompose URI. -

Returns an RFC 3986 compliant - URIString (percent-encoded). +

Creates an RFC 3986 compliant + URIString (percent-encoded), based on the components of + URIMap. If the URIMap is invalid, an error tuple is returned.

See also the opposite operation parse/1.

@@ -249,13 +294,19 @@

Transcodes an RFC 3986 compliant URIString, where Options is a list of tagged tuples, specifying the inbound - (in_encoding) and outbound (out_encoding) encodings. + (in_encoding) and outbound (out_encoding) encodings. in_encoding + and out_encoding specifies both binary encoding and percent-encoding for the + input and output data. Mixed encoding, where binary encoding is not the same as + percent-encoding, is not supported. If an argument is invalid, an error tuple is returned.

Example:

 1> >,]]>
 1> [{in_encoding, utf32},{out_encoding, utf8}]).
 >]]>
+2> uri_string:transcode("foo%F6bar", [{in_encoding, latin1},
+2> {out_encoding, utf8}]).
+"foo%C3%B6bar"
 	
-- cgit v1.2.3 From 7a4d4e183ae5567d6242184b8268918904c872c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 30 Oct 2017 16:57:49 +0100 Subject: stdlib: Refactor helper functions in uri_string --- lib/stdlib/doc/src/uri_string.xml | 2 +- lib/stdlib/src/uri_string.erl | 142 ++++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 77 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index 55d8690b98..8fa2a92370 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -119,7 +119,7 @@ invalid_utf8 missing_value -

The third component is a list or binary providing additional information about the +

The third component is a term providing additional information about the cause of the error.

diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index cf8c388f54..2c73e38324 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -273,7 +273,7 @@ %% %x96 ` grave / accent %%------------------------------------------------------------------------- -type uri_string() :: iodata(). --type error() :: {error, atom(), list() | binary()}. +-type error() :: {error, atom(), term()}. %%------------------------------------------------------------------------- @@ -298,10 +298,11 @@ normalize(URIString) -> %% Case normalization and percent-encoding normalization are achieved %% by running parse and recompose on the input URI string. - M = parse(URIString), - M1 = normalize_scheme_based(M), - M2 = normalize_path_segment(M1), - recompose(M2). + recompose( + normalize_path_segment( + normalize_scheme_based( + parse(URIString)))). + %%------------------------------------------------------------------------- %% Parse URIs @@ -311,8 +312,7 @@ normalize(URIString) -> URIMap :: uri_map() | error(). parse(URIString) when is_binary(URIString) -> - try parse_uri_reference(URIString, #{}) of - Result -> Result + try parse_uri_reference(URIString, #{}) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; @@ -321,8 +321,6 @@ parse(URIString) when is_list(URIString) -> Binary = unicode:characters_to_binary(URIString), Map = parse_uri_reference(Binary, #{}), convert_mapfields_to_list(Map) - of - Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -348,8 +346,6 @@ recompose(Map) -> T4 = update_path(Map, T3), T5 = update_query(Map, T4), update_fragment(Map, T5) - of - Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end @@ -371,8 +367,6 @@ transcode(URIString, Options) when is_binary(URIString) -> List = convert_list(URIString, InEnc), Output = transcode(List, [], InEnc, OutEnc), convert_binary(Output, utf8, OutEnc) - of - Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; @@ -380,8 +374,7 @@ transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), Flattened = flatten_list(URIString, InEnc), - try transcode(Flattened, [], InEnc, OutEnc) of - Result -> Result + try transcode(Flattened, [], InEnc, OutEnc) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -413,8 +406,7 @@ compose_query(List) -> compose_query([],_Options) -> []; compose_query(List, Options) -> - try compose_query(List, Options, false, <<>>) of - Result -> Result + try compose_query(List, Options, false, <<>>) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -423,8 +415,7 @@ compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> Separator = get_separator(Options, Rest), K = form_urlencode(Key), V = form_urlencode(Value), - Flag = is_list(Key) orelse is_list(Value), - IsListNew = IsList orelse Flag, + IsListNew = IsList orelse is_list(Key) orelse is_list(Value), compose_query(Rest, Options, IsListNew, < -%% E.g. "//user@" - invalid URI -%% - userinfo port -%% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} -%% There is always at least an empty host when both userinfo and port -%% are present. -%% - #{path => "///"} otherwise the following would be true: -%% "/////" = uri_string:recompose(#{host => "", path => "///"}) -%% "/////" = uri_string:recompose(#{path => "/////"}) -%% AND -%% path-absolute = "/" [ segment-nz *( "/" segment ) ] -%%------------------------------------------------------------------------- -is_valid_map(Map) -> - case - ((not maps:is_key(userinfo, Map) andalso - not maps:is_key(host, Map) andalso - maps:is_key(port, Map)) - orelse - (maps:is_key(userinfo, Map) andalso - not maps:is_key(host, Map) andalso - not maps:is_key(port, Map)) - orelse - (maps:is_key(userinfo, Map) andalso - not maps:is_key(host, Map) andalso - maps:is_key(port, Map))) orelse - not maps:is_key(path, Map) orelse - not is_host_and_path_valid(Map) orelse - invalid_field_present(Map) - of +%% +%% The implementation is based on a decision tree that fulfills the +%% following rules: +%% - 'path' shall always be present in the input map +%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%% - 'host' shall be present in the input map when 'path' starts with +%% two slashes ("//") +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% path-abempty = *( "/" segment ) +%% segment = *pchar +%% - 'host' shall be present if userinfo or port is present in input map +%% authority = [ userinfo "@" ] host [ ":" port ] +%% - All fields shall be valid (scheme, userinfo, host, port, path, query +%% or fragment). +%%------------------------------------------------------------------------- +is_valid_map(#{path := Path} = Map) -> + case starts_with_two_slash(Path) of true -> - false; + is_valid_map_host(Map); false -> - true - end. + case maps:is_key(userinfo, Map) of + true -> + is_valid_map_host(Map); + false -> + case maps:is_key(port, Map) of + true -> + is_valid_map_host(Map); + false -> + all_fields_valid(Map) + end + end + end; +is_valid_map(#{}) -> + false. -invalid_field_present(Map) -> - Fun = fun(K, _, AccIn) -> AccIn orelse - ((K =/= scheme) andalso (K =/= userinfo) - andalso (K =/= host) andalso (K =/= port) - andalso (K =/= path) andalso (K =/= query) - andalso (K =/= fragment)) - end, - maps:fold(Fun, false, Map). +is_valid_map_host(Map) -> + maps:is_key(host, Map) andalso all_fields_valid(Map). -is_host_and_path_valid(Map) -> - Host = maps:get(host, Map, undefined), - Path = maps:get(path, Map, undefined), - not (Host =:= undefined andalso starts_with_two_slash(Path)). +all_fields_valid(Map) -> + Fun = fun(scheme, _, Acc) -> Acc; + (userinfo, _, Acc) -> Acc; + (host, _, Acc) -> Acc; + (port, _, Acc) -> Acc; + (path, _, Acc) -> Acc; + (query, _, Acc) -> Acc; + (fragment, _, Acc) -> Acc; + (_, _, _) -> false + end, + maps:fold(Fun, true, Map). starts_with_two_slash([$/,$/|_]) -> -- cgit v1.2.3 From a4c3f8d3b270b9c21caabcd084bf55049b5bc700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Tue, 31 Oct 2017 13:24:13 +0100 Subject: stdlib: Fix case normalization (normalize/1) --- lib/stdlib/src/uri_string.erl | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 2c73e38324..b8e0432fd6 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -296,12 +296,14 @@ URIString :: uri_string(), NormalizedURI :: uri_string(). normalize(URIString) -> - %% Case normalization and percent-encoding normalization are achieved - %% by running parse and recompose on the input URI string. + %% Percent-encoding normalization and case normalization for + %% percent-encoded triplets are achieved by running parse and + %% recompose on the input URI string. recompose( normalize_path_segment( normalize_scheme_based( - parse(URIString)))). + normalize_case( + parse(URIString))))). %%------------------------------------------------------------------------- @@ -1894,7 +1896,32 @@ form_urldecode(<>, _Acc) -> %% Helper functions for normalize %%------------------------------------------------------------------------- -%% RFC 3986 +%% 6.2.2.1. Case Normalization +normalize_case(#{scheme := Scheme, host := Host} = Map) -> + Map#{scheme => to_lower(Scheme), + host => to_lower(Host)}; +normalize_case(#{host := Host} = Map) -> + Map#{host => to_lower(Host)}; +normalize_case(#{scheme := Scheme} = Map) -> + Map#{scheme => to_lower(Scheme)}; +normalize_case(#{} = Map) -> + Map. + + +to_lower(Cs) when is_list(Cs) -> + B = convert_binary(Cs, utf8, utf8), + convert_list(to_lower(B), utf8); +to_lower(Cs) when is_binary(Cs) -> + to_lower(Cs, <<>>). +%% +to_lower(<>, Acc) when $A =< C, C =< $Z -> + to_lower(Cs, <>); +to_lower(<>, Acc) -> + to_lower(Cs, <>); +to_lower(<<>>, Acc) -> + Acc. + + %% 6.2.2.3. Path Segment Normalization %% 5.2.4. Remove Dot Segments normalize_path_segment(Map) -> -- cgit v1.2.3 From fdfe083c65348095c4168581bdc53e7508be78c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 1 Nov 2017 13:18:34 +0100 Subject: stdlib: Add uri_string module to stdlib.app.src --- lib/stdlib/src/stdlib.app.src | 1 + 1 file changed, 1 insertion(+) (limited to 'lib') diff --git a/lib/stdlib/src/stdlib.app.src b/lib/stdlib/src/stdlib.app.src index ab0824ca17..5fb48acfab 100644 --- a/lib/stdlib/src/stdlib.app.src +++ b/lib/stdlib/src/stdlib.app.src @@ -101,6 +101,7 @@ timer, unicode, unicode_util, + uri_string, win32reg, zip]}, {registered,[timer_server,rsh_starter,take_over_monitor,pool_master, -- cgit v1.2.3 From 74c2a9db0caa376ea375614fcc67c3a9295737d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 3 Nov 2017 10:07:12 +0100 Subject: stdlib: Refactor functions in uri_string --- lib/stdlib/src/uri_string.erl | 111 +++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 61 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index b8e0432fd6..f4acf1885d 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -366,9 +366,9 @@ transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), - List = convert_list(URIString, InEnc), + List = convert_to_list(URIString, InEnc), Output = transcode(List, [], InEnc, OutEnc), - convert_binary(Output, utf8, OutEnc) + convert_to_binary(Output, utf8, OutEnc) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; @@ -421,7 +421,7 @@ compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> compose_query(Rest, Options, IsListNew, < - normalize_http(Map, Port, Path); - <<"http">> -> - normalize_http(Map, Port, Path); - "https" -> - normalize_https(Map, Port, Path); - <<"https">> -> - normalize_https(Map, Port, Path); - "ftp" -> - normalize_ftp(Map, Port); - <<"ftp">> -> - normalize_ftp(Map, Port); - "ssh" -> - normalize_ssh_sftp(Map, Port); - <<"ssh">> -> - normalize_ssh_sftp(Map, Port); - "sftp" -> - normalize_ssh_sftp(Map, Port); - <<"sftp">> -> - normalize_ssh_sftp(Map, Port); - "tftp" -> - normalize_tftp(Map, Port); - <<"tftp">> -> - normalize_tftp(Map, Port); - _Else -> Map - end. + normalize_scheme_based(Map, Scheme, Port, Path). +%% +normalize_scheme_based(Map, Scheme, Port, Path) + when Scheme =:= "http"; Scheme =:= <<"http">> -> + normalize_http(Map, Port, Path); +normalize_scheme_based(Map, Scheme, Port, Path) + when Scheme =:= "https"; Scheme =:= <<"https">> -> + normalize_https(Map, Port, Path); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "ftp"; Scheme =:= <<"ftp">> -> + normalize_ftp(Map, Port); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "ssh"; Scheme =:= <<"ssh">> -> + normalize_ssh_sftp(Map, Port); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "sftp"; Scheme =:= <<"sftp">> -> + normalize_ssh_sftp(Map, Port); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "tftp"; Scheme =:= <<"tftp">> -> + normalize_tftp(Map, Port); +normalize_scheme_based(Map, _, _, _) -> + Map. normalize_http(Map, Port, Path) -> -- cgit v1.2.3 From 7e5d062973e7cb4f9ee949529e9dcdb5785c1304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 6 Nov 2017 09:54:12 +0100 Subject: stdlib: Remove compose_query and dissect_query compose_query/{1,2} and dissect_query/1 removed as the implemented specification (HTML 2.0) is old. They will be re-implemented based on HTML5. --- lib/stdlib/doc/src/uri_string.xml | 87 +------------- lib/stdlib/src/uri_string.erl | 216 +---------------------------------- lib/stdlib/test/uri_string_SUITE.erl | 51 +-------- 3 files changed, 4 insertions(+), 350 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index 8fa2a92370..9ace2b0a05 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -31,8 +31,7 @@ URI processing functions.

This module contains functions for parsing and handling URIs - (RFC 3986) and - form-urlencoded query strings (RFC 1866). + (RFC 3986).

A URI is an identifier consisting of a sequence of characters matching the syntax rule named URI in RFC 3986. @@ -72,13 +71,6 @@ Transforming URIs into a normalized form

normalize/1
- Composing form-urlencoded query strings from a list of key-value pairs

- compose_query/1

- compose_query/2 -
- Dissecting form-urlencoded query strings into a list of key-value pairs

- dissect_query/1 -

There are four different encodings present during the handling of URIs:

@@ -110,14 +102,12 @@

Error tuple indicating the type of error. Possible values of the second component:

- invalid_character invalid_input invalid_map invalid_percent_encoding invalid_scheme invalid_uri invalid_utf8 - missing_value

The third component is a term providing additional information about the cause of the error.

@@ -143,81 +133,6 @@ - - - Compose urlencoded query string. - -

Composes a form-urlencoded QueryString based on a - QueryList, a list of non-percent-encoded key-value pairs. - Form-urlencoding is defined in section - 8.2.1 of RFC 1866 - (HTML 2.0) for media type application/x-www-form-urlencoded. - Reserved and unsafe characters, as - defined by RFC 1738 - (Uniform Resource Locators), are percent-encoded.

-

See also the opposite operation - dissect_query/1. -

-

Example:

-
-1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}]).
-
-2> >,<<"1">>},
-2> {<<"city">>,<<"örebro"/utf8>>}]).]]>
->]]>
-	
-
-
- - - - Compose urlencoded query string. - -

Same as compose_query/1 but with an additional - Options parameter, that controls the type of separator used - between key-value pairs. There are three supported separator types: amp (), escaped_amp () and semicolon (;). If the parameter Options is empty, separator takes the default value (escaped_amp).

-

See also the opposite operation - dissect_query/1. -

-

Example:

-
-1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
-1> [{separator, amp}]).
- uri_string:compose_query([{<<"foo bar">>,<<"1">>},
-2> {<<"city">>,<<"örebro"/utf8>>}], [{separator, escaped_amp}]).]]>
->]]>
-	
-
-
- - - - Dissect query string. - -

Dissects an urlencoded QueryString and returns a - QueryList, a list of non-percent-encoded key-value pairs. - Form-urlencoding is defined in section - 8.2.1 of RFC 1866 - (HTML 2.0) for media type application/x-www-form-urlencoded. - Percent-encoded segments are decoded as defined by - RFC 1738 - (Uniform Resource Locators).

-

Supported separator types: amp (), escaped_amp - () and semicolon (;).

-

See also the opposite operation - compose_query/1. -

-

Example:

-
-1> uri_string:dissect_query("foo+bar=1;city=%C3%B6rebro").
-[{"foo bar","1"},{"city","örebro"}]
-2> >).]]>
->,<<"1">>},{<<"city">>,<<"örebro"/utf8>>}] ]]>
-	
-
-
- Syntax-based normalization. diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index f4acf1885d..22212da222 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,8 +226,7 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([compose_query/1, compose_query/2, - dissect_query/1, normalize/1, parse/1, +-export([normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -382,75 +381,6 @@ transcode(URIString, Options) when is_list(URIString) -> end. -%%------------------------------------------------------------------------- -%% Functions for working with the query part of a URI as a list -%% of key/value pairs. -%% HTML 2.0 (RFC 1866) defines a media type application/x-www-form-urlencoded -%% in section [8.2.1] "The form-urlencoded Media Type". -%%------------------------------------------------------------------------- - -%%------------------------------------------------------------------------- -%% Compose urlencoded query string from a list of unescaped key/value pairs. -%%------------------------------------------------------------------------- --spec compose_query(QueryList) -> QueryString when - QueryList :: [{uri_string(), uri_string()}], - QueryString :: uri_string() - | error(). -compose_query(List) -> - compose_query(List, []). - - --spec compose_query(QueryList, Options) -> QueryString when - QueryList :: [{uri_string(), uri_string()}], - Options :: [{separator, atom()}], - QueryString :: uri_string() - | error(). -compose_query([],_Options) -> - []; -compose_query(List, Options) -> - try compose_query(List, Options, false, <<>>) - catch - throw:{error, Atom, RestData} -> {error, Atom, RestData} - end. -%% -compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> - Separator = get_separator(Options, Rest), - K = form_urlencode(Key), - V = form_urlencode(Value), - IsListNew = IsList orelse is_list(Key) orelse is_list(Value), - compose_query(Rest, Options, IsListNew, <>; -get_separator([{separator, amp}], _L) -> - <<"&">>; -get_separator([{separator, escaped_amp}], _L) -> - <<"&">>; -get_separator([{separator, semicolon}], _L) -> - <<";">>. - - -%% Form-urlencode input based on RFC 1866 [8.2.1] -form_urlencode(Cs) when is_list(Cs) -> - B = convert_to_binary(Cs, utf8, utf8), - form_urlencode(B, <<>>); -form_urlencode(Cs) -> - form_urlencode(Cs, <<>>). -%% -form_urlencode(<<>>, Acc) -> - Acc; -form_urlencode(<<$ ,T/binary>>, Acc) -> - form_urlencode(T, <>); -form_urlencode(<>, Acc) -> - case is_url_char(H) of - true -> - form_urlencode(T, <>); - false -> - E = percent_encode_binary(H), - form_urlencode(T, <>) - end; -form_urlencode(<>, _Acc) -> - throw({error,invalid_utf8,<>}); -form_urlencode(H, _Acc) -> - throw({error,invalid_input, H}). - - -%% Return true if input char can appear in URL according to -%% RFC 1738 "Uniform Resource Locators". -is_url_char(C) - when 0 =< C, C =< 31; - 128 =< C, C =< 255 -> false; -is_url_char(127) -> false; -is_url_char(C) -> - not (is_reserved(C) orelse is_unsafe(C)). - - -%% Reserved characters (RFC 1738) -is_reserved($;) -> true; -is_reserved($/) -> true; -is_reserved($?) -> true; -is_reserved($:) -> true; -is_reserved($@) -> true; -is_reserved($=) -> true; -is_reserved($&) -> true; -is_reserved(_) -> false. - - -%% Unsafe characters (RFC 1738) -is_unsafe(${) -> true; -is_unsafe($}) -> true; -is_unsafe($|) -> true; -is_unsafe($\\) -> true; -is_unsafe($^) -> true; -is_unsafe($~) -> true; -is_unsafe($[) -> true; -is_unsafe($]) -> true; -is_unsafe($`) -> true; -is_unsafe(_) -> false. - - -%%------------------------------------------------------------------------- -%% Helper functions for dissect_query -%%------------------------------------------------------------------------- -dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_value(T, IsList, Acc, Key, Value); -dissect_query_key(<>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, <>, Value); -dissect_query_key(B, _, _, _, _) -> - throw({error, missing_value, B}). - - -dissect_query_value(<<$&,_/binary>> = B, IsList, Acc, Key, Value) -> - K = form_urldecode(IsList, Key), - V = form_urldecode(IsList, Value), - dissect_query_separator_amp(B, IsList, [{K,V}|Acc], <<>>, <<>>); -dissect_query_value(<<$;,_/binary>> = B, IsList, Acc, Key, Value) -> - K = form_urldecode(IsList, Key), - V = form_urldecode(IsList, Value), - dissect_query_separator_semicolon(B, IsList, [{K,V}|Acc], <<>>, <<>>); -dissect_query_value(<>, IsList, Acc, Key, Value) -> - dissect_query_value(T, IsList, Acc, Key, <>); -dissect_query_value(<<>>, IsList, Acc, Key, Value) -> - K = form_urldecode(IsList, Key), - V = form_urldecode(IsList, Value), - lists:reverse([{K,V}|Acc]). - - -dissect_query_separator_amp(<<"&",T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, Key, Value); -dissect_query_separator_amp(<<$&,T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, Key, Value). - - -dissect_query_separator_semicolon(<<$;,T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, Key, Value). - - -%% Form-urldecode input based on RFC 1866 [8.2.1] -form_urldecode(true, B) -> - Result = form_urldecode(B, <<>>), - convert_to_list(Result, utf8); -form_urldecode(false, B) -> - form_urldecode(B, <<>>); -form_urldecode(<<>>, Acc) -> - Acc; -form_urldecode(<<$+,T/binary>>, Acc) -> - form_urldecode(T, <>); -form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> - case is_hex_digit(C0) andalso is_hex_digit(C1) of - true -> - V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), - form_urldecode(T, <>); - false -> - L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8), - throw({error, invalid_percent_encoding, L}) - end; -form_urldecode(<>, Acc) -> - case is_url_char(H) of - true -> - form_urldecode(T, <>); - false -> - throw({error, invalid_character, [H]}) - end; -form_urldecode(<>, _Acc) -> - throw({error, invalid_character, [H]}). - - %%------------------------------------------------------------------------- %% Helper functions for normalize %%------------------------------------------------------------------------- diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 1567b9333a..c625da56c6 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -38,9 +38,7 @@ recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, recompose_autogen/1, parse_recompose_autogen/1, - transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1, - compose_query/1, compose_query_negative/1, - dissect_query/1, dissect_query_negative/1 + transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1 ]). @@ -109,11 +107,7 @@ all() -> transcode_basic, transcode_options, transcode_mixed, - transcode_negative, - compose_query, - compose_query_negative, - dissect_query, - dissect_query_negative + transcode_negative ]. groups() -> @@ -829,47 +823,6 @@ transcode_negative(_Config) -> {error,invalid_input,<<"ö">>} = uri_string:transcode("foo%F6bar", [{in_encoding, utf8},{out_encoding, utf8}]). -compose_query(_Config) -> - [] = uri_string:compose_query([]), - "foo=1&bar=2" = uri_string:compose_query([{<<"foo">>,"1"}, {"bar", "2"}]), - "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,escaped_amp}]), - "foo=1&bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,amp}]), - "foo=1;bar=2" = uri_string:compose_query([{"foo","1"}, {"bar", "2"}],[{separator,semicolon}]), - "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{"foo bar","1"}, {"ö", "2"}]), - "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {"ö", <<"2">>}]), - <<"foo+bar=1&%C3%B6=2">> = - uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {<<"ö"/utf8>>, <<"2">>}]). - -compose_query_negative(_Config) -> - {error,invalid_input,4} = uri_string:compose_query([{"",4}]), - {error,invalid_input,5} = uri_string:compose_query([{5,""}]), - {error,invalid_utf8,<<"ö">>} = uri_string:compose_query([{"foo bar","1"}, {<<"ö">>, "2"}]). - -dissect_query(_Config) -> - [] = uri_string:dissect_query(""), - [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"), - [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"), - [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1;bar=2"), - [{"foo","1"}, {"bar", "222"}] = uri_string:dissect_query([<<"foo=1;bar=2">>,"22"]), - [{"foo","ö"}, {"bar", "2"}] = uri_string:dissect_query("foo=%C3%B6&bar=2"), - [{<<"foo">>,<<"ö"/utf8>>}, {<<"bar">>, <<"2">>}] = - uri_string:dissect_query(<<"foo=%C3%B6&bar=2">>), - [{"foo bar","1"},{"ö","2"}] = - uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2">>]). - -dissect_query_negative(_Config) -> - {error,invalid_character,";"} = - uri_string:dissect_query("foo=1≈bar=2"), - {error,invalid_character,"&"} = - uri_string:dissect_query("foo1&bar=2"), - {error,invalid_percent_encoding,"%XX%B6"} = uri_string:dissect_query("foo=%XX%B6&bar=2"), - {error,invalid_input,<<153,182>>} = - uri_string:dissect_query("foo=%99%B6&bar=2"), - {error,invalid_character,"ö"} = uri_string:dissect_query("föo+bar=1&%C3%B6=2"), - {error,invalid_character,"ö"} = uri_string:dissect_query(<<"föo+bar=1&%C3%B6=2">>), - {error,invalid_input,<<"ö">>} = - uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2ö">>]). - normalize(_Config) -> "/a/g" = uri_string:normalize("/a/b/c/./../../g"), <<"mid/6">> = uri_string:normalize(<<"mid/content=5/../6">>), -- cgit v1.2.3