From 80feeb36f92a923f57f740c7c28c12bb8b69ec16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 28 Jul 2017 11:04:19 +0200 Subject: stdlib: Add API and doc of uri_string module --- lib/stdlib/src/Makefile | 1 + lib/stdlib/src/uri_string.erl | 325 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 326 insertions(+) create mode 100644 lib/stdlib/src/uri_string.erl (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/Makefile b/lib/stdlib/src/Makefile index bf836203ec..8b156929d7 100644 --- a/lib/stdlib/src/Makefile +++ b/lib/stdlib/src/Makefile @@ -121,6 +121,7 @@ MODULES= \ timer \ unicode \ unicode_util \ + uri_string \ win32reg \ zip diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl new file mode 100644 index 0000000000..2c10c34f39 --- /dev/null +++ b/lib/stdlib/src/uri_string.erl @@ -0,0 +1,325 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +%% +%% [RFC 3986, Chapter 2.2. Reserved Characters] +%% +%% reserved = gen-delims / sub-delims +%% +%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +%% +%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +%% / "*" / "+" / "," / ";" / "=" +%% +%% +%% [RFC 3986, Chapter 2.3. Unreserved Characters] +%% +%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +%% +%% +%% [RFC 3986, Chapter 3. Syntax Components] +%% +%% The generic URI syntax consists of a hierarchical sequence of +%% components referred to as the scheme, authority, path, query, and +%% fragment. +%% +%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +%% +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%% +%% The scheme and path components are required, though the path may be +%% empty (no characters). When authority is present, the path must +%% either be empty or begin with a slash ("/") character. When +%% authority is not present, the path cannot begin with two slash +%% characters ("//"). These restrictions result in five different ABNF +%% rules for a path (Section 3.3), only one of which will match any +%% given URI reference. +%% +%% The following are two example URIs and their component parts: +%% +%% foo://example.com:8042/over/there?name=ferret#nose +%% \_/ \______________/\_________/ \_________/ \__/ +%% | | | | | +%% scheme authority path query fragment +%% | _____________________|__ +%% / \ / \ +%% urn:example:animal:ferret:nose +%% +%% +%% [RFC 3986, Chapter 3.1. Scheme] +%% +%% Each URI begins with a scheme name that refers to a specification for +%% assigning identifiers within that scheme. +%% +%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +%% +%% +%% [RFC 3986, Chapter 3.2. Authority] +%% +%% Many URI schemes include a hierarchical element for a naming +%% authority so that governance of the name space defined by the +%% remainder of the URI is delegated to that authority (which may, in +%% turn, delegate it further). +%% +%% authority = [ userinfo "@" ] host [ ":" port ] +%% +%% +%% [RFC 3986, Chapter 3.2.1. User Information] +%% +%% The userinfo subcomponent may consist of a user name and, optionally, +%% scheme-specific information about how to gain authorization to access +%% the resource. The user information, if present, is followed by a +%% commercial at-sign ("@") that delimits it from the host. +%% +%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Host] +%% +%% The host subcomponent of authority is identified by an IP literal +%% encapsulated within square brackets, an IPv4 address in dotted- +%% decimal form, or a registered name. +%% +%% host = IP-literal / IPv4address / reg-name +%% +%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" +%% +%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +%% +%% IPv6address = 6( h16 ":" ) ls32 +%% / "::" 5( h16 ":" ) ls32 +%% / [ h16 ] "::" 4( h16 ":" ) ls32 +%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +%% / [ *4( h16 ":" ) h16 ] "::" ls32 +%% / [ *5( h16 ":" ) h16 ] "::" h16 +%% / [ *6( h16 ":" ) h16 ] "::" +%% +%% ls32 = ( h16 ":" h16 ) / IPv4address +%% ; least-significant 32 bits of address +%% +%% h16 = 1*4HEXDIG +%% ; 16 bits of address represented in hexadecimal +%% +%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +%% +%% dec-octet = DIGIT ; 0-9 +%% / %x31-39 DIGIT ; 10-99 +%% / "1" 2DIGIT ; 100-199 +%% / "2" %x30-34 DIGIT ; 200-249 +%% / "25" %x30-35 ; 250-255 +%% +%% reg-name = *( unreserved / pct-encoded / sub-delims ) +%% +%% +%% [RFC 3986, Chapter 3.2.2. Port] +%% +%% The port subcomponent of authority is designated by an optional port +%% number in decimal following the host and delimited from it by a +%% single colon (":") character. +%% +%% port = *DIGIT +%% +%% +%% [RFC 3986, Chapter 3.3. Path] +%% +%% The path component contains data, usually organized in hierarchical +%% form, that, along with data in the non-hierarchical query component +%% (Section 3.4), serves to identify a resource within the scope of the +%% URI's scheme and naming authority (if any). The path is terminated +%% by the first question mark ("?") or number sign ("#") character, or +%% by the end of the URI. +%% +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% +%% path-abempty = *( "/" segment ) +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] +%% path-noscheme = segment-nz-nc *( "/" segment ) +%% path-rootless = segment-nz *( "/" segment ) +%% path-empty = 0 +%% segment = *pchar +%% segment-nz = 1*pchar +%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +%% ; non-zero-length segment without any colon ":" +%% +%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +%% +%% +%% [RFC 3986, Chapter 3.4. Query] +%% +%% The query component contains non-hierarchical data that, along with +%% data in the path component (Section 3.3), serves to identify a +%% resource within the scope of the URI's scheme and naming authority +%% (if any). The query component is indicated by the first question +%% mark ("?") character and terminated by a number sign ("#") character +%% or by the end of the URI. +%% +%% query = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 3.5. Fragment] +%% +%% The fragment identifier component of a URI allows indirect +%% identification of a secondary resource by reference to a primary +%% resource and additional identifying information. +%% +%% fragment = *( pchar / "/" / "?" ) +%% +%% +%% [RFC 3986, Chapter 4.1. URI Reference] +%% +%% URI-reference is used to denote the most common usage of a resource +%% identifier. +%% +%% URI-reference = URI / relative-ref +%% +%% +%% [RFC 3986, Chapter 4.2. Relative Reference] +%% +%% A relative reference takes advantage of the hierarchical syntax +%% (Section 1.2.3) to express a URI reference relative to the name space +%% of another hierarchical URI. +%% +%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] +%% +%% relative-part = "//" authority path-abempty +%% / path-absolute +%% / path-noscheme +%% / path-empty +%% +%% +%% [RFC 3986, Chapter 4.3. Absolute URI] +%% +%% Some protocol elements allow only the absolute form of a URI without +%% a fragment identifier. For example, defining a base URI for later +%% use by relative references calls for an absolute-URI syntax rule that +%% does not allow a fragment. +%% +%% absolute-URI = scheme ":" hier-part [ "?" query ] +%% + +-module(uri_string). + + +-export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, + parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export_type([uri_map/0, uri_string/0, bytelist/0]). + + +%%%========================================================================= +%%% API +%%%========================================================================= + + +-type bytelist() :: maybe_improper_list( + 0..255 | + binary() | bytelist(), + binary() | []). + +%% URI compliant with RFC 3986 +%% ASCII %x21 - %x7A ("!" - "z") except +%% %x34 " double quote +%% %x60 < less than +%% %x62 > greater than +%% %x92 \ backslash +%% %x94 ^ caret / circumflex +%% %x96 ` grave / accent +-type uri_string() :: bytelist() | binary(). + + +%% RFC 3986, Chapter 3. Syntax Components +-type uri_map() :: + #{fragment := unicode:chardata(), + host := unicode:chardata(), + path := unicode:chardata(), + port := non_neg_integer(), + query := unicode:chardata(), + scheme := atom(), + userinfo := unicode:chardata()}. + +%% Parse URIs +-spec parse(URIString) -> URIMap when + URIString :: uri_string(), + URIMap :: uri_map(). +parse(_) -> + ok. + +%% Recompose URIs +-spec recompose(URIMap) -> URIString when + URIMap :: uri_map(), + URIString :: uri_string(). +recompose(_) -> + ok. + +%% Resolve references +-spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when + RelativeURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + AbsoluteDestURI :: uri_string(). +resolve_uri_reference(_,_) -> + ok. + +%% Create references +-spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when + AbsoluteSourceURI :: uri_string(), + AbsoluteBaseURI :: uri_string(), + RelativeDestURI :: uri_string(). +create_uri_reference(_,_) -> + ok. + +%% Normalize URIs +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(_) -> + ok. + +%% Transcode URIs +-spec transcode(URIString, Options) -> URIString when + URIString :: uri_string(), + Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. +transcode(_, _) -> + ok. + + +%% Working with query strings +%% HTML 2.0 - application/x-www-form-urlencoded +%% RFC 1866 [8.2.1] + +%% Compose urlencoded query string from a list of unescaped key/value pairs. +-spec compose_query(QueryList) -> QueryString when + QueryList :: [{unicode:chardata(), unicode:chardata()}], + QueryString :: uri_string(). +compose_query(_) -> + ok. + +%% Dissect a query string into a list of unescaped key/value pairs. +-spec dissect_query(QueryString) -> QueryList when + QueryString :: uri_string(), + QueryList :: [{unicode:chardata(), unicode:chardata()}]. +dissect_query(_) -> + ok. -- cgit v1.2.3 From 29a9dd0e17a97a3e6e46f0d08c6ba8f31db33f5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 31 Aug 2017 15:39:45 +0200 Subject: stdlib: Implement uri_string:parse --- lib/stdlib/src/uri_string.erl | 838 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 813 insertions(+), 25 deletions(-) mode change 100644 => 100755 lib/stdlib/src/uri_string.erl (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl old mode 100644 new mode 100755 index 2c10c34f39..619da24cbc --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -221,25 +221,24 @@ %% %% absolute-URI = scheme ":" hier-part [ "?" query ] %% - -module(uri_string). -export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). --export_type([uri_map/0, uri_string/0, bytelist/0]). +-export_type([uri_map/0, uri_string/0]). + +-define(CHAR(Char), <>). +-define(STRING_EMPTY, <<>>). +-define(STRING(MatchStr), <>). +-define(STRING_REST(MatchStr, Rest), <>). %%%========================================================================= %%% API %%%========================================================================= - --type bytelist() :: maybe_improper_list( - 0..255 | - binary() | bytelist(), - binary() | []). - +%%------------------------------------------------------------------------- %% URI compliant with RFC 3986 %% ASCII %x21 - %x7A ("!" - "z") except %% %x34 " double quote @@ -248,32 +247,37 @@ %% %x92 \ backslash %% %x94 ^ caret / circumflex %% %x96 ` grave / accent --type uri_string() :: bytelist() | binary(). +%%------------------------------------------------------------------------- +-type uri_string() :: iodata(). %% RFC 3986, Chapter 3. Syntax Components -type uri_map() :: - #{fragment := unicode:chardata(), - host := unicode:chardata(), - path := unicode:chardata(), - port := non_neg_integer(), - query := unicode:chardata(), - scheme := atom(), - userinfo := unicode:chardata()}. + #{fragment => unicode:chardata(), + host => unicode:chardata(), + path => unicode:chardata(), + port => non_neg_integer(), + query => unicode:chardata(), + scheme => unicode:chardata(), + userinfo => unicode:chardata()} | #{}. %% Parse URIs -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map(). -parse(_) -> - ok. +parse(URIString) -> + if is_binary(URIString) -> + parse_uri_reference(URIString, #{}); + true -> + parse_uri_reference(URIString, [], #{}) + end. %% Recompose URIs -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). recompose(_) -> - ok. + "". %% Resolve references -spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when @@ -281,7 +285,7 @@ recompose(_) -> AbsoluteBaseURI :: uri_string(), AbsoluteDestURI :: uri_string(). resolve_uri_reference(_,_) -> - ok. + "". %% Create references -spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when @@ -289,21 +293,21 @@ resolve_uri_reference(_,_) -> AbsoluteBaseURI :: uri_string(), RelativeDestURI :: uri_string(). create_uri_reference(_,_) -> - ok. + "". %% Normalize URIs -spec normalize(URIString) -> NormalizedURI when URIString :: uri_string(), NormalizedURI :: uri_string(). normalize(_) -> - ok. + "". %% Transcode URIs -spec transcode(URIString, Options) -> URIString when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. transcode(_, _) -> - ok. + "". %% Working with query strings @@ -315,11 +319,795 @@ transcode(_, _) -> QueryList :: [{unicode:chardata(), unicode:chardata()}], QueryString :: uri_string(). compose_query(_) -> - ok. + "". %% Dissect a query string into a list of unescaped key/value pairs. -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), QueryList :: [{unicode:chardata(), unicode:chardata()}]. dissect_query(_) -> - ok. + "". + + +%%%======================================================================== +%%% Internal functions +%%%======================================================================== + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 4.1. URI Reference] +%% +%% URI-reference is used to denote the most common usage of a resource +%% identifier. +%% +%% URI-reference = URI / relative-ref +%%------------------------------------------------------------------------- +-spec parse_uri_reference(iolist(), list(), uri_map()) -> uri_map(). +parse_uri_reference([], _, _) -> #{}; +parse_uri_reference(URIString, Acc, URI) -> + try parse_scheme_start(URIString, Acc, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_relative_part(URIString, Acc, URI) + end. + +-spec parse_uri_reference(binary(), uri_map()) -> uri_map(). +parse_uri_reference(<<>>, _) -> #{}; +parse_uri_reference(URIString, URI) -> + try parse_scheme_start(URIString, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_relative_part(URIString, URI) + end. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 4.2. Relative Reference] +%% +%% A relative reference takes advantage of the hierarchical syntax +%% (Section 1.2.3) to express a URI reference relative to the name space +%% of another hierarchical URI. +%% +%% relative-ref = relative-part [ "?" query ] [ "#" fragment ] +%% +%% relative-part = "//" authority path-abempty +%% / path-absolute +%% / path-noscheme +%% / path-empty +%%------------------------------------------------------------------------- +-spec parse_relative_part(binary(), uri_map()) -> uri_map(). +parse_relative_part(?STRING_REST("//", Rest), URI) -> + %% Parse userinfo - "//" is NOT part of authority + try parse_userinfo(Rest, URI) of + {T, URI1} -> + {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + URI1#{userinfo => Userinfo} + catch + throw:uri_parse_error -> + {T, URI1} = parse_host(Rest, URI), + {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + URI1#{host => Host} + end; +parse_relative_part(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-absolute + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + URI1#{path => ?STRING_REST($/, Path)}; +parse_relative_part(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + URI1#{query => ?STRING_REST($?, Query)}; +parse_relative_part(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + URI1#{fragment => Fragment}; +parse_relative_part(?STRING_REST(Char, Rest), URI) -> + case is_segment_nz_nc(Char) of + true -> + {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + URI1#{path => ?STRING_REST(Char, Path)}; + false -> throw(uri_parse_error) + end. + +-spec parse_relative_part(iolist(), list(), uri_map()) -> uri_map(). +parse_relative_part([H|Rest], Acc, URI) when is_binary(H) -> + parse_relative_part(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_relative_part([H|Rest], Acc, URI) when is_list(H) -> + parse_relative_part(H ++ Rest, Acc, URI); +parse_relative_part("//" ++ Rest, Acc, URI) -> + % Parse userinfo + try parse_userinfo(Rest, Acc, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_host(Rest, Acc, URI) + end; +parse_relative_part([$/|Rest], _Acc, URI) -> + parse_segment(Rest, [$/], URI); % path-absolute +parse_relative_part([$?|Rest], _Acc, URI) -> + parse_query(Rest, [$?], URI); % path-empty ?query +parse_relative_part([$#|Rest], _Acc, URI) -> + parse_fragment(Rest, [], URI); % path-empty +parse_relative_part([Char|Rest], _, URI) -> + case is_segment_nz_nc(Char) of + true -> parse_segment_nz_nc(Rest, [Char], URI); % path-noscheme + false -> throw(uri_parse_error) + end. + + +%% Returns size of 'Rest' for proper calculation of splitting position. +%% Solves the following special case: +%% +%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). +%% +%% While keeping the following true: +%% +%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). +%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). +%% +-spec byte_size_exl_single_slash(uri_string()) -> number(). +byte_size_exl_single_slash(<<$/>>) -> 0; +byte_size_exl_single_slash(Rest) -> byte_size(Rest). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.3. Path] +%% +%% The path component contains data, usually organized in hierarchical +%% form, that, along with data in the non-hierarchical query component +%% (Section 3.4), serves to identify a resource within the scope of the +%% URI's scheme and naming authority (if any). The path is terminated +%% by the first question mark ("?") or number sign ("#") character, or +%% by the end of the URI. +%% +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% +%% path-abempty = *( "/" segment ) +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] +%% path-noscheme = segment-nz-nc *( "/" segment ) +%% path-rootless = segment-nz *( "/" segment ) +%% path-empty = 0 +%% segment = *pchar +%% segment-nz = 1*pchar +%% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) +%% ; non-zero-length segment without any colon ":" +%% +%% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% path-abempty +%%------------------------------------------------------------------------- +-spec parse_segment(binary(), uri_map()) -> {binary(), uri_map()}. +parse_segment(?STRING_REST($/, Rest), URI) -> + parse_segment(Rest, URI); % segment +parse_segment(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_segment(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_segment(?STRING_REST(Char, Rest), URI) -> + case is_pchar(Char) of + true -> parse_segment(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_segment(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_segment(iolist(), list(), uri_map()) -> uri_map(). +parse_segment(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_segment(unicode:characters_to_list(Str), Acc, URI); +parse_segment([H|Rest], Acc, URI) when is_binary(H) -> + parse_segment(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_segment([H|Rest], Acc, URI) when is_list(H) -> + parse_segment(H ++ Rest, Acc, URI); +parse_segment([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/|Acc], URI); % segment +parse_segment([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query +parse_segment([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); +parse_segment([Char|Rest], Acc, URI) -> + case is_pchar(Char) of + true -> parse_segment(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_segment([], Acc, URI) -> + URI#{path => lists:reverse(Acc)}. + +%%------------------------------------------------------------------------- +%% path-noscheme +%%------------------------------------------------------------------------- +-spec parse_segment_nz_nc(binary(), uri_map()) -> {binary(), uri_map()}. +parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> + parse_segment(Rest, URI); % segment +parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> + case is_segment_nz_nc(Char) of + true -> parse_segment_nz_nc(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_segment_nz_nc(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_segment_nz_nc(iolist(), list(), uri_map()) -> uri_map(). +parse_segment_nz_nc(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_segment_nz_nc(unicode:characters_to_list(Str), Acc, URI); +parse_segment_nz_nc([H|Rest], Acc, URI) when is_binary(H) -> + parse_segment_nz_nc(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_segment_nz_nc([H|Rest], Acc, URI) when is_list(H) -> + parse_segment_nz_nc(H ++ Rest, Acc, URI); +parse_segment_nz_nc([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/|Acc], URI); % segment +parse_segment_nz_nc([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query +parse_segment_nz_nc([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); +parse_segment_nz_nc([Char|Rest], Acc, URI) -> + case is_segment_nz_nc(Char) of + true -> parse_segment_nz_nc(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_segment_nz_nc([], Acc, URI) -> + URI#{path => lists:reverse(Acc)}. + +%% Check if char is pchar. +-spec is_pchar(char()) -> boolean(). +is_pchar($%) -> true; % pct-encoded +is_pchar($:) -> true; +is_pchar($@) -> true; +is_pchar(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + +%% Check if char is segment_nz_nc. +-spec is_segment_nz_nc(char()) -> boolean(). +is_segment_nz_nc($%) -> true; % pct-encoded +is_segment_nz_nc($@) -> true; +is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.1. Scheme] +%% +%% Each URI begins with a scheme name that refers to a specification for +%% assigning identifiers within that scheme. +%% +%% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) +%%------------------------------------------------------------------------- +-spec parse_scheme_start(binary(), uri_map()) -> uri_map(). +parse_scheme_start(?STRING_REST(Char, Rest), URI) -> + case is_alpha(Char) of + true -> {T, URI1} = parse_scheme(Rest, URI), + {Scheme, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + URI1#{scheme => ?STRING_REST(Char, Scheme)}; + false -> throw(uri_parse_error) + end. + +-spec parse_scheme_start(iolist(), list(), uri_map()) -> uri_map(). +parse_scheme_start([H|Rest], Acc, URI) when is_binary(H) -> + parse_scheme_start(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_scheme_start([H|Rest], Acc, URI) when is_list(H) -> + parse_scheme_start(H ++ Rest, Acc, URI); +parse_scheme_start([Char|Rest], Acc, URI) -> + case is_alpha(Char) of + true -> parse_scheme(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end. + + +-spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. +parse_scheme(?STRING_REST($:, Rest), URI) -> + {_, URI1} = parse_hier(Rest, URI), + {Rest, URI1}; +parse_scheme(?STRING_REST(Char, Rest), URI) -> + case is_scheme(Char) of + true -> parse_scheme(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_scheme(?STRING_EMPTY, _URI) -> + throw(uri_parse_error). + +-spec parse_scheme(iolist(), list(), uri_map()) -> uri_map(). +parse_scheme(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_scheme(unicode:characters_to_list(Str), Acc, URI); +parse_scheme([H|Rest], Acc, URI) when is_binary(H) -> + parse_scheme(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_scheme([H|Rest], Acc, URI) when is_list(H) -> + parse_scheme(H ++ Rest, Acc, URI); +parse_scheme([$:|Rest], Acc, URI) -> + parse_hier(Rest, [], URI#{scheme => lists:reverse(Acc)}); +parse_scheme([Char|Rest], Acc, URI) -> + case is_scheme(Char) of + true -> parse_scheme(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_scheme([], _Acc, _URI) -> + throw(uri_parse_error). + +%% Check if char is allowed in scheme +-spec is_scheme(char()) -> boolean(). +is_scheme($+) -> true; +is_scheme($-) -> true; +is_scheme($.) -> true; +is_scheme(Char) -> is_alpha(Char) orelse is_digit(Char). + + +%%------------------------------------------------------------------------- +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%%------------------------------------------------------------------------- +-spec parse_hier(binary(), uri_map()) -> {binary(), uri_map()}. +parse_hier(?STRING_REST("//", Rest), URI) -> + % Parse userinfo - "//" is NOT part of authority + try parse_userinfo(Rest, URI) of + {T, URI1} -> + {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + {Rest, URI1#{userinfo => Userinfo}} + catch + throw:uri_parse_error -> + {T, URI1} = parse_host(Rest, URI), + {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{host => Host}} + end; +parse_hier(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-absolute + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_hier(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_hier(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless + case is_pchar(Char) of + true -> % segment_nz + {T, URI1} = parse_segment(Rest, URI), + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST(Char, Path)}}; + false -> throw(uri_parse_error) + end; +parse_hier(?STRING_EMPTY, URI) -> + {<<>>, URI}. + +-spec parse_hier(iolist(), list(), uri_map()) -> uri_map(). +parse_hier(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_hier(unicode:characters_to_list(Str), Acc, URI); +parse_hier([H|Rest], Acc, URI) when is_binary(H) -> + parse_hier(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_hier([H|Rest], Acc, URI) when is_list(H) -> + parse_hier(H ++ Rest, Acc, URI); +parse_hier("//" ++ Rest, Acc, URI) -> + % Parse userinfo + try parse_userinfo(Rest, Acc, URI) of + Res -> Res + catch + throw:uri_parse_error -> + parse_host(Rest, [], URI) + end; +parse_hier([$/|Rest], _Acc, URI) -> + parse_segment(Rest, [$/], URI); % path-absolute +parse_hier([$?|Rest], _Acc, URI) -> + parse_query(Rest, [$?], URI); % path-empty ?query +parse_hier([$#|Rest], _Acc, URI) -> + parse_fragment(Rest, [], URI); % path-empty +parse_hier([Char|Rest], _, URI) -> % path-rootless + case is_pchar(Char) of + true -> parse_segment(Rest, [Char], URI); + false -> throw(uri_parse_error) + end; +parse_hier([], _, URI) -> + URI. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.2. Authority] +%% +%% Many URI schemes include a hierarchical element for a naming +%% authority so that governance of the name space defined by the +%% remainder of the URI is delegated to that authority (which may, in +%% turn, delegate it further). +%% +%% The authority component is preceded by a double slash ("//") and is +%% terminated by the next slash ("/"), question mark ("?"), or number +%% sign ("#") character, or by the end of the URI. +%% +%% authority = [ userinfo "@" ] host [ ":" port ] +%% +%% +%% [RFC 3986, Chapter 3.2.1. User Information] +%% +%% The userinfo subcomponent may consist of a user name and, optionally, +%% scheme-specific information about how to gain authorization to access +%% the resource. The user information, if present, is followed by a +%% commercial at-sign ("@") that delimits it from the host. +%% +%% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) +%%------------------------------------------------------------------------- +-spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. +parse_userinfo(?CHAR($@), _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error); +parse_userinfo(?STRING_REST($@, Rest), URI) -> + {T, URI1} = parse_host(Rest, URI), + {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{host => Host}}; +parse_userinfo(?STRING_REST(Char, Rest), URI) -> + case is_userinfo(Char) of + true -> parse_userinfo(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_userinfo(?STRING_EMPTY, _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error). + +-spec parse_userinfo(iolist(), list(), uri_map()) -> uri_map(). +parse_userinfo(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_userinfo(unicode:characters_to_list(Str), Acc, URI); +parse_userinfo([H|Rest], Acc, URI) when is_binary(H) -> + parse_userinfo(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_userinfo([H|Rest], Acc, URI) when is_list(H) -> + parse_userinfo(H ++ Rest, Acc, URI); +parse_userinfo([$@], _Acc, _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error); +parse_userinfo([$@|Rest], Acc, URI) -> + parse_host(Rest, [], URI#{userinfo => lists:reverse(Acc)}); +parse_userinfo([Char|Rest], Acc, URI) -> + case is_userinfo(Char) of + true -> parse_userinfo(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) % URI#{userinfo => lists:reverse(Acc)} + end; +parse_userinfo([], _Acc, _URI) -> + %% URI cannot end in userinfo state + throw(uri_parse_error). + +%% Check if char is allowed in userinfo +-spec is_userinfo(char()) -> boolean(). +is_userinfo($%) -> true; % pct-encoded +is_userinfo($:) -> true; +is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.2.2. Host] +%% +%% The host subcomponent of authority is identified by an IP literal +%% encapsulated within square brackets, an IPv4 address in dotted- +%% decimal form, or a registered name. +%% +%% host = IP-literal / IPv4address / reg-name +%% +%% IP-literal = "[" ( IPv6address / IPvFuture ) "]" +%% +%% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) +%% +%% IPv6address = 6( h16 ":" ) ls32 +%% / "::" 5( h16 ":" ) ls32 +%% / [ h16 ] "::" 4( h16 ":" ) ls32 +%% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 +%% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 +%% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 +%% / [ *4( h16 ":" ) h16 ] "::" ls32 +%% / [ *5( h16 ":" ) h16 ] "::" h16 +%% / [ *6( h16 ":" ) h16 ] "::" +%% +%% ls32 = ( h16 ":" h16 ) / IPv4address +%% ; least-significant 32 bits of address +%% +%% h16 = 1*4HEXDIG +%% ; 16 bits of address represented in hexadecimal +%% +%% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet +%% +%% dec-octet = DIGIT ; 0-9 +%% / %x31-39 DIGIT ; 10-99 +%% / "1" 2DIGIT ; 100-199 +%% / "2" %x30-34 DIGIT ; 200-249 +%% / "25" %x30-35 ; 250-255 +%% +%% reg-name = *( unreserved / pct-encoded / sub-delims ) +%%------------------------------------------------------------------------- +%% TODO: implement parsing of IPv4/IPv6 addresses +-spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. +parse_host(?STRING_REST($:, Rest), URI) -> + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_host(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_host(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_host(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_host(?STRING_REST(Char, Rest), URI) -> + case is_reg_name(Char) of + true -> parse_host(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_host(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_host(iolist(), list(), uri_map()) -> uri_map(). +parse_host(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_host(unicode:characters_to_list(Str), Acc, URI); +parse_host([H|Rest], Acc, URI) when is_binary(H) -> + parse_host(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_host([H|Rest], Acc, URI) when is_list(H) -> + parse_host(H ++ Rest, Acc, URI); +parse_host([$:|Rest], Acc, URI) -> + parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); +parse_host([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty +parse_host([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query +parse_host([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty +parse_host([Char|Rest], Acc, URI) -> + case is_reg_name(Char) of + true -> parse_host(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_host([], Acc, URI) -> + URI#{host => lists:reverse(Acc)}. + +%% Check if char is allowed in reg-name +-spec is_reg_name(char()) -> boolean(). +is_reg_name($%) -> true; +is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.2.2. Port] +%% +%% The port subcomponent of authority is designated by an optional port +%% number in decimal following the host and delimited from it by a +%% single colon (":") character. +%% +%% port = *DIGIT +%%------------------------------------------------------------------------- +-spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}. +parse_port(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_port(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_port(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_port(?STRING_REST(Char, Rest), URI) -> + case is_digit(Char) of + true -> parse_port(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_port(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_port(iolist(), list(), uri_map()) -> uri_map(). +parse_port(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_port(unicode:characters_to_list(Str), Acc, URI); +parse_port([H|Rest], Acc, URI) when is_binary(H) -> + parse_port(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_port([H|Rest], Acc, URI) when is_list(H) -> + parse_port(H ++ Rest, Acc, URI); +parse_port([$/|Rest], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + parse_segment(Rest, [$/], URI#{port => Port}); % path-abempty +parse_port([$?|Rest], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + parse_query(Rest, [$?], URI#{port => Port}); % path-empty ?query +parse_port([$#|Rest], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + parse_fragment(Rest, [], URI#{port => Port}); % path-empty +parse_port([Char|Rest], Acc, URI) -> + case is_digit(Char) of + true -> parse_port(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_port([], Acc, URI) -> + {Port, _} = string:to_integer(lists:reverse(Acc)), + URI#{port => Port}. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.4. Query] +%% +%% The query component contains non-hierarchical data that, along with +%% data in the path component (Section 3.3), serves to identify a +%% resource within the scope of the URI's scheme and naming authority +%% (if any). The query component is indicated by the first question +%% mark ("?") character and terminated by a number sign ("#") character +%% or by the end of the URI. +%% +%% query = *( pchar / "/" / "?" ) +%%------------------------------------------------------------------------- +-spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. +parse_query(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_query(?STRING_REST(Char, Rest), URI) -> + case is_query(Char) of + true -> parse_query(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_query(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_query(iolist(), list(), uri_map()) -> uri_map(). +parse_query(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_query(unicode:characters_to_list(Str), Acc, URI); +parse_query([H|Rest], Acc, URI) when is_binary(H) -> + parse_query(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_query([H|Rest], Acc, URI) when is_list(H) -> + parse_query(H ++ Rest, Acc, URI); +parse_query([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{query => lists:reverse(Acc)}); +parse_query([Char|Rest], Acc, URI) -> + case is_query(Char) of + true -> parse_query(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_query([], Acc, URI) -> + URI#{query => lists:reverse(Acc)}. + +%% Check if char is allowed in query +-spec is_query(char()) -> boolean(). +is_query($/) -> true; +is_query(Char) -> is_pchar(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 3.5. Fragment] +%% +%% The fragment identifier component of a URI allows indirect +%% identification of a secondary resource by reference to a primary +%% resource and additional identifying information. +%% +%% fragment = *( pchar / "/" / "?" ) +%%------------------------------------------------------------------------- +-spec parse_fragment(binary(), uri_map()) -> {binary(), uri_map()}. +parse_fragment(?STRING_REST(Char, Rest), URI) -> + case is_fragment(Char) of + true -> parse_fragment(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_fragment(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_fragment(iolist(), list(), uri_map()) -> uri_map(). +parse_fragment(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_fragment(unicode:characters_to_list(Str), Acc, URI); +parse_fragment([H|Rest], Acc, URI) when is_binary(H) -> + parse_fragment(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_fragment([H|Rest], Acc, URI) when is_list(H) -> + parse_fragment(H ++ Rest, Acc, URI); +parse_fragment([Char|Rest], Acc, URI) -> + case is_fragment(Char) of + true -> parse_fragment(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_fragment([], Acc, URI) -> + URI#{fragment => lists:reverse(Acc)}. + +%% Check if char is allowed in fragment +-spec is_fragment(char()) -> boolean(). +is_fragment($/) -> true; +is_fragment($?) -> true; +is_fragment(Char) -> is_pchar(Char). + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 2.2. Reserved Characters] +%% +%% reserved = gen-delims / sub-delims +%% +%% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" +%% +%% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" +%% / "*" / "+" / "," / ";" / "=" +%% +%%------------------------------------------------------------------------- +%% %% Return true if input char is reserved. +%% -spec is_reserved(char()) -> boolean(). +%% is_reserved(Char) -> +%% is_gen_delim(Char) orelse is_sub_delim(Char). + +%% %% Check if char is reserved. +%% -spec is_gen_delim(char()) -> boolean(). +%% is_gen_delim($:) -> true; +%% is_gen_delim($/) -> true; +%% is_gen_delim($?) -> true; +%% is_gen_delim($#) -> true; +%% is_gen_delim($[) -> true; +%% is_gen_delim($]) -> true; +%% is_gen_delim($@) -> true; +%% is_gen_delim(_) -> false. + +%% Check if char is sub-delim. +-spec is_sub_delim(char()) -> boolean(). +is_sub_delim($!) -> true; +is_sub_delim($$) -> true; +is_sub_delim($&) -> true; +is_sub_delim($') -> true; +is_sub_delim($() -> true; +is_sub_delim($)) -> true; + +is_sub_delim($*) -> true; +is_sub_delim($+) -> true; +is_sub_delim($,) -> true; +is_sub_delim($;) -> true; +is_sub_delim($=) -> true; +is_sub_delim(_) -> false. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 2.3. Unreserved Characters] +%% +%% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" +%% +%%------------------------------------------------------------------------- +-spec is_unreserved(char()) -> boolean(). +is_unreserved($-) -> true; +is_unreserved($.) -> true; +is_unreserved($_) -> true; +is_unreserved($~) -> true; +is_unreserved(Char) -> is_alpha(Char) orelse is_digit(Char). + +-spec is_alpha(char()) -> boolean(). +is_alpha(C) + when $A =< C, C =< $Z; + $a =< C, C =< $z -> true; +is_alpha(_) -> false. + +-spec is_digit(char()) -> boolean(). +is_digit(C) + when $0 =< C, C =< $9 -> true; +is_digit(_) -> false. + +%% Returns the size of a binary exluding the first element. +%% Used in calls to split_binary(). +-spec byte_size_exl_head(binary()) -> number(). +byte_size_exl_head(<<>>) -> 0; +byte_size_exl_head(Binary) -> byte_size(Binary) + 1. -- cgit v1.2.3 From ec3f0c7f96531b714082f5af694a7ed6a02769ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 14 Sep 2017 14:25:47 +0200 Subject: stdlib: Add support for parsing IPv4 and IPv6 --- lib/stdlib/src/uri_string.erl | 246 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 237 insertions(+), 9 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 619da24cbc..3656d561be 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -388,7 +388,7 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - URI1#{host => Host} + URI1#{host => remove_brackets(Host)} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -669,7 +669,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => Host}} + {Rest, URI1#{host => remove_brackets(Host)}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -756,7 +756,7 @@ parse_userinfo(?CHAR($@), _URI) -> parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => Host}}; + {Rest, URI1#{host => remove_brackets(Host)}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); @@ -834,7 +834,6 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). %% %% reg-name = *( unreserved / pct-encoded / sub-delims ) %%------------------------------------------------------------------------- -%% TODO: implement parsing of IPv4/IPv6 addresses -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), @@ -849,14 +848,16 @@ parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_host(?STRING_REST($[, Rest), URI) -> + parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), {Rest, URI1#{fragment => Fragment}}; parse_host(?STRING_REST(Char, Rest), URI) -> - case is_reg_name(Char) of - true -> parse_host(Rest, URI); - false -> throw(uri_parse_error) + case is_digit(Char) of + true -> parse_ipv4_bin(Rest, [Char], URI); + false -> parse_reg_name(?STRING_REST(Char, Rest), URI) end; parse_host(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -877,12 +878,65 @@ parse_host([$?|Rest], Acc, URI) -> parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query parse_host([$#|Rest], Acc, URI) -> parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty +parse_host([$[|Rest], _Acc, URI) -> + parse_ipv6(Rest, [], URI); parse_host([Char|Rest], Acc, URI) -> + case is_digit(Char) of + true -> parse_ipv4(Rest, [Char|Acc], URI); + false -> parse_reg_name([Char|Rest], Acc, URI) + end; +parse_host([], Acc, URI) -> + URI#{host => lists:reverse(Acc)}. + + +-spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. +parse_reg_name(?STRING_REST($:, Rest), URI) -> + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_reg_name(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_reg_name(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_reg_name(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of - true -> parse_host(Rest, [Char|Acc], URI); + true -> parse_reg_name(Rest, URI); false -> throw(uri_parse_error) end; -parse_host([], Acc, URI) -> +parse_reg_name(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_reg_name(iolist(), list(), uri_map()) -> uri_map(). +parse_reg_name(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_reg_name(unicode:characters_to_list(Str), Acc, URI); +parse_reg_name([H|Rest], Acc, URI) when is_binary(H) -> + parse_reg_name(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_reg_name([H|Rest], Acc, URI) when is_list(H) -> + parse_reg_name(H ++ Rest, Acc, URI); +parse_reg_name([$:|Rest], Acc, URI) -> + parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); +parse_reg_name([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty +parse_reg_name([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query +parse_reg_name([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty +parse_reg_name([Char|Rest], Acc, URI) -> + case is_reg_name(Char) of + true -> parse_reg_name(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_reg_name([], Acc, URI) -> URI#{host => lists:reverse(Acc)}. %% Check if char is allowed in reg-name @@ -891,6 +945,168 @@ is_reg_name($%) -> true; is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). +-spec parse_ipv4_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. +parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> + case is_ipv4(Char) of + true -> parse_ipv4_bin(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> + _ = validate_ipv4_address(lists:reverse(Acc)), + {?STRING_EMPTY, URI}. + +-spec parse_ipv4(iolist(), list(), uri_map()) -> uri_map(). +parse_ipv4(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_ipv4(unicode:characters_to_list(Str), Acc, URI); +parse_ipv4([H|Rest], Acc, URI) when is_binary(H) -> + parse_ipv4(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_ipv4([H|Rest], Acc, URI) when is_list(H) -> + parse_ipv4(H ++ Rest, Acc, URI); +parse_ipv4([$:|Rest], Acc, URI) -> + parse_port(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); +parse_ipv4([$/|Rest], Acc, URI) -> + parse_segment(Rest, [$/], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-abempty +parse_ipv4([$?|Rest], Acc, URI) -> + parse_query(Rest, [$?], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty ?query +parse_ipv4([$#|Rest], Acc, URI) -> + parse_fragment(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty +parse_ipv4([Char|Rest], Acc, URI) -> + case is_ipv4(Char) of + true -> parse_ipv4(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_ipv4([], Acc, URI) -> + URI#{host => validate_ipv4_address(lists:reverse(Acc))}. + +%% Check if char is allowed in IPv4 addresses +-spec is_ipv4(char()) -> boolean(). +is_ipv4($.) -> true; +is_ipv4(Char) -> is_digit(Char). + +-spec validate_ipv4_address(list()) -> list(). +validate_ipv4_address(Addr) -> + case inet:parse_ipv4strict_address(Addr) of + {ok, _} -> Addr; + {error, _} -> throw(uri_parse_error) + end. + + +-spec parse_ipv6_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. +parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) -> + _ = validate_ipv6_address(lists:reverse(Acc)), + parse_ipv6_bin_end(Rest, URI); +parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> + case is_ipv6(Char) of + true -> parse_ipv6_bin(Rest, [Char|Acc], URI); + false -> throw(uri_parse_error) + end; +parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> + throw(uri_parse_error). + +-spec parse_ipv6(iolist(), list(), uri_map()) -> uri_map(). +parse_ipv6(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_ipv6(unicode:characters_to_list(Str), Acc, URI); +parse_ipv6([H|Rest], Acc, URI) when is_binary(H) -> + parse_ipv6(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_ipv6([H|Rest], Acc, URI) when is_list(H) -> + parse_ipv6(H ++ Rest, Acc, URI); +parse_ipv6([$]|Rest], Acc, URI) -> + parse_ipv6_end(Rest, [], URI#{host => validate_ipv6_address(lists:reverse(Acc))}); +parse_ipv6([Char|Rest], Acc, URI) -> + case is_ipv6(Char) of + true -> parse_ipv6(Rest, [Char|Acc], URI); + false -> + io:format("# DEBUG Char: >>~c<<~n", [Char]), + io:format("# DEBUG Rest: >>~s<<~n", [Rest]), + throw(uri_parse_error) + end; +parse_ipv6([], _Acc, _URI) -> + throw(uri_parse_error). + +%% Check if char is allowed in IPv6 addresses +-spec is_ipv6(char()) -> boolean(). +is_ipv6($:) -> true; +is_ipv6($.) -> true; +is_ipv6(Char) -> is_hex_digit(Char). + + +-spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. +parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> + {T, URI1} = parse_port(Rest, URI), + {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Port = binary_to_integer(H), + {Rest, URI1#{port => Port}}; +parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> + {T, URI1} = parse_segment(Rest, URI), % path-abempty + {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{path => ?STRING_REST($/, Path)}}; +parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> + {T, URI1} = parse_query(Rest, URI), % path-empty ?query + {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + {Rest, URI1#{query => ?STRING_REST($?, Query)}}; +parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> + {T, URI1} = parse_fragment(Rest, URI), % path-empty + {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + {Rest, URI1#{fragment => Fragment}}; +parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> + case is_ipv6(Char) of + true -> parse_ipv6_bin_end(Rest, URI); + false -> throw(uri_parse_error) + end; +parse_ipv6_bin_end(?STRING_EMPTY, URI) -> + {?STRING_EMPTY, URI}. + +-spec parse_ipv6_end(iolist(), list(), uri_map()) -> uri_map(). +parse_ipv6_end(?STRING(Str), Acc, URI) when is_list(Acc) -> + parse_ipv6_end(unicode:characters_to_list(Str), Acc, URI); +parse_ipv6_end([H|Rest], Acc, URI) when is_binary(H) -> + parse_ipv6_end(unicode:characters_to_list(H, utf8) ++ Rest, + Acc, URI); +parse_ipv6_end([H|Rest], Acc, URI) when is_list(H) -> + parse_ipv6_end(H ++ Rest, Acc, URI); +parse_ipv6_end([$:|Rest], _Acc, URI) -> + parse_port(Rest, [], URI); +parse_ipv6_end([$/|Rest], _Acc, URI) -> + parse_segment(Rest, [$/], URI); % path-abempty +parse_ipv6_end([$?|Rest], _Acc, URI) -> + parse_query(Rest, [$?], URI); % path-empty ?query +parse_ipv6_end([$#|Rest], _Acc, URI) -> + parse_fragment(Rest, [], URI); % path-empty +parse_ipv6_end([], _Acc, URI) -> + URI. + + +-spec validate_ipv6_address(list()) -> list(). +validate_ipv6_address(Addr) -> + case inet:parse_ipv6strict_address(Addr) of + {ok, _} -> Addr; + {error, _} -> throw(uri_parse_error) + end. + + %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2.2. Port] %% @@ -1106,8 +1322,20 @@ is_digit(C) when $0 =< C, C =< $9 -> true; is_digit(_) -> false. +-spec is_hex_digit(char()) -> boolean(). +is_hex_digit(C) + when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true; +is_hex_digit(_) -> false. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). byte_size_exl_head(<<>>) -> 0; byte_size_exl_head(Binary) -> byte_size(Binary) + 1. + +% Remove brackets from binary +-spec remove_brackets(binary()) -> binary(). +remove_brackets(?STRING_REST($[,Addr)) -> + A1 = binary:replace(Addr, <<$[>>, <<>>), + binary:replace(A1, <<$]>>, <<>>); +remove_brackets(Addr) -> Addr. -- cgit v1.2.3 From 6c0c11eeaf0649cfbca5e426263c7dc43b49feff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Tue, 19 Sep 2017 16:07:49 +0200 Subject: stdlib: Add support to parse percent-encoded URIs --- lib/stdlib/src/uri_string.erl | 198 ++++++++++++++++++++++++++++++------------ 1 file changed, 143 insertions(+), 55 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 3656d561be..50e8a0bf5a 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -223,9 +223,9 @@ %% -module(uri_string). - -export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export([is_host/1, is_path/1]). % suppress warnings -export_type([uri_map/0, uri_string/0]). -define(CHAR(Char), <>). @@ -383,31 +383,31 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> try parse_userinfo(Rest, URI) of {T, URI1} -> {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), - URI1#{userinfo => Userinfo} + URI1#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - URI1#{host => remove_brackets(Host)} + URI1#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - URI1#{path => ?STRING_REST($/, Path)}; + URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - URI1#{query => ?STRING_REST($?, Query)}; + URI1#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - URI1#{fragment => Fragment}; + URI1#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - URI1#{path => ?STRING_REST(Char, Path)}; + URI1#{path => decode_path(?STRING_REST(Char, Path))}; false -> throw(uri_parse_error) end. @@ -491,11 +491,11 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of true -> parse_segment(Rest, URI); @@ -515,16 +515,16 @@ parse_segment([H|Rest], Acc, URI) when is_list(H) -> parse_segment([$/|Rest], Acc, URI) -> parse_segment(Rest, [$/|Acc], URI); % segment parse_segment([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query + parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query parse_segment([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); + parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); parse_segment([Char|Rest], Acc, URI) -> case is_pchar(Char) of true -> parse_segment(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_segment([], Acc, URI) -> - URI#{path => lists:reverse(Acc)}. + URI#{path => decode_path(lists:reverse(Acc))}. %%------------------------------------------------------------------------- %% path-noscheme @@ -535,11 +535,11 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, URI); @@ -559,16 +559,16 @@ parse_segment_nz_nc([H|Rest], Acc, URI) when is_list(H) -> parse_segment_nz_nc([$/|Rest], Acc, URI) -> parse_segment(Rest, [$/|Acc], URI); % segment parse_segment_nz_nc([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => lists:reverse(Acc)}); % ?query + parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query parse_segment_nz_nc([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => lists:reverse(Acc)}); + parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); parse_segment_nz_nc([Char|Rest], Acc, URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_segment_nz_nc([], Acc, URI) -> - URI#{path => lists:reverse(Acc)}. + URI#{path => decode_path(lists:reverse(Acc))}. %% Check if char is pchar. -spec is_pchar(char()) -> boolean(). @@ -664,31 +664,31 @@ parse_hier(?STRING_REST("//", Rest), URI) -> try parse_userinfo(Rest, URI) of {T, URI1} -> {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), - {Rest, URI1#{userinfo => Userinfo}} + {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => remove_brackets(Host)}} + {Rest, URI1#{host => decode_host(remove_brackets(Host))}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of true -> % segment_nz {T, URI1} = parse_segment(Rest, URI), {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST(Char, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST(Char, Path))}}; false -> throw(uri_parse_error) end; parse_hier(?STRING_EMPTY, URI) -> @@ -756,7 +756,7 @@ parse_userinfo(?CHAR($@), _URI) -> parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{host => remove_brackets(Host)}}; + {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); @@ -778,11 +778,11 @@ parse_userinfo([$@], _Acc, _URI) -> %% URI cannot end in userinfo state throw(uri_parse_error); parse_userinfo([$@|Rest], Acc, URI) -> - parse_host(Rest, [], URI#{userinfo => lists:reverse(Acc)}); + parse_host(Rest, [], URI#{userinfo => decode_userinfo(lists:reverse(Acc))}); parse_userinfo([Char|Rest], Acc, URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) % URI#{userinfo => lists:reverse(Acc)} + false -> throw(uri_parse_error) end; parse_userinfo([], _Acc, _URI) -> %% URI cannot end in userinfo state @@ -843,17 +843,17 @@ parse_host(?STRING_REST($:, Rest), URI) -> parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_ipv4_bin(Rest, [Char], URI); @@ -871,13 +871,13 @@ parse_host([H|Rest], Acc, URI) when is_binary(H) -> parse_host([H|Rest], Acc, URI) when is_list(H) -> parse_host(H ++ Rest, Acc, URI); parse_host([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); + parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); parse_host([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty + parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty parse_host([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query + parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query parse_host([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty + parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty parse_host([$[|Rest], _Acc, URI) -> parse_ipv6(Rest, [], URI); parse_host([Char|Rest], Acc, URI) -> @@ -886,7 +886,7 @@ parse_host([Char|Rest], Acc, URI) -> false -> parse_reg_name([Char|Rest], Acc, URI) end; parse_host([], Acc, URI) -> - URI#{host => lists:reverse(Acc)}. + URI#{host => decode_host(lists:reverse(Acc))}. -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. @@ -898,15 +898,15 @@ parse_reg_name(?STRING_REST($:, Rest), URI) -> parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, URI); @@ -924,20 +924,20 @@ parse_reg_name([H|Rest], Acc, URI) when is_binary(H) -> parse_reg_name([H|Rest], Acc, URI) when is_list(H) -> parse_reg_name(H ++ Rest, Acc, URI); parse_reg_name([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => lists:reverse(Acc)}); + parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); parse_reg_name([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => lists:reverse(Acc)}); % path-abempty + parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty parse_reg_name([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => lists:reverse(Acc)}); % path-empty ?query + parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query parse_reg_name([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => lists:reverse(Acc)}); % path-empty + parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty parse_reg_name([Char|Rest], Acc, URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_reg_name([], Acc, URI) -> - URI#{host => lists:reverse(Acc)}. + URI#{host => decode_host(lists:reverse(Acc))}. %% Check if char is allowed in reg-name -spec is_reg_name(char()) -> boolean(). @@ -956,17 +956,17 @@ parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of true -> parse_ipv4_bin(Rest, [Char|Acc], URI); @@ -1062,15 +1062,15 @@ parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin_end(Rest, URI); @@ -1120,15 +1120,15 @@ validate_ipv6_address(Addr) -> parse_port(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{path => ?STRING_REST($/, Path)}}; + {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), - {Rest, URI1#{query => ?STRING_REST($?, Query)}}; + {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_port(Rest, URI); @@ -1180,7 +1180,7 @@ parse_port([], Acc, URI) -> parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), - {Rest, URI1#{fragment => Fragment}}; + {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of true -> parse_query(Rest, URI); @@ -1198,18 +1198,19 @@ parse_query([H|Rest], Acc, URI) when is_binary(H) -> parse_query([H|Rest], Acc, URI) when is_list(H) -> parse_query(H ++ Rest, Acc, URI); parse_query([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{query => lists:reverse(Acc)}); + parse_fragment(Rest, [], URI#{query => decode_query(lists:reverse(Acc))}); parse_query([Char|Rest], Acc, URI) -> case is_query(Char) of true -> parse_query(Rest, [Char|Acc], URI); false -> throw(uri_parse_error) end; parse_query([], Acc, URI) -> - URI#{query => lists:reverse(Acc)}. + URI#{query => decode_query(lists:reverse(Acc))}. %% Check if char is allowed in query -spec is_query(char()) -> boolean(). is_query($/) -> true; +is_query($?) -> true; is_query(Char) -> is_pchar(Char). @@ -1245,7 +1246,7 @@ parse_fragment([Char|Rest], Acc, URI) -> false -> throw(uri_parse_error) end; parse_fragment([], Acc, URI) -> - URI#{fragment => lists:reverse(Acc)}. + URI#{fragment => decode_fragment(lists:reverse(Acc))}. %% Check if char is allowed in fragment -spec is_fragment(char()) -> boolean(). @@ -1339,3 +1340,90 @@ remove_brackets(?STRING_REST($[,Addr)) -> A1 = binary:replace(Addr, <<$[>>, <<>>), binary:replace(A1, <<$]>>, <<>>); remove_brackets(Addr) -> Addr. + + +%%------------------------------------------------------------------------- +%% [RFC 3986, Chapter 2.1. Percent-Encoding] +%% +%% A percent-encoding mechanism is used to represent a data octet in a +%% component when that octet's corresponding character is outside the +%% allowed set or is being used as a delimiter of, or within, the +%% component. A percent-encoded octet is encoded as a character +%% triplet, consisting of the percent character "%" followed by the two +%% hexadecimal digits representing that octet's numeric value. For +%% example, "%20" is the percent-encoding for the binary octet +%% "00100000" (ABNF: %x20), which in US-ASCII corresponds to the space +%% character (SP). Section 2.4 describes when percent-encoding and +%% decoding is applied. +%% +%% pct-encoded = "%" HEXDIG HEXDIG +%%------------------------------------------------------------------------- +-spec decode_userinfo(list()|binary()) -> list() | binary(). +decode_userinfo(Cs) -> + decode(Cs, fun is_userinfo/1, <<>>). + + +-spec decode_host(list()|binary()) -> list() | binary(). +decode_host(Cs) -> + decode(Cs, fun is_host/1, <<>>). + +%% Check if char is allowed in host +-spec is_host(char()) -> boolean(). +is_host($:) -> true; +is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + + +-spec decode_path(list()|binary()) -> list() | binary(). +decode_path(Cs) -> + decode(Cs, fun is_path/1, <<>>). + +%% Check if char is allowed in path +-spec is_path(char()) -> boolean(). +is_path($/) -> true; + +is_path(Char) -> is_pchar(Char). + + +-spec decode_query(list()|binary()) -> list() | binary(). +decode_query(Cs) -> + decode(Cs, fun is_query/1, <<>>). + +-spec decode_fragment(list()|binary()) -> list() | binary(). +decode_fragment(Cs) -> + decode(Cs, fun is_host/1, <<>>). + + +-spec decode(list()|binary(), fun(), binary()) -> list() | binary(). +decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + B = hex2dec(C0)*16+hex2dec(C1), + decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode(<>, Fun, Acc) -> + case Fun(C) of + true -> decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode(<<>>, _Fun, Acc) -> + Acc; +decode([$%,C0,C1|Cs], Fun, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + B = hex2dec(C0)*16+hex2dec(C1), + decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode([C|Cs], Fun, Acc) -> + case Fun(C) of + true -> decode(Cs, Fun, <>); + false -> throw(uri_parse_error) + end; +decode([], _Fun, Acc) -> + unicode:characters_to_list(Acc). + + +hex2dec(X) when (X >= $0) andalso (X =< $9) -> X - $0; +hex2dec(X) when (X >= $A) andalso (X =< $F) -> X - $A + 10; +hex2dec(X) when (X >= $a) andalso (X =< $f) -> X - $a + 10. -- cgit v1.2.3 From 892bf58ee115a7e56ff38083afd85702bb8e14d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 20 Sep 2017 17:17:34 +0200 Subject: stdlib: Implement recompose - Implemented recompose function with percent-encoding and validation of IPv4/IPv6 addresses. - Added test for recompose that uses a generated test vector (URI combinations based on a fix set of URI components). - Added test for parse-recompose using a generated test vector. - Removed parsing functions for lists. Lists are converted to binary before parsing. --- lib/stdlib/src/uri_string.erl | 783 ++++++++++++++++++++++-------------------- 1 file changed, 403 insertions(+), 380 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 50e8a0bf5a..89a2c21518 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -223,16 +223,39 @@ %% -module(uri_string). +%%------------------------------------------------------------------------- +%% External API +%%------------------------------------------------------------------------- -export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). --export([is_host/1, is_path/1]). % suppress warnings -export_type([uri_map/0, uri_string/0]). + +%%------------------------------------------------------------------------- +%% Internal API +%%------------------------------------------------------------------------- +-export([is_host/1, is_path/1]). % suppress warnings + + +%%------------------------------------------------------------------------- +%% Macros +%%------------------------------------------------------------------------- -define(CHAR(Char), <>). -define(STRING_EMPTY, <<>>). -define(STRING(MatchStr), <>). -define(STRING_REST(MatchStr, Rest), <>). +-define(DEC2HEX(X), + if ((X) >= 0) andalso ((X) =< 9) -> (X) + $0; + ((X) >= 10) andalso ((X) =< 15) -> (X) + $A - 10 + end). + +-define(HEX2DEC(X), + if ((X) >= $0) andalso ((X) =< $9) -> (X) - $0; + ((X) >= $A) andalso ((X) =< $F) -> (X) - $A + 10; + ((X) >= $a) andalso ((X) =< $f) -> (X) - $a + 10 + end). + %%%========================================================================= %%% API @@ -250,8 +273,9 @@ %%------------------------------------------------------------------------- -type uri_string() :: iodata(). - +%%------------------------------------------------------------------------- %% RFC 3986, Chapter 3. Syntax Components +%%------------------------------------------------------------------------- -type uri_map() :: #{fragment => unicode:chardata(), host => unicode:chardata(), @@ -261,25 +285,44 @@ scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. +%%------------------------------------------------------------------------- %% Parse URIs +%%------------------------------------------------------------------------- -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map(). -parse(URIString) -> - if is_binary(URIString) -> - parse_uri_reference(URIString, #{}); - true -> - parse_uri_reference(URIString, [], #{}) - end. +parse(URIString) when is_binary(URIString) -> + parse_uri_reference(URIString, #{}); +parse(URIString) when is_list(URIString) -> + Binary = unicode:characters_to_binary(URIString), + Map = parse_uri_reference(Binary, #{}), + convert_mapfields_to_list(Map). +%%------------------------------------------------------------------------- %% Recompose URIs +%%------------------------------------------------------------------------- -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). -recompose(_) -> - "". +recompose(Map) when map_size(Map) =:= 0 -> + ""; +recompose(Map) -> + case is_valid_map(Map) of + false -> + error({badarg, invalid_map}); + true -> + T0 = update_scheme(Map, empty), + T1 = update_userinfo(Map, T0), + T2 = update_host(Map, T1), + T3 = update_port(Map, T2), + T4 = update_path(Map, T3), + T5 = update_query(Map, T4), + update_fragment(Map, T5) + end. +%%------------------------------------------------------------------------- %% Resolve references +%%------------------------------------------------------------------------- -spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when RelativeURI :: uri_string(), AbsoluteBaseURI :: uri_string(), @@ -287,7 +330,9 @@ recompose(_) -> resolve_uri_reference(_,_) -> "". +%%------------------------------------------------------------------------- %% Create references +%%------------------------------------------------------------------------- -spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when AbsoluteSourceURI :: uri_string(), AbsoluteBaseURI :: uri_string(), @@ -295,33 +340,42 @@ resolve_uri_reference(_,_) -> create_uri_reference(_,_) -> "". +%%------------------------------------------------------------------------- %% Normalize URIs +%%------------------------------------------------------------------------- -spec normalize(URIString) -> NormalizedURI when URIString :: uri_string(), NormalizedURI :: uri_string(). normalize(_) -> "". +%%------------------------------------------------------------------------- %% Transcode URIs +%%------------------------------------------------------------------------- -spec transcode(URIString, Options) -> URIString when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. transcode(_, _) -> "". - +%%------------------------------------------------------------------------- %% Working with query strings %% HTML 2.0 - application/x-www-form-urlencoded %% RFC 1866 [8.2.1] +%%------------------------------------------------------------------------- +%%------------------------------------------------------------------------- %% Compose urlencoded query string from a list of unescaped key/value pairs. +%%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{unicode:chardata(), unicode:chardata()}], QueryString :: uri_string(). compose_query(_) -> "". +%%------------------------------------------------------------------------- %% Dissect a query string into a list of unescaped key/value pairs. +%%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), QueryList :: [{unicode:chardata(), unicode:chardata()}]. @@ -333,6 +387,14 @@ dissect_query(_) -> %%% Internal functions %%%======================================================================== +%%------------------------------------------------------------------------- +%% Converts Map fields to lists +%%------------------------------------------------------------------------- +convert_mapfields_to_list(Map) -> + Fun = fun (_, V) when is_binary(V) -> unicode:characters_to_list(V); + (_, V) -> V end, + maps:map(Fun, Map). + %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 4.1. URI Reference] @@ -342,16 +404,6 @@ dissect_query(_) -> %% %% URI-reference = URI / relative-ref %%------------------------------------------------------------------------- --spec parse_uri_reference(iolist(), list(), uri_map()) -> uri_map(). -parse_uri_reference([], _, _) -> #{}; -parse_uri_reference(URIString, Acc, URI) -> - try parse_scheme_start(URIString, Acc, URI) of - Res -> Res - catch - throw:uri_parse_error -> - parse_relative_part(URIString, Acc, URI) - end. - -spec parse_uri_reference(binary(), uri_map()) -> uri_map(). parse_uri_reference(<<>>, _) -> #{}; parse_uri_reference(URIString, URI) -> @@ -411,32 +463,6 @@ parse_relative_part(?STRING_REST(Char, Rest), URI) -> false -> throw(uri_parse_error) end. --spec parse_relative_part(iolist(), list(), uri_map()) -> uri_map(). -parse_relative_part([H|Rest], Acc, URI) when is_binary(H) -> - parse_relative_part(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_relative_part([H|Rest], Acc, URI) when is_list(H) -> - parse_relative_part(H ++ Rest, Acc, URI); -parse_relative_part("//" ++ Rest, Acc, URI) -> - % Parse userinfo - try parse_userinfo(Rest, Acc, URI) of - Res -> Res - catch - throw:uri_parse_error -> - parse_host(Rest, Acc, URI) - end; -parse_relative_part([$/|Rest], _Acc, URI) -> - parse_segment(Rest, [$/], URI); % path-absolute -parse_relative_part([$?|Rest], _Acc, URI) -> - parse_query(Rest, [$?], URI); % path-empty ?query -parse_relative_part([$#|Rest], _Acc, URI) -> - parse_fragment(Rest, [], URI); % path-empty -parse_relative_part([Char|Rest], _, URI) -> - case is_segment_nz_nc(Char) of - true -> parse_segment_nz_nc(Rest, [Char], URI); % path-noscheme - false -> throw(uri_parse_error) - end. - %% Returns size of 'Rest' for proper calculation of splitting position. %% Solves the following special case: @@ -504,27 +530,6 @@ parse_segment(?STRING_REST(Char, Rest), URI) -> parse_segment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_segment(iolist(), list(), uri_map()) -> uri_map(). -parse_segment(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_segment(unicode:characters_to_list(Str), Acc, URI); -parse_segment([H|Rest], Acc, URI) when is_binary(H) -> - parse_segment(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_segment([H|Rest], Acc, URI) when is_list(H) -> - parse_segment(H ++ Rest, Acc, URI); -parse_segment([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/|Acc], URI); % segment -parse_segment([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query -parse_segment([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); -parse_segment([Char|Rest], Acc, URI) -> - case is_pchar(Char) of - true -> parse_segment(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_segment([], Acc, URI) -> - URI#{path => decode_path(lists:reverse(Acc))}. %%------------------------------------------------------------------------- %% path-noscheme @@ -548,27 +553,6 @@ parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> parse_segment_nz_nc(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_segment_nz_nc(iolist(), list(), uri_map()) -> uri_map(). -parse_segment_nz_nc(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_segment_nz_nc(unicode:characters_to_list(Str), Acc, URI); -parse_segment_nz_nc([H|Rest], Acc, URI) when is_binary(H) -> - parse_segment_nz_nc(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_segment_nz_nc([H|Rest], Acc, URI) when is_list(H) -> - parse_segment_nz_nc(H ++ Rest, Acc, URI); -parse_segment_nz_nc([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/|Acc], URI); % segment -parse_segment_nz_nc([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{path => decode_path(lists:reverse(Acc))}); % ?query -parse_segment_nz_nc([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{path => decode_path(lists:reverse(Acc))}); -parse_segment_nz_nc([Char|Rest], Acc, URI) -> - case is_segment_nz_nc(Char) of - true -> parse_segment_nz_nc(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_segment_nz_nc([], Acc, URI) -> - URI#{path => decode_path(lists:reverse(Acc))}. %% Check if char is pchar. -spec is_pchar(char()) -> boolean(). @@ -601,18 +585,6 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> false -> throw(uri_parse_error) end. --spec parse_scheme_start(iolist(), list(), uri_map()) -> uri_map(). -parse_scheme_start([H|Rest], Acc, URI) when is_binary(H) -> - parse_scheme_start(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_scheme_start([H|Rest], Acc, URI) when is_list(H) -> - parse_scheme_start(H ++ Rest, Acc, URI); -parse_scheme_start([Char|Rest], Acc, URI) -> - case is_alpha(Char) of - true -> parse_scheme(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end. - -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> @@ -626,23 +598,6 @@ parse_scheme(?STRING_REST(Char, Rest), URI) -> parse_scheme(?STRING_EMPTY, _URI) -> throw(uri_parse_error). --spec parse_scheme(iolist(), list(), uri_map()) -> uri_map(). -parse_scheme(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_scheme(unicode:characters_to_list(Str), Acc, URI); -parse_scheme([H|Rest], Acc, URI) when is_binary(H) -> - parse_scheme(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_scheme([H|Rest], Acc, URI) when is_list(H) -> - parse_scheme(H ++ Rest, Acc, URI); -parse_scheme([$:|Rest], Acc, URI) -> - parse_hier(Rest, [], URI#{scheme => lists:reverse(Acc)}); -parse_scheme([Char|Rest], Acc, URI) -> - case is_scheme(Char) of - true -> parse_scheme(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_scheme([], _Acc, _URI) -> - throw(uri_parse_error). %% Check if char is allowed in scheme -spec is_scheme(char()) -> boolean(). @@ -694,36 +649,6 @@ parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless parse_hier(?STRING_EMPTY, URI) -> {<<>>, URI}. --spec parse_hier(iolist(), list(), uri_map()) -> uri_map(). -parse_hier(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_hier(unicode:characters_to_list(Str), Acc, URI); -parse_hier([H|Rest], Acc, URI) when is_binary(H) -> - parse_hier(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_hier([H|Rest], Acc, URI) when is_list(H) -> - parse_hier(H ++ Rest, Acc, URI); -parse_hier("//" ++ Rest, Acc, URI) -> - % Parse userinfo - try parse_userinfo(Rest, Acc, URI) of - Res -> Res - catch - throw:uri_parse_error -> - parse_host(Rest, [], URI) - end; -parse_hier([$/|Rest], _Acc, URI) -> - parse_segment(Rest, [$/], URI); % path-absolute -parse_hier([$?|Rest], _Acc, URI) -> - parse_query(Rest, [$?], URI); % path-empty ?query -parse_hier([$#|Rest], _Acc, URI) -> - parse_fragment(Rest, [], URI); % path-empty -parse_hier([Char|Rest], _, URI) -> % path-rootless - case is_pchar(Char) of - true -> parse_segment(Rest, [Char], URI); - false -> throw(uri_parse_error) - end; -parse_hier([], _, URI) -> - URI. - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2. Authority] @@ -766,27 +691,6 @@ parse_userinfo(?STRING_EMPTY, _URI) -> %% URI cannot end in userinfo state throw(uri_parse_error). --spec parse_userinfo(iolist(), list(), uri_map()) -> uri_map(). -parse_userinfo(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_userinfo(unicode:characters_to_list(Str), Acc, URI); -parse_userinfo([H|Rest], Acc, URI) when is_binary(H) -> - parse_userinfo(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_userinfo([H|Rest], Acc, URI) when is_list(H) -> - parse_userinfo(H ++ Rest, Acc, URI); -parse_userinfo([$@], _Acc, _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error); -parse_userinfo([$@|Rest], Acc, URI) -> - parse_host(Rest, [], URI#{userinfo => decode_userinfo(lists:reverse(Acc))}); -parse_userinfo([Char|Rest], Acc, URI) -> - case is_userinfo(Char) of - true -> parse_userinfo(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_userinfo([], _Acc, _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error). %% Check if char is allowed in userinfo -spec is_userinfo(char()) -> boolean(). @@ -862,32 +766,6 @@ parse_host(?STRING_REST(Char, Rest), URI) -> parse_host(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_host(iolist(), list(), uri_map()) -> uri_map(). -parse_host(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_host(unicode:characters_to_list(Str), Acc, URI); -parse_host([H|Rest], Acc, URI) when is_binary(H) -> - parse_host(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_host([H|Rest], Acc, URI) when is_list(H) -> - parse_host(H ++ Rest, Acc, URI); -parse_host([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); -parse_host([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty -parse_host([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query -parse_host([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty -parse_host([$[|Rest], _Acc, URI) -> - parse_ipv6(Rest, [], URI); -parse_host([Char|Rest], Acc, URI) -> - case is_digit(Char) of - true -> parse_ipv4(Rest, [Char|Acc], URI); - false -> parse_reg_name([Char|Rest], Acc, URI) - end; -parse_host([], Acc, URI) -> - URI#{host => decode_host(lists:reverse(Acc))}. - -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> @@ -915,30 +793,6 @@ parse_reg_name(?STRING_REST(Char, Rest), URI) -> parse_reg_name(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_reg_name(iolist(), list(), uri_map()) -> uri_map(). -parse_reg_name(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_reg_name(unicode:characters_to_list(Str), Acc, URI); -parse_reg_name([H|Rest], Acc, URI) when is_binary(H) -> - parse_reg_name(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_reg_name([H|Rest], Acc, URI) when is_list(H) -> - parse_reg_name(H ++ Rest, Acc, URI); -parse_reg_name([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); -parse_reg_name([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => decode_host(lists:reverse(Acc))}); % path-abempty -parse_reg_name([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty ?query -parse_reg_name([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => decode_host(lists:reverse(Acc))}); % path-empty -parse_reg_name([Char|Rest], Acc, URI) -> - case is_reg_name(Char) of - true -> parse_reg_name(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_reg_name([], Acc, URI) -> - URI#{host => decode_host(lists:reverse(Acc))}. - %% Check if char is allowed in reg-name -spec is_reg_name(char()) -> boolean(). is_reg_name($%) -> true; @@ -976,29 +830,6 @@ parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {?STRING_EMPTY, URI}. --spec parse_ipv4(iolist(), list(), uri_map()) -> uri_map(). -parse_ipv4(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_ipv4(unicode:characters_to_list(Str), Acc, URI); -parse_ipv4([H|Rest], Acc, URI) when is_binary(H) -> - parse_ipv4(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_ipv4([H|Rest], Acc, URI) when is_list(H) -> - parse_ipv4(H ++ Rest, Acc, URI); -parse_ipv4([$:|Rest], Acc, URI) -> - parse_port(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); -parse_ipv4([$/|Rest], Acc, URI) -> - parse_segment(Rest, [$/], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-abempty -parse_ipv4([$?|Rest], Acc, URI) -> - parse_query(Rest, [$?], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty ?query -parse_ipv4([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{host => validate_ipv4_address(lists:reverse(Acc))}); % path-empty -parse_ipv4([Char|Rest], Acc, URI) -> - case is_ipv4(Char) of - true -> parse_ipv4(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_ipv4([], Acc, URI) -> - URI#{host => validate_ipv4_address(lists:reverse(Acc))}. %% Check if char is allowed in IPv4 addresses -spec is_ipv4(char()) -> boolean(). @@ -1025,27 +856,6 @@ parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> throw(uri_parse_error). --spec parse_ipv6(iolist(), list(), uri_map()) -> uri_map(). -parse_ipv6(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_ipv6(unicode:characters_to_list(Str), Acc, URI); -parse_ipv6([H|Rest], Acc, URI) when is_binary(H) -> - parse_ipv6(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_ipv6([H|Rest], Acc, URI) when is_list(H) -> - parse_ipv6(H ++ Rest, Acc, URI); -parse_ipv6([$]|Rest], Acc, URI) -> - parse_ipv6_end(Rest, [], URI#{host => validate_ipv6_address(lists:reverse(Acc))}); -parse_ipv6([Char|Rest], Acc, URI) -> - case is_ipv6(Char) of - true -> parse_ipv6(Rest, [Char|Acc], URI); - false -> - io:format("# DEBUG Char: >>~c<<~n", [Char]), - io:format("# DEBUG Rest: >>~s<<~n", [Rest]), - throw(uri_parse_error) - end; -parse_ipv6([], _Acc, _URI) -> - throw(uri_parse_error). - %% Check if char is allowed in IPv6 addresses -spec is_ipv6(char()) -> boolean(). is_ipv6($:) -> true; @@ -1079,26 +889,6 @@ parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> parse_ipv6_bin_end(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_ipv6_end(iolist(), list(), uri_map()) -> uri_map(). -parse_ipv6_end(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_ipv6_end(unicode:characters_to_list(Str), Acc, URI); -parse_ipv6_end([H|Rest], Acc, URI) when is_binary(H) -> - parse_ipv6_end(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_ipv6_end([H|Rest], Acc, URI) when is_list(H) -> - parse_ipv6_end(H ++ Rest, Acc, URI); -parse_ipv6_end([$:|Rest], _Acc, URI) -> - parse_port(Rest, [], URI); -parse_ipv6_end([$/|Rest], _Acc, URI) -> - parse_segment(Rest, [$/], URI); % path-abempty -parse_ipv6_end([$?|Rest], _Acc, URI) -> - parse_query(Rest, [$?], URI); % path-empty ?query -parse_ipv6_end([$#|Rest], _Acc, URI) -> - parse_fragment(Rest, [], URI); % path-empty -parse_ipv6_end([], _Acc, URI) -> - URI. - - -spec validate_ipv6_address(list()) -> list(). validate_ipv6_address(Addr) -> case inet:parse_ipv6strict_address(Addr) of @@ -1137,32 +927,6 @@ parse_port(?STRING_REST(Char, Rest), URI) -> parse_port(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_port(iolist(), list(), uri_map()) -> uri_map(). -parse_port(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_port(unicode:characters_to_list(Str), Acc, URI); -parse_port([H|Rest], Acc, URI) when is_binary(H) -> - parse_port(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_port([H|Rest], Acc, URI) when is_list(H) -> - parse_port(H ++ Rest, Acc, URI); -parse_port([$/|Rest], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - parse_segment(Rest, [$/], URI#{port => Port}); % path-abempty -parse_port([$?|Rest], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - parse_query(Rest, [$?], URI#{port => Port}); % path-empty ?query -parse_port([$#|Rest], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - parse_fragment(Rest, [], URI#{port => Port}); % path-empty -parse_port([Char|Rest], Acc, URI) -> - case is_digit(Char) of - true -> parse_port(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_port([], Acc, URI) -> - {Port, _} = string:to_integer(lists:reverse(Acc)), - URI#{port => Port}. - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.4. Query] @@ -1189,23 +953,6 @@ parse_query(?STRING_REST(Char, Rest), URI) -> parse_query(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_query(iolist(), list(), uri_map()) -> uri_map(). -parse_query(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_query(unicode:characters_to_list(Str), Acc, URI); -parse_query([H|Rest], Acc, URI) when is_binary(H) -> - parse_query(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_query([H|Rest], Acc, URI) when is_list(H) -> - parse_query(H ++ Rest, Acc, URI); -parse_query([$#|Rest], Acc, URI) -> - parse_fragment(Rest, [], URI#{query => decode_query(lists:reverse(Acc))}); -parse_query([Char|Rest], Acc, URI) -> - case is_query(Char) of - true -> parse_query(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_query([], Acc, URI) -> - URI#{query => decode_query(lists:reverse(Acc))}. %% Check if char is allowed in query -spec is_query(char()) -> boolean(). @@ -1232,21 +979,6 @@ parse_fragment(?STRING_REST(Char, Rest), URI) -> parse_fragment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. --spec parse_fragment(iolist(), list(), uri_map()) -> uri_map(). -parse_fragment(?STRING(Str), Acc, URI) when is_list(Acc) -> - parse_fragment(unicode:characters_to_list(Str), Acc, URI); -parse_fragment([H|Rest], Acc, URI) when is_binary(H) -> - parse_fragment(unicode:characters_to_list(H, utf8) ++ Rest, - Acc, URI); -parse_fragment([H|Rest], Acc, URI) when is_list(H) -> - parse_fragment(H ++ Rest, Acc, URI); -parse_fragment([Char|Rest], Acc, URI) -> - case is_fragment(Char) of - true -> parse_fragment(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) - end; -parse_fragment([], Acc, URI) -> - URI#{fragment => decode_fragment(lists:reverse(Acc))}. %% Check if char is allowed in fragment -spec is_fragment(char()) -> boolean(). @@ -1266,21 +998,6 @@ is_fragment(Char) -> is_pchar(Char). %% / "*" / "+" / "," / ";" / "=" %% %%------------------------------------------------------------------------- -%% %% Return true if input char is reserved. -%% -spec is_reserved(char()) -> boolean(). -%% is_reserved(Char) -> -%% is_gen_delim(Char) orelse is_sub_delim(Char). - -%% %% Check if char is reserved. -%% -spec is_gen_delim(char()) -> boolean(). -%% is_gen_delim($:) -> true; -%% is_gen_delim($/) -> true; -%% is_gen_delim($?) -> true; -%% is_gen_delim($#) -> true; -%% is_gen_delim($[) -> true; -%% is_gen_delim($]) -> true; -%% is_gen_delim($@) -> true; -%% is_gen_delim(_) -> false. %% Check if char is sub-delim. -spec is_sub_delim(char()) -> boolean(). @@ -1328,17 +1045,22 @@ is_hex_digit(C) when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true; is_hex_digit(_) -> false. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). byte_size_exl_head(<<>>) -> 0; byte_size_exl_head(Binary) -> byte_size(Binary) + 1. -% Remove brackets from binary + +%% Remove enclosing brackets from binary -spec remove_brackets(binary()) -> binary(). -remove_brackets(?STRING_REST($[,Addr)) -> - A1 = binary:replace(Addr, <<$[>>, <<>>), - binary:replace(A1, <<$]>>, <<>>); +remove_brackets(<<$[/utf8, Rest/binary>>) -> + {H,T} = split_binary(Rest, byte_size(Rest) - 1), + case T =:= <<$]/utf8>> of + true -> H; + false -> Rest + end; remove_brackets(Addr) -> Addr. @@ -1362,42 +1084,72 @@ remove_brackets(Addr) -> Addr. decode_userinfo(Cs) -> decode(Cs, fun is_userinfo/1, <<>>). - -spec decode_host(list()|binary()) -> list() | binary(). decode_host(Cs) -> decode(Cs, fun is_host/1, <<>>). -%% Check if char is allowed in host --spec is_host(char()) -> boolean(). -is_host($:) -> true; -is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). - - -spec decode_path(list()|binary()) -> list() | binary(). decode_path(Cs) -> decode(Cs, fun is_path/1, <<>>). -%% Check if char is allowed in path --spec is_path(char()) -> boolean(). -is_path($/) -> true; - -is_path(Char) -> is_pchar(Char). - - -spec decode_query(list()|binary()) -> list() | binary(). decode_query(Cs) -> decode(Cs, fun is_query/1, <<>>). -spec decode_fragment(list()|binary()) -> list() | binary(). decode_fragment(Cs) -> - decode(Cs, fun is_host/1, <<>>). + decode(Cs, fun is_fragment/1, <<>>). + + +%%------------------------------------------------------------------------- +%% Percent-encode +%%------------------------------------------------------------------------- + +%% Only validates as scheme cannot have percent-encoded characters +-spec encode_scheme(list()|binary()) -> list() | binary(). +encode_scheme([]) -> + throw(uri_parse_error); +encode_scheme(<<>>) -> + throw(uri_parse_error); +encode_scheme(Scheme) -> + case validate_scheme(Scheme) of + true -> Scheme; + false -> throw(uri_parse_error) + end. + +-spec encode_userinfo(list()|binary()) -> list() | binary(). +encode_userinfo(Cs) -> + encode(Cs, fun is_userinfo/1). + +-spec encode_host(list()|binary()) -> list() | binary(). +encode_host(Cs) -> + case classify_host(Cs) of + regname -> Cs; + ipv4 -> Cs; + ipv6 -> bracket_ipv6(Cs); + other -> encode(Cs, fun is_reg_name/1) + end. +-spec encode_path(list()|binary()) -> list() | binary(). +encode_path(Cs) -> + encode(Cs, fun is_path/1). +-spec encode_query(list()|binary()) -> list() | binary(). +encode_query(Cs) -> + encode(Cs, fun is_query/1). + +-spec encode_fragment(list()|binary()) -> list() | binary(). +encode_fragment(Cs) -> + encode(Cs, fun is_fragment/1). + +%%------------------------------------------------------------------------- +%% Helper funtions for percent-decode +%%------------------------------------------------------------------------- -spec decode(list()|binary(), fun(), binary()) -> list() | binary(). decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> - B = hex2dec(C0)*16+hex2dec(C1), + B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); false -> throw(uri_parse_error) end; @@ -1411,7 +1163,7 @@ decode(<<>>, _Fun, Acc) -> decode([$%,C0,C1|Cs], Fun, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> - B = hex2dec(C0)*16+hex2dec(C1), + B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); false -> throw(uri_parse_error) end; @@ -1423,7 +1175,278 @@ decode([C|Cs], Fun, Acc) -> decode([], _Fun, Acc) -> unicode:characters_to_list(Acc). +%% Check if char is allowed in host +-spec is_host(char()) -> boolean(). +is_host($:) -> true; +is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). + +%% Check if char is allowed in path +-spec is_path(char()) -> boolean(). +is_path($/) -> true; +is_path(Char) -> is_pchar(Char). + + + +%%------------------------------------------------------------------------- +%% Helper functions for percent-encode +%%------------------------------------------------------------------------- +-spec encode(list()|binary(), fun()) -> list() | binary(). +encode(Component, Fun) when is_list(Component) -> + B = unicode:characters_to_binary(Component), + unicode:characters_to_list(encode(B, Fun, <<>>)); +encode(Component, Fun) when is_binary(Component) -> + encode(Component, Fun, <<>>). +%% +encode(<>, Fun, Acc) -> + C = encode_codepoint_binary(Char, Fun), + encode(Rest, Fun, <>); +encode(<<_Char, _Rest/binary>>, _Fun, _Acc) -> + throw(uri_parse_error); +encode(<<>>, _Fun, Acc) -> + Acc. + + +-spec encode_codepoint_binary(integer(), fun()) -> list(). +encode_codepoint_binary(C, Fun) -> + case Fun(C) of + false -> percent_encode_binary(C); + true -> <> + end. + + +-spec percent_encode_binary(integer()) -> binary(). +percent_encode_binary(Code) -> + percent_encode_binary(<>, <<>>). + + +percent_encode_binary(<>, Acc) -> + percent_encode_binary(Rest, <>); +percent_encode_binary(<<>>, Acc) -> + Acc. -hex2dec(X) when (X >= $0) andalso (X =< $9) -> X - $0; -hex2dec(X) when (X >= $A) andalso (X =< $F) -> X - $A + 10; -hex2dec(X) when (X >= $a) andalso (X =< $f) -> X - $a + 10. + +%%------------------------------------------------------------------------- +%%------------------------------------------------------------------------- +validate_scheme([]) -> true; +validate_scheme([H|T]) -> + case is_scheme(H) of + true -> validate_scheme(T); + false -> false + end; +validate_scheme(<<>>) -> true; +validate_scheme(<>) -> + case is_scheme(H) of + true -> validate_scheme(Rest); + false -> false + end. + +%%------------------------------------------------------------------------- +%% Classifies hostname into the following categories: +%% regname, ipv4 - address does not contain reserved characters to be +%% percent-encoded +%% ipv6 - address does not contain reserved characters but it shall be +%% encolsed in brackets +%% other - address shall be percent-encoded +%%------------------------------------------------------------------------- +classify_host([]) -> false; +classify_host(Addr) when is_binary(Addr) -> + A = unicode:characters_to_list(Addr), + classify_host_ipv6(A); +classify_host(Addr) -> + classify_host_ipv6(Addr). + +classify_host_ipv6(Addr) -> + case is_ipv6_address(Addr) of + true -> ipv6; + false -> classify_host_ipv4(Addr) + end. + +classify_host_ipv4(Addr) -> + case is_ipv4_address(Addr) of + true -> ipv4; + false -> classify_host_regname(Addr) + end. + +classify_host_regname([]) -> regname; +classify_host_regname([H|T]) -> + case is_reg_name(H) of + true -> classify_host_regname(T); + false -> other + end; +classify_host_regname(<<>>) -> regname; +classify_host_regname(<>) -> + case is_reg_name(H) of + true -> classify_host_regname(Rest); + false -> other + end. + +is_ipv4_address(Addr) -> + case inet:parse_ipv4strict_address(Addr) of + {ok, _} -> true; + {error, _} -> false + end. + +is_ipv6_address(Addr) -> + case inet:parse_ipv6strict_address(Addr) of + {ok, _} -> true; + {error, _} -> false + end. + +bracket_ipv6(Addr) when is_binary(Addr) -> + concat(<<$[,Addr/binary>>,<<$]>>); +bracket_ipv6(Addr) when is_list(Addr) -> + [$[|Addr] ++ "]". + + +%%------------------------------------------------------------------------- +%% Helper funtions for recompose +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% Checks if input Map has valid combination of fields that can be +%% recomposed into a URI. +%% It filters out the following combinations from the set of all possible +%% values: +%% - port +%% E.g. ":8080" - invalid URI +%% - userinfo +%% E.g. "//user@" - invalid URI +%% - userinfo port +%% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} +%% There is always at least an empty host when both userinfo and port +%% are present. +%%------------------------------------------------------------------------- +is_valid_map(Map) -> + case + (not maps:is_key(userinfo, Map) andalso + not maps:is_key(host, Map) andalso + maps:is_key(port, Map)) + orelse + (maps:is_key(userinfo, Map) andalso + not maps:is_key(host, Map) andalso + not maps:is_key(port, Map)) + orelse + (maps:is_key(userinfo, Map) andalso + not maps:is_key(host, Map) andalso + maps:is_key(port, Map)) + of + true -> + false; + false -> + true + end. + + +update_scheme(#{scheme := Scheme}, _) -> + add_colon_postfix(encode_scheme(Scheme)); +update_scheme(#{}, _) -> + empty. + + +update_userinfo(#{userinfo := Userinfo}, empty) -> + add_auth_prefix(encode_userinfo(Userinfo)); +update_userinfo(#{userinfo := Userinfo}, URI) -> + concat(URI,add_auth_prefix(encode_userinfo(Userinfo))); +update_userinfo(#{}, empty) -> + empty; +update_userinfo(#{}, URI) -> + URI. + + +update_host(#{host := Host}, empty) -> + add_auth_prefix(encode_host(Host)); +update_host(#{host := Host} = Map, URI) -> + concat(URI,add_host_prefix(Map, encode_host(Host))); +update_host(#{}, empty) -> + empty; +update_host(#{}, URI) -> + URI. + + +%% URI cannot be empty for ports. E.g. ":8080" is not a valid URI +update_port(#{port := Port}, URI) -> + concat(URI,add_colon(encode_port(Port))); +update_port(#{}, URI) -> + URI. + + +update_path(#{path := Path}, empty) -> + encode_path(Path); +update_path(#{path := Path}, URI) -> + concat(URI,encode_path(Path)); +update_path(#{}, empty) -> + empty; +update_path(#{}, URI) -> + URI. + + +update_query(#{query := Query}, empty) -> + encode_query(Query); +update_query(#{query := Query}, URI) -> + concat(URI,encode_query(Query)); +update_query(#{}, empty) -> + empty; +update_query(#{}, URI) -> + URI. + + +update_fragment(#{fragment := Fragment}, empty) -> + add_hashmark(encode_query(Fragment)); +update_fragment(#{fragment := Fragment}, URI) -> + concat(URI,add_hashmark(encode_fragment(Fragment))); +update_fragment(#{}, empty) -> + ""; +update_fragment(#{}, URI) -> + URI. + +%%------------------------------------------------------------------------- +%% Concatenates its arguments that can be lists and binaries. +%% The result is a list if at least one of its argument is a list and +%% binary otherwise. +%%------------------------------------------------------------------------- +concat(A, B) when is_binary(A), is_binary(B) -> + <>; +concat(A, B) when is_binary(A), is_list(B) -> + unicode:characters_to_list(A) ++ B; +concat(A, B) when is_list(A) -> + A ++ maybe_to_list(B). + +add_hashmark(empty) -> empty; +add_hashmark(Comp) when is_binary(Comp) -> + <<$#, Comp/binary>>; +add_hashmark(Comp) when is_list(Comp) -> + [$#|Comp]. + +add_colon(empty) -> empty; +add_colon(Comp) when is_binary(Comp) -> + <<$:, Comp/binary>>; +add_colon(Comp) when is_list(Comp) -> + [$:|Comp]. + +add_colon_postfix(empty) -> empty; +add_colon_postfix(Comp) when is_binary(Comp) -> + <>; +add_colon_postfix(Comp) when is_list(Comp) -> + Comp ++ ":". + +add_auth_prefix(empty) -> empty; +add_auth_prefix(Comp) when is_binary(Comp) -> + <<"//", Comp/binary>>; +add_auth_prefix(Comp) when is_list(Comp) -> + [$/,$/|Comp]. + +add_host_prefix(_, empty) -> empty; +add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) -> + <<$@,Host/binary>>; +add_host_prefix(#{}, Host) when is_binary(Host) -> + <<"//",Host/binary>>; +add_host_prefix(#{userinfo := _}, Host) when is_list(Host) -> + [$@|Host]; +add_host_prefix(#{}, Host) when is_list(Host) -> + [$/,$/|Host]. + +maybe_to_list(Comp) when is_binary(Comp) -> unicode:characters_to_list(Comp); +maybe_to_list(Comp) -> Comp. + +encode_port(Port) -> + integer_to_binary(Port). -- cgit v1.2.3 From 505579acda74b9281c965488f86cbd6c83254a57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 29 Sep 2017 16:54:50 +0200 Subject: stdlib: Improve calculation of parsed binary - Improved calculation of parsed binary. - Added tests for special corner cases. - Fixed dialyzer warnings. --- lib/stdlib/src/uri_string.erl | 246 +++++++++++++++++++++++++----------------- 1 file changed, 146 insertions(+), 100 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 89a2c21518..bb7079c193 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -434,51 +434,36 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> %% Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + Userinfo = calculate_parsed_part(Rest, T), URI1#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), - {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + Host = calculate_parsed_part_sl(Rest, T), URI1#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), URI1#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), URI1#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), URI1#{path => decode_path(?STRING_REST(Char, Path))}; false -> throw(uri_parse_error) end. -%% Returns size of 'Rest' for proper calculation of splitting position. -%% Solves the following special case: -%% -%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). -%% -%% While keeping the following true: -%% -%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). -%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). -%% --spec byte_size_exl_single_slash(uri_string()) -> number(). -byte_size_exl_single_slash(<<$/>>) -> 0; -byte_size_exl_single_slash(Rest) -> byte_size(Rest). - - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.3. Path] %% @@ -516,11 +501,11 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of @@ -539,11 +524,11 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of @@ -580,7 +565,7 @@ is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_scheme_start(?STRING_REST(Char, Rest), URI) -> case is_alpha(Char) of true -> {T, URI1} = parse_scheme(Rest, URI), - {Scheme, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + Scheme = calculate_parsed_scheme(Rest, T), URI1#{scheme => ?STRING_REST(Char, Scheme)}; false -> throw(uri_parse_error) end. @@ -618,31 +603,31 @@ parse_hier(?STRING_REST("//", Rest), URI) -> % Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - {Userinfo, _} = split_binary(Rest, byte_size(Rest) - byte_size(T) - 1), + Userinfo = calculate_parsed_part(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), - {Host, _} = split_binary(Rest, byte_size_exl_single_slash(Rest) - byte_size_exl_head(T)), + Host = calculate_parsed_part_sl(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of true -> % segment_nz {T, URI1} = parse_segment(Rest, URI), - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST(Char, Path))}}; false -> throw(uri_parse_error) end; @@ -680,7 +665,7 @@ parse_userinfo(?CHAR($@), _URI) -> throw(uri_parse_error); parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), - {Host, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Host = calculate_parsed_part(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of @@ -741,22 +726,22 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -770,20 +755,20 @@ parse_host(?STRING_EMPTY, URI) -> -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of @@ -803,23 +788,23 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of @@ -866,20 +851,20 @@ is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - {H, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + H = calculate_parsed_part(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of @@ -909,15 +894,15 @@ validate_ipv6_address(Addr) -> -spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}. parse_port(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty - {Path, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - {Query, _} = split_binary(Rest, byte_size(Rest) - byte_size_exl_head(T)), + Query = calculate_parsed_part(Rest, T), {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -943,7 +928,7 @@ parse_port(?STRING_EMPTY, URI) -> -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - {Fragment, _} = split_binary(Rest, byte_size(Rest) - byte_size(T)), + Fragment = calculate_parsed_part(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of @@ -1046,13 +1031,6 @@ is_hex_digit(C) is_hex_digit(_) -> false. -%% Returns the size of a binary exluding the first element. -%% Used in calls to split_binary(). --spec byte_size_exl_head(binary()) -> number(). -byte_size_exl_head(<<>>) -> 0; -byte_size_exl_head(Binary) -> byte_size(Binary) + 1. - - %% Remove enclosing brackets from binary -spec remove_brackets(binary()) -> binary(). remove_brackets(<<$[/utf8, Rest/binary>>) -> @@ -1064,6 +1042,95 @@ remove_brackets(<<$[/utf8, Rest/binary>>) -> remove_brackets(Addr) -> Addr. +%%------------------------------------------------------------------------- +%% Helper functions for calculating the parsed binary. +%%------------------------------------------------------------------------- + +%% Returns the parsed binary based on Input and the Unparsed part. +%% Handles the following special cases: +%% +%% #{host => [],path => "/",query => "?"} = uri_string:parse("///?") +%% #{fragment => [],host => [],path => "/"} = uri_string:parse("///#") +%% +-spec calculate_parsed_part(binary(), binary()) -> binary(). +calculate_parsed_part(<<$?>>, _) -> <<>>; +calculate_parsed_part(<<$#>>, _) -> <<>>; +calculate_parsed_part(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +%% Returns the parsed binary based on Input and the Unparsed part. +%% Used when parsing authority. +%% +%% Handles the following special cases: +%% +%% #{host => "foo",query => "?"} = uri_string:parse("//foo?") +%% #{fragment => [],host => "foo"} = uri_string:parse("//foo#") +%% #{host => "foo",path => "/"} = uri_string:parse("//foo/") +%% #{host => "foo",query => "?",scheme => "http"} = uri_string:parse("http://foo?") +%% #{fragment => [],host => "foo",scheme => "http"} = uri_string:parse("http://foo#") +%% #{host => "foo",path => "/",scheme => "http"} = uri_string:parse("http://foo/") +%% +-spec calculate_parsed_part_sl(binary(), binary()) -> binary(). +calculate_parsed_part_sl(<<$?>>, _) -> <<>>; +calculate_parsed_part_sl(<<$#>>, _) -> <<>>; +calculate_parsed_part_sl(<<>>, _) -> <<>>; +calculate_parsed_part_sl(Input, <<>>) -> + case binary:last(Input) of + $? -> + {First, _} = + split_binary(Input, byte_size(Input) - 1), + First; + + $# -> + {First, _} = + split_binary(Input, byte_size(Input) - 1), + First; + $/ -> + {First, _} = + split_binary(Input, byte_size(Input) - 1), + First; + _Else -> + {First, _} = + split_binary(Input, byte_size_exl_single_slash(Input)), + First + end; +calculate_parsed_part_sl(Input, Unparsed) -> + {First, _} = + split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)), + First. + + +%% Returns the parsed binary based on Input and the Unparsed part. +%% Used when parsing scheme. +-spec calculate_parsed_scheme(binary(), binary()) -> binary(). +calculate_parsed_scheme(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1), + First. + +%% Returns the size of a binary exluding the first element. +%% Used in calls to split_binary(). +-spec byte_size_exl_head(binary()) -> number(). +byte_size_exl_head(<<>>) -> 0; +byte_size_exl_head(Binary) -> byte_size(Binary) + 1. + + +%% Returns size of 'Rest' for proper calculation of splitting position. +%% Solves the following special case: +%% +%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). +%% +%% While keeping the following true: +%% +%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). +%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). +%% +-spec byte_size_exl_single_slash(uri_string()) -> number(). +byte_size_exl_single_slash(<<$/>>) -> 0; +byte_size_exl_single_slash(Rest) -> byte_size(Rest). + + %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 2.1. Percent-Encoding] %% @@ -1080,23 +1147,23 @@ remove_brackets(Addr) -> Addr. %% %% pct-encoded = "%" HEXDIG HEXDIG %%------------------------------------------------------------------------- --spec decode_userinfo(list()|binary()) -> list() | binary(). +-spec decode_userinfo(binary()) -> binary(). decode_userinfo(Cs) -> decode(Cs, fun is_userinfo/1, <<>>). --spec decode_host(list()|binary()) -> list() | binary(). +-spec decode_host(binary()) -> binary(). decode_host(Cs) -> decode(Cs, fun is_host/1, <<>>). --spec decode_path(list()|binary()) -> list() | binary(). +-spec decode_path(binary()) -> binary(). decode_path(Cs) -> decode(Cs, fun is_path/1, <<>>). --spec decode_query(list()|binary()) -> list() | binary(). +-spec decode_query(binary()) -> binary(). decode_query(Cs) -> decode(Cs, fun is_query/1, <<>>). --spec decode_fragment(list()|binary()) -> list() | binary(). +-spec decode_fragment(binary()) -> binary(). decode_fragment(Cs) -> decode(Cs, fun is_fragment/1, <<>>). @@ -1136,7 +1203,10 @@ encode_path(Cs) -> -spec encode_query(list()|binary()) -> list() | binary(). encode_query(Cs) -> - encode(Cs, fun is_query/1). + case validate_query(Cs) of + true -> encode(Cs, fun is_query/1); + false -> throw(uri_parse_error) + end. -spec encode_fragment(list()|binary()) -> list() | binary(). encode_fragment(Cs) -> @@ -1145,7 +1215,6 @@ encode_fragment(Cs) -> %%------------------------------------------------------------------------- %% Helper funtions for percent-decode %%------------------------------------------------------------------------- --spec decode(list()|binary(), fun(), binary()) -> list() | binary(). decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> @@ -1159,21 +1228,7 @@ decode(<>, Fun, Acc) -> false -> throw(uri_parse_error) end; decode(<<>>, _Fun, Acc) -> - Acc; -decode([$%,C0,C1|Cs], Fun, Acc) -> - case is_hex_digit(C0) andalso is_hex_digit(C1) of - true -> - B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), - decode(Cs, Fun, <>); - false -> throw(uri_parse_error) - end; -decode([C|Cs], Fun, Acc) -> - case Fun(C) of - true -> decode(Cs, Fun, <>); - false -> throw(uri_parse_error) - end; -decode([], _Fun, Acc) -> - unicode:characters_to_list(Acc). + Acc. %% Check if char is allowed in host -spec is_host(char()) -> boolean(). @@ -1186,7 +1241,6 @@ is_path($/) -> true; is_path(Char) -> is_pchar(Char). - %%------------------------------------------------------------------------- %% Helper functions for percent-encode %%------------------------------------------------------------------------- @@ -1206,7 +1260,7 @@ encode(<<>>, _Fun, Acc) -> Acc. --spec encode_codepoint_binary(integer(), fun()) -> list(). +-spec encode_codepoint_binary(integer(), fun()) -> binary(). encode_codepoint_binary(C, Fun) -> case Fun(C) of false -> percent_encode_binary(C); @@ -1240,6 +1294,11 @@ validate_scheme(<>) -> false -> false end. +validate_query([$?|_]) -> true; +validate_query(<<$?/utf8, _/binary>>) -> true; +validate_query(_) -> false. + + %%------------------------------------------------------------------------- %% Classifies hostname into the following categories: %% regname, ipv4 - address does not contain reserved characters to be @@ -1248,7 +1307,7 @@ validate_scheme(<>) -> %% encolsed in brackets %% other - address shall be percent-encoded %%------------------------------------------------------------------------- -classify_host([]) -> false; +classify_host([]) -> other; classify_host(Addr) when is_binary(Addr) -> A = unicode:characters_to_list(Addr), classify_host_ipv6(A); @@ -1272,12 +1331,6 @@ classify_host_regname([H|T]) -> case is_reg_name(H) of true -> classify_host_regname(T); false -> other - end; -classify_host_regname(<<>>) -> regname; -classify_host_regname(<>) -> - case is_reg_name(H) of - true -> classify_host_regname(Rest); - false -> other end. is_ipv4_address(Addr) -> @@ -1391,7 +1444,7 @@ update_query(#{}, URI) -> update_fragment(#{fragment := Fragment}, empty) -> - add_hashmark(encode_query(Fragment)); + add_hashmark(encode_fragment(Fragment)); update_fragment(#{fragment := Fragment}, URI) -> concat(URI,add_hashmark(encode_fragment(Fragment))); update_fragment(#{}, empty) -> @@ -1411,31 +1464,24 @@ concat(A, B) when is_binary(A), is_list(B) -> concat(A, B) when is_list(A) -> A ++ maybe_to_list(B). -add_hashmark(empty) -> empty; add_hashmark(Comp) when is_binary(Comp) -> <<$#, Comp/binary>>; add_hashmark(Comp) when is_list(Comp) -> [$#|Comp]. -add_colon(empty) -> empty; add_colon(Comp) when is_binary(Comp) -> - <<$:, Comp/binary>>; -add_colon(Comp) when is_list(Comp) -> - [$:|Comp]. + <<$:, Comp/binary>>. -add_colon_postfix(empty) -> empty; add_colon_postfix(Comp) when is_binary(Comp) -> <>; add_colon_postfix(Comp) when is_list(Comp) -> Comp ++ ":". -add_auth_prefix(empty) -> empty; add_auth_prefix(Comp) when is_binary(Comp) -> <<"//", Comp/binary>>; add_auth_prefix(Comp) when is_list(Comp) -> [$/,$/|Comp]. -add_host_prefix(_, empty) -> empty; add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) -> <<$@,Host/binary>>; add_host_prefix(#{}, Host) when is_binary(Host) -> -- cgit v1.2.3 From 1335e59a60d5e195baf519d2c52b0ca0aa96831f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 4 Oct 2017 16:45:51 +0200 Subject: stdlib: Add property tests, bugfixes - Add property tests using PropEr. - Add new testcases to uri_string_SUITE. - Improve calculation of parsed binary. - Verify if input to parse() is UTF8 encoded. - Update is_valid_map(): added check for path and host. --- lib/stdlib/src/uri_string.erl | 224 +++++++++++++++++++++++++++++++++--------- 1 file changed, 176 insertions(+), 48 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index bb7079c193..893ba4c6bf 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -304,8 +304,6 @@ parse(URIString) when is_list(URIString) -> -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). -recompose(Map) when map_size(Map) =:= 0 -> - ""; recompose(Map) -> case is_valid_map(Map) of false -> @@ -405,7 +403,7 @@ convert_mapfields_to_list(Map) -> %% URI-reference = URI / relative-ref %%------------------------------------------------------------------------- -spec parse_uri_reference(binary(), uri_map()) -> uri_map(). -parse_uri_reference(<<>>, _) -> #{}; +parse_uri_reference(<<>>, _) -> #{path => <<>>}; parse_uri_reference(URIString, URI) -> try parse_scheme_start(URIString, URI) of Res -> Res @@ -434,13 +432,15 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> %% Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - Userinfo = calculate_parsed_part(Rest, T), - URI1#{userinfo => decode_userinfo(Userinfo)} + Userinfo = calculate_parsed_userinfo(Rest, T), + URI2 = maybe_add_path(URI1), + URI2#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), - URI1#{host => decode_host(remove_brackets(Host))} + URI2 = maybe_add_path(URI1), + URI2#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -449,11 +449,13 @@ parse_relative_part(?STRING_REST($/, Rest), URI) -> parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_part(Rest, T), - URI1#{query => decode_query(?STRING_REST($?, Query))}; + URI2 = maybe_add_path(URI1), + URI2#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), - URI1#{fragment => decode_fragment(Fragment)}; + Fragment = calculate_parsed_fragment(Rest, T), + URI2 = maybe_add_path(URI1), + URI2#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> @@ -505,7 +507,7 @@ parse_segment(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of @@ -528,7 +530,7 @@ parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of @@ -566,10 +568,32 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> case is_alpha(Char) of true -> {T, URI1} = parse_scheme(Rest, URI), Scheme = calculate_parsed_scheme(Rest, T), - URI1#{scheme => ?STRING_REST(Char, Scheme)}; + URI2 = maybe_add_path(URI1), + URI2#{scheme => ?STRING_REST(Char, Scheme)}; false -> throw(uri_parse_error) end. +%% Add path component if it missing after parsing the URI. +%% According to the URI specification there is always a +%% path component in every URI-reference and it can be +%% empty. + +%% maybe_add_path(Map) -> +%% case length(maps:keys(Map)) of +%% 0 -> +%% Map#{path => <<>>}; +%% _Else -> +%% Map +%% end. +maybe_add_path(Map) -> + case maps:is_key(path, Map) of + false -> + Map#{path => <<>>}; + _Else -> + Map + end. + + -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> @@ -603,7 +627,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> % Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - Userinfo = calculate_parsed_part(Rest, T), + Userinfo = calculate_parsed_userinfo(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> @@ -621,7 +645,7 @@ parse_hier(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of @@ -660,12 +684,11 @@ parse_hier(?STRING_EMPTY, URI) -> %% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) %%------------------------------------------------------------------------- -spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. -parse_userinfo(?CHAR($@), _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error); +parse_userinfo(?CHAR($@), URI) -> + {?STRING_EMPTY, URI#{host => <<>>}}; parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), - Host = calculate_parsed_part(Rest, T), + Host = calculate_parsed_host(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of @@ -726,7 +749,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> @@ -741,7 +764,7 @@ parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -755,7 +778,7 @@ parse_host(?STRING_EMPTY, URI) -> -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> @@ -768,7 +791,7 @@ parse_reg_name(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of @@ -788,7 +811,7 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> @@ -804,7 +827,7 @@ parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of @@ -851,7 +874,7 @@ is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> @@ -864,7 +887,7 @@ parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of @@ -902,7 +925,7 @@ parse_port(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -928,7 +951,7 @@ parse_port(?STRING_EMPTY, URI) -> -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of @@ -1055,11 +1078,88 @@ remove_brackets(Addr) -> Addr. -spec calculate_parsed_part(binary(), binary()) -> binary(). calculate_parsed_part(<<$?>>, _) -> <<>>; calculate_parsed_part(<<$#>>, _) -> <<>>; +calculate_parsed_part(<<>>, _) -> <<>>; +calculate_parsed_part(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + _Else -> + Input + end; calculate_parsed_part(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), First. +-spec calculate_parsed_userinfo(binary(), binary()) -> binary(). +calculate_parsed_userinfo(<<$?>>, _) -> <<>>; +calculate_parsed_userinfo(<<$#>>, _) -> <<>>; +calculate_parsed_userinfo(<<>>, _) -> <<>>; +calculate_parsed_userinfo(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $@ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_userinfo(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_host(binary(), binary()) -> binary(). +calculate_parsed_host(<<$?>>, _) -> <<>>; +calculate_parsed_host(<<$#>>, _) -> <<>>; +calculate_parsed_host(<<>>, _) -> <<>>; +calculate_parsed_host(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $/ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_host(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_port(binary(), binary()) -> binary(). +calculate_parsed_port(<<$?>>, _) -> <<>>; +calculate_parsed_port(<<$#>>, _) -> <<>>; +calculate_parsed_port(<<>>, _) -> <<>>; +calculate_parsed_port(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $/ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_port(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_fragment(binary(), binary()) -> binary(). +calculate_parsed_fragment(<<$#>>, _) -> <<>>; +calculate_parsed_fragment(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + %% Returns the parsed binary based on Input and the Unparsed part. %% Used when parsing authority. %% @@ -1079,28 +1179,25 @@ calculate_parsed_part_sl(<<>>, _) -> <<>>; calculate_parsed_part_sl(Input, <<>>) -> case binary:last(Input) of $? -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; - + init_binary(Input); $# -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; + init_binary(Input); $/ -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; + init_binary(Input); _Else -> - {First, _} = - split_binary(Input, byte_size_exl_single_slash(Input)), - First + Input end; calculate_parsed_part_sl(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)), First. +%% Return all bytes of the binary except the last one. The binary must be non-empty. +init_binary(B) -> + {Init, _} = + split_binary(B, byte_size(B) - 1), + Init. + %% Returns the parsed binary based on Input and the Unparsed part. %% Used when parsing scheme. @@ -1109,6 +1206,7 @@ calculate_parsed_scheme(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1), First. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). @@ -1149,25 +1247,35 @@ byte_size_exl_single_slash(Rest) -> byte_size(Rest). %%------------------------------------------------------------------------- -spec decode_userinfo(binary()) -> binary(). decode_userinfo(Cs) -> - decode(Cs, fun is_userinfo/1, <<>>). + check_utf8(decode(Cs, fun is_userinfo/1, <<>>)). -spec decode_host(binary()) -> binary(). decode_host(Cs) -> - decode(Cs, fun is_host/1, <<>>). + check_utf8(decode(Cs, fun is_host/1, <<>>)). -spec decode_path(binary()) -> binary(). decode_path(Cs) -> - decode(Cs, fun is_path/1, <<>>). + check_utf8(decode(Cs, fun is_path/1, <<>>)). -spec decode_query(binary()) -> binary(). decode_query(Cs) -> - decode(Cs, fun is_query/1, <<>>). + check_utf8(decode(Cs, fun is_query/1, <<>>)). -spec decode_fragment(binary()) -> binary(). decode_fragment(Cs) -> - decode(Cs, fun is_fragment/1, <<>>). + check_utf8(decode(Cs, fun is_fragment/1, <<>>)). +%% Returns Cs if it is utf8 encoded. +check_utf8(Cs) -> + case unicode:characters_to_list(Cs) of + {incomplete,_,_} -> + throw(uri_parse_error); + {error,_,_} -> + throw(uri_parse_error); + _ -> Cs + end. + %%------------------------------------------------------------------------- %% Percent-encode %%------------------------------------------------------------------------- @@ -1368,10 +1476,15 @@ bracket_ipv6(Addr) when is_list(Addr) -> %% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} %% There is always at least an empty host when both userinfo and port %% are present. +%% - #{path => "///"} otherwise the following would be true: +%% "/////" = uri_string:recompose(#{host => "", path => "///"}) +%% "/////" = uri_string:recompose(#{path => "/////"}) +%% AND +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] %%------------------------------------------------------------------------- is_valid_map(Map) -> case - (not maps:is_key(userinfo, Map) andalso + ((not maps:is_key(userinfo, Map) andalso not maps:is_key(host, Map) andalso maps:is_key(port, Map)) orelse @@ -1381,7 +1494,9 @@ is_valid_map(Map) -> orelse (maps:is_key(userinfo, Map) andalso not maps:is_key(host, Map) andalso - maps:is_key(port, Map)) + maps:is_key(port, Map))) orelse + not maps:is_key(path, Map) orelse + not is_host_and_path_valid(Map) of true -> false; @@ -1390,6 +1505,19 @@ is_valid_map(Map) -> end. +is_host_and_path_valid(Map) -> + Host = maps:get(host, Map, undefined), + Path = maps:get(path, Map, undefined), + not (Host =:= undefined andalso starts_with_two_slash(Path)). + + +starts_with_two_slash([$/,$/|_]) -> + true; +starts_with_two_slash(?STRING_REST("//", _)) -> + true; +starts_with_two_slash(_) -> false. + + update_scheme(#{scheme := Scheme}, _) -> add_colon_postfix(encode_scheme(Scheme)); update_scheme(#{}, _) -> -- cgit v1.2.3 From 4a2358bbf4a4049a765aab435a31daeeffbbd677 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 11 Oct 2017 16:36:14 +0200 Subject: stdlib: Implement transcode/2. --- lib/stdlib/src/uri_string.erl | 112 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 2 deletions(-) mode change 100755 => 100644 lib/stdlib/src/uri_string.erl (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl old mode 100755 new mode 100644 index 893ba4c6bf..439ffa80da --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -353,8 +353,26 @@ normalize(_) -> -spec transcode(URIString, Options) -> URIString when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. -transcode(_, _) -> - "". +transcode(URIString, Options) when is_binary(URIString) -> + try + InEnc = proplists:get_value(in_encoding, Options, utf8), + OutEnc = proplists:get_value(out_encoding, Options, utf8), + List = convert_list(URIString, InEnc), + Output = transcode(List, [], InEnc, OutEnc), + convert_binary(Output, utf8, OutEnc) + of + Result -> Result + catch + throw:{error, L, RestData} -> {invalid_input, L, RestData} + end; +transcode(URIString, Options) when is_list(URIString) -> + InEnc = proplists:get_value(in_encoding, Options, utf8), + OutEnc = proplists:get_value(out_encoding, Options, utf8), + try transcode(URIString, [], InEnc, OutEnc) of + Result -> Result + catch + throw:{error, List, RestData} -> {invalid_input, List, RestData} + end. %%------------------------------------------------------------------------- %% Working with query strings @@ -1624,3 +1642,93 @@ maybe_to_list(Comp) -> Comp. encode_port(Port) -> integer_to_binary(Port). + +%%------------------------------------------------------------------------- +%% Helper functions for transcode +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% uri_string:transcode(<<"x%00%00%00%F6"/utf32>>). +%% 1. Convert (transcode/2) input to list form (list of unicode codepoints) +%% "x%00%00%00%F6" +%% 2. Accumulate characters until percent-encoded segment (transcode/4). +%% Acc = "x" +%% 3. Convert percent-encoded triplets to binary form (transcode_pct/4) +%% <<0,0,0,246>> +%% 4. Transcode in-encoded binary to out-encoding (utf32 -> utf8): +%% <<195,182>> +%% 5. Percent-encode out-encoded binary: +%% <<"%C3%B6"/utf8>> = <<37,67,51,37,66,54>> +%% 6. Convert binary to list form, reverse it and append the accumulator +%% "6B%3C%" + "x" +%% 7. Reverse Acc and return it +%%------------------------------------------------------------------------- +transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) -> + transcode_pct(L, Acc, <<>>, InEnc, OutEnc); +transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) -> + transcode(L, Acc, [], InEnc, OutEnc). +%% +transcode([H|T], Acc, List, InEnc, OutEnc) when is_binary(H) -> + L = convert_list(H, InEnc), + transcode(L ++ T, Acc, List, InEnc, OutEnc); +transcode([H|T], Acc, List, InEnc, OutEnc) when is_list(H) -> + transcode(H ++ T, Acc, List, InEnc, OutEnc); +transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) -> + transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding); +transcode([C|Rest], Acc, List, InEncoding, OutEncoding) -> + transcode(Rest, Acc, [C|List], InEncoding, OutEncoding); +transcode([], Acc, List, _InEncoding, _OutEncoding) -> + lists:reverse(List ++ Acc). + + +%% Transcode percent-encoded segment +transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_binary(H) -> + L = convert_list(H, InEnc), + transcode_pct(L ++ T, Acc, B, InEnc, OutEnc); +transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_list(H) -> + transcode_pct(H ++ T, Acc, B, InEnc, OutEnc); +transcode_pct([$%,C0,C1|Rest], Acc, B, InEncoding, OutEncoding) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1), + transcode_pct(Rest, Acc, <>, InEncoding, OutEncoding); + false -> throw({error, lists:reverse(Acc),[C0,C1]}) + end; +transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) -> + OutBinary = convert_binary(B, InEncoding, OutEncoding), + PctEncUtf8 = percent_encode_segment(OutBinary), + Out = lists:reverse(convert_list(PctEncUtf8, utf8)), + transcode(L, Out ++ Acc, [], InEncoding, OutEncoding); +transcode_pct([], Acc, B, InEncoding, OutEncoding) -> + OutBinary = convert_binary(B, InEncoding, OutEncoding), + PctEncUtf8 = percent_encode_segment(OutBinary), + Out = convert_list(PctEncUtf8, utf8), + lists:reverse(Acc) ++ Out. + + +% Convert binary +convert_binary(Binary, InEncoding, OutEncoding) -> + case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of + {error, List, RestData} -> + throw({error, List, RestData}); + {incomplete, List, RestData} -> + throw({error, List, RestData}); + Result -> + Result + end. + + +% Convert binary +convert_list(Binary, InEncoding) -> + case unicode:characters_to_list(Binary, InEncoding) of + {error, List, RestData} -> + throw({error, List, RestData}); + {incomplete, List, RestData} -> + throw({error, List, RestData}); + Result -> + Result + end. + + +percent_encode_segment(Segment) -> + percent_encode_binary(Segment, <<>>). -- cgit v1.2.3 From 57f8021105f1c213be674681f48d0c8e92935ff6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 16 Oct 2017 13:30:36 +0200 Subject: stdlib: Change handling of queries ["?" query] Previously when parsing queries the first "?" was part of the parsed query in the result Map. This behavior has been changed to follow the patterns used with other URI components and to not include the special character(s) that mark the start of a specific component. --- lib/stdlib/src/uri_string.erl | 80 +++++++++++++++++++++++-------------------- 1 file changed, 42 insertions(+), 38 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 439ffa80da..f9e1e273bc 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -466,9 +466,9 @@ parse_relative_part(?STRING_REST($/, Rest), URI) -> URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), + Query = calculate_parsed_query(Rest, T), URI2 = maybe_add_path(URI1), - URI2#{query => decode_query(?STRING_REST($?, Query))}; + URI2#{query => decode_query(Query)}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -521,8 +521,8 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_fragment(Rest, T), @@ -544,8 +544,8 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_fragment(Rest, T), @@ -595,14 +595,6 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> %% According to the URI specification there is always a %% path component in every URI-reference and it can be %% empty. - -%% maybe_add_path(Map) -> -%% case length(maps:keys(Map)) of -%% 0 -> -%% Map#{path => <<>>}; -%% _Else -> -%% Map -%% end. maybe_add_path(Map) -> case maps:is_key(path, Map) of false -> @@ -659,8 +651,8 @@ parse_hier(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -776,8 +768,8 @@ parse_host(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> @@ -805,8 +797,8 @@ parse_reg_name(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -840,8 +832,8 @@ parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty @@ -901,8 +893,8 @@ parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -939,8 +931,8 @@ parse_port(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_part(Rest, T), - {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; + Query = calculate_parsed_query(Rest, T), + {Rest, URI1#{query => decode_query(Query)}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_fragment(Rest, T), @@ -1090,7 +1082,7 @@ remove_brackets(Addr) -> Addr. %% Returns the parsed binary based on Input and the Unparsed part. %% Handles the following special cases: %% -%% #{host => [],path => "/",query => "?"} = uri_string:parse("///?") +%% #{host => [],path => "/",query => []} = uri_string:parse("///?") %% #{fragment => [],host => [],path => "/"} = uri_string:parse("///#") %% -spec calculate_parsed_part(binary(), binary()) -> binary(). @@ -1171,6 +1163,20 @@ calculate_parsed_port(Input, Unparsed) -> First. +calculate_parsed_query(<<$#>>, _) -> <<>>; +calculate_parsed_query(<<>>, _) -> <<>>; +calculate_parsed_query(Input, <<>>) -> + case binary:last(Input) of + $# -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_query(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + -spec calculate_parsed_fragment(binary(), binary()) -> binary(). calculate_parsed_fragment(<<$#>>, _) -> <<>>; calculate_parsed_fragment(Input, Unparsed) -> @@ -1183,10 +1189,10 @@ calculate_parsed_fragment(Input, Unparsed) -> %% %% Handles the following special cases: %% -%% #{host => "foo",query => "?"} = uri_string:parse("//foo?") +%% #{host => "foo",query => []} = uri_string:parse("//foo?") %% #{fragment => [],host => "foo"} = uri_string:parse("//foo#") %% #{host => "foo",path => "/"} = uri_string:parse("//foo/") -%% #{host => "foo",query => "?",scheme => "http"} = uri_string:parse("http://foo?") +%% #{host => "foo",query => [],scheme => "http"} = uri_string:parse("http://foo?") %% #{fragment => [],host => "foo",scheme => "http"} = uri_string:parse("http://foo#") %% #{host => "foo",path => "/",scheme => "http"} = uri_string:parse("http://foo/") %% @@ -1329,10 +1335,7 @@ encode_path(Cs) -> -spec encode_query(list()|binary()) -> list() | binary(). encode_query(Cs) -> - case validate_query(Cs) of - true -> encode(Cs, fun is_query/1); - false -> throw(uri_parse_error) - end. + encode(Cs, fun is_query/1). -spec encode_fragment(list()|binary()) -> list() | binary(). encode_fragment(Cs) -> @@ -1420,10 +1423,6 @@ validate_scheme(<>) -> false -> false end. -validate_query([$?|_]) -> true; -validate_query(<<$?/utf8, _/binary>>) -> true; -validate_query(_) -> false. - %%------------------------------------------------------------------------- %% Classifies hostname into the following categories: @@ -1582,7 +1581,7 @@ update_path(#{}, URI) -> update_query(#{query := Query}, empty) -> encode_query(Query); update_query(#{query := Query}, URI) -> - concat(URI,encode_query(Query)); + concat(URI,add_question_mark(encode_query(Query))); update_query(#{}, empty) -> empty; update_query(#{}, URI) -> @@ -1615,6 +1614,11 @@ add_hashmark(Comp) when is_binary(Comp) -> add_hashmark(Comp) when is_list(Comp) -> [$#|Comp]. +add_question_mark(Comp) when is_binary(Comp) -> + <<$?, Comp/binary>>; +add_question_mark(Comp) when is_list(Comp) -> + [$?|Comp]. + add_colon(Comp) when is_binary(Comp) -> <<$:, Comp/binary>>. -- cgit v1.2.3 From fd276f4a2a109d19d25cffee54a2c21ee4568085 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 16 Oct 2017 16:12:18 +0200 Subject: stdlib: Improve support of mixed lists (transcode) - transcode/2 flattens input lists in order to be able to handle lists with percent-encoded parts that are split into muliple list and binary segments. - Add additional tests for transcoding mixed lists. --- lib/stdlib/src/uri_string.erl | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index f9e1e273bc..7d180f73b8 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -368,12 +368,14 @@ transcode(URIString, Options) when is_binary(URIString) -> transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), - try transcode(URIString, [], InEnc, OutEnc) of + Flattened = flatten_list(URIString, InEnc), + try transcode(Flattened, [], InEnc, OutEnc) of Result -> Result catch throw:{error, List, RestData} -> {invalid_input, List, RestData} end. + %%------------------------------------------------------------------------- %% Working with query strings %% HTML 2.0 - application/x-www-form-urlencoded @@ -1672,11 +1674,6 @@ transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) -> transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) -> transcode(L, Acc, [], InEnc, OutEnc). %% -transcode([H|T], Acc, List, InEnc, OutEnc) when is_binary(H) -> - L = convert_list(H, InEnc), - transcode(L ++ T, Acc, List, InEnc, OutEnc); -transcode([H|T], Acc, List, InEnc, OutEnc) when is_list(H) -> - transcode(H ++ T, Acc, List, InEnc, OutEnc); transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) -> transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding); transcode([C|Rest], Acc, List, InEncoding, OutEncoding) -> @@ -1686,11 +1683,6 @@ transcode([], Acc, List, _InEncoding, _OutEncoding) -> %% Transcode percent-encoded segment -transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_binary(H) -> - L = convert_list(H, InEnc), - transcode_pct(L ++ T, Acc, B, InEnc, OutEnc); -transcode_pct([H|T], Acc, B, InEnc, OutEnc) when is_list(H) -> - transcode_pct(H ++ T, Acc, B, InEnc, OutEnc); transcode_pct([$%,C0,C1|Rest], Acc, B, InEncoding, OutEncoding) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> @@ -1710,7 +1702,7 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) -> lists:reverse(Acc) ++ Out. -% Convert binary +%% Convert to binary convert_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of {error, List, RestData} -> @@ -1722,7 +1714,7 @@ convert_binary(Binary, InEncoding, OutEncoding) -> end. -% Convert binary +%% Convert to list convert_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of {error, List, RestData} -> @@ -1734,5 +1726,22 @@ convert_list(Binary, InEncoding) -> end. +%% Flatten input list +flatten_list([], _) -> + []; +flatten_list(L, InEnc) -> + flatten_list(L, InEnc, []). +%% +flatten_list([H|T], InEnc, Acc) when is_binary(H) -> + L = convert_list(H, InEnc), + flatten_list(T, InEnc, lists:reverse(L) ++ Acc); +flatten_list([H|T], InEnc, Acc) when is_list(H) -> + flatten_list(H ++ T, InEnc, Acc); +flatten_list([H|T], InEnc, Acc) -> + flatten_list(T, InEnc, [H|Acc]); +flatten_list([], _InEnc, Acc) -> + lists:reverse(Acc). + + percent_encode_segment(Segment) -> percent_encode_binary(Segment, <<>>). -- cgit v1.2.3 From 5fe4c673bb8ee10d0fccadb4da14d7a500c2b8ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 18 Oct 2017 15:48:04 +0200 Subject: stdlib: Implement compose_query and dissect_query --- lib/stdlib/src/uri_string.erl | 226 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 205 insertions(+), 21 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 7d180f73b8..1b8f8b828f 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,8 +226,9 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([compose_query/1, create_uri_reference/2, dissect_query/1, normalize/1, - parse/1, recompose/1, resolve_uri_reference/2, transcode/2]). +-export([compose_query/1, compose_query/2, create_uri_reference/2, + dissect_query/1, normalize/1, parse/1, + recompose/1, resolve_uri_reference/2, transcode/2]). -export_type([uri_map/0, uri_string/0]). @@ -377,28 +378,66 @@ transcode(URIString, Options) when is_list(URIString) -> %%------------------------------------------------------------------------- -%% Working with query strings -%% HTML 2.0 - application/x-www-form-urlencoded -%% RFC 1866 [8.2.1] +%% Functions for working with the query part of a URI as a list +%% of key/value pairs. +%% HTML 2.0 (RFC 1866) defines a media type application/x-www-form-urlencoded +%% in section [8.2.1] "The form-urlencoded Media Type". %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% Compose urlencoded query string from a list of unescaped key/value pairs. %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when - QueryList :: [{unicode:chardata(), unicode:chardata()}], - QueryString :: uri_string(). -compose_query(_) -> - "". + QueryList :: [{uri_string(), uri_string()}], + QueryString :: string(). +compose_query(List) -> + compose_query(List, []). + + +-spec compose_query(QueryList, Options) -> QueryString when + QueryList :: [{uri_string(), uri_string()}], + Options :: [{separator, atom()}], + QueryString :: string(). +compose_query([],_Options) -> + []; +compose_query(List, Options) -> + try compose_query(List, Options, []) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. +%% +compose_query([{Key,Value}|Rest], Options, Acc) -> + Separator = get_separator(Options, Acc), + K = form_urlencode(Key), + V = form_urlencode(Value), + compose_query(Rest, Options, Acc ++ Separator ++ K ++ "=" ++ V); +compose_query([], _Options, Acc) -> + Acc. + %%------------------------------------------------------------------------- %% Dissect a query string into a list of unescaped key/value pairs. %%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), - QueryList :: [{unicode:chardata(), unicode:chardata()}]. -dissect_query(_) -> - "". + QueryList :: [{string(), string()}]. +dissect_query([]) -> + []; +dissect_query(QueryString) when is_binary(QueryString) -> + L = convert_list(QueryString, utf8), + try dissect_query_key(L, [], [], []) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end; +dissect_query(QueryString) -> + L = flatten_list(QueryString, utf8), + try dissect_query_key(L, [], [], []) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. %%%======================================================================== @@ -1705,10 +1744,10 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) -> %% Convert to binary convert_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of - {error, List, RestData} -> - throw({error, List, RestData}); - {incomplete, List, RestData} -> - throw({error, List, RestData}); + {error, _List, RestData} -> + throw({error, unicode, RestData}); + {incomplete, _List, RestData} -> + throw({error, unicode, RestData}); Result -> Result end. @@ -1717,10 +1756,10 @@ convert_binary(Binary, InEncoding, OutEncoding) -> %% Convert to list convert_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of - {error, List, RestData} -> - throw({error, List, RestData}); - {incomplete, List, RestData} -> - throw({error, List, RestData}); + {error, _List, RestData} -> + throw({error, unicode, RestData}); + {incomplete, _List, RestData} -> + throw({error, unicode, RestData}); Result -> Result end. @@ -1740,8 +1779,153 @@ flatten_list([H|T], InEnc, Acc) when is_list(H) -> flatten_list([H|T], InEnc, Acc) -> flatten_list(T, InEnc, [H|Acc]); flatten_list([], _InEnc, Acc) -> - lists:reverse(Acc). + lists:reverse(Acc); +flatten_list(Arg, _, _) -> + throw({error, badarg, Arg}). + percent_encode_segment(Segment) -> percent_encode_binary(Segment, <<>>). + + +%%------------------------------------------------------------------------- +%% Helper functions for compose_query +%%------------------------------------------------------------------------- + +%% Returns separator to be used between key-value pairs +get_separator(_, Acc) when length(Acc) =:= 0 -> + []; +get_separator([], _Acc) -> + "&"; +get_separator([{separator, amp}], _Acc) -> + "&"; +get_separator([{separator, semicolon}], _Acc) -> + ";". + + +%% Form-urlencode input based on RFC 1866 [8.2.1] +form_urlencode(Cs) when is_binary(Cs) -> + L = convert_list(Cs, utf8), + form_urlencode(L, []); +form_urlencode(Cs) -> + L = flatten_list(Cs, utf8), + form_urlencode(L, []). +%% +form_urlencode([], Acc) -> + lists:reverse(Acc); +form_urlencode([$ |T], Acc) -> + form_urlencode(T, [$+|Acc]); +form_urlencode([H|T], Acc) -> + case is_url_char(H) of + true -> + form_urlencode(T, [H|Acc]); + false -> + E = urlencode_char(H), + form_urlencode(T, lists:reverse(E) ++ Acc) + end. + + +urlencode_char(C) -> + B = percent_encode_binary(C), + unicode:characters_to_list(B). + + +%% Return true if input char can appear in URL according to +%% RFC 1738 "Uniform Resource Locators". +is_url_char(C) + when 0 =< C, C =< 31; + 128 =< C, C =< 255 -> false; +is_url_char(127) -> false; +is_url_char(C) -> + not (is_reserved(C) orelse is_unsafe(C)). + + +%% Reserved characters (RFC 1738) +is_reserved($;) -> true; +is_reserved($/) -> true; +is_reserved($?) -> true; +is_reserved($:) -> true; +is_reserved($@) -> true; +is_reserved($=) -> true; +is_reserved($&) -> true; +is_reserved(_) -> false. + + +%% Unsafe characters (RFC 1738) +is_unsafe(${) -> true; +is_unsafe($}) -> true; +is_unsafe($|) -> true; +is_unsafe($\\) -> true; +is_unsafe($^) -> true; +is_unsafe($~) -> true; +is_unsafe($[) -> true; +is_unsafe($]) -> true; +is_unsafe($`) -> true; +is_unsafe(_) -> false. + + +%%------------------------------------------------------------------------- +%% Helper functions for dissect_query +%%------------------------------------------------------------------------- +dissect_query_key([$=|T], Acc, Key, Value) -> + dissect_query_value(T, Acc, Key, Value); +dissect_query_key([H|T], Acc, Key, Value) -> + dissect_query_key(T, Acc, [H|Key], Value); +dissect_query_key(L, _, _, _) -> + throw({error, missing_value, L}). + + +dissect_query_value([$&|_] = L, Acc, Key, Value) -> + K = form_urldecode(lists:reverse(Key)), + V = form_urldecode(lists:reverse(Value)), + dissect_query_separator_amp(L, [{K,V}|Acc], [], []); +dissect_query_value([$;|_] = L, Acc, Key, Value) -> + K = form_urldecode(lists:reverse(Key)), + V = form_urldecode(lists:reverse(Value)), + dissect_query_separator_semicolon(L, [{K,V}|Acc], [], []); +dissect_query_value([H|T], Acc, Key, Value) -> + dissect_query_value(T, Acc, Key, [H|Value]); +dissect_query_value([], Acc, Key, Value) -> + K = form_urldecode(lists:reverse(Key)), + V = form_urldecode(lists:reverse(Value)), + lists:reverse([{K,V}|Acc]). + + +dissect_query_separator_amp("&" ++ T, Acc, Key, Value) -> + dissect_query_key(T, Acc, Key, Value); +dissect_query_separator_amp(L, _, _, _) -> + throw({error, invalid_separator, L}). + + +dissect_query_separator_semicolon([$;|T], Acc, Key, Value) -> + dissect_query_key(T, Acc, Key, Value). + + +%% Form-urldecode input based on RFC 1866 [8.2.1] +form_urldecode(Cs) -> + B = convert_binary(Cs, utf8, utf8), + Result = form_urldecode(B, <<>>), + convert_list(Result, utf8). +%% +form_urldecode(<<>>, Acc) -> + convert_list(Acc, utf8); +form_urldecode(<<$+,T/binary>>, Acc) -> + form_urlencode(T, [$ |Acc]); +form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), + form_urldecode(T, <>); + false -> + L = convert_list(<<$%,C0,C1,T/binary>>, utf8), + throw({error, urldecode, L}) + end; +form_urldecode(<>, Acc) -> + case is_url_char(H) of + true -> + form_urldecode(T, <>); + false -> + L = convert_list(<>, utf8), + throw({error, urldecode, L}) + end. -- cgit v1.2.3 From 75989c8024283155f6f8075ee9e81b50a65e9ecb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 19 Oct 2017 17:19:46 +0200 Subject: stdlib: Improve error handling --- lib/stdlib/src/uri_string.erl | 129 ++++++++++++++++++++++++++---------------- 1 file changed, 79 insertions(+), 50 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 1b8f8b828f..51f7564934 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -293,11 +293,22 @@ URIString :: uri_string(), URIMap :: uri_map(). parse(URIString) when is_binary(URIString) -> - parse_uri_reference(URIString, #{}); + try parse_uri_reference(URIString, #{}) of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end; parse(URIString) when is_list(URIString) -> - Binary = unicode:characters_to_binary(URIString), - Map = parse_uri_reference(Binary, #{}), - convert_mapfields_to_list(Map). + try + Binary = unicode:characters_to_binary(URIString), + Map = parse_uri_reference(Binary, #{}), + convert_mapfields_to_list(Map) + of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. + %%------------------------------------------------------------------------- %% Recompose URIs @@ -308,17 +319,24 @@ parse(URIString) when is_list(URIString) -> recompose(Map) -> case is_valid_map(Map) of false -> - error({badarg, invalid_map}); + {error, invalid_map, Map}; true -> - T0 = update_scheme(Map, empty), - T1 = update_userinfo(Map, T0), - T2 = update_host(Map, T1), - T3 = update_port(Map, T2), - T4 = update_path(Map, T3), - T5 = update_query(Map, T4), - update_fragment(Map, T5) + try + T0 = update_scheme(Map, empty), + T1 = update_userinfo(Map, T0), + T2 = update_host(Map, T1), + T3 = update_port(Map, T2), + T4 = update_path(Map, T3), + T5 = update_query(Map, T4), + update_fragment(Map, T5) + of + Result -> Result + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end end. + %%------------------------------------------------------------------------- %% Resolve references %%------------------------------------------------------------------------- @@ -364,7 +382,7 @@ transcode(URIString, Options) when is_binary(URIString) -> of Result -> Result catch - throw:{error, L, RestData} -> {invalid_input, L, RestData} + throw:{error, _, RestData} -> {error, invalid_input, RestData} end; transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -373,7 +391,7 @@ transcode(URIString, Options) when is_list(URIString) -> try transcode(Flattened, [], InEnc, OutEnc) of Result -> Result catch - throw:{error, List, RestData} -> {invalid_input, List, RestData} + throw:{error, _, RestData} -> {error, invalid_input, RestData} end. @@ -467,7 +485,7 @@ parse_uri_reference(URIString, URI) -> try parse_scheme_start(URIString, URI) of Res -> Res catch - throw:uri_parse_error -> + throw:{_,_,_} -> parse_relative_part(URIString, URI) end. @@ -495,7 +513,7 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> URI2 = maybe_add_path(URI1), URI2#{userinfo => decode_userinfo(Userinfo)} catch - throw:uri_parse_error -> + throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), URI2 = maybe_add_path(URI1), @@ -521,7 +539,7 @@ parse_relative_part(?STRING_REST(Char, Rest), URI) -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme Path = calculate_parsed_part(Rest, T), URI1#{path => decode_path(?STRING_REST(Char, Path))}; - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end. @@ -571,7 +589,7 @@ parse_segment(?STRING_REST($#, Rest), URI) -> parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of true -> parse_segment(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_segment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -594,7 +612,7 @@ parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_segment_nz_nc(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -629,7 +647,7 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> Scheme = calculate_parsed_scheme(Rest, T), URI2 = maybe_add_path(URI1), URI2#{scheme => ?STRING_REST(Char, Scheme)}; - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end. %% Add path component if it missing after parsing the URI. @@ -653,10 +671,10 @@ parse_scheme(?STRING_REST($:, Rest), URI) -> parse_scheme(?STRING_REST(Char, Rest), URI) -> case is_scheme(Char) of true -> parse_scheme(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_scheme(?STRING_EMPTY, _URI) -> - throw(uri_parse_error). + throw({error,invalid_uri,<<>>}). %% Check if char is allowed in scheme @@ -681,7 +699,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> Userinfo = calculate_parsed_userinfo(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch - throw:uri_parse_error -> + throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}} @@ -704,7 +722,7 @@ parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless {T, URI1} = parse_segment(Rest, URI), Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => decode_path(?STRING_REST(Char, Path))}}; - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_hier(?STRING_EMPTY, URI) -> {<<>>, URI}. @@ -744,11 +762,11 @@ parse_userinfo(?STRING_REST($@, Rest), URI) -> parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_userinfo(?STRING_EMPTY, _URI) -> %% URI cannot end in userinfo state - throw(uri_parse_error). + throw({error,invalid_uri,<<>>}). %% Check if char is allowed in userinfo @@ -847,7 +865,7 @@ parse_reg_name(?STRING_REST($#, Rest), URI) -> parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_reg_name(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -883,7 +901,7 @@ parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of true -> parse_ipv4_bin(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), @@ -899,7 +917,7 @@ is_ipv4(Char) -> is_digit(Char). validate_ipv4_address(Addr) -> case inet:parse_ipv4strict_address(Addr) of {ok, _} -> Addr; - {error, _} -> throw(uri_parse_error) + {error, _} -> throw({error,invalid_uri,Addr}) end. @@ -910,10 +928,10 @@ parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) -> parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin(Rest, [Char|Acc], URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> - throw(uri_parse_error). + throw({error,invalid_uri,<<>>}). %% Check if char is allowed in IPv6 addresses -spec is_ipv6(char()) -> boolean(). @@ -943,7 +961,7 @@ parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin_end(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_ipv6_bin_end(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -952,7 +970,7 @@ parse_ipv6_bin_end(?STRING_EMPTY, URI) -> validate_ipv6_address(Addr) -> case inet:parse_ipv6strict_address(Addr) of {ok, _} -> Addr; - {error, _} -> throw(uri_parse_error) + {error, _} -> throw({error,invalid_uri,Addr}) end. @@ -981,7 +999,7 @@ parse_port(?STRING_REST($#, Rest), URI) -> parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_port(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_port(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -1007,7 +1025,7 @@ parse_query(?STRING_REST($#, Rest), URI) -> parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of true -> parse_query(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_query(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -1033,7 +1051,7 @@ is_query(Char) -> is_pchar(Char). parse_fragment(?STRING_REST(Char, Rest), URI) -> case is_fragment(Char) of true -> parse_fragment(Rest, URI); - false -> throw(uri_parse_error) + false -> throw({error,invalid_uri,[Char]}) end; parse_fragment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. @@ -1335,9 +1353,9 @@ decode_fragment(Cs) -> check_utf8(Cs) -> case unicode:characters_to_list(Cs) of {incomplete,_,_} -> - throw(uri_parse_error); + throw({error,non_utf8,Cs}); {error,_,_} -> - throw(uri_parse_error); + throw({error,non_utf8,Cs}); _ -> Cs end. @@ -1348,13 +1366,13 @@ check_utf8(Cs) -> %% Only validates as scheme cannot have percent-encoded characters -spec encode_scheme(list()|binary()) -> list() | binary(). encode_scheme([]) -> - throw(uri_parse_error); + throw({error,invalid_scheme,""}); encode_scheme(<<>>) -> - throw(uri_parse_error); + throw({error,invalid_scheme,<<>>}); encode_scheme(Scheme) -> case validate_scheme(Scheme) of true -> Scheme; - false -> throw(uri_parse_error) + false -> throw({error,invalid_scheme,Scheme}) end. -spec encode_userinfo(list()|binary()) -> list() | binary(). @@ -1390,12 +1408,12 @@ decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> true -> B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); - false -> throw(uri_parse_error) + false -> throw({error,percent_decode,<<$%,C0,C1>>}) end; decode(<>, Fun, Acc) -> case Fun(C) of true -> decode(Cs, Fun, <>); - false -> throw(uri_parse_error) + false -> throw({error,percent_decode,<>}) end; decode(<<>>, _Fun, Acc) -> Acc. @@ -1424,8 +1442,8 @@ encode(Component, Fun) when is_binary(Component) -> encode(<>, Fun, Acc) -> C = encode_codepoint_binary(Char, Fun), encode(Rest, Fun, <>); -encode(<<_Char, _Rest/binary>>, _Fun, _Acc) -> - throw(uri_parse_error); +encode(<>, _Fun, _Acc) -> + throw({error,percent_encode,<>}); encode(<<>>, _Fun, Acc) -> Acc. @@ -1554,7 +1572,8 @@ is_valid_map(Map) -> not maps:is_key(host, Map) andalso maps:is_key(port, Map))) orelse not maps:is_key(path, Map) orelse - not is_host_and_path_valid(Map) + not is_host_and_path_valid(Map) orelse + invalid_field_present(Map) of true -> false; @@ -1563,6 +1582,16 @@ is_valid_map(Map) -> end. +invalid_field_present(Map) -> + Fun = fun(K, _, AccIn) -> AccIn orelse + ((K =/= scheme) andalso (K =/= userinfo) + andalso (K =/= host) andalso (K =/= port) + andalso (K =/= path) andalso (K =/= query) + andalso (K =/= fragment)) + end, + maps:fold(Fun, false, Map). + + is_host_and_path_valid(Map) -> Host = maps:get(host, Map, undefined), Path = maps:get(path, Map, undefined), @@ -1745,9 +1774,9 @@ transcode_pct([], Acc, B, InEncoding, OutEncoding) -> convert_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of {error, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); {incomplete, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); Result -> Result end. @@ -1757,9 +1786,9 @@ convert_binary(Binary, InEncoding, OutEncoding) -> convert_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of {error, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); {incomplete, _List, RestData} -> - throw({error, unicode, RestData}); + throw({error, invalid_input, RestData}); Result -> Result end. -- cgit v1.2.3 From b439d19d38479d6264d906dd926a168c9c514da3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 20 Oct 2017 16:32:42 +0200 Subject: stdlib: Update documentation (uri_string) --- lib/stdlib/src/uri_string.erl | 58 ++++++++++++++----------------------------- 1 file changed, 18 insertions(+), 40 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 51f7564934..8723d3f183 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,9 +226,9 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([compose_query/1, compose_query/2, create_uri_reference/2, - dissect_query/1, normalize/1, parse/1, - recompose/1, resolve_uri_reference/2, transcode/2]). +-export([compose_query/1, compose_query/2, + dissect_query/1, parse/1, + recompose/1, transcode/2]). -export_type([uri_map/0, uri_string/0]). @@ -291,7 +291,8 @@ %%------------------------------------------------------------------------- -spec parse(URIString) -> URIMap when URIString :: uri_string(), - URIMap :: uri_map(). + URIMap :: uri_map() + | {error, atom(), list() | binary()}. parse(URIString) when is_binary(URIString) -> try parse_uri_reference(URIString, #{}) of Result -> Result @@ -315,7 +316,8 @@ parse(URIString) when is_list(URIString) -> %%------------------------------------------------------------------------- -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), - URIString :: uri_string(). + URIString :: uri_string() + | {error, atom(), list() | binary()}. recompose(Map) -> case is_valid_map(Map) of false -> @@ -337,41 +339,14 @@ recompose(Map) -> end. -%%------------------------------------------------------------------------- -%% Resolve references -%%------------------------------------------------------------------------- --spec resolve_uri_reference(RelativeURI, AbsoluteBaseURI) -> AbsoluteDestURI when - RelativeURI :: uri_string(), - AbsoluteBaseURI :: uri_string(), - AbsoluteDestURI :: uri_string(). -resolve_uri_reference(_,_) -> - "". - -%%------------------------------------------------------------------------- -%% Create references -%%------------------------------------------------------------------------- --spec create_uri_reference(AbsoluteSourceURI, AbsoluteBaseURI) -> RelativeDestURI when - AbsoluteSourceURI :: uri_string(), - AbsoluteBaseURI :: uri_string(), - RelativeDestURI :: uri_string(). -create_uri_reference(_,_) -> - "". - -%%------------------------------------------------------------------------- -%% Normalize URIs -%%------------------------------------------------------------------------- --spec normalize(URIString) -> NormalizedURI when - URIString :: uri_string(), - NormalizedURI :: uri_string(). -normalize(_) -> - "". - %%------------------------------------------------------------------------- %% Transcode URIs %%------------------------------------------------------------------------- --spec transcode(URIString, Options) -> URIString when +-spec transcode(URIString, Options) -> Result when URIString :: uri_string(), - Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}]. + Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}], + Result :: uri_string() + | {error, atom(), list() | binary()}. transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -407,7 +382,8 @@ transcode(URIString, Options) when is_list(URIString) -> %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{uri_string(), uri_string()}], - QueryString :: string(). + QueryString :: string() + | {error, atom(), list() | binary()}. compose_query(List) -> compose_query(List, []). @@ -415,7 +391,8 @@ compose_query(List) -> -spec compose_query(QueryList, Options) -> QueryString when QueryList :: [{uri_string(), uri_string()}], Options :: [{separator, atom()}], - QueryString :: string(). + QueryString :: string() + | {error, atom(), list() | binary()}. compose_query([],_Options) -> []; compose_query(List, Options) -> @@ -439,7 +416,8 @@ compose_query([], _Options, Acc) -> %%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), - QueryList :: [{string(), string()}]. + QueryList :: [{string(), string()}] + | {error, atom(), list() | binary()}. dissect_query([]) -> []; dissect_query(QueryString) when is_binary(QueryString) -> @@ -1940,7 +1918,7 @@ form_urldecode(Cs) -> form_urldecode(<<>>, Acc) -> convert_list(Acc, utf8); form_urldecode(<<$+,T/binary>>, Acc) -> - form_urlencode(T, [$ |Acc]); + form_urldecode(T, <>); form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> -- cgit v1.2.3 From da11b15aef87f392a807b4756bf285160e15a194 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 23 Oct 2017 12:02:16 +0200 Subject: stdlib: Update supported separators (query string) Update list of supported separators: - escaped_amp (default): "&" - amp: "&" - semicolon: ";" --- lib/stdlib/src/uri_string.erl | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 8723d3f183..a4fd9c66f4 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -1806,6 +1806,8 @@ get_separator(_, Acc) when length(Acc) =:= 0 -> get_separator([], _Acc) -> "&"; get_separator([{separator, amp}], _Acc) -> + "&"; +get_separator([{separator, escaped_amp}], _Acc) -> "&"; get_separator([{separator, semicolon}], _Acc) -> ";". @@ -1901,6 +1903,8 @@ dissect_query_value([], Acc, Key, Value) -> dissect_query_separator_amp("&" ++ T, Acc, Key, Value) -> dissect_query_key(T, Acc, Key, Value); +dissect_query_separator_amp("&" ++ T, Acc, Key, Value) -> + dissect_query_key(T, Acc, Key, Value); dissect_query_separator_amp(L, _, _, _) -> throw({error, invalid_separator, L}). -- cgit v1.2.3 From 3c80849dc9167018a66542b76b441e675d404a78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Tue, 24 Oct 2017 13:19:37 +0200 Subject: stdlib: Refactor parsed binary calculation --- lib/stdlib/src/uri_string.erl | 220 +++++++++++++----------------------------- 1 file changed, 65 insertions(+), 155 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index a4fd9c66f4..684087b870 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -493,7 +493,7 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> catch throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), - Host = calculate_parsed_part_sl(Rest, T), + Host = calculate_parsed_host_port(Rest, T), URI2 = maybe_add_path(URI1), URI2#{host => decode_host(remove_brackets(Host))} end; @@ -503,12 +503,12 @@ parse_relative_part(?STRING_REST($/, Rest), URI) -> URI1#{path => decode_path(?STRING_REST($/, Path))}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), URI2 = maybe_add_path(URI1), URI2#{query => decode_query(Query)}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), URI2 = maybe_add_path(URI1), URI2#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> @@ -558,11 +558,11 @@ parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of @@ -581,11 +581,11 @@ parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of @@ -679,7 +679,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> catch throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), - Host = calculate_parsed_part_sl(Rest, T), + Host = calculate_parsed_host_port(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}} end; parse_hier(?STRING_REST($/, Rest), URI) -> @@ -688,11 +688,11 @@ parse_hier(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of @@ -735,7 +735,7 @@ parse_userinfo(?CHAR($@), URI) -> {?STRING_EMPTY, URI#{host => <<>>}}; parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), - Host = calculate_parsed_host(Rest, T), + Host = calculate_parsed_host_port(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of @@ -796,7 +796,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_port(Rest, T), + H = calculate_parsed_host_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> @@ -805,13 +805,13 @@ parse_host(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -825,7 +825,7 @@ parse_host(?STRING_EMPTY, URI) -> -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_port(Rest, T), + H = calculate_parsed_host_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> @@ -834,11 +834,11 @@ parse_reg_name(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of @@ -858,7 +858,7 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_port(Rest, T), + H = calculate_parsed_host_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> @@ -869,12 +869,12 @@ parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of @@ -921,7 +921,7 @@ is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_port(Rest, T), + H = calculate_parsed_host_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> @@ -930,11 +930,11 @@ parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of @@ -968,11 +968,11 @@ parse_port(?STRING_REST($/, Rest), URI) -> {Rest, URI1#{path => decode_path(?STRING_REST($/, Path))}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query - Query = calculate_parsed_query(Rest, T), + Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => decode_query(Query)}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -998,7 +998,7 @@ parse_port(?STRING_EMPTY, URI) -> -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_fragment(Rest, T), + Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of @@ -1115,144 +1115,77 @@ remove_brackets(Addr) -> Addr. %%------------------------------------------------------------------------- %% Helper functions for calculating the parsed binary. %%------------------------------------------------------------------------- +-spec calculate_parsed_scheme(binary(), binary()) -> binary(). +calculate_parsed_scheme(Input, <<>>) -> + strip_last_char(Input, [$:]); +calculate_parsed_scheme(Input, Unparsed) -> + get_parsed_binary(Input, Unparsed). + -%% Returns the parsed binary based on Input and the Unparsed part. -%% Handles the following special cases: -%% -%% #{host => [],path => "/",query => []} = uri_string:parse("///?") -%% #{fragment => [],host => [],path => "/"} = uri_string:parse("///#") -%% -spec calculate_parsed_part(binary(), binary()) -> binary(). -calculate_parsed_part(<<$?>>, _) -> <<>>; -calculate_parsed_part(<<$#>>, _) -> <<>>; -calculate_parsed_part(<<>>, _) -> <<>>; calculate_parsed_part(Input, <<>>) -> - case binary:last(Input) of - $? -> - init_binary(Input); - $# -> - init_binary(Input); - _Else -> - Input - end; + strip_last_char(Input, [$?,$#]); calculate_parsed_part(Input, Unparsed) -> - {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), - First. + get_parsed_binary(Input, Unparsed). -spec calculate_parsed_userinfo(binary(), binary()) -> binary(). -calculate_parsed_userinfo(<<$?>>, _) -> <<>>; -calculate_parsed_userinfo(<<$#>>, _) -> <<>>; -calculate_parsed_userinfo(<<>>, _) -> <<>>; calculate_parsed_userinfo(Input, <<>>) -> - case binary:last(Input) of - $? -> - init_binary(Input); - $# -> - init_binary(Input); - $@ -> - init_binary(Input); - _Else -> - Input - end; + strip_last_char(Input, [$?,$#,$@]); calculate_parsed_userinfo(Input, Unparsed) -> - {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), - First. + get_parsed_binary(Input, Unparsed). + + +-spec calculate_parsed_host_port(binary(), binary()) -> binary(). +calculate_parsed_host_port(Input, <<>>) -> + strip_last_char(Input, [$?,$#,$/]); +calculate_parsed_host_port(Input, Unparsed) -> + get_parsed_binary(Input, Unparsed). + +calculate_parsed_query_fragment(Input, <<>>) -> + strip_last_char(Input, [$#]); +calculate_parsed_query_fragment(Input, Unparsed) -> + get_parsed_binary(Input, Unparsed). --spec calculate_parsed_host(binary(), binary()) -> binary(). -calculate_parsed_host(<<$?>>, _) -> <<>>; -calculate_parsed_host(<<$#>>, _) -> <<>>; -calculate_parsed_host(<<>>, _) -> <<>>; -calculate_parsed_host(Input, <<>>) -> + +%% Strip last char if it is in list +strip_last_char(<<>>, _) -> <<>>; +strip_last_char(Input, [C0]) -> case binary:last(Input) of - $? -> - init_binary(Input); - $# -> - init_binary(Input); - $/ -> + C0 -> init_binary(Input); _Else -> Input end; -calculate_parsed_host(Input, Unparsed) -> - {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), - First. - - --spec calculate_parsed_port(binary(), binary()) -> binary(). -calculate_parsed_port(<<$?>>, _) -> <<>>; -calculate_parsed_port(<<$#>>, _) -> <<>>; -calculate_parsed_port(<<>>, _) -> <<>>; -calculate_parsed_port(Input, <<>>) -> +strip_last_char(Input, [C0,C1]) -> case binary:last(Input) of - $? -> - init_binary(Input); - $# -> + C0 -> init_binary(Input); - $/ -> + C1 -> init_binary(Input); _Else -> Input end; -calculate_parsed_port(Input, Unparsed) -> - {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), - First. - - -calculate_parsed_query(<<$#>>, _) -> <<>>; -calculate_parsed_query(<<>>, _) -> <<>>; -calculate_parsed_query(Input, <<>>) -> +strip_last_char(Input, [C0,C1,C2]) -> case binary:last(Input) of - $# -> + C0 -> + init_binary(Input); + C1 -> + init_binary(Input); + C2 -> init_binary(Input); _Else -> Input - end; -calculate_parsed_query(Input, Unparsed) -> - {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), - First. + end. --spec calculate_parsed_fragment(binary(), binary()) -> binary(). -calculate_parsed_fragment(<<$#>>, _) -> <<>>; -calculate_parsed_fragment(Input, Unparsed) -> +%% Get parsed binary +get_parsed_binary(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), First. -%% Returns the parsed binary based on Input and the Unparsed part. -%% Used when parsing authority. -%% -%% Handles the following special cases: -%% -%% #{host => "foo",query => []} = uri_string:parse("//foo?") -%% #{fragment => [],host => "foo"} = uri_string:parse("//foo#") -%% #{host => "foo",path => "/"} = uri_string:parse("//foo/") -%% #{host => "foo",query => [],scheme => "http"} = uri_string:parse("http://foo?") -%% #{fragment => [],host => "foo",scheme => "http"} = uri_string:parse("http://foo#") -%% #{host => "foo",path => "/",scheme => "http"} = uri_string:parse("http://foo/") -%% --spec calculate_parsed_part_sl(binary(), binary()) -> binary(). -calculate_parsed_part_sl(<<$?>>, _) -> <<>>; -calculate_parsed_part_sl(<<$#>>, _) -> <<>>; -calculate_parsed_part_sl(<<>>, _) -> <<>>; -calculate_parsed_part_sl(Input, <<>>) -> - case binary:last(Input) of - $? -> - init_binary(Input); - $# -> - init_binary(Input); - $/ -> - init_binary(Input); - _Else -> - Input - end; -calculate_parsed_part_sl(Input, Unparsed) -> - {First, _} = - split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)), - First. - %% Return all bytes of the binary except the last one. The binary must be non-empty. init_binary(B) -> {Init, _} = @@ -1260,14 +1193,6 @@ init_binary(B) -> Init. -%% Returns the parsed binary based on Input and the Unparsed part. -%% Used when parsing scheme. --spec calculate_parsed_scheme(binary(), binary()) -> binary(). -calculate_parsed_scheme(Input, Unparsed) -> - {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1), - First. - - %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). @@ -1275,21 +1200,6 @@ byte_size_exl_head(<<>>) -> 0; byte_size_exl_head(Binary) -> byte_size(Binary) + 1. -%% Returns size of 'Rest' for proper calculation of splitting position. -%% Solves the following special case: -%% -%% #{host := <<>>, path := <<"/">>} = uri_string:parse(<<"///">>). -%% -%% While keeping the following true: -%% -%% #{host := <<"hostname">>} = uri_string:parse(<<"//hostname">>). -%% #{host := <<>>, path := <<"/hostname">>} = uri_string:parse(<<"///hostname">>). -%% --spec byte_size_exl_single_slash(uri_string()) -> number(). -byte_size_exl_single_slash(<<$/>>) -> 0; -byte_size_exl_single_slash(Rest) -> byte_size(Rest). - - %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 2.1. Percent-Encoding] %% -- cgit v1.2.3 From 992cda82f16ee23b0114563858d5a082711f659b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 25 Oct 2017 10:11:14 +0200 Subject: stdlib: Refactor compose_query --- lib/stdlib/src/uri_string.erl | 73 ++++++++++++++++++++++--------------------- 1 file changed, 38 insertions(+), 35 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 684087b870..2bf7ceaff1 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -396,19 +396,24 @@ compose_query(List) -> compose_query([],_Options) -> []; compose_query(List, Options) -> - try compose_query(List, Options, []) of + try compose_query(List, Options, false, <<>>) of Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. %% -compose_query([{Key,Value}|Rest], Options, Acc) -> - Separator = get_separator(Options, Acc), +compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> + Separator = get_separator(Options, Rest), K = form_urlencode(Key), V = form_urlencode(Value), - compose_query(Rest, Options, Acc ++ Separator ++ K ++ "=" ++ V); -compose_query([], _Options, Acc) -> - Acc. + Flag = is_list(Key) orelse is_list(Value), + IsListNew = IsList orelse Flag, + compose_query(Rest, Options, IsListNew, < - "&"; -get_separator([{separator, escaped_amp}], _Acc) -> - "&"; -get_separator([{separator, semicolon}], _Acc) -> - ";". +get_separator(_, L) when length(L) =:= 0 -> + <<>>; +get_separator([], _L) -> + <<"&">>; +get_separator([{separator, amp}], _L) -> + <<"&">>; +get_separator([{separator, escaped_amp}], _L) -> + <<"&">>; +get_separator([{separator, semicolon}], _L) -> + <<";">>. %% Form-urlencode input based on RFC 1866 [8.2.1] -form_urlencode(Cs) when is_binary(Cs) -> - L = convert_list(Cs, utf8), - form_urlencode(L, []); +form_urlencode(Cs) when is_list(Cs) -> + B = convert_binary(Cs, utf8, utf8), + form_urlencode(B, <<>>); form_urlencode(Cs) -> - L = flatten_list(Cs, utf8), - form_urlencode(L, []). + form_urlencode(Cs, <<>>). %% -form_urlencode([], Acc) -> - lists:reverse(Acc); -form_urlencode([$ |T], Acc) -> - form_urlencode(T, [$+|Acc]); -form_urlencode([H|T], Acc) -> +form_urlencode(<<>>, Acc) -> + Acc; +form_urlencode(<<$ ,T/binary>>, Acc) -> + form_urlencode(T, <>); +form_urlencode(<>, Acc) -> case is_url_char(H) of true -> - form_urlencode(T, [H|Acc]); + form_urlencode(T, <>); false -> - E = urlencode_char(H), - form_urlencode(T, lists:reverse(E) ++ Acc) - end. - - -urlencode_char(C) -> - B = percent_encode_binary(C), - unicode:characters_to_list(B). + E = percent_encode_binary(H), + form_urlencode(T, <>) + end; +form_urlencode(<>, _Acc) -> + throw({error,invalid_utf8,<>}); +form_urlencode(H, _Acc) -> + throw({error,badarg, H}). %% Return true if input char can appear in URL according to -- cgit v1.2.3 From eba3d3e5e9b08839dafcb2e8adc6620d9211d96c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 25 Oct 2017 14:43:45 +0200 Subject: stdlib: Refactor dissect_query --- lib/stdlib/src/uri_string.erl | 91 ++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 45 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 2bf7ceaff1..09bf4aef1d 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -423,18 +423,21 @@ compose_query([], _Options, IsList, Acc) -> QueryString :: uri_string(), QueryList :: [{string(), string()}] | {error, atom(), list() | binary()}. +dissect_query(<<>>) -> + []; dissect_query([]) -> []; -dissect_query(QueryString) when is_binary(QueryString) -> - L = convert_list(QueryString, utf8), - try dissect_query_key(L, [], [], []) of +dissect_query(QueryString) when is_list(QueryString) -> + try + B = convert_binary(QueryString, utf8, utf8), + dissect_query_key(B, true, [], <<>>, <<>>) + of Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; dissect_query(QueryString) -> - L = flatten_list(QueryString, utf8), - try dissect_query_key(L, [], [], []) of + try dissect_query_key(QueryString, false, [], <<>>, <<>>) of Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} @@ -1706,7 +1709,6 @@ flatten_list(Arg, _, _) -> throw({error, badarg, Arg}). - percent_encode_segment(Segment) -> percent_encode_binary(Segment, <<>>). @@ -1790,50 +1792,48 @@ is_unsafe(_) -> false. %%------------------------------------------------------------------------- %% Helper functions for dissect_query %%------------------------------------------------------------------------- -dissect_query_key([$=|T], Acc, Key, Value) -> - dissect_query_value(T, Acc, Key, Value); -dissect_query_key([H|T], Acc, Key, Value) -> - dissect_query_key(T, Acc, [H|Key], Value); -dissect_query_key(L, _, _, _) -> - throw({error, missing_value, L}). - - -dissect_query_value([$&|_] = L, Acc, Key, Value) -> - K = form_urldecode(lists:reverse(Key)), - V = form_urldecode(lists:reverse(Value)), - dissect_query_separator_amp(L, [{K,V}|Acc], [], []); -dissect_query_value([$;|_] = L, Acc, Key, Value) -> - K = form_urldecode(lists:reverse(Key)), - V = form_urldecode(lists:reverse(Value)), - dissect_query_separator_semicolon(L, [{K,V}|Acc], [], []); -dissect_query_value([H|T], Acc, Key, Value) -> - dissect_query_value(T, Acc, Key, [H|Value]); -dissect_query_value([], Acc, Key, Value) -> - K = form_urldecode(lists:reverse(Key)), - V = form_urldecode(lists:reverse(Value)), +dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_value(T, IsList, Acc, Key, Value); +dissect_query_key(<>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, <>, Value); +dissect_query_key(B, _, _, _, _) -> + throw({error, missing_value, B}). + + +dissect_query_value(<<$&,_/binary>> = B, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), + dissect_query_separator_amp(B, IsList, [{K,V}|Acc], <<>>, <<>>); +dissect_query_value(<<$;,_/binary>> = B, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), + dissect_query_separator_semicolon(B, IsList, [{K,V}|Acc], <<>>, <<>>); +dissect_query_value(<>, IsList, Acc, Key, Value) -> + dissect_query_value(T, IsList, Acc, Key, <>); +dissect_query_value(<<>>, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), lists:reverse([{K,V}|Acc]). -dissect_query_separator_amp("&" ++ T, Acc, Key, Value) -> - dissect_query_key(T, Acc, Key, Value); -dissect_query_separator_amp("&" ++ T, Acc, Key, Value) -> - dissect_query_key(T, Acc, Key, Value); -dissect_query_separator_amp(L, _, _, _) -> - throw({error, invalid_separator, L}). +dissect_query_separator_amp(<<"&",T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, Key, Value); +dissect_query_separator_amp(<<$&,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, Key, Value). -dissect_query_separator_semicolon([$;|T], Acc, Key, Value) -> - dissect_query_key(T, Acc, Key, Value). +dissect_query_separator_semicolon(<<$;,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, Key, Value). %% Form-urldecode input based on RFC 1866 [8.2.1] -form_urldecode(Cs) -> - B = convert_binary(Cs, utf8, utf8), +form_urldecode(true, B) -> Result = form_urldecode(B, <<>>), - convert_list(Result, utf8). -%% + convert_list(Result, utf8); +form_urldecode(false, B) -> + form_urldecode(B, <<>>); form_urldecode(<<>>, Acc) -> - convert_list(Acc, utf8); + Acc; form_urldecode(<<$+,T/binary>>, Acc) -> form_urldecode(T, <>); form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> @@ -1843,13 +1843,14 @@ form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> form_urldecode(T, <>); false -> L = convert_list(<<$%,C0,C1,T/binary>>, utf8), - throw({error, urldecode, L}) + throw({error, invalid_percent_encoding, L}) end; -form_urldecode(<>, Acc) -> +form_urldecode(<>, Acc) -> case is_url_char(H) of true -> form_urldecode(T, <>); false -> - L = convert_list(<>, utf8), - throw({error, urldecode, L}) - end. + throw({error, invalid_character, [H]}) + end; +form_urldecode(<>, _Acc) -> + throw({error, invalid_character, [H]}). -- cgit v1.2.3 From b0c682a8118c5775da784e9a0f569ee995319f80 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Thu, 26 Oct 2017 11:29:48 +0200 Subject: stdlib: Update documentation, error tuples --- lib/stdlib/src/uri_string.erl | 44 ++++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 21 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 09bf4aef1d..ca212284d2 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -229,7 +229,7 @@ -export([compose_query/1, compose_query/2, dissect_query/1, parse/1, recompose/1, transcode/2]). --export_type([uri_map/0, uri_string/0]). +-export_type([error/0, uri_map/0, uri_string/0]). %%------------------------------------------------------------------------- @@ -273,6 +273,8 @@ %% %x96 ` grave / accent %%------------------------------------------------------------------------- -type uri_string() :: iodata(). +-type error() :: {error, atom(), list() | binary()}. + %%------------------------------------------------------------------------- %% RFC 3986, Chapter 3. Syntax Components @@ -292,7 +294,7 @@ -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map() - | {error, atom(), list() | binary()}. + | error(). parse(URIString) when is_binary(URIString) -> try parse_uri_reference(URIString, #{}) of Result -> Result @@ -317,7 +319,7 @@ parse(URIString) when is_list(URIString) -> -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string() - | {error, atom(), list() | binary()}. + | error(). recompose(Map) -> case is_valid_map(Map) of false -> @@ -346,7 +348,7 @@ recompose(Map) -> URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}], Result :: uri_string() - | {error, atom(), list() | binary()}. + | error(). transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -357,7 +359,7 @@ transcode(URIString, Options) when is_binary(URIString) -> of Result -> Result catch - throw:{error, _, RestData} -> {error, invalid_input, RestData} + throw:{error, Atom, RestData} -> {error, Atom, RestData} end; transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), @@ -366,7 +368,7 @@ transcode(URIString, Options) when is_list(URIString) -> try transcode(Flattened, [], InEnc, OutEnc) of Result -> Result catch - throw:{error, _, RestData} -> {error, invalid_input, RestData} + throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -382,8 +384,8 @@ transcode(URIString, Options) when is_list(URIString) -> %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{uri_string(), uri_string()}], - QueryString :: string() - | {error, atom(), list() | binary()}. + QueryString :: uri_string() + | error(). compose_query(List) -> compose_query(List, []). @@ -391,8 +393,8 @@ compose_query(List) -> -spec compose_query(QueryList, Options) -> QueryString when QueryList :: [{uri_string(), uri_string()}], Options :: [{separator, atom()}], - QueryString :: string() - | {error, atom(), list() | binary()}. + QueryString :: uri_string() + | error(). compose_query([],_Options) -> []; compose_query(List, Options) -> @@ -421,8 +423,8 @@ compose_query([], _Options, IsList, Acc) -> %%------------------------------------------------------------------------- -spec dissect_query(QueryString) -> QueryList when QueryString :: uri_string(), - QueryList :: [{string(), string()}] - | {error, atom(), list() | binary()}. + QueryList :: [{uri_string(), uri_string()}] + | error(). dissect_query(<<>>) -> []; dissect_query([]) -> @@ -1249,9 +1251,9 @@ decode_fragment(Cs) -> check_utf8(Cs) -> case unicode:characters_to_list(Cs) of {incomplete,_,_} -> - throw({error,non_utf8,Cs}); + throw({error,invalid_utf8,Cs}); {error,_,_} -> - throw({error,non_utf8,Cs}); + throw({error,invalid_utf8,Cs}); _ -> Cs end. @@ -1304,12 +1306,12 @@ decode(<<$%,C0,C1,Cs/binary>>, Fun, Acc) -> true -> B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), decode(Cs, Fun, <>); - false -> throw({error,percent_decode,<<$%,C0,C1>>}) + false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>}) end; decode(<>, Fun, Acc) -> case Fun(C) of true -> decode(Cs, Fun, <>); - false -> throw({error,percent_decode,<>}) + false -> throw({error,invalid_percent_encoding,<>}) end; decode(<<>>, _Fun, Acc) -> Acc. @@ -1339,7 +1341,7 @@ encode(<>, Fun, Acc) -> C = encode_codepoint_binary(Char, Fun), encode(Rest, Fun, <>); encode(<>, _Fun, _Acc) -> - throw({error,percent_encode,<>}); + throw({error,invalid_input,<>}); encode(<<>>, _Fun, Acc) -> Acc. @@ -1647,12 +1649,12 @@ transcode([], Acc, List, _InEncoding, _OutEncoding) -> %% Transcode percent-encoded segment -transcode_pct([$%,C0,C1|Rest], Acc, B, InEncoding, OutEncoding) -> +transcode_pct([$%,C0,C1|Rest] = L, Acc, B, InEncoding, OutEncoding) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1), transcode_pct(Rest, Acc, <>, InEncoding, OutEncoding); - false -> throw({error, lists:reverse(Acc),[C0,C1]}) + false -> throw({error, invalid_percent_encoding,L}) end; transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) -> OutBinary = convert_binary(B, InEncoding, OutEncoding), @@ -1706,7 +1708,7 @@ flatten_list([H|T], InEnc, Acc) -> flatten_list([], _InEnc, Acc) -> lists:reverse(Acc); flatten_list(Arg, _, _) -> - throw({error, badarg, Arg}). + throw({error, invalid_input, Arg}). percent_encode_segment(Segment) -> @@ -1752,7 +1754,7 @@ form_urlencode(<>, Acc) -> form_urlencode(<>, _Acc) -> throw({error,invalid_utf8,<>}); form_urlencode(H, _Acc) -> - throw({error,badarg, H}). + throw({error,invalid_input, H}). %% Return true if input char can appear in URL according to -- cgit v1.2.3 From 3d12c8f164f79dd67967ba5c7df7d3c555dc0f29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 27 Oct 2017 14:14:22 +0200 Subject: stdlib: Allow undefined port in uri_map() uri_map() updated to allow 'undefined' ports in order to align the implementation with RFC 3986: port = *DIGIT An 'undefined' port is mapped to a ":" during recompose operation. --- lib/stdlib/src/uri_string.erl | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index ca212284d2..16650d5005 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -283,7 +283,7 @@ #{fragment => unicode:chardata(), host => unicode:chardata(), path => unicode:chardata(), - port => non_neg_integer(), + port => non_neg_integer() | undefined, query => unicode:chardata(), scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. @@ -807,7 +807,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty @@ -836,7 +836,7 @@ parse_host(?STRING_EMPTY, URI) -> parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty @@ -869,7 +869,7 @@ parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), @@ -932,7 +932,7 @@ is_ipv6(Char) -> is_hex_digit(Char). parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), - Port = binary_to_integer(H), + Port = get_port(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty @@ -1148,7 +1148,7 @@ calculate_parsed_userinfo(Input, Unparsed) -> -spec calculate_parsed_host_port(binary(), binary()) -> binary(). calculate_parsed_host_port(Input, <<>>) -> - strip_last_char(Input, [$?,$#,$/]); + strip_last_char(Input, [$:,$?,$#,$/]); calculate_parsed_host_port(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). @@ -1159,6 +1159,18 @@ calculate_parsed_query_fragment(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). +get_port(<<>>) -> + undefined; +get_port(B) -> + try binary_to_integer(B) of + Port -> + Port + catch + error:badarg -> + throw({error, invalid_uri, B}) + end. + + %% Strip last char if it is in list strip_last_char(<<>>, _) -> <<>>; strip_last_char(Input, [C0]) -> @@ -1187,6 +1199,19 @@ strip_last_char(Input, [C0,C1,C2]) -> init_binary(Input); _Else -> Input + end; +strip_last_char(Input, [C0,C1,C2,C3]) -> + case binary:last(Input) of + C0 -> + init_binary(Input); + C1 -> + init_binary(Input); + C2 -> + init_binary(Input); + C3 -> + init_binary(Input); + _Else -> + Input end. @@ -1530,6 +1555,8 @@ update_host(#{}, URI) -> %% URI cannot be empty for ports. E.g. ":8080" is not a valid URI +update_port(#{port := undefined}, URI) -> + concat(URI, <<":">>); update_port(#{port := Port}, URI) -> concat(URI,add_colon(encode_port(Port))); update_port(#{}, URI) -> -- cgit v1.2.3 From ce78af7e5a76dc4a27673ab5c80a315762b992b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 27 Oct 2017 16:54:27 +0200 Subject: stdlib: Implement normalize/1 Implements the following Syntax-Based Normalizations: - Case Normalization - Percent-Encoding Normalization - Path Segment Normalization - Scheme-Based Normalization - HTTP(S) - Basic support for FTP, SSH, SFTP, TFTP --- lib/stdlib/src/uri_string.erl | 167 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 166 insertions(+), 1 deletion(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 16650d5005..cf8c388f54 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -227,7 +227,7 @@ %% External API %%------------------------------------------------------------------------- -export([compose_query/1, compose_query/2, - dissect_query/1, parse/1, + dissect_query/1, normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -288,6 +288,21 @@ scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. + +%%------------------------------------------------------------------------- +%% Normalize URIs +%%------------------------------------------------------------------------- +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(URIString) -> + %% Case normalization and percent-encoding normalization are achieved + %% by running parse and recompose on the input URI string. + M = parse(URIString), + M1 = normalize_scheme_based(M), + M2 = normalize_path_segment(M1), + recompose(M2). + %%------------------------------------------------------------------------- %% Parse URIs %%------------------------------------------------------------------------- @@ -1883,3 +1898,153 @@ form_urldecode(<>, Acc) -> end; form_urldecode(<>, _Acc) -> throw({error, invalid_character, [H]}). + + +%%------------------------------------------------------------------------- +%% Helper functions for normalize +%%------------------------------------------------------------------------- + +%% RFC 3986 +%% 6.2.2.3. Path Segment Normalization +%% 5.2.4. Remove Dot Segments +normalize_path_segment(Map) -> + Path = maps:get(path, Map, undefined), + Map#{path => remove_dot_segments(Path)}. + + +remove_dot_segments(Path) when is_binary(Path) -> + remove_dot_segments(Path, <<>>); +remove_dot_segments(Path) when is_list(Path) -> + B = convert_binary(Path, utf8, utf8), + B1 = remove_dot_segments(B, <<>>), + convert_list(B1, utf8). +%% +remove_dot_segments(<<>>, Output) -> + Output; +remove_dot_segments(<<"../",T/binary>>, Output) -> + remove_dot_segments(T, Output); +remove_dot_segments(<<"./",T/binary>>, Output) -> + remove_dot_segments(T, Output); +remove_dot_segments(<<"/./",T/binary>>, Output) -> + remove_dot_segments(<<$/,T/binary>>, Output); +remove_dot_segments(<<"/.">>, Output) -> + remove_dot_segments(<<$/>>, Output); +remove_dot_segments(<<"/../",T/binary>>, Output) -> + Out1 = remove_last_segment(Output), + remove_dot_segments(<<$/,T/binary>>, Out1); +remove_dot_segments(<<"/..">>, Output) -> + Out1 = remove_last_segment(Output), + remove_dot_segments(<<$/>>, Out1); +remove_dot_segments(<<$.>>, Output) -> + remove_dot_segments(<<>>, Output); +remove_dot_segments(<<"..">>, Output) -> + remove_dot_segments(<<>>, Output); +remove_dot_segments(Input, Output) -> + {First, Rest} = first_path_segment(Input), + remove_dot_segments(Rest, <>). + + +first_path_segment(Input) -> + F = first_path_segment(Input, <<>>), + split_binary(Input, byte_size(F)). +%% +first_path_segment(<<$/,T/binary>>, Acc) -> + first_path_segment_end(<>, <>); +first_path_segment(<>, Acc) -> + first_path_segment_end(<>, <>). + + +first_path_segment_end(<<>>, Acc) -> + Acc; +first_path_segment_end(<<$/,_/binary>>, Acc) -> + Acc; +first_path_segment_end(<>, Acc) -> + first_path_segment_end(<>, <>). + + +remove_last_segment(<<>>) -> + <<>>; +remove_last_segment(B) -> + {Init, Last} = split_binary(B, byte_size(B) - 1), + case Last of + <<$/>> -> + Init; + _Char -> + remove_last_segment(Init) + end. + + +%% RFC 3986, 6.2.3. Scheme-Based Normalization +normalize_scheme_based(Map) -> + Scheme = maps:get(scheme, Map, undefined), + Port = maps:get(port, Map, undefined), + Path= maps:get(path, Map, undefined), + case Scheme of + "http" -> + normalize_http(Map, Port, Path); + <<"http">> -> + normalize_http(Map, Port, Path); + "https" -> + normalize_https(Map, Port, Path); + <<"https">> -> + normalize_https(Map, Port, Path); + "ftp" -> + normalize_ftp(Map, Port); + <<"ftp">> -> + normalize_ftp(Map, Port); + "ssh" -> + normalize_ssh_sftp(Map, Port); + <<"ssh">> -> + normalize_ssh_sftp(Map, Port); + "sftp" -> + normalize_ssh_sftp(Map, Port); + <<"sftp">> -> + normalize_ssh_sftp(Map, Port); + "tftp" -> + normalize_tftp(Map, Port); + <<"tftp">> -> + normalize_tftp(Map, Port); + _Else -> Map + end. + + +normalize_http(Map, Port, Path) -> + M1 = normalize_port(Map, Port, 80), + normalize_http_path(M1, Path). + + +normalize_https(Map, Port, Path) -> + M1 = normalize_port(Map, Port, 443), + normalize_http_path(M1, Path). + + +normalize_ftp(Map, Port) -> + normalize_port(Map, Port, 21). + + +normalize_ssh_sftp(Map, Port) -> + normalize_port(Map, Port, 22). + + +normalize_tftp(Map, Port) -> + normalize_port(Map, Port, 69). + + +normalize_port(Map, Port, Default) -> + case Port of + Default -> + maps:remove(port, Map); + _Else -> + Map + end. + + +normalize_http_path(Map, Path) -> + case Path of + "" -> + Map#{path => "/"}; + <<>> -> + Map#{path => <<"/">>}; + _Else -> + Map + end. -- cgit v1.2.3 From 7a4d4e183ae5567d6242184b8268918904c872c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 30 Oct 2017 16:57:49 +0100 Subject: stdlib: Refactor helper functions in uri_string --- lib/stdlib/src/uri_string.erl | 142 ++++++++++++++++++++---------------------- 1 file changed, 66 insertions(+), 76 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index cf8c388f54..2c73e38324 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -273,7 +273,7 @@ %% %x96 ` grave / accent %%------------------------------------------------------------------------- -type uri_string() :: iodata(). --type error() :: {error, atom(), list() | binary()}. +-type error() :: {error, atom(), term()}. %%------------------------------------------------------------------------- @@ -298,10 +298,11 @@ normalize(URIString) -> %% Case normalization and percent-encoding normalization are achieved %% by running parse and recompose on the input URI string. - M = parse(URIString), - M1 = normalize_scheme_based(M), - M2 = normalize_path_segment(M1), - recompose(M2). + recompose( + normalize_path_segment( + normalize_scheme_based( + parse(URIString)))). + %%------------------------------------------------------------------------- %% Parse URIs @@ -311,8 +312,7 @@ normalize(URIString) -> URIMap :: uri_map() | error(). parse(URIString) when is_binary(URIString) -> - try parse_uri_reference(URIString, #{}) of - Result -> Result + try parse_uri_reference(URIString, #{}) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; @@ -321,8 +321,6 @@ parse(URIString) when is_list(URIString) -> Binary = unicode:characters_to_binary(URIString), Map = parse_uri_reference(Binary, #{}), convert_mapfields_to_list(Map) - of - Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -348,8 +346,6 @@ recompose(Map) -> T4 = update_path(Map, T3), T5 = update_query(Map, T4), update_fragment(Map, T5) - of - Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end @@ -371,8 +367,6 @@ transcode(URIString, Options) when is_binary(URIString) -> List = convert_list(URIString, InEnc), Output = transcode(List, [], InEnc, OutEnc), convert_binary(Output, utf8, OutEnc) - of - Result -> Result catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; @@ -380,8 +374,7 @@ transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), Flattened = flatten_list(URIString, InEnc), - try transcode(Flattened, [], InEnc, OutEnc) of - Result -> Result + try transcode(Flattened, [], InEnc, OutEnc) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -413,8 +406,7 @@ compose_query(List) -> compose_query([],_Options) -> []; compose_query(List, Options) -> - try compose_query(List, Options, false, <<>>) of - Result -> Result + try compose_query(List, Options, false, <<>>) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. @@ -423,8 +415,7 @@ compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> Separator = get_separator(Options, Rest), K = form_urlencode(Key), V = form_urlencode(Value), - Flag = is_list(Key) orelse is_list(Value), - IsListNew = IsList orelse Flag, + IsListNew = IsList orelse is_list(Key) orelse is_list(Value), compose_query(Rest, Options, IsListNew, < -%% E.g. "//user@" - invalid URI -%% - userinfo port -%% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} -%% There is always at least an empty host when both userinfo and port -%% are present. -%% - #{path => "///"} otherwise the following would be true: -%% "/////" = uri_string:recompose(#{host => "", path => "///"}) -%% "/////" = uri_string:recompose(#{path => "/////"}) -%% AND -%% path-absolute = "/" [ segment-nz *( "/" segment ) ] -%%------------------------------------------------------------------------- -is_valid_map(Map) -> - case - ((not maps:is_key(userinfo, Map) andalso - not maps:is_key(host, Map) andalso - maps:is_key(port, Map)) - orelse - (maps:is_key(userinfo, Map) andalso - not maps:is_key(host, Map) andalso - not maps:is_key(port, Map)) - orelse - (maps:is_key(userinfo, Map) andalso - not maps:is_key(host, Map) andalso - maps:is_key(port, Map))) orelse - not maps:is_key(path, Map) orelse - not is_host_and_path_valid(Map) orelse - invalid_field_present(Map) - of +%% +%% The implementation is based on a decision tree that fulfills the +%% following rules: +%% - 'path' shall always be present in the input map +%% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] +%% hier-part = "//" authority path-abempty +%% / path-absolute +%% / path-rootless +%% / path-empty +%% - 'host' shall be present in the input map when 'path' starts with +%% two slashes ("//") +%% path = path-abempty ; begins with "/" or is empty +%% / path-absolute ; begins with "/" but not "//" +%% / path-noscheme ; begins with a non-colon segment +%% / path-rootless ; begins with a segment +%% / path-empty ; zero characters +%% path-abempty = *( "/" segment ) +%% segment = *pchar +%% - 'host' shall be present if userinfo or port is present in input map +%% authority = [ userinfo "@" ] host [ ":" port ] +%% - All fields shall be valid (scheme, userinfo, host, port, path, query +%% or fragment). +%%------------------------------------------------------------------------- +is_valid_map(#{path := Path} = Map) -> + case starts_with_two_slash(Path) of true -> - false; + is_valid_map_host(Map); false -> - true - end. + case maps:is_key(userinfo, Map) of + true -> + is_valid_map_host(Map); + false -> + case maps:is_key(port, Map) of + true -> + is_valid_map_host(Map); + false -> + all_fields_valid(Map) + end + end + end; +is_valid_map(#{}) -> + false. -invalid_field_present(Map) -> - Fun = fun(K, _, AccIn) -> AccIn orelse - ((K =/= scheme) andalso (K =/= userinfo) - andalso (K =/= host) andalso (K =/= port) - andalso (K =/= path) andalso (K =/= query) - andalso (K =/= fragment)) - end, - maps:fold(Fun, false, Map). +is_valid_map_host(Map) -> + maps:is_key(host, Map) andalso all_fields_valid(Map). -is_host_and_path_valid(Map) -> - Host = maps:get(host, Map, undefined), - Path = maps:get(path, Map, undefined), - not (Host =:= undefined andalso starts_with_two_slash(Path)). +all_fields_valid(Map) -> + Fun = fun(scheme, _, Acc) -> Acc; + (userinfo, _, Acc) -> Acc; + (host, _, Acc) -> Acc; + (port, _, Acc) -> Acc; + (path, _, Acc) -> Acc; + (query, _, Acc) -> Acc; + (fragment, _, Acc) -> Acc; + (_, _, _) -> false + end, + maps:fold(Fun, true, Map). starts_with_two_slash([$/,$/|_]) -> -- cgit v1.2.3 From a4c3f8d3b270b9c21caabcd084bf55049b5bc700 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Tue, 31 Oct 2017 13:24:13 +0100 Subject: stdlib: Fix case normalization (normalize/1) --- lib/stdlib/src/uri_string.erl | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 2c73e38324..b8e0432fd6 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -296,12 +296,14 @@ URIString :: uri_string(), NormalizedURI :: uri_string(). normalize(URIString) -> - %% Case normalization and percent-encoding normalization are achieved - %% by running parse and recompose on the input URI string. + %% Percent-encoding normalization and case normalization for + %% percent-encoded triplets are achieved by running parse and + %% recompose on the input URI string. recompose( normalize_path_segment( normalize_scheme_based( - parse(URIString)))). + normalize_case( + parse(URIString))))). %%------------------------------------------------------------------------- @@ -1894,7 +1896,32 @@ form_urldecode(<>, _Acc) -> %% Helper functions for normalize %%------------------------------------------------------------------------- -%% RFC 3986 +%% 6.2.2.1. Case Normalization +normalize_case(#{scheme := Scheme, host := Host} = Map) -> + Map#{scheme => to_lower(Scheme), + host => to_lower(Host)}; +normalize_case(#{host := Host} = Map) -> + Map#{host => to_lower(Host)}; +normalize_case(#{scheme := Scheme} = Map) -> + Map#{scheme => to_lower(Scheme)}; +normalize_case(#{} = Map) -> + Map. + + +to_lower(Cs) when is_list(Cs) -> + B = convert_binary(Cs, utf8, utf8), + convert_list(to_lower(B), utf8); +to_lower(Cs) when is_binary(Cs) -> + to_lower(Cs, <<>>). +%% +to_lower(<>, Acc) when $A =< C, C =< $Z -> + to_lower(Cs, <>); +to_lower(<>, Acc) -> + to_lower(Cs, <>); +to_lower(<<>>, Acc) -> + Acc. + + %% 6.2.2.3. Path Segment Normalization %% 5.2.4. Remove Dot Segments normalize_path_segment(Map) -> -- cgit v1.2.3 From fdfe083c65348095c4168581bdc53e7508be78c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Wed, 1 Nov 2017 13:18:34 +0100 Subject: stdlib: Add uri_string module to stdlib.app.src --- lib/stdlib/src/stdlib.app.src | 1 + 1 file changed, 1 insertion(+) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/stdlib.app.src b/lib/stdlib/src/stdlib.app.src index ab0824ca17..5fb48acfab 100644 --- a/lib/stdlib/src/stdlib.app.src +++ b/lib/stdlib/src/stdlib.app.src @@ -101,6 +101,7 @@ timer, unicode, unicode_util, + uri_string, win32reg, zip]}, {registered,[timer_server,rsh_starter,take_over_monitor,pool_master, -- cgit v1.2.3 From 74c2a9db0caa376ea375614fcc67c3a9295737d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 3 Nov 2017 10:07:12 +0100 Subject: stdlib: Refactor functions in uri_string --- lib/stdlib/src/uri_string.erl | 111 +++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 61 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index b8e0432fd6..f4acf1885d 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -366,9 +366,9 @@ transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), - List = convert_list(URIString, InEnc), + List = convert_to_list(URIString, InEnc), Output = transcode(List, [], InEnc, OutEnc), - convert_binary(Output, utf8, OutEnc) + convert_to_binary(Output, utf8, OutEnc) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; @@ -421,7 +421,7 @@ compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> compose_query(Rest, Options, IsListNew, < - normalize_http(Map, Port, Path); - <<"http">> -> - normalize_http(Map, Port, Path); - "https" -> - normalize_https(Map, Port, Path); - <<"https">> -> - normalize_https(Map, Port, Path); - "ftp" -> - normalize_ftp(Map, Port); - <<"ftp">> -> - normalize_ftp(Map, Port); - "ssh" -> - normalize_ssh_sftp(Map, Port); - <<"ssh">> -> - normalize_ssh_sftp(Map, Port); - "sftp" -> - normalize_ssh_sftp(Map, Port); - <<"sftp">> -> - normalize_ssh_sftp(Map, Port); - "tftp" -> - normalize_tftp(Map, Port); - <<"tftp">> -> - normalize_tftp(Map, Port); - _Else -> Map - end. + normalize_scheme_based(Map, Scheme, Port, Path). +%% +normalize_scheme_based(Map, Scheme, Port, Path) + when Scheme =:= "http"; Scheme =:= <<"http">> -> + normalize_http(Map, Port, Path); +normalize_scheme_based(Map, Scheme, Port, Path) + when Scheme =:= "https"; Scheme =:= <<"https">> -> + normalize_https(Map, Port, Path); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "ftp"; Scheme =:= <<"ftp">> -> + normalize_ftp(Map, Port); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "ssh"; Scheme =:= <<"ssh">> -> + normalize_ssh_sftp(Map, Port); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "sftp"; Scheme =:= <<"sftp">> -> + normalize_ssh_sftp(Map, Port); +normalize_scheme_based(Map, Scheme, Port, _Path) + when Scheme =:= "tftp"; Scheme =:= <<"tftp">> -> + normalize_tftp(Map, Port); +normalize_scheme_based(Map, _, _, _) -> + Map. normalize_http(Map, Port, Path) -> -- cgit v1.2.3 From 7e5d062973e7cb4f9ee949529e9dcdb5785c1304 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Mon, 6 Nov 2017 09:54:12 +0100 Subject: stdlib: Remove compose_query and dissect_query compose_query/{1,2} and dissect_query/1 removed as the implemented specification (HTML 2.0) is old. They will be re-implemented based on HTML5. --- lib/stdlib/src/uri_string.erl | 216 +----------------------------------------- 1 file changed, 1 insertion(+), 215 deletions(-) (limited to 'lib/stdlib/src') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index f4acf1885d..22212da222 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,8 +226,7 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([compose_query/1, compose_query/2, - dissect_query/1, normalize/1, parse/1, +-export([normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -382,75 +381,6 @@ transcode(URIString, Options) when is_list(URIString) -> end. -%%------------------------------------------------------------------------- -%% Functions for working with the query part of a URI as a list -%% of key/value pairs. -%% HTML 2.0 (RFC 1866) defines a media type application/x-www-form-urlencoded -%% in section [8.2.1] "The form-urlencoded Media Type". -%%------------------------------------------------------------------------- - -%%------------------------------------------------------------------------- -%% Compose urlencoded query string from a list of unescaped key/value pairs. -%%------------------------------------------------------------------------- --spec compose_query(QueryList) -> QueryString when - QueryList :: [{uri_string(), uri_string()}], - QueryString :: uri_string() - | error(). -compose_query(List) -> - compose_query(List, []). - - --spec compose_query(QueryList, Options) -> QueryString when - QueryList :: [{uri_string(), uri_string()}], - Options :: [{separator, atom()}], - QueryString :: uri_string() - | error(). -compose_query([],_Options) -> - []; -compose_query(List, Options) -> - try compose_query(List, Options, false, <<>>) - catch - throw:{error, Atom, RestData} -> {error, Atom, RestData} - end. -%% -compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> - Separator = get_separator(Options, Rest), - K = form_urlencode(Key), - V = form_urlencode(Value), - IsListNew = IsList orelse is_list(Key) orelse is_list(Value), - compose_query(Rest, Options, IsListNew, <>; -get_separator([{separator, amp}], _L) -> - <<"&">>; -get_separator([{separator, escaped_amp}], _L) -> - <<"&">>; -get_separator([{separator, semicolon}], _L) -> - <<";">>. - - -%% Form-urlencode input based on RFC 1866 [8.2.1] -form_urlencode(Cs) when is_list(Cs) -> - B = convert_to_binary(Cs, utf8, utf8), - form_urlencode(B, <<>>); -form_urlencode(Cs) -> - form_urlencode(Cs, <<>>). -%% -form_urlencode(<<>>, Acc) -> - Acc; -form_urlencode(<<$ ,T/binary>>, Acc) -> - form_urlencode(T, <>); -form_urlencode(<>, Acc) -> - case is_url_char(H) of - true -> - form_urlencode(T, <>); - false -> - E = percent_encode_binary(H), - form_urlencode(T, <>) - end; -form_urlencode(<>, _Acc) -> - throw({error,invalid_utf8,<>}); -form_urlencode(H, _Acc) -> - throw({error,invalid_input, H}). - - -%% Return true if input char can appear in URL according to -%% RFC 1738 "Uniform Resource Locators". -is_url_char(C) - when 0 =< C, C =< 31; - 128 =< C, C =< 255 -> false; -is_url_char(127) -> false; -is_url_char(C) -> - not (is_reserved(C) orelse is_unsafe(C)). - - -%% Reserved characters (RFC 1738) -is_reserved($;) -> true; -is_reserved($/) -> true; -is_reserved($?) -> true; -is_reserved($:) -> true; -is_reserved($@) -> true; -is_reserved($=) -> true; -is_reserved($&) -> true; -is_reserved(_) -> false. - - -%% Unsafe characters (RFC 1738) -is_unsafe(${) -> true; -is_unsafe($}) -> true; -is_unsafe($|) -> true; -is_unsafe($\\) -> true; -is_unsafe($^) -> true; -is_unsafe($~) -> true; -is_unsafe($[) -> true; -is_unsafe($]) -> true; -is_unsafe($`) -> true; -is_unsafe(_) -> false. - - -%%------------------------------------------------------------------------- -%% Helper functions for dissect_query -%%------------------------------------------------------------------------- -dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_value(T, IsList, Acc, Key, Value); -dissect_query_key(<>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, <>, Value); -dissect_query_key(B, _, _, _, _) -> - throw({error, missing_value, B}). - - -dissect_query_value(<<$&,_/binary>> = B, IsList, Acc, Key, Value) -> - K = form_urldecode(IsList, Key), - V = form_urldecode(IsList, Value), - dissect_query_separator_amp(B, IsList, [{K,V}|Acc], <<>>, <<>>); -dissect_query_value(<<$;,_/binary>> = B, IsList, Acc, Key, Value) -> - K = form_urldecode(IsList, Key), - V = form_urldecode(IsList, Value), - dissect_query_separator_semicolon(B, IsList, [{K,V}|Acc], <<>>, <<>>); -dissect_query_value(<>, IsList, Acc, Key, Value) -> - dissect_query_value(T, IsList, Acc, Key, <>); -dissect_query_value(<<>>, IsList, Acc, Key, Value) -> - K = form_urldecode(IsList, Key), - V = form_urldecode(IsList, Value), - lists:reverse([{K,V}|Acc]). - - -dissect_query_separator_amp(<<"&",T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, Key, Value); -dissect_query_separator_amp(<<$&,T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, Key, Value). - - -dissect_query_separator_semicolon(<<$;,T/binary>>, IsList, Acc, Key, Value) -> - dissect_query_key(T, IsList, Acc, Key, Value). - - -%% Form-urldecode input based on RFC 1866 [8.2.1] -form_urldecode(true, B) -> - Result = form_urldecode(B, <<>>), - convert_to_list(Result, utf8); -form_urldecode(false, B) -> - form_urldecode(B, <<>>); -form_urldecode(<<>>, Acc) -> - Acc; -form_urldecode(<<$+,T/binary>>, Acc) -> - form_urldecode(T, <>); -form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> - case is_hex_digit(C0) andalso is_hex_digit(C1) of - true -> - V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), - form_urldecode(T, <>); - false -> - L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8), - throw({error, invalid_percent_encoding, L}) - end; -form_urldecode(<>, Acc) -> - case is_url_char(H) of - true -> - form_urldecode(T, <>); - false -> - throw({error, invalid_character, [H]}) - end; -form_urldecode(<>, _Acc) -> - throw({error, invalid_character, [H]}). - - %%------------------------------------------------------------------------- %% Helper functions for normalize %%------------------------------------------------------------------------- -- cgit v1.2.3