%% %% %CopyrightBegin% %% %% Copyright Ericsson AB 2017. All Rights Reserved. %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. %% You may obtain a copy of the License at %% %% http://www.apache.org/licenses/LICENSE-2.0 %% %% Unless required by applicable law or agreed to in writing, software %% distributed under the License is distributed on an "AS IS" BASIS, %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %% See the License for the specific language governing permissions and %% limitations under the License. %% %% %CopyrightEnd% %% %% %% [RFC 3986, Chapter 2.2. Reserved Characters] %% %% reserved = gen-delims / sub-delims %% %% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" %% %% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" %% / "*" / "+" / "," / ";" / "=" %% %% %% [RFC 3986, Chapter 2.3. Unreserved Characters] %% %% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" %% %% %% [RFC 3986, Chapter 3. Syntax Components] %% %% The generic URI syntax consists of a hierarchical sequence of %% components referred to as the scheme, authority, path, query, and %% fragment. %% %% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] %% %% hier-part = "//" authority path-abempty %% / path-absolute %% / path-rootless %% / path-empty %% %% The scheme and path components are required, though the path may be %% empty (no characters). When authority is present, the path must %% either be empty or begin with a slash ("/") character. When %% authority is not present, the path cannot begin with two slash %% characters ("//"). These restrictions result in five different ABNF %% rules for a path (Section 3.3), only one of which will match any %% given URI reference. %% %% The following are two example URIs and their component parts: %% %% foo://example.com:8042/over/there?name=ferret#nose %% \_/ \______________/\_________/ \_________/ \__/ %% | | | | | %% scheme authority path query fragment %% | _____________________|__ %% / \ / \ %% urn:example:animal:ferret:nose %% %% %% [RFC 3986, Chapter 3.1. Scheme] %% %% Each URI begins with a scheme name that refers to a specification for %% assigning identifiers within that scheme. %% %% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) %% %% %% [RFC 3986, Chapter 3.2. Authority] %% %% Many URI schemes include a hierarchical element for a naming %% authority so that governance of the name space defined by the %% remainder of the URI is delegated to that authority (which may, in %% turn, delegate it further). %% %% authority = [ userinfo "@" ] host [ ":" port ] %% %% %% [RFC 3986, Chapter 3.2.1. User Information] %% %% The userinfo subcomponent may consist of a user name and, optionally, %% scheme-specific information about how to gain authorization to access %% the resource. The user information, if present, is followed by a %% commercial at-sign ("@") that delimits it from the host. %% %% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) %% %% %% [RFC 3986, Chapter 3.2.2. Host] %% %% The host subcomponent of authority is identified by an IP literal %% encapsulated within square brackets, an IPv4 address in dotted- %% decimal form, or a registered name. %% %% host = IP-literal / IPv4address / reg-name %% %% IP-literal = "[" ( IPv6address / IPvFuture ) "]" %% %% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) %% %% IPv6address = 6( h16 ":" ) ls32 %% / "::" 5( h16 ":" ) ls32 %% / [ h16 ] "::" 4( h16 ":" ) ls32 %% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 %% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 %% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 %% / [ *4( h16 ":" ) h16 ] "::" ls32 %% / [ *5( h16 ":" ) h16 ] "::" h16 %% / [ *6( h16 ":" ) h16 ] "::" %% %% ls32 = ( h16 ":" h16 ) / IPv4address %% ; least-significant 32 bits of address %% %% h16 = 1*4HEXDIG %% ; 16 bits of address represented in hexadecimal %% %% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet %% %% dec-octet = DIGIT ; 0-9 %% / %x31-39 DIGIT ; 10-99 %% / "1" 2DIGIT ; 100-199 %% / "2" %x30-34 DIGIT ; 200-249 %% / "25" %x30-35 ; 250-255 %% %% reg-name = *( unreserved / pct-encoded / sub-delims ) %% %% %% [RFC 3986, Chapter 3.2.2. Port] %% %% The port subcomponent of authority is designated by an optional port %% number in decimal following the host and delimited from it by a %% single colon (":") character. %% %% port = *DIGIT %% %% %% [RFC 3986, Chapter 3.3. Path] %% %% The path component contains data, usually organized in hierarchical %% form, that, along with data in the non-hierarchical query component %% (Section 3.4), serves to identify a resource within the scope of the %% URI's scheme and naming authority (if any). The path is terminated %% by the first question mark ("?") or number sign ("#") character, or %% by the end of the URI. %% %% path = path-abempty ; begins with "/" or is empty %% / path-absolute ; begins with "/" but not "//" %% / path-noscheme ; begins with a non-colon segment %% / path-rootless ; begins with a segment %% / path-empty ; zero characters %% %% path-abempty = *( "/" segment ) %% path-absolute = "/" [ segment-nz *( "/" segment ) ] %% path-noscheme = segment-nz-nc *( "/" segment ) %% path-rootless = segment-nz *( "/" segment ) %% path-empty = 0 %% segment = *pchar %% segment-nz = 1*pchar %% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) %% ; non-zero-length segment without any colon ":" %% %% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" %% %% %% [RFC 3986, Chapter 3.4. Query] %% %% The query component contains non-hierarchical data that, along with %% data in the path component (Section 3.3), serves to identify a %% resource within the scope of the URI's scheme and naming authority %% (if any). The query component is indicated by the first question %% mark ("?") character and terminated by a number sign ("#") character %% or by the end of the URI. %% %% query = *( pchar / "/" / "?" ) %% %% %% [RFC 3986, Chapter 3.5. Fragment] %% %% The fragment identifier component of a URI allows indirect %% identification of a secondary resource by reference to a primary %% resource and additional identifying information. %% %% fragment = *( pchar / "/" / "?" ) %% %% %% [RFC 3986, Chapter 4.1. URI Reference] %% %% URI-reference is used to denote the most common usage of a resource %% identifier. %% %% URI-reference = URI / relative-ref %% %% %% [RFC 3986, Chapter 4.2. Relative Reference] %% %% A relative reference takes advantage of the hierarchical syntax %% (Section 1.2.3) to express a URI reference relative to the name space %% of another hierarchical URI. %% %% relative-ref = relative-part [ "?" query ] [ "#" fragment ] %% %% relative-part = "//" authority path-abempty %% / path-absolute %% / path-noscheme %% / path-empty %% %% %% [RFC 3986, Chapter 4.3. Absolute URI] %% %% Some protocol elements allow only the absolute form of a URI without %% a fragment identifier. For example, defining a base URI for later %% use by relative references calls for an absolute-URI syntax rule that %% does not allow a fragment. %% %% absolute-URI = scheme ":" hier-part [ "?" query ] %% -module(uri_string). %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- -export([compose_query/1, compose_query/2, dissect_query/1, normalize/1, normalize/2, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). %%------------------------------------------------------------------------- %% Internal API %%------------------------------------------------------------------------- -export([is_host/1, is_path/1]). % suppress warnings %%------------------------------------------------------------------------- %% Macros %%------------------------------------------------------------------------- -define(CHAR(Char), <>). -define(STRING_EMPTY, <<>>). -define(STRING(MatchStr), <>). -define(STRING_REST(MatchStr, Rest), <>). -define(DEC2HEX(X), if ((X) >= 0) andalso ((X) =< 9) -> (X) + $0; ((X) >= 10) andalso ((X) =< 15) -> (X) + $A - 10 end). -define(HEX2DEC(X), if ((X) >= $0) andalso ((X) =< $9) -> (X) - $0; ((X) >= $A) andalso ((X) =< $F) -> (X) - $A + 10; ((X) >= $a) andalso ((X) =< $f) -> (X) - $a + 10 end). %%%========================================================================= %%% API %%%========================================================================= %%------------------------------------------------------------------------- %% URI compliant with RFC 3986 %% ASCII %x21 - %x7A ("!" - "z") except %% %x34 " double quote %% %x60 < less than %% %x62 > greater than %% %x92 \ backslash %% %x94 ^ caret / circumflex %% %x96 ` grave / accent %%------------------------------------------------------------------------- -type uri_string() :: iodata(). -type error() :: {error, atom(), term()}. %%------------------------------------------------------------------------- %% RFC 3986, Chapter 3. Syntax Components %%------------------------------------------------------------------------- -type uri_map() :: #{fragment => unicode:chardata(), host => unicode:chardata(), path => unicode:chardata(), port => non_neg_integer() | undefined, query => unicode:chardata(), scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. %%------------------------------------------------------------------------- %% Normalize URIs %%------------------------------------------------------------------------- -spec normalize(URI) -> NormalizedURI when URI :: uri_string() | uri_map(), NormalizedURI :: uri_string() | error(). normalize(URIMap) -> try normalize(URIMap, []) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. -spec normalize(URI, Options) -> NormalizedURI when URI :: uri_string() | uri_map(), Options :: [return_map], NormalizedURI :: uri_string() | uri_map(). normalize(URIMap, []) when is_map(URIMap) -> recompose(normalize_map(URIMap)); normalize(URIMap, [return_map]) when is_map(URIMap) -> normalize_map(URIMap); normalize(URIString, []) -> case parse(URIString) of Value when is_map(Value) -> recompose(normalize_map(Value)); Error -> Error end; normalize(URIString, [return_map]) -> case parse(URIString) of Value when is_map(Value) -> normalize_map(Value); Error -> Error end. %%------------------------------------------------------------------------- %% Parse URIs %%------------------------------------------------------------------------- -spec parse(URIString) -> URIMap when URIString :: uri_string(), URIMap :: uri_map() | error(). parse(URIString) when is_binary(URIString) -> try parse_uri_reference(URIString, #{}) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; parse(URIString) when is_list(URIString) -> try Binary = unicode:characters_to_binary(URIString), Map = parse_uri_reference(Binary, #{}), convert_mapfields_to_list(Map) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. %%------------------------------------------------------------------------- %% Recompose URIs %%------------------------------------------------------------------------- -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string() | error(). recompose(Map) -> case is_valid_map(Map) of false -> {error, invalid_map, Map}; true -> try T0 = update_scheme(Map, empty), T1 = update_userinfo(Map, T0), T2 = update_host(Map, T1), T3 = update_port(Map, T2), T4 = update_path(Map, T3), T5 = update_query(Map, T4), update_fragment(Map, T5) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end end. %%------------------------------------------------------------------------- %% Transcode URIs %%------------------------------------------------------------------------- -spec transcode(URIString, Options) -> Result when URIString :: uri_string(), Options :: [{in_encoding, unicode:encoding()}|{out_encoding, unicode:encoding()}], Result :: uri_string() | error(). transcode(URIString, Options) when is_binary(URIString) -> try InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), List = convert_to_list(URIString, InEnc), Output = transcode(List, [], InEnc, OutEnc), convert_to_binary(Output, utf8, OutEnc) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end; transcode(URIString, Options) when is_list(URIString) -> InEnc = proplists:get_value(in_encoding, Options, utf8), OutEnc = proplists:get_value(out_encoding, Options, utf8), Flattened = flatten_list(URIString, InEnc), try transcode(Flattened, [], InEnc, OutEnc) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. %%------------------------------------------------------------------------- %% Functions for working with the query part of a URI as a list %% of key/value pairs. %% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8 %% HTML 5.0 - 4.10.22.6 URL-encoded form data - non UTF-8 %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% Compose urlencoded query string from a list of unescaped key/value pairs. %% (application/x-www-form-urlencoded encoding algorithm) %%------------------------------------------------------------------------- -spec compose_query(QueryList) -> QueryString when QueryList :: [{unicode:chardata(), unicode:chardata()}], QueryString :: uri_string() | error(). compose_query(List) -> compose_query(List, [{encoding, utf8}]). -spec compose_query(QueryList, Options) -> QueryString when QueryList :: [{unicode:chardata(), unicode:chardata()}], Options :: [{encoding, atom()}], QueryString :: uri_string() | error(). compose_query([],_Options) -> []; compose_query(List, Options) -> try compose_query(List, Options, false, <<>>) catch throw:{error, Atom, RestData} -> {error, Atom, RestData} end. %% compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> Separator = get_separator(Rest), K = form_urlencode(Key, Options), V = form_urlencode(Value, Options), IsListNew = IsList orelse is_list(Key) orelse is_list(Value), compose_query(Rest, Options, IsListNew, < uri_map(). parse_relative_part(?STRING_REST("//", Rest), URI) -> %% Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> Userinfo = calculate_parsed_userinfo(Rest, T), URI2 = maybe_add_path(URI1), URI2#{userinfo => Userinfo} catch throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_host_port(Rest, T), URI2 = maybe_add_path(URI1), URI2#{host => remove_brackets(Host)} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute Path = calculate_parsed_part(Rest, T), URI1#{path => ?STRING_REST($/, Path)}; parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), URI2 = maybe_add_path(URI1), URI2#{query => Query}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), URI2 = maybe_add_path(URI1), URI2#{fragment => Fragment}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> {T, URI1} = parse_segment_nz_nc(Rest, URI), % path-noscheme Path = calculate_parsed_part(Rest, T), URI1#{path => ?STRING_REST(Char, Path)}; false -> throw({error,invalid_uri,[Char]}) end. %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.3. Path] %% %% The path component contains data, usually organized in hierarchical %% form, that, along with data in the non-hierarchical query component %% (Section 3.4), serves to identify a resource within the scope of the %% URI's scheme and naming authority (if any). The path is terminated %% by the first question mark ("?") or number sign ("#") character, or %% by the end of the URI. %% %% path = path-abempty ; begins with "/" or is empty %% / path-absolute ; begins with "/" but not "//" %% / path-noscheme ; begins with a non-colon segment %% / path-rootless ; begins with a segment %% / path-empty ; zero characters %% %% path-abempty = *( "/" segment ) %% path-absolute = "/" [ segment-nz *( "/" segment ) ] %% path-noscheme = segment-nz-nc *( "/" segment ) %% path-rootless = segment-nz *( "/" segment ) %% path-empty = 0 %% segment = *pchar %% segment-nz = 1*pchar %% segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) %% ; non-zero-length segment without any colon ":" %% %% pchar = unreserved / pct-encoded / sub-delims / ":" / "@" %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% path-abempty %%------------------------------------------------------------------------- -spec parse_segment(binary(), uri_map()) -> {binary(), uri_map()}. parse_segment(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of true -> parse_segment(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_segment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. %%------------------------------------------------------------------------- %% path-noscheme %%------------------------------------------------------------------------- -spec parse_segment_nz_nc(binary(), uri_map()) -> {binary(), uri_map()}. parse_segment_nz_nc(?STRING_REST($/, Rest), URI) -> parse_segment(Rest, URI); % segment parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> parse_segment_nz_nc(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_segment_nz_nc(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. %% Check if char is pchar. -spec is_pchar(char()) -> boolean(). is_pchar($%) -> true; % pct-encoded is_pchar($:) -> true; is_pchar($@) -> true; is_pchar(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). %% Check if char is segment_nz_nc. -spec is_segment_nz_nc(char()) -> boolean(). is_segment_nz_nc($%) -> true; % pct-encoded is_segment_nz_nc($@) -> true; is_segment_nz_nc(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.1. Scheme] %% %% Each URI begins with a scheme name that refers to a specification for %% assigning identifiers within that scheme. %% %% scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) %%------------------------------------------------------------------------- -spec parse_scheme_start(binary(), uri_map()) -> uri_map(). parse_scheme_start(?STRING_REST(Char, Rest), URI) -> case is_alpha(Char) of true -> {T, URI1} = parse_scheme(Rest, URI), Scheme = calculate_parsed_scheme(Rest, T), URI2 = maybe_add_path(URI1), URI2#{scheme => ?STRING_REST(Char, Scheme)}; false -> throw({error,invalid_uri,[Char]}) end. %% Add path component if it missing after parsing the URI. %% According to the URI specification there is always a %% path component in every URI-reference and it can be %% empty. maybe_add_path(Map) -> case maps:is_key(path, Map) of false -> Map#{path => <<>>}; _Else -> Map end. -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> {_, URI1} = parse_hier(Rest, URI), {Rest, URI1}; parse_scheme(?STRING_REST(Char, Rest), URI) -> case is_scheme(Char) of true -> parse_scheme(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_scheme(?STRING_EMPTY, _URI) -> throw({error,invalid_uri,<<>>}). %% Check if char is allowed in scheme -spec is_scheme(char()) -> boolean(). is_scheme($+) -> true; is_scheme($-) -> true; is_scheme($.) -> true; is_scheme(Char) -> is_alpha(Char) orelse is_digit(Char). %%------------------------------------------------------------------------- %% hier-part = "//" authority path-abempty %% / path-absolute %% / path-rootless %% / path-empty %%------------------------------------------------------------------------- -spec parse_hier(binary(), uri_map()) -> {binary(), uri_map()}. parse_hier(?STRING_REST("//", Rest), URI) -> % Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> Userinfo = calculate_parsed_userinfo(Rest, T), {Rest, URI1#{userinfo => Userinfo}} catch throw:{_,_,_} -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_host_port(Rest, T), {Rest, URI1#{host => remove_brackets(Host)}} end; parse_hier(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST($/, Path)}}; parse_hier(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of true -> % segment_nz {T, URI1} = parse_segment(Rest, URI), Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST(Char, Path)}}; false -> throw({error,invalid_uri,[Char]}) end; parse_hier(?STRING_EMPTY, URI) -> {<<>>, URI}. %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2. Authority] %% %% Many URI schemes include a hierarchical element for a naming %% authority so that governance of the name space defined by the %% remainder of the URI is delegated to that authority (which may, in %% turn, delegate it further). %% %% The authority component is preceded by a double slash ("//") and is %% terminated by the next slash ("/"), question mark ("?"), or number %% sign ("#") character, or by the end of the URI. %% %% authority = [ userinfo "@" ] host [ ":" port ] %% %% %% [RFC 3986, Chapter 3.2.1. User Information] %% %% The userinfo subcomponent may consist of a user name and, optionally, %% scheme-specific information about how to gain authorization to access %% the resource. The user information, if present, is followed by a %% commercial at-sign ("@") that delimits it from the host. %% %% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) %%------------------------------------------------------------------------- -spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. parse_userinfo(?CHAR($@), URI) -> {?STRING_EMPTY, URI#{host => <<>>}}; parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_host_port(Rest, T), {Rest, URI1#{host => remove_brackets(Host)}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of true -> parse_userinfo(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_userinfo(?STRING_EMPTY, _URI) -> %% URI cannot end in userinfo state throw({error,invalid_uri,<<>>}). %% Check if char is allowed in userinfo -spec is_userinfo(char()) -> boolean(). is_userinfo($%) -> true; % pct-encoded is_userinfo($:) -> true; is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2.2. Host] %% %% The host subcomponent of authority is identified by an IP literal %% encapsulated within square brackets, an IPv4 address in dotted- %% decimal form, or a registered name. %% %% host = IP-literal / IPv4address / reg-name %% %% IP-literal = "[" ( IPv6address / IPvFuture ) "]" %% %% IPvFuture = "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) %% %% IPv6address = 6( h16 ":" ) ls32 %% / "::" 5( h16 ":" ) ls32 %% / [ h16 ] "::" 4( h16 ":" ) ls32 %% / [ *1( h16 ":" ) h16 ] "::" 3( h16 ":" ) ls32 %% / [ *2( h16 ":" ) h16 ] "::" 2( h16 ":" ) ls32 %% / [ *3( h16 ":" ) h16 ] "::" h16 ":" ls32 %% / [ *4( h16 ":" ) h16 ] "::" ls32 %% / [ *5( h16 ":" ) h16 ] "::" h16 %% / [ *6( h16 ":" ) h16 ] "::" %% %% ls32 = ( h16 ":" h16 ) / IPv4address %% ; least-significant 32 bits of address %% %% h16 = 1*4HEXDIG %% ; 16 bits of address represented in hexadecimal %% %% IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet %% %% dec-octet = DIGIT ; 0-9 %% / %x31-39 DIGIT ; 10-99 %% / "1" 2DIGIT ; 100-199 %% / "2" %x30-34 DIGIT ; 200-249 %% / "25" %x30-35 ; 250-255 %% %% reg-name = *( unreserved / pct-encoded / sub-delims ) %%------------------------------------------------------------------------- -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), Port = get_port(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST($/, Path)}}; parse_host(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> try parse_ipv4_bin(Rest, [Char], URI) catch throw:{_,_,_} -> parse_reg_name(?STRING_REST(Char, Rest), URI) end; false -> parse_reg_name(?STRING_REST(Char, Rest), URI) end; parse_host(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), Port = get_port(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST($/, Path)}}; parse_reg_name(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of true -> parse_reg_name(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_reg_name(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. %% Check if char is allowed in reg-name -spec is_reg_name(char()) -> boolean(). is_reg_name($%) -> true; is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_ipv4_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), Port = get_port(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_segment(Rest, URI), % path-abempty Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST($/, Path)}}; parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of true -> parse_ipv4_bin(Rest, [Char|Acc], URI); false -> throw({error,invalid_uri,[Char]}) end; parse_ipv4_bin(?STRING_EMPTY, Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {?STRING_EMPTY, URI}. %% Check if char is allowed in IPv4 addresses -spec is_ipv4(char()) -> boolean(). is_ipv4($.) -> true; is_ipv4(Char) -> is_digit(Char). -spec validate_ipv4_address(list()) -> list(). validate_ipv4_address(Addr) -> case inet:parse_ipv4strict_address(Addr) of {ok, _} -> Addr; {error, _} -> throw({error,invalid_uri,Addr}) end. -spec parse_ipv6_bin(binary(), list(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin(?STRING_REST($], Rest), Acc, URI) -> _ = validate_ipv6_address(lists:reverse(Acc)), parse_ipv6_bin_end(Rest, URI); parse_ipv6_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin(Rest, [Char|Acc], URI); false -> throw({error,invalid_uri,[Char]}) end; parse_ipv6_bin(?STRING_EMPTY, _Acc, _URI) -> throw({error,invalid_uri,<<>>}). %% Check if char is allowed in IPv6 addresses -spec is_ipv6(char()) -> boolean(). is_ipv6($:) -> true; is_ipv6($.) -> true; is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), H = calculate_parsed_host_port(Rest, T), Port = get_port(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST($/, Path)}}; parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of true -> parse_ipv6_bin_end(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_ipv6_bin_end(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. -spec validate_ipv6_address(list()) -> list(). validate_ipv6_address(Addr) -> case inet:parse_ipv6strict_address(Addr) of {ok, _} -> Addr; {error, _} -> throw({error,invalid_uri,Addr}) end. %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.2.2. Port] %% %% The port subcomponent of authority is designated by an optional port %% number in decimal following the host and delimited from it by a %% single colon (":") character. %% %% port = *DIGIT %%------------------------------------------------------------------------- -spec parse_port(binary(), uri_map()) -> {binary(), uri_map()}. parse_port(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-abempty Path = calculate_parsed_part(Rest, T), {Rest, URI1#{path => ?STRING_REST($/, Path)}}; parse_port(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{query => Query}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of true -> parse_port(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_port(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.4. Query] %% %% The query component contains non-hierarchical data that, along with %% data in the path component (Section 3.3), serves to identify a %% resource within the scope of the URI's scheme and naming authority %% (if any). The query component is indicated by the first question %% mark ("?") character and terminated by a number sign ("#") character %% or by the end of the URI. %% %% query = *( pchar / "/" / "?" ) %%------------------------------------------------------------------------- -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), Fragment = calculate_parsed_query_fragment(Rest, T), {Rest, URI1#{fragment => Fragment}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of true -> parse_query(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_query(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. %% Check if char is allowed in query -spec is_query(char()) -> boolean(). is_query($/) -> true; is_query($?) -> true; is_query(Char) -> is_pchar(Char). %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 3.5. Fragment] %% %% The fragment identifier component of a URI allows indirect %% identification of a secondary resource by reference to a primary %% resource and additional identifying information. %% %% fragment = *( pchar / "/" / "?" ) %%------------------------------------------------------------------------- -spec parse_fragment(binary(), uri_map()) -> {binary(), uri_map()}. parse_fragment(?STRING_REST(Char, Rest), URI) -> case is_fragment(Char) of true -> parse_fragment(Rest, URI); false -> throw({error,invalid_uri,[Char]}) end; parse_fragment(?STRING_EMPTY, URI) -> {?STRING_EMPTY, URI}. %% Check if char is allowed in fragment -spec is_fragment(char()) -> boolean(). is_fragment($/) -> true; is_fragment($?) -> true; is_fragment(Char) -> is_pchar(Char). %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 2.2. Reserved Characters] %% %% reserved = gen-delims / sub-delims %% %% gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" %% %% sub-delims = "!" / "$" / "&" / "'" / "(" / ")" %% / "*" / "+" / "," / ";" / "=" %% %%------------------------------------------------------------------------- %% Return true if input char is reserved. -spec is_reserved(char()) -> boolean(). is_reserved($:) -> true; is_reserved($/) -> true; is_reserved($?) -> true; is_reserved($#) -> true; is_reserved($[) -> true; is_reserved($]) -> true; is_reserved($@) -> true; is_reserved($!) -> true; is_reserved($$) -> true; is_reserved($&) -> true; is_reserved($') -> true; is_reserved($() -> true; is_reserved($)) -> true; is_reserved($*) -> true; is_reserved($+) -> true; is_reserved($,) -> true; is_reserved($;) -> true; is_reserved($=) -> true; is_reserved(_) -> false. %% Check if char is sub-delim. -spec is_sub_delim(char()) -> boolean(). is_sub_delim($!) -> true; is_sub_delim($$) -> true; is_sub_delim($&) -> true; is_sub_delim($') -> true; is_sub_delim($() -> true; is_sub_delim($)) -> true; is_sub_delim($*) -> true; is_sub_delim($+) -> true; is_sub_delim($,) -> true; is_sub_delim($;) -> true; is_sub_delim($=) -> true; is_sub_delim(_) -> false. %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 2.3. Unreserved Characters] %% %% unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" %% %%------------------------------------------------------------------------- -spec is_unreserved(char()) -> boolean(). is_unreserved($-) -> true; is_unreserved($.) -> true; is_unreserved($_) -> true; is_unreserved($~) -> true; is_unreserved(Char) -> is_alpha(Char) orelse is_digit(Char). -spec is_alpha(char()) -> boolean(). is_alpha(C) when $A =< C, C =< $Z; $a =< C, C =< $z -> true; is_alpha(_) -> false. -spec is_digit(char()) -> boolean(). is_digit(C) when $0 =< C, C =< $9 -> true; is_digit(_) -> false. -spec is_hex_digit(char()) -> boolean(). is_hex_digit(C) when $0 =< C, C =< $9;$a =< C, C =< $f;$A =< C, C =< $F -> true; is_hex_digit(_) -> false. %% Remove enclosing brackets from binary -spec remove_brackets(binary()) -> binary(). remove_brackets(<<$[/utf8, Rest/binary>>) -> {H,T} = split_binary(Rest, byte_size(Rest) - 1), case T =:= <<$]/utf8>> of true -> H; false -> Rest end; remove_brackets(Addr) -> Addr. %%------------------------------------------------------------------------- %% Helper functions for calculating the parsed binary. %%------------------------------------------------------------------------- -spec calculate_parsed_scheme(binary(), binary()) -> binary(). calculate_parsed_scheme(Input, <<>>) -> strip_last_char(Input, [$:]); calculate_parsed_scheme(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). -spec calculate_parsed_part(binary(), binary()) -> binary(). calculate_parsed_part(Input, <<>>) -> strip_last_char(Input, [$?,$#]); calculate_parsed_part(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). -spec calculate_parsed_userinfo(binary(), binary()) -> binary(). calculate_parsed_userinfo(Input, <<>>) -> strip_last_char(Input, [$?,$#,$@]); calculate_parsed_userinfo(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). -spec calculate_parsed_host_port(binary(), binary()) -> binary(). calculate_parsed_host_port(Input, <<>>) -> strip_last_char(Input, [$:,$?,$#,$/]); calculate_parsed_host_port(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). calculate_parsed_query_fragment(Input, <<>>) -> strip_last_char(Input, [$#]); calculate_parsed_query_fragment(Input, Unparsed) -> get_parsed_binary(Input, Unparsed). get_port(<<>>) -> undefined; get_port(B) -> try binary_to_integer(B) catch error:badarg -> throw({error, invalid_uri, B}) end. %% Strip last char if it is in list %% %% This function is optimized for speed: parse/1 is about 10% faster than %% with an alternative implementation based on lists and sets. strip_last_char(<<>>, _) -> <<>>; strip_last_char(Input, [C0]) -> case binary:last(Input) of C0 -> init_binary(Input); _Else -> Input end; strip_last_char(Input, [C0,C1]) -> case binary:last(Input) of C0 -> init_binary(Input); C1 -> init_binary(Input); _Else -> Input end; strip_last_char(Input, [C0,C1,C2]) -> case binary:last(Input) of C0 -> init_binary(Input); C1 -> init_binary(Input); C2 -> init_binary(Input); _Else -> Input end; strip_last_char(Input, [C0,C1,C2,C3]) -> case binary:last(Input) of C0 -> init_binary(Input); C1 -> init_binary(Input); C2 -> init_binary(Input); C3 -> init_binary(Input); _Else -> Input end. %% Get parsed binary get_parsed_binary(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), First. %% Return all bytes of the binary except the last one. The binary must be non-empty. init_binary(B) -> {Init, _} = split_binary(B, byte_size(B) - 1), Init. %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). byte_size_exl_head(<<>>) -> 0; byte_size_exl_head(Binary) -> byte_size(Binary) + 1. %%------------------------------------------------------------------------- %% [RFC 3986, Chapter 2.1. Percent-Encoding] %% %% A percent-encoding mechanism is used to represent a data octet in a %% component when that octet's corresponding character is outside the %% allowed set or is being used as a delimiter of, or within, the %% component. A percent-encoded octet is encoded as a character %% triplet, consisting of the percent character "%" followed by the two %% hexadecimal digits representing that octet's numeric value. For %% example, "%20" is the percent-encoding for the binary octet %% "00100000" (ABNF: %x20), which in US-ASCII corresponds to the space %% character (SP). Section 2.4 describes when percent-encoding and %% decoding is applied. %% %% pct-encoded = "%" HEXDIG HEXDIG %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% Percent-encode %%------------------------------------------------------------------------- %% Only validates as scheme cannot have percent-encoded characters -spec encode_scheme(list()|binary()) -> list() | binary(). encode_scheme([]) -> throw({error,invalid_scheme,""}); encode_scheme(<<>>) -> throw({error,invalid_scheme,<<>>}); encode_scheme(Scheme) -> case validate_scheme(Scheme) of true -> Scheme; false -> throw({error,invalid_scheme,Scheme}) end. -spec encode_userinfo(list()|binary()) -> list() | binary(). encode_userinfo(Cs) -> encode(Cs, fun is_userinfo/1). -spec encode_host(list()|binary()) -> list() | binary(). encode_host(Cs) -> case classify_host(Cs) of regname -> Cs; ipv4 -> Cs; ipv6 -> bracket_ipv6(Cs); other -> encode(Cs, fun is_reg_name/1) end. -spec encode_path(list()|binary()) -> list() | binary(). encode_path(Cs) -> encode(Cs, fun is_path/1). -spec encode_query(list()|binary()) -> list() | binary(). encode_query(Cs) -> encode(Cs, fun is_query/1). -spec encode_fragment(list()|binary()) -> list() | binary(). encode_fragment(Cs) -> encode(Cs, fun is_fragment/1). %%------------------------------------------------------------------------- %% Helper funtions for percent-decode %%------------------------------------------------------------------------- -spec decode(list()|binary()) -> list() | binary(). decode(Cs) -> decode(Cs, <<>>). %% decode(L, Acc) when is_list(L) -> B0 = unicode:characters_to_binary(L), B1 = decode(B0, Acc), unicode:characters_to_list(B1); decode(<<$%,C0,C1,Cs/binary>>, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> B = ?HEX2DEC(C0)*16+?HEX2DEC(C1), case is_reserved(B) of true -> %% [2.2] Characters in the reserved set are protected from %% normalization. %% [2.1] For consistency, URI producers and normalizers should %% use uppercase hexadecimal digits for all percent- %% encodings. H0 = hex_to_upper(C0), H1 = hex_to_upper(C1), decode(Cs, <>); false -> decode(Cs, <>) end; false -> throw({error,invalid_percent_encoding,<<$%,C0,C1>>}) end; decode(<>, Acc) -> decode(Cs, <>); decode(<<>>, Acc) -> check_utf8(Acc). %% Returns Cs if it is utf8 encoded. check_utf8(Cs) -> case unicode:characters_to_list(Cs) of {incomplete,_,_} -> throw({error,invalid_utf8,Cs}); {error,_,_} -> throw({error,invalid_utf8,Cs}); _ -> Cs end. %% Convert hex digit to uppercase form hex_to_upper(H) when $a =< H, H =< $f -> H - 32; hex_to_upper(H) when $0 =< H, H =< $9;$A =< H, H =< $F-> H; hex_to_upper(H) -> throw({error,invalid_input, H}). %% Check if char is allowed in host -spec is_host(char()) -> boolean(). is_host($:) -> true; is_host(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). %% Check if char is allowed in path -spec is_path(char()) -> boolean(). is_path($/) -> true; is_path(Char) -> is_pchar(Char). %%------------------------------------------------------------------------- %% Helper functions for percent-encode %%------------------------------------------------------------------------- -spec encode(list()|binary(), fun()) -> list() | binary(). encode(Component, Fun) when is_list(Component) -> B = unicode:characters_to_binary(Component), unicode:characters_to_list(encode(B, Fun, <<>>)); encode(Component, Fun) when is_binary(Component) -> encode(Component, Fun, <<>>). %% encode(<>, Fun, Acc) -> C = encode_codepoint_binary(Char, Fun), encode(Rest, Fun, <>); encode(<>, _Fun, _Acc) -> throw({error,invalid_input,<>}); encode(<<>>, _Fun, Acc) -> Acc. -spec encode_codepoint_binary(integer(), fun()) -> binary(). encode_codepoint_binary(C, Fun) -> case Fun(C) of false -> percent_encode_binary(C); true -> <> end. -spec percent_encode_binary(integer()) -> binary(). percent_encode_binary(Code) -> percent_encode_binary(<>, <<>>). percent_encode_binary(<>, Acc) -> percent_encode_binary(Rest, <>); percent_encode_binary(<<>>, Acc) -> Acc. %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- validate_scheme([]) -> true; validate_scheme([H|T]) -> case is_scheme(H) of true -> validate_scheme(T); false -> false end; validate_scheme(<<>>) -> true; validate_scheme(<>) -> case is_scheme(H) of true -> validate_scheme(Rest); false -> false end. %%------------------------------------------------------------------------- %% Classifies hostname into the following categories: %% regname, ipv4 - address does not contain reserved characters to be %% percent-encoded %% ipv6 - address does not contain reserved characters but it shall be %% encolsed in brackets %% other - address shall be percent-encoded %%------------------------------------------------------------------------- classify_host([]) -> other; classify_host(Addr) when is_binary(Addr) -> A = unicode:characters_to_list(Addr), classify_host_ipv6(A); classify_host(Addr) -> classify_host_ipv6(Addr). classify_host_ipv6(Addr) -> case is_ipv6_address(Addr) of true -> ipv6; false -> classify_host_ipv4(Addr) end. classify_host_ipv4(Addr) -> case is_ipv4_address(Addr) of true -> ipv4; false -> classify_host_regname(Addr) end. classify_host_regname([]) -> regname; classify_host_regname([H|T]) -> case is_reg_name(H) of true -> classify_host_regname(T); false -> other end. is_ipv4_address(Addr) -> case inet:parse_ipv4strict_address(Addr) of {ok, _} -> true; {error, _} -> false end. is_ipv6_address(Addr) -> case inet:parse_ipv6strict_address(Addr) of {ok, _} -> true; {error, _} -> false end. bracket_ipv6(Addr) when is_binary(Addr) -> concat(<<$[,Addr/binary>>,<<$]>>); bracket_ipv6(Addr) when is_list(Addr) -> [$[|Addr] ++ "]". %%------------------------------------------------------------------------- %% Helper funtions for recompose %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% Checks if input Map has valid combination of fields that can be %% recomposed into a URI. %% %% The implementation is based on a decision tree that fulfills the %% following rules: %% - 'path' shall always be present in the input map %% URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] %% hier-part = "//" authority path-abempty %% / path-absolute %% / path-rootless %% / path-empty %% - 'host' shall be present in the input map when 'path' starts with %% two slashes ("//") %% path = path-abempty ; begins with "/" or is empty %% / path-absolute ; begins with "/" but not "//" %% / path-noscheme ; begins with a non-colon segment %% / path-rootless ; begins with a segment %% / path-empty ; zero characters %% path-abempty = *( "/" segment ) %% segment = *pchar %% - 'host' shall be present if userinfo or port is present in input map %% authority = [ userinfo "@" ] host [ ":" port ] %% - All fields shall be valid (scheme, userinfo, host, port, path, query %% or fragment). %%------------------------------------------------------------------------- is_valid_map(#{path := Path} = Map) -> ((starts_with_two_slash(Path) andalso is_valid_map_host(Map)) orelse (maps:is_key(userinfo, Map) andalso is_valid_map_host(Map)) orelse (maps:is_key(port, Map) andalso is_valid_map_host(Map)) orelse all_fields_valid(Map)); is_valid_map(#{}) -> false. is_valid_map_host(Map) -> maps:is_key(host, Map) andalso all_fields_valid(Map). all_fields_valid(Map) -> Fun = fun(scheme, _, Acc) -> Acc; (userinfo, _, Acc) -> Acc; (host, _, Acc) -> Acc; (port, _, Acc) -> Acc; (path, _, Acc) -> Acc; (query, _, Acc) -> Acc; (fragment, _, Acc) -> Acc; (_, _, _) -> false end, maps:fold(Fun, true, Map). starts_with_two_slash([$/,$/|_]) -> true; starts_with_two_slash(?STRING_REST("//", _)) -> true; starts_with_two_slash(_) -> false. update_scheme(#{scheme := Scheme}, _) -> add_colon_postfix(encode_scheme(Scheme)); update_scheme(#{}, _) -> empty. update_userinfo(#{userinfo := Userinfo}, empty) -> add_auth_prefix(encode_userinfo(Userinfo)); update_userinfo(#{userinfo := Userinfo}, URI) -> concat(URI,add_auth_prefix(encode_userinfo(Userinfo))); update_userinfo(#{}, empty) -> empty; update_userinfo(#{}, URI) -> URI. update_host(#{host := Host}, empty) -> add_auth_prefix(encode_host(Host)); update_host(#{host := Host} = Map, URI) -> concat(URI,add_host_prefix(Map, encode_host(Host))); update_host(#{}, empty) -> empty; update_host(#{}, URI) -> URI. %% URI cannot be empty for ports. E.g. ":8080" is not a valid URI update_port(#{port := undefined}, URI) -> concat(URI, <<":">>); update_port(#{port := Port}, URI) -> concat(URI,add_colon(encode_port(Port))); update_port(#{}, URI) -> URI. update_path(#{path := Path}, empty) -> encode_path(Path); update_path(#{path := Path}, URI) -> concat(URI,encode_path(Path)); update_path(#{}, empty) -> empty; update_path(#{}, URI) -> URI. update_query(#{query := Query}, empty) -> encode_query(Query); update_query(#{query := Query}, URI) -> concat(URI,add_question_mark(encode_query(Query))); update_query(#{}, empty) -> empty; update_query(#{}, URI) -> URI. update_fragment(#{fragment := Fragment}, empty) -> add_hashmark(encode_fragment(Fragment)); update_fragment(#{fragment := Fragment}, URI) -> concat(URI,add_hashmark(encode_fragment(Fragment))); update_fragment(#{}, empty) -> ""; update_fragment(#{}, URI) -> URI. %%------------------------------------------------------------------------- %% Concatenates its arguments that can be lists and binaries. %% The result is a list if at least one of its argument is a list and %% binary otherwise. %%------------------------------------------------------------------------- concat(A, B) when is_binary(A), is_binary(B) -> <>; concat(A, B) when is_binary(A), is_list(B) -> unicode:characters_to_list(A) ++ B; concat(A, B) when is_list(A) -> A ++ maybe_to_list(B). add_hashmark(Comp) when is_binary(Comp) -> <<$#, Comp/binary>>; add_hashmark(Comp) when is_list(Comp) -> [$#|Comp]. add_question_mark(Comp) when is_binary(Comp) -> <<$?, Comp/binary>>; add_question_mark(Comp) when is_list(Comp) -> [$?|Comp]. add_colon(Comp) when is_binary(Comp) -> <<$:, Comp/binary>>. add_colon_postfix(Comp) when is_binary(Comp) -> <>; add_colon_postfix(Comp) when is_list(Comp) -> Comp ++ ":". add_auth_prefix(Comp) when is_binary(Comp) -> <<"//", Comp/binary>>; add_auth_prefix(Comp) when is_list(Comp) -> [$/,$/|Comp]. add_host_prefix(#{userinfo := _}, Host) when is_binary(Host) -> <<$@,Host/binary>>; add_host_prefix(#{}, Host) when is_binary(Host) -> <<"//",Host/binary>>; add_host_prefix(#{userinfo := _}, Host) when is_list(Host) -> [$@|Host]; add_host_prefix(#{}, Host) when is_list(Host) -> [$/,$/|Host]. maybe_to_list(Comp) when is_binary(Comp) -> unicode:characters_to_list(Comp); maybe_to_list(Comp) -> Comp. encode_port(Port) -> integer_to_binary(Port). %%------------------------------------------------------------------------- %% Helper functions for transcode %%------------------------------------------------------------------------- %%------------------------------------------------------------------------- %% uri_string:transcode(<<"x%00%00%00%F6"/utf32>>). %% 1. Convert (transcode/2) input to list form (list of unicode codepoints) %% "x%00%00%00%F6" %% 2. Accumulate characters until percent-encoded segment (transcode/4). %% Acc = "x" %% 3. Convert percent-encoded triplets to binary form (transcode_pct/4) %% <<0,0,0,246>> %% 4. Transcode in-encoded binary to out-encoding (utf32 -> utf8): %% <<195,182>> %% 5. Percent-encode out-encoded binary: %% <<"%C3%B6"/utf8>> = <<37,67,51,37,66,54>> %% 6. Convert binary to list form, reverse it and append the accumulator %% "6B%3C%" + "x" %% 7. Reverse Acc and return it %%------------------------------------------------------------------------- transcode([$%,_C0,_C1|_Rest] = L, Acc, InEnc, OutEnc) -> transcode_pct(L, Acc, <<>>, InEnc, OutEnc); transcode([_C|_Rest] = L, Acc, InEnc, OutEnc) -> transcode(L, Acc, [], InEnc, OutEnc). %% transcode([$%,_C0,_C1|_Rest] = L, Acc, List, InEncoding, OutEncoding) -> transcode_pct(L, List ++ Acc, <<>>, InEncoding, OutEncoding); transcode([C|Rest], Acc, List, InEncoding, OutEncoding) -> transcode(Rest, Acc, [C|List], InEncoding, OutEncoding); transcode([], Acc, List, _InEncoding, _OutEncoding) -> lists:reverse(List ++ Acc). %% Transcode percent-encoded segment transcode_pct([$%,C0,C1|Rest] = L, Acc, B, InEncoding, OutEncoding) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> Int = ?HEX2DEC(C0)*16+?HEX2DEC(C1), transcode_pct(Rest, Acc, <>, InEncoding, OutEncoding); false -> throw({error, invalid_percent_encoding,L}) end; transcode_pct([_C|_Rest] = L, Acc, B, InEncoding, OutEncoding) -> OutBinary = convert_to_binary(B, InEncoding, OutEncoding), PctEncUtf8 = percent_encode_segment(OutBinary), Out = lists:reverse(convert_to_list(PctEncUtf8, utf8)), transcode(L, Out ++ Acc, [], InEncoding, OutEncoding); transcode_pct([], Acc, B, InEncoding, OutEncoding) -> OutBinary = convert_to_binary(B, InEncoding, OutEncoding), PctEncUtf8 = percent_encode_segment(OutBinary), Out = convert_to_list(PctEncUtf8, utf8), lists:reverse(Acc) ++ Out. %% Convert to binary convert_to_binary(Binary, InEncoding, OutEncoding) -> case unicode:characters_to_binary(Binary, InEncoding, OutEncoding) of {error, _List, RestData} -> throw({error, invalid_input, RestData}); {incomplete, _List, RestData} -> throw({error, invalid_input, RestData}); Result -> Result end. %% Convert to list convert_to_list(Binary, InEncoding) -> case unicode:characters_to_list(Binary, InEncoding) of {error, _List, RestData} -> throw({error, invalid_input, RestData}); {incomplete, _List, RestData} -> throw({error, invalid_input, RestData}); Result -> Result end. %% Flatten input list flatten_list([], _) -> []; flatten_list(L, InEnc) -> flatten_list(L, InEnc, []). %% flatten_list([H|T], InEnc, Acc) when is_binary(H) -> L = convert_to_list(H, InEnc), flatten_list(T, InEnc, lists:reverse(L) ++ Acc); flatten_list([H|T], InEnc, Acc) when is_list(H) -> flatten_list(H ++ T, InEnc, Acc); flatten_list([H|T], InEnc, Acc) -> flatten_list(T, InEnc, [H|Acc]); flatten_list([], _InEnc, Acc) -> lists:reverse(Acc); flatten_list(Arg, _, _) -> throw({error, invalid_input, Arg}). percent_encode_segment(Segment) -> percent_encode_binary(Segment, <<>>). %%------------------------------------------------------------------------- %% Helper functions for compose_query %%------------------------------------------------------------------------- %% Returns separator to be used between key-value pairs get_separator(L) when length(L) =:= 0 -> <<>>; get_separator(_L) -> <<"&">>. %% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8 %% HTML 5.0 - 4.10.22.6 URL-encoded form data - encoding (non UTF-8) form_urlencode(Cs, [{encoding, latin1}]) when is_list(Cs) -> B = convert_to_binary(Cs, utf8, utf8), html5_byte_encode(base10_encode(B)); form_urlencode(Cs, [{encoding, latin1}]) when is_binary(Cs) -> html5_byte_encode(base10_encode(Cs)); form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs), Encoding =:= utf8; Encoding =:= unicode -> B = convert_to_binary(Cs, utf8, Encoding), html5_byte_encode(B); form_urlencode(Cs, [{encoding, Encoding}]) when is_binary(Cs), Encoding =:= utf8; Encoding =:= unicode -> html5_byte_encode(Cs); form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs); is_binary(Cs) -> throw({error,invalid_encoding, Encoding}); form_urlencode(Cs, _) -> throw({error,invalid_input, Cs}). %% For each character in the entry's name and value that cannot be expressed using %% the selected character encoding, replace the character by a string consisting of %% a U+0026 AMPERSAND character (&), a "#" (U+0023) character, one or more ASCII %% digits representing the Unicode code point of the character in base ten, and %% finally a ";" (U+003B) character. base10_encode(Cs) -> base10_encode(Cs, <<>>). %% base10_encode(<<>>, Acc) -> Acc; base10_encode(<>, Acc) when H > 255 -> Base10 = convert_to_binary(integer_to_list(H,10), utf8, utf8), base10_encode(T, <>); base10_encode(<>, Acc) -> base10_encode(T, <>). html5_byte_encode(B) -> html5_byte_encode(B, <<>>). %% html5_byte_encode(<<>>, Acc) -> Acc; html5_byte_encode(<<$ ,T/binary>>, Acc) -> html5_byte_encode(T, <>); html5_byte_encode(<>, Acc) -> case is_url_char(H) of true -> html5_byte_encode(T, <>); false -> <> = <>, html5_byte_encode(T, <>) end; html5_byte_encode(H, _Acc) -> throw({error,invalid_input, H}). %% Return true if input char can appear in form-urlencoded string %% Allowed chararacters: %% 0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A, %% 0x5F, 0x61 to 0x7A is_url_char(C) when C =:= 16#2A; C =:= 16#2D; C =:= 16#2E; C =:= 16#5F; 16#30 =< C, C =< 16#39; 16#41 =< C, C =< 16#5A; 16#61 =< C, C =< 16#7A -> true; is_url_char(_) -> false. %%------------------------------------------------------------------------- %% Helper functions for dissect_query %%------------------------------------------------------------------------- dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> dissect_query_value(T, IsList, Acc, Key, Value); dissect_query_key(<<"&#",T/binary>>, IsList, Acc, Key, Value) -> dissect_query_key(T, IsList, Acc, <>, Value); dissect_query_key(<<$&,_T/binary>>, _IsList, _Acc, _Key, _Value) -> throw({error, missing_value, "&"}); dissect_query_key(<>, IsList, Acc, Key, Value) -> dissect_query_key(T, IsList, Acc, <>, Value); dissect_query_key(B, _, _, _, _) -> throw({error, missing_value, B}). dissect_query_value(<<$&,T/binary>>, IsList, Acc, Key, Value) -> K = form_urldecode(IsList, Key), V = form_urldecode(IsList, Value), dissect_query_key(T, IsList, [{K,V}|Acc], <<>>, <<>>); dissect_query_value(<>, IsList, Acc, Key, Value) -> dissect_query_value(T, IsList, Acc, Key, <>); dissect_query_value(<<>>, IsList, Acc, Key, Value) -> K = form_urldecode(IsList, Key), V = form_urldecode(IsList, Value), lists:reverse([{K,V}|Acc]). %% HTML 5.2 - 4.10.21.6 URL-encoded form data - WHATWG URL (10 Jan 2018) - UTF-8 %% HTML 5.0 - 4.10.22.6 URL-encoded form data - decoding (non UTF-8) form_urldecode(true, B) -> Result = base10_decode(form_urldecode(B, <<>>)), convert_to_list(Result, utf8); form_urldecode(false, B) -> base10_decode(form_urldecode(B, <<>>)); form_urldecode(<<>>, Acc) -> Acc; form_urldecode(<<$+,T/binary>>, Acc) -> form_urldecode(T, <>); form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> case is_hex_digit(C0) andalso is_hex_digit(C1) of true -> V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), form_urldecode(T, <>); false -> L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8), throw({error, invalid_percent_encoding, L}) end; form_urldecode(<>, Acc) -> form_urldecode(T, <>); form_urldecode(<>, _Acc) -> throw({error, invalid_character, [H]}). base10_decode(Cs) -> base10_decode(Cs, <<>>). % base10_decode(<<>>, Acc) -> Acc; base10_decode(<<"&#",T/binary>>, Acc) -> base10_decode_unicode(T, Acc); base10_decode(<>, Acc) -> base10_decode(T,<>); base10_decode(<>, _) -> throw({error, invalid_input, [H]}). base10_decode_unicode(B, Acc) -> base10_decode_unicode(B, 0, Acc). %% base10_decode_unicode(<>, Codepoint, Acc) when $0 =< H, H =< $9 -> Res = Codepoint * 10 + (H - $0), base10_decode_unicode(T, Res, Acc); base10_decode_unicode(<<$;,T/binary>>, Codepoint, Acc) -> base10_decode(T, <>); base10_decode_unicode(<>, _, _) -> throw({error, invalid_input, [H]}). %%------------------------------------------------------------------------- %% Helper functions for normalize %%------------------------------------------------------------------------- normalize_map(URIMap) -> normalize_path_segment( normalize_scheme_based( normalize_percent_encoding( normalize_case(URIMap)))). %% 6.2.2.1. Case Normalization normalize_case(#{scheme := Scheme, host := Host} = Map) -> Map#{scheme => to_lower(Scheme), host => to_lower(Host)}; normalize_case(#{host := Host} = Map) -> Map#{host => to_lower(Host)}; normalize_case(#{scheme := Scheme} = Map) -> Map#{scheme => to_lower(Scheme)}; normalize_case(#{} = Map) -> Map. %% 6.2.2.2. Percent-Encoding Normalization normalize_percent_encoding(Map) -> Fun = fun (K,V) when K =:= userinfo; K =:= host; K =:= path; K =:= query; K =:= fragment -> decode(V); %% Handle port and scheme (_,V) -> V end, maps:map(Fun, Map). to_lower(Cs) when is_list(Cs) -> B = convert_to_binary(Cs, utf8, utf8), convert_to_list(to_lower(B), utf8); to_lower(Cs) when is_binary(Cs) -> to_lower(Cs, <<>>). %% to_lower(<>, Acc) when $A =< C, C =< $Z -> to_lower(Cs, <>); to_lower(<>, Acc) -> to_lower(Cs, <>); to_lower(<<>>, Acc) -> Acc. %% 6.2.2.3. Path Segment Normalization %% 5.2.4. Remove Dot Segments normalize_path_segment(Map) -> Path = maps:get(path, Map, undefined), Map#{path => remove_dot_segments(Path)}. remove_dot_segments(Path) when is_binary(Path) -> remove_dot_segments(Path, <<>>); remove_dot_segments(Path) when is_list(Path) -> B = convert_to_binary(Path, utf8, utf8), B1 = remove_dot_segments(B, <<>>), convert_to_list(B1, utf8). %% remove_dot_segments(<<>>, Output) -> Output; remove_dot_segments(<<"../",T/binary>>, Output) -> remove_dot_segments(T, Output); remove_dot_segments(<<"./",T/binary>>, Output) -> remove_dot_segments(T, Output); remove_dot_segments(<<"/./",T/binary>>, Output) -> remove_dot_segments(<<$/,T/binary>>, Output); remove_dot_segments(<<"/.">>, Output) -> remove_dot_segments(<<$/>>, Output); remove_dot_segments(<<"/../",T/binary>>, Output) -> Out1 = remove_last_segment(Output), remove_dot_segments(<<$/,T/binary>>, Out1); remove_dot_segments(<<"/..">>, Output) -> Out1 = remove_last_segment(Output), remove_dot_segments(<<$/>>, Out1); remove_dot_segments(<<$.>>, Output) -> remove_dot_segments(<<>>, Output); remove_dot_segments(<<"..">>, Output) -> remove_dot_segments(<<>>, Output); remove_dot_segments(Input, Output) -> {First, Rest} = first_path_segment(Input), remove_dot_segments(Rest, <>). first_path_segment(Input) -> F = first_path_segment(Input, <<>>), split_binary(Input, byte_size(F)). %% first_path_segment(<<$/,T/binary>>, Acc) -> first_path_segment_end(<>, <>); first_path_segment(<>, Acc) -> first_path_segment_end(<>, <>). first_path_segment_end(<<>>, Acc) -> Acc; first_path_segment_end(<<$/,_/binary>>, Acc) -> Acc; first_path_segment_end(<>, Acc) -> first_path_segment_end(<>, <>). remove_last_segment(<<>>) -> <<>>; remove_last_segment(B) -> {Init, Last} = split_binary(B, byte_size(B) - 1), case Last of <<$/>> -> Init; _Char -> remove_last_segment(Init) end. %% RFC 3986, 6.2.3. Scheme-Based Normalization normalize_scheme_based(Map) -> Scheme = maps:get(scheme, Map, undefined), Port = maps:get(port, Map, undefined), Path= maps:get(path, Map, undefined), normalize_scheme_based(Map, Scheme, Port, Path). %% normalize_scheme_based(Map, Scheme, Port, Path) when Scheme =:= "http"; Scheme =:= <<"http">> -> normalize_http(Map, Port, Path); normalize_scheme_based(Map, Scheme, Port, Path) when Scheme =:= "https"; Scheme =:= <<"https">> -> normalize_https(Map, Port, Path); normalize_scheme_based(Map, Scheme, Port, _Path) when Scheme =:= "ftp"; Scheme =:= <<"ftp">> -> normalize_ftp(Map, Port); normalize_scheme_based(Map, Scheme, Port, _Path) when Scheme =:= "ssh"; Scheme =:= <<"ssh">> -> normalize_ssh_sftp(Map, Port); normalize_scheme_based(Map, Scheme, Port, _Path) when Scheme =:= "sftp"; Scheme =:= <<"sftp">> -> normalize_ssh_sftp(Map, Port); normalize_scheme_based(Map, Scheme, Port, _Path) when Scheme =:= "tftp"; Scheme =:= <<"tftp">> -> normalize_tftp(Map, Port); normalize_scheme_based(Map, _, _, _) -> Map. normalize_http(Map, Port, Path) -> M1 = normalize_port(Map, Port, 80), normalize_http_path(M1, Path). normalize_https(Map, Port, Path) -> M1 = normalize_port(Map, Port, 443), normalize_http_path(M1, Path). normalize_ftp(Map, Port) -> normalize_port(Map, Port, 21). normalize_ssh_sftp(Map, Port) -> normalize_port(Map, Port, 22). normalize_tftp(Map, Port) -> normalize_port(Map, Port, 69). normalize_port(Map, Port, Default) -> case Port of Default -> maps:remove(port, Map); _Else -> Map end. normalize_http_path(Map, Path) -> case Path of "" -> Map#{path => "/"}; <<>> -> Map#{path => <<"/">>}; _Else -> Map end.