diff options
Diffstat (limited to 'lib/stdlib/src/uri_string.erl')
-rwxr-xr-x | lib/stdlib/src/uri_string.erl | 224 |
1 files changed, 176 insertions, 48 deletions
diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index bb7079c193..893ba4c6bf 100755 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -304,8 +304,6 @@ parse(URIString) when is_list(URIString) -> -spec recompose(URIMap) -> URIString when URIMap :: uri_map(), URIString :: uri_string(). -recompose(Map) when map_size(Map) =:= 0 -> - ""; recompose(Map) -> case is_valid_map(Map) of false -> @@ -405,7 +403,7 @@ convert_mapfields_to_list(Map) -> %% URI-reference = URI / relative-ref %%------------------------------------------------------------------------- -spec parse_uri_reference(binary(), uri_map()) -> uri_map(). -parse_uri_reference(<<>>, _) -> #{}; +parse_uri_reference(<<>>, _) -> #{path => <<>>}; parse_uri_reference(URIString, URI) -> try parse_scheme_start(URIString, URI) of Res -> Res @@ -434,13 +432,15 @@ parse_relative_part(?STRING_REST("//", Rest), URI) -> %% Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - Userinfo = calculate_parsed_part(Rest, T), - URI1#{userinfo => decode_userinfo(Userinfo)} + Userinfo = calculate_parsed_userinfo(Rest, T), + URI2 = maybe_add_path(URI1), + URI2#{userinfo => decode_userinfo(Userinfo)} catch throw:uri_parse_error -> {T, URI1} = parse_host(Rest, URI), Host = calculate_parsed_part_sl(Rest, T), - URI1#{host => decode_host(remove_brackets(Host))} + URI2 = maybe_add_path(URI1), + URI2#{host => decode_host(remove_brackets(Host))} end; parse_relative_part(?STRING_REST($/, Rest), URI) -> {T, URI1} = parse_segment(Rest, URI), % path-absolute @@ -449,11 +449,13 @@ parse_relative_part(?STRING_REST($/, Rest), URI) -> parse_relative_part(?STRING_REST($?, Rest), URI) -> {T, URI1} = parse_query(Rest, URI), % path-empty ?query Query = calculate_parsed_part(Rest, T), - URI1#{query => decode_query(?STRING_REST($?, Query))}; + URI2 = maybe_add_path(URI1), + URI2#{query => decode_query(?STRING_REST($?, Query))}; parse_relative_part(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), - URI1#{fragment => decode_fragment(Fragment)}; + Fragment = calculate_parsed_fragment(Rest, T), + URI2 = maybe_add_path(URI1), + URI2#{fragment => decode_fragment(Fragment)}; parse_relative_part(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of true -> @@ -505,7 +507,7 @@ parse_segment(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment(?STRING_REST(Char, Rest), URI) -> case is_pchar(Char) of @@ -528,7 +530,7 @@ parse_segment_nz_nc(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_segment_nz_nc(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_segment_nz_nc(?STRING_REST(Char, Rest), URI) -> case is_segment_nz_nc(Char) of @@ -566,10 +568,32 @@ parse_scheme_start(?STRING_REST(Char, Rest), URI) -> case is_alpha(Char) of true -> {T, URI1} = parse_scheme(Rest, URI), Scheme = calculate_parsed_scheme(Rest, T), - URI1#{scheme => ?STRING_REST(Char, Scheme)}; + URI2 = maybe_add_path(URI1), + URI2#{scheme => ?STRING_REST(Char, Scheme)}; false -> throw(uri_parse_error) end. +%% Add path component if it missing after parsing the URI. +%% According to the URI specification there is always a +%% path component in every URI-reference and it can be +%% empty. + +%% maybe_add_path(Map) -> +%% case length(maps:keys(Map)) of +%% 0 -> +%% Map#{path => <<>>}; +%% _Else -> +%% Map +%% end. +maybe_add_path(Map) -> + case maps:is_key(path, Map) of + false -> + Map#{path => <<>>}; + _Else -> + Map + end. + + -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> @@ -603,7 +627,7 @@ parse_hier(?STRING_REST("//", Rest), URI) -> % Parse userinfo - "//" is NOT part of authority try parse_userinfo(Rest, URI) of {T, URI1} -> - Userinfo = calculate_parsed_part(Rest, T), + Userinfo = calculate_parsed_userinfo(Rest, T), {Rest, URI1#{userinfo => decode_userinfo(Userinfo)}} catch throw:uri_parse_error -> @@ -621,7 +645,7 @@ parse_hier(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_hier(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_hier(?STRING_REST(Char, Rest), URI) -> % path-rootless case is_pchar(Char) of @@ -660,12 +684,11 @@ parse_hier(?STRING_EMPTY, URI) -> %% userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) %%------------------------------------------------------------------------- -spec parse_userinfo(binary(), uri_map()) -> {binary(), uri_map()}. -parse_userinfo(?CHAR($@), _URI) -> - %% URI cannot end in userinfo state - throw(uri_parse_error); +parse_userinfo(?CHAR($@), URI) -> + {?STRING_EMPTY, URI#{host => <<>>}}; parse_userinfo(?STRING_REST($@, Rest), URI) -> {T, URI1} = parse_host(Rest, URI), - Host = calculate_parsed_part(Rest, T), + Host = calculate_parsed_host(Rest, T), {Rest, URI1#{host => decode_host(remove_brackets(Host))}}; parse_userinfo(?STRING_REST(Char, Rest), URI) -> case is_userinfo(Char) of @@ -726,7 +749,7 @@ is_userinfo(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). -spec parse_host(binary(), uri_map()) -> {binary(), uri_map()}. parse_host(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_host(?STRING_REST($/, Rest), URI) -> @@ -741,7 +764,7 @@ parse_host(?STRING_REST($[, Rest), URI) -> parse_ipv6_bin(Rest, [], URI); parse_host(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_host(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -755,7 +778,7 @@ parse_host(?STRING_EMPTY, URI) -> -spec parse_reg_name(binary(), uri_map()) -> {binary(), uri_map()}. parse_reg_name(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_reg_name(?STRING_REST($/, Rest), URI) -> @@ -768,7 +791,7 @@ parse_reg_name(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_reg_name(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_reg_name(?STRING_REST(Char, Rest), URI) -> case is_reg_name(Char) of @@ -788,7 +811,7 @@ is_reg_name(Char) -> is_unreserved(Char) orelse is_sub_delim(Char). parse_ipv4_bin(?STRING_REST($:, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv4_bin(?STRING_REST($/, Rest), Acc, URI) -> @@ -804,7 +827,7 @@ parse_ipv4_bin(?STRING_REST($?, Rest), Acc, URI) -> parse_ipv4_bin(?STRING_REST($#, Rest), Acc, URI) -> _ = validate_ipv4_address(lists:reverse(Acc)), {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv4_bin(?STRING_REST(Char, Rest), Acc, URI) -> case is_ipv4(Char) of @@ -851,7 +874,7 @@ is_ipv6(Char) -> is_hex_digit(Char). -spec parse_ipv6_bin_end(binary(), uri_map()) -> {binary(), uri_map()}. parse_ipv6_bin_end(?STRING_REST($:, Rest), URI) -> {T, URI1} = parse_port(Rest, URI), - H = calculate_parsed_part(Rest, T), + H = calculate_parsed_port(Rest, T), Port = binary_to_integer(H), {Rest, URI1#{port => Port}}; parse_ipv6_bin_end(?STRING_REST($/, Rest), URI) -> @@ -864,7 +887,7 @@ parse_ipv6_bin_end(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_ipv6_bin_end(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_ipv6_bin_end(?STRING_REST(Char, Rest), URI) -> case is_ipv6(Char) of @@ -902,7 +925,7 @@ parse_port(?STRING_REST($?, Rest), URI) -> {Rest, URI1#{query => decode_query(?STRING_REST($?, Query))}}; parse_port(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), % path-empty - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_port(?STRING_REST(Char, Rest), URI) -> case is_digit(Char) of @@ -928,7 +951,7 @@ parse_port(?STRING_EMPTY, URI) -> -spec parse_query(binary(), uri_map()) -> {binary(), uri_map()}. parse_query(?STRING_REST($#, Rest), URI) -> {T, URI1} = parse_fragment(Rest, URI), - Fragment = calculate_parsed_part(Rest, T), + Fragment = calculate_parsed_fragment(Rest, T), {Rest, URI1#{fragment => decode_fragment(Fragment)}}; parse_query(?STRING_REST(Char, Rest), URI) -> case is_query(Char) of @@ -1055,11 +1078,88 @@ remove_brackets(Addr) -> Addr. -spec calculate_parsed_part(binary(), binary()) -> binary(). calculate_parsed_part(<<$?>>, _) -> <<>>; calculate_parsed_part(<<$#>>, _) -> <<>>; +calculate_parsed_part(<<>>, _) -> <<>>; +calculate_parsed_part(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + _Else -> + Input + end; calculate_parsed_part(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), First. +-spec calculate_parsed_userinfo(binary(), binary()) -> binary(). +calculate_parsed_userinfo(<<$?>>, _) -> <<>>; +calculate_parsed_userinfo(<<$#>>, _) -> <<>>; +calculate_parsed_userinfo(<<>>, _) -> <<>>; +calculate_parsed_userinfo(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $@ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_userinfo(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_host(binary(), binary()) -> binary(). +calculate_parsed_host(<<$?>>, _) -> <<>>; +calculate_parsed_host(<<$#>>, _) -> <<>>; +calculate_parsed_host(<<>>, _) -> <<>>; +calculate_parsed_host(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $/ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_host(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_port(binary(), binary()) -> binary(). +calculate_parsed_port(<<$?>>, _) -> <<>>; +calculate_parsed_port(<<$#>>, _) -> <<>>; +calculate_parsed_port(<<>>, _) -> <<>>; +calculate_parsed_port(Input, <<>>) -> + case binary:last(Input) of + $? -> + init_binary(Input); + $# -> + init_binary(Input); + $/ -> + init_binary(Input); + _Else -> + Input + end; +calculate_parsed_port(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + +-spec calculate_parsed_fragment(binary(), binary()) -> binary(). +calculate_parsed_fragment(<<$#>>, _) -> <<>>; +calculate_parsed_fragment(Input, Unparsed) -> + {First, _} = split_binary(Input, byte_size(Input) - byte_size_exl_head(Unparsed)), + First. + + %% Returns the parsed binary based on Input and the Unparsed part. %% Used when parsing authority. %% @@ -1079,28 +1179,25 @@ calculate_parsed_part_sl(<<>>, _) -> <<>>; calculate_parsed_part_sl(Input, <<>>) -> case binary:last(Input) of $? -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; - + init_binary(Input); $# -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; + init_binary(Input); $/ -> - {First, _} = - split_binary(Input, byte_size(Input) - 1), - First; + init_binary(Input); _Else -> - {First, _} = - split_binary(Input, byte_size_exl_single_slash(Input)), - First + Input end; calculate_parsed_part_sl(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size_exl_single_slash(Input) - byte_size_exl_head(Unparsed)), First. +%% Return all bytes of the binary except the last one. The binary must be non-empty. +init_binary(B) -> + {Init, _} = + split_binary(B, byte_size(B) - 1), + Init. + %% Returns the parsed binary based on Input and the Unparsed part. %% Used when parsing scheme. @@ -1109,6 +1206,7 @@ calculate_parsed_scheme(Input, Unparsed) -> {First, _} = split_binary(Input, byte_size(Input) - byte_size(Unparsed) - 1), First. + %% Returns the size of a binary exluding the first element. %% Used in calls to split_binary(). -spec byte_size_exl_head(binary()) -> number(). @@ -1149,25 +1247,35 @@ byte_size_exl_single_slash(Rest) -> byte_size(Rest). %%------------------------------------------------------------------------- -spec decode_userinfo(binary()) -> binary(). decode_userinfo(Cs) -> - decode(Cs, fun is_userinfo/1, <<>>). + check_utf8(decode(Cs, fun is_userinfo/1, <<>>)). -spec decode_host(binary()) -> binary(). decode_host(Cs) -> - decode(Cs, fun is_host/1, <<>>). + check_utf8(decode(Cs, fun is_host/1, <<>>)). -spec decode_path(binary()) -> binary(). decode_path(Cs) -> - decode(Cs, fun is_path/1, <<>>). + check_utf8(decode(Cs, fun is_path/1, <<>>)). -spec decode_query(binary()) -> binary(). decode_query(Cs) -> - decode(Cs, fun is_query/1, <<>>). + check_utf8(decode(Cs, fun is_query/1, <<>>)). -spec decode_fragment(binary()) -> binary(). decode_fragment(Cs) -> - decode(Cs, fun is_fragment/1, <<>>). + check_utf8(decode(Cs, fun is_fragment/1, <<>>)). +%% Returns Cs if it is utf8 encoded. +check_utf8(Cs) -> + case unicode:characters_to_list(Cs) of + {incomplete,_,_} -> + throw(uri_parse_error); + {error,_,_} -> + throw(uri_parse_error); + _ -> Cs + end. + %%------------------------------------------------------------------------- %% Percent-encode %%------------------------------------------------------------------------- @@ -1368,10 +1476,15 @@ bracket_ipv6(Addr) when is_list(Addr) -> %% E.g. "//user@:8080" => #{host => [],port => 8080,userinfo => "user"} %% There is always at least an empty host when both userinfo and port %% are present. +%% - #{path => "///"} otherwise the following would be true: +%% "/////" = uri_string:recompose(#{host => "", path => "///"}) +%% "/////" = uri_string:recompose(#{path => "/////"}) +%% AND +%% path-absolute = "/" [ segment-nz *( "/" segment ) ] %%------------------------------------------------------------------------- is_valid_map(Map) -> case - (not maps:is_key(userinfo, Map) andalso + ((not maps:is_key(userinfo, Map) andalso not maps:is_key(host, Map) andalso maps:is_key(port, Map)) orelse @@ -1381,7 +1494,9 @@ is_valid_map(Map) -> orelse (maps:is_key(userinfo, Map) andalso not maps:is_key(host, Map) andalso - maps:is_key(port, Map)) + maps:is_key(port, Map))) orelse + not maps:is_key(path, Map) orelse + not is_host_and_path_valid(Map) of true -> false; @@ -1390,6 +1505,19 @@ is_valid_map(Map) -> end. +is_host_and_path_valid(Map) -> + Host = maps:get(host, Map, undefined), + Path = maps:get(path, Map, undefined), + not (Host =:= undefined andalso starts_with_two_slash(Path)). + + +starts_with_two_slash([$/,$/|_]) -> + true; +starts_with_two_slash(?STRING_REST("//", _)) -> + true; +starts_with_two_slash(_) -> false. + + update_scheme(#{scheme := Scheme}, _) -> add_colon_postfix(encode_scheme(Scheme)); update_scheme(#{}, _) -> |