From ce78af7e5a76dc4a27673ab5c80a315762b992b1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Fri, 27 Oct 2017 16:54:27 +0200 Subject: stdlib: Implement normalize/1 Implements the following Syntax-Based Normalizations: - Case Normalization - Percent-Encoding Normalization - Path Segment Normalization - Scheme-Based Normalization - HTTP(S) - Basic support for FTP, SSH, SFTP, TFTP --- lib/stdlib/src/uri_string.erl | 167 ++++++++++++++++++++++++++++++++++- lib/stdlib/test/uri_string_SUITE.erl | 22 +++++ 2 files changed, 188 insertions(+), 1 deletion(-) (limited to 'lib/stdlib') diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 16650d5005..cf8c388f54 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -227,7 +227,7 @@ %% External API %%------------------------------------------------------------------------- -export([compose_query/1, compose_query/2, - dissect_query/1, parse/1, + dissect_query/1, normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -288,6 +288,21 @@ scheme => unicode:chardata(), userinfo => unicode:chardata()} | #{}. + +%%------------------------------------------------------------------------- +%% Normalize URIs +%%------------------------------------------------------------------------- +-spec normalize(URIString) -> NormalizedURI when + URIString :: uri_string(), + NormalizedURI :: uri_string(). +normalize(URIString) -> + %% Case normalization and percent-encoding normalization are achieved + %% by running parse and recompose on the input URI string. + M = parse(URIString), + M1 = normalize_scheme_based(M), + M2 = normalize_path_segment(M1), + recompose(M2). + %%------------------------------------------------------------------------- %% Parse URIs %%------------------------------------------------------------------------- @@ -1883,3 +1898,153 @@ form_urldecode(<>, Acc) -> end; form_urldecode(<>, _Acc) -> throw({error, invalid_character, [H]}). + + +%%------------------------------------------------------------------------- +%% Helper functions for normalize +%%------------------------------------------------------------------------- + +%% RFC 3986 +%% 6.2.2.3. Path Segment Normalization +%% 5.2.4. Remove Dot Segments +normalize_path_segment(Map) -> + Path = maps:get(path, Map, undefined), + Map#{path => remove_dot_segments(Path)}. + + +remove_dot_segments(Path) when is_binary(Path) -> + remove_dot_segments(Path, <<>>); +remove_dot_segments(Path) when is_list(Path) -> + B = convert_binary(Path, utf8, utf8), + B1 = remove_dot_segments(B, <<>>), + convert_list(B1, utf8). +%% +remove_dot_segments(<<>>, Output) -> + Output; +remove_dot_segments(<<"../",T/binary>>, Output) -> + remove_dot_segments(T, Output); +remove_dot_segments(<<"./",T/binary>>, Output) -> + remove_dot_segments(T, Output); +remove_dot_segments(<<"/./",T/binary>>, Output) -> + remove_dot_segments(<<$/,T/binary>>, Output); +remove_dot_segments(<<"/.">>, Output) -> + remove_dot_segments(<<$/>>, Output); +remove_dot_segments(<<"/../",T/binary>>, Output) -> + Out1 = remove_last_segment(Output), + remove_dot_segments(<<$/,T/binary>>, Out1); +remove_dot_segments(<<"/..">>, Output) -> + Out1 = remove_last_segment(Output), + remove_dot_segments(<<$/>>, Out1); +remove_dot_segments(<<$.>>, Output) -> + remove_dot_segments(<<>>, Output); +remove_dot_segments(<<"..">>, Output) -> + remove_dot_segments(<<>>, Output); +remove_dot_segments(Input, Output) -> + {First, Rest} = first_path_segment(Input), + remove_dot_segments(Rest, <>). + + +first_path_segment(Input) -> + F = first_path_segment(Input, <<>>), + split_binary(Input, byte_size(F)). +%% +first_path_segment(<<$/,T/binary>>, Acc) -> + first_path_segment_end(<>, <>); +first_path_segment(<>, Acc) -> + first_path_segment_end(<>, <>). + + +first_path_segment_end(<<>>, Acc) -> + Acc; +first_path_segment_end(<<$/,_/binary>>, Acc) -> + Acc; +first_path_segment_end(<>, Acc) -> + first_path_segment_end(<>, <>). + + +remove_last_segment(<<>>) -> + <<>>; +remove_last_segment(B) -> + {Init, Last} = split_binary(B, byte_size(B) - 1), + case Last of + <<$/>> -> + Init; + _Char -> + remove_last_segment(Init) + end. + + +%% RFC 3986, 6.2.3. Scheme-Based Normalization +normalize_scheme_based(Map) -> + Scheme = maps:get(scheme, Map, undefined), + Port = maps:get(port, Map, undefined), + Path= maps:get(path, Map, undefined), + case Scheme of + "http" -> + normalize_http(Map, Port, Path); + <<"http">> -> + normalize_http(Map, Port, Path); + "https" -> + normalize_https(Map, Port, Path); + <<"https">> -> + normalize_https(Map, Port, Path); + "ftp" -> + normalize_ftp(Map, Port); + <<"ftp">> -> + normalize_ftp(Map, Port); + "ssh" -> + normalize_ssh_sftp(Map, Port); + <<"ssh">> -> + normalize_ssh_sftp(Map, Port); + "sftp" -> + normalize_ssh_sftp(Map, Port); + <<"sftp">> -> + normalize_ssh_sftp(Map, Port); + "tftp" -> + normalize_tftp(Map, Port); + <<"tftp">> -> + normalize_tftp(Map, Port); + _Else -> Map + end. + + +normalize_http(Map, Port, Path) -> + M1 = normalize_port(Map, Port, 80), + normalize_http_path(M1, Path). + + +normalize_https(Map, Port, Path) -> + M1 = normalize_port(Map, Port, 443), + normalize_http_path(M1, Path). + + +normalize_ftp(Map, Port) -> + normalize_port(Map, Port, 21). + + +normalize_ssh_sftp(Map, Port) -> + normalize_port(Map, Port, 22). + + +normalize_tftp(Map, Port) -> + normalize_port(Map, Port, 69). + + +normalize_port(Map, Port, Default) -> + case Port of + Default -> + maps:remove(port, Map); + _Else -> + Map + end. + + +normalize_http_path(Map, Path) -> + case Path of + "" -> + Map#{path => "/"}; + <<>> -> + Map#{path => <<"/">>}; + _Else -> + Map + end. diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index 9ee321c509..1567b9333a 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -22,6 +22,7 @@ -include_lib("common_test/include/ct.hrl"). -export([all/0, suite/0,groups/0, + normalize/1, parse_binary_fragment/1, parse_binary_host/1, parse_binary_host_ipv4/1, parse_binary_host_ipv6/1, parse_binary_path/1, parse_binary_pct_encoded_fragment/1, parse_binary_pct_encoded_query/1, @@ -65,6 +66,7 @@ suite() -> all() -> [ + normalize, parse_binary_scheme, parse_binary_userinfo, parse_binary_pct_encoded_userinfo, @@ -867,3 +869,23 @@ dissect_query_negative(_Config) -> {error,invalid_character,"ö"} = uri_string:dissect_query(<<"föo+bar=1&%C3%B6=2">>), {error,invalid_input,<<"ö">>} = uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2ö">>]). + +normalize(_Config) -> + "/a/g" = uri_string:normalize("/a/b/c/./../../g"), + <<"mid/6">> = uri_string:normalize(<<"mid/content=5/../6">>), + "http://localhost-%C3%B6rebro/a/g" = + uri_string:normalize("http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g"), + <<"http://localhost-%C3%B6rebro/a/g">> = + uri_string:normalize(<<"http://localhos%74-%c3%b6rebro:80/a/b/c/./../../g">>), + <<"https://localhost/">> = + uri_string:normalize(<<"https://localhost:443">>), + <<"https://localhost:445/">> = + uri_string:normalize(<<"https://localhost:445">>), + <<"ftp://localhost">> = + uri_string:normalize(<<"ftp://localhost:21">>), + <<"ssh://localhost">> = + uri_string:normalize(<<"ssh://localhost:22">>), + <<"sftp://localhost">> = + uri_string:normalize(<<"sftp://localhost:22">>), + <<"tftp://localhost">> = + uri_string:normalize(<<"tftp://localhost:69">>). -- cgit v1.2.3