From 1bb2c76c09510bf761c4a6908ae78d1e2a87d574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A9ter=20Dimitrov?= Date: Tue, 7 Nov 2017 16:23:09 +0100 Subject: stdlib: Implement compose and dissect query (HTML5) Implement functions for handling form-urlencoded query strings based on the HTML5 specification. --- lib/stdlib/doc/src/uri_string.xml | 98 ++++++++++++++- lib/stdlib/src/uri_string.erl | 229 ++++++++++++++++++++++++++++++++++- lib/stdlib/test/uri_string_SUITE.erl | 86 ++++++++++++- 3 files changed, 409 insertions(+), 4 deletions(-) diff --git a/lib/stdlib/doc/src/uri_string.xml b/lib/stdlib/doc/src/uri_string.xml index 9ace2b0a05..21f470e763 100644 --- a/lib/stdlib/doc/src/uri_string.xml +++ b/lib/stdlib/doc/src/uri_string.xml @@ -31,7 +31,8 @@ URI processing functions.

This module contains functions for parsing and handling URIs - (RFC 3986). + (RFC 3986) and + form-urlencoded query strings (HTML5).

A URI is an identifier consisting of a sequence of characters matching the syntax rule named URI in RFC 3986. @@ -71,6 +72,13 @@ Transforming URIs into a normalized form

normalize/1
+ Composing form-urlencoded query strings from a list of key-value pairs

+ compose_query/1

+ compose_query/2 +
+ Dissecting form-urlencoded query strings into a list of key-value pairs

+ dissect_query/1 +

There are four different encodings present during the handling of URIs:

@@ -102,12 +110,15 @@

Error tuple indicating the type of error. Possible values of the second component:

+ invalid_character + invalid_encoding invalid_input invalid_map invalid_percent_encoding invalid_scheme invalid_uri invalid_utf8 + missing_value

The third component is a term providing additional information about the cause of the error.

@@ -133,6 +144,91 @@ + + + Compose urlencoded query string. + +

Composes a form-urlencoded QueryString based on a + QueryList, a list of non-percent-encoded key-value pairs. + Form-urlencoding is defined in section + 4.10.22.6 of the HTML5 + specification. +

+

See also the opposite operation + dissect_query/1. +

+

Example:

+
+1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}]).
+
+2> >,<<"1">>},
+2> {<<"city">>,<<"örebro"/utf8>>}]).]]>
+>]]>
+	
+
+
+ + + + Compose urlencoded query string. + +

Same as compose_query/1 but with an additional + Options parameter, that controls the encoding ("charset") + used by the encoding algorithm. There are two supported encodings: utf8 + (or unicode) and latin1. +

+

Each character in the entry's name and value that cannot be expressed using + the selected character encoding, is replaced by a string consisting of a U+0026 + AMPERSAND character (), a "#" (U+0023) character, one or more ASCII + digits representing the Unicode code point of the character in base ten, and + finally a ";" (U+003B) character. +

+

Bytes that are out of the range 0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A, 0x5F, + 0x61 to 0x7A, are percent-encoded (U+0025 PERCENT SIGN character (%) followed by + uppercase ASCII hex digits representing the hexadecimal value of the byte). +

+

See also the opposite operation + dissect_query/1. +

+

Example:

+
+1> uri_string:compose_query([{"foo bar","1"},{"city","örebro"}],
+1> [{encoding, latin1}]).
+ uri_string:compose_query([{<<"foo bar">>,<<"1">>},
+2> {<<"city">>,<<"東京"/utf8>>}], [{encoding, latin1}]).]]>
+>]]>
+	
+
+
+ + + + Dissect query string. + +

Dissects an urlencoded QueryString and returns a + QueryList, a list of non-percent-encoded key-value pairs. + Form-urlencoding is defined in section + 4.10.22.6 of the HTML5 + specification. +

+

It is not as strict for its input as the decoding algorithm defined by + HTML5 + and accepts all unicode characters.

+

See also the opposite operation + compose_query/1. +

+

Example:

+
+1> 
+[{"foo bar","1"},{"city","örebro"}]
+2> >).]]>
+>,<<"1">>},
+ {<<"city">>,<<230,157,177,228,186,172>>}] ]]>
+	
+
+
+ Syntax-based normalization. diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 22212da222..a84679c595 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,7 +226,8 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([normalize/1, parse/1, +-export([compose_query/1, compose_query/2, + dissect_query/1, normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -381,6 +382,76 @@ transcode(URIString, Options) when is_list(URIString) -> end. +%%------------------------------------------------------------------------- +%% Functions for working with the query part of a URI as a list +%% of key/value pairs. +%% HTML5 - 4.10.22.6 URL-encoded form data +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% Compose urlencoded query string from a list of unescaped key/value pairs. +%% (application/x-www-form-urlencoded encoding algorithm) +%%------------------------------------------------------------------------- +-spec compose_query(QueryList) -> QueryString when + QueryList :: [{uri_string(), uri_string()}], + QueryString :: uri_string() + | error(). +compose_query(List) -> + compose_query(List, [{encoding, utf8}]). + + +-spec compose_query(QueryList, Options) -> QueryString when + QueryList :: [{uri_string(), uri_string()}], + Options :: [{encoding, atom()}], + QueryString :: uri_string() + | error(). +compose_query([],_Options) -> + []; +compose_query(List, Options) -> + try compose_query(List, Options, false, <<>>) + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. +%% +compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> + Separator = get_separator(Rest), + K = form_urlencode(Key, Options), + V = form_urlencode(Value, Options), + IsListNew = IsList orelse is_list(Key) orelse is_list(Value), + compose_query(Rest, Options, IsListNew, <>. + + +%% HTML5 - 4.10.22.6 URL-encoded form data - encoding +form_urlencode(Cs, [{encoding, latin1}]) when is_list(Cs) -> + B = convert_to_binary(Cs, utf8, utf8), + html5_byte_encode(base10_encode(B)); +form_urlencode(Cs, [{encoding, latin1}]) when is_binary(Cs) -> + html5_byte_encode(base10_encode(Cs)); +form_urlencode(Cs, [{encoding, Encoding}]) + when is_list(Cs), Encoding =:= utf8; Encoding =:= unicode -> + B = convert_to_binary(Cs, utf8, Encoding), + html5_byte_encode(B); +form_urlencode(Cs, [{encoding, Encoding}]) + when is_binary(Cs), Encoding =:= utf8; Encoding =:= unicode -> + html5_byte_encode(Cs); +form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs); is_binary(Cs) -> + throw({error,invalid_encoding, Encoding}); +form_urlencode(Cs, _) -> + throw({error,invalid_input, Cs}). + + +%% For each character in the entry's name and value that cannot be expressed using +%% the selected character encoding, replace the character by a string consisting of +%% a U+0026 AMPERSAND character (&), a "#" (U+0023) character, one or more ASCII +%% digits representing the Unicode code point of the character in base ten, and +%% finally a ";" (U+003B) character. +base10_encode(Cs) -> + base10_encode(Cs, <<>>). +%% +base10_encode(<<>>, Acc) -> + Acc; +base10_encode(<>, Acc) when H > 255 -> + Base10 = convert_to_binary(integer_to_list(H,10), utf8, utf8), + base10_encode(T, <>); +base10_encode(<>, Acc) -> + base10_encode(T, <>). + + +html5_byte_encode(B) -> + html5_byte_encode(B, <<>>). +%% +html5_byte_encode(<<>>, Acc) -> + Acc; +html5_byte_encode(<<$ ,T/binary>>, Acc) -> + html5_byte_encode(T, <>); +html5_byte_encode(<>, Acc) -> + case is_url_char(H) of + true -> + html5_byte_encode(T, <>); + false -> + <> = <>, + html5_byte_encode(T, <>) + end; +html5_byte_encode(H, _Acc) -> + throw({error,invalid_input, H}). + + +%% Return true if input char can appear in form-urlencoded string +%% Allowed chararacters: +%% 0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A, +%% 0x5F, 0x61 to 0x7A +is_url_char(C) + when C =:= 16#2A; C =:= 16#2D; + C =:= 16#2E; C =:= 16#5F; + 16#30 =< C, C =< 16#39; + 16#41 =< C, C =< 16#5A; + 16#61 =< C, C =< 16#7A -> true; +is_url_char(_) -> false. + + +%%------------------------------------------------------------------------- +%% Helper functions for dissect_query +%%------------------------------------------------------------------------- +dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_value(T, IsList, Acc, Key, Value); +dissect_query_key(<<"&#",T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, <>, Value); +dissect_query_key(<<$&,_T/binary>>, _IsList, _Acc, _Key, _Value) -> + throw({error, missing_value, "&"}); +dissect_query_key(<>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, <>, Value); +dissect_query_key(B, _, _, _, _) -> + throw({error, missing_value, B}). + + +dissect_query_value(<<$&,T/binary>>, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), + dissect_query_key(T, IsList, [{K,V}|Acc], <<>>, <<>>); +dissect_query_value(<>, IsList, Acc, Key, Value) -> + dissect_query_value(T, IsList, Acc, Key, <>); +dissect_query_value(<<>>, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), + lists:reverse([{K,V}|Acc]). + + +%% Form-urldecode input based on RFC 1866 [8.2.1] +form_urldecode(true, B) -> + Result = base10_decode(form_urldecode(B, <<>>)), + convert_to_list(Result, utf8); +form_urldecode(false, B) -> + base10_decode(form_urldecode(B, <<>>)); +form_urldecode(<<>>, Acc) -> + Acc; +form_urldecode(<<$+,T/binary>>, Acc) -> + form_urldecode(T, <>); +form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), + form_urldecode(T, <>); + false -> + L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8), + throw({error, invalid_percent_encoding, L}) + end; +form_urldecode(<>, Acc) -> + form_urldecode(T, <>); +form_urldecode(<>, _Acc) -> + throw({error, invalid_character, [H]}). + +base10_decode(Cs) -> + base10_decode(Cs, <<>>). +% +base10_decode(<<>>, Acc) -> + Acc; +base10_decode(<<"&#",T/binary>>, Acc) -> + base10_decode_unicode(T, Acc); +base10_decode(<>, Acc) -> + base10_decode(T,<>); +base10_decode(<>, _) -> + throw({error, invalid_input, [H]}). + + +base10_decode_unicode(B, Acc) -> + base10_decode_unicode(B, 0, Acc). +%% +base10_decode_unicode(<>, Codepoint, Acc) when $0 =< H, H =< $9 -> + Res = Codepoint * 10 + (H - $0), + base10_decode_unicode(T, Res, Acc); +base10_decode_unicode(<<$;,T/binary>>, Codepoint, Acc) -> + base10_decode(T, <>); +base10_decode_unicode(<>, _, _) -> + throw({error, invalid_input, [H]}). + + %%------------------------------------------------------------------------- %% Helper functions for normalize %%------------------------------------------------------------------------- diff --git a/lib/stdlib/test/uri_string_SUITE.erl b/lib/stdlib/test/uri_string_SUITE.erl index c625da56c6..fef356355c 100644 --- a/lib/stdlib/test/uri_string_SUITE.erl +++ b/lib/stdlib/test/uri_string_SUITE.erl @@ -38,7 +38,10 @@ recompose_query/1, recompose_parse_query/1, recompose_path/1, recompose_parse_path/1, recompose_autogen/1, parse_recompose_autogen/1, - transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1 + transcode_basic/1, transcode_options/1, transcode_mixed/1, transcode_negative/1, + compose_query/1, compose_query_latin1/1, compose_query_negative/1, + dissect_query/1, dissect_query_negative/1, + interop_query_latin1/1, interop_query_utf8/1 ]). @@ -107,7 +110,14 @@ all() -> transcode_basic, transcode_options, transcode_mixed, - transcode_negative + transcode_negative, + compose_query, + compose_query_latin1, + compose_query_negative, + dissect_query, + dissect_query_negative, + interop_query_latin1, + interop_query_utf8 ]. groups() -> @@ -823,6 +833,65 @@ transcode_negative(_Config) -> {error,invalid_input,<<"ö">>} = uri_string:transcode("foo%F6bar", [{in_encoding, utf8},{out_encoding, utf8}]). +compose_query(_Config) -> + [] = uri_string:compose_query([]), + "foo=1&bar=2" = uri_string:compose_query([{<<"foo">>,"1"}, {"bar", "2"}]), + "foo=1&b%C3%A4r=2" = uri_string:compose_query([{"foo","1"}, {"bär", "2"}],[{encoding,utf8}]), + "foo=1&b%C3%A4r=2" = uri_string:compose_query([{"foo","1"}, {"bär", "2"}],[{encoding,unicode}]), + "foo=1&b%E4r=2" = uri_string:compose_query([{"foo","1"}, {"bär", "2"}],[{encoding,latin1}]), + "foo+bar=1&%E5%90%88=2" = uri_string:compose_query([{"foo bar","1"}, {"合", "2"}]), + "foo+bar=1&%26%2321512%3B=2" = + uri_string:compose_query([{"foo bar","1"}, {"合", "2"}],[{encoding,latin1}]), + "foo+bar=1&%C3%B6=2" = uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {"ö", <<"2">>}]), + <<"foo+bar=1&%C3%B6=2">> = + uri_string:compose_query([{<<"foo bar">>,<<"1">>}, {<<"ö"/utf8>>, <<"2">>}]). + +compose_query_latin1(_Config) -> + Q = uri_string:compose_query([{"合foö bar","1"}, {"合", "合"}],[{encoding,latin1}]), + Q1 = uri_string:transcode(Q, [{in_encoding, latin1}]), + [{"合foö bar","1"}, {"合", "合"}] = uri_string:dissect_query(Q1), + Q2 = uri_string:compose_query([{<<"合foö bar"/utf8>>,<<"1">>}, {<<"合"/utf8>>, <<"合"/utf8>>}], + [{encoding,latin1}]), + Q3 = uri_string:transcode(Q2, [{in_encoding, latin1}]), + [{<<"合foö bar"/utf8>>,<<"1">>}, {<<"合"/utf8>>, <<"合"/utf8>>}] = + uri_string:dissect_query(Q3). + +compose_query_negative(_Config) -> + {error,invalid_input,4} = uri_string:compose_query([{"",4}]), + {error,invalid_input,5} = uri_string:compose_query([{5,""}]), + {error,invalid_encoding,utf16} = + uri_string:compose_query([{"foo bar","1"}, {<<"ö">>, "2"}],[{encoding,utf16}]). + +dissect_query(_Config) -> + [] = uri_string:dissect_query(""), + [{"foo","1"}, {"amp;bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"), + [{"foo","1"}, {"bar", "2"}] = uri_string:dissect_query("foo=1&bar=2"), + [{"foo","1;bar=2"}] = uri_string:dissect_query("foo=1;bar=2"), + [{"foo","1"}, {"bar", "222"}] = uri_string:dissect_query([<<"foo=1&bar=2">>,"22"]), + [{"foo","ö"}, {"bar", "2"}] = uri_string:dissect_query("foo=%C3%B6&bar=2"), + [{<<"foo">>,<<"ö"/utf8>>}, {<<"bar">>, <<"2">>}] = + uri_string:dissect_query(<<"foo=%C3%B6&bar=2">>), + [{"foo bar","1"},{"ö","2"}] = + uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2">>]), + [{"foo bar","1"},{[21512],"2"}] = + uri_string:dissect_query("foo+bar=1&%26%2321512%3B=2"), + [{<<"foo bar">>,<<"1">>},{<<"合"/utf8>>,<<"2">>}] = + uri_string:dissect_query(<<"foo+bar=1&%26%2321512%3B=2">>), + [{"föo bar","1"},{"ö","2"}] = + uri_string:dissect_query("föo+bar=1&%C3%B6=2"), + [{<<"föo bar"/utf8>>,<<"1">>},{<<"ö"/utf8>>,<<"2">>}] = + uri_string:dissect_query(<<"föo+bar=1&%C3%B6=2"/utf8>>). + +dissect_query_negative(_Config) -> + {error,missing_value,"&"} = + uri_string:dissect_query("foo1&bar=2"), + {error,invalid_percent_encoding,"%XX%B6"} = uri_string:dissect_query("foo=%XX%B6&bar=2"), + {error,invalid_input,[153]} = + uri_string:dissect_query("foo=%99%B6&bar=2"), + {error,invalid_character,"ö"} = uri_string:dissect_query(<<"föo+bar=1&%C3%B6=2">>), + {error,invalid_input,<<"ö">>} = + uri_string:dissect_query([<<"foo+bar=1&">>,<<"%C3%B6=2ö">>]). + normalize(_Config) -> "/a/g" = uri_string:normalize("/a/b/c/./../../g"), <<"mid/6">> = uri_string:normalize(<<"mid/content=5/../6">>), @@ -842,3 +911,16 @@ normalize(_Config) -> uri_string:normalize(<<"sftp://localhost:22">>), <<"tftp://localhost">> = uri_string:normalize(<<"tftp://localhost:69">>). + +interop_query_utf8(_Config) -> + Q = uri_string:compose_query([{"foo bar","1"}, {"合", "2"}]), + Uri = uri_string:recompose(#{path => "/", query => Q}), + #{query := Q1} = uri_string:parse(Uri), + [{"foo bar","1"}, {"合", "2"}] = uri_string:dissect_query(Q1). + +interop_query_latin1(_Config) -> + Q = uri_string:compose_query([{"foo bar","1"}, {"合", "2"}], [{encoding,latin1}]), + Uri = uri_string:recompose(#{path => "/", query => Q}), + Uri1 = uri_string:transcode(Uri, [{in_encoding, latin1}]), + #{query := Q1} = uri_string:parse(Uri1), + [{"foo bar","1"}, {"合", "2"}] = uri_string:dissect_query(Q1). -- cgit v1.2.3