diff options
Diffstat (limited to 'lib/stdlib/src')
-rw-r--r-- | lib/stdlib/src/base64.erl | 597 | ||||
-rw-r--r-- | lib/stdlib/src/erl_compile.erl | 3 | ||||
-rw-r--r-- | lib/stdlib/src/erl_lint.erl | 44 | ||||
-rw-r--r-- | lib/stdlib/src/erl_parse.yrl | 82 | ||||
-rw-r--r-- | lib/stdlib/src/gen.erl | 1 | ||||
-rw-r--r-- | lib/stdlib/src/string.erl | 747 | ||||
-rw-r--r-- | lib/stdlib/src/uri_string.erl | 229 |
7 files changed, 1181 insertions, 522 deletions
diff --git a/lib/stdlib/src/base64.erl b/lib/stdlib/src/base64.erl index c8cf6fdffe..6ea4147abf 100644 --- a/lib/stdlib/src/base64.erl +++ b/lib/stdlib/src/base64.erl @@ -24,22 +24,11 @@ -export([encode/1, decode/1, mime_decode/1, encode_to_string/1, decode_to_string/1, mime_decode_to_string/1]). -%%------------------------------------------------------------------------- %% The following type is a subtype of string() for return values %% of (some) functions of this module. -%%------------------------------------------------------------------------- - -type ascii_string() :: [1..255]. -type ascii_binary() :: binary(). -%%------------------------------------------------------------------------- -%% encode_to_string(ASCII) -> Base64String -%% ASCII - string() | binary() -%% Base64String - string() -%% -%% Description: Encodes a plain ASCII string (or binary) into base64. -%%------------------------------------------------------------------------- - -spec encode_to_string(Data) -> Base64String when Data :: ascii_string() | ascii_binary(), Base64String :: ascii_string(). @@ -47,66 +36,67 @@ encode_to_string(Bin) when is_binary(Bin) -> encode_to_string(binary_to_list(Bin)); encode_to_string(List) when is_list(List) -> - encode_l(List). - -%%------------------------------------------------------------------------- -%% encode(ASCII) -> Base64 -%% ASCII - string() | binary() -%% Base64 - binary() -%% -%% Description: Encodes a plain ASCII string (or binary) into base64. -%%------------------------------------------------------------------------- + encode_list_to_string(List). -spec encode(Data) -> Base64 when Data :: ascii_string() | ascii_binary(), Base64 :: ascii_binary(). encode(Bin) when is_binary(Bin) -> - encode_binary(Bin); + encode_binary(Bin, <<>>); encode(List) when is_list(List) -> - list_to_binary(encode_l(List)). - --spec encode_l(ascii_string()) -> ascii_string(). + encode_list(List, <<>>). -encode_l([]) -> +encode_list_to_string([]) -> []; -encode_l([A]) -> - [b64e(A bsr 2), - b64e((A band 3) bsl 4), $=, $=]; -encode_l([A,B]) -> - [b64e(A bsr 2), - b64e(((A band 3) bsl 4) bor (B bsr 4)), - b64e((B band 15) bsl 2), $=]; -encode_l([A,B,C|Ls]) -> - BB = (A bsl 16) bor (B bsl 8) bor C, +encode_list_to_string([B1]) -> + [b64e(B1 bsr 2), + b64e((B1 band 3) bsl 4), $=, $=]; +encode_list_to_string([B1,B2]) -> + [b64e(B1 bsr 2), + b64e(((B1 band 3) bsl 4) bor (B2 bsr 4)), + b64e((B2 band 15) bsl 2), $=]; +encode_list_to_string([B1,B2,B3|Ls]) -> + BB = (B1 bsl 16) bor (B2 bsl 8) bor B3, [b64e(BB bsr 18), b64e((BB bsr 12) band 63), b64e((BB bsr 6) band 63), - b64e(BB band 63) | encode_l(Ls)]. - -encode_binary(Bin) -> - Split = 3*(byte_size(Bin) div 3), - <<Main0:Split/binary,Rest/binary>> = Bin, - Main = << <<(b64e(C)):8>> || <<C:6>> <= Main0 >>, - case Rest of - <<A:6,B:6,C:4>> -> - <<Main/binary,(b64e(A)):8,(b64e(B)):8,(b64e(C bsl 2)):8,$=:8>>; - <<A:6,B:2>> -> - <<Main/binary,(b64e(A)):8,(b64e(B bsl 4)):8,$=:8,$=:8>>; - <<>> -> - Main - end. + b64e(BB band 63) | encode_list_to_string(Ls)]. -%%------------------------------------------------------------------------- -%% mime_decode(Base64) -> ASCII -%% decode(Base64) -> ASCII -%% Base64 - string() | binary() -%% ASCII - binary() -%% -%% Description: Decodes an base64 encoded string to plain ASCII. -%% mime_decode strips away all characters not Base64 before converting, -%% whereas decode crashes if an illegal character is found -%%------------------------------------------------------------------------- +encode_binary(<<>>, A) -> + A; +encode_binary(<<B1:8>>, A) -> + <<A/bits,(b64e(B1 bsr 2)):8,(b64e((B1 band 3) bsl 4)):8,$=:8,$=:8>>; +encode_binary(<<B1:8, B2:8>>, A) -> + <<A/bits,(b64e(B1 bsr 2)):8, + (b64e(((B1 band 3) bsl 4) bor (B2 bsr 4))):8, + (b64e((B2 band 15) bsl 2)):8, $=:8>>; +encode_binary(<<B1:8, B2:8, B3:8, Ls/bits>>, A) -> + BB = (B1 bsl 16) bor (B2 bsl 8) bor B3, + encode_binary(Ls, + <<A/bits,(b64e(BB bsr 18)):8, + (b64e((BB bsr 12) band 63)):8, + (b64e((BB bsr 6) band 63)):8, + (b64e(BB band 63)):8>>). + +encode_list([], A) -> + A; +encode_list([B1], A) -> + <<A/bits,(b64e(B1 bsr 2)):8,(b64e((B1 band 3) bsl 4)):8,$=:8,$=:8>>; +encode_list([B1,B2], A) -> + <<A/bits,(b64e(B1 bsr 2)):8, + (b64e(((B1 band 3) bsl 4) bor (B2 bsr 4))):8, + (b64e((B2 band 15) bsl 2)):8, $=:8>>; +encode_list([B1,B2,B3|Ls], A) -> + BB = (B1 bsl 16) bor (B2 bsl 8) bor B3, + encode_list(Ls, + <<A/bits,(b64e(BB bsr 18)):8, + (b64e((BB bsr 12) band 63)):8, + (b64e((BB bsr 6) band 63)):8, + (b64e(BB band 63)):8>>). + +%% mime_decode strips away all characters not Base64 before +%% converting, whereas decode crashes if an illegal character is found -spec decode(Base64) -> Data when Base64 :: ascii_string() | ascii_binary(), @@ -122,32 +112,13 @@ decode(List) when is_list(List) -> Data :: ascii_binary(). mime_decode(Bin) when is_binary(Bin) -> - mime_decode_binary(<<>>, Bin); + mime_decode_binary(Bin, <<>>); mime_decode(List) when is_list(List) -> - mime_decode(list_to_binary(List)). + mime_decode_list(List, <<>>). --spec decode_l(ascii_string()) -> ascii_string(). - -decode_l(List) -> - L = strip_spaces(List, []), - decode(L, []). - --spec mime_decode_l(ascii_string()) -> ascii_string(). - -mime_decode_l(List) -> - L = strip_illegal(List, [], 0), - decode(L, []). - -%%------------------------------------------------------------------------- -%% mime_decode_to_string(Base64) -> ASCII -%% decode_to_string(Base64) -> ASCII -%% Base64 - string() | binary() -%% ASCII - binary() -%% -%% Description: Decodes an base64 encoded string to plain ASCII. -%% mime_decode strips away all characters not Base64 before converting, -%% whereas decode crashes if an illegal character is found -%%------------------------------------------------------------------------- +%% mime_decode_to_string strips away all characters not Base64 before +%% converting, whereas decode_to_string crashes if an illegal +%% character is found -spec decode_to_string(Base64) -> DataString when Base64 :: ascii_string() | ascii_binary(), @@ -156,7 +127,7 @@ mime_decode_l(List) -> decode_to_string(Bin) when is_binary(Bin) -> decode_to_string(binary_to_list(Bin)); decode_to_string(List) when is_list(List) -> - decode_l(List). + decode_list_to_string(List). -spec mime_decode_to_string(Base64) -> DataString when Base64 :: ascii_string() | ascii_binary(), @@ -165,115 +136,195 @@ decode_to_string(List) when is_list(List) -> mime_decode_to_string(Bin) when is_binary(Bin) -> mime_decode_to_string(binary_to_list(Bin)); mime_decode_to_string(List) when is_list(List) -> - mime_decode_l(List). - -%% One-based decode map. --define(DECODE_MAP, - {bad,bad,bad,bad,bad,bad,bad,bad,ws,ws,bad,bad,ws,bad,bad, %1-15 - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, %16-31 - ws,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,62,bad,bad,bad,63, %32-47 - 52,53,54,55,56,57,58,59,60,61,bad,bad,bad,eq,bad,bad, %48-63 - bad,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, - 15,16,17,18,19,20,21,22,23,24,25,bad,bad,bad,bad,bad, - bad,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, - 41,42,43,44,45,46,47,48,49,50,51,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, - bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad}). + mime_decode_list_to_string(List). -decode_binary(<<C1:8, Cs/bits>>, A) -> - case element(C1, ?DECODE_MAP) of - ws -> decode_binary(Cs, A); - B1 -> decode_binary(Cs, A, B1) +%% Skipping pad character if not at end of string. Also liberal about +%% excess padding and skipping of other illegal (non-base64 alphabet) +%% characters. See section 3.3 of RFC4648 +mime_decode_list([0 | Cs], A) -> + mime_decode_list(Cs, A); +mime_decode_list([C1 | Cs], A) -> + case b64d(C1) of + B1 when is_integer(B1) -> mime_decode_list(Cs, A, B1); + _ -> mime_decode_list(Cs, A) % eq is padding end; -decode_binary(<<>>, A) -> +mime_decode_list([], A) -> A. -decode_binary(<<C2:8, Cs/bits>>, A, B1) -> - case element(C2, ?DECODE_MAP) of - ws -> decode_binary(Cs, A, B1); - B2 -> decode_binary(Cs, A, B1, B2) +mime_decode_list([0 | Cs], A, B1) -> + mime_decode_list(Cs, A, B1); +mime_decode_list([C2 | Cs], A, B1) -> + case b64d(C2) of + B2 when is_integer(B2) -> + mime_decode_list(Cs, A, B1, B2); + _ -> mime_decode_list(Cs, A, B1) % eq is padding end. -decode_binary(<<C3:8, Cs/bits>>, A, B1, B2) -> - case element(C3, ?DECODE_MAP) of - ws -> decode_binary(Cs, A, B1, B2); - B3 -> decode_binary(Cs, A, B1, B2, B3) +mime_decode_list([0 | Cs], A, B1, B2) -> + mime_decode_list(Cs, A, B1, B2); +mime_decode_list([C3 | Cs], A, B1, B2) -> + case b64d(C3) of + B3 when is_integer(B3) -> + mime_decode_list(Cs, A, B1, B2, B3); + eq=B3 -> + mime_decode_list_after_eq(Cs, A, B1, B2, B3); + _ -> mime_decode_list(Cs, A, B1, B2) end. -decode_binary(<<C4:8, Cs/bits>>, A, B1, B2, B3) -> - case element(C4, ?DECODE_MAP) of - ws -> decode_binary(Cs, A, B1, B2, B3); - eq when B3 =:= eq -> only_ws_binary(Cs, <<A/binary,B1:6,(B2 bsr 4):2>>); - eq -> only_ws_binary(Cs, <<A/binary,B1:6,B2:6,(B3 bsr 2):4>>); - B4 -> decode_binary(Cs, <<A/binary,B1:6,B2:6,B3:6,B4:6>>) +mime_decode_list([0 | Cs], A, B1, B2, B3) -> + mime_decode_list(Cs, A, B1, B2, B3); +mime_decode_list([C4 | Cs], A, B1, B2, B3) -> + case b64d(C4) of + B4 when is_integer(B4) -> + mime_decode_list(Cs, <<A/bits,B1:6,B2:6,B3:6,B4:6>>); + eq -> + mime_decode_list_after_eq(Cs, A, B1, B2, B3); + _ -> mime_decode_list(Cs, A, B1, B2, B3) end. -only_ws_binary(<<>>, A) -> - A; -only_ws_binary(<<C:8, Cs/bits>>, A) -> - case element(C, ?DECODE_MAP) of - ws -> only_ws_binary(Cs, A); - _ -> erlang:error(function_clause) +mime_decode_list_after_eq([0 | Cs], A, B1, B2, B3) -> + mime_decode_list_after_eq(Cs, A, B1, B2, B3); +mime_decode_list_after_eq([C | Cs], A, B1, B2, B3) -> + case b64d(C) of + B when is_integer(B) -> + %% More valid data, skip the eq as invalid + case B3 of + eq -> mime_decode_list(Cs, A, B1, B2, B); + _ -> mime_decode_list(Cs, <<A/bits,B1:6,B2:6,B3:6,B:6>>) + end; + _ -> mime_decode_list_after_eq(Cs, A, B1, B2, B3) + end; +mime_decode_list_after_eq([], A, B1, B2, eq) -> + <<A/bits,B1:6,(B2 bsr 4):2>>; +mime_decode_list_after_eq([], A, B1, B2, B3) -> + <<A/bits,B1:6,B2:6,(B3 bsr 2):4>>. + +mime_decode_binary(<<0:8, Cs/bits>>, A) -> + mime_decode_binary(Cs, A); +mime_decode_binary(<<C1:8, Cs/bits>>, A) -> + case b64d(C1) of + B1 when is_integer(B1) -> mime_decode_binary(Cs, A, B1); + _ -> mime_decode_binary(Cs, A) % eq is padding + end; +mime_decode_binary(<<>>, A) -> + A. + +mime_decode_binary(<<0:8, Cs/bits>>, A, B1) -> + mime_decode_binary(Cs, A, B1); +mime_decode_binary(<<C2:8, Cs/bits>>, A, B1) -> + case b64d(C2) of + B2 when is_integer(B2) -> + mime_decode_binary(Cs, A, B1, B2); + _ -> mime_decode_binary(Cs, A, B1) % eq is padding end. -%% Skipping pad character if not at end of string. Also liberal about -%% excess padding and skipping of other illegal (non-base64 alphabet) -%% characters. See section 3.3 of RFC4648 -mime_decode_binary(Result, <<0:8,T/bits>>) -> - mime_decode_binary(Result, T); -mime_decode_binary(Result0, <<C:8,T/bits>>) -> - case element(C, ?DECODE_MAP) of - Bits when is_integer(Bits) -> - mime_decode_binary(<<Result0/bits,Bits:6>>, T); +mime_decode_binary(<<0:8, Cs/bits>>, A, B1, B2) -> + mime_decode_binary(Cs, A, B1, B2); +mime_decode_binary(<<C3:8, Cs/bits>>, A, B1, B2) -> + case b64d(C3) of + B3 when is_integer(B3) -> + mime_decode_binary(Cs, A, B1, B2, B3); + eq=B3 -> + mime_decode_binary_after_eq(Cs, A, B1, B2, B3); + _ -> mime_decode_binary(Cs, A, B1, B2) + end. + +mime_decode_binary(<<0:8, Cs/bits>>, A, B1, B2, B3) -> + mime_decode_binary(Cs, A, B1, B2, B3); +mime_decode_binary(<<C4:8, Cs/bits>>, A, B1, B2, B3) -> + case b64d(C4) of + B4 when is_integer(B4) -> + mime_decode_binary(Cs, <<A/bits,B1:6,B2:6,B3:6,B4:6>>); eq -> - mime_decode_binary_after_eq(Result0, T, false); - _ -> - mime_decode_binary(Result0, T) + mime_decode_binary_after_eq(Cs, A, B1, B2, B3); + _ -> mime_decode_binary(Cs, A, B1, B2, B3) + end. + +mime_decode_binary_after_eq(<<0:8, Cs/bits>>, A, B1, B2, B3) -> + mime_decode_binary_after_eq(Cs, A, B1, B2, B3); +mime_decode_binary_after_eq(<<C:8, Cs/bits>>, A, B1, B2, B3) -> + case b64d(C) of + B when is_integer(B) -> + %% More valid data, skip the eq as invalid + case B3 of + eq -> mime_decode_binary(Cs, A, B1, B2, B); + _ -> mime_decode_binary(Cs, <<A/bits,B1:6,B2:6,B3:6,B:6>>) + end; + _ -> mime_decode_binary_after_eq(Cs, A, B1, B2, B3) end; -mime_decode_binary(Result, _) -> - true = is_binary(Result), - Result. - -mime_decode_binary_after_eq(Result, <<0:8,T/bits>>, Eq) -> - mime_decode_binary_after_eq(Result, T, Eq); -mime_decode_binary_after_eq(Result0, <<C:8,T/bits>>, Eq) -> - case element(C, ?DECODE_MAP) of - bad -> - mime_decode_binary_after_eq(Result0, T, Eq); - ws -> - mime_decode_binary_after_eq(Result0, T, Eq); +mime_decode_binary_after_eq(<<>>, A, B1, B2, eq) -> + <<A/bits,B1:6,(B2 bsr 4):2>>; +mime_decode_binary_after_eq(<<>>, A, B1, B2, B3) -> + <<A/bits,B1:6,B2:6,(B3 bsr 2):4>>. + +mime_decode_list_to_string([0 | Cs]) -> + mime_decode_list_to_string(Cs); +mime_decode_list_to_string([C1 | Cs]) -> + case b64d(C1) of + B1 when is_integer(B1) -> mime_decode_list_to_string(Cs, B1); + _ -> mime_decode_list_to_string(Cs) % eq is padding + end; +mime_decode_list_to_string([]) -> + []. + +mime_decode_list_to_string([0 | Cs], B1) -> + mime_decode_list_to_string(Cs, B1); +mime_decode_list_to_string([C2 | Cs], B1) -> + case b64d(C2) of + B2 when is_integer(B2) -> + mime_decode_list_to_string(Cs, B1, B2); + _ -> mime_decode_list_to_string(Cs, B1) % eq is padding + end. + +mime_decode_list_to_string([0 | Cs], B1, B2) -> + mime_decode_list_to_string(Cs, B1, B2); +mime_decode_list_to_string([C3 | Cs], B1, B2) -> + case b64d(C3) of + B3 when is_integer(B3) -> + mime_decode_list_to_string(Cs, B1, B2, B3); + eq=B3 -> mime_decode_list_to_string_after_eq(Cs, B1, B2, B3); + _ -> mime_decode_list_to_string(Cs, B1, B2) + end. + +mime_decode_list_to_string([0 | Cs], B1, B2, B3) -> + mime_decode_list_to_string(Cs, B1, B2, B3); +mime_decode_list_to_string([C4 | Cs], B1, B2, B3) -> + case b64d(C4) of + B4 when is_integer(B4) -> + Bits4x6 = (B1 bsl 18) bor (B2 bsl 12) bor (B3 bsl 6) bor B4, + Octet1 = Bits4x6 bsr 16, + Octet2 = (Bits4x6 bsr 8) band 16#ff, + Octet3 = Bits4x6 band 16#ff, + [Octet1, Octet2, Octet3 | mime_decode_list_to_string(Cs)]; eq -> - mime_decode_binary_after_eq(Result0, T, true); - Bits when is_integer(Bits) -> + mime_decode_list_to_string_after_eq(Cs, B1, B2, B3); + _ -> mime_decode_list_to_string(Cs, B1, B2, B3) + end. + +mime_decode_list_to_string_after_eq([0 | Cs], B1, B2, B3) -> + mime_decode_list_to_string_after_eq(Cs, B1, B2, B3); +mime_decode_list_to_string_after_eq([C | Cs], B1, B2, B3) -> + case b64d(C) of + B when is_integer(B) -> %% More valid data, skip the eq as invalid - mime_decode_binary(<<Result0/bits,Bits:6>>, T) + case B3 of + eq -> mime_decode_list_to_string(Cs, B1, B2, B); + _ -> + Bits4x6 = (B1 bsl 18) bor (B2 bsl 12) bor (B3 bsl 6) bor B, + Octet1 = Bits4x6 bsr 16, + Octet2 = (Bits4x6 bsr 8) band 16#ff, + Octet3 = Bits4x6 band 16#ff, + [Octet1, Octet2, Octet3 | mime_decode_list_to_string(Cs)] + end; + _ -> mime_decode_list_to_string_after_eq(Cs, B1, B2, B3) end; -mime_decode_binary_after_eq(Result0, <<>>, Eq) -> - %% No more valid data. - case bit_size(Result0) rem 8 of - 0 -> - %% '====' is not uncommon. - Result0; - 4 when Eq -> - %% enforce at least one more '=' only ignoring illegals and spacing - Split = byte_size(Result0) - 1, - <<Result:Split/bytes,_:4>> = Result0, - Result; - 2 -> - %% remove 2 bits - Split = byte_size(Result0) - 1, - <<Result:Split/bytes,_:2>> = Result0, - Result - end. +mime_decode_list_to_string_after_eq([], B1, B2, eq) -> + binary_to_list(<<B1:6,(B2 bsr 4):2>>); +mime_decode_list_to_string_after_eq([], B1, B2, B3) -> + binary_to_list(<<B1:6,B2:6,(B3 bsr 2):4>>). decode_list([C1 | Cs], A) -> - case element(C1, ?DECODE_MAP) of + case b64d(C1) of ws -> decode_list(Cs, A); B1 -> decode_list(Cs, A, B1) end; @@ -281,122 +332,130 @@ decode_list([], A) -> A. decode_list([C2 | Cs], A, B1) -> - case element(C2, ?DECODE_MAP) of + case b64d(C2) of ws -> decode_list(Cs, A, B1); B2 -> decode_list(Cs, A, B1, B2) end. decode_list([C3 | Cs], A, B1, B2) -> - case element(C3, ?DECODE_MAP) of + case b64d(C3) of ws -> decode_list(Cs, A, B1, B2); B3 -> decode_list(Cs, A, B1, B2, B3) end. decode_list([C4 | Cs], A, B1, B2, B3) -> - case element(C4, ?DECODE_MAP) of + case b64d(C4) of ws -> decode_list(Cs, A, B1, B2, B3); - eq when B3 =:= eq -> only_ws(Cs, <<A/binary,B1:6,(B2 bsr 4):2>>); - eq -> only_ws(Cs, <<A/binary,B1:6,B2:6,(B3 bsr 2):4>>); - B4 -> decode_list(Cs, <<A/binary,B1:6,B2:6,B3:6,B4:6>>) + eq when B3 =:= eq -> only_ws(Cs, <<A/bits,B1:6,(B2 bsr 4):2>>); + eq -> only_ws(Cs, <<A/bits,B1:6,B2:6,(B3 bsr 2):4>>); + B4 -> decode_list(Cs, <<A/bits,B1:6,B2:6,B3:6,B4:6>>) end. -only_ws([], A) -> - A; -only_ws([C | Cs], A) -> - case element(C, ?DECODE_MAP) of - ws -> only_ws(Cs, A); - _ -> erlang:error(function_clause) - end. +decode_binary(<<C1:8, Cs/bits>>, A) -> + case b64d(C1) of + ws -> decode_binary(Cs, A); + B1 -> decode_binary(Cs, A, B1) + end; +decode_binary(<<>>, A) -> + A. -decode([], A) -> A; -decode([$=,$=,C2,C1|Cs], A) -> - Bits2x6 = (b64d(C1) bsl 18) bor (b64d(C2) bsl 12), - Octet1 = Bits2x6 bsr 16, - decode(Cs, [Octet1|A]); -decode([$=,C3,C2,C1|Cs], A) -> - Bits3x6 = (b64d(C1) bsl 18) bor (b64d(C2) bsl 12) - bor (b64d(C3) bsl 6), - Octet1 = Bits3x6 bsr 16, - Octet2 = (Bits3x6 bsr 8) band 16#ff, - decode(Cs, [Octet1,Octet2|A]); -decode([C4,C3,C2,C1| Cs], A) -> - Bits4x6 = (b64d(C1) bsl 18) bor (b64d(C2) bsl 12) - bor (b64d(C3) bsl 6) bor b64d(C4), - Octet1 = Bits4x6 bsr 16, - Octet2 = (Bits4x6 bsr 8) band 16#ff, - Octet3 = Bits4x6 band 16#ff, - decode(Cs, [Octet1,Octet2,Octet3|A]). +decode_binary(<<C2:8, Cs/bits>>, A, B1) -> + case b64d(C2) of + ws -> decode_binary(Cs, A, B1); + B2 -> decode_binary(Cs, A, B1, B2) + end. -%%%======================================================================== -%%% Internal functions -%%%======================================================================== +decode_binary(<<C3:8, Cs/bits>>, A, B1, B2) -> + case b64d(C3) of + ws -> decode_binary(Cs, A, B1, B2); + B3 -> decode_binary(Cs, A, B1, B2, B3) + end. -strip_spaces([], A) -> A; -strip_spaces([$\s|Cs], A) -> strip_spaces(Cs, A); -strip_spaces([$\t|Cs], A) -> strip_spaces(Cs, A); -strip_spaces([$\r|Cs], A) -> strip_spaces(Cs, A); -strip_spaces([$\n|Cs], A) -> strip_spaces(Cs, A); -strip_spaces([C|Cs], A) -> strip_spaces(Cs, [C | A]). +decode_binary(<<C4:8, Cs/bits>>, A, B1, B2, B3) -> + case b64d(C4) of + ws -> decode_binary(Cs, A, B1, B2, B3); + eq when B3 =:= eq -> only_ws_binary(Cs, <<A/bits,B1:6,(B2 bsr 4):2>>); + eq -> only_ws_binary(Cs, <<A/bits,B1:6,B2:6,(B3 bsr 2):4>>); + B4 -> decode_binary(Cs, <<A/bits,B1:6,B2:6,B3:6,B4:6>>) + end. -%% Skipping pad character if not at end of string. Also liberal about -%% excess padding and skipping of other illegal (non-base64 alphabet) -%% characters. See section 3.3 of RFC4648 -strip_illegal([], A, _Cnt) -> +only_ws_binary(<<>>, A) -> A; -strip_illegal([0|Cs], A, Cnt) -> - strip_illegal(Cs, A, Cnt); -strip_illegal([C|Cs], A, Cnt) -> - case element(C, ?DECODE_MAP) of - bad -> - strip_illegal(Cs, A, Cnt); - ws -> - strip_illegal(Cs, A, Cnt); - eq -> - case {tail_contains_more(Cs, false), Cnt rem 4} of - {{[], _}, 0} -> - A; %% Ignore extra = - {{[], true}, 2} -> - [$=|[$=|A]]; %% 'XX==' - {{[], _}, 3} -> - [$=|A]; %% 'XXX=' - {{[H|T], _}, _} -> - %% more data, skip equals - strip_illegal(T, [H|A], Cnt+1) - end; - _ -> - strip_illegal(Cs, [C|A], Cnt+1) +only_ws_binary(<<C:8, Cs/bits>>, A) -> + case b64d(C) of + ws -> only_ws_binary(Cs, A) end. -%% Search the tail for more valid data and remember if we saw -%% another equals along the way. -tail_contains_more([], Eq) -> - {[], Eq}; -tail_contains_more(<<>>, Eq) -> - {<<>>, Eq}; -tail_contains_more([C|T]=More, Eq) -> - case element(C, ?DECODE_MAP) of - bad -> - tail_contains_more(T, Eq); - ws -> - tail_contains_more(T, Eq); - eq -> - tail_contains_more(T, true); - _ -> - {More, Eq} +decode_list_to_string([C1 | Cs]) -> + case b64d(C1) of + ws -> decode_list_to_string(Cs); + B1 -> decode_list_to_string(Cs, B1) end; -tail_contains_more(<<C:8,T/bits>> =More, Eq) -> - case element(C, ?DECODE_MAP) of - bad -> - tail_contains_more(T, Eq); - ws -> - tail_contains_more(T, Eq); - eq -> - tail_contains_more(T, true); - _ -> - {More, Eq} +decode_list_to_string([]) -> + []. + +decode_list_to_string([C2 | Cs], B1) -> + case b64d(C2) of + ws -> decode_list_to_string(Cs, B1); + B2 -> decode_list_to_string(Cs, B1, B2) + end. + +decode_list_to_string([C3 | Cs], B1, B2) -> + case b64d(C3) of + ws -> decode_list_to_string(Cs, B1, B2); + B3 -> decode_list_to_string(Cs, B1, B2, B3) + end. + +decode_list_to_string([C4 | Cs], B1, B2, B3) -> + case b64d(C4) of + ws -> + decode_list_to_string(Cs, B1, B2, B3); + eq when B3 =:= eq -> + only_ws(Cs, binary_to_list(<<B1:6,(B2 bsr 4):2>>)); + eq -> + only_ws(Cs, binary_to_list(<<B1:6,B2:6,(B3 bsr 2):4>>)); + B4 -> + Bits4x6 = (B1 bsl 18) bor (B2 bsl 12) bor (B3 bsl 6) bor B4, + Octet1 = Bits4x6 bsr 16, + Octet2 = (Bits4x6 bsr 8) band 16#ff, + Octet3 = Bits4x6 band 16#ff, + [Octet1, Octet2, Octet3 | decode_list_to_string(Cs)] + end. + +only_ws([], A) -> + A; +only_ws([C | Cs], A) -> + case b64d(C) of + ws -> only_ws(Cs, A) end. - + +%%%======================================================================== +%%% Internal functions +%%%======================================================================== + %% accessors +-compile({inline, [{b64d, 1}]}). +%% One-based decode map. +b64d(X) -> + element(X, + {bad,bad,bad,bad,bad,bad,bad,bad,ws,ws,bad,bad,ws,bad,bad, %1-15 + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, %16-31 + ws,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,62,bad,bad,bad,63, %32-47 + 52,53,54,55,56,57,58,59,60,61,bad,bad,bad,eq,bad,bad, %48-63 + bad,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14, + 15,16,17,18,19,20,21,22,23,24,25,bad,bad,bad,bad,bad, + bad,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40, + 41,42,43,44,45,46,47,48,49,50,51,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad, + bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad,bad}). + +-compile({inline, [{b64e, 1}]}). b64e(X) -> element(X+1, {$A, $B, $C, $D, $E, $F, $G, $H, $I, $J, $K, $L, $M, $N, @@ -404,9 +463,3 @@ b64e(X) -> $a, $b, $c, $d, $e, $f, $g, $h, $i, $j, $k, $l, $m, $n, $o, $p, $q, $r, $s, $t, $u, $v, $w, $x, $y, $z, $0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $+, $/}). - - -b64d(X) -> - b64d_ok(element(X, ?DECODE_MAP)). - -b64d_ok(I) when is_integer(I) -> I. diff --git a/lib/stdlib/src/erl_compile.erl b/lib/stdlib/src/erl_compile.erl index 18d7548fdc..f781312ca2 100644 --- a/lib/stdlib/src/erl_compile.erl +++ b/lib/stdlib/src/erl_compile.erl @@ -188,6 +188,8 @@ parse_dep_option("", T) -> {[makedep,{makedep_output,standard_io}],T}; parse_dep_option("D", T) -> {[makedep],T}; +parse_dep_option("MD", T) -> + {[makedep_side_effect],T}; parse_dep_option("F"++Opt, T0) -> {File,T} = get_option("MF", Opt, T0), {[makedep,{makedep_output,File}],T}; @@ -221,6 +223,7 @@ usage() -> "the dependencies"}, {"-MP","add a phony target for each dependency"}, {"-MD","same as -M -MT file (with default 'file')"}, + {"-MMD","generate dependencies as a side-effect"}, {"-o name","name output directory or file"}, {"-pa path","add path to the front of Erlang's code path"}, {"-pz path","add path to the end of Erlang's code path"}, diff --git a/lib/stdlib/src/erl_lint.erl b/lib/stdlib/src/erl_lint.erl index f58cb35cea..1930c462e8 100644 --- a/lib/stdlib/src/erl_lint.erl +++ b/lib/stdlib/src/erl_lint.erl @@ -144,6 +144,7 @@ value_option(Flag, Default, On, OnVal, Off, OffVal, Opts) -> :: dict:dict(ta(), #typeinfo{}), exp_types=gb_sets:empty() %Exported types :: gb_sets:set(ta()), + in_try_head=false :: boolean(), %In a try head. catch_scope = none %Inside/outside try or catch :: catch_scope() }). @@ -312,6 +313,10 @@ format_error({unused_var, V}) -> io_lib:format("variable ~w is unused", [V]); format_error({variable_in_record_def,V}) -> io_lib:format("variable ~w in record definition", [V]); +format_error({stacktrace_guard,V}) -> + io_lib:format("stacktrace variable ~w must not be used in a guard", [V]); +format_error({stacktrace_bound,V}) -> + io_lib:format("stacktrace variable ~w must not be previously bound", [V]); %% --- binaries --- format_error({undefined_bittype,Type}) -> io_lib:format("bit type ~tw undefined", [Type]); @@ -3218,11 +3223,11 @@ is_module_dialyzer_option(Option) -> try_clauses(Scs, Ccs, In, Vt, St0) -> {Csvt0,St1} = icrt_clauses(Scs, Vt, St0), - St2 = St1#lint{catch_scope=try_catch}, + St2 = St1#lint{catch_scope=try_catch,in_try_head=true}, {Csvt1,St3} = icrt_clauses(Ccs, Vt, St2), Csvt = Csvt0 ++ Csvt1, UpdVt = icrt_export(Csvt, Vt, In, St3), - {UpdVt,St3}. + {UpdVt,St3#lint{in_try_head=false}}. %% icrt_clauses(Clauses, In, ImportVarTable, State) -> %% {UpdVt,State}. @@ -3239,12 +3244,29 @@ icrt_clauses(Cs, Vt, St) -> mapfoldl(fun (C, St0) -> icrt_clause(C, Vt, St0) end, St, Cs). icrt_clause({clause,_Line,H,G,B}, Vt0, #lint{catch_scope=Scope}=St0) -> - {Hvt,Binvt,St1} = head(H, Vt0, St0), - Vt1 = vtupdate(Hvt, Binvt), - {Gvt,St2} = guard(G, vtupdate(Vt1, Vt0), St1), - Vt2 = vtupdate(Gvt, Vt1), - {Bvt,St3} = exprs(B, vtupdate(Vt2, Vt0), St2), - {vtupdate(Bvt, Vt2),St3#lint{catch_scope=Scope}}. + Vt1 = taint_stack_var(Vt0, H, St0), + {Hvt,Binvt,St1} = head(H, Vt1, St0), + Vt2 = vtupdate(Hvt, Binvt), + Vt3 = taint_stack_var(Vt2, H, St0), + {Gvt,St2} = guard(G, vtupdate(Vt3, Vt0), St1#lint{in_try_head=false}), + Vt4 = vtupdate(Gvt, Vt2), + {Bvt,St3} = exprs(B, vtupdate(Vt4, Vt0), St2), + {vtupdate(Bvt, Vt4),St3#lint{catch_scope=Scope}}. + +taint_stack_var(Vt, Pat, #lint{in_try_head=true}) -> + [{tuple,_,[_,_,{var,_,Stk}]}] = Pat, + case Stk of + '_' -> + Vt; + _ -> + lists:map(fun({V,{bound,Used,Lines}}) when V =:= Stk -> + {V,{stacktrace,Used,Lines}}; + (B) -> + B + end, Vt) + end; +taint_stack_var(Vt, _Pat, #lint{in_try_head=false}) -> + Vt. icrt_export(Vts, Vt, {Tag,Attrs}, St) -> {_File,Loc} = loc(Attrs, St), @@ -3484,6 +3506,9 @@ pat_var(V, Line, Vt, Bvt, St) -> {[{V,{bound,used,Ls}}],[], %% As this is matching, exported vars are risky. add_warning(Line, {exported_var,V,From}, St)}; + {ok,{stacktrace,_Usage,Ls}} -> + {[{V,{bound,used,Ls}}],[], + add_error(Line, {stacktrace_bound,V}, St)}; error when St#lint.recdef_top -> {[],[{V,{bound,unused,[Line]}}], add_error(Line, {variable_in_record_def,V}, St)}; @@ -3541,6 +3566,9 @@ expr_var(V, Line, Vt, St) -> false -> {[{V,{{export,From},used,Ls}}],St} end; + {ok,{stacktrace,_Usage,Ls}} -> + {[{V,{bound,used,Ls}}], + add_error(Line, {stacktrace_guard,V}, St)}; error -> {[{V,{bound,used,[Line]}}], add_error(Line, {unbound_var,V}, St)} diff --git a/lib/stdlib/src/erl_parse.yrl b/lib/stdlib/src/erl_parse.yrl index 6e72d64acc..14ca24362e 100644 --- a/lib/stdlib/src/erl_parse.yrl +++ b/lib/stdlib/src/erl_parse.yrl @@ -29,6 +29,10 @@ clause_args clause_guard clause_body expr expr_100 expr_150 expr_160 expr_200 expr_300 expr_400 expr_500 expr_600 expr_700 expr_800 expr_max +pat_expr pat_expr_200 pat_expr_300 pat_expr_400 pat_expr_500 +pat_expr_600 pat_expr_700 pat_expr_800 +pat_expr_max map_pat_expr record_pat_expr +pat_argument_list pat_exprs list tail list_comprehension lc_expr lc_exprs binary_comprehension @@ -37,7 +41,7 @@ record_expr record_tuple record_field record_fields map_expr map_tuple map_field map_field_assoc map_field_exact map_fields map_key if_expr if_clause if_clauses case_expr cr_clause cr_clauses receive_expr fun_expr fun_clause fun_clauses atom_or_var integer_or_var -try_expr try_catch try_clause try_clauses +try_expr try_catch try_clause try_clauses try_opt_stacktrace function_call argument_list exprs guard atomic strings @@ -66,7 +70,7 @@ char integer float atom string var 'spec' 'callback' % helper dot. -Expect 2. +Expect 0. Rootsymbol form. @@ -210,7 +214,7 @@ function_clause -> atom clause_args clause_guard clause_body : {clause,?anno('$1'),element(3, '$1'),'$2','$3','$4'}. -clause_args -> argument_list : element(1, '$1'). +clause_args -> pat_argument_list : element(1, '$1'). clause_guard -> 'when' guard : '$2'. clause_guard -> '$empty' : []. @@ -275,6 +279,53 @@ expr_max -> receive_expr : '$1'. expr_max -> fun_expr : '$1'. expr_max -> try_expr : '$1'. +pat_expr -> pat_expr_200 '=' pat_expr : {match,?anno('$2'),'$1','$3'}. +pat_expr -> pat_expr_200 : '$1'. + +pat_expr_200 -> pat_expr_300 comp_op pat_expr_300 : + ?mkop2('$1', '$2', '$3'). +pat_expr_200 -> pat_expr_300 : '$1'. + +pat_expr_300 -> pat_expr_400 list_op pat_expr_300 : + ?mkop2('$1', '$2', '$3'). +pat_expr_300 -> pat_expr_400 : '$1'. + +pat_expr_400 -> pat_expr_400 add_op pat_expr_500 : + ?mkop2('$1', '$2', '$3'). +pat_expr_400 -> pat_expr_500 : '$1'. + +pat_expr_500 -> pat_expr_500 mult_op pat_expr_600 : + ?mkop2('$1', '$2', '$3'). +pat_expr_500 -> pat_expr_600 : '$1'. + +pat_expr_600 -> prefix_op pat_expr_700 : + ?mkop1('$1', '$2'). +pat_expr_600 -> map_pat_expr : '$1'. +pat_expr_600 -> pat_expr_700 : '$1'. + +pat_expr_700 -> record_pat_expr : '$1'. +pat_expr_700 -> pat_expr_800 : '$1'. + +pat_expr_800 -> pat_expr_max : '$1'. + +pat_expr_max -> var : '$1'. +pat_expr_max -> atomic : '$1'. +pat_expr_max -> list : '$1'. +pat_expr_max -> binary : '$1'. +pat_expr_max -> tuple : '$1'. +pat_expr_max -> '(' pat_expr ')' : '$2'. + +map_pat_expr -> '#' map_tuple : + {map, ?anno('$1'),'$2'}. +map_pat_expr -> pat_expr_max '#' map_tuple : + {map, ?anno('$2'),'$1','$3'}. +map_pat_expr -> map_pat_expr '#' map_tuple : + {map, ?anno('$2'),'$1','$3'}. + +record_pat_expr -> '#' atom '.' atom : + {record_index,?anno('$1'),element(3, '$2'),'$4'}. +record_pat_expr -> '#' atom record_tuple : + {record,?anno('$1'),element(3, '$2'),'$3'}. list -> '[' ']' : {nil,?anno('$1')}. list -> '[' expr tail : {cons,?anno('$1'),'$2','$3'}. @@ -397,6 +448,10 @@ case_expr -> 'case' expr 'of' cr_clauses 'end' : cr_clauses -> cr_clause : ['$1']. cr_clauses -> cr_clause ';' cr_clauses : ['$1' | '$3']. +%% FIXME: merl in syntax_tools depends on patterns in a 'case' being +%% full expressions. Therefore, we can't use pat_expr here. There +%% should be a better way. + cr_clause -> expr clause_guard clause_body : {clause,?anno('$1'),['$1'],'$2','$3'}. @@ -424,11 +479,11 @@ integer_or_var -> var : '$1'. fun_clauses -> fun_clause : ['$1']. fun_clauses -> fun_clause ';' fun_clauses : ['$1' | '$3']. -fun_clause -> argument_list clause_guard clause_body : +fun_clause -> pat_argument_list clause_guard clause_body : {Args,Anno} = '$1', {clause,Anno,'fun',Args,'$2','$3'}. -fun_clause -> var argument_list clause_guard clause_body : +fun_clause -> var pat_argument_list clause_guard clause_body : {clause,element(2, '$1'),element(3, '$1'),element(1, '$2'),'$3','$4'}. try_expr -> 'try' exprs 'of' cr_clauses try_catch : @@ -446,24 +501,31 @@ try_catch -> 'after' exprs 'end' : try_clauses -> try_clause : ['$1']. try_clauses -> try_clause ';' try_clauses : ['$1' | '$3']. -try_clause -> expr clause_guard clause_body : +try_clause -> pat_expr clause_guard clause_body : A = ?anno('$1'), {clause,A,[{tuple,A,[{atom,A,throw},'$1',{var,A,'_'}]}],'$2','$3'}. -try_clause -> atom ':' expr clause_guard clause_body : +try_clause -> atom ':' pat_expr try_opt_stacktrace clause_guard clause_body : A = ?anno('$1'), - {clause,A,[{tuple,A,['$1','$3',{var,A,'_'}]}],'$4','$5'}. -try_clause -> var ':' expr clause_guard clause_body : + {clause,A,[{tuple,A,['$1','$3',{var,A,'$4'}]}],'$5','$6'}. +try_clause -> var ':' pat_expr try_opt_stacktrace clause_guard clause_body : A = ?anno('$1'), - {clause,A,[{tuple,A,['$1','$3',{var,A,'_'}]}],'$4','$5'}. + {clause,A,[{tuple,A,['$1','$3',{var,A,'$4'}]}],'$5','$6'}. +try_opt_stacktrace -> ':' var : element(3, '$2'). +try_opt_stacktrace -> '$empty' : '_'. argument_list -> '(' ')' : {[],?anno('$1')}. argument_list -> '(' exprs ')' : {'$2',?anno('$1')}. +pat_argument_list -> '(' ')' : {[],?anno('$1')}. +pat_argument_list -> '(' pat_exprs ')' : {'$2',?anno('$1')}. exprs -> expr : ['$1']. exprs -> expr ',' exprs : ['$1' | '$3']. +pat_exprs -> pat_expr : ['$1']. +pat_exprs -> pat_expr ',' pat_exprs : ['$1' | '$3']. + guard -> exprs : ['$1']. guard -> exprs ';' guard : ['$1'|'$3']. diff --git a/lib/stdlib/src/gen.erl b/lib/stdlib/src/gen.erl index 33af0aed8f..4b1d448487 100644 --- a/lib/stdlib/src/gen.erl +++ b/lib/stdlib/src/gen.erl @@ -49,6 +49,7 @@ | {'logfile', string()}. -type option() :: {'timeout', timeout()} | {'debug', [debug_flag()]} + | {'hibernate_after', timeout()} | {'spawn_opt', [proc_lib:spawn_option()]}. -type options() :: [option()]. diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl index 5a4d2df2a6..e01bb7d85e 100644 --- a/lib/stdlib/src/string.erl +++ b/lib/stdlib/src/string.erl @@ -74,15 +74,16 @@ -export([to_upper/1, to_lower/1]). %% -import(lists,[member/2]). - -compile({no_auto_import,[length/1]}). +-compile({inline, [btoken/2, rev/1, append/2, stack/2, search_compile/1]}). +-define(ASCII_LIST(CP1,CP2), CP1 < 256, CP2 < 256, CP1 =/= $\r). -export_type([grapheme_cluster/0]). -type grapheme_cluster() :: char() | [char()]. -type direction() :: 'leading' | 'trailing'. --dialyzer({no_improper_lists, stack/2}). +-dialyzer({no_improper_lists, [stack/2, length_b/3]}). %%% BIFs internal (not documented) should not to be used outside of this module %%% May be removed -export([list_to_float/1, list_to_integer/1]). @@ -127,8 +128,10 @@ is_empty(_) -> false. %% Count the number of grapheme clusters in chardata -spec length(String::unicode:chardata()) -> non_neg_integer(). +length(<<CP1/utf8, Bin/binary>>) -> + length_b(Bin, CP1, 0); length(CD) -> - length_1(unicode_util:gc(CD), 0). + length_1(CD, 0). %% Convert a string to a list of grapheme clusters -spec to_graphemes(String::unicode:chardata()) -> [grapheme_cluster()]. @@ -176,6 +179,8 @@ equal(A, B, true, Norm) -> %% Reverse grapheme clusters -spec reverse(String::unicode:chardata()) -> [grapheme_cluster()]. +reverse(<<CP1/utf8, Rest/binary>>) -> + reverse_b(Rest, CP1, []); reverse(CD) -> reverse_1(CD, []). @@ -186,7 +191,10 @@ reverse(CD) -> Start :: non_neg_integer(), Slice :: unicode:chardata(). slice(CD, N) when is_integer(N), N >= 0 -> - slice_l(CD, N, is_binary(CD)). + case slice_l0(CD, N) of + [] when is_binary(CD) -> <<>>; + Res -> Res + end. -spec slice(String, Start, Length) -> Slice when String::unicode:chardata(), @@ -195,9 +203,15 @@ slice(CD, N) when is_integer(N), N >= 0 -> Slice :: unicode:chardata(). slice(CD, N, Length) when is_integer(N), N >= 0, is_integer(Length), Length > 0 -> - slice_trail(slice_l(CD, N, is_binary(CD)), Length); + case slice_l0(CD, N) of + [] when is_binary(CD) -> <<>>; + L -> slice_trail(L, Length) + end; slice(CD, N, infinity) -> - slice_l(CD, N, is_binary(CD)); + case slice_l0(CD, N) of + [] when is_binary(CD) -> <<>>; + Res -> Res + end; slice(CD, _, 0) -> case is_binary(CD) of true -> <<>>; @@ -256,18 +270,22 @@ trim(Str, Dir) -> Dir :: direction() | 'both', Characters :: [grapheme_cluster()]. trim(Str, _, []) -> Str; +trim(Str, leading, [Sep]) when is_list(Str), Sep < 256 -> + trim_ls(Str, Sep); trim(Str, leading, Sep) when is_list(Sep) -> - trim_l(Str, search_pattern(Sep)); -trim(Str, trailing, Sep) when is_list(Sep) -> - trim_t(Str, 0, search_pattern(Sep)); -trim(Str, both, Sep0) when is_list(Sep0) -> - Sep = search_pattern(Sep0), - trim_t(trim_l(Str,Sep), 0, Sep). + trim_l(Str, Sep); +trim(Str, trailing, [Sep]) when is_list(Str), Sep < 256 -> + trim_ts(Str, Sep); +trim(Str, trailing, Seps0) when is_list(Seps0) -> + Seps = search_pattern(Seps0), + trim_t(Str, 0, Seps); +trim(Str, both, Sep) when is_list(Sep) -> + trim(trim(Str,leading,Sep), trailing, Sep). %% Delete trailing newlines or \r\n -spec chomp(String::unicode:chardata()) -> unicode:chardata(). chomp(Str) -> - trim_t(Str,0, {[[$\r,$\n],$\n], [$\r,$\n], [<<$\r>>,<<$\n>>]}). + trim(Str, trailing, [[$\r,$\n],$\n]). %% Split String into two parts where the leading part consists of Characters -spec take(String, Characters) -> {Leading, Trailing} when @@ -300,8 +318,7 @@ take(Str, [], Complement, Dir) -> {true, leading} -> {Str, Empty}; {true, trailing} -> {Empty, Str} end; -take(Str, Sep0, false, leading) -> - Sep = search_pattern(Sep0), +take(Str, Sep, false, leading) -> take_l(Str, Sep, []); take(Str, Sep0, true, leading) -> Sep = search_pattern(Sep0), @@ -461,6 +478,7 @@ replace(String, SearchPattern, Replacement, Where) -> SeparatorList::[grapheme_cluster()]) -> [unicode:chardata()]. lexemes([], _) -> []; +lexemes(Str, []) -> [Str]; lexemes(Str, Seps0) when is_list(Seps0) -> Seps = search_pattern(Seps0), lexemes_m(Str, Seps, []). @@ -494,13 +512,13 @@ find(String, SearchPattern, leading) -> find(String, SearchPattern, trailing) -> find_r(String, unicode:characters_to_list(SearchPattern), nomatch). -%% Fetch first codepoint and return rest in tail +%% Fetch first grapheme cluster and return rest in tail -spec next_grapheme(String::unicode:chardata()) -> maybe_improper_list(grapheme_cluster(),unicode:chardata()) | {error,unicode:chardata()}. next_grapheme(CD) -> unicode_util:gc(CD). -%% Fetch first grapheme cluster and return rest in tail +%% Fetch first codepoint and return rest in tail -spec next_codepoint(String::unicode:chardata()) -> maybe_improper_list(char(),unicode:chardata()) | {error,unicode:chardata()}. @@ -508,10 +526,23 @@ next_codepoint(CD) -> unicode_util:cp(CD). %% Internals -length_1([_|Rest], N) -> - length_1(unicode_util:gc(Rest), N+1); -length_1([], N) -> - N. +length_1([CP1|[CP2|_]=Cont], N) when ?ASCII_LIST(CP1,CP2) -> + length_1(Cont, N+1); +length_1(Str, N) -> + case unicode_util:gc(Str) of + [] -> N; + [_|Rest] -> length_1(Rest, N+1) + end. + +length_b(<<CP2/utf8, Rest/binary>>, CP1, N) + when ?ASCII_LIST(CP1,CP2) -> + length_b(Rest, CP2, N+1); +length_b(Bin0, CP1, N) -> + [_|Bin1] = unicode_util:gc([CP1|Bin0]), + case unicode_util:cp(Bin1) of + [] -> N+1; + [CP3|Bin] -> length_b(Bin, CP3, N+1) + end. equal_1([A|AR], [B|BR]) when is_integer(A), is_integer(B) -> A =:= B andalso equal_1(AR, BR); @@ -550,29 +581,66 @@ equal_norm_nocase(A0, B0, Norm) -> {L1,L2} when is_list(L1), is_list(L2) -> false end. +reverse_1([CP1|[CP2|_]=Cont], Acc) when ?ASCII_LIST(CP1,CP2) -> + reverse_1(Cont, [CP1|Acc]); reverse_1(CD, Acc) -> case unicode_util:gc(CD) of [GC|Rest] -> reverse_1(Rest, [GC|Acc]); [] -> Acc end. -slice_l(CD, N, Binary) when N > 0 -> +reverse_b(<<CP2/utf8, Rest/binary>>, CP1, Acc) + when ?ASCII_LIST(CP1,CP2) -> + reverse_b(Rest, CP2, [CP1|Acc]); +reverse_b(Bin0, CP1, Acc) -> + [GC|Bin1] = unicode_util:gc([CP1|Bin0]), + case unicode_util:cp(Bin1) of + [] -> [GC|Acc]; + [CP3|Bin] -> reverse_b(Bin, CP3, [GC|Acc]) + end. + +slice_l0(<<CP1/utf8, Bin/binary>>, N) when N > 0 -> + slice_lb(Bin, CP1, N); +slice_l0(L, N) -> + slice_l(L, N). + +slice_l([CP1|[CP2|_]=Cont], N) when ?ASCII_LIST(CP1,CP2),N > 0 -> + slice_l(Cont, N-1); +slice_l(CD, N) when N > 0 -> case unicode_util:gc(CD) of - [_|Cont] -> slice_l(Cont, N-1, Binary); - [] when Binary -> <<>>; + [_|Cont] -> slice_l(Cont, N-1); [] -> [] end; -slice_l(Cont, 0, Binary) -> - case is_empty(Cont) of - true when Binary -> <<>>; - _ -> Cont +slice_l(Cont, 0) -> + Cont. + +slice_lb(<<CP2/utf8, Bin/binary>>, CP1, N) when ?ASCII_LIST(CP1,CP2), N > 1 -> + slice_lb(Bin, CP2, N-1); +slice_lb(Bin, CP1, N) -> + [_|Rest] = unicode_util:gc([CP1|Bin]), + if N > 1 -> + case unicode_util:cp(Rest) of + [CP2|Cont] -> slice_lb(Cont, CP2, N-1); + [] -> <<>> + end; + N =:= 1 -> + Rest end. +slice_trail(Orig, N) when is_binary(Orig) -> + case Orig of + <<CP1/utf8, Bin/binary>> when N > 0 -> + Length = slice_bin(Bin, CP1, N), + Sz = byte_size(Orig) - Length, + <<Keep:Sz/binary, _/binary>> = Orig, + Keep; + _ -> <<>> + end; slice_trail(CD, N) when is_list(CD) -> - slice_list(CD, N); -slice_trail(CD, N) when is_binary(CD) -> - slice_bin(CD, N, CD). + slice_list(CD, N). +slice_list([CP1|[CP2|_]=Cont], N) when ?ASCII_LIST(CP1,CP2),N > 0 -> + [CP1|slice_list(Cont, N-1)]; slice_list(CD, N) when N > 0 -> case unicode_util:gc(CD) of [GC|Cont] -> append(GC, slice_list(Cont, N-1)); @@ -581,17 +649,16 @@ slice_list(CD, N) when N > 0 -> slice_list(_, 0) -> []. -slice_bin(CD, N, Orig) when N > 0 -> - case unicode_util:gc(CD) of - [_|Cont] -> slice_bin(Cont, N-1, Orig); - [] -> Orig +slice_bin(<<CP2/utf8, Bin/binary>>, CP1, N) when ?ASCII_LIST(CP1,CP2), N > 0 -> + slice_bin(Bin, CP2, N-1); +slice_bin(CD, CP1, N) when N > 0 -> + [_|Bin] = unicode_util:gc([CP1|CD]), + case unicode_util:cp(Bin) of + [CP2|Cont] -> slice_bin(Cont, CP2, N-1); + [] -> 0 end; -slice_bin([], 0, Orig) -> - Orig; -slice_bin(CD, 0, Orig) -> - Sz = byte_size(Orig) - byte_size(CD), - <<Keep:Sz/binary, _/binary>> = Orig, - Keep. +slice_bin(CD, CP1, 0) -> + byte_size(CD)+byte_size(<<CP1/utf8>>). uppercase_list(CPs0) -> case unicode_util:uppercase(CPs0) of @@ -641,16 +708,31 @@ casefold_bin(CPs0, Acc) -> [] -> Acc end. - +%% Fast path for ascii searching for one character in lists +trim_ls([CP1|[CP2|_]=Cont]=Str, Sep) + when ?ASCII_LIST(CP1,CP2) -> + case Sep of + CP1 -> trim_ls(Cont, Sep); + _ -> Str + end; +trim_ls(Str, Sep) -> + trim_l(Str, [Sep]). + +trim_l([CP1|[CP2|_]=Cont]=Str, Sep) + when ?ASCII_LIST(CP1,CP2) -> + case lists:member(CP1, Sep) of + true -> trim_l(Cont, Sep); + false -> Str + end; trim_l([Bin|Cont0], Sep) when is_binary(Bin) -> case bin_search_inv(Bin, Cont0, Sep) of {nomatch, Cont} -> trim_l(Cont, Sep); Keep -> Keep end; -trim_l(Str, {GCs, _, _}=Sep) when is_list(Str) -> +trim_l(Str, Sep) when is_list(Str) -> case unicode_util:gc(Str) of [C|Cs] -> - case lists:member(C, GCs) of + case lists:member(C, Sep) of true -> trim_l(Cs, Sep); false -> Str end; @@ -662,15 +744,51 @@ trim_l(Bin, Sep) when is_binary(Bin) -> [Keep] -> Keep end. -trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> +%% Fast path for ascii searching for one character in lists +trim_ts([Sep|Cs1]=Str, Sep) -> + case Cs1 of + [] -> []; + [CP2|_] when ?ASCII_LIST(Sep,CP2) -> + Tail = trim_ts(Cs1, Sep), + case is_empty(Tail) of + true -> []; + false -> [Sep|Tail] + end; + _ -> + trim_t(Str, 0, search_pattern([Sep])) + end; +trim_ts([CP|Cont],Sep) when is_integer(CP) -> + [CP|trim_ts(Cont, Sep)]; +trim_ts(Str, Sep) -> + trim_t(Str, 0, search_pattern([Sep])). + +trim_t([CP1|Cont]=Cs0, _, {GCs,CPs,_}=Seps) when is_integer(CP1) -> + case lists:member(CP1, CPs) of + true -> + [GC|Cs1] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> + Tail = trim_t(Cs1, 0, Seps), + case is_empty(Tail) of + true -> []; + false -> append(GC,Tail) + end; + false -> + append(GC,trim_t(Cs1, 0, Seps)) + end; + false -> + [CP1|trim_t(Cont, 0, Seps)] + end; +trim_t([Bin|Cont0], N, {GCs,_,_}=Seps0) when is_binary(Bin) -> <<_:N/binary, Rest/binary>> = Bin, - case bin_search(Rest, Cont0, Sep) of + Seps = search_compile(Seps0), + case bin_search(Rest, Cont0, Seps) of {nomatch,_} -> - stack(Bin, trim_t(Cont0, 0, Sep)); + stack(Bin, trim_t(Cont0, 0, Seps)); [SepStart|Cont1] -> - case bin_search_inv(SepStart, Cont1, Sep) of + case bin_search_inv(SepStart, Cont1, GCs) of {nomatch, Cont} -> - Tail = trim_t(Cont, 0, Sep), + Tail = trim_t(Cont, 0, Seps), case is_empty(Tail) of true -> KeepSz = byte_size(Bin) - byte_size(SepStart), @@ -682,67 +800,69 @@ trim_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> end; [NonSep|Cont] when is_binary(NonSep) -> KeepSz = byte_size(Bin) - byte_size(NonSep), - trim_t([Bin|Cont], KeepSz, Sep) + trim_t([Bin|Cont], KeepSz, Seps) end end; -trim_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> - case unicode_util:cp(Str) of - [CP|Cs] -> - case lists:member(CP, CPs) of +trim_t(Str, 0, {GCs,_,_}=Seps) when is_list(Str) -> + case unicode_util:gc(Str) of + [GC|Cs1] -> + case lists:member(GC, GCs) of true -> - [GC|Cs1] = unicode_util:gc(Str), - case lists:member(GC, GCs) of - true -> - Tail = trim_t(Cs1, 0, Sep), - case is_empty(Tail) of - true -> []; - false -> append(GC,Tail) - end; - false -> - append(GC,trim_t(Cs1, 0, Sep)) + Tail = trim_t(Cs1, 0, Seps), + case is_empty(Tail) of + true -> []; + false -> append(GC,Tail) end; false -> - append(CP,trim_t(Cs, 0, Sep)) + append(GC,trim_t(Cs1, 0, Seps)) end; [] -> [] end; -trim_t(Bin, N, Sep) when is_binary(Bin) -> +trim_t(Bin, N, {GCs,_,_}=Seps0) when is_binary(Bin) -> <<_:N/binary, Rest/binary>> = Bin, - case bin_search(Rest, Sep) of + Seps = search_compile(Seps0), + case bin_search(Rest, [], Seps) of {nomatch,_} -> Bin; [SepStart] -> - case bin_search_inv(SepStart, [], Sep) of + case bin_search_inv(SepStart, [], GCs) of {nomatch,_} -> KeepSz = byte_size(Bin) - byte_size(SepStart), <<Keep:KeepSz/binary, _/binary>> = Bin, Keep; [NonSep] -> KeepSz = byte_size(Bin) - byte_size(NonSep), - trim_t(Bin, KeepSz, Sep) + trim_t(Bin, KeepSz, Seps) end end. -take_l([Bin|Cont0], Sep, Acc) when is_binary(Bin) -> - case bin_search_inv(Bin, Cont0, Sep) of + +take_l([CP1|[CP2|_]=Cont]=Str, Seps, Acc) + when ?ASCII_LIST(CP1,CP2) -> + case lists:member(CP1, Seps) of + true -> take_l(Cont, Seps, [CP1|Acc]); + false -> {rev(Acc), Str} + end; +take_l([Bin|Cont0], Seps, Acc) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, Seps) of {nomatch, Cont} -> Used = cp_prefix(Cont0, Cont), - take_l(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]); + take_l(Cont, Seps, [unicode:characters_to_binary([Bin|Used])|Acc]); [Bin1|_]=After when is_binary(Bin1) -> First = byte_size(Bin) - byte_size(Bin1), <<Keep:First/binary, _/binary>> = Bin, {btoken(Keep,Acc), After} end; -take_l(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) -> +take_l(Str, Seps, Acc) when is_list(Str) -> case unicode_util:gc(Str) of [C|Cs] -> - case lists:member(C, GCs) of - true -> take_l(Cs, Sep, append(rev(C),Acc)); + case lists:member(C, Seps) of + true -> take_l(Cs, Seps, append(rev(C),Acc)); false -> {rev(Acc), Str} end; [] -> {rev(Acc), []} end; -take_l(Bin, Sep, Acc) when is_binary(Bin) -> - case bin_search_inv(Bin, [], Sep) of +take_l(Bin, Seps, Acc) when is_binary(Bin) -> + case bin_search_inv(Bin, [], Seps) of {nomatch,_} -> {btoken(Bin, Acc), <<>>}; [After] -> @@ -751,27 +871,41 @@ take_l(Bin, Sep, Acc) when is_binary(Bin) -> {btoken(Keep, Acc), After} end. -take_lc([Bin|Cont0], Sep, Acc) when is_binary(Bin) -> - case bin_search(Bin, Cont0, Sep) of + +take_lc([CP1|Cont]=Str0, {GCs,CPs,_}=Seps, Acc) when is_integer(CP1) -> + case lists:member(CP1, CPs) of + true -> + [GC|Str] = unicode_util:gc(Str0), + case lists:member(GC, GCs) of + false -> take_lc(Str, Seps, append(rev(GC),Acc)); + true -> {rev(Acc), Str0} + end; + false -> + take_lc(Cont, Seps, append(CP1,Acc)) + end; +take_lc([Bin|Cont0], Seps0, Acc) when is_binary(Bin) -> + Seps = search_compile(Seps0), + case bin_search(Bin, Cont0, Seps) of {nomatch, Cont} -> Used = cp_prefix(Cont0, Cont), - take_lc(Cont, Sep, [unicode:characters_to_binary([Bin|Used])|Acc]); + take_lc(Cont, Seps, [unicode:characters_to_binary([Bin|Used])|Acc]); [Bin1|_]=After when is_binary(Bin1) -> First = byte_size(Bin) - byte_size(Bin1), <<Keep:First/binary, _/binary>> = Bin, {btoken(Keep,Acc), After} end; -take_lc(Str, {GCs, _, _}=Sep, Acc) when is_list(Str) -> +take_lc(Str, {GCs,_,_}=Seps, Acc) when is_list(Str) -> case unicode_util:gc(Str) of [C|Cs] -> case lists:member(C, GCs) of - false -> take_lc(Cs, Sep, append(rev(C),Acc)); + false -> take_lc(Cs, Seps, append(rev(C),Acc)); true -> {rev(Acc), Str} end; [] -> {rev(Acc), []} end; -take_lc(Bin, Sep, Acc) when is_binary(Bin) -> - case bin_search(Bin, [], Sep) of +take_lc(Bin, Seps0, Acc) when is_binary(Bin) -> + Seps = search_compile(Seps0), + case bin_search(Bin, [], Seps) of {nomatch,_} -> {btoken(Bin, Acc), <<>>}; [After] -> @@ -780,148 +914,192 @@ take_lc(Bin, Sep, Acc) when is_binary(Bin) -> {btoken(Keep, Acc), After} end. -take_t([Bin|Cont0], N, Sep) when is_binary(Bin) -> + +take_t([CP1|Cont]=Str0, _, {GCs,CPs,_}=Seps) when is_integer(CP1) -> + case lists:member(CP1, CPs) of + true -> + [GC|Str] = unicode_util:gc(Str0), + case lists:member(GC, GCs) of + true -> + {Head, Tail} = take_t(Str, 0, Seps), + case is_empty(Head) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} + end; + false -> + {Head, Tail} = take_t(Str, 0, Seps), + {append(GC,Head), Tail} + end; + false -> + {Head, Tail} = take_t(Cont, 0, Seps), + {[CP1|Head], Tail} + end; +take_t([Bin|Cont0], N, {GCs,_,_}=Seps0) when is_binary(Bin) -> <<_:N/binary, Rest/binary>> = Bin, - case bin_search(Rest, Cont0, Sep) of + Seps = search_compile(Seps0), + case bin_search(Rest, Cont0, Seps) of {nomatch,Cont} -> Used = cp_prefix(Cont0, Cont), - {Head, Tail} = take_t(Cont, 0, Sep), + {Head, Tail} = take_t(Cont, 0, Seps), {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; [SepStart|Cont1] -> - case bin_search_inv(SepStart, Cont1, Sep) of + case bin_search_inv(SepStart, Cont1, GCs) of {nomatch, Cont} -> - {Head, Tail} = take_t(Cont, 0, Sep), + {Head, Tail} = take_t(Cont, 0, Seps), Used = cp_prefix(Cont0, Cont), - case equal(Tail, Cont) of + case is_empty(Head) of true -> KeepSz = byte_size(Bin) - byte_size(SepStart), <<Keep:KeepSz/binary, End/binary>> = Bin, - {stack(Keep,Head), stack(stack(End,Used),Tail)}; + {Keep, stack(stack(End,Used),Tail)}; false -> {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} end; [NonSep|Cont] when is_binary(NonSep) -> KeepSz = byte_size(Bin) - byte_size(NonSep), - take_t([Bin|Cont], KeepSz, Sep) + take_t([Bin|Cont], KeepSz, Seps) end end; -take_t(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> - case unicode_util:cp(Str) of - [CP|Cs] -> - case lists:member(CP, CPs) of +take_t(Str, 0, {GCs,_,_}=Seps) when is_list(Str) -> + case unicode_util:gc(Str) of + [GC|Cs1] -> + case lists:member(GC, GCs) of true -> - [GC|Cs1] = unicode_util:gc(Str), - case lists:member(GC, GCs) of - true -> - {Head, Tail} = take_t(Cs1, 0, Sep), - case equal(Tail, Cs1) of - true -> {Head, append(GC,Tail)}; - false -> {append(GC,Head), Tail} - end; - false -> - {Head, Tail} = take_t(Cs, 0, Sep), - {append(CP,Head), Tail} + {Head, Tail} = take_t(Cs1, 0, Seps), + case is_empty(Head) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} end; false -> - {Head, Tail} = take_t(Cs, 0, Sep), - {append(CP,Head), Tail} + {Head, Tail} = take_t(Cs1, 0, Seps), + {append(GC,Head), Tail} end; [] -> {[],[]} end; -take_t(Bin, N, Sep) when is_binary(Bin) -> +take_t(Bin, N, {GCs,_,_}=Seps0) when is_binary(Bin) -> <<_:N/binary, Rest/binary>> = Bin, - case bin_search(Rest, Sep) of + Seps = search_compile(Seps0), + case bin_search(Rest, [], Seps) of {nomatch,_} -> {Bin, <<>>}; [SepStart] -> - case bin_search_inv(SepStart, [], Sep) of + case bin_search_inv(SepStart, [], GCs) of {nomatch,_} -> KeepSz = byte_size(Bin) - byte_size(SepStart), <<Before:KeepSz/binary, End/binary>> = Bin, {Before, End}; [NonSep] -> KeepSz = byte_size(Bin) - byte_size(NonSep), - take_t(Bin, KeepSz, Sep) + take_t(Bin, KeepSz, Seps) end end. -take_tc([Bin|Cont0], N, Sep) when is_binary(Bin) -> +take_tc([CP1|[CP2|_]=Cont], _, {GCs,_,_}=Seps) when ?ASCII_LIST(CP1,CP2) -> + case lists:member(CP1, GCs) of + false -> + {Head, Tail} = take_tc(Cont, 0, Seps), + case is_empty(Head) of + true -> {Head, append(CP1,Tail)}; + false -> {append(CP1,Head), Tail} + end; + true -> + {Head, Tail} = take_tc(Cont, 0, Seps), + {append(CP1,Head), Tail} + end; +take_tc([Bin|Cont0], N, {GCs,_,_}=Seps0) when is_binary(Bin) -> <<_:N/binary, Rest/binary>> = Bin, - case bin_search_inv(Rest, Cont0, Sep) of + case bin_search_inv(Rest, Cont0, GCs) of {nomatch,Cont} -> Used = cp_prefix(Cont0, Cont), - {Head, Tail} = take_tc(Cont, 0, Sep), + {Head, Tail} = take_tc(Cont, 0, Seps0), {stack(unicode:characters_to_binary([Bin|Used]), Head), Tail}; [SepStart|Cont1] -> - case bin_search(SepStart, Cont1, Sep) of + Seps = search_compile(Seps0), + case bin_search(SepStart, Cont1, Seps) of {nomatch, Cont} -> - {Head, Tail} = take_tc(Cont, 0, Sep), + {Head, Tail} = take_tc(Cont, 0, Seps), Used = cp_prefix(Cont0, Cont), - case equal(Tail, Cont) of + case is_empty(Head) of true -> KeepSz = byte_size(Bin) - byte_size(SepStart), <<Keep:KeepSz/binary, End/binary>> = Bin, - {stack(Keep,Head), stack(stack(End,Used),Tail)}; + {Keep, stack(stack(End,Used),Tail)}; false -> {stack(unicode:characters_to_binary([Bin|Used]),Head), Tail} end; [NonSep|Cont] when is_binary(NonSep) -> KeepSz = byte_size(Bin) - byte_size(NonSep), - take_tc([Bin|Cont], KeepSz, Sep) + take_tc([Bin|Cont], KeepSz, Seps) end end; -take_tc(Str, 0, {GCs,CPs,_}=Sep) when is_list(Str) -> - case unicode_util:cp(Str) of - [CP|Cs] -> - case lists:member(CP, CPs) of - true -> - [GC|Cs1] = unicode_util:gc(Str), - case lists:member(GC, GCs) of - false -> - {Head, Tail} = take_tc(Cs1, 0, Sep), - case equal(Tail, Cs1) of - true -> {Head, append(GC,Tail)}; - false -> {append(GC,Head), Tail} - end; - true -> - {Head, Tail} = take_tc(Cs1, 0, Sep), - {append(GC,Head), Tail} - end; +take_tc(Str, 0, {GCs,_,_}=Seps) when is_list(Str) -> + case unicode_util:gc(Str) of + [GC|Cs1] -> + case lists:member(GC, GCs) of false -> - {Head, Tail} = take_tc(Cs, 0, Sep), - case equal(Tail, Cs) of - true -> {Head, append(CP,Tail)}; - false -> {append(CP,Head), Tail} - end + {Head, Tail} = take_tc(Cs1, 0, Seps), + case is_empty(Head) of + true -> {Head, append(GC,Tail)}; + false -> {append(GC,Head), Tail} + end; + true -> + {Head, Tail} = take_tc(Cs1, 0, Seps), + {append(GC,Head), Tail} end; [] -> {[],[]} end; -take_tc(Bin, N, Sep) when is_binary(Bin) -> +take_tc(Bin, N, {GCs,_,_}=Seps0) when is_binary(Bin) -> <<_:N/binary, Rest/binary>> = Bin, - case bin_search_inv(Rest, [], Sep) of + case bin_search_inv(Rest, [], GCs) of {nomatch,_} -> {Bin, <<>>}; [SepStart] -> - case bin_search(SepStart, [], Sep) of + Seps = search_compile(Seps0), + case bin_search(SepStart, [], Seps) of {nomatch,_} -> KeepSz = byte_size(Bin) - byte_size(SepStart), <<Before:KeepSz/binary, End/binary>> = Bin, {Before, End}; [NonSep] -> KeepSz = byte_size(Bin) - byte_size(NonSep), - take_tc(Bin, KeepSz, Sep) + take_tc(Bin, KeepSz, Seps) end end. -prefix_1(Cs, []) -> Cs; -prefix_1(Cs, [_]=Pre) -> - prefix_2(unicode_util:gc(Cs), Pre); -prefix_1(Cs, Pre) -> - prefix_2(unicode_util:cp(Cs), Pre). - -prefix_2([C|Cs], [C|Pre]) -> - prefix_1(Cs, Pre); -prefix_2(_, _) -> - nomatch. +prefix_1(Cs0, [GC]) -> + case unicode_util:gc(Cs0) of + [GC|Cs] -> Cs; + _ -> nomatch + end; +prefix_1([CP|Cs], [Pre|PreR]) when is_integer(CP) -> + case CP =:= Pre of + true -> prefix_1(Cs,PreR); + false -> nomatch + end; +prefix_1(<<CP/utf8, Cs/binary>>, [Pre|PreR]) -> + case CP =:= Pre of + true -> prefix_1(Cs,PreR); + false -> nomatch + end; +prefix_1(Cs0, [Pre|PreR]) -> + case unicode_util:cp(Cs0) of + [Pre|Cs] -> prefix_1(Cs,PreR); + _ -> nomatch + end. +split_1([CP1|Cs]=Cs0, [C|_]=Needle, _, Where, Curr, Acc) when is_integer(CP1) -> + case CP1=:=C of + true -> + case prefix_1(Cs0, Needle) of + nomatch -> split_1(Cs, Needle, 0, Where, append(C,Curr), Acc); + Rest when Where =:= leading -> + [rev(Curr), Rest]; + Rest when Where =:= trailing -> + split_1(Cs, Needle, 0, Where, [C|Curr], [rev(Curr), Rest]); + Rest when Where =:= all -> + split_1(Rest, Needle, 0, Where, [], [rev(Curr)|Acc]) + end; + false -> + split_1(Cs, Needle, 0, Where, append(CP1,Curr), Acc) + end; split_1([Bin|Cont0], Needle, Start, Where, Curr0, Acc) when is_binary(Bin) -> case bin_search_str(Bin, Start, Cont0, Needle) of @@ -981,32 +1159,50 @@ split_1(Bin, [_C|_]=Needle, Start, Where, Curr0, Acc) -> end end. -lexemes_m([Bin|Cont0], Seps, Ts) when is_binary(Bin) -> - case bin_search_inv(Bin, Cont0, Seps) of +lexemes_m([CP|_]=Cs0, {GCs,CPs,_}=Seps, Ts) when is_integer(CP) -> + case lists:member(CP, CPs) of + true -> + [GC|Cs2] = unicode_util:gc(Cs0), + case lists:member(GC, GCs) of + true -> + lexemes_m(Cs2, Seps, Ts); + false -> + {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), + lexemes_m(Rest, Seps, [Lexeme|Ts]) + end; + false -> + {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), + lexemes_m(Rest, Seps, [Lexeme|Ts]) + end; +lexemes_m([Bin|Cont0], {GCs,_,_}=Seps0, Ts) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, GCs) of {nomatch,Cont} -> - lexemes_m(Cont, Seps, Ts); + lexemes_m(Cont, Seps0, Ts); Cs -> + Seps = search_compile(Seps0), {Lexeme,Rest} = lexeme_pick(Cs, Seps, []), lexemes_m(Rest, Seps, [Lexeme|Ts]) end; -lexemes_m(Cs0, {GCs, _, _}=Seps, Ts) when is_list(Cs0) -> +lexemes_m(Cs0, {GCs, _, _}=Seps0, Ts) when is_list(Cs0) -> case unicode_util:gc(Cs0) of [C|Cs] -> case lists:member(C, GCs) of true -> - lexemes_m(Cs, Seps, Ts); + lexemes_m(Cs, Seps0, Ts); false -> + Seps = search_compile(Seps0), {Lexeme,Rest} = lexeme_pick(Cs0, Seps, []), lexemes_m(Rest, Seps, [Lexeme|Ts]) end; [] -> lists:reverse(Ts) end; -lexemes_m(Bin, Seps, Ts) when is_binary(Bin) -> - case bin_search_inv(Bin, [], Seps) of +lexemes_m(Bin, {GCs,_,_}=Seps0, Ts) when is_binary(Bin) -> + case bin_search_inv(Bin, [], GCs) of {nomatch,_} -> lists:reverse(Ts); [Cs] -> + Seps = search_compile(Seps0), {Lexeme,Rest} = lexeme_pick(Cs, Seps, []), lexemes_m(Rest, Seps, add_non_empty(Lexeme,Ts)) end. @@ -1037,7 +1233,7 @@ lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) -> true -> [GC|Cs2] = unicode_util:gc(Cs0), case lists:member(GC, GCs) of - true -> {rev(Tkn), Cs0}; + true -> {rev(Tkn), Cs2}; false -> lexeme_pick(Cs2, Seps, append(rev(GC),Tkn)) end; false -> @@ -1047,7 +1243,7 @@ lexeme_pick(Cs0, {GCs, CPs, _} = Seps, Tkn) when is_list(Cs0) -> {rev(Tkn), []} end; lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) -> - case bin_search(Bin, Seps) of + case bin_search(Bin, [], Seps) of {nomatch,_} -> {btoken(Bin,Tkn), []}; [Left] -> @@ -1056,35 +1252,38 @@ lexeme_pick(Bin, Seps, Tkn) when is_binary(Bin) -> {btoken(Lexeme, Tkn), Left} end. -nth_lexeme_m([Bin|Cont0], Seps, N) when is_binary(Bin) -> - case bin_search_inv(Bin, Cont0, Seps) of +nth_lexeme_m([Bin|Cont0], {GCs,_,_}=Seps0, N) when is_binary(Bin) -> + case bin_search_inv(Bin, Cont0, GCs) of {nomatch,Cont} -> - nth_lexeme_m(Cont, Seps, N); + nth_lexeme_m(Cont, Seps0, N); Cs when N > 1 -> - Rest = lexeme_skip(Cs, Seps), - nth_lexeme_m(Rest, Seps, N-1); + Rest = lexeme_skip(Cs, Seps0), + nth_lexeme_m(Rest, Seps0, N-1); Cs -> + Seps = search_compile(Seps0), {Lexeme,_} = lexeme_pick(Cs, Seps, []), Lexeme end; -nth_lexeme_m(Cs0, {GCs, _, _}=Seps, N) when is_list(Cs0) -> +nth_lexeme_m(Cs0, {GCs, _, _}=Seps0, N) when is_list(Cs0) -> case unicode_util:gc(Cs0) of [C|Cs] -> case lists:member(C, GCs) of true -> - nth_lexeme_m(Cs, Seps, N); + nth_lexeme_m(Cs, Seps0, N); false when N > 1 -> - Cs1 = lexeme_skip(Cs, Seps), - nth_lexeme_m(Cs1, Seps, N-1); + Cs1 = lexeme_skip(Cs, Seps0), + nth_lexeme_m(Cs1, Seps0, N-1); false -> + Seps = search_compile(Seps0), {Lexeme,_} = lexeme_pick(Cs0, Seps, []), Lexeme end; [] -> [] end; -nth_lexeme_m(Bin, Seps, N) when is_binary(Bin) -> - case bin_search_inv(Bin, [], Seps) of +nth_lexeme_m(Bin, {GCs,_,_}=Seps0, N) when is_binary(Bin) -> + Seps = search_compile(Seps0), + case bin_search_inv(Bin, [], GCs) of [Cs] when N > 1 -> Cs1 = lexeme_skip(Cs, Seps), nth_lexeme_m(Cs1, Seps, N-1); @@ -1100,16 +1299,17 @@ lexeme_skip([CP|Cs1]=Cs0, {GCs,CPs,_}=Seps) when is_integer(CP) -> true -> [GC|Cs2] = unicode_util:gc(Cs0), case lists:member(GC, GCs) of - true -> Cs0; + true -> Cs2; false -> lexeme_skip(Cs2, Seps) end; false -> lexeme_skip(Cs1, Seps) end; -lexeme_skip([Bin|Cont0], Seps) when is_binary(Bin) -> +lexeme_skip([Bin|Cont0], Seps0) when is_binary(Bin) -> + Seps = search_compile(Seps0), case bin_search(Bin, Cont0, Seps) of {nomatch,_} -> lexeme_skip(Cont0, Seps); - Cs -> Cs + Cs -> tl(unicode_util:gc(Cs)) end; lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) -> case unicode_util:cp(Cs0) of @@ -1118,7 +1318,7 @@ lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) -> true -> [GC|Cs2] = unicode_util:gc(Cs0), case lists:member(GC, GCs) of - true -> Cs0; + true -> Cs2; false -> lexeme_skip(Cs2, Seps) end; false -> @@ -1127,12 +1327,23 @@ lexeme_skip(Cs0, {GCs, CPs, _} = Seps) when is_list(Cs0) -> [] -> [] end; -lexeme_skip(Bin, Seps) when is_binary(Bin) -> - case bin_search(Bin, Seps) of +lexeme_skip(Bin, Seps0) when is_binary(Bin) -> + Seps = search_compile(Seps0), + case bin_search(Bin, [], Seps) of {nomatch,_} -> <<>>; - [Left] -> Left + [Left] -> tl(unicode_util:gc(Left)) end. +find_l([C1|Cs]=Cs0, [C|_]=Needle) when is_integer(C1) -> + case C1 of + C -> + case prefix_1(Cs0, Needle) of + nomatch -> find_l(Cs, Needle); + _ -> Cs0 + end; + _ -> + find_l(Cs, Needle) + end; find_l([Bin|Cont0], Needle) when is_binary(Bin) -> case bin_search_str(Bin, 0, Cont0, Needle) of {nomatch, _, Cont} -> @@ -1157,6 +1368,16 @@ find_l(Bin, Needle) -> {_Before, [Cs], _After} -> Cs end. +find_r([Cp|Cs]=Cs0, [C|_]=Needle, Res) when is_integer(Cp) -> + case Cp of + C -> + case prefix_1(Cs0, Needle) of + nomatch -> find_r(Cs, Needle, Res); + _ -> find_r(Cs, Needle, Cs0) + end; + _ -> + find_r(Cs, Needle, Res) + end; find_r([Bin|Cont0], Needle, Res) when is_binary(Bin) -> case bin_search_str(Bin, 0, Cont0, Needle) of {nomatch,_,Cont} -> @@ -1227,11 +1448,6 @@ cp_prefix_1(Orig, Until, Cont) -> %% Binary special -bin_search(Bin, Seps) -> - bin_search(Bin, [], Seps). - -bin_search(_Bin, Cont, {[],_,_}) -> - {nomatch, Cont}; bin_search(Bin, Cont, {Seps,_,BP}) -> bin_search_loop(Bin, 0, BP, Cont, Seps). @@ -1239,10 +1455,14 @@ bin_search(Bin, Cont, {Seps,_,BP}) -> %% i.e. å in nfd form $a "COMBINING RING ABOVE" %% and PREPEND characters like "ARABIC NUMBER SIGN" 1536 <<216,128>> %% combined with other characters are currently ignored. +search_pattern({_,_,_}=P) -> P; search_pattern(Seps) -> CPs = search_cp(Seps), - Bin = bin_pattern(CPs), - {Seps, CPs, Bin}. + {Seps, CPs, undefined}. + +search_compile({Sep, CPs, undefined}) -> + {Sep, CPs, binary:compile_pattern(bin_pattern(CPs))}; +search_compile({_,_,_}=Compiled) -> Compiled. search_cp([CP|Seps]) when is_integer(CP) -> [CP|search_cp(Seps)]; @@ -1263,9 +1483,21 @@ bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) -> case binary:match(Bin, BinSeps) of nomatch -> {nomatch,Cont}; + {Where, _CL} when Cont =:= [] -> + <<_:Where/binary, Cont1/binary>> = Bin, + [GC|Cont2] = unicode_util:gc(Cont1), + case lists:member(GC, Seps) of + false when Cont2 =:= [] -> + {nomatch, []}; + false -> + Next = byte_size(Bin0) - byte_size(Cont2), + bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); + true -> + [Cont1] + end; {Where, _CL} -> <<_:Where/binary, Cont0/binary>> = Bin, - Cont1 = stack(Cont0, Cont), + Cont1 = [Cont0|Cont], [GC|Cont2] = unicode_util:gc(Cont1), case lists:member(GC, Seps) of false -> @@ -1273,55 +1505,108 @@ bin_search_loop(Bin0, Start, BinSeps, Cont, Seps) -> [BinR|Cont] when is_binary(BinR) -> Next = byte_size(Bin0) - byte_size(BinR), bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); - BinR when is_binary(BinR), Cont =:= [] -> - Next = byte_size(Bin0) - byte_size(BinR), - bin_search_loop(Bin0, Next, BinSeps, Cont, Seps); _ -> {nomatch, Cont2} end; - true when is_list(Cont1) -> - Cont1; true -> - [Cont1] + Cont1 end end. -bin_search_inv(Bin, Cont, {[], _, _}) -> - [Bin|Cont]; -bin_search_inv(Bin, Cont, {[Sep], _, _}) -> - bin_search_inv_1([Bin|Cont], Sep); -bin_search_inv(Bin, Cont, {Seps, _, _}) -> - bin_search_inv_n([Bin|Cont], Seps). - -bin_search_inv_1([<<>>|CPs], _) -> - {nomatch, CPs}; -bin_search_inv_1(CPs = [Bin0|Cont], Sep) when is_binary(Bin0) -> - case unicode_util:gc(CPs) of - [Sep|Bin] when is_binary(Bin), Cont =:= [] -> - bin_search_inv_1([Bin], Sep); - [Sep|[Bin|Cont]=Cs] when is_binary(Bin) -> - bin_search_inv_1(Cs, Sep); - [Sep|Cs] -> - {nomatch, Cs}; - _ -> CPs - end. +bin_search_inv(<<>>, Cont, _) -> + {nomatch, Cont}; +bin_search_inv(Bin, Cont, [Sep]) -> + bin_search_inv_1(Bin, Cont, Sep); +bin_search_inv(Bin, Cont, Seps) -> + bin_search_inv_n(Bin, Cont, Seps). + +bin_search_inv_1(<<CP1/utf8, BinRest/binary>>=Bin0, Cont, Sep) -> + case BinRest of + <<CP2/utf8, _/binary>> when ?ASCII_LIST(CP1, CP2) -> + case CP1 of + Sep -> bin_search_inv_1(BinRest, Cont, Sep); + _ -> [Bin0|Cont] + end; + _ when Cont =:= [] -> + case unicode_util:gc(Bin0) of + [Sep|Bin] -> bin_search_inv_1(Bin, Cont, Sep); + _ -> [Bin0|Cont] + end; + _ -> + case unicode_util:gc([Bin0|Cont]) of + [Sep|[Bin|Cont]] when is_binary(Bin) -> + bin_search_inv_1(Bin, Cont, Sep); + [Sep|Cs] -> + {nomatch, Cs}; + _ -> [Bin0|Cont] + end + end; +bin_search_inv_1(<<>>, Cont, _Sep) -> + {nomatch, Cont}; +bin_search_inv_1([], Cont, _Sep) -> + {nomatch, Cont}. -bin_search_inv_n([<<>>|CPs], _) -> - {nomatch, CPs}; -bin_search_inv_n([Bin0|Cont]=CPs, Seps) when is_binary(Bin0) -> - [C|Cs0] = unicode_util:gc(CPs), - case {lists:member(C, Seps), Cs0} of - {true, Cs} when is_binary(Cs), Cont =:= [] -> - bin_search_inv_n([Cs], Seps); - {true, [Bin|Cont]=Cs} when is_binary(Bin) -> - bin_search_inv_n(Cs, Seps); - {true, Cs} -> {nomatch, Cs}; - {false, _} -> CPs - end. +bin_search_inv_n(<<CP1/utf8, BinRest/binary>>=Bin0, Cont, Seps) -> + case BinRest of + <<CP2/utf8, _/binary>> when ?ASCII_LIST(CP1, CP2) -> + case lists:member(CP1,Seps) of + true -> bin_search_inv_n(BinRest, Cont, Seps); + false -> [Bin0|Cont] + end; + _ when Cont =:= [] -> + [GC|Bin] = unicode_util:gc(Bin0), + case lists:member(GC, Seps) of + true -> bin_search_inv_n(Bin, Cont, Seps); + false -> [Bin0|Cont] + end; + _ -> + [GC|Cs0] = unicode_util:gc([Bin0|Cont]), + case lists:member(GC, Seps) of + false -> [Bin0|Cont]; + true -> + case Cs0 of + [Bin|Cont] when is_binary(Bin) -> + bin_search_inv_n(Bin, Cont, Seps); + _ -> + {nomatch, Cs0} + end + end + end; +bin_search_inv_n(<<>>, Cont, _Sep) -> + {nomatch, Cont}; +bin_search_inv_n([], Cont, _Sep) -> + {nomatch, Cont}. + +bin_search_str(Bin0, Start, [], SearchCPs) -> + Compiled = binary:compile_pattern(unicode:characters_to_binary(SearchCPs)), + bin_search_str_1(Bin0, Start, Compiled, SearchCPs); bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) -> + First = binary:compile_pattern(<<CP/utf8>>), + bin_search_str_2(Bin0, Start, Cont, First, SearchCPs). + +bin_search_str_1(Bin0, Start, First, SearchCPs) -> + <<_:Start/binary, Bin/binary>> = Bin0, + case binary:match(Bin, First) of + nomatch -> {nomatch, byte_size(Bin0), []}; + {Where0, _} -> + Where = Start+Where0, + <<Keep:Where/binary, Cs0/binary>> = Bin0, + case prefix_1(Cs0, SearchCPs) of + nomatch -> + <<_/utf8, Cs/binary>> = Cs0, + KeepSz = byte_size(Bin0) - byte_size(Cs), + bin_search_str_1(Bin0, KeepSz, First, SearchCPs); + [] -> + {Keep, [Cs0], <<>>}; + Rest -> + {Keep, [Cs0], Rest} + end + end. + +bin_search_str_2(Bin0, Start, Cont, First, SearchCPs) -> <<_:Start/binary, Bin/binary>> = Bin0, - case binary:match(Bin, <<CP/utf8>>) of + case binary:match(Bin, First) of nomatch -> {nomatch, byte_size(Bin0), Cont}; {Where0, _} -> Where = Start+Where0, @@ -1330,7 +1615,7 @@ bin_search_str(Bin0, Start, Cont, [CP|_]=SearchCPs) -> case prefix_1(stack(Cs0,Cont), SearchCPs) of nomatch when is_binary(Cs) -> KeepSz = byte_size(Bin0) - byte_size(Cs), - bin_search_str(Bin0, KeepSz, Cont, SearchCPs); + bin_search_str_2(Bin0, KeepSz, Cont, First, SearchCPs); nomatch -> {nomatch, Where, stack([GC|Cs],Cont)}; [] -> diff --git a/lib/stdlib/src/uri_string.erl b/lib/stdlib/src/uri_string.erl index 22212da222..a84679c595 100644 --- a/lib/stdlib/src/uri_string.erl +++ b/lib/stdlib/src/uri_string.erl @@ -226,7 +226,8 @@ %%------------------------------------------------------------------------- %% External API %%------------------------------------------------------------------------- --export([normalize/1, parse/1, +-export([compose_query/1, compose_query/2, + dissect_query/1, normalize/1, parse/1, recompose/1, transcode/2]). -export_type([error/0, uri_map/0, uri_string/0]). @@ -381,6 +382,76 @@ transcode(URIString, Options) when is_list(URIString) -> end. +%%------------------------------------------------------------------------- +%% Functions for working with the query part of a URI as a list +%% of key/value pairs. +%% HTML5 - 4.10.22.6 URL-encoded form data +%%------------------------------------------------------------------------- + +%%------------------------------------------------------------------------- +%% Compose urlencoded query string from a list of unescaped key/value pairs. +%% (application/x-www-form-urlencoded encoding algorithm) +%%------------------------------------------------------------------------- +-spec compose_query(QueryList) -> QueryString when + QueryList :: [{uri_string(), uri_string()}], + QueryString :: uri_string() + | error(). +compose_query(List) -> + compose_query(List, [{encoding, utf8}]). + + +-spec compose_query(QueryList, Options) -> QueryString when + QueryList :: [{uri_string(), uri_string()}], + Options :: [{encoding, atom()}], + QueryString :: uri_string() + | error(). +compose_query([],_Options) -> + []; +compose_query(List, Options) -> + try compose_query(List, Options, false, <<>>) + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. +%% +compose_query([{Key,Value}|Rest], Options, IsList, Acc) -> + Separator = get_separator(Rest), + K = form_urlencode(Key, Options), + V = form_urlencode(Value, Options), + IsListNew = IsList orelse is_list(Key) orelse is_list(Value), + compose_query(Rest, Options, IsListNew, <<Acc/binary,K/binary,"=",V/binary,Separator/binary>>); +compose_query([], _Options, IsList, Acc) -> + case IsList of + true -> convert_to_list(Acc, utf8); + false -> Acc + end. + + +%%------------------------------------------------------------------------- +%% Dissect a query string into a list of unescaped key/value pairs. +%% (application/x-www-form-urlencoded decoding algorithm) +%%------------------------------------------------------------------------- +-spec dissect_query(QueryString) -> QueryList when + QueryString :: uri_string(), + QueryList :: [{uri_string(), uri_string()}] + | error(). +dissect_query(<<>>) -> + []; +dissect_query([]) -> + []; +dissect_query(QueryString) when is_list(QueryString) -> + try + B = convert_to_binary(QueryString, utf8, utf8), + dissect_query_key(B, true, [], <<>>, <<>>) + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end; +dissect_query(QueryString) -> + try dissect_query_key(QueryString, false, [], <<>>, <<>>) + catch + throw:{error, Atom, RestData} -> {error, Atom, RestData} + end. + + %%%======================================================================== %%% Internal functions %%%======================================================================== @@ -585,6 +656,7 @@ maybe_add_path(Map) -> end. + -spec parse_scheme(binary(), uri_map()) -> {binary(), uri_map()}. parse_scheme(?STRING_REST($:, Rest), URI) -> {_, URI1} = parse_hier(Rest, URI), @@ -1673,6 +1745,161 @@ percent_encode_segment(Segment) -> %%------------------------------------------------------------------------- +%% Helper functions for compose_query +%%------------------------------------------------------------------------- + +%% Returns separator to be used between key-value pairs +get_separator(L) when length(L) =:= 0 -> + <<>>; +get_separator(_L) -> + <<"&">>. + + +%% HTML5 - 4.10.22.6 URL-encoded form data - encoding +form_urlencode(Cs, [{encoding, latin1}]) when is_list(Cs) -> + B = convert_to_binary(Cs, utf8, utf8), + html5_byte_encode(base10_encode(B)); +form_urlencode(Cs, [{encoding, latin1}]) when is_binary(Cs) -> + html5_byte_encode(base10_encode(Cs)); +form_urlencode(Cs, [{encoding, Encoding}]) + when is_list(Cs), Encoding =:= utf8; Encoding =:= unicode -> + B = convert_to_binary(Cs, utf8, Encoding), + html5_byte_encode(B); +form_urlencode(Cs, [{encoding, Encoding}]) + when is_binary(Cs), Encoding =:= utf8; Encoding =:= unicode -> + html5_byte_encode(Cs); +form_urlencode(Cs, [{encoding, Encoding}]) when is_list(Cs); is_binary(Cs) -> + throw({error,invalid_encoding, Encoding}); +form_urlencode(Cs, _) -> + throw({error,invalid_input, Cs}). + + +%% For each character in the entry's name and value that cannot be expressed using +%% the selected character encoding, replace the character by a string consisting of +%% a U+0026 AMPERSAND character (&), a "#" (U+0023) character, one or more ASCII +%% digits representing the Unicode code point of the character in base ten, and +%% finally a ";" (U+003B) character. +base10_encode(Cs) -> + base10_encode(Cs, <<>>). +%% +base10_encode(<<>>, Acc) -> + Acc; +base10_encode(<<H/utf8,T/binary>>, Acc) when H > 255 -> + Base10 = convert_to_binary(integer_to_list(H,10), utf8, utf8), + base10_encode(T, <<Acc/binary,"&#",Base10/binary,$;>>); +base10_encode(<<H/utf8,T/binary>>, Acc) -> + base10_encode(T, <<Acc/binary,H>>). + + +html5_byte_encode(B) -> + html5_byte_encode(B, <<>>). +%% +html5_byte_encode(<<>>, Acc) -> + Acc; +html5_byte_encode(<<$ ,T/binary>>, Acc) -> + html5_byte_encode(T, <<Acc/binary,$+>>); +html5_byte_encode(<<H,T/binary>>, Acc) -> + case is_url_char(H) of + true -> + html5_byte_encode(T, <<Acc/binary,H>>); + false -> + <<A:4,B:4>> = <<H>>, + html5_byte_encode(T, <<Acc/binary,$%,(?DEC2HEX(A)),(?DEC2HEX(B))>>) + end; +html5_byte_encode(H, _Acc) -> + throw({error,invalid_input, H}). + + +%% Return true if input char can appear in form-urlencoded string +%% Allowed chararacters: +%% 0x2A, 0x2D, 0x2E, 0x30 to 0x39, 0x41 to 0x5A, +%% 0x5F, 0x61 to 0x7A +is_url_char(C) + when C =:= 16#2A; C =:= 16#2D; + C =:= 16#2E; C =:= 16#5F; + 16#30 =< C, C =< 16#39; + 16#41 =< C, C =< 16#5A; + 16#61 =< C, C =< 16#7A -> true; +is_url_char(_) -> false. + + +%%------------------------------------------------------------------------- +%% Helper functions for dissect_query +%%------------------------------------------------------------------------- +dissect_query_key(<<$=,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_value(T, IsList, Acc, Key, Value); +dissect_query_key(<<"&#",T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, <<Key/binary,"&#">>, Value); +dissect_query_key(<<$&,_T/binary>>, _IsList, _Acc, _Key, _Value) -> + throw({error, missing_value, "&"}); +dissect_query_key(<<H,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_key(T, IsList, Acc, <<Key/binary,H>>, Value); +dissect_query_key(B, _, _, _, _) -> + throw({error, missing_value, B}). + + +dissect_query_value(<<$&,T/binary>>, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), + dissect_query_key(T, IsList, [{K,V}|Acc], <<>>, <<>>); +dissect_query_value(<<H,T/binary>>, IsList, Acc, Key, Value) -> + dissect_query_value(T, IsList, Acc, Key, <<Value/binary,H>>); +dissect_query_value(<<>>, IsList, Acc, Key, Value) -> + K = form_urldecode(IsList, Key), + V = form_urldecode(IsList, Value), + lists:reverse([{K,V}|Acc]). + + +%% Form-urldecode input based on RFC 1866 [8.2.1] +form_urldecode(true, B) -> + Result = base10_decode(form_urldecode(B, <<>>)), + convert_to_list(Result, utf8); +form_urldecode(false, B) -> + base10_decode(form_urldecode(B, <<>>)); +form_urldecode(<<>>, Acc) -> + Acc; +form_urldecode(<<$+,T/binary>>, Acc) -> + form_urldecode(T, <<Acc/binary,$ >>); +form_urldecode(<<$%,C0,C1,T/binary>>, Acc) -> + case is_hex_digit(C0) andalso is_hex_digit(C1) of + true -> + V = ?HEX2DEC(C0)*16+?HEX2DEC(C1), + form_urldecode(T, <<Acc/binary, V>>); + false -> + L = convert_to_list(<<$%,C0,C1,T/binary>>, utf8), + throw({error, invalid_percent_encoding, L}) + end; +form_urldecode(<<H/utf8,T/binary>>, Acc) -> + form_urldecode(T, <<Acc/binary,H/utf8>>); +form_urldecode(<<H,_/binary>>, _Acc) -> + throw({error, invalid_character, [H]}). + +base10_decode(Cs) -> + base10_decode(Cs, <<>>). +% +base10_decode(<<>>, Acc) -> + Acc; +base10_decode(<<"&#",T/binary>>, Acc) -> + base10_decode_unicode(T, Acc); +base10_decode(<<H/utf8,T/binary>>, Acc) -> + base10_decode(T,<<Acc/binary,H/utf8>>); +base10_decode(<<H,_/binary>>, _) -> + throw({error, invalid_input, [H]}). + + +base10_decode_unicode(B, Acc) -> + base10_decode_unicode(B, 0, Acc). +%% +base10_decode_unicode(<<H/utf8,T/binary>>, Codepoint, Acc) when $0 =< H, H =< $9 -> + Res = Codepoint * 10 + (H - $0), + base10_decode_unicode(T, Res, Acc); +base10_decode_unicode(<<$;,T/binary>>, Codepoint, Acc) -> + base10_decode(T, <<Acc/binary,Codepoint/utf8>>); +base10_decode_unicode(<<H,_/binary>>, _, _) -> + throw({error, invalid_input, [H]}). + + +%%------------------------------------------------------------------------- %% Helper functions for normalize %%------------------------------------------------------------------------- |