diff options
-rw-r--r-- | src/cow_uri.erl | 202 |
1 files changed, 78 insertions, 124 deletions
diff --git a/src/cow_uri.erl b/src/cow_uri.erl index 4a50240..9d46ae4 100644 --- a/src/cow_uri.erl +++ b/src/cow_uri.erl @@ -14,26 +14,10 @@ -module(cow_uri). --include("cow_inline.hrl"). - -export([urldecode/1]). -export([urlencode/1]). -%% Decode a percent encoded string. (RFC3986 2.1) -%% -%% Inspiration for some of the optimisations done here come -%% from the new `json` module as it was in mid-2024. -%% -%% Possible input includes: -%% -%% * nothing encoded (no % character): -%% We want to return the binary as-is to avoid an allocation. -%% -%% * small number of encoded characters: -%% We can "skip" words of text. -%% -%% * mostly encoded characters (non-ascii languages) -%% We can decode characters in bulk. +-include("cow_inline.hrl"). -define(IS_PLAIN(C), ( (C =:= $!) orelse (C =:= $$) orelse (C =:= $&) orelse (C =:= $') orelse @@ -58,6 +42,24 @@ (C =:= $y) orelse (C =:= $z) orelse (C =:= $~) )). +%% Decode a percent encoded string. (RFC3986 2.1) +%% +%% Inspiration for some of the optimisations done here come +%% from the new `json` module as it was in mid-2024. +%% +%% Possible input includes: +%% +%% * nothing encoded (no % character): +%% We want to return the binary as-is to avoid an allocation. +%% +%% * small number of encoded characters: +%% We can "skip" words of text. +%% +%% * mostly encoded characters (non-ascii languages) +%% We can decode characters in bulk. + +-spec urldecode(binary()) -> binary(). + urldecode(Binary) -> skip_dec(Binary, Binary, 0). @@ -97,8 +99,8 @@ dec(<<$%, H, L, Rest/bits>>, Acc, Orig, Skip, Len) -> end; %% This clause helps speed up decoding of barely encoded values. dec(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len) - when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2) - andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) -> + when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2) + andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) -> dec(Rest, Acc, Orig, Skip, Len + 4); dec(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) -> dec(Rest, Acc, Orig, Skip, Len + 1); @@ -175,116 +177,60 @@ horse_urldecode_worst_case_hex() -> ). -endif. -%% @doc Percent encode a string. (RFC3986 2.1) +%% Percent encode a string. (RFC3986 2.1) %% %% This function is meant to be used for path components. --spec urlencode(B) -> B when B::binary(). -urlencode(B) -> - urlencode(B, <<>>). +-spec urlencode(binary()) -> binary(). -urlencode(<< $!, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $! >>); -urlencode(<< $$, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $$ >>); -urlencode(<< $&, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $& >>); -urlencode(<< $', Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $' >>); -urlencode(<< $(, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $( >>); -urlencode(<< $), Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $) >>); -urlencode(<< $*, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $* >>); -urlencode(<< $+, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $+ >>); -urlencode(<< $,, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $, >>); -urlencode(<< $-, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $- >>); -urlencode(<< $., Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $. >>); -urlencode(<< $0, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $0 >>); -urlencode(<< $1, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $1 >>); -urlencode(<< $2, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $2 >>); -urlencode(<< $3, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $3 >>); -urlencode(<< $4, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $4 >>); -urlencode(<< $5, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $5 >>); -urlencode(<< $6, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $6 >>); -urlencode(<< $7, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $7 >>); -urlencode(<< $8, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $8 >>); -urlencode(<< $9, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $9 >>); -urlencode(<< $:, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $: >>); -urlencode(<< $;, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $; >>); -urlencode(<< $=, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $= >>); -urlencode(<< $@, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $@ >>); -urlencode(<< $A, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $A >>); -urlencode(<< $B, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $B >>); -urlencode(<< $C, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $C >>); -urlencode(<< $D, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $D >>); -urlencode(<< $E, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $E >>); -urlencode(<< $F, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $F >>); -urlencode(<< $G, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $G >>); -urlencode(<< $H, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $H >>); -urlencode(<< $I, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $I >>); -urlencode(<< $J, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $J >>); -urlencode(<< $K, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $K >>); -urlencode(<< $L, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $L >>); -urlencode(<< $M, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $M >>); -urlencode(<< $N, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $N >>); -urlencode(<< $O, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $O >>); -urlencode(<< $P, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $P >>); -urlencode(<< $Q, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Q >>); -urlencode(<< $R, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $R >>); -urlencode(<< $S, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $S >>); -urlencode(<< $T, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $T >>); -urlencode(<< $U, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $U >>); -urlencode(<< $V, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $V >>); -urlencode(<< $W, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $W >>); -urlencode(<< $X, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $X >>); -urlencode(<< $Y, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Y >>); -urlencode(<< $Z, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Z >>); -urlencode(<< $_, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $_ >>); -urlencode(<< $a, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $a >>); -urlencode(<< $b, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $b >>); -urlencode(<< $c, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $c >>); -urlencode(<< $d, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $d >>); -urlencode(<< $e, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $e >>); -urlencode(<< $f, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $f >>); -urlencode(<< $g, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $g >>); -urlencode(<< $h, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $h >>); -urlencode(<< $i, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $i >>); -urlencode(<< $j, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $j >>); -urlencode(<< $k, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $k >>); -urlencode(<< $l, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $l >>); -urlencode(<< $m, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $m >>); -urlencode(<< $n, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $n >>); -urlencode(<< $o, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $o >>); -urlencode(<< $p, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $p >>); -urlencode(<< $q, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $q >>); -urlencode(<< $r, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $r >>); -urlencode(<< $s, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $s >>); -urlencode(<< $t, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $t >>); -urlencode(<< $u, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $u >>); -urlencode(<< $v, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $v >>); -urlencode(<< $w, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $w >>); -urlencode(<< $x, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $x >>); -urlencode(<< $y, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $y >>); -urlencode(<< $z, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $z >>); -urlencode(<< $~, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $~ >>); -urlencode(<< C, Rest/bits >>, Acc) -> - H = hex(C bsr 4), - L = hex(C band 16#0f), - urlencode(Rest, << Acc/bits, $%, H, L >>); -urlencode(<<>>, Acc) -> - Acc. +urlencode(Binary) -> + skip_enc(Binary, Binary, 0). + +skip_enc(Binary, Orig, Len) -> + case Binary of + <<C1, C2, C3, C4, Rest/bits>> + when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2) + andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) -> + skip_enc(Rest, Orig, Len + 4); + _ -> + enc(Binary, [], Orig, 0, Len) + end. -hex( 0) -> $0; -hex( 1) -> $1; -hex( 2) -> $2; -hex( 3) -> $3; -hex( 4) -> $4; -hex( 5) -> $5; -hex( 6) -> $6; -hex( 7) -> $7; -hex( 8) -> $8; -hex( 9) -> $9; -hex(10) -> $A; -hex(11) -> $B; -hex(12) -> $C; -hex(13) -> $D; -hex(14) -> $E; -hex(15) -> $F. +enc(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len) + when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2) + andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) -> + enc(Rest, Acc, Orig, Skip, Len + 4); +enc(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) -> + enc(Rest, Acc, Orig, Skip, Len + 1); +enc(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len) + when (not ?IS_PLAIN(C2)) andalso (not ?IS_PLAIN(C3)) + andalso (not ?IS_PLAIN(C4)) -> + Enc = <<$%, ?HEX(C1), $%, ?HEX(C2), $%, ?HEX(C3), $%, ?HEX(C4)>>, + case Len of + 0 -> + enc(Rest, [Acc|Enc], Orig, Skip + 4, 0); + _ -> + Part = binary_part(Orig, Skip, Len), + enc(Rest, [Acc, Part|Enc], Orig, Skip + Len + 4, 0) + end; +enc(<<C, Rest/bits>>, Acc, Orig, Skip, Len) -> + Enc = <<$%, ?HEX(C)>>, + case Len of + 0 -> + enc(Rest, [Acc|Enc], Orig, Skip + 1, 0); + _ -> + Part = binary_part(Orig, Skip, Len), + enc(Rest, [Acc, Part|Enc], Orig, Skip + Len + 1, 0) + end; +enc(<<>>, _, Orig, 0, _) -> + Orig; +enc(<<>>, Acc, _, _, 0) -> + iolist_to_binary(Acc); +enc(<<>>, Acc, Orig, Skip, Len) -> + Part = binary_part(Orig, Skip, Len), + iolist_to_binary([Acc|Part]); +enc(_, _, Orig, Skip, Len) -> + error({invalid_byte, binary:at(Orig, Skip + Len)}). -ifdef(TEST). urlencode_test_() -> @@ -326,6 +272,14 @@ horse_urlencode_jp() -> 129,153,227,130,139,230,151,139,229,190,139,227,128,156>>) ). +horse_urlencode_jp_mixed() -> + horse:repeat(100000, + urlencode(<<227,131,132,227,130,164,227,131,179,227,130,189,227, + $1, $2, $3, + 130,166,227,131,171,227,128,156,232,188,170,229,187,187,227, + 129,153,227,130,139,230,151,139,229,190,139,227,128,156>>) + ). + horse_urlencode_mix() -> horse:repeat(100000, urlencode(<<"Small, fast, modular HTTP server.">>) |