aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/cow_uri.erl202
1 files changed, 78 insertions, 124 deletions
diff --git a/src/cow_uri.erl b/src/cow_uri.erl
index 4a50240..9d46ae4 100644
--- a/src/cow_uri.erl
+++ b/src/cow_uri.erl
@@ -14,26 +14,10 @@
-module(cow_uri).
--include("cow_inline.hrl").
-
-export([urldecode/1]).
-export([urlencode/1]).
-%% Decode a percent encoded string. (RFC3986 2.1)
-%%
-%% Inspiration for some of the optimisations done here come
-%% from the new `json` module as it was in mid-2024.
-%%
-%% Possible input includes:
-%%
-%% * nothing encoded (no % character):
-%% We want to return the binary as-is to avoid an allocation.
-%%
-%% * small number of encoded characters:
-%% We can "skip" words of text.
-%%
-%% * mostly encoded characters (non-ascii languages)
-%% We can decode characters in bulk.
+-include("cow_inline.hrl").
-define(IS_PLAIN(C), (
(C =:= $!) orelse (C =:= $$) orelse (C =:= $&) orelse (C =:= $') orelse
@@ -58,6 +42,24 @@
(C =:= $y) orelse (C =:= $z) orelse (C =:= $~)
)).
+%% Decode a percent encoded string. (RFC3986 2.1)
+%%
+%% Inspiration for some of the optimisations done here come
+%% from the new `json` module as it was in mid-2024.
+%%
+%% Possible input includes:
+%%
+%% * nothing encoded (no % character):
+%% We want to return the binary as-is to avoid an allocation.
+%%
+%% * small number of encoded characters:
+%% We can "skip" words of text.
+%%
+%% * mostly encoded characters (non-ascii languages)
+%% We can decode characters in bulk.
+
+-spec urldecode(binary()) -> binary().
+
urldecode(Binary) ->
skip_dec(Binary, Binary, 0).
@@ -97,8 +99,8 @@ dec(<<$%, H, L, Rest/bits>>, Acc, Orig, Skip, Len) ->
end;
%% This clause helps speed up decoding of barely encoded values.
dec(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len)
- when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
- andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
+ when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
+ andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
dec(Rest, Acc, Orig, Skip, Len + 4);
dec(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) ->
dec(Rest, Acc, Orig, Skip, Len + 1);
@@ -175,116 +177,60 @@ horse_urldecode_worst_case_hex() ->
).
-endif.
-%% @doc Percent encode a string. (RFC3986 2.1)
+%% Percent encode a string. (RFC3986 2.1)
%%
%% This function is meant to be used for path components.
--spec urlencode(B) -> B when B::binary().
-urlencode(B) ->
- urlencode(B, <<>>).
+-spec urlencode(binary()) -> binary().
-urlencode(<< $!, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $! >>);
-urlencode(<< $$, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $$ >>);
-urlencode(<< $&, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $& >>);
-urlencode(<< $', Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $' >>);
-urlencode(<< $(, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $( >>);
-urlencode(<< $), Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $) >>);
-urlencode(<< $*, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $* >>);
-urlencode(<< $+, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $+ >>);
-urlencode(<< $,, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $, >>);
-urlencode(<< $-, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $- >>);
-urlencode(<< $., Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $. >>);
-urlencode(<< $0, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $0 >>);
-urlencode(<< $1, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $1 >>);
-urlencode(<< $2, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $2 >>);
-urlencode(<< $3, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $3 >>);
-urlencode(<< $4, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $4 >>);
-urlencode(<< $5, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $5 >>);
-urlencode(<< $6, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $6 >>);
-urlencode(<< $7, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $7 >>);
-urlencode(<< $8, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $8 >>);
-urlencode(<< $9, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $9 >>);
-urlencode(<< $:, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $: >>);
-urlencode(<< $;, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $; >>);
-urlencode(<< $=, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $= >>);
-urlencode(<< $@, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $@ >>);
-urlencode(<< $A, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $A >>);
-urlencode(<< $B, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $B >>);
-urlencode(<< $C, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $C >>);
-urlencode(<< $D, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $D >>);
-urlencode(<< $E, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $E >>);
-urlencode(<< $F, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $F >>);
-urlencode(<< $G, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $G >>);
-urlencode(<< $H, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $H >>);
-urlencode(<< $I, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $I >>);
-urlencode(<< $J, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $J >>);
-urlencode(<< $K, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $K >>);
-urlencode(<< $L, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $L >>);
-urlencode(<< $M, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $M >>);
-urlencode(<< $N, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $N >>);
-urlencode(<< $O, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $O >>);
-urlencode(<< $P, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $P >>);
-urlencode(<< $Q, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Q >>);
-urlencode(<< $R, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $R >>);
-urlencode(<< $S, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $S >>);
-urlencode(<< $T, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $T >>);
-urlencode(<< $U, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $U >>);
-urlencode(<< $V, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $V >>);
-urlencode(<< $W, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $W >>);
-urlencode(<< $X, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $X >>);
-urlencode(<< $Y, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Y >>);
-urlencode(<< $Z, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $Z >>);
-urlencode(<< $_, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $_ >>);
-urlencode(<< $a, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $a >>);
-urlencode(<< $b, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $b >>);
-urlencode(<< $c, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $c >>);
-urlencode(<< $d, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $d >>);
-urlencode(<< $e, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $e >>);
-urlencode(<< $f, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $f >>);
-urlencode(<< $g, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $g >>);
-urlencode(<< $h, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $h >>);
-urlencode(<< $i, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $i >>);
-urlencode(<< $j, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $j >>);
-urlencode(<< $k, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $k >>);
-urlencode(<< $l, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $l >>);
-urlencode(<< $m, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $m >>);
-urlencode(<< $n, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $n >>);
-urlencode(<< $o, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $o >>);
-urlencode(<< $p, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $p >>);
-urlencode(<< $q, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $q >>);
-urlencode(<< $r, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $r >>);
-urlencode(<< $s, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $s >>);
-urlencode(<< $t, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $t >>);
-urlencode(<< $u, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $u >>);
-urlencode(<< $v, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $v >>);
-urlencode(<< $w, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $w >>);
-urlencode(<< $x, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $x >>);
-urlencode(<< $y, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $y >>);
-urlencode(<< $z, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $z >>);
-urlencode(<< $~, Rest/bits >>, Acc) -> urlencode(Rest, << Acc/bits, $~ >>);
-urlencode(<< C, Rest/bits >>, Acc) ->
- H = hex(C bsr 4),
- L = hex(C band 16#0f),
- urlencode(Rest, << Acc/bits, $%, H, L >>);
-urlencode(<<>>, Acc) ->
- Acc.
+urlencode(Binary) ->
+ skip_enc(Binary, Binary, 0).
+
+skip_enc(Binary, Orig, Len) ->
+ case Binary of
+ <<C1, C2, C3, C4, Rest/bits>>
+ when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
+ andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
+ skip_enc(Rest, Orig, Len + 4);
+ _ ->
+ enc(Binary, [], Orig, 0, Len)
+ end.
-hex( 0) -> $0;
-hex( 1) -> $1;
-hex( 2) -> $2;
-hex( 3) -> $3;
-hex( 4) -> $4;
-hex( 5) -> $5;
-hex( 6) -> $6;
-hex( 7) -> $7;
-hex( 8) -> $8;
-hex( 9) -> $9;
-hex(10) -> $A;
-hex(11) -> $B;
-hex(12) -> $C;
-hex(13) -> $D;
-hex(14) -> $E;
-hex(15) -> $F.
+enc(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len)
+ when ?IS_PLAIN(C1) andalso ?IS_PLAIN(C2)
+ andalso ?IS_PLAIN(C3) andalso ?IS_PLAIN(C4) ->
+ enc(Rest, Acc, Orig, Skip, Len + 4);
+enc(<<C, Rest/bits>>, Acc, Orig, Skip, Len) when ?IS_PLAIN(C) ->
+ enc(Rest, Acc, Orig, Skip, Len + 1);
+enc(<<C1, C2, C3, C4, Rest/bits>>, Acc, Orig, Skip, Len)
+ when (not ?IS_PLAIN(C2)) andalso (not ?IS_PLAIN(C3))
+ andalso (not ?IS_PLAIN(C4)) ->
+ Enc = <<$%, ?HEX(C1), $%, ?HEX(C2), $%, ?HEX(C3), $%, ?HEX(C4)>>,
+ case Len of
+ 0 ->
+ enc(Rest, [Acc|Enc], Orig, Skip + 4, 0);
+ _ ->
+ Part = binary_part(Orig, Skip, Len),
+ enc(Rest, [Acc, Part|Enc], Orig, Skip + Len + 4, 0)
+ end;
+enc(<<C, Rest/bits>>, Acc, Orig, Skip, Len) ->
+ Enc = <<$%, ?HEX(C)>>,
+ case Len of
+ 0 ->
+ enc(Rest, [Acc|Enc], Orig, Skip + 1, 0);
+ _ ->
+ Part = binary_part(Orig, Skip, Len),
+ enc(Rest, [Acc, Part|Enc], Orig, Skip + Len + 1, 0)
+ end;
+enc(<<>>, _, Orig, 0, _) ->
+ Orig;
+enc(<<>>, Acc, _, _, 0) ->
+ iolist_to_binary(Acc);
+enc(<<>>, Acc, Orig, Skip, Len) ->
+ Part = binary_part(Orig, Skip, Len),
+ iolist_to_binary([Acc|Part]);
+enc(_, _, Orig, Skip, Len) ->
+ error({invalid_byte, binary:at(Orig, Skip + Len)}).
-ifdef(TEST).
urlencode_test_() ->
@@ -326,6 +272,14 @@ horse_urlencode_jp() ->
129,153,227,130,139,230,151,139,229,190,139,227,128,156>>)
).
+horse_urlencode_jp_mixed() ->
+ horse:repeat(100000,
+ urlencode(<<227,131,132,227,130,164,227,131,179,227,130,189,227,
+ $1, $2, $3,
+ 130,166,227,131,171,227,128,156,232,188,170,229,187,187,227,
+ 129,153,227,130,139,230,151,139,229,190,139,227,128,156>>)
+ ).
+
horse_urlencode_mix() ->
horse:repeat(100000,
urlencode(<<"Small, fast, modular HTTP server.">>)