aboutsummaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorLoïc Hoguin <[email protected]>2014-12-13 13:19:45 +0200
committerLoïc Hoguin <[email protected]>2014-12-13 13:22:03 +0200
commita9a3bc7b66b72c088814c99f169a3ae67f37c901 (patch)
tree799c8bee2b2c54014c781a23e9dd116bbf233d34 /src
parent20ada930c3996945fe2771518da05b5e7ae9b904 (diff)
downloadcowlib-a9a3bc7b66b72c088814c99f169a3ae67f37c901.tar.gz
cowlib-a9a3bc7b66b72c088814c99f169a3ae67f37c901.tar.bz2
cowlib-a9a3bc7b66b72c088814c99f169a3ae67f37c901.zip
Add cow_http_hd:parse_charset/1
From RFC7231. This code is more than twice faster as the current Cowboy code, while filtering out more bad cases.
Diffstat (limited to 'src')
-rw-r--r--src/cow_http_hd.erl93
1 files changed, 93 insertions, 0 deletions
diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl
index a6624a8..397a759 100644
--- a/src/cow_http_hd.erl
+++ b/src/cow_http_hd.erl
@@ -15,6 +15,7 @@
-module(cow_http_hd).
-export([parse_accept/1]).
+-export([parse_accept_charset/1]).
-export([parse_connection/1]).
-export([parse_content_length/1]).
-export([parse_expect/1]).
@@ -224,6 +225,98 @@ horse_parse_accept() ->
).
-endif.
+%% @doc Parse the Accept-Charset header.
+
+-spec parse_accept_charset(binary()) -> [{binary(), qvalue()}].
+parse_accept_charset(Charset) ->
+ nonempty(conneg_list(Charset, [])).
+
+conneg_list(<<>>, Acc) -> lists:reverse(Acc);
+conneg_list(<< $\s, R/bits >>, Acc) -> conneg_list(R, Acc);
+conneg_list(<< $\t, R/bits >>, Acc) -> conneg_list(R, Acc);
+conneg_list(<< $\,, R/bits >>, Acc) -> conneg_list(R, Acc);
+conneg_list(<< C, R/bits >>, Acc) when ?IS_TOKEN(C) ->
+ case C of
+ ?INLINE_LOWERCASE(conneg, R, Acc, <<>>)
+ end.
+
+conneg(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]);
+conneg(<< $,, R/bits >>, Acc, T) -> conneg_list(R, [{T, 1000}|Acc]);
+conneg(<< $;, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg(<< $\s, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T);
+conneg(<< $\t, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T);
+conneg(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) ->
+ case C of
+ ?INLINE_LOWERCASE(conneg, R, Acc, T)
+ end.
+
+conneg_before_semicolon(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]);
+conneg_before_semicolon(<< $,, R/bits >>, Acc, T) -> conneg_list(R, [{T, 1000}|Acc]);
+conneg_before_semicolon(<< $;, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg_before_semicolon(<< $\s, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T);
+conneg_before_semicolon(<< $\t, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T).
+
+conneg_before_weight(<< $\s, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg_before_weight(<< $\t, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg_before_weight(<< $q, $=, R/bits >>, Acc, T) -> conneg_weight(R, Acc, T);
+%% Special clause for broken user agents that confuse ; and , separators.
+conneg_before_weight(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) ->
+ case C of
+ ?INLINE_LOWERCASE(conneg, R, [{T, 1000}|Acc], <<>>)
+ end.
+
+conneg_weight(<< "1.000", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1.00", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1.0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "0.", A, B, C, R/bits >>, Acc, T)
+ when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) ->
+ conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10 + (C - $0)}|Acc]);
+conneg_weight(<< "0.", A, B, R/bits >>, Acc, T)
+ when ?IS_DIGIT(A), ?IS_DIGIT(B) ->
+ conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10}|Acc]);
+conneg_weight(<< "0.", A, R/bits >>, Acc, T)
+ when ?IS_DIGIT(A) ->
+ conneg_list_sep(R, [{T, (A - $0) * 100}|Acc]);
+conneg_weight(<< "0.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]);
+conneg_weight(<< "0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]).
+
+conneg_list_sep(<<>>, Acc) -> lists:reverse(Acc);
+conneg_list_sep(<< $\s, R/bits >>, Acc) -> conneg_list_sep(R, Acc);
+conneg_list_sep(<< $\t, R/bits >>, Acc) -> conneg_list_sep(R, Acc);
+conneg_list_sep(<< $,, R/bits >>, Acc) -> conneg_list(R, Acc).
+
+-ifdef(TEST).
+parse_accept_charset_test_() ->
+ Tests = [
+ {<<"iso-8859-5, unicode-1-1;q=0.8">>, [
+ {<<"iso-8859-5">>, 1000},
+ {<<"unicode-1-1">>, 800}
+ ]},
+ %% Some user agents send this invalid value for the Accept-Charset header
+ {<<"ISO-8859-1;utf-8;q=0.7,*;q=0.7">>, [
+ {<<"iso-8859-1">>, 1000},
+ {<<"utf-8">>, 700},
+ {<<"*">>, 700}
+ ]}
+ ],
+ [{V, fun() -> R = parse_accept_charset(V) end} || {V, R} <- Tests].
+
+parse_accept_charset_error_test_() ->
+ Tests = [
+ <<>>
+ ],
+ [{V, fun() -> {'EXIT', _} = (catch parse_accept_charset(V)) end} || V <- Tests].
+-endif.
+
+-ifdef(PERF).
+horse_parse_accept_charset() ->
+ horse:repeat(20000,
+ parse_accept_charset(<<"iso-8859-5, unicode-1-1;q=0.8">>)
+ ).
+-endif.
+
%% @doc Parse the Connection header.
-spec parse_connection(binary()) -> [binary()].