diff options
author | Loïc Hoguin <[email protected]> | 2014-12-13 14:42:43 +0200 |
---|---|---|
committer | Loïc Hoguin <[email protected]> | 2014-12-13 14:46:05 +0200 |
commit | 86917a4c641f01c18f53b34bc4bc89c20549f04a (patch) | |
tree | 45e8fc0b1b4b49d6a135c81d404857648976cc85 | |
parent | 847b1bca21cc2ddec6b94f2a8f8924ad3f4be73a (diff) | |
download | cowlib-86917a4c641f01c18f53b34bc4bc89c20549f04a.tar.gz cowlib-86917a4c641f01c18f53b34bc4bc89c20549f04a.tar.bz2 cowlib-86917a4c641f01c18f53b34bc4bc89c20549f04a.zip |
Add cow_http_hd:parse_accept_language/1
From RFC7231.
This code is more than twice faster as the current Cowboy code,
while filtering out more bad cases.
-rw-r--r-- | include/cow_inline.hrl | 25 | ||||
-rw-r--r-- | src/cow_http_hd.erl | 132 |
2 files changed, 138 insertions, 19 deletions
diff --git a/include/cow_inline.hrl b/include/cow_inline.hrl index 82cc465..a72f8c5 100644 --- a/include/cow_inline.hrl +++ b/include/cow_inline.hrl @@ -15,15 +15,9 @@ -ifndef(COW_INLINE_HRL). -define(COW_INLINE_HRL, 1). -%% IS_DIGIT(Character) +%% IS_ALPHA(Character) --define(IS_DIGIT(C), - C >= $0, C =< $9 -). - -%% IS_TOKEN(Character) - --define(IS_TOKEN(C), +-define(IS_ALPHA(C), C =:= $a; C =:= $b; C =:= $c; C =:= $d; C =:= $e; C =:= $f; C =:= $g; C =:= $h; C =:= $i; C =:= $j; C =:= $k; C =:= $l; C =:= $m; C =:= $n; C =:= $o; @@ -35,9 +29,20 @@ C =:= $K; C =:= $L; C =:= $M; C =:= $N; C =:= $O; C =:= $P; C =:= $Q; C =:= $R; C =:= $S; C =:= $T; C =:= $U; C =:= $V; C =:= $W; C =:= $X; C =:= $Y; - C =:= $2; + C =:= $2 +). + +%% IS_DIGIT(Character) + +-define(IS_DIGIT(C), C =:= $0; C =:= $1; C =:= $2; C =:= $3; C =:= $4; - C =:= $5; C =:= $6; C =:= $7; C =:= $8; C =:= $9; + C =:= $5; C =:= $6; C =:= $7; C =:= $8; C =:= $9 +). + +%% IS_TOKEN(Character) + +-define(IS_TOKEN(C), + ?IS_ALPHA(C); ?IS_DIGIT(C); C =:= $!; C =:= $#; C =:= $$; C =:= $%; C =:= $&; C =:= $'; C =:= $*; C =:= $+; C =:= $-; C =:= $.; C =:= $^; C =:= $_; C =:= $`; C =:= $|; C =:= $~ diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl index 0e3e940..e418754 100644 --- a/src/cow_http_hd.erl +++ b/src/cow_http_hd.erl @@ -17,6 +17,7 @@ -export([parse_accept/1]). -export([parse_accept_charset/1]). -export([parse_accept_encoding/1]). +-export([parse_accept_language/1]). -export([parse_connection/1]). -export([parse_content_length/1]). -export([parse_expect/1]). @@ -99,13 +100,13 @@ media_range_value(<< C, R/bits >>, Acc, T, S, P, K, V) when ?IS_TOKEN(C) -> medi %% Special function for badly behaving user agents that send .123 instead of 0.123. media_range_broken_weight(<< A, B, C, R/bits >>, Acc, T, S, P) - when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + when A >= $0, A =< $9, B >= $0, B =< $9, C >= $0, C =< $9 -> accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10 + (C - $0), []); media_range_broken_weight(<< A, B, R/bits >>, Acc, T, S, P) - when ?IS_DIGIT(A), ?IS_DIGIT(B) -> + when A >= $0, A =< $9, B >= $0, B =< $9 -> accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10, []); media_range_broken_weight(<< A, R/bits >>, Acc, T, S, P) - when ?IS_DIGIT(A) -> + when A >= $0, A =< $9 -> accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100, []). media_range_weight(<< "1.000", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); @@ -114,13 +115,13 @@ media_range_weight(<< "1.0", R/bits >>, Acc, T, S, P) -> accept_before_semicolon media_range_weight(<< "1.", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); media_range_weight(<< "1", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); media_range_weight(<< "0.", A, B, C, R/bits >>, Acc, T, S, P) - when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + when A >= $0, A =< $9, B >= $0, B =< $9, C >= $0, C =< $9 -> accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10 + (C - $0), []); media_range_weight(<< "0.", A, B, R/bits >>, Acc, T, S, P) - when ?IS_DIGIT(A), ?IS_DIGIT(B) -> + when A >= $0, A =< $9, B >= $0, B =< $9 -> accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10, []); media_range_weight(<< "0.", A, R/bits >>, Acc, T, S, P) - when ?IS_DIGIT(A) -> + when A >= $0, A =< $9 -> accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100, []); media_range_weight(<< "0.", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 0, []); media_range_weight(<< "0", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 0, []). @@ -272,13 +273,13 @@ conneg_weight(<< "1.0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc] conneg_weight(<< "1.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); conneg_weight(<< "1", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); conneg_weight(<< "0.", A, B, C, R/bits >>, Acc, T) - when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + when A >= $0, A =< $9, B >= $0, B =< $9, C >= $0, C =< $9 -> conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10 + (C - $0)}|Acc]); conneg_weight(<< "0.", A, B, R/bits >>, Acc, T) - when ?IS_DIGIT(A), ?IS_DIGIT(B) -> + when A >= $0, A =< $9, B >= $0, B =< $9 -> conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10}|Acc]); conneg_weight(<< "0.", A, R/bits >>, Acc, T) - when ?IS_DIGIT(A) -> + when A >= $0, A =< $9 -> conneg_list_sep(R, [{T, (A - $0) * 100}|Acc]); conneg_weight(<< "0.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]); conneg_weight(<< "0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]). @@ -353,6 +354,119 @@ horse_parse_accept_encoding() -> ). -endif. +%% @doc Parse the Accept-Language header. + +-spec parse_accept_language(binary()) -> [{binary(), qvalue()}]. +parse_accept_language(LanguageRange) -> + nonempty(language_range_list(LanguageRange, [])). + +language_range_list(<<>>, Acc) -> lists:reverse(Acc); +language_range_list(<< $\s, R/bits >>, Acc) -> language_range_list(R, Acc); +language_range_list(<< $\t, R/bits >>, Acc) -> language_range_list(R, Acc); +language_range_list(<< $\,, R/bits >>, Acc) -> language_range_list(R, Acc); +language_range_list(<< $*, R/bits >>, Acc) -> language_range_before_semicolon(R, Acc, <<"*">>); +language_range_list(<< C, R/bits >>, Acc) when ?IS_ALPHA(C) -> + case C of + ?INLINE_LOWERCASE(language_range, R, Acc, 1, <<>>) + end. + +language_range(<<>>, Acc, _, T) -> lists:reverse([{T, 1000}|Acc]); +language_range(<< $,, R/bits >>, Acc, _, T) -> language_range_list(R, [{T, 1000}|Acc]); +language_range(<< $;, R/bits >>, Acc, _, T) -> language_range_before_weight(R, Acc, T); +language_range(<< $\s, R/bits >>, Acc, _, T) -> language_range_before_semicolon(R, Acc, T); +language_range(<< $\t, R/bits >>, Acc, _, T) -> language_range_before_semicolon(R, Acc, T); +language_range(<< $-, R/bits >>, Acc, _, T) -> language_range_sub(R, Acc, 0, << T/binary, $- >>); +language_range(<< _, _/bits >>, _, 8, _) -> error(badarg); +language_range(<< C, R/bits >>, Acc, N, T) when ?IS_ALPHA(C) -> + case C of + ?INLINE_LOWERCASE(language_range, R, Acc, N + 1, T) + end. + +language_range_sub(<<>>, Acc, N, T) when N > 0 -> lists:reverse([{T, 1000}|Acc]); +language_range_sub(<< $,, R/bits >>, Acc, N, T) when N > 0 -> language_range_list(R, [{T, 1000}|Acc]); +language_range_sub(<< $;, R/bits >>, Acc, N, T) when N > 0 -> language_range_before_weight(R, Acc, T); +language_range_sub(<< $\s, R/bits >>, Acc, N, T) when N > 0 -> language_range_before_semicolon(R, Acc, T); +language_range_sub(<< $\t, R/bits >>, Acc, N, T) when N > 0 -> language_range_before_semicolon(R, Acc, T); +language_range_sub(<< $-, R/bits >>, Acc, N, T) when N > 0 -> language_range_sub(R, Acc, 0, << T/binary, $- >>); +language_range_sub(<< _, _/bits >>, _, 8, _) -> error(badarg); +language_range_sub(<< C, R/bits >>, Acc, N, T) when ?IS_ALPHA(C); ?IS_DIGIT(C) -> + case C of + ?INLINE_LOWERCASE(language_range_sub, R, Acc, N + 1, T) + end. + +language_range_before_semicolon(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]); +language_range_before_semicolon(<< $,, R/bits >>, Acc, T) -> language_range_list(R, [{T, 1000}|Acc]); +language_range_before_semicolon(<< $;, R/bits >>, Acc, T) -> language_range_before_weight(R, Acc, T); +language_range_before_semicolon(<< $\s, R/bits >>, Acc, T) -> language_range_before_semicolon(R, Acc, T); +language_range_before_semicolon(<< $\t, R/bits >>, Acc, T) -> language_range_before_semicolon(R, Acc, T). + +language_range_before_weight(<< $\s, R/bits >>, Acc, T) -> language_range_before_weight(R, Acc, T); +language_range_before_weight(<< $\t, R/bits >>, Acc, T) -> language_range_before_weight(R, Acc, T); +language_range_before_weight(<< $q, $=, R/bits >>, Acc, T) -> language_range_weight(R, Acc, T); +%% Special clause for broken user agents that confuse ; and , separators. +language_range_before_weight(<< C, R/bits >>, Acc, T) when ?IS_ALPHA(C) -> + case C of + ?INLINE_LOWERCASE(language_range, R, [{T, 1000}|Acc], 1, <<>>) + end. + +language_range_weight(<< "1.000", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 1000}|Acc]); +language_range_weight(<< "1.00", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 1000}|Acc]); +language_range_weight(<< "1.0", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 1000}|Acc]); +language_range_weight(<< "1.", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 1000}|Acc]); +language_range_weight(<< "1", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 1000}|Acc]); +language_range_weight(<< "0.", A, B, C, R/bits >>, Acc, T) + when A >= $0, A =< $9, B >= $0, B =< $9, C >= $0, C =< $9 -> + language_range_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10 + (C - $0)}|Acc]); +language_range_weight(<< "0.", A, B, R/bits >>, Acc, T) + when A >= $0, A =< $9, B >= $0, B =< $9 -> + language_range_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10}|Acc]); +language_range_weight(<< "0.", A, R/bits >>, Acc, T) + when A >= $0, A =< $9 -> + language_range_list_sep(R, [{T, (A - $0) * 100}|Acc]); +language_range_weight(<< "0.", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 0}|Acc]); +language_range_weight(<< "0", R/bits >>, Acc, T) -> language_range_list_sep(R, [{T, 0}|Acc]). + +language_range_list_sep(<<>>, Acc) -> lists:reverse(Acc); +language_range_list_sep(<< $\s, R/bits >>, Acc) -> language_range_list_sep(R, Acc); +language_range_list_sep(<< $\t, R/bits >>, Acc) -> language_range_list_sep(R, Acc); +language_range_list_sep(<< $,, R/bits >>, Acc) -> language_range_list(R, Acc). + +-ifdef(TEST). +parse_accept_language_test_() -> + Tests = [ + {<<"da, en-gb;q=0.8, en;q=0.7">>, [ + {<<"da">>, 1000}, + {<<"en-gb">>, 800}, + {<<"en">>, 700} + ]}, + {<<"en, en-US, en-cockney, i-cherokee, x-pig-latin, es-419">>, [ + {<<"en">>, 1000}, + {<<"en-us">>, 1000}, + {<<"en-cockney">>, 1000}, + {<<"i-cherokee">>, 1000}, + {<<"x-pig-latin">>, 1000}, + {<<"es-419">>, 1000} + ]} + ], + [{V, fun() -> R = parse_accept_language(V) end} || {V, R} <- Tests]. + +parse_accept_language_error_test_() -> + Tests = [ + <<>>, + <<"loooooong">>, + <<"en-us-loooooong">>, + <<"419-en-us">> + ], + [{V, fun() -> {'EXIT', _} = (catch parse_accept_language(V)) end} || V <- Tests]. +-endif. + +-ifdef(PERF). +horse_parse_accept_language() -> + horse:repeat(20000, + parse_accept_language(<<"da, en-gb;q=0.8, en;q=0.7">>) + ). +-endif. + %% @doc Parse the Connection header. -spec parse_connection(binary()) -> [binary()]. |