From a9a3bc7b66b72c088814c99f169a3ae67f37c901 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Sat, 13 Dec 2014 13:19:45 +0200 Subject: Add cow_http_hd:parse_charset/1 From RFC7231. This code is more than twice faster as the current Cowboy code, while filtering out more bad cases. --- src/cow_http_hd.erl | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) (limited to 'src') diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl index a6624a8..397a759 100644 --- a/src/cow_http_hd.erl +++ b/src/cow_http_hd.erl @@ -15,6 +15,7 @@ -module(cow_http_hd). -export([parse_accept/1]). +-export([parse_accept_charset/1]). -export([parse_connection/1]). -export([parse_content_length/1]). -export([parse_expect/1]). @@ -224,6 +225,98 @@ horse_parse_accept() -> ). -endif. +%% @doc Parse the Accept-Charset header. + +-spec parse_accept_charset(binary()) -> [{binary(), qvalue()}]. +parse_accept_charset(Charset) -> + nonempty(conneg_list(Charset, [])). + +conneg_list(<<>>, Acc) -> lists:reverse(Acc); +conneg_list(<< $\s, R/bits >>, Acc) -> conneg_list(R, Acc); +conneg_list(<< $\t, R/bits >>, Acc) -> conneg_list(R, Acc); +conneg_list(<< $\,, R/bits >>, Acc) -> conneg_list(R, Acc); +conneg_list(<< C, R/bits >>, Acc) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(conneg, R, Acc, <<>>) + end. + +conneg(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]); +conneg(<< $,, R/bits >>, Acc, T) -> conneg_list(R, [{T, 1000}|Acc]); +conneg(<< $;, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T); +conneg(<< $\s, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T); +conneg(<< $\t, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T); +conneg(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(conneg, R, Acc, T) + end. + +conneg_before_semicolon(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]); +conneg_before_semicolon(<< $,, R/bits >>, Acc, T) -> conneg_list(R, [{T, 1000}|Acc]); +conneg_before_semicolon(<< $;, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T); +conneg_before_semicolon(<< $\s, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T); +conneg_before_semicolon(<< $\t, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T). + +conneg_before_weight(<< $\s, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T); +conneg_before_weight(<< $\t, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T); +conneg_before_weight(<< $q, $=, R/bits >>, Acc, T) -> conneg_weight(R, Acc, T); +%% Special clause for broken user agents that confuse ; and , separators. +conneg_before_weight(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(conneg, R, [{T, 1000}|Acc], <<>>) + end. + +conneg_weight(<< "1.000", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); +conneg_weight(<< "1.00", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); +conneg_weight(<< "1.0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); +conneg_weight(<< "1.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); +conneg_weight(<< "1", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]); +conneg_weight(<< "0.", A, B, C, R/bits >>, Acc, T) + when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10 + (C - $0)}|Acc]); +conneg_weight(<< "0.", A, B, R/bits >>, Acc, T) + when ?IS_DIGIT(A), ?IS_DIGIT(B) -> + conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10}|Acc]); +conneg_weight(<< "0.", A, R/bits >>, Acc, T) + when ?IS_DIGIT(A) -> + conneg_list_sep(R, [{T, (A - $0) * 100}|Acc]); +conneg_weight(<< "0.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]); +conneg_weight(<< "0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]). + +conneg_list_sep(<<>>, Acc) -> lists:reverse(Acc); +conneg_list_sep(<< $\s, R/bits >>, Acc) -> conneg_list_sep(R, Acc); +conneg_list_sep(<< $\t, R/bits >>, Acc) -> conneg_list_sep(R, Acc); +conneg_list_sep(<< $,, R/bits >>, Acc) -> conneg_list(R, Acc). + +-ifdef(TEST). +parse_accept_charset_test_() -> + Tests = [ + {<<"iso-8859-5, unicode-1-1;q=0.8">>, [ + {<<"iso-8859-5">>, 1000}, + {<<"unicode-1-1">>, 800} + ]}, + %% Some user agents send this invalid value for the Accept-Charset header + {<<"ISO-8859-1;utf-8;q=0.7,*;q=0.7">>, [ + {<<"iso-8859-1">>, 1000}, + {<<"utf-8">>, 700}, + {<<"*">>, 700} + ]} + ], + [{V, fun() -> R = parse_accept_charset(V) end} || {V, R} <- Tests]. + +parse_accept_charset_error_test_() -> + Tests = [ + <<>> + ], + [{V, fun() -> {'EXIT', _} = (catch parse_accept_charset(V)) end} || V <- Tests]. +-endif. + +-ifdef(PERF). +horse_parse_accept_charset() -> + horse:repeat(20000, + parse_accept_charset(<<"iso-8859-5, unicode-1-1;q=0.8">>) + ). +-endif. + %% @doc Parse the Connection header. -spec parse_connection(binary()) -> [binary()]. -- cgit v1.2.3