From a9a3bc7b66b72c088814c99f169a3ae67f37c901 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= <essen@ninenines.eu>
Date: Sat, 13 Dec 2014 13:19:45 +0200
Subject: Add cow_http_hd:parse_charset/1

From RFC7231.

This code is more than twice faster as the current Cowboy code,
while filtering out more bad cases.
---
 src/cow_http_hd.erl | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

(limited to 'src')

diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl
index a6624a8..397a759 100644
--- a/src/cow_http_hd.erl
+++ b/src/cow_http_hd.erl
@@ -15,6 +15,7 @@
 -module(cow_http_hd).
 
 -export([parse_accept/1]).
+-export([parse_accept_charset/1]).
 -export([parse_connection/1]).
 -export([parse_content_length/1]).
 -export([parse_expect/1]).
@@ -224,6 +225,98 @@ horse_parse_accept() ->
 	).
 -endif.
 
+%% @doc Parse the Accept-Charset header.
+
+-spec parse_accept_charset(binary()) -> [{binary(), qvalue()}].
+parse_accept_charset(Charset) ->
+	nonempty(conneg_list(Charset, [])).
+
+conneg_list(<<>>, Acc) -> lists:reverse(Acc);
+conneg_list(<< $\s, R/bits >>, Acc) -> conneg_list(R, Acc);
+conneg_list(<< $\t, R/bits >>, Acc) -> conneg_list(R, Acc);
+conneg_list(<< $\,, R/bits >>, Acc) -> conneg_list(R, Acc);
+conneg_list(<< C, R/bits >>, Acc) when ?IS_TOKEN(C) ->
+	case C of
+		?INLINE_LOWERCASE(conneg, R, Acc, <<>>)
+	end.
+
+conneg(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]);
+conneg(<< $,, R/bits >>, Acc, T) -> conneg_list(R, [{T, 1000}|Acc]);
+conneg(<< $;, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg(<< $\s, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T);
+conneg(<< $\t, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T);
+conneg(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) ->
+	case C of
+		?INLINE_LOWERCASE(conneg, R, Acc, T)
+	end.
+
+conneg_before_semicolon(<<>>, Acc, T) -> lists:reverse([{T, 1000}|Acc]);
+conneg_before_semicolon(<< $,, R/bits >>, Acc, T) -> conneg_list(R, [{T, 1000}|Acc]);
+conneg_before_semicolon(<< $;, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg_before_semicolon(<< $\s, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T);
+conneg_before_semicolon(<< $\t, R/bits >>, Acc, T) -> conneg_before_semicolon(R, Acc, T).
+
+conneg_before_weight(<< $\s, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg_before_weight(<< $\t, R/bits >>, Acc, T) -> conneg_before_weight(R, Acc, T);
+conneg_before_weight(<< $q, $=, R/bits >>, Acc, T) -> conneg_weight(R, Acc, T);
+%% Special clause for broken user agents that confuse ; and , separators.
+conneg_before_weight(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) ->
+	case C of
+		?INLINE_LOWERCASE(conneg, R, [{T, 1000}|Acc], <<>>)
+	end.
+
+conneg_weight(<< "1.000", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1.00", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1.0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "1", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 1000}|Acc]);
+conneg_weight(<< "0.", A, B, C, R/bits >>, Acc, T)
+	when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) ->
+		conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10 + (C - $0)}|Acc]);
+conneg_weight(<< "0.", A, B, R/bits >>, Acc, T)
+	when ?IS_DIGIT(A), ?IS_DIGIT(B) ->
+		conneg_list_sep(R, [{T, (A - $0) * 100 + (B - $0) * 10}|Acc]);
+conneg_weight(<< "0.", A, R/bits >>, Acc, T)
+	when ?IS_DIGIT(A) ->
+		conneg_list_sep(R, [{T, (A - $0) * 100}|Acc]);
+conneg_weight(<< "0.", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]);
+conneg_weight(<< "0", R/bits >>, Acc, T) -> conneg_list_sep(R, [{T, 0}|Acc]).
+
+conneg_list_sep(<<>>, Acc) -> lists:reverse(Acc);
+conneg_list_sep(<< $\s, R/bits >>, Acc) -> conneg_list_sep(R, Acc);
+conneg_list_sep(<< $\t, R/bits >>, Acc) -> conneg_list_sep(R, Acc);
+conneg_list_sep(<< $,, R/bits >>, Acc) -> conneg_list(R, Acc).
+
+-ifdef(TEST).
+parse_accept_charset_test_() ->
+	Tests = [
+		{<<"iso-8859-5, unicode-1-1;q=0.8">>, [
+			{<<"iso-8859-5">>, 1000},
+			{<<"unicode-1-1">>, 800}
+		]},
+		%% Some user agents send this invalid value for the Accept-Charset header
+		{<<"ISO-8859-1;utf-8;q=0.7,*;q=0.7">>, [
+			{<<"iso-8859-1">>, 1000},
+			{<<"utf-8">>, 700},
+			{<<"*">>, 700}
+		]}
+	],
+	[{V, fun() -> R = parse_accept_charset(V) end} || {V, R} <- Tests].
+
+parse_accept_charset_error_test_() ->
+	Tests = [
+		<<>>
+	],
+	[{V, fun() -> {'EXIT', _} = (catch parse_accept_charset(V)) end} || V <- Tests].
+-endif.
+
+-ifdef(PERF).
+horse_parse_accept_charset() ->
+	horse:repeat(20000,
+		parse_accept_charset(<<"iso-8859-5, unicode-1-1;q=0.8">>)
+	).
+-endif.
+
 %% @doc Parse the Connection header.
 
 -spec parse_connection(binary()) -> [binary()].
-- 
cgit v1.2.3