From 223902002097370da74494e7d3d5463ed1707ea7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Sun, 14 Dec 2014 23:24:44 +0200 Subject: Add cow_http_hd:parse_content_type/1 From RFC7231. This implementation is about 4 times faster than the one currently found in Cowboy. --- src/cow_http_hd.erl | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 168 insertions(+), 1 deletion(-) (limited to 'src') diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl index 668b80d..49db989 100644 --- a/src/cow_http_hd.erl +++ b/src/cow_http_hd.erl @@ -20,10 +20,14 @@ -export([parse_accept_language/1]). -export([parse_connection/1]). -export([parse_content_length/1]). +-export([parse_content_type/1]). -export([parse_expect/1]). -export([parse_max_forwards/1]). -export([parse_transfer_encoding/1]). +-type media_type() :: {binary(), binary(), [{binary(), binary()}]}. +-export_type([media_type/0]). + -type qvalue() :: 0..1000. -export_type([qvalue/0]). @@ -31,11 +35,34 @@ -ifdef(TEST). -include_lib("triq/include/triq.hrl"). + +alpha_chars() -> lists:seq($a, $z) ++ lists:seq($A, $Z). +digit_chars() -> lists:seq($0, $9). + +tchar() -> oneof([$!, $#, $$, $%, $&, $', $*, $+, $-, $., $^, $_, $`, $|, $~] ++ digit_chars() ++ alpha_chars()). +token() -> ?LET(T, non_empty(list(tchar())), list_to_binary(T)). + +qdtext() -> + oneof([$\t, $\s, $!] ++ lists:seq(16#23, 16#5b) ++ lists:seq(16#5d, 16#7e) ++ lists:seq(16#80, 16#ff)). + +quoted_pair() -> + [$\\, oneof([$\t, $\s] ++ lists:seq(16#21, 16#7e) ++ lists:seq(16#80, 16#ff))]. + +quoted_string() -> + [$", list(frequency([{100, qdtext()}, {1, quoted_pair()}])), $"]. + +%% Helper function for ( token / quoted-string ) values. +unquote([$", V, $"]) -> unquote(V, <<>>); +unquote(V) -> V. + +unquote([], Acc) -> Acc; +unquote([[$\\, C]|Tail], Acc) -> unquote(Tail, << Acc/binary, C >>); +unquote([C|Tail], Acc) -> unquote(Tail, << Acc/binary, C >>). -endif. %% @doc Parse the Accept header. --spec parse_accept(binary()) -> [{{binary(), binary(), [{binary(), binary()}]}, qvalue(), [binary() | {binary(), binary()}]}]. +-spec parse_accept(binary()) -> [{media_type(), qvalue(), [binary() | {binary(), binary()}]}]. parse_accept(<<"*/*">>) -> [{{<<"*">>, <<"*">>, []}, 1000, []}]; parse_accept(Accept) -> @@ -566,6 +593,146 @@ horse_parse_content_length_giga() -> ). -endif. +%% @doc Parse the Content-Type header. + +-spec parse_content_type(binary()) -> media_type(). +parse_content_type(<< C, R/bits >>) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_type, R, <<>>) + end. + +media_type(<< $/, C, R/bits >>, T) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_subtype, R, T, <<>>) + end; +media_type(<< C, R/bits >>, T) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_type, R, T) + end. + +media_subtype(<<>>, T, S) -> {T, S, []}; +media_subtype(<< $;, R/bits >>, T, S) -> media_before_param(R, T, S, []); +media_subtype(<< $\s, R/bits >>, T, S) -> media_before_semicolon(R, T, S, []); +media_subtype(<< $\t, R/bits >>, T, S) -> media_before_semicolon(R, T, S, []); +media_subtype(<< C, R/bits >>, T, S) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_subtype, R, T, S) + end. + +media_before_semicolon(<<>>, T, S, P) -> {T, S, lists:reverse(P)}; +media_before_semicolon(<< $;, R/bits >>, T, S, P) -> media_before_param(R, T, S, P); +media_before_semicolon(<< $\s, R/bits >>, T, S, P) -> media_before_semicolon(R, T, S, P); +media_before_semicolon(<< $\t, R/bits >>, T, S, P) -> media_before_semicolon(R, T, S, P). + +media_before_param(<< $\s, R/bits >>, T, S, P) -> media_before_param(R, T, S, P); +media_before_param(<< $\t, R/bits >>, T, S, P) -> media_before_param(R, T, S, P); +media_before_param(<< "charset=", $", R/bits >>, T, S, P) -> media_charset_quoted(R, T, S, P, <<>>); +media_before_param(<< "charset=", R/bits >>, T, S, P) -> media_charset(R, T, S, P, <<>>); +media_before_param(<< C, R/bits >>, T, S, P) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_param, R, T, S, P, <<>>) + end. + +media_charset_quoted(<< $", R/bits >>, T, S, P, V) -> + media_before_semicolon(R, T, S, [{<<"charset">>, V}|P]); +media_charset_quoted(<< $\\, C, R/bits >>, T, S, P, V) when ?IS_VCHAR(C) -> + case C of + ?INLINE_LOWERCASE(media_charset_quoted, R, T, S, P, V) + end; +media_charset_quoted(<< C, R/bits >>, T, S, P, V) when ?IS_VCHAR(C) -> + case C of + ?INLINE_LOWERCASE(media_charset_quoted, R, T, S, P, V) + end. + +media_charset(<<>>, T, S, P, V) -> {T, S, lists:reverse([{<<"charset">>, V}|P])}; + +media_charset(<< $;, R/bits >>, T, S, P, V) -> media_before_param(R, T, S, [{<<"charset">>, V}|P]); +media_charset(<< $\s, R/bits >>, T, S, P, V) -> media_before_semicolon(R, T, S, [{<<"charset">>, V}|P]); +media_charset(<< $\t, R/bits >>, T, S, P, V) -> media_before_semicolon(R, T, S, [{<<"charset">>, V}|P]); +media_charset(<< C, R/bits >>, T, S, P, V) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_charset, R, T, S, P, V) + end. + +media_param(<< $=, $", R/bits >>, T, S, P, K) -> media_quoted(R, T, S, P, K, <<>>); +media_param(<< $=, R/bits >>, T, S, P, K) -> media_value(R, T, S, P, K, <<>>); +media_param(<< C, R/bits >>, T, S, P, K) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_param, R, T, S, P, K) + end. + +media_quoted(<< $", R/bits >>, T, S, P, K, V) -> media_before_semicolon(R, T, S, [{K, V}|P]); +media_quoted(<< $\\, C, R/bits >>, T, S, P, K, V) when ?IS_VCHAR(C) -> media_quoted(R, T, S, P, K, << V/binary, C >>); +media_quoted(<< C, R/bits >>, T, S, P, K, V) when ?IS_VCHAR(C) -> media_quoted(R, T, S, P, K, << V/binary, C >>). + +media_value(<<>>, T, S, P, K, V) -> {T, S, lists:reverse([{K, V}|P])}; +media_value(<< $;, R/bits >>, T, S, P, K, V) -> media_before_param(R, T, S, [{K, V}|P]); +media_value(<< $\s, R/bits >>, T, S, P, K, V) -> media_before_semicolon(R, T, S, [{K, V}|P]); +media_value(<< $\t, R/bits >>, T, S, P, K, V) -> media_before_semicolon(R, T, S, [{K, V}|P]); +media_value(<< C, R/bits >>, T, S, P, K, V) when ?IS_TOKEN(C) -> media_value(R, T, S, P, K, << V/binary, C >>). + +-ifdef(TEST). +media_type_parameter() -> + frequency([ + {90, {token(), oneof([token(), quoted_string()])}}, + {10, {<<"charset">>, oneof([token(), quoted_string()])}} + ]). + +media_type() -> + ?LET({T, S, P}, + {token(), token(), list(media_type_parameter())}, + {T, S, P, iolist_to_binary([T, $/, S, [[$;, K, $=, V] || {K, V} <- P]])} + ). + +prop_parse_content_type() -> + ?FORALL({T, S, P, MediaType}, + media_type(), + begin + {ResT, ResS, ResP} = parse_content_type(MediaType), + ExpectedP = [case ?INLINE_LOWERCASE_BC(K) of + <<"charset">> -> {<<"charset">>, ?INLINE_LOWERCASE_BC(unquote(V))}; + LowK -> {LowK, unquote(V)} + end || {K, V} <- P], + ResT =:= ?INLINE_LOWERCASE_BC(T) + andalso ResS =:= ?INLINE_LOWERCASE_BC(S) + andalso ResP =:= ExpectedP + end + ). + +parse_content_type_test_() -> + Tests = [ + {<<"text/html;charset=utf-8">>, + {<<"text">>, <<"html">>, [{<<"charset">>, <<"utf-8">>}]}}, + {<<"text/html;charset=UTF-8">>, + {<<"text">>, <<"html">>, [{<<"charset">>, <<"utf-8">>}]}}, + {<<"Text/HTML;Charset=\"utf-8\"">>, + {<<"text">>, <<"html">>, [{<<"charset">>, <<"utf-8">>}]}}, + {<<"text/html; charset=\"utf-8\"">>, + {<<"text">>, <<"html">>, [{<<"charset">>, <<"utf-8">>}]}}, + {<<"text/html; charset=ISO-8859-4">>, + {<<"text">>, <<"html">>, [{<<"charset">>, <<"iso-8859-4">>}]}}, + {<<"text/plain; charset=iso-8859-4">>, + {<<"text">>, <<"plain">>, [{<<"charset">>, <<"iso-8859-4">>}]}}, + {<<"multipart/form-data \t;Boundary=\"MultipartIsUgly\"">>, + {<<"multipart">>, <<"form-data">>, [ + {<<"boundary">>, <<"MultipartIsUgly">>} + ]}}, + {<<"foo/bar; one=FirstParam; two=SecondParam">>, + {<<"foo">>, <<"bar">>, [ + {<<"one">>, <<"FirstParam">>}, + {<<"two">>, <<"SecondParam">>} + ]}} + ], + [{V, fun() -> R = parse_content_type(V) end} || {V, R} <- Tests]. +-endif. + +-ifdef(PERF). +horse_parse_content_type() -> + horse:repeat(200000, + parse_content_type(<<"text/html;charset=utf-8">>) + ). +-endif. + %% @doc Parse the Expect header. -spec parse_expect(binary()) -> continue. -- cgit v1.2.3