From 20ada930c3996945fe2771518da05b5e7ae9b904 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Sat, 13 Dec 2014 11:13:05 +0200 Subject: Add cow_http_hd:parse_accept/1 From RFC7231. This code is more than twice faster as the current Cowboy code, while filtering out more bad cases. --- include/cow_inline.hrl | 32 ++++++++ src/cow_http_hd.erl | 202 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 234 insertions(+) diff --git a/include/cow_inline.hrl b/include/cow_inline.hrl index 36a3558..82cc465 100644 --- a/include/cow_inline.hrl +++ b/include/cow_inline.hrl @@ -15,6 +15,38 @@ -ifndef(COW_INLINE_HRL). -define(COW_INLINE_HRL, 1). +%% IS_DIGIT(Character) + +-define(IS_DIGIT(C), + C >= $0, C =< $9 +). + +%% IS_TOKEN(Character) + +-define(IS_TOKEN(C), + C =:= $a; C =:= $b; C =:= $c; C =:= $d; C =:= $e; + C =:= $f; C =:= $g; C =:= $h; C =:= $i; C =:= $j; + C =:= $k; C =:= $l; C =:= $m; C =:= $n; C =:= $o; + C =:= $p; C =:= $q; C =:= $r; C =:= $s; C =:= $t; + C =:= $u; C =:= $v; C =:= $w; C =:= $x; C =:= $y; + C =:= $z; + C =:= $A; C =:= $B; C =:= $C; C =:= $D; C =:= $E; + C =:= $F; C =:= $G; C =:= $H; C =:= $I; C =:= $J; + C =:= $K; C =:= $L; C =:= $M; C =:= $N; C =:= $O; + C =:= $P; C =:= $Q; C =:= $R; C =:= $S; C =:= $T; + C =:= $U; C =:= $V; C =:= $W; C =:= $X; C =:= $Y; + C =:= $2; + C =:= $0; C =:= $1; C =:= $2; C =:= $3; C =:= $4; + C =:= $5; C =:= $6; C =:= $7; C =:= $8; C =:= $9; + C =:= $!; C =:= $#; C =:= $$; C =:= $%; C =:= $&; + C =:= $'; C =:= $*; C =:= $+; C =:= $-; C =:= $.; + C =:= $^; C =:= $_; C =:= $`; C =:= $|; C =:= $~ +). + +%% IS_VCHAR(Character) + +-define(IS_VCHAR(C), C =:= $\t; C > 31, C =/= 127). + %% INLINE_LOWERCASE(Function, Rest, Acc, ...) %% %% To be included at the end of a case block. diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl index 9a920a9..a6624a8 100644 --- a/src/cow_http_hd.erl +++ b/src/cow_http_hd.erl @@ -14,14 +14,216 @@ -module(cow_http_hd). +-export([parse_accept/1]). -export([parse_connection/1]). -export([parse_content_length/1]). -export([parse_expect/1]). -export([parse_max_forwards/1]). -export([parse_transfer_encoding/1]). +-type qvalue() :: 0..1000. +-export_type([qvalue/0]). + -include("cow_inline.hrl"). +%% @doc Parse the Accept header. + +-spec parse_accept(binary()) -> [{{binary(), binary(), [{binary(), binary()}]}, qvalue(), [binary() | {binary(), binary()}]}]. +parse_accept(<<"*/*">>) -> + [{{<<"*">>, <<"*">>, []}, 1000, []}]; +parse_accept(Accept) -> + nonempty(media_range_list(Accept, [])). + +media_range_list(<<>>, Acc) -> lists:reverse(Acc); +media_range_list(<< $\s, R/bits >>, Acc) -> media_range_list(R, Acc); +media_range_list(<< $\t, R/bits >>, Acc) -> media_range_list(R, Acc); +media_range_list(<< $,, R/bits >>, Acc) -> media_range_list(R, Acc); +media_range_list(<< C, R/bits >>, Acc) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_range_type, R, Acc, <<>>) + end. + +media_range_type(<< $/, R/bits >>, Acc, T) -> media_range_subtype(R, Acc, T, <<>>); +%% Special clause for badly behaving user agents that send * instead of */*. +media_range_type(<< _, R/bits >>, Acc, <<"*">>) -> media_range_before_param(R, Acc, <<"*">>, <<"*">>, []); +media_range_type(<< C, R/bits >>, Acc, T) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_range_type, R, Acc, T) + end. + +media_range_subtype(<<>>, Acc, T, S) when S =/= <<>> -> lists:reverse([{{T, S, []}, 1000, []}|Acc]); +media_range_subtype(<< $,, R/bits >>, Acc, T, S) when S =/= <<>> -> media_range_list(R, [{{T, S, []}, 1000, []}|Acc]); +media_range_subtype(<< $;, R/bits >>, Acc, T, S) when S =/= <<>> -> media_range_before_param(R, Acc, T, S, []); +media_range_subtype(<< $\s, R/bits >>, Acc, T, S) when S =/= <<>> -> media_range_before_semicolon(R, Acc, T, S, []); +media_range_subtype(<< $\t, R/bits >>, Acc, T, S) when S =/= <<>> -> media_range_before_semicolon(R, Acc, T, S, []); +media_range_subtype(<< C, R/bits >>, Acc, T, S) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_range_subtype, R, Acc, T, S) + end. + +media_range_before_semicolon(<<>>, Acc, T, S, P) -> lists:reverse([{{T, S, lists:reverse(P)}, 1000, []}|Acc]); +media_range_before_semicolon(<< $,, R/bits >>, Acc, T, S, P) -> media_range_list(R, [{{T, S, lists:reverse(P)}, 1000, []}|Acc]); +media_range_before_semicolon(<< $;, R/bits >>, Acc, T, S, P) -> media_range_before_param(R, Acc, T, S, P); +media_range_before_semicolon(<< $\s, R/bits >>, Acc, T, S, P) -> media_range_before_semicolon(R, Acc, T, S, P); +media_range_before_semicolon(<< $\t, R/bits >>, Acc, T, S, P) -> media_range_before_semicolon(R, Acc, T, S, P). + +media_range_before_param(<< $\s, R/bits >>, Acc, T, S, P) -> media_range_before_param(R, Acc, T, S, P); +media_range_before_param(<< $\t, R/bits >>, Acc, T, S, P) -> media_range_before_param(R, Acc, T, S, P); +%% Special clause for badly behaving user agents that send .123 instead of 0.123. +media_range_before_param(<< $q, $=, $., R/bits >>, Acc, T, S, P) -> media_range_broken_weight(R, Acc, T, S, P); +media_range_before_param(<< $q, $=, R/bits >>, Acc, T, S, P) -> media_range_weight(R, Acc, T, S, P); +media_range_before_param(<< C, R/bits >>, Acc, T, S, P) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_range_param, R, Acc, T, S, P, <<>>) + end. + +media_range_param(<< $=, $", R/bits >>, Acc, T, S, P, K) -> media_range_quoted(R, Acc, T, S, P, K, <<>>); +media_range_param(<< $=, R/bits >>, Acc, T, S, P, K) -> media_range_value(R, Acc, T, S, P, K, <<>>); +media_range_param(<< C, R/bits >>, Acc, T, S, P, K) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(media_range_param, R, Acc, T, S, P, K) + end. + +media_range_quoted(<< $", R/bits >>, Acc, T, S, P, K, V) -> media_range_before_semicolon(R, Acc, T, S, [{K, V}|P]); +media_range_quoted(<< $\\, C, R/bits >>, Acc, T, S, P, K, V) when ?IS_VCHAR(C) -> media_range_quoted(R, Acc, T, S, P, K, << V/binary, C >>); +media_range_quoted(<< C, R/bits >>, Acc, T, S, P, K, V) when ?IS_VCHAR(C) -> media_range_quoted(R, Acc, T, S, P, K, << V/binary, C >>). + +media_range_value(<<>>, Acc, T, S, P, K, V) -> lists:reverse([{{T, S, lists:reverse([{K, V}|P])}, 1000, []}|Acc]); +media_range_value(<< $,, R/bits >>, Acc, T, S, P, K, V) -> media_range_list(R, [{{T, S, lists:reverse([{K, V}|P])}, 1000, []}|Acc]); +media_range_value(<< $;, R/bits >>, Acc, T, S, P, K, V) -> media_range_before_param(R, Acc, T, S, [{K, V}|P]); +media_range_value(<< $\s, R/bits >>, Acc, T, S, P, K, V) -> media_range_before_semicolon(R, Acc, T, S, [{K, V}|P]); +media_range_value(<< $\t, R/bits >>, Acc, T, S, P, K, V) -> media_range_before_semicolon(R, Acc, T, S, [{K, V}|P]); +media_range_value(<< C, R/bits >>, Acc, T, S, P, K, V) when ?IS_TOKEN(C) -> media_range_value(R, Acc, T, S, P, K, << V/binary, C >>). + +%% Special function for badly behaving user agents that send .123 instead of 0.123. +media_range_broken_weight(<< A, B, C, R/bits >>, Acc, T, S, P) + when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10 + (C - $0), []); +media_range_broken_weight(<< A, B, R/bits >>, Acc, T, S, P) + when ?IS_DIGIT(A), ?IS_DIGIT(B) -> + accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10, []); +media_range_broken_weight(<< A, R/bits >>, Acc, T, S, P) + when ?IS_DIGIT(A) -> + accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100, []). + +media_range_weight(<< "1.000", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); +media_range_weight(<< "1.00", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); +media_range_weight(<< "1.0", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); +media_range_weight(<< "1.", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); +media_range_weight(<< "1", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 1000, []); +media_range_weight(<< "0.", A, B, C, R/bits >>, Acc, T, S, P) + when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10 + (C - $0), []); +media_range_weight(<< "0.", A, B, R/bits >>, Acc, T, S, P) + when ?IS_DIGIT(A), ?IS_DIGIT(B) -> + accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100 + (B - $0) * 10, []); +media_range_weight(<< "0.", A, R/bits >>, Acc, T, S, P) + when ?IS_DIGIT(A) -> + accept_before_semicolon(R, Acc, T, S, P, (A - $0) * 100, []); +media_range_weight(<< "0.", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 0, []); +media_range_weight(<< "0", R/bits >>, Acc, T, S, P) -> accept_before_semicolon(R, Acc, T, S, P, 0, []). + +accept_before_semicolon(<<>>, Acc, T, S, P, Q, E) -> lists:reverse([{{T, S, lists:reverse(P)}, Q, lists:reverse(E)}|Acc]); +accept_before_semicolon(<< $,, R/bits >>, Acc, T, S, P, Q, E) -> media_range_list(R, [{{T, S, lists:reverse(P)}, Q, lists:reverse(E)}|Acc]); +accept_before_semicolon(<< $;, R/bits >>, Acc, T, S, P, Q, E) -> accept_before_ext(R, Acc, T, S, P, Q, E); +accept_before_semicolon(<< $\s, R/bits >>, Acc, T, S, P, Q, E) -> accept_before_semicolon(R, Acc, T, S, P, Q, E); +accept_before_semicolon(<< $\t, R/bits >>, Acc, T, S, P, Q, E) -> accept_before_semicolon(R, Acc, T, S, P, Q, E). + +accept_before_ext(<< $\s, R/bits >>, Acc, T, S, P, Q, E) -> accept_before_ext(R, Acc, T, S, P, Q, E); +accept_before_ext(<< $\t, R/bits >>, Acc, T, S, P, Q, E) -> accept_before_ext(R, Acc, T, S, P, Q, E); +accept_before_ext(<< C, R/bits >>, Acc, T, S, P, Q, E) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(accept_ext, R, Acc, T, S, P, Q, E, <<>>) + end. + +accept_ext(<<>>, Acc, T, S, P, Q, E, K) -> lists:reverse([{{T, S, lists:reverse(P)}, Q, lists:reverse([K|E])}|Acc]); +accept_ext(<< $,, R/bits >>, Acc, T, S, P, Q, E, K) -> media_range_list(R, [{{T, S, lists:reverse(P)}, Q, lists:reverse([K|E])}|Acc]); +accept_ext(<< $;, R/bits >>, Acc, T, S, P, Q, E, K) -> accept_before_ext(R, Acc, T, S, P, Q, [K|E]); +accept_ext(<< $\s, R/bits >>, Acc, T, S, P, Q, E, K) -> accept_before_semicolon(R, Acc, T, S, P, Q, [K|E]); +accept_ext(<< $\t, R/bits >>, Acc, T, S, P, Q, E, K) -> accept_before_semicolon(R, Acc, T, S, P, Q, [K|E]); +accept_ext(<< $=, $", R/bits >>, Acc, T, S, P, Q, E, K) -> accept_quoted(R, Acc, T, S, P, Q, E, K, <<>>); +accept_ext(<< $=, R/bits >>, Acc, T, S, P, Q, E, K) -> accept_value(R, Acc, T, S, P, Q, E, K, <<>>); +accept_ext(<< C, R/bits >>, Acc, T, S, P, Q, E, K) when ?IS_TOKEN(C) -> + case C of + ?INLINE_LOWERCASE(accept_ext, R, Acc, T, S, P, Q, E, K) + end. + +accept_quoted(<< $", R/bits >>, Acc, T, S, P, Q, E, K, V) -> accept_before_semicolon(R, Acc, T, S, P, Q, [{K, V}|E]); +accept_quoted(<< $\\, C, R/bits >>, Acc, T, S, P, Q, E, K, V) when ?IS_VCHAR(C) -> accept_quoted(R, Acc, T, S, P, Q, E, K, << V/binary, C >>); +accept_quoted(<< C, R/bits >>, Acc, T, S, P, Q, E, K, V) when ?IS_VCHAR(C) -> accept_quoted(R, Acc, T, S, P, Q, E, K, << V/binary, C >>). + +accept_value(<<>>, Acc, T, S, P, Q, E, K, V) -> lists:reverse([{{T, S, lists:reverse(P)}, Q, lists:reverse([{K, V}|E])}|Acc]); +accept_value(<< $,, R/bits >>, Acc, T, S, P, Q, E, K, V) -> media_range_list(R, [{{T, S, lists:reverse(P)}, Q, lists:reverse([{K, V}|E])}|Acc]); +accept_value(<< $;, R/bits >>, Acc, T, S, P, Q, E, K, V) -> accept_before_semicolon(R, Acc, T, S, P, Q, [{K, V}|E]); +accept_value(<< $\s, R/bits >>, Acc, T, S, P, Q, E, K, V) -> accept_before_semicolon(R, Acc, T, S, P, Q, [{K, V}|E]); +accept_value(<< $\t, R/bits >>, Acc, T, S, P, Q, E, K, V) -> accept_before_semicolon(R, Acc, T, S, P, Q, [{K, V}|E]); +accept_value(<< C, R/bits >>, Acc, T, S, P, Q, E, K, V) when ?IS_TOKEN(C) -> accept_value(R, Acc, T, S, P, Q, E, K, << V/binary, C >>). + +-ifdef(TEST). +parse_accept_test_() -> + Tests = [ + {<<"audio/*; q=0.2, audio/basic">>, [ + {{<<"audio">>, <<"*">>, []}, 200, []}, + {{<<"audio">>, <<"basic">>, []}, 1000, []} + ]}, + {<<"text/plain; q=0.5, text/html, " + "text/x-dvi; q=0.8, text/x-c">>, [ + {{<<"text">>, <<"plain">>, []}, 500, []}, + {{<<"text">>, <<"html">>, []}, 1000, []}, + {{<<"text">>, <<"x-dvi">>, []}, 800, []}, + {{<<"text">>, <<"x-c">>, []}, 1000, []} + ]}, + {<<"text/*, text/html, text/html;level=1, */*">>, [ + {{<<"text">>, <<"*">>, []}, 1000, []}, + {{<<"text">>, <<"html">>, []}, 1000, []}, + {{<<"text">>, <<"html">>, [{<<"level">>, <<"1">>}]}, 1000, []}, + {{<<"*">>, <<"*">>, []}, 1000, []} + ]}, + {<<"text/*;q=0.3, text/html;q=0.7, text/html;level=1, " + "text/html;level=2;q=0.4, */*;q=0.5">>, [ + {{<<"text">>, <<"*">>, []}, 300, []}, + {{<<"text">>, <<"html">>, []}, 700, []}, + {{<<"text">>, <<"html">>, [{<<"level">>, <<"1">>}]}, 1000, []}, + {{<<"text">>, <<"html">>, [{<<"level">>, <<"2">>}]}, 400, []}, + {{<<"*">>, <<"*">>, []}, 500, []} + ]}, + {<<"text/html;level=1;quoted=\"hi hi hi\";" + "q=0.123;standalone;complex=gits, text/plain">>, [ + {{<<"text">>, <<"html">>, + [{<<"level">>, <<"1">>}, {<<"quoted">>, <<"hi hi hi">>}]}, 123, + [<<"standalone">>, {<<"complex">>, <<"gits">>}]}, + {{<<"text">>, <<"plain">>, []}, 1000, []} + ]}, + {<<"text/html, image/gif, image/jpeg, *; q=.2, */*; q=.2">>, [ + {{<<"text">>, <<"html">>, []}, 1000, []}, + {{<<"image">>, <<"gif">>, []}, 1000, []}, + {{<<"image">>, <<"jpeg">>, []}, 1000, []}, + {{<<"*">>, <<"*">>, []}, 200, []}, + {{<<"*">>, <<"*">>, []}, 200, []} + ]} + ], + [{V, fun() -> R = parse_accept(V) end} || {V, R} <- Tests]. + +parse_accept_error_test_() -> + Tests = [ + <<>>, + <<" ">>, + <<"audio/basic, */;q=0.5">>, + <<"audio/, audio/basic">>, + <<"aud\tio/basic">>, + <<"audio/basic;t=\"zero \\", 0, " woo\"">> + ], + [{V, fun() -> {'EXIT', _} = (catch parse_accept(V)) end} || V <- Tests]. +-endif. + +-ifdef(PERF). +horse_parse_accept() -> + horse:repeat(20000, + parse_accept(<<"text/*;q=0.3, text/html;q=0.7, text/html;level=1, " + "text/html;level=2;q=0.4, */*;q=0.5">>) + ). +-endif. + %% @doc Parse the Connection header. -spec parse_connection(binary()) -> [binary()]. -- cgit v1.2.3