From 1f8a70d0860862a8b5d5819f5d9e0240abdbe69e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Sat, 20 Dec 2014 02:06:07 +0200 Subject: Add cow_http_hd:parse_content_language/1 From RFC7231 and RFC5646. The ABNF for language tags is terrible. It makes parsing efficiently a big challenge and the result is this huge ugly set of functions. Thankfully triq allows us to make sure the implementation is correct. A large number of examples has also been extracted from both RFCs. The various ?IS_ALPHA(C), ?IS_TOKEN(C) and so on have received a change: they now use 'orelse' instead of ';'. This is because in this new code we need to check more than one character per clause. The compilation time for this module increased dramatically. Apparently happens because the guards are too big. Using ranges ($a =< C =< $z) instead of the current solution makes compilation much faster, but the function executes twice as slow which is not acceptable. --- include/cow_inline.hrl | 70 ++++++++--- src/cow_http_hd.erl | 332 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 382 insertions(+), 20 deletions(-) diff --git a/include/cow_inline.hrl b/include/cow_inline.hrl index 0fb8b3b..a33b0b9 100644 --- a/include/cow_inline.hrl +++ b/include/cow_inline.hrl @@ -18,27 +18,31 @@ %% IS_ALPHA(Character) -define(IS_ALPHA(C), - C =:= $a; C =:= $b; C =:= $c; C =:= $d; C =:= $e; - C =:= $f; C =:= $g; C =:= $h; C =:= $i; C =:= $j; - C =:= $k; C =:= $l; C =:= $m; C =:= $n; C =:= $o; - C =:= $p; C =:= $q; C =:= $r; C =:= $s; C =:= $t; - C =:= $u; C =:= $v; C =:= $w; C =:= $x; C =:= $y; - C =:= $z; - C =:= $A; C =:= $B; C =:= $C; C =:= $D; C =:= $E; - C =:= $F; C =:= $G; C =:= $H; C =:= $I; C =:= $J; - C =:= $K; C =:= $L; C =:= $M; C =:= $N; C =:= $O; - C =:= $P; C =:= $Q; C =:= $R; C =:= $S; C =:= $T; - C =:= $U; C =:= $V; C =:= $W; C =:= $X; C =:= $Y; + C =:= $a orelse C =:= $b orelse C =:= $c orelse C =:= $d orelse C =:= $e orelse + C =:= $f orelse C =:= $g orelse C =:= $h orelse C =:= $i orelse C =:= $j orelse + C =:= $k orelse C =:= $l orelse C =:= $m orelse C =:= $n orelse C =:= $o orelse + C =:= $p orelse C =:= $q orelse C =:= $r orelse C =:= $s orelse C =:= $t orelse + C =:= $u orelse C =:= $v orelse C =:= $w orelse C =:= $x orelse C =:= $y orelse + C =:= $z orelse + C =:= $A orelse C =:= $B orelse C =:= $C orelse C =:= $D orelse C =:= $E orelse + C =:= $F orelse C =:= $G orelse C =:= $H orelse C =:= $I orelse C =:= $J orelse + C =:= $K orelse C =:= $L orelse C =:= $M orelse C =:= $N orelse C =:= $O orelse + C =:= $P orelse C =:= $Q orelse C =:= $R orelse C =:= $S orelse C =:= $T orelse + C =:= $U orelse C =:= $V orelse C =:= $W orelse C =:= $X orelse C =:= $Y orelse C =:= $Z ). %% IS_DIGIT(Character) -define(IS_DIGIT(C), - C =:= $0; C =:= $1; C =:= $2; C =:= $3; C =:= $4; - C =:= $5; C =:= $6; C =:= $7; C =:= $8; C =:= $9 + C =:= $0 orelse C =:= $1 orelse C =:= $2 orelse C =:= $3 orelse C =:= $4 orelse + C =:= $5 orelse C =:= $6 orelse C =:= $7 orelse C =:= $8 orelse C =:= $9 ). +%% IS_ALPHANUM(Character) + +-define(IS_ALPHANUM(C), ?IS_ALPHA(C) orelse ?IS_DIGIT(C)). + %% IS_ETAGC(Character) -define(IS_ETAGC(C), C =:= 16#21; C >= 16#23, C =/= 16#7f). @@ -46,16 +50,48 @@ %% IS_TOKEN(Character) -define(IS_TOKEN(C), - ?IS_ALPHA(C); ?IS_DIGIT(C); - C =:= $!; C =:= $#; C =:= $$; C =:= $%; C =:= $&; - C =:= $'; C =:= $*; C =:= $+; C =:= $-; C =:= $.; - C =:= $^; C =:= $_; C =:= $`; C =:= $|; C =:= $~ + ?IS_ALPHA(C) orelse ?IS_DIGIT(C) + orelse C =:= $! orelse C =:= $# orelse C =:= $$ orelse C =:= $% orelse C =:= $& + orelse C =:= $' orelse C =:= $* orelse C =:= $+ orelse C =:= $- orelse C =:= $. + orelse C =:= $^ orelse C =:= $_ orelse C =:= $` orelse C =:= $| orelse C =:= $~ ). %% IS_VCHAR(Character) -define(IS_VCHAR(C), C =:= $\t; C > 31, C =/= 127). +%% LC(Character) + +-define(LC(C), case C of + $A -> $a; + $B -> $b; + $C -> $c; + $D -> $d; + $E -> $e; + $F -> $f; + $G -> $g; + $H -> $h; + $I -> $i; + $J -> $j; + $K -> $k; + $L -> $l; + $M -> $m; + $N -> $n; + $O -> $o; + $P -> $p; + $Q -> $q; + $R -> $r; + $S -> $s; + $T -> $t; + $U -> $u; + $V -> $v; + $W -> $w; + $X -> $x; + $Y -> $y; + $Z -> $z; + _ -> C +end). + %% INLINE_LOWERCASE(Function, Rest, Acc, ...) %% %% To be included at the end of a case block. diff --git a/src/cow_http_hd.erl b/src/cow_http_hd.erl index fd9f218..6146151 100644 --- a/src/cow_http_hd.erl +++ b/src/cow_http_hd.erl @@ -20,6 +20,7 @@ -export([parse_accept_language/1]). -export([parse_connection/1]). -export([parse_content_encoding/1]). +-export([parse_content_language/1]). -export([parse_content_length/1]). -export([parse_content_type/1]). -export([parse_date/1]). @@ -57,6 +58,7 @@ ows() -> alpha_chars() -> "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". alphanum_chars() -> "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". +digit_chars() -> "0123456789". alpha() -> oneof(alpha_chars()). @@ -64,6 +66,9 @@ alpha() -> alphanum() -> oneof(alphanum_chars()). +digit() -> + oneof(digit_chars()). + tchar() -> frequency([ {1, oneof([$!, $#, $$, $%, $&, $', $*, $+, $-, $., $^, $_, $`, $|, $~])}, @@ -621,7 +626,7 @@ language_range_list_sep(<< $\t, R/bits >>, Acc) -> language_range_list_sep(R, Ac language_range_list_sep(<< $,, R/bits >>, Acc) -> language_range_list(R, Acc). -ifdef(TEST). -language_tag() -> +language_range_tag() -> oneof([ [alpha()], [alpha(), alpha()], @@ -633,7 +638,7 @@ language_tag() -> [alpha(), alpha(), alpha(), alpha(), alpha(), alpha(), alpha(), alpha()] ]). -language_subtag() -> +language_range_subtag() -> [$-, oneof([ [alphanum()], [alphanum(), alphanum()], @@ -646,7 +651,7 @@ language_subtag() -> ])]. language_range() -> - [language_tag(), list(language_subtag())]. + [language_range_tag(), list(language_range_subtag())]. accept_language() -> ?LET({R, W}, @@ -786,6 +791,327 @@ horse_parse_content_encoding() -> ). -endif. +%% @doc Parse the Content-Language header. +%% +%% We do not support irregular deprecated tags that do not match the ABNF. + +-spec parse_content_language(binary()) -> [binary()]. +parse_content_language(ContentLanguage) -> + nonempty(langtag_list(ContentLanguage, [])). + +langtag_list(<<>>, Acc) -> lists:reverse(Acc); +langtag_list(<< $\s, R/bits >>, Acc) -> langtag_list(R, Acc); +langtag_list(<< $\t, R/bits >>, Acc) -> langtag_list(R, Acc); +langtag_list(<< $,, R/bits >>, Acc) -> langtag_list(R, Acc); +langtag_list(<< A, B, C, R/bits >>, Acc) when ?IS_ALPHA(A), ?IS_ALPHA(B), ?IS_ALPHA(C) -> + langtag_extlang(R, Acc, << ?LC(A), ?LC(B), ?LC(C) >>, 0); +langtag_list(<< A, B, R/bits >>, Acc) when ?IS_ALPHA(A), ?IS_ALPHA(B) -> + langtag_extlang(R, Acc, << ?LC(A), ?LC(B) >>, 0); +langtag_list(<< X, R/bits >>, Acc) when X =:= $x; X =:= $X -> langtag_privateuse_sub(R, Acc, << $x >>, 0). + +langtag_extlang(<<>>, Acc, T, _) -> lists:reverse([T|Acc]); +langtag_extlang(<< $,, R/bits >>, Acc, T, _) -> langtag_list(R, [T|Acc]); +langtag_extlang(<< $\s, R/bits >>, Acc, T, _) -> langtag_list_sep(R, [T|Acc]); +langtag_extlang(<< $\t, R/bits >>, Acc, T, _) -> langtag_list_sep(R, [T|Acc]); +langtag_extlang(<< $-, A, B, C, D, E, F, G, H, R/bits >>, Acc, T, _) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G), ?IS_ALPHANUM(H) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G), ?LC(H) >>); +langtag_extlang(<< $-, A, B, C, D, E, F, G, R/bits >>, Acc, T, _) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G) >>); +langtag_extlang(<< $-, A, B, C, D, E, F, R/bits >>, Acc, T, _) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F) >>); +langtag_extlang(<< $-, A, B, C, D, E, R/bits >>, Acc, T, _) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), ?IS_ALPHANUM(E) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E) >>); +langtag_extlang(<< $-, A, B, C, D, R/bits >>, Acc, T, _) + when ?IS_ALPHA(A), ?IS_ALPHA(B), ?IS_ALPHA(C), ?IS_ALPHA(D) -> + langtag_region(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D) >>); +langtag_extlang(<< $-, A, B, C, R/bits >>, Acc, T, N) + when ?IS_ALPHA(A), ?IS_ALPHA(B), ?IS_ALPHA(C) -> + case N of + 2 -> langtag_script(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C) >>); + _ -> langtag_extlang(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C) >>, N + 1) + end; +langtag_extlang(R, Acc, T, _) -> langtag_region(R, Acc, T). + +langtag_script(<<>>, Acc, T) -> lists:reverse([T|Acc]); +langtag_script(<< $,, R/bits >>, Acc, T) -> langtag_list(R, [T|Acc]); +langtag_script(<< $\s, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_script(<< $\t, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_script(<< $-, A, B, C, D, E, F, G, H, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G), ?IS_ALPHANUM(H) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G), ?LC(H) >>); +langtag_script(<< $-, A, B, C, D, E, F, G, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G) >>); +langtag_script(<< $-, A, B, C, D, E, F, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F) >>); +langtag_script(<< $-, A, B, C, D, E, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), ?IS_ALPHANUM(E) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E) >>); +langtag_script(<< $-, A, B, C, D, R/bits >>, Acc, T) + when ?IS_ALPHA(A), ?IS_ALPHA(B), ?IS_ALPHA(C), ?IS_ALPHA(D) -> + langtag_region(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D) >>); +langtag_script(R, Acc, T) -> + langtag_region(R, Acc, T). + +langtag_region(<<>>, Acc, T) -> lists:reverse([T|Acc]); +langtag_region(<< $,, R/bits >>, Acc, T) -> langtag_list(R, [T|Acc]); +langtag_region(<< $\s, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_region(<< $\t, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_region(<< $-, A, B, C, D, E, F, G, H, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G), ?IS_ALPHANUM(H) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G), ?LC(H) >>); +langtag_region(<< $-, A, B, C, D, E, F, G, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G) >>); +langtag_region(<< $-, A, B, C, D, E, F, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F) >>); +langtag_region(<< $-, A, B, C, D, E, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), ?IS_ALPHANUM(E) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E) >>); +langtag_region(<< $-, A, B, C, D, R/bits >>, Acc, T) + when ?IS_DIGIT(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D) -> + langtag_variant(R, Acc, << T/binary, $-, A, ?LC(B), ?LC(C), ?LC(D) >>); +langtag_region(<< $-, A, B, R/bits >>, Acc, T) when ?IS_ALPHA(A), ?IS_ALPHA(B) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B) >>); +langtag_region(<< $-, A, B, C, R/bits >>, Acc, T) when ?IS_DIGIT(A), ?IS_DIGIT(B), ?IS_DIGIT(C) -> + langtag_variant(R, Acc, << T/binary, $-, A, B, C >>); +langtag_region(R, Acc, T) -> + langtag_variant(R, Acc, T). + +langtag_variant(<<>>, Acc, T) -> lists:reverse([T|Acc]); +langtag_variant(<< $,, R/bits >>, Acc, T) -> langtag_list(R, [T|Acc]); +langtag_variant(<< $\s, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_variant(<< $\t, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_variant(<< $-, A, B, C, D, E, F, G, H, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G), ?IS_ALPHANUM(H) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G), ?LC(H) >>); +langtag_variant(<< $-, A, B, C, D, E, F, G, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G) >>); +langtag_variant(<< $-, A, B, C, D, E, F, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F) >>); +langtag_variant(<< $-, A, B, C, D, E, R/bits >>, Acc, T) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), ?IS_ALPHANUM(E) -> + langtag_variant(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E) >>); +langtag_variant(<< $-, A, B, C, D, R/bits >>, Acc, T) + when ?IS_DIGIT(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D) -> + langtag_variant(R, Acc, << T/binary, $-, A, ?LC(B), ?LC(C), ?LC(D) >>); +langtag_variant(R, Acc, T) -> + langtag_extension(R, Acc, T). + +langtag_extension(<<>>, Acc, T) -> lists:reverse([T|Acc]); +langtag_extension(<< $,, R/bits >>, Acc, T) -> langtag_list(R, [T|Acc]); +langtag_extension(<< $\s, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_extension(<< $\t, R/bits >>, Acc, T) -> langtag_list_sep(R, [T|Acc]); +langtag_extension(<< $-, X, R/bits >>, Acc, T) when X =:= $x; X =:= $X -> langtag_privateuse_sub(R, Acc, << T/binary, $-, $x >>, 0); +langtag_extension(<< $-, S, R/bits >>, Acc, T) when ?IS_ALPHANUM(S) -> langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(S) >>, 0). + +langtag_extension_sub(<<>>, Acc, T, N) when N > 0 -> lists:reverse([T|Acc]); +langtag_extension_sub(<< $,, R/bits >>, Acc, T, N) when N > 0 -> langtag_list(R, [T|Acc]); +langtag_extension_sub(<< $\s, R/bits >>, Acc, T, N) when N > 0 -> langtag_list_sep(R, [T|Acc]); +langtag_extension_sub(<< $\t, R/bits >>, Acc, T, N) when N > 0 -> langtag_list_sep(R, [T|Acc]); +langtag_extension_sub(<< $-, A, B, C, D, E, F, G, H, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G), ?IS_ALPHANUM(H) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G), ?LC(H) >>, N + 1); +langtag_extension_sub(<< $-, A, B, C, D, E, F, G, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G) >>, N + 1); +langtag_extension_sub(<< $-, A, B, C, D, E, F, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F) >>, N + 1); +langtag_extension_sub(<< $-, A, B, C, D, E, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), ?IS_ALPHANUM(E) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E) >>, N + 1); +langtag_extension_sub(<< $-, A, B, C, D, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D) >>, N + 1); +langtag_extension_sub(<< $-, A, B, C, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C) >>, N + 1); +langtag_extension_sub(<< $-, A, B, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B) -> + langtag_extension_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B) >>, N + 1); +langtag_extension_sub(R, Acc, T, N) when N > 0 -> + langtag_extension(R, Acc, T). + +langtag_privateuse_sub(<<>>, Acc, T, N) when N > 0 -> lists:reverse([T|Acc]); +langtag_privateuse_sub(<< $,, R/bits >>, Acc, T, N) when N > 0 -> langtag_list(R, [T|Acc]); +langtag_privateuse_sub(<< $\s, R/bits >>, Acc, T, N) when N > 0 -> langtag_list_sep(R, [T|Acc]); +langtag_privateuse_sub(<< $\t, R/bits >>, Acc, T, N) when N > 0 -> langtag_list_sep(R, [T|Acc]); +langtag_privateuse_sub(<< $-, A, B, C, D, E, F, G, H, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G), ?IS_ALPHANUM(H) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G), ?LC(H) >>, N + 1); +langtag_privateuse_sub(<< $-, A, B, C, D, E, F, G, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F), ?IS_ALPHANUM(G) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F), ?LC(G) >>, N + 1); +langtag_privateuse_sub(<< $-, A, B, C, D, E, F, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), + ?IS_ALPHANUM(E), ?IS_ALPHANUM(F) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E), ?LC(F) >>, N + 1); +langtag_privateuse_sub(<< $-, A, B, C, D, E, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D), ?IS_ALPHANUM(E) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D), ?LC(E) >>, N + 1); +langtag_privateuse_sub(<< $-, A, B, C, D, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C), ?IS_ALPHANUM(D) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C), ?LC(D) >>, N + 1); +langtag_privateuse_sub(<< $-, A, B, C, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B), ?IS_ALPHANUM(C) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B), ?LC(C) >>, N + 1); +langtag_privateuse_sub(<< $-, A, B, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A), ?IS_ALPHANUM(B) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A), ?LC(B) >>, N + 1); +langtag_privateuse_sub(<< $-, A, R/bits >>, Acc, T, N) + when ?IS_ALPHANUM(A) -> + langtag_privateuse_sub(R, Acc, << T/binary, $-, ?LC(A) >>, N + 1). + +langtag_list_sep(<<>>, Acc) -> lists:reverse(Acc); +langtag_list_sep(<< $,, R/bits >>, Acc) -> langtag_list(R, Acc); +langtag_list_sep(<< $\s, R/bits >>, Acc) -> langtag_list_sep(R, Acc); +langtag_list_sep(<< $\t, R/bits >>, Acc) -> langtag_list_sep(R, Acc). + +-ifdef(TEST). +vector(Min, Max, Dom) -> ?LET(N, choose(Min, Max), vector(N, Dom)). +small_list(Dom) -> vector(0, 10, Dom). +small_non_empty_list(Dom) -> vector(1, 10, Dom). + +langtag_language() -> vector(2, 3, alpha()). +langtag_extlang() -> vector(0, 3, [$-, alpha(), alpha(), alpha()]). +langtag_script() -> oneof([[], [$-, alpha(), alpha(), alpha(), alpha()]]). +langtag_region() -> oneof([[], [$-, alpha(), alpha()], [$-, digit(), digit(), digit()]]). + +langtag_variant() -> + small_list(frequency([ + {4, [$-, vector(5, 8, alphanum())]}, + {1, [$-, digit(), alphanum(), alphanum(), alphanum()]} + ])). + +langtag_extension() -> + small_list([$-, ?SUCHTHAT(S, alphanum(), S =/= $x andalso S =/= $X), + small_non_empty_list([$-, vector(2, 8, alphanum())]) + ]). + +langtag_privateuse() -> oneof([[], [$-, langtag_privateuse_nodash()]]). +langtag_privateuse_nodash() -> [oneof([$x, $X]), small_non_empty_list([$-, vector(1, 8, alphanum())])]. +private_language_tag() -> ?LET(T, langtag_privateuse_nodash(), iolist_to_binary(T)). + +language_tag() -> + ?LET(IoList, + [langtag_language(), langtag_extlang(), langtag_script(), langtag_region(), + langtag_variant(), langtag_extension(), langtag_privateuse()], + iolist_to_binary(IoList)). + +content_language() -> + ?LET(L, + non_empty(list(frequency([ + {90, language_tag()}, + {10, private_language_tag()} + ]))), + begin + << _, ContentLanguage/binary >> = iolist_to_binary([[$,, T] || T <- L]), + {L, ContentLanguage} + end). + +prop_parse_content_language() -> + ?FORALL({L, ContentLanguage}, + content_language(), + begin + ResL = parse_content_language(ContentLanguage), + CheckedL = [?INLINE_LOWERCASE_BC(T) =:= ResT || {T, ResT} <- lists:zip(L, ResL)], + [true] =:= lists:usort(CheckedL) + end). + +parse_content_language_test_() -> + Tests = [ + {<<"de">>, [<<"de">>]}, + {<<"fr">>, [<<"fr">>]}, + {<<"ja">>, [<<"ja">>]}, + {<<"zh-Hant">>, [<<"zh-hant">>]}, + {<<"zh-Hans">>, [<<"zh-hans">>]}, + {<<"sr-Cyrl">>, [<<"sr-cyrl">>]}, + {<<"sr-Latn">>, [<<"sr-latn">>]}, + {<<"zh-cmn-Hans-CN">>, [<<"zh-cmn-hans-cn">>]}, + {<<"cmn-Hans-CN">>, [<<"cmn-hans-cn">>]}, + {<<"zh-yue-HK">>, [<<"zh-yue-hk">>]}, + {<<"yue-HK">>, [<<"yue-hk">>]}, + {<<"zh-Hans-CN">>, [<<"zh-hans-cn">>]}, + {<<"sr-Latn-RS">>, [<<"sr-latn-rs">>]}, + {<<"sl-rozaj">>, [<<"sl-rozaj">>]}, + {<<"sl-rozaj-biske">>, [<<"sl-rozaj-biske">>]}, + {<<"sl-nedis">>, [<<"sl-nedis">>]}, + {<<"de-CH-1901">>, [<<"de-ch-1901">>]}, + {<<"sl-IT-nedis">>, [<<"sl-it-nedis">>]}, + {<<"hy-Latn-IT-arevela">>, [<<"hy-latn-it-arevela">>]}, + {<<"de-DE">>, [<<"de-de">>]}, + {<<"en-US">>, [<<"en-us">>]}, + {<<"es-419">>, [<<"es-419">>]}, + {<<"de-CH-x-phonebk">>, [<<"de-ch-x-phonebk">>]}, + {<<"az-Arab-x-AZE-derbend">>, [<<"az-arab-x-aze-derbend">>]}, + {<<"x-whatever">>, [<<"x-whatever">>]}, + {<<"qaa-Qaaa-QM-x-southern">>, [<<"qaa-qaaa-qm-x-southern">>]}, + {<<"de-Qaaa">>, [<<"de-qaaa">>]}, + {<<"sr-Latn-QM">>, [<<"sr-latn-qm">>]}, + {<<"sr-Qaaa-RS">>, [<<"sr-qaaa-rs">>]}, + {<<"en-US-u-islamcal">>, [<<"en-us-u-islamcal">>]}, + {<<"zh-CN-a-myext-x-private">>, [<<"zh-cn-a-myext-x-private">>]}, + {<<"en-a-myext-b-another">>, [<<"en-a-myext-b-another">>]}, + {<<"mn-Cyrl-MN">>, [<<"mn-cyrl-mn">>]}, + {<<"MN-cYRL-mn">>, [<<"mn-cyrl-mn">>]}, + {<<"mN-cYrL-Mn">>, [<<"mn-cyrl-mn">>]}, + {<<"az-Arab-IR">>, [<<"az-arab-ir">>]}, + {<<"zh-gan">>, [<<"zh-gan">>]}, + {<<"zh-yue">>, [<<"zh-yue">>]}, + {<<"zh-cmn">>, [<<"zh-cmn">>]}, + {<<"de-AT">>, [<<"de-at">>]}, + {<<"de-CH-1996">>, [<<"de-ch-1996">>]}, + {<<"en-Latn-GB-boont-r-extended-sequence-x-private">>, + [<<"en-latn-gb-boont-r-extended-sequence-x-private">>]}, + {<<"el-x-koine">>, [<<"el-x-koine">>]}, + {<<"el-x-attic">>, [<<"el-x-attic">>]}, + {<<"fr, en-US, es-419, az-Arab, x-pig-latin, man-Nkoo-GN">>, + [<<"fr">>, <<"en-us">>, <<"es-419">>, <<"az-arab">>, <<"x-pig-latin">>, <<"man-nkoo-gn">>]}, + {<<"da">>, [<<"da">>]}, + {<<"mi, en">>, [<<"mi">>, <<"en">>]} + ], + [{V, fun() -> R = parse_content_language(V) end} || {V, R} <- Tests]. + +parse_content_language_error_test_() -> + Tests = [ + <<>> + ], + [{V, fun() -> {'EXIT', _} = (catch parse_content_language(V)) end} || V <- Tests]. +-endif. + +-ifdef(PERF). +horse_parse_content_language() -> + horse:repeat(100000, + parse_content_language(<<"fr, en-US, es-419, az-Arab, x-pig-latin, man-Nkoo-GN">>) + ). +-endif. + %% @doc Parse the Content-Length header. %% %% The value has at least one digit, and may be followed by whitespace. -- cgit v1.2.3