diff options
-rw-r--r-- | src/cow_ws.erl | 184 |
1 files changed, 154 insertions, 30 deletions
diff --git a/src/cow_ws.erl b/src/cow_ws.erl index 27c7c87..d5c40d0 100644 --- a/src/cow_ws.erl +++ b/src/cow_ws.erl @@ -72,6 +72,8 @@ -type utf8_state() :: 0..8 | undefined. -export_type([utf8_state/0]). +-compile({inline, [utf8_class/0]}). + %% @doc Generate a key for the Websocket handshake request. -spec key() -> binary(). @@ -559,14 +561,14 @@ validate_payload(Payload, Rest, undefined, _, _, _, true) -> {ok, Payload, undefined, Rest}; %% Text frames and close control frames MUST have a payload that is valid UTF-8. validate_payload(Payload, Rest, Utf8State, _, Type, _, Eof) when Type =:= text; Type =:= close -> - case validate_utf8(Payload, Utf8State) of + case validate_text(Payload, Utf8State) of 1 -> {error, badencoding}; Utf8State2 when not Eof -> {more, Payload, Utf8State2}; 0 when Eof -> {ok, Payload, 0, Rest}; _ -> {error, badencoding} end; validate_payload(Payload, Rest, Utf8State, _, fragment, {Fin, text, _}, Eof) -> - case validate_utf8(Payload, Utf8State) of + case validate_text(Payload, Utf8State) of 1 -> {error, badencoding}; 0 when Eof -> {ok, Payload, 0, Rest}; Utf8State2 when Eof, Fin =:= nofin -> {ok, Payload, Utf8State2, Rest}; @@ -581,36 +583,158 @@ validate_payload(Payload, Rest, Utf8State, _, _, _, true) -> %% Based on the Flexible and Economical UTF-8 Decoder algorithm by %% Bjoern Hoehrmann <[email protected]> (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/). %% -%% The original algorithm has been unrolled into all combinations of values for C and State -%% each with a clause. The common clauses were then grouped together. +%% The original algorithm has been reworked to better adapt to +%% the current Erlang VM (at the time of writing). +%% +%% We keep the character class table to quickly find which class +%% a character is. The transition table was removed in favor of +%% a separate Erlang function per state as that proved more +%% efficient. +%% +%% We store the character class table in a tuple returned by +%% an inline function. +%% +%% We handle ASCII characters specially because when ASCII +%% characters are present we are highly likely to have mostly +%% or only ASCII characters. We process them 4 at a time when +%% possible. +%% +%% When a non-ASCII character is encountered, we switch to +%% the UTF-8 decoder. When in the UTF-8 decoder we have to +%% process characters one at a time. When we are in the UTF-8 +%% decoder we expect there to be additional UTF-8 characters +%% so we check for them instead of reverting back to ASCII +%% every time. This greatly speeds up decoding of Japanese +%% and other non-ASCII text. +%% +%% Our UTF-8 decoder functions consist of looking up the +%% character class of the current byte and then using a +%% case clause to determine which state we are switching to. +%% +%% We order clauses based on the likelihood of the character class. +%% Order is determined by the number of occurrences of the class in +%% the table. The order (and number of occurrences) is as follow: +%% 7 (32), 2 (30), 1 and 9 (16), 3 (14), 8 (13), 6 (3), 4, 5, 10 and 11. %% %% This function returns 0 on success, 1 on error, and 2..8 on incomplete data. -validate_utf8(<<>>, State) -> State; -validate_utf8(<< C, Rest/bits >>, 0) when C < 128 -> validate_utf8(Rest, 0); -validate_utf8(<< C, Rest/bits >>, 2) when C >= 128, C < 144 -> validate_utf8(Rest, 0); -validate_utf8(<< C, Rest/bits >>, 3) when C >= 128, C < 144 -> validate_utf8(Rest, 2); -validate_utf8(<< C, Rest/bits >>, 5) when C >= 128, C < 144 -> validate_utf8(Rest, 2); -validate_utf8(<< C, Rest/bits >>, 7) when C >= 128, C < 144 -> validate_utf8(Rest, 3); -validate_utf8(<< C, Rest/bits >>, 8) when C >= 128, C < 144 -> validate_utf8(Rest, 3); -validate_utf8(<< C, Rest/bits >>, 2) when C >= 144, C < 160 -> validate_utf8(Rest, 0); -validate_utf8(<< C, Rest/bits >>, 3) when C >= 144, C < 160 -> validate_utf8(Rest, 2); -validate_utf8(<< C, Rest/bits >>, 5) when C >= 144, C < 160 -> validate_utf8(Rest, 2); -validate_utf8(<< C, Rest/bits >>, 6) when C >= 144, C < 160 -> validate_utf8(Rest, 3); -validate_utf8(<< C, Rest/bits >>, 7) when C >= 144, C < 160 -> validate_utf8(Rest, 3); -validate_utf8(<< C, Rest/bits >>, 2) when C >= 160, C < 192 -> validate_utf8(Rest, 0); -validate_utf8(<< C, Rest/bits >>, 3) when C >= 160, C < 192 -> validate_utf8(Rest, 2); -validate_utf8(<< C, Rest/bits >>, 4) when C >= 160, C < 192 -> validate_utf8(Rest, 2); -validate_utf8(<< C, Rest/bits >>, 6) when C >= 160, C < 192 -> validate_utf8(Rest, 3); -validate_utf8(<< C, Rest/bits >>, 7) when C >= 160, C < 192 -> validate_utf8(Rest, 3); -validate_utf8(<< C, Rest/bits >>, 0) when C >= 194, C < 224 -> validate_utf8(Rest, 2); -validate_utf8(<< 224, Rest/bits >>, 0) -> validate_utf8(Rest, 4); -validate_utf8(<< C, Rest/bits >>, 0) when C >= 225, C < 237 -> validate_utf8(Rest, 3); -validate_utf8(<< 237, Rest/bits >>, 0) -> validate_utf8(Rest, 5); -validate_utf8(<< C, Rest/bits >>, 0) when C =:= 238; C =:= 239 -> validate_utf8(Rest, 3); -validate_utf8(<< 240, Rest/bits >>, 0) -> validate_utf8(Rest, 6); -validate_utf8(<< C, Rest/bits >>, 0) when C =:= 241; C =:= 242; C =:= 243 -> validate_utf8(Rest, 7); -validate_utf8(<< 244, Rest/bits >>, 0) -> validate_utf8(Rest, 8); -validate_utf8(_, _) -> 1. +%% It expects a starting state value of 0. It can be called again +%% to stream parse large amounts of text as long as the returned +%% 2..8 state is provided when it is called back. + +validate_text(Text, 0) -> validate_ascii(Text); +validate_text(Text, 2) -> validate_s2(Text); +validate_text(Text, 3) -> validate_s3(Text); +validate_text(Text, 4) -> validate_s4(Text); +validate_text(Text, 5) -> validate_s5(Text); +validate_text(Text, 6) -> validate_s6(Text); +validate_text(Text, 7) -> validate_s7(Text); +validate_text(Text, 8) -> validate_s8(Text). + +validate_ascii(<<>>) -> 0; +validate_ascii(<<C1,C2,C3,C4,R/bits>>) when C1 < 128, C2 < 128, C3 < 128, C4 < 128 -> validate_ascii(R); +validate_ascii(<<C1,R/bits>>) when C1 < 128 -> validate_ascii(R); +validate_ascii(Text) -> validate_s0(Text). + +%% Instead of switching back to ASCII we first have this +%% function attempt to find a non-ASCII character to +%% greatly speed up decoding of Japanese and other languages. +validate_s0(<<C,R/bits>>) when C >= 128 -> + Class = element(C - 127, utf8_class()), + case Class of + 2 -> validate_s2(R); + 3 -> validate_s3(R); + 6 -> validate_s7(R); + 4 -> validate_s5(R); + 5 -> validate_s8(R); + 10 -> validate_s4(R); + 11 -> validate_s6(R); + _ -> 1 + end; +validate_s0(Text) -> + validate_ascii(Text). + +validate_s2(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 7 -> validate_s0(R); + 1 -> validate_s0(R); + 9 -> validate_s0(R); + _ -> 1 + end; +validate_s2(<<>>) -> + 2. + +validate_s3(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 7 -> validate_s2(R); + 1 -> validate_s2(R); + 9 -> validate_s2(R); + _ -> 1 + end; +validate_s3(<<>>) -> + 3. + +validate_s4(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 7 -> validate_s2(R); + _ -> 1 + end; +validate_s4(<<>>) -> + 4. + +validate_s5(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 1 -> validate_s2(R); + 9 -> validate_s2(R); + _ -> 1 + end; +validate_s5(<<>>) -> + 5. + +validate_s6(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 7 -> validate_s3(R); + 9 -> validate_s3(R); + _ -> 1 + end; +validate_s6(<<>>) -> + 6. + +validate_s7(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 7 -> validate_s3(R); + 1 -> validate_s3(R); + 9 -> validate_s3(R); + _ -> 1 + end; +validate_s7(<<>>) -> + 7. + +validate_s8(<<C,R/bits>>) -> + Class = element(C - 127, utf8_class()), + case Class of + 1 -> validate_s3(R); + _ -> 1 + end; +validate_s8(<<>>) -> + 8. + +utf8_class() -> + { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, + 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8 + }. %% @doc Return a frame tuple from parsed state and data. |