aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLoïc Hoguin <[email protected]>2015-03-06 01:16:10 +0100
committerLoïc Hoguin <[email protected]>2015-03-06 01:16:10 +0100
commit7e4983b70ddf8cedb967e36fba6a600731bdad5d (patch)
tree5ba776969f990c8e3a4d5219da7f1bf58b0148e6
parentd36e6538d207d4cf0e4affd755f250c75645e1a2 (diff)
downloadcowlib-7e4983b70ddf8cedb967e36fba6a600731bdad5d.tar.gz
cowlib-7e4983b70ddf8cedb967e36fba6a600731bdad5d.tar.bz2
cowlib-7e4983b70ddf8cedb967e36fba6a600731bdad5d.zip
Rewrite UTF-8 validation code
Use a version of the Flexible and Economical UTF-8 Decoder algorithm specifically optimized for Erlang. http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ The utf8_state() type is now exported and changed from a binary to an integer from 0..8.
-rw-r--r--src/cow_ws.erl81
1 files changed, 44 insertions, 37 deletions
diff --git a/src/cow_ws.erl b/src/cow_ws.erl
index 5607646..0858b68 100644
--- a/src/cow_ws.erl
+++ b/src/cow_ws.erl
@@ -34,11 +34,13 @@
| {fragment, fin | nofin, text | binary, iodata()}.
-export_type([frame/0]).
+-type utf8_state() :: 0..8.
+-export_type([utf8_state/0]).
+
-type extensions() :: map().
-type frame_type() :: fragment | text | binary | close | ping | pong.
-type mask_key() :: undefined | 0..16#ffffffff.
-type rsv() :: <<_:3>>.
--type utf8_state() :: <<>> | <<_:8>> | <<_:16>> | <<_:24>>.
%% @doc Negotiate the permessage-deflate extension.
@@ -260,8 +262,8 @@ parse_payload(Data, MaskKey, Utf8State, ParsedLen, Type, Len, FragState,
Payload = inflate_frame(unmask(Data2, MaskKey, ParsedLen), Inflate, TakeOver, FragState, Eof),
validate_payload(Payload, Rest, Utf8State, ParsedLen, Type, FragState, Eof);
%% Empty frame.
-parse_payload(Data, _, <<>>, 0, _, 0, _, _, _) ->
- {ok, <<>>, <<>>, Data};
+parse_payload(Data, _, Utf8State = 0, 0, _, 0, _, _, _) ->
+ {ok, <<>>, Utf8State, Data};
%% Start of close frame.
parse_payload(Data, MaskKey, Utf8State, 0, Type = close, Len, FragState, _, << 0:3 >>) ->
{<< MaskedCode:2/binary, Data2/bits >>, Rest, Eof} = split_payload(Data, Len),
@@ -346,16 +348,16 @@ inflate_frame(Data, Inflate, _T, _F, _E) ->
%% Text frames and close control frames MUST have a payload that is valid UTF-8.
validate_payload(Payload, Rest, Utf8State, _, Type, _, Eof) when Type =:= text; Type =:= close ->
- case validate_utf8(<< Utf8State/binary, Payload/binary >>) of
- false -> {error, badencoding};
+ case validate_utf8(Payload, Utf8State) of
+ 1 -> {error, badencoding};
Utf8State2 when not Eof -> {more, Payload, Utf8State2};
- <<>> when Eof -> {ok, Payload, <<>>, Rest};
+ 0 when Eof -> {ok, Payload, 0, Rest};
_ -> {error, badencoding}
end;
validate_payload(Payload, Rest, Utf8State, _, fragment, {Fin, text, _}, Eof) ->
- case validate_utf8(<< Utf8State/binary, Payload/binary >>) of
- false -> {error, badencoding};
- <<>> when Eof -> {ok, Payload, <<>>, Rest};
+ case validate_utf8(Payload, Utf8State) of
+ 1 -> {error, badencoding};
+ 0 when Eof -> {ok, Payload, 0, Rest};
Utf8State2 when Eof, Fin =:= nofin -> {ok, Payload, Utf8State2, Rest};
Utf8State2 when not Eof -> {more, Payload, Utf8State2};
_ -> {error, badencoding}
@@ -365,34 +367,39 @@ validate_payload(Payload, _, Utf8State, _, _, _, false) ->
validate_payload(Payload, Rest, Utf8State, _, _, _, true) ->
{ok, Payload, Utf8State, Rest}.
-%% Returns <<>> if the argument is valid UTF-8, false if not,
-%% or the incomplete part of the argument if we need more data.
-validate_utf8(<<>>) ->
- <<>>;
-validate_utf8(<< _/utf8, Rest/bits >>) ->
- validate_utf8(Rest);
-%% 2 bytes. Codepages C0 and C1 are invalid; fail early.
-validate_utf8(<< 2#1100000:7, _/bits >>) ->
- false;
-validate_utf8(Incomplete = << 2#110:3, _:5 >>) ->
- Incomplete;
-%% 3 bytes.
-validate_utf8(Incomplete = << 2#1110:4, _:4 >>) ->
- Incomplete;
-validate_utf8(Incomplete = << 2#1110:4, _:4, 2#10:2, _:6 >>) ->
- Incomplete;
-%% 4 bytes. Codepage F4 may have invalid values greater than 0x10FFFF.
-validate_utf8(<< 2#11110100:8, 2#10:2, High:6, _/bits >>) when High >= 2#10000 ->
- false;
-validate_utf8(Incomplete = << 2#11110:5, _:3 >>) ->
- Incomplete;
-validate_utf8(Incomplete = << 2#11110:5, _:3, 2#10:2, _:6 >>) ->
- Incomplete;
-validate_utf8(Incomplete = << 2#11110:5, _:3, 2#10:2, _:6, 2#10:2, _:6 >>) ->
- Incomplete;
-%% Invalid.
-validate_utf8(_) ->
- false.
+%% Based on the Flexible and Economical UTF-8 Decoder algorithm by
+%% Bjoern Hoehrmann <[email protected]> (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/).
+%%
+%% The original algorithm has been unrolled into all combinations of values for C and State
+%% each with a clause. The common clauses were then grouped together.
+%%
+%% This function returns 0 on success, 1 on error, and 2..8 on incomplete data.
+validate_utf8(<<>>, State) -> State;
+validate_utf8(<< C, Rest/bits >>, 0) when C < 128 -> validate_utf8(Rest, 0);
+validate_utf8(<< C, Rest/bits >>, 2) when C >= 128, C < 144 -> validate_utf8(Rest, 0);
+validate_utf8(<< C, Rest/bits >>, 3) when C >= 128, C < 144 -> validate_utf8(Rest, 2);
+validate_utf8(<< C, Rest/bits >>, 5) when C >= 128, C < 144 -> validate_utf8(Rest, 2);
+validate_utf8(<< C, Rest/bits >>, 7) when C >= 128, C < 144 -> validate_utf8(Rest, 3);
+validate_utf8(<< C, Rest/bits >>, 8) when C >= 128, C < 144 -> validate_utf8(Rest, 3);
+validate_utf8(<< C, Rest/bits >>, 2) when C >= 144, C < 160 -> validate_utf8(Rest, 0);
+validate_utf8(<< C, Rest/bits >>, 3) when C >= 144, C < 160 -> validate_utf8(Rest, 2);
+validate_utf8(<< C, Rest/bits >>, 5) when C >= 144, C < 160 -> validate_utf8(Rest, 2);
+validate_utf8(<< C, Rest/bits >>, 6) when C >= 144, C < 160 -> validate_utf8(Rest, 3);
+validate_utf8(<< C, Rest/bits >>, 7) when C >= 144, C < 160 -> validate_utf8(Rest, 3);
+validate_utf8(<< C, Rest/bits >>, 2) when C >= 160, C < 192 -> validate_utf8(Rest, 0);
+validate_utf8(<< C, Rest/bits >>, 3) when C >= 160, C < 192 -> validate_utf8(Rest, 2);
+validate_utf8(<< C, Rest/bits >>, 4) when C >= 160, C < 192 -> validate_utf8(Rest, 2);
+validate_utf8(<< C, Rest/bits >>, 6) when C >= 160, C < 192 -> validate_utf8(Rest, 3);
+validate_utf8(<< C, Rest/bits >>, 7) when C >= 160, C < 192 -> validate_utf8(Rest, 3);
+validate_utf8(<< C, Rest/bits >>, 0) when C >= 194, C < 224 -> validate_utf8(Rest, 2);
+validate_utf8(<< 224, Rest/bits >>, 0) -> validate_utf8(Rest, 4);
+validate_utf8(<< C, Rest/bits >>, 0) when C >= 225, C < 237 -> validate_utf8(Rest, 3);
+validate_utf8(<< 237, Rest/bits >>, 0) -> validate_utf8(Rest, 5);
+validate_utf8(<< C, Rest/bits >>, 0) when C =:= 238; C =:= 239 -> validate_utf8(Rest, 3);
+validate_utf8(<< 240, Rest/bits >>, 0) -> validate_utf8(Rest, 6);
+validate_utf8(<< C, Rest/bits >>, 0) when C =:= 241; C =:= 242; C =:= 243 -> validate_utf8(Rest, 7);
+validate_utf8(<< 244, Rest/bits >>, 0) -> validate_utf8(Rest, 8);
+validate_utf8(_, _) -> 1.
%% @doc Return a frame tuple from parsed state and data.