From 7e4983b70ddf8cedb967e36fba6a600731bdad5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Fri, 6 Mar 2015 01:16:10 +0100 Subject: Rewrite UTF-8 validation code Use a version of the Flexible and Economical UTF-8 Decoder algorithm specifically optimized for Erlang. http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ The utf8_state() type is now exported and changed from a binary to an integer from 0..8. --- src/cow_ws.erl | 81 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 37 deletions(-) (limited to 'src/cow_ws.erl') diff --git a/src/cow_ws.erl b/src/cow_ws.erl index 5607646..0858b68 100644 --- a/src/cow_ws.erl +++ b/src/cow_ws.erl @@ -34,11 +34,13 @@ | {fragment, fin | nofin, text | binary, iodata()}. -export_type([frame/0]). +-type utf8_state() :: 0..8. +-export_type([utf8_state/0]). + -type extensions() :: map(). -type frame_type() :: fragment | text | binary | close | ping | pong. -type mask_key() :: undefined | 0..16#ffffffff. -type rsv() :: <<_:3>>. --type utf8_state() :: <<>> | <<_:8>> | <<_:16>> | <<_:24>>. %% @doc Negotiate the permessage-deflate extension. @@ -260,8 +262,8 @@ parse_payload(Data, MaskKey, Utf8State, ParsedLen, Type, Len, FragState, Payload = inflate_frame(unmask(Data2, MaskKey, ParsedLen), Inflate, TakeOver, FragState, Eof), validate_payload(Payload, Rest, Utf8State, ParsedLen, Type, FragState, Eof); %% Empty frame. -parse_payload(Data, _, <<>>, 0, _, 0, _, _, _) -> - {ok, <<>>, <<>>, Data}; +parse_payload(Data, _, Utf8State = 0, 0, _, 0, _, _, _) -> + {ok, <<>>, Utf8State, Data}; %% Start of close frame. parse_payload(Data, MaskKey, Utf8State, 0, Type = close, Len, FragState, _, << 0:3 >>) -> {<< MaskedCode:2/binary, Data2/bits >>, Rest, Eof} = split_payload(Data, Len), @@ -346,16 +348,16 @@ inflate_frame(Data, Inflate, _T, _F, _E) -> %% Text frames and close control frames MUST have a payload that is valid UTF-8. validate_payload(Payload, Rest, Utf8State, _, Type, _, Eof) when Type =:= text; Type =:= close -> - case validate_utf8(<< Utf8State/binary, Payload/binary >>) of - false -> {error, badencoding}; + case validate_utf8(Payload, Utf8State) of + 1 -> {error, badencoding}; Utf8State2 when not Eof -> {more, Payload, Utf8State2}; - <<>> when Eof -> {ok, Payload, <<>>, Rest}; + 0 when Eof -> {ok, Payload, 0, Rest}; _ -> {error, badencoding} end; validate_payload(Payload, Rest, Utf8State, _, fragment, {Fin, text, _}, Eof) -> - case validate_utf8(<< Utf8State/binary, Payload/binary >>) of - false -> {error, badencoding}; - <<>> when Eof -> {ok, Payload, <<>>, Rest}; + case validate_utf8(Payload, Utf8State) of + 1 -> {error, badencoding}; + 0 when Eof -> {ok, Payload, 0, Rest}; Utf8State2 when Eof, Fin =:= nofin -> {ok, Payload, Utf8State2, Rest}; Utf8State2 when not Eof -> {more, Payload, Utf8State2}; _ -> {error, badencoding} @@ -365,34 +367,39 @@ validate_payload(Payload, _, Utf8State, _, _, _, false) -> validate_payload(Payload, Rest, Utf8State, _, _, _, true) -> {ok, Payload, Utf8State, Rest}. -%% Returns <<>> if the argument is valid UTF-8, false if not, -%% or the incomplete part of the argument if we need more data. -validate_utf8(<<>>) -> - <<>>; -validate_utf8(<< _/utf8, Rest/bits >>) -> - validate_utf8(Rest); -%% 2 bytes. Codepages C0 and C1 are invalid; fail early. -validate_utf8(<< 2#1100000:7, _/bits >>) -> - false; -validate_utf8(Incomplete = << 2#110:3, _:5 >>) -> - Incomplete; -%% 3 bytes. -validate_utf8(Incomplete = << 2#1110:4, _:4 >>) -> - Incomplete; -validate_utf8(Incomplete = << 2#1110:4, _:4, 2#10:2, _:6 >>) -> - Incomplete; -%% 4 bytes. Codepage F4 may have invalid values greater than 0x10FFFF. -validate_utf8(<< 2#11110100:8, 2#10:2, High:6, _/bits >>) when High >= 2#10000 -> - false; -validate_utf8(Incomplete = << 2#11110:5, _:3 >>) -> - Incomplete; -validate_utf8(Incomplete = << 2#11110:5, _:3, 2#10:2, _:6 >>) -> - Incomplete; -validate_utf8(Incomplete = << 2#11110:5, _:3, 2#10:2, _:6, 2#10:2, _:6 >>) -> - Incomplete; -%% Invalid. -validate_utf8(_) -> - false. +%% Based on the Flexible and Economical UTF-8 Decoder algorithm by +%% Bjoern Hoehrmann (http://bjoern.hoehrmann.de/utf-8/decoder/dfa/). +%% +%% The original algorithm has been unrolled into all combinations of values for C and State +%% each with a clause. The common clauses were then grouped together. +%% +%% This function returns 0 on success, 1 on error, and 2..8 on incomplete data. +validate_utf8(<<>>, State) -> State; +validate_utf8(<< C, Rest/bits >>, 0) when C < 128 -> validate_utf8(Rest, 0); +validate_utf8(<< C, Rest/bits >>, 2) when C >= 128, C < 144 -> validate_utf8(Rest, 0); +validate_utf8(<< C, Rest/bits >>, 3) when C >= 128, C < 144 -> validate_utf8(Rest, 2); +validate_utf8(<< C, Rest/bits >>, 5) when C >= 128, C < 144 -> validate_utf8(Rest, 2); +validate_utf8(<< C, Rest/bits >>, 7) when C >= 128, C < 144 -> validate_utf8(Rest, 3); +validate_utf8(<< C, Rest/bits >>, 8) when C >= 128, C < 144 -> validate_utf8(Rest, 3); +validate_utf8(<< C, Rest/bits >>, 2) when C >= 144, C < 160 -> validate_utf8(Rest, 0); +validate_utf8(<< C, Rest/bits >>, 3) when C >= 144, C < 160 -> validate_utf8(Rest, 2); +validate_utf8(<< C, Rest/bits >>, 5) when C >= 144, C < 160 -> validate_utf8(Rest, 2); +validate_utf8(<< C, Rest/bits >>, 6) when C >= 144, C < 160 -> validate_utf8(Rest, 3); +validate_utf8(<< C, Rest/bits >>, 7) when C >= 144, C < 160 -> validate_utf8(Rest, 3); +validate_utf8(<< C, Rest/bits >>, 2) when C >= 160, C < 192 -> validate_utf8(Rest, 0); +validate_utf8(<< C, Rest/bits >>, 3) when C >= 160, C < 192 -> validate_utf8(Rest, 2); +validate_utf8(<< C, Rest/bits >>, 4) when C >= 160, C < 192 -> validate_utf8(Rest, 2); +validate_utf8(<< C, Rest/bits >>, 6) when C >= 160, C < 192 -> validate_utf8(Rest, 3); +validate_utf8(<< C, Rest/bits >>, 7) when C >= 160, C < 192 -> validate_utf8(Rest, 3); +validate_utf8(<< C, Rest/bits >>, 0) when C >= 194, C < 224 -> validate_utf8(Rest, 2); +validate_utf8(<< 224, Rest/bits >>, 0) -> validate_utf8(Rest, 4); +validate_utf8(<< C, Rest/bits >>, 0) when C >= 225, C < 237 -> validate_utf8(Rest, 3); +validate_utf8(<< 237, Rest/bits >>, 0) -> validate_utf8(Rest, 5); +validate_utf8(<< C, Rest/bits >>, 0) when C =:= 238; C =:= 239 -> validate_utf8(Rest, 3); +validate_utf8(<< 240, Rest/bits >>, 0) -> validate_utf8(Rest, 6); +validate_utf8(<< C, Rest/bits >>, 0) when C =:= 241; C =:= 242; C =:= 243 -> validate_utf8(Rest, 7); +validate_utf8(<< 244, Rest/bits >>, 0) -> validate_utf8(Rest, 8); +validate_utf8(_, _) -> 1. %% @doc Return a frame tuple from parsed state and data. -- cgit v1.2.3