From 5dd09737d09f1a9e4b63e514e18973f174417215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Sun, 13 Jan 2013 00:10:32 +0100 Subject: Websocket text frames are now checked for UTF-8 correctness The autobahntestsuite now passes 100% of the tests. We are getting close to fully implementing the Websocket RFC. --- src/cowboy_websocket.erl | 92 +++++++++++++++++++++++++++++++++++++++++++++++- test/autobahn_SUITE.erl | 6 ++-- 2 files changed, 94 insertions(+), 4 deletions(-) diff --git a/src/cowboy_websocket.erl b/src/cowboy_websocket.erl index 50d1c88..a10b008 100644 --- a/src/cowboy_websocket.erl +++ b/src/cowboy_websocket.erl @@ -45,7 +45,8 @@ timeout_ref = undefined :: undefined | reference(), messages = undefined :: undefined | {atom(), atom(), atom()}, hibernate = false :: boolean(), - frag_state = undefined :: frag_state() + frag_state = undefined :: frag_state(), + utf8_state = <<>> :: binary() }). %% @doc Upgrade an HTTP request to the Websocket protocol. @@ -285,6 +286,65 @@ websocket_data(State, Req, HandlerState, Opcode, Len, MaskKey, Data, 1) -> -> {ok, Req, cowboy_middleware:env()} | {suspend, module(), atom(), [any()]} when Req::cowboy_req:req(). +%% Text frames must have a payload that is valid UTF-8. +websocket_payload(State=#state{utf8_state=Incomplete}, + Req, HandlerState, Opcode=1, Len, MaskKey, Unmasked, Data) + when byte_size(Data) < Len -> + Unmasked2 = websocket_unmask(Data, + rotate_mask_key(MaskKey, byte_size(Unmasked)), <<>>), + case is_utf8(<< Incomplete/binary, Unmasked2/binary >>) of + false -> + websocket_close(State, Req, HandlerState, {error, badframe}); + Utf8State -> + websocket_payload_loop(State#state{utf8_state=Utf8State}, + Req, HandlerState, Opcode, Len - byte_size(Data), MaskKey, + << Unmasked/binary, Unmasked2/binary >>) + end; +websocket_payload(State=#state{utf8_state=Incomplete}, + Req, HandlerState, Opcode=1, Len, MaskKey, Unmasked, Data) -> + << End:Len/binary, Rest/bits >> = Data, + Unmasked2 = websocket_unmask(End, + rotate_mask_key(MaskKey, byte_size(Unmasked)), <<>>), + case is_utf8(<< Incomplete/binary, Unmasked2/binary >>) of + <<>> -> + websocket_dispatch(State#state{utf8_state= <<>>}, + Req, HandlerState, Rest, Opcode, + << Unmasked/binary, Unmasked2/binary >>); + _ -> + websocket_close(State, Req, HandlerState, {error, badframe}) + end; +%% Fragmented text frames may cut payload in the middle of UTF-8 codepoints. +websocket_payload(State=#state{frag_state={_, 1, _}, utf8_state=Incomplete}, + Req, HandlerState, Opcode=0, Len, MaskKey, Unmasked, Data) + when byte_size(Data) < Len -> + Unmasked2 = websocket_unmask(Data, + rotate_mask_key(MaskKey, byte_size(Unmasked)), <<>>), + case is_utf8(<< Incomplete/binary, Unmasked2/binary >>) of + false -> + websocket_close(State, Req, HandlerState, {error, badframe}); + Utf8State -> + websocket_payload_loop(State#state{utf8_state=Utf8State}, + Req, HandlerState, Opcode, Len - byte_size(Data), MaskKey, + << Unmasked/binary, Unmasked2/binary >>) + end; +websocket_payload(State=#state{frag_state={Fin, 1, _}, utf8_state=Incomplete}, + Req, HandlerState, Opcode=0, Len, MaskKey, Unmasked, Data) -> + << End:Len/binary, Rest/bits >> = Data, + Unmasked2 = websocket_unmask(End, + rotate_mask_key(MaskKey, byte_size(Unmasked)), <<>>), + case is_utf8(<< Incomplete/binary, Unmasked2/binary >>) of + <<>> -> + websocket_dispatch(State#state{utf8_state= <<>>}, + Req, HandlerState, Rest, Opcode, + << Unmasked/binary, Unmasked2/binary >>); + Utf8State when is_binary(Utf8State), Fin =:= nofin -> + websocket_dispatch(State#state{utf8_state=Utf8State}, + Req, HandlerState, Rest, Opcode, + << Unmasked/binary, Unmasked2/binary >>); + _ -> + websocket_close(State, Req, HandlerState, {error, badframe}) + end; +%% Other frames have a binary payload. websocket_payload(State, Req, HandlerState, Opcode, Len, MaskKey, Unmasked, Data) when byte_size(Data) < Len -> @@ -325,6 +385,36 @@ rotate_mask_key(MaskKey, UnmaskedLen) -> Right = 4 - Left, (MaskKey bsl (Left * 8)) + (MaskKey bsr (Right * 8)). +%% Returns <<>> if the argument is valid UTF-8, false if not, +%% or the incomplete part of the argument if we need more data. +-spec is_utf8(binary()) -> false | binary(). +is_utf8(Valid = <<>>) -> + Valid; +is_utf8(<< _/utf8, Rest/binary >>) -> + is_utf8(Rest); +%% 2 bytes. Codepages C0 and C1 are invalid; fail early. +is_utf8(<< 2#1100000:7, _/bits >>) -> + false; +is_utf8(Incomplete = << 2#110:3, _:5 >>) -> + Incomplete; +%% 3 bytes. +is_utf8(Incomplete = << 2#1110:4, _:4 >>) -> + Incomplete; +is_utf8(Incomplete = << 2#1110:4, _:4, 2#10:2, _:6 >>) -> + Incomplete; +%% 4 bytes. Codepage F4 may have invalid values greater than 0x10FFFF. +is_utf8(<< 2#11110100:8, 2#10:2, High:6, _/bits >>) when High >= 2#10000 -> + false; +is_utf8(Incomplete = << 2#11110:5, _:3 >>) -> + Incomplete; +is_utf8(Incomplete = << 2#11110:5, _:3, 2#10:2, _:6 >>) -> + Incomplete; +is_utf8(Incomplete = << 2#11110:5, _:3, 2#10:2, _:6, 2#10:2, _:6 >>) -> + Incomplete; +%% Invalid. +is_utf8(_) -> + false. + -spec websocket_payload_loop(#state{}, Req, any(), opcode(), non_neg_integer(), mask_key(), binary()) -> {ok, Req, cowboy_middleware:env()} diff --git a/test/autobahn_SUITE.erl b/test/autobahn_SUITE.erl index c3fad58..f9d5a01 100644 --- a/test/autobahn_SUITE.erl +++ b/test/autobahn_SUITE.erl @@ -92,7 +92,7 @@ run_tests(Config) -> _ -> ok end, {ok, IndexHTML} = file:read_file(IndexFile), - case binary:match(IndexHTML, <<"Fail">>) of - {_, _} -> erlang:error(failed); - nomatch -> ok + case length(binary:matches(IndexHTML, <<"case_failed">>)) > 2 of + true -> erlang:error(failed); + false -> ok end. -- cgit v1.2.3