%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2008-2016. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
-module(bs_utf_SUITE).
-export([all/0, suite/0,
utf8_roundtrip/1,utf16_roundtrip/1,utf32_roundtrip/1,
utf8_illegal_sequences/1,utf16_illegal_sequences/1,
utf32_illegal_sequences/1,
bad_construction/1]).
-include_lib("common_test/include/ct.hrl").
-define(FAIL(Expr), fail_check(catch Expr, ??Expr, [])).
suite() ->
[{ct_hooks,[ts_install_cth]},
{timetrap, {minutes, 6}}].
all() ->
[utf8_roundtrip, utf16_roundtrip, utf32_roundtrip,
utf8_illegal_sequences, utf16_illegal_sequences,
utf32_illegal_sequences, bad_construction].
utf8_roundtrip(Config) when is_list(Config) ->
utf8_roundtrip(0, 16#D7FF),
utf8_roundtrip(16#E000, 16#10FFFF),
ok.
utf8_roundtrip(First, Last) when First =< Last ->
Bin = int_to_utf8(First),
Bin = id(<<First/utf8>>),
Bin = id(<<(id(<<>>))/binary,First/utf8>>),
Unaligned = id(<<3:2,First/utf8>>),
<<_:2,Bin/binary>> = Unaligned,
<<First/utf8>> = Bin,
<<First/utf8>> = make_unaligned(Bin),
utf8_roundtrip(First+1, Last);
utf8_roundtrip(_, _) -> ok.
utf16_roundtrip(Config) when is_list(Config) ->
Big = fun utf16_big_roundtrip/1,
Little = fun utf16_little_roundtrip/1,
PidRefs = [spawn_monitor(fun() ->
do_utf16_roundtrip(Fun)
end) || Fun <- [Big,Little]],
[receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end || {Pid,Ref} <- PidRefs],
ok.
do_utf16_roundtrip(Fun) ->
do_utf16_roundtrip(0, 16#D7FF, Fun),
do_utf16_roundtrip(16#E000, 16#10FFFF, Fun).
do_utf16_roundtrip(First, Last, Fun) when First =< Last ->
Fun(First),
do_utf16_roundtrip(First+1, Last, Fun);
do_utf16_roundtrip(_, _, _) -> ok.
utf16_big_roundtrip(Char) ->
Bin = id(<<Char/utf16>>),
Bin = id(<<(id(<<>>))/binary,Char/utf16>>),
Unaligned = id(<<3:2,Char/utf16>>),
<<_:2,Bin/binary>> = Unaligned,
<<Char/utf16>> = Bin,
<<Char/utf16>> = make_unaligned(Bin),
ok.
utf16_little_roundtrip(Char) ->
Bin = id(<<Char/little-utf16>>),
Bin = id(<<(id(<<>>))/binary,Char/little-utf16>>),
Unaligned = id(<<3:2,Char/little-utf16>>),
<<_:2,Bin/binary>> = Unaligned,
<<Char/little-utf16>> = Bin,
<<Char/little-utf16>> = make_unaligned(Bin),
ok.
utf32_roundtrip(Config) when is_list(Config) ->
Big = fun utf32_big_roundtrip/1,
Little = fun utf32_little_roundtrip/1,
PidRefs = [spawn_monitor(fun() ->
do_utf32_roundtrip(Fun)
end) || Fun <- [Big,Little]],
[receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end ||
{Pid,Ref} <- PidRefs],
ok.
do_utf32_roundtrip(Fun) ->
do_utf32_roundtrip(0, 16#D7FF, Fun),
do_utf32_roundtrip(16#E000, 16#10FFFF, Fun).
do_utf32_roundtrip(First, Last, Fun) when First =< Last ->
Fun(First),
do_utf32_roundtrip(First+1, Last, Fun);
do_utf32_roundtrip(_, _, _) -> ok.
utf32_big_roundtrip(Char) ->
Bin = id(<<Char/utf32>>),
Bin = id(<<(id(<<>>))/binary,Char/utf32>>),
Unaligned = id(<<3:2,Char/utf32>>),
<<_:2,Bin/binary>> = Unaligned,
<<Char/utf32>> = Bin,
<<Char/utf32>> = make_unaligned(Bin),
ok.
utf32_little_roundtrip(Char) ->
Bin = id(<<Char/little-utf32>>),
Bin = id(<<(id(<<>>))/binary,Char/little-utf32>>),
Unaligned = id(<<3:2,Char/little-utf32>>),
<<_:2,Bin/binary>> = Unaligned,
<<Char/little-utf32>> = Bin,
<<Char/little-utf32>> = make_unaligned(Bin),
ok.
utf8_illegal_sequences(Config) when is_list(Config) ->
fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
fail_range(16#D800, 16#DFFF), %Reserved for UTF-16.
%% Illegal first character.
[fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)],
%% Short sequences.
short_sequences(16#80, 16#10FFFF),
%% Overlong sequences. (Using more bytes than necessary
%% is not allowed.)
overlong(0, 127, 2),
overlong(128, 16#7FF, 3),
overlong(16#800, 16#FFFF, 4),
ok.
fail_range(Char, End) when Char =< End ->
{'EXIT',_} = (catch <<Char/utf8>>),
Bin = int_to_utf8(Char),
fail(Bin),
fail_range(Char+1, End);
fail_range(_, _) -> ok.
short_sequences(Char, End) ->
Step = (End - Char) div erlang:system_info(schedulers) + 1,
PidRefs = short_sequences_1(Char, Step, End),
[receive {'DOWN',Ref,process,Pid,Reason} -> normal=Reason end ||
{Pid,Ref} <- PidRefs],
ok.
short_sequences_1(Char, Step, End) when Char =< End ->
CharEnd = lists:min([Char+Step-1,End]),
[spawn_monitor(fun() ->
io:format("~p - ~p\n", [Char,CharEnd]),
do_short_sequences(Char, CharEnd)
end)|short_sequences_1(Char+Step, Step, End)];
short_sequences_1(_, _, _) -> [].
do_short_sequences(Char, End) when Char =< End ->
short_sequence(Char),
do_short_sequences(Char+1, End);
do_short_sequences(_, _) -> ok.
short_sequence(I) ->
case int_to_utf8(I) of
<<S0:3/binary,_:8>> ->
<<S1:2/binary,R1:8>> = S0,
<<S2:1/binary,_:8>> = S1,
fail(S0),
fail(S1),
fail(S2),
fail(<<S2/binary,16#7F,R1,R1>>),
fail(<<S1/binary,16#7F,R1>>),
fail(<<S0/binary,16#7F>>);
<<S0:2/binary,_:8>> ->
<<S1:1/binary,R1:8>> = S0,
fail(S0),
fail(S1),
fail(<<S0/binary,16#7F>>),
fail(<<S1/binary,16#7F>>),
fail(<<S1/binary,16#7F,R1>>);
<<S:1/binary,_:8>> ->
fail(S),
fail(<<S/binary,16#7F>>)
end.
overlong(Char, Last, NumBytes) when Char =< Last ->
overlong(Char, NumBytes),
overlong(Char+1, Last, NumBytes);
overlong(_, _, _) -> ok.
overlong(Char, NumBytes) when NumBytes < 5 ->
case int_to_utf8(Char, NumBytes) of
<<Char/utf8>>=Bin ->
ct:fail({illegal_encoding_accepted,Bin,Char});
<<OtherChar/utf8>>=Bin ->
ct:fail({illegal_encoding_accepted,Bin,Char,OtherChar});
_ -> ok
end,
overlong(Char, NumBytes+1);
overlong(_, _) -> ok.
fail(Bin) ->
fail_1(Bin),
fail_1(make_unaligned(Bin)).
fail_1(<<Char/utf8>>=Bin) ->
ct:fail({illegal_encoding_accepted,Bin,Char});
fail_1(_) -> ok.
utf16_illegal_sequences(Config) when is_list(Config) ->
utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
utf16_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16.
lonely_hi_surrogate(16#D800, 16#DFFF),
leading_lo_surrogate(16#DC00, 16#DFFF),
ok.
utf16_fail_range(Char, End) when Char =< End ->
{'EXIT',_} = (catch <<Char/big-utf16>>),
{'EXIT',_} = (catch <<Char/little-utf16>>),
utf16_fail_range(Char+1, End);
utf16_fail_range(_, _) -> ok.
lonely_hi_surrogate(Char, End) when Char =< End ->
BinBig = <<Char:16/big>>,
BinLittle = <<Char:16/little>>,
case {BinBig,BinLittle} of
{<<Bad/big-utf16>>,_} ->
ct:fail({lonely_hi_surrogate_accepted,Bad});
{_,<<Bad/little-utf16>>} ->
ct:fail({lonely_hi_surrogate_accepted,Bad});
{_,_} ->
ok
end,
lonely_hi_surrogate(Char+1, End);
lonely_hi_surrogate(_, _) -> ok.
leading_lo_surrogate(Char, End) when Char =< End ->
leading_lo_surrogate(Char, 16#D800, 16#DFFF),
leading_lo_surrogate(Char+1, End);
leading_lo_surrogate(_, _) -> ok.
leading_lo_surrogate(HiSurr, LoSurr, End) when LoSurr =< End ->
BinBig = <<HiSurr:16/big,LoSurr:16/big>>,
BinLittle = <<HiSurr:16/little,LoSurr:16/little>>,
case {BinBig,BinLittle} of
{<<Bad/big-utf16,_/bits>>,_} ->
ct:fail({leading_lo_surrogate_accepted,Bad});
{_,<<Bad/little-utf16,_/bits>>} ->
ct:fail({leading_lo_surrogate_accepted,Bad});
{_,_} ->
ok
end,
leading_lo_surrogate(HiSurr, LoSurr+1, End);
leading_lo_surrogate(_, _, _) -> ok.
utf32_illegal_sequences(Config) when is_list(Config) ->
utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
utf32_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16.
utf32_fail_range(-100, -1),
ok.
utf32_fail_range(Char, End) when Char =< End ->
{'EXIT',_} = (catch <<Char/big-utf32>>),
{'EXIT',_} = (catch <<Char/little-utf32>>),
case {<<Char:32>>,<<Char:32/little>>} of
{<<Unexpected/utf32>>,_} ->
ct:fail(Unexpected);
{_,<<Unexpected/little-utf32>>} ->
ct:fail(Unexpected);
{_,_} -> ok
end,
utf32_fail_range(Char+1, End);
utf32_fail_range(_, _) -> ok.
bad_construction(Config) when is_list(Config) ->
?FAIL(<<3.14/utf8>>),
?FAIL(<<3.1415/utf16>>),
?FAIL(<<3.1415/utf32>>),
?FAIL(<<(-1)/utf8>>),
?FAIL(<<(-1)/utf16>>),
{'EXIT',_} = (catch <<(id(-1))/utf8>>),
{'EXIT',_} = (catch <<(id(-1))/utf16>>),
{'EXIT',_} = (catch <<(id(-1))/utf32>>),
?FAIL(<<16#D800/utf8>>),
?FAIL(<<16#D800/utf16>>),
?FAIL(<<16#D800/utf32>>),
ok.
%% This function intentionally allows construction of
%% UTF-8 sequence in illegal ranges.
int_to_utf8(I) when I =< 16#7F ->
<<I>>;
int_to_utf8(I) when I =< 16#7FF ->
B2 = I,
B1 = (I bsr 6),
<<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>;
int_to_utf8(I) when I =< 16#FFFF ->
B3 = I,
B2 = (I bsr 6),
B1 = (I bsr 12),
<<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>;
int_to_utf8(I) when I =< 16#3FFFFF ->
B4 = I,
B3 = (I bsr 6),
B2 = (I bsr 12),
B1 = (I bsr 18),
<<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>;
int_to_utf8(I) when I =< 16#3FFFFFF ->
B5 = I,
B4 = (I bsr 6),
B3 = (I bsr 12),
B2 = (I bsr 18),
B1 = (I bsr 24),
<<1:1,1:1,1:1,1:1,1:1,0:1,B1:2,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6,
1:1,0:1,B5:6>>.
%% int_to_utf8(I, NumberOfBytes) -> Binary.
%% This function can be used to construct overlong sequences.
int_to_utf8(I, 1) ->
<<I>>;
int_to_utf8(I, 2) ->
B2 = I,
B1 = (I bsr 6),
<<1:1,1:1,0:1,B1:5,1:1,0:1,B2:6>>;
int_to_utf8(I, 3) ->
B3 = I,
B2 = (I bsr 6),
B1 = (I bsr 12),
<<1:1,1:1,1:1,0:1,B1:4,1:1,0:1,B2:6,1:1,0:1,B3:6>>;
int_to_utf8(I, 4) ->
B4 = I,
B3 = (I bsr 6),
B2 = (I bsr 12),
B1 = (I bsr 18),
<<1:1,1:1,1:1,1:1,0:1,B1:3,1:1,0:1,B2:6,1:1,0:1,B3:6,1:1,0:1,B4:6>>.
make_unaligned(Bin0) when is_binary(Bin0) ->
Bin1 = <<0:3,Bin0/binary,31:5>>,
Sz = byte_size(Bin0),
<<0:3,Bin:Sz/binary,31:5>> = id(Bin1),
Bin.
fail_check({'EXIT',{badarg,_}}, Str, Vars) ->
try evaluate(Str, Vars) of
Res ->
io:format("Interpreted result: ~p", [Res]),
ct:fail(did_not_fail_in_intepreted_code)
catch
error:badarg ->
ok
end;
fail_check(Res, _, _) ->
io:format("Compiled result: ~p", [Res]),
ct:fail(did_not_fail_in_compiled_code).
evaluate(Str, Vars) ->
{ok,Tokens,_} =
erl_scan:string(Str ++ " . "),
{ok, [Expr]} = erl_parse:parse_exprs(Tokens),
case erl_eval:expr(Expr, Vars) of
{value, Result, _} ->
Result
end.
id(I) -> I.