%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 2017-2018. All Rights Reserved.
%%
%% Licensed under the Apache License, Version 2.0 (the "License");
%% you may not use this file except in compliance with the License.
%% You may obtain a copy of the License at
%%
%% http://www.apache.org/licenses/LICENSE-2.0
%%
%% Unless required by applicable law or agreed to in writing, software
%% distributed under the License is distributed on an "AS IS" BASIS,
%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
%% See the License for the specific language governing permissions and
%% limitations under the License.
%%
%% %CopyrightEnd%
%%
-module(unicode_util_SUITE).
-include_lib("common_test/include/ct.hrl").
-export([all/0, suite/0, extra/1,
uppercase/1, lowercase/1, titlecase/1, casefold/1,
cp/1, gc/1,
nfd/1, nfc/1, nfkd/1, nfkc/1,
whitespace/1,
get/1,
count/1]).
-export([debug/0, id/1, bin_split/1, uc_loaded_size/0,
time_count/4 %% Used by stdlib_bench_SUITE
]).
suite() ->
[{ct_hooks,[ts_install_cth]},
{timetrap,{minutes,20}}].
all() ->
[
extra,
uppercase, lowercase, titlecase, casefold,
cp, gc,
nfd, nfc, nfkd, nfkc,
whitespace,
get,
count
].
debug() ->
Config = [{data_dir, ?MODULE_STRING++"_data"}],
[io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) || Test <- all()].
extra(_) ->
{_, _} = unicode_util:spec_version(),
#{ccc:=0, compat:=[], canon:=[_,_]} = unicode_util:lookup($å),
#{fold:=229,lower:=229,title:=197,upper:=197} = unicode_util:get_case($å),
#{fold:="ss",lower:=223,title:="Ss",upper:="SS"} = unicode_util:get_case($ß),
ok.
uppercase(_) ->
[$H] = unicode_util:uppercase([$H]),
[$H] = unicode_util:uppercase([$h]),
[$1] = unicode_util:uppercase([$1]),
ok.
titlecase(_) ->
[$H] = unicode_util:titlecase([$H]),
[$H] = unicode_util:titlecase([$h]),
[$1] = unicode_util:titlecase([$1]),
ok.
lowercase(_) ->
[$h] = unicode_util:lowercase([$H]),
[$h] = unicode_util:lowercase([$h]),
[$1] = unicode_util:lowercase([$1]),
[$i] = unicode_util:casefold([$I]), %% no Turkish
ok.
casefold(_) ->
[$h] = unicode_util:casefold([$H]),
[$h] = unicode_util:casefold([$h]),
[$1] = unicode_util:casefold([$1]),
[$i] = unicode_util:casefold([$I]),%% no Turkish
[[$s,$s]|"abC"] = unicode_util:casefold([$ß,$a,$b,$C]),
[[$s,$s]] = unicode_util:casefold([$ẞ]),
ok.
whitespace(_) ->
WS = unicode_util:whitespace(),
WS = lists:filter(fun unicode_util:is_whitespace/1, WS),
%% TODO add more tests
ok.
cp(_) ->
Get = fun unicode_util:cp/1,
"hejsan" = fetch("hejsan", Get),
"hejsan" = fetch(<<"hejsan">>, Get),
"hejsan" = fetch(["hej",<<"san">>], Get),
"hejsan" = fetch(["hej"|<<"san">>], Get),
{error, <<128>>} = Get(<<128>>),
{error, [<<128>>, 0]} = Get([<<128>>, 0]),
ok.
gc(Config) ->
DataDir = proplists:get_value(data_dir, Config),
Get = fun unicode_util:gc/1,
"hejsan" = fetch("hejsan", Get),
"hejsan" = fetch(<<"hejsan">>, Get),
"hejsan" = fetch(["hej",<<"san">>], Get),
"hejsan" = fetch(["hej"|<<"san">>], Get),
{error, <<128>>} = Get(<<128>>),
{error, [<<128>>, 0]} = Get([<<128>>, 0]),
0 = fold(fun verify_gc/3, 0, DataDir ++ "/GraphemeBreakTest.txt"),
ok.
verify_gc(Line0, N, Acc) ->
Line = unicode:characters_to_list(Line0),
Line = fetch(Line0,fun unicode_util:cp/1), %% Test cp
LineGC = fetch(Line0,fun unicode_util:gc/1), %% Test gc
LineGC = fetch(Line,fun unicode_util:gc/1), %% Test gc
LineGC = fetch(LineGC,fun unicode_util:gc/1), %% Test gc
LineGC = fetch(LineGC,fun unicode_util:cp/1), %% Test cp
%io:format("Line: ~s~n",[Line]),
[Data|_Comments] = string:tokens(Line, "#"),
%% io:format("Data: ~w~n",[string:tokens(Data, " \t")]),
{Str,Res} = gc_test_data(string:tokens(Data, " \t"), [], [[]]),
%% io:format("InputStr: ~w ~w~n",[Str,unicode:characters_to_binary(Str)]),
case verify_gc(Str, Res, N, Line) andalso
verify_gc(unicode:characters_to_binary(Str), Res, N, Line0) of
true -> Acc;
false -> Acc+1
end.
verify_gc({error,_,[CP|_]}=Err, _Res, N, Line) ->
IsSurrogate = 16#D800 =< CP andalso CP =< 16#DFFF,
%% Surrogat is not valid in utf8 encoding only utf16
IsSurrogate orelse
io:format("~w: ~ts~n Error in unicode:characters_to_binary ~w~n", [N, Line, Err]),
IsSurrogate;
verify_gc(Str, Res, N, Line) ->
try fetch(Str, fun unicode_util:gc/1) of
Res -> true;
Other ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[N, Line, Str, Str]),
io:format("Expected: ~p~n", [Res]),
io:format("Got: ~w~n", [Other]),
false
catch Cl:R:Stacktrace ->
io:format("~p: ~ts => |~tp|~n",[N, Line, Str]),
io:format("Expected: ~p~n", [Res]),
erlang:raise(Cl,R,Stacktrace)
end.
gc_test_data([[247]|Rest], Str, [First|GCs]) ->
case First of
[] -> gc_test_data(Rest, Str, [[]|GCs]);
[CP] -> gc_test_data(Rest, Str, [[],CP|GCs]);
_ -> gc_test_data(Rest, Str, [[],lists:reverse(First)|GCs])
end;
gc_test_data([[215]|Rest], Str, GCs) ->
gc_test_data(Rest, Str, GCs);
gc_test_data([Hex|Rest], Str, [First|GCs]) ->
CP = hex_to_int(Hex),
gc_test_data(Rest, [CP|Str], [[CP|First]|GCs]);
gc_test_data([], Str, [[]|GCs]) ->
{lists:reverse(Str), lists:reverse(GCs)};
gc_test_data([], Str, GCs) ->
{lists:reverse(Str), lists:reverse(GCs)}.
nfd(Config) ->
DataDir = proplists:get_value(data_dir, Config),
ok = fold(fun verify_nfd/3, 0, DataDir ++ "/NormalizationTest.txt"),
ok.
verify_nfd(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok;
verify_nfd(Data0, LineNo, _Acc) ->
Data1 = unicode:characters_to_list(Data0),
[Data2|_Comments] = string:tokens(Data1, "#"),
Columns = string:tokens(Data2, ";"),
[C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] ||
C <- Columns],
C3GC = fetch(C3, fun unicode_util:gc/1),
try
C3GC = fetch(C1, fun unicode_util:nfd/1),
C3GC = fetch(C2, fun unicode_util:nfd/1),
C3GC = fetch(C3, fun unicode_util:nfd/1)
catch _Cl:{badmatch, Other} = _R: Stacktrace ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]),
io:format("Expected: ~ts ~w~n", [C3GC, C3GC]),
io:format("Got: ~ts ~w~n", [Other, Other]),
erlang:raise(_Cl,_R,Stacktrace);
Cl:R:Stacktrace ->
io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]),
io:format("Expected: ~p~n", [C3]),
erlang:raise(Cl,R,Stacktrace)
end,
C5GC = fetch(C5, fun unicode_util:gc/1),
try
C5GC = fetch(C4, fun unicode_util:nfd/1),
C5GC = fetch(C5, fun unicode_util:nfd/1)
catch _Cl2:{badmatch, Other2} = _R2:Stacktrace2 ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]),
io:format("Expected: ~ts ~w~n", [C5GC, C5GC]),
io:format("Got: ~ts ~w~n", [Other2, Other2]),
erlang:raise(_Cl2,_R2,Stacktrace2);
Cl2:R2:Stacktrace2 ->
io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]),
io:format("Expected: ~p~n", [C5]),
erlang:raise(Cl2,R2,Stacktrace2)
end,
ok.
nfc(Config) ->
DataDir = proplists:get_value(data_dir, Config),
ok = fold(fun verify_nfc/3, 0, DataDir ++ "/NormalizationTest.txt"),
ok.
verify_nfc(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok;
verify_nfc(Data0, LineNo, _Acc) ->
Data1 = unicode:characters_to_list(Data0),
[Data2|_Comments] = string:tokens(Data1, "#"),
Columns = string:tokens(Data2, ";"),
[C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] ||
C <- Columns],
C2GC = fetch(C2, fun unicode_util:gc/1),
try
C2GC = fetch(C1, fun unicode_util:nfc/1),
C2GC = fetch(C2, fun unicode_util:nfc/1),
C2GC = fetch(C3, fun unicode_util:nfc/1)
catch _Cl:{badmatch, Other} = _R:Stacktrace ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]),
io:format("Expected: ~ts ~w~n", [C2GC, C2GC]),
io:format("Got: ~ts ~w~n", [Other, Other]),
erlang:raise(_Cl,_R,Stacktrace);
Cl:R:Stacktrace ->
io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]),
io:format("Expected: ~p~n", [C3]),
erlang:raise(Cl,R,Stacktrace)
end,
C4GC = fetch(C4, fun unicode_util:gc/1),
try
C4GC = fetch(C4, fun unicode_util:nfc/1),
C4GC = fetch(C5, fun unicode_util:nfc/1)
catch _Cl2:{badmatch, Other2} = _R2:Stacktrace2 ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]),
io:format("Expected: ~ts ~w~n", [C4GC, C4GC]),
io:format("Got: ~ts ~w~n", [Other2, Other2]),
erlang:raise(_Cl2,_R2,Stacktrace2);
Cl2:R2:Stacktrace2 ->
io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]),
io:format("Expected: ~p~n", [C5]),
erlang:raise(Cl2,R2,Stacktrace2)
end,
ok.
nfkd(Config) ->
DataDir = proplists:get_value(data_dir, Config),
ok = fold(fun verify_nfkd/3, 0, DataDir ++ "/NormalizationTest.txt"),
ok.
verify_nfkd(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok;
verify_nfkd(Data0, LineNo, _Acc) ->
Data1 = unicode:characters_to_list(Data0),
[Data2|_Comments] = string:tokens(Data1, "#"),
Columns = string:tokens(Data2, ";"),
[C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] ||
C <- Columns],
C5GC = lists:flatten(fetch(C5, fun unicode_util:gc/1)),
try
C5GC = lists:flatten(fetch(C1, fun unicode_util:nfkd/1)),
C5GC = lists:flatten(fetch(C2, fun unicode_util:nfkd/1)),
C5GC = lists:flatten(fetch(C3, fun unicode_util:nfkd/1)),
C5GC = lists:flatten(fetch(C4, fun unicode_util:nfkd/1)),
C5GC = lists:flatten(fetch(C5, fun unicode_util:nfkd/1))
catch _Cl:{badmatch, Other} = _R:Stacktrace ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C5, C5]),
io:format("Expected: ~ts ~w~n", [C5GC, C5GC]),
io:format("Got: ~ts ~w~n", [Other, Other]),
erlang:raise(_Cl,_R,Stacktrace);
Cl:R:Stacktrace ->
io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]),
io:format("Expected: ~p~n", [C3]),
erlang:raise(Cl,R,Stacktrace)
end,
ok.
nfkc(Config) ->
DataDir = proplists:get_value(data_dir, Config),
ok = fold(fun verify_nfkc/3, 0, DataDir ++ "/NormalizationTest.txt"),
ok.
verify_nfkc(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok;
verify_nfkc(Data0, LineNo, _Acc) ->
Data1 = unicode:characters_to_list(Data0),
[Data2|_Comments] = string:tokens(Data1, "#"),
Columns = string:tokens(Data2, ";"),
[C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] ||
C <- Columns],
C4GC = lists:flatten(fetch(C4, fun unicode_util:gc/1)),
try
C4GC = lists:flatten(fetch(C1, fun unicode_util:nfkc/1)),
C4GC = lists:flatten(fetch(C2, fun unicode_util:nfkc/1)),
C4GC = lists:flatten(fetch(C3, fun unicode_util:nfkc/1)),
C4GC = lists:flatten(fetch(C4, fun unicode_util:nfkc/1)),
C4GC = lists:flatten(fetch(C5, fun unicode_util:nfkc/1))
catch _Cl:{badmatch, Other} = _R:Stacktrace ->
io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C4, C4]),
io:format("Expected: ~ts ~w~n", [C4GC, C4GC]),
io:format("Got: ~ts ~w~n", [Other, Other]),
erlang:raise(_Cl,_R,Stacktrace);
Cl:R:Stacktrace ->
io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]),
io:format("Expected: ~p~n", [C3]),
erlang:raise(Cl,R,Stacktrace)
end,
ok.
get(_) ->
add_get_tests.
count(Config) ->
Parent = self(),
Exec = fun() ->
do_measure(Config),
Parent ! {test_done, self()}
end,
ct:timetrap({minutes,5}),
case ct:get_timetrap_info() of
{_,{_,Scale}} when Scale > 1 ->
{skip,{measurments_skipped_debug,Scale}};
_ -> % No scaling, run at most 2 min
Tester = spawn(Exec),
receive {test_done, Tester} -> ok
after 120000 ->
io:format("Timelimit reached stopping~n",[]),
exit(Tester, die)
end,
ok
end.
do_measure(Config) ->
DataDir = proplists:get_value(data_dir, Config),
File = DataDir ++ "/NormalizationTest.txt",
{ok, Bin} = file:read_file(File),
Do = fun(Func, Mode) ->
{N, Mean, Stddev, Res} = time_count(Func, Mode, Bin, 10),
io:format("~4w ~6w ~.10w ~.6wms ±~.2wms #~.2w~n",
[Func, Mode, Res, Mean div 1000, Stddev div 1000, N])
end,
Do(lref, list),
Do(bref, binary),
io:format("----------------------~n"),
[Do(What,Mode) || What <- [cp, gc, nfd, nfc, nfkd, nfkc], Mode <- [list, deep_l, binary, deep_b]],
io:format("Size of unicode_util: ~pkB~n",[uc_loaded_size() div 1024]),
ok.
uc_loaded_size() ->
uc_loaded_size(binary:split(erlang:system_info(loaded), <<$\n>>, [global])).
uc_loaded_size([<<"unicode_util ", Rest/binary>>|_]) ->
[StrSize|_] = binary:split(Rest, <<$\s>>),
binary_to_integer(StrSize);
uc_loaded_size([_|Rest]) ->
uc_loaded_size(Rest).
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
time_count(Fun, Mode, Bin, Repeat) ->
timer:sleep(100), %% Let emulator catch up and clean things before test runs
Self = self(),
Pid = spawn_link(fun() ->
Str = mode(Mode, Bin),
Self ! {self(),do_count(0,0,0, Fun, Str, undefined, Repeat)}
end),
receive {Pid,Msg} -> Msg end.
do_count(N,Sum,SumSq, Fun, Str, _, Repeat) when N < Repeat ->
{Time, Res} = do_count(Fun, Str),
do_count(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res, Repeat);
do_count(N,Sum,SumSq, _, _, Res, _) ->
Mean = round(Sum / N),
Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))),
{N, Mean, Stdev, Res}.
do_count(Fun, Str) ->
Count = fun Count(Str0, N) ->
case unicode_util:Fun(Str0) of
[] -> N;
[_|Str1] -> Count(Str1,N+1)
end
end,
if Fun =/= lref, Fun =/= bref ->
timer:tc(fun() -> Count(Str, 0) end);
true ->
Pick = case Fun of
lref -> id(id);
bref -> id(bin_split)
end,
Ref = fun LR(Str0, N) ->
case ?MODULE:Pick(Str0) of
[] -> N;
[_|Str1] -> LR(Str1,N+1)
end
end,
timer:tc(fun() -> Ref(Str, 0) end)
end.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
id(Op) -> Op.
bin_split(<<>>) -> [];
bin_split(<<CP/utf8,R/binary>>) -> [CP|R].
mode(binary, Bin) -> Bin;
mode(list, Bin) -> unicode:characters_to_list(Bin);
mode(deep_b, Bin) -> [Bin];
mode(deep_l, Bin) -> [unicode:characters_to_list(Bin)].
fetch(Str, F) ->
case F(Str) of
[] -> [];
[CP|R] ->
%% If input is a binary R should be binary
if is_binary(Str) == false -> ok;
is_binary(R); R =:= [] -> ok;
true ->
io:format("Char: ~tc Tail:~tP~n", [CP,R,10]),
exit({bug, F})
end,
[CP|fetch(R,F)]
end.
%% *Test.txt file helpers
hex_to_int([]) -> [];
hex_to_int(HexStr) ->
list_to_integer(string:strip(HexStr, both), 16).
fold(Fun, Acc, File) ->
io:format("Processing ~s~n",[File]),
{ok, Fd} = file:open(File, [read, raw, binary, {read_ahead, 100000}]),
Get = fun() -> file:read_line(Fd) end,
try
fold_1(Fun, 1, Acc, Get)
after
ok = file:close(Fd)
end.
fold_1(Fun, Line, Acc, Get) ->
case Get() of
eof -> Acc;
{ok, <<"#",_/binary>>} -> %% Ignore comments
fold_1(Fun, Line+1, Acc, Get);
{ok, <<"\n">>} -> %% Ignore empty lines
fold_1(Fun, Line+1, Acc, Get);
{ok, Data} ->
fold_1(Fun, Line+1, Fun(Data, Line, Acc), Get)
end.