diff options
Diffstat (limited to 'lib/stdlib/test/unicode_util_SUITE.erl')
| -rw-r--r-- | lib/stdlib/test/unicode_util_SUITE.erl | 433 | 
1 files changed, 433 insertions, 0 deletions
| diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl new file mode 100644 index 0000000000..03c24c7027 --- /dev/null +++ b/lib/stdlib/test/unicode_util_SUITE.erl @@ -0,0 +1,433 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%%     http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(unicode_util_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([all/0, suite/0, extra/1, +         uppercase/1, lowercase/1, titlecase/1, casefold/1, +         cp/1, gc/1, +         nfd/1, nfc/1, nfkd/1, nfkc/1, +         whitespace/1, +         get/1, +         count/1]). + +-export([debug/0, id/1, bin_split/1, uc_loaded_size/0]). + +suite() -> +    [{ct_hooks,[ts_install_cth]}, +     {timetrap,{minutes,20}}]. + +all() -> +    [ +     extra, +     uppercase, lowercase, titlecase, casefold, +     cp, gc, +     nfd, nfc, nfkd, nfkc, +     whitespace, +     get, +     count +    ]. + +debug() -> +    Config = [{data_dir, ?MODULE_STRING++"_data"}], +    [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) || Test <- all()]. + +extra(_) -> +    {_, _} = unicode_util:spec_version(), +    #{ccc:=0, compat:=[], canon:=[_,_]} = unicode_util:lookup($å), +    #{fold:=229,lower:=229,title:=197,upper:=197} = unicode_util:get_case($å), +    #{fold:="ss",lower:=223,title:="Ss",upper:="SS"} = unicode_util:get_case($ß), +    ok. + +uppercase(_) -> +    [$H] = unicode_util:uppercase([$H]), +    [$H] = unicode_util:uppercase([$h]), +    [$1] = unicode_util:uppercase([$1]), +    ok. + +titlecase(_) -> +    [$H] = unicode_util:titlecase([$H]), +    [$H] = unicode_util:titlecase([$h]), +    [$1] = unicode_util:titlecase([$1]), +    ok. + +lowercase(_) -> +    [$h] = unicode_util:lowercase([$H]), +    [$h] = unicode_util:lowercase([$h]), +    [$1] = unicode_util:lowercase([$1]), +    [$i] = unicode_util:casefold([$I]), %% no Turkish +    ok. + +casefold(_) -> +    [$h] = unicode_util:casefold([$H]), +    [$h] = unicode_util:casefold([$h]), +    [$1] = unicode_util:casefold([$1]), +    [$i] = unicode_util:casefold([$I]),%% no Turkish +    [[$s,$s]|"abC"] = unicode_util:casefold([$ß,$a,$b,$C]), +    [[$s,$s]] = unicode_util:casefold([$ẞ]), +    ok. + +whitespace(_) -> +    WS = unicode_util:whitespace(), +    WS = lists:filter(fun unicode_util:is_whitespace/1, WS), +    %% TODO add more tests +    ok. + +cp(_) -> +    Get = fun unicode_util:cp/1, +    "hejsan" = fetch("hejsan", Get), +    "hejsan" = fetch(<<"hejsan">>, Get), +    "hejsan" = fetch(["hej",<<"san">>], Get), +    "hejsan" = fetch(["hej"|<<"san">>], Get), +    {error, <<128>>} = Get(<<128>>), +    {error, [<<128>>, 0]} = Get([<<128>>, 0]), +    ok. + +gc(Config) -> +    DataDir = proplists:get_value(data_dir, Config), +    Get = fun unicode_util:gc/1, +    "hejsan" = fetch("hejsan", Get), +    "hejsan" = fetch(<<"hejsan">>, Get), +    "hejsan" = fetch(["hej",<<"san">>], Get), +    "hejsan" = fetch(["hej"|<<"san">>], Get), +    {error, <<128>>} = Get(<<128>>), +    {error, [<<128>>, 0]} = Get([<<128>>, 0]), + +    0 = fold(fun verify_gc/3, 0, DataDir ++ "/GraphemeBreakTest.txt"), +    ok. + +verify_gc(Line0, N, Acc) -> +    Line = unicode:characters_to_list(Line0), +    Line = fetch(Line0,fun unicode_util:cp/1), %% Test cp +    LineGC = fetch(Line0,fun unicode_util:gc/1), %% Test gc +    LineGC = fetch(Line,fun unicode_util:gc/1), %% Test gc +    LineGC = fetch(LineGC,fun unicode_util:gc/1), %% Test gc +    LineGC = fetch(LineGC,fun unicode_util:cp/1), %% Test cp + +    %io:format("Line: ~s~n",[Line]), +    [Data|_Comments] = string:tokens(Line, "#"), +    %io:format("Data: ~w~n",[string:tokens(Data, " \t")]), +    {Str,Res} = gc_test_data(string:tokens(Data, " \t"), [], [[]]), +    try +        Res = fetch(Str, fun unicode_util:gc/1), +        Acc +    catch _Cl:{badmatch, Other} -> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[N, Line, Str, Str]), +            io:format("Expected: ~p~n", [Res]), +            io:format("Got: ~w~n", [Other]), +            Acc+1; +          Cl:R -> +            io:format("~p: ~ts => |~tp|~n",[N, Line, Str]), +            io:format("Expected: ~p~n", [Res]), +            erlang:raise(Cl,R,erlang:get_stacktrace()) +    end. + +gc_test_data([[247]|Rest], Str, [First|GCs]) -> +    case First of +        [] -> gc_test_data(Rest, Str, [[]|GCs]); +        [CP] -> gc_test_data(Rest, Str, [[],CP|GCs]); +        _  -> gc_test_data(Rest, Str, [[],lists:reverse(First)|GCs]) +    end; +gc_test_data([[215]|Rest], Str, GCs) -> +    gc_test_data(Rest, Str, GCs); +gc_test_data([Hex|Rest], Str, [First|GCs]) -> +    CP = hex_to_int(Hex), +    gc_test_data(Rest, [CP|Str], [[CP|First]|GCs]); +gc_test_data([], Str, [[]|GCs]) -> +    {lists:reverse(Str), lists:reverse(GCs)}; +gc_test_data([], Str, GCs) -> +    {lists:reverse(Str), lists:reverse(GCs)}. + +nfd(Config) -> +    DataDir = proplists:get_value(data_dir, Config), +    ok = fold(fun verify_nfd/3, 0, DataDir ++ "/NormalizationTest.txt"), +    ok. + +verify_nfd(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfd(Data0, LineNo, _Acc) -> +    Data1 = unicode:characters_to_list(Data0), +    [Data2|_Comments] = string:tokens(Data1, "#"), +    Columns = string:tokens(Data2, ";"), +    [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || +                             C <- Columns], +    C3GC = fetch(C3, fun unicode_util:gc/1), +    try +        C3GC = fetch(C1, fun unicode_util:nfd/1), +        C3GC = fetch(C2, fun unicode_util:nfd/1), +        C3GC = fetch(C3, fun unicode_util:nfd/1) +    catch  _Cl:{badmatch, Other} = _R-> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), +            io:format("Expected: ~ts ~w~n", [C3GC, C3GC]), +            io:format("Got: ~ts ~w~n", [Other, Other]), +            erlang:raise(_Cl,_R,erlang:get_stacktrace()); +           Cl:R -> +            io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), +            io:format("Expected: ~p~n", [C3]), +            erlang:raise(Cl,R,erlang:get_stacktrace()) +    end, +    C5GC = fetch(C5, fun unicode_util:gc/1), +    try +        C5GC = fetch(C4, fun unicode_util:nfd/1), +        C5GC = fetch(C5, fun unicode_util:nfd/1) +    catch  _Cl2:{badmatch, Other2} = _R2-> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), +            io:format("Expected: ~ts ~w~n", [C5GC, C5GC]), +            io:format("Got:      ~ts ~w~n", [Other2, Other2]), +            erlang:raise(_Cl2,_R2,erlang:get_stacktrace()); +           Cl2:R2 -> +            io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), +            io:format("Expected: ~p~n", [C5]), +            erlang:raise(Cl2,R2,erlang:get_stacktrace()) +    end, +    ok. + +nfc(Config) -> +    DataDir = proplists:get_value(data_dir, Config), +    ok = fold(fun verify_nfc/3, 0, DataDir ++ "/NormalizationTest.txt"), +    ok. + +verify_nfc(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfc(Data0, LineNo, _Acc) -> +    Data1 = unicode:characters_to_list(Data0), +    [Data2|_Comments] = string:tokens(Data1, "#"), +    Columns = string:tokens(Data2, ";"), +    [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || +                             C <- Columns], +    C2GC = fetch(C2, fun unicode_util:gc/1), +    try +        C2GC = fetch(C1, fun unicode_util:nfc/1), +        C2GC = fetch(C2, fun unicode_util:nfc/1), +        C2GC = fetch(C3, fun unicode_util:nfc/1) +    catch  _Cl:{badmatch, Other} = _R-> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), +            io:format("Expected: ~ts ~w~n", [C2GC, C2GC]), +            io:format("Got:      ~ts ~w~n", [Other, Other]), +            erlang:raise(_Cl,_R,erlang:get_stacktrace()); +           Cl:R -> +            io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), +            io:format("Expected: ~p~n", [C3]), +            erlang:raise(Cl,R,erlang:get_stacktrace()) +    end, +    C4GC = fetch(C4, fun unicode_util:gc/1), +    try +        C4GC = fetch(C4, fun unicode_util:nfc/1), +        C4GC = fetch(C5, fun unicode_util:nfc/1) +    catch  _Cl2:{badmatch, Other2} = _R2-> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), +            io:format("Expected: ~ts ~w~n", [C4GC, C4GC]), +            io:format("Got: ~ts ~w~n", [Other2, Other2]), +            erlang:raise(_Cl2,_R2,erlang:get_stacktrace()); +           Cl2:R2 -> +            io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), +            io:format("Expected: ~p~n", [C5]), +            erlang:raise(Cl2,R2,erlang:get_stacktrace()) +    end, +    ok. + +nfkd(Config) -> +    DataDir = proplists:get_value(data_dir, Config), +    ok = fold(fun verify_nfkd/3, 0, DataDir ++ "/NormalizationTest.txt"), +    ok. + +verify_nfkd(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfkd(Data0, LineNo, _Acc) -> +    Data1 = unicode:characters_to_list(Data0), +    [Data2|_Comments] = string:tokens(Data1, "#"), +    Columns = string:tokens(Data2, ";"), +    [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || +                             C <- Columns], +    C5GC = lists:flatten(fetch(C5, fun unicode_util:gc/1)), +    try +        C5GC = lists:flatten(fetch(C1, fun unicode_util:nfkd/1)), +        C5GC = lists:flatten(fetch(C2, fun unicode_util:nfkd/1)), +        C5GC = lists:flatten(fetch(C3, fun unicode_util:nfkd/1)), +        C5GC = lists:flatten(fetch(C4, fun unicode_util:nfkd/1)), +        C5GC = lists:flatten(fetch(C5, fun unicode_util:nfkd/1)) +    catch  _Cl:{badmatch, Other} = _R-> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C5, C5]), +            io:format("Expected: ~ts ~w~n", [C5GC, C5GC]), +            io:format("Got: ~ts ~w~n", [Other, Other]), +            erlang:raise(_Cl,_R,erlang:get_stacktrace()); +           Cl:R -> +            io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), +            io:format("Expected: ~p~n", [C3]), +            erlang:raise(Cl,R,erlang:get_stacktrace()) +    end, +    ok. + + +nfkc(Config) -> +    DataDir = proplists:get_value(data_dir, Config), +    ok = fold(fun verify_nfkc/3, 0, DataDir ++ "/NormalizationTest.txt"), +    ok. + +verify_nfkc(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfkc(Data0, LineNo, _Acc) -> +    Data1 = unicode:characters_to_list(Data0), +    [Data2|_Comments] = string:tokens(Data1, "#"), +    Columns = string:tokens(Data2, ";"), +    [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || +                             C <- Columns], +    C4GC = lists:flatten(fetch(C4, fun unicode_util:gc/1)), +    try +        C4GC = lists:flatten(fetch(C1, fun unicode_util:nfkc/1)), +        C4GC = lists:flatten(fetch(C2, fun unicode_util:nfkc/1)), +        C4GC = lists:flatten(fetch(C3, fun unicode_util:nfkc/1)), +        C4GC = lists:flatten(fetch(C4, fun unicode_util:nfkc/1)), +        C4GC = lists:flatten(fetch(C5, fun unicode_util:nfkc/1)) + +    catch  _Cl:{badmatch, Other} = _R-> +            io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C4, C4]), +            io:format("Expected: ~ts ~w~n", [C4GC, C4GC]), +            io:format("Got:      ~ts ~w~n", [Other, Other]), +            erlang:raise(_Cl,_R,erlang:get_stacktrace()); +           Cl:R -> +            io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), +            io:format("Expected: ~p~n", [C3]), +            erlang:raise(Cl,R,erlang:get_stacktrace()) +    end, +    ok. + +get(_) -> +    add_get_tests. + +count(Config) -> +    ct:timetrap({minutes,5}), +    case ct:get_timetrap_info() of +        {_,{_,Scale}} -> +            {skip,{measurments_skipped_debug,Scale}}; +        _ -> % No scaling +            do_measure(Config) +    end. + +do_measure(Config) -> +    DataDir = proplists:get_value(data_dir, Config), +    File =  DataDir ++ "/NormalizationTest.txt", +    {ok, Bin} = file:read_file(File), +    Do = fun(Func, Mode) -> +                 {N, Mean, Stddev, Res} = time_count(Func, Mode, Bin), +                 io:format("~4w ~6w ~.10w ~.6wms ±~.2wms #~.2w~n", +                           [Func, Mode, Res, Mean div 1000, Stddev div 1000, N]) +         end, +    Do(lref, list), +    Do(bref, binary), +    io:format("----------------------~n"), +    [Do(What,Mode) || What <- [cp, gc, nfd, nfc, nfkd, nfkc], Mode <- [list, deep_l, binary, deep_b]], +    io:format("Size of unicode_util: ~pkB~n",[uc_loaded_size() div 1024]), +    ok. + +uc_loaded_size() -> +    uc_loaded_size(binary:split(erlang:system_info(loaded), <<$\n>>, [global])). + +uc_loaded_size([<<"unicode_util ", Rest/binary>>|_]) -> +    [StrSize|_] = binary:split(Rest, <<$\s>>), +    binary_to_integer(StrSize); +uc_loaded_size([_|Rest]) -> +    uc_loaded_size(Rest). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +time_count(Fun, Mode, Bin) -> +    timer:sleep(100), %% Let emulator catch up and clean things before test runs +    Self = self(), +    Pid = spawn_link(fun() -> +                             Str = mode(Mode, Bin), +                             Self ! {self(),do_count(0,0,0, Fun, Str, undefined)} +                     end), +    receive {Pid,Msg} -> Msg end. + +do_count(N,Sum,SumSq, Fun, Str, _) when N < 10 -> +    {Time, Res} = do_count(Fun, Str), +    do_count(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res); +do_count(N,Sum,SumSq, _, _, Res) -> +    Mean = round(Sum / N), +    Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))), +    {N, Mean, Stdev, Res}. + +do_count(Fun, Str) -> +    Count = fun Count(Str0, N) -> +                    case unicode_util:Fun(Str0) of +                        [] -> N; +                        [_|Str1] -> Count(Str1,N+1) +                    end +            end, + +    if Fun =/= lref, Fun =/= bref -> +            timer:tc(fun() -> Count(Str, 0) end); +       true -> +            Pick = case Fun of +                       lref -> id(id); +                       bref -> id(bin_split) +                   end, +            Ref = fun LR(Str0, N) -> +                          case ?MODULE:Pick(Str0) of +                              [] -> N; +                              [_|Str1] -> LR(Str1,N+1) +                          end +                  end, +            timer:tc(fun() -> Ref(Str, 0) end) +    end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +id(Op) -> Op. + +bin_split(<<>>) -> []; +bin_split(<<CP/utf8,R/binary>>) -> [CP|R]. + +mode(binary, Bin) -> Bin; +mode(list, Bin) -> unicode:characters_to_list(Bin); +mode(deep_b, Bin) -> [Bin]; +mode(deep_l, Bin) -> [unicode:characters_to_list(Bin)]. + +fetch(Str, F) -> +    case F(Str) of +        [] -> []; +        [CP|R] -> [CP|fetch(R,F)] +    end. + +%% *Test.txt file helpers + +hex_to_int([]) -> []; +hex_to_int(HexStr) -> +    list_to_integer(string:strip(HexStr, both), 16). + +fold(Fun, Acc, File) -> +    io:format("Processing ~s~n",[File]), +    {ok, Fd} = file:open(File, [read, raw, binary, {read_ahead, 100000}]), +    Get = fun() -> file:read_line(Fd) end, +    try +        fold_1(Fun, 1, Acc, Get) +    after +        ok = file:close(Fd) +    end. + +fold_1(Fun, Line, Acc, Get) -> +    case Get() of +        eof -> Acc; +        {ok, <<"#",_/binary>>} -> %% Ignore comments +            fold_1(Fun, Line+1, Acc, Get); +        {ok, <<"\n">>} -> %% Ignore empty lines +            fold_1(Fun, Line+1, Acc, Get); +        {ok, Data} -> +            fold_1(Fun, Line+1, Fun(Data, Line, Acc), Get) +    end. | 
