diff options
author | Dan Gudmundsson <[email protected]> | 2017-01-09 13:54:43 +0100 |
---|---|---|
committer | Dan Gudmundsson <[email protected]> | 2017-04-24 12:16:50 +0200 |
commit | 6d5b392670a427914ad0413e4abbb89dac15ab0a (patch) | |
tree | 0279b61eb7b2022850c7ace9f4af0642e387388a /lib/stdlib/test/unicode_util_SUITE.erl | |
parent | 621cedccc78581330b9628c559b0d851c303564f (diff) | |
download | otp-6d5b392670a427914ad0413e4abbb89dac15ab0a.tar.gz otp-6d5b392670a427914ad0413e4abbb89dac15ab0a.tar.bz2 otp-6d5b392670a427914ad0413e4abbb89dac15ab0a.zip |
Add unicode_util
A base for unicode functions, not intended to be a user api.
Whitespace returns a reasonable subset of non nobreak whitespace
characters.
Implementation notes:
Make function clauses instead of using arrays and store tuples instead
of maps to save space.
Diffstat (limited to 'lib/stdlib/test/unicode_util_SUITE.erl')
-rw-r--r-- | lib/stdlib/test/unicode_util_SUITE.erl | 429 |
1 files changed, 429 insertions, 0 deletions
diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl new file mode 100644 index 0000000000..e9b3d7f98d --- /dev/null +++ b/lib/stdlib/test/unicode_util_SUITE.erl @@ -0,0 +1,429 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2017. All Rights Reserved. +%% +%% Licensed under the Apache License, Version 2.0 (the "License"); +%% you may not use this file except in compliance with the License. +%% You may obtain a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, software +%% distributed under the License is distributed on an "AS IS" BASIS, +%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +%% See the License for the specific language governing permissions and +%% limitations under the License. +%% +%% %CopyrightEnd% +%% +-module(unicode_util_SUITE). + +-include_lib("common_test/include/ct.hrl"). + +-export([all/0, suite/0, extra/1, + uppercase/1, lowercase/1, titlecase/1, casefold/1, + cp/1, gc/1, + nfd/1, nfc/1, nfkd/1, nfkc/1, + whitespace/1, + get/1, + count/1]). + +-export([debug/0, id/1, bin_split/1, uc_loaded_size/0]). + +suite() -> + [{ct_hooks,[ts_install_cth]}, + {timetrap,{minutes,20}}]. + +all() -> + [ + extra, + uppercase, lowercase, titlecase, casefold, + cp, gc, + nfd, nfc, nfkd, nfkc, + whitespace, + get, + count + ]. + +debug() -> + Config = [{data_dir, ?MODULE_STRING++"_data"}], + [io:format("~p:~p~n",[Test,?MODULE:Test(Config)]) || Test <- all()]. + +extra(_) -> + {_, _} = unicode_util:spec_version(), + #{ccc:=0, compat:=[], canon:=[_,_]} = unicode_util:lookup($å), + #{fold:=229,lower:=229,title:=197,upper:=197} = unicode_util:get_case($å), + #{fold:="ss",lower:=223,title:="Ss",upper:="SS"} = unicode_util:get_case($ß), + ok. + +uppercase(_) -> + [$H] = unicode_util:uppercase([$H]), + [$H] = unicode_util:uppercase([$h]), + [$1] = unicode_util:uppercase([$1]), + ok. + +titlecase(_) -> + [$H] = unicode_util:titlecase([$H]), + [$H] = unicode_util:titlecase([$h]), + [$1] = unicode_util:titlecase([$1]), + ok. + +lowercase(_) -> + [$h] = unicode_util:lowercase([$H]), + [$h] = unicode_util:lowercase([$h]), + [$1] = unicode_util:lowercase([$1]), + [$i] = unicode_util:casefold([$I]), %% no Turkish + ok. + +casefold(_) -> + [$h] = unicode_util:casefold([$H]), + [$h] = unicode_util:casefold([$h]), + [$1] = unicode_util:casefold([$1]), + [$i] = unicode_util:casefold([$I]),%% no Turkish + [[$s,$s]|"abC"] = unicode_util:casefold([$ß,$a,$b,$C]), + [[$s,$s]] = unicode_util:casefold([$ẞ]), + ok. + +whitespace(_) -> + WS = unicode_util:whitespace(), + WS = lists:filter(fun unicode_util:is_whitespace/1, WS), + %% TODO add more tests + ok. + +cp(_) -> + Get = fun unicode_util:cp/1, + "hejsan" = fetch("hejsan", Get), + "hejsan" = fetch(<<"hejsan">>, Get), + "hejsan" = fetch(["hej",<<"san">>], Get), + "hejsan" = fetch(["hej"|<<"san">>], Get), + ok. + +gc(Config) -> + DataDir = proplists:get_value(data_dir, Config), + Get = fun unicode_util:gc/1, + "hejsan" = fetch("hejsan", Get), + "hejsan" = fetch(<<"hejsan">>, Get), + "hejsan" = fetch(["hej",<<"san">>], Get), + "hejsan" = fetch(["hej"|<<"san">>], Get), + + 0 = fold(fun verify_gc/3, 0, DataDir ++ "/GraphemeBreakTest.txt"), + ok. + +verify_gc(Line0, N, Acc) -> + Line = unicode:characters_to_list(Line0), + Line = fetch(Line0,fun unicode_util:cp/1), %% Test cp + LineGC = fetch(Line0,fun unicode_util:gc/1), %% Test gc + LineGC = fetch(Line,fun unicode_util:gc/1), %% Test gc + LineGC = fetch(LineGC,fun unicode_util:gc/1), %% Test gc + LineGC = fetch(LineGC,fun unicode_util:cp/1), %% Test cp + + %io:format("Line: ~s~n",[Line]), + [Data|_Comments] = string:tokens(Line, "#"), + %io:format("Data: ~w~n",[string:tokens(Data, " \t")]), + {Str,Res} = gc_test_data(string:tokens(Data, " \t"), [], [[]]), + try + Res = fetch(Str, fun unicode_util:gc/1), + Acc + catch _Cl:{badmatch, Other} -> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[N, Line, Str, Str]), + io:format("Expected: ~p~n", [Res]), + io:format("Got: ~w~n", [Other]), + Acc+1; + Cl:R -> + io:format("~p: ~ts => |~tp|~n",[N, Line, Str]), + io:format("Expected: ~p~n", [Res]), + erlang:raise(Cl,R,erlang:get_stacktrace()) + end. + +gc_test_data([[247]|Rest], Str, [First|GCs]) -> + case First of + [] -> gc_test_data(Rest, Str, [[]|GCs]); + [CP] -> gc_test_data(Rest, Str, [[],CP|GCs]); + _ -> gc_test_data(Rest, Str, [[],lists:reverse(First)|GCs]) + end; +gc_test_data([[215]|Rest], Str, GCs) -> + gc_test_data(Rest, Str, GCs); +gc_test_data([Hex|Rest], Str, [First|GCs]) -> + CP = hex_to_int(Hex), + gc_test_data(Rest, [CP|Str], [[CP|First]|GCs]); +gc_test_data([], Str, [[]|GCs]) -> + {lists:reverse(Str), lists:reverse(GCs)}; +gc_test_data([], Str, GCs) -> + {lists:reverse(Str), lists:reverse(GCs)}. + +nfd(Config) -> + DataDir = proplists:get_value(data_dir, Config), + ok = fold(fun verify_nfd/3, 0, DataDir ++ "/NormalizationTest.txt"), + ok. + +verify_nfd(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfd(Data0, LineNo, _Acc) -> + Data1 = unicode:characters_to_list(Data0), + [Data2|_Comments] = string:tokens(Data1, "#"), + Columns = string:tokens(Data2, ";"), + [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || + C <- Columns], + C3GC = fetch(C3, fun unicode_util:gc/1), + try + C3GC = fetch(C1, fun unicode_util:nfd/1), + C3GC = fetch(C2, fun unicode_util:nfd/1), + C3GC = fetch(C3, fun unicode_util:nfd/1) + catch _Cl:{badmatch, Other} = _R-> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), + io:format("Expected: ~ts ~w~n", [C3GC, C3GC]), + io:format("Got: ~ts ~w~n", [Other, Other]), + erlang:raise(_Cl,_R,erlang:get_stacktrace()); + Cl:R -> + io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), + io:format("Expected: ~p~n", [C3]), + erlang:raise(Cl,R,erlang:get_stacktrace()) + end, + C5GC = fetch(C5, fun unicode_util:gc/1), + try + C5GC = fetch(C4, fun unicode_util:nfd/1), + C5GC = fetch(C5, fun unicode_util:nfd/1) + catch _Cl2:{badmatch, Other2} = _R2-> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), + io:format("Expected: ~ts ~w~n", [C5GC, C5GC]), + io:format("Got: ~ts ~w~n", [Other2, Other2]), + erlang:raise(_Cl2,_R2,erlang:get_stacktrace()); + Cl2:R2 -> + io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), + io:format("Expected: ~p~n", [C5]), + erlang:raise(Cl2,R2,erlang:get_stacktrace()) + end, + ok. + +nfc(Config) -> + DataDir = proplists:get_value(data_dir, Config), + ok = fold(fun verify_nfc/3, 0, DataDir ++ "/NormalizationTest.txt"), + ok. + +verify_nfc(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfc(Data0, LineNo, _Acc) -> + Data1 = unicode:characters_to_list(Data0), + [Data2|_Comments] = string:tokens(Data1, "#"), + Columns = string:tokens(Data2, ";"), + [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || + C <- Columns], + C2GC = fetch(C2, fun unicode_util:gc/1), + try + C2GC = fetch(C1, fun unicode_util:nfc/1), + C2GC = fetch(C2, fun unicode_util:nfc/1), + C2GC = fetch(C3, fun unicode_util:nfc/1) + catch _Cl:{badmatch, Other} = _R-> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), + io:format("Expected: ~ts ~w~n", [C2GC, C2GC]), + io:format("Got: ~ts ~w~n", [Other, Other]), + erlang:raise(_Cl,_R,erlang:get_stacktrace()); + Cl:R -> + io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), + io:format("Expected: ~p~n", [C3]), + erlang:raise(Cl,R,erlang:get_stacktrace()) + end, + C4GC = fetch(C4, fun unicode_util:gc/1), + try + C4GC = fetch(C4, fun unicode_util:nfc/1), + C4GC = fetch(C5, fun unicode_util:nfc/1) + catch _Cl2:{badmatch, Other2} = _R2-> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C1, C1]), + io:format("Expected: ~ts ~w~n", [C4GC, C4GC]), + io:format("Got: ~ts ~w~n", [Other2, Other2]), + erlang:raise(_Cl2,_R2,erlang:get_stacktrace()); + Cl2:R2 -> + io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), + io:format("Expected: ~p~n", [C5]), + erlang:raise(Cl2,R2,erlang:get_stacktrace()) + end, + ok. + +nfkd(Config) -> + DataDir = proplists:get_value(data_dir, Config), + ok = fold(fun verify_nfkd/3, 0, DataDir ++ "/NormalizationTest.txt"), + ok. + +verify_nfkd(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfkd(Data0, LineNo, _Acc) -> + Data1 = unicode:characters_to_list(Data0), + [Data2|_Comments] = string:tokens(Data1, "#"), + Columns = string:tokens(Data2, ";"), + [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || + C <- Columns], + C5GC = lists:flatten(fetch(C5, fun unicode_util:gc/1)), + try + C5GC = lists:flatten(fetch(C1, fun unicode_util:nfkd/1)), + C5GC = lists:flatten(fetch(C2, fun unicode_util:nfkd/1)), + C5GC = lists:flatten(fetch(C3, fun unicode_util:nfkd/1)), + C5GC = lists:flatten(fetch(C4, fun unicode_util:nfkd/1)), + C5GC = lists:flatten(fetch(C5, fun unicode_util:nfkd/1)) + catch _Cl:{badmatch, Other} = _R-> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C5, C5]), + io:format("Expected: ~ts ~w~n", [C5GC, C5GC]), + io:format("Got: ~ts ~w~n", [Other, Other]), + erlang:raise(_Cl,_R,erlang:get_stacktrace()); + Cl:R -> + io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), + io:format("Expected: ~p~n", [C3]), + erlang:raise(Cl,R,erlang:get_stacktrace()) + end, + ok. + + +nfkc(Config) -> + DataDir = proplists:get_value(data_dir, Config), + ok = fold(fun verify_nfkc/3, 0, DataDir ++ "/NormalizationTest.txt"), + ok. + +verify_nfkc(<<"@Part", _/binary>>, _LineNo, _Acc) -> ok; +verify_nfkc(Data0, LineNo, _Acc) -> + Data1 = unicode:characters_to_list(Data0), + [Data2|_Comments] = string:tokens(Data1, "#"), + Columns = string:tokens(Data2, ";"), + [C1,C2,C3,C4,C5|_] = [[hex_to_int(CP) || CP <- string:tokens(C, " ")] || + C <- Columns], + C4GC = lists:flatten(fetch(C4, fun unicode_util:gc/1)), + try + C4GC = lists:flatten(fetch(C1, fun unicode_util:nfkc/1)), + C4GC = lists:flatten(fetch(C2, fun unicode_util:nfkc/1)), + C4GC = lists:flatten(fetch(C3, fun unicode_util:nfkc/1)), + C4GC = lists:flatten(fetch(C4, fun unicode_util:nfkc/1)), + C4GC = lists:flatten(fetch(C5, fun unicode_util:nfkc/1)) + + catch _Cl:{badmatch, Other} = _R-> + io:format("Failed: ~p~nInput: ~ts~n\t=> ~w |~ts|~n",[LineNo, Data1, C4, C4]), + io:format("Expected: ~ts ~w~n", [C4GC, C4GC]), + io:format("Got: ~ts ~w~n", [Other, Other]), + erlang:raise(_Cl,_R,erlang:get_stacktrace()); + Cl:R -> + io:format("~p: ~ts => |~tp|~n",[LineNo, Data1, C1]), + io:format("Expected: ~p~n", [C3]), + erlang:raise(Cl,R,erlang:get_stacktrace()) + end, + ok. + +get(_) -> + add_get_tests. + +count(Config) -> + ct:timetrap({minutes,5}), + case ct:get_timetrap_info() of + {_,{_,Scale}} -> + {skip,{measurments_skipped_debug,Scale}}; + _ -> % No scaling + do_measure(Config) + end. + +do_measure(Config) -> + DataDir = proplists:get_value(data_dir, Config), + File = DataDir ++ "/NormalizationTest.txt", + {ok, Bin} = file:read_file(File), + Do = fun(Func, Mode) -> + {N, Mean, Stddev, Res} = time_count(Func, Mode, Bin), + io:format("~4w ~6w ~.10w ~.6wms ±~.2wms #~.2w~n", + [Func, Mode, Res, Mean div 1000, Stddev div 1000, N]) + end, + Do(lref, list), + Do(bref, binary), + io:format("----------------------~n"), + [Do(What,Mode) || What <- [cp, gc, nfd, nfc, nfkd, nfkc], Mode <- [list, deep_l, binary, deep_b]], + io:format("Size of unicode_util: ~pkB~n",[uc_loaded_size() div 1024]), + ok. + +uc_loaded_size() -> + uc_loaded_size(binary:split(erlang:system_info(loaded), <<$\n>>, [global])). + +uc_loaded_size([<<"unicode_util ", Rest/binary>>|_]) -> + [StrSize|_] = binary:split(Rest, <<$\s>>), + binary_to_integer(StrSize); +uc_loaded_size([_|Rest]) -> + uc_loaded_size(Rest). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + +time_count(Fun, Mode, Bin) -> + timer:sleep(100), %% Let emulator catch up and clean things before test runs + Self = self(), + Pid = spawn_link(fun() -> + Str = mode(Mode, Bin), + Self ! {self(),do_count(0,0,0, Fun, Str, undefined)} + end), + receive {Pid,Msg} -> Msg end. + +do_count(N,Sum,SumSq, Fun, Str, _) when N < 10 -> + {Time, Res} = do_count(Fun, Str), + do_count(N+1,Sum+Time,SumSq+Time*Time, Fun, Str, Res); +do_count(N,Sum,SumSq, _, _, Res) -> + Mean = round(Sum / N), + Stdev = round(math:sqrt((SumSq - (Sum*Sum/N))/(N - 1))), + {N, Mean, Stdev, Res}. + +do_count(Fun, Str) -> + Count = fun Count(Str0, N) -> + case unicode_util:Fun(Str0) of + [] -> N; + [_|Str1] -> Count(Str1,N+1) + end + end, + + if Fun =/= lref, Fun =/= bref -> + timer:tc(fun() -> Count(Str, 0) end); + true -> + Pick = case Fun of + lref -> id(id); + bref -> id(bin_split) + end, + Ref = fun LR(Str0, N) -> + case ?MODULE:Pick(Str0) of + [] -> N; + [_|Str1] -> LR(Str1,N+1) + end + end, + timer:tc(fun() -> Ref(Str, 0) end) + end. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + + +id(Op) -> Op. + +bin_split(<<>>) -> []; +bin_split(<<CP/utf8,R/binary>>) -> [CP|R]. + +mode(binary, Bin) -> Bin; +mode(list, Bin) -> unicode:characters_to_list(Bin); +mode(deep_b, Bin) -> [Bin]; +mode(deep_l, Bin) -> [unicode:characters_to_list(Bin)]. + +fetch(Str, F) -> + case F(Str) of + [] -> []; + [CP|R] -> [CP|fetch(R,F)] + end. + +%% *Test.txt file helpers + +hex_to_int([]) -> []; +hex_to_int(HexStr) -> + list_to_integer(string:strip(HexStr, both), 16). + +fold(Fun, Acc, File) -> + io:format("Processing ~s~n",[File]), + {ok, Fd} = file:open(File, [read, raw, binary, {read_ahead, 100000}]), + Get = fun() -> file:read_line(Fd) end, + try + fold_1(Fun, 1, Acc, Get) + after + ok = file:close(Fd) + end. + +fold_1(Fun, Line, Acc, Get) -> + case Get() of + eof -> Acc; + {ok, <<"#",_/binary>>} -> %% Ignore comments + fold_1(Fun, Line+1, Acc, Get); + {ok, <<"\n">>} -> %% Ignore empty lines + fold_1(Fun, Line+1, Acc, Get); + {ok, Data} -> + fold_1(Fun, Line+1, Fun(Data, Line, Acc), Get) + end. |