From cce2d5ab396ede0627a2a83f71fa6651de459faf Mon Sep 17 00:00:00 2001 From: Hans Bolinder Date: Mon, 4 Mar 2019 12:07:18 +0100 Subject: stdlib: Optimize handling of Unicode in the string module The unicode_util:cp() function handles deep lists faster by returning the rest of the input more balanced to the right than before. --- lib/stdlib/test/string_SUITE.erl | 32 ++++++++++++++++++++++-------- lib/stdlib/uc_spec/gen_unicode_mod.escript | 31 +++++++++++++++++++---------- 2 files changed, 45 insertions(+), 18 deletions(-) mode change 100755 => 100644 lib/stdlib/uc_spec/gen_unicode_mod.escript (limited to 'lib/stdlib') diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl index 251e09121c..fe01900c55 100644 --- a/lib/stdlib/test/string_SUITE.erl +++ b/lib/stdlib/test/string_SUITE.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2004-2018. All Rights Reserved. +%% Copyright Ericsson AB 2004-2019. All Rights Reserved. %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -754,19 +754,22 @@ do_measure(DataDir) -> io:format("~p~n",[byte_size(Bin)]), Do = fun(Name, Func, Mode) -> {N, Mean, Stddev, _} = time_func(Func, Mode, Bin, 20), - io:format("~15w ~6w ~6.2fms ±~5.2fms #~.2w gc included~n", + io:format("~15w ~11w ~8.2fms ±~6.2fms #~.2w gc included~n", [Name, Mode, Mean/1000, Stddev/1000, N]) end, Do2 = fun(Name, Func, Mode) -> {N, Mean, Stddev, _} = time_func(Func, binary, <<>>, 20), - io:format("~15w ~6w ~6.2fms ±~5.2fms #~.2w gc included~n", + io:format("~15w ~11w ~8.2fms ±~6.2fms #~.2w gc included~n", [Name, Mode, Mean/1000, Stddev/1000, N]) end, + %% lefty_list means a list balanced to the left, like + %% [[[30],31],32]. Only some functions check such lists. + Modes = [list, lefty_list, binary], io:format("----------------------~n"), Do(old_tokens, fun(Str) -> string:tokens(Str, [$\n,$\r]) end, list), Tokens = {lexemes, fun(Str) -> string:lexemes(Str, [$\n,$\r]) end}, - [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- [list, binary]], + [Do(Name,Fun,Mode) || {Name,Fun} <- [Tokens], Mode <- Modes], S0 = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy.....", S0B = <<"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxy.....">>, @@ -824,17 +827,17 @@ do_measure(DataDir) -> io:format("--~n",[]), NthTokens = {nth_lexemes, fun(Str) -> string:nth_lexeme(Str, 18000, [$\n,$\r]) end}, - [Do(Name,Fun,Mode) || {Name,Fun} <- [NthTokens], Mode <- [list, binary]], + [Do(Name,Fun,Mode) || {Name,Fun} <- [NthTokens], Mode <- Modes], Do2(take_t, repeat(fun() -> string:take(S0, [$.,$y], false, trailing) end), list), Do2(take_t, repeat(fun() -> string:take(S0B, [$.,$y], false, trailing) end), binary), Do2(take_tc, repeat(fun() -> string:take(S0, [$x], true, trailing) end), list), Do2(take_tc, repeat(fun() -> string:take(S0B, [$x], true, trailing) end), binary), Length = {length, fun(Str) -> string:length(Str) end}, - [Do(Name,Fun,Mode) || {Name,Fun} <- [Length], Mode <- [list, binary]], + [Do(Name,Fun,Mode) || {Name,Fun} <- [Length], Mode <- Modes], Reverse = {reverse, fun(Str) -> string:reverse(Str) end}, - [Do(Name,Fun,Mode) || {Name,Fun} <- [Reverse], Mode <- [list, binary]], + [Do(Name,Fun,Mode) || {Name,Fun} <- [Reverse], Mode <- Modes], ok. @@ -1064,7 +1067,20 @@ time_func(N,Sum,SumSq, _, _, Res, _) -> {N, Mean, Stdev, Res}. mode(binary, Bin) -> Bin; -mode(list, Bin) -> unicode:characters_to_list(Bin). +mode(list, Bin) -> unicode:characters_to_list(Bin); +mode(lefty_list, Bin) -> + L = unicode:characters_to_list(Bin), + to_left(L). + +to_left([]) -> + []; +to_left([H|L]) -> + to_left([H], L). + +to_left(V, []) -> + V; +to_left(V, [H|L]) -> + to_left([V,H], L). %% %% Old string lists Test cases starts here. diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript old mode 100755 new mode 100644 index 70eec1a6f2..780353af58 --- a/lib/stdlib/uc_spec/gen_unicode_mod.escript +++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript @@ -4,7 +4,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2017. All Rights Reserved. +%% Copyright Ericsson AB 2017-2019. All Rights Reserved. %% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. @@ -461,16 +461,27 @@ gen_cp(Fd) -> io:put_chars(Fd, "cp([C|_]=L) when is_integer(C) -> L;\n"), io:put_chars(Fd, "cp([List]) -> cp(List);\n"), io:put_chars(Fd, "cp([List|R]) ->\n"), - io:put_chars(Fd, " case cp(List) of\n"), - io:put_chars(Fd, " [] -> cp(R);\n"), - io:put_chars(Fd, " [CP] -> [CP|R];\n"), - io:put_chars(Fd, " [C|R0] -> [C|[R0|R]];\n"), - io:put_chars(Fd, " {error,Error} -> {error,[Error|R]}\n"), - io:put_chars(Fd, " end;\n"), + io:put_chars(Fd, " cpl(List, R);\n"), io:put_chars(Fd, "cp([]) -> [];\n"), io:put_chars(Fd, "cp(<>) -> [C|R];\n"), io:put_chars(Fd, "cp(<<>>) -> [];\n"), - io:put_chars(Fd, "cp(<>) -> {error,R}.\n\n"), + io:put_chars(Fd, "cp(<>) -> {error,R}.\n"), + io:put_chars(Fd, "\n"), + io:put_chars(Fd, "cpl([C], R) when is_integer(C) ->\n"), + io:put_chars(Fd, " [C|R];\n"), + io:put_chars(Fd, "cpl([C|T], R) when is_integer(C) ->\n"), + io:put_chars(Fd, " [C,T|R];\n"), + io:put_chars(Fd, "cpl([List], R) ->\n"), + io:put_chars(Fd, " cpl(List, R);\n"), + io:put_chars(Fd, "cpl([List|T], R) ->\n"), + io:put_chars(Fd, " cpl(List, [T|R]);\n"), + io:put_chars(Fd, "cpl([], R) ->\n"), + io:put_chars(Fd, " cp(R);\n"), + io:put_chars(Fd, "cpl(<>, R) ->\n"), + io:put_chars(Fd, " [C,T|R];\n"), + io:put_chars(Fd, "cpl(<<>>, R) ->\n"), + io:put_chars(Fd, " cp(R);\n"), + io:put_chars(Fd, "cpl(<>, R) -> {error,[B|R]}.\n\n"), ok. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -482,10 +493,10 @@ gen_gc(Fd, GBP) -> " maybe_improper_list() | {error, unicode:chardata()}.\n"), io:put_chars(Fd, "gc([CP1, CP2|_]=T)\n" - " when CP1 < 256, CP2 < 256, CP1 =/= $\r -> %% Ascii Fast path\n" + " when CP1 < 256, CP2 < 256, CP1 =/= $\\r -> %% Ascii Fast path\n" " T;\n" "gc(<>) ->\n" - " if CP1 < 256, CP1 =/= $\r ->\n" + " if CP1 < 256, CP1 =/= $\\r ->\n" " case Rest of\n" " <> when CP2 < 256 -> %% Ascii Fast path\n" " [CP1|Rest];\n" -- cgit v1.2.3