From 5c51e87bee9d8db0503f41eda4ab0589df1a2ec1 Mon Sep 17 00:00:00 2001 From: Dan Gudmundsson Date: Mon, 14 May 2018 15:06:29 +0200 Subject: Optimize string lowercase, uppercase and casefold for ASCII characters --- lib/stdlib/src/string.erl | 180 ++++++++++++++++++++++++++++++--------- lib/stdlib/test/string_SUITE.erl | 12 +++ 2 files changed, 150 insertions(+), 42 deletions(-) (limited to 'lib') diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl index 0736374f21..f5d271c06d 100644 --- a/lib/stdlib/src/string.erl +++ b/lib/stdlib/src/string.erl @@ -323,16 +323,30 @@ take(Str, Sep0, true, trailing) -> %% Uppercase all chars in Str -spec uppercase(String::unicode:chardata()) -> unicode:chardata(). uppercase(CD) when is_list(CD) -> - uppercase_list(CD); -uppercase(CD) when is_binary(CD) -> - uppercase_bin(CD,<<>>). + try uppercase_list(CD, false) + catch unchanged -> CD + end; +uppercase(<>=Orig) -> + try uppercase_bin(CP1, Rest, false) of + List -> unicode:characters_to_binary(List) + catch unchanged -> Orig + end; +uppercase(<<>>) -> + <<>>. %% Lowercase all chars in Str -spec lowercase(String::unicode:chardata()) -> unicode:chardata(). lowercase(CD) when is_list(CD) -> - lowercase_list(CD); -lowercase(CD) when is_binary(CD) -> - lowercase_bin(CD,<<>>). + try lowercase_list(CD, false) + catch unchanged -> CD + end; +lowercase(<>=Orig) -> + try lowercase_bin(CP1, Rest, false) of + List -> unicode:characters_to_binary(List) + catch unchanged -> Orig + end; +lowercase(<<>>) -> + <<>>. %% Make a titlecase of the first char in Str -spec titlecase(String::unicode:chardata()) -> unicode:chardata(). @@ -352,9 +366,16 @@ titlecase(CD) when is_binary(CD) -> %% Make a comparable string of the Str should be used for equality tests only -spec casefold(String::unicode:chardata()) -> unicode:chardata(). casefold(CD) when is_list(CD) -> - casefold_list(CD); -casefold(CD) when is_binary(CD) -> - casefold_bin(CD,<<>>). + try casefold_list(CD, false) + catch unchanged -> CD + end; +casefold(<>=Orig) -> + try casefold_bin(CP1, Rest, false) of + List -> unicode:characters_to_binary(List) + catch unchanged -> Orig + end; +casefold(<<>>) -> + <<>>. -spec to_integer(String) -> {Int, Rest} | {'error', Reason} when String :: unicode:chardata(), @@ -652,52 +673,127 @@ slice_bin(CD, CP1, N) when N > 0 -> slice_bin(CD, CP1, 0) -> byte_size(CD)+byte_size(<>). -uppercase_list(CPs0) -> +uppercase_list([CP1|[CP2|_]=Cont], _Changed) when $a =< CP1, CP1 =< $z, CP2 < 256 -> + [CP1-32|uppercase_list(Cont, true)]; +uppercase_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 -> + [CP1|uppercase_list(Cont, Changed)]; +uppercase_list([], true) -> + []; +uppercase_list([], false) -> + throw(unchanged); +uppercase_list(CPs0, Changed) -> case unicode_util:uppercase(CPs0) of - [Char|CPs] -> append(Char,uppercase_list(CPs)); - [] -> [] + [Char|CPs] when Char =:= hd(CPs0) -> [Char|uppercase_list(CPs, Changed)]; + [Char|CPs] -> append(Char,uppercase_list(CPs, true)); + [] -> uppercase_list([], Changed) end. -uppercase_bin(CPs0, Acc) -> - case unicode_util:uppercase(CPs0) of - [Char|CPs] when is_integer(Char) -> - uppercase_bin(CPs, <>); - [Chars|CPs] -> - uppercase_bin(CPs, <> || CP <- Chars>>/binary >>); - [] -> Acc +uppercase_bin(CP1, <>, _Changed) + when $a =< CP1, CP1 =< $z, CP2 < 256 -> + [CP1-32|uppercase_bin(CP2, Bin, true)]; +uppercase_bin(CP1, <>, _Changed) + when CP1 < 128, CP2 < 256 -> + [CP1|uppercase_bin(CP2, Bin, false)]; +uppercase_bin(CP1, Bin, Changed) -> + case unicode_util:uppercase([CP1|Bin]) of + [CP1|CPs] -> + case unicode_util:cp(CPs) of + [Next|Rest] -> + [CP1|uppercase_bin(Next, Rest, Changed)]; + [] when Changed -> + [CP1]; + [] -> + throw(unchanged) + end; + [Char|CPs] -> + case unicode_util:cp(CPs) of + [Next|Rest] -> + [Char|uppercase_bin(Next, Rest, true)]; + [] -> + [Char] + end end. -lowercase_list(CPs0) -> +lowercase_list([CP1|[CP2|_]=Cont], _Changed) when $A =< CP1, CP1 =< $Z, CP2 < 256 -> + [CP1+32|lowercase_list(Cont, true)]; +lowercase_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 -> + [CP1|lowercase_list(Cont, Changed)]; +lowercase_list([], true) -> + []; +lowercase_list([], false) -> + throw(unchanged); +lowercase_list(CPs0, Changed) -> case unicode_util:lowercase(CPs0) of - [Char|CPs] -> append(Char,lowercase_list(CPs)); - [] -> [] + [Char|CPs] when Char =:= hd(CPs0) -> [Char|lowercase_list(CPs, Changed)]; + [Char|CPs] -> append(Char,lowercase_list(CPs, true)); + [] -> lowercase_list([], Changed) end. -lowercase_bin(CPs0, Acc) -> - case unicode_util:lowercase(CPs0) of - [Char|CPs] when is_integer(Char) -> - lowercase_bin(CPs, <>); - [Chars|CPs] -> - lowercase_bin(CPs, <> || CP <- Chars>>/binary >>); - [] -> Acc +lowercase_bin(CP1, <>, _Changed) + when $A =< CP1, CP1 =< $Z, CP2 < 256 -> + [CP1+32|lowercase_bin(CP2, Bin, true)]; +lowercase_bin(CP1, <>, _Changed) + when CP1 < 128, CP2 < 256 -> + [CP1|lowercase_bin(CP2, Bin, false)]; +lowercase_bin(CP1, Bin, Changed) -> + case unicode_util:lowercase([CP1|Bin]) of + [CP1|CPs] -> + case unicode_util:cp(CPs) of + [Next|Rest] -> + [CP1|lowercase_bin(Next, Rest, Changed)]; + [] when Changed -> + [CP1]; + [] -> + throw(unchanged) + end; + [Char|CPs] -> + case unicode_util:cp(CPs) of + [Next|Rest] -> + [Char|lowercase_bin(Next, Rest, true)]; + [] -> + [Char] + end end. -casefold_list(CPs0) -> +casefold_list([CP1|[CP2|_]=Cont], _Changed) when $A =< CP1, CP1 =< $Z, CP2 < 256 -> + [CP1+32|casefold_list(Cont, true)]; +casefold_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 -> + [CP1|casefold_list(Cont, Changed)]; +casefold_list([], true) -> + []; +casefold_list([], false) -> + throw(unchanged); +casefold_list(CPs0, Changed) -> case unicode_util:casefold(CPs0) of - [Char|CPs] -> append(Char, casefold_list(CPs)); - [] -> [] + [Char|CPs] when Char =:= hd(CPs0) -> [Char|casefold_list(CPs, Changed)]; + [Char|CPs] -> append(Char,casefold_list(CPs, true)); + [] -> casefold_list([], Changed) end. -casefold_bin(CPs0, Acc) -> - case unicode_util:casefold(CPs0) of - [Char|CPs] when is_integer(Char) -> - casefold_bin(CPs, <>); - [Chars|CPs] -> - casefold_bin(CPs, <> || CP <- Chars>>/binary >>); - [] -> Acc +casefold_bin(CP1, <>, _Changed) + when $A =< CP1, CP1 =< $Z, CP2 < 256 -> + [CP1+32|casefold_bin(CP2, Bin, true)]; +casefold_bin(CP1, <>, _Changed) + when CP1 < 128, CP2 < 256 -> + [CP1|casefold_bin(CP2, Bin, false)]; +casefold_bin(CP1, Bin, Changed) -> + case unicode_util:casefold([CP1|Bin]) of + [CP1|CPs] -> + case unicode_util:cp(CPs) of + [Next|Rest] -> + [CP1|casefold_bin(Next, Rest, Changed)]; + [] when Changed -> + [CP1]; + [] -> + throw(unchanged) + end; + [Char|CPs] -> + case unicode_util:cp(CPs) of + [Next|Rest] -> + [Char|casefold_bin(Next, Rest, true)]; + [] -> + [Char] + end end. %% Fast path for ascii searching for one character in lists diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl index fdff2d24b8..29fabb4583 100644 --- a/lib/stdlib/test/string_SUITE.erl +++ b/lib/stdlib/test/string_SUITE.erl @@ -810,6 +810,18 @@ do_measure(DataDir) -> Do2(slice, repeat(fun() -> string:slice(S0, 20, 15) end), list), Do2(slice, repeat(fun() -> string:slice(S0B, 20, 15) end), binary), + LCase = "areaa reare rerar earea reare reare", + LCaseB = unicode:characters_to_binary(LCase), + UCase = string:uppercase(LCase), + UCaseB = unicode:characters_to_binary(UCase), + + Do2(to_upper_0, repeat(fun() -> string:to_upper(UCase) end), list), + Do2(uppercase_0, repeat(fun() -> string:uppercase(UCase) end), list), + Do2(uppercase_0, repeat(fun() -> string:uppercase(UCaseB) end), binary), + Do2(to_upper_a, repeat(fun() -> string:to_upper(LCase) end), list), + Do2(uppercase_a, repeat(fun() -> string:uppercase(LCase) end), list), + Do2(uppercase_a, repeat(fun() -> string:uppercase(LCaseB) end), binary), + io:format("--~n",[]), NthTokens = {nth_lexemes, fun(Str) -> string:nth_lexeme(Str, 18000, [$\n,$\r]) end}, [Do(Name,Fun,Mode) || {Name,Fun} <- [NthTokens], Mode <- [list, binary]], -- cgit v1.2.3