aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Gudmundsson <[email protected]>2018-05-14 15:06:29 +0200
committerDan Gudmundsson <[email protected]>2018-05-14 15:09:05 +0200
commit5c51e87bee9d8db0503f41eda4ab0589df1a2ec1 (patch)
tree828f129be32eb359506a41b48ca34a6f8513e494
parentb873e5cd6d78191d303c7c828ab7df7cd7fdf4d2 (diff)
downloadotp-5c51e87bee9d8db0503f41eda4ab0589df1a2ec1.tar.gz
otp-5c51e87bee9d8db0503f41eda4ab0589df1a2ec1.tar.bz2
otp-5c51e87bee9d8db0503f41eda4ab0589df1a2ec1.zip
Optimize string lowercase, uppercase and casefold for ASCII characters
-rw-r--r--lib/stdlib/src/string.erl180
-rw-r--r--lib/stdlib/test/string_SUITE.erl12
2 files changed, 150 insertions, 42 deletions
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index 0736374f21..f5d271c06d 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -323,16 +323,30 @@ take(Str, Sep0, true, trailing) ->
%% Uppercase all chars in Str
-spec uppercase(String::unicode:chardata()) -> unicode:chardata().
uppercase(CD) when is_list(CD) ->
- uppercase_list(CD);
-uppercase(CD) when is_binary(CD) ->
- uppercase_bin(CD,<<>>).
+ try uppercase_list(CD, false)
+ catch unchanged -> CD
+ end;
+uppercase(<<CP1/utf8, Rest/binary>>=Orig) ->
+ try uppercase_bin(CP1, Rest, false) of
+ List -> unicode:characters_to_binary(List)
+ catch unchanged -> Orig
+ end;
+uppercase(<<>>) ->
+ <<>>.
%% Lowercase all chars in Str
-spec lowercase(String::unicode:chardata()) -> unicode:chardata().
lowercase(CD) when is_list(CD) ->
- lowercase_list(CD);
-lowercase(CD) when is_binary(CD) ->
- lowercase_bin(CD,<<>>).
+ try lowercase_list(CD, false)
+ catch unchanged -> CD
+ end;
+lowercase(<<CP1/utf8, Rest/binary>>=Orig) ->
+ try lowercase_bin(CP1, Rest, false) of
+ List -> unicode:characters_to_binary(List)
+ catch unchanged -> Orig
+ end;
+lowercase(<<>>) ->
+ <<>>.
%% Make a titlecase of the first char in Str
-spec titlecase(String::unicode:chardata()) -> unicode:chardata().
@@ -352,9 +366,16 @@ titlecase(CD) when is_binary(CD) ->
%% Make a comparable string of the Str should be used for equality tests only
-spec casefold(String::unicode:chardata()) -> unicode:chardata().
casefold(CD) when is_list(CD) ->
- casefold_list(CD);
-casefold(CD) when is_binary(CD) ->
- casefold_bin(CD,<<>>).
+ try casefold_list(CD, false)
+ catch unchanged -> CD
+ end;
+casefold(<<CP1/utf8, Rest/binary>>=Orig) ->
+ try casefold_bin(CP1, Rest, false) of
+ List -> unicode:characters_to_binary(List)
+ catch unchanged -> Orig
+ end;
+casefold(<<>>) ->
+ <<>>.
-spec to_integer(String) -> {Int, Rest} | {'error', Reason} when
String :: unicode:chardata(),
@@ -652,52 +673,127 @@ slice_bin(CD, CP1, N) when N > 0 ->
slice_bin(CD, CP1, 0) ->
byte_size(CD)+byte_size(<<CP1/utf8>>).
-uppercase_list(CPs0) ->
+uppercase_list([CP1|[CP2|_]=Cont], _Changed) when $a =< CP1, CP1 =< $z, CP2 < 256 ->
+ [CP1-32|uppercase_list(Cont, true)];
+uppercase_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 ->
+ [CP1|uppercase_list(Cont, Changed)];
+uppercase_list([], true) ->
+ [];
+uppercase_list([], false) ->
+ throw(unchanged);
+uppercase_list(CPs0, Changed) ->
case unicode_util:uppercase(CPs0) of
- [Char|CPs] -> append(Char,uppercase_list(CPs));
- [] -> []
+ [Char|CPs] when Char =:= hd(CPs0) -> [Char|uppercase_list(CPs, Changed)];
+ [Char|CPs] -> append(Char,uppercase_list(CPs, true));
+ [] -> uppercase_list([], Changed)
end.
-uppercase_bin(CPs0, Acc) ->
- case unicode_util:uppercase(CPs0) of
- [Char|CPs] when is_integer(Char) ->
- uppercase_bin(CPs, <<Acc/binary, Char/utf8>>);
- [Chars|CPs] ->
- uppercase_bin(CPs, <<Acc/binary,
- << <<CP/utf8>> || CP <- Chars>>/binary >>);
- [] -> Acc
+uppercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
+ when $a =< CP1, CP1 =< $z, CP2 < 256 ->
+ [CP1-32|uppercase_bin(CP2, Bin, true)];
+uppercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
+ when CP1 < 128, CP2 < 256 ->
+ [CP1|uppercase_bin(CP2, Bin, false)];
+uppercase_bin(CP1, Bin, Changed) ->
+ case unicode_util:uppercase([CP1|Bin]) of
+ [CP1|CPs] ->
+ case unicode_util:cp(CPs) of
+ [Next|Rest] ->
+ [CP1|uppercase_bin(Next, Rest, Changed)];
+ [] when Changed ->
+ [CP1];
+ [] ->
+ throw(unchanged)
+ end;
+ [Char|CPs] ->
+ case unicode_util:cp(CPs) of
+ [Next|Rest] ->
+ [Char|uppercase_bin(Next, Rest, true)];
+ [] ->
+ [Char]
+ end
end.
-lowercase_list(CPs0) ->
+lowercase_list([CP1|[CP2|_]=Cont], _Changed) when $A =< CP1, CP1 =< $Z, CP2 < 256 ->
+ [CP1+32|lowercase_list(Cont, true)];
+lowercase_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 ->
+ [CP1|lowercase_list(Cont, Changed)];
+lowercase_list([], true) ->
+ [];
+lowercase_list([], false) ->
+ throw(unchanged);
+lowercase_list(CPs0, Changed) ->
case unicode_util:lowercase(CPs0) of
- [Char|CPs] -> append(Char,lowercase_list(CPs));
- [] -> []
+ [Char|CPs] when Char =:= hd(CPs0) -> [Char|lowercase_list(CPs, Changed)];
+ [Char|CPs] -> append(Char,lowercase_list(CPs, true));
+ [] -> lowercase_list([], Changed)
end.
-lowercase_bin(CPs0, Acc) ->
- case unicode_util:lowercase(CPs0) of
- [Char|CPs] when is_integer(Char) ->
- lowercase_bin(CPs, <<Acc/binary, Char/utf8>>);
- [Chars|CPs] ->
- lowercase_bin(CPs, <<Acc/binary,
- << <<CP/utf8>> || CP <- Chars>>/binary >>);
- [] -> Acc
+lowercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
+ when $A =< CP1, CP1 =< $Z, CP2 < 256 ->
+ [CP1+32|lowercase_bin(CP2, Bin, true)];
+lowercase_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
+ when CP1 < 128, CP2 < 256 ->
+ [CP1|lowercase_bin(CP2, Bin, false)];
+lowercase_bin(CP1, Bin, Changed) ->
+ case unicode_util:lowercase([CP1|Bin]) of
+ [CP1|CPs] ->
+ case unicode_util:cp(CPs) of
+ [Next|Rest] ->
+ [CP1|lowercase_bin(Next, Rest, Changed)];
+ [] when Changed ->
+ [CP1];
+ [] ->
+ throw(unchanged)
+ end;
+ [Char|CPs] ->
+ case unicode_util:cp(CPs) of
+ [Next|Rest] ->
+ [Char|lowercase_bin(Next, Rest, true)];
+ [] ->
+ [Char]
+ end
end.
-casefold_list(CPs0) ->
+casefold_list([CP1|[CP2|_]=Cont], _Changed) when $A =< CP1, CP1 =< $Z, CP2 < 256 ->
+ [CP1+32|casefold_list(Cont, true)];
+casefold_list([CP1|[CP2|_]=Cont], Changed) when CP1 < 128, CP2 < 256 ->
+ [CP1|casefold_list(Cont, Changed)];
+casefold_list([], true) ->
+ [];
+casefold_list([], false) ->
+ throw(unchanged);
+casefold_list(CPs0, Changed) ->
case unicode_util:casefold(CPs0) of
- [Char|CPs] -> append(Char, casefold_list(CPs));
- [] -> []
+ [Char|CPs] when Char =:= hd(CPs0) -> [Char|casefold_list(CPs, Changed)];
+ [Char|CPs] -> append(Char,casefold_list(CPs, true));
+ [] -> casefold_list([], Changed)
end.
-casefold_bin(CPs0, Acc) ->
- case unicode_util:casefold(CPs0) of
- [Char|CPs] when is_integer(Char) ->
- casefold_bin(CPs, <<Acc/binary, Char/utf8>>);
- [Chars|CPs] ->
- casefold_bin(CPs, <<Acc/binary,
- << <<CP/utf8>> || CP <- Chars>>/binary >>);
- [] -> Acc
+casefold_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
+ when $A =< CP1, CP1 =< $Z, CP2 < 256 ->
+ [CP1+32|casefold_bin(CP2, Bin, true)];
+casefold_bin(CP1, <<CP2/utf8, Bin/binary>>, _Changed)
+ when CP1 < 128, CP2 < 256 ->
+ [CP1|casefold_bin(CP2, Bin, false)];
+casefold_bin(CP1, Bin, Changed) ->
+ case unicode_util:casefold([CP1|Bin]) of
+ [CP1|CPs] ->
+ case unicode_util:cp(CPs) of
+ [Next|Rest] ->
+ [CP1|casefold_bin(Next, Rest, Changed)];
+ [] when Changed ->
+ [CP1];
+ [] ->
+ throw(unchanged)
+ end;
+ [Char|CPs] ->
+ case unicode_util:cp(CPs) of
+ [Next|Rest] ->
+ [Char|casefold_bin(Next, Rest, true)];
+ [] ->
+ [Char]
+ end
end.
%% Fast path for ascii searching for one character in lists
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index fdff2d24b8..29fabb4583 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -810,6 +810,18 @@ do_measure(DataDir) ->
Do2(slice, repeat(fun() -> string:slice(S0, 20, 15) end), list),
Do2(slice, repeat(fun() -> string:slice(S0B, 20, 15) end), binary),
+ LCase = "areaa reare rerar earea reare reare",
+ LCaseB = unicode:characters_to_binary(LCase),
+ UCase = string:uppercase(LCase),
+ UCaseB = unicode:characters_to_binary(UCase),
+
+ Do2(to_upper_0, repeat(fun() -> string:to_upper(UCase) end), list),
+ Do2(uppercase_0, repeat(fun() -> string:uppercase(UCase) end), list),
+ Do2(uppercase_0, repeat(fun() -> string:uppercase(UCaseB) end), binary),
+ Do2(to_upper_a, repeat(fun() -> string:to_upper(LCase) end), list),
+ Do2(uppercase_a, repeat(fun() -> string:uppercase(LCase) end), list),
+ Do2(uppercase_a, repeat(fun() -> string:uppercase(LCaseB) end), binary),
+
io:format("--~n",[]),
NthTokens = {nth_lexemes, fun(Str) -> string:nth_lexeme(Str, 18000, [$\n,$\r]) end},
[Do(Name,Fun,Mode) || {Name,Fun} <- [NthTokens], Mode <- [list, binary]],