1 files changed, 37 insertions, 20 deletions
diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript
index fefd7d3b70..fe5a860d45 100755
--- a/lib/stdlib/uc_spec/gen_unicode_mod.escript
+++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript
@@ -65,7 +65,7 @@ main(_) ->
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
 parse_unicode_data(Line0, Acc) ->
-    Line = string:strip(Line0, right, $\n),
+    Line = string:chomp(Line0),
     [CodePoint,Name,_Cat,Class,_BiDi,Decomp,
      _N1,_N2,_N3,_BDMirror,_Uni1,_Iso|Case] = tokens(Line, ";"),
     {Dec,Comp} = case to_decomp(Decomp) of
@@ -78,14 +78,14 @@ parse_unicode_data(Line0, Acc) ->
      |Acc].
 
 to_class(String) ->
-    list_to_integer(string:strip(String, both)).
+    list_to_integer(string:trim(String, both)).
 
 to_decomp("") -> [];
 to_decomp("<" ++ Str) ->
-    [Tag,Rest]  = string:tokens(Str, ">"),
+    [Tag,Rest]  = string:lexemes(Str, ">"),
     {list_to_atom(Tag), to_decomp(Rest)};
 to_decomp(CodePoints) ->
-    CPL = string:tokens(CodePoints, " "),
+    CPL = string:lexemes(CodePoints, " "),
     [hex_to_int(CP) || CP <- CPL].
 
 to_case(["","",""]) -> [];
@@ -105,20 +105,20 @@ parse_special_casing(Line, Table) ->
     array:set(CP, Entry#cp{cs=Case}, Table).
 
 to_scase([Lower,Title,Upper|_]) ->
-    {unlist([hex_to_int(CP) || CP <- string:strip(string:tokens(Upper, " "), both)]),
-     unlist([hex_to_int(CP) || CP <- string:strip(string:tokens(Lower, " "), both)]),
-     unlist([hex_to_int(CP) || CP <- string:strip(string:tokens(Title, " "), both)]),
+    {unlist([hex_to_int(CP) || CP <- string:lexemes(Upper, " ")]),
+     unlist([hex_to_int(CP) || CP <- string:lexemes(Lower, " ")]),
+     unlist([hex_to_int(CP) || CP <- string:lexemes(Title, " ")]),
      []}.
 
 parse_case_folding(Line, Table) ->
     [CodePoint, Class0, CaseStr |_Comments] = tokens(Line, ";"),
-    Class = string:strip(Class0, both),
+    Class = string:trim(Class0, both),
     if Class =:= "T" -> Table; %% Do not support localization yet
        Class =:= "S" -> Table; %% Ignore simple
        true ->
             CP = hex_to_int(CodePoint),
             Case = unlist([hex_to_int(CPC) ||
-                              CPC <- string:strip(string:tokens(CaseStr, " "), both)]),
+                              CPC <- string:lexemes(CaseStr, " ")]),
             #cp{cs={U,L,T,_}} = Entry = array:get(CP, Table),
             array:set(CP, Entry#cp{cs={U,L,T,Case}}, Table)
     end.
@@ -170,7 +170,7 @@ gen_header(Fd) ->
     io:put_chars(Fd, "-export([spec_version/0, lookup/1, get_case/1]).\n"),
     io:put_chars(Fd, "-inline([class/1]).\n"),
     io:put_chars(Fd, "-compile(nowarn_unused_vars).\n"),
-    io:put_chars(Fd, "-dialyzer({no_improper_lists, [cp/1, gc_prepend/2, gc_e_cont/2]}).\n"),
+    io:put_chars(Fd, "-dialyzer({no_improper_lists, [cp/1, gc/1, gc_prepend/2, gc_e_cont/2]}).\n"),
     io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n\n"),
     ok.
 
@@ -186,7 +186,7 @@ gen_static(Fd) ->
                  "        {U,L} -> #{upper=>U,lower=>L,title=>U,fold=>L};\n"
                  "        {U,L,T,F} -> #{upper=>U,lower=>L,title=>T,fold=>F}\n"
                  "    end.\n\n"),
-    io:put_chars(Fd, "spec_version() -> {9,0}.\n\n\n"),
+    io:put_chars(Fd, "spec_version() -> {10,0}.\n\n\n"),
     io:put_chars(Fd, "class(Codepoint) -> {CCC,_,_} = unicode_table(Codepoint),\n    CCC.\n\n"),
     io:put_chars(Fd, "-spec uppercase(unicode:chardata()) -> "
                  "maybe_improper_list(gc(),unicode:chardata()).\n"),
@@ -240,7 +240,7 @@ gen_norm(Fd) ->
                  "-spec nfd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfd(Str0) ->\n"
                  "    case gc(Str0) of\n"
-                 "        [GC|R] when GC < 127 -> [GC|R];\n"
+                 "        [GC|R] when GC < 128 -> [GC|R];\n"
                  "        [GC|Str] -> [decompose(GC)|Str];\n"
                  "        [] -> [];\n"
                  "        {error,_}=Error -> Error\n    end.\n\n"
@@ -250,7 +250,7 @@ gen_norm(Fd) ->
                  "-spec nfkd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfkd(Str0) ->\n"
                  "    case gc(Str0) of\n"
-                 "        [GC|R] when GC < 127 -> [GC|R];\n"
+                 "        [GC|R] when GC < 128 -> [GC|R];\n"
                  "        [GC|Str] -> [decompose_compat(GC)|Str];\n"
                  "        [] -> [];\n"
                  "        {error,_}=Error -> Error\n    end.\n\n"
@@ -260,7 +260,7 @@ gen_norm(Fd) ->
                  "-spec nfc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfc(Str0) ->\n"
                  "    case gc(Str0) of\n"
-                 "        [GC|R] when GC < 255 -> [GC|R];\n"
+                 "        [GC|R] when GC < 256 -> [GC|R];\n"
                  "        [GC|Str] -> [compose(decompose(GC))|Str];\n"
                  "        [] -> [];\n"
                  "        {error,_}=Error -> Error\n    end.\n\n"
@@ -270,7 +270,7 @@ gen_norm(Fd) ->
                  "-spec nfkc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfkc(Str0) ->\n"
                  "    case gc(Str0) of\n"
-                 "        [GC|R] when GC < 127 -> [GC|R];\n"
+                 "        [GC|R] when GC < 128 -> [GC|R];\n"
                  "        [GC|Str] -> [compose_compat_0(decompose_compat(GC))|Str];\n"
                  "        [] -> [];\n"
                  "        {error,_}=Error -> Error\n    end.\n\n"
@@ -476,13 +476,30 @@ gen_gc(Fd, GBP) ->
                  "-spec gc(String::unicode:chardata()) ->"
                  " maybe_improper_list() | {error, unicode:chardata()}.\n"),
     io:put_chars(Fd,
+                 "gc([CP1, CP2|_]=T)\n"
+                 "  when CP1 < 256, CP2 < 256, CP1 =/= $\r -> %% Ascii Fast path\n"
+                 "       T;\n"
+                 "gc(<<CP1/utf8, Rest/binary>>) ->\n"
+                 "    if CP1 < 256, CP1 =/= $\r ->\n"
+                 "           case Rest of\n"
+                 "               <<CP2/utf8, _/binary>> when CP2 < 256 -> %% Ascii Fast path\n"
+                 "                   [CP1|Rest];\n"
+                 "               _ -> gc_1([CP1|Rest])\n"
+                 "           end;\n"
+                 "      true -> gc_1([CP1|Rest])\n"
+                 "    end;\n"
                  "gc(Str) ->\n"
                  "    gc_1(cp(Str)).\n\n"
                  "gc_1([$\\r|R0] = R) ->\n"
                  "    case cp(R0) of % Don't break CRLF\n"
                  "        [$\\n|R1] -> [[$\\r,$\\n]|R1];\n"
                  "        _ -> R\n"
-                 "    end;\n"),
+                 "    end;\n"
+                 %% "gc_1([CP1, CP2|_]=T) when CP1 < 256, CP2 < 256 ->\n"
+                 %% "    T;  %% Fast path\n"
+                 %% "gc_1([CP1|<<CP2/utf8, _/binary>>]=T) when CP1 < 256, CP2 < 256 ->\n"
+                 %% "    T;  %% Fast path\n"
+                ),
 
     io:put_chars(Fd, "%% Handle control\n"),
     GenControl = fun(Range) -> io:format(Fd, "gc_1~s R0;\n", [gen_clause(Range)]) end,
@@ -490,7 +507,7 @@ gen_gc(Fd, GBP) ->
     [R1,R2,R3|Crs] = CRs0,
     [GenControl(CP) || CP <- merge_ranges([R1,R2,R3], split), CP =/= {$\r, undefined}],
     %%GenControl(R1),GenControl(R2),GenControl(R3),
-    io:format(Fd, "gc_1([CP|R]) when CP < 255 -> gc_extend(R,CP);\n", []),
+    io:format(Fd, "gc_1([CP|R]) when CP < 256 -> gc_extend(R,CP);\n", []),
     [GenControl(CP) || CP <- Crs],
     %% One clause per CP
     %% CRs0 = merge_ranges(maps:get(cr, GBP) ++ maps:get(lf, GBP) ++ maps:get(control, GBP)),
@@ -869,10 +886,10 @@ optimize_ranges_1(Rs) ->
 
 hex_to_int([]) -> [];
 hex_to_int(HexStr) ->
-    list_to_integer(string:strip(HexStr, both), 16).
+    list_to_integer(string:trim(HexStr, both), 16).
 
 to_atom(Str) ->
-    list_to_atom(string:to_lower(string:strip(Str, both))).
+    list_to_atom(string:lowercase(string:trim(Str, both))).
 
 foldl(Fun, Acc, Fd) ->
     Get = fun() -> file:read_line(Fd) end,
@@ -892,7 +909,7 @@ foldl_1(Fun, Acc, Get) ->
 
 
 
-%% Differs from string:tokens, it returns empty string as token between two delimiters
+%% Differs from string:lexemes, it returns empty string as token between two delimiters
 tokens(S, [C]) ->
     tokens(lists:reverse(S), C, []).