Return error tuple on unicode normalization functions

Prior to this patch, the normalization functions in the unicode module would raise a function clause error for non-utf8 binaries. This patch changes it so it returns {error, SoFar, Invalid} as characters_to_binary and characters_to_list does in the unicode module. Note string:next_codepoint/1 and string:next_grapheme had to be changed accordingly and also return an error tuple.
author: José Valim <[email protected]> 2017-05-19 16:06:08 +0200
committer: José Valim <[email protected]> 2017-05-22 15:08:31 +0200
commit: e1370f924df65e72843b5f81400230e1c2591485 (patch)
tree: 73babac731e86c0903ef584d14749e1777d5f54b /lib
parent: 166d11bb8cbb386dfab4fef37f6f231ac2689b61 (diff)
download: otp-e1370f924df65e72843b5f81400230e1c2591485.tar.gz
otp-e1370f924df65e72843b5f81400230e1c2591485.tar.bz2
otp-e1370f924df65e72843b5f81400230e1c2591485.zip
7 files changed, 144 insertions, 68 deletions
diff --git a/lib/stdlib/doc/src/string.xml b/lib/stdlib/doc/src/string.xml
index 343904a49a..9d5edd9ecf 100644
--- a/lib/stdlib/doc/src/string.xml
+++ b/lib/stdlib/doc/src/string.xml
@@ -311,7 +311,9 @@ true</pre>
       <desc>
         <p>
 	  Returns the first codepoint in <c><anno>String</anno></c>
-	  and the rest of <c><anno>String</anno></c> in the tail.
+	  and the rest of <c><anno>String</anno></c> in the tail. Returns
+	  an empty list if <c><anno>String</anno></c> is empty or an
+	  <c>{error, String}</c> tuple if the next byte is invalid.
 	</p>
 	<p><em>Example:</em></p>
 	<pre>
@@ -326,7 +328,9 @@ true</pre>
       <desc>
         <p>
 	  Returns the first grapheme cluster in <c><anno>String</anno></c>
-	  and the rest of <c><anno>String</anno></c> in the tail.
+	  and the rest of <c><anno>String</anno></c> in the tail. Returns
+	  an empty list if <c><anno>String</anno></c> is empty or an
+	  <c>{error, String}</c> tuple if the next byte is invalid.
 	</p>
 	<p><em>Example:</em></p>
 	<pre>
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index 17135dd64a..6f7009b5d9 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -486,12 +486,14 @@ find(String, SearchPattern, trailing) ->
 
 %% Fetch first codepoint and return rest in tail
 -spec next_grapheme(String::unicode:chardata()) ->
-                           maybe_improper_list(grapheme_cluster(),unicode:chardata()).
+                           maybe_improper_list(grapheme_cluster(),unicode:chardata()) |
+                           {error,unicode:chardata()}.
 next_grapheme(CD) -> unicode_util:gc(CD).
 
 %% Fetch first grapheme cluster and return rest in tail
 -spec next_codepoint(String::unicode:chardata()) ->
-                            maybe_improper_list(char(),unicode:chardata()).
+                            maybe_improper_list(char(),unicode:chardata()) |
+                            {error,unicode:chardata()}.
 next_codepoint(CD) -> unicode_util:cp(CD).
 
 %% Internals
@@ -508,7 +510,7 @@ equal_1(A0,B0) ->
     case {unicode_util:cp(A0), unicode_util:cp(B0)} of
         {[CP|A],[CP|B]} -> equal_1(A,B);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 equal_nocase(A, A) -> true;
@@ -517,7 +519,7 @@ equal_nocase(A0, B0) ->
           unicode_util:cp(unicode_util:casefold(B0))} of
         {[CP|A],[CP|B]} -> equal_nocase(A,B);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 equal_norm(A, A, _Norm) -> true;
@@ -526,7 +528,7 @@ equal_norm(A0, B0, Norm) ->
           unicode_util:cp(unicode_util:Norm(B0))} of
         {[CP|A],[CP|B]} -> equal_norm(A,B, Norm);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 equal_norm_nocase(A, A, _Norm) -> true;
@@ -535,7 +537,7 @@ equal_norm_nocase(A0, B0, Norm) ->
           unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of
         {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 reverse_1(CD, Acc) ->
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl
index aa1da400ce..fbe8a94074 100644
--- a/lib/stdlib/src/unicode.erl
+++ b/lib/stdlib/src/unicode.erl
@@ -250,89 +250,110 @@ encoding_to_bom(latin1) ->
 -define(GC_N, 200). %% arbitrary number
 
 %% Canonical decompose string to list of chars
--spec characters_to_nfd_list(chardata()) -> [char()].
+-spec characters_to_nfd_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfd_list(CD) ->
+    characters_to_nfd_list(CD, []).
+characters_to_nfd_list(CD, Acc) ->
     case unicode_util:nfd(CD) of
-        [GC|Str] when is_list(GC) -> GC++characters_to_nfd_list(Str);
-        [CP|Str] -> [CP|characters_to_nfd_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfd_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfd_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfd_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfd_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfd_binary(CD) ->
-    list_to_binary(characters_to_nfd_binary(CD, ?GC_N, [])).
+    characters_to_nfd_binary(CD, ?GC_N, [], []).
 
-characters_to_nfd_binary(CD, N, Row) when N > 0 ->
+characters_to_nfd_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfd(CD) of
-        [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfd_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfd_binary(CD,?GC_N,[])].
+characters_to_nfd_binary(CD, _, Row, Acc) ->
+    characters_to_nfd_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
 %% Compability Canonical decompose string to list of chars.
--spec characters_to_nfkd_list(chardata()) -> [char()].
+-spec characters_to_nfkd_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfkd_list(CD) ->
+    characters_to_nfkd_list(CD, []).
+characters_to_nfkd_list(CD, Acc) ->
     case unicode_util:nfkd(CD) of
-        [GC|Str] when is_list(GC) -> GC++characters_to_nfkd_list(Str);
-        [CP|Str] -> [CP|characters_to_nfkd_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfkd_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfkd_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfkd_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfkd_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfkd_binary(CD) ->
-    list_to_binary(characters_to_nfkd_binary(CD, ?GC_N, [])).
+    characters_to_nfkd_binary(CD, ?GC_N, [], []).
 
-characters_to_nfkd_binary(CD, N, Row) when N > 0 ->
+characters_to_nfkd_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfkd(CD) of
-        [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfkd_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfkd_binary(CD,?GC_N,[])].
+characters_to_nfkd_binary(CD, _, Row, Acc) ->
+    characters_to_nfkd_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
 
 %% Canonical compose string to list of chars
--spec characters_to_nfc_list(chardata()) -> [char()].
+-spec characters_to_nfc_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfc_list(CD) ->
+    characters_to_nfc_list(CD, []).
+characters_to_nfc_list(CD, Acc) ->
     case unicode_util:nfc(CD) of
-        [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str);
-        [CP|Str] -> [CP|characters_to_nfc_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfc_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfc_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfc_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfc_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfc_binary(CD) ->
-    list_to_binary(characters_to_nfc_binary(CD, ?GC_N, [])).
+    characters_to_nfc_binary(CD, ?GC_N, [], []).
 
-characters_to_nfc_binary(CD, N, Row) when N > 0 ->
+characters_to_nfc_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfc(CD) of
-        [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfc_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfc_binary(CD,?GC_N,[])].
+characters_to_nfc_binary(CD, _, Row, Acc) ->
+    characters_to_nfc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
 %% Compability Canonical compose string to list of chars
--spec characters_to_nfkc_list(chardata()) -> [char()].
+-spec characters_to_nfkc_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfkc_list(CD) ->
+    characters_to_nfkc_list(CD, []).
+characters_to_nfkc_list(CD, Acc) ->
     case unicode_util:nfkc(CD) of
-        [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfkc_list(Str);
-        [CP|Str] -> [CP|characters_to_nfkc_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfkc_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfkc_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfkc_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfkc_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfkc_binary(CD) ->
-    list_to_binary(characters_to_nfkc_binary(CD, ?GC_N, [])).
+    characters_to_nfkc_binary(CD, ?GC_N, [], []).
 
-characters_to_nfkc_binary(CD, N, Row) when N > 0 ->
+characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfkc(CD) of
-        [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfkc_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfkc_binary(CD,?GC_N,[])].
+characters_to_nfkc_binary(CD, _, Row, Acc) ->
+    characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
+
+acc_to_binary(Acc) ->
+    list_to_binary(lists:reverse(Acc)).
+prepend_row_to_acc(Row, Acc) ->
+    [characters_to_binary(lists:reverse(Row))|Acc].
 
 %% internals
 
diff --git a/lib/stdlib/test/string_SUITE.erl b/lib/stdlib/test/string_SUITE.erl
index 4320b735ac..90f980c0e5 100644
--- a/lib/stdlib/test/string_SUITE.erl
+++ b/lib/stdlib/test/string_SUITE.erl
@@ -582,6 +582,8 @@ cd_gc(_) ->
     [$e,778] = string:next_codepoint([$e,778]),
     [$e|<<204,138>>] = string:next_codepoint(<<$e,778/utf8>>),
     [778|_] = string:next_codepoint(tl(string:next_codepoint(<<$e,778/utf8>>))),
+    [0|<<128,1>>] = string:next_codepoint(<<0,128,1>>),
+    {error,<<128,1>>} = string:next_codepoint(<<128,1>>),
 
     [] = string:next_grapheme(""),
     [] = string:next_grapheme(<<>>),
@@ -589,6 +591,8 @@ cd_gc(_) ->
     "abcd" = string:next_grapheme("abcd"),
     [[$e,778]] = string:next_grapheme([$e,778]),
     [[$e,778]] = string:next_grapheme(<<$e,778/utf8>>),
+    [0|<<128,1>>] = string:next_grapheme(<<0,128,1>>),
+    {error,<<128,1>>} = string:next_grapheme(<<128,1>>),
 
     ok.
 
diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl
index 3d97ab93f1..e01ba3fbb0 100644
--- a/lib/stdlib/test/unicode_SUITE.erl
+++ b/lib/stdlib/test/unicode_SUITE.erl
@@ -998,6 +998,30 @@ normalize(_) ->
 
     true = unicode:characters_to_nfkc_list("ホンダ") =:= unicode:characters_to_nfkc_list("ﾎﾝﾀﾞ"),
     true = unicode:characters_to_nfkd_list("32") =:= unicode:characters_to_nfkd_list("３２"),
+
+    {error, [0], <<128>>} = unicode:characters_to_nfc_list(<<0, 128>>),
+    {error, [0], <<128>>} = unicode:characters_to_nfkc_list(<<0, 128>>),
+    {error, [0], <<128>>} = unicode:characters_to_nfd_list(<<0, 128>>),
+    {error, [0], <<128>>} = unicode:characters_to_nfkd_list(<<0, 128>>),
+
+    {error, <<0>>, <<128>>} = unicode:characters_to_nfc_binary(<<0, 128>>),
+    {error, <<0>>, <<128>>} = unicode:characters_to_nfkc_binary(<<0, 128>>),
+    {error, <<0>>, <<128>>} = unicode:characters_to_nfd_binary(<<0, 128>>),
+    {error, <<0>>, <<128>>} = unicode:characters_to_nfkd_binary(<<0, 128>>),
+
+    LargeBin = binary:copy(<<"abcde">>, 50),
+    LargeList = binary_to_list(LargeBin),
+
+    {error, LargeList, <<128>>} = unicode:characters_to_nfc_list(<<LargeBin/binary, 128>>),
+    {error, LargeList, <<128>>} = unicode:characters_to_nfkc_list(<<LargeBin/binary, 128>>),
+    {error, LargeList, <<128>>} = unicode:characters_to_nfd_list(<<LargeBin/binary, 128>>),
+    {error, LargeList, <<128>>} = unicode:characters_to_nfkd_list(<<LargeBin/binary, 128>>),
+
+    {error, LargeBin, <<128>>} = unicode:characters_to_nfc_binary(<<LargeBin/binary, 128>>),
+    {error, LargeBin, <<128>>} = unicode:characters_to_nfkc_binary(<<LargeBin/binary, 128>>),
+    {error, LargeBin, <<128>>} = unicode:characters_to_nfd_binary(<<LargeBin/binary, 128>>),
+    {error, LargeBin, <<128>>} = unicode:characters_to_nfkd_binary(<<LargeBin/binary, 128>>),
+
     ok.
 
 
diff --git a/lib/stdlib/test/unicode_util_SUITE.erl b/lib/stdlib/test/unicode_util_SUITE.erl
index e9b3d7f98d..03c24c7027 100644
--- a/lib/stdlib/test/unicode_util_SUITE.erl
+++ b/lib/stdlib/test/unicode_util_SUITE.erl
@@ -97,6 +97,8 @@ cp(_) ->
     "hejsan" = fetch(<<"hejsan">>, Get),
     "hejsan" = fetch(["hej",<<"san">>], Get),
     "hejsan" = fetch(["hej"|<<"san">>], Get),
+    {error, <<128>>} = Get(<<128>>),
+    {error, [<<128>>, 0]} = Get([<<128>>, 0]),
     ok.
 
 gc(Config) ->
@@ -106,6 +108,8 @@ gc(Config) ->
     "hejsan" = fetch(<<"hejsan">>, Get),
     "hejsan" = fetch(["hej",<<"san">>], Get),
     "hejsan" = fetch(["hej"|<<"san">>], Get),
+    {error, <<128>>} = Get(<<128>>),
+    {error, [<<128>>, 0]} = Get([<<128>>, 0]),
 
     0 = fold(fun verify_gc/3, 0, DataDir ++ "/GraphemeBreakTest.txt"),
     ok.
diff --git a/lib/stdlib/uc_spec/gen_unicode_mod.escript b/lib/stdlib/uc_spec/gen_unicode_mod.escript
index c8b815e435..fefd7d3b70 100755
--- a/lib/stdlib/uc_spec/gen_unicode_mod.escript
+++ b/lib/stdlib/uc_spec/gen_unicode_mod.escript
@@ -170,7 +170,7 @@ gen_header(Fd) ->
     io:put_chars(Fd, "-export([spec_version/0, lookup/1, get_case/1]).\n"),
     io:put_chars(Fd, "-inline([class/1]).\n"),
     io:put_chars(Fd, "-compile(nowarn_unused_vars).\n"),
-    io:put_chars(Fd, "-dialyzer({no_improper_lists, cp/1}).\n"),
+    io:put_chars(Fd, "-dialyzer({no_improper_lists, [cp/1, gc_prepend/2, gc_e_cont/2]}).\n"),
     io:put_chars(Fd, "-type gc() :: char()|[char()].\n\n\n"),
     ok.
 
@@ -237,39 +237,43 @@ gen_static(Fd) ->
 
 gen_norm(Fd) ->
     io:put_chars(Fd,
-                 "-spec nfd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()).\n"
+                 "-spec nfd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfd(Str0) ->\n"
                  "    case gc(Str0) of\n"
                  "        [GC|R] when GC < 127 -> [GC|R];\n"
                  "        [GC|Str] -> [decompose(GC)|Str];\n"
-                 "        [] -> []\n    end.\n\n"
+                 "        [] -> [];\n"
+                 "        {error,_}=Error -> Error\n    end.\n\n"
                 ),
 
     io:put_chars(Fd,
-                 "-spec nfkd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()).\n"
+                 "-spec nfkd(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfkd(Str0) ->\n"
                  "    case gc(Str0) of\n"
                  "        [GC|R] when GC < 127 -> [GC|R];\n"
                  "        [GC|Str] -> [decompose_compat(GC)|Str];\n"
-                 "        [] -> []\n    end.\n\n"
+                 "        [] -> [];\n"
+                 "        {error,_}=Error -> Error\n    end.\n\n"
                 ),
 
     io:put_chars(Fd,
-                 "-spec nfc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()).\n"
+                 "-spec nfc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfc(Str0) ->\n"
                  "    case gc(Str0) of\n"
                  "        [GC|R] when GC < 255 -> [GC|R];\n"
                  "        [GC|Str] -> [compose(decompose(GC))|Str];\n"
-                 "        [] -> []\n    end.\n\n"
+                 "        [] -> [];\n"
+                 "        {error,_}=Error -> Error\n    end.\n\n"
                 ),
 
     io:put_chars(Fd,
-                 "-spec nfkc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()).\n"
+                 "-spec nfkc(unicode:chardata()) -> maybe_improper_list(gc(),unicode:chardata()) | {error, unicode:chardata()}.\n"
                  "nfkc(Str0) ->\n"
                  "    case gc(Str0) of\n"
                  "        [GC|R] when GC < 127 -> [GC|R];\n"
                  "        [GC|Str] -> [compose_compat_0(decompose_compat(GC))|Str];\n"
-                 "        [] -> []\n    end.\n\n"
+                 "        [] -> [];\n"
+                 "        {error,_}=Error -> Error\n    end.\n\n"
                 ),
 
     io:put_chars(Fd,
@@ -448,18 +452,20 @@ gen_ws(Fd, Props) ->
 
 gen_cp(Fd) ->
     io:put_chars(Fd, "-spec cp(String::unicode:chardata()) ->"
-                 " maybe_improper_list().\n"),
+                 " maybe_improper_list() | {error, unicode:chardata()}.\n"),
     io:put_chars(Fd, "cp([C|_]=L) when is_integer(C) -> L;\n"),
     io:put_chars(Fd, "cp([List]) -> cp(List);\n"),
     io:put_chars(Fd, "cp([List|R]) ->\n"),
     io:put_chars(Fd, "    case cp(List) of\n"),
     io:put_chars(Fd, "        [] -> cp(R);\n"),
     io:put_chars(Fd, "        [CP] -> [CP|R];\n"),
-    io:put_chars(Fd, "        [C|R0] -> [C|[R0|R]]\n"),
+    io:put_chars(Fd, "        [C|R0] -> [C|[R0|R]];\n"),
+    io:put_chars(Fd, "        {error,Error} -> {error,[Error|R]}\n"),
     io:put_chars(Fd, "    end;\n"),
     io:put_chars(Fd, "cp([]) -> [];\n"),
     io:put_chars(Fd, "cp(<<C/utf8, R/binary>>) -> [C|R];\n"),
-    io:put_chars(Fd, "cp(<<>>) -> [].\n\n"),
+    io:put_chars(Fd, "cp(<<>>) -> [];\n"),
+    io:put_chars(Fd, "cp(<<R/binary>>) -> {error,R}.\n\n"),
     ok.
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
@@ -468,7 +474,7 @@ gen_gc(Fd, GBP) ->
     %% see http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
     io:put_chars(Fd,
                  "-spec gc(String::unicode:chardata()) ->"
-                 " maybe_improper_list().\n"),
+                 " maybe_improper_list() | {error, unicode:chardata()}.\n"),
     io:put_chars(Fd,
                  "gc(Str) ->\n"
                  "    gc_1(cp(Str)).\n\n"
@@ -521,7 +527,8 @@ gen_gc(Fd, GBP) ->
     [GenEBG(CP) || CP <- merge_ranges(maps:get(e_base_gaz,GBP))],
 
     io:put_chars(Fd, "gc_1([CP|R]) -> gc_extend(R, CP);\n"),
-    io:put_chars(Fd, "gc_1([]) -> [].\n\n"),
+    io:put_chars(Fd, "gc_1([]) -> [];\n"),
+    io:put_chars(Fd, "gc_1({error,_}=Error) -> Error.\n\n"),
 
     io:put_chars(Fd, "%% Handle Prepend\n"),
     io:put_chars(Fd,
@@ -536,7 +543,8 @@ gen_gc(Fd, GBP) ->
                  "                    [GC|R1] -> [[CP0|GC]|R1]\n"
                  "                end\n"
                  "           end;\n"
-                 "      [] -> [CP0]\n"
+                 "      [] -> [CP0];\n"
+                 "      {error,R} -> [CP0|R]\n"
                  "    end.\n\n"),
 
     IsCtrl = fun(Range) -> io:format(Fd, "is_control~s true;\n", [gen_single_clause(Range)]) end,
@@ -574,7 +582,10 @@ gen_gc(Fd, GBP) ->
                  "        [_]=Acc -> Acc;\n"
                  "        [_|_]=Acc -> [lists:reverse(Acc)];\n"
                  "        Acc -> [Acc]\n"
-                 "    end.\n\n"),
+                 "    end;\n"
+                 "gc_extend({error,R}, T, Acc0) ->\n"
+                 "    gc_extend([], T, Acc0) ++ [R].\n\n"
+                 ),
     [ZWJ] = maps:get(zwj, GBP),
     GenExtend = fun(R) when R =:= ZWJ -> io:format(Fd, "is_extend~s zwj;\n", [gen_single_clause(ZWJ)]);
                    (Range) -> io:format(Fd, "is_extend~s true;\n", [gen_single_clause(Range)])
@@ -604,6 +615,11 @@ gen_gc(Fd, GBP) ->
                  "            case Acc of\n"
                  "                [A] -> [A];\n"
                  "                _ -> [lists:reverse(Acc)]\n"
+                 "            end;\n"
+                 "        {error,R} ->\n"
+                 "            case Acc of\n"
+                 "                [A] -> [A|R];\n"
+                 "                _ -> [lists:reverse(Acc)|R]\n"
                  "            end\n"
                  "    end.\n\n"),
 
@@ -660,6 +676,7 @@ gen_gc(Fd, GBP) ->
     [GenHangulT_1(CP) || CP <- merge_ranges(maps:get(t,GBP))],
     io:put_chars(Fd, "        R1 -> gc_extend(R1, R0, Acc)\n    end.\n\n"),
 
+    io:put_chars(Fd, "gc_h_lv_lvt({error,_}=Error, Acc) -> gc_extend(Error, [], Acc);\n"),
     io:put_chars(Fd, "%% Handle Hangul LV\n"),
     GenHangulLV = fun(Range) -> io:format(Fd, "gc_h_lv_lvt~s gc_h_V(R1,[CP|Acc]);\n",
                                           [gen_clause2(Range)]) end,
author	José Valim <[email protected]>	2017-05-19 16:06:08 +0200
committer	José Valim <[email protected]>	2017-05-22 15:08:31 +0200
commit	e1370f924df65e72843b5f81400230e1c2591485 (patch)
tree	73babac731e86c0903ef584d14749e1777d5f54b /lib
parent	166d11bb8cbb386dfab4fef37f6f231ac2689b61 (diff)
download	otp-e1370f924df65e72843b5f81400230e1c2591485.tar.gz otp-e1370f924df65e72843b5f81400230e1c2591485.tar.bz2 otp-e1370f924df65e72843b5f81400230e1c2591485.zip