Return error tuple on unicode normalization functions

Prior to this patch, the normalization functions in the unicode module would raise a function clause error for non-utf8 binaries. This patch changes it so it returns {error, SoFar, Invalid} as characters_to_binary and characters_to_list does in the unicode module. Note string:next_codepoint/1 and string:next_grapheme had to be changed accordingly and also return an error tuple.
author: José Valim <jose.valim@plataformatec.com.br> 2017-05-19 16:06:08 +0200
committer: José Valim <jose.valim@plataformatec.com.br> 2017-05-22 15:08:31 +0200
commit: e1370f924df65e72843b5f81400230e1c2591485 (patch)
tree: 73babac731e86c0903ef584d14749e1777d5f54b /lib/stdlib/src
parent: 166d11bb8cbb386dfab4fef37f6f231ac2689b61 (diff)
download: otp-e1370f924df65e72843b5f81400230e1c2591485.tar.gz
otp-e1370f924df65e72843b5f81400230e1c2591485.tar.bz2
otp-e1370f924df65e72843b5f81400230e1c2591485.zip
2 files changed, 73 insertions, 50 deletions
diff --git a/lib/stdlib/src/string.erl b/lib/stdlib/src/string.erl
index 17135dd64a..6f7009b5d9 100644
--- a/lib/stdlib/src/string.erl
+++ b/lib/stdlib/src/string.erl
@@ -486,12 +486,14 @@ find(String, SearchPattern, trailing) ->
 
 %% Fetch first codepoint and return rest in tail
 -spec next_grapheme(String::unicode:chardata()) ->
-                           maybe_improper_list(grapheme_cluster(),unicode:chardata()).
+                           maybe_improper_list(grapheme_cluster(),unicode:chardata()) |
+                           {error,unicode:chardata()}.
 next_grapheme(CD) -> unicode_util:gc(CD).
 
 %% Fetch first grapheme cluster and return rest in tail
 -spec next_codepoint(String::unicode:chardata()) ->
-                            maybe_improper_list(char(),unicode:chardata()).
+                            maybe_improper_list(char(),unicode:chardata()) |
+                            {error,unicode:chardata()}.
 next_codepoint(CD) -> unicode_util:cp(CD).
 
 %% Internals
@@ -508,7 +510,7 @@ equal_1(A0,B0) ->
     case {unicode_util:cp(A0), unicode_util:cp(B0)} of
         {[CP|A],[CP|B]} -> equal_1(A,B);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 equal_nocase(A, A) -> true;
@@ -517,7 +519,7 @@ equal_nocase(A0, B0) ->
           unicode_util:cp(unicode_util:casefold(B0))} of
         {[CP|A],[CP|B]} -> equal_nocase(A,B);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 equal_norm(A, A, _Norm) -> true;
@@ -526,7 +528,7 @@ equal_norm(A0, B0, Norm) ->
           unicode_util:cp(unicode_util:Norm(B0))} of
         {[CP|A],[CP|B]} -> equal_norm(A,B, Norm);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 equal_norm_nocase(A, A, _Norm) -> true;
@@ -535,7 +537,7 @@ equal_norm_nocase(A0, B0, Norm) ->
           unicode_util:cp(unicode_util:casefold(unicode_util:Norm(B0)))} of
         {[CP|A],[CP|B]} -> equal_norm_nocase(A,B, Norm);
         {[], []} -> true;
-        _ -> false
+        {L1,L2} when is_list(L1), is_list(L2) -> false
     end.
 
 reverse_1(CD, Acc) ->
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl
index aa1da400ce..fbe8a94074 100644
--- a/lib/stdlib/src/unicode.erl
+++ b/lib/stdlib/src/unicode.erl
@@ -250,89 +250,110 @@ encoding_to_bom(latin1) ->
 -define(GC_N, 200). %% arbitrary number
 
 %% Canonical decompose string to list of chars
--spec characters_to_nfd_list(chardata()) -> [char()].
+-spec characters_to_nfd_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfd_list(CD) ->
+    characters_to_nfd_list(CD, []).
+characters_to_nfd_list(CD, Acc) ->
     case unicode_util:nfd(CD) of
-        [GC|Str] when is_list(GC) -> GC++characters_to_nfd_list(Str);
-        [CP|Str] -> [CP|characters_to_nfd_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfd_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfd_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfd_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfd_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfd_binary(CD) ->
-    list_to_binary(characters_to_nfd_binary(CD, ?GC_N, [])).
+    characters_to_nfd_binary(CD, ?GC_N, [], []).
 
-characters_to_nfd_binary(CD, N, Row) when N > 0 ->
+characters_to_nfd_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfd(CD) of
-        [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfd_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfd_binary(CD,?GC_N,[])].
+characters_to_nfd_binary(CD, _, Row, Acc) ->
+    characters_to_nfd_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
 %% Compability Canonical decompose string to list of chars.
--spec characters_to_nfkd_list(chardata()) -> [char()].
+-spec characters_to_nfkd_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfkd_list(CD) ->
+    characters_to_nfkd_list(CD, []).
+characters_to_nfkd_list(CD, Acc) ->
     case unicode_util:nfkd(CD) of
-        [GC|Str] when is_list(GC) -> GC++characters_to_nfkd_list(Str);
-        [CP|Str] -> [CP|characters_to_nfkd_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfkd_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfkd_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfkd_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfkd_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfkd_binary(CD) ->
-    list_to_binary(characters_to_nfkd_binary(CD, ?GC_N, [])).
+    characters_to_nfkd_binary(CD, ?GC_N, [], []).
 
-characters_to_nfkd_binary(CD, N, Row) when N > 0 ->
+characters_to_nfkd_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfkd(CD) of
-        [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfkd_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfkd_binary(CD,?GC_N,[])].
+characters_to_nfkd_binary(CD, _, Row, Acc) ->
+    characters_to_nfkd_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
 
 %% Canonical compose string to list of chars
--spec characters_to_nfc_list(chardata()) -> [char()].
+-spec characters_to_nfc_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfc_list(CD) ->
+    characters_to_nfc_list(CD, []).
+characters_to_nfc_list(CD, Acc) ->
     case unicode_util:nfc(CD) of
-        [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str);
-        [CP|Str] -> [CP|characters_to_nfc_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfc_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfc_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfc_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfc_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfc_binary(CD) ->
-    list_to_binary(characters_to_nfc_binary(CD, ?GC_N, [])).
+    characters_to_nfc_binary(CD, ?GC_N, [], []).
 
-characters_to_nfc_binary(CD, N, Row) when N > 0 ->
+characters_to_nfc_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfc(CD) of
-        [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfc_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfc_binary(CD,?GC_N,[])].
+characters_to_nfc_binary(CD, _, Row, Acc) ->
+    characters_to_nfc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
 
 %% Compability Canonical compose string to list of chars
--spec characters_to_nfkc_list(chardata()) -> [char()].
+-spec characters_to_nfkc_list(chardata()) -> [char()] | {error, [char()], chardata()}.
 characters_to_nfkc_list(CD) ->
+    characters_to_nfkc_list(CD, []).
+characters_to_nfkc_list(CD, Acc) ->
     case unicode_util:nfkc(CD) of
-        [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfkc_list(Str);
-        [CP|Str] -> [CP|characters_to_nfkc_list(Str)];
-        [] -> []
+        [GC|Str] when is_list(GC) -> characters_to_nfkc_list(Str, lists:reverse(GC, Acc));
+        [CP|Str] -> characters_to_nfkc_list(Str, [CP | Acc]);
+        [] -> lists:reverse(Acc);
+        {error,Error} -> {error, lists:reverse(Acc), Error}
     end.
 
--spec characters_to_nfkc_binary(chardata()) -> unicode_binary().
+-spec characters_to_nfkc_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}.
 characters_to_nfkc_binary(CD) ->
-    list_to_binary(characters_to_nfkc_binary(CD, ?GC_N, [])).
+    characters_to_nfkc_binary(CD, ?GC_N, [], []).
 
-characters_to_nfkc_binary(CD, N, Row) when N > 0 ->
+characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 ->
     case unicode_util:nfkc(CD) of
-        [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row]);
-        [] -> [characters_to_binary(lists:reverse(Row))]
+        [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row], Acc);
+        [] -> acc_to_binary(prepend_row_to_acc(Row, Acc));
+        {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error}
     end;
-characters_to_nfkc_binary(CD, _, Row) ->
-    [characters_to_binary(lists:reverse(Row))|characters_to_nfkc_binary(CD,?GC_N,[])].
+characters_to_nfkc_binary(CD, _, Row, Acc) ->
+    characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)).
+
+acc_to_binary(Acc) ->
+    list_to_binary(lists:reverse(Acc)).
+prepend_row_to_acc(Row, Acc) ->
+    [characters_to_binary(lists:reverse(Row))|Acc].
 
 %% internals
author	José Valim <jose.valim@plataformatec.com.br>	2017-05-19 16:06:08 +0200
committer	José Valim <jose.valim@plataformatec.com.br>	2017-05-22 15:08:31 +0200
commit	e1370f924df65e72843b5f81400230e1c2591485 (patch)
tree	73babac731e86c0903ef584d14749e1777d5f54b /lib/stdlib/src
parent	166d11bb8cbb386dfab4fef37f6f231ac2689b61 (diff)
download	otp-e1370f924df65e72843b5f81400230e1c2591485.tar.gz otp-e1370f924df65e72843b5f81400230e1c2591485.tar.bz2 otp-e1370f924df65e72843b5f81400230e1c2591485.zip