diff options
author | José Valim <[email protected]> | 2017-05-19 16:06:08 +0200 |
---|---|---|
committer | José Valim <[email protected]> | 2017-05-22 15:08:31 +0200 |
commit | e1370f924df65e72843b5f81400230e1c2591485 (patch) | |
tree | 73babac731e86c0903ef584d14749e1777d5f54b /lib/stdlib/src/unicode.erl | |
parent | 166d11bb8cbb386dfab4fef37f6f231ac2689b61 (diff) | |
download | otp-e1370f924df65e72843b5f81400230e1c2591485.tar.gz otp-e1370f924df65e72843b5f81400230e1c2591485.tar.bz2 otp-e1370f924df65e72843b5f81400230e1c2591485.zip |
Return error tuple on unicode normalization functions
Prior to this patch, the normalization functions in the
unicode module would raise a function clause error for
non-utf8 binaries.
This patch changes it so it returns {error, SoFar, Invalid}
as characters_to_binary and characters_to_list does in
the unicode module.
Note string:next_codepoint/1 and string:next_grapheme had
to be changed accordingly and also return an error tuple.
Diffstat (limited to 'lib/stdlib/src/unicode.erl')
-rw-r--r-- | lib/stdlib/src/unicode.erl | 109 |
1 files changed, 65 insertions, 44 deletions
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl index aa1da400ce..fbe8a94074 100644 --- a/lib/stdlib/src/unicode.erl +++ b/lib/stdlib/src/unicode.erl @@ -250,89 +250,110 @@ encoding_to_bom(latin1) -> -define(GC_N, 200). %% arbitrary number %% Canonical decompose string to list of chars --spec characters_to_nfd_list(chardata()) -> [char()]. +-spec characters_to_nfd_list(chardata()) -> [char()] | {error, [char()], chardata()}. characters_to_nfd_list(CD) -> + characters_to_nfd_list(CD, []). +characters_to_nfd_list(CD, Acc) -> case unicode_util:nfd(CD) of - [GC|Str] when is_list(GC) -> GC++characters_to_nfd_list(Str); - [CP|Str] -> [CP|characters_to_nfd_list(Str)]; - [] -> [] + [GC|Str] when is_list(GC) -> characters_to_nfd_list(Str, lists:reverse(GC, Acc)); + [CP|Str] -> characters_to_nfd_list(Str, [CP | Acc]); + [] -> lists:reverse(Acc); + {error,Error} -> {error, lists:reverse(Acc), Error} end. --spec characters_to_nfd_binary(chardata()) -> unicode_binary(). +-spec characters_to_nfd_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}. characters_to_nfd_binary(CD) -> - list_to_binary(characters_to_nfd_binary(CD, ?GC_N, [])). + characters_to_nfd_binary(CD, ?GC_N, [], []). -characters_to_nfd_binary(CD, N, Row) when N > 0 -> +characters_to_nfd_binary(CD, N, Row, Acc) when N > 0 -> case unicode_util:nfd(CD) of - [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row]); - [] -> [characters_to_binary(lists:reverse(Row))] + [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row], Acc); + [] -> acc_to_binary(prepend_row_to_acc(Row, Acc)); + {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error} end; -characters_to_nfd_binary(CD, _, Row) -> - [characters_to_binary(lists:reverse(Row))|characters_to_nfd_binary(CD,?GC_N,[])]. +characters_to_nfd_binary(CD, _, Row, Acc) -> + characters_to_nfd_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)). %% Compability Canonical decompose string to list of chars. --spec characters_to_nfkd_list(chardata()) -> [char()]. +-spec characters_to_nfkd_list(chardata()) -> [char()] | {error, [char()], chardata()}. characters_to_nfkd_list(CD) -> + characters_to_nfkd_list(CD, []). +characters_to_nfkd_list(CD, Acc) -> case unicode_util:nfkd(CD) of - [GC|Str] when is_list(GC) -> GC++characters_to_nfkd_list(Str); - [CP|Str] -> [CP|characters_to_nfkd_list(Str)]; - [] -> [] + [GC|Str] when is_list(GC) -> characters_to_nfkd_list(Str, lists:reverse(GC, Acc)); + [CP|Str] -> characters_to_nfkd_list(Str, [CP | Acc]); + [] -> lists:reverse(Acc); + {error,Error} -> {error, lists:reverse(Acc), Error} end. --spec characters_to_nfkd_binary(chardata()) -> unicode_binary(). +-spec characters_to_nfkd_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}. characters_to_nfkd_binary(CD) -> - list_to_binary(characters_to_nfkd_binary(CD, ?GC_N, [])). + characters_to_nfkd_binary(CD, ?GC_N, [], []). -characters_to_nfkd_binary(CD, N, Row) when N > 0 -> +characters_to_nfkd_binary(CD, N, Row, Acc) when N > 0 -> case unicode_util:nfkd(CD) of - [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row]); - [] -> [characters_to_binary(lists:reverse(Row))] + [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row], Acc); + [] -> acc_to_binary(prepend_row_to_acc(Row, Acc)); + {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error} end; -characters_to_nfkd_binary(CD, _, Row) -> - [characters_to_binary(lists:reverse(Row))|characters_to_nfkd_binary(CD,?GC_N,[])]. +characters_to_nfkd_binary(CD, _, Row, Acc) -> + characters_to_nfkd_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)). %% Canonical compose string to list of chars --spec characters_to_nfc_list(chardata()) -> [char()]. +-spec characters_to_nfc_list(chardata()) -> [char()] | {error, [char()], chardata()}. characters_to_nfc_list(CD) -> + characters_to_nfc_list(CD, []). +characters_to_nfc_list(CD, Acc) -> case unicode_util:nfc(CD) of - [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str); - [CP|Str] -> [CP|characters_to_nfc_list(Str)]; - [] -> [] + [GC|Str] when is_list(GC) -> characters_to_nfc_list(Str, lists:reverse(GC, Acc)); + [CP|Str] -> characters_to_nfc_list(Str, [CP | Acc]); + [] -> lists:reverse(Acc); + {error,Error} -> {error, lists:reverse(Acc), Error} end. --spec characters_to_nfc_binary(chardata()) -> unicode_binary(). +-spec characters_to_nfc_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}. characters_to_nfc_binary(CD) -> - list_to_binary(characters_to_nfc_binary(CD, ?GC_N, [])). + characters_to_nfc_binary(CD, ?GC_N, [], []). -characters_to_nfc_binary(CD, N, Row) when N > 0 -> +characters_to_nfc_binary(CD, N, Row, Acc) when N > 0 -> case unicode_util:nfc(CD) of - [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row]); - [] -> [characters_to_binary(lists:reverse(Row))] + [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row], Acc); + [] -> acc_to_binary(prepend_row_to_acc(Row, Acc)); + {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error} end; -characters_to_nfc_binary(CD, _, Row) -> - [characters_to_binary(lists:reverse(Row))|characters_to_nfc_binary(CD,?GC_N,[])]. +characters_to_nfc_binary(CD, _, Row, Acc) -> + characters_to_nfc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)). %% Compability Canonical compose string to list of chars --spec characters_to_nfkc_list(chardata()) -> [char()]. +-spec characters_to_nfkc_list(chardata()) -> [char()] | {error, [char()], chardata()}. characters_to_nfkc_list(CD) -> + characters_to_nfkc_list(CD, []). +characters_to_nfkc_list(CD, Acc) -> case unicode_util:nfkc(CD) of - [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfkc_list(Str); - [CP|Str] -> [CP|characters_to_nfkc_list(Str)]; - [] -> [] + [GC|Str] when is_list(GC) -> characters_to_nfkc_list(Str, lists:reverse(GC, Acc)); + [CP|Str] -> characters_to_nfkc_list(Str, [CP | Acc]); + [] -> lists:reverse(Acc); + {error,Error} -> {error, lists:reverse(Acc), Error} end. --spec characters_to_nfkc_binary(chardata()) -> unicode_binary(). +-spec characters_to_nfkc_binary(chardata()) -> unicode_binary() | {error, unicode_binary(), chardata()}. characters_to_nfkc_binary(CD) -> - list_to_binary(characters_to_nfkc_binary(CD, ?GC_N, [])). + characters_to_nfkc_binary(CD, ?GC_N, [], []). -characters_to_nfkc_binary(CD, N, Row) when N > 0 -> +characters_to_nfkc_binary(CD, N, Row, Acc) when N > 0 -> case unicode_util:nfkc(CD) of - [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row]); - [] -> [characters_to_binary(lists:reverse(Row))] + [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row], Acc); + [] -> acc_to_binary(prepend_row_to_acc(Row, Acc)); + {error, Error} -> {error, acc_to_binary(prepend_row_to_acc(Row, Acc)), Error} end; -characters_to_nfkc_binary(CD, _, Row) -> - [characters_to_binary(lists:reverse(Row))|characters_to_nfkc_binary(CD,?GC_N,[])]. +characters_to_nfkc_binary(CD, _, Row, Acc) -> + characters_to_nfkc_binary(CD, ?GC_N, [], prepend_row_to_acc(Row, Acc)). + +acc_to_binary(Acc) -> + list_to_binary(lists:reverse(Acc)). +prepend_row_to_acc(Row, Acc) -> + [characters_to_binary(lists:reverse(Row))|Acc]. %% internals |