aboutsummaryrefslogtreecommitdiffstats
path: root/lib/stdlib/src/unicode.erl
diff options
context:
space:
mode:
authorDan Gudmundsson <[email protected]>2017-01-27 15:27:37 +0100
committerDan Gudmundsson <[email protected]>2017-04-24 12:16:56 +0200
commit75fc94b8b462d7b7f6dd4b706bbe32cff77ee575 (patch)
treeda96f42d849544e674cb40c9376f2ccf57a14385 /lib/stdlib/src/unicode.erl
parente6a31e66961314a5b74cdb5ffcdab89092c31bc7 (diff)
downloadotp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.tar.gz
otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.tar.bz2
otp-75fc94b8b462d7b7f6dd4b706bbe32cff77ee575.zip
Add nf(k)d, nf(k)c conversion functions to unicode module
Diffstat (limited to 'lib/stdlib/src/unicode.erl')
-rw-r--r--lib/stdlib/src/unicode.erl93
1 files changed, 92 insertions, 1 deletions
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl
index cb2020c21b..59499021cb 100644
--- a/lib/stdlib/src/unicode.erl
+++ b/lib/stdlib/src/unicode.erl
@@ -22,7 +22,12 @@
-export([characters_to_list/1, characters_to_list_int/2,
characters_to_binary/1, characters_to_binary_int/2,
characters_to_binary/3,
- bom_to_encoding/1, encoding_to_bom/1]).
+ bom_to_encoding/1, encoding_to_bom/1,
+ characters_to_nfd_list/1, characters_to_nfd_binary/1,
+ characters_to_nfc_list/1, characters_to_nfc_binary/1,
+ characters_to_nfkd_list/1, characters_to_nfkd_binary/1,
+ characters_to_nfkc_list/1, characters_to_nfkc_binary/1
+ ]).
-export_type([chardata/0, charlist/0, encoding/0, external_chardata/0,
external_charlist/0, latin1_char/0, latin1_chardata/0,
@@ -242,6 +247,92 @@ encoding_to_bom({utf32,little}) ->
encoding_to_bom(latin1) ->
<<>>.
+-define(GC_N, 200). %% arbitrary number
+
+%% Canonical decompose string to list of chars
+-spec characters_to_nfd_list(chardata()) -> [char()].
+characters_to_nfd_list(CD) ->
+ case unicode_util:nfd(CD) of
+ [GC|Str] when is_list(GC) -> GC++characters_to_nfd_list(Str);
+ [CP|Str] -> [CP|characters_to_nfd_list(Str)];
+ [] -> []
+ end.
+
+-spec characters_to_nfd_binary(chardata()) -> unicode_binary().
+characters_to_nfd_binary(CD) ->
+ list_to_binary(characters_to_nfd_binary(CD, ?GC_N, [])).
+
+characters_to_nfd_binary(CD, N, Row) when N > 0 ->
+ case unicode_util:nfd(CD) of
+ [GC|Str] -> characters_to_nfd_binary(Str, N-1, [GC|Row]);
+ [] -> [characters_to_binary(lists:reverse(Row))]
+ end;
+characters_to_nfd_binary(CD, _, Row) ->
+ [characters_to_binary(lists:reverse(Row))|characters_to_nfd_binary(CD,?GC_N,[])].
+
+%% Compability Canonical decompose string to list of chars.
+-spec characters_to_nfkd_list(chardata()) -> [char()].
+characters_to_nfkd_list(CD) ->
+ case unicode_util:nfkd(CD) of
+ [GC|Str] when is_list(GC) -> GC++characters_to_nfkd_list(Str);
+ [CP|Str] -> [CP|characters_to_nfkd_list(Str)];
+ [] -> []
+ end.
+
+-spec characters_to_nfkd_binary(chardata()) -> unicode_binary().
+characters_to_nfkd_binary(CD) ->
+ list_to_binary(characters_to_nfkd_binary(CD, ?GC_N, [])).
+
+characters_to_nfkd_binary(CD, N, Row) when N > 0 ->
+ case unicode_util:nfkd(CD) of
+ [GC|Str] -> characters_to_nfkd_binary(Str, N-1, [GC|Row]);
+ [] -> [characters_to_binary(lists:reverse(Row))]
+ end;
+characters_to_nfkd_binary(CD, _, Row) ->
+ [characters_to_binary(lists:reverse(Row))|characters_to_nfkd_binary(CD,?GC_N,[])].
+
+
+%% Canonical compose string to list of chars
+-spec characters_to_nfc_list(chardata()) -> [char()].
+characters_to_nfc_list(CD) ->
+ case unicode_util:nfc(CD) of
+ [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfc_list(Str);
+ [CP|Str] -> [CP|characters_to_nfc_list(Str)];
+ [] -> []
+ end.
+
+-spec characters_to_nfc_binary(chardata()) -> unicode_binary().
+characters_to_nfc_binary(CD) ->
+ list_to_binary(characters_to_nfc_binary(CD, ?GC_N, [])).
+
+characters_to_nfc_binary(CD, N, Row) when N > 0 ->
+ case unicode_util:nfc(CD) of
+ [GC|Str] -> characters_to_nfc_binary(Str, N-1, [GC|Row]);
+ [] -> [characters_to_binary(lists:reverse(Row))]
+ end;
+characters_to_nfc_binary(CD, _, Row) ->
+ [characters_to_binary(lists:reverse(Row))|characters_to_nfc_binary(CD,?GC_N,[])].
+
+%% Compability Canonical compose string to list of chars
+-spec characters_to_nfkc_list(chardata()) -> [char()].
+characters_to_nfkc_list(CD) ->
+ case unicode_util:nfkc(CD) of
+ [CPs|Str] when is_list(CPs) -> CPs ++ characters_to_nfkc_list(Str);
+ [CP|Str] -> [CP|characters_to_nfkc_list(Str)];
+ [] -> []
+ end.
+
+-spec characters_to_nfkc_binary(chardata()) -> unicode_binary().
+characters_to_nfkc_binary(CD) ->
+ list_to_binary(characters_to_nfkc_binary(CD, ?GC_N, [])).
+
+characters_to_nfkc_binary(CD, N, Row) when N > 0 ->
+ case unicode_util:nfkc(CD) of
+ [GC|Str] -> characters_to_nfkc_binary(Str, N-1, [GC|Row]);
+ [] -> [characters_to_binary(lists:reverse(Row))]
+ end;
+characters_to_nfkc_binary(CD, _, Row) ->
+ [characters_to_binary(lists:reverse(Row))|characters_to_nfkc_binary(CD,?GC_N,[])].
%% internals