diff options
Diffstat (limited to 'lib/stdlib/src/unicode.erl')
-rw-r--r-- | lib/stdlib/src/unicode.erl | 225 |
1 files changed, 114 insertions, 111 deletions
diff --git a/lib/stdlib/src/unicode.erl b/lib/stdlib/src/unicode.erl index 617da11ba8..cb2020c21b 100644 --- a/lib/stdlib/src/unicode.erl +++ b/lib/stdlib/src/unicode.erl @@ -1,8 +1,8 @@ %% %% %CopyrightBegin% -%% +%% %% Copyright Ericsson AB 2008-2016. All Rights Reserved. -%% +%% %% Licensed under the Apache License, Version 2.0 (the "License"); %% you may not use this file except in compliance with the License. %% You may obtain a copy of the License at @@ -14,7 +14,7 @@ %% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. %% See the License for the specific language governing permissions and %% limitations under the License. -%% +%% %% %CopyrightEnd% %% -module(unicode). @@ -102,35 +102,6 @@ characters_to_list(_, _) -> characters_to_list(ML) -> unicode:characters_to_list(ML,unicode). -characters_to_list_int(ML, Encoding) -> - try - do_characters_to_list(ML,Encoding) - catch - error:AnyError -> - TheError = case AnyError of - system_limit -> - system_limit; - _ -> - badarg - end, - {'EXIT',{new_stacktrace,[{Mod,_,L,_}|Rest]}} = - (catch erlang:error(new_stacktrace, - [ML,Encoding])), - erlang:raise(error,TheError,[{Mod,characters_to_list,L}|Rest]) - end. - -% XXX: Optimize me! -do_characters_to_list(ML, Encoding) -> - case unicode:characters_to_binary(ML,Encoding) of - Bin when is_binary(Bin) -> - unicode:characters_to_list(Bin,utf8); - {error,Encoded,Rest} -> - {error,unicode:characters_to_list(Encoded,utf8),Rest}; - {incomplete, Encoded2, Rest2} -> - {incomplete,unicode:characters_to_list(Encoded2,utf8),Rest2} - end. - - -spec characters_to_binary(Data) -> Result when Data :: latin1_chardata() | chardata() | external_chardata(), Result :: binary() @@ -154,24 +125,6 @@ characters_to_binary(ML) -> [ML])), erlang:raise(error,TheError,[{Mod,characters_to_binary,L}|Rest]) end. - - -characters_to_binary_int(ML,InEncoding) -> - try - characters_to_binary_int(ML,InEncoding,unicode) - catch - error:AnyError -> - TheError = case AnyError of - system_limit -> - system_limit; - _ -> - badarg - end, - {'EXIT',{new_stacktrace,[{Mod,_,L,_}|Rest]}} = - (catch erlang:error(new_stacktrace, - [ML,InEncoding])), - erlang:raise(error,TheError,[{Mod,characters_to_binary,L}|Rest]) - end. -spec characters_to_binary(Data, InEncoding, OutEncoding) -> Result when Data :: latin1_chardata() | chardata() | external_chardata(), @@ -192,7 +145,7 @@ characters_to_binary(ML, latin1, Uni) when is_binary(ML) and ((Uni =:= utf8) or try characters_to_binary_int(ML,latin1,utf8) catch - error:AnyError -> + error:AnyError -> TheError = case AnyError of system_limit -> system_limit; @@ -228,7 +181,7 @@ characters_to_binary(ML,Uni,latin1) when is_binary(ML) and ((Uni =:= utf8) or [{Mod,characters_to_binary,L}|Rest]) end end; - + characters_to_binary(ML, InEncoding, OutEncoding) -> try characters_to_binary_int(ML,InEncoding,OutEncoding) @@ -246,53 +199,6 @@ characters_to_binary(ML, InEncoding, OutEncoding) -> erlang:raise(error,TheError,[{Mod,characters_to_binary,L}|Rest]) end. -characters_to_binary_int(ML, InEncoding, OutEncoding) when - InEncoding =:= latin1, OutEncoding =:= unicode; - InEncoding =:= latin1, OutEncoding =:= utf8; - InEncoding =:= unicode, OutEncoding =:= unicode; - InEncoding =:= unicode, OutEncoding =:= utf8; - InEncoding =:= utf8, OutEncoding =:= unicode; - InEncoding =:= utf8, OutEncoding =:= utf8 -> - unicode:characters_to_binary(ML,InEncoding); - -characters_to_binary_int(ML, InEncoding, OutEncoding) -> - {InTrans,Limit} = case OutEncoding of - latin1 -> {i_trans_chk(InEncoding),255}; - _ -> {i_trans(InEncoding),case InEncoding of latin1 -> 255; _ -> 16#10FFFF end} - end, - OutTrans = o_trans(OutEncoding), - Res = - ml_map(ML, - fun(Part,Accum) when is_binary(Part) -> - case InTrans(Part) of - List when is_list(List) -> - Tail = OutTrans(List), - <<Accum/binary, Tail/binary>>; - {error, Translated, Rest} -> - Tail = OutTrans(Translated), - {error, <<Accum/binary,Tail/binary>>, Rest}; - {incomplete, Translated, Rest, Missing} -> - Tail = OutTrans(Translated), - {incomplete, <<Accum/binary,Tail/binary>>, Rest, - Missing} - end; - (Part, Accum) when is_integer(Part), Part =< Limit -> - case OutTrans([Part]) of - Binary when is_binary(Binary) -> - <<Accum/binary, Binary/binary>>; - {error, _, [Part]} -> - {error,Accum,[Part]} - end; - (Part, Accum) -> - {error, Accum, [Part]} - end,<<>>), - case Res of - {incomplete,A,B,_} -> - {incomplete,A,B}; - _ -> - Res - end. - -spec bom_to_encoding(Bin) -> {Encoding, Length} when Bin :: binary(), Encoding :: 'latin1' | 'utf8' @@ -335,11 +241,108 @@ encoding_to_bom({utf32,little}) -> <<255,254,0,0>>; encoding_to_bom(latin1) -> <<>>. - -cbv(utf8,<<1:1,1:1,0:1,_:5>>) -> + +%% internals + +characters_to_list_int(ML, Encoding) -> + try + do_characters_to_list(ML,Encoding) + catch + error:AnyError -> + TheError = case AnyError of + system_limit -> + system_limit; + _ -> + badarg + end, + {'EXIT',{new_stacktrace,[{Mod,_,L,_}|Rest]}} = + (catch erlang:error(new_stacktrace, + [ML,Encoding])), + erlang:raise(error,TheError,[{Mod,characters_to_list,L}|Rest]) + end. + + % XXX: Optimize me! +do_characters_to_list(ML, Encoding) -> + case unicode:characters_to_binary(ML,Encoding) of + Bin when is_binary(Bin) -> + unicode:characters_to_list(Bin,utf8); + {error,Encoded,Rest} -> + {error,unicode:characters_to_list(Encoded,utf8),Rest}; + {incomplete, Encoded2, Rest2} -> + {incomplete,unicode:characters_to_list(Encoded2,utf8),Rest2} + end. + + +characters_to_binary_int(ML,InEncoding) -> + try + characters_to_binary_int(ML,InEncoding,unicode) + catch + error:AnyError -> + TheError = case AnyError of + system_limit -> + system_limit; + _ -> + badarg + end, + {'EXIT',{new_stacktrace,[{Mod,_,L,_}|Rest]}} = + (catch erlang:error(new_stacktrace, + [ML,InEncoding])), + erlang:raise(error,TheError,[{Mod,characters_to_binary,L}|Rest]) + end. + + +characters_to_binary_int(ML, InEncoding, OutEncoding) when + InEncoding =:= latin1, OutEncoding =:= unicode; + InEncoding =:= latin1, OutEncoding =:= utf8; + InEncoding =:= unicode, OutEncoding =:= unicode; + InEncoding =:= unicode, OutEncoding =:= utf8; + InEncoding =:= utf8, OutEncoding =:= unicode; + InEncoding =:= utf8, OutEncoding =:= utf8 -> + unicode:characters_to_binary(ML,InEncoding); + +characters_to_binary_int(ML, InEncoding, OutEncoding) -> + {InTrans,Limit} = case OutEncoding of + latin1 -> {i_trans_chk(InEncoding),255}; + _ -> {i_trans(InEncoding),case InEncoding of latin1 -> 255; _ -> 16#10FFFF end} + end, + OutTrans = o_trans(OutEncoding), + Res = + ml_map(ML, + fun(Part,Accum) when is_binary(Part) -> + case InTrans(Part) of + List when is_list(List) -> + Tail = OutTrans(List), + <<Accum/binary, Tail/binary>>; + {error, Translated, Rest} -> + Tail = OutTrans(Translated), + {error, <<Accum/binary,Tail/binary>>, Rest}; + {incomplete, Translated, Rest, Missing} -> + Tail = OutTrans(Translated), + {incomplete, <<Accum/binary,Tail/binary>>, Rest, + Missing} + end; + (Part, Accum) when is_integer(Part), Part =< Limit -> + case OutTrans([Part]) of + Binary when is_binary(Binary) -> + <<Accum/binary, Binary/binary>>; + {error, _, [Part]} -> + {error,Accum,[Part]} + end; + (Part, Accum) -> + {error, Accum, [Part]} + end,<<>>), + case Res of + {incomplete,A,B,_} -> + {incomplete,A,B}; + _ -> + Res + end. + + +cbv(utf8,<<1:1,1:1,0:1,_:5>>) -> 1; -cbv(utf8,<<1:1,1:1,1:1,0:1,_:4,R/binary>>) -> +cbv(utf8,<<1:1,1:1,1:1,0:1,_:4,R/binary>>) -> case R of <<>> -> 2; @@ -386,18 +389,18 @@ cbv({utf32,big}, <<0:8>>) -> 3; cbv({utf32,big}, <<0:8,X:8>>) when X =< 16 -> 2; -cbv({utf32,big}, <<0:8,X:8,Y:8>>) +cbv({utf32,big}, <<0:8,X:8,Y:8>>) when X =< 16, ((X > 0) or ((Y =< 215) or (Y >= 224))) -> 1; cbv({utf32,big},_) -> false; cbv({utf32,little},<<_:8>>) -> 3; -cbv({utf32,little},<<_:8,_:8>>) -> +cbv({utf32,little},<<_:8,_:8>>) -> 2; cbv({utf32,little},<<X:8,255:8,0:8>>) when X =:= 254; X =:= 255 -> false; -cbv({utf32,little},<<_:8,Y:8,X:8>>) +cbv({utf32,little},<<_:8,Y:8,X:8>>) when X =< 16, ((X > 0) or ((Y =< 215) or (Y >= 224))) -> 1; cbv({utf32,little},_) -> @@ -417,8 +420,8 @@ ml_map([Part|T],Fun,Accum) when is_integer(Part) -> Bin2 when is_binary(Bin2) -> Bin2; {error, Converted, Rest} -> - {error, Converted, Rest}; - {incomplete, Converted, Rest,X} -> + {error, Converted, Rest}; + {incomplete, Converted, Rest,X} -> {incomplete, Converted, Rest,X} end; % Can not be incomplete - it's an integer @@ -471,7 +474,7 @@ ml_map(Part,Fun,Accum) when is_binary(Part), byte_size(Part) > 8192 -> ml_map(Bin,Fun,Accum) when is_binary(Bin) -> Fun(Bin,Accum). - + @@ -523,7 +526,7 @@ o_trans(utf8) -> <<One/utf8>> end, L) end; - + o_trans(utf16) -> fun(L) -> do_o_binary(fun(One) -> @@ -577,9 +580,9 @@ do_o_binary2(F,[H|T]) -> [Bin|Bin3] end end. - + %% Specific functions only allowing codepoints in latin1 range - + do_i_utf8_chk(<<>>) -> []; do_i_utf8_chk(<<U/utf8,R/binary>>) when U =< 255 -> |