diff options
author | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
---|---|---|
committer | Erlang/OTP <[email protected]> | 2009-11-20 14:54:40 +0000 |
commit | 84adefa331c4159d432d22840663c38f155cd4c1 (patch) | |
tree | bff9a9c66adda4df2106dfd0e5c053ab182a12bd /lib/xmerl/src/xmerl_ucs.erl | |
download | otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.gz otp-84adefa331c4159d432d22840663c38f155cd4c1.tar.bz2 otp-84adefa331c4159d432d22840663c38f155cd4c1.zip |
The R13B03 release.OTP_R13B03
Diffstat (limited to 'lib/xmerl/src/xmerl_ucs.erl')
-rw-r--r-- | lib/xmerl/src/xmerl_ucs.erl | 556 |
1 files changed, 556 insertions, 0 deletions
diff --git a/lib/xmerl/src/xmerl_ucs.erl b/lib/xmerl/src/xmerl_ucs.erl new file mode 100644 index 0000000000..7c45c838ab --- /dev/null +++ b/lib/xmerl/src/xmerl_ucs.erl @@ -0,0 +1,556 @@ +%% +%% %CopyrightBegin% +%% +%% Copyright Ericsson AB 2005-2009. All Rights Reserved. +%% +%% The contents of this file are subject to the Erlang Public License, +%% Version 1.1, (the "License"); you may not use this file except in +%% compliance with the License. You should have received a copy of the +%% Erlang Public License along with this software. If not, it can be +%% retrieved online at http://www.erlang.org/. +%% +%% Software distributed under the License is distributed on an "AS IS" +%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See +%% the License for the specific language governing rights and limitations +%% under the License. +%% +%% %CopyrightEnd% +%% + +-module(xmerl_ucs). + +-compile([verbose,report_warnings,warn_unused_vars]). + + + +%%% Conversion to/from IANA recognised character sets +-export([to_unicode/2]). + +%%% Micellaneous predicates +-export([is_iso10646/1, is_unicode/1, is_bmpchar/1, is_latin1/1, is_ascii/1, + is_visible_latin1/1, is_visible_ascii/1, is_iso646_basic/1, + is_incharset/2]). + +%%% Conversion to/from RFC-1345 style mnemonic strings consisting +%%% of subsets of ISO-10646 with "escape" sequences. +%-export([from_mnemonic/1, from_mnemonic/2]). + +%%% UCS-2, UCS-4, UTF-16, and UTF-8 encoding and decoding +-export([to_ucs2be/1,from_ucs2be/1, from_ucs2be/2]). +-export([to_ucs2le/1,from_ucs2le/1, from_ucs2le/2]). +-export([to_ucs4be/1,from_ucs4be/1, from_ucs4be/2]). +-export([to_ucs4le/1,from_ucs4le/1, from_ucs4le/2]). +-export([to_utf16be/1, from_utf16be/1, from_utf16be/2]). +-export([to_utf16le/1, from_utf16le/1, from_utf16le/2]). +-export([to_utf8/1, from_utf8/1]). + +%%% NB: Non-canonical UTF-8 encodings and incorrectly used +%%% surrogate-pair codes are disallowed by this code. There are +%%% important security implications concerning them. DO NOT REMOVE +%%% THE VARIOUS GUARDS AND TESTS THAT ENFORCE THIS POLICY. + +%%% Test if Ch is a legitimate ISO-10646 character code +is_iso10646(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch < 16#D800 -> true; + Ch < 16#E000 -> false; % Surrogates + Ch < 16#FFFE -> true; + Ch =< 16#FFFF -> false; % FFFE and FFFF (not characters) + Ch =< 16#7FFFFFFF -> true; + true -> false + end; +is_iso10646(_) -> false. + +%%% Test if Ch is a legitimate ISO-10646 character code capable of +%%% being encoded in a UTF-16 string. +is_unicode(Ch) when Ch < 16#110000 -> is_iso10646(Ch); +is_unicode(_) -> false. + +%%% Test if Ch is a legitimate ISO-10646 character code belonging to +%%% the basic multi-lingual plane (BMP). +is_bmpchar(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch < 16#D800 -> true; + Ch < 16#E000 -> false; % Surrogates + Ch < 16#FFFE -> true; + true -> false + end; +is_bmpchar(_) -> false. + +%%% Test for legitimate Latin-1 code +is_latin1(Ch) when is_integer(Ch), Ch >= 0, Ch =< 255 -> true; +is_latin1(_) -> false. + +%%% Test for legitimate ASCII code +is_ascii(Ch) when is_integer(Ch), Ch >= 0, Ch =< 127 -> true; +is_ascii(_) -> false. + +%%% Test for char an element of ISO-646.basic set +is_iso646_basic(Ch) when is_integer(Ch), Ch >= $\s -> + if Ch =< $Z -> + %% Everything in this range except $# $$ and $@ + if Ch > $$ -> Ch =/= $@; + true -> Ch < $# + end; + %% Only $_ and $a .. $z in range above $Z + Ch > $z -> false; + Ch >= $a -> true; + true -> Ch =:= $_ + end; +is_iso646_basic(_) -> + false. + +%%% Test for char a visible Latin-1 char, i.e. a non-control Latin-1 char, +%%% excepting non-break space (but including space). +is_visible_latin1(Ch) when is_integer(Ch), Ch >= $\s -> + if Ch =< $~ -> true; + Ch >= 161 -> Ch =< 255 + end; +is_visible_latin1(_) -> + false. + +%%% Test for char a visible ASCII char, i.e. a non-control ASCII char +%%% (including space). +is_visible_ascii(Ch) when is_integer(Ch), Ch >= $\s -> Ch =< $~; +is_visible_ascii(_) -> false. + + +%%% UCS-4, big and little endian versions, encoding and decoding +to_ucs4be(List) when is_list(List) -> lists:flatmap(fun to_ucs4be/1, List); +to_ucs4be(Ch) -> char_to_ucs4be(Ch). + +from_ucs4be(Bin) when is_binary(Bin) -> from_ucs4be(Bin,[],[]); +from_ucs4be(List) -> from_ucs4be(list_to_binary(List),[],[]). + +from_ucs4be(Bin,Tail) when is_binary(Bin) -> from_ucs4be(Bin,[],Tail); +from_ucs4be(List,Tail) -> from_ucs4be(list_to_binary(List),[],Tail). + +to_ucs4le(List) when is_list(List) -> lists:flatmap(fun to_ucs4le/1, List); +to_ucs4le(Ch) -> char_to_ucs4le(Ch). + +from_ucs4le(Bin) when is_binary(Bin) -> from_ucs4le(Bin,[],[]); +from_ucs4le(List) -> from_ucs4le(list_to_binary(List),[],[]). + +from_ucs4le(Bin,Tail) when is_binary(Bin) -> from_ucs4le(Bin,[],Tail); +from_ucs4le(List,Tail) -> from_ucs4le(list_to_binary(List),[],Tail). + +%%% UCS-2, big and little endian versions, encoding and decoding +to_ucs2be(List) when is_list(List) -> lists:flatmap(fun to_ucs2be/1, List); +to_ucs2be(Ch) -> char_to_ucs2be(Ch). + +from_ucs2be(Bin) when is_binary(Bin) -> from_ucs2be(Bin,[],[]); +from_ucs2be(List) -> from_ucs2be(list_to_binary(List),[],[]). + +from_ucs2be(Bin,Tail) when is_binary(Bin) -> from_ucs2be(Bin,[],Tail); +from_ucs2be(List,Tail) -> from_ucs2be(list_to_binary(List),[],Tail). + +to_ucs2le(List) when is_list(List) -> lists:flatmap(fun to_ucs2le/1, List); +to_ucs2le(Ch) -> char_to_ucs2le(Ch). + +from_ucs2le(Bin) when is_binary(Bin) -> from_ucs2le(Bin,[],[]); +from_ucs2le(List) -> from_ucs2le(list_to_binary(List),[],[]). + +from_ucs2le(Bin,Tail) when is_binary(Bin) -> from_ucs2le(Bin,[],Tail); +from_ucs2le(List,Tail) -> from_ucs2le(list_to_binary(List),[],Tail). + + +%%% UTF-16, big and little endian versions, encoding and decoding +to_utf16be(List) when is_list(List) -> lists:flatmap(fun to_utf16be/1, List); +to_utf16be(Ch) -> char_to_utf16be(Ch). + +from_utf16be(Bin) when is_binary(Bin) -> from_utf16be(Bin,[],[]); +from_utf16be(List) -> from_utf16be(list_to_binary(List),[],[]). + +from_utf16be(Bin,Tail) when is_binary(Bin) -> from_utf16be(Bin,[],Tail); +from_utf16be(List,Tail) -> from_utf16be(list_to_binary(List),[],Tail). + +to_utf16le(List) when is_list(List) -> lists:flatmap(fun to_utf16le/1, List); +to_utf16le(Ch) -> char_to_utf16le(Ch). + +from_utf16le(Bin) when is_binary(Bin) -> from_utf16le(Bin,[],[]); +from_utf16le(List) -> from_utf16le(list_to_binary(List),[],[]). + +from_utf16le(Bin,Tail) when is_binary(Bin) -> from_utf16le(Bin,[],Tail); +from_utf16le(List,Tail) -> from_utf16le(list_to_binary(List),[],Tail). + + +%%% UTF-8 encoding and decoding +to_utf8(List) when is_list(List) -> lists:flatmap(fun to_utf8/1, List); +to_utf8(Ch) -> char_to_utf8(Ch). + +from_utf8(Bin) when is_binary(Bin) -> from_utf8(binary_to_list(Bin)); +from_utf8(List) -> + case expand_utf8(List) of + {Result,0} -> Result; + {_Res,_NumBadChar} -> + exit({ucs,{bad_utf8_character_code}}) + end. + + + + +%%% UCS-4 support +%%% Possible errors encoding UCS-4: +%%% - Non-character values (something other than 0 .. 2^31-1) +%%% - Surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +%%% Possible errors decoding UCS-4: +%%% - Element out of range (i.e. the "sign" bit is set). +%%% - Surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +char_to_ucs4be(Ch) -> + true = is_iso10646(Ch), + [(Ch bsr 24), + (Ch bsr 16) band 16#FF, + (Ch bsr 8) band 16#FF, + Ch band 16#FF]. + +from_ucs4be(<<Ch:32/big-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs4be(Rest,[Ch|Acc],Tail) + end; +from_ucs4be(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs4be(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs4be}. + +char_to_ucs4le(Ch) -> + true = is_iso10646(Ch), + [Ch band 16#FF, + (Ch bsr 8) band 16#FF, + (Ch bsr 16) band 16#FF, + (Ch bsr 24)]. + + +from_ucs4le(<<Ch:32/little-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs4le(Rest,[Ch|Acc],Tail) + end; +from_ucs4le(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs4le(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs4le}. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% UCS-2 support +%%% FIXME! Don't know how to encode UCS-2!! +%%% Currently I just encode as UCS-4, but strips the 16 higher bits. +char_to_ucs2be(Ch) -> + true = is_iso10646(Ch), + [(Ch bsr 8) band 16#FF, + Ch band 16#FF]. + +from_ucs2be(<<Ch:16/big-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs2be(Rest,[Ch|Acc],Tail) + end; +from_ucs2be(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs2be(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs2be}. + +char_to_ucs2le(Ch) -> + true = is_iso10646(Ch), + [(Ch bsr 16) band 16#FF, + (Ch bsr 24)]. + + +from_ucs2le(<<Ch:16/little-signed-integer, Rest/binary>>,Acc,Tail) -> + if Ch < 0; Ch >= 16#D800, Ch < 16#E000; Ch =:= 16#FFFE; Ch =:= 16#FFFF -> + exit({bad_character_code,Ch}); + true -> + from_ucs4le(Rest,[Ch|Acc],Tail) + end; +from_ucs2le(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_ucs2le(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_ucs2le}. + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% UTF-16 support +%%% Possible errors encoding UTF-16 +%%% - Non-character values (something other than 0 .. 2^31-1) +%%% - Surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +%%% NB: the UCS replacement char (U+FFFD) will be quietly substituted +%%% for unrepresentable chars (i.e. those geq to 2^20+2^16). +%%% Possible errors decoding UTF-16: +%%% - Unmatched surrogate-pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +char_to_utf16be(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch =< 16#FFFF -> + if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE -> + [Ch bsr 8, Ch band 16#FF] + end; + Ch < 16#110000 -> + %% Encode with surrogate pair + X = Ch - 16#10000, + [16#D8 + (X bsr 18), + (X bsr 10) band 16#FF, + 16#DC + ((X bsr 8) band 3), + X band 16#FF]; + Ch =< 16#7FFFFFFF -> + %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD) + [16#FF, 16#FD] + end. + +from_utf16be(<<Ch:16/big-unsigned-integer, Rest/binary>>, Acc, Tail) + when Ch < 16#D800; Ch > 16#DFFF -> + if Ch < 16#FFFE -> from_utf16be(Rest,[Ch|Acc],Tail) end; +from_utf16be(<<Hi:16/big-unsigned-integer, Lo:16/big-unsigned-integer, + Rest/binary>>, Acc, Tail) + when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF -> + %% Surrogate pair + Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000, + from_utf16be(Rest, [Ch|Acc], Tail); +from_utf16be(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_utf16be(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_utf16be}. + +char_to_utf16le(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch =< 16#FFFF -> + if Ch < 16#D800; Ch >= 16#E000, Ch < 16#FFFE -> + [Ch band 16#FF, Ch bsr 8] + end; + Ch < 16#110000 -> + %% Encode with surrogate pair + X = Ch - 16#10000, + [(X bsr 10) band 16#FF, + 16#D8 + (X bsr 18), + X band 16#FF, + 16#DC + ((X bsr 8) band 3)]; + Ch =< 16#7FFFFFFF -> + %% Unrepresentable char: use REPLACEMENT CHARACTER (U+FFFD) + [16#FD, 16#FF] + end. + +from_utf16le(<<Ch:16/little-unsigned-integer, Rest/binary>>, Acc, Tail) + when Ch < 16#D800; Ch > 16#DFFF -> + if Ch < 16#FFFE -> from_utf16le(Rest, [Ch|Acc], Tail) end; +from_utf16le(<<Hi:16/little-unsigned-integer, Lo:16/little-unsigned-integer, + Rest/binary>>, Acc, Tail) + when Hi >= 16#D800, Hi < 16#DC00, Lo >= 16#DC00, Lo =< 16#DFFF -> + %% Surrogate pair + Ch = ((Hi band 16#3FF) bsl 10) + (Lo band 16#3FF) + 16#10000, + from_utf16le(Rest, [Ch|Acc], Tail); +from_utf16le(<<>>,Acc,Tail) -> + lists:reverse(Acc,Tail); +from_utf16le(Bin,Acc,Tail) -> + io:format("ucs Error: Bin=~p~n Acc=~p~n Tail=~p~n",[Bin,Acc,Tail]), + {error,not_utf16le}. + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%%% UTF-8 support +%%% Possible errors encoding UTF-8: +%%% - Non-character values (something other than 0 .. 2^31-1). +%%% - Surrogate pair code in string. +%%% - 16#FFFE or 16#FFFF character in string. +%%% Possible errors decoding UTF-8: +%%% - 10xxxxxx or 1111111x as initial byte. +%%% - Insufficient number of 10xxxxxx octets following an initial octet of +%%% multi-octet sequence. +%%% - Non-canonical encoding used. +%%% - Surrogate-pair code encoded as UTF-8. +%%% - 16#FFFE or 16#FFFF character in string. +char_to_utf8(Ch) when is_integer(Ch), Ch >= 0 -> + if Ch < 128 -> + %% 0yyyyyyy + [Ch]; + Ch < 16#800 -> + %% 110xxxxy 10yyyyyy + [16#C0 + (Ch bsr 6), + 128+(Ch band 16#3F)]; + Ch < 16#10000 -> + %% 1110xxxx 10xyyyyy 10yyyyyy + if Ch < 16#D800; Ch > 16#DFFF, Ch < 16#FFFE -> + [16#E0 + (Ch bsr 12), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)] + end; + Ch < 16#200000 -> + %% 11110xxx 10xxyyyy 10yyyyyy 10yyyyyy + [16#F0+(Ch bsr 18), + 128+((Ch bsr 12) band 16#3F), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)]; + Ch < 16#4000000 -> + %% 111110xx 10xxxyyy 10yyyyyy 10yyyyyy 10yyyyyy + [16#F8+(Ch bsr 24), + 128+((Ch bsr 18) band 16#3F), + 128+((Ch bsr 12) band 16#3F), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)]; + Ch < 16#80000000 -> + %% 1111110x 10xxxxyy 10yyyyyy 10yyyyyy 10yyyyyy 10yyyyyy + [16#FC+(Ch bsr 30), + 128+((Ch bsr 24) band 16#3F), + 128+((Ch bsr 18) band 16#3F), + 128+((Ch bsr 12) band 16#3F), + 128+((Ch bsr 6) band 16#3F), + 128+(Ch band 16#3F)] + end. + + + + +%% expand_utf8([Byte]) -> {[UnicodeChar],NumberOfBadBytes} +%% Expand UTF8 byte sequences to ISO 10646/Unicode +%% charactes. Any illegal bytes are removed and the number of +%% bad bytes are returned. +%% +%% Reference: +%% RFC 3629: "UTF-8, a transformation format of ISO 10646". + +expand_utf8(Str) -> + expand_utf8_1(Str, [], 0). + +expand_utf8_1([C|Cs], Acc, Bad) when C < 16#80 -> + %% Plain Ascii character. + expand_utf8_1(Cs, [C|Acc], Bad); +expand_utf8_1([C1,C2|Cs], Acc, Bad) when C1 band 16#E0 =:= 16#C0, + C2 band 16#C0 =:= 16#80 -> + case ((C1 band 16#1F) bsl 6) bor (C2 band 16#3F) of + C when 16#80 =< C -> + expand_utf8_1(Cs, [C|Acc], Bad); + _ -> + %% Bad range. + expand_utf8_1(Cs, Acc, Bad+1) + end; +expand_utf8_1([C1,C2,C3|Cs], Acc, Bad) when C1 band 16#F0 =:= 16#E0, + C2 band 16#C0 =:= 16#80, + C3 band 16#C0 =:= 16#80 -> + case ((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor + (C3 band 16#3F) of + C when 16#800 =< C -> + expand_utf8_1(Cs, [C|Acc], Bad); + _ -> + %% Bad range. + expand_utf8_1(Cs, Acc, Bad+1) + end; +expand_utf8_1([C1,C2,C3,C4|Cs], Acc, Bad) when C1 band 16#F8 =:= 16#F0, + C2 band 16#C0 =:= 16#80, + C3 band 16#C0 =:= 16#80, + C4 band 16#C0 =:= 16#80 -> + case ((((((C1 band 16#0F) bsl 6) bor (C2 band 16#3F)) bsl 6) bor + (C3 band 16#3F)) bsl 6) bor (C4 band 16#3F) of + C when 16#10000 =< C -> + expand_utf8_1(Cs, [C|Acc], Bad); + _ -> + %% Bad range. + expand_utf8_1(Cs, Acc, Bad+1) + end; +expand_utf8_1([_|Cs], Acc, Bad) -> + %% Ignore bad character. + expand_utf8_1(Cs, Acc, Bad+1); +expand_utf8_1([], Acc, Bad) -> {lists:reverse(Acc),Bad}. + + + +%%% ---------------------------------------------------------------------------- +%%% Translation to/from any IANA defined character set, given that a mapping +%%% exists. Don't care about validating valid subsets of Unicode +to_unicode(Input,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6'; + Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991'; + Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us'; + Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII + Input; +to_unicode(Input,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' -> + Input; +to_unicode(Input,Cs) when Cs=='iso_646.basic:1983';Cs=='ref'; + Cs=='csiso646basic1983' -> + Input; +to_unicode(Input,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100'; + Cs=='iso_8859-1';Cs=='iso-8859-1';Cs=='latin1'; + Cs=='l1';Cs=='ibm819'; + Cs=='cp819';Cs=='csisolatin1' -> + Input; +% to_unicode(Input,Cs) when Cs=='mnemonic';Cs=='"mnemonic+ascii+38'; +% Cs=='mnem';Cs=='"mnemonic+ascii+8200' -> +% from_mnemonic(Input); +to_unicode(Input,Cs) when Cs=='iso-10646-ucs-2';Cs=='csunicode' -> + from_ucs2be(Input); % Guess byteorder +to_unicode(Input,Cs) when Cs=='iso-10646-ucs-4';Cs=='csucs4' -> + from_ucs4be(Input); % Guess byteorder +to_unicode(Input,Cs) when Cs=='utf-16be';Cs=='utf-16' -> + from_utf16be(Input); +to_unicode(Input,'utf-16le') -> + from_utf16le(Input); +to_unicode(Input,'utf-8') -> + from_utf8(Input); +to_unicode(Input,Charset) -> + exit({bad_character_code,Input,Charset}). + %ucs_data:to_unicode(Input,Charset). + + + + +%%% Tests if Char is in Charset. +%%% Do this by trying to convert it into unicode, if possible a mapping was +%%% found and we are ok. +is_incharset(In,Cs) when Cs=='ansi_x3.4-1968';Cs=='iso-ir-6'; + Cs=='ansi_x3.4-1986';Cs=='iso_646.irv:1991'; + Cs=='ascii';Cs=='iso646-us';Cs=='us-ascii';Cs=='us'; + Cs=='ibm367';Cs=='cp367';Cs=='csascii' -> % US-ASCII + if + is_integer(In) -> is_ascii(In); + is_list(In) -> test_charset(fun is_ascii/1,In) + end; +is_incharset(In,Cs) when Cs=='iso-10646-utf-1';Cs=='csiso10646utf1' -> + if + is_integer(In) -> is_unicode(In); + is_list(In) -> test_charset(fun is_unicode/1, In) + end; +is_incharset(In,Cs) when Cs=='iso_646.basic:1983';Cs=='ref'; + Cs=='csiso646basic1983' -> + if + is_integer(In) -> is_iso646_basic(In); + is_list(In) -> test_charset(fun is_iso646_basic/1, In) + end; +is_incharset(In,Cs) when Cs=='iso_8859-1:1987';Cs=='iso-ir-100'; + Cs=='iso_8859-1';Cs=='iso-8859-1'; + Cs=='latin1';Cs=='l1';Cs=='ibm819'; + Cs=='cp819';Cs=='csisolatin1' -> + if + is_integer(In) -> is_latin1(In); + is_list(In) -> test_charset(fun is_latin1/1, In) + end; +is_incharset(In,Charset) when is_integer(In) -> + case to_unicode([In],Charset) of + {error,unsupported_charset} -> + {error,unsupported_charset}; + {error,_} -> + false; + [Int] when is_integer(Int) -> + true + end; +is_incharset(In,Charset) when is_list(In) -> + case to_unicode(In,Charset) of + {error,unsupported_charset} -> + {error,unsupported_charset}; + {error,_} -> + false; + [Int] when is_integer(Int) -> + true + end. + + +test_charset(Fun,Input) -> + case lists:all(Fun, Input) of + true -> + true; + _ -> + false + end. + |