From 34db76765561487e526fe66d3d19ecf3b3fb9dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Tue, 30 Aug 2011 11:51:11 +0200 Subject: Allow noncharacter code points in unicode encoding and decoding The two noncharacter code points 16#FFFE and 16#FFFF were not allowed to be encoded or decoded using the unicode module or bit syntax. That causes an inconsistency, since the noncharacters 16#FDD0 to 16#FDEF could be encoded/decoded. There is two ways to fix that inconsistency. We have chosen to allow 16#FFFE and 16#FFFF to be encoded and decoded, because the noncharacters could be useful internally within an application and it will make encoding and decoding slightly faster. Reported-by: Alisdair Sullivan --- lib/compiler/test/bs_utf_SUITE.erl | 21 --------------------- .../com/ericsson/otp/erlang/OtpErlangString.java | 5 ++--- lib/stdlib/doc/src/unicode.xml | 3 +-- lib/stdlib/test/unicode_SUITE.erl | 4 +--- 4 files changed, 4 insertions(+), 29 deletions(-) (limited to 'lib') diff --git a/lib/compiler/test/bs_utf_SUITE.erl b/lib/compiler/test/bs_utf_SUITE.erl index f30a4d3fef..94549ad0d3 100644 --- a/lib/compiler/test/bs_utf_SUITE.erl +++ b/lib/compiler/test/bs_utf_SUITE.erl @@ -264,18 +264,10 @@ literals(Config) when is_list(Config) -> ?line {'EXIT',{badarg,_}} = (catch <<(-1)/utf32,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf32,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf8,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf8,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf8,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf16,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/little-utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf16,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf32,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf32,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf32,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf32,I/utf8>>), B = 16#10FFFF+1, ?line {'EXIT',{badarg,_}} = (catch <>), @@ -286,20 +278,11 @@ literals(Config) when is_list(Config) -> %% Matching of bad literals. ?line error = bad_literal_match(<<237,160,128>>), %16#D800 in UTF-8 - ?line error = bad_literal_match(<<239,191,190>>), %16#FFFE in UTF-8 - ?line error = bad_literal_match(<<239,191,191>>), %16#FFFF in UTF-8 ?line error = bad_literal_match(<<244,144,128,128>>), %16#110000 in UTF-8 - ?line error = bad_literal_match(<<255,254>>), %16#FFFE in UTF-16 - ?line error = bad_literal_match(<<255,255>>), %16#FFFF in UTF-16 - ?line error = bad_literal_match(<<16#D800:32>>), - ?line error = bad_literal_match(<<16#FFFE:32>>), - ?line error = bad_literal_match(<<16#FFFF:32>>), ?line error = bad_literal_match(<<16#110000:32>>), ?line error = bad_literal_match(<<16#D800:32/little>>), - ?line error = bad_literal_match(<<16#FFFE:32/little>>), - ?line error = bad_literal_match(<<16#FFFF:32/little>>), ?line error = bad_literal_match(<<16#110000:32/little>>), ok. @@ -314,11 +297,7 @@ match_literal(<<"bj\366rn"/big-utf16>>) -> bjorn_utf16be; match_literal(<<"bj\366rn"/little-utf16>>) -> bjorn_utf16le. bad_literal_match(<<16#D800/utf8>>) -> ok; -bad_literal_match(<<16#FFFE/utf8>>) -> ok; -bad_literal_match(<<16#FFFF/utf8>>) -> ok; bad_literal_match(<<16#110000/utf8>>) -> ok; -bad_literal_match(<<16#FFFE/utf16>>) -> ok; -bad_literal_match(<<16#FFFF/utf16>>) -> ok; bad_literal_match(<<16#D800/utf32>>) -> ok; bad_literal_match(<<16#110000/utf32>>) -> ok; bad_literal_match(<<16#D800/little-utf32>>) -> ok; diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java index 19ee92e0d0..23734bf83b 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java @@ -166,7 +166,7 @@ public class OtpErlangString extends OtpErlangObject implements Serializable, /** * Validate a code point according to Erlang definition; Unicode 3.0. * That is; valid in the range U+0..U+10FFFF, but not in the range - * U+D800..U+DFFF (surrogat pairs), nor U+FFFE..U+FFFF (non-characters). + * U+D800..U+DFFF (surrogat pairs). * * @param cp * the code point value to validate @@ -179,8 +179,7 @@ public class OtpErlangString extends OtpErlangObject implements Serializable, // Erlang definition of valid Unicode code points; // Unicode 3.0, XML, et.al. return (cp>>>16) <= 0x10 // in 0..10FFFF; Unicode range - && (cp & ~0x7FF) != 0xD800 // not in D800..DFFF; surrogate range - && (cp & ~1) != 0xFFFE; // not in FFFE..FFFF; non-characters + && (cp & ~0x7FF) != 0xD800; // not in D800..DFFF; surrogate range } /** diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml index d02763f75c..1001ebbae4 100644 --- a/lib/stdlib/doc/src/unicode.xml +++ b/lib/stdlib/doc/src/unicode.xml @@ -203,8 +203,7 @@ greater than 16#10FFFF (the maximum unicode character), in the range 16#D800 to 16#DFFF - (invalid unicode range) - or equal to 16#FFFE or 16#FFFF (non characters) + (invalid range reserved for UTF-16 surrogate pairs) is found. diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl index 9aa800209d..4055af2741 100644 --- a/lib/stdlib/test/unicode_SUITE.erl +++ b/lib/stdlib/test/unicode_SUITE.erl @@ -322,7 +322,7 @@ roundtrips(Config) when is_list(Config) -> ex_roundtrips(Config) when is_list(Config) -> ?line L1 = ranges(0, 16#D800 - 1, erlang:system_info(context_reductions) * 11), - ?line L2 = ranges(16#DFFF + 1, 16#FFFE - 1, + ?line L2 = ranges(16#DFFF + 1, 16#10000 - 1, erlang:system_info(context_reductions) * 11), %?line L3 = ranges(16#FFFF + 1, 16#10FFFF, % erlang:system_info(context_reductions) * 11), @@ -569,7 +569,6 @@ utf16_illegal_sequences_bif(Config) when is_list(Config) -> ex_utf16_illegal_sequences_bif(Config) when is_list(Config) -> ?line utf16_fail_range_bif_simple(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line utf16_fail_range_bif(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line utf16_fail_range_bif(16#FFFE, 16#FFFF), %Non-characters. ?line lonely_hi_surrogate_bif(16#D800, 16#DBFF,incomplete), ?line lonely_hi_surrogate_bif(16#DC00, 16#DFFF,error), @@ -644,7 +643,6 @@ utf8_illegal_sequences_bif(Config) when is_list(Config) -> ex_utf8_illegal_sequences_bif(Config) when is_list(Config) -> ?line fail_range_bif(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line fail_range_bif(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line fail_range_bif(16#FFFE, 16#FFFF), %Reserved (BOM). %% Illegal first character. ?line [fail_bif(<>,unicode) || I <- lists:seq(16#80, 16#BF)], -- cgit v1.2.3