From 34db76765561487e526fe66d3d19ecf3b3fb9dc8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Tue, 30 Aug 2011 11:51:11 +0200 Subject: Allow noncharacter code points in unicode encoding and decoding The two noncharacter code points 16#FFFE and 16#FFFF were not allowed to be encoded or decoded using the unicode module or bit syntax. That causes an inconsistency, since the noncharacters 16#FDD0 to 16#FDEF could be encoded/decoded. There is two ways to fix that inconsistency. We have chosen to allow 16#FFFE and 16#FFFF to be encoded and decoded, because the noncharacters could be useful internally within an application and it will make encoding and decoding slightly faster. Reported-by: Alisdair Sullivan --- erts/emulator/beam/beam_emu.c | 7 +++---- erts/emulator/beam/erl_bits.c | 12 +++-------- erts/emulator/beam/erl_unicode.c | 24 +++++----------------- erts/emulator/test/bs_utf_SUITE.erl | 12 +++-------- lib/compiler/test/bs_utf_SUITE.erl | 21 ------------------- .../com/ericsson/otp/erlang/OtpErlangString.java | 5 ++--- lib/stdlib/doc/src/unicode.xml | 3 +-- lib/stdlib/test/unicode_SUITE.erl | 4 +--- system/doc/reference_manual/expressions.xml | 12 +++++------ 9 files changed, 23 insertions(+), 77 deletions(-) diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c index 4b5b5cbdaa..62a14dbacf 100644 --- a/erts/emulator/beam/beam_emu.c +++ b/erts/emulator/beam/beam_emu.c @@ -3967,8 +3967,7 @@ void process_main(void) * too big numbers). */ if (is_not_small(val) || val > make_small(0x10FFFFUL) || - (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL)) || - val == make_small(0xFFFEUL) || val == make_small(0xFFFFUL)) { + (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL))) { goto badarg; } Next(2); @@ -3987,8 +3986,8 @@ void process_main(void) * the valid range). */ if (is_not_small(tmp_arg1) || tmp_arg1 > make_small(0x10FFFFUL) || - (make_small(0xD800UL) <= tmp_arg1 && tmp_arg1 <= make_small(0xDFFFUL)) || - tmp_arg1 == make_small(0xFFFEUL) || tmp_arg1 == make_small(0xFFFFUL)) { + (make_small(0xD800UL) <= tmp_arg1 && + tmp_arg1 <= make_small(0xDFFFUL))) { ErlBinMatchBuffer *mb = ms_matchbuffer(tmp_arg2); mb->offset -= 32; diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c index 326a5c136b..6f7309f493 100644 --- a/erts/emulator/beam/erl_bits.c +++ b/erts/emulator/beam/erl_bits.c @@ -845,8 +845,7 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg)) dst[1] = 0x80 | (val & 0x3F); num_bits = 16; } else if (val < 0x10000UL) { - if ((0xD800 <= val && val <= 0xDFFF) || - val == 0xFFFE || val == 0xFFFF) { + if (0xD800 <= val && val <= 0xDFFF) { return 0; } dst[0] = 0xE0 | (val >> 12); @@ -886,8 +885,7 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags)) return 0; } val = unsigned_val(arg); - if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF) || - val == 0xFFFE || val == 0xFFFF) { + if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF)) { return 0; } @@ -1652,8 +1650,7 @@ erts_bs_get_utf8(ErlBinMatchBuffer* mb) return THE_NON_VALUE; } result = (((result << 6) + a) << 6) + b - (Eterm) 0x000E2080UL; - if ((0xD800 <= result && result <= 0xDFFF) || - result == 0xFFFE || result == 0xFFFF) { + if (0xD800 <= result && result <= 0xDFFF) { return THE_NON_VALUE; } mb->offset += 24; @@ -1723,9 +1720,6 @@ erts_bs_get_utf16(ErlBinMatchBuffer* mb, Uint flags) w1 = (src[0] << 8) | src[1]; } if (w1 < 0xD800 || w1 > 0xDFFF) { - if (w1 == 0xFFFE || w1 == 0xFFFF) { - return THE_NON_VALUE; - } mb->offset += 16; return make_small(w1); } else if (w1 > 0xDBFF) { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 158eb361a4..bd5f3cc4c1 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -348,12 +348,6 @@ static int copy_utf8_bin(byte *target, byte *source, Uint size, return copied; } - if (((*source) == 0xEF) && (source[1] == 0xBF) && - ((source[2] == 0xBE) || (source[2] == 0xBF))) { - *err_pos = source; - return copied; - } - *(target++) = *(source++); *(target++) = *(source++); *(target++) = *(source++); @@ -714,9 +708,8 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ target[(*pos)++] = (((byte) (x & 0x3F)) | ((byte) 0x80)); } else if (x < 0x10000) { - if ((x >= 0xD800 && x <= 0xDFFF) || - (x == 0xFFFE) || - (x == 0xFFFF)) { /* Invalid unicode range */ + if (x >= 0xD800 && x <= 0xDFFF) { + /* Invalid unicode range */ *err = 1; goto done; } @@ -1230,10 +1223,6 @@ int erts_analyze_utf8(byte *source, Uint size, ((source[1] & 0x20) != 0)) { return ERTS_UTF8_ERROR; } - if (((*source) == 0xEF) && (source[1] == 0xBF) && - ((source[2] == 0xBE) || (source[2] == 0xBF))) { - return ERTS_UTF8_ERROR; - } source += 3; size -= 3; } else if (((*source) & ((byte) 0xF8)) == 0xF0) { @@ -2166,9 +2155,8 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ } else if (x < 0x800) { need += 2; } else if (x < 0x10000) { - if ((x >= 0xD800 && x <= 0xDFFF) || - (x == 0xFFFE) || - (x == 0xFFFF)) { /* Invalid unicode range */ + if (x >= 0xD800 && x <= 0xDFFF) { + /* Invalid unicode range */ DESTROY_ESTACK(stack); return ((Sint) -1); } @@ -2314,9 +2302,7 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ *p++ = (((byte) (x & 0x3F)) | ((byte) 0x80)); } else if (x < 0x10000) { - ASSERT(!((x >= 0xD800 && x <= 0xDFFF) || - (x == 0xFFFE) || - (x == 0xFFFF))); + ASSERT(!(x >= 0xD800 && x <= 0xDFFF)); *p++ = (((byte) (x >> 12)) | ((byte) 0xE0)); *p++ = ((((byte) (x >> 6)) & 0x3F) | diff --git a/erts/emulator/test/bs_utf_SUITE.erl b/erts/emulator/test/bs_utf_SUITE.erl index 72c656c400..4ab7d674a6 100644 --- a/erts/emulator/test/bs_utf_SUITE.erl +++ b/erts/emulator/test/bs_utf_SUITE.erl @@ -64,8 +64,7 @@ end_per_group(_GroupName, Config) -> utf8_roundtrip(Config) when is_list(Config) -> ?line utf8_roundtrip(0, 16#D7FF), - ?line utf8_roundtrip(16#E000, 16#FFFD), - ?line utf8_roundtrip(16#10000, 16#10FFFF), + ?line utf8_roundtrip(16#E000, 16#10FFFF), ok. utf8_roundtrip(First, Last) when First =< Last -> @@ -91,8 +90,7 @@ utf16_roundtrip(Config) when is_list(Config) -> do_utf16_roundtrip(Fun) -> do_utf16_roundtrip(0, 16#D7FF, Fun), - do_utf16_roundtrip(16#E000, 16#FFFD, Fun), - do_utf16_roundtrip(16#10000, 16#10FFFF, Fun). + do_utf16_roundtrip(16#E000, 16#10FFFF, Fun). do_utf16_roundtrip(First, Last, Fun) when First =< Last -> Fun(First), @@ -129,8 +127,7 @@ utf32_roundtrip(Config) when is_list(Config) -> do_utf32_roundtrip(Fun) -> do_utf32_roundtrip(0, 16#D7FF, Fun), - do_utf32_roundtrip(16#E000, 16#FFFD, Fun), - do_utf32_roundtrip(16#10000, 16#10FFFF, Fun). + do_utf32_roundtrip(16#E000, 16#10FFFF, Fun). do_utf32_roundtrip(First, Last, Fun) when First =< Last -> Fun(First), @@ -158,7 +155,6 @@ utf32_little_roundtrip(Char) -> utf8_illegal_sequences(Config) when is_list(Config) -> ?line fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line fail_range(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line fail_range(16#FFFE, 16#FFFF), %Non-characters. %% Illegal first character. ?line [fail(<>) || I <- lists:seq(16#80, 16#BF)], @@ -251,7 +247,6 @@ fail_1(_) -> ok. utf16_illegal_sequences(Config) when is_list(Config) -> ?line utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line utf16_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line utf16_fail_range(16#FFFE, 16#FFFF), %Non-characters. ?line lonely_hi_surrogate(16#D800, 16#DFFF), ?line leading_lo_surrogate(16#DC00, 16#DFFF), @@ -300,7 +295,6 @@ leading_lo_surrogate(_, _, _) -> ok. utf32_illegal_sequences(Config) when is_list(Config) -> ?line utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line utf32_fail_range(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line utf32_fail_range(16#FFFE, 16#FFFF), %Non-characters. ?line utf32_fail_range(-100, -1), ok. diff --git a/lib/compiler/test/bs_utf_SUITE.erl b/lib/compiler/test/bs_utf_SUITE.erl index f30a4d3fef..94549ad0d3 100644 --- a/lib/compiler/test/bs_utf_SUITE.erl +++ b/lib/compiler/test/bs_utf_SUITE.erl @@ -264,18 +264,10 @@ literals(Config) when is_list(Config) -> ?line {'EXIT',{badarg,_}} = (catch <<(-1)/utf32,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf32,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf8,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf8,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf8,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf16,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/little-utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf16,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf16,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf32,I/utf8>>), ?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf32,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf32,I/utf8>>), - ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf32,I/utf8>>), B = 16#10FFFF+1, ?line {'EXIT',{badarg,_}} = (catch <>), @@ -286,20 +278,11 @@ literals(Config) when is_list(Config) -> %% Matching of bad literals. ?line error = bad_literal_match(<<237,160,128>>), %16#D800 in UTF-8 - ?line error = bad_literal_match(<<239,191,190>>), %16#FFFE in UTF-8 - ?line error = bad_literal_match(<<239,191,191>>), %16#FFFF in UTF-8 ?line error = bad_literal_match(<<244,144,128,128>>), %16#110000 in UTF-8 - ?line error = bad_literal_match(<<255,254>>), %16#FFFE in UTF-16 - ?line error = bad_literal_match(<<255,255>>), %16#FFFF in UTF-16 - ?line error = bad_literal_match(<<16#D800:32>>), - ?line error = bad_literal_match(<<16#FFFE:32>>), - ?line error = bad_literal_match(<<16#FFFF:32>>), ?line error = bad_literal_match(<<16#110000:32>>), ?line error = bad_literal_match(<<16#D800:32/little>>), - ?line error = bad_literal_match(<<16#FFFE:32/little>>), - ?line error = bad_literal_match(<<16#FFFF:32/little>>), ?line error = bad_literal_match(<<16#110000:32/little>>), ok. @@ -314,11 +297,7 @@ match_literal(<<"bj\366rn"/big-utf16>>) -> bjorn_utf16be; match_literal(<<"bj\366rn"/little-utf16>>) -> bjorn_utf16le. bad_literal_match(<<16#D800/utf8>>) -> ok; -bad_literal_match(<<16#FFFE/utf8>>) -> ok; -bad_literal_match(<<16#FFFF/utf8>>) -> ok; bad_literal_match(<<16#110000/utf8>>) -> ok; -bad_literal_match(<<16#FFFE/utf16>>) -> ok; -bad_literal_match(<<16#FFFF/utf16>>) -> ok; bad_literal_match(<<16#D800/utf32>>) -> ok; bad_literal_match(<<16#110000/utf32>>) -> ok; bad_literal_match(<<16#D800/little-utf32>>) -> ok; diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java index 19ee92e0d0..23734bf83b 100644 --- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java +++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java @@ -166,7 +166,7 @@ public class OtpErlangString extends OtpErlangObject implements Serializable, /** * Validate a code point according to Erlang definition; Unicode 3.0. * That is; valid in the range U+0..U+10FFFF, but not in the range - * U+D800..U+DFFF (surrogat pairs), nor U+FFFE..U+FFFF (non-characters). + * U+D800..U+DFFF (surrogat pairs). * * @param cp * the code point value to validate @@ -179,8 +179,7 @@ public class OtpErlangString extends OtpErlangObject implements Serializable, // Erlang definition of valid Unicode code points; // Unicode 3.0, XML, et.al. return (cp>>>16) <= 0x10 // in 0..10FFFF; Unicode range - && (cp & ~0x7FF) != 0xD800 // not in D800..DFFF; surrogate range - && (cp & ~1) != 0xFFFE; // not in FFFE..FFFF; non-characters + && (cp & ~0x7FF) != 0xD800; // not in D800..DFFF; surrogate range } /** diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml index d02763f75c..1001ebbae4 100644 --- a/lib/stdlib/doc/src/unicode.xml +++ b/lib/stdlib/doc/src/unicode.xml @@ -203,8 +203,7 @@ greater than 16#10FFFF (the maximum unicode character), in the range 16#D800 to 16#DFFF - (invalid unicode range) - or equal to 16#FFFE or 16#FFFF (non characters) + (invalid range reserved for UTF-16 surrogate pairs) is found. diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl index 9aa800209d..4055af2741 100644 --- a/lib/stdlib/test/unicode_SUITE.erl +++ b/lib/stdlib/test/unicode_SUITE.erl @@ -322,7 +322,7 @@ roundtrips(Config) when is_list(Config) -> ex_roundtrips(Config) when is_list(Config) -> ?line L1 = ranges(0, 16#D800 - 1, erlang:system_info(context_reductions) * 11), - ?line L2 = ranges(16#DFFF + 1, 16#FFFE - 1, + ?line L2 = ranges(16#DFFF + 1, 16#10000 - 1, erlang:system_info(context_reductions) * 11), %?line L3 = ranges(16#FFFF + 1, 16#10FFFF, % erlang:system_info(context_reductions) * 11), @@ -569,7 +569,6 @@ utf16_illegal_sequences_bif(Config) when is_list(Config) -> ex_utf16_illegal_sequences_bif(Config) when is_list(Config) -> ?line utf16_fail_range_bif_simple(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line utf16_fail_range_bif(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line utf16_fail_range_bif(16#FFFE, 16#FFFF), %Non-characters. ?line lonely_hi_surrogate_bif(16#D800, 16#DBFF,incomplete), ?line lonely_hi_surrogate_bif(16#DC00, 16#DFFF,error), @@ -644,7 +643,6 @@ utf8_illegal_sequences_bif(Config) when is_list(Config) -> ex_utf8_illegal_sequences_bif(Config) when is_list(Config) -> ?line fail_range_bif(16#10FFFF+1, 16#10FFFF+512), %Too large. ?line fail_range_bif(16#D800, 16#DFFF), %Reserved for UTF-16. - ?line fail_range_bif(16#FFFE, 16#FFFF), %Reserved (BOM). %% Illegal first character. ?line [fail_bif(<>,unicode) || I <- lists:seq(16#80, 16#BF)], diff --git a/system/doc/reference_manual/expressions.xml b/system/doc/reference_manual/expressions.xml index 497d7eb464..644896cd7f 100644 --- a/system/doc/reference_manual/expressions.xml +++ b/system/doc/reference_manual/expressions.xml @@ -879,9 +879,8 @@ Ei = Value | and UTF-32, respectively.

When constructing a segment of a utf type, Value - must be an integer in one of the ranges 0..16#D7FF, - 16#E000..16#FFFD, or 16#10000..16#10FFFF - (i.e. a valid Unicode code point). Construction + must be an integer in the range 0..16#D7FF or + 16#E000....16#10FFFF. Construction will fail with a badarg exception if Value is outside the allowed ranges. The size of the resulting binary segment depends on the type and/or Value. For utf8, @@ -896,14 +895,13 @@ Ei = Value | >]]>.

A successful match of a segment of a utf type results - in an integer in one of the ranges 0..16#D7FF, 16#E000..16#FFFD, - or 16#10000..16#10FFFF - (i.e. a valid Unicode code point). The match will fail if returned value + in an integer in the range 0..16#D7FF or 16#E000..16#10FFFF. + The match will fail if returned value would fall outside those ranges.

A segment of type utf8 will match 1 to 4 bytes in the binary, if the binary at the match position contains a valid UTF-8 sequence. - (See RFC-2279 or the Unicode standard.)

+ (See RFC-3629 or the Unicode standard.)

A segment of type utf16 may match 2 or 4 bytes in the binary. The match will fail if the binary at the match position does not contain -- cgit v1.2.3