From 34db76765561487e526fe66d3d19ecf3b3fb9dc8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Tue, 30 Aug 2011 11:51:11 +0200
Subject: Allow noncharacter code points in unicode encoding and decoding

The two noncharacter code points 16#FFFE and 16#FFFF were not
allowed to be encoded or decoded using the unicode module or
bit syntax. That causes an inconsistency, since the noncharacters
16#FDD0 to 16#FDEF could be encoded/decoded.

There is two ways to fix that inconsistency.

We have chosen to allow 16#FFFE and 16#FFFF to be encoded and
decoded, because the noncharacters could be useful internally
within an application and it will make encoding and decoding
slightly faster.

Reported-by: Alisdair Sullivan
---
 erts/emulator/beam/beam_emu.c                      |  7 +++----
 erts/emulator/beam/erl_bits.c                      | 12 +++--------
 erts/emulator/beam/erl_unicode.c                   | 24 +++++-----------------
 erts/emulator/test/bs_utf_SUITE.erl                | 12 +++--------
 lib/compiler/test/bs_utf_SUITE.erl                 | 21 -------------------
 .../com/ericsson/otp/erlang/OtpErlangString.java   |  5 ++---
 lib/stdlib/doc/src/unicode.xml                     |  3 +--
 lib/stdlib/test/unicode_SUITE.erl                  |  4 +---
 system/doc/reference_manual/expressions.xml        | 12 +++++------
 9 files changed, 23 insertions(+), 77 deletions(-)

diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index 4b5b5cbdaa..62a14dbacf 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -3967,8 +3967,7 @@ void process_main(void)
       * too big numbers).
       */
      if (is_not_small(val) || val > make_small(0x10FFFFUL) ||
-	 (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL)) ||
-	 val == make_small(0xFFFEUL) || val == make_small(0xFFFFUL)) {
+	 (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL))) {
 	 goto badarg;
      }
      Next(2);
@@ -3987,8 +3986,8 @@ void process_main(void)
       * the valid range).
       */
      if (is_not_small(tmp_arg1) || tmp_arg1 > make_small(0x10FFFFUL) ||
-	 (make_small(0xD800UL) <= tmp_arg1 && tmp_arg1 <= make_small(0xDFFFUL)) ||
-	 tmp_arg1 == make_small(0xFFFEUL) || tmp_arg1 == make_small(0xFFFFUL)) {
+	 (make_small(0xD800UL) <= tmp_arg1 &&
+	  tmp_arg1 <= make_small(0xDFFFUL))) {
 	 ErlBinMatchBuffer *mb = ms_matchbuffer(tmp_arg2);
 
 	 mb->offset -= 32;
diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index 326a5c136b..6f7309f493 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -845,8 +845,7 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
 	dst[1] = 0x80 | (val & 0x3F);
 	num_bits = 16;
     } else if (val < 0x10000UL) {
-	if ((0xD800 <= val && val <= 0xDFFF) ||
-	    val == 0xFFFE || val == 0xFFFF) {
+	if (0xD800 <= val && val <= 0xDFFF) {
 	    return 0;
 	}
 	dst[0] = 0xE0 | (val >> 12);
@@ -886,8 +885,7 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
 	return 0;
     }
     val = unsigned_val(arg);
-    if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF) ||
-	val == 0xFFFE || val == 0xFFFF) {
+    if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF)) {
 	return 0;
     }
 
@@ -1652,8 +1650,7 @@ erts_bs_get_utf8(ErlBinMatchBuffer* mb)
 	    return THE_NON_VALUE;
 	}
 	result = (((result << 6) + a) << 6) + b - (Eterm) 0x000E2080UL;
-	if ((0xD800 <= result && result <= 0xDFFF) ||
-	    result == 0xFFFE || result == 0xFFFF) {
+	if (0xD800 <= result && result <= 0xDFFF) {
 	    return THE_NON_VALUE;
 	}
 	mb->offset += 24;
@@ -1723,9 +1720,6 @@ erts_bs_get_utf16(ErlBinMatchBuffer* mb, Uint flags)
 	w1 = (src[0] << 8) | src[1];
     }
     if (w1 < 0xD800 || w1 > 0xDFFF) {
-	if (w1 == 0xFFFE || w1 == 0xFFFF) {
-	    return THE_NON_VALUE;
-	}
 	mb->offset += 16;
 	return make_small(w1);
     } else if (w1 > 0xDBFF) {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 158eb361a4..bd5f3cc4c1 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -348,12 +348,6 @@ static int copy_utf8_bin(byte *target, byte *source, Uint size,
 		return copied;
 	    }
 
-	    if (((*source) == 0xEF) && (source[1] == 0xBF) &&
-		((source[2] == 0xBE) || (source[2] == 0xBF))) {
-		*err_pos = source;
-		return copied;
-	    }
-		
 	    *(target++) = *(source++);
 	    *(target++) = *(source++);
 	    *(target++) = *(source++);
@@ -714,9 +708,8 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 			    target[(*pos)++] = (((byte) (x & 0x3F)) | 
 						((byte) 0x80));
 			} else if (x < 0x10000) {
-			    if ((x >= 0xD800 && x <= 0xDFFF) ||
-				(x == 0xFFFE) ||
-				(x == 0xFFFF)) { /* Invalid unicode range */
+			    if (x >= 0xD800 && x <= 0xDFFF) {
+				/* Invalid unicode range */
 				*err = 1;
 				goto done;
 			    }
@@ -1230,10 +1223,6 @@ int erts_analyze_utf8(byte *source, Uint size,
 		((source[1] & 0x20) != 0)) {
 		return ERTS_UTF8_ERROR;
 	    }
-	    if (((*source) == 0xEF) && (source[1] == 0xBF) &&
-		((source[2] == 0xBE) || (source[2] == 0xBF))) {
-		return ERTS_UTF8_ERROR;
-	    }
 	    source += 3;
 	    size -= 3;
 	} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
@@ -2166,9 +2155,8 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 			    } else if (x < 0x800) {
 				need += 2;
 			    } else if (x < 0x10000) {
-				if ((x >= 0xD800 && x <= 0xDFFF) ||
-				    (x == 0xFFFE) ||
-				    (x == 0xFFFF)) { /* Invalid unicode range */
+				if (x >= 0xD800 && x <= 0xDFFF) {
+				    /* Invalid unicode range */
 				    DESTROY_ESTACK(stack);
 				    return ((Sint) -1);
 				}
@@ -2314,9 +2302,7 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 				*p++ = (((byte) (x & 0x3F)) | 
 					((byte) 0x80));
 			    } else if (x < 0x10000) {
-				ASSERT(!((x >= 0xD800 && x <= 0xDFFF) ||
-					 (x == 0xFFFE) ||
-					 (x == 0xFFFF)));
+				ASSERT(!(x >= 0xD800 && x <= 0xDFFF));
 				*p++ = (((byte) (x >> 12)) | 
 					((byte) 0xE0));
 				*p++ = ((((byte) (x >> 6)) & 0x3F)  | 
diff --git a/erts/emulator/test/bs_utf_SUITE.erl b/erts/emulator/test/bs_utf_SUITE.erl
index 72c656c400..4ab7d674a6 100644
--- a/erts/emulator/test/bs_utf_SUITE.erl
+++ b/erts/emulator/test/bs_utf_SUITE.erl
@@ -64,8 +64,7 @@ end_per_group(_GroupName, Config) ->
 
 utf8_roundtrip(Config) when is_list(Config) ->
     ?line utf8_roundtrip(0, 16#D7FF),
-    ?line utf8_roundtrip(16#E000, 16#FFFD),
-    ?line utf8_roundtrip(16#10000, 16#10FFFF),
+    ?line utf8_roundtrip(16#E000, 16#10FFFF),
     ok.
 
 utf8_roundtrip(First, Last) when First =< Last ->
@@ -91,8 +90,7 @@ utf16_roundtrip(Config) when is_list(Config) ->
 
 do_utf16_roundtrip(Fun) ->
     do_utf16_roundtrip(0, 16#D7FF, Fun),
-    do_utf16_roundtrip(16#E000, 16#FFFD, Fun),
-    do_utf16_roundtrip(16#10000, 16#10FFFF, Fun).
+    do_utf16_roundtrip(16#E000, 16#10FFFF, Fun).
 
 do_utf16_roundtrip(First, Last, Fun) when First =< Last ->
     Fun(First),
@@ -129,8 +127,7 @@ utf32_roundtrip(Config) when is_list(Config) ->
 
 do_utf32_roundtrip(Fun) ->
     do_utf32_roundtrip(0, 16#D7FF, Fun),
-    do_utf32_roundtrip(16#E000, 16#FFFD, Fun),
-    do_utf32_roundtrip(16#10000, 16#10FFFF, Fun).
+    do_utf32_roundtrip(16#E000, 16#10FFFF, Fun).
 
 do_utf32_roundtrip(First, Last, Fun) when First =< Last ->
     Fun(First),
@@ -158,7 +155,6 @@ utf32_little_roundtrip(Char) ->
 utf8_illegal_sequences(Config) when is_list(Config) ->
     ?line fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line fail_range(16#FFFE, 16#FFFF),		%Non-characters.
 
     %% Illegal first character.
     ?line [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)],
@@ -251,7 +247,6 @@ fail_1(_) -> ok.
 utf16_illegal_sequences(Config) when is_list(Config) ->
     ?line utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line utf16_fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line utf16_fail_range(16#FFFE, 16#FFFF),		%Non-characters.
 
     ?line lonely_hi_surrogate(16#D800, 16#DFFF),
     ?line leading_lo_surrogate(16#DC00, 16#DFFF),
@@ -300,7 +295,6 @@ leading_lo_surrogate(_, _, _) -> ok.
 utf32_illegal_sequences(Config) when is_list(Config) ->
     ?line utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line utf32_fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line utf32_fail_range(16#FFFE, 16#FFFF),		%Non-characters.
     ?line utf32_fail_range(-100, -1),
     ok.
 
diff --git a/lib/compiler/test/bs_utf_SUITE.erl b/lib/compiler/test/bs_utf_SUITE.erl
index f30a4d3fef..94549ad0d3 100644
--- a/lib/compiler/test/bs_utf_SUITE.erl
+++ b/lib/compiler/test/bs_utf_SUITE.erl
@@ -264,18 +264,10 @@ literals(Config) when is_list(Config) ->
     ?line {'EXIT',{badarg,_}} = (catch <<(-1)/utf32,I/utf8>>),
     ?line {'EXIT',{badarg,_}} = (catch <<(-1)/little-utf32,I/utf8>>),
     ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf8,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf8,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf8,I/utf8>>),
     ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf16,I/utf8>>),
     ?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf16,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf16,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/little-utf16,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/utf16,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf16,I/utf8>>),
     ?line {'EXIT',{badarg,_}} = (catch <<16#D800/utf32,I/utf8>>),
     ?line {'EXIT',{badarg,_}} = (catch <<16#D800/little-utf32,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFE/utf32,I/utf8>>),
-    ?line {'EXIT',{badarg,_}} = (catch <<16#FFFF/little-utf32,I/utf8>>),
 
     B = 16#10FFFF+1,
     ?line {'EXIT',{badarg,_}} = (catch <<B/utf8>>),
@@ -286,20 +278,11 @@ literals(Config) when is_list(Config) ->
 
     %% Matching of bad literals.
     ?line error = bad_literal_match(<<237,160,128>>), %16#D800 in UTF-8
-    ?line error = bad_literal_match(<<239,191,190>>), %16#FFFE in UTF-8
-    ?line error = bad_literal_match(<<239,191,191>>), %16#FFFF in UTF-8
     ?line error = bad_literal_match(<<244,144,128,128>>), %16#110000 in UTF-8
 
-    ?line error = bad_literal_match(<<255,254>>), %16#FFFE in UTF-16
-    ?line error = bad_literal_match(<<255,255>>), %16#FFFF in UTF-16
-
     ?line error = bad_literal_match(<<16#D800:32>>),
-    ?line error = bad_literal_match(<<16#FFFE:32>>),
-    ?line error = bad_literal_match(<<16#FFFF:32>>),
     ?line error = bad_literal_match(<<16#110000:32>>),
     ?line error = bad_literal_match(<<16#D800:32/little>>),
-    ?line error = bad_literal_match(<<16#FFFE:32/little>>),
-    ?line error = bad_literal_match(<<16#FFFF:32/little>>),
     ?line error = bad_literal_match(<<16#110000:32/little>>),
 
     ok.
@@ -314,11 +297,7 @@ match_literal(<<"bj\366rn"/big-utf16>>) -> bjorn_utf16be;
 match_literal(<<"bj\366rn"/little-utf16>>) -> bjorn_utf16le.
 
 bad_literal_match(<<16#D800/utf8>>) -> ok;
-bad_literal_match(<<16#FFFE/utf8>>) -> ok;
-bad_literal_match(<<16#FFFF/utf8>>) -> ok;
 bad_literal_match(<<16#110000/utf8>>) -> ok;
-bad_literal_match(<<16#FFFE/utf16>>) -> ok;
-bad_literal_match(<<16#FFFF/utf16>>) -> ok;
 bad_literal_match(<<16#D800/utf32>>) -> ok;
 bad_literal_match(<<16#110000/utf32>>) -> ok;
 bad_literal_match(<<16#D800/little-utf32>>) -> ok;
diff --git a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java
index 19ee92e0d0..23734bf83b 100644
--- a/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java
+++ b/lib/jinterface/java_src/com/ericsson/otp/erlang/OtpErlangString.java
@@ -166,7 +166,7 @@ public class OtpErlangString extends OtpErlangObject implements Serializable,
     /**
      * Validate a code point according to Erlang definition; Unicode 3.0.
      * That is; valid in the range U+0..U+10FFFF, but not in the range
-     * U+D800..U+DFFF (surrogat pairs), nor U+FFFE..U+FFFF (non-characters).
+     * U+D800..U+DFFF (surrogat pairs).
      *
      * @param  cp
      *             the code point value to validate
@@ -179,8 +179,7 @@ public class OtpErlangString extends OtpErlangObject implements Serializable,
 	// Erlang definition of valid Unicode code points; 
 	// Unicode 3.0, XML, et.al.
 	return (cp>>>16) <= 0x10 // in 0..10FFFF; Unicode range
-	    && (cp & ~0x7FF) != 0xD800 // not in D800..DFFF; surrogate range
-	    && (cp & ~1) != 0xFFFE; // not in FFFE..FFFF; non-characters
+	    && (cp & ~0x7FF) != 0xD800; // not in D800..DFFF; surrogate range
     }
 
     /**
diff --git a/lib/stdlib/doc/src/unicode.xml b/lib/stdlib/doc/src/unicode.xml
index d02763f75c..1001ebbae4 100644
--- a/lib/stdlib/doc/src/unicode.xml
+++ b/lib/stdlib/doc/src/unicode.xml
@@ -203,8 +203,7 @@
 	     <item>greater than <c>16#10FFFF</c>
 	     (the maximum unicode character),</item>
 	     <item>in the range <c>16#D800</c> to <c>16#DFFF</c>
-	     (invalid unicode range)</item>
-	     <item>or equal to 16#FFFE or 16#FFFF (non characters)</item>
+	       (invalid range reserved for UTF-16 surrogate pairs)</item>
 	   </list>
 	   is found.
 	   </item>
diff --git a/lib/stdlib/test/unicode_SUITE.erl b/lib/stdlib/test/unicode_SUITE.erl
index 9aa800209d..4055af2741 100644
--- a/lib/stdlib/test/unicode_SUITE.erl
+++ b/lib/stdlib/test/unicode_SUITE.erl
@@ -322,7 +322,7 @@ roundtrips(Config) when is_list(Config) ->
 ex_roundtrips(Config) when is_list(Config) ->
     ?line L1 = ranges(0, 16#D800 - 1, 
 		      erlang:system_info(context_reductions) * 11),
-    ?line L2 = ranges(16#DFFF + 1, 16#FFFE - 1, 
+    ?line L2 = ranges(16#DFFF + 1, 16#10000 - 1,
 		      erlang:system_info(context_reductions) * 11),
     %?line L3 = ranges(16#FFFF + 1, 16#10FFFF, 
     %		      erlang:system_info(context_reductions) * 11),
@@ -569,7 +569,6 @@ utf16_illegal_sequences_bif(Config) when is_list(Config) ->
 ex_utf16_illegal_sequences_bif(Config) when is_list(Config) ->
     ?line utf16_fail_range_bif_simple(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line utf16_fail_range_bif(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line utf16_fail_range_bif(16#FFFE, 16#FFFF),		%Non-characters.
 
     ?line lonely_hi_surrogate_bif(16#D800, 16#DBFF,incomplete),
     ?line lonely_hi_surrogate_bif(16#DC00, 16#DFFF,error),
@@ -644,7 +643,6 @@ utf8_illegal_sequences_bif(Config) when is_list(Config) ->
 ex_utf8_illegal_sequences_bif(Config) when is_list(Config) ->
     ?line fail_range_bif(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line fail_range_bif(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line fail_range_bif(16#FFFE, 16#FFFF),		%Reserved (BOM).
 
     %% Illegal first character.
     ?line [fail_bif(<<I,16#8F,16#8F,16#8F>>,unicode) || I <- lists:seq(16#80, 16#BF)],
diff --git a/system/doc/reference_manual/expressions.xml b/system/doc/reference_manual/expressions.xml
index 497d7eb464..644896cd7f 100644
--- a/system/doc/reference_manual/expressions.xml
+++ b/system/doc/reference_manual/expressions.xml
@@ -879,9 +879,8 @@ Ei = Value |
     and UTF-32, respectively.</p>
 
     <p>When constructing a segment of a <c>utf</c> type, <c>Value</c>
-    must be an integer in one of the ranges 0..16#D7FF,
-    16#E000..16#FFFD, or 16#10000..16#10FFFF
-    (i.e. a valid Unicode code point). Construction
+    must be an integer in the range 0..16#D7FF or
+    16#E000....16#10FFFF. Construction
     will fail with a <c>badarg</c> exception if <c>Value</c> is
     outside the allowed ranges. The size of the resulting binary
     segment depends on the type and/or <c>Value</c>. For <c>utf8</c>,
@@ -896,14 +895,13 @@ Ei = Value |
     <c><![CDATA[<<$a/utf8,$b/utf8,$c/utf8>>]]></c>.</p>
 
     <p>A successful match of a segment of a <c>utf</c> type results
-    in an integer in one of the ranges 0..16#D7FF, 16#E000..16#FFFD,
-    or 16#10000..16#10FFFF
-    (i.e. a valid Unicode code point). The match will fail if returned value
+    in an integer in the range 0..16#D7FF or  16#E000..16#10FFFF.
+    The match will fail if returned value
     would fall outside those ranges.</p>
 
     <p>A segment of type <c>utf8</c> will match 1 to 4 bytes in the binary,
     if the binary at the match position contains a valid UTF-8 sequence.
-    (See RFC-2279 or the Unicode standard.)</p>
+    (See RFC-3629 or the Unicode standard.)</p>
 
     <p>A segment of type <c>utf16</c> may match 2 or 4 bytes in the binary.
     The match will fail if the binary at the match position does not contain
-- 
cgit v1.2.3