Allow noncharacter code points in unicode encoding and decoding

The two noncharacter code points 16#FFFE and 16#FFFF were not allowed to be encoded or decoded using the unicode module or bit syntax. That causes an inconsistency, since the noncharacters 16#FDD0 to 16#FDEF could be encoded/decoded. There is two ways to fix that inconsistency. We have chosen to allow 16#FFFE and 16#FFFF to be encoded and decoded, because the noncharacters could be useful internally within an application and it will make encoding and decoding slightly faster. Reported-by: Alisdair Sullivan
author: Björn Gustavsson <[email protected]> 2011-08-30 11:51:11 +0200
committer: Björn Gustavsson <[email protected]> 2011-10-13 14:16:00 +0200
commit: 34db76765561487e526fe66d3d19ecf3b3fb9dc8 (patch)
tree: 9141e3c5729e46d03c8b27b14da3b29b1e54abca /erts/emulator
parent: 6ca6dd3c670fb8185ebb9a20c2a731a7375c1cac (diff)
download: otp-34db76765561487e526fe66d3d19ecf3b3fb9dc8.tar.gz
otp-34db76765561487e526fe66d3d19ecf3b3fb9dc8.tar.bz2
otp-34db76765561487e526fe66d3d19ecf3b3fb9dc8.zip
4 files changed, 14 insertions, 41 deletions
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index 4b5b5cbdaa..62a14dbacf 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -3967,8 +3967,7 @@ void process_main(void)
       * too big numbers).
       */
      if (is_not_small(val) || val > make_small(0x10FFFFUL) ||
-	 (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL)) ||
-	 val == make_small(0xFFFEUL) || val == make_small(0xFFFFUL)) {
+	 (make_small(0xD800UL) <= val && val <= make_small(0xDFFFUL))) {
 	 goto badarg;
      }
      Next(2);
@@ -3987,8 +3986,8 @@ void process_main(void)
       * the valid range).
       */
      if (is_not_small(tmp_arg1) || tmp_arg1 > make_small(0x10FFFFUL) ||
-	 (make_small(0xD800UL) <= tmp_arg1 && tmp_arg1 <= make_small(0xDFFFUL)) ||
-	 tmp_arg1 == make_small(0xFFFEUL) || tmp_arg1 == make_small(0xFFFFUL)) {
+	 (make_small(0xD800UL) <= tmp_arg1 &&
+	  tmp_arg1 <= make_small(0xDFFFUL))) {
 	 ErlBinMatchBuffer *mb = ms_matchbuffer(tmp_arg2);
 
 	 mb->offset -= 32;
diff --git a/erts/emulator/beam/erl_bits.c b/erts/emulator/beam/erl_bits.c
index 326a5c136b..6f7309f493 100644
--- a/erts/emulator/beam/erl_bits.c
+++ b/erts/emulator/beam/erl_bits.c
@@ -845,8 +845,7 @@ erts_bs_put_utf8(ERL_BITS_PROTO_1(Eterm arg))
 	dst[1] = 0x80 | (val & 0x3F);
 	num_bits = 16;
     } else if (val < 0x10000UL) {
-	if ((0xD800 <= val && val <= 0xDFFF) ||
-	    val == 0xFFFE || val == 0xFFFF) {
+	if (0xD800 <= val && val <= 0xDFFF) {
 	    return 0;
 	}
 	dst[0] = 0xE0 | (val >> 12);
@@ -886,8 +885,7 @@ erts_bs_put_utf16(ERL_BITS_PROTO_2(Eterm arg, Uint flags))
 	return 0;
     }
     val = unsigned_val(arg);
-    if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF) ||
-	val == 0xFFFE || val == 0xFFFF) {
+    if (val > 0x10FFFF || (0xD800 <= val && val <= 0xDFFF)) {
 	return 0;
     }
 
@@ -1652,8 +1650,7 @@ erts_bs_get_utf8(ErlBinMatchBuffer* mb)
 	    return THE_NON_VALUE;
 	}
 	result = (((result << 6) + a) << 6) + b - (Eterm) 0x000E2080UL;
-	if ((0xD800 <= result && result <= 0xDFFF) ||
-	    result == 0xFFFE || result == 0xFFFF) {
+	if (0xD800 <= result && result <= 0xDFFF) {
 	    return THE_NON_VALUE;
 	}
 	mb->offset += 24;
@@ -1723,9 +1720,6 @@ erts_bs_get_utf16(ErlBinMatchBuffer* mb, Uint flags)
 	w1 = (src[0] << 8) | src[1];
     }
     if (w1 < 0xD800 || w1 > 0xDFFF) {
-	if (w1 == 0xFFFE || w1 == 0xFFFF) {
-	    return THE_NON_VALUE;
-	}
 	mb->offset += 16;
 	return make_small(w1);
     } else if (w1 > 0xDBFF) {
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 158eb361a4..bd5f3cc4c1 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -348,12 +348,6 @@ static int copy_utf8_bin(byte *target, byte *source, Uint size,
 		return copied;
 	    }
 
-	    if (((*source) == 0xEF) && (source[1] == 0xBF) &&
-		((source[2] == 0xBE) || (source[2] == 0xBF))) {
-		*err_pos = source;
-		return copied;
-	    }
-		
 	    *(target++) = *(source++);
 	    *(target++) = *(source++);
 	    *(target++) = *(source++);
@@ -714,9 +708,8 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 			    target[(*pos)++] = (((byte) (x & 0x3F)) | 
 						((byte) 0x80));
 			} else if (x < 0x10000) {
-			    if ((x >= 0xD800 && x <= 0xDFFF) ||
-				(x == 0xFFFE) ||
-				(x == 0xFFFF)) { /* Invalid unicode range */
+			    if (x >= 0xD800 && x <= 0xDFFF) {
+				/* Invalid unicode range */
 				*err = 1;
 				goto done;
 			    }
@@ -1230,10 +1223,6 @@ int erts_analyze_utf8(byte *source, Uint size,
 		((source[1] & 0x20) != 0)) {
 		return ERTS_UTF8_ERROR;
 	    }
-	    if (((*source) == 0xEF) && (source[1] == 0xBF) &&
-		((source[2] == 0xBE) || (source[2] == 0xBF))) {
-		return ERTS_UTF8_ERROR;
-	    }
 	    source += 3;
 	    size -= 3;
 	} else if (((*source) & ((byte) 0xF8)) == 0xF0) {
@@ -2166,9 +2155,8 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 			    } else if (x < 0x800) {
 				need += 2;
 			    } else if (x < 0x10000) {
-				if ((x >= 0xD800 && x <= 0xDFFF) ||
-				    (x == 0xFFFE) ||
-				    (x == 0xFFFF)) { /* Invalid unicode range */
+				if (x >= 0xD800 && x <= 0xDFFF) {
+				    /* Invalid unicode range */
 				    DESTROY_ESTACK(stack);
 				    return ((Sint) -1);
 				}
@@ -2314,9 +2302,7 @@ L_Again:   /* Restart with sublist, old listend was pushed on stack */
 				*p++ = (((byte) (x & 0x3F)) | 
 					((byte) 0x80));
 			    } else if (x < 0x10000) {
-				ASSERT(!((x >= 0xD800 && x <= 0xDFFF) ||
-					 (x == 0xFFFE) ||
-					 (x == 0xFFFF)));
+				ASSERT(!(x >= 0xD800 && x <= 0xDFFF));
 				*p++ = (((byte) (x >> 12)) | 
 					((byte) 0xE0));
 				*p++ = ((((byte) (x >> 6)) & 0x3F)  | 
diff --git a/erts/emulator/test/bs_utf_SUITE.erl b/erts/emulator/test/bs_utf_SUITE.erl
index 72c656c400..4ab7d674a6 100644
--- a/erts/emulator/test/bs_utf_SUITE.erl
+++ b/erts/emulator/test/bs_utf_SUITE.erl
@@ -64,8 +64,7 @@ end_per_group(_GroupName, Config) ->
 
 utf8_roundtrip(Config) when is_list(Config) ->
     ?line utf8_roundtrip(0, 16#D7FF),
-    ?line utf8_roundtrip(16#E000, 16#FFFD),
-    ?line utf8_roundtrip(16#10000, 16#10FFFF),
+    ?line utf8_roundtrip(16#E000, 16#10FFFF),
     ok.
 
 utf8_roundtrip(First, Last) when First =< Last ->
@@ -91,8 +90,7 @@ utf16_roundtrip(Config) when is_list(Config) ->
 
 do_utf16_roundtrip(Fun) ->
     do_utf16_roundtrip(0, 16#D7FF, Fun),
-    do_utf16_roundtrip(16#E000, 16#FFFD, Fun),
-    do_utf16_roundtrip(16#10000, 16#10FFFF, Fun).
+    do_utf16_roundtrip(16#E000, 16#10FFFF, Fun).
 
 do_utf16_roundtrip(First, Last, Fun) when First =< Last ->
     Fun(First),
@@ -129,8 +127,7 @@ utf32_roundtrip(Config) when is_list(Config) ->
 
 do_utf32_roundtrip(Fun) ->
     do_utf32_roundtrip(0, 16#D7FF, Fun),
-    do_utf32_roundtrip(16#E000, 16#FFFD, Fun),
-    do_utf32_roundtrip(16#10000, 16#10FFFF, Fun).
+    do_utf32_roundtrip(16#E000, 16#10FFFF, Fun).
 
 do_utf32_roundtrip(First, Last, Fun) when First =< Last ->
     Fun(First),
@@ -158,7 +155,6 @@ utf32_little_roundtrip(Char) ->
 utf8_illegal_sequences(Config) when is_list(Config) ->
     ?line fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line fail_range(16#FFFE, 16#FFFF),		%Non-characters.
 
     %% Illegal first character.
     ?line [fail(<<I,16#8F,16#8F,16#8F>>) || I <- lists:seq(16#80, 16#BF)],
@@ -251,7 +247,6 @@ fail_1(_) -> ok.
 utf16_illegal_sequences(Config) when is_list(Config) ->
     ?line utf16_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line utf16_fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line utf16_fail_range(16#FFFE, 16#FFFF),		%Non-characters.
 
     ?line lonely_hi_surrogate(16#D800, 16#DFFF),
     ?line leading_lo_surrogate(16#DC00, 16#DFFF),
@@ -300,7 +295,6 @@ leading_lo_surrogate(_, _, _) -> ok.
 utf32_illegal_sequences(Config) when is_list(Config) ->
     ?line utf32_fail_range(16#10FFFF+1, 16#10FFFF+512), %Too large.
     ?line utf32_fail_range(16#D800, 16#DFFF),		%Reserved for UTF-16.
-    ?line utf32_fail_range(16#FFFE, 16#FFFF),		%Non-characters.
     ?line utf32_fail_range(-100, -1),
     ok.
author	Björn Gustavsson <[email protected]>	2011-08-30 11:51:11 +0200
committer	Björn Gustavsson <[email protected]>	2011-10-13 14:16:00 +0200
commit	34db76765561487e526fe66d3d19ecf3b3fb9dc8 (patch)
tree	9141e3c5729e46d03c8b27b14da3b29b1e54abca /erts/emulator
parent	6ca6dd3c670fb8185ebb9a20c2a731a7375c1cac (diff)
download	otp-34db76765561487e526fe66d3d19ecf3b3fb9dc8.tar.gz otp-34db76765561487e526fe66d3d19ecf3b3fb9dc8.tar.bz2 otp-34db76765561487e526fe66d3d19ecf3b3fb9dc8.zip