aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/beam/erl_unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'erts/emulator/beam/erl_unicode.c')
-rw-r--r--erts/emulator/beam/erl_unicode.c278
1 files changed, 175 insertions, 103 deletions
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 3a968594f3..b7a5c45fea 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1,18 +1,19 @@
/*
* %CopyrightBegin%
*
- * Copyright Ericsson AB 2008-2013. All Rights Reserved.
+ * Copyright Ericsson AB 2008-2017. All Rights Reserved.
*
- * The contents of this file are subject to the Erlang Public License,
- * Version 1.1, (the "License"); you may not use this file except in
- * compliance with the License. You should have received a copy of the
- * Erlang Public License along with this software. If not, it can be
- * retrieved online at http://www.erlang.org/.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
- * the License for the specific language governing rights and limitations
- * under the License.
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*
* %CopyrightEnd%
*/
@@ -54,7 +55,7 @@ static BIF_RETTYPE finalize_list_to_list(Process *p,
Uint num_processed_bytes,
Uint num_bytes_to_process,
Uint num_resulting_chars,
- int state, int left,
+ int state, Sint left,
Eterm tail);
static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3);
static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3);
@@ -122,19 +123,16 @@ static void cleanup_restart_context(RestartContext *rc)
}
}
-static void cleanup_restart_context_bin(Binary *bp)
+static int cleanup_restart_context_bin(Binary *bp)
{
RestartContext *rc = ERTS_MAGIC_BIN_DATA(bp);
cleanup_restart_context(rc);
+ return 1;
}
-static RestartContext *get_rc_from_bin(Eterm bin)
+static RestartContext *get_rc_from_bin(Eterm mref)
{
- Binary *mbp;
- ASSERT(ERTS_TERM_IS_MAGIC_BINARY(bin));
-
- mbp = ((ProcBin *) binary_val(bin))->val;
-
+ Binary *mbp = erts_magic_ref2bin(mref);
ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(mbp)
== cleanup_restart_context_bin);
return (RestartContext *) ERTS_MAGIC_BIN_DATA(mbp);
@@ -147,8 +145,8 @@ static Eterm make_magic_bin_for_restart(Process *p, RestartContext *rc)
RestartContext *restartp = ERTS_MAGIC_BIN_DATA(mbp);
Eterm *hp;
memcpy(restartp,rc,sizeof(RestartContext));
- hp = HAlloc(p, PROC_BIN_SIZE);
- return erts_mk_magic_binary_term(&hp, &MSO(p), mbp);
+ hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE);
+ return erts_mk_magic_ref(&hp, &MSO(p), mbp);
}
@@ -172,12 +170,13 @@ static ERTS_INLINE int allowed_iterations(Process *p)
else
return tmp;
}
-static ERTS_INLINE int cost_to_proc(Process *p, int cost)
+
+static ERTS_INLINE void cost_to_proc(Process *p, Sint cost)
{
- int x = (cost / LOOP_FACTOR);
+ Sint x = (cost / LOOP_FACTOR);
BUMP_REDS(p,x);
- return x;
}
+
static ERTS_INLINE int simple_loops_to_common(int cost)
{
int factor = (LOOP_FACTOR_SIMPLE / LOOP_FACTOR);
@@ -242,14 +241,15 @@ static int utf8_len(byte first)
return -1;
}
-static int copy_utf8_bin(byte *target, byte *source, Uint size,
- byte *leftover, int *num_leftovers,
- byte **err_pos, Uint *characters) {
- int copied = 0;
+static Uint copy_utf8_bin(byte *target, byte *source, Uint size,
+ byte *leftover, int *num_leftovers,
+ byte **err_pos, Uint *characters)
+{
+ Uint copied = 0;
if (leftover != NULL && *num_leftovers) {
int need = utf8_len(leftover[0]);
int from_source = need - (*num_leftovers);
- int c;
+ Uint c;
byte *tmp_err_pos = NULL;
ASSERT(need > 0);
ASSERT(from_source > 0);
@@ -501,8 +501,8 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */
}
-static Eterm do_build_utf8(Process *p, Eterm ioterm, int *left, int latin1,
- byte *target, int *pos, Uint *characters, int *err,
+static Eterm do_build_utf8(Process *p, Eterm ioterm, Sint *left, int latin1,
+ byte *target, Uint *pos, Uint *characters, int *err,
byte *leftover, int *num_leftovers)
{
int c;
@@ -572,7 +572,7 @@ static Eterm do_build_utf8(Process *p, Eterm ioterm, int *left, int latin1,
}
if (!latin1) {
- int num;
+ Uint num;
byte *err_pos = NULL;
num = copy_utf8_bin(target + (*pos), bytes,
size, leftover, num_leftovers,&err_pos,characters);
@@ -803,7 +803,7 @@ static int check_leftovers(byte *source, int size)
-static BIF_RETTYPE build_utf8_return(Process *p,Eterm bin,int pos,
+static BIF_RETTYPE build_utf8_return(Process *p,Eterm bin,Uint pos,
Eterm rest_term,int err,
byte *leftover,int num_leftovers,Eterm latin1)
{
@@ -858,8 +858,8 @@ static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3)
#endif
byte* bytes;
Eterm rest_term;
- int left, sleft;
- int pos;
+ Sint left, sleft;
+ Uint pos;
int err;
byte leftover[4]; /* used for temp buffer too,
otherwise 3 bytes would have been enough */
@@ -873,7 +873,7 @@ static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3)
real_bin = binary_val(BIF_ARG_1);
ASSERT(*real_bin == HEADER_PROC_BIN);
#endif
- pos = (int) binary_size(BIF_ARG_1);
+ pos = binary_size(BIF_ARG_1);
bytes = binary_bytes(BIF_ARG_1);
sleft = left = allowed_iterations(BIF_P);
err = 0;
@@ -933,9 +933,9 @@ BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2)
int latin1;
Eterm bin;
byte *bytes;
- int pos;
+ Uint pos;
int err;
- int left, sleft;
+ Sint left, sleft;
Eterm rest_term, subject;
byte leftover[4]; /* used for temp buffer too, o
therwise 3 bytes would have been enough */
@@ -998,7 +998,7 @@ BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2)
byte *t = NULL;
Uint sz = binary_size(bin);
byte *by = erts_get_aligned_binary_bytes(bin,&t);
- int i;
+ Uint i;
erts_printf("<<");
for (i = 0;i < sz; ++i) {
erts_printf((i == sz -1) ? "0x%X" : "0x%X, ", (unsigned) by[i]);
@@ -1006,7 +1006,7 @@ BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2)
erts_printf(">>: ");
erts_free_aligned_binary_bytes(t);
}
- erts_printf("%d - %d = %d\n",sleft,left,sleft - left);
+ erts_printf("%ld - %ld = %ld\n", sleft, left, sleft - left);
}
#endif
cost_to_proc(BIF_P, sleft - left);
@@ -1014,10 +1014,10 @@ BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2)
leftover,num_leftovers,BIF_ARG_2);
}
-static BIF_RETTYPE build_list_return(Process *p, byte *bytes, int pos, Uint characters,
+static BIF_RETTYPE build_list_return(Process *p, byte *bytes, Uint pos, Uint characters,
Eterm rest_term, int err,
byte *leftover, int num_leftovers,
- Eterm latin1, int left)
+ Eterm latin1, Sint left)
{
Eterm *hp;
@@ -1069,11 +1069,11 @@ static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3)
{
RestartContext *rc;
byte* bytes;
- int pos;
+ Uint pos;
Uint characters;
int err;
Eterm rest_term;
- int left, sleft;
+ Sint left, sleft;
int latin1 = 0;
byte leftover[4]; /* used for temp buffer too,
@@ -1106,9 +1106,9 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
int latin1;
Uint characters = 0;
byte *bytes;
- int pos;
+ Uint pos;
int err;
- int left, sleft;
+ Sint left, sleft;
Eterm rest_term;
byte leftover[4]; /* used for temp buffer too, o
therwise 3 bytes would have been enough */
@@ -1540,7 +1540,7 @@ static BIF_RETTYPE finalize_list_to_list(Process *p,
Uint num_processed_bytes,
Uint num_bytes_to_process,
Uint num_resulting_chars,
- int state, int left,
+ int state, Sint left,
Eterm tail)
{
Uint num_built; /* characters */
@@ -1887,74 +1887,57 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist)
byte* bytes;
byte *temp_alloc = NULL;
Uint bin_size;
+ Eterm a;
if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
BIF_ERROR(proc, BADARG);
}
bin_size = binary_size(bin);
+
if (enc == am_latin1) {
- Eterm a;
- if (bin_size > MAX_ATOM_CHARACTERS) {
- system_limit:
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(proc, SYSTEM_LIMIT);
- }
if (!must_exist) {
- a = erts_atom_put((byte *) bytes,
- bin_size,
- ERTS_ATOM_ENC_LATIN1,
- 0);
- erts_free_aligned_binary_bytes(temp_alloc);
- if (is_non_value(a))
- goto badarg;
- BIF_RET(a);
- } else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_RET(a);
- } else {
+ int lix = erts_atom_put_index((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_LATIN1,
+ 0);
+ if (lix == ATOM_BAD_ENCODING_ERROR) {
+ badarg:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(proc, BADARG);
+ } else if (lix == ATOM_MAX_CHARS_ERROR) {
+ system_limit:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(proc, SYSTEM_LIMIT);
+ }
+
+ a = make_atom(lix);
+ } else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) {
goto badarg;
}
- } else if (enc == am_utf8 || enc == am_unicode) {
- Eterm res;
- Uint num_chars = 0;
- const byte* p = bytes;
- Uint left = bin_size;
- while (left) {
- if (++num_chars > MAX_ATOM_CHARACTERS) {
+ } else if (enc == am_utf8 || enc == am_unicode) {
+ if (!must_exist) {
+ int uix = erts_atom_put_index((byte *) bytes,
+ bin_size,
+ ERTS_ATOM_ENC_UTF8,
+ 0);
+ if (uix == ATOM_BAD_ENCODING_ERROR) {
+ goto badarg;
+ } else if (uix == ATOM_MAX_CHARS_ERROR) {
goto system_limit;
}
- if ((p[0] & 0x80) == 0) {
- ++p;
- --left;
- }
- else if (left >= 2
- && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */
- && (p[1] & 0xC0) == 0x80) {
- p += 2;
- left -= 2;
- }
- else goto badarg;
- }
- if (!must_exist) {
- res = erts_atom_put((byte *) bytes,
- bin_size,
- ERTS_ATOM_ENC_UTF8,
- 0);
+ a = make_atom(uix);
}
- else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) {
+ else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) {
goto badarg;
}
- erts_free_aligned_binary_bytes(temp_alloc);
- if (is_non_value(res))
- goto badarg;
- BIF_RET(res);
} else {
- badarg:
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(proc, BADARG);
+ goto badarg;
}
+
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(a);
}
BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2)
@@ -2005,7 +1988,7 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
is_list(name) ||
(allow_empty && is_nil(name))) {
Sint need;
- if ((need = erts_native_filename_need(name,encoding)) < 0) {
+ if ((need = erts_native_filename_need(name, encoding)) < 0) {
return NULL;
}
if (encoding == ERL_FILENAME_WIN_WCHAR) {
@@ -2015,7 +1998,7 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
++need;
}
if (used)
- *used = (Sint) need;
+ *used = need;
if (need+extra > statbuf_size) {
name_buf = (char *) erts_alloc(alloc_type, need+extra);
} else {
@@ -2126,6 +2109,8 @@ Eterm erts_convert_native_to_filename(Process *p, byte *bytes)
mac = 1;
case ERL_FILENAME_UTF8:
size = strlen((char *) bytes);
+ if (size == 0)
+ return NIL;
if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK) {
goto noconvert;
}
@@ -2167,12 +2152,13 @@ Eterm erts_convert_native_to_filename(Process *p, byte *bytes)
}
-Sint erts_native_filename_need(Eterm ioterm, int encoding)
+Sint erts_native_filename_need(Eterm ioterm, int encoding)
{
Eterm *objp;
Eterm obj;
DECLARE_ESTACK(stack);
Sint need = 0;
+ int seen_null = 0;
if (is_atom(ioterm)) {
Atom* ap;
@@ -2209,6 +2195,22 @@ Sint erts_native_filename_need(Eterm ioterm, int encoding)
default:
need = -1;
}
+ /*
+ * Do not allow null in
+ * the middle of filenames
+ */
+ if (need > 0) {
+ byte *name = ap->name;
+ int len = ap->len;
+ for (i = 0; i < len; i++) {
+ if (name[i] == 0)
+ seen_null = 1;
+ else if (seen_null) {
+ need = -1;
+ break;
+ }
+ }
+ }
DESTROY_ESTACK(stack);
return need;
}
@@ -2239,6 +2241,16 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */
if (is_small(obj)) { /* Always small */
for(;;) {
Uint x = unsigned_val(obj);
+ /*
+ * Do not allow null in
+ * the middle of filenames
+ */
+ if (x == 0)
+ seen_null = 1;
+ else if (seen_null) {
+ DESTROY_ESTACK(stack);
+ return ((Sint) -1);
+ }
switch (encoding) {
case ERL_FILENAME_LATIN1:
if (x > 255) {
@@ -2504,7 +2516,7 @@ void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars)
((Uint) (bytes[3] & ((byte) 0x3F)));
bytes += 4;
} else {
- erl_exit(1,"Internal unicode error in prim_file:internal_name2native/1");
+ erts_exit(ERTS_ERROR_EXIT,"Internal unicode error in prim_file:internal_name2native/1");
}
*target++ = (byte) (unipoint & 0xFF);
*target++ = (byte) ((unipoint >> 8) & 0xFF);
@@ -2512,6 +2524,38 @@ void erts_copy_utf8_to_utf16_little(byte *target, byte *bytes, int num_chars)
}
/*
+ * *** Requirements on Raw Filename Format ***
+ *
+ * These requirements are due to the 'filename' module
+ * in stdlib. This since it is documented that it
+ * should be able to operate on raw filenames as well
+ * as ordinary filenames.
+ *
+ * A raw filename *must* be a byte sequence where:
+ * 1. Codepoints 0-127 (7-bit ascii) *must* be encoded
+ * as a byte with the corresponding value. That is,
+ * the most significant bit in the byte encoding the
+ * codepoint is never set.
+ * 2. Codepoints greater than 127 *must* be encoded
+ * with the most significant bit set in *every* byte
+ * encoding it.
+ *
+ * Latin1 and UTF-8 meet these requirements while
+ * UTF-16 and UTF-32 don't.
+ *
+ * On Windows filenames are natively stored as malformed
+ * UTF-16LE (lonely surrogates may appear). A more correct
+ * description than UTF-16 would be an array of 16-bit
+ * words... In order to meet the requirements of the
+ * raw file format we convert the malformed UTF-16LE to
+ * malformed UTF-8 which meet the requirements.
+ *
+ * Note that these requirements are today only OTP
+ * internal (erts-stdlib internal) requirements that
+ * could be changed.
+ */
+
+/*
* This internal bif converts a filename to whatever format is suitable for the file driver
* It also adds zero termination so that prim_file needn't bother with the character encoding
* of the file driver
@@ -2522,6 +2566,12 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
Sint need;
Eterm bin_term;
byte* bin_p;
+
+ /*
+ * See comment on "Requirements on Raw Filename Format"
+ * above.
+ */
+
/* Prim file explicitly does not allow atoms, although we could
very well cope with it. Instead of letting 'file' handle them,
it would probably be more efficient to handle them here. Subject to
@@ -2530,6 +2580,7 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
BIF_ERROR(BIF_P,BADARG);
}
if (is_binary(BIF_ARG_1)) {
+ int seen_null = 0;
byte *temp_alloc = NULL;
byte *bytes;
byte *err_pos;
@@ -2539,10 +2590,18 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
size = binary_size(BIF_ARG_1);
bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
if (encoding != ERL_FILENAME_WIN_WCHAR) {
+ Uint i;
/*Add 0 termination only*/
bin_term = new_binary(BIF_P, NULL, size+1);
bin_p = binary_bytes(bin_term);
- memcpy(bin_p,bytes,size);
+ for (i = 0; i < size; i++) {
+ /* Don't allow null in the middle of filenames... */
+ if (bytes[i] == 0)
+ seen_null = 1;
+ else if (seen_null)
+ goto bin_name_error;
+ bin_p[i] = bytes[i];
+ }
bin_p[size]=0;
erts_free_aligned_binary_bytes(temp_alloc);
BIF_RET(bin_term);
@@ -2556,6 +2615,11 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
bin_term = new_binary(BIF_P, 0, (size+1)*2);
bin_p = binary_bytes(bin_term);
while (size--) {
+ /* Don't allow null in the middle of filenames... */
+ if (*bytes == 0)
+ seen_null = 1;
+ else if (seen_null)
+ goto bin_name_error;
*bin_p++ = *bytes++;
*bin_p++ = 0;
}
@@ -2573,11 +2637,14 @@ BIF_RETTYPE prim_file_internal_name2native_1(BIF_ALIST_1)
bin_p[num_chars*2+1] = 0;
erts_free_aligned_binary_bytes(temp_alloc);
BIF_RET(bin_term);
+ bin_name_error:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(BIF_P,BADARG);
} /* binary */
- if ((need = erts_native_filename_need(BIF_ARG_1,encoding)) < 0) {
- BIF_ERROR(BIF_P,BADARG);
+ if ((need = erts_native_filename_need(BIF_ARG_1, encoding)) < 0) {
+ BIF_ERROR(BIF_P,BADARG);
}
if (encoding == ERL_FILENAME_WIN_WCHAR) {
need += 2;
@@ -2611,6 +2678,11 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1)
Eterm ret;
int mac = 0;
+ /*
+ * See comment on "Requirements on Raw Filename Format"
+ * above.
+ */
+
if (is_not_binary(BIF_ARG_1)) {
BIF_ERROR(BIF_P,BADARG);
}