diff options
Diffstat (limited to 'erts/emulator/beam/erl_unicode.c')
-rw-r--r-- | erts/emulator/beam/erl_unicode.c | 386 |
1 files changed, 215 insertions, 171 deletions
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 26ac231ddb..e00440b905 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2008-2012. All Rights Reserved. + * Copyright Ericsson AB 2008-2013. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in @@ -77,66 +77,25 @@ void erts_init_unicode(void) { max_loop_limit = CONTEXT_REDS * LOOP_FACTOR; /* Non visual BIFs to trap to. */ - memset(&characters_to_utf8_trap_exp, 0, sizeof(Export)); - characters_to_utf8_trap_exp.address = - &characters_to_utf8_trap_exp.code[3]; - characters_to_utf8_trap_exp.code[0] = am_erlang; - characters_to_utf8_trap_exp.code[1] = - am_atom_put("characters_to_utf8_trap",23); - characters_to_utf8_trap_exp.code[2] = 3; - characters_to_utf8_trap_exp.code[3] = - (BeamInstr) em_apply_bif; - characters_to_utf8_trap_exp.code[4] = - (BeamInstr) &characters_to_utf8_trap; - - memset(&characters_to_list_trap_1_exp, 0, sizeof(Export)); - characters_to_list_trap_1_exp.address = - &characters_to_list_trap_1_exp.code[3]; - characters_to_list_trap_1_exp.code[0] = am_erlang; - characters_to_list_trap_1_exp.code[1] = - am_atom_put("characters_to_list_trap_1",25); - characters_to_list_trap_1_exp.code[2] = 3; - characters_to_list_trap_1_exp.code[3] = - (BeamInstr) em_apply_bif; - characters_to_list_trap_1_exp.code[4] = - (BeamInstr) &characters_to_list_trap_1; - - memset(&characters_to_list_trap_2_exp, 0, sizeof(Export)); - characters_to_list_trap_2_exp.address = - &characters_to_list_trap_2_exp.code[3]; - characters_to_list_trap_2_exp.code[0] = am_erlang; - characters_to_list_trap_2_exp.code[1] = - am_atom_put("characters_to_list_trap_2",25); - characters_to_list_trap_2_exp.code[2] = 3; - characters_to_list_trap_2_exp.code[3] = - (BeamInstr) em_apply_bif; - characters_to_list_trap_2_exp.code[4] = - (BeamInstr) &characters_to_list_trap_2; - - - memset(&characters_to_list_trap_3_exp, 0, sizeof(Export)); - characters_to_list_trap_3_exp.address = - &characters_to_list_trap_3_exp.code[3]; - characters_to_list_trap_3_exp.code[0] = am_erlang; - characters_to_list_trap_3_exp.code[1] = - am_atom_put("characters_to_list_trap_3",25); - characters_to_list_trap_3_exp.code[2] = 3; - characters_to_list_trap_3_exp.code[3] = - (BeamInstr) em_apply_bif; - characters_to_list_trap_3_exp.code[4] = - (BeamInstr) &characters_to_list_trap_3; - - memset(&characters_to_list_trap_4_exp, 0, sizeof(Export)); - characters_to_list_trap_4_exp.address = - &characters_to_list_trap_4_exp.code[3]; - characters_to_list_trap_4_exp.code[0] = am_erlang; - characters_to_list_trap_4_exp.code[1] = - am_atom_put("characters_to_list_trap_4",25); - characters_to_list_trap_4_exp.code[2] = 1; - characters_to_list_trap_4_exp.code[3] = - (BeamInstr) em_apply_bif; - characters_to_list_trap_4_exp.code[4] = - (BeamInstr) &characters_to_list_trap_4; + erts_init_trap_export(&characters_to_utf8_trap_exp, + am_erlang, am_atom_put("characters_to_utf8_trap",23), 3, + &characters_to_utf8_trap); + + erts_init_trap_export(&characters_to_list_trap_1_exp, + am_erlang, am_atom_put("characters_to_list_trap_1",25), 3, + &characters_to_list_trap_1); + + erts_init_trap_export(&characters_to_list_trap_2_exp, + am_erlang, am_atom_put("characters_to_list_trap_2",25), 3, + &characters_to_list_trap_2); + + erts_init_trap_export(&characters_to_list_trap_3_exp, + am_erlang, am_atom_put("characters_to_list_trap_3",25), 3, + &characters_to_list_trap_3); + + erts_init_trap_export(&characters_to_list_trap_4_exp, + am_erlang, am_atom_put("characters_to_list_trap_4",25), 1, + &characters_to_list_trap_4); c_to_b_int_trap_exportp = erts_export_put(am_unicode,am_characters_to_binary_int,2); c_to_l_int_trap_exportp = erts_export_put(am_unicode,am_characters_to_list_int,2); @@ -765,7 +724,7 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ hp = HAlloc(p, 2); obj = CDR(objp); ioterm = CONS(hp, rest_term, obj); - //(*left) = 0; + /* (*left) = 0; */ goto done; } if (rest_term != NIL) { @@ -1195,15 +1154,24 @@ BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) * When input to characters_to_list is a plain binary and the format is 'unicode', we do * a faster analyze and size count with this function. */ -int erts_analyze_utf8(byte *source, Uint size, - byte **err_pos, Uint *num_chars, int *left) +static ERTS_INLINE int +analyze_utf8(byte *source, Uint size, byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) { + Uint latin1_count; + int is_latin1; *err_pos = source; + if (num_latin1_chars) { + is_latin1 = 1; + latin1_count = 0; + } *num_chars = 0; while (size) { if (((*source) & ((byte) 0x80)) == 0) { source++; - --size; + --size; + if (num_latin1_chars) + latin1_count++; } else if (((*source) & ((byte) 0xE0)) == 0xC0) { if (size < 2) { return ERTS_UTF8_INCOMPLETE; @@ -1212,6 +1180,11 @@ int erts_analyze_utf8(byte *source, Uint size, ((*source) < 0xC2) /* overlong */) { return ERTS_UTF8_ERROR; } + if (num_latin1_chars) { + latin1_count++; + if ((source[0] & ((byte) 0xFC)) != ((byte) 0xC0)) + is_latin1 = 0; + } source += 2; size -= 2; } else if (((*source) & ((byte) 0xF0)) == 0xE0) { @@ -1229,6 +1202,8 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 3; size -= 3; + if (num_latin1_chars) + is_latin1 = 0; } else if (((*source) & ((byte) 0xF8)) == 0xF0) { if (size < 4) { return ERTS_UTF8_INCOMPLETE; @@ -1246,21 +1221,40 @@ int erts_analyze_utf8(byte *source, Uint size, } source += 4; size -= 4; + if (num_latin1_chars) + is_latin1 = 0; } else { return ERTS_UTF8_ERROR; } ++(*num_chars); *err_pos = source; - if (left && --(*left) <= 0) { + if (max_chars && size > 0 && *num_chars == max_chars) + return ERTS_UTF8_OK_MAX_CHARS; + if (left && --(*left) <= 0 && size) { return ERTS_UTF8_ANALYZE_MORE; } } + if (num_latin1_chars) + *num_latin1_chars = is_latin1 ? latin1_count : -1; return ERTS_UTF8_OK; } +int erts_analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, NULL, 0); +} + +int erts_analyze_utf8_x(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left, + Sint *num_latin1_chars, Uint max_chars) +{ + return analyze_utf8(source, size, err_pos, num_chars, left, num_latin1_chars, max_chars); +} + /* * No errors should be able to occur - no overlongs, no malformed, no nothing - */ + */ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, Uint *num_built, Uint *num_eaten, Eterm tail) @@ -1316,6 +1310,12 @@ static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, return ret; } +Eterm erts_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail) +{ + return do_utf8_to_list(p, num, bytes, sz, left, num_built, num_eaten, tail); +} + static int is_candidate(Uint cp) { int index,pos; @@ -1723,14 +1723,14 @@ static BIF_RETTYPE do_bif_utf8_to_list(Process *p, if (b_sz) { ErlSubBin *sb; Eterm orig; - ERTS_DECLARE_DUMMY(Uint offset); + Uint offset; ASSERT(state != ERTS_UTF8_OK); hp = HAlloc(p, ERL_SUB_BIN_SIZE); sb = (ErlSubBin *) hp; ERTS_GET_REAL_BIN(orig_bin, orig, offset, bitoffs, bitsize); sb->thing_word = HEADER_SUB_BIN; sb->size = b_sz; - sb->offs = num_bytes_to_process + num_processed_bytes; + sb->offs = offset + num_bytes_to_process + num_processed_bytes; sb->orig = orig; sb->bitoffs = bitoffs; sb->bitsize = bitsize; @@ -1853,31 +1853,25 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) ap = atom_tab(atom_val(BIF_ARG_1)); if (BIF_ARG_2 == am_latin1) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); - } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { - int bin_size = 0; - int i; Eterm bin_term; - byte* bin_p; - for (i = 0; i < ap->len; i++) { - bin_size += (ap->name[i] >= 0x80) ? 2 : 1; + if (ap->latin1_chars < 0) { + goto error; } - if (bin_size == ap->len) { - BIF_RET(new_binary(BIF_P, ap->name, ap->len)); + if (ap->latin1_chars == ap->len) { + bin_term = new_binary(BIF_P, ap->name, ap->len); } - bin_term = new_binary(BIF_P, 0, bin_size); - bin_p = binary_bytes(bin_term); - for (i = 0; i < ap->len; i++) { - byte b = ap->name[i]; - if (b < 0x80) { - *bin_p++ = b; - } else { - *bin_p++ = 0xC0 | (b >> 6); - *bin_p++ = 0x80 | (b & 0x3F); - } + else { + byte* bin_p; + int dbg_sz; + bin_term = new_binary(BIF_P, 0, ap->latin1_chars); + bin_p = binary_bytes(bin_term); + dbg_sz = erts_utf8_to_latin1(bin_p, ap->name, ap->len); + ASSERT(dbg_sz == ap->latin1_chars); (void)dbg_sz; } BIF_RET(bin_term); + } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { + BIF_RET(new_binary(BIF_P, ap->name, ap->len)); } else { error: BIF_ERROR(BIF_P, BADARG); @@ -1885,118 +1879,78 @@ BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) } static BIF_RETTYPE -binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) +binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) { byte* bytes; byte *temp_alloc = NULL; Uint bin_size; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); if (enc == am_latin1) { Eterm a; - if (bin_size > MAX_ATOM_LENGTH) { + if (bin_size > MAX_ATOM_CHARACTERS) { system_limit: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, SYSTEM_LIMIT); + BIF_ERROR(proc, SYSTEM_LIMIT); } if (!must_exist) { - a = am_atom_put((char *)bytes, bin_size); - erts_free_aligned_binary_bytes(temp_alloc); + a = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(a)) + goto badarg; BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a)) { + } else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(a); } else { goto badarg; } } else if (enc == am_utf8 || enc == am_unicode) { - char *buf; - char *dst; - int i; - int num_chars; Eterm res; + Uint num_chars = 0; + const byte* p = bytes; + Uint left = bin_size; - if (bin_size > 2*MAX_ATOM_LENGTH) { - byte* err_pos; - Uint n; - int reds_left = bin_size+1; /* Number of reductions left. */ - - if (erts_analyze_utf8(bytes, bin_size, &err_pos, - &n, &reds_left) == ERTS_UTF8_OK) { - /* - * Correct UTF-8 encoding, but too many characters to - * fit in an atom. - */ + while (left) { + if (++num_chars > MAX_ATOM_CHARACTERS) { goto system_limit; - } else { - /* - * Something wrong in the UTF-8 encoding or Unicode code - * points > 255. - */ - goto badarg; } - } - - /* - * Allocate a temporary buffer the same size as the binary, - * so that we don't need an extra overflow test. - */ - buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size); - dst = buf; - for (i = 0; i < bin_size; i++) { - int c = bytes[i]; - if (c < 0x80) { - *dst++ = c; - } else if (i < bin_size-1) { - int c2; - if ((c & 0xE0) != 0xC0) { - goto free_badarg; - } - i++; - c = (c & 0x3F) << 6; - c2 = bytes[i]; - if ((c2 & 0xC0) != 0x80) { - goto free_badarg; - } - c = c | (c2 & 0x3F); - if (0x80 <= c && c < 256) { - *dst++ = c; - } else { - goto free_badarg; - } - } else { - free_badarg: - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto badarg; + if ((p[0] & 0x80) == 0) { + ++p; + --left; } + else if (left >= 2 + && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ + && (p[1] & 0xC0) == 0x80) { + p += 2; + left -= 2; + } + else goto badarg; } - num_chars = dst - buf; - if (num_chars > MAX_ATOM_LENGTH) { - erts_free(ERTS_ALC_T_TMP, (void *) buf); - goto system_limit; - } + if (!must_exist) { - res = am_atom_put(buf, num_chars); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - int exists = erts_atom_get(buf, num_chars, &res); - erts_free(ERTS_ALC_T_TMP, (void *) buf); - if (exists) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(res); - } else { - goto badarg; - } + res = erts_atom_put((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); + } + else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) { + goto badarg; } + erts_free_aligned_binary_bytes(temp_alloc); + if (is_non_value(res)) + goto badarg; + BIF_RET(res); } else { badarg: erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(p, BADARG); + BIF_ERROR(proc, BADARG); } } @@ -2619,8 +2573,20 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1) case ERL_FILENAME_UTF8: bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); if (erts_analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != ERTS_UTF8_OK) { + Eterm *hp = HAlloc(BIF_P,3); + Eterm warn_type = NIL; erts_free_aligned_binary_bytes(temp_alloc); - goto noconvert; + switch (erts_get_filename_warning_type()) { + case ERL_FILENAME_WARNING_IGNORE: + warn_type = am_ignore; + break; + case ERL_FILENAME_WARNING_ERROR: + warn_type = am_error; + break; + default: + warn_type = am_warning; + } + BIF_RET(TUPLE2(hp,am_error,warn_type)); } num_built = 0; num_eaten = 0; @@ -2653,9 +2619,8 @@ BIF_RETTYPE prim_file_internal_native2name_1(BIF_ALIST_1) erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(ret); default: - goto noconvert; + break; } - noconvert: BIF_RET(BIF_ARG_1); } @@ -2692,6 +2657,52 @@ BIF_RETTYPE prim_file_internal_normalize_utf8_1(BIF_ALIST_1) BIF_RET(ret); } +BIF_RETTYPE prim_file_is_translatable_1(BIF_ALIST_1) +{ + ERTS_DECLARE_DUMMY(Eterm real_bin); + ERTS_DECLARE_DUMMY(Uint offset); + Uint size; + Uint num_chars; + Uint bitsize; + ERTS_DECLARE_DUMMY(Uint bitoffs); + byte *temp_alloc = NULL; + byte *bytes; + byte *err_pos; + int status; + + if (is_not_binary(BIF_ARG_1)) { + BIF_ERROR(BIF_P,BADARG); + } + size = binary_size(BIF_ARG_1); + ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize); + if (bitsize != 0) { + BIF_ERROR(BIF_P,BADARG); + } + if (size == 0) { + BIF_RET(am_true); + } + + /* + * If the encoding is latin1, the pathname is always translatable. + */ + switch (erts_get_native_filename_encoding()) { + case ERL_FILENAME_LATIN1: + BIF_RET(am_true); + case ERL_FILENAME_WIN_WCHAR: + if (erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) { + BIF_RET(am_true); + } + } + + /* + * Check whether the binary contains legal UTF-8 sequences. + */ + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + status = erts_analyze_utf8(bytes, size, &err_pos, &num_chars, NULL); + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(status == ERTS_UTF8_OK ? am_true : am_false); +} + BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) { switch (erts_get_native_filename_encoding()) { @@ -2711,3 +2722,36 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) } } +int erts_utf8_to_latin1(byte* dest, const byte* source, int slen) +{ + /* + * Assumes source contains valid utf8 that can be encoded as latin1, + * and that dest has enough room. + */ + byte* dp = dest; + + while (slen > 0) { + if ((source[0] & 0x80) == 0) { + *dp++ = *source++; + --slen; + } + else { + ASSERT(slen > 1); + ASSERT((source[0] & 0xFE) == 0xC2); + ASSERT((source[1] & 0xC0) == 0x80); + *dp++ = (char) ((source[0] << 6) | (source[1] & 0x3F)); + source += 2; + slen -= 2; + } + } + return dp - dest; +} + +BIF_RETTYPE io_printable_range_0(BIF_ALIST_0) +{ + if (erts_get_printable_characters() == ERL_PRINTABLE_CHARACTERS_UNICODE) { + BIF_RET(am_unicode); + } else { + BIF_RET(am_latin1); + } +} |