From 26b59dfe67ef551cd94765557cdd8c79794bcc38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Tue, 31 May 2016 14:28:54 +0200 Subject: Add new AtU8 beam chunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new chunk stores atoms encoded in UTF-8. beam_lib has also been modified to handle the new 'utf8_atoms' attribute while the 'atoms' attribute may be a missing chunk from now on. The binary_to_atom/2 BIF can now encode any utf8 binary with up to 255 characters. The list_to_atom/1 BIF can now accept codepoints higher than 255 with up to 255 characters (thanks to Björn Gustavsson). --- erts/emulator/beam/atom.c | 35 +++++++++++------ erts/emulator/beam/atom.h | 3 ++ erts/emulator/beam/beam_load.c | 66 +++++++++++++++++++++++--------- erts/emulator/beam/bif.c | 14 +++---- erts/emulator/beam/erl_unicode.c | 83 ++++++++++++++++------------------------ erts/emulator/beam/global.h | 1 + erts/emulator/beam/utils.c | 62 ++++++++++++++++++++++++++++++ 7 files changed, 178 insertions(+), 86 deletions(-) (limited to 'erts/emulator/beam') diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c index 2b5ad097a0..2055c29190 100644 --- a/erts/emulator/beam/atom.c +++ b/erts/emulator/beam/atom.c @@ -233,10 +233,10 @@ need_convertion: } /* - * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + * erts_atom_put_index() may fail. Returns negative indexes for errors. */ -Eterm -erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) +int +erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc) { byte utf8_copy[MAX_ATOM_SZ_FROM_LATIN1]; const byte *text = name; @@ -253,7 +253,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = 0; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } switch (enc) { @@ -262,7 +262,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = MAX_ATOM_CHARACTERS; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } #ifdef DEBUG for (aix = 0; aix < len; aix++) { @@ -276,7 +276,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) if (trunc) tlen = MAX_ATOM_CHARACTERS; else - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; } no_latin1_chars = tlen; latin1_to_utf8(utf8_copy, &text, &tlen); @@ -284,7 +284,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) case ERTS_ATOM_ENC_UTF8: /* First sanity check; need to verify later */ if (tlen > MAX_ATOM_SZ_LIMIT && !trunc) - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; break; } @@ -295,7 +295,7 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) atom_read_unlock(); if (aix >= 0) { /* Already in table no need to verify it */ - return make_atom(aix); + return aix; } if (enc == ERTS_ATOM_ENC_UTF8) { @@ -314,13 +314,13 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) case ERTS_UTF8_OK_MAX_CHARS: /* Truncated... */ if (!trunc) - return THE_NON_VALUE; + return ATOM_MAX_CHARS_ERROR; ASSERT(no_chars == MAX_ATOM_CHARACTERS); tlen = err_pos - text; break; default: /* Bad utf8... */ - return THE_NON_VALUE; + return ATOM_BAD_ENCODING_ERROR; } } @@ -333,7 +333,20 @@ erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) atom_write_lock(); aix = index_put(&erts_atom_table, (void*) &a); atom_write_unlock(); - return make_atom(aix); + return aix; +} + +/* + * erts_atom_put() may fail. If it fails THE_NON_VALUE is returned! + */ +Eterm +erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc) +{ + int aix = erts_atom_put_index(name, len, enc, trunc); + if (aix >= 0) + return make_atom(aix); + else + return THE_NON_VALUE; } Eterm diff --git a/erts/emulator/beam/atom.h b/erts/emulator/beam/atom.h index abd3b44993..be998a46bd 100644 --- a/erts/emulator/beam/atom.h +++ b/erts/emulator/beam/atom.h @@ -29,6 +29,8 @@ #define MAX_ATOM_SZ_LIMIT (4*MAX_ATOM_CHARACTERS) /* theoretical byte limit */ #define ATOM_LIMIT (1024*1024) #define MIN_ATOM_TABLE_SIZE 8192 +#define ATOM_BAD_ENCODING_ERROR -1 +#define ATOM_MAX_CHARS_ERROR -2 #ifndef ARCH_32 /* Internal atom cache needs MAX_ATOM_TABLE_SIZE to be less than an @@ -133,6 +135,7 @@ int atom_table_sz(void); /* table size in bytes, excluding stored objects */ Eterm am_atom_put(const char*, int); /* ONLY 7-bit ascii! */ Eterm erts_atom_put(const byte *name, int len, ErtsAtomEncoding enc, int trunc); +int erts_atom_put_index(const byte *name, int len, ErtsAtomEncoding enc, int trunc); void init_atom_table(void); void atom_info(fmtfn_t, void *); void dump_atoms(fmtfn_t, void *); diff --git a/erts/emulator/beam/beam_load.c b/erts/emulator/beam/beam_load.c index 8f1faa6719..1899ffb079 100644 --- a/erts/emulator/beam/beam_load.c +++ b/erts/emulator/beam/beam_load.c @@ -157,13 +157,15 @@ typedef struct { #define STR_CHUNK 2 #define IMP_CHUNK 3 #define EXP_CHUNK 4 -#define NUM_MANDATORY 5 +#define MIN_MANDATORY 1 +#define MAX_MANDATORY 5 #define LAMBDA_CHUNK 5 #define LITERAL_CHUNK 6 #define ATTR_CHUNK 7 #define COMPILE_CHUNK 8 #define LINE_CHUNK 9 +#define UTF8_ATOM_CHUNK 10 #define NUM_CHUNK_TYPES (sizeof(chunk_types)/sizeof(chunk_types[0])) @@ -173,9 +175,13 @@ typedef struct { static Uint chunk_types[] = { /* - * Mandatory chunk types -- these MUST be present. + * Atom chunk types -- Atom or AtU8 MUST be present. */ MakeIffId('A', 't', 'o', 'm'), /* 0 */ + + /* + * Mandatory chunk types -- these MUST be present. + */ MakeIffId('C', 'o', 'd', 'e'), /* 1 */ MakeIffId('S', 't', 'r', 'T'), /* 2 */ MakeIffId('I', 'm', 'p', 'T'), /* 3 */ @@ -189,6 +195,7 @@ static Uint chunk_types[] = { MakeIffId('A', 't', 't', 'r'), /* 7 */ MakeIffId('C', 'I', 'n', 'f'), /* 8 */ MakeIffId('L', 'i', 'n', 'e'), /* 9 */ + MakeIffId('A', 't', 'U', '8'), /* 10 */ }; /* @@ -490,9 +497,9 @@ static Eterm stub_insert_new_code(Process *c_p, ErtsProcLocks c_p_locks, #endif static int init_iff_file(LoaderState* stp, byte* code, Uint size); static int scan_iff_file(LoaderState* stp, Uint* chunk_types, - Uint num_types, Uint num_mandatory); + Uint num_types); static int verify_chunks(LoaderState* stp); -static int load_atom_table(LoaderState* stp); +static int load_atom_table(LoaderState* stp, ErtsAtomEncoding enc); static int load_import_table(LoaderState* stp); static int read_export_table(LoaderState* stp); static int is_bif(Eterm mod, Eterm func, unsigned arity); @@ -629,7 +636,7 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader, CHKALLOC(); CHKBLK(ERTS_ALC_T_CODE,stp->code); if (!init_iff_file(stp, code, unloaded_size) || - !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { goto load_error; } @@ -674,9 +681,16 @@ erts_prepare_loading(Binary* magic, Process *c_p, Eterm group_leader, */ CHKBLK(ERTS_ALC_T_CODE,stp->code); - define_file(stp, "atom table", ATOM_CHUNK); - if (!load_atom_table(stp)) { - goto load_error; + if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) { + define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) { + goto load_error; + } + } else { + define_file(stp, "atom table", ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) { + goto load_error; + } } /* @@ -1212,7 +1226,7 @@ init_iff_file(LoaderState* stp, byte* code, Uint size) * Scan the IFF file. The header should have been verified by init_iff_file(). */ static int -scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types, Uint num_mandatory) +scan_iff_file(LoaderState* stp, Uint* chunk_types, Uint num_types) { Uint count; Uint id; @@ -1291,7 +1305,16 @@ verify_chunks(LoaderState* stp) MD5_CTX context; MD5Init(&context); - for (i = 0; i < NUM_MANDATORY; i++) { + + if (stp->chunks[UTF8_ATOM_CHUNK].start != NULL) { + MD5Update(&context, stp->chunks[UTF8_ATOM_CHUNK].start, stp->chunks[UTF8_ATOM_CHUNK].size); + } else if (stp->chunks[ATOM_CHUNK].start != NULL) { + MD5Update(&context, stp->chunks[ATOM_CHUNK].start, stp->chunks[ATOM_CHUNK].size); + } else { + LoadError0(stp, "mandatory chunk of type 'Atom' or 'AtU8' not found\n"); + } + + for (i = MIN_MANDATORY; i < MAX_MANDATORY; i++) { if (stp->chunks[i].start != NULL) { MD5Update(&context, stp->chunks[i].start, stp->chunks[i].size); } else { @@ -1352,7 +1375,7 @@ verify_chunks(LoaderState* stp) } static int -load_atom_table(LoaderState* stp) +load_atom_table(LoaderState* stp, ErtsAtomEncoding enc) { unsigned int i; @@ -1371,7 +1394,7 @@ load_atom_table(LoaderState* stp) GetByte(stp, n); GetString(stp, atom, n); - stp->atom[i] = erts_atom_put(atom, n, ERTS_ATOM_ENC_LATIN1, 1); + stp->atom[i] = erts_atom_put(atom, n, enc, 1); } /* @@ -5937,7 +5960,7 @@ code_get_chunk_2(BIF_ALIST_2) goto error; } if (!init_iff_file(stp, start, binary_size(Bin)) || - !scan_iff_file(stp, &chunk, 1, 1) || + !scan_iff_file(stp, &chunk, 1) || stp->chunks[0].start == NULL) { res = am_undefined; goto done; @@ -5986,7 +6009,7 @@ code_module_md5_1(BIF_ALIST_1) } stp->module = THE_NON_VALUE; /* Suppress diagnostiscs */ if (!init_iff_file(stp, bytes, binary_size(Bin)) || - !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + !scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { res = am_undefined; goto done; @@ -6335,7 +6358,7 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info) if (!init_iff_file(stp, bytes, size)) { goto error; } - if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES, NUM_MANDATORY) || + if (!scan_iff_file(stp, chunk_types, NUM_CHUNK_TYPES) || !verify_chunks(stp)) { goto error; } @@ -6343,9 +6366,16 @@ erts_make_stub_module(Process* p, Eterm hipe_magic_bin, Eterm Beam, Eterm Info) if (!read_code_header(stp)) { goto error; } - define_file(stp, "atom table", ATOM_CHUNK); - if (!load_atom_table(stp)) { - goto error; + if (stp->chunks[UTF8_ATOM_CHUNK].size > 0) { + define_file(stp, "utf8 atom table", UTF8_ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_UTF8)) { + goto error; + } + } else { + define_file(stp, "atom table", ATOM_CHUNK); + if (!load_atom_table(stp, ERTS_ATOM_ENC_LATIN1)) { + goto error; + } } define_file(stp, "export table", EXP_CHUNK); if (!stub_read_export_table(stp)) { diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c index d886c2985e..95bf13c07c 100644 --- a/erts/emulator/beam/bif.c +++ b/erts/emulator/beam/bif.c @@ -3022,8 +3022,8 @@ BIF_RETTYPE atom_to_list_1(BIF_ALIST_1) BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) { Eterm res; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); - Sint i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); + byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT); + Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); if (i < 0) { erts_free(ERTS_ALC_T_TMP, (void *) buf); @@ -3033,7 +3033,7 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) } BIF_ERROR(BIF_P, BADARG); } - res = erts_atom_put((byte *) buf, i, ERTS_ATOM_ENC_LATIN1, 1); + res = erts_atom_put(buf, i, ERTS_ATOM_ENC_UTF8, 1); ASSERT(is_atom(res)); erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(res); @@ -3043,17 +3043,17 @@ BIF_RETTYPE list_to_atom_1(BIF_ALIST_1) BIF_RETTYPE list_to_existing_atom_1(BIF_ALIST_1) { - Sint i; - char *buf = (char *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_CHARACTERS); + byte *buf = (byte *) erts_alloc(ERTS_ALC_T_TMP, MAX_ATOM_SZ_LIMIT); + Sint i = erts_unicode_list_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS); - if ((i = intlist_to_buf(BIF_ARG_1, buf, MAX_ATOM_CHARACTERS)) < 0) { + if (i < 0) { error: erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_ERROR(BIF_P, BADARG); } else { Eterm a; - if (erts_atom_get(buf, i, &a, ERTS_ATOM_ENC_LATIN1)) { + if (erts_atom_get((char *) buf, i, &a, ERTS_ATOM_ENC_UTF8)) { erts_free(ERTS_ALC_T_TMP, (void *) buf); BIF_RET(a); } else { diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index bd5e1482fb..8919898181 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1890,74 +1890,57 @@ binary_to_atom(Process* proc, Eterm bin, Eterm enc, int must_exist) byte* bytes; byte *temp_alloc = NULL; Uint bin_size; + Eterm a; if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { BIF_ERROR(proc, BADARG); } bin_size = binary_size(bin); + if (enc == am_latin1) { - Eterm a; - if (bin_size > MAX_ATOM_CHARACTERS) { - system_limit: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(proc, SYSTEM_LIMIT); - } if (!must_exist) { - a = erts_atom_put((byte *) bytes, - bin_size, - ERTS_ATOM_ENC_LATIN1, - 0); - erts_free_aligned_binary_bytes(temp_alloc); - if (is_non_value(a)) - goto badarg; - BIF_RET(a); - } else if (erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { - erts_free_aligned_binary_bytes(temp_alloc); - BIF_RET(a); - } else { + int lix = erts_atom_put_index((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_LATIN1, + 0); + if (lix == ATOM_BAD_ENCODING_ERROR) { + badarg: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(proc, BADARG); + } else if (lix == ATOM_MAX_CHARS_ERROR) { + system_limit: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(proc, SYSTEM_LIMIT); + } + + a = make_atom(lix); + } else if (!erts_atom_get((char *)bytes, bin_size, &a, ERTS_ATOM_ENC_LATIN1)) { goto badarg; } - } else if (enc == am_utf8 || enc == am_unicode) { - Eterm res; - Uint num_chars = 0; - const byte* p = bytes; - Uint left = bin_size; - while (left) { - if (++num_chars > MAX_ATOM_CHARACTERS) { + } else if (enc == am_utf8 || enc == am_unicode) { + if (!must_exist) { + int uix = erts_atom_put_index((byte *) bytes, + bin_size, + ERTS_ATOM_ENC_UTF8, + 0); + if (uix == ATOM_BAD_ENCODING_ERROR) { + goto badarg; + } else if (uix == ATOM_MAX_CHARS_ERROR) { goto system_limit; } - if ((p[0] & 0x80) == 0) { - ++p; - --left; - } - else if (left >= 2 - && (p[0] & 0xFE) == 0xC2 /* only allow latin1 subset */ - && (p[1] & 0xC0) == 0x80) { - p += 2; - left -= 2; - } - else goto badarg; - } - if (!must_exist) { - res = erts_atom_put((byte *) bytes, - bin_size, - ERTS_ATOM_ENC_UTF8, - 0); + a = make_atom(uix); } - else if (!erts_atom_get((char*)bytes, bin_size, &res, ERTS_ATOM_ENC_UTF8)) { + else if (!erts_atom_get((char*)bytes, bin_size, &a, ERTS_ATOM_ENC_UTF8)) { goto badarg; } - erts_free_aligned_binary_bytes(temp_alloc); - if (is_non_value(res)) - goto badarg; - BIF_RET(res); } else { - badarg: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(proc, BADARG); + goto badarg; } + + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(a); } BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2) diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 2b2f3c5cdc..9f2b43d216 100644 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1373,6 +1373,7 @@ int erts_utf8_to_latin1(byte* dest, const byte* source, int slen); void bin_write(fmtfn_t, void*, byte*, size_t); Sint intlist_to_buf(Eterm, char*, Sint); /* most callers pass plain char*'s */ +Sint erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len); struct Sint_buf { #if defined(ARCH_64) diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c index ec502d5a78..36b818505c 100644 --- a/erts/emulator/beam/utils.c +++ b/erts/emulator/beam/utils.c @@ -3923,6 +3923,68 @@ intlist_to_buf(Eterm list, char *buf, Sint len) return -2; /* not enough space */ } +/* Fill buf with the contents of the unicode list. + * Return the number of bytes in the buffer, + * or -1 for type error, + * or -2 for not enough buffer space (buffer contains truncated result). + */ +Sint +erts_unicode_list_to_buf(Eterm list, byte *buf, Sint len) +{ + Eterm* listptr; + Sint sz = 0; + + if (is_nil(list)) { + return 0; + } + if (is_not_list(list)) { + return -1; + } + listptr = list_val(list); + + while (len-- > 0) { + Sint val; + + if (is_not_small(CAR(listptr))) { + return -1; + } + val = signed_val(CAR(listptr)); + if (0 <= val && val < 0x80) { + buf[sz] = val; + sz++; + } else if (val < 0x800) { + buf[sz+0] = 0xC0 | (val >> 6); + buf[sz+1] = 0x80 | (val & 0x3F); + sz += 2; + } else if (val < 0x10000UL) { + if (0xD800 <= val && val <= 0xDFFF) { + return -1; + } + buf[sz+0] = 0xE0 | (val >> 12); + buf[sz+1] = 0x80 | ((val >> 6) & 0x3F); + buf[sz+2] = 0x80 | (val & 0x3F); + sz += 3; + } else if (val < 0x110000) { + buf[sz+0] = 0xF0 | (val >> 18); + buf[sz+1] = 0x80 | ((val >> 12) & 0x3F); + buf[sz+2] = 0x80 | ((val >> 6) & 0x3F); + buf[sz+3] = 0x80 | (val & 0x3F); + sz += 4; + } else { + return -1; + } + list = CDR(listptr); + if (is_nil(list)) { + return sz; + } + if (is_not_list(list)) { + return -1; + } + listptr = list_val(list); + } + return -2; /* not enough space */ +} + /* ** Convert an integer to a byte list ** return pointer to converted stuff (need not to be at start of buf!) -- cgit v1.2.3