diff options
Diffstat (limited to 'erts/emulator/beam/erl_unicode.c')
-rw-r--r-- | erts/emulator/beam/erl_unicode.c | 1815 |
1 files changed, 1815 insertions, 0 deletions
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c new file mode 100644 index 0000000000..ab5811c70f --- /dev/null +++ b/erts/emulator/beam/erl_unicode.c @@ -0,0 +1,1815 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2008-2009. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include "sys.h" +#include "erl_vm.h" +#include "global.h" +#include "erl_process.h" +#include "error.h" +#include "bif.h" +#include "erl_binary.h" +#include "big.h" + +#include "erl_unicode.h" + +typedef struct _restart_context { + byte *bytes; + Uint num_processed_bytes; + Uint num_bytes_to_process; + Uint num_resulting_chars; + int state; +} RestartContext; + + +#define LOOP_FACTOR 10 +#define LOOP_FACTOR_SIMPLE 50 /* When just counting */ + +static Uint max_loop_limit; + +static BIF_RETTYPE utf8_to_list(BIF_ALIST_1); +static BIF_RETTYPE finalize_list_to_list(Process *p, + byte *bytes, + Eterm rest, + Uint num_processed_bytes, + Uint num_bytes_to_process, + Uint num_resulting_chars, + int state, int left, + Eterm tail); +static int analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left); +#define UTF8_OK 0 +#define UTF8_INCOMPLETE 1 +#define UTF8_ERROR 2 +#define UTF8_ANALYZE_MORE 3 + +static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3); +static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3); +static BIF_RETTYPE characters_to_list_trap_2(BIF_ALIST_3); + +static BIF_RETTYPE characters_to_list_trap_3(BIF_ALIST_3); +static BIF_RETTYPE characters_to_list_trap_4(BIF_ALIST_1); + +static Export characters_to_utf8_trap_exp; +static Export characters_to_list_trap_1_exp; +static Export characters_to_list_trap_2_exp; + +static Export characters_to_list_trap_3_exp; +static Export characters_to_list_trap_4_exp; + +static Export *c_to_b_int_trap_exportp = NULL; +static Export *c_to_l_int_trap_exportp = NULL; + +void erts_init_unicode(void) +{ + max_loop_limit = CONTEXT_REDS * LOOP_FACTOR; + /* Non visual BIFs to trap to. */ + memset(&characters_to_utf8_trap_exp, 0, sizeof(Export)); + characters_to_utf8_trap_exp.address = + &characters_to_utf8_trap_exp.code[3]; + characters_to_utf8_trap_exp.code[0] = am_erlang; + characters_to_utf8_trap_exp.code[1] = + am_atom_put("characters_to_utf8_trap",23); + characters_to_utf8_trap_exp.code[2] = 3; + characters_to_utf8_trap_exp.code[3] = + (Eterm) em_apply_bif; + characters_to_utf8_trap_exp.code[4] = + (Eterm) &characters_to_utf8_trap; + + memset(&characters_to_list_trap_1_exp, 0, sizeof(Export)); + characters_to_list_trap_1_exp.address = + &characters_to_list_trap_1_exp.code[3]; + characters_to_list_trap_1_exp.code[0] = am_erlang; + characters_to_list_trap_1_exp.code[1] = + am_atom_put("characters_to_list_trap_1",25); + characters_to_list_trap_1_exp.code[2] = 3; + characters_to_list_trap_1_exp.code[3] = + (Eterm) em_apply_bif; + characters_to_list_trap_1_exp.code[4] = + (Eterm) &characters_to_list_trap_1; + + memset(&characters_to_list_trap_2_exp, 0, sizeof(Export)); + characters_to_list_trap_2_exp.address = + &characters_to_list_trap_2_exp.code[3]; + characters_to_list_trap_2_exp.code[0] = am_erlang; + characters_to_list_trap_2_exp.code[1] = + am_atom_put("characters_to_list_trap_2",25); + characters_to_list_trap_2_exp.code[2] = 3; + characters_to_list_trap_2_exp.code[3] = + (Eterm) em_apply_bif; + characters_to_list_trap_2_exp.code[4] = + (Eterm) &characters_to_list_trap_2; + + + memset(&characters_to_list_trap_3_exp, 0, sizeof(Export)); + characters_to_list_trap_3_exp.address = + &characters_to_list_trap_3_exp.code[3]; + characters_to_list_trap_3_exp.code[0] = am_erlang; + characters_to_list_trap_3_exp.code[1] = + am_atom_put("characters_to_list_trap_3",25); + characters_to_list_trap_3_exp.code[2] = 3; + characters_to_list_trap_3_exp.code[3] = + (Eterm) em_apply_bif; + characters_to_list_trap_3_exp.code[4] = + (Eterm) &characters_to_list_trap_3; + + memset(&characters_to_list_trap_4_exp, 0, sizeof(Export)); + characters_to_list_trap_4_exp.address = + &characters_to_list_trap_4_exp.code[3]; + characters_to_list_trap_4_exp.code[0] = am_erlang; + characters_to_list_trap_4_exp.code[1] = + am_atom_put("characters_to_list_trap_4",25); + characters_to_list_trap_4_exp.code[2] = 1; + characters_to_list_trap_4_exp.code[3] = + (Eterm) em_apply_bif; + characters_to_list_trap_4_exp.code[4] = + (Eterm) &characters_to_list_trap_4; + + c_to_b_int_trap_exportp = erts_export_put(am_unicode,am_characters_to_binary_int,2); + c_to_l_int_trap_exportp = erts_export_put(am_unicode,am_characters_to_list_int,2); + + +} + + +static ERTS_INLINE void *alloc_restart(size_t size) +{ + return erts_alloc(ERTS_ALC_T_UNICODE_BUFFER,size); +} + +static ERTS_INLINE void free_restart(void *ptr) +{ + erts_free(ERTS_ALC_T_UNICODE_BUFFER, ptr); +} + +static void cleanup_restart_context(RestartContext *rc) +{ + if (rc->bytes != NULL) { + free_restart(rc->bytes); + rc->bytes = NULL; + } +} + +static void cleanup_restart_context_bin(Binary *bp) +{ + RestartContext *rc = ERTS_MAGIC_BIN_DATA(bp); + cleanup_restart_context(rc); +} + +static RestartContext *get_rc_from_bin(Eterm bin) +{ + Binary *mbp; + ASSERT(ERTS_TERM_IS_MAGIC_BINARY(bin)); + + mbp = ((ProcBin *) binary_val(bin))->val; + + ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(mbp) + == cleanup_restart_context_bin); + return (RestartContext *) ERTS_MAGIC_BIN_DATA(mbp); +} + +static Eterm make_magic_bin_for_restart(Process *p, RestartContext *rc) +{ + Binary *mbp = erts_create_magic_binary(sizeof(RestartContext), + cleanup_restart_context_bin); + RestartContext *restartp = ERTS_MAGIC_BIN_DATA(mbp); + Eterm *hp; + memcpy(restartp,rc,sizeof(RestartContext)); + hp = HAlloc(p, PROC_BIN_SIZE); + return erts_mk_magic_binary_term(&hp, &MSO(p), mbp); +} + + +Sint erts_unicode_set_loop_limit(Sint limit) +{ + Sint save = (Sint) max_loop_limit; + if (limit <= 0) { + max_loop_limit = CONTEXT_REDS * LOOP_FACTOR; + } else { + max_loop_limit = (Uint) limit; + } + return save; +} + +static ERTS_INLINE int allowed_iterations(Process *p) +{ + int tmp = ERTS_BIF_REDS_LEFT(p) * LOOP_FACTOR; + int tmp2 = max_loop_limit; + if (tmp2 < tmp) + return tmp2; + else + return tmp; +} +static ERTS_INLINE int cost_to_proc(Process *p, int cost) +{ + int x = (cost / LOOP_FACTOR); + BUMP_REDS(p,x); + return x; +} +static ERTS_INLINE int simple_loops_to_common(int cost) +{ + int factor = (LOOP_FACTOR_SIMPLE / LOOP_FACTOR); + return (cost / factor); +} + +static Sint aligned_binary_size(Eterm binary) +{ + unsigned char *bytes; + Uint bitoffs; + Uint bitsize; + + ERTS_GET_BINARY_BYTES(binary, bytes, bitoffs, bitsize); + if (bitsize != 0) { + return (Sint) -1; + } + return binary_size(binary); +} + +static Sint latin1_binary_need(Eterm binary) +{ + unsigned char *bytes; + byte *temp_alloc = NULL; + Uint bitoffs; + Uint bitsize; + Uint size; + Sint need = 0; + Sint i; + + ERTS_GET_BINARY_BYTES(binary, bytes, bitoffs, bitsize); + if (bitsize != 0) { + return (Sint) -1; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(binary, &temp_alloc); + /* The call to erts_get_aligned_binary_bytes cannot fail as + we'we already checked bitsize and that this is a binary */ + } + size = binary_size(binary); + for(i = 0; i < size; ++i) { + if (bytes[i] & ((byte) 0x80)) { + need += 2; + } else { + need += 1; + } + } + erts_free_aligned_binary_bytes(temp_alloc); + return need; +} + +static int utf8_len(byte first) +{ + if ((first & ((byte) 0x80)) == 0) { + return 1; + } else if ((first & ((byte) 0xE0)) == 0xC0) { + return 2; + } else if ((first & ((byte) 0xF0)) == 0xE0) { + return 3; + } else if ((first & ((byte) 0xF8)) == 0xF0) { + return 4; + } + return -1; +} + +static int copy_utf8_bin(byte *target, byte *source, Uint size, + byte *leftover, int *num_leftovers, + byte **err_pos, Uint *characters) { + int copied = 0; + if (leftover != NULL && *num_leftovers) { + int need = utf8_len(leftover[0]); + int from_source = need - (*num_leftovers); + int c; + byte *tmp_err_pos = NULL; + ASSERT(need > 0); + ASSERT(from_source > 0); + if (size < from_source) { + memcpy(leftover + (*num_leftovers), source, size); + *num_leftovers += size; + return 0; + } + /* leftover has room for four bytes (see bif) */ + memcpy(leftover + (*num_leftovers),source,from_source); + c = copy_utf8_bin(target, leftover, need, NULL, NULL, &tmp_err_pos, characters); + if (tmp_err_pos != 0) { + *err_pos = source; + return 0; + } + copied += c; + *num_leftovers = 0; + size -= from_source; + target += c; + source += from_source; + } + while (size) { + if (((*source) & ((byte) 0x80)) == 0) { + *(target++) = *(source++); + --size; ++copied; + } else if (((*source) & ((byte) 0xE0)) == 0xC0) { + if (leftover && size < 2) { + *leftover = *source; + *num_leftovers = 1; + break; + } + if (size < 2 || ((source[1] & ((byte) 0xC0)) != 0x80) || + ((*source) < 0xC2) /* overlong */) { + *err_pos = source; + return copied; + } + *(target++) = *(source++); + *(target++) = *(source++); + size -= 2; copied += 2; + } else if (((*source) & ((byte) 0xF0)) == 0xE0) { + if (leftover && size < 3) { + memcpy(leftover, source, (int) size); + *num_leftovers = (int) size; + break; + } + if (size < 3 || ((source[1] & ((byte) 0xC0)) != 0x80) || + ((source[2] & ((byte) 0xC0)) != 0x80) || + (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) { + *err_pos = source; + return copied; + } + if ((((*source) & ((byte) 0xF)) == 0xD) && + ((source[1] & 0x20) != 0)) { + *err_pos = source; + return copied; + } + + if (((*source) == 0xEF) && (source[1] == 0xBF) && + ((source[2] == 0xBE) || (source[2] == 0xBF))) { + *err_pos = source; + return copied; + } + + *(target++) = *(source++); + *(target++) = *(source++); + *(target++) = *(source++); + size -= 3; copied += 3; + } else if (((*source) & ((byte) 0xF8)) == 0xF0) { + if (leftover && size < 4) { + memcpy(leftover, source, (int) size); + *num_leftovers = (int) size; + break; + } + if (size < 4 || ((source[1] & ((byte) 0xC0)) != 0x80) || + ((source[2] & ((byte) 0xC0)) != 0x80) || + ((source[3] & ((byte) 0xC0)) != 0x80) || + (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) { + *err_pos = source; + return copied; + } + if ((((*source) & ((byte)0x7)) > 0x4U) || + ((((*source) & ((byte)0x7)) == 0x4U) && + ((source[1] & ((byte)0x3F)) > 0xFU))) { + *err_pos = source; + return copied; + } + *(target++) = *(source++); + *(target++) = *(source++); + *(target++) = *(source++); + *(target++) = *(source++); + size -= 4; copied +=4; + } else { + *err_pos = source; + return copied; + } + ++(*characters); + } + return copied; +} + + + +static Sint utf8_need(Eterm ioterm, int latin1, Uint *costp) +{ + Eterm *objp; + Eterm obj; + DECLARE_ESTACK(stack); + Sint need = 0; + Uint cost = 0; + + if (is_nil(ioterm)) { + DESTROY_ESTACK(stack); + *costp = 0; + return need; + } + if(is_binary(ioterm)) { + DESTROY_ESTACK(stack); + if (latin1) { + Sint x = latin1_binary_need(ioterm); + *costp = x; + return x; + } else { + *costp = 1; + return aligned_binary_size(ioterm); + } + } + + if (!is_list(ioterm)) { + DESTROY_ESTACK(stack); + *costp = 0; + return (Sint) -1; + } + /* OK a list, needs to be processed in order, handling each flat list-level + as they occur, just like io_list_to_binary would */ + ESTACK_PUSH(stack,ioterm); + while (!ESTACK_ISEMPTY(stack)) { + ioterm = ESTACK_POP(stack); + if (is_nil(ioterm)) { + /* ignore empty lists */ + continue; + } + if(is_list(ioterm)) { +L_Again: /* Restart with sublist, old listend was pushed on stack */ + objp = list_val(ioterm); + obj = CAR(objp); + for(;;) { /* loop over one flat list of bytes and binaries + until sublist or list end is encountered */ + if (is_small(obj)) { /* Always small */ + for(;;) { + Uint x = unsigned_val(obj); + if (x < 0x80) + need +=1; + else if (x < 0x800) + need += 2; + else if (x < 0x10000) + need += 3; + else + need += 4; + /* everything else will give badarg later + in the process, so we dont check */ + ++cost; + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + if (!is_byte(obj)) + break; + } + } else if (is_nil(obj)) { + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + } else if (is_list(obj)) { + /* push rest of list for later processing, start + again with sublist */ + ESTACK_PUSH(stack,CDR(objp)); + ioterm = obj; + goto L_Again; + } else if (is_binary(obj)) { + Sint x; + + if (latin1) { + x = latin1_binary_need(obj); + if (x < 0) { + DESTROY_ESTACK(stack); + *costp = cost; + return x; + } + cost += x; + } else { + x = aligned_binary_size(obj); + if (x < 0) { + DESTROY_ESTACK(stack); + *costp = cost; + return x; + } + ++cost; + } + need += x; + ioterm = CDR(objp); + if (is_list(ioterm)) { + /* objp and obj need to be updated if + loop is to continue */ + objp = list_val(ioterm); + obj = CAR(objp); + } + } else { + DESTROY_ESTACK(stack); + *costp = cost; + return ((Sint) -1); + } + if (is_nil(ioterm) || !is_list(ioterm)) { + break; + } + } /* for(;;) */ + } /* is_list(ioterm) */ + + if (!is_list(ioterm) && !is_nil(ioterm)) { + /* inproper list end */ + if (is_binary(ioterm)) { + Sint x; + if (latin1) { + x = latin1_binary_need(ioterm); + if (x < 0) { + DESTROY_ESTACK(stack); + *costp = cost; + return x; + } + cost += x; + } else { + x = aligned_binary_size(ioterm); + if (x < 0) { + DESTROY_ESTACK(stack); + *costp = cost; + return x; + } + ++cost; + } + need += x; + } else { + DESTROY_ESTACK(stack); + *costp = cost; + return ((Sint) -1); + } + } + } /* while not estack empty */ + DESTROY_ESTACK(stack); + *costp = cost; + return need; +} + + +static Eterm do_build_utf8(Process *p, Eterm ioterm, int *left, int latin1, + byte *target, int *pos, Uint *characters, int *err, + byte *leftover, int *num_leftovers) +{ + int c; + Eterm *objp; + Eterm obj; + DECLARE_ESTACK(stack); + + *err = 0; + if ((*left) <= 0 || is_nil(ioterm)) { + DESTROY_ESTACK(stack); + return ioterm; + } + if(is_binary(ioterm)) { + Uint bitoffs; + Uint bitsize; + Uint size; + Uint i; + Eterm res_term = NIL; + unsigned char *bytes; + byte *temp_alloc = NULL; + Uint orig_size; + + ERTS_GET_BINARY_BYTES(ioterm, bytes, bitoffs, bitsize); + if (bitsize != 0) { + *err = 1; + DESTROY_ESTACK(stack); + return ioterm; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(ioterm, &temp_alloc); + /* The call to erts_get_aligned_binary_bytes cannot fail as + we'we already checked bitsize and that this is a binary */ + } + + orig_size = size = binary_size(ioterm); + + /* This is done to avoid splitting binaries in two + and then create an unnecessary rest that eventually gives an error. + For cases where errors are not returned this is unnecessary */ + if (!latin1) { + /* Find a valid character boundary */ + while (size > (*left) && + (((byte) bytes[(*left)]) & ((byte) 0xC0)) == ((byte) 0x80)) { + ++(*left); + } + } + + if (size > (*left)) { + Eterm *hp; + ErlSubBin *sb; + Eterm orig; + Uint offset; + /* Split the binary in two parts, of which we + only process the first */ + hp = HAlloc(p, ERL_SUB_BIN_SIZE); + sb = (ErlSubBin *) hp; + ERTS_GET_REAL_BIN(ioterm, orig, offset, bitoffs, bitsize); + sb->thing_word = HEADER_SUB_BIN; + sb->size = size - (*left); + sb->offs = offset + (*left); + sb->orig = orig; + sb->bitoffs = bitoffs; + sb->bitsize = bitsize; + sb->is_writable = 0; + res_term = make_binary(sb); + size = (*left); + } + + if (!latin1) { + int num; + byte *err_pos = NULL; + num = copy_utf8_bin(target + (*pos), bytes, + size, leftover, num_leftovers,&err_pos,characters); + *pos += num; + if (err_pos != NULL) { + int rest_bin_offset; + int rest_bin_size; + Eterm *hp; + ErlSubBin *sb; + Eterm orig; + Uint offset; + + *err = 1; + /* we have no real stack, just build a list of the binaries + we have not decoded... */ + DESTROY_ESTACK(stack); + + rest_bin_offset = (err_pos - bytes); + rest_bin_size = orig_size - rest_bin_offset; + + hp = HAlloc(p, ERL_SUB_BIN_SIZE); + sb = (ErlSubBin *) hp; + ERTS_GET_REAL_BIN(ioterm, orig, offset, bitoffs, bitsize); + sb->thing_word = HEADER_SUB_BIN; + sb->size = rest_bin_size; + sb->offs = offset + rest_bin_offset; + sb->orig = orig; + sb->bitoffs = bitoffs; + sb->bitsize = bitsize; + sb->is_writable = 0; + res_term = make_binary(sb); + erts_free_aligned_binary_bytes(temp_alloc); + return res_term; + } + } else { + i = 0; + while(i < size) { + if (bytes[i] < 0x80) { + target[(*pos)++] = bytes[i++]; + } else { + target[(*pos)++] = ((bytes[i] >> 6) | ((byte) 0xC0)); + target[(*pos)++] = ((bytes[i] & 0x3F) | ((byte) 0x80)); + ++i; + } + ++(*characters); + } + } + *left -= size; + DESTROY_ESTACK(stack); + erts_free_aligned_binary_bytes(temp_alloc); + return res_term; + } + + if (!is_list(ioterm)) { + *err = 1; + goto done; + } + + /* OK a list, needs to be processed in order, handling each flat list-level + as they occur, just like io_list_to_binary would */ + ESTACK_PUSH(stack,ioterm); + while (!ESTACK_ISEMPTY(stack) && (*left)) { + ioterm = ESTACK_POP(stack); + if (is_nil(ioterm)) { + /* ignore empty lists */ + continue; + } + if(is_list(ioterm)) { +L_Again: /* Restart with sublist, old listend was pushed on stack */ + objp = list_val(ioterm); + obj = CAR(objp); + for(;;) { /* loop over one flat list of bytes and binaries + until sublist or list end is encountered */ + if (is_small(obj)) { /* Always small in unicode*/ + if (*num_leftovers) { + /* Have rest from previous bin and this is an integer, not allowed */ + *err = 1; + goto done; + } + for(;;) { + Uint x = unsigned_val(obj); + if (latin1 && x > 255) { + *err = 1; + goto done; + } + if (x < 0x80) { + target[(*pos)++] = (byte) x; + } + else if (x < 0x800) { + target[(*pos)++] = (((byte) (x >> 6)) | + ((byte) 0xC0)); + target[(*pos)++] = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else if (x < 0x10000) { + if ((x >= 0xD800 && x <= 0xDFFF) || + (x == 0xFFFE) || + (x == 0xFFFF)) { /* Invalid unicode range */ + *err = 1; + goto done; + } + target[(*pos)++] = (((byte) (x >> 12)) | + ((byte) 0xE0)); + target[(*pos)++] = ((((byte) (x >> 6)) & 0x3F) | + ((byte) 0x80)); + target[(*pos)++] = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else if (x < 0x110000) { /* Standard imposed max */ + target[(*pos)++] = (((byte) (x >> 18)) | + ((byte) 0xF0)); + target[(*pos)++] = ((((byte) (x >> 12)) & 0x3F) | + ((byte) 0x80)); + target[(*pos)++] = ((((byte) (x >> 6)) & 0x3F) | + ((byte) 0x80)); + target[(*pos)++] = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else { + *err = 1; + goto done; + } + ++(*characters); + --(*left); + ioterm = CDR(objp); + if (!is_list(ioterm) || !(*left)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + if (!is_small(obj)) + break; + } + } else if (is_nil(obj)) { + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + } else if (is_list(obj)) { + /* push rest of list for later processing, start + again with sublist */ + ESTACK_PUSH(stack,CDR(objp)); + ioterm = obj; + goto L_Again; + } else if (is_binary(obj)) { + Eterm rest_term; + rest_term = do_build_utf8(p,obj,left,latin1,target,pos, characters, err, + leftover, num_leftovers); + if ((*err) != 0) { + Eterm *hp; + hp = HAlloc(p, 2); + obj = CDR(objp); + ioterm = CONS(hp, rest_term, obj); + //(*left) = 0; + goto done; + } + if (rest_term != NIL) { + Eterm *hp; + hp = HAlloc(p, 2); + obj = CDR(objp); + ioterm = CONS(hp, rest_term, obj); + (*left) = 0; + break; + } + ioterm = CDR(objp); + if (is_list(ioterm)) { + /* objp and obj need to be updated if + loop is to continue */ + objp = list_val(ioterm); + obj = CAR(objp); + } + } else { + *err = 1; + goto done; + } + if (!(*left) || is_nil(ioterm) || !is_list(ioterm)) { + break; + } + } /* for(;;) */ + } /* is_list(ioterm) */ + + if ((*left) && !is_list(ioterm) && !is_nil(ioterm)) { + /* inproper list end */ + if (is_binary(ioterm)) { + ioterm = do_build_utf8(p,ioterm,left,latin1,target,pos,characters,err,leftover,num_leftovers); + if ((*err) != 0) { + goto done; + } + } else { + *err = 1; + goto done; + } + } + } /* while left and not estack empty */ + done: + c = ESTACK_COUNT(stack); + if (c > 0) { + Eterm *hp = HAlloc(p,2*c); + while(!ESTACK_ISEMPTY(stack)) { + Eterm st = ESTACK_POP(stack); + ioterm = CONS(hp, ioterm, st); + hp += 2; + } + } + DESTROY_ESTACK(stack); + return ioterm; + +} + +static int check_leftovers(byte *source, int size) +{ + if (((*source) & ((byte) 0xE0)) == 0xC0) { + return 0; + } else if (((*source) & ((byte) 0xF0)) == 0xE0) { + if (size < 2 || + (size < 3 && ((source[1] & ((byte) 0xC0)) == 0x80))) { + return 0; + } + } else if (((*source) & ((byte) 0xF8)) == 0xF0) { + if (size < 2 || + (size < 3 && ((source[1] & ((byte) 0xC0)) == 0x80)) || + (size < 4 && + ((source[1] & ((byte) 0xC0)) == 0x80) && + ((source[2] & ((byte) 0xC0)) == 0x80))) { + return 0; + } + } + return -1; +} + + + +static BIF_RETTYPE build_utf8_return(Process *p,Eterm bin,int pos, + Eterm rest_term,int err, + byte *leftover,int num_leftovers,Eterm latin1) +{ + Eterm *hp; + Eterm ret; + + binary_size(bin) = pos; + if (err) { + if (num_leftovers > 0) { + Eterm leftover_bin = new_binary(p, leftover, num_leftovers); + hp = HAlloc(p,8); + rest_term = CONS(hp,rest_term,NIL); + hp += 2; + rest_term = CONS(hp,leftover_bin,rest_term); + hp += 2; + } else { + hp = HAlloc(p,4); + } + ret = TUPLE3(hp,am_error,bin,rest_term); + } else if (rest_term == NIL && num_leftovers != 0) { + Eterm leftover_bin = new_binary(p, leftover, num_leftovers); + if (check_leftovers(leftover,num_leftovers) != 0) { + hp = HAlloc(p,4); + ret = TUPLE3(hp,am_error,bin,leftover_bin); + } else { + hp = HAlloc(p,4); + ret = TUPLE3(hp,am_incomplete,bin,leftover_bin); + } + } else { /* All OK */ + if (rest_term != NIL) { /* Trap */ + if (num_leftovers > 0) { + Eterm rest_bin = new_binary(p, leftover, num_leftovers); + hp = HAlloc(p,2); + rest_term = CONS(hp,rest_bin,rest_term); + } + BUMP_ALL_REDS(p); + BIF_TRAP3(&characters_to_utf8_trap_exp, p, bin, rest_term, latin1); + } else { /* Success */ + /*hp = HAlloc(p,5); + ret = TUPLE4(hp,bin,rest_term,make_small(pos),make_small(err));*/ + ret = bin; + } + } + BIF_RET(ret); +} + + +static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3) +{ + Eterm *real_bin; + Sint need; + byte* bytes; + Eterm rest_term; + int left, sleft; + int pos; + int err; + byte leftover[4]; /* used for temp buffer too, + otherwise 3 bytes would have been enough */ + int num_leftovers = 0; + int latin1 = 0; + Uint characters = 0; + + /*erts_printf("Trap %T!\r\n",BIF_ARG_2);*/ + ASSERT(is_binary(BIF_ARG_1)); + real_bin = binary_val(BIF_ARG_1); + ASSERT(*real_bin == HEADER_PROC_BIN); + need = ((ProcBin *) real_bin)->val->orig_size; + pos = (int) binary_size(BIF_ARG_1); + bytes = binary_bytes(BIF_ARG_1); + sleft = left = allowed_iterations(BIF_P); + err = 0; + if (BIF_ARG_3 == am_latin1) { + latin1 = 1; + } + rest_term = do_build_utf8(BIF_P, BIF_ARG_2, &left, latin1, + bytes, &pos, &characters, &err, leftover, &num_leftovers); + cost_to_proc(BIF_P, sleft - left); + return build_utf8_return(BIF_P,BIF_ARG_1,pos,rest_term,err, + leftover,num_leftovers,BIF_ARG_3); +} + +BIF_RETTYPE unicode_bin_is_7bit_1(BIF_ALIST_1) +{ + Sint need; + if(!is_binary(BIF_ARG_1)) { + BIF_RET(am_false); + } + need = latin1_binary_need(BIF_ARG_1); + if(need >= 0 && aligned_binary_size(BIF_ARG_1) == need) { + BIF_RET(am_true); + } + BIF_RET(am_false); +} + +static int is_valid_utf8(Eterm orig_bin) +{ + Uint bitoffs; + Uint bitsize; + Uint size; + byte *temp_alloc = NULL; + byte *endpos; + Uint numchar; + byte *bytes; + int ret; + + ERTS_GET_BINARY_BYTES(orig_bin, bytes, bitoffs, bitsize); + if (bitsize != 0) { + return 0; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(orig_bin, &temp_alloc); + } + size = binary_size(orig_bin); + ret = analyze_utf8(bytes, + size, + &endpos,&numchar,NULL); + erts_free_aligned_binary_bytes(temp_alloc); + return (ret == UTF8_OK); +} + +BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2) +{ + Sint need; + Uint characters; + int latin1; + Eterm bin; + byte *bytes; + int pos; + int err; + int left, sleft; + Eterm rest_term, subject; + byte leftover[4]; /* used for temp buffer too, o + therwise 3 bytes would have been enough */ + int num_leftovers = 0; + Uint cost_of_utf8_need; + + + if (BIF_ARG_2 == am_latin1) { + latin1 = 1; + } else if (BIF_ARG_2 == am_unicode || BIF_ARG_2 == am_utf8) { + latin1 = 0; + } else { + BIF_TRAP2(c_to_b_int_trap_exportp, BIF_P, BIF_ARG_1, BIF_ARG_2); + } + if (is_list(BIF_ARG_1) && is_binary(CAR(list_val(BIF_ARG_1))) && + is_nil(CDR(list_val(BIF_ARG_1)))) { + subject = CAR(list_val(BIF_ARG_1)); + } else { + subject = BIF_ARG_1; + } + + need = utf8_need(subject,latin1,&cost_of_utf8_need); + if (need < 0) { + BIF_ERROR(BIF_P,BADARG); + } + if (is_binary(subject) && need >= 0 && aligned_binary_size(subject) == need + && (latin1 || is_valid_utf8(subject))) { + cost_to_proc(BIF_P, simple_loops_to_common(cost_of_utf8_need)); + BIF_RET(subject); + } + + + bin = erts_new_mso_binary(BIF_P, (byte *)NULL, need); + bytes = binary_bytes(bin); + cost_to_proc(BIF_P, simple_loops_to_common(cost_of_utf8_need)); + left = allowed_iterations(BIF_P) - + simple_loops_to_common(cost_of_utf8_need); + if (left <= 0) { + /* simplified - let everything be setup by setting left to 1 */ + left = 1; + } + sleft = left; + pos = 0; + err = 0; + + + rest_term = do_build_utf8(BIF_P, subject, &left, latin1, + bytes, &pos, &characters, &err, leftover, &num_leftovers); +#ifdef HARDDEBUG + if (left == 0) { + Eterm bin; + if (is_binary(subject)) { + bin = subject; + } else if(is_list(subject) && is_binary(CAR(list_val(subject)))) { + bin = CAR(list_val(subject)); + } else { + bin = NIL; + } + if (is_binary(bin)) { + byte *t = NULL; + Uint sz = binary_size(bin); + byte *by = erts_get_aligned_binary_bytes(bin,&t); + int i; + erts_printf("<<"); + for (i = 0;i < sz; ++i) { + erts_printf((i == sz -1) ? "0x%X" : "0x%X, ", (unsigned) by[i]); + } + erts_printf(">>: "); + erts_free_aligned_binary_bytes(t); + } + erts_printf("%d - %d = %d\n",sleft,left,sleft - left); + } +#endif + cost_to_proc(BIF_P, sleft - left); + return build_utf8_return(BIF_P,bin,pos,rest_term,err, + leftover,num_leftovers,BIF_ARG_2); +} + +static BIF_RETTYPE build_list_return(Process *p, byte *bytes, int pos, Uint characters, + Eterm rest_term, int err, + byte *leftover, int num_leftovers, + Eterm latin1, int left) +{ + Eterm *hp; + + if (left <= 0) { + left = 1; + } + + if (err) { + if (num_leftovers > 0) { + Eterm leftover_bin = new_binary(p, leftover, num_leftovers); + hp = HAlloc(p,4); + rest_term = CONS(hp,rest_term,NIL); + hp += 2; + rest_term = CONS(hp,leftover_bin,rest_term); + } + BIF_RET(finalize_list_to_list(p, bytes, rest_term, 0U, pos, characters, UTF8_ERROR, left, NIL)); + } else if (rest_term == NIL && num_leftovers != 0) { + Eterm leftover_bin = new_binary(p, leftover, num_leftovers); + if (check_leftovers(leftover,num_leftovers) != 0) { + BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, UTF8_ERROR, + left, NIL)); + } else { + BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, UTF8_INCOMPLETE, + left, NIL)); + } + } else { /* All OK */ + if (rest_term != NIL) { /* Trap */ + RestartContext rc; + if (num_leftovers > 0) { + Eterm rest_bin = new_binary(p, leftover, num_leftovers); + hp = HAlloc(p,2); + rest_term = CONS(hp,rest_bin,rest_term); + } + BUMP_ALL_REDS(p); + rc.bytes = bytes; + rc.num_processed_bytes = 0; /* not used */ + rc.num_bytes_to_process = pos; + rc.num_resulting_chars = characters; + rc.state = UTF8_OK; /* not used */ + BIF_TRAP3(&characters_to_list_trap_1_exp, p, make_magic_bin_for_restart(p,&rc), + rest_term, latin1); + } else { /* Success */ + BIF_RET(finalize_list_to_list(p, bytes, NIL, 0U, pos, characters, UTF8_OK, left, NIL)); + } + } +} + +static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3) +{ + RestartContext *rc; + byte* bytes; + int pos; + Uint characters; + int err; + Eterm rest_term; + int left, sleft; + + int latin1 = 0; + byte leftover[4]; /* used for temp buffer too, + otherwise 3 bytes would have been enough */ + int num_leftovers = 0; + + + rc = get_rc_from_bin(BIF_ARG_1); + + bytes = rc->bytes; + rc->bytes = NULL; /* to avoid free due to later GC */ + pos = rc->num_bytes_to_process; + characters = rc->num_resulting_chars; + + sleft = left = allowed_iterations(BIF_P); + err = 0; + if (BIF_ARG_3 == am_latin1) { + latin1 = 1; + } + rest_term = do_build_utf8(BIF_P, BIF_ARG_2, &left, latin1, + bytes, &pos, &characters, &err, leftover, &num_leftovers); + cost_to_proc(BIF_P, sleft - left); + return build_list_return(BIF_P,bytes,pos,characters,rest_term,err, + leftover,num_leftovers,BIF_ARG_3,left); +} + +BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2) +{ + Sint need; + int latin1; + Uint characters = 0; + byte *bytes; + int pos; + int err; + int left, sleft; + Eterm rest_term; + byte leftover[4]; /* used for temp buffer too, o + therwise 3 bytes would have been enough */ + int num_leftovers = 0; + Uint cost_of_utf8_need; + + if (BIF_ARG_2 == am_latin1) { + latin1 = 1; + } else if (BIF_ARG_2 == am_unicode || BIF_ARG_2 == am_utf8) { + latin1 = 0; + } else { + BIF_TRAP2(c_to_l_int_trap_exportp, BIF_P, BIF_ARG_1, BIF_ARG_2); + } + if (is_binary(BIF_ARG_1) && !latin1) { /* Optimized behaviour for this case */ + return utf8_to_list(BIF_P,BIF_ARG_1); + } + need = utf8_need(BIF_ARG_1,latin1,&cost_of_utf8_need); + if (need < 0) { + BIF_ERROR(BIF_P,BADARG); + } + bytes = alloc_restart(need); + cost_to_proc(BIF_P, simple_loops_to_common(cost_of_utf8_need)); + left = allowed_iterations(BIF_P) - + simple_loops_to_common(cost_of_utf8_need); + if (left <= 0) { + /* simplified - let everything be setup by setting left to 1 */ + left = 1; + } + sleft = left; + pos = 0; + err = 0; + + + rest_term = do_build_utf8(BIF_P, BIF_ARG_1, &left, latin1, + bytes, &pos, &characters, &err, leftover, &num_leftovers); + cost_to_proc(BIF_P, sleft - left); + return build_list_return(BIF_P,bytes,pos,characters,rest_term,err, + leftover,num_leftovers,BIF_ARG_2,left); +} + + +/* + * When input to characters_to_list is a plain binary and the format is 'unicode', we do + * a faster analyze and size count with this function. + */ +static int analyze_utf8(byte *source, Uint size, + byte **err_pos, Uint *num_chars, int *left) +{ + *err_pos = source; + *num_chars = 0; + while (size) { + if (((*source) & ((byte) 0x80)) == 0) { + source++; + --size; + } else if (((*source) & ((byte) 0xE0)) == 0xC0) { + if (size < 2) { + return UTF8_INCOMPLETE; + } + if (((source[1] & ((byte) 0xC0)) != 0x80) || + ((*source) < 0xC2) /* overlong */) { + return UTF8_ERROR; + } + source += 2; + size -= 2; + } else if (((*source) & ((byte) 0xF0)) == 0xE0) { + if (size < 3) { + return UTF8_INCOMPLETE; + } + if (((source[1] & ((byte) 0xC0)) != 0x80) || + ((source[2] & ((byte) 0xC0)) != 0x80) || + (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) { + return UTF8_ERROR; + } + if ((((*source) & ((byte) 0xF)) == 0xD) && + ((source[1] & 0x20) != 0)) { + return UTF8_ERROR; + } + if (((*source) == 0xEF) && (source[1] == 0xBF) && + ((source[2] == 0xBE) || (source[2] == 0xBF))) { + return UTF8_ERROR; + } + source += 3; + size -= 3; + } else if (((*source) & ((byte) 0xF8)) == 0xF0) { + if (size < 4) { + return UTF8_INCOMPLETE; + } + if (((source[1] & ((byte) 0xC0)) != 0x80) || + ((source[2] & ((byte) 0xC0)) != 0x80) || + ((source[3] & ((byte) 0xC0)) != 0x80) || + (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) { + return UTF8_ERROR; + } + if ((((*source) & ((byte)0x7)) > 0x4U) || + ((((*source) & ((byte)0x7)) == 0x4U) && + ((source[1] & ((byte)0x3F)) > 0xFU))) { + return UTF8_ERROR; + } + source += 4; + size -= 4; + } else { + return UTF8_ERROR; + } + ++(*num_chars); + *err_pos = source; + if (left && --(*left) <= 0) { + return UTF8_ANALYZE_MORE; + } + } + return UTF8_OK; +} + +/* + * No errors should be able to occur - no overlongs, no malformed, no nothing + */ +static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz, + Uint left, + Uint *num_built, Uint *num_eaten, Eterm tail) +{ + Eterm *hp; + Eterm ret; + byte *source, *ssource; + Uint unipoint; + + ASSERT(num > 0); + if (left < num) { + if (left > 0) + num = left; + else + num = 1; + } + + *num_built = num; /* Always */ + + hp = HAlloc(p,num * 2); + ret = tail; + source = bytes + sz; + ssource = source; + while(--source >= bytes) { + if (((*source) & ((byte) 0x80)) == 0) { + unipoint = (Uint) *source; + } else if (((*source) & ((byte) 0xE0)) == 0xC0) { + unipoint = + (((Uint) ((*source) & ((byte) 0x1F))) << 6) | + ((Uint) (source[1] & ((byte) 0x3F))); + } else if (((*source) & ((byte) 0xF0)) == 0xE0) { + unipoint = + (((Uint) ((*source) & ((byte) 0xF))) << 12) | + (((Uint) (source[1] & ((byte) 0x3F))) << 6) | + ((Uint) (source[2] & ((byte) 0x3F))); + } else if (((*source) & ((byte) 0xF8)) == 0xF0) { + unipoint = + (((Uint) ((*source) & ((byte) 0x7))) << 18) | + (((Uint) (source[1] & ((byte) 0x3F))) << 12) | + (((Uint) (source[2] & ((byte) 0x3F))) << 6) | + ((Uint) (source[3] & ((byte) 0x3F))); + } else { + /* ignore 2#10XXXXXX */ + continue; + } + ret = CONS(hp,make_small(unipoint),ret); + hp += 2; + if (--num <= 0) { + break; + } + } + *num_eaten = (ssource - source); + return ret; +} + +/* + * The last step of characters_to_list, build a list from the buffer 'bytes' (created in the same way + * as for characters_to_utf8). All sizes are known in advance and most data will be held in a + * "magic binary" during trapping. + */ +static BIF_RETTYPE finalize_list_to_list(Process *p, + byte *bytes, + Eterm rest, + Uint num_processed_bytes, + Uint num_bytes_to_process, + Uint num_resulting_chars, + int state, int left, + Eterm tail) +{ + Uint num_built; /* characters */ + Uint num_eaten; /* bytes */ + Eterm *hp; + Eterm converted,ret; + + if (!num_bytes_to_process) { + converted = tail; + } else { + num_built = 0; + num_eaten = 0; + converted = do_utf8_to_list(p, num_resulting_chars, + bytes, num_bytes_to_process, + left, &num_built, &num_eaten, tail); + cost_to_proc(p,num_built); + + if (num_built != num_resulting_chars) { /* work left to do */ + RestartContext rc; + + rc.num_resulting_chars = num_resulting_chars - num_built; + rc.num_bytes_to_process = num_bytes_to_process - num_eaten; + rc.num_processed_bytes = num_processed_bytes + num_eaten; + rc.state = state; + rc.bytes = bytes; + BUMP_ALL_REDS(p); + BIF_TRAP3(&characters_to_list_trap_2_exp, p, + make_magic_bin_for_restart(p, &rc), rest, converted); + } + } + + /* + * OK, no more trapping, let's get rid of the temporary array... + */ + + free_restart(bytes); + if (state == UTF8_INCOMPLETE) { + hp = HAlloc(p,4); + ret = TUPLE3(hp,am_incomplete,converted,rest); + } else if (state == UTF8_ERROR) { + hp = HAlloc(p,4); + ret = TUPLE3(hp,am_error,converted,rest); + } else { + ret = converted; + } + + BIF_RET(ret); +} + +static BIF_RETTYPE characters_to_list_trap_2(BIF_ALIST_3) +{ + RestartContext *rc; + byte *bytes; + + rc = get_rc_from_bin(BIF_ARG_1); + + bytes = rc->bytes; + rc->bytes = NULL; /* Don't want this freed just yet... */ + return finalize_list_to_list(BIF_P, bytes, BIF_ARG_2, rc->num_processed_bytes, + rc->num_bytes_to_process, rc->num_resulting_chars, + rc->state, allowed_iterations(BIF_P), BIF_ARG_3); +} + + +/* + * Hooks into the process of decoding a binary depending on state. + * If last_state is UTF8_ANALYZE_MORE, num_bytes_to_process + * and num_resulting_chars will grow + * until we're done analyzing the binary. Then we'll eat + * the bytes to process, lowering num_bytes_to_process and num_resulting_chars, + * while increasing num_processed_bytes until we're done. the state + * indicates how to return (error, incomplete or ok) in this stage. + * note that num_processed_bytes and num_bytes_to_process will make up the + * length of the binary part to process, not necessarily the length of the + * whole binary (if there are errors or an incomplete tail). + * + * Analyzing happens from the beginning of the binary towards the end, + * while result is built from the end of the analyzed/accepted part + * towards the beginning. + * + * Note that this routine is *only* called when original input was a plain utf8 binary, + * otherwise the rest and the sizes are known in advance, so finalize_list_to_list is + * used to build the resulting list (no analyzing needed). + */ +static BIF_RETTYPE do_bif_utf8_to_list(Process *p, + Eterm orig_bin, + Uint num_processed_bytes, + Uint num_bytes_to_process, + Uint num_resulting_chars, + int state, + Eterm tail) +{ + int left; + Uint bitoffs; + Uint bitsize; + Uint size; + byte *bytes; + Eterm converted = NIL; + Eterm rest = NIL; + Eterm *hp; + Eterm ret; + byte *temp_alloc = NULL; + byte *endpos; + Uint numchar; + + Uint b_sz; /* size of the non analyzed tail */ + Uint num_built; /* characters */ + Uint num_eaten; /* bytes */ + + ERTS_GET_BINARY_BYTES(orig_bin, bytes, bitoffs, bitsize); + if (bitsize != 0) { + converted = NIL; + rest = orig_bin; + goto error_return; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(orig_bin, &temp_alloc); + } + + size = binary_size(orig_bin); + + left = allowed_iterations(p); + + if (state == UTF8_ANALYZE_MORE) { + state = analyze_utf8(bytes + num_bytes_to_process, + size - num_bytes_to_process, + &endpos,&numchar,&left); + cost_to_proc(p,numchar); + num_resulting_chars += numchar; + num_bytes_to_process = endpos - bytes; + if (state == UTF8_ANALYZE_MORE) { + Eterm epos = erts_make_integer(num_bytes_to_process,p); + Eterm enumchar = erts_make_integer(num_resulting_chars,p); + erts_free_aligned_binary_bytes(temp_alloc); + BUMP_ALL_REDS(p); + BIF_TRAP3(&characters_to_list_trap_3_exp, p, orig_bin, epos, + enumchar); + } + } + + /* + * If we're here, we have everything analyzed and are instead building + */ + + + if (!num_bytes_to_process) { + converted = tail; + } else { + num_built = 0; + num_eaten = 0; + converted = do_utf8_to_list(p, num_resulting_chars, + bytes, num_bytes_to_process, + left, &num_built, &num_eaten, tail); + cost_to_proc(p,num_built); + + if (num_built != num_resulting_chars) { /* work left to do */ + Eterm newnum_resulting_chars = + erts_make_integer(num_resulting_chars - num_built,p); + Eterm newnum_bytes_to_process = + erts_make_integer(num_bytes_to_process - num_eaten,p); + Eterm newnum_processed_bytes = + erts_make_integer(num_processed_bytes + num_eaten,p); + Eterm traptuple; + hp = HAlloc(p,7); + traptuple = TUPLE6(hp,orig_bin,newnum_processed_bytes, + newnum_bytes_to_process, + newnum_resulting_chars, + make_small(state), + converted); + BUMP_ALL_REDS(p); + erts_free_aligned_binary_bytes(temp_alloc); + BIF_TRAP1(&characters_to_list_trap_4_exp,p,traptuple); + } + } + + /* + * OK, no more trapping, let's build rest binary if there should + * be one. + */ + + b_sz = size - (num_bytes_to_process + num_processed_bytes); + + if (b_sz) { + ErlSubBin *sb; + Eterm orig; + Uint offset; + ASSERT(state != UTF8_OK); + hp = HAlloc(p, ERL_SUB_BIN_SIZE); + sb = (ErlSubBin *) hp; + ERTS_GET_REAL_BIN(orig_bin, orig, offset, bitoffs, bitsize); + sb->thing_word = HEADER_SUB_BIN; + sb->size = b_sz; + sb->offs = num_bytes_to_process + num_processed_bytes; + sb->orig = orig; + sb->bitoffs = bitoffs; + sb->bitsize = bitsize; + sb->is_writable = 0; + rest = make_binary(sb); + } + + /* Done */ + + if (state == UTF8_INCOMPLETE) { + if (check_leftovers(bytes + num_bytes_to_process + num_processed_bytes, + b_sz) != 0) { + goto error_return; + } + hp = HAlloc(p,4); + ret = TUPLE3(hp,am_incomplete,converted,rest); + } else if (state == UTF8_ERROR) { + error_return: + hp = HAlloc(p,4); + ret = TUPLE3(hp,am_error,converted,rest); + } else { + ret = converted; + } + + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); +} + + +/* + * This is called when there's still analyzing left to do, + * we only reach this if original input was a binary. + */ + +static BIF_RETTYPE characters_to_list_trap_3(BIF_ALIST_3) +{ + Uint num_bytes_to_process; + Uint num_resulting_chars; + + term_to_Uint(BIF_ARG_2, &num_bytes_to_process); /* The number of already + analyzed and accepted + bytes */ + term_to_Uint(BIF_ARG_3, &num_resulting_chars); /* The number of chars + procuced by the + already analyzed + part of the binary */ + + /*erts_printf("Trap: %T, %T, %T\n",BIF_ARG_1, BIF_ARG_2, BIF_ARG_3);*/ + + return do_bif_utf8_to_list(BIF_P, + BIF_ARG_1, /* the binary */ + 0U, /* nothing processed yet */ + num_bytes_to_process, + num_resulting_chars, + UTF8_ANALYZE_MORE, /* always this state here */ + NIL); /* Nothing built -> no tail yet */ + +} + +/* + * This is called when analyzing is done and we are trapped during building, + * we only reach this if original input was a binary. + */ +static BIF_RETTYPE characters_to_list_trap_4(BIF_ALIST_1) +{ + Uint num_processed_bytes; + Uint num_bytes_to_process; + Uint num_resulting_chars; + Eterm orig_bin, tail; + int last_state; + Eterm *tplp = tuple_val(BIF_ARG_1); + + orig_bin = tplp[1]; + term_to_Uint(tplp[2], &num_processed_bytes); + term_to_Uint(tplp[3], &num_bytes_to_process); + term_to_Uint(tplp[4], &num_resulting_chars); + last_state = (int) signed_val(tplp[5]); + tail = tplp[6]; + + /*erts_printf("Trap: {%T, %lu, %lu, %lu, %d, %T}\n", + orig_bin, num_processed_bytes, num_bytes_to_process, + num_resulting_chars, last_state, tail);*/ + + return do_bif_utf8_to_list(BIF_P, + orig_bin, /* The whole binary */ + num_processed_bytes, /* Number of bytes + already processed */ + num_bytes_to_process, /* Bytes left to proc. */ + num_resulting_chars, /* Num chars left to + build */ + last_state, /* The current state + (never ANALYZE_MORE)*/ + tail); /* The already built + tail */ + +} +/* + * This is only used when characters are a plain unicode (utf8) binary. + * Instead of building an utf8 buffer, we analyze the binary given and use that. + */ + +static BIF_RETTYPE utf8_to_list(BIF_ALIST_1) +{ + if (!is_binary(BIF_ARG_1) || aligned_binary_size(BIF_ARG_1) < 0) { + BIF_ERROR(BIF_P,BADARG); + } + return do_bif_utf8_to_list(BIF_P, BIF_ARG_1, 0U, 0U, 0U, + UTF8_ANALYZE_MORE,NIL); +} + + +BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2) +{ + Atom* ap; + + if (is_not_atom(BIF_ARG_1)) { + goto error; + } + + ap = atom_tab(atom_val(BIF_ARG_1)); + + if (BIF_ARG_2 == am_latin1) { + BIF_RET(new_binary(BIF_P, ap->name, ap->len)); + } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) { + int bin_size = 0; + int i; + Eterm bin_term; + byte* bin_p; + + for (i = 0; i < ap->len; i++) { + bin_size += (ap->name[i] >= 0x80) ? 2 : 1; + } + if (bin_size == ap->len) { + BIF_RET(new_binary(BIF_P, ap->name, ap->len)); + } + bin_term = new_binary(BIF_P, 0, bin_size); + bin_p = binary_bytes(bin_term); + for (i = 0; i < ap->len; i++) { + byte b = ap->name[i]; + if (b < 0x80) { + *bin_p++ = b; + } else { + *bin_p++ = 0xC0 | (b >> 6); + *bin_p++ = 0x80 | (b & 0x3F); + } + } + BIF_RET(bin_term); + } else { + error: + BIF_ERROR(BIF_P, BADARG); + } +} + +static BIF_RETTYPE +binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist) +{ + byte* bytes; + byte *temp_alloc = NULL; + Uint bin_size; + + if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) { + BIF_ERROR(p, BADARG); + } + bin_size = binary_size(bin); + if (enc == am_latin1) { + Eterm a; + if (bin_size > MAX_ATOM_LENGTH) { + system_limit: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(p, SYSTEM_LIMIT); + } + if (!must_exist) { + a = am_atom_put((char *)bytes, bin_size); + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(a); + } else if (erts_atom_get((char *)bytes, bin_size, &a)) { + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(a); + } else { + goto badarg; + } + } else if (enc == am_utf8 || enc == am_unicode) { + char *buf; + char *dst; + int i; + int num_chars; + Eterm res; + + if (bin_size > 2*MAX_ATOM_LENGTH) { + byte* err_pos; + Uint n; + int reds_left = bin_size+1; /* Number of reductions left. */ + + if (analyze_utf8(bytes, bin_size, &err_pos, + &n, &reds_left) == UTF8_OK) { + /* + * Correct UTF-8 encoding, but too many characters to + * fit in an atom. + */ + goto system_limit; + } else { + /* + * Something wrong in the UTF-8 encoding or Unicode code + * points > 255. + */ + goto badarg; + } + } + + /* + * Allocate a temporary buffer the same size as the binary, + * so that we don't need an extra overflow test. + */ + buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size); + dst = buf; + for (i = 0; i < bin_size; i++) { + int c = bytes[i]; + if (c < 0x80) { + *dst++ = c; + } else if (i < bin_size-1) { + int c2; + if ((c & 0xE0) != 0xC0) { + goto free_badarg; + } + i++; + c = (c & 0x3F) << 6; + c2 = bytes[i]; + if ((c2 & 0xC0) != 0x80) { + goto free_badarg; + } + c = c | (c2 & 0x3F); + if (0x80 <= c && c < 256) { + *dst++ = c; + } else { + goto free_badarg; + } + } else { + free_badarg: + erts_free(ERTS_ALC_T_TMP, (void *) buf); + goto badarg; + } + } + num_chars = dst - buf; + if (num_chars > MAX_ATOM_LENGTH) { + erts_free(ERTS_ALC_T_TMP, (void *) buf); + goto system_limit; + } + if (!must_exist) { + res = am_atom_put(buf, num_chars); + erts_free(ERTS_ALC_T_TMP, (void *) buf); + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(res); + } else { + int exists = erts_atom_get(buf, num_chars, &res); + erts_free(ERTS_ALC_T_TMP, (void *) buf); + if (exists) { + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(res); + } else { + goto badarg; + } + } + } else { + badarg: + erts_free_aligned_binary_bytes(temp_alloc); + BIF_ERROR(p, BADARG); + } +} + +BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2) +{ + return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 0); +} + +BIF_RETTYPE binary_to_existing_atom_2(BIF_ALIST_2) +{ + return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 1); +} |