aboutsummaryrefslogtreecommitdiffstats
path: root/erts/emulator/beam/erl_unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'erts/emulator/beam/erl_unicode.c')
-rw-r--r--erts/emulator/beam/erl_unicode.c1815
1 files changed, 1815 insertions, 0 deletions
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
new file mode 100644
index 0000000000..ab5811c70f
--- /dev/null
+++ b/erts/emulator/beam/erl_unicode.c
@@ -0,0 +1,1815 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2008-2009. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+#include "sys.h"
+#include "erl_vm.h"
+#include "global.h"
+#include "erl_process.h"
+#include "error.h"
+#include "bif.h"
+#include "erl_binary.h"
+#include "big.h"
+
+#include "erl_unicode.h"
+
+typedef struct _restart_context {
+ byte *bytes;
+ Uint num_processed_bytes;
+ Uint num_bytes_to_process;
+ Uint num_resulting_chars;
+ int state;
+} RestartContext;
+
+
+#define LOOP_FACTOR 10
+#define LOOP_FACTOR_SIMPLE 50 /* When just counting */
+
+static Uint max_loop_limit;
+
+static BIF_RETTYPE utf8_to_list(BIF_ALIST_1);
+static BIF_RETTYPE finalize_list_to_list(Process *p,
+ byte *bytes,
+ Eterm rest,
+ Uint num_processed_bytes,
+ Uint num_bytes_to_process,
+ Uint num_resulting_chars,
+ int state, int left,
+ Eterm tail);
+static int analyze_utf8(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left);
+#define UTF8_OK 0
+#define UTF8_INCOMPLETE 1
+#define UTF8_ERROR 2
+#define UTF8_ANALYZE_MORE 3
+
+static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3);
+static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3);
+static BIF_RETTYPE characters_to_list_trap_2(BIF_ALIST_3);
+
+static BIF_RETTYPE characters_to_list_trap_3(BIF_ALIST_3);
+static BIF_RETTYPE characters_to_list_trap_4(BIF_ALIST_1);
+
+static Export characters_to_utf8_trap_exp;
+static Export characters_to_list_trap_1_exp;
+static Export characters_to_list_trap_2_exp;
+
+static Export characters_to_list_trap_3_exp;
+static Export characters_to_list_trap_4_exp;
+
+static Export *c_to_b_int_trap_exportp = NULL;
+static Export *c_to_l_int_trap_exportp = NULL;
+
+void erts_init_unicode(void)
+{
+ max_loop_limit = CONTEXT_REDS * LOOP_FACTOR;
+ /* Non visual BIFs to trap to. */
+ memset(&characters_to_utf8_trap_exp, 0, sizeof(Export));
+ characters_to_utf8_trap_exp.address =
+ &characters_to_utf8_trap_exp.code[3];
+ characters_to_utf8_trap_exp.code[0] = am_erlang;
+ characters_to_utf8_trap_exp.code[1] =
+ am_atom_put("characters_to_utf8_trap",23);
+ characters_to_utf8_trap_exp.code[2] = 3;
+ characters_to_utf8_trap_exp.code[3] =
+ (Eterm) em_apply_bif;
+ characters_to_utf8_trap_exp.code[4] =
+ (Eterm) &characters_to_utf8_trap;
+
+ memset(&characters_to_list_trap_1_exp, 0, sizeof(Export));
+ characters_to_list_trap_1_exp.address =
+ &characters_to_list_trap_1_exp.code[3];
+ characters_to_list_trap_1_exp.code[0] = am_erlang;
+ characters_to_list_trap_1_exp.code[1] =
+ am_atom_put("characters_to_list_trap_1",25);
+ characters_to_list_trap_1_exp.code[2] = 3;
+ characters_to_list_trap_1_exp.code[3] =
+ (Eterm) em_apply_bif;
+ characters_to_list_trap_1_exp.code[4] =
+ (Eterm) &characters_to_list_trap_1;
+
+ memset(&characters_to_list_trap_2_exp, 0, sizeof(Export));
+ characters_to_list_trap_2_exp.address =
+ &characters_to_list_trap_2_exp.code[3];
+ characters_to_list_trap_2_exp.code[0] = am_erlang;
+ characters_to_list_trap_2_exp.code[1] =
+ am_atom_put("characters_to_list_trap_2",25);
+ characters_to_list_trap_2_exp.code[2] = 3;
+ characters_to_list_trap_2_exp.code[3] =
+ (Eterm) em_apply_bif;
+ characters_to_list_trap_2_exp.code[4] =
+ (Eterm) &characters_to_list_trap_2;
+
+
+ memset(&characters_to_list_trap_3_exp, 0, sizeof(Export));
+ characters_to_list_trap_3_exp.address =
+ &characters_to_list_trap_3_exp.code[3];
+ characters_to_list_trap_3_exp.code[0] = am_erlang;
+ characters_to_list_trap_3_exp.code[1] =
+ am_atom_put("characters_to_list_trap_3",25);
+ characters_to_list_trap_3_exp.code[2] = 3;
+ characters_to_list_trap_3_exp.code[3] =
+ (Eterm) em_apply_bif;
+ characters_to_list_trap_3_exp.code[4] =
+ (Eterm) &characters_to_list_trap_3;
+
+ memset(&characters_to_list_trap_4_exp, 0, sizeof(Export));
+ characters_to_list_trap_4_exp.address =
+ &characters_to_list_trap_4_exp.code[3];
+ characters_to_list_trap_4_exp.code[0] = am_erlang;
+ characters_to_list_trap_4_exp.code[1] =
+ am_atom_put("characters_to_list_trap_4",25);
+ characters_to_list_trap_4_exp.code[2] = 1;
+ characters_to_list_trap_4_exp.code[3] =
+ (Eterm) em_apply_bif;
+ characters_to_list_trap_4_exp.code[4] =
+ (Eterm) &characters_to_list_trap_4;
+
+ c_to_b_int_trap_exportp = erts_export_put(am_unicode,am_characters_to_binary_int,2);
+ c_to_l_int_trap_exportp = erts_export_put(am_unicode,am_characters_to_list_int,2);
+
+
+}
+
+
+static ERTS_INLINE void *alloc_restart(size_t size)
+{
+ return erts_alloc(ERTS_ALC_T_UNICODE_BUFFER,size);
+}
+
+static ERTS_INLINE void free_restart(void *ptr)
+{
+ erts_free(ERTS_ALC_T_UNICODE_BUFFER, ptr);
+}
+
+static void cleanup_restart_context(RestartContext *rc)
+{
+ if (rc->bytes != NULL) {
+ free_restart(rc->bytes);
+ rc->bytes = NULL;
+ }
+}
+
+static void cleanup_restart_context_bin(Binary *bp)
+{
+ RestartContext *rc = ERTS_MAGIC_BIN_DATA(bp);
+ cleanup_restart_context(rc);
+}
+
+static RestartContext *get_rc_from_bin(Eterm bin)
+{
+ Binary *mbp;
+ ASSERT(ERTS_TERM_IS_MAGIC_BINARY(bin));
+
+ mbp = ((ProcBin *) binary_val(bin))->val;
+
+ ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(mbp)
+ == cleanup_restart_context_bin);
+ return (RestartContext *) ERTS_MAGIC_BIN_DATA(mbp);
+}
+
+static Eterm make_magic_bin_for_restart(Process *p, RestartContext *rc)
+{
+ Binary *mbp = erts_create_magic_binary(sizeof(RestartContext),
+ cleanup_restart_context_bin);
+ RestartContext *restartp = ERTS_MAGIC_BIN_DATA(mbp);
+ Eterm *hp;
+ memcpy(restartp,rc,sizeof(RestartContext));
+ hp = HAlloc(p, PROC_BIN_SIZE);
+ return erts_mk_magic_binary_term(&hp, &MSO(p), mbp);
+}
+
+
+Sint erts_unicode_set_loop_limit(Sint limit)
+{
+ Sint save = (Sint) max_loop_limit;
+ if (limit <= 0) {
+ max_loop_limit = CONTEXT_REDS * LOOP_FACTOR;
+ } else {
+ max_loop_limit = (Uint) limit;
+ }
+ return save;
+}
+
+static ERTS_INLINE int allowed_iterations(Process *p)
+{
+ int tmp = ERTS_BIF_REDS_LEFT(p) * LOOP_FACTOR;
+ int tmp2 = max_loop_limit;
+ if (tmp2 < tmp)
+ return tmp2;
+ else
+ return tmp;
+}
+static ERTS_INLINE int cost_to_proc(Process *p, int cost)
+{
+ int x = (cost / LOOP_FACTOR);
+ BUMP_REDS(p,x);
+ return x;
+}
+static ERTS_INLINE int simple_loops_to_common(int cost)
+{
+ int factor = (LOOP_FACTOR_SIMPLE / LOOP_FACTOR);
+ return (cost / factor);
+}
+
+static Sint aligned_binary_size(Eterm binary)
+{
+ unsigned char *bytes;
+ Uint bitoffs;
+ Uint bitsize;
+
+ ERTS_GET_BINARY_BYTES(binary, bytes, bitoffs, bitsize);
+ if (bitsize != 0) {
+ return (Sint) -1;
+ }
+ return binary_size(binary);
+}
+
+static Sint latin1_binary_need(Eterm binary)
+{
+ unsigned char *bytes;
+ byte *temp_alloc = NULL;
+ Uint bitoffs;
+ Uint bitsize;
+ Uint size;
+ Sint need = 0;
+ Sint i;
+
+ ERTS_GET_BINARY_BYTES(binary, bytes, bitoffs, bitsize);
+ if (bitsize != 0) {
+ return (Sint) -1;
+ }
+ if (bitoffs != 0) {
+ bytes = erts_get_aligned_binary_bytes(binary, &temp_alloc);
+ /* The call to erts_get_aligned_binary_bytes cannot fail as
+ we'we already checked bitsize and that this is a binary */
+ }
+ size = binary_size(binary);
+ for(i = 0; i < size; ++i) {
+ if (bytes[i] & ((byte) 0x80)) {
+ need += 2;
+ } else {
+ need += 1;
+ }
+ }
+ erts_free_aligned_binary_bytes(temp_alloc);
+ return need;
+}
+
+static int utf8_len(byte first)
+{
+ if ((first & ((byte) 0x80)) == 0) {
+ return 1;
+ } else if ((first & ((byte) 0xE0)) == 0xC0) {
+ return 2;
+ } else if ((first & ((byte) 0xF0)) == 0xE0) {
+ return 3;
+ } else if ((first & ((byte) 0xF8)) == 0xF0) {
+ return 4;
+ }
+ return -1;
+}
+
+static int copy_utf8_bin(byte *target, byte *source, Uint size,
+ byte *leftover, int *num_leftovers,
+ byte **err_pos, Uint *characters) {
+ int copied = 0;
+ if (leftover != NULL && *num_leftovers) {
+ int need = utf8_len(leftover[0]);
+ int from_source = need - (*num_leftovers);
+ int c;
+ byte *tmp_err_pos = NULL;
+ ASSERT(need > 0);
+ ASSERT(from_source > 0);
+ if (size < from_source) {
+ memcpy(leftover + (*num_leftovers), source, size);
+ *num_leftovers += size;
+ return 0;
+ }
+ /* leftover has room for four bytes (see bif) */
+ memcpy(leftover + (*num_leftovers),source,from_source);
+ c = copy_utf8_bin(target, leftover, need, NULL, NULL, &tmp_err_pos, characters);
+ if (tmp_err_pos != 0) {
+ *err_pos = source;
+ return 0;
+ }
+ copied += c;
+ *num_leftovers = 0;
+ size -= from_source;
+ target += c;
+ source += from_source;
+ }
+ while (size) {
+ if (((*source) & ((byte) 0x80)) == 0) {
+ *(target++) = *(source++);
+ --size; ++copied;
+ } else if (((*source) & ((byte) 0xE0)) == 0xC0) {
+ if (leftover && size < 2) {
+ *leftover = *source;
+ *num_leftovers = 1;
+ break;
+ }
+ if (size < 2 || ((source[1] & ((byte) 0xC0)) != 0x80) ||
+ ((*source) < 0xC2) /* overlong */) {
+ *err_pos = source;
+ return copied;
+ }
+ *(target++) = *(source++);
+ *(target++) = *(source++);
+ size -= 2; copied += 2;
+ } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
+ if (leftover && size < 3) {
+ memcpy(leftover, source, (int) size);
+ *num_leftovers = (int) size;
+ break;
+ }
+ if (size < 3 || ((source[1] & ((byte) 0xC0)) != 0x80) ||
+ ((source[2] & ((byte) 0xC0)) != 0x80) ||
+ (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) {
+ *err_pos = source;
+ return copied;
+ }
+ if ((((*source) & ((byte) 0xF)) == 0xD) &&
+ ((source[1] & 0x20) != 0)) {
+ *err_pos = source;
+ return copied;
+ }
+
+ if (((*source) == 0xEF) && (source[1] == 0xBF) &&
+ ((source[2] == 0xBE) || (source[2] == 0xBF))) {
+ *err_pos = source;
+ return copied;
+ }
+
+ *(target++) = *(source++);
+ *(target++) = *(source++);
+ *(target++) = *(source++);
+ size -= 3; copied += 3;
+ } else if (((*source) & ((byte) 0xF8)) == 0xF0) {
+ if (leftover && size < 4) {
+ memcpy(leftover, source, (int) size);
+ *num_leftovers = (int) size;
+ break;
+ }
+ if (size < 4 || ((source[1] & ((byte) 0xC0)) != 0x80) ||
+ ((source[2] & ((byte) 0xC0)) != 0x80) ||
+ ((source[3] & ((byte) 0xC0)) != 0x80) ||
+ (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) {
+ *err_pos = source;
+ return copied;
+ }
+ if ((((*source) & ((byte)0x7)) > 0x4U) ||
+ ((((*source) & ((byte)0x7)) == 0x4U) &&
+ ((source[1] & ((byte)0x3F)) > 0xFU))) {
+ *err_pos = source;
+ return copied;
+ }
+ *(target++) = *(source++);
+ *(target++) = *(source++);
+ *(target++) = *(source++);
+ *(target++) = *(source++);
+ size -= 4; copied +=4;
+ } else {
+ *err_pos = source;
+ return copied;
+ }
+ ++(*characters);
+ }
+ return copied;
+}
+
+
+
+static Sint utf8_need(Eterm ioterm, int latin1, Uint *costp)
+{
+ Eterm *objp;
+ Eterm obj;
+ DECLARE_ESTACK(stack);
+ Sint need = 0;
+ Uint cost = 0;
+
+ if (is_nil(ioterm)) {
+ DESTROY_ESTACK(stack);
+ *costp = 0;
+ return need;
+ }
+ if(is_binary(ioterm)) {
+ DESTROY_ESTACK(stack);
+ if (latin1) {
+ Sint x = latin1_binary_need(ioterm);
+ *costp = x;
+ return x;
+ } else {
+ *costp = 1;
+ return aligned_binary_size(ioterm);
+ }
+ }
+
+ if (!is_list(ioterm)) {
+ DESTROY_ESTACK(stack);
+ *costp = 0;
+ return (Sint) -1;
+ }
+ /* OK a list, needs to be processed in order, handling each flat list-level
+ as they occur, just like io_list_to_binary would */
+ ESTACK_PUSH(stack,ioterm);
+ while (!ESTACK_ISEMPTY(stack)) {
+ ioterm = ESTACK_POP(stack);
+ if (is_nil(ioterm)) {
+ /* ignore empty lists */
+ continue;
+ }
+ if(is_list(ioterm)) {
+L_Again: /* Restart with sublist, old listend was pushed on stack */
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ for(;;) { /* loop over one flat list of bytes and binaries
+ until sublist or list end is encountered */
+ if (is_small(obj)) { /* Always small */
+ for(;;) {
+ Uint x = unsigned_val(obj);
+ if (x < 0x80)
+ need +=1;
+ else if (x < 0x800)
+ need += 2;
+ else if (x < 0x10000)
+ need += 3;
+ else
+ need += 4;
+ /* everything else will give badarg later
+ in the process, so we dont check */
+ ++cost;
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ if (!is_byte(obj))
+ break;
+ }
+ } else if (is_nil(obj)) {
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ } else if (is_list(obj)) {
+ /* push rest of list for later processing, start
+ again with sublist */
+ ESTACK_PUSH(stack,CDR(objp));
+ ioterm = obj;
+ goto L_Again;
+ } else if (is_binary(obj)) {
+ Sint x;
+
+ if (latin1) {
+ x = latin1_binary_need(obj);
+ if (x < 0) {
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return x;
+ }
+ cost += x;
+ } else {
+ x = aligned_binary_size(obj);
+ if (x < 0) {
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return x;
+ }
+ ++cost;
+ }
+ need += x;
+ ioterm = CDR(objp);
+ if (is_list(ioterm)) {
+ /* objp and obj need to be updated if
+ loop is to continue */
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ }
+ } else {
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return ((Sint) -1);
+ }
+ if (is_nil(ioterm) || !is_list(ioterm)) {
+ break;
+ }
+ } /* for(;;) */
+ } /* is_list(ioterm) */
+
+ if (!is_list(ioterm) && !is_nil(ioterm)) {
+ /* inproper list end */
+ if (is_binary(ioterm)) {
+ Sint x;
+ if (latin1) {
+ x = latin1_binary_need(ioterm);
+ if (x < 0) {
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return x;
+ }
+ cost += x;
+ } else {
+ x = aligned_binary_size(ioterm);
+ if (x < 0) {
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return x;
+ }
+ ++cost;
+ }
+ need += x;
+ } else {
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return ((Sint) -1);
+ }
+ }
+ } /* while not estack empty */
+ DESTROY_ESTACK(stack);
+ *costp = cost;
+ return need;
+}
+
+
+static Eterm do_build_utf8(Process *p, Eterm ioterm, int *left, int latin1,
+ byte *target, int *pos, Uint *characters, int *err,
+ byte *leftover, int *num_leftovers)
+{
+ int c;
+ Eterm *objp;
+ Eterm obj;
+ DECLARE_ESTACK(stack);
+
+ *err = 0;
+ if ((*left) <= 0 || is_nil(ioterm)) {
+ DESTROY_ESTACK(stack);
+ return ioterm;
+ }
+ if(is_binary(ioterm)) {
+ Uint bitoffs;
+ Uint bitsize;
+ Uint size;
+ Uint i;
+ Eterm res_term = NIL;
+ unsigned char *bytes;
+ byte *temp_alloc = NULL;
+ Uint orig_size;
+
+ ERTS_GET_BINARY_BYTES(ioterm, bytes, bitoffs, bitsize);
+ if (bitsize != 0) {
+ *err = 1;
+ DESTROY_ESTACK(stack);
+ return ioterm;
+ }
+ if (bitoffs != 0) {
+ bytes = erts_get_aligned_binary_bytes(ioterm, &temp_alloc);
+ /* The call to erts_get_aligned_binary_bytes cannot fail as
+ we'we already checked bitsize and that this is a binary */
+ }
+
+ orig_size = size = binary_size(ioterm);
+
+ /* This is done to avoid splitting binaries in two
+ and then create an unnecessary rest that eventually gives an error.
+ For cases where errors are not returned this is unnecessary */
+ if (!latin1) {
+ /* Find a valid character boundary */
+ while (size > (*left) &&
+ (((byte) bytes[(*left)]) & ((byte) 0xC0)) == ((byte) 0x80)) {
+ ++(*left);
+ }
+ }
+
+ if (size > (*left)) {
+ Eterm *hp;
+ ErlSubBin *sb;
+ Eterm orig;
+ Uint offset;
+ /* Split the binary in two parts, of which we
+ only process the first */
+ hp = HAlloc(p, ERL_SUB_BIN_SIZE);
+ sb = (ErlSubBin *) hp;
+ ERTS_GET_REAL_BIN(ioterm, orig, offset, bitoffs, bitsize);
+ sb->thing_word = HEADER_SUB_BIN;
+ sb->size = size - (*left);
+ sb->offs = offset + (*left);
+ sb->orig = orig;
+ sb->bitoffs = bitoffs;
+ sb->bitsize = bitsize;
+ sb->is_writable = 0;
+ res_term = make_binary(sb);
+ size = (*left);
+ }
+
+ if (!latin1) {
+ int num;
+ byte *err_pos = NULL;
+ num = copy_utf8_bin(target + (*pos), bytes,
+ size, leftover, num_leftovers,&err_pos,characters);
+ *pos += num;
+ if (err_pos != NULL) {
+ int rest_bin_offset;
+ int rest_bin_size;
+ Eterm *hp;
+ ErlSubBin *sb;
+ Eterm orig;
+ Uint offset;
+
+ *err = 1;
+ /* we have no real stack, just build a list of the binaries
+ we have not decoded... */
+ DESTROY_ESTACK(stack);
+
+ rest_bin_offset = (err_pos - bytes);
+ rest_bin_size = orig_size - rest_bin_offset;
+
+ hp = HAlloc(p, ERL_SUB_BIN_SIZE);
+ sb = (ErlSubBin *) hp;
+ ERTS_GET_REAL_BIN(ioterm, orig, offset, bitoffs, bitsize);
+ sb->thing_word = HEADER_SUB_BIN;
+ sb->size = rest_bin_size;
+ sb->offs = offset + rest_bin_offset;
+ sb->orig = orig;
+ sb->bitoffs = bitoffs;
+ sb->bitsize = bitsize;
+ sb->is_writable = 0;
+ res_term = make_binary(sb);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ return res_term;
+ }
+ } else {
+ i = 0;
+ while(i < size) {
+ if (bytes[i] < 0x80) {
+ target[(*pos)++] = bytes[i++];
+ } else {
+ target[(*pos)++] = ((bytes[i] >> 6) | ((byte) 0xC0));
+ target[(*pos)++] = ((bytes[i] & 0x3F) | ((byte) 0x80));
+ ++i;
+ }
+ ++(*characters);
+ }
+ }
+ *left -= size;
+ DESTROY_ESTACK(stack);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ return res_term;
+ }
+
+ if (!is_list(ioterm)) {
+ *err = 1;
+ goto done;
+ }
+
+ /* OK a list, needs to be processed in order, handling each flat list-level
+ as they occur, just like io_list_to_binary would */
+ ESTACK_PUSH(stack,ioterm);
+ while (!ESTACK_ISEMPTY(stack) && (*left)) {
+ ioterm = ESTACK_POP(stack);
+ if (is_nil(ioterm)) {
+ /* ignore empty lists */
+ continue;
+ }
+ if(is_list(ioterm)) {
+L_Again: /* Restart with sublist, old listend was pushed on stack */
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ for(;;) { /* loop over one flat list of bytes and binaries
+ until sublist or list end is encountered */
+ if (is_small(obj)) { /* Always small in unicode*/
+ if (*num_leftovers) {
+ /* Have rest from previous bin and this is an integer, not allowed */
+ *err = 1;
+ goto done;
+ }
+ for(;;) {
+ Uint x = unsigned_val(obj);
+ if (latin1 && x > 255) {
+ *err = 1;
+ goto done;
+ }
+ if (x < 0x80) {
+ target[(*pos)++] = (byte) x;
+ }
+ else if (x < 0x800) {
+ target[(*pos)++] = (((byte) (x >> 6)) |
+ ((byte) 0xC0));
+ target[(*pos)++] = (((byte) (x & 0x3F)) |
+ ((byte) 0x80));
+ } else if (x < 0x10000) {
+ if ((x >= 0xD800 && x <= 0xDFFF) ||
+ (x == 0xFFFE) ||
+ (x == 0xFFFF)) { /* Invalid unicode range */
+ *err = 1;
+ goto done;
+ }
+ target[(*pos)++] = (((byte) (x >> 12)) |
+ ((byte) 0xE0));
+ target[(*pos)++] = ((((byte) (x >> 6)) & 0x3F) |
+ ((byte) 0x80));
+ target[(*pos)++] = (((byte) (x & 0x3F)) |
+ ((byte) 0x80));
+ } else if (x < 0x110000) { /* Standard imposed max */
+ target[(*pos)++] = (((byte) (x >> 18)) |
+ ((byte) 0xF0));
+ target[(*pos)++] = ((((byte) (x >> 12)) & 0x3F) |
+ ((byte) 0x80));
+ target[(*pos)++] = ((((byte) (x >> 6)) & 0x3F) |
+ ((byte) 0x80));
+ target[(*pos)++] = (((byte) (x & 0x3F)) |
+ ((byte) 0x80));
+ } else {
+ *err = 1;
+ goto done;
+ }
+ ++(*characters);
+ --(*left);
+ ioterm = CDR(objp);
+ if (!is_list(ioterm) || !(*left)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ if (!is_small(obj))
+ break;
+ }
+ } else if (is_nil(obj)) {
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ } else if (is_list(obj)) {
+ /* push rest of list for later processing, start
+ again with sublist */
+ ESTACK_PUSH(stack,CDR(objp));
+ ioterm = obj;
+ goto L_Again;
+ } else if (is_binary(obj)) {
+ Eterm rest_term;
+ rest_term = do_build_utf8(p,obj,left,latin1,target,pos, characters, err,
+ leftover, num_leftovers);
+ if ((*err) != 0) {
+ Eterm *hp;
+ hp = HAlloc(p, 2);
+ obj = CDR(objp);
+ ioterm = CONS(hp, rest_term, obj);
+ //(*left) = 0;
+ goto done;
+ }
+ if (rest_term != NIL) {
+ Eterm *hp;
+ hp = HAlloc(p, 2);
+ obj = CDR(objp);
+ ioterm = CONS(hp, rest_term, obj);
+ (*left) = 0;
+ break;
+ }
+ ioterm = CDR(objp);
+ if (is_list(ioterm)) {
+ /* objp and obj need to be updated if
+ loop is to continue */
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ }
+ } else {
+ *err = 1;
+ goto done;
+ }
+ if (!(*left) || is_nil(ioterm) || !is_list(ioterm)) {
+ break;
+ }
+ } /* for(;;) */
+ } /* is_list(ioterm) */
+
+ if ((*left) && !is_list(ioterm) && !is_nil(ioterm)) {
+ /* inproper list end */
+ if (is_binary(ioterm)) {
+ ioterm = do_build_utf8(p,ioterm,left,latin1,target,pos,characters,err,leftover,num_leftovers);
+ if ((*err) != 0) {
+ goto done;
+ }
+ } else {
+ *err = 1;
+ goto done;
+ }
+ }
+ } /* while left and not estack empty */
+ done:
+ c = ESTACK_COUNT(stack);
+ if (c > 0) {
+ Eterm *hp = HAlloc(p,2*c);
+ while(!ESTACK_ISEMPTY(stack)) {
+ Eterm st = ESTACK_POP(stack);
+ ioterm = CONS(hp, ioterm, st);
+ hp += 2;
+ }
+ }
+ DESTROY_ESTACK(stack);
+ return ioterm;
+
+}
+
+static int check_leftovers(byte *source, int size)
+{
+ if (((*source) & ((byte) 0xE0)) == 0xC0) {
+ return 0;
+ } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
+ if (size < 2 ||
+ (size < 3 && ((source[1] & ((byte) 0xC0)) == 0x80))) {
+ return 0;
+ }
+ } else if (((*source) & ((byte) 0xF8)) == 0xF0) {
+ if (size < 2 ||
+ (size < 3 && ((source[1] & ((byte) 0xC0)) == 0x80)) ||
+ (size < 4 &&
+ ((source[1] & ((byte) 0xC0)) == 0x80) &&
+ ((source[2] & ((byte) 0xC0)) == 0x80))) {
+ return 0;
+ }
+ }
+ return -1;
+}
+
+
+
+static BIF_RETTYPE build_utf8_return(Process *p,Eterm bin,int pos,
+ Eterm rest_term,int err,
+ byte *leftover,int num_leftovers,Eterm latin1)
+{
+ Eterm *hp;
+ Eterm ret;
+
+ binary_size(bin) = pos;
+ if (err) {
+ if (num_leftovers > 0) {
+ Eterm leftover_bin = new_binary(p, leftover, num_leftovers);
+ hp = HAlloc(p,8);
+ rest_term = CONS(hp,rest_term,NIL);
+ hp += 2;
+ rest_term = CONS(hp,leftover_bin,rest_term);
+ hp += 2;
+ } else {
+ hp = HAlloc(p,4);
+ }
+ ret = TUPLE3(hp,am_error,bin,rest_term);
+ } else if (rest_term == NIL && num_leftovers != 0) {
+ Eterm leftover_bin = new_binary(p, leftover, num_leftovers);
+ if (check_leftovers(leftover,num_leftovers) != 0) {
+ hp = HAlloc(p,4);
+ ret = TUPLE3(hp,am_error,bin,leftover_bin);
+ } else {
+ hp = HAlloc(p,4);
+ ret = TUPLE3(hp,am_incomplete,bin,leftover_bin);
+ }
+ } else { /* All OK */
+ if (rest_term != NIL) { /* Trap */
+ if (num_leftovers > 0) {
+ Eterm rest_bin = new_binary(p, leftover, num_leftovers);
+ hp = HAlloc(p,2);
+ rest_term = CONS(hp,rest_bin,rest_term);
+ }
+ BUMP_ALL_REDS(p);
+ BIF_TRAP3(&characters_to_utf8_trap_exp, p, bin, rest_term, latin1);
+ } else { /* Success */
+ /*hp = HAlloc(p,5);
+ ret = TUPLE4(hp,bin,rest_term,make_small(pos),make_small(err));*/
+ ret = bin;
+ }
+ }
+ BIF_RET(ret);
+}
+
+
+static BIF_RETTYPE characters_to_utf8_trap(BIF_ALIST_3)
+{
+ Eterm *real_bin;
+ Sint need;
+ byte* bytes;
+ Eterm rest_term;
+ int left, sleft;
+ int pos;
+ int err;
+ byte leftover[4]; /* used for temp buffer too,
+ otherwise 3 bytes would have been enough */
+ int num_leftovers = 0;
+ int latin1 = 0;
+ Uint characters = 0;
+
+ /*erts_printf("Trap %T!\r\n",BIF_ARG_2);*/
+ ASSERT(is_binary(BIF_ARG_1));
+ real_bin = binary_val(BIF_ARG_1);
+ ASSERT(*real_bin == HEADER_PROC_BIN);
+ need = ((ProcBin *) real_bin)->val->orig_size;
+ pos = (int) binary_size(BIF_ARG_1);
+ bytes = binary_bytes(BIF_ARG_1);
+ sleft = left = allowed_iterations(BIF_P);
+ err = 0;
+ if (BIF_ARG_3 == am_latin1) {
+ latin1 = 1;
+ }
+ rest_term = do_build_utf8(BIF_P, BIF_ARG_2, &left, latin1,
+ bytes, &pos, &characters, &err, leftover, &num_leftovers);
+ cost_to_proc(BIF_P, sleft - left);
+ return build_utf8_return(BIF_P,BIF_ARG_1,pos,rest_term,err,
+ leftover,num_leftovers,BIF_ARG_3);
+}
+
+BIF_RETTYPE unicode_bin_is_7bit_1(BIF_ALIST_1)
+{
+ Sint need;
+ if(!is_binary(BIF_ARG_1)) {
+ BIF_RET(am_false);
+ }
+ need = latin1_binary_need(BIF_ARG_1);
+ if(need >= 0 && aligned_binary_size(BIF_ARG_1) == need) {
+ BIF_RET(am_true);
+ }
+ BIF_RET(am_false);
+}
+
+static int is_valid_utf8(Eterm orig_bin)
+{
+ Uint bitoffs;
+ Uint bitsize;
+ Uint size;
+ byte *temp_alloc = NULL;
+ byte *endpos;
+ Uint numchar;
+ byte *bytes;
+ int ret;
+
+ ERTS_GET_BINARY_BYTES(orig_bin, bytes, bitoffs, bitsize);
+ if (bitsize != 0) {
+ return 0;
+ }
+ if (bitoffs != 0) {
+ bytes = erts_get_aligned_binary_bytes(orig_bin, &temp_alloc);
+ }
+ size = binary_size(orig_bin);
+ ret = analyze_utf8(bytes,
+ size,
+ &endpos,&numchar,NULL);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ return (ret == UTF8_OK);
+}
+
+BIF_RETTYPE unicode_characters_to_binary_2(BIF_ALIST_2)
+{
+ Sint need;
+ Uint characters;
+ int latin1;
+ Eterm bin;
+ byte *bytes;
+ int pos;
+ int err;
+ int left, sleft;
+ Eterm rest_term, subject;
+ byte leftover[4]; /* used for temp buffer too, o
+ therwise 3 bytes would have been enough */
+ int num_leftovers = 0;
+ Uint cost_of_utf8_need;
+
+
+ if (BIF_ARG_2 == am_latin1) {
+ latin1 = 1;
+ } else if (BIF_ARG_2 == am_unicode || BIF_ARG_2 == am_utf8) {
+ latin1 = 0;
+ } else {
+ BIF_TRAP2(c_to_b_int_trap_exportp, BIF_P, BIF_ARG_1, BIF_ARG_2);
+ }
+ if (is_list(BIF_ARG_1) && is_binary(CAR(list_val(BIF_ARG_1))) &&
+ is_nil(CDR(list_val(BIF_ARG_1)))) {
+ subject = CAR(list_val(BIF_ARG_1));
+ } else {
+ subject = BIF_ARG_1;
+ }
+
+ need = utf8_need(subject,latin1,&cost_of_utf8_need);
+ if (need < 0) {
+ BIF_ERROR(BIF_P,BADARG);
+ }
+ if (is_binary(subject) && need >= 0 && aligned_binary_size(subject) == need
+ && (latin1 || is_valid_utf8(subject))) {
+ cost_to_proc(BIF_P, simple_loops_to_common(cost_of_utf8_need));
+ BIF_RET(subject);
+ }
+
+
+ bin = erts_new_mso_binary(BIF_P, (byte *)NULL, need);
+ bytes = binary_bytes(bin);
+ cost_to_proc(BIF_P, simple_loops_to_common(cost_of_utf8_need));
+ left = allowed_iterations(BIF_P) -
+ simple_loops_to_common(cost_of_utf8_need);
+ if (left <= 0) {
+ /* simplified - let everything be setup by setting left to 1 */
+ left = 1;
+ }
+ sleft = left;
+ pos = 0;
+ err = 0;
+
+
+ rest_term = do_build_utf8(BIF_P, subject, &left, latin1,
+ bytes, &pos, &characters, &err, leftover, &num_leftovers);
+#ifdef HARDDEBUG
+ if (left == 0) {
+ Eterm bin;
+ if (is_binary(subject)) {
+ bin = subject;
+ } else if(is_list(subject) && is_binary(CAR(list_val(subject)))) {
+ bin = CAR(list_val(subject));
+ } else {
+ bin = NIL;
+ }
+ if (is_binary(bin)) {
+ byte *t = NULL;
+ Uint sz = binary_size(bin);
+ byte *by = erts_get_aligned_binary_bytes(bin,&t);
+ int i;
+ erts_printf("<<");
+ for (i = 0;i < sz; ++i) {
+ erts_printf((i == sz -1) ? "0x%X" : "0x%X, ", (unsigned) by[i]);
+ }
+ erts_printf(">>: ");
+ erts_free_aligned_binary_bytes(t);
+ }
+ erts_printf("%d - %d = %d\n",sleft,left,sleft - left);
+ }
+#endif
+ cost_to_proc(BIF_P, sleft - left);
+ return build_utf8_return(BIF_P,bin,pos,rest_term,err,
+ leftover,num_leftovers,BIF_ARG_2);
+}
+
+static BIF_RETTYPE build_list_return(Process *p, byte *bytes, int pos, Uint characters,
+ Eterm rest_term, int err,
+ byte *leftover, int num_leftovers,
+ Eterm latin1, int left)
+{
+ Eterm *hp;
+
+ if (left <= 0) {
+ left = 1;
+ }
+
+ if (err) {
+ if (num_leftovers > 0) {
+ Eterm leftover_bin = new_binary(p, leftover, num_leftovers);
+ hp = HAlloc(p,4);
+ rest_term = CONS(hp,rest_term,NIL);
+ hp += 2;
+ rest_term = CONS(hp,leftover_bin,rest_term);
+ }
+ BIF_RET(finalize_list_to_list(p, bytes, rest_term, 0U, pos, characters, UTF8_ERROR, left, NIL));
+ } else if (rest_term == NIL && num_leftovers != 0) {
+ Eterm leftover_bin = new_binary(p, leftover, num_leftovers);
+ if (check_leftovers(leftover,num_leftovers) != 0) {
+ BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, UTF8_ERROR,
+ left, NIL));
+ } else {
+ BIF_RET(finalize_list_to_list(p, bytes, leftover_bin, 0U, pos, characters, UTF8_INCOMPLETE,
+ left, NIL));
+ }
+ } else { /* All OK */
+ if (rest_term != NIL) { /* Trap */
+ RestartContext rc;
+ if (num_leftovers > 0) {
+ Eterm rest_bin = new_binary(p, leftover, num_leftovers);
+ hp = HAlloc(p,2);
+ rest_term = CONS(hp,rest_bin,rest_term);
+ }
+ BUMP_ALL_REDS(p);
+ rc.bytes = bytes;
+ rc.num_processed_bytes = 0; /* not used */
+ rc.num_bytes_to_process = pos;
+ rc.num_resulting_chars = characters;
+ rc.state = UTF8_OK; /* not used */
+ BIF_TRAP3(&characters_to_list_trap_1_exp, p, make_magic_bin_for_restart(p,&rc),
+ rest_term, latin1);
+ } else { /* Success */
+ BIF_RET(finalize_list_to_list(p, bytes, NIL, 0U, pos, characters, UTF8_OK, left, NIL));
+ }
+ }
+}
+
+static BIF_RETTYPE characters_to_list_trap_1(BIF_ALIST_3)
+{
+ RestartContext *rc;
+ byte* bytes;
+ int pos;
+ Uint characters;
+ int err;
+ Eterm rest_term;
+ int left, sleft;
+
+ int latin1 = 0;
+ byte leftover[4]; /* used for temp buffer too,
+ otherwise 3 bytes would have been enough */
+ int num_leftovers = 0;
+
+
+ rc = get_rc_from_bin(BIF_ARG_1);
+
+ bytes = rc->bytes;
+ rc->bytes = NULL; /* to avoid free due to later GC */
+ pos = rc->num_bytes_to_process;
+ characters = rc->num_resulting_chars;
+
+ sleft = left = allowed_iterations(BIF_P);
+ err = 0;
+ if (BIF_ARG_3 == am_latin1) {
+ latin1 = 1;
+ }
+ rest_term = do_build_utf8(BIF_P, BIF_ARG_2, &left, latin1,
+ bytes, &pos, &characters, &err, leftover, &num_leftovers);
+ cost_to_proc(BIF_P, sleft - left);
+ return build_list_return(BIF_P,bytes,pos,characters,rest_term,err,
+ leftover,num_leftovers,BIF_ARG_3,left);
+}
+
+BIF_RETTYPE unicode_characters_to_list_2(BIF_ALIST_2)
+{
+ Sint need;
+ int latin1;
+ Uint characters = 0;
+ byte *bytes;
+ int pos;
+ int err;
+ int left, sleft;
+ Eterm rest_term;
+ byte leftover[4]; /* used for temp buffer too, o
+ therwise 3 bytes would have been enough */
+ int num_leftovers = 0;
+ Uint cost_of_utf8_need;
+
+ if (BIF_ARG_2 == am_latin1) {
+ latin1 = 1;
+ } else if (BIF_ARG_2 == am_unicode || BIF_ARG_2 == am_utf8) {
+ latin1 = 0;
+ } else {
+ BIF_TRAP2(c_to_l_int_trap_exportp, BIF_P, BIF_ARG_1, BIF_ARG_2);
+ }
+ if (is_binary(BIF_ARG_1) && !latin1) { /* Optimized behaviour for this case */
+ return utf8_to_list(BIF_P,BIF_ARG_1);
+ }
+ need = utf8_need(BIF_ARG_1,latin1,&cost_of_utf8_need);
+ if (need < 0) {
+ BIF_ERROR(BIF_P,BADARG);
+ }
+ bytes = alloc_restart(need);
+ cost_to_proc(BIF_P, simple_loops_to_common(cost_of_utf8_need));
+ left = allowed_iterations(BIF_P) -
+ simple_loops_to_common(cost_of_utf8_need);
+ if (left <= 0) {
+ /* simplified - let everything be setup by setting left to 1 */
+ left = 1;
+ }
+ sleft = left;
+ pos = 0;
+ err = 0;
+
+
+ rest_term = do_build_utf8(BIF_P, BIF_ARG_1, &left, latin1,
+ bytes, &pos, &characters, &err, leftover, &num_leftovers);
+ cost_to_proc(BIF_P, sleft - left);
+ return build_list_return(BIF_P,bytes,pos,characters,rest_term,err,
+ leftover,num_leftovers,BIF_ARG_2,left);
+}
+
+
+/*
+ * When input to characters_to_list is a plain binary and the format is 'unicode', we do
+ * a faster analyze and size count with this function.
+ */
+static int analyze_utf8(byte *source, Uint size,
+ byte **err_pos, Uint *num_chars, int *left)
+{
+ *err_pos = source;
+ *num_chars = 0;
+ while (size) {
+ if (((*source) & ((byte) 0x80)) == 0) {
+ source++;
+ --size;
+ } else if (((*source) & ((byte) 0xE0)) == 0xC0) {
+ if (size < 2) {
+ return UTF8_INCOMPLETE;
+ }
+ if (((source[1] & ((byte) 0xC0)) != 0x80) ||
+ ((*source) < 0xC2) /* overlong */) {
+ return UTF8_ERROR;
+ }
+ source += 2;
+ size -= 2;
+ } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
+ if (size < 3) {
+ return UTF8_INCOMPLETE;
+ }
+ if (((source[1] & ((byte) 0xC0)) != 0x80) ||
+ ((source[2] & ((byte) 0xC0)) != 0x80) ||
+ (((*source) == 0xE0) && (source[1] < 0xA0)) /* overlong */ ) {
+ return UTF8_ERROR;
+ }
+ if ((((*source) & ((byte) 0xF)) == 0xD) &&
+ ((source[1] & 0x20) != 0)) {
+ return UTF8_ERROR;
+ }
+ if (((*source) == 0xEF) && (source[1] == 0xBF) &&
+ ((source[2] == 0xBE) || (source[2] == 0xBF))) {
+ return UTF8_ERROR;
+ }
+ source += 3;
+ size -= 3;
+ } else if (((*source) & ((byte) 0xF8)) == 0xF0) {
+ if (size < 4) {
+ return UTF8_INCOMPLETE;
+ }
+ if (((source[1] & ((byte) 0xC0)) != 0x80) ||
+ ((source[2] & ((byte) 0xC0)) != 0x80) ||
+ ((source[3] & ((byte) 0xC0)) != 0x80) ||
+ (((*source) == 0xF0) && (source[1] < 0x90)) /* overlong */) {
+ return UTF8_ERROR;
+ }
+ if ((((*source) & ((byte)0x7)) > 0x4U) ||
+ ((((*source) & ((byte)0x7)) == 0x4U) &&
+ ((source[1] & ((byte)0x3F)) > 0xFU))) {
+ return UTF8_ERROR;
+ }
+ source += 4;
+ size -= 4;
+ } else {
+ return UTF8_ERROR;
+ }
+ ++(*num_chars);
+ *err_pos = source;
+ if (left && --(*left) <= 0) {
+ return UTF8_ANALYZE_MORE;
+ }
+ }
+ return UTF8_OK;
+}
+
+/*
+ * No errors should be able to occur - no overlongs, no malformed, no nothing
+ */
+static Eterm do_utf8_to_list(Process *p, Uint num, byte *bytes, Uint sz,
+ Uint left,
+ Uint *num_built, Uint *num_eaten, Eterm tail)
+{
+ Eterm *hp;
+ Eterm ret;
+ byte *source, *ssource;
+ Uint unipoint;
+
+ ASSERT(num > 0);
+ if (left < num) {
+ if (left > 0)
+ num = left;
+ else
+ num = 1;
+ }
+
+ *num_built = num; /* Always */
+
+ hp = HAlloc(p,num * 2);
+ ret = tail;
+ source = bytes + sz;
+ ssource = source;
+ while(--source >= bytes) {
+ if (((*source) & ((byte) 0x80)) == 0) {
+ unipoint = (Uint) *source;
+ } else if (((*source) & ((byte) 0xE0)) == 0xC0) {
+ unipoint =
+ (((Uint) ((*source) & ((byte) 0x1F))) << 6) |
+ ((Uint) (source[1] & ((byte) 0x3F)));
+ } else if (((*source) & ((byte) 0xF0)) == 0xE0) {
+ unipoint =
+ (((Uint) ((*source) & ((byte) 0xF))) << 12) |
+ (((Uint) (source[1] & ((byte) 0x3F))) << 6) |
+ ((Uint) (source[2] & ((byte) 0x3F)));
+ } else if (((*source) & ((byte) 0xF8)) == 0xF0) {
+ unipoint =
+ (((Uint) ((*source) & ((byte) 0x7))) << 18) |
+ (((Uint) (source[1] & ((byte) 0x3F))) << 12) |
+ (((Uint) (source[2] & ((byte) 0x3F))) << 6) |
+ ((Uint) (source[3] & ((byte) 0x3F)));
+ } else {
+ /* ignore 2#10XXXXXX */
+ continue;
+ }
+ ret = CONS(hp,make_small(unipoint),ret);
+ hp += 2;
+ if (--num <= 0) {
+ break;
+ }
+ }
+ *num_eaten = (ssource - source);
+ return ret;
+}
+
+/*
+ * The last step of characters_to_list, build a list from the buffer 'bytes' (created in the same way
+ * as for characters_to_utf8). All sizes are known in advance and most data will be held in a
+ * "magic binary" during trapping.
+ */
+static BIF_RETTYPE finalize_list_to_list(Process *p,
+ byte *bytes,
+ Eterm rest,
+ Uint num_processed_bytes,
+ Uint num_bytes_to_process,
+ Uint num_resulting_chars,
+ int state, int left,
+ Eterm tail)
+{
+ Uint num_built; /* characters */
+ Uint num_eaten; /* bytes */
+ Eterm *hp;
+ Eterm converted,ret;
+
+ if (!num_bytes_to_process) {
+ converted = tail;
+ } else {
+ num_built = 0;
+ num_eaten = 0;
+ converted = do_utf8_to_list(p, num_resulting_chars,
+ bytes, num_bytes_to_process,
+ left, &num_built, &num_eaten, tail);
+ cost_to_proc(p,num_built);
+
+ if (num_built != num_resulting_chars) { /* work left to do */
+ RestartContext rc;
+
+ rc.num_resulting_chars = num_resulting_chars - num_built;
+ rc.num_bytes_to_process = num_bytes_to_process - num_eaten;
+ rc.num_processed_bytes = num_processed_bytes + num_eaten;
+ rc.state = state;
+ rc.bytes = bytes;
+ BUMP_ALL_REDS(p);
+ BIF_TRAP3(&characters_to_list_trap_2_exp, p,
+ make_magic_bin_for_restart(p, &rc), rest, converted);
+ }
+ }
+
+ /*
+ * OK, no more trapping, let's get rid of the temporary array...
+ */
+
+ free_restart(bytes);
+ if (state == UTF8_INCOMPLETE) {
+ hp = HAlloc(p,4);
+ ret = TUPLE3(hp,am_incomplete,converted,rest);
+ } else if (state == UTF8_ERROR) {
+ hp = HAlloc(p,4);
+ ret = TUPLE3(hp,am_error,converted,rest);
+ } else {
+ ret = converted;
+ }
+
+ BIF_RET(ret);
+}
+
+static BIF_RETTYPE characters_to_list_trap_2(BIF_ALIST_3)
+{
+ RestartContext *rc;
+ byte *bytes;
+
+ rc = get_rc_from_bin(BIF_ARG_1);
+
+ bytes = rc->bytes;
+ rc->bytes = NULL; /* Don't want this freed just yet... */
+ return finalize_list_to_list(BIF_P, bytes, BIF_ARG_2, rc->num_processed_bytes,
+ rc->num_bytes_to_process, rc->num_resulting_chars,
+ rc->state, allowed_iterations(BIF_P), BIF_ARG_3);
+}
+
+
+/*
+ * Hooks into the process of decoding a binary depending on state.
+ * If last_state is UTF8_ANALYZE_MORE, num_bytes_to_process
+ * and num_resulting_chars will grow
+ * until we're done analyzing the binary. Then we'll eat
+ * the bytes to process, lowering num_bytes_to_process and num_resulting_chars,
+ * while increasing num_processed_bytes until we're done. the state
+ * indicates how to return (error, incomplete or ok) in this stage.
+ * note that num_processed_bytes and num_bytes_to_process will make up the
+ * length of the binary part to process, not necessarily the length of the
+ * whole binary (if there are errors or an incomplete tail).
+ *
+ * Analyzing happens from the beginning of the binary towards the end,
+ * while result is built from the end of the analyzed/accepted part
+ * towards the beginning.
+ *
+ * Note that this routine is *only* called when original input was a plain utf8 binary,
+ * otherwise the rest and the sizes are known in advance, so finalize_list_to_list is
+ * used to build the resulting list (no analyzing needed).
+ */
+static BIF_RETTYPE do_bif_utf8_to_list(Process *p,
+ Eterm orig_bin,
+ Uint num_processed_bytes,
+ Uint num_bytes_to_process,
+ Uint num_resulting_chars,
+ int state,
+ Eterm tail)
+{
+ int left;
+ Uint bitoffs;
+ Uint bitsize;
+ Uint size;
+ byte *bytes;
+ Eterm converted = NIL;
+ Eterm rest = NIL;
+ Eterm *hp;
+ Eterm ret;
+ byte *temp_alloc = NULL;
+ byte *endpos;
+ Uint numchar;
+
+ Uint b_sz; /* size of the non analyzed tail */
+ Uint num_built; /* characters */
+ Uint num_eaten; /* bytes */
+
+ ERTS_GET_BINARY_BYTES(orig_bin, bytes, bitoffs, bitsize);
+ if (bitsize != 0) {
+ converted = NIL;
+ rest = orig_bin;
+ goto error_return;
+ }
+ if (bitoffs != 0) {
+ bytes = erts_get_aligned_binary_bytes(orig_bin, &temp_alloc);
+ }
+
+ size = binary_size(orig_bin);
+
+ left = allowed_iterations(p);
+
+ if (state == UTF8_ANALYZE_MORE) {
+ state = analyze_utf8(bytes + num_bytes_to_process,
+ size - num_bytes_to_process,
+ &endpos,&numchar,&left);
+ cost_to_proc(p,numchar);
+ num_resulting_chars += numchar;
+ num_bytes_to_process = endpos - bytes;
+ if (state == UTF8_ANALYZE_MORE) {
+ Eterm epos = erts_make_integer(num_bytes_to_process,p);
+ Eterm enumchar = erts_make_integer(num_resulting_chars,p);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BUMP_ALL_REDS(p);
+ BIF_TRAP3(&characters_to_list_trap_3_exp, p, orig_bin, epos,
+ enumchar);
+ }
+ }
+
+ /*
+ * If we're here, we have everything analyzed and are instead building
+ */
+
+
+ if (!num_bytes_to_process) {
+ converted = tail;
+ } else {
+ num_built = 0;
+ num_eaten = 0;
+ converted = do_utf8_to_list(p, num_resulting_chars,
+ bytes, num_bytes_to_process,
+ left, &num_built, &num_eaten, tail);
+ cost_to_proc(p,num_built);
+
+ if (num_built != num_resulting_chars) { /* work left to do */
+ Eterm newnum_resulting_chars =
+ erts_make_integer(num_resulting_chars - num_built,p);
+ Eterm newnum_bytes_to_process =
+ erts_make_integer(num_bytes_to_process - num_eaten,p);
+ Eterm newnum_processed_bytes =
+ erts_make_integer(num_processed_bytes + num_eaten,p);
+ Eterm traptuple;
+ hp = HAlloc(p,7);
+ traptuple = TUPLE6(hp,orig_bin,newnum_processed_bytes,
+ newnum_bytes_to_process,
+ newnum_resulting_chars,
+ make_small(state),
+ converted);
+ BUMP_ALL_REDS(p);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_TRAP1(&characters_to_list_trap_4_exp,p,traptuple);
+ }
+ }
+
+ /*
+ * OK, no more trapping, let's build rest binary if there should
+ * be one.
+ */
+
+ b_sz = size - (num_bytes_to_process + num_processed_bytes);
+
+ if (b_sz) {
+ ErlSubBin *sb;
+ Eterm orig;
+ Uint offset;
+ ASSERT(state != UTF8_OK);
+ hp = HAlloc(p, ERL_SUB_BIN_SIZE);
+ sb = (ErlSubBin *) hp;
+ ERTS_GET_REAL_BIN(orig_bin, orig, offset, bitoffs, bitsize);
+ sb->thing_word = HEADER_SUB_BIN;
+ sb->size = b_sz;
+ sb->offs = num_bytes_to_process + num_processed_bytes;
+ sb->orig = orig;
+ sb->bitoffs = bitoffs;
+ sb->bitsize = bitsize;
+ sb->is_writable = 0;
+ rest = make_binary(sb);
+ }
+
+ /* Done */
+
+ if (state == UTF8_INCOMPLETE) {
+ if (check_leftovers(bytes + num_bytes_to_process + num_processed_bytes,
+ b_sz) != 0) {
+ goto error_return;
+ }
+ hp = HAlloc(p,4);
+ ret = TUPLE3(hp,am_incomplete,converted,rest);
+ } else if (state == UTF8_ERROR) {
+ error_return:
+ hp = HAlloc(p,4);
+ ret = TUPLE3(hp,am_error,converted,rest);
+ } else {
+ ret = converted;
+ }
+
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(ret);
+}
+
+
+/*
+ * This is called when there's still analyzing left to do,
+ * we only reach this if original input was a binary.
+ */
+
+static BIF_RETTYPE characters_to_list_trap_3(BIF_ALIST_3)
+{
+ Uint num_bytes_to_process;
+ Uint num_resulting_chars;
+
+ term_to_Uint(BIF_ARG_2, &num_bytes_to_process); /* The number of already
+ analyzed and accepted
+ bytes */
+ term_to_Uint(BIF_ARG_3, &num_resulting_chars); /* The number of chars
+ procuced by the
+ already analyzed
+ part of the binary */
+
+ /*erts_printf("Trap: %T, %T, %T\n",BIF_ARG_1, BIF_ARG_2, BIF_ARG_3);*/
+
+ return do_bif_utf8_to_list(BIF_P,
+ BIF_ARG_1, /* the binary */
+ 0U, /* nothing processed yet */
+ num_bytes_to_process,
+ num_resulting_chars,
+ UTF8_ANALYZE_MORE, /* always this state here */
+ NIL); /* Nothing built -> no tail yet */
+
+}
+
+/*
+ * This is called when analyzing is done and we are trapped during building,
+ * we only reach this if original input was a binary.
+ */
+static BIF_RETTYPE characters_to_list_trap_4(BIF_ALIST_1)
+{
+ Uint num_processed_bytes;
+ Uint num_bytes_to_process;
+ Uint num_resulting_chars;
+ Eterm orig_bin, tail;
+ int last_state;
+ Eterm *tplp = tuple_val(BIF_ARG_1);
+
+ orig_bin = tplp[1];
+ term_to_Uint(tplp[2], &num_processed_bytes);
+ term_to_Uint(tplp[3], &num_bytes_to_process);
+ term_to_Uint(tplp[4], &num_resulting_chars);
+ last_state = (int) signed_val(tplp[5]);
+ tail = tplp[6];
+
+ /*erts_printf("Trap: {%T, %lu, %lu, %lu, %d, %T}\n",
+ orig_bin, num_processed_bytes, num_bytes_to_process,
+ num_resulting_chars, last_state, tail);*/
+
+ return do_bif_utf8_to_list(BIF_P,
+ orig_bin, /* The whole binary */
+ num_processed_bytes, /* Number of bytes
+ already processed */
+ num_bytes_to_process, /* Bytes left to proc. */
+ num_resulting_chars, /* Num chars left to
+ build */
+ last_state, /* The current state
+ (never ANALYZE_MORE)*/
+ tail); /* The already built
+ tail */
+
+}
+/*
+ * This is only used when characters are a plain unicode (utf8) binary.
+ * Instead of building an utf8 buffer, we analyze the binary given and use that.
+ */
+
+static BIF_RETTYPE utf8_to_list(BIF_ALIST_1)
+{
+ if (!is_binary(BIF_ARG_1) || aligned_binary_size(BIF_ARG_1) < 0) {
+ BIF_ERROR(BIF_P,BADARG);
+ }
+ return do_bif_utf8_to_list(BIF_P, BIF_ARG_1, 0U, 0U, 0U,
+ UTF8_ANALYZE_MORE,NIL);
+}
+
+
+BIF_RETTYPE atom_to_binary_2(BIF_ALIST_2)
+{
+ Atom* ap;
+
+ if (is_not_atom(BIF_ARG_1)) {
+ goto error;
+ }
+
+ ap = atom_tab(atom_val(BIF_ARG_1));
+
+ if (BIF_ARG_2 == am_latin1) {
+ BIF_RET(new_binary(BIF_P, ap->name, ap->len));
+ } else if (BIF_ARG_2 == am_utf8 || BIF_ARG_2 == am_unicode) {
+ int bin_size = 0;
+ int i;
+ Eterm bin_term;
+ byte* bin_p;
+
+ for (i = 0; i < ap->len; i++) {
+ bin_size += (ap->name[i] >= 0x80) ? 2 : 1;
+ }
+ if (bin_size == ap->len) {
+ BIF_RET(new_binary(BIF_P, ap->name, ap->len));
+ }
+ bin_term = new_binary(BIF_P, 0, bin_size);
+ bin_p = binary_bytes(bin_term);
+ for (i = 0; i < ap->len; i++) {
+ byte b = ap->name[i];
+ if (b < 0x80) {
+ *bin_p++ = b;
+ } else {
+ *bin_p++ = 0xC0 | (b >> 6);
+ *bin_p++ = 0x80 | (b & 0x3F);
+ }
+ }
+ BIF_RET(bin_term);
+ } else {
+ error:
+ BIF_ERROR(BIF_P, BADARG);
+ }
+}
+
+static BIF_RETTYPE
+binary_to_atom(Process* p, Eterm bin, Eterm enc, int must_exist)
+{
+ byte* bytes;
+ byte *temp_alloc = NULL;
+ Uint bin_size;
+
+ if ((bytes = erts_get_aligned_binary_bytes(bin, &temp_alloc)) == 0) {
+ BIF_ERROR(p, BADARG);
+ }
+ bin_size = binary_size(bin);
+ if (enc == am_latin1) {
+ Eterm a;
+ if (bin_size > MAX_ATOM_LENGTH) {
+ system_limit:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(p, SYSTEM_LIMIT);
+ }
+ if (!must_exist) {
+ a = am_atom_put((char *)bytes, bin_size);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(a);
+ } else if (erts_atom_get((char *)bytes, bin_size, &a)) {
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(a);
+ } else {
+ goto badarg;
+ }
+ } else if (enc == am_utf8 || enc == am_unicode) {
+ char *buf;
+ char *dst;
+ int i;
+ int num_chars;
+ Eterm res;
+
+ if (bin_size > 2*MAX_ATOM_LENGTH) {
+ byte* err_pos;
+ Uint n;
+ int reds_left = bin_size+1; /* Number of reductions left. */
+
+ if (analyze_utf8(bytes, bin_size, &err_pos,
+ &n, &reds_left) == UTF8_OK) {
+ /*
+ * Correct UTF-8 encoding, but too many characters to
+ * fit in an atom.
+ */
+ goto system_limit;
+ } else {
+ /*
+ * Something wrong in the UTF-8 encoding or Unicode code
+ * points > 255.
+ */
+ goto badarg;
+ }
+ }
+
+ /*
+ * Allocate a temporary buffer the same size as the binary,
+ * so that we don't need an extra overflow test.
+ */
+ buf = (char *) erts_alloc(ERTS_ALC_T_TMP, bin_size);
+ dst = buf;
+ for (i = 0; i < bin_size; i++) {
+ int c = bytes[i];
+ if (c < 0x80) {
+ *dst++ = c;
+ } else if (i < bin_size-1) {
+ int c2;
+ if ((c & 0xE0) != 0xC0) {
+ goto free_badarg;
+ }
+ i++;
+ c = (c & 0x3F) << 6;
+ c2 = bytes[i];
+ if ((c2 & 0xC0) != 0x80) {
+ goto free_badarg;
+ }
+ c = c | (c2 & 0x3F);
+ if (0x80 <= c && c < 256) {
+ *dst++ = c;
+ } else {
+ goto free_badarg;
+ }
+ } else {
+ free_badarg:
+ erts_free(ERTS_ALC_T_TMP, (void *) buf);
+ goto badarg;
+ }
+ }
+ num_chars = dst - buf;
+ if (num_chars > MAX_ATOM_LENGTH) {
+ erts_free(ERTS_ALC_T_TMP, (void *) buf);
+ goto system_limit;
+ }
+ if (!must_exist) {
+ res = am_atom_put(buf, num_chars);
+ erts_free(ERTS_ALC_T_TMP, (void *) buf);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(res);
+ } else {
+ int exists = erts_atom_get(buf, num_chars, &res);
+ erts_free(ERTS_ALC_T_TMP, (void *) buf);
+ if (exists) {
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(res);
+ } else {
+ goto badarg;
+ }
+ }
+ } else {
+ badarg:
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_ERROR(p, BADARG);
+ }
+}
+
+BIF_RETTYPE binary_to_atom_2(BIF_ALIST_2)
+{
+ return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 0);
+}
+
+BIF_RETTYPE binary_to_existing_atom_2(BIF_ALIST_2)
+{
+ return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 1);
+}