From 25e22145d129a80dcfc02c64dfe0b0d890a5e26d Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Tue, 12 Oct 2010 17:22:42 +0200 Subject: Add bifs to translate between erlang filenames and native encoding --- erts/emulator/beam/erl_unicode.c | 384 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 383 insertions(+), 1 deletion(-) (limited to 'erts/emulator/beam/erl_unicode.c') diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index d01a3661f9..671c3c0cdf 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -463,7 +463,7 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ } objp = list_val(ioterm); obj = CAR(objp); - if (!is_byte(obj)) + if (!is_small(obj)) break; } } else if (is_nil(obj)) { @@ -1813,3 +1813,385 @@ BIF_RETTYPE binary_to_existing_atom_2(BIF_ALIST_2) { return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 1); } + +/********************************************************** + * Simpler non-interruptable routines for UTF-8 and + * Windowish UTF-16 (restricted) + **********************************************************/ +static Sint simple_char_need(Eterm ioterm, int encoding) +{ + Eterm *objp; + Eterm obj; + DECLARE_ESTACK(stack); + Sint need = 0; + + if (is_atom(ioterm)) { + Atom* ap; + int i; + ap = atom_tab(atom_val(ioterm)); + switch (encoding) { + case ERL_FILENAME_LATIN1: + need = ap->len; + break; + case ERL_FILENAME_UTF8: + for (i = 0; i < ap->len; i++) { + need += (ap->name[i] >= 0x80) ? 2 : 1; + } + break; + case ERL_FILENAME_WIN_WCHAR: + need = 2*(ap->len); + break; + default: + need = -1; + } + DESTROY_ESTACK(stack); + return need; + } + + if (is_nil(ioterm)) { + DESTROY_ESTACK(stack); + return need; + } + if (!is_list(ioterm)) { + DESTROY_ESTACK(stack); + return (Sint) -1; + } + /* OK a list, needs to be processed in order, handling each flat list-level + as they occur, just like io_list_to_binary would */ + ESTACK_PUSH(stack,ioterm); + while (!ESTACK_ISEMPTY(stack)) { + ioterm = ESTACK_POP(stack); + if (is_nil(ioterm)) { + /* ignore empty lists */ + continue; + } + if(is_list(ioterm)) { +L_Again: /* Restart with sublist, old listend was pushed on stack */ + objp = list_val(ioterm); + obj = CAR(objp); + for(;;) { /* loop over one flat list of bytes and binaries + until sublist or list end is encountered */ + if (is_small(obj)) { /* Always small */ + for(;;) { + Uint x = unsigned_val(obj); + switch (encoding) { + case ERL_FILENAME_LATIN1: + need += 1; + break; + case ERL_FILENAME_UTF8: + if (x < 0x80) { + need +=1; + } else if (x < 0x800) { + need += 2; + } else if (x < 0x10000) { + if ((x >= 0xD800 && x <= 0xDFFF) || + (x == 0xFFFE) || + (x == 0xFFFF)) { /* Invalid unicode range */ + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + need += 3; + } else if (x < 0x110000) { + need += 4; + } else { + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + break; + case ERL_FILENAME_WIN_WCHAR: + if (x <= 0xffff) { + need += 2; + break; + } /* else fall throug to error */ + default: + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + + /* everything else will give badarg later + in the process, so we dont check */ + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + if (!is_small(obj)) + break; + } + } else if (is_nil(obj)) { + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + } else if (is_list(obj)) { + /* push rest of list for later processing, start + again with sublist */ + ESTACK_PUSH(stack,CDR(objp)); + ioterm = obj; + goto L_Again; + } else { + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + if (is_nil(ioterm) || !is_list(ioterm)) { + break; + } + } /* for(;;) */ + } /* is_list(ioterm) */ + + if (!is_list(ioterm) && !is_nil(ioterm)) { + /* inproper list end */ + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + } /* while not estack empty */ + DESTROY_ESTACK(stack); + return need; +} + +static void simple_put_chars(Eterm ioterm, int encoding, byte *p) +{ + Eterm *objp; + Eterm obj; + DECLARE_ESTACK(stack); + + if (is_atom(ioterm)) { + Atom* ap; + int i; + ap = atom_tab(atom_val(ioterm)); + switch (encoding) { + case ERL_FILENAME_LATIN1: + for (i = 0; i < ap->len; i++) { + *p++ = ap->name[i]; + } + break; + case ERL_FILENAME_UTF8: + for (i = 0; i < ap->len; i++) { + if(ap->name[i] < 0x80) { + *p++ = ap->name[i]; + } else { + *p++ = (((ap->name[i]) >> 6) | ((byte) 0xC0)); + *p++ = (((ap->name[i]) & 0x3F) | ((byte) 0x80)); + } + } + break; + case ERL_FILENAME_WIN_WCHAR: + for (i = 0; i < ap->len; i++) { + /* Little endian */ + *p++ = ap->name[i]; + *p++ = 0; + } + break; + default: + ASSERT(0); + } + DESTROY_ESTACK(stack); + return; + } + + if (is_nil(ioterm)) { + DESTROY_ESTACK(stack); + return; + } + ASSERT(is_list(ioterm)); + /* OK a list, needs to be processed in order, handling each flat list-level + as they occur, just like io_list_to_binary would */ + ESTACK_PUSH(stack,ioterm); + while (!ESTACK_ISEMPTY(stack)) { + ioterm = ESTACK_POP(stack); + if (is_nil(ioterm)) { + /* ignore empty lists */ + continue; + } + if(is_list(ioterm)) { +L_Again: /* Restart with sublist, old listend was pushed on stack */ + objp = list_val(ioterm); + obj = CAR(objp); + for(;;) { /* loop over one flat list of bytes and binaries + until sublist or list end is encountered */ + if (is_small(obj)) { /* Always small */ + for(;;) { + Uint x = unsigned_val(obj); + switch (encoding) { + case ERL_FILENAME_LATIN1: + ASSERT( x < 256); + *p++ = (byte) x; + break; + case ERL_FILENAME_UTF8: + if (x < 0x80) { + *p++ = (byte) x; + } + else if (x < 0x800) { + *p++ = (((byte) (x >> 6)) | + ((byte) 0xC0)); + *p++ = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else if (x < 0x10000) { + ASSERT(!((x >= 0xD800 && x <= 0xDFFF) || + (x == 0xFFFE) || + (x == 0xFFFF))); + *p++ = (((byte) (x >> 12)) | + ((byte) 0xE0)); + *p++ = ((((byte) (x >> 6)) & 0x3F) | + ((byte) 0x80)); + *p++ = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else { + ASSERT(x < 0x110000); + *p++ = (((byte) (x >> 18)) | + ((byte) 0xF0)); + *p++ = ((((byte) (x >> 12)) & 0x3F) | + ((byte) 0x80)); + *p++ = ((((byte) (x >> 6)) & 0x3F) | + ((byte) 0x80)); + *p++ = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } + break; + case ERL_FILENAME_WIN_WCHAR: + ASSERT(x <= 0xFFFF); + *p++ = (byte) (x & 0xFFU); + *p++ = (byte) ((x >> 8) & 0xFFU); + break; + default: + ASSERT(0); + } + + /* everything else will give badarg later + in the process, so we dont check */ + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + if (!is_small(obj)) + break; + } + } else if (is_nil(obj)) { + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + } else if (is_list(obj)) { + /* push rest of list for later processing, start + again with sublist */ + ESTACK_PUSH(stack,CDR(objp)); + ioterm = obj; + goto L_Again; + } else { + ASSERT(0); + } + if (is_nil(ioterm) || !is_list(ioterm)) { + break; + } + } /* for(;;) */ + } /* is_list(ioterm) */ + + ASSERT(is_list(ioterm) || is_nil(ioterm)); + } /* while not estack empty */ + DESTROY_ESTACK(stack); + return; +} + + + +BIF_RETTYPE file_name2native_1(BIF_ALIST_1) +{ + int encoding = erts_get_native_filename_encoding(); + Sint need; + Eterm bin_term; + byte* bin_p; + if ((need = simple_char_need(BIF_ARG_1,encoding)) < 0) { + BIF_ERROR(BIF_P,BADARG); + } + bin_term = new_binary(BIF_P, 0, need); + bin_p = binary_bytes(bin_term); + simple_put_chars(BIF_ARG_1,encoding,bin_p); + BIF_RET(bin_term); +} + +BIF_RETTYPE file_native2name_1(BIF_ALIST_1) +{ + Eterm real_bin; + Uint offset; + Uint size,num_chars; + Uint bitsize; + Uint bitoffs; + Eterm *hp; + byte *temp_alloc = NULL; + byte *bytes; + byte *err_pos; + Uint num_built; /* characters */ + Uint num_eaten; /* bytes */ + Eterm ret; + + if (is_not_binary(BIF_ARG_1)) { + BIF_ERROR(BIF_P,BADARG); + } + size = binary_size(BIF_ARG_1); + ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize); + if (bitsize != 0) { + BIF_ERROR(BIF_P,BADARG); + } + if (size == 0) { + BIF_RET(NIL); + } + switch (erts_get_native_filename_encoding()) { + case ERL_FILENAME_LATIN1: + goto simple; + case ERL_FILENAME_UTF8: + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK) { + erts_free_aligned_binary_bytes(temp_alloc); + goto simple; + } + num_built = 0; + num_eaten = 0; + ret = do_utf8_to_list(BIF_P, num_chars, bytes, size, num_chars, &num_built, &num_eaten, NIL); + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + case ERL_FILENAME_WIN_WCHAR: + if ((size % 2) != 0) { + goto simple; + } + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + hp = HAlloc(BIF_P, size); + ret = NIL; + bytes += size-1; + while (size > 0) { + Uint x = ((Uint) *bytes--) << 8; + x |= ((Uint) *bytes--); + ret = CONS(hp,make_small(x),ret); + size -= 2; + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + default: + goto simple; + } + simple: + hp = HAlloc(BIF_P, 2 * size); + bytes = binary_bytes(real_bin)+offset; + + BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs)); +} + +BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) +{ + switch (erts_get_native_filename_encoding()) { + case ERL_FILENAME_LATIN1: + BIF_RET(am_latin1); + case ERL_FILENAME_UTF8: + BIF_RET(am_utf8); + case ERL_FILENAME_WIN_WCHAR: + BIF_RET(am_win_wchar); + default: + BIF_RET(am_undefined); + } +} -- cgit v1.2.3