From 25e22145d129a80dcfc02c64dfe0b0d890a5e26d Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Tue, 12 Oct 2010 17:22:42 +0200 Subject: Add bifs to translate between erlang filenames and native encoding --- erts/emulator/Makefile.in | 3 +- erts/emulator/beam/atom.names | 1 + erts/emulator/beam/bif.tab | 6 + erts/emulator/beam/binary.c | 10 +- erts/emulator/beam/erl_binary.h | 2 + erts/emulator/beam/erl_init.c | 1 + erts/emulator/beam/erl_unicode.c | 384 ++++++++++++++++++++++++- erts/emulator/beam/sys.h | 12 + erts/emulator/sys/common/erl_sys_common_misc.c | 78 +++++ 9 files changed, 490 insertions(+), 7 deletions(-) create mode 100644 erts/emulator/sys/common/erl_sys_common_misc.c (limited to 'erts') diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in index 4ed0ccabc6..6c33e2ca16 100644 --- a/erts/emulator/Makefile.in +++ b/erts/emulator/Makefile.in @@ -796,7 +796,8 @@ endif OS_OBJS += $(OBJDIR)/erl_mseg.o \ $(OBJDIR)/erl_$(ERLANG_OSTYPE)_sys_ddll.o \ - $(OBJDIR)/erl_mtrace_sys_wrap.o + $(OBJDIR)/erl_mtrace_sys_wrap.o \ + $(OBJDIR)/erl_sys_common_misc.o HIPE_x86_OS_OBJS=$(HIPE_x86_$(OPSYS)_OBJS) HIPE_x86_OBJS=$(OBJDIR)/hipe_x86.o $(OBJDIR)/hipe_x86_glue.o $(OBJDIR)/hipe_x86_bifs.o $(OBJDIR)/hipe_x86_signal.o $(OBJDIR)/hipe_x86_stack.o $(HIPE_x86_OS_OBJS) diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index 327620772f..93b8e3ec28 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -549,6 +549,7 @@ atom waiting atom wall_clock atom warning atom warning_msg +atom win_wchar atom wordsize atom write_concurrency atom xor diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab index 0674aae77f..55166417e5 100644 --- a/erts/emulator/beam/bif.tab +++ b/erts/emulator/beam/bif.tab @@ -794,6 +794,12 @@ bif binary:decode_unsigned/2 bif erlang:nif_error/1 bif erlang:nif_error/2 +# +# Helpers for unicode filenames +# +bif file:name2native/1 +bif file:native2name/1 +bif file:native_name_encoding/0 # # Obsolete # diff --git a/erts/emulator/beam/binary.c b/erts/emulator/beam/binary.c index 8ee8fbcb29..4be869f269 100644 --- a/erts/emulator/beam/binary.c +++ b/erts/emulator/beam/binary.c @@ -217,8 +217,8 @@ erts_get_aligned_binary_bytes_extra(Eterm bin, byte** base_ptr, ErtsAlcType_t al return bytes; } -static Eterm -bin_bytes_to_list(Eterm previous, Eterm* hp, byte* bytes, Uint size, Uint bitoffs) +Eterm +erts_bin_bytes_to_list(Eterm previous, Eterm* hp, byte* bytes, Uint size, Uint bitoffs) { if (bitoffs == 0) { while (size) { @@ -263,7 +263,7 @@ BIF_RETTYPE binary_to_list_1(BIF_ALIST_1) Eterm* hp = HAlloc(BIF_P, 2 * size); byte* bytes = binary_bytes(real_bin)+offset; - BIF_RET(bin_bytes_to_list(NIL, hp, bytes, size, bitoffs)); + BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs)); } error: @@ -295,7 +295,7 @@ BIF_RETTYPE binary_to_list_3(BIF_ALIST_3) } i = stop-start+1; hp = HAlloc(BIF_P, 2*i); - BIF_RET(bin_bytes_to_list(NIL, hp, bytes+start-1, i, bitoffs)); + BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes+start-1, i, bitoffs)); error: BIF_ERROR(BIF_P, BADARG); @@ -339,7 +339,7 @@ BIF_RETTYPE bitstring_to_list_1(BIF_ALIST_1) previous = CONS(hp, make_binary(last), previous); hp += 2; } - BIF_RET(bin_bytes_to_list(previous, hp, bytes, size, bitoffs)); + BIF_RET(erts_bin_bytes_to_list(previous, hp, bytes, size, bitoffs)); } diff --git a/erts/emulator/beam/erl_binary.h b/erts/emulator/beam/erl_binary.h index a569fe2e85..bdf0fe23fc 100644 --- a/erts/emulator/beam/erl_binary.h +++ b/erts/emulator/beam/erl_binary.h @@ -152,6 +152,8 @@ do { \ void erts_init_binary(void); byte* erts_get_aligned_binary_bytes_extra(Eterm, byte**, ErtsAlcType_t, unsigned extra); +/* Used by unicode module */ +Eterm erts_bin_bytes_to_list(Eterm previous, Eterm* hp, byte* bytes, Uint size, Uint bitoffs); /* * Common implementation for erlang:list_to_binary/1 and binary:list_to_bin/1 diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index a9f4f041ac..a7892e143b 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -251,6 +251,7 @@ erl_init(int ncpu) erts_init_monitors(); erts_init_gc(); init_time(); + erts_init_sys_common_misc(); erts_init_process(ncpu); erts_init_scheduling(use_multi_run_queue, no_schedulers, diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index d01a3661f9..671c3c0cdf 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -463,7 +463,7 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ } objp = list_val(ioterm); obj = CAR(objp); - if (!is_byte(obj)) + if (!is_small(obj)) break; } } else if (is_nil(obj)) { @@ -1813,3 +1813,385 @@ BIF_RETTYPE binary_to_existing_atom_2(BIF_ALIST_2) { return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 1); } + +/********************************************************** + * Simpler non-interruptable routines for UTF-8 and + * Windowish UTF-16 (restricted) + **********************************************************/ +static Sint simple_char_need(Eterm ioterm, int encoding) +{ + Eterm *objp; + Eterm obj; + DECLARE_ESTACK(stack); + Sint need = 0; + + if (is_atom(ioterm)) { + Atom* ap; + int i; + ap = atom_tab(atom_val(ioterm)); + switch (encoding) { + case ERL_FILENAME_LATIN1: + need = ap->len; + break; + case ERL_FILENAME_UTF8: + for (i = 0; i < ap->len; i++) { + need += (ap->name[i] >= 0x80) ? 2 : 1; + } + break; + case ERL_FILENAME_WIN_WCHAR: + need = 2*(ap->len); + break; + default: + need = -1; + } + DESTROY_ESTACK(stack); + return need; + } + + if (is_nil(ioterm)) { + DESTROY_ESTACK(stack); + return need; + } + if (!is_list(ioterm)) { + DESTROY_ESTACK(stack); + return (Sint) -1; + } + /* OK a list, needs to be processed in order, handling each flat list-level + as they occur, just like io_list_to_binary would */ + ESTACK_PUSH(stack,ioterm); + while (!ESTACK_ISEMPTY(stack)) { + ioterm = ESTACK_POP(stack); + if (is_nil(ioterm)) { + /* ignore empty lists */ + continue; + } + if(is_list(ioterm)) { +L_Again: /* Restart with sublist, old listend was pushed on stack */ + objp = list_val(ioterm); + obj = CAR(objp); + for(;;) { /* loop over one flat list of bytes and binaries + until sublist or list end is encountered */ + if (is_small(obj)) { /* Always small */ + for(;;) { + Uint x = unsigned_val(obj); + switch (encoding) { + case ERL_FILENAME_LATIN1: + need += 1; + break; + case ERL_FILENAME_UTF8: + if (x < 0x80) { + need +=1; + } else if (x < 0x800) { + need += 2; + } else if (x < 0x10000) { + if ((x >= 0xD800 && x <= 0xDFFF) || + (x == 0xFFFE) || + (x == 0xFFFF)) { /* Invalid unicode range */ + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + need += 3; + } else if (x < 0x110000) { + need += 4; + } else { + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + break; + case ERL_FILENAME_WIN_WCHAR: + if (x <= 0xffff) { + need += 2; + break; + } /* else fall throug to error */ + default: + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + + /* everything else will give badarg later + in the process, so we dont check */ + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + if (!is_small(obj)) + break; + } + } else if (is_nil(obj)) { + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + } else if (is_list(obj)) { + /* push rest of list for later processing, start + again with sublist */ + ESTACK_PUSH(stack,CDR(objp)); + ioterm = obj; + goto L_Again; + } else { + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + if (is_nil(ioterm) || !is_list(ioterm)) { + break; + } + } /* for(;;) */ + } /* is_list(ioterm) */ + + if (!is_list(ioterm) && !is_nil(ioterm)) { + /* inproper list end */ + DESTROY_ESTACK(stack); + return ((Sint) -1); + } + } /* while not estack empty */ + DESTROY_ESTACK(stack); + return need; +} + +static void simple_put_chars(Eterm ioterm, int encoding, byte *p) +{ + Eterm *objp; + Eterm obj; + DECLARE_ESTACK(stack); + + if (is_atom(ioterm)) { + Atom* ap; + int i; + ap = atom_tab(atom_val(ioterm)); + switch (encoding) { + case ERL_FILENAME_LATIN1: + for (i = 0; i < ap->len; i++) { + *p++ = ap->name[i]; + } + break; + case ERL_FILENAME_UTF8: + for (i = 0; i < ap->len; i++) { + if(ap->name[i] < 0x80) { + *p++ = ap->name[i]; + } else { + *p++ = (((ap->name[i]) >> 6) | ((byte) 0xC0)); + *p++ = (((ap->name[i]) & 0x3F) | ((byte) 0x80)); + } + } + break; + case ERL_FILENAME_WIN_WCHAR: + for (i = 0; i < ap->len; i++) { + /* Little endian */ + *p++ = ap->name[i]; + *p++ = 0; + } + break; + default: + ASSERT(0); + } + DESTROY_ESTACK(stack); + return; + } + + if (is_nil(ioterm)) { + DESTROY_ESTACK(stack); + return; + } + ASSERT(is_list(ioterm)); + /* OK a list, needs to be processed in order, handling each flat list-level + as they occur, just like io_list_to_binary would */ + ESTACK_PUSH(stack,ioterm); + while (!ESTACK_ISEMPTY(stack)) { + ioterm = ESTACK_POP(stack); + if (is_nil(ioterm)) { + /* ignore empty lists */ + continue; + } + if(is_list(ioterm)) { +L_Again: /* Restart with sublist, old listend was pushed on stack */ + objp = list_val(ioterm); + obj = CAR(objp); + for(;;) { /* loop over one flat list of bytes and binaries + until sublist or list end is encountered */ + if (is_small(obj)) { /* Always small */ + for(;;) { + Uint x = unsigned_val(obj); + switch (encoding) { + case ERL_FILENAME_LATIN1: + ASSERT( x < 256); + *p++ = (byte) x; + break; + case ERL_FILENAME_UTF8: + if (x < 0x80) { + *p++ = (byte) x; + } + else if (x < 0x800) { + *p++ = (((byte) (x >> 6)) | + ((byte) 0xC0)); + *p++ = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else if (x < 0x10000) { + ASSERT(!((x >= 0xD800 && x <= 0xDFFF) || + (x == 0xFFFE) || + (x == 0xFFFF))); + *p++ = (((byte) (x >> 12)) | + ((byte) 0xE0)); + *p++ = ((((byte) (x >> 6)) & 0x3F) | + ((byte) 0x80)); + *p++ = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } else { + ASSERT(x < 0x110000); + *p++ = (((byte) (x >> 18)) | + ((byte) 0xF0)); + *p++ = ((((byte) (x >> 12)) & 0x3F) | + ((byte) 0x80)); + *p++ = ((((byte) (x >> 6)) & 0x3F) | + ((byte) 0x80)); + *p++ = (((byte) (x & 0x3F)) | + ((byte) 0x80)); + } + break; + case ERL_FILENAME_WIN_WCHAR: + ASSERT(x <= 0xFFFF); + *p++ = (byte) (x & 0xFFU); + *p++ = (byte) ((x >> 8) & 0xFFU); + break; + default: + ASSERT(0); + } + + /* everything else will give badarg later + in the process, so we dont check */ + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + if (!is_small(obj)) + break; + } + } else if (is_nil(obj)) { + ioterm = CDR(objp); + if (!is_list(ioterm)) { + break; + } + objp = list_val(ioterm); + obj = CAR(objp); + } else if (is_list(obj)) { + /* push rest of list for later processing, start + again with sublist */ + ESTACK_PUSH(stack,CDR(objp)); + ioterm = obj; + goto L_Again; + } else { + ASSERT(0); + } + if (is_nil(ioterm) || !is_list(ioterm)) { + break; + } + } /* for(;;) */ + } /* is_list(ioterm) */ + + ASSERT(is_list(ioterm) || is_nil(ioterm)); + } /* while not estack empty */ + DESTROY_ESTACK(stack); + return; +} + + + +BIF_RETTYPE file_name2native_1(BIF_ALIST_1) +{ + int encoding = erts_get_native_filename_encoding(); + Sint need; + Eterm bin_term; + byte* bin_p; + if ((need = simple_char_need(BIF_ARG_1,encoding)) < 0) { + BIF_ERROR(BIF_P,BADARG); + } + bin_term = new_binary(BIF_P, 0, need); + bin_p = binary_bytes(bin_term); + simple_put_chars(BIF_ARG_1,encoding,bin_p); + BIF_RET(bin_term); +} + +BIF_RETTYPE file_native2name_1(BIF_ALIST_1) +{ + Eterm real_bin; + Uint offset; + Uint size,num_chars; + Uint bitsize; + Uint bitoffs; + Eterm *hp; + byte *temp_alloc = NULL; + byte *bytes; + byte *err_pos; + Uint num_built; /* characters */ + Uint num_eaten; /* bytes */ + Eterm ret; + + if (is_not_binary(BIF_ARG_1)) { + BIF_ERROR(BIF_P,BADARG); + } + size = binary_size(BIF_ARG_1); + ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize); + if (bitsize != 0) { + BIF_ERROR(BIF_P,BADARG); + } + if (size == 0) { + BIF_RET(NIL); + } + switch (erts_get_native_filename_encoding()) { + case ERL_FILENAME_LATIN1: + goto simple; + case ERL_FILENAME_UTF8: + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK) { + erts_free_aligned_binary_bytes(temp_alloc); + goto simple; + } + num_built = 0; + num_eaten = 0; + ret = do_utf8_to_list(BIF_P, num_chars, bytes, size, num_chars, &num_built, &num_eaten, NIL); + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + case ERL_FILENAME_WIN_WCHAR: + if ((size % 2) != 0) { + goto simple; + } + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + hp = HAlloc(BIF_P, size); + ret = NIL; + bytes += size-1; + while (size > 0) { + Uint x = ((Uint) *bytes--) << 8; + x |= ((Uint) *bytes--); + ret = CONS(hp,make_small(x),ret); + size -= 2; + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + default: + goto simple; + } + simple: + hp = HAlloc(BIF_P, 2 * size); + bytes = binary_bytes(real_bin)+offset; + + BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs)); +} + +BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) +{ + switch (erts_get_native_filename_encoding()) { + case ERL_FILENAME_LATIN1: + BIF_RET(am_latin1); + case ERL_FILENAME_UTF8: + BIF_RET(am_utf8); + case ERL_FILENAME_WIN_WCHAR: + BIF_RET(am_win_wchar); + default: + BIF_RET(am_undefined); + } +} diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h index 0d15272aa8..d14e0ac105 100644 --- a/erts/emulator/beam/sys.h +++ b/erts/emulator/beam/sys.h @@ -1253,6 +1253,18 @@ char* win32_errorstr(int); #endif +/************************************************************************ + * Find out the native filename encoding of the process (look at locale of + * Unix processes and just do UTF16 on windows + ************************************************************************/ +#define ERL_FILENAME_UNKNOWN 0 +#define ERL_FILENAME_LATIN1 1 +#define ERL_FILENAME_UTF8 2 +#define ERL_FILENAME_WIN_WCHAR 3 + +int erts_get_native_filename_encoding(void); + +void erts_init_sys_common_misc(void); #endif diff --git a/erts/emulator/sys/common/erl_sys_common_misc.c b/erts/emulator/sys/common/erl_sys_common_misc.c new file mode 100644 index 0000000000..dbb59676c8 --- /dev/null +++ b/erts/emulator/sys/common/erl_sys_common_misc.c @@ -0,0 +1,78 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2006-2010. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +/* + * Description: Check I/O + * + * Author: Rickard Green + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "sys.h" +#include "global.h" + +#if !defined(__WIN32__) +#include +#if !defined(HAVE_SETLOCALE) || !defined(HAVE_NL_LANGINFO) || !defined(HAVE_LANGINFO_H) +#define PRIMITIVE_UTF8_CHECK 1 +#else +#include +#endif +#endif + +/* Written once and only once */ + +static int filename_encoding = ERL_FILENAME_UNKNOWN; + +void erts_init_sys_common_misc(void) +{ +#if defined(__WIN32__) + filename_encoding = ERL_FILENAME_WIN_WCHAR; +#else + char *l; + filename_encoding = ERL_FILENAME_LATIN1; +# ifdef PRIMITIVE_UTF8_CHECK + setlocale(LC_CTYPE, ""); /* Set international environment, + ignore result */ + if (((l = getenv("LC_ALL")) && *l) || + ((l = getenv("LC_CTYPE")) && *l) || + ((l = getenv("LANG")) && *l)) { + if (strstr(l, "UTF-8")) { + filename_encoding = ERL_FILENAME_UTF8; + } + } + +# else + l = setlocale(LC_CTYPE, ""); /* Set international environment */ + if (l != NULL) { + if (strcmp(nl_langinfo(CODESET), "UTF-8") == 0) { + filename_encoding = ERL_FILENAME_UTF8; + } + } +# endif +#endif +} + +int erts_get_native_filename_encoding(void) +{ + return filename_encoding; +} -- cgit v1.2.3