From 4cf08709189ea8b7e2ae20f85c390abd04ae48ae Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Wed, 13 Oct 2010 17:08:32 +0200 Subject: Teach filename to accept raw data and add filename enc option to emu --- erts/emulator/beam/atom.names | 1 - erts/emulator/beam/bif.tab | 4 +- erts/emulator/beam/erl_init.c | 22 ++++- erts/emulator/beam/erl_unicode.c | 108 +++++++++++++++++++++---- erts/emulator/beam/sys.h | 3 + erts/emulator/sys/common/erl_sys_common_misc.c | 52 ++++++++---- 6 files changed, 153 insertions(+), 37 deletions(-) (limited to 'erts') diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index 93b8e3ec28..327620772f 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -549,7 +549,6 @@ atom waiting atom wall_clock atom warning atom warning_msg -atom win_wchar atom wordsize atom write_concurrency atom xor diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab index 55166417e5..cf251d9016 100644 --- a/erts/emulator/beam/bif.tab +++ b/erts/emulator/beam/bif.tab @@ -797,8 +797,8 @@ bif erlang:nif_error/2 # # Helpers for unicode filenames # -bif file:name2native/1 -bif file:native2name/1 +bif file:internal_name2native/1 +bif file:internal_native2name/1 bif file:native_name_encoding/0 # # Obsolete diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index a7892e143b..464ee750f7 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -908,7 +908,27 @@ erl_start(int argc, char **argv) VERBOSE(DEBUG_SYSTEM, ("using display items %d\n",display_items)); break; - + case 'f': + if (!strncmp(argv[i],"-fn",3)) { + arg = get_arg(argv[i]+3, argv[i+1], &i); + switch (*arg) { + case 'u': + erts_set_user_requested_filename_encoding(ERL_FILENAME_UTF8); + break; + case 'l': + erts_set_user_requested_filename_encoding(ERL_FILENAME_LATIN1); + break; + case 'a': + erts_set_user_requested_filename_encoding(ERL_FILENAME_UNKNOWN); + default: + erts_fprintf(stderr, "bad filename encoding %s, can be (l,u or a)\n", arg); + erts_usage(); + } + break; + } else { + erts_fprintf(stderr, "%s unknown flag %s\n", argv[0], argv[i]); + erts_usage(); + } case 'l': display_loads++; break; diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 671c3c0cdf..3e7a935cef 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1876,6 +1876,10 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ Uint x = unsigned_val(obj); switch (encoding) { case ERL_FILENAME_LATIN1: + if (x > 255) { + DESTROY_ESTACK(stack); + return ((Sint) -1); + } need += 1; break; case ERL_FILENAME_UTF8: @@ -2101,12 +2105,76 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */ -BIF_RETTYPE file_name2native_1(BIF_ALIST_1) +BIF_RETTYPE file_internal_name2native_1(BIF_ALIST_1) { int encoding = erts_get_native_filename_encoding(); Sint need; Eterm bin_term; byte* bin_p; + if (is_binary(BIF_ARG_1)) { + byte *temp_alloc = NULL; + byte *bytes; + byte *err_pos; + Uint size,num_chars; + Uint unipoint; + /* Uninterpreted encoding except if windows widechar, in case we convert from + utf8 to win_wchar */ + if (encoding != ERL_FILENAME_WIN_WCHAR) { + BIF_RET(BIF_ARG_1); + } + /* In a wchar world, the emulator flags only affect how + binaries are interpreted when sent from the user. */ + /* Determine real length and create a new binary */ + size = binary_size(BIF_ARG_1); + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK || + erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) { + /* What to do now? Maybe latin1, so just take byte for byte instead */ + bin_term = new_binary(BIF_P, 0, size*2); + bin_p = binary_bytes(bin_term); + while (size--) { + *bin_p++ = *bytes++; + *bin_p++ = 0; + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(bin_term); + } + /* OK, UTF8 ok, number of characters is in num_chars */ + bin_term = new_binary(BIF_P, 0, num_chars*2); + bin_p = binary_bytes(bin_term); + while (num_chars--) { + if (((*bytes) & ((byte) 0x80)) == 0) { + unipoint = (Uint) *bytes; + ++bytes; + } else if (((*bytes) & ((byte) 0xE0)) == 0xC0) { + unipoint = + (((Uint) ((*bytes) & ((byte) 0x1F))) << 6) | + ((Uint) (bytes[1] & ((byte) 0x3F))); + bytes += 2; + } else if (((*bytes) & ((byte) 0xF0)) == 0xE0) { + unipoint = + (((Uint) ((*bytes) & ((byte) 0xF))) << 12) | + (((Uint) (bytes[1] & ((byte) 0x3F))) << 6) | + ((Uint) (bytes[2] & ((byte) 0x3F))); + bytes +=3; + } else if (((*bytes) & ((byte) 0xF8)) == 0xF0) { + unipoint = + (((Uint) ((*bytes) & ((byte) 0x7))) << 18) | + (((Uint) (bytes[1] & ((byte) 0x3F))) << 12) | + (((Uint) (bytes[2] & ((byte) 0x3F))) << 6) | + ((Uint) (bytes[3] & ((byte) 0x3F))); + bytes += 4; + } else { + erl_exit(1,"Internal unicode error in file:name2native/1"); + } + *bin_p++ = (byte) (unipoint & 0xFF); + *bin_p++ = (byte) ((unipoint >> 8) & 0xFF); + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(bin_term); + } /* binary */ + + if ((need = simple_char_need(BIF_ARG_1,encoding)) < 0) { BIF_ERROR(BIF_P,BADARG); } @@ -2116,7 +2184,7 @@ BIF_RETTYPE file_name2native_1(BIF_ALIST_1) BIF_RET(bin_term); } -BIF_RETTYPE file_native2name_1(BIF_ALIST_1) +BIF_RETTYPE file_internal_native2name_1(BIF_ALIST_1) { Eterm real_bin; Uint offset; @@ -2144,12 +2212,15 @@ BIF_RETTYPE file_native2name_1(BIF_ALIST_1) } switch (erts_get_native_filename_encoding()) { case ERL_FILENAME_LATIN1: - goto simple; + hp = HAlloc(BIF_P, 2 * size); + bytes = binary_bytes(real_bin)+offset; + + BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs)); case ERL_FILENAME_UTF8: bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK) { erts_free_aligned_binary_bytes(temp_alloc); - goto simple; + goto noconvert; } num_built = 0; num_eaten = 0; @@ -2157,12 +2228,16 @@ BIF_RETTYPE file_native2name_1(BIF_ALIST_1) erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(ret); case ERL_FILENAME_WIN_WCHAR: - if ((size % 2) != 0) { - goto simple; - } bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); - hp = HAlloc(BIF_P, size); - ret = NIL; + if ((size % 2) != 0) { /* Panic fixup to avoid crashing the emulator */ + size--; + hp = HAlloc(BIF_P, size+2); + ret = CONS(hp,make_small((Uint) bytes[size]),NIL); + hp += 2; + } else { + hp = HAlloc(BIF_P, size); + ret = NIL; + } bytes += size-1; while (size > 0) { Uint x = ((Uint) *bytes--) << 8; @@ -2173,13 +2248,10 @@ BIF_RETTYPE file_native2name_1(BIF_ALIST_1) erts_free_aligned_binary_bytes(temp_alloc); BIF_RET(ret); default: - goto simple; + goto noconvert; } - simple: - hp = HAlloc(BIF_P, 2 * size); - bytes = binary_bytes(real_bin)+offset; - - BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs)); + noconvert: + BIF_RET(BIF_ARG_1); } BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) @@ -2190,7 +2262,11 @@ BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0) case ERL_FILENAME_UTF8: BIF_RET(am_utf8); case ERL_FILENAME_WIN_WCHAR: - BIF_RET(am_win_wchar); + if (erts_get_user_requested_filename_encoding() == ERL_FILENAME_LATIN1) { + BIF_RET(am_latin1); + } else { + BIF_RET(am_utf8); + } default: BIF_RET(am_undefined); } diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h index d14e0ac105..57f2b2f16c 100644 --- a/erts/emulator/beam/sys.h +++ b/erts/emulator/beam/sys.h @@ -1263,6 +1263,9 @@ char* win32_errorstr(int); #define ERL_FILENAME_WIN_WCHAR 3 int erts_get_native_filename_encoding(void); +/* The set function is only to be used by erl_init! */ +void erts_set_user_requested_filename_encoding(int encoding); +int erts_get_user_requested_filename_encoding(void); void erts_init_sys_common_misc(void); diff --git a/erts/emulator/sys/common/erl_sys_common_misc.c b/erts/emulator/sys/common/erl_sys_common_misc.c index dbb59676c8..581c14b6c6 100644 --- a/erts/emulator/sys/common/erl_sys_common_misc.c +++ b/erts/emulator/sys/common/erl_sys_common_misc.c @@ -42,33 +42,51 @@ /* Written once and only once */ static int filename_encoding = ERL_FILENAME_UNKNOWN; +#if defined(__WIN32__) +static int user_filename_encoding = ERL_FILENAME_UTF8; /* Default unicode on windows */ +#else +static int user_filename_encoding = ERL_FILENAME_LATIN1; +#endif +void erts_set_user_requested_filename_encoding(int encoding) +{ + user_filename_encoding = encoding; +} + +int erts_get_user_requested_filename_encoding(void) +{ + return user_filename_encoding; +} void erts_init_sys_common_misc(void) { #if defined(__WIN32__) filename_encoding = ERL_FILENAME_WIN_WCHAR; #else - char *l; - filename_encoding = ERL_FILENAME_LATIN1; + if (user_filename_encoding != ERL_FILENAME_UNKNOWN) { + filename_encoding = user_filename_encoding; + } else { + char *l; + filename_encoding = ERL_FILENAME_LATIN1; # ifdef PRIMITIVE_UTF8_CHECK - setlocale(LC_CTYPE, ""); /* Set international environment, - ignore result */ - if (((l = getenv("LC_ALL")) && *l) || - ((l = getenv("LC_CTYPE")) && *l) || - ((l = getenv("LANG")) && *l)) { - if (strstr(l, "UTF-8")) { - filename_encoding = ERL_FILENAME_UTF8; - } - } - + setlocale(LC_CTYPE, ""); /* Set international environment, + ignore result */ + if (((l = getenv("LC_ALL")) && *l) || + ((l = getenv("LC_CTYPE")) && *l) || + ((l = getenv("LANG")) && *l)) { + if (strstr(l, "UTF-8")) { + filename_encoding = ERL_FILENAME_UTF8; + } + } + # else - l = setlocale(LC_CTYPE, ""); /* Set international environment */ - if (l != NULL) { - if (strcmp(nl_langinfo(CODESET), "UTF-8") == 0) { - filename_encoding = ERL_FILENAME_UTF8; + l = setlocale(LC_CTYPE, ""); /* Set international environment */ + if (l != NULL) { + if (strcmp(nl_langinfo(CODESET), "UTF-8") == 0) { + filename_encoding = ERL_FILENAME_UTF8; + } } - } # endif + } #endif } -- cgit v1.2.3