aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPatrik Nyblom <[email protected]>2010-10-12 17:22:42 +0200
committerPatrik Nyblom <[email protected]>2010-11-29 13:59:12 +0100
commit25e22145d129a80dcfc02c64dfe0b0d890a5e26d (patch)
treea662b428c06089a502b8ff0cd0e214bb8f956bc2
parent1cf74ffecc28bc620062cbff69620671f0927e9b (diff)
downloadotp-25e22145d129a80dcfc02c64dfe0b0d890a5e26d.tar.gz
otp-25e22145d129a80dcfc02c64dfe0b0d890a5e26d.tar.bz2
otp-25e22145d129a80dcfc02c64dfe0b0d890a5e26d.zip
Add bifs to translate between erlang filenames and native encoding
-rw-r--r--erts/emulator/Makefile.in3
-rw-r--r--erts/emulator/beam/atom.names1
-rw-r--r--erts/emulator/beam/bif.tab6
-rw-r--r--erts/emulator/beam/binary.c10
-rw-r--r--erts/emulator/beam/erl_binary.h2
-rw-r--r--erts/emulator/beam/erl_init.c1
-rw-r--r--erts/emulator/beam/erl_unicode.c384
-rw-r--r--erts/emulator/beam/sys.h12
-rw-r--r--erts/emulator/sys/common/erl_sys_common_misc.c78
9 files changed, 490 insertions, 7 deletions
diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in
index 4ed0ccabc6..6c33e2ca16 100644
--- a/erts/emulator/Makefile.in
+++ b/erts/emulator/Makefile.in
@@ -796,7 +796,8 @@ endif
OS_OBJS += $(OBJDIR)/erl_mseg.o \
$(OBJDIR)/erl_$(ERLANG_OSTYPE)_sys_ddll.o \
- $(OBJDIR)/erl_mtrace_sys_wrap.o
+ $(OBJDIR)/erl_mtrace_sys_wrap.o \
+ $(OBJDIR)/erl_sys_common_misc.o
HIPE_x86_OS_OBJS=$(HIPE_x86_$(OPSYS)_OBJS)
HIPE_x86_OBJS=$(OBJDIR)/hipe_x86.o $(OBJDIR)/hipe_x86_glue.o $(OBJDIR)/hipe_x86_bifs.o $(OBJDIR)/hipe_x86_signal.o $(OBJDIR)/hipe_x86_stack.o $(HIPE_x86_OS_OBJS)
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index 327620772f..93b8e3ec28 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -549,6 +549,7 @@ atom waiting
atom wall_clock
atom warning
atom warning_msg
+atom win_wchar
atom wordsize
atom write_concurrency
atom xor
diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab
index 0674aae77f..55166417e5 100644
--- a/erts/emulator/beam/bif.tab
+++ b/erts/emulator/beam/bif.tab
@@ -795,6 +795,12 @@ bif erlang:nif_error/1
bif erlang:nif_error/2
#
+# Helpers for unicode filenames
+#
+bif file:name2native/1
+bif file:native2name/1
+bif file:native_name_encoding/0
+#
# Obsolete
#
diff --git a/erts/emulator/beam/binary.c b/erts/emulator/beam/binary.c
index 8ee8fbcb29..4be869f269 100644
--- a/erts/emulator/beam/binary.c
+++ b/erts/emulator/beam/binary.c
@@ -217,8 +217,8 @@ erts_get_aligned_binary_bytes_extra(Eterm bin, byte** base_ptr, ErtsAlcType_t al
return bytes;
}
-static Eterm
-bin_bytes_to_list(Eterm previous, Eterm* hp, byte* bytes, Uint size, Uint bitoffs)
+Eterm
+erts_bin_bytes_to_list(Eterm previous, Eterm* hp, byte* bytes, Uint size, Uint bitoffs)
{
if (bitoffs == 0) {
while (size) {
@@ -263,7 +263,7 @@ BIF_RETTYPE binary_to_list_1(BIF_ALIST_1)
Eterm* hp = HAlloc(BIF_P, 2 * size);
byte* bytes = binary_bytes(real_bin)+offset;
- BIF_RET(bin_bytes_to_list(NIL, hp, bytes, size, bitoffs));
+ BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs));
}
error:
@@ -295,7 +295,7 @@ BIF_RETTYPE binary_to_list_3(BIF_ALIST_3)
}
i = stop-start+1;
hp = HAlloc(BIF_P, 2*i);
- BIF_RET(bin_bytes_to_list(NIL, hp, bytes+start-1, i, bitoffs));
+ BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes+start-1, i, bitoffs));
error:
BIF_ERROR(BIF_P, BADARG);
@@ -339,7 +339,7 @@ BIF_RETTYPE bitstring_to_list_1(BIF_ALIST_1)
previous = CONS(hp, make_binary(last), previous);
hp += 2;
}
- BIF_RET(bin_bytes_to_list(previous, hp, bytes, size, bitoffs));
+ BIF_RET(erts_bin_bytes_to_list(previous, hp, bytes, size, bitoffs));
}
diff --git a/erts/emulator/beam/erl_binary.h b/erts/emulator/beam/erl_binary.h
index a569fe2e85..bdf0fe23fc 100644
--- a/erts/emulator/beam/erl_binary.h
+++ b/erts/emulator/beam/erl_binary.h
@@ -152,6 +152,8 @@ do { \
void erts_init_binary(void);
byte* erts_get_aligned_binary_bytes_extra(Eterm, byte**, ErtsAlcType_t, unsigned extra);
+/* Used by unicode module */
+Eterm erts_bin_bytes_to_list(Eterm previous, Eterm* hp, byte* bytes, Uint size, Uint bitoffs);
/*
* Common implementation for erlang:list_to_binary/1 and binary:list_to_bin/1
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index a9f4f041ac..a7892e143b 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -251,6 +251,7 @@ erl_init(int ncpu)
erts_init_monitors();
erts_init_gc();
init_time();
+ erts_init_sys_common_misc();
erts_init_process(ncpu);
erts_init_scheduling(use_multi_run_queue,
no_schedulers,
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index d01a3661f9..671c3c0cdf 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -463,7 +463,7 @@ L_Again: /* Restart with sublist, old listend was pushed on stack */
}
objp = list_val(ioterm);
obj = CAR(objp);
- if (!is_byte(obj))
+ if (!is_small(obj))
break;
}
} else if (is_nil(obj)) {
@@ -1813,3 +1813,385 @@ BIF_RETTYPE binary_to_existing_atom_2(BIF_ALIST_2)
{
return binary_to_atom(BIF_P, BIF_ARG_1, BIF_ARG_2, 1);
}
+
+/**********************************************************
+ * Simpler non-interruptable routines for UTF-8 and
+ * Windowish UTF-16 (restricted)
+ **********************************************************/
+static Sint simple_char_need(Eterm ioterm, int encoding)
+{
+ Eterm *objp;
+ Eterm obj;
+ DECLARE_ESTACK(stack);
+ Sint need = 0;
+
+ if (is_atom(ioterm)) {
+ Atom* ap;
+ int i;
+ ap = atom_tab(atom_val(ioterm));
+ switch (encoding) {
+ case ERL_FILENAME_LATIN1:
+ need = ap->len;
+ break;
+ case ERL_FILENAME_UTF8:
+ for (i = 0; i < ap->len; i++) {
+ need += (ap->name[i] >= 0x80) ? 2 : 1;
+ }
+ break;
+ case ERL_FILENAME_WIN_WCHAR:
+ need = 2*(ap->len);
+ break;
+ default:
+ need = -1;
+ }
+ DESTROY_ESTACK(stack);
+ return need;
+ }
+
+ if (is_nil(ioterm)) {
+ DESTROY_ESTACK(stack);
+ return need;
+ }
+ if (!is_list(ioterm)) {
+ DESTROY_ESTACK(stack);
+ return (Sint) -1;
+ }
+ /* OK a list, needs to be processed in order, handling each flat list-level
+ as they occur, just like io_list_to_binary would */
+ ESTACK_PUSH(stack,ioterm);
+ while (!ESTACK_ISEMPTY(stack)) {
+ ioterm = ESTACK_POP(stack);
+ if (is_nil(ioterm)) {
+ /* ignore empty lists */
+ continue;
+ }
+ if(is_list(ioterm)) {
+L_Again: /* Restart with sublist, old listend was pushed on stack */
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ for(;;) { /* loop over one flat list of bytes and binaries
+ until sublist or list end is encountered */
+ if (is_small(obj)) { /* Always small */
+ for(;;) {
+ Uint x = unsigned_val(obj);
+ switch (encoding) {
+ case ERL_FILENAME_LATIN1:
+ need += 1;
+ break;
+ case ERL_FILENAME_UTF8:
+ if (x < 0x80) {
+ need +=1;
+ } else if (x < 0x800) {
+ need += 2;
+ } else if (x < 0x10000) {
+ if ((x >= 0xD800 && x <= 0xDFFF) ||
+ (x == 0xFFFE) ||
+ (x == 0xFFFF)) { /* Invalid unicode range */
+ DESTROY_ESTACK(stack);
+ return ((Sint) -1);
+ }
+ need += 3;
+ } else if (x < 0x110000) {
+ need += 4;
+ } else {
+ DESTROY_ESTACK(stack);
+ return ((Sint) -1);
+ }
+ break;
+ case ERL_FILENAME_WIN_WCHAR:
+ if (x <= 0xffff) {
+ need += 2;
+ break;
+ } /* else fall throug to error */
+ default:
+ DESTROY_ESTACK(stack);
+ return ((Sint) -1);
+ }
+
+ /* everything else will give badarg later
+ in the process, so we dont check */
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ if (!is_small(obj))
+ break;
+ }
+ } else if (is_nil(obj)) {
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ } else if (is_list(obj)) {
+ /* push rest of list for later processing, start
+ again with sublist */
+ ESTACK_PUSH(stack,CDR(objp));
+ ioterm = obj;
+ goto L_Again;
+ } else {
+ DESTROY_ESTACK(stack);
+ return ((Sint) -1);
+ }
+ if (is_nil(ioterm) || !is_list(ioterm)) {
+ break;
+ }
+ } /* for(;;) */
+ } /* is_list(ioterm) */
+
+ if (!is_list(ioterm) && !is_nil(ioterm)) {
+ /* inproper list end */
+ DESTROY_ESTACK(stack);
+ return ((Sint) -1);
+ }
+ } /* while not estack empty */
+ DESTROY_ESTACK(stack);
+ return need;
+}
+
+static void simple_put_chars(Eterm ioterm, int encoding, byte *p)
+{
+ Eterm *objp;
+ Eterm obj;
+ DECLARE_ESTACK(stack);
+
+ if (is_atom(ioterm)) {
+ Atom* ap;
+ int i;
+ ap = atom_tab(atom_val(ioterm));
+ switch (encoding) {
+ case ERL_FILENAME_LATIN1:
+ for (i = 0; i < ap->len; i++) {
+ *p++ = ap->name[i];
+ }
+ break;
+ case ERL_FILENAME_UTF8:
+ for (i = 0; i < ap->len; i++) {
+ if(ap->name[i] < 0x80) {
+ *p++ = ap->name[i];
+ } else {
+ *p++ = (((ap->name[i]) >> 6) | ((byte) 0xC0));
+ *p++ = (((ap->name[i]) & 0x3F) | ((byte) 0x80));
+ }
+ }
+ break;
+ case ERL_FILENAME_WIN_WCHAR:
+ for (i = 0; i < ap->len; i++) {
+ /* Little endian */
+ *p++ = ap->name[i];
+ *p++ = 0;
+ }
+ break;
+ default:
+ ASSERT(0);
+ }
+ DESTROY_ESTACK(stack);
+ return;
+ }
+
+ if (is_nil(ioterm)) {
+ DESTROY_ESTACK(stack);
+ return;
+ }
+ ASSERT(is_list(ioterm));
+ /* OK a list, needs to be processed in order, handling each flat list-level
+ as they occur, just like io_list_to_binary would */
+ ESTACK_PUSH(stack,ioterm);
+ while (!ESTACK_ISEMPTY(stack)) {
+ ioterm = ESTACK_POP(stack);
+ if (is_nil(ioterm)) {
+ /* ignore empty lists */
+ continue;
+ }
+ if(is_list(ioterm)) {
+L_Again: /* Restart with sublist, old listend was pushed on stack */
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ for(;;) { /* loop over one flat list of bytes and binaries
+ until sublist or list end is encountered */
+ if (is_small(obj)) { /* Always small */
+ for(;;) {
+ Uint x = unsigned_val(obj);
+ switch (encoding) {
+ case ERL_FILENAME_LATIN1:
+ ASSERT( x < 256);
+ *p++ = (byte) x;
+ break;
+ case ERL_FILENAME_UTF8:
+ if (x < 0x80) {
+ *p++ = (byte) x;
+ }
+ else if (x < 0x800) {
+ *p++ = (((byte) (x >> 6)) |
+ ((byte) 0xC0));
+ *p++ = (((byte) (x & 0x3F)) |
+ ((byte) 0x80));
+ } else if (x < 0x10000) {
+ ASSERT(!((x >= 0xD800 && x <= 0xDFFF) ||
+ (x == 0xFFFE) ||
+ (x == 0xFFFF)));
+ *p++ = (((byte) (x >> 12)) |
+ ((byte) 0xE0));
+ *p++ = ((((byte) (x >> 6)) & 0x3F) |
+ ((byte) 0x80));
+ *p++ = (((byte) (x & 0x3F)) |
+ ((byte) 0x80));
+ } else {
+ ASSERT(x < 0x110000);
+ *p++ = (((byte) (x >> 18)) |
+ ((byte) 0xF0));
+ *p++ = ((((byte) (x >> 12)) & 0x3F) |
+ ((byte) 0x80));
+ *p++ = ((((byte) (x >> 6)) & 0x3F) |
+ ((byte) 0x80));
+ *p++ = (((byte) (x & 0x3F)) |
+ ((byte) 0x80));
+ }
+ break;
+ case ERL_FILENAME_WIN_WCHAR:
+ ASSERT(x <= 0xFFFF);
+ *p++ = (byte) (x & 0xFFU);
+ *p++ = (byte) ((x >> 8) & 0xFFU);
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ /* everything else will give badarg later
+ in the process, so we dont check */
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ if (!is_small(obj))
+ break;
+ }
+ } else if (is_nil(obj)) {
+ ioterm = CDR(objp);
+ if (!is_list(ioterm)) {
+ break;
+ }
+ objp = list_val(ioterm);
+ obj = CAR(objp);
+ } else if (is_list(obj)) {
+ /* push rest of list for later processing, start
+ again with sublist */
+ ESTACK_PUSH(stack,CDR(objp));
+ ioterm = obj;
+ goto L_Again;
+ } else {
+ ASSERT(0);
+ }
+ if (is_nil(ioterm) || !is_list(ioterm)) {
+ break;
+ }
+ } /* for(;;) */
+ } /* is_list(ioterm) */
+
+ ASSERT(is_list(ioterm) || is_nil(ioterm));
+ } /* while not estack empty */
+ DESTROY_ESTACK(stack);
+ return;
+}
+
+
+
+BIF_RETTYPE file_name2native_1(BIF_ALIST_1)
+{
+ int encoding = erts_get_native_filename_encoding();
+ Sint need;
+ Eterm bin_term;
+ byte* bin_p;
+ if ((need = simple_char_need(BIF_ARG_1,encoding)) < 0) {
+ BIF_ERROR(BIF_P,BADARG);
+ }
+ bin_term = new_binary(BIF_P, 0, need);
+ bin_p = binary_bytes(bin_term);
+ simple_put_chars(BIF_ARG_1,encoding,bin_p);
+ BIF_RET(bin_term);
+}
+
+BIF_RETTYPE file_native2name_1(BIF_ALIST_1)
+{
+ Eterm real_bin;
+ Uint offset;
+ Uint size,num_chars;
+ Uint bitsize;
+ Uint bitoffs;
+ Eterm *hp;
+ byte *temp_alloc = NULL;
+ byte *bytes;
+ byte *err_pos;
+ Uint num_built; /* characters */
+ Uint num_eaten; /* bytes */
+ Eterm ret;
+
+ if (is_not_binary(BIF_ARG_1)) {
+ BIF_ERROR(BIF_P,BADARG);
+ }
+ size = binary_size(BIF_ARG_1);
+ ERTS_GET_REAL_BIN(BIF_ARG_1, real_bin, offset, bitoffs, bitsize);
+ if (bitsize != 0) {
+ BIF_ERROR(BIF_P,BADARG);
+ }
+ if (size == 0) {
+ BIF_RET(NIL);
+ }
+ switch (erts_get_native_filename_encoding()) {
+ case ERL_FILENAME_LATIN1:
+ goto simple;
+ case ERL_FILENAME_UTF8:
+ bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
+ if (analyze_utf8(bytes,size,&err_pos,&num_chars,NULL) != UTF8_OK) {
+ erts_free_aligned_binary_bytes(temp_alloc);
+ goto simple;
+ }
+ num_built = 0;
+ num_eaten = 0;
+ ret = do_utf8_to_list(BIF_P, num_chars, bytes, size, num_chars, &num_built, &num_eaten, NIL);
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(ret);
+ case ERL_FILENAME_WIN_WCHAR:
+ if ((size % 2) != 0) {
+ goto simple;
+ }
+ bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc);
+ hp = HAlloc(BIF_P, size);
+ ret = NIL;
+ bytes += size-1;
+ while (size > 0) {
+ Uint x = ((Uint) *bytes--) << 8;
+ x |= ((Uint) *bytes--);
+ ret = CONS(hp,make_small(x),ret);
+ size -= 2;
+ }
+ erts_free_aligned_binary_bytes(temp_alloc);
+ BIF_RET(ret);
+ default:
+ goto simple;
+ }
+ simple:
+ hp = HAlloc(BIF_P, 2 * size);
+ bytes = binary_bytes(real_bin)+offset;
+
+ BIF_RET(erts_bin_bytes_to_list(NIL, hp, bytes, size, bitoffs));
+}
+
+BIF_RETTYPE file_native_name_encoding_0(BIF_ALIST_0)
+{
+ switch (erts_get_native_filename_encoding()) {
+ case ERL_FILENAME_LATIN1:
+ BIF_RET(am_latin1);
+ case ERL_FILENAME_UTF8:
+ BIF_RET(am_utf8);
+ case ERL_FILENAME_WIN_WCHAR:
+ BIF_RET(am_win_wchar);
+ default:
+ BIF_RET(am_undefined);
+ }
+}
diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h
index 0d15272aa8..d14e0ac105 100644
--- a/erts/emulator/beam/sys.h
+++ b/erts/emulator/beam/sys.h
@@ -1253,6 +1253,18 @@ char* win32_errorstr(int);
#endif
+/************************************************************************
+ * Find out the native filename encoding of the process (look at locale of
+ * Unix processes and just do UTF16 on windows
+ ************************************************************************/
+#define ERL_FILENAME_UNKNOWN 0
+#define ERL_FILENAME_LATIN1 1
+#define ERL_FILENAME_UTF8 2
+#define ERL_FILENAME_WIN_WCHAR 3
+
+int erts_get_native_filename_encoding(void);
+
+void erts_init_sys_common_misc(void);
#endif
diff --git a/erts/emulator/sys/common/erl_sys_common_misc.c b/erts/emulator/sys/common/erl_sys_common_misc.c
new file mode 100644
index 0000000000..dbb59676c8
--- /dev/null
+++ b/erts/emulator/sys/common/erl_sys_common_misc.c
@@ -0,0 +1,78 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2006-2010. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+/*
+ * Description: Check I/O
+ *
+ * Author: Rickard Green
+ */
+
+#ifdef HAVE_CONFIG_H
+# include "config.h"
+#endif
+
+#include "sys.h"
+#include "global.h"
+
+#if !defined(__WIN32__)
+#include <locale.h>
+#if !defined(HAVE_SETLOCALE) || !defined(HAVE_NL_LANGINFO) || !defined(HAVE_LANGINFO_H)
+#define PRIMITIVE_UTF8_CHECK 1
+#else
+#include <langinfo.h>
+#endif
+#endif
+
+/* Written once and only once */
+
+static int filename_encoding = ERL_FILENAME_UNKNOWN;
+
+void erts_init_sys_common_misc(void)
+{
+#if defined(__WIN32__)
+ filename_encoding = ERL_FILENAME_WIN_WCHAR;
+#else
+ char *l;
+ filename_encoding = ERL_FILENAME_LATIN1;
+# ifdef PRIMITIVE_UTF8_CHECK
+ setlocale(LC_CTYPE, ""); /* Set international environment,
+ ignore result */
+ if (((l = getenv("LC_ALL")) && *l) ||
+ ((l = getenv("LC_CTYPE")) && *l) ||
+ ((l = getenv("LANG")) && *l)) {
+ if (strstr(l, "UTF-8")) {
+ filename_encoding = ERL_FILENAME_UTF8;
+ }
+ }
+
+# else
+ l = setlocale(LC_CTYPE, ""); /* Set international environment */
+ if (l != NULL) {
+ if (strcmp(nl_langinfo(CODESET), "UTF-8") == 0) {
+ filename_encoding = ERL_FILENAME_UTF8;
+ }
+ }
+# endif
+#endif
+}
+
+int erts_get_native_filename_encoding(void)
+{
+ return filename_encoding;
+}