Make get/putenv and erlexec understand Unicode

Putenv and getenv needs to convert to the proper environment strings in Unicode depending on platform and user settings for filename encoding. Also erlexec needs to pass environment strings in an appropriate way for kernel to pick up. All environment strings on the command line, as well as home directory, is now passed in UTF8 on windows and in whatever encoding you have on Unix, kernel tries to convert all parameters and environments from UTF8 before making strings.
author: Patrik Nyblom <[email protected]> 2012-07-26 18:35:46 +0200
committer: Patrik Nyblom <[email protected]> 2012-08-14 15:04:06 +0200
commit: e11bbb37273ebd2408d0c83c62770f9ef023879d (patch)
tree: 58ca0f2f6381e171c7c99ba3022a25c137f7b689 /erts/etc/common/erlexec.c
parent: 0c9d90f314f364e5b1301ec89d762baabc57c7aa (diff)
download: otp-e11bbb37273ebd2408d0c83c62770f9ef023879d.tar.gz
otp-e11bbb37273ebd2408d0c83c62770f9ef023879d.tar.bz2
otp-e11bbb37273ebd2408d0c83c62770f9ef023879d.zip
1 files changed, 164 insertions, 7 deletions
diff --git a/erts/etc/common/erlexec.c b/erts/etc/common/erlexec.c
index 790b0ed400..cba7429fab 100644
--- a/erts/etc/common/erlexec.c
+++ b/erts/etc/common/erlexec.c
@@ -245,7 +245,9 @@ static char* config_script = NULL; /* used by option -start_erl and -config */
 
 static HANDLE this_module_handle;
 static int run_werl;
-
+static WCHAR *utf8_to_utf16(unsigned char *bytes);
+static char *utf16_to_utf8(WCHAR *wstr);
+static WCHAR *latin1_to_utf16(char *str);
 #endif
 
 /*
@@ -263,8 +265,12 @@ static void
 set_env(char *key, char *value)
 {
 #ifdef __WIN32__
-    if (!SetEnvironmentVariable((LPCTSTR) key, (LPCTSTR) value))
+    WCHAR *wkey = latin1_to_utf16(key);
+    WCHAR *wvalue = utf8_to_utf16(value);
+    if (!SetEnvironmentVariableW(wkey, wvalue))
 	error("SetEnvironmentVariable(\"%s\", \"%s\") failed!", key, value);
+    efree(wkey);
+    efree(wvalue);
 #else
     size_t size = strlen(key) + 1 + strlen(value) + 1;
     char *str = emalloc(size);
@@ -277,25 +283,33 @@ set_env(char *key, char *value)
 #endif
 }
 
+
 static char *
 get_env(char *key)
 {
 #ifdef __WIN32__
     DWORD size = 32;
-    char *value = NULL;
+    WCHAR *value = NULL;
+    WCHAR *wkey = latin1_to_utf16(key);
+    char *res;
     while (1) {
 	DWORD nsz;
 	if (value)
 	    efree(value);
-	value = emalloc(size);
+	value = emalloc(size*sizeof(WCHAR));
 	SetLastError(0);
-	nsz = GetEnvironmentVariable((LPCTSTR) key, (LPTSTR) value, size);
+	nsz = GetEnvironmentVariableW(wkey, value, size);
 	if (nsz == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND) {
 	    efree(value);
+	    efree(wkey);
 	    return NULL;
 	}
-	if (nsz <= size)
-	    return value;
+	if (nsz <= size) {
+	    efree(wkey);
+	    res = utf16_to_utf8(value);
+	    efree(value);
+	    return res;
+	}
 	size = nsz;
     }
 #else
@@ -2080,4 +2094,147 @@ possibly_quote(char* arg)
     return narg;
 }
 
+/*
+ * Unicode helpers to handle environment and command line parameters on
+ * Windows. We internally handle all environment variables in UTF8,
+ * but put and get the environment using the WCHAR (limited UTF16) interface
+ * 
+ * These are simplified to only handle Unicode characters that can fit in 
+ * Windows simplified UTF16, i.e. characters that fit in 16 bits.
+ */
+
+static int utf8_len(unsigned char first) 
+{
+    if ((first & ((unsigned char) 0x80)) == 0) {
+	return 1;
+    } else if ((first & ((unsigned char) 0xE0)) == 0xC0) {
+	return 2;
+    } else if ((first & ((unsigned char) 0xF0)) == 0xE0) {
+	return 3;
+    } else if ((first & ((unsigned char) 0xF8)) == 0xF0) {
+	return 4;
+    } 
+    return 1; /* will be a '?' */
+}
+
+static WCHAR *utf8_to_utf16(unsigned char *bytes)
+{
+    unsigned int unipoint;
+    unsigned char *tmp = bytes;
+    WCHAR *target, *res;
+    int num = 0;
+    
+    while (*tmp) {
+	num++;
+	tmp += utf8_len(*tmp);
+    }
+    res = target = emalloc((num + 1) * sizeof(WCHAR));
+    while (*bytes) {
+	if (((*bytes) & ((unsigned char) 0x80)) == 0) {
+	    unipoint = (Uint) *bytes;
+	    ++bytes;
+	} else if (((*bytes) & ((unsigned char) 0xE0)) == 0xC0) {
+	    unipoint = 
+		(((Uint) ((*bytes) & ((unsigned char) 0x1F))) << 6) |
+		((Uint) (bytes[1] & ((unsigned char) 0x3F))); 	
+	    bytes += 2;
+	} else if (((*bytes) & ((unsigned char) 0xF0)) == 0xE0) {
+	    unipoint = 
+		(((Uint) ((*bytes) & ((unsigned char) 0xF))) << 12) |
+		(((Uint) (bytes[1] & ((unsigned char) 0x3F))) << 6) |
+		((Uint) (bytes[2] & ((unsigned char) 0x3F)));
+	    if (unipoint > 0xFFFF) {
+		 unipoint = (unsigned int) '?';
+	    }
+	    bytes +=3;
+	} else if (((*bytes) & ((unsigned char) 0xF8)) == 0xF0) {
+	    unipoint = (unsigned int) '?'; /* Cannot put in a wchar */
+	    bytes += 4;
+	} else {
+	    unipoint = (unsigned int) '?';
+	}
+	*target++ = (WCHAR) unipoint;
+    }
+    *target = L'\0';
+    return res;
+}
+
+static int put_utf8(WCHAR ch, unsigned char *target, int sz, int *pos)
+{
+    Uint x = (Uint) ch;
+    if (x < 0x80) {
+    if (*pos >= sz) {
+	return -1;
+    }
+	target[(*pos)++] = (unsigned char) x;
+    }
+    else if (x < 0x800) {
+	if (((*pos) + 1) >= sz) {
+	    return -1;
+	}
+	target[(*pos)++] = (((unsigned char) (x >> 6)) | 
+			    ((unsigned char) 0xC0));
+	target[(*pos)++] = (((unsigned char) (x & 0x3F)) | 
+			    ((unsigned char) 0x80));
+    } else {
+	if ((x >= 0xD800 && x <= 0xDFFF) ||
+	    (x == 0xFFFE) ||
+	    (x == 0xFFFF)) { /* Invalid unicode range */
+	    return -1;
+	}
+	if (((*pos) + 2) >= sz) {
+	    return -1;
+	}
+
+	target[(*pos)++] = (((unsigned char) (x >> 12)) | 
+			    ((unsigned char) 0xE0));
+	target[(*pos)++] = ((((unsigned char) (x >> 6)) & 0x3F)  | 
+			    ((unsigned char) 0x80));
+	target[(*pos)++] = (((unsigned char) (x & 0x3F)) | 
+			    ((unsigned char) 0x80));
+    }
+    return 0;
+}
+
+static int need_bytes_for_utf8(WCHAR x)
+{
+    if (x < 0x80)
+	return 1;
+    else if (x < 0x800)
+	return 2;
+    else 
+	return 3;
+}
+
+static WCHAR *latin1_to_utf16(char *str)
+{
+    int len = strlen(str);
+    int i;
+    WCHAR *wstr = emalloc((len+1) * sizeof(WCHAR));
+    for(i=0;i<len;++i)
+	wstr[i] = (WCHAR) str[i];
+    wstr[len] = L'\0';
+    return wstr;
+}
+
+static char *utf16_to_utf8(WCHAR *wstr) 
+{
+    int len = wcslen(wstr);
+    char *result;
+    int i,pos;
+    int reslen = 0;
+    for(i=0;i<len;++i) {
+	reslen += need_bytes_for_utf8(wstr[i]);
+    }
+    result = emalloc(reslen+1);
+    pos = 0;
+    for(i=0;i<len;++i) {
+	if (put_utf8((int) wstr[i], result, reslen, &pos) < 0) {
+	    break;
+	}
+    }
+    result[pos] = '\0';
+    return result;
+}
+    
 #endif
author	Patrik Nyblom <[email protected]>	2012-07-26 18:35:46 +0200
committer	Patrik Nyblom <[email protected]>	2012-08-14 15:04:06 +0200
commit	e11bbb37273ebd2408d0c83c62770f9ef023879d (patch)
tree	58ca0f2f6381e171c7c99ba3022a25c137f7b689 /erts/etc/common/erlexec.c
parent	0c9d90f314f364e5b1301ec89d762baabc57c7aa (diff)
download	otp-e11bbb37273ebd2408d0c83c62770f9ef023879d.tar.gz otp-e11bbb37273ebd2408d0c83c62770f9ef023879d.tar.bz2 otp-e11bbb37273ebd2408d0c83c62770f9ef023879d.zip