From e11bbb37273ebd2408d0c83c62770f9ef023879d Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Thu, 26 Jul 2012 18:35:46 +0200 Subject: Make get/putenv and erlexec understand Unicode Putenv and getenv needs to convert to the proper environment strings in Unicode depending on platform and user settings for filename encoding. Also erlexec needs to pass environment strings in an appropriate way for kernel to pick up. All environment strings on the command line, as well as home directory, is now passed in UTF8 on windows and in whatever encoding you have on Unix, kernel tries to convert all parameters and environments from UTF8 before making strings. --- lib/kernel/test/file_name_SUITE.erl | 52 +++++++++++++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 2 deletions(-) (limited to 'lib') diff --git a/lib/kernel/test/file_name_SUITE.erl b/lib/kernel/test/file_name_SUITE.erl index 53bcb1162d..be33ec2c06 100644 --- a/lib/kernel/test/file_name_SUITE.erl +++ b/lib/kernel/test/file_name_SUITE.erl @@ -74,7 +74,7 @@ init_per_suite/1,end_per_suite/1, init_per_group/2,end_per_group/2, init_per_testcase/2, end_per_testcase/2]). --export([normal/1,icky/1,very_icky/1,normalize/1]). +-export([normal/1,icky/1,very_icky/1,normalize/1,home_dir/1]). init_per_testcase(_Func, Config) -> @@ -88,7 +88,7 @@ end_per_testcase(_Func, Config) -> suite() -> [{ct_hooks,[ts_install_cth]}]. all() -> - [normal, icky, very_icky, normalize]. + [normal, icky, very_icky, normalize, home_dir]. groups() -> []. @@ -105,6 +105,54 @@ init_per_group(_GroupName, Config) -> end_per_group(_GroupName, Config) -> Config. +home_dir(suite) -> + []; +home_dir(doc) -> + ["Check that Erlang can be started with unicode named home directory"]; +home_dir(Config) when is_list(Config) -> + try + Name=[960,945,964,961,953,954], + Priv = ?config(priv_dir, Config), + UniMode = file:native_name_encoding() =/= latin1, + if + not UniMode -> + throw(need_unicode_mode); + true -> + ok + end, + NewHome=filename:join(Priv,Name), + file:make_dir(NewHome), + {SaveOldName,SaveOldValue} = case os:type() of + {win32,nt} -> + HomePath=re:replace(filename:nativename(NewHome),"^[a-zA-Z]:","",[{return,list},unicode]), + Save = os:getenv("HOMEPATH"), + os:putenv("HOMEPATH",HomePath), + {"HOMEPATH",Save}; + {unix,_} -> + Save = os:getenv("HOME"), + os:putenv("HOME",NewHome), + {"HOME",Save}; + _ -> + rm_rf(prim_file,NewHome), + throw(unsupported_os) + end, + try + {ok,Node} = test_server:start_node(test_unicode_homedir,slave,[{args,"-setcookie "++atom_to_list(erlang:get_cookie())}]), + test_server:stop_node(Node), + ok + after + os:putenv(SaveOldName,SaveOldValue), + rm_rf(prim_file,NewHome) + end + catch + throw:need_unicode_mode -> + io:format("Sorry, can only run in unicode mode.~n"), + {skipped,"VM needs to be started in Unicode filename mode"}; + throw:unsupported_os -> + io:format("Sorry, can only run on Unix/Windows.~n"), + {skipped,"Runs only on Unix/Windows"} + end. + normalize(suite) -> []; normalize(doc) -> -- cgit v1.2.3 From b6196f5d47c06e6991cfc7e76db3d588a7797302 Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Tue, 14 Aug 2012 15:02:43 +0200 Subject: Add documetation about Unicode in environment --- lib/kernel/doc/src/os.xml | 15 +++++++++++++++ lib/stdlib/doc/src/unicode_usage.xml | 8 +++++++- 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/kernel/doc/src/os.xml b/lib/kernel/doc/src/os.xml index e94119845a..09c525b376 100644 --- a/lib/kernel/doc/src/os.xml +++ b/lib/kernel/doc/src/os.xml @@ -80,6 +80,10 @@ DirOut = os:cmd("dir"), % on Win32 platform Each environment variable is given as a single string on the format "VarName=Value", where VarName is the name of the variable and Value its value.

+

If Unicode file name encoding is in effect (see the erl manual + page), the strings may contain characters with + codepoints > 255.

@@ -93,6 +97,10 @@ DirOut = os:cmd("dir"), % on Win32 platform

Returns the Value of the environment variable VarName, or false if the environment variable is undefined.

+

If Unicode file name encoding is in effect (see the erl manual + page), the strings (both VarName and + Value) may contain characters with codepoints > 255.

@@ -123,6 +131,13 @@ DirOut = os:cmd("dir"), % on Win32 platform

Sets a new Value for the environment variable VarName.

+

If Unicode filename encoding is in effect (see the erl manual + page), the strings (both VarName and + Value) may contain characters with codepoints > 255.

+

On Unix platforms, the environment will be set using UTF-8 encoding + if Unicode file name translation is in effect. On Windows the + environment is set using wide character interfaces.

diff --git a/lib/stdlib/doc/src/unicode_usage.xml b/lib/stdlib/doc/src/unicode_usage.xml index a7e010a05f..b7b5d497d0 100644 --- a/lib/stdlib/doc/src/unicode_usage.xml +++ b/lib/stdlib/doc/src/unicode_usage.xml @@ -53,7 +53,7 @@ Basically the same as UTF-32, but without some Unicode semantics, defined by IEEE and has little use as a separate encoding standard. For all normal (and possibly abnormal) usages, UTF-32 and UCS-4 are interchangeable.

Certain ranges of characters are left unused and certain ranges are even deemed invalid. The most notable invalid range is 16#D800 - 16#DFFF, as the UTF-16 encoding does not allow for encoding of these numbers. It can be speculated that the UTF-16 encoding standard was, from the beginning, expected to be able to hold all Unicode characters in one 16-bit entity, but then had to be extended, leaving a hole in the Unicode range to cope with backward compatibility.

-

Additionally, the codepoint 16#FEFF is used for byte order marks (BOM's) and use of that character is not encouraged in other contexts than that. It actually is valid though, as the character "ZWNBS" (Zero Width Non Breaking Space). BOM's are used to identify encodings and byte order for programs where such parameters are not known in advance. Byte order marks are more seldom used than one could expect, put their use is becoming more widely spread as they provide the means for programs to make educated guesses about the Unicode format of a certain file.

+

Additionally, the codepoint 16#FEFF is used for byte order marks (BOM's) and use of that character is not encouraged in other contexts than that. It actually is valid though, as the character "ZWNBS" (Zero Width Non Breaking Space). BOM's are used to identify encodings and byte order for programs where such parameters are not known in advance. Byte order marks are more seldom used than one could expect, but their use is becoming more widely spread as they provide the means for programs to make educated guesses about the Unicode format of a certain file.

Standard Unicode representation in Erlang @@ -210,6 +210,12 @@ Eshell V5.7 (abort with ^G)
+Unicode in environment variables and parameters +

Environment variables and their interpretation is handled much in the same way as file names. If Unicode file names are enabled, environment variables as well as parameters to the Erlang VM are expected to be in Unicode.

+

If Unicode file names are enabled, the calls to os:getenv/0, os:getenv/1 and os:putenv/2 will handle Unicode strings. On Unix-like platforms, the built-in functions will translate environment variables in UTF-8 to/from Unicode strings, possibly with codepoints > 255. On Windows the Unicode versions of the environment system API will be used, also allowing for codepoints > 255.

+

On Unix-like operating systems, parameters are expected to be UTF-8 without translation if Unicode file names are enabled.

+
+
Unicode-aware modules

Most of the modules in Erlang/OTP are of course Unicode-unaware in the sense that they have no notion of Unicode and really shouldn't have. Typically they handle non-textual or byte-oriented data (like gen_tcp etc).

Modules that actually handle textual data (like io_lib, string etc) are sometimes subject to conversion or extension to be able to handle Unicode characters.

-- cgit v1.2.3 From 0c387e88d5a6dc97466ec33088aa9798eeb1d16d Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Wed, 15 Aug 2012 17:25:26 +0200 Subject: Teach release_handler_SUITE about file:native_name_encoding/0 Release_handler_suite correctly checks that no calls to the file module is done during a diskless upgrade, but file:native_name_encoding is a BIF that does no real i/o and can therefore be called during command line argunemt parsing. The testcase is updated to ignore such calls. --- lib/sasl/test/installer.erl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'lib') diff --git a/lib/sasl/test/installer.erl b/lib/sasl/test/installer.erl index 6942ec21ea..634218e3fb 100644 --- a/lib/sasl/test/installer.erl +++ b/lib/sasl/test/installer.erl @@ -876,7 +876,9 @@ trace_disallowed_calls(Node) -> MasterProc = self(), rpc:call(Node,dbg,tracer,[process,{fun(T,_) -> MasterProc ! T end,[]}]), rpc:call(Node,dbg,p,[all,call]), - rpc:call(Node,dbg,tp,[file,[{'_',[],[{message,{caller}}]}]]). + rpc:call(Node,dbg,tp,[file,[{'_',[],[{message,{caller}}]}]]), + %% File:native_name_encoding/0 is a BIF and OK to use + rpc:call(Node,dbg,ctp,[file,native_name_encoding,0]). check_disallowed_calls(TestNode,Line) -> receive -- cgit v1.2.3