From 61138cb31e1f6ad44d3ca54e668de9d2d4adb2ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Wed, 9 Apr 2014 12:42:32 +0200 Subject: Support path names with characters outside the US ASCII range --- lib/stdlib/doc/src/erl_tar.xml | 14 +++++++++ lib/stdlib/src/erl_tar.erl | 24 ++++++++++++++-- lib/stdlib/test/tar_SUITE.erl | 65 ++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 99 insertions(+), 4 deletions(-) diff --git a/lib/stdlib/doc/src/erl_tar.xml b/lib/stdlib/doc/src/erl_tar.xml index f81e36f810..afa4196be1 100644 --- a/lib/stdlib/doc/src/erl_tar.xml +++ b/lib/stdlib/doc/src/erl_tar.xml @@ -64,6 +64,20 @@ format_error/1 function.

+
+ UNICODE SUPPORT +

If file:native_name_encoding/0 + returns utf8, path names will be encoded in UTF-8 when + creating tar files and path names will be assumed to be encoded in + UTF-8 when extracting tar files.

+ +

If file:native_name_encoding/0 + returns latin1, no translation of path names will be + done.

+
+
LIMITATIONS

For maximum compatibility, it is safe to archive files with names diff --git a/lib/stdlib/src/erl_tar.erl b/lib/stdlib/src/erl_tar.erl index 40b48d7999..89b4ea6c04 100644 --- a/lib/stdlib/src/erl_tar.erl +++ b/lib/stdlib/src/erl_tar.erl @@ -381,7 +381,12 @@ to_octal(Int, Count, Result) -> to_octal(Int div 8, Count-1, [Int rem 8 + $0|Result]). to_string(Str0, Count) -> - Str = list_to_binary(Str0), + Str = case file:native_name_encoding() of + utf8 -> + unicode:characters_to_binary(Str0); + latin1 -> + list_to_binary(Str0) + end, case byte_size(Str) of Size when Size < Count -> [Str|zeroes(Count-Size)]; @@ -608,7 +613,22 @@ typeflag(Bin) -> %% Get the name of the file from the prefix and name fields of the %% tar header. -get_name(Bin) -> +get_name(Bin0) -> + List0 = get_name_raw(Bin0), + case file:native_name_encoding() of + utf8 -> + Bin = list_to_binary(List0), + case unicode:characters_to_list(Bin) of + {error,_,_} -> + List0; + List when is_list(List) -> + List + end; + latin1 -> + List0 + end. + +get_name_raw(Bin) -> Name = from_string(Bin, ?th_name, ?th_name_len), case binary_to_list(Bin, ?th_prefix+1, ?th_prefix+1) of [0] -> diff --git a/lib/stdlib/test/tar_SUITE.erl b/lib/stdlib/test/tar_SUITE.erl index 5bc34e35af..297f5b161a 100644 --- a/lib/stdlib/test/tar_SUITE.erl +++ b/lib/stdlib/test/tar_SUITE.erl @@ -23,7 +23,7 @@ create_long_names/1, bad_tar/1, errors/1, extract_from_binary/1, extract_from_binary_compressed/1, extract_from_open_file/1, symlinks/1, open_add_close/1, cooked_compressed/1, - memory/1]). + memory/1,unicode/1]). -include_lib("test_server/include/test_server.hrl"). -include_lib("kernel/include/file.hrl"). @@ -34,7 +34,7 @@ all() -> [borderline, atomic, long_names, create_long_names, bad_tar, errors, extract_from_binary, extract_from_binary_compressed, extract_from_open_file, - symlinks, open_add_close, cooked_compressed, memory]. + symlinks, open_add_close, cooked_compressed, memory, unicode]. groups() -> []. @@ -726,6 +726,56 @@ memory(Config) when is_list(Config) -> ?line ok = delete_files([Name1,Name2]), ok. +%% Test filenames with characters outside the US ASCII range. +unicode(Config) when is_list(Config) -> + PrivDir = ?config(priv_dir, Config), + do_unicode(PrivDir), + case has_transparent_naming() of + true -> + Pa = filename:dirname(code:which(?MODULE)), + Node = start_node(unicode, "+fnl -pa "++Pa), + ok = rpc:call(Node, erlang, apply, + [fun() -> do_unicode(PrivDir) end,[]]), + true = test_server:stop_node(Node), + ok; + false -> + ok + end. + +has_transparent_naming() -> + case os:type() of + {unix,darwin} -> false; + {unix,_} -> true; + _ -> false + end. + +do_unicode(PrivDir) -> + ok = file:set_cwd(PrivDir), + ok = file:make_dir("unicöde"), + + Names = unicode_create_files(), + Tar = "unicöde.tar", + ok = erl_tar:create(Tar, ["unicöde"], []), + {ok,Names} = erl_tar:table(Tar, []), + _ = [ok = file:delete(Name) || Name <- Names], + ok = erl_tar:extract(Tar), + _ = [{ok,_} = file:read_file(Name) || Name <- Names], + _ = [ok = file:delete(Name) || Name <- Names], + ok = file:del_dir("unicöde"), + ok. + +unicode_create_files() -> + FileA = "unicöde/smörgåsbord", + ok = file:write_file(FileA, "yum!\n"), + [FileA|case file:native_name_encoding() of + utf8 -> + FileB = "unicöde/Хороший файл!", + ok = file:write_file(FileB, "But almost empty.\n"), + [FileB]; + latin1 -> + [] + end]. + %% Delete the given list of files. delete_files([]) -> ok; delete_files([Item|Rest]) -> @@ -791,3 +841,14 @@ make_temp_dir(Base, I) -> ok -> Name; {error,eexist} -> make_temp_dir(Base, I+1) end. + +start_node(Name, Args) -> + [_,Host] = string:tokens(atom_to_list(node()), "@"), + ct:log("Trying to start ~w@~s~n", [Name,Host]), + case test_server:start_node(Name, peer, [{args,Args}]) of + {error,Reason} -> + test_server:fail(Reason); + {ok,Node} -> + ct:log("Node ~p started~n", [Node]), + Node + end. -- cgit v1.2.3 From bb2b84c35e75ae80174ef54dad7babf6c6fa9075 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Wed, 9 Apr 2014 16:30:49 +0200 Subject: Correct end of tape marker The POSIX standard for tar says that there must be at least two 512-bytes zero blocks at the end of the tar archive file. Our implementation would only emit a single 512-byte zero block if the size of the last file was in the range 18*512 through 19*512-1 (modulo 20*512). GNU tar would correctly unpack such tar archive file, but would emit a warning: tar: A lone zero block at 20 --- lib/stdlib/src/erl_tar.erl | 14 +++++++++++--- lib/stdlib/test/tar_SUITE.erl | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/lib/stdlib/src/erl_tar.erl b/lib/stdlib/src/erl_tar.erl index 89b4ea6c04..acf7a5cd40 100644 --- a/lib/stdlib/src/erl_tar.erl +++ b/lib/stdlib/src/erl_tar.erl @@ -397,9 +397,17 @@ to_string(Str0, Count) -> pad_file(File) -> {ok,Position} = file:position(File, {cur,0}), - %% There must be at least one empty record at the end of the file. - Zeros = zeroes(?block_size - (Position rem ?block_size)), - file:write(File, Zeros). + %% There must be at least two zero records at the end. + Fill = case ?block_size - (Position rem ?block_size) of + Fill0 when Fill0 < 2*?record_size -> + %% We need to another block here to ensure that there + %% are at least two zero records at the end. + Fill0 + ?block_size; + Fill0 -> + %% Large enough. + Fill0 + end, + file:write(File, zeroes(Fill)). split_filename(Name) when length(Name) =< ?th_name_len -> {"", Name}; diff --git a/lib/stdlib/test/tar_SUITE.erl b/lib/stdlib/test/tar_SUITE.erl index 297f5b161a..6349139925 100644 --- a/lib/stdlib/test/tar_SUITE.erl +++ b/lib/stdlib/test/tar_SUITE.erl @@ -73,6 +73,7 @@ borderline(Config) when is_list(Config) -> ?line lists:foreach(fun(Size) -> borderline_test(Size, TempDir) end, [0, 1, 10, 13, 127, 333, Record-1, Record, Record+1, + Block-2*Record-1, Block-2*Record, Block-2*Record+1, Block-Record-1, Block-Record, Block-Record+1, Block-1, Block, Block+1, Block+Record-1, Block+Record, Block+Record+1]), -- cgit v1.2.3 From fa67c39896487a3f6f382d8a38c3663ca25646af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Thu, 10 Apr 2014 13:55:01 +0200 Subject: Update information about compatibility GNU tar now supports the 'ustar' format. --- lib/stdlib/doc/src/erl_tar.xml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/stdlib/doc/src/erl_tar.xml b/lib/stdlib/doc/src/erl_tar.xml index afa4196be1..ccf0f3901f 100644 --- a/lib/stdlib/doc/src/erl_tar.xml +++ b/lib/stdlib/doc/src/erl_tar.xml @@ -35,10 +35,11 @@ Unix 'tar' utility for reading and writing tar archives

The erl_tar module archives and extract files to and from - a tar file. The tar file format is the POSIX extended tar file format - specified in IEEE Std 1003.1 and ISO/IEC 9945-1. That is the same - format as used by tar program on Solaris, but is not the same - as used by the GNU tar program.

+ a tar file. erl_tar supports the ustar format + (IEEE Std 1003.1 and ISO/IEC 9945-1). All modern tar + programs (including GNU tar) can read this format. To ensure that + that GNU tar produces a tar file that erl_tar can read, + give the --format=ustar option to GNU tar.

By convention, the name of a tar file should end in ".tar". To abide to the convention, you'll need to add ".tar" yourself to the name.

-- cgit v1.2.3 From 92a5b09ed50c905a6480b74f10af770d98eea5bd Mon Sep 17 00:00:00 2001 From: Yuki Ito Date: Sat, 12 Apr 2014 20:46:26 +0900 Subject: Fix typo in erl_tar docs --- lib/stdlib/doc/src/erl_tar.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stdlib/doc/src/erl_tar.xml b/lib/stdlib/doc/src/erl_tar.xml index ccf0f3901f..6d5b07059a 100644 --- a/lib/stdlib/doc/src/erl_tar.xml +++ b/lib/stdlib/doc/src/erl_tar.xml @@ -128,7 +128,7 @@ TarDescriptor = term() FilenameOrBin = Filename()|binary() - Filename = filename()() + Filename = filename() NameInArchive = filename() Options = [Option] Option = dereference|verbose -- cgit v1.2.3 From d02469d12f103276345dc0a3b024f76f6d6763fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Thu, 17 Apr 2014 12:09:20 +0200 Subject: Correct typo in type specification Types start with a lower-case letter. --- lib/stdlib/doc/src/erl_tar.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/stdlib/doc/src/erl_tar.xml b/lib/stdlib/doc/src/erl_tar.xml index 6d5b07059a..7f25f5b7bc 100644 --- a/lib/stdlib/doc/src/erl_tar.xml +++ b/lib/stdlib/doc/src/erl_tar.xml @@ -127,7 +127,7 @@ Add a file to an open tar file TarDescriptor = term() - FilenameOrBin = Filename()|binary() + FilenameOrBin = filename()|binary() Filename = filename() NameInArchive = filename() Options = [Option] -- cgit v1.2.3