Add new AtU8 beam chunk

The new chunk stores atoms encoded in UTF-8. beam_lib has also been modified to handle the new 'utf8_atoms' attribute while the 'atoms' attribute may be a missing chunk from now on. The binary_to_atom/2 BIF can now encode any utf8 binary with up to 255 characters. The list_to_atom/1 BIF can now accept codepoints higher than 255 with up to 255 characters (thanks to Björn Gustavsson).
author: José Valim <[email protected]> 2016-05-31 14:28:54 +0200
committer: José Valim <[email protected]> 2017-01-30 15:24:05 +0100
commit: 26b59dfe67ef551cd94765557cdd8c79794bcc38 (patch)
tree: 696adc07b3e7a4a3f1ed6c52311ff6e163b218b4 /lib/stdlib
parent: 6c7539b0e39996f870385e5276e08c0dd98b6eb8 (diff)
download: otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.gz
otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.bz2
otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.zip
3 files changed, 73 insertions, 41 deletions
diff --git a/lib/stdlib/src/beam_lib.erl b/lib/stdlib/src/beam_lib.erl
index d7ee5c1f5d..461acf03be 100644
--- a/lib/stdlib/src/beam_lib.erl
+++ b/lib/stdlib/src/beam_lib.erl
@@ -63,7 +63,7 @@
 -type label()     :: integer().
 
 -type chunkid()   :: nonempty_string(). % approximation of the strings below
-%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom".
+%% "Abst" | "Attr" | "CInf" | "ExpT" | "ImpT" | "LocT" | "Atom" | "AtU8".
 -type chunkname() :: 'abstract_code' | 'attributes' | 'compile_info'
                    | 'exports' | 'labeled_exports'
                    | 'imports' | 'indexed_imports'
@@ -520,6 +520,8 @@ read_chunk_data(File0, ChunkNames0, Options)
     end.
 
 %% -> {ok, list()} | throw(Error)
+check_chunks([atoms | Ids], File, IL, L) ->
+    check_chunks(Ids, File, ["Atom", "AtU8" | IL], [{atom_chunk, atoms} | L]);
 check_chunks([ChunkName | Ids], File, IL, L) when is_atom(ChunkName) ->
     ChunkId = chunk_name_to_id(ChunkName, File),
     check_chunks(Ids, File, [ChunkId | IL], [{ChunkId, ChunkName} | L]);
@@ -537,6 +539,10 @@ scan_beam(File, What0, AllowMissingChunks) ->
     case scan_beam1(File, What0) of
 	{missing, _FD, Mod, Data, What} when AllowMissingChunks ->
 	    {ok, Mod, [{Id, missing_chunk} || Id <- What] ++ Data};
+	{missing, _FD, Mod, Data, ["Atom"]} ->
+	    {ok, Mod, Data};
+	{missing, _FD, Mod, Data, ["AtU8"]} ->
+	    {ok, Mod, Data};
 	{missing, FD, _Mod, _Data, What} ->
 	    error({missing_chunk, filename(FD), hd(What)});
 	R ->
@@ -581,18 +587,23 @@ scan_beam(FD, Pos, What, Mod, Data) ->
 	    error({invalid_beam_file, filename(FD), Pos})
     end.
 
-get_data(Cs, "Atom"=Id, FD, Size, Pos, Pos2, _Mod, Data) ->
+get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, Encoding) ->
     NewCs = del_chunk(Id, Cs),
     {NFD, Chunk} = get_chunk(Id, Pos, Size, FD),
     <<_Num:32, Chunk2/binary>> = Chunk,
-    {Module, _} = extract_atom(Chunk2),
+    {Module, _} = extract_atom(Chunk2, Encoding),
     C = case Cs of
 	    info -> 
 		{Id, Pos, Size};
 	    _ -> 
 		{Id, Chunk}
 	end,
-    scan_beam(NFD, Pos2, NewCs, Module, [C | Data]);
+    scan_beam(NFD, Pos2, NewCs, Module, [C | Data]).
+
+get_data(Cs, "Atom" = Id, FD, Size, Pos, Pos2, _Mod, Data) ->
+    get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, latin1);
+get_data(Cs, "AtU8" = Id, FD, Size, Pos, Pos2, _Mod, Data) ->
+    get_atom_data(Cs, Id, FD, Size, Pos, Pos2, Data, utf8);
 get_data(info, Id, FD, Size, Pos, Pos2, Mod, Data) ->
     scan_beam(FD, Pos2, info, Mod, [{Id, Pos, Size} | Data]);
 get_data(Chunks, Id, FD, Size, Pos, Pos2, Mod, Data) ->
@@ -624,6 +635,9 @@ get_chunk(Id, Pos, Size, FD) ->
 	    {NFD, Chunk}
     end.
 
+chunks_to_data([{atom_chunk, Name} | CNs], Chunks, File, Cs, Module, Atoms, L) ->
+    {NewAtoms, Ret} = chunk_to_data(Name, <<"">>, File, Cs, Atoms, Module),
+    chunks_to_data(CNs, Chunks, File, Cs, Module, NewAtoms, [Ret | L]);
 chunks_to_data([{Id, Name} | CNs], Chunks, File, Cs, Module, Atoms, L) ->
     {_Id, Chunk} = lists:keyfind(Id, 1, Chunks),
     {NewAtoms, Ret} = chunk_to_data(Name, Chunk, File, Cs, Atoms, Module),
@@ -651,7 +665,7 @@ chunk_to_data(abstract_code=Id, Chunk, File, _Cs, AtomTable, Mod) ->
 	<<>> ->
 	    {AtomTable, {Id, no_abstract_code}};
 	<<0:8,N:8,Mode0:N/binary,Rest/binary>> ->
-	    Mode = list_to_atom(binary_to_list(Mode0)),
+	    Mode = binary_to_atom(Mode0, utf8),
 	    decrypt_abst(Mode, Mod, File, Id, AtomTable, Rest);
 	_ ->
 	    case catch binary_to_term(Chunk) of
@@ -683,7 +697,6 @@ chunk_to_data(ChunkId, Chunk, _File,
 	      _Cs, AtomTable, _Module) when is_list(ChunkId) ->
     {AtomTable, {ChunkId, Chunk}}. % Chunk is a binary
 
-chunk_name_to_id(atoms, _)           -> "Atom";
 chunk_name_to_id(indexed_imports, _) -> "ImpT";
 chunk_name_to_id(imports, _)         -> "ImpT";
 chunk_name_to_id(exports, _)         -> "ExpT";
@@ -738,25 +751,30 @@ atm(AT, N) ->
 
 %% AT is updated.
 ensure_atoms({empty, AT}, Cs) ->
-    {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs),
-    extract_atoms(AtomChunk, AT),
+    case lists:keyfind("AtU8", 1, Cs) of
+	{_Id, AtomChunk} when is_binary(AtomChunk) ->
+	    extract_atoms(AtomChunk, AT, utf8);
+	_ ->
+	    {_Id, AtomChunk} = lists:keyfind("Atom", 1, Cs),
+	    extract_atoms(AtomChunk, AT, latin1)
+    end,
     AT;
 ensure_atoms(AT, _Cs) ->
     AT.
 
-extract_atoms(<<_Num:32, B/binary>>, AT) ->
-    extract_atoms(B, 1, AT).
+extract_atoms(<<_Num:32, B/binary>>, AT, Encoding) ->
+    extract_atoms(B, 1, AT, Encoding).
 
-extract_atoms(<<>>, _I, _AT) ->
+extract_atoms(<<>>, _I, _AT, _Encoding) ->
     true;
-extract_atoms(B, I, AT) ->
-    {Atom, B1} = extract_atom(B),
+extract_atoms(B, I, AT, Encoding) ->
+    {Atom, B1} = extract_atom(B, Encoding),
     true = ets:insert(AT, {I, Atom}),
-    extract_atoms(B1, I+1, AT).
+    extract_atoms(B1, I+1, AT, Encoding).
 
-extract_atom(<<Len, B/binary>>) ->
+extract_atom(<<Len, B/binary>>, Encoding) ->
     <<SB:Len/binary, Tail/binary>> = B,
-    {list_to_atom(binary_to_list(SB)), Tail}.
+    {binary_to_atom(SB, Encoding), Tail}.
 
 %%% Utils.
 
@@ -856,12 +874,12 @@ significant_chunks() ->
 %% for a module. They are listed in the order that they should be MD5:ed.
 
 md5_chunks() ->
-    ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"].
+    ["Atom", "AtU8", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"].
 
 %% The following chunks are mandatory in every Beam file.
 
 mandatory_chunks() ->
-    ["Code", "ExpT", "ImpT", "StrT", "Atom"].
+    ["Code", "ExpT", "ImpT", "StrT"].
 
 %%% ====================================================================
 %%% The rest of the file handles encrypted debug info.
diff --git a/lib/stdlib/test/beam_lib_SUITE.erl b/lib/stdlib/test/beam_lib_SUITE.erl
index 4521ecc0ef..279e15f703 100644
--- a/lib/stdlib/test/beam_lib_SUITE.erl
+++ b/lib/stdlib/test/beam_lib_SUITE.erl
@@ -81,12 +81,8 @@ normal(Conf) when is_list(Conf) ->
     NoOfTables = length(ets:all()),
     P0 = pps(),
 
-    CompileFlags = [{outdir,PrivDir}, debug_info],
-    {ok,_} = compile:file(Source, CompileFlags),
-    {ok, Binary} = file:read_file(BeamFile),
-
-    do_normal(BeamFile),
-    do_normal(Binary),
+    do_normal(Source, PrivDir, BeamFile, []),
+    do_normal(Source, PrivDir, BeamFile, [no_utf8_atoms]),
 
     {ok,_} = compile:file(Source, [{outdir,PrivDir}, no_debug_info]),
     {ok, {simple, [{abstract_code, no_abstract_code}]}} =
@@ -101,7 +97,15 @@ normal(Conf) when is_list(Conf) ->
     true = (P0 == pps()),
     ok.
 
-do_normal(BeamFile) ->
+do_normal(Source, PrivDir, BeamFile, Opts) ->
+    CompileFlags = [{outdir,PrivDir}, debug_info | Opts],
+    {ok,_} = compile:file(Source, CompileFlags),
+    {ok, Binary} = file:read_file(BeamFile),
+
+    do_normal(BeamFile, Opts),
+    do_normal(Binary, Opts).
+
+do_normal(BeamFile, Opts) ->
     Imports = {imports, [{erlang, get_module_info, 1},
 			 {erlang, get_module_info, 2},
 			 {lists, member, 2}]},
@@ -130,20 +134,31 @@ do_normal(BeamFile) ->
 	beam_lib:chunks(BeamFile, [abstract_code]),
 
     %% Test reading optional chunks.
-    All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT"],
+    All = ["Atom", "Code", "StrT", "ImpT", "ExpT", "FunT", "LitT", "AtU8"],
     {ok,{simple,Chunks}} = beam_lib:chunks(BeamFile, All, [allow_missing_chunks]),
-    verify_simple(Chunks).
+    case {verify_simple(Chunks),Opts} of
+	{{missing_chunk, AtomBin}, []} when is_binary(AtomBin) -> ok;
+	{{AtomBin, missing_chunk}, [no_utf8_atoms]} when is_binary(AtomBin) -> ok
+    end,
 
-verify_simple([{"Atom", AtomBin},
+    %% Make sure that reading the atom chunk works when the 'allow_missing_chunks'
+    %% option is used.
+    Some = ["Code",atoms,"ExpT","LitT"],
+    {ok,{simple,SomeChunks}} = beam_lib:chunks(BeamFile, Some, [allow_missing_chunks]),
+    [{"Code",<<_/binary>>},{atoms,[_|_]},{"ExpT",<<_/binary>>},{"LitT",missing_chunk}] =
+	SomeChunks.
+
+verify_simple([{"Atom", PlainAtomChunk},
 	       {"Code", CodeBin},
 	       {"StrT", StrBin},
 	       {"ImpT", ImpBin},
 	       {"ExpT", ExpBin},
 	       {"FunT", missing_chunk},
-	       {"LitT", missing_chunk}])
-  when is_binary(AtomBin), is_binary(CodeBin), is_binary(StrBin),
+	       {"LitT", missing_chunk},
+	       {"AtU8", AtU8Chunk}])
+  when is_binary(CodeBin), is_binary(StrBin),
        is_binary(ImpBin), is_binary(ExpBin) ->
-    ok.
+    {PlainAtomChunk, AtU8Chunk}.
 
 %% Read invalid beam files.
 error(Conf) when is_list(Conf) ->
@@ -211,7 +226,7 @@ last_chunk(Bin) ->
 do_error(BeamFile, ACopy) ->
     %% evil tests
     Chunks = chunk_info(BeamFile),
-    {value, {_, AtomStart, _}} = lists:keysearch("Atom", 1, Chunks),
+    {value, {_, AtomStart, _}} = lists:keysearch("AtU8", 1, Chunks),
     {value, {_, ImportStart, _}} = lists:keysearch("ImpT", 1, Chunks),
     {value, {_, AbstractStart, _}} = lists:keysearch("Abst", 1, Chunks),
     {value, {_, AttributesStart, _}} =
@@ -234,7 +249,7 @@ do_error(BeamFile, ACopy) ->
     verify(not_a_beam_file, beam_lib:info(BF7)),
 
     BF8 = set_byte(ACopy, BeamFile, 13, 17),
-    verify(missing_chunk, beam_lib:chunks(BF8, ["Atom"])),
+    verify(missing_chunk, beam_lib:chunks(BF8, ["AtU8"])),
 
     BF9 = set_byte(ACopy, BeamFile, CompileInfoStart+10, 17),
     verify(invalid_chunk, beam_lib:chunks(BF9, [compile_info])).
diff --git a/lib/stdlib/test/erl_scan_SUITE.erl b/lib/stdlib/test/erl_scan_SUITE.erl
index 4ae734eb65..7d0ba967f9 100644
--- a/lib/stdlib/test/erl_scan_SUITE.erl
+++ b/lib/stdlib/test/erl_scan_SUITE.erl
@@ -772,10 +772,9 @@ unicode() ->
         erl_scan:string([1089]),
     {error,{{1,1},erl_scan,{illegal,character}},{1,2}} =
         erl_scan:string([1089], {1,1}),
-    {error,{1,erl_scan,{illegal,atom}},1} =
-        erl_scan:string("'a"++[1089]++"b'", 1),
-    {error,{{1,1},erl_scan,{illegal,atom}},{1,6}} =
-        erl_scan:string("'a"++[1089]++"b'", {1,1}),
+    {error,{{1,3},erl_scan,{illegal,character}},{1,4}} =
+        erl_scan:string("'a" ++ [999999999] ++ "c'", {1,1}),
+
     test("\"a"++[1089]++"b\""),
     {ok,[{char,1,1}],1} =
         erl_scan_string([$$,$\\,$^,1089], 1),
@@ -786,8 +785,8 @@ unicode() ->
         erl_scan:format_error(Error),
     {error,{{1,1},erl_scan,_},{1,11}} =
         erl_scan:string("\"qa\\x{aaa}",{1,1}),
-    {error,{{1,1},erl_scan,{illegal,atom}},{1,12}} =
-        erl_scan:string("'qa\\x{aaa}'",{1,1}),
+    {error,{{1,1},erl_scan,_},{1,11}} =
+        erl_scan:string("'qa\\x{aaa}",{1,1}),
 
     {ok,[{char,1,1089}],1} =
         erl_scan_string([$$,1089], 1),
@@ -904,9 +903,9 @@ more_chars() ->
 %% OTP-10302. Unicode characters scanner/parser.
 otp_10302(Config) when is_list(Config) ->
     %% From unicode():
-    {error,{1,erl_scan,{illegal,atom}},1} =
+    {ok,[{atom,1,'aсb'}],1} =
         erl_scan:string("'a"++[1089]++"b'", 1),
-    {error,{{1,1},erl_scan,{illegal,atom}},{1,12}} =
+    {ok,[{atom,{1,1},'qaપ'}],{1,12}} =
         erl_scan:string("'qa\\x{aaa}'",{1,1}),
 
     {ok,[{char,1,1089}],1} = erl_scan_string([$$,1089], 1),
author	José Valim <[email protected]>	2016-05-31 14:28:54 +0200
committer	José Valim <[email protected]>	2017-01-30 15:24:05 +0100
commit	26b59dfe67ef551cd94765557cdd8c79794bcc38 (patch)
tree	696adc07b3e7a4a3f1ed6c52311ff6e163b218b4 /lib/stdlib
parent	6c7539b0e39996f870385e5276e08c0dd98b6eb8 (diff)
download	otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.gz otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.tar.bz2 otp-26b59dfe67ef551cd94765557cdd8c79794bcc38.zip