From 26b59dfe67ef551cd94765557cdd8c79794bcc38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Tue, 31 May 2016 14:28:54 +0200 Subject: Add new AtU8 beam chunk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new chunk stores atoms encoded in UTF-8. beam_lib has also been modified to handle the new 'utf8_atoms' attribute while the 'atoms' attribute may be a missing chunk from now on. The binary_to_atom/2 BIF can now encode any utf8 binary with up to 255 characters. The list_to_atom/1 BIF can now accept codepoints higher than 255 with up to 255 characters (thanks to Björn Gustavsson). --- lib/compiler/src/beam_asm.erl | 29 +++++++++++++++-------- lib/compiler/src/beam_dict.erl | 12 +++++----- lib/compiler/src/compile.erl | 21 +++++++++++++---- lib/compiler/test/compile_SUITE.erl | 31 ++++++++++++++++++++----- lib/compiler/test/compile_SUITE_data/simple.erl | 5 +++- lib/compiler/test/lc_SUITE.erl | 2 +- 6 files changed, 71 insertions(+), 29 deletions(-) (limited to 'lib/compiler') diff --git a/lib/compiler/src/beam_asm.erl b/lib/compiler/src/beam_asm.erl index a2f5dc674c..2fc2850591 100644 --- a/lib/compiler/src/beam_asm.erl +++ b/lib/compiler/src/beam_asm.erl @@ -21,7 +21,7 @@ -module(beam_asm). --export([module/4]). +-export([module/5]). -export([encode/2]). -export_type([fail/0,label/0,reg/0,src/0,module_code/0,function_name/0]). @@ -57,20 +57,20 @@ -type module_code() :: {module(),[_],[_],[asm_function()],pos_integer()}. --spec module(module_code(), exports(), [_], [compile:option()]) -> +-spec module(module_code(), exports(), [_], [compile:option()], [compile:option()]) -> {'ok',binary()}. -module(Code, Abst, SourceFile, Opts) -> - {ok,assemble(Code, Abst, SourceFile, Opts)}. +module(Code, Abst, SourceFile, Opts, CompilerOpts) -> + {ok,assemble(Code, Abst, SourceFile, Opts, CompilerOpts)}. -assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts) -> +assemble({Mod,Exp0,Attr0,Asm0,NumLabels}, Abst, SourceFile, Opts, CompilerOpts) -> {1,Dict0} = beam_dict:atom(Mod, beam_dict:new()), {0,Dict1} = beam_dict:fname(atom_to_list(Mod) ++ ".erl", Dict0), NumFuncs = length(Asm0), {Asm,Attr} = on_load(Asm0, Attr0), Exp = cerl_sets:from_list(Exp0), {Code,Dict2} = assemble_1(Asm, Exp, Dict1, []), - build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts). + build_file(Code, Attr, Dict2, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts). on_load(Fs0, Attr0) -> case proplists:get_value(on_load, Attr0) of @@ -113,7 +113,7 @@ assemble_function([H|T], Acc, Dict0) -> assemble_function([], Code, Dict) -> {Code, Dict}. -build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> +build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts, CompilerOpts) -> %% Create the code chunk. CodeChunk = chunk(<<"Code">>, @@ -125,9 +125,9 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> Code), %% Create the atom table chunk. - - {NumAtoms, AtomTab} = beam_dict:atom_table(Dict), - AtomChunk = chunk(<<"Atom">>, <>, AtomTab), + AtomEncoding = atom_encoding(CompilerOpts), + {NumAtoms, AtomTab} = beam_dict:atom_table(Dict, AtomEncoding), + AtomChunk = chunk(atom_chunk_name(AtomEncoding), <>, AtomTab), %% Create the import table chunk. @@ -203,6 +203,15 @@ build_file(Code, Attr, Dict, NumLabels, NumFuncs, Abst, SourceFile, Opts) -> end, build_form(<<"BEAM">>, Chunks). +atom_encoding(Opts) -> + case proplists:get_bool(no_utf8_atoms, Opts) of + false -> utf8; + true -> latin1 + end. + +atom_chunk_name(utf8) -> <<"AtU8">>; +atom_chunk_name(latin1) -> <<"Atom">>. + %% finalize_fun_table(Essentials, MD5) -> FinalizedEssentials %% Update the 'old_uniq' field in the entry for each fun in the %% 'FunT' chunk. We'll use part of the MD5 for the module as a diff --git a/lib/compiler/src/beam_dict.erl b/lib/compiler/src/beam_dict.erl index 719d799fd7..990e86062a 100644 --- a/lib/compiler/src/beam_dict.erl +++ b/lib/compiler/src/beam_dict.erl @@ -24,7 +24,7 @@ -export([new/0,opcode/2,highest_opcode/1, atom/2,local/4,export/4,import/4, string/2,lambda/3,literal/2,line/2,fname/2, - atom_table/1,local_table/1,export_table/1,import_table/1, + atom_table/2,local_table/1,export_table/1,import_table/1, string_table/1,lambda_table/1,literal_table/1, line_table/1]). @@ -197,15 +197,15 @@ fname(Name, #asm{fnames=Fnames}=Dict) -> end. %% Returns the atom table. -%% atom_table(Dict) -> {LastIndex,[Length,AtomString...]} --spec atom_table(bdict()) -> {non_neg_integer(), [[non_neg_integer(),...]]}. +%% atom_table(Dict, Encoding) -> {LastIndex,[Length,AtomString...]} +-spec atom_table(bdict(), latin1 | utf8) -> {non_neg_integer(), [[non_neg_integer(),...]]}. -atom_table(#asm{atoms=Atoms}) -> +atom_table(#asm{atoms=Atoms}, Encoding) -> NumAtoms = maps:size(Atoms), Sorted = lists:keysort(2, maps:to_list(Atoms)), {NumAtoms,[begin - L = atom_to_list(A), - [length(L)|L] + L = atom_to_binary(A, Encoding), + [byte_size(L),L] end || {A,_} <- Sorted]}. %% Returns the table of local functions. diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl index 069add7890..dcd962df66 100644 --- a/lib/compiler/src/compile.erl +++ b/lib/compiler/src/compile.erl @@ -214,11 +214,21 @@ expand_opt(report, Os) -> expand_opt(return, Os) -> [return_errors,return_warnings|Os]; expand_opt(r12, Os) -> - [no_recv_opt,no_line_info|Os]; + [no_recv_opt,no_line_info,no_utf8_atoms|Os]; expand_opt(r13, Os) -> - [no_recv_opt,no_line_info|Os]; + [no_recv_opt,no_line_info,no_utf8_atoms|Os]; expand_opt(r14, Os) -> - [no_line_info|Os]; + [no_line_info,no_utf8_atoms|Os]; +expand_opt(r15, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r16, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r17, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r18, Os) -> + [no_utf8_atoms|Os]; +expand_opt(r19, Os) -> + [no_utf8_atoms|Os]; expand_opt({debug_info_key,_}=O, Os) -> [encrypt_debug_info,O|Os]; expand_opt(no_float_opt, Os) -> @@ -1376,13 +1386,14 @@ encrypt({des3_cbc=Type,Key,IVec,BlockSize}, Bin0) -> save_core_code(Code, St) -> {ok,Code,St#compile{core_code=cerl:from_records(Code)}}. -beam_asm(Code0, #compile{ifile=File,abstract_code=Abst,mod_options=Opts0}=St) -> +beam_asm(Code0, #compile{ifile=File,abstract_code=Abst, + options=CompilerOpts,mod_options=Opts0}=St) -> Source = paranoid_absname(File), Opts1 = lists:map(fun({debug_info_key,_}) -> {debug_info_key,'********'}; (Other) -> Other end, Opts0), Opts2 = [O || O <- Opts1, effects_code_generation(O)], - case beam_asm:module(Code0, Abst, Source, Opts2) of + case beam_asm:module(Code0, Abst, Source, Opts2, CompilerOpts) of {ok,Code} -> {ok,Code,St#compile{abstract_code=[]}} end. diff --git a/lib/compiler/test/compile_SUITE.erl b/lib/compiler/test/compile_SUITE.erl index 8c09414a52..8d7facd727 100644 --- a/lib/compiler/test/compile_SUITE.erl +++ b/lib/compiler/test/compile_SUITE.erl @@ -30,7 +30,7 @@ file_1/1, forms_2/1, module_mismatch/1, big_file/1, outdir/1, binary/1, makedep/1, cond_and_ifdef/1, listings/1, listings_big/1, other_output/1, kernel_listing/1, encrypted_abstr/1, - strict_record/1, + strict_record/1, utf8_atoms/1, cover/1, env/1, core/1, core_roundtrip/1, asm/1, optimized_guards/1, sys_pre_attributes/1, dialyzer/1, @@ -48,7 +48,7 @@ all() -> [app_test, appup_test, file_1, forms_2, module_mismatch, big_file, outdir, binary, makedep, cond_and_ifdef, listings, listings_big, other_output, kernel_listing, encrypted_abstr, - strict_record, + strict_record, utf8_atoms, cover, env, core, core_roundtrip, asm, optimized_guards, sys_pre_attributes, dialyzer, warnings, pre_load_check, env_compiler_options]. @@ -450,8 +450,10 @@ do_kernel_listing({M,A}) -> try {ok,M,Kern} = compile:forms(A, [to_kernel]), IoList = v3_kernel_pp:format(Kern), - _ = iolist_size(IoList), - ok + case unicode:characters_to_binary(IoList) of + Bin when is_binary(Bin) -> + ok + end catch throw:{error,Error} -> io:format("*** compilation failure '~p' for module ~s\n", @@ -680,6 +682,23 @@ test_sloppy() -> {1,2} = record_access:test(Turtle), Turtle. +utf8_atoms(Config) when is_list(Config) -> + Anno = erl_anno:new(1), + Atom = binary_to_atom(<<"こんにちは"/utf8>>, utf8), + Forms = [{attribute,Anno,compile,[export_all]}, + {function,Anno,atom,0,[{clause,Anno,[],[],[{atom,Anno,Atom}]}]}], + + Utf8AtomForms = [{attribute,Anno,module,utf8_atom}|Forms], + {ok,utf8_atom,Utf8AtomBin} = + compile:forms(Utf8AtomForms, [binary]), + {ok,{utf8_atom,[{atoms,_}]}} = + beam_lib:chunks(Utf8AtomBin, [atoms]), + code:load_binary(utf8_atom, "compile_SUITE", Utf8AtomBin), + Atom = utf8_atom:atom(), + + NoUtf8AtomForms = [{attribute,Anno,module,no_utf8_atom}|Forms], + error = compile:forms(NoUtf8AtomForms, [binary, r19]). + env(Config) when is_list(Config) -> {Simple,Target} = get_files(Config, simple, env), {ok,Cwd} = file:get_cwd(), @@ -751,7 +770,7 @@ do_core_1(M, A, Outdir) -> {ok,M,Core0} = compile:forms(A, [to_core]), CoreFile = filename:join(Outdir, atom_to_list(M)++".core"), CorePP = core_pp:format(Core0), - ok = file:write_file(CoreFile, CorePP), + ok = file:write_file(CoreFile, unicode:characters_to_binary(CorePP)), %% Parse the .core file and return the result as Core Erlang Terms. Core = case compile:file(CoreFile, [report_errors,from_core,no_copt,to_core,binary]) of @@ -823,7 +842,7 @@ do_core_roundtrip_1(Mod, Abstr, Outdir) -> do_core_roundtrip_2(M, Core0, Outdir) -> CoreFile = filename:join(Outdir, atom_to_list(M)++".core"), CorePP = core_pp:format_all(Core0), - ok = file:write_file(CoreFile, CorePP), + ok = file:write_file(CoreFile, unicode:characters_to_binary(CorePP)), %% Parse the .core file and return the result as Core Erlang Terms. Core2 = case compile:file(CoreFile, [report_errors,from_core, diff --git a/lib/compiler/test/compile_SUITE_data/simple.erl b/lib/compiler/test/compile_SUITE_data/simple.erl index d8324dafaf..9385d101e0 100644 --- a/lib/compiler/test/compile_SUITE_data/simple.erl +++ b/lib/compiler/test/compile_SUITE_data/simple.erl @@ -19,7 +19,7 @@ %% -module(simple). --export([test/0]). +-export([test/0,unicode/0]). -ifdef(need_foo). -export([foo/0]). @@ -28,6 +28,9 @@ test() -> passed. +unicode() -> + {"это",'спутник'}. + %% Conditional inclusion. %% Compile with [{d, need_foo}, {d, foo_value, 42}]. diff --git a/lib/compiler/test/lc_SUITE.erl b/lib/compiler/test/lc_SUITE.erl index 3cb49433ce..6904ee7f52 100644 --- a/lib/compiler/test/lc_SUITE.erl +++ b/lib/compiler/test/lc_SUITE.erl @@ -226,7 +226,7 @@ effect(Config) when is_list(Config) -> lc_SUITE -> _ = [{'EXIT',{badarg,_}} = (catch binary_to_atom(<>, utf8)) || - C <- lists:seq(16#10000, 16#FFFFF)]; + C <- lists:seq(16#FF10000, 16#FFFFFFF)]; _ -> ok end, -- cgit v1.2.3