1 files changed, 127 insertions, 67 deletions
diff --git a/lib/compiler/src/beam_block.erl b/lib/compiler/src/beam_block.erl
index 6543e05e20..39ae8d5347 100644
--- a/lib/compiler/src/beam_block.erl
+++ b/lib/compiler/src/beam_block.erl
@@ -23,30 +23,37 @@
 -module(beam_block).
 
 -export([module/2]).
--import(lists, [reverse/1,reverse/2,foldl/3,member/2]).
+-import(lists, [reverse/1,reverse/2,member/2]).
 
 -spec module(beam_utils:module_code(), [compile:option()]) ->
                     {'ok',beam_utils:module_code()}.
 
-module({Mod,Exp,Attr,Fs0,Lc}, _Opt) ->
-    Fs = [function(F) || F <- Fs0],
+module({Mod,Exp,Attr,Fs0,Lc}, Opts) ->
+    Blockify = not member(no_blockify, Opts),
+    Fs = [function(F, Blockify) || F <- Fs0],
     {ok,{Mod,Exp,Attr,Fs,Lc}}.
 
-function({function,Name,Arity,CLabel,Is0}) ->
+function({function,Name,Arity,CLabel,Is0}, Blockify) ->
     try
 	%% Collect basic blocks and optimize them.
-	Is1 = blockify(Is0),
-	Is2 = embed_lines(Is1),
-	Is3 = move_allocates(Is2),
-	Is4 = beam_utils:live_opt(Is3),
-	Is5 = opt_blocks(Is4),
-	Is6 = beam_utils:delete_live_annos(Is5),
-
-	%% Done.
-	{function,Name,Arity,CLabel,Is6}
+        Is2 = case Blockify of
+                  true ->
+                      Is1 = blockify(Is0),
+                      embed_lines(Is1);
+                  false ->
+                      Is0
+              end,
+        Is3 = beam_utils:anno_defs(Is2),
+        Is4 = move_allocates(Is3),
+        Is5 = beam_utils:live_opt(Is4),
+        Is6 = opt_blocks(Is5),
+        Is7 = beam_utils:delete_annos(Is6),
+        Is = opt_allocs(Is7),
+
+        %% Done.
+        {function,Name,Arity,CLabel,Is}
     catch
-	Class:Error ->
-	    Stack = erlang:get_stacktrace(),
+        Class:Error:Stack ->
 	    io:fwrite("Function: ~w/~w\n", [Name,Arity]),
 	    erlang:raise(Class, Error, Stack)
     end.
@@ -136,17 +143,16 @@ embed_lines([], Acc) -> Acc.
 
 opt_blocks([{block,Bl0}|Is]) ->
     %% The live annotation at the beginning is not useful.
-    [{'%live',_,_}|Bl] = Bl0,
+    [{'%anno',_}|Bl] = Bl0,
     [{block,opt_block(Bl)}|opt_blocks(Is)];
 opt_blocks([I|Is]) ->
     [I|opt_blocks(Is)];
 opt_blocks([]) -> [].
 
 opt_block(Is0) ->
-    Is = find_fixpoint(fun(Is) ->
-			       opt_tuple_element(opt(Is))
-		       end, Is0),
-    opt_alloc(Is).
+    find_fixpoint(fun(Is) ->
+                          opt_tuple_element(opt(Is))
+                  end, Is0).
 
 find_fixpoint(OptFun, Is0) ->
     case OptFun(Is0) of
@@ -173,7 +179,7 @@ find_fixpoint(OptFun, Is0) ->
 %%  safe to assume that if x(N) is initialized, then all lower-numbered
 %%  x registers are also initialized.
 %%
-%%  For example, in general it is not safe to transform the following
+%%  For example, we must be careful when transforming the following
 %%  instructions:
 %%
 %%     get_tuple_element x(0) Element => x(1)
@@ -185,13 +191,9 @@ find_fixpoint(OptFun, Is0) ->
 %%     get_tuple_element x(0) Element => x(1)
 %%
 %%  The transformation is safe if and only if x(1) has been
-%%  initialized previously. Unfortunately, beam_reorder may have moved
-%%  a get_tuple_element instruction so that x(1) is not always
-%%  initialized when this code is reached. To find whether or not x(1)
-%%  is initialized, we would need to analyze all code preceding these
-%%  two instructions (across branches). Since we currently don't have
-%%  any practical mechanism for doing that, we will have to
-%%  conservatively assume that the transformation is unsafe.
+%%  initialized previously.  We will use the annotations added by
+%%  beam_utils:anno_defs/1 to determine whether x(a) has been
+%%  initialized.
 
 move_allocates([{block,Bl0}|Is]) ->
     Bl = move_allocates_1(reverse(Bl0), []),
@@ -200,15 +202,20 @@ move_allocates([I|Is]) ->
     [I|move_allocates(Is)];
 move_allocates([]) -> [].
 
+move_allocates_1([{'%anno',_}|Is], Acc) ->
+    move_allocates_1(Is, Acc);
 move_allocates_1([I|Is], [{set,[],[],{alloc,Live0,Info}}|Acc]=Acc0) ->
-    case {alloc_may_pass(I),alloc_live_regs(I, Live0)} of
-	{false,_} ->
-	    move_allocates_1(Is, [I|Acc0]);
-	{true,not_possible} ->
-	    move_allocates_1(Is, [I|Acc0]);
-	{true,Live} when is_integer(Live) ->
-	    A = {set,[],[],{alloc,Live,Info}},
-	    move_allocates_1(Is, [A,I|Acc])
+    case alloc_may_pass(I) of
+        false ->
+            move_allocates_1(Is, [I|Acc0]);
+        true ->
+            case alloc_live_regs(I, Is, Live0) of
+                not_possible ->
+                    move_allocates_1(Is, [I|Acc0]);
+                Live when is_integer(Live) ->
+                    A = {set,[],[],{alloc,Live,Info}},
+                    move_allocates_1(Is, [A,I|Acc])
+            end
     end;
 move_allocates_1([I|Is], Acc) ->
     move_allocates_1(Is, [I|Acc]);
@@ -219,21 +226,34 @@ alloc_may_pass({set,_,_,{set_tuple_element,_}}) -> false;
 alloc_may_pass({set,_,_,put_list}) -> false;
 alloc_may_pass({set,_,_,put}) -> false;
 alloc_may_pass({set,_,_,_}) -> true.
-    
+
 %% opt([Instruction]) -> [Instruction]
 %%  Optimize the instruction stream inside a basic block.
 
 opt([{set,[X],[X],move}|Is]) -> opt(Is);
+opt([{set,[X],_,move},{set,[X],_,move}=I|Is]) ->
+    opt([I|Is]);
+opt([{set,[{x,0}],[S1],move}=I1,{set,[D2],[{x,0}],move}|Is]) ->
+    opt([I1,{set,[D2],[S1],move}|Is]);
+opt([{set,[{x,0}],[S1],move}=I1,{set,[D2],[S2],move}|Is0]) when S1 =/= D2 ->
+    %% Place move S x0 at the end of move sequences so that
+    %% loader can merge with the following instruction
+    {Ds,Is} = opt_moves([D2], Is0),
+    [{set,Ds,[S2],move}|opt([I1|Is])];
 opt([{set,_,_,{line,_}}=Line1,
      {set,[D1],[{integer,Idx1},Reg],{bif,element,{f,0}}}=I1,
      {set,_,_,{line,_}}=Line2,
      {set,[D2],[{integer,Idx2},Reg],{bif,element,{f,0}}}=I2|Is])
   when Idx1 < Idx2, D1 =/= D2, D1 =/= Reg, D2 =/= Reg ->
     opt([Line2,I2,Line1,I1|Is]);
+opt([{set,[D1],[{integer,Idx1},Reg],{bif,element,{f,L}}}=I1,
+     {set,[D2],[{integer,Idx2},Reg],{bif,element,{f,L}}}=I2|Is])
+  when Idx1 < Idx2, D1 =/= D2, D1 =/= Reg, D2 =/= Reg ->
+    opt([I2,I1|Is]);
 opt([{set,Ds0,Ss,Op}|Is0]) ->
     {Ds,Is} = opt_moves(Ds0, Is0),
     [{set,Ds,Ss,Op}|opt(Is)];
-opt([{'%live',_,_}=I|Is]) ->
+opt([{'%anno',_}=I|Is]) ->
     [I|opt(Is)];
 opt([]) -> [].
 
@@ -401,31 +421,47 @@ eliminate_use_of_from_reg([I]=Is, From, _To, Acc) ->
 	    no
     end.
 
+%% opt_allocs(Instructions) -> Instructions.  Optimize allocate
+%%  instructions inside blocks. If safe, replace an allocate_zero
+%%  instruction with the slightly cheaper allocate instruction.
+
+opt_allocs(Is) ->
+    D = beam_utils:index_labels(Is),
+    opt_allocs_1(Is, D).
+
+opt_allocs_1([{block,Bl0}|Is], D) ->
+    Bl = opt_alloc(Bl0, {D,Is}),
+    [{block,Bl}|opt_allocs_1(Is, D)];
+opt_allocs_1([I|Is], D) ->
+    [I|opt_allocs_1(Is, D)];
+opt_allocs_1([], _) -> [].
+
 %% opt_alloc(Instructions) -> Instructions'
 %%  Optimises all allocate instructions.
 
 opt_alloc([{set,[],[],{alloc,Live0,Info0}},
-	   {set,[],[],{alloc,Live,Info}}|Is]) ->
+           {set,[],[],{alloc,Live,Info}}|Is], D) ->
     Live = Live0,				%Assertion.
     Alloc = combine_alloc(Info0, Info),
     I = {set,[],[],{alloc,Live,Alloc}},
-    opt_alloc([I|Is]);
-opt_alloc([{set,[],[],{alloc,R,{_,Ns,Nh,[]}}}|Is]) ->
-    [{set,[],[],opt_alloc(Is, Ns, Nh, R)}|Is];
-opt_alloc([I|Is]) -> [I|opt_alloc(Is)];
-opt_alloc([]) -> [].
+    opt_alloc([I|Is], D);
+opt_alloc([{set,[],[],{alloc,R,{_,Ns,Nh,[]}}}|Is], D) ->
+    [{set,[],[],opt_alloc(Is, D, Ns, Nh, R)}|Is];
+opt_alloc([I|Is], D) -> [I|opt_alloc(Is, D)];
+opt_alloc([], _) -> [].
 
 combine_alloc({_,Ns,Nh1,Init}, {_,nostack,Nh2,[]})  ->
     {zero,Ns,beam_utils:combine_heap_needs(Nh1, Nh2),Init}.
-	
+
 %% opt_alloc(Instructions, FrameSize, HeapNeed, LivingRegs) -> [Instr]
 %%  Generates the optimal sequence of instructions for
 %%  allocating and initalizing the stack frame and needed heap.
 
-opt_alloc(_Is, nostack, Nh, LivingRegs) ->
+opt_alloc(_Is, _D, nostack, Nh, LivingRegs) ->
     {alloc,LivingRegs,{nozero,nostack,Nh,[]}};
-opt_alloc(Is, Ns, Nh, LivingRegs) ->
-    InitRegs = init_yreg(Is, 0),
+opt_alloc(Bl, {D,OuterIs}, Ns, Nh, LivingRegs) ->
+    Is = [{block,Bl}|OuterIs],
+    InitRegs = init_yregs(Ns, Is, D),
     case count_ones(InitRegs) of
 	N when N*2 > Ns ->
 	    {alloc,LivingRegs,{nozero,Ns,Nh,gen_init(Ns, InitRegs)}};
@@ -441,19 +477,14 @@ gen_init(Fs, Regs, Y, Acc) when Regs band 1 =:= 0 ->
 gen_init(Fs, Regs, Y, Acc) ->
     gen_init(Fs, Regs bsr 1, Y+1, Acc).
 
-%% init_yreg(Instructions, RegSet) -> RegSetInitialized
-%%  Calculate the set of initialized y registers.
-
-init_yreg([{set,_,_,{bif,_,_}}|_], Reg) -> Reg;
-init_yreg([{set,_,_,{alloc,_,{gc_bif,_,_}}}|_], Reg) -> Reg;
-init_yreg([{set,_,_,{alloc,_,{put_map,_,_}}}|_], Reg) -> Reg;
-init_yreg([{set,Ds,_,_}|Is], Reg) -> init_yreg(Is, add_yregs(Ds, Reg));
-init_yreg(_Is, Reg) -> Reg.
-
-add_yregs(Ys, Reg) -> foldl(fun(Y, R0) -> add_yreg(Y, R0) end, Reg, Ys).
-    
-add_yreg({y,Y}, Reg) -> Reg bor (1 bsl Y);
-add_yreg(_, Reg)     -> Reg.
+init_yregs(Y, Is, D) when Y >= 0 ->
+    case beam_utils:is_killed({y,Y}, Is, D) of
+        true ->
+            (1 bsl Y) bor init_yregs(Y-1, Is, D);
+        false ->
+            init_yregs(Y-1, Is, D)
+    end;
+init_yregs(_, _, _) -> 0.
 
 count_ones(Bits) -> count_ones(Bits, 0).
 count_ones(0, Acc) -> Acc;
@@ -463,16 +494,34 @@ count_ones(Bits, Acc) ->
 %% Calculate the new number of live registers when we move an allocate
 %% instruction upwards, passing a 'set' instruction.
 
-alloc_live_regs({set,Ds,Ss,_}, Regs0) ->
+alloc_live_regs({set,Ds,Ss,_}, Is, Regs0) ->
     Rset = x_live(Ss, x_dead(Ds, (1 bsl Regs0)-1)),
-    live_regs(0, Rset).
+    Live = live_regs(0, Rset),
+    case ensure_contiguous(Rset, Live) of
+        not_possible ->
+            %% Liveness information (looking forward in the
+            %% instruction stream) can't prove that moving this
+            %% allocation instruction is safe. Now use the annotation
+            %% of defined registers at the beginning of the current
+            %% block to see whether moving would be safe.
+            Def0 = defined_regs(Is, 0),
+            Def = Def0 band ((1 bsl Live) - 1),
+            ensure_contiguous(Rset bor Def, Live);
+        Live ->
+            %% Safe based on liveness information.
+            Live
+    end.
 
 live_regs(N, 0) ->
     N;
-live_regs(N, Regs) when Regs band 1 =:= 1 ->
-    live_regs(N+1, Regs bsr 1);
-live_regs(_, _) ->
-    not_possible.
+live_regs(N, Regs) ->
+    live_regs(N+1, Regs bsr 1).
+
+ensure_contiguous(Regs, Live) ->
+    case (1 bsl Live) - 1 of
+        Regs -> Live;
+        _ -> not_possible
+    end.
 
 x_dead([{x,N}|Rs], Regs) -> x_dead(Rs, Regs band (bnot (1 bsl N)));
 x_dead([_|Rs], Regs) -> x_dead(Rs, Regs);
@@ -481,3 +530,14 @@ x_dead([], Regs) -> Regs.
 x_live([{x,N}|Rs], Regs) -> x_live(Rs, Regs bor (1 bsl N));
 x_live([_|Rs], Regs) -> x_live(Rs, Regs);
 x_live([], Regs) -> Regs.
+
+%% defined_regs(ReversedInstructions) -> RegBitmap.
+%%  Given a reversed instruction stream, determine the
+%%  the registers that are defined.
+
+defined_regs([{'%anno',{def,Def}}|_], Regs) ->
+    Def bor Regs;
+defined_regs([{set,Ds,_,{alloc,Live,_}}|_], Regs) ->
+    x_live(Ds, Regs bor ((1 bsl Live) - 1));
+defined_regs([{set,Ds,_,_}|Is], Regs) ->
+    defined_regs(Is, x_live(Ds, Regs)).