4 files changed, 334 insertions, 44 deletions
diff --git a/lib/compiler/src/beam_ssa_opt.erl b/lib/compiler/src/beam_ssa_opt.erl
index 3685c09a2b..2c898ba6f8 100644
--- a/lib/compiler/src/beam_ssa_opt.erl
+++ b/lib/compiler/src/beam_ssa_opt.erl
@@ -39,7 +39,7 @@
 
 -include("beam_ssa_opt.hrl").
 
--import(lists, [append/1,duplicate/2,foldl/3,keyfind/3,member/2,
+-import(lists, [all/2,append/1,duplicate/2,foldl/3,keyfind/3,member/2,
                 reverse/1,reverse/2,
                 splitwith/2,sort/1,takewhile/2,unzip/1]).
 
@@ -142,6 +142,7 @@ finish_1([], _StMap) ->
 prologue_passes(Opts) ->
     Ps = [?PASS(ssa_opt_split_blocks),
           ?PASS(ssa_opt_coalesce_phis),
+          ?PASS(ssa_opt_tail_phis),
           ?PASS(ssa_opt_element),
           ?PASS(ssa_opt_linearize),
           ?PASS(ssa_opt_tuple_size),
@@ -157,6 +158,7 @@ repeated_passes(Opts) ->
           ?PASS(ssa_opt_bs_puts),
           ?PASS(ssa_opt_dead),
           ?PASS(ssa_opt_cse),
+          ?PASS(ssa_opt_tail_phis),
           ?PASS(ssa_opt_type_continue)],        %Must run after ssa_opt_dead to
                                                 %clean up phi nodes.
     passes_1(Ps, Opts).
@@ -431,6 +433,160 @@ c_fix_branches([{_,Pred}|As], L, Blocks0) ->
 c_fix_branches([], _, Blocks) -> Blocks.
 
 %%%
+%%% Eliminate phi nodes in the tail of a function.
+%%%
+%%% Try to eliminate short blocks that starts with a phi node
+%%% and end in a return. For example:
+%%%
+%%%    Result = phi { Res1, 4 }, { literal true, 5 }
+%%%    Ret = put_tuple literal ok, Result
+%%%    ret Ret
+%%%
+%%% The code in this block can be inserted at the end blocks 4 and
+%%% 5. Thus, the following code can be inserted into block 4:
+%%%
+%%%    Ret:1 = put_tuple literal ok, Res1
+%%%    ret Ret:1
+%%%
+%%% And the following code into block 5:
+%%%
+%%%    Ret:2 = put_tuple literal ok, literal true
+%%%    ret Ret:2
+%%%
+%%% Which can be further simplified to:
+%%%
+%%%    ret literal {ok, true}
+%%%
+%%% This transformation may lead to more code improvements:
+%%%
+%%%   - Stack trimming
+%%%   - Fewer test_heap instructions
+%%%   - Smaller stack frames
+%%%
+
+ssa_opt_tail_phis({#st{ssa=SSA0,cnt=Count0}=St, FuncDb}) ->
+    {SSA,Count} = opt_tail_phis(SSA0, Count0),
+    {St#st{ssa=SSA,cnt=Count}, FuncDb}.
+
+opt_tail_phis(Blocks, Count) when is_map(Blocks) ->
+    opt_tail_phis(maps:values(Blocks), Blocks, Count);
+opt_tail_phis(Linear0, Count0) when is_list(Linear0) ->
+    Blocks0 = maps:from_list(Linear0),
+    {Blocks,Count} = opt_tail_phis(Blocks0, Count0),
+    {beam_ssa:linearize(Blocks),Count}.
+
+opt_tail_phis([#b_blk{is=Is0,last=Last}|Bs], Blocks0, Count0) ->
+    case {Is0,Last} of
+        {[#b_set{op=phi,args=[_,_|_]}|_],#b_ret{arg=#b_var{}}=Ret} ->
+            {Phis,Is} = splitwith(fun(#b_set{op=Op}) -> Op =:= phi end, Is0),
+            case suitable_tail_ops(Is) of
+                true ->
+                    {Blocks,Count} = opt_tail_phi(Phis, Is, Ret,
+                                                  Blocks0, Count0),
+                    opt_tail_phis(Bs, Blocks, Count);
+                false ->
+                    opt_tail_phis(Bs, Blocks0, Count0)
+            end;
+        {_,_} ->
+            opt_tail_phis(Bs, Blocks0, Count0)
+    end;
+opt_tail_phis([], Blocks, Count) ->
+    {Blocks,Count}.
+
+opt_tail_phi(Phis0, Is, Ret, Blocks0, Count0) ->
+    Phis = rel2fam(reduce_phis(Phis0)),
+    {Blocks,Count,Cost} =
+        foldl(fun(PhiArg, Acc) ->
+                      opt_tail_phi_arg(PhiArg, Is, Ret, Acc)
+              end, {Blocks0,Count0,0}, Phis),
+    MaxCost = length(Phis) * 3 + 2,
+    if
+        Cost =< MaxCost ->
+            %% The transformation would cause at most a slight
+            %% increase in code size if no more optimizations
+            %% can be applied.
+            {Blocks,Count};
+        true ->
+            %% The code size would be increased too much.
+            {Blocks0,Count0}
+    end.
+
+reduce_phis([#b_set{dst=PhiDst,args=PhiArgs}|Is]) ->
+    [{L,{PhiDst,Val}} || {Val,L} <- PhiArgs] ++ reduce_phis(Is);
+reduce_phis([]) -> [].
+
+opt_tail_phi_arg({PredL,Sub0}, Is0, Ret0, {Blocks0,Count0,Cost0}) ->
+    Blk0 = map_get(PredL, Blocks0),
+    #b_blk{is=IsPrefix,last=#b_br{succ=Next,fail=Next}} = Blk0,
+    case is_exit_bif(IsPrefix) of
+        false ->
+            Sub1 = maps:from_list(Sub0),
+            {Is1,Count,Sub} = new_names(Is0, Sub1, Count0, []),
+            Is2 = [sub(I, Sub) || I <- Is1],
+            Cost = build_cost(Is2, Cost0),
+            Is = IsPrefix ++ Is2,
+            Ret = sub(Ret0, Sub),
+            Blk = Blk0#b_blk{is=Is,last=Ret},
+            Blocks = Blocks0#{PredL:=Blk},
+            {Blocks,Count,Cost};
+        true ->
+            %% The block ends in a call to a function that
+            %% will cause an exception.
+            {Blocks0,Count0,Cost0+3}
+    end.
+
+is_exit_bif([#b_set{op=call,
+                    args=[#b_remote{mod=#b_literal{val=Mod},
+                                    name=#b_literal{val=Name}}|Args]}]) ->
+    erl_bifs:is_exit_bif(Mod, Name, length(Args));
+is_exit_bif(_) -> false.
+
+new_names([#b_set{dst=Dst}=I|Is], Sub0, Count0, Acc) ->
+    {NewDst,Count} = new_var(Dst, Count0),
+    Sub = Sub0#{Dst=>NewDst},
+    new_names(Is, Sub, Count, [I#b_set{dst=NewDst}|Acc]);
+new_names([], Sub, Count, Acc) ->
+    {reverse(Acc),Count,Sub}.
+
+suitable_tail_ops(Is) ->
+    all(fun(#b_set{op=Op}) ->
+                is_suitable_tail_op(Op)
+        end, Is).
+
+is_suitable_tail_op({bif,_}) -> true;
+is_suitable_tail_op(put_list) -> true;
+is_suitable_tail_op(put_tuple) -> true;
+is_suitable_tail_op(_) -> false.
+
+build_cost([#b_set{op=put_list,args=Args}|Is], Cost) ->
+    case are_all_literals(Args) of
+        true ->
+            build_cost(Is, Cost);
+        false ->
+            build_cost(Is, Cost + 1)
+    end;
+build_cost([#b_set{op=put_tuple,args=Args}|Is], Cost) ->
+    case are_all_literals(Args) of
+        true ->
+            build_cost(Is, Cost);
+        false ->
+            build_cost(Is, Cost + length(Args) + 1)
+    end;
+build_cost([#b_set{op={bif,_},args=Args}|Is], Cost) ->
+    case are_all_literals(Args) of
+        true ->
+            build_cost(Is, Cost);
+        false ->
+            build_cost(Is, Cost + 1)
+    end;
+build_cost([], Cost) -> Cost.
+
+are_all_literals(Args) ->
+    all(fun(#b_literal{}) -> true;
+                (_) -> false
+             end, Args).
+
+%%%
 %%% Order element/2 calls.
 %%%
 %%% Order an unbroken chain of element/2 calls for the same tuple
@@ -2116,3 +2272,9 @@ sub_arg(Old, Sub) ->
         #{Old:=New} -> New;
         #{} -> Old
     end.
+
+new_var(#b_var{name={Base,N}}, Count) ->
+    true = is_integer(N),                       %Assertion.
+    {#b_var{name={Base,Count}},Count+1};
+new_var(#b_var{name=Base}, Count) ->
+    {#b_var{name={Base,Count}},Count+1}.
diff --git a/lib/compiler/src/beam_ssa_pre_codegen.erl b/lib/compiler/src/beam_ssa_pre_codegen.erl
index 9b2bb65911..fde1118c29 100644
--- a/lib/compiler/src/beam_ssa_pre_codegen.erl
+++ b/lib/compiler/src/beam_ssa_pre_codegen.erl
@@ -2086,23 +2086,95 @@ reserve_freg([], Res) -> Res.
 %%  will allocate the lowest free X register for the variable.
 
 reserve_xregs(Blocks, Res) ->
-    F = fun(L, #b_blk{is=Is,last=Last}, R) ->
-                {Xs0,Used0} = reserve_terminator(L, Last, Blocks, R),
-                reserve_xregs_is(reverse(Is), R, Xs0, Used0)
-        end,
-    beam_ssa:fold_po(F, Res, Blocks).
-
+    Ls = reverse(beam_ssa:rpo(Blocks)),
+    reserve_xregs(Ls, Blocks, #{}, Res).
+
+reserve_xregs([L|Ls], Blocks, XsMap0, Res0) ->
+    #b_blk{anno=Anno,is=Is0,last=Last} = map_get(L, Blocks),
+
+    %% Calculate mapping from variable name to the preferred
+    %% register.
+    Xs0 = reserve_terminator(L, Is0, Last, Blocks, XsMap0, Res0),
+
+    %% We need to figure out where the code generator will
+    %% place instructions that will do a garbage collection.
+    %% Insert 'gc' markers as pseudo-instructions in the
+    %% instruction sequence.
+    Is1 = reverse(Is0),
+    Is2 = res_place_gc_instrs(Is1, []),
+    Is = res_place_allocate(Anno, Is2),
+
+    %% Add register hints for variables that are defined
+    %% in the (reversed) instruction sequence.
+    {Res,Xs} = reserve_xregs_is(Is, Res0, Xs0, []),
+
+    XsMap = XsMap0#{L=>Xs},
+    reserve_xregs(Ls, Blocks, XsMap, Res);
+reserve_xregs([], _, _, Res) -> Res.
+
+%% Insert explicit 'gc' markers points where there will
+%% be a garbage collection. (Note that the instruction
+%% sequence passed to this function is reversed.)
+
+res_place_gc_instrs([#b_set{op=phi}=I|Is], Acc) ->
+    res_place_gc_instrs(Is, [I|Acc]);
+res_place_gc_instrs([#b_set{op=Op}=I|Is], Acc)
+  when Op =:= call; Op =:= make_fun ->
+    case Acc of
+        [] ->
+            res_place_gc_instrs(Is, [I|Acc]);
+        [GC|_] when GC =:= gc; GC =:= test_heap ->
+            res_place_gc_instrs(Is, [I,gc|Acc]);
+        [_|_] ->
+            res_place_gc_instrs(Is, [I,gc|Acc])
+    end;
+res_place_gc_instrs([#b_set{op=Op,args=Args}=I|Is], Acc0) ->
+    case beam_ssa_codegen:classify_heap_need(Op, Args) of
+        neutral ->
+            case Acc0 of
+                [test_heap|Acc] ->
+                    res_place_gc_instrs(Is, [test_heap,I|Acc]);
+                Acc ->
+                    res_place_gc_instrs(Is, [I|Acc])
+            end;
+        {put,_} ->
+            case Acc0 of
+                [test_heap|Acc] ->
+                    res_place_gc_instrs(Is, [test_heap,I|Acc]);
+                Acc ->
+                    res_place_gc_instrs(Is, [test_heap,I|Acc])
+            end;
+        _ ->
+            res_place_gc_instrs(Is, [gc,I|Acc0])
+    end;
+res_place_gc_instrs([], Acc) ->
+    %% Reverse and replace 'test_heap' markers with 'gc'.
+    %% (The distinction is no longer useful.)
+    res_place_gc_instrs_rev(Acc, []).
+
+res_place_gc_instrs_rev([test_heap|Is], [gc|_]=Acc) ->
+    res_place_gc_instrs_rev(Is, Acc);
+res_place_gc_instrs_rev([test_heap|Is], Acc) ->
+    res_place_gc_instrs_rev(Is, [gc|Acc]);
+res_place_gc_instrs_rev([gc|Is], [gc|_]=Acc) ->
+    res_place_gc_instrs_rev(Is, Acc);
+res_place_gc_instrs_rev([I|Is], Acc) ->
+    res_place_gc_instrs_rev(Is, [I|Acc]);
+res_place_gc_instrs_rev([], Acc) -> Acc.
+
+res_place_allocate(#{yregs:=_}, Is) ->
+    %% There will be an 'allocate' instruction inserted here.
+    Is ++ [gc];
+res_place_allocate(#{}, Is) -> Is.
+
+reserve_xregs_is([gc|Is], Res, Xs0, Used) ->
+    %% At this point, the code generator will place an instruction
+    %% that does a garbage collection. We must prune the remembered
+    %% registers.
+    Xs = res_xregs_prune(Xs0, Used, Res),
+    reserve_xregs_is(Is, Res, Xs, Used);
 reserve_xregs_is([#b_set{op=Op,dst=Dst,args=Args}=I|Is], Res0, Xs0, Used0) ->
-    Xs1 = case is_gc_safe(I) of
-              true ->
-                  Xs0;
-              false ->
-                  %% There may be a garbage collection after executing this
-                  %% instruction. We will need prune the list of preferred
-                  %% X registers.
-                  res_xregs_prune(Xs0, Used0, Res0)
-          end,
-    Res = reserve_xreg(Dst, Xs1, Res0),
+    Res = reserve_xreg(Dst, Xs0, Res0),
     Used1 = ordsets:union(Used0, beam_ssa:used(I)),
     Used = ordsets:del_element(Dst, Used1),
     case Op of
@@ -2113,28 +2185,74 @@ reserve_xregs_is([#b_set{op=Op,dst=Dst,args=Args}=I|Is], Res0, Xs0, Used0) ->
             Xs = reserve_call_args(tl(Args)),
             reserve_xregs_is(Is, Res, Xs, Used);
         _ ->
-            reserve_xregs_is(Is, Res, Xs1, Used)
+            reserve_xregs_is(Is, Res, Xs0, Used)
     end;
-reserve_xregs_is([], Res, _Xs, _Used) -> Res.
-
-reserve_terminator(L, #b_br{bool=#b_literal{val=true},succ=Succ}, Blocks, Res) ->
-    case maps:get(Succ, Blocks) of
+reserve_xregs_is([], Res, Xs, _Used) ->
+    {Res,Xs}.
+
+%% Pick up register hints from the successors of this blocks.
+reserve_terminator(_L, _Is, #b_br{bool=#b_var{},succ=Succ,fail=?BADARG_BLOCK},
+                   _Blocks, XsMap, _Res) ->
+    %% We know that no variables are used at ?BADARG_BLOCK, so
+    %% any register hints from the success blocks are safe to use.
+    map_get(Succ, XsMap);
+reserve_terminator(L, Is, #b_br{bool=#b_var{},succ=Succ,fail=Fail},
+                   Blocks, XsMap, Res) when Succ =/= Fail ->
+    #{Succ:=SuccBlk,Fail:=FailBlk} = Blocks,
+    case {SuccBlk,FailBlk} of
+        {#b_blk{is=[],last=#b_br{succ=PhiL,fail=PhiL}},
+         #b_blk{is=[],last=#b_br{succ=PhiL,fail=PhiL}}} ->
+            %% Both branches ultimately transfer to the same
+            %% block (via two blocks with no instructions).
+            %% Pick up register hints from the phi nodes
+            %% in the common block.
+            #{PhiL:=#b_blk{is=PhiIs}} = Blocks,
+            Xs = res_xregs_from_phi(PhiIs, Succ, Res, #{}),
+            res_xregs_from_phi(PhiIs, Fail, Res, Xs);
+        {_,_} when Is =/= [] ->
+            case last(Is) of
+                #b_set{op=succeeded,args=[Arg]} ->
+                    %% We know that Arg will not be used at the failure
+                    %% label, so we can pick up register hints from the
+                    %% success label.
+                    Br = #b_br{bool=#b_literal{val=true},succ=Succ,fail=Succ},
+                    case reserve_terminator(L, [], Br, Blocks, XsMap, Res) of
+                        #{Arg:=Reg} -> #{Arg=>Reg};
+                        #{} -> #{}
+                    end;
+                _ ->
+                    %% Register hints from the success block may not
+                    %% be safe at the failure block, and vice versa.
+                    #{}
+            end;
+        {_,_} ->
+            %% Register hints from the success block may not
+            %% be safe at the failure block, and vice versa.
+            #{}
+    end;
+reserve_terminator(L, Is, #b_br{bool=#b_literal{val=true},succ=Succ},
+                   Blocks, XsMap, Res) ->
+    case map_get(Succ, Blocks) of
         #b_blk{is=[],last=Last} ->
-            reserve_terminator(Succ, Last, Blocks, Res);
-        #b_blk{is=[_|_]=Is} ->
-            {res_xregs_from_phi(Is, L, Res, #{}),[]}
+            reserve_terminator(Succ, Is, Last, Blocks, XsMap, Res);
+        #b_blk{is=[_|_]=PhiIs} ->
+            res_xregs_from_phi(PhiIs, L, Res, #{})
     end;
-reserve_terminator(_, Last, _, _) ->
-    {#{},beam_ssa:used(Last)}.
+reserve_terminator(_, _, _, _, _, _) -> #{}.
 
+%% Pick up a reservation from a phi node.
 res_xregs_from_phi([#b_set{op=phi,dst=Dst,args=Args}|Is],
                    Pred, Res, Acc) ->
     case [V || {#b_var{}=V,L} <- Args, L =:= Pred] of
         [] ->
+            %% The value of the phi node for this predecessor
+            %% is a literal. Nothing to do here.
             res_xregs_from_phi(Is, Pred, Res, Acc);
         [V] ->
             case Res of
                 #{Dst:={prefer,Reg}} ->
+                    %% Try placing V in the same register as for
+                    %% the phi node.
                     res_xregs_from_phi(Is, Pred, Res, Acc#{V=>Reg});
                 #{Dst:=_} ->
                     res_xregs_from_phi(Is, Pred, Res, Acc)
@@ -2154,12 +2272,12 @@ reserve_call_args([], _, Xs) -> Xs.
 reserve_xreg(V, Xs, Res) ->
     case Res of
         #{V:=_} ->
-            %% Already reserved.
+            %% Already reserved (but not as an X register).
             Res;
         #{} ->
             case Xs of
                 #{V:=X} ->
-                    %% Add a hint that a specific X register is
+                    %% Add a hint that this specific X register is
                     %% preferred, unless it is already in use.
                     Res#{V=>{prefer,X}};
                 #{} ->
@@ -2168,23 +2286,15 @@ reserve_xreg(V, Xs, Res) ->
             end
     end.
 
-is_gc_safe(#b_set{op=phi}) ->
-    false;
-is_gc_safe(#b_set{op=Op,args=Args}) ->
-    case beam_ssa_codegen:classify_heap_need(Op, Args) of
-        neutral -> true;
-        {put,_} -> true;
-        _ -> false
-    end.
-
 %% res_xregs_prune(PreferredRegs, Used, Res) -> PreferredRegs.
-%%  Prune the list of preferred to only include X registers that
-%%  are guaranteed to survice a garbage collection.
+%%  Prune the list of preferred registers, to make sure that
+%%  there are no "holes" (uninitialized X registers) when
+%%  invoking the garbage collector.
 
-res_xregs_prune(Xs, Used, Res) ->
+res_xregs_prune(Xs, Used, Res) when map_size(Xs) =/= 0 ->
     %% The number of safe registers is the number of the X registers
     %% used after this point. The actual number of safe registers may
-    %% be highter than this number, but this is a conservative safe
+    %% be higher than this number, but this is a conservative safe
     %% estimate.
     NumSafe = foldl(fun(V, N) ->
                             case Res of
@@ -2196,7 +2306,8 @@ res_xregs_prune(Xs, Used, Res) ->
 
     %% Remove unsafe registers from the list of potential
     %% preferred registers.
-    maps:filter(fun(_, {x,X}) -> X < NumSafe end, Xs).
+    maps:filter(fun(_, {x,X}) -> X < NumSafe end, Xs);
+res_xregs_prune(Xs, _Used, _Res) -> Xs.
 
 %%%
 %%% Register allocation using linear scan.
diff --git a/lib/compiler/src/beam_validator.erl b/lib/compiler/src/beam_validator.erl
index 15ed267c54..4081e366a5 100644
--- a/lib/compiler/src/beam_validator.erl
+++ b/lib/compiler/src/beam_validator.erl
@@ -871,7 +871,7 @@ valfun_4({test,is_map,{f,Lbl},[Src]}, Vst0) ->
 valfun_4({test,is_nil,{f,Lbl},[Src]}, Vst0) ->
     Vst = case get_term_type(Src, Vst0) of
               list ->
-                  branch_state(Lbl, set_type_reg(cons, Src, Vst0));
+                  branch_state(Lbl, set_aliased_type(cons, Src, Vst0));
               _ ->
                   branch_state(Lbl, Vst0)
           end,
diff --git a/lib/compiler/test/beam_validator_SUITE.erl b/lib/compiler/test/beam_validator_SUITE.erl
index c9df066958..585d0e7191 100644
--- a/lib/compiler/test/beam_validator_SUITE.erl
+++ b/lib/compiler/test/beam_validator_SUITE.erl
@@ -586,6 +586,9 @@ aliased_types(Config) ->
     {1,1} = aliased_types_2(Seq),
     {42,none} = aliased_types_2([]),
 
+    gurka = aliased_types_3([gurka]),
+    gaffel = aliased_types_3([gaffel]),
+
     ok.
 
 %% ERL-735: validator failed to track types on aliased registers, rejecting
@@ -614,6 +617,20 @@ aliased_types_2(Bug) ->
              _ -> hd(Bug)
          end}.
 
+%% ERL-832 part deux; validator failed to realize that an aliased register was
+%% a cons.
+aliased_types_3(Bug) ->
+    List = [Y || Y <- Bug],
+    case List of
+        [] -> Bug;
+        _ ->
+            if
+                hd(List) -> a:a();
+                true -> ok
+            end,
+            hd(List)
+    end.
+
 %%%-------------------------------------------------------------------------
 
 transform_remove(Remove, Module) ->