From 693cda7f01137eda93362a74abd6028c41d2f5c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Wed, 14 Nov 2018 06:48:54 +0100
Subject: beam_ssa_type: Speed up type analysis for huge functions

The type analysis pass (`beam_ssa_type`) keeps the type information
for all variables that are in scope. For huge functions, the
`join_types/2` function could get really slow when joining two
maps with thousands of variables in each.

Use a conservative approach to discard type information for
variables only used once by a `br` or `switch` in the same
block as the variable is defined in.

This approach reduces the runtime for the `beam_ssa_type` pass
from a few minutes down to few seconds for this module:

https://github.com/aggelgian/cuter/blob/master/src/cuter_binlib.erl

Rejected approach: Calculate liveness information for all variables
and discard type information for any variable that would not be
used again. That turned out to not work because some optimizations
would invalidate the liveness (in particular, substitutions could
lengthen the lifetime for a variable). Trying to update the
liveness information when doing the optimizations would be tricky.

Noticed-by: Kostis Sagonas
---
 lib/compiler/src/beam_ssa_type.erl | 103 ++++++++++++++++++++++++++++++++-----
 1 file changed, 90 insertions(+), 13 deletions(-)

diff --git a/lib/compiler/src/beam_ssa_type.erl b/lib/compiler/src/beam_ssa_type.erl
index 18e6e73a46..95fc3bb0e9 100644
--- a/lib/compiler/src/beam_ssa_type.erl
+++ b/lib/compiler/src/beam_ssa_type.erl
@@ -27,9 +27,10 @@
 
 -define(UNICODE_INT, #t_integer{elements={0,16#10FFFF}}).
 
--record(d, {ds :: #{beam_ssa:var_name():=beam_ssa:b_set()},
+-record(d, {ds :: #{beam_ssa:b_var():=beam_ssa:b_set()},
             ls :: #{beam_ssa:label():=type_db()},
-            sub :: #{beam_ssa:var_name():=beam_ssa:value()}
+            once :: cerl_sets:set(beam_ssa:b_var()),
+            sub :: #{beam_ssa:b_var():=beam_ssa:value()}
            }).
 
 -define(ATOM_SET_SIZE, 5).
@@ -56,13 +57,15 @@
       Block :: beam_ssa:b_blk().
 
 opt(Linear, Args) ->
+    UsedOnce = used_once(Linear, Args),
     Ts = maps:from_list([{V,any} || #b_var{}=V <- Args]),
     FakeCall = #b_set{op=call,args=[#b_remote{mod=#b_literal{val=unknown},
                                               name=#b_literal{val=unknown},
                                               arity=0}]},
     Defs = maps:from_list([{Var,FakeCall#b_set{dst=Var}} ||
                               #b_var{}=Var <- Args]),
-    D = #d{ds=Defs,ls=#{0=>Ts},sub=#{}},
+    D = #d{ds=Defs,ls=#{0=>Ts,?BADARG_BLOCK=>#{}},
+           once=UsedOnce,sub=#{}},
     opt_1(Linear, D).
 
 opt_1([{L,Blk}|Bs], #d{ls=Ls}=D) ->
@@ -425,16 +428,43 @@ opt_terminator(#b_ret{}=Ret, _Ts, _Ds) -> Ret.
 
 update_successors(#b_br{bool=#b_literal{val=true},succ=S}, Ts, D) ->
     update_successor(S, Ts, D);
-update_successors(#b_br{bool=#b_var{}=Bool,succ=Succ,fail=Fail}, Ts, D0) ->
-    D = update_successor_bool(Bool, false, Fail, Ts, D0),
-    SuccTs = infer_types(Bool, Ts, D0),
-    update_successor_bool(Bool, true, Succ, SuccTs, D);
-update_successors(#b_switch{arg=#b_var{}=V,fail=Fail,list=List}, Ts, D0) ->
-    D = update_successor(Fail, Ts, D0),
-    foldl(fun({Val,S}, A) ->
-                  T = get_type(Val, Ts),
-                  update_successor(S, Ts#{V=>T}, A)
-          end, D, List);
+update_successors(#b_br{bool=#b_var{}=Bool,succ=Succ,fail=Fail}, Ts0, D0) ->
+    case cerl_sets:is_element(Bool, D0#d.once) of
+        true ->
+            %% This variable is defined in this block and is only
+            %% referenced by this br terminator. Therefore, there is
+            %% no need to include the type database passed on to the
+            %% successors of this block.
+            Ts = maps:remove(Bool, Ts0),
+            D = update_successor(Fail, Ts, D0),
+            SuccTs = infer_types(Bool, Ts, D0),
+            update_successor(Succ, SuccTs, D);
+        false ->
+            D = update_successor_bool(Bool, false, Fail, Ts0, D0),
+            SuccTs = infer_types(Bool, Ts0, D0),
+            update_successor_bool(Bool, true, Succ, SuccTs, D)
+    end;
+update_successors(#b_switch{arg=#b_var{}=V,fail=Fail,list=List}, Ts0, D0) ->
+    case cerl_sets:is_element(V, D0#d.once) of
+        true ->
+            %% This variable is defined in this block and is only
+            %% referenced by this switch terminator. Therefore, there is
+            %% no need to include the type database passed on to the
+            %% successors of this block.
+            Ts = maps:remove(V, Ts0),
+            D = update_successor(Fail, Ts, D0),
+            F = fun({_Val,S}, A) ->
+                        update_successor(S, Ts, A)
+                end,
+            foldl(F, D, List);
+        false ->
+            D = update_successor(Fail, Ts0, D0),
+            F = fun({Val,S}, A) ->
+                        T = get_type(Val, Ts0),
+                        update_successor(S, Ts0#{V=>T}, A)
+                end,
+            foldl(F, D, List)
+        end;
 update_successors(#b_ret{}, _Ts, D) -> D.
 
 update_successor_bool(#b_var{}=Var, BoolValue, S, Ts, D) ->
@@ -447,6 +477,11 @@ update_successor_bool(#b_var{}=Var, BoolValue, S, Ts, D) ->
             update_successor(S, Ts, D)
     end.
 
+update_successor(?BADARG_BLOCK, _Ts, #d{}=D) ->
+    %% We KNOW that no variables are used in the ?BADARG_BLOCK,
+    %% so there is no need to update the type information. That
+    %% can be a huge timesaver for huge functions.
+    D;
 update_successor(S, Ts0, #d{ls=Ls}=D) ->
     case Ls of
         #{S:=Ts1} ->
@@ -766,6 +801,48 @@ simplify_not(#b_br{bool=#b_var{}=V,succ=Succ,fail=Fail}=Br0, Ts, Ds) ->
             Br0
     end.
 
+%%%
+%%% Calculate the set of variables that are only used once in the
+%%% block that they are defined in. That will allow us to discard type
+%%% information for variables that will never be referenced by the
+%%% successor blocks, potentially improving compilation times.
+%%%
+
+used_once(Linear, Args) ->
+    Map0 = used_once_1(reverse(Linear), #{}),
+    Map = maps:without(Args, Map0),
+    cerl_sets:from_list(maps:keys(Map)).
+
+used_once_1([{L,#b_blk{is=Is,last=Last}}|Bs], Uses0) ->
+    Uses = used_once_2([Last|reverse(Is)], L, Uses0),
+    used_once_1(Bs, Uses);
+used_once_1([], Uses) -> Uses.
+
+used_once_2([I|Is], L, Uses0) ->
+    Uses = used_once_uses(beam_ssa:used(I), L, Uses0),
+    case I of
+        #b_set{dst=Dst} ->
+            case Uses of
+                #{Dst:=[L]} ->
+                    used_once_2(Is, L, Uses);
+                #{} ->
+                    used_once_2(Is, L, maps:remove(Dst, Uses))
+            end;
+        _ ->
+            used_once_2(Is, L, Uses)
+    end;
+used_once_2([], _, Uses) -> Uses.
+
+used_once_uses([V|Vs], L, Uses) ->
+    case Uses of
+        #{V:=Us} ->
+            used_once_uses(Vs, L, Uses#{V:=[L|Us]});
+        #{} ->
+            used_once_uses(Vs, L, Uses#{V=>[L]})
+    end;
+used_once_uses([], _, Uses) -> Uses.
+
+
 get_types(Values, Ts) ->
     [get_type(Val, Ts) || Val <- Values].
 -spec get_type(beam_ssa:value(), type_db()) -> type().
-- 
cgit v1.2.3


From 64a15f2979c4644298cdad1319919fe84103484e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Thu, 15 Nov 2018 03:22:53 +0100
Subject: beam_ssa_pre_codegen: Eliminate a bottleneck in linear scan

The linear scan algorithm keeps unhandled intervals in two separate
lists: one for intervals with reserved registers and one for intervals
without reserved registers. When collecting the available free registers
all unhandled intervals with reserved registers must be checked for
overlap.

Unhandled intervals that had a preferred register were put into the
list of intervals with reserved registers, leading to a lot of
unecessary overlap checking if there were many intervals with
preferred registers. Changing the partition code to put intervals with
preferred registers into the general list of unhandled intervals will
reduce the compilation time if there are many preferred registers.

On my computer, this change reduced the time of the linear scan pass
from about 20 seconds down to about 0.5 seconds for this module:

https://github.com/aggelgian/cuter/blob/master/src/cuter_binlib.erl

Noticed-by: Kostis Sagonas
---
 lib/compiler/src/beam_ssa_pre_codegen.erl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/lib/compiler/src/beam_ssa_pre_codegen.erl b/lib/compiler/src/beam_ssa_pre_codegen.erl
index 9175931375..c60c6da9ea 100644
--- a/lib/compiler/src/beam_ssa_pre_codegen.erl
+++ b/lib/compiler/src/beam_ssa_pre_codegen.erl
@@ -2211,7 +2211,13 @@ linear_scan(#st{intervals=Intervals0,res=Res}=St0) ->
     Free = init_free(maps:to_list(Res)),
     Intervals1 = [init_interval(Int, Res) || Int <- Intervals0],
     Intervals = sort(Intervals1),
-    IsReserved = fun (#i{reg=Reg}) -> Reg =/= none end,
+    IsReserved = fun(#i{reg=Reg}) ->
+                         case Reg of
+                             none -> false;
+                             {prefer,{_,_}} -> false;
+                             {_,_} -> true
+                         end
+                 end,
     {UnhandledRes,Unhandled} = partition(IsReserved, Intervals),
     L = #l{unhandled_res=UnhandledRes,
            unhandled_any=Unhandled,free=Free},
-- 
cgit v1.2.3


From 4534b73552bdb405808154f01f9b051cacb4ff3e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Thu, 15 Nov 2018 10:21:11 +0100
Subject: beam_ssa_dead: Speed up optimization of switch instructions

`beam_ssa_dead` can waste a lot of time trying to optimize
an unoptimizable `switch` instruction.

By being a little bit smarter when optimizing `switch` instructions,
the runtime for the beam_ssa_dead pass was reduced approximately by
half on my computer for this module:

https://github.com/aggelgian/cuter/blob/master/src/cuter_binlib.erl

Noticed-by: Kostis Sagonas
---
 lib/compiler/src/beam_ssa_dead.erl | 59 ++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 28 deletions(-)

diff --git a/lib/compiler/src/beam_ssa_dead.erl b/lib/compiler/src/beam_ssa_dead.erl
index c20652580d..067d9a6741 100644
--- a/lib/compiler/src/beam_ssa_dead.erl
+++ b/lib/compiler/src/beam_ssa_dead.erl
@@ -135,7 +135,8 @@ shortcut_terminator(Last, _Is, _Bs, _St) ->
     Last.
 
 shortcut_switch([{Lit,L0}|T], Bool, Bs, St0) ->
-    St = St0#st{rel_op=normalize_op({bif,'=:='}, [Bool,Lit])},
+    RelOp = {'=:=',Bool,Lit},
+    St = St0#st{rel_op=RelOp},
     #b_br{bool=#b_literal{val=true},succ=L} =
         shortcut(L0, bind_var(Bool, Lit, Bs), St#st{target=one_way}),
     [{Lit,L}|shortcut_switch(T, Bool, Bs, St0)];
@@ -388,41 +389,43 @@ eval_terminator(#b_switch{arg=Arg,fail=Fail,list=List}=Sw, Bs, St) ->
             %% Literal argument. Simplify to a `br`.
             beam_ssa:normalize(Sw#b_switch{arg=Val});
         #b_var{} ->
-            case St of
-                #st{rel_op=none} ->
-                    %% No previous relational operator is stored.
-                    %% Give up.
+            %% Try optimizing the switch.
+            case eval_switch(List, Arg, St, Fail) of
+                none ->
                     none;
-                #st{} ->
-                    %% There is a previous relational operator stored.
-                    %% Try optimizing the switch.
-                    case eval_switch(List, Arg, St, Fail) of
-                        none ->
-                            none;
-                        To when is_integer(To) ->
-                            %% Either one of the values in the switch
-                            %% matched a previous value in a '=:=' test, or
-                            %% none of the values matched a previous test.
-                            #b_br{bool=#b_literal{val=true},succ=To,fail=To}
-                    end
+                To when is_integer(To) ->
+                    %% Either one of the values in the switch
+                    %% matched a previous value in a '=:=' test, or
+                    %% none of the values matched a previous test.
+                    #b_br{bool=#b_literal{val=true},succ=To,fail=To}
             end
     end;
 eval_terminator(#b_ret{}, _Bs, _St) ->
     none.
 
-eval_switch([{Lit,Lbl}|T], Arg, St, Fail) ->
-    case eval_rel_op({bif,'=:='}, [Arg,Lit], St) of
-        none ->
-            %% This label could be reached.
-            eval_switch(T, Arg, St, none);
-        #b_literal{val=false} ->
-            %% This branch will never be taken.
-            eval_switch(T, Arg, St, Fail);
-        #b_literal{val=true} ->
+eval_switch(List, Arg, #st{rel_op={_,Arg,_}=PrevOp}, Fail) ->
+    %% There is a previous relational operator testing the same variable.
+    %% Optimization may be possible.
+    eval_switch_1(List, Arg, PrevOp, Fail);
+eval_switch(_, _, _, _) ->
+    %% There is either no previous relational operator, or it tests
+    %% a different variable. Nothing to optimize.
+    none.
+
+eval_switch_1([{Lit,Lbl}|T], Arg, PrevOp, Fail) ->
+    RelOp = {'=:=',Arg,Lit},
+    case will_succeed(PrevOp, RelOp) of
+        yes ->
             %% Success. This branch will always be taken.
-            Lbl
+            Lbl;
+        no ->
+            %% This branch will never be taken.
+            eval_switch_1(T, Arg, PrevOp, Fail);
+        maybe ->
+            %% This label could be reached.
+            eval_switch_1(T, Arg, PrevOp, none)
     end;
-eval_switch([], _Arg, _St, Fail) ->
+eval_switch_1([], _Arg, _PrevOp, Fail) ->
     %% Fail is now either the failure label or 'none'.
     Fail.
 
-- 
cgit v1.2.3