Optimize ssa_opt_sink for huge functions

The ssa_opt_sink optimization of beam_ssa_opt could get very slow for certain huge functions. 9a190cae9bd7 partly addressed this issue by terminating the optimization early if there happened to be no get_tuple_element instructions at all in the function. This commit addresses the issue more directly by making the dominator calculation in beam_ssa:dominators/1 more efficient. The same algorithm as before is used, but it is implemented in a more efficient way based on the ideas in "A Simple, Fast Dominance Algorithm" (http://www.hipersoft.rice.edu/grads/publications/dom14.pdf). As well as being more efficient, the new implementation also gives an explicit representation of the dominator tree, which makes it possible to simplify and optimize the ssa_opt_sink optimization.
author: Björn Gustavsson <[email protected]> 2019-02-05 02:46:45 +0100
committer: Björn Gustavsson <[email protected]> 2019-02-06 10:34:23 +0100
commit: 077cfc59f1dca89aa1231ce291100aa4b33c50e0 (patch)
tree: 3f0eb95adda9f0432b18a7021a3d97599a34fb00 /lib/compiler/src
parent: e82637d6f078409db27449383df5342294df0b63 (diff)
download: otp-077cfc59f1dca89aa1231ce291100aa4b33c50e0.tar.gz
otp-077cfc59f1dca89aa1231ce291100aa4b33c50e0.tar.bz2
otp-077cfc59f1dca89aa1231ce291100aa4b33c50e0.zip
4 files changed, 100 insertions, 69 deletions
diff --git a/lib/compiler/src/beam_ssa.erl b/lib/compiler/src/beam_ssa.erl
index 9c29c98064..0f662d851d 100644
--- a/lib/compiler/src/beam_ssa.erl
+++ b/lib/compiler/src/beam_ssa.erl
@@ -23,7 +23,7 @@
 -export([add_anno/3,get_anno/2,get_anno/3,
          clobbers_xregs/1,def/2,def_used/2,
          definitions/1,
-         dominators/1,
+         dominators/1,common_dominators/3,
          flatmapfold_instrs_rpo/4,
          fold_po/3,fold_po/4,fold_rpo/3,fold_rpo/4,
          fold_instrs_rpo/4,
@@ -85,7 +85,8 @@
 -type anno() :: #{atom() := any()}.
 
 -type block_map() :: #{label():=b_blk()}.
--type dominator_map() :: #{label():=ordsets:ordset(label())}.
+-type dominator_map() :: #{label():=[label()]}.
+-type numbering_map() :: #{label():=non_neg_integer()}.
 -type usage_map() :: #{b_var():=[{label(),b_set() | terminator()}]}.
 -type definition_map() :: #{b_var():=b_set()}.
 -type rename_map() :: #{b_var():=value()}.
@@ -327,18 +328,41 @@ def_used(Ls, Blocks) ->
     Preds = cerl_sets:from_list(Top),
     def_used_1(Blks, Preds, [], []).
 
+%% dominators(BlockMap) -> {Dominators,Numbering}.
+%%  Calculate the dominator tree, returning a map where each entry
+%%  in the map is a list that gives the path from that block to
+%%  the top of the dominator tree. (Note that the suffixes of the
+%%  paths are shared with each other, which make the representation
+%%  of the dominator tree highly memory-efficient.)
+%%
+%%  The implementation is based on:
+%%
+%%     http://www.hipersoft.rice.edu/grads/publications/dom14.pdf
+%%     Cooper, Keith D.; Harvey, Timothy J; Kennedy, Ken (2001).
+%%        A Simple, Fast Dominance Algorithm.
+
 -spec dominators(Blocks) -> Result when
       Blocks :: block_map(),
-      Result :: dominator_map().
-
+      Result :: {dominator_map(), numbering_map()}.
 dominators(Blocks) ->
     Preds = predecessors(Blocks),
     Top0 = rpo(Blocks),
-    Top = [{L,map_get(L, Preds)} || L <- Top0],
+    Df = maps:from_list(number(Top0, 0)),
+    [{0,[]}|Top] = [{L,map_get(L, Preds)} || L <- Top0],
 
     %% The flow graph for an Erlang function is reducible, and
     %% therefore one traversal in reverse postorder is sufficient.
-    iter_dominators(Top, #{}).
+    Acc = #{0=>[0]},
+    {dominators_1(Top, Df, Acc),Df}.
+
+%% common_dominators([Label], Dominators, Numbering) -> [Label].
+%%  Calculate the common dominators for the given list of blocks
+%%  and Dominators and Numbering as returned from dominators/1.
+
+-spec common_dominators([label()], dominator_map(), numbering_map()) -> [label()].
+common_dominators(Ls, Dom, Numbering) ->
+    Doms = [map_get(L, Dom) || L <- Ls],
+    dom_intersection(Doms, Numbering).
 
 -spec fold_instrs_rpo(Fun, From, Acc0, Blocks) -> any() when
       Fun :: fun((b_blk()|terminator(), any()) -> any()),
@@ -657,14 +681,37 @@ def_is([#b_set{dst=Dst}|Is], Def) ->
     def_is(Is, [Dst|Def]);
 def_is([], Def) -> Def.
 
-iter_dominators([{0,[]}|Ls], _Doms) ->
-    Dom = [0],
-    iter_dominators(Ls, #{0=>Dom});
-iter_dominators([{L,Preds}|Ls], Doms) ->
+dominators_1([{L,Preds}|Ls], Df, Doms) ->
     DomPreds = [map_get(P, Doms) || P <- Preds, is_map_key(P, Doms)],
-    Dom = ordsets:add_element(L, ordsets:intersection(DomPreds)),
-    iter_dominators(Ls, Doms#{L=>Dom});
-iter_dominators([], Doms) -> Doms.
+    Dom = [L|dom_intersection(DomPreds, Df)],
+    dominators_1(Ls, Df, Doms#{L=>Dom});
+dominators_1([], _Df, Doms) -> Doms.
+
+dom_intersection([S], _Df) ->
+    S;
+dom_intersection([S|Ss], Df) ->
+    dom_intersection(S, Ss, Df).
+
+dom_intersection(S1, [S2|Ss], Df) ->
+    dom_intersection(dom_intersection_1(S1, S2, Df), Ss, Df);
+dom_intersection(S, [], _Df) -> S.
+
+dom_intersection_1([E1|Es1]=Set1, [E2|Es2]=Set2, Df) ->
+    %% Blocks are numbered in the order they are found in
+    %% reverse postorder.
+    #{E1:=Df1,E2:=Df2} = Df,
+    if Df1 > Df2 ->
+            dom_intersection_1(Es1, Set2, Df);
+       Df2 > Df1 ->
+            dom_intersection_1(Es2, Set1, Df);  %switch arguments!
+       true ->                                  %Set1 == Set2
+            %% The common suffix of the sets is the intersection.
+            Set1
+    end.
+
+number([L|Ls], N) ->
+    [{L,N}|number(Ls, N+1)];
+number([], _) -> [].
 
 fold_rpo_1([L|Ls], Fun, Blocks, Acc0) ->
     Block = map_get(L, Blocks),
diff --git a/lib/compiler/src/beam_ssa_bsm.erl b/lib/compiler/src/beam_ssa_bsm.erl
index 466337db0e..382e6f635e 100644
--- a/lib/compiler/src/beam_ssa_bsm.erl
+++ b/lib/compiler/src/beam_ssa_bsm.erl
@@ -300,7 +300,8 @@ get_fa(#b_function{ anno = Anno }) ->
                promotions = #{} :: promotion_map() }).
 
 alias_matched_binaries(Blocks0, Counter, AliasMap) when AliasMap =/= #{} ->
-    State0 = #amb{ dominators = beam_ssa:dominators(Blocks0),
+    {Dominators, _} = beam_ssa:dominators(Blocks0),
+    State0 = #amb{ dominators = Dominators,
                    match_aliases = AliasMap,
                    cnt = Counter },
     {Blocks, State} = beam_ssa:mapfold_blocks_rpo(fun amb_1/3, [0], State0,
@@ -347,7 +348,7 @@ amb_get_alias(#b_var{}=Arg, Lbl, State) ->
             %% Our context may not have been created yet, so we skip assigning
             %% an alias unless the given block is among our dominators.
             Dominators = maps:get(Lbl, State#amb.dominators),
-            case ordsets:is_element(AliasAfter, Dominators) of
+            case member(AliasAfter, Dominators) of
                 true -> amb_create_alias(Arg, Context, Lbl, State);
                 false -> {Arg, State}
             end;
@@ -444,6 +445,7 @@ combine_matches({Fs0, ModInfo}) ->
 combine_matches(#b_function{bs=Blocks0,cnt=Counter0}=F, ModInfo) ->
     case funcinfo_get(F, has_bsm_ops, ModInfo) of
         true ->
+            {Dominators, _} = beam_ssa:dominators(Blocks0),
             {Blocks1, State} =
                 beam_ssa:mapfold_blocks_rpo(
                   fun(Lbl, #b_blk{is=Is0}=Block0, State0) ->
@@ -451,7 +453,7 @@ combine_matches(#b_function{bs=Blocks0,cnt=Counter0}=F, ModInfo) ->
                           {Block0#b_blk{is=Is}, State}
                   end, [0],
                   #cm{ definitions = beam_ssa:definitions(Blocks0),
-                       dominators = beam_ssa:dominators(Blocks0),
+                       dominators = Dominators,
                        blocks = Blocks0 },
                   Blocks0),
 
@@ -491,7 +493,7 @@ cm_handle_priors(Src, DstCtx, Bool, Acc, MatchSeq, Lbl, State0) ->
                         %% dominate us.
                         Dominators = maps:get(Lbl, State0#cm.dominators, []),
                         [Ctx || {ValidAfter, Ctx} <- Priors,
-                                ordsets:is_element(ValidAfter, Dominators)];
+                                member(ValidAfter, Dominators)];
                     error ->
                         []
                 end,
diff --git a/lib/compiler/src/beam_ssa_opt.erl b/lib/compiler/src/beam_ssa_opt.erl
index ca5eefe4fc..533cc04aac 100644
--- a/lib/compiler/src/beam_ssa_opt.erl
+++ b/lib/compiler/src/beam_ssa_opt.erl
@@ -2021,7 +2021,7 @@ do_ssa_opt_sink(Linear, Defs, #st{ssa=Blocks0}=St) ->
     Used = used_blocks(Linear, Defs, []),
 
     %% Calculate dominators.
-    Dom0 = beam_ssa:dominators(Blocks0),
+    {Dom,Numbering} = beam_ssa:dominators(Blocks0),
 
     %% It is not safe to move get_tuple_element instructions to blocks
     %% that begin with certain instructions. It is also unsafe to move
@@ -2029,20 +2029,10 @@ do_ssa_opt_sink(Linear, Defs, #st{ssa=Blocks0}=St) ->
     %% unsafe moves, pretend that the unsuitable blocks are not
     %% dominators.
     Unsuitable = unsuitable(Linear, Blocks0),
-    Dom = case gb_sets:is_empty(Unsuitable) of
-              true ->
-                  Dom0;
-              false ->
-                  F = fun(_, DomBy) ->
-                              [L || L <- DomBy,
-                                    not gb_sets:is_element(L, Unsuitable)]
-                      end,
-                  maps:map(F, Dom0)
-          end,
 
     %% Calculate new positions for get_tuple_element instructions. The new
     %% position is a block that dominates all uses of the variable.
-    DefLoc = new_def_locations(Used, Defs, Dom),
+    DefLoc = new_def_locations(Used, Defs, Dom, Numbering, Unsuitable),
 
     %% Now move all suitable get_tuple_element instructions to their
     %% new blocks.
@@ -2136,50 +2126,42 @@ unsuitable_loop_1([P|Ps], Blocks, Predecessors, Acc0) ->
     end;
 unsuitable_loop_1([], _, _, Acc) -> Acc.
 
-%% new_def_locations([{Variable,[UsedInBlock]}|Vs], Defs, Dominators) ->
-%%          [{Variable,NewDefinitionBlock}]
-%%  Calculate new locations for get_tuple_element instructions. For each
-%%  variable, the new location is a block that dominates all uses of
-%%  variable and as near to the uses of as possible. If no such block
-%%  distinct from the block where the instruction currently is, the
-%%  variable will not be included in the result list.
+%% new_def_locations([{Variable,[UsedInBlock]}|Vs], Defs,
+%%                   Dominators, Numbering, Unsuitable) ->
+%%  [{Variable,NewDefinitionBlock}]
+%%
+%%  Calculate new locations for get_tuple_element instructions. For
+%%  each variable, the new location is a block that dominates all uses
+%%  of the variable and as near to the uses of as possible.
 
-new_def_locations([{V,UsedIn}|Vs], Defs, Dom) ->
+new_def_locations([{V,UsedIn}|Vs], Defs, Dom, Numbering, Unsuitable) ->
     DefIn = map_get(V, Defs),
-    case common_dom(UsedIn, DefIn, Dom) of
-        [] ->
-            new_def_locations(Vs, Defs, Dom);
-        [_|_]=BetterDef ->
-            L = most_dominated(BetterDef, Dom),
-            [{V,L}|new_def_locations(Vs, Defs, Dom)]
-    end;
-new_def_locations([], _, _) -> [].
-
-common_dom([L|Ls], DefIn, Dom) ->
-    DomBy0 = map_get(L, Dom),
-    DomBy = ordsets:subtract(DomBy0, map_get(DefIn, Dom)),
-    common_dom_1(Ls, Dom, DomBy).
-
-common_dom_1(_, _, []) ->
-    [];
-common_dom_1([L|Ls], Dom, [_|_]=DomBy0) ->
-    DomBy1 = map_get(L, Dom),
-    DomBy = ordsets:intersection(DomBy0, DomBy1),
-    common_dom_1(Ls, Dom, DomBy);
-common_dom_1([], _, DomBy) -> DomBy.
-
-most_dominated([L|Ls], Dom) ->
-    most_dominated(Ls, L, map_get(L, Dom), Dom).
-
-most_dominated([L|Ls], L0, DomBy, Dom) ->
-    case member(L, DomBy) of
+    Common = common_dominator(UsedIn, Dom, Numbering, Unsuitable),
+    case member(Common, map_get(DefIn, Dom)) of
         true ->
-            most_dominated(Ls, L0, DomBy, Dom);
+            %% The common dominator is either DefIn or an
+            %% ancestor of DefIn.
+            new_def_locations(Vs, Defs, Dom, Numbering, Unsuitable);
         false ->
-            most_dominated(Ls, L, map_get(L, Dom), Dom)
+            %% We have found a suitable descendant of DefIn,
+            %% to which the get_tuple_element instruction can
+            %% be sunk.
+            [{V,Common}|new_def_locations(Vs, Defs, Dom, Numbering, Unsuitable)]
     end;
-most_dominated([], L, _, _) -> L.
+new_def_locations([], _, _, _, _) -> [].
 
+common_dominator(Ls0, Dom, Numbering, Unsuitable) ->
+    [Common|_] = beam_ssa:common_dominators(Ls0, Dom, Numbering),
+    case gb_sets:is_member(Common, Unsuitable) of
+        true ->
+            %% It is not allowed to place the instruction here. Try
+            %% to find another suitable dominating block by going up
+            %% one step in the dominator tree.
+            [Common,OneUp|_] = map_get(Common, Dom),
+            common_dominator([OneUp], Dom, Numbering, Unsuitable);
+        false ->
+            Common
+    end.
 
 %% Move get_tuple_element instructions to their new locations.
 
diff --git a/lib/compiler/src/beam_ssa_pre_codegen.erl b/lib/compiler/src/beam_ssa_pre_codegen.erl
index 274f78052d..b4a7517841 100644
--- a/lib/compiler/src/beam_ssa_pre_codegen.erl
+++ b/lib/compiler/src/beam_ssa_pre_codegen.erl
@@ -874,7 +874,7 @@ fix_tuples(#st{ssa=Blocks0,cnt=Count0}=St) ->
 %%   a stack frame or set up a stack frame with a different size.
 
 place_frames(#st{ssa=Blocks}=St) ->
-    Doms = beam_ssa:dominators(Blocks),
+    {Doms,_} = beam_ssa:dominators(Blocks),
     Ls = beam_ssa:rpo(Blocks),
     Tried = gb_sets:empty(),
     Frames0 = [],
@@ -1001,7 +1001,7 @@ phi_predecessors(L, Blocks) ->
 
 is_dominated_by(L, DomBy, Doms) ->
     DominatedBy = map_get(L, Doms),
-    ordsets:is_element(DomBy, DominatedBy).
+    member(DomBy, DominatedBy).
 
 %% need_frame(#b_blk{}) -> true|false.
 %%  Test whether any of the instructions in the block requires a stack frame.
author	Björn Gustavsson <[email protected]>	2019-02-05 02:46:45 +0100
committer	Björn Gustavsson <[email protected]>	2019-02-06 10:34:23 +0100
commit	077cfc59f1dca89aa1231ce291100aa4b33c50e0 (patch)
tree	3f0eb95adda9f0432b18a7021a3d97599a34fb00 /lib/compiler/src
parent	e82637d6f078409db27449383df5342294df0b63 (diff)
download	otp-077cfc59f1dca89aa1231ce291100aa4b33c50e0.tar.gz otp-077cfc59f1dca89aa1231ce291100aa4b33c50e0.tar.bz2 otp-077cfc59f1dca89aa1231ce291100aa4b33c50e0.zip