From 0009fc378109f870472ded41d29c5d81febd5990 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sat, 3 Nov 2018 07:12:10 +0100
Subject: Add get_map_element to beam_ssa:no_side_effect/1

The `get_map_element` instruction has no side effects, and should be
removed if its value is not used.
---
 lib/compiler/src/beam_ssa.erl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lib/compiler/src/beam_ssa.erl b/lib/compiler/src/beam_ssa.erl
index c5e23d2ae0..b491e340b7 100644
--- a/lib/compiler/src/beam_ssa.erl
+++ b/lib/compiler/src/beam_ssa.erl
@@ -194,6 +194,7 @@ no_side_effect(#b_set{op=Op}) ->
         extract -> true;
         get_hd -> true;
         get_tl -> true;
+        get_map_element -> true;
         get_tuple_element -> true;
         has_map_field -> true;
         is_nonempty_list -> true;
-- 
cgit v1.2.3


From 667e0e1213f69344a57c2eb0427d36cd240e16a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Mon, 5 Nov 2018 07:35:02 +0100
Subject: beam_ssa_codegen: Improve optimization of allocate instructions

There could be `allocate_zero` instructions where `allocate` would
suffice or superfluous `init` instructions because all possible
initializations of Y registers were not taken into account.

While at it, also add some more comments.
---
 lib/compiler/src/beam_ssa_codegen.erl | 39 +++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/lib/compiler/src/beam_ssa_codegen.erl b/lib/compiler/src/beam_ssa_codegen.erl
index 3c14062d0b..d3facc5911 100644
--- a/lib/compiler/src/beam_ssa_codegen.erl
+++ b/lib/compiler/src/beam_ssa_codegen.erl
@@ -747,7 +747,16 @@ need_live_anno(Op) ->
     end.
 
 %%%
-%%% Add annotations for defined Y registers.
+%%% Add the following annotations for Y registers:
+%%%
+%%%   def_yregs   An ordset with variables that refer to live Y registers.
+%%%               That is, Y registers that that have been killed
+%%%               are not included. This annotation is added to all
+%%%               instructions that require Y registers to be initialized.
+%%%
+%%%   kill_yregs  This annotation is added to call instructions. It is
+%%%               an ordset containing variables referring to Y registers
+%%%               that will no longer be used after the call instruction.
 %%%
 
 defined(Linear, #cg{regs=Regs}) ->
@@ -863,13 +872,35 @@ opt_allocate(Linear, #cg{regs=Regs}) ->
 
 opt_allocate_1([{L,#cg_blk{is=[#cg_alloc{stack=Stk}=I0|Is]}=Blk0}|Bs]=Bs0, Regs)
   when is_integer(Stk) ->
-    Yregs = opt_alloc_def(Bs0, gb_sets:singleton(L), []),
-    I = I0#cg_alloc{def_yregs=Yregs},
-    [{L,Blk0#cg_blk{is=[I|Is]}}|opt_allocate_1(Bs, Regs)];
+    %% Collect the variables that are initialized by copy
+    %% instruction in this block.
+    case ordsets:from_list(opt_allocate_defs(Is, Regs)) of
+        Yregs when length(Yregs) =:= Stk ->
+            %% Those copy instructions are sufficient to fully
+            %% initialize the stack frame.
+            I = I0#cg_alloc{def_yregs=Yregs},
+            [{L,Blk0#cg_blk{is=[I|Is]}}|opt_allocate_1(Bs, Regs)];
+        Yregs0 ->
+            %% Determine a conservative approximation of the Y
+            %% registers that are guaranteed to be initialized by all
+            %% successors of this block, and to it add the variables
+            %% initialized by copy instructions in this block.
+            Yregs1 = opt_alloc_def(Bs0, gb_sets:singleton(L), []),
+            Yregs = ordsets:union(Yregs0, Yregs1),
+            I = I0#cg_alloc{def_yregs=Yregs},
+            [{L,Blk0#cg_blk{is=[I|Is]}}|opt_allocate_1(Bs, Regs)]
+    end;
 opt_allocate_1([B|Bs], Regs) ->
     [B|opt_allocate_1(Bs, Regs)];
 opt_allocate_1([], _) -> [].
 
+opt_allocate_defs([#cg_set{op=copy,dst=Dst}|Is], Regs) ->
+    case is_yreg(Dst, Regs) of
+        true -> [Dst|opt_allocate_defs(Is, Regs)];
+        false -> []
+    end;
+opt_allocate_defs(_, _Regs) -> [].
+
 opt_alloc_def([{L,#cg_blk{is=Is,last=Last}}|Bs], Ws0, Def0) ->
     case gb_sets:is_member(L, Ws0) of
         false ->
-- 
cgit v1.2.3


From b4595a670b159c802489d94d8537a48567967927 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Sun, 18 Nov 2018 06:21:18 +0100
Subject: beam_ssa_pre_codegen: Improve reuse of Y registers

Enhance the copy_retval/1 optimization to allow Y registers to be
reused in more circumstances. Reusing Y register can often reduce
the size of the stack frame.
---
 lib/compiler/src/beam_ssa_pre_codegen.erl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/lib/compiler/src/beam_ssa_pre_codegen.erl b/lib/compiler/src/beam_ssa_pre_codegen.erl
index c60c6da9ea..32232e9b9f 100644
--- a/lib/compiler/src/beam_ssa_pre_codegen.erl
+++ b/lib/compiler/src/beam_ssa_pre_codegen.erl
@@ -1478,6 +1478,10 @@ copy_retval_is([#b_set{op=put_tuple_elements,args=Args0}=I0], false, _Yregs,
            Copy, Count, Acc) ->
     I = I0#b_set{args=copy_sub_args(Args0, Copy)},
     {reverse(Acc, [I|acc_copy([], Copy)]),Count};
+copy_retval_is([#b_set{op=Op}=I0], false, Yregs, Copy, Count0, Acc0)
+  when Op =:= call; Op =:= make_fun ->
+    {I,Count,Acc} = place_retval_copy(I0, Yregs, Copy, Count0, Acc0),
+    {reverse(Acc, [I]),Count};
 copy_retval_is([#b_set{}]=Is, false, _Yregs, Copy, Count, Acc) ->
     {reverse(Acc, acc_copy(Is, Copy)),Count};
 copy_retval_is([#b_set{},#b_set{op=succeeded}]=Is, false, _Yregs, Copy, Count, Acc) ->
-- 
cgit v1.2.3


From e79e5f194577d6c5b289415ec84338f70d2486db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= <bjorn@erlang.org>
Date: Mon, 29 Oct 2018 16:31:00 +0100
Subject: Sort move instructions on the Y register

Sort sequences of `move` instructions on the Y register.

When moving from X registers to Y registers, having the instructions
sorted on Y registers give the loader more opportunities to use
`move_window{3,4,5}` instructions. For examples, the following five
instructions:

    move_xy x(2) y(0)
    move_xy x(1) y(1)
    move_xy x(0) y(2)
    move_xy x(5) y(3)
    move_xy x(4) y(4)

can be replaced with:

    move_window5_xxxxxy x(2) x(1) x(0) x(5) x(4) y(0)

When the Y registers are not ordered so that `move_window5` can be
used, the loader would typically combine the first three moves to a
`move3_xyxyxy` instruction and the last two moves to a
`move2_par_xyxy` instruction.

When moving from Y registers to X registers, sorting on the Y
registers could potentially be more cache-friendly. It could also
be worthwhile investigating a new `move_window` instruction in
the BEAM interpreter that could move values from contiguous Y registers
to X registers.

Note that `scripts/diffable` can generate diffable dissambly files for
the loaded BEAM code:

    $ scripts/diffable --dis 0
    $ scripts/diffable --dis 1
    $ diff -u 0 1
---
 lib/compiler/src/beam_block.erl | 38 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/lib/compiler/src/beam_block.erl b/lib/compiler/src/beam_block.erl
index 9d8d5b2b0c..707974b2c1 100644
--- a/lib/compiler/src/beam_block.erl
+++ b/lib/compiler/src/beam_block.erl
@@ -22,7 +22,7 @@
 -module(beam_block).
 
 -export([module/2]).
--import(lists, [reverse/1,splitwith/2]).
+-import(lists, [keysort/2,reverse/1,splitwith/2]).
 
 -spec module(beam_utils:module_code(), [compile:option()]) ->
                     {'ok',beam_utils:module_code()}.
@@ -53,7 +53,8 @@ blockify([I|Is0]=IsAll, Acc) ->
     case collect(I) of
 	error -> blockify(Is0, [I|Acc]);
 	Instr when is_tuple(Instr) ->
-	    {Block,Is} = collect_block(IsAll),
+            {Block0,Is} = collect_block(IsAll),
+            Block = sort_moves(Block0),
 	    blockify(Is, [{block,Block}|Acc])
     end;
 blockify([], Acc) -> reverse(Acc).
@@ -117,3 +118,36 @@ embed_lines([{block,B1},{line,_}=Line|T], Acc) ->
 embed_lines([I|Is], Acc) ->
     embed_lines(Is, [I|Acc]);
 embed_lines([], Acc) -> Acc.
+
+%% sort_moves([Instruction]) -> [Instruction].
+%%  Sort move instructions on the Y register to give the loader
+%%  more opportunities for combining instructions.
+
+sort_moves([{set,[{x,_}],[{y,_}],move}=I|Is0]) ->
+    {Moves,Is} = sort_moves_1(Is0, x, y, [I]),
+    Moves ++ sort_moves(Is);
+sort_moves([{set,[{y,_}],[{x,_}],move}=I|Is0]) ->
+    {Moves,Is} = sort_moves_1(Is0, y, x, [I]),
+    Moves ++ sort_moves(Is);
+sort_moves([I|Is]) ->
+    [I|sort_moves(Is)];
+sort_moves([]) -> [].
+
+sort_moves_1([{set,[{x,0}],[_],move}=I|Is], _DTag, _STag, Acc) ->
+    %% The loader sometimes combines a move to x0 with the
+    %% instruction that follows, producing, for example, a move_call
+    %% instruction. Therefore, we don't want include this move
+    %% instruction in the sorting.
+    {sort_on_yreg(Acc)++[I],Is};
+sort_moves_1([{set,[{DTag,_}],[{STag,_}],move}=I|Is], DTag, STag, Acc) ->
+    sort_moves_1(Is, DTag, STag, [I|Acc]);
+sort_moves_1(Is, _DTag, _STag, Acc) ->
+    {sort_on_yreg(Acc),Is}.
+
+sort_on_yreg([{set,[Dst],[Src],move}|_]=Moves) ->
+    case {Dst,Src} of
+        {{y,_},{x,_}} ->
+            keysort(2, Moves);
+        {{x,_},{y,_}} ->
+            keysort(3, Moves)
+    end.
-- 
cgit v1.2.3