From e547ffd572e41178e6b513a4a9b84fdf5c557b98 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 2 Mar 2017 17:10:25 +0100
Subject: hipe_x86_subst: Fix bug with zero-arity #fp_unop{}

---
 lib/hipe/x86/hipe_x86_subst.erl | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/hipe/x86/hipe_x86_subst.erl b/lib/hipe/x86/hipe_x86_subst.erl
index 7b5fb1352b..102207ad7e 100644
--- a/lib/hipe/x86/hipe_x86_subst.erl
+++ b/lib/hipe/x86/hipe_x86_subst.erl
@@ -49,10 +49,11 @@ insn_temps(SubstTemp, I) ->
     #movzx   {src=S, dst=D}  -> I#movzx   {src=O(S), dst=O(D)};
     #shift   {src=S, dst=D}  -> I#shift   {src=O(S), dst=O(D)};
     #test    {src=S, dst=D}  -> I#test    {src=O(S), dst=O(D)};
-    #fp_unop{arg=A} -> I#fp_unop{arg=O(A)};
-    #move64 {dst=D} -> I#move64 {dst=O(D)};
-    #push   {src=S} -> I#push   {src=O(S)};
-    #pop    {dst=D} -> I#pop    {dst=O(D)};
+    #fp_unop{arg=[]} -> I;
+    #fp_unop{arg=A}  -> I#fp_unop{arg=O(A)};
+    #move64 {dst=D}  -> I#move64 {dst=O(D)};
+    #push   {src=S}  -> I#push   {src=O(S)};
+    #pop    {dst=D}  -> I#pop    {dst=O(D)};
     #jmp_switch{temp=T, jtab=J} ->
       I#jmp_switch{temp=O(T), jtab=jtab_temps(SubstTemp, J)};
     #pseudo_call{'fun'=F} ->
-- 
cgit v1.2.3


From f9263b9173905d4e7a53350d4f374c5020c52738 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 16 Mar 2017 15:50:09 +0100
Subject: hipe: Extract disjoint sets to its own module

---
 lib/hipe/main/hipe.app.src                  |  1 +
 lib/hipe/regalloc/hipe_regalloc_prepass.erl | 71 +++++-------------------
 lib/hipe/util/Makefile                      |  2 +-
 lib/hipe/util/hipe_dsets.erl                | 84 +++++++++++++++++++++++++++++
 4 files changed, 98 insertions(+), 60 deletions(-)
 create mode 100644 lib/hipe/util/hipe_dsets.erl

diff --git a/lib/hipe/main/hipe.app.src b/lib/hipe/main/hipe.app.src
index af2c02006d..4c3f147d40 100644
--- a/lib/hipe/main/hipe.app.src
+++ b/lib/hipe/main/hipe.app.src
@@ -83,6 +83,7 @@
 	     hipe_digraph,
 	     hipe_dominators,
 	     hipe_dot,
+	     hipe_dsets,
 	     hipe_gen_cfg,
 	     hipe_gensym,
 	     hipe_graph_coloring_regalloc,
diff --git a/lib/hipe/regalloc/hipe_regalloc_prepass.erl b/lib/hipe/regalloc/hipe_regalloc_prepass.erl
index e212420ad2..5024840237 100644
--- a/lib/hipe/regalloc/hipe_regalloc_prepass.erl
+++ b/lib/hipe/regalloc/hipe_regalloc_prepass.erl
@@ -483,8 +483,8 @@ merge_pointless_splits_1([], _ScanBBs, DSets, Acc) -> {Acc, DSets};
 merge_pointless_splits_1([P={_,{single,_}}|Ps], ScanBBs, DSets, Acc) ->
   merge_pointless_splits_1(Ps, ScanBBs, DSets, [P|Acc]);
 merge_pointless_splits_1([P0={L,{split,_,_}}|Ps], ScanBBs, DSets0, Acc) ->
-  {EntryRoot, DSets1} = dsets_find({entry,L}, DSets0),
-  {ExitRoot,  DSets}  = dsets_find({exit,L},  DSets1),
+  {EntryRoot, DSets1} = hipe_dsets:find({entry,L}, DSets0),
+  {ExitRoot,  DSets}  = hipe_dsets:find({exit,L},  DSets1),
   case EntryRoot =:= ExitRoot of
     false -> merge_pointless_splits_1(Ps, ScanBBs, DSets, [P0|Acc]);
     true ->
@@ -501,7 +501,7 @@ merge_pointless_splits_1([P0={L,{split,_,_}}|Ps], ScanBBs, DSets0, Acc) ->
 
 -spec merge_small_parts(bb_dsets()) -> {bb_dsets_rllist(), bb_dsets()}.
 merge_small_parts(DSets0) ->
-  {RLList, DSets1} = dsets_to_rllist(DSets0),
+  {RLList, DSets1} = hipe_dsets:to_rllist(DSets0),
   RLLList = [{R, length(Elems), Elems} || {R, Elems} <- RLList],
   merge_small_parts_1(RLLList, DSets1, []).
 
@@ -518,8 +518,8 @@ merge_small_parts_1([Fst,{R, L, Es}|Ps], DSets, Acc)
   merge_small_parts_1([Fst|Ps], DSets, [{R,Es}|Acc]);
 merge_small_parts_1([{R1,L1,Es1},{R2,L2,Es2}|Ps], DSets0, Acc) ->
   ?ASSERT(L1 < ?TUNE_TOO_FEW_BBS andalso L2 < ?TUNE_TOO_FEW_BBS),
-  DSets1 = dsets_union(R1, R2, DSets0),
-  {R, DSets} = dsets_find(R1, DSets1),
+  DSets1 = hipe_dsets:union(R1, R2, DSets0),
+  {R, DSets} = hipe_dsets:find(R1, DSets1),
   merge_small_parts_1([{R,L2+L1,Es2++Es1}|Ps], DSets, Acc).
 
 %% @doc Partition an ordering over BBs into subsequences for the dsets that
@@ -531,8 +531,8 @@ part_order(Lbs, DSets) -> part_order(Lbs, DSets, #{}).
 
 part_order([], DSets, Acc) -> {Acc, DSets};
 part_order([L|Ls], DSets0, Acc0) ->
-  {EntryRoot, DSets1} = dsets_find({entry,L}, DSets0),
-  {ExitRoot,  DSets2} = dsets_find({exit,L},  DSets1),
+  {EntryRoot, DSets1} = hipe_dsets:find({entry,L}, DSets0),
+  {ExitRoot,  DSets2} = hipe_dsets:find({exit,L},  DSets1),
   Acc1 = map_append(EntryRoot, L, Acc0),
   %% Only include the label once if both entry and exit is in same partition
   Acc2 = case EntryRoot =:= ExitRoot of
@@ -558,72 +558,25 @@ map_append(Key, Elem, Map) ->
 %% split point, and one from the end to the last split point.
 
 -type bb_dset_key() :: {entry | exit, label()}.
--type bb_dsets() :: dsets(bb_dset_key()).
+-type bb_dsets() :: hipe_dsets:dsets(bb_dset_key()).
 -type bb_dsets_rllist() :: [{bb_dset_key(), [bb_dset_key()]}].
 
 -spec initial_dsets(target_cfg(), module(), target_context()) -> bb_dsets().
 initial_dsets(CFG, TgtMod, TgtCtx) ->
   Labels = TgtMod:labels(CFG, TgtCtx),
-  DSets0 = dsets_new(lists:append([[{entry,L},{exit,L}] || L <- Labels])),
+  DSets0 = hipe_dsets:new(lists:append([[{entry,L},{exit,L}] || L <- Labels])),
   Edges = lists:append([[{L, S} || S <- hipe_gen_cfg:succ(CFG, L)]
 			|| L <- Labels]),
-  lists:foldl(fun({X, Y}, DS) -> dsets_union({exit,X}, {entry,Y}, DS) end,
+  lists:foldl(fun({X, Y}, DS) -> hipe_dsets:union({exit,X}, {entry,Y}, DS) end,
 	      DSets0, Edges).
 
 -spec join_whole_blocks(part_bb_list(), bb_dsets()) -> bb_dsets().
 join_whole_blocks(PartBBList, DSets0) ->
-  lists:foldl(fun({L, {single, _}}, DS) -> dsets_union({entry,L}, {exit,L}, DS);
+  lists:foldl(fun({L, {single, _}}, DS) ->
+		  hipe_dsets:union({entry,L}, {exit,L}, DS);
 		 ({_, {split, _, _}}, DS) -> DS
 	      end, DSets0, PartBBList).
 
-%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
-%% The disjoint set forests data structure, for elements of arbitrary types.
-%% Note that the find operation mutates the set.
-%%
-%% We could do this more efficiently if we restricted the elements to integers,
-%% and used the (mutable) hipe arrays. For arbitrary terms ETS could be used,
-%% for a persistent interface (which isn't that nice when even accessors return
-%% modified copies), the array module could be used.
--type dsets(X) :: #{X => {node, X} | {root, non_neg_integer()}}.
-
--spec dsets_new([E]) -> dsets(E).
-dsets_new(Elems) -> maps:from_list([{E,{root,0}} || E <- Elems]).
-
--spec dsets_find(E, dsets(E)) -> {E, dsets(E)}.
-dsets_find(E, DS0) ->
-  case DS0 of
-    #{E := {root,_}} -> {E, DS0};
-    #{E := {node,N}} ->
-      case dsets_find(N, DS0) of
-	{N, _}=T -> T;
-	{R, DS1} -> {R, DS1#{E := {node,R}}}
-      end
-   ;_ -> error(badarg, [E, DS0])
-  end.
-
--spec dsets_union(E, E, dsets(E)) -> dsets(E).
-dsets_union(X, Y, DS0) ->
-  {XRoot, DS1} = dsets_find(X, DS0),
-  case dsets_find(Y, DS1) of
-    {XRoot, DS2} -> DS2;
-    {YRoot, DS2} ->
-      #{XRoot := {root,XRR}, YRoot := {root,YRR}} = DS2,
-      if XRR < YRR -> DS2#{XRoot := {node,YRoot}};
-	 XRR > YRR -> DS2#{YRoot := {node,XRoot}};
-	 true -> DS2#{YRoot := {node,XRoot}, XRoot := {root,XRR+1}}
-      end
-  end.
-
--spec dsets_to_rllist(dsets(E)) -> {[{Root::E, Elems::[E]}], dsets(E)}.
-dsets_to_rllist(DS0) ->
-  {Lists, DS} = dsets_to_rllist(maps:keys(DS0), #{}, DS0),
-  {maps:to_list(Lists), DS}.
-
-dsets_to_rllist([], Acc, DS) -> {Acc, DS};
-dsets_to_rllist([E|Es], Acc, DS0) ->
-  {ERoot, DS} = dsets_find(E, DS0),
-  dsets_to_rllist(Es, map_append(ERoot, E, Acc), DS).
-
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %% Third pass
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/lib/hipe/util/Makefile b/lib/hipe/util/Makefile
index 04de7f7823..eeb81ac482 100644
--- a/lib/hipe/util/Makefile
+++ b/lib/hipe/util/Makefile
@@ -48,7 +48,7 @@ HIPE_MODULES = hipe_vectors
 else
 HIPE_MODULES =
 endif
-MODULES = hipe_timing hipe_dot hipe_digraph $(HIPE_MODULES)
+MODULES = hipe_timing hipe_dot hipe_digraph hipe_dsets $(HIPE_MODULES)
 
 HRL_FILES=
 ERL_FILES= $(MODULES:%=%.erl)
diff --git a/lib/hipe/util/hipe_dsets.erl b/lib/hipe/util/hipe_dsets.erl
new file mode 100644
index 0000000000..9492cab0ff
--- /dev/null
+++ b/lib/hipe/util/hipe_dsets.erl
@@ -0,0 +1,84 @@
+%% -*- erlang-indent-level: 2 -*-
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%@doc
+%%	       IMMUTABLE DISJOINT SETS OF ARBITRARY TERMS
+%%
+%% The disjoint set forests data structure, for elements of arbitrary types.
+%% Note that the find operation mutates the set.
+%%
+%% We could do this more efficiently if we restricted the elements to integers,
+%% and used the (mutable) hipe arrays. For arbitrary terms ETS could be used,
+%% for a persistent interface (which isn't that nice when even accessors return
+%% modified copies), the array module could be used.
+-module(hipe_dsets).
+
+-export([new/1, find/2, union/3, to_map/1, to_rllist/1]).
+-export_type([dsets/1]).
+
+-opaque dsets(X) :: #{X => {node, X} | {root, non_neg_integer()}}.
+
+-spec new([E]) -> dsets(E).
+new(Elems) -> maps:from_list([{E,{root,0}} || E <- Elems]).
+
+-spec find(E, dsets(E)) -> {E, dsets(E)}.
+find(E, DS0) ->
+  case DS0 of
+    #{E := {root,_}} -> {E, DS0};
+    #{E := {node,N}} ->
+      case find(N, DS0) of
+	{N, _}=T -> T;
+	{R, DS1} -> {R, DS1#{E := {node,R}}}
+      end;
+    _ -> error(badarg, [E, DS0])
+  end.
+
+-spec union(E, E, dsets(E)) -> dsets(E).
+union(X, Y, DS0) ->
+  {XRoot, DS1} = find(X, DS0),
+  case find(Y, DS1) of
+    {XRoot, DS2} -> DS2;
+    {YRoot, DS2} ->
+      #{XRoot := {root,XRR}, YRoot := {root,YRR}} = DS2,
+      if XRR < YRR -> DS2#{XRoot := {node,YRoot}};
+	 XRR > YRR -> DS2#{YRoot := {node,XRoot}};
+	 true -> DS2#{YRoot := {node,XRoot}, XRoot := {root,XRR+1}}
+      end
+  end.
+
+-spec to_map(dsets(E)) -> {#{Elem::E => Root::E}, dsets(E)}.
+to_map(DS) ->
+  to_map(maps:keys(DS), DS, #{}).
+
+to_map([], DS, Acc) -> {Acc, DS};
+to_map([K|Ks], DS0, Acc) ->
+  {KR, DS} = find(K, DS0),
+  to_map(Ks, DS, Acc#{K => KR}).
+
+-spec to_rllist(dsets(E)) -> {[{Root::E, Elems::[E]}], dsets(E)}.
+to_rllist(DS0) ->
+  {Lists, DS} = to_rllist(maps:keys(DS0), #{}, DS0),
+  {maps:to_list(Lists), DS}.
+
+to_rllist([], Acc, DS) -> {Acc, DS};
+to_rllist([E|Es], Acc, DS0) ->
+  {ERoot, DS} = find(E, DS0),
+  to_rllist(Es, map_append(ERoot, E, Acc), DS).
+
+map_append(Key, Elem, Map) ->
+  case Map of
+    #{Key := List} -> Map#{Key := [Elem|List]};
+    #{} -> Map#{Key => [Elem]}
+  end.
-- 
cgit v1.2.3


From 040f6e240a80cb8576ddb3e7b2b49fd7f98aa3dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 9 Mar 2017 16:59:55 +0100
Subject: hipe: Add basic_edge_cases test case

Two tests are added, primarily aimed at the range splitters.

 * test_float_spills, which exercises the rare case of high floating
   point register pressure, including spill slot move coalescing.
 * test_infinite_loops, which tests that various infinite loops are
   properly compiled and do contain reduction tests (otherwise they
   would permanently hog their scheduler and not notice being sent an
   exit signal).
---
 .../test/basic_SUITE_data/basic_edge_cases.erl     | 142 +++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 lib/hipe/test/basic_SUITE_data/basic_edge_cases.erl

diff --git a/lib/hipe/test/basic_SUITE_data/basic_edge_cases.erl b/lib/hipe/test/basic_SUITE_data/basic_edge_cases.erl
new file mode 100644
index 0000000000..9bf5cf52cd
--- /dev/null
+++ b/lib/hipe/test/basic_SUITE_data/basic_edge_cases.erl
@@ -0,0 +1,142 @@
+%%% -*- erlang-indent-level: 2 -*-
+%%%----------------------------------------------------------------------
+%%% Contains
+%%%----------------------------------------------------------------------
+-module(basic_edge_cases).
+
+-export([test/0]).
+
+test() ->
+  ok = test_float_spills(),
+  ok = test_infinite_loops(),
+  ok.
+
+%% Contains more float temps live at a single point than there are float
+%% registers in any backend
+
+test_float_spills() ->
+    {{{2942.0,4670.0,3198.0,4926.0,2206.0,4734.0},
+      {3118.0,2062.0,5174.0,3038.0,3618.0,3014.0},
+      {2542.0,2062.0,4934.0,2590.0,3098.0,3062.0},
+      {2950.0,3666.0,2574.0,5038.0,1866.0,2946.0},
+      {3126.0,3050.0,3054.0,5070.0,2258.0,2714.0},
+      {4734.0,2206.0,4926.0,3198.0,4670.0,2942.0}},
+     58937.0} =
+	mat66_flip_sum(35.0,86.0,32.0,88.0,33.0,57.0,
+		       22.0,77.0,91.0,80.0,14.0,33.0,
+		       51.0,28.0,87.0,20.0,91.0,11.0,
+		       68.0,83.0,64.0,82.0,10.0,86.0,
+		       74.0,18.0,08.0,52.0,10.0,14.0,
+		       89.0,34.0,64.0,66.0,58.0,55.0,
+		       0.0, 5),
+    ok.
+
+mat66_flip_sum(M11, M12, M13, M14, M15, M16,
+	       M21, M22, M23, M24, M25, M26,
+	       M31, M32, M33, M34, M35, M36,
+	       M41, M42, M43, M44, M45, M46,
+	       M51, M52, M53, M54, M55, M56,
+	       M61, M62, M63, M64, M65, M66,
+	       Acc, Ctr)
+  when is_float(M11), is_float(M12), is_float(M13),
+       is_float(M14), is_float(M15), is_float(M16),
+       is_float(M21), is_float(M22), is_float(M23),
+       is_float(M24), is_float(M25), is_float(M26),
+       is_float(M31), is_float(M32), is_float(M33),
+       is_float(M34), is_float(M35), is_float(M36),
+       is_float(M41), is_float(M42), is_float(M43),
+       is_float(M44), is_float(M45), is_float(M46),
+       is_float(M51), is_float(M52), is_float(M53),
+       is_float(M54), is_float(M55), is_float(M56),
+       is_float(M61), is_float(M62), is_float(M63),
+       is_float(M64), is_float(M65), is_float(M66),
+       is_float(Acc) ->
+    R11 = M66+M11, R12 = M65+M12, R13 = M64+M13,
+    R14 = M63+M14, R15 = M62+M15, R16 = M61+M16,
+    R21 = M56+M21, R22 = M55+M22, R23 = M54+M23,
+    R24 = M53+M24, R25 = M52+M25, R26 = M51+M26,
+    R31 = M46+M31, R32 = M45+M32, R33 = M44+M33,
+    R34 = M43+M34, R35 = M42+M35, R36 = M41+M36,
+    R41 = M26+M41, R42 = M25+M42, R43 = M24+M43,
+    R44 = M23+M44, R45 = M22+M45, R46 = M21+M46,
+    R51 = M36+M51, R52 = M35+M52, R53 = M34+M53,
+    R54 = M33+M54, R55 = M32+M55, R56 = M31+M56,
+    R61 = M16+M61, R62 = M15+M62, R63 = M14+M63,
+    R64 = M13+M64, R65 = M12+M65, R66 = M11+M66,
+    case Ctr of
+	0 ->
+	    {{{R11, R12, R13, R14, R15, R16},
+	      {R21, R22, R23, R24, R25, R26},
+	      {R31, R32, R33, R34, R35, R36},
+	      {R41, R42, R43, R44, R45, R46},
+	      {R51, R52, R53, R54, R55, R56},
+	      {R61, R62, R63, R64, R65, R66}},
+	     Acc};
+	_ ->
+	    NewAcc = 0.0 + M11 + M12 + M13 + M14 + M15 + M16 +
+		+ M21 + M22 + M23 + M24 + M25 + M26
+		+ M31 + M32 + M33 + M34 + M35 + M36
+		+ M41 + M42 + M43 + M44 + M45 + M46
+		+ M51 + M52 + M53 + M54 + M55 + M56
+		+ M61 + M62 + M63 + M64 + M65 + M66
+		+ Acc,
+	    mat66_flip_sum(R11+1.0, R12+1.0, R13+1.0, R14+1.0, R15+1.0, R16+1.0,
+			   R21+1.0, R22+1.0, R23+1.0, R24+1.0, R25+1.0, R26+1.0,
+			   R31+1.0, R32+1.0, R33+1.0, R34+1.0, R35+1.0, R36+1.0,
+			   R41+1.0, R42+1.0, R43+1.0, R44+1.0, R45+1.0, R46+1.0,
+			   R51+1.0, R52+1.0, R53+1.0, R54+1.0, R55+1.0, R56+1.0,
+			   R61+1.0, R62+1.0, R63+1.0, R64+1.0, R65+1.0, R66+1.0,
+			   NewAcc, Ctr-1)
+    end.
+
+%% Infinite loops must receive reduction tests, and might trip up basic block
+%% weighting, leading to infinite weights and/or divisions by zero.
+
+test_infinite_loops() ->
+  OldTrapExit = process_flag(trap_exit, true),
+  ok = test_infinite_loop(fun infinite_recursion/0),
+  ok = test_infinite_loop(fun infinite_corecursion/0),
+  RecursiveFun = fun RecursiveFun() -> RecursiveFun() end,
+  ok = test_infinite_loop(RecursiveFun),
+  CorecursiveFunA = fun CorecursiveFunA() ->
+		       CorecursiveFunA1 = fun () -> CorecursiveFunA() end,
+		       CorecursiveFunA1()
+		   end,
+  ok = test_infinite_loop(CorecursiveFunA),
+  CorecursiveFunB1 = fun(CorecursiveFunB) -> CorecursiveFunB() end,
+  CorecursiveFunB = fun CorecursiveFunB() ->
+		       CorecursiveFunB1(CorecursiveFunB)
+		   end,
+  ok = test_infinite_loop(CorecursiveFunB),
+  CorecursiveFunC1 = fun CorecursiveFunC1(Other) ->
+			 Other(CorecursiveFunC1)
+		     end,
+  CorecursiveFunC = fun CorecursiveFunC(Other) ->
+		       Other(CorecursiveFunC)
+		   end,
+  ok = test_infinite_loop(fun() -> CorecursiveFunC(CorecursiveFunC1) end),
+  ok = test_infinite_loop(fun() -> CorecursiveFunC(CorecursiveFunC) end),
+  true = process_flag(trap_exit, OldTrapExit),
+  ok.
+
+-define(INFINITE_LOOP_TIMEOUT, 100).
+test_infinite_loop(Fun) ->
+  Tester = spawn_link(Fun),
+  kill_soon(Tester),
+  receive {'EXIT', Tester, awake} ->
+      undefined = process_info(Tester),
+      ok
+  after ?INFINITE_LOOP_TIMEOUT -> error(timeout)
+  end.
+
+infinite_recursion() -> infinite_recursion().
+
+infinite_corecursion() -> infinite_corecursion_1().
+infinite_corecursion_1() -> infinite_corecursion().
+
+kill_soon(Pid) ->
+  _ = spawn_link(fun() ->
+		     timer:sleep(1),
+		     erlang:exit(Pid, awake)
+		 end),
+  ok.
-- 
cgit v1.2.3


From dbe626aa7beb0f04403f6782443f3a78d0f1fdb0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 16 Mar 2017 14:55:23 +0100
Subject: hipe: Add basic range splitting ra callbacks

In addition to the temporary name rewriting that hipe_regalloc_prepass
does, range splitters also need to be able to insert move instructions,
as well as inserting new basic blocks in the control flow graph. The
following four callbacks are added for that purpose:

 * Target:mk_move(Src, Dst, Context)
   Returns a move instruction from the temporary (not just register
   number) Src to Dst.
 * Target:mk_goto(Label, Context)
   Returns a unconditional control flow instruction that branches to the
   label with name Label.
 * Target:redirect_jmp(Instr, ToOld, ToNew, Context)
   Modifies the control flow instruction Instr so that any control flow
   that would go to a label with name ToOld instead goes to the label
   with name ToNew.
 * Target:new_label(Context)
   Returns a fresh label name that does not belong to any existing block
   in the current function, and is to be used to create a new basic
   block in the control flow graph by calling Target:update_bb/4 with
   this new name.
---
 lib/hipe/arm/hipe_arm_subst.erl                | 22 ++++++++++++++++++-
 lib/hipe/ppc/hipe_ppc_cfg.erl                  | 16 ++++++++------
 lib/hipe/regalloc/hipe_amd64_specific_sse2.erl | 29 ++++++++++++++++++++++++--
 lib/hipe/regalloc/hipe_arm_specific.erl        | 29 ++++++++++++++++++++++++--
 lib/hipe/regalloc/hipe_ppc_specific.erl        | 20 ++++++++++++++++--
 lib/hipe/regalloc/hipe_ppc_specific_fp.erl     | 20 ++++++++++++++++--
 lib/hipe/regalloc/hipe_sparc_specific.erl      | 20 ++++++++++++++++--
 lib/hipe/regalloc/hipe_sparc_specific_fp.erl   | 20 ++++++++++++++++--
 lib/hipe/regalloc/hipe_x86_specific.erl        | 29 ++++++++++++++++++++++++--
 lib/hipe/sparc/hipe_sparc_cfg.erl              | 24 +++++++++++++--------
 lib/hipe/x86/hipe_x86_subst.erl                | 22 ++++++++++++++++++-
 11 files changed, 220 insertions(+), 31 deletions(-)

diff --git a/lib/hipe/arm/hipe_arm_subst.erl b/lib/hipe/arm/hipe_arm_subst.erl
index 7510c197bd..a41a907a4c 100644
--- a/lib/hipe/arm/hipe_arm_subst.erl
+++ b/lib/hipe/arm/hipe_arm_subst.erl
@@ -13,7 +13,7 @@
 %% limitations under the License.
 
 -module(hipe_arm_subst).
--export([insn_temps/2]).
+-export([insn_temps/2, insn_lbls/2]).
 -include("hipe_arm.hrl").
 
 %% These should be moved to hipe_arm and exported
@@ -31,6 +31,7 @@
 -type am3()     :: #am3{}.
 -type arg()     :: temp() | integer().
 -type funv()    :: #arm_mfa{} | #arm_prim{} | temp().
+-type label()   :: non_neg_integer().
 -type insn()    :: tuple(). % for now
 
 -type subst_fun() :: fun((temp()) -> temp()).
@@ -103,3 +104,22 @@ funv_temps(SubstTemp,  T=#arm_temp{}) -> SubstTemp(T).
 -spec arg_temps(subst_fun(), arg()) -> arg().
 arg_temps(_SubstTemp, Imm) when is_integer(Imm) -> Imm;
 arg_temps(SubstTemp,  T=#arm_temp{}) -> SubstTemp(T).
+
+-type lbl_subst_fun() :: fun((label()) -> label()).
+
+%% @doc Maps over the branch targets in an instruction
+-spec insn_lbls(lbl_subst_fun(), insn()) -> insn().
+insn_lbls(SubstLbl, I) ->
+  case I of
+    #b_label{label=Label} ->
+      I#b_label{label=SubstLbl(Label)};
+    #pseudo_bc{true_label=T, false_label=F} ->
+      I#pseudo_bc{true_label=SubstLbl(T), false_label=SubstLbl(F)};
+    #pseudo_call{sdesc=Sdesc, contlab=Contlab} ->
+      I#pseudo_call{sdesc=sdesc_lbls(SubstLbl, Sdesc),
+		    contlab=SubstLbl(Contlab)}
+  end.
+
+sdesc_lbls(_SubstLbl, Sdesc=#arm_sdesc{exnlab=[]}) -> Sdesc;
+sdesc_lbls(SubstLbl, Sdesc=#arm_sdesc{exnlab=Exnlab}) ->
+  Sdesc#arm_sdesc{exnlab=SubstLbl(Exnlab)}.
diff --git a/lib/hipe/ppc/hipe_ppc_cfg.erl b/lib/hipe/ppc/hipe_ppc_cfg.erl
index f17c0ac503..58a4d5c8c3 100644
--- a/lib/hipe/ppc/hipe_ppc_cfg.erl
+++ b/lib/hipe/ppc/hipe_ppc_cfg.erl
@@ -21,8 +21,7 @@
          bb/2, bb_add/3]).
 -export([postorder/1]).
 -export([linearise/1, params/1, reverse_postorder/1]).
--export([arity/1]).
-%%%-export([redirect_jmp/3, arity/1]).
+-export([redirect_jmp/3, arity/1]).
 
 %%% these tell cfg.inc what to define (ugly as hell)
 -define(BREADTH_ORDER,true).
@@ -79,7 +78,6 @@ branch_successors(Branch) ->
 fails_to(_Instr) -> [].
 -endif.
 
--ifdef(notdef).
 redirect_jmp(I, Old, New) ->
   case I of
     #b_label{label=Label} ->
@@ -93,10 +91,16 @@ redirect_jmp(I, Old, New) ->
       if Old =:= FalseLab -> I1#pseudo_bc{false_label=New};
 	 true -> I1
       end;
-    %% handle pseudo_call too?
-    _ -> I
+    #pseudo_call{sdesc=SDesc0, contlab=ContLab0} ->
+      SDesc = case SDesc0 of
+		#ppc_sdesc{exnlab=Old} -> SDesc0#ppc_sdesc{exnlab=New};
+		#ppc_sdesc{exnlab=_}   -> SDesc0
+	      end,
+      ContLab = if Old =:= ContLab0 -> New;
+		   true -> ContLab0
+		end,
+      I#pseudo_call{sdesc=SDesc, contlab=ContLab}
   end.
--endif.
 
 mk_goto(Label) ->
   hipe_ppc:mk_b_label(Label).
diff --git a/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl b/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
index 9682d37520..341233e851 100644
--- a/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
+++ b/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
@@ -50,8 +50,12 @@
 -export([check_and_rewrite/3,
 	 check_and_rewrite/4]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -188,6 +192,27 @@ is_move(Instruction, _) ->
 reg_nr(Reg, _) ->
   hipe_x86:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_x86:mk_fmove(Src, Dst).
+
+mk_goto(Label, _) ->
+  hipe_x86:mk_jmp_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  Ref = make_ref(),
+  put(Ref, false),
+  I = hipe_x86_subst:insn_lbls(
+	fun(Tgt) ->
+	    if Tgt =:= ToOld -> put(Ref, true), ToNew;
+	       is_integer(Tgt) -> Tgt
+	    end
+	end, Jmp),
+  true = erase(Ref), % Assert that something was rewritten
+  I.
+
+new_label(_) ->
+  hipe_gensym:get_next_label(x86).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(x86).
 
diff --git a/lib/hipe/regalloc/hipe_arm_specific.erl b/lib/hipe/regalloc/hipe_arm_specific.erl
index cef22e5af9..224837a34a 100644
--- a/lib/hipe/regalloc/hipe_arm_specific.erl
+++ b/lib/hipe/regalloc/hipe_arm_specific.erl
@@ -46,8 +46,12 @@
 %% callbacks for hipe_regalloc_loop
 -export([check_and_rewrite/3]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -147,6 +151,27 @@ is_move(Instruction, _) ->
 reg_nr(Reg, _) ->
   hipe_arm:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_arm:mk_pseudo_move(Dst, Src).
+
+mk_goto(Label, _) ->
+  hipe_arm:mk_b_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  Ref = make_ref(),
+  put(Ref, false),
+  I = hipe_arm_subst:insn_lbls(
+	fun(Tgt) ->
+	    if Tgt =:= ToOld -> put(Ref, true), ToNew;
+	       is_integer(Tgt) -> Tgt
+	    end
+	end, Jmp),
+  true = erase(Ref), % Assert that something was rewritten
+  I.
+
+new_label(_) ->
+  hipe_gensym:get_next_label(arm).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(arm).
 
diff --git a/lib/hipe/regalloc/hipe_ppc_specific.erl b/lib/hipe/regalloc/hipe_ppc_specific.erl
index a6450b4d96..bc4bd1972e 100644
--- a/lib/hipe/regalloc/hipe_ppc_specific.erl
+++ b/lib/hipe/regalloc/hipe_ppc_specific.erl
@@ -46,8 +46,12 @@
 %% callbacks for hipe_regalloc_loop
 -export([check_and_rewrite/3]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -147,6 +151,18 @@ is_move(Instruction, _) ->
 reg_nr(Reg, _) ->
   hipe_ppc:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_ppc:mk_pseudo_move(Dst, Src).
+
+mk_goto(Label, _) ->
+  hipe_ppc:mk_b_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  hipe_ppc_cfg:redirect_jmp(Jmp, ToOld, ToNew).
+
+new_label(_) ->
+  hipe_gensym:get_next_label(ppc).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(ppc).
 
diff --git a/lib/hipe/regalloc/hipe_ppc_specific_fp.erl b/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
index 23cb6c0318..3aab14f061 100644
--- a/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
+++ b/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
@@ -46,8 +46,12 @@
 %% callbacks for hipe_regalloc_loop
 -export([check_and_rewrite/3]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -128,6 +132,18 @@ is_move(I, _) ->
 reg_nr(Reg, _) ->
   hipe_ppc:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_ppc:mk_pseudo_fmove(Dst, Src).
+
+mk_goto(Label, _) ->
+  hipe_ppc:mk_b_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  hipe_ppc_cfg:redirect_jmp(Jmp, ToOld, ToNew).
+
+new_label(_) ->
+  hipe_gensym:get_next_label(ppc).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(ppc).
 
diff --git a/lib/hipe/regalloc/hipe_sparc_specific.erl b/lib/hipe/regalloc/hipe_sparc_specific.erl
index 31fca81316..12642709b9 100644
--- a/lib/hipe/regalloc/hipe_sparc_specific.erl
+++ b/lib/hipe/regalloc/hipe_sparc_specific.erl
@@ -46,8 +46,12 @@
 %% callbacks for hipe_regalloc_loop
 -export([check_and_rewrite/3]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -147,6 +151,18 @@ is_move(Instruction, _) ->
 reg_nr(Reg, _) ->
   hipe_sparc:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_sparc:mk_pseudo_move(Src, Dst).
+
+mk_goto(Label, _) ->
+  hipe_sparc:mk_b_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  hipe_sparc_cfg:redirect_jmp(Jmp, ToOld, ToNew).
+
+new_label(_) ->
+  hipe_gensym:get_next_label(sparc).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(sparc).
 
diff --git a/lib/hipe/regalloc/hipe_sparc_specific_fp.erl b/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
index 050d65e1a9..ba4849d2d5 100644
--- a/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
+++ b/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
@@ -46,8 +46,12 @@
 %% callbacks for hipe_regalloc_loop
 -export([check_and_rewrite/3]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -128,6 +132,18 @@ is_move(I, _) ->
 reg_nr(Reg, _) ->
   hipe_sparc:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_sparc:mk_pseudo_fmove(Src, Dst).
+
+mk_goto(Label, _) ->
+  hipe_sparc:mk_b_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  hipe_sparc_cfg:redirect_jmp(Jmp, ToOld, ToNew).
+
+new_label(_) ->
+  hipe_gensym:get_next_label(sparc).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(sparc).
 
diff --git a/lib/hipe/regalloc/hipe_x86_specific.erl b/lib/hipe/regalloc/hipe_x86_specific.erl
index c1c8dbbcd6..0d9d7b751e 100644
--- a/lib/hipe/regalloc/hipe_x86_specific.erl
+++ b/lib/hipe/regalloc/hipe_x86_specific.erl
@@ -63,8 +63,12 @@
 %% callbacks for hipe_regalloc_loop
 -export([check_and_rewrite/3]).
 
-%% callbacks for hipe_regalloc_prepass
--export([new_reg_nr/1,
+%% callbacks for hipe_regalloc_prepass, hipe_range_split
+-export([mk_move/3,
+	 mk_goto/2,
+	 redirect_jmp/4,
+	 new_label/1,
+	 new_reg_nr/1,
 	 update_reg_nr/3,
 	 update_bb/4,
 	 subst_temps/3]).
@@ -203,6 +207,27 @@ is_move(Instruction,_) ->
 reg_nr(Reg,_) ->
   hipe_x86:temp_reg(Reg).
 
+mk_move(Src, Dst, _) ->
+  hipe_x86:mk_move(Src, Dst).
+
+mk_goto(Label, _) ->
+  hipe_x86:mk_jmp_label(Label).
+
+redirect_jmp(Jmp, ToOld, ToNew, _) when is_integer(ToOld), is_integer(ToNew) ->
+  Ref = make_ref(),
+  put(Ref, false),
+  I = hipe_x86_subst:insn_lbls(
+	fun(Tgt) ->
+	    if Tgt =:= ToOld -> put(Ref, true), ToNew;
+	       is_integer(Tgt) -> Tgt
+	    end
+	end, Jmp),
+  true = erase(Ref), % Assert that something was rewritten
+  I.
+
+new_label(_) ->
+  hipe_gensym:get_next_label(x86).
+
 new_reg_nr(_) ->
   hipe_gensym:get_next_var(x86).
 
diff --git a/lib/hipe/sparc/hipe_sparc_cfg.erl b/lib/hipe/sparc/hipe_sparc_cfg.erl
index 27374d187b..b31c90a03a 100644
--- a/lib/hipe/sparc/hipe_sparc_cfg.erl
+++ b/lib/hipe/sparc/hipe_sparc_cfg.erl
@@ -23,6 +23,7 @@
 -export([linearise/1]).
 -export([params/1]).
 -export([arity/1]). % for linear scan
+-export([redirect_jmp/3]).
 
 -define(SPARC_CFG, true).     % needed for cfg.inc
 
@@ -81,24 +82,29 @@ branch_successors(Branch) ->
 fails_to(_Instr) -> [].
 -endif.
 
--ifdef(notdef).
 redirect_jmp(I, Old, New) ->
   case I of
-    #b_label{label=Label} ->
-      if Old =:= Label -> I#b_label{label=New};
+    #bp{'cond'='a',label=Label} ->
+      if Old =:= Label -> I#bp{label=New};
 	 true -> I
       end;
-    #pseudo_bc{true_label=TrueLab, false_label=FalseLab} ->
-      I1 = if Old =:= TrueLab -> I#pseudo_bc{true_label=New};
+    #pseudo_bp{true_label=TrueLab, false_label=FalseLab} ->
+      I1 = if Old =:= TrueLab -> I#pseudo_bp{true_label=New};
 	      true -> I
 	   end,
-      if Old =:= FalseLab -> I1#pseudo_bc{false_label=New};
+      if Old =:= FalseLab -> I1#pseudo_bp{false_label=New};
 	 true -> I1
       end;
-    %% handle pseudo_call too?
-    _ -> I
+    #pseudo_call{contlab=ContLab0, sdesc=SDesc0} ->
+      SDesc = case SDesc0 of
+		#sparc_sdesc{exnlab=Old} -> SDesc0#sparc_sdesc{exnlab=New};
+		#sparc_sdesc{exnlab=_}   -> SDesc0
+	      end,
+      ContLab = if Old =:= ContLab0 -> New;
+		   true -> ContLab0
+		end,
+      I#pseudo_call{sdesc=SDesc, contlab=ContLab}
   end.
--endif.
 
 mk_goto(Label) ->
   hipe_sparc:mk_b_label(Label).
diff --git a/lib/hipe/x86/hipe_x86_subst.erl b/lib/hipe/x86/hipe_x86_subst.erl
index 102207ad7e..adc2e9dc65 100644
--- a/lib/hipe/x86/hipe_x86_subst.erl
+++ b/lib/hipe/x86/hipe_x86_subst.erl
@@ -19,7 +19,7 @@
 -endif.
 
 -module(?HIPE_X86_SUBST).
--export([insn_temps/2]).
+-export([insn_temps/2, insn_lbls/2]).
 -include("../x86/hipe_x86.hrl").
 
 %% These should be moved to hipe_x86 and exported
@@ -28,6 +28,7 @@
 -type mfarec() :: #x86_mfa{}.
 -type prim()   :: #x86_prim{}.
 -type funv()   :: mfarec() | prim() | temp().
+-type label()  :: non_neg_integer().
 -type insn()   :: tuple(). % for now
 
 -type subst_fun() :: fun((temp()) -> temp()).
@@ -86,3 +87,22 @@ jtab_temps(SubstTemp, T=#x86_temp{}) -> SubstTemp(T).
 -else.
 jtab_temps(_SubstTemp, DataLbl) when is_integer(DataLbl) -> DataLbl.
 -endif.
+
+-type lbl_subst_fun() :: fun((label()) -> label()).
+
+%% @doc Maps over the branch targets in an instruction
+-spec insn_lbls(lbl_subst_fun(), insn()) -> insn().
+insn_lbls(SubstLbl, I) ->
+  case I of
+    #jmp_label{label=Label} ->
+      I#jmp_label{label=SubstLbl(Label)};
+    #pseudo_call{sdesc=Sdesc, contlab=Contlab} ->
+      I#pseudo_call{sdesc=sdesc_lbls(SubstLbl, Sdesc),
+		    contlab=SubstLbl(Contlab)};
+    #pseudo_jcc{true_label=T, false_label=F} ->
+      I#pseudo_jcc{true_label=SubstLbl(T), false_label=SubstLbl(F)}
+  end.
+
+sdesc_lbls(_SubstLbl, Sdesc=#x86_sdesc{exnlab=[]}) -> Sdesc;
+sdesc_lbls(SubstLbl, Sdesc=#x86_sdesc{exnlab=Exnlab}) ->
+  Sdesc#x86_sdesc{exnlab=SubstLbl(Exnlab)}.
-- 
cgit v1.2.3


From e99f1d41bc8a7e035e35fd5aef6f3ea023d7f12e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 16 Mar 2017 16:38:22 +0100
Subject: hipe: Add range splitter restore_reuse

hipe_restore_reuse is a simplistic range splitter that splits temps that
are forced onto the stack by being live over call instructions. In
particular, it attempts to avoid cases where there are several accesses
to such stack allocated temps in straight-line code, uninterrupted by
any calls. In order to achieve this it splits temps between just before
the first access(es) and just after the last access(es) in such
straight-line code groups.

The hipe_restore_reuse pass is controlled by a new option
ra_restore_reuse.
ra_restore_reuse is added to o1.
---
 lib/hipe/main/hipe.app.src               |   1 +
 lib/hipe/main/hipe.erl                   |  10 +-
 lib/hipe/regalloc/Makefile               |   1 +
 lib/hipe/regalloc/hipe_regalloc_loop.erl |  15 +-
 lib/hipe/regalloc/hipe_restore_reuse.erl | 503 +++++++++++++++++++++++++++++++
 5 files changed, 527 insertions(+), 3 deletions(-)
 create mode 100644 lib/hipe/regalloc/hipe_restore_reuse.erl

diff --git a/lib/hipe/main/hipe.app.src b/lib/hipe/main/hipe.app.src
index 4c3f147d40..dad095da63 100644
--- a/lib/hipe/main/hipe.app.src
+++ b/lib/hipe/main/hipe.app.src
@@ -150,6 +150,7 @@
 	     hipe_reg_worklists,
 	     hipe_regalloc_loop,
 	     hipe_regalloc_prepass,
+	     hipe_restore_reuse,
 	     hipe_rtl,
 	     hipe_rtl_arch,
 	     hipe_rtl_arith_32,
diff --git a/lib/hipe/main/hipe.erl b/lib/hipe/main/hipe.erl
index fff397b060..f3e7c0879e 100644
--- a/lib/hipe/main/hipe.erl
+++ b/lib/hipe/main/hipe.erl
@@ -1230,6 +1230,11 @@ option_text(regalloc) ->
   "    optimistic - another variant of a coalescing allocator";
 option_text(remove_comments) ->
   "Strip comments from intermediate code";
+option_text(ra_restore_reuse) ->
+  "Split live ranges of temporaries such that straight-line\n"
+  "code will not need to contain multiple restores from the same stack\n"
+  "location.\n"
+  "Should only be used with move coalescing register allocators.";
 option_text(rtl_ssa) ->
   "Perform SSA conversion on the RTL level -- default starting at O2";
 option_text(rtl_ssa_const_prop) ->
@@ -1371,6 +1376,7 @@ opt_keys() ->
      pp_rtl_linear,
      ra_partitioned,
      ra_prespill,
+     ra_restore_reuse,
      regalloc,
      remove_comments,
      rtl_ssa,
@@ -1409,7 +1415,8 @@ o1_opts(TargetArch) ->
 	    icode_ssa_const_prop, icode_ssa_copy_prop, icode_inline_bifs,
 	    rtl_ssa, rtl_ssa_const_prop, rtl_ssapre,
 	    spillmin_color, use_indexing, remove_comments,
-	    binary_opt, {regalloc,coalescing} | o0_opts(TargetArch)],
+	    binary_opt, {regalloc,coalescing}, ra_restore_reuse
+	    | o0_opts(TargetArch)],
   case TargetArch of
     ultrasparc ->
       Common;
@@ -1477,6 +1484,7 @@ opt_negations() ->
    {no_pp_rtl_ssapre, pp_rtl_ssapre},
    {no_ra_partitioned, ra_partitioned},
    {no_ra_prespill, ra_prespill},
+   {no_ra_restore_reuse, ra_restore_reuse},
    {no_remove_comments, remove_comments},
    {no_rtl_ssa, rtl_ssa},
    {no_rtl_ssa_const_prop, rtl_ssa_const_prop},
diff --git a/lib/hipe/regalloc/Makefile b/lib/hipe/regalloc/Makefile
index 209f230a9b..73ac232f51 100644
--- a/lib/hipe/regalloc/Makefile
+++ b/lib/hipe/regalloc/Makefile
@@ -52,6 +52,7 @@ MODULES = hipe_ig hipe_ig_moves hipe_moves \
 	  hipe_graph_coloring_regalloc \
 	  hipe_regalloc_loop \
 	  hipe_regalloc_prepass \
+	  hipe_restore_reuse \
 	  hipe_ls_regalloc \
 	  hipe_ppc_specific hipe_ppc_specific_fp \
 	  hipe_sparc_specific hipe_sparc_specific_fp \
diff --git a/lib/hipe/regalloc/hipe_regalloc_loop.erl b/lib/hipe/regalloc/hipe_regalloc_loop.erl
index 5bbb0ba7c1..5d32073437 100644
--- a/lib/hipe/regalloc/hipe_regalloc_loop.erl
+++ b/lib/hipe/regalloc/hipe_regalloc_loop.erl
@@ -32,9 +32,11 @@ ra_fp(CFG, Liveness, Options, RegAllocMod, TargetMod, TargetCtx) ->
 ra_common(CFG0, Liveness0, SpillIndex, Options, RegAllocMod, TargetMod,
 	  TargetCtx) ->
   ?inc_counter(ra_calls_counter, 1),
-  SpillLimit0 = TargetMod:number_of_temporaries(CFG0, TargetCtx),
+  {CFG1, Liveness1} =
+    do_range_split(CFG0, Liveness0, TargetMod, TargetCtx, Options),
+  SpillLimit0 = TargetMod:number_of_temporaries(CFG1, TargetCtx),
   {Coloring, _, CFG, Liveness} =
-    call_allocator_initial(CFG0, Liveness0, SpillLimit0, SpillIndex, Options,
+    call_allocator_initial(CFG1, Liveness1, SpillLimit0, SpillIndex, Options,
 			   RegAllocMod, TargetMod, TargetCtx),
   %% The first iteration, the hipe_regalloc_prepass may create new temps, these
   %% should not end up above SpillLimit.
@@ -96,3 +98,12 @@ call_allocator(CFG, Liveness, SpillLimit, SpillIndex, Options, RegAllocMod,
       RegAllocMod:regalloc(CFG, Liveness, SpillIndex, SpillLimit, TargetMod,
 			   TargetCtx, Options)
   end.
+
+do_range_split(CFG0, Liveness, TgtMod, TgtCtx, Options) ->
+  case proplists:get_bool(ra_restore_reuse, Options) of
+    true ->
+      CFG1 = hipe_restore_reuse:split(CFG0, Liveness, TgtMod, TgtCtx),
+      {CFG1, TgtMod:analyze(CFG1, TgtCtx)};
+    false ->
+      {CFG0, Liveness}
+  end.
diff --git a/lib/hipe/regalloc/hipe_restore_reuse.erl b/lib/hipe/regalloc/hipe_restore_reuse.erl
new file mode 100644
index 0000000000..2d8dea2a2c
--- /dev/null
+++ b/lib/hipe/regalloc/hipe_restore_reuse.erl
@@ -0,0 +1,503 @@
+%% -*- erlang-indent-level: 2 -*-
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%@doc
+%%	       RESTORE REUSE LIVE RANGE SPLITTING PASS
+%%
+%% This is a simple live range splitter that tries to avoid sequences where a
+%% temporary is accessed on stack multiple times by keeping a copy of that temp
+%% around in a register.
+%%
+%% At any point where a temporary that is expected to be spilled (see uses of
+%% spills_add_list/2) is defined or used, this pass considers that temporary
+%% "available".
+%%
+%% Limitations:
+%%   * If a live range part starts with several different restores, this module
+%%     will introduce a new temp number for each of them, and later be forced to
+%%     generate phi blocks. It would be more efficient to introduce just a
+%%     single temp number. That would also remove the need for the phi blocks.
+%%   * If a live range part ends in a definition, that definition should just
+%%     define the base temp rather than the substitution, since some CISC
+%%     targets might be able to inline the memory access in the instruction.
+-module(hipe_restore_reuse).
+
+-export([split/4]).
+
+-compile(inline).
+
+%% -define(DO_ASSERT, 1).
+-include("../main/hipe.hrl").
+
+-type target_cfg()       :: any().
+-type liveness()         :: any().
+-type target_module()    :: module().
+-type target_context()   :: any().
+-type target()           :: {target_module(), target_context()}.
+-type label()            :: non_neg_integer().
+-type reg()              :: non_neg_integer().
+-type instr()            :: any().
+-type temp()             :: any().
+
+-spec split(target_cfg(), liveness(), target_module(), target_context())
+	   -> target_cfg().
+split(CFG, Liveness, TargetMod, TargetContext) ->
+  Target = {TargetMod, TargetContext},
+  Avail = analyse(CFG, Liveness, Target),
+  rewrite(CFG, Target, Avail).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-type avail() :: #{label() => avail_bb()}.
+
+-record(avail_bb, {
+	  %% Blocks where HasCall is true are considered to have too high
+	  %% register pressure to support a register copy of a temp
+	  has_call :: boolean(),
+	  %% AvailOut: Temps that can be split (are available)
+	  out      :: availset(),
+	  %% Gen: AvailOut generated locally
+	  gen      :: availset(),
+	  %% WantIn: Temps that are split
+	  want     :: regset(),
+	  %% Self: Temps with avail-want pairs locally
+	  self     :: regset(),
+	  %% DefIn: Temps shadowed by later def in same live range part
+	  defin    :: regset(),
+	  pred     :: [label()],
+	  succ     :: [label()]
+	 }).
+-type avail_bb() :: #avail_bb{}.
+
+avail_get(L, Avail) -> maps:get(L, Avail).
+avail_set(L, Val, Avail) -> maps:put(L, Val, Avail).
+avail_has_call(L, Avail) -> (avail_get(L, Avail))#avail_bb.has_call.
+avail_out(L, Avail) -> (avail_get(L, Avail))#avail_bb.out.
+avail_self(L, Avail) -> (avail_get(L, Avail))#avail_bb.self.
+avail_pred(L, Avail) -> (avail_get(L, Avail))#avail_bb.pred.
+avail_succ(L, Avail) -> (avail_get(L, Avail))#avail_bb.succ.
+
+avail_in(L, Avail) ->
+  case avail_pred(L, Avail) of
+    [] -> availset_empty(); % entry
+    Pred ->
+      lists:foldl(fun(P, ASet) ->
+		      availset_intersect(avail_out(P, Avail), ASet)
+		  end, availset_top(), Pred)
+  end.
+
+want_in(L, Avail) -> (avail_get(L, Avail))#avail_bb.want.
+want_out(L, Avail) ->
+  lists:foldl(fun(S, Set) ->
+		  ordsets:union(want_in(S, Avail), Set)
+	      end, ordsets:new(), avail_succ(L, Avail)).
+
+def_in(L, Avail) -> (avail_get(L, Avail))#avail_bb.defin.
+def_out(L, Avail) ->
+  case avail_succ(L, Avail) of
+    [] -> ordsets:new(); % entry
+    Succ ->
+      ordsets:intersection([def_in(S, Avail) || S <- Succ])
+  end.
+
+-type regset()  :: ordsets:ordset(reg()).
+-type availset() :: top | regset().
+availset_empty() -> [].
+availset_top() -> top.
+availset_intersect(top, B) -> B;
+availset_intersect(A, top) -> A;
+availset_intersect(A, B) -> ordsets:intersection(A, B).
+availset_union(top, _) -> top;
+availset_union(_, top) -> top;
+availset_union(A, B) -> ordsets:union(A, B).
+ordset_intersect_availset(OS, top) -> OS;
+ordset_intersect_availset(OS, AS) -> ordsets:intersection(OS, AS).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Analysis pass
+%%
+%% The analysis pass collects the set of temps we're interested in splitting
+%% (Spills), and computes three dataflow analyses for this subset of temps.
+%%
+%% Avail, which is the set of temps which are available in register from a
+%%   previous (potential) spill or restore without going through a HasCall
+%%   block.
+%% Want, which is a liveness analysis for the subset of temps used by an
+%%   instruction that are also in Avail at that point. In other words, Want is
+%%   the set of temps that are split (has a register copy) at a particular
+%%   point.
+%% Def, which are the temps that are already going to be spilled later, and so
+%%   need not be spilled when they're defined.
+%%
+%% Lastly, it computes the set Self for each block, which is the temps that have
+%% avail-want pairs in the same block, and so should be split in that block even
+%% if they're not in WantIn for the block.
+
+-spec analyse(target_cfg(), liveness(), target()) -> avail().
+analyse(CFG, Liveness, Target) ->
+  Avail0 = analyse_init(CFG, Liveness, Target),
+  RPO = reverse_postorder(CFG, Target),
+  AvailLs = [L || L <- RPO, not avail_has_call(L, Avail0)],
+  Avail1 = avail_dataf(AvailLs, Avail0),
+  Avail2 = analyse_filter_want(maps:keys(Avail1), Avail1),
+  PO = lists:reverse(RPO),
+  want_dataf(PO, Avail2).
+
+-spec analyse_init(target_cfg(), liveness(), target()) -> avail().
+analyse_init(CFG, Liveness, Target) ->
+  analyse_init(labels(CFG, Target), CFG, Liveness, Target, #{}, []).
+
+-spec analyse_init([label()], target_cfg(), liveness(), target(), spillset(),
+		   [{label(), avail_bb()}])
+		  -> avail().
+analyse_init([], _CFG, _Liveness, Target, Spills0, Acc) ->
+  %% Precoloured temps can't be spilled
+  Spills = spills_filter(fun(R) -> not is_precoloured(R, Target) end, Spills0),
+  analyse_init_1(Acc, Spills, []);
+analyse_init([L|Ls], CFG, Liveness, Target, Spills0, Acc) ->
+  {DefIn, Gen, Self, Want, HasCall0} =
+    analyse_scan(hipe_bb:code(bb(CFG, L, Target)), Target,
+		 ordsets:new(), ordsets:new(), ordsets:new(),
+		 ordsets:new()),
+  {Spills, Out, HasCall} =
+    case HasCall0 of
+      false -> {Spills0, availset_top(), false};
+      {true, CallDefs} ->
+	Spill = ordsets:subtract(liveout(Liveness, L, Target), CallDefs),
+	{spills_add_list(Spill, Spills0), Gen, true}
+    end,
+  Pred = hipe_gen_cfg:pred(CFG, L),
+  Succ = hipe_gen_cfg:succ(CFG, L),
+  Val = #avail_bb{gen=Gen, want=Want, self=Self, out=Out, has_call=HasCall,
+		  pred=Pred, succ=Succ, defin=DefIn},
+  analyse_init(Ls, CFG, Liveness, Target, Spills, [{L, Val} | Acc]).
+
+-spec analyse_init_1([{label(), avail_bb()}], spillset(),
+		     [{label(), avail_bb()}])
+		    -> avail().
+analyse_init_1([], _Spills, Acc) -> maps:from_list(Acc);
+analyse_init_1([{L, Val0}|Vs], Spills, Acc) ->
+  #avail_bb{out=Out,gen=Gen,want=Want,self=Self} = Val0,
+  Val = Val0#avail_bb{
+	  out  = spills_filter_availset(Out,  Spills),
+	  gen  = spills_filter_availset(Gen,  Spills),
+	  want = spills_filter_availset(Want, Spills),
+	  self = spills_filter_availset(Self, Spills)},
+  analyse_init_1(Vs, Spills, [{L, Val} | Acc]).
+
+-type spillset() :: #{reg() => []}.
+-spec spills_add_list([reg()], spillset()) -> spillset().
+spills_add_list([], Spills) -> Spills;
+spills_add_list([R|Rs], Spills) -> spills_add_list(Rs, Spills#{R => []}).
+
+-spec spills_filter_availset(availset(), spillset()) -> availset().
+spills_filter_availset([E|Es], Spills) ->
+  case Spills of
+    #{E := _} -> [E|spills_filter_availset(Es, Spills)];
+    #{} ->          spills_filter_availset(Es, Spills)
+  end;
+spills_filter_availset([], _) -> [];
+spills_filter_availset(top, _) -> top.
+
+spills_filter(Fun, Spills) -> maps:filter(fun(K, _) -> Fun(K) end, Spills).
+
+-spec analyse_scan([instr()], target(), Defset, Gen, Self, Want)
+		  -> {Defset, Gen, Self, Want, HasCall} when
+    HasCall :: false | {true, regset()},
+    Defset  :: regset(),
+    Gen     :: availset(),
+    Self    :: regset(),
+    Want    :: regset().
+analyse_scan([], _Target, Defs, Gen, Self, Want) ->
+  {Defs, Gen, Self, Want, false};
+analyse_scan([I|Is], Target, Defs0, Gen0, Self0, Want0) ->
+  {DefL, UseL} = reg_def_use(I, Target),
+  Use = ordsets:from_list(UseL),
+  Def = ordsets:from_list(DefL),
+  Self = ordsets:union(ordsets:intersection(Use, Gen0), Self0),
+  Want = ordsets:union(ordsets:subtract(Use, Defs0), Want0),
+  Defs = ordsets:union(Def, Defs0),
+  case defines_all_alloc(I, Target) of
+    true ->
+      [] = Is, %assertion
+      {Defs, ordsets:new(), Self, Want, {true, Def}};
+    false ->
+      Gen = ordsets:union(ordsets:union(Def, Use), Gen0),
+      analyse_scan(Is, Target, Defs, Gen, Self, Want)
+  end.
+
+-spec avail_dataf([label()], avail()) -> avail().
+avail_dataf(RPO, Avail0) ->
+  case avail_dataf_once(RPO, Avail0, 0) of
+    {Avail, 0} -> Avail;
+    {Avail, _Changed} ->
+      avail_dataf(RPO, Avail)
+  end.
+
+-spec avail_dataf_once([label()], avail(), non_neg_integer())
+		      -> {avail(), non_neg_integer()}.
+avail_dataf_once([], Avail, Changed) -> {Avail, Changed};
+avail_dataf_once([L|Ls], Avail0, Changed0) ->
+  ABB = #avail_bb{out=OldOut, gen=Gen} = avail_get(L, Avail0),
+  In = avail_in(L, Avail0),
+  {Changed, Avail} =
+    case availset_union(In, Gen) of
+      OldOut -> {Changed0, Avail0};
+      Out -> {Changed0+1, avail_set(L, ABB#avail_bb{out=Out}, Avail0)}
+    end,
+  avail_dataf_once(Ls, Avail, Changed).
+
+-spec analyse_filter_want([label()], avail()) -> avail().
+analyse_filter_want([], Avail) -> Avail;
+analyse_filter_want([L|Ls], Avail0) ->
+  ABB = #avail_bb{want=Want0, defin=DefIn0} = avail_get(L, Avail0),
+  In = avail_in(L, Avail0),
+  Want = ordset_intersect_availset(Want0, In),
+  DefIn = ordset_intersect_availset(DefIn0, In),
+  Avail = avail_set(L, ABB#avail_bb{want=Want, defin=DefIn}, Avail0),
+  analyse_filter_want(Ls, Avail).
+
+-spec want_dataf([label()], avail()) -> avail().
+want_dataf(PO, Avail0) ->
+  case want_dataf_once(PO, Avail0, 0) of
+    {Avail, 0} -> Avail;
+    {Avail, _Changed} ->
+      want_dataf(PO, Avail)
+  end.
+
+-spec want_dataf_once([label()], avail(), non_neg_integer())
+		     -> {avail(), non_neg_integer()}.
+want_dataf_once([], Avail, Changed) -> {Avail, Changed};
+want_dataf_once([L|Ls], Avail0, Changed0) ->
+  ABB0 = #avail_bb{want=OldIn,defin=OldDef} = avail_get(L, Avail0),
+  AvailIn = avail_in(L, Avail0),
+  Out = want_out(L, Avail0),
+  DefOut = def_out(L, Avail0),
+  {Changed, Avail} =
+    case {ordsets:union(ordset_intersect_availset(Out,    AvailIn), OldIn),
+	  ordsets:union(ordset_intersect_availset(DefOut, AvailIn), OldDef)}
+    of
+      {OldIn, OldDef} -> {Changed0, Avail0};
+      {In, DefIn} ->
+	ABB = ABB0#avail_bb{want=In,defin=DefIn},
+	{Changed0+1, avail_set(L, ABB, Avail0)}
+    end,
+  want_dataf_once(Ls, Avail, Changed).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Rewrite pass
+-type subst_dict() :: orddict:orddict(reg(), reg()).
+-type input()      :: #{label() => subst_dict()}.
+
+-spec rewrite(target_cfg(), target(), avail()) -> target_cfg().
+rewrite(CFG, Target, Avail) ->
+  RPO = reverse_postorder(CFG, Target),
+  rewrite(RPO, Target, Avail, #{}, CFG).
+
+-spec rewrite([label()], target(), avail(), input(), target_cfg())
+	     -> target_cfg().
+rewrite([], _Target, _Avail, _Input, CFG) -> CFG;
+rewrite([L|Ls], Target, Avail, Input0, CFG0) ->
+  SplitHere = split_in_block(L, Avail),
+  {Input1, LInput} =
+    case Input0 of
+      #{L := LInput0} -> {Input0, LInput0};
+      #{} -> {Input0#{L => []}, []} % entry block
+    end,
+  ?ASSERT([] =:= [X || X <- SplitHere, orddict:is_key(X, LInput)]),
+  ?ASSERT(want_in(L, Avail) =:= orddict:fetch_keys(LInput)),
+  {CFG1, LOutput} =
+    case {SplitHere, LInput} of
+      {[], []} -> % optimisation (rewrite will do nothing, so skip it)
+	{CFG0, LInput};
+      _ ->
+	Code0 = hipe_bb:code(BB=bb(CFG0, L, Target)),
+	DefOut = def_out(L, Avail),
+	{Code, LOutput0, _DefIn} =
+	  rewrite_instrs(Code0, Target, LInput, DefOut, SplitHere),
+	{update_bb(CFG0, L, hipe_bb:code_update(BB, Code), Target), LOutput0}
+    end,
+  {Input, CFG} = rewrite_succs(avail_succ(L, Avail), Target, L, LOutput, Avail,
+			       Input1, CFG1),
+  rewrite(Ls, Target, Avail, Input, CFG).
+
+-spec split_in_block(label(), avail()) -> ordsets:ordset(reg()).
+split_in_block(L, Avail) ->
+  ordsets:subtract(ordsets:union(avail_self(L, Avail), want_out(L, Avail)),
+		   want_in(L, Avail)).
+
+-spec rewrite_instrs([instr()], target(), subst_dict(), regset(), [reg()])
+		    ->  {[instr()], subst_dict(), regset()}.
+rewrite_instrs([], _Target, Output, DefOut, []) ->
+  {[], Output, DefOut};
+rewrite_instrs([I|Is], Target, Input0, BBDefOut, SplitHere0) ->
+  {TDef, TUse} = def_use(I, Target),
+  {Def, Use} = {reg_names(TDef, Target), reg_names(TUse, Target)},
+  %% Restores are generated in forward order by picking temps from SplitHere as
+  %% they're used or defined. After the last instruction, all temps have been
+  %% picked.
+  {ISplits, SplitHere} =
+    lists:partition(fun(R) ->
+			lists:member(R, Def) orelse lists:member(R, Use)
+		    end, SplitHere0),
+  {Input, Restores} =
+    case ISplits of
+      [] -> {Input0, []};
+      _ ->
+	make_splits(ISplits, Target, TDef, TUse, Input0, [])
+    end,
+  %% Here's the recursive call
+  {Acc0, Output, DefOut} =
+    rewrite_instrs(Is, Target, Input, BBDefOut, SplitHere),
+  %% From here we're processing instructions in reverse order, because to avoid
+  %% redundant spills we need to walk the 'def' dataflow, which is in reverse.
+  SubstFun = fun(Temp) ->
+		 case orddict:find(reg_nr(Temp, Target), Input) of
+		   {ok, NewTemp} -> NewTemp;
+		   error -> Temp
+		 end
+	     end,
+  Acc1 = insert_spills(TDef, Target, Input, DefOut, Acc0),
+  Acc = Restores ++ [subst_temps(SubstFun, I, Target) | Acc1],
+  DefIn = ordsets:union(DefOut, ordsets:from_list(Def)),
+  {Acc, Output, DefIn}.
+
+-spec make_splits([reg()], target(), [temp()], [temp()], subst_dict(),
+		  [instr()])
+		 -> {subst_dict(), [instr()]}.
+make_splits([], _Target, _TDef, _TUse, Input, Acc) ->
+  {Input, Acc};
+make_splits([S|Ss], Target, TDef, TUse, Input0, Acc0) ->
+  SubstReg = new_reg_nr(Target),
+  {Acc, Subst} =
+    case find_reg_temp(S, TUse, Target) of
+      error ->
+	{ok, Temp} = find_reg_temp(S, TDef, Target),
+	{Acc0, update_reg_nr(SubstReg, Temp, Target)};
+      {ok, Temp} ->
+	Subst0 = update_reg_nr(SubstReg, Temp, Target),
+	Acc1 = [mk_move(Temp, Subst0, Target) | Acc0],
+	{Acc1, Subst0}
+    end,
+  Input = orddict:store(S, Subst, Input0),
+  make_splits(Ss, Target, TDef, TUse, Input, Acc).
+
+-spec find_reg_temp(reg(), [temp()], target()) -> error | {ok, temp()}.
+find_reg_temp(_Reg, [], _Target) -> error;
+find_reg_temp(Reg, [T|Ts], Target) ->
+  case reg_nr(T, Target) of
+    Reg -> {ok, T};
+    _ -> find_reg_temp(Reg, Ts, Target)
+  end.
+
+-spec insert_spills([temp()], target(), subst_dict(), regset(), [instr()])
+		   -> [instr()].
+insert_spills([], _Target, _Input, _DefOut, Acc) -> Acc;
+insert_spills([T|Ts], Target, Input, DefOut, Acc0) ->
+  R = reg_nr(T, Target),
+  Acc =
+    case orddict:find(R, Input) of
+      error -> Acc0;
+      {ok, Subst} ->
+	case lists:member(R, DefOut) of
+	  true -> Acc0;
+	  false -> [mk_move(Subst, T, Target) | Acc0]
+	end
+    end,
+  insert_spills(Ts, Target, Input, DefOut, Acc).
+
+-spec rewrite_succs([label()], target(), label(), subst_dict(), avail(),
+		    input(), target_cfg()) -> {input(), target_cfg()}.
+rewrite_succs([], _Target, _P, _POutput, _Avail, Input, CFG) -> {Input, CFG};
+rewrite_succs([L|Ls], Target, P, POutput, Avail, Input0, CFG0) ->
+  NewLInput = orddict_with_ordset(want_in(L, Avail), POutput),
+  {Input, CFG} =
+    case Input0 of
+      #{L := LInput} ->
+	CFG2 =
+	  case required_phi_moves(LInput, NewLInput) of
+	    [] -> CFG0;
+	    ReqMovs ->
+	      PhiLb = new_label(Target),
+	      Code = [mk_move(S,D,Target) || {S,D} <- ReqMovs]
+		++ [mk_goto(L, Target)],
+	      PhiBB = hipe_bb:mk_bb(Code),
+	      CFG1 = update_bb(CFG0, PhiLb, PhiBB, Target),
+	      bb_redirect_jmp(L, PhiLb, P, CFG1, Target)
+	  end,
+	{Input0, CFG2};
+      #{} ->
+	{Input0#{L => NewLInput}, CFG0}
+    end,
+  rewrite_succs(Ls, Target, P, POutput, Avail, Input, CFG).
+
+-spec bb_redirect_jmp(label(), label(), label(), target_cfg(), target())
+		     -> target_cfg().
+bb_redirect_jmp(From, To, Lb, CFG, Target) ->
+  BB0 = bb(CFG, Lb, Target),
+  Last = redirect_jmp(hipe_bb:last(BB0), From, To, Target),
+  BB = hipe_bb:code_update(BB0, hipe_bb:butlast(BB0) ++ [Last]),
+  update_bb(CFG, Lb, BB, Target).
+
+-spec required_phi_moves(subst_dict(), subst_dict()) -> [{reg(), reg()}].
+required_phi_moves([], []) -> [];
+required_phi_moves([P|Is], [P|Os]) -> required_phi_moves(Is, Os);
+required_phi_moves([{K, In}|Is], [{K, Out}|Os]) ->
+  [{Out, In}|required_phi_moves(Is, Os)].
+
+%% @doc Returns a new orddict with the keys in Set and their associated values.
+-spec orddict_with_ordset(ordsets:ordset(K), orddict:orddict(K, V))
+			 -> orddict:orddict(K, V).
+orddict_with_ordset([S|Ss], [{K, _}|_]=Dict) when S < K ->
+  orddict_with_ordset(Ss, Dict);
+orddict_with_ordset([S|_]=Set, [{K, _}|Ds]) when S > K ->
+  orddict_with_ordset(Set, Ds);
+orddict_with_ordset([_S|Ss], [{_K, _}=P|Ds]) -> % _S == _K
+  [P|orddict_with_ordset(Ss, Ds)];
+orddict_with_ordset([], _) -> [];
+orddict_with_ordset(_, []) -> [].
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Target module interface functions
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-define(TGT_IFACE_0(N), N(         {M,C}) -> M:N(         C)).
+-define(TGT_IFACE_1(N), N(A1,      {M,C}) -> M:N(A1,      C)).
+-define(TGT_IFACE_2(N), N(A1,A2,   {M,C}) -> M:N(A1,A2,   C)).
+-define(TGT_IFACE_3(N), N(A1,A2,A3,{M,C}) -> M:N(A1,A2,A3,C)).
+
+?TGT_IFACE_2(bb).
+?TGT_IFACE_1(def_use).
+?TGT_IFACE_1(defines_all_alloc).
+?TGT_IFACE_1(is_precoloured).
+?TGT_IFACE_1(labels).
+?TGT_IFACE_1(mk_goto).
+?TGT_IFACE_2(mk_move).
+?TGT_IFACE_0(new_label).
+?TGT_IFACE_0(new_reg_nr).
+?TGT_IFACE_3(redirect_jmp).
+?TGT_IFACE_1(reg_nr).
+?TGT_IFACE_1(reverse_postorder).
+?TGT_IFACE_2(subst_temps).
+?TGT_IFACE_3(update_bb).
+?TGT_IFACE_2(update_reg_nr).
+
+liveout(Liveness, L, Target={TgtMod,TgtCtx}) ->
+  ordsets:from_list(reg_names(TgtMod:liveout(Liveness, L, TgtCtx), Target)).
+
+reg_names(Regs, {TgtMod,TgtCtx}) ->
+  [TgtMod:reg_nr(X,TgtCtx) || X <- Regs].
+
+reg_def_use(I, Target) ->
+  {TDef, TUse} = def_use(I, Target),
+  {reg_names(TDef, Target), reg_names(TUse, Target)}.
-- 
cgit v1.2.3


From cc115ebc67a465233c7740efb42e0bc9584ad352 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Sat, 24 Sep 2016 09:37:46 +0200
Subject: hipe: Add branch prediction accessor ra callbacks

Adds a new register allocator callback
Target:branch_preds(Instr, Context) which, for a control flow
instruction Instr, returns a list of tuples {Target, Probability} for
each label name Target that Instr may branch to. Probability is a float
between 0.0 and 1.0 and corresponds to the predicted probability that
control flow branches to the corresponding target. The probabilities may
sum to at most 1.0 (rounding errors aside). Note that a sum less than
1.0 is valid.
---
 lib/hipe/arm/hipe_arm_cfg.erl                  | 21 +++++++++++++++++++++
 lib/hipe/ppc/hipe_ppc_cfg.erl                  | 21 +++++++++++++++++++++
 lib/hipe/regalloc/hipe_amd64_specific_sse2.erl |  6 ++++++
 lib/hipe/regalloc/hipe_arm_specific.erl        |  6 ++++++
 lib/hipe/regalloc/hipe_ppc_specific.erl        |  6 ++++++
 lib/hipe/regalloc/hipe_ppc_specific_fp.erl     |  6 ++++++
 lib/hipe/regalloc/hipe_sparc_specific.erl      |  6 ++++++
 lib/hipe/regalloc/hipe_sparc_specific_fp.erl   |  6 ++++++
 lib/hipe/regalloc/hipe_x86_specific.erl        |  6 ++++++
 lib/hipe/sparc/hipe_sparc_cfg.erl              | 22 +++++++++++++++++++++-
 lib/hipe/x86/hipe_x86_cfg.erl                  | 22 +++++++++++++++++++++-
 11 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/lib/hipe/arm/hipe_arm_cfg.erl b/lib/hipe/arm/hipe_arm_cfg.erl
index ea6da67317..0bc3df30b9 100644
--- a/lib/hipe/arm/hipe_arm_cfg.erl
+++ b/lib/hipe/arm/hipe_arm_cfg.erl
@@ -24,6 +24,7 @@
 -export([params/1, reverse_postorder/1]).
 -export([arity/1]). % for linear scan
 %%-export([redirect_jmp/3]).
+-export([branch_preds/1]).
 
 %%% these tell cfg.inc what to define (ugly as hell)
 -define(BREADTH_ORDER,true).  % for linear scan
@@ -75,6 +76,26 @@ branch_successors(Branch) ->
     #pseudo_tailcall{} -> []
   end.
 
+branch_preds(Branch) ->
+  case Branch of
+    #pseudo_bc{true_label=TrueLab,false_label=FalseLab,pred=Pred} ->
+      [{FalseLab, 1.0-Pred}, {TrueLab, Pred}];
+    #pseudo_call{contlab=ContLab, sdesc=#arm_sdesc{exnlab=[]}} ->
+      %% A function can still cause an exception, even if we won't catch it
+      [{ContLab, 1.0-hipe_bb_weights:call_exn_pred()}];
+    #pseudo_call{contlab=ContLab, sdesc=#arm_sdesc{exnlab=ExnLab}} ->
+      CallExnPred = hipe_bb_weights:call_exn_pred(),
+      [{ContLab, 1.0-CallExnPred}, {ExnLab, CallExnPred}];
+    #pseudo_switch{labels=Labels} ->
+      Prob = 1.0/length(Labels),
+      [{L, Prob} || L <- Labels];
+    _ ->
+      case branch_successors(Branch) of
+	[] -> [];
+	[Single] -> [{Single, 1.0}]
+      end
+  end.
+
 -ifdef(REMOVE_TRIVIAL_BBS_NEEDED).
 fails_to(_Instr) -> [].
 -endif.
diff --git a/lib/hipe/ppc/hipe_ppc_cfg.erl b/lib/hipe/ppc/hipe_ppc_cfg.erl
index 58a4d5c8c3..d44d38f38d 100644
--- a/lib/hipe/ppc/hipe_ppc_cfg.erl
+++ b/lib/hipe/ppc/hipe_ppc_cfg.erl
@@ -22,6 +22,7 @@
 -export([postorder/1]).
 -export([linearise/1, params/1, reverse_postorder/1]).
 -export([redirect_jmp/3, arity/1]).
+-export([branch_preds/1]).
 
 %%% these tell cfg.inc what to define (ugly as hell)
 -define(BREADTH_ORDER,true).
@@ -74,6 +75,26 @@ branch_successors(Branch) ->
     #pseudo_tailcall{} -> []
   end.
 
+branch_preds(Branch) ->
+  case Branch of
+    #bctr{labels=Labels} ->
+      Prob = 1.0/length(Labels),
+      [{L, Prob} || L <- Labels];
+    #pseudo_bc{true_label=TrueLab,false_label=FalseLab,pred=Pred} ->
+      [{FalseLab, 1.0-Pred}, {TrueLab, Pred}];
+    #pseudo_call{contlab=ContLab, sdesc=#ppc_sdesc{exnlab=[]}} ->
+      %% A function can still cause an exception, even if we won't catch it
+      [{ContLab, 1.0-hipe_bb_weights:call_exn_pred()}];
+    #pseudo_call{contlab=ContLab, sdesc=#ppc_sdesc{exnlab=ExnLab}} ->
+      CallExnPred = hipe_bb_weights:call_exn_pred(),
+      [{ContLab, 1.0-CallExnPred}, {ExnLab, CallExnPred}];
+    _ ->
+      case branch_successors(Branch) of
+	[] -> [];
+	[Single] -> [{Single, 1.0}]
+      end
+  end.
+
 -ifdef(REMOVE_TRIVIAL_BBS_NEEDED).
 fails_to(_Instr) -> [].
 -endif.
diff --git a/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl b/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
index 341233e851..8914e7cac3 100644
--- a/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
+++ b/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
@@ -60,6 +60,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights
+-export([branch_preds/2]).
+
 %%----------------------------------------------------------------------------
 
 -include("../flow/cfg.hrl").
@@ -156,6 +159,9 @@ bb(CFG, L, _) ->
 update_bb(CFG,L,BB,_) ->
   hipe_x86_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Instr,_) ->
+  hipe_x86_cfg:branch_preds(Instr).
+
 %% AMD64 stuff
 
 def_use(Instruction, _) ->
diff --git a/lib/hipe/regalloc/hipe_arm_specific.erl b/lib/hipe/regalloc/hipe_arm_specific.erl
index 224837a34a..517ad83982 100644
--- a/lib/hipe/regalloc/hipe_arm_specific.erl
+++ b/lib/hipe/regalloc/hipe_arm_specific.erl
@@ -56,6 +56,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights, hipe_range_split
+-export([branch_preds/2]).
+
 check_and_rewrite(CFG, Coloring, no_context) ->
   hipe_arm_ra_postconditions:check_and_rewrite(CFG, Coloring, 'normal').
 
@@ -119,6 +122,9 @@ bb(CFG,L,_) ->
 update_bb(CFG,L,BB,_) ->
   hipe_arm_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Branch,_) ->
+  hipe_arm_cfg:branch_preds(Branch).
+
 %% ARM stuff
 
 def_use(Instruction, Ctx) ->
diff --git a/lib/hipe/regalloc/hipe_ppc_specific.erl b/lib/hipe/regalloc/hipe_ppc_specific.erl
index bc4bd1972e..50ab0f04cf 100644
--- a/lib/hipe/regalloc/hipe_ppc_specific.erl
+++ b/lib/hipe/regalloc/hipe_ppc_specific.erl
@@ -56,6 +56,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights
+-export([branch_preds/2]).
+
 check_and_rewrite(CFG, Coloring, _) ->
   hipe_ppc_ra_postconditions:check_and_rewrite(CFG, Coloring, 'normal').
 
@@ -119,6 +122,9 @@ bb(CFG,L,_) ->
 update_bb(CFG,L,BB,_) ->
   hipe_ppc_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Instr,_) ->
+  hipe_ppc_cfg:branch_preds(Instr).
+
 %% PowerPC stuff
 
 def_use(Instruction, Ctx) ->
diff --git a/lib/hipe/regalloc/hipe_ppc_specific_fp.erl b/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
index 3aab14f061..5a4f63fc78 100644
--- a/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
+++ b/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
@@ -56,6 +56,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights
+-export([branch_preds/2]).
+
 check_and_rewrite(CFG, Coloring, _) ->
   hipe_ppc_ra_postconditions_fp:check_and_rewrite(CFG, Coloring).
 
@@ -112,6 +115,9 @@ bb(CFG, L, _) ->
 update_bb(CFG,L,BB,_) ->
   hipe_ppc_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Instr,_) ->
+  hipe_ppc_cfg:branch_preds(Instr).
+
 %% PowerPC stuff
 
 def_use(I, Ctx) ->
diff --git a/lib/hipe/regalloc/hipe_sparc_specific.erl b/lib/hipe/regalloc/hipe_sparc_specific.erl
index 12642709b9..9eb57dafd2 100644
--- a/lib/hipe/regalloc/hipe_sparc_specific.erl
+++ b/lib/hipe/regalloc/hipe_sparc_specific.erl
@@ -56,6 +56,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights, hipe_range_split
+-export([branch_preds/2]).
+
 check_and_rewrite(CFG, Coloring, no_context) ->
   hipe_sparc_ra_postconditions:check_and_rewrite(CFG, Coloring, 'normal').
 
@@ -119,6 +122,9 @@ bb(CFG,L,_) ->
 update_bb(CFG,L,BB,_) ->
   hipe_sparc_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Branch,_) ->
+  hipe_sparc_cfg:branch_preds(Branch).
+
 %% SPARC stuff
 
 def_use(Instruction, Ctx) ->
diff --git a/lib/hipe/regalloc/hipe_sparc_specific_fp.erl b/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
index ba4849d2d5..9993af81c2 100644
--- a/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
+++ b/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
@@ -56,6 +56,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights, hipe_range_split
+-export([branch_preds/2]).
+
 check_and_rewrite(CFG, Coloring, no_context) ->
   hipe_sparc_ra_postconditions_fp:check_and_rewrite(CFG, Coloring).
 
@@ -112,6 +115,9 @@ bb(CFG, L, _) ->
 update_bb(CFG,L,BB,_) ->
   hipe_sparc_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Branch,_) ->
+  hipe_sparc_cfg:branch_preds(Branch).
+
 %% SPARC stuff
 
 def_use(I, Ctx) ->
diff --git a/lib/hipe/regalloc/hipe_x86_specific.erl b/lib/hipe/regalloc/hipe_x86_specific.erl
index 0d9d7b751e..f21384d377 100644
--- a/lib/hipe/regalloc/hipe_x86_specific.erl
+++ b/lib/hipe/regalloc/hipe_x86_specific.erl
@@ -73,6 +73,9 @@
 	 update_bb/4,
 	 subst_temps/3]).
 
+%% callbacks for hipe_bb_weights
+-export([branch_preds/2]).
+
 check_and_rewrite(CFG, Coloring, _) ->
   ?HIPE_X86_RA_POSTCONDITIONS:check_and_rewrite(CFG, Coloring, 'normal').
 
@@ -160,6 +163,9 @@ bb(CFG,L,_) ->
 update_bb(CFG,L,BB,_) ->
   hipe_x86_cfg:bb_add(CFG,L,BB).
 
+branch_preds(Instr,_) ->
+  hipe_x86_cfg:branch_preds(Instr).
+
 %% X86 stuff
 
 def_use(Instruction,_) ->
diff --git a/lib/hipe/sparc/hipe_sparc_cfg.erl b/lib/hipe/sparc/hipe_sparc_cfg.erl
index b31c90a03a..45c8e887b5 100644
--- a/lib/hipe/sparc/hipe_sparc_cfg.erl
+++ b/lib/hipe/sparc/hipe_sparc_cfg.erl
@@ -23,7 +23,7 @@
 -export([linearise/1]).
 -export([params/1]).
 -export([arity/1]). % for linear scan
--export([redirect_jmp/3]).
+-export([redirect_jmp/3, branch_preds/1]).
 
 -define(SPARC_CFG, true).     % needed for cfg.inc
 
@@ -78,6 +78,26 @@ branch_successors(Branch) ->
     #pseudo_tailcall{} -> []
   end.
 
+branch_preds(Branch) ->
+  case Branch of
+    #jmp{labels=Labels} ->
+      Prob = 1.0/length(Labels),
+      [{L, Prob} || L <- Labels];
+    #pseudo_bp{true_label=TrueLab,false_label=FalseLab,pred=Pred} ->
+      [{FalseLab, 1.0-Pred}, {TrueLab, Pred}];
+    #pseudo_call{contlab=ContLab, sdesc=#sparc_sdesc{exnlab=[]}} ->
+      %% A function can still cause an exception, even if we won't catch it
+      [{ContLab, 1.0-hipe_bb_weights:call_exn_pred()}];
+    #pseudo_call{contlab=ContLab, sdesc=#sparc_sdesc{exnlab=ExnLab}} ->
+      CallExnPred = hipe_bb_weights:call_exn_pred(),
+      [{ContLab, 1.0-CallExnPred}, {ExnLab, CallExnPred}];
+    _ ->
+      case branch_successors(Branch) of
+	[] -> [];
+	[Single] -> [{Single, 1.0}]
+      end
+  end.
+
 -ifdef(REMOVE_TRIVIAL_BBS_NEEDED).
 fails_to(_Instr) -> [].
 -endif.
diff --git a/lib/hipe/x86/hipe_x86_cfg.erl b/lib/hipe/x86/hipe_x86_cfg.erl
index a4544e1086..0a3c0fc9d6 100644
--- a/lib/hipe/x86/hipe_x86_cfg.erl
+++ b/lib/hipe/x86/hipe_x86_cfg.erl
@@ -19,7 +19,7 @@
          succ/2, pred/2,
          bb/2, bb_add/3, map_bbs/2, fold_bbs/3]).
 -export([postorder/1, reverse_postorder/1]).
--export([linearise/1, params/1, arity/1, redirect_jmp/3]).
+-export([linearise/1, params/1, arity/1, redirect_jmp/3, branch_preds/1]).
 
 %%% these tell cfg.inc what to define (ugly as hell)
 -define(PRED_NEEDED,true).
@@ -72,6 +72,26 @@ branch_successors(Branch) ->
 	#ret{} -> []
     end.
 
+branch_preds(Branch) ->
+  case Branch of
+    #jmp_switch{labels=Labels} ->
+      Prob = 1.0/length(Labels),
+      [{L, Prob} || L <- Labels];
+    #pseudo_call{contlab=ContLab, sdesc=#x86_sdesc{exnlab=[]}} ->
+      %% A function can still cause an exception, even if we won't catch it
+      [{ContLab, 1.0-hipe_bb_weights:call_exn_pred()}];
+    #pseudo_call{contlab=ContLab, sdesc=#x86_sdesc{exnlab=ExnLab}} ->
+      CallExnPred = hipe_bb_weights:call_exn_pred(),
+      [{ContLab, 1.0-CallExnPred}, {ExnLab, CallExnPred}];
+    #pseudo_jcc{true_label=TrueLab,false_label=FalseLab,pred=Pred} ->
+      [{FalseLab, 1.0-Pred}, {TrueLab, Pred}];
+    _ ->
+      case branch_successors(Branch) of
+	[] -> [];
+	[Single] -> [{Single, 1.0}]
+      end
+    end.
+
 -ifdef(REMOVE_TRIVIAL_BBS_NEEDED).
 fails_to(_Instr) -> [].
 -endif.
-- 
cgit v1.2.3


From cf047293ecf6ea108a1e5a412743bfb5fe66e26f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Fri, 16 Sep 2016 13:34:24 +0200
Subject: hipe: Add module computing basic blocks weights

hipe_bb_weights computes basic block weights by using the branch
probability predictions as the coefficients in a linear equation system.
This linear equation system is then solved using Gauss-Jordan
Elimination.

The equation system representation is picked to be efficient with highly
sparse data. During triangelisation, the remaining equations are
dynamically reordered in order to prevent the equations from growing in
the common case, preserving the benefit of the sparse equation
representation.

In the case that the input is very big, hipe_bb_weights automatically
falls back to a rough approximation in order to keep compile times under
control.
---
 lib/hipe/main/hipe.app.src       |   1 +
 lib/hipe/opt/Makefile            |   3 +-
 lib/hipe/opt/hipe_bb_weights.erl | 449 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 452 insertions(+), 1 deletion(-)
 create mode 100644 lib/hipe/opt/hipe_bb_weights.erl

diff --git a/lib/hipe/main/hipe.app.src b/lib/hipe/main/hipe.app.src
index dad095da63..01499472f4 100644
--- a/lib/hipe/main/hipe.app.src
+++ b/lib/hipe/main/hipe.app.src
@@ -76,6 +76,7 @@
 	     hipe_arm_specific,
 	     hipe_arm_subst,
 	     hipe_bb,
+	     hipe_bb_weights,
 	     hipe_beam_to_icode,
 	     hipe_coalescing_regalloc,
 	     hipe_consttab,
diff --git a/lib/hipe/opt/Makefile b/lib/hipe/opt/Makefile
index 684d6f45b4..5a729d04ae 100644
--- a/lib/hipe/opt/Makefile
+++ b/lib/hipe/opt/Makefile
@@ -43,7 +43,8 @@ RELSYSDIR = $(RELEASE_PATH)/lib/hipe-$(VSN)
 # ----------------------------------------------------
 # Target Specs
 # ----------------------------------------------------
-MODULES = hipe_spillmin hipe_spillmin_color hipe_spillmin_scan
+MODULES = hipe_spillmin hipe_spillmin_color hipe_spillmin_scan \
+	hipe_bb_weights
 
 HRL_FILES=
 ERL_FILES= $(MODULES:%=%.erl)
diff --git a/lib/hipe/opt/hipe_bb_weights.erl b/lib/hipe/opt/hipe_bb_weights.erl
new file mode 100644
index 0000000000..8ef113b94c
--- /dev/null
+++ b/lib/hipe/opt/hipe_bb_weights.erl
@@ -0,0 +1,449 @@
+%% -*- erlang-indent-level: 2 -*-
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%@doc
+%%	                BASIC BLOCK WEIGHTING
+%%
+%% Computes basic block weights by using branch probabilities as weights in a
+%% linear equation system, that is then solved using Gauss-Jordan Elimination.
+%%
+%% The equation system representation is intentionally sparse, since most blocks
+%% have at most two successors.
+-module(hipe_bb_weights).
+-export([compute/3, compute_fast/3, weight/2, call_exn_pred/0]).
+-export_type([bb_weights/0]).
+
+-compile(inline).
+
+%%-define(DO_ASSERT,1).
+%%-define(DEBUG,1).
+-include("../main/hipe.hrl").
+
+%% If the equation system is large, it might take too long to solve it exactly.
+%% Thus, if there are more than ?HEUR_MAX_SOLVE labels, we use the iterative
+%% approximation.
+-define(HEUR_MAX_SOLVE, 10000).
+
+-opaque bb_weights() :: #{label() => float()}.
+
+-type cfg() :: any().
+-type target_module() :: module().
+-type target_context() :: any().
+-type target() :: {target_module(), target_context()}.
+
+-type label()            :: integer().
+-type var()              :: label().
+-type assignment()       :: {var(), float()}.
+-type eq_assoc()         :: [{var(), key()}].
+-type solution()         :: [assignment()].
+
+%% Constant. Predicted probability of a call resulting in an exception.
+-spec call_exn_pred() -> float().
+call_exn_pred() -> 0.01.
+
+-spec compute(cfg(), target_module(), target_context()) -> bb_weights().
+compute(CFG, TgtMod, TgtCtx) ->
+  Target = {TgtMod, TgtCtx},
+  Labels = labels(CFG, Target),
+  if length(Labels) > ?HEUR_MAX_SOLVE ->
+      ?debug_msg("~w: Too many labels (~w), approximating.~n",
+		 [?MODULE, length(Labels)]),
+      compute_fast(CFG, TgtMod, TgtCtx);
+     true ->
+      {EqSys, EqAssoc} = build_eq_system(CFG, Labels, Target),
+      case solve(EqSys, EqAssoc) of
+	{ok, Solution} ->
+	  maps:from_list(Solution)
+      end
+  end.
+
+-spec build_eq_system(cfg(), [label()], target()) -> {eq_system(), eq_assoc()}.
+build_eq_system(CFG, Labels, Target) ->
+  StartLb = hipe_gen_cfg:start_label(CFG),
+  EQS0 = eqs_new(),
+  {EQS1, Assoc} = build_eq_system(Labels, CFG, Target, [], EQS0),
+  {StartLb, StartKey} = lists:keyfind(StartLb, 1, Assoc),
+  StartRow0 = eqs_get(StartKey, EQS1),
+  StartRow = row_set_const(-1.0, StartRow0), % -1.0 since StartLb coef is -1.0
+  EQS = eqs_put(StartKey, StartRow, EQS1),
+  {EQS, Assoc}.
+
+build_eq_system([], _CFG, _Target, Map, EQS) -> {EQS, lists:reverse(Map)};
+build_eq_system([L|Ls], CFG, Target, Map, EQS0) ->
+  PredProb = pred_prob(L, CFG, Target),
+  {Key, EQS} = eqs_insert(row_new([{L, -1.0}|PredProb], 0.0), EQS0),
+  build_eq_system(Ls, CFG, Target, [{L, Key}|Map], EQS).
+
+pred_prob(L, CFG, Target) ->
+  [begin
+     BB = bb(CFG, Pred, Target),
+     Ps = branch_preds(hipe_bb:last(BB), Target),
+     ?ASSERT(length(lists:ukeysort(1, Ps))
+	     =:= length(hipe_gen_cfg:succ(CFG, Pred))),
+     case lists:keyfind(L, 1, Ps) of
+       {L, Prob} when is_float(Prob) -> {Pred, Prob}
+     end
+   end || Pred <- hipe_gen_cfg:pred(CFG, L)].
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-spec triangelise(eq_system(), eq_assoc()) -> {eq_system(), eq_assoc()}.
+triangelise(EQS, VKs) ->
+  triangelise_1(mk_triix(EQS, VKs), []).
+
+triangelise_1(TIX0, Acc) ->
+  case triix_is_empty(TIX0) of
+    true -> {triix_eqs(TIX0), lists:reverse(Acc)};
+    false ->
+      {V,Key,TIX1} = triix_pop_smallest(TIX0),
+      Row0 = triix_get(Key, TIX1),
+      case row_get(V, Row0) of
+	Coef when Coef > -0.0001, Coef < 0.0001 ->
+	  throw(error);
+	_ ->
+	  Row = row_normalise(V, Row0),
+	  TIX2 = triix_put(Key, Row, TIX1),
+	  TIX = eliminate_triix(V, Key, Row, TIX2),
+	  triangelise_1(TIX, [{V,Key}|Acc])
+      end
+  end.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Triangelisation maintains its own index, outside of eqs. This index is
+%% essentially a BST (used as a heap) of all equations by size, with {Key,Var}
+%% as the values and only containing a subset of all the keys in the whole
+%% equation system. The key operation is triix_pop_smallest/1, which pops a
+%% {Key,Var} from the heap corresponding to one of the smallest equations. This
+%% is critical in order to prevent the equations from growing during
+%% triangelisation, which would make the algorithm O(n^2) in the common case.
+-type tri_eq_system() :: {eq_system(),
+			  gb_trees:tree(non_neg_integer(),
+					gb_trees:tree(key(), var()))}.
+
+triix_eqs({EQS, _}) -> EQS.
+triix_get(Key, {EQS, _}) -> eqs_get(Key, EQS).
+triix_is_empty({_, Tree}) -> gb_trees:is_empty(Tree).
+triix_lookup(V, {EQS, _}) -> eqs_lookup(V, EQS).
+
+mk_triix(EQS, VKs) ->
+  {EQS,
+   lists:foldl(fun({V,Key}, Tree) ->
+		   Size = row_size(eqs_get(Key, EQS)),
+		   sitree_insert(Size, Key, V, Tree)
+	       end, gb_trees:empty(), VKs)}.
+
+sitree_insert(Size, Key, V, SiTree) ->
+  SubTree1 =
+    case gb_trees:lookup(Size, SiTree) of
+      none -> gb_trees:empty();
+      {value, SubTree0} -> SubTree0
+    end,
+  SubTree = gb_trees:insert(Key, V, SubTree1),
+  gb_trees:enter(Size, SubTree, SiTree).
+
+sitree_update_subtree(Size, SubTree, SiTree) ->
+  case gb_trees:is_empty(SubTree) of
+    true -> gb_trees:delete(Size, SiTree);
+    false -> gb_trees:update(Size, SubTree, SiTree)
+  end.
+
+triix_put(Key, Row, {EQS, Tree0}) ->
+  OldSize = row_size(eqs_get(Key, EQS)),
+  case row_size(Row) of
+    OldSize -> {eqs_put(Key, Row, EQS), Tree0};
+    Size ->
+      Tree =
+	case gb_trees:lookup(OldSize, Tree0) of
+	  none -> Tree0;
+	  {value, SubTree0} ->
+	    case gb_trees:lookup(Key, SubTree0) of
+	      none -> Tree0;
+	      {value, V} ->
+		SubTree = gb_trees:delete(Key, SubTree0),
+		Tree1 = sitree_update_subtree(OldSize, SubTree, Tree0),
+		sitree_insert(Size, Key, V, Tree1)
+	    end
+	end,
+      {eqs_put(Key, Row, EQS), Tree}
+  end.
+
+triix_pop_smallest({EQS, Tree}) ->
+  {Size, SubTree0} = gb_trees:smallest(Tree),
+  {Key, V, SubTree} = gb_trees:take_smallest(SubTree0),
+  {V, Key, {EQS, sitree_update_subtree(Size, SubTree, Tree)}}.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
+row_normalise(Var, Row) ->
+  %% Normalise v's coef to 1.0
+  %% row_set_coef ensures the coef is exactly 1.0 (no rounding errors)
+  row_set_coef(Var, 1.0, row_scale(Row, 1.0/row_get(Var, Row))).
+
+%% Precondition: Row must be normalised; i.e. Vars coef must be 1.0 (mod
+%% rounding errors)
+-spec eliminate(var(), key(), row(), eq_system()) -> eq_system().
+eliminate(Var, Key, Row, TIX0) ->
+  eliminate_abstr(Var, Key, Row, TIX0,
+		  fun eqs_get/2, fun eqs_lookup/2, fun eqs_put/3).
+
+-spec eliminate_triix(var(), key(), row(), tri_eq_system()) -> tri_eq_system().
+eliminate_triix(Var, Key, Row, TIX0) ->
+  eliminate_abstr(Var, Key, Row, TIX0,
+		  fun triix_get/2, fun triix_lookup/2, fun triix_put/3).
+
+%% The same function implemented for two data types, eqs and triix.
+-compile({inline, eliminate_abstr/7}).
+-spec eliminate_abstr(var(), key(), row(), ADT, fun((key(), ADT) -> row()),
+		      fun((var(), ADT) -> [key()]),
+		      fun((key(), row(), ADT) -> ADT)) -> ADT.
+eliminate_abstr(Var, Key, Row, ADT0, GetFun, LookupFun, PutFun) ->
+  ?ASSERT(1.0 =:= row_get(Var, Row)),
+  ADT =
+    lists:foldl(fun(RK, ADT1) when RK =:= Key -> ADT1;
+		   (RK, ADT1) ->
+		    R = GetFun(RK, ADT1),
+		    PutFun(RK, row_addmul(R, Row, -row_get(Var, R)), ADT1)
+		end, ADT0, LookupFun(Var, ADT0)),
+  [Key] = LookupFun(Var, ADT),
+  ADT.
+
+-spec solve(eq_system(), eq_assoc()) -> error | {ok, solution()}.
+solve(EQS0, EqAssoc0) ->
+  try triangelise(EQS0, EqAssoc0)
+  of {EQS1, EqAssoc} ->
+      {ok, solve_1(EqAssoc, maps:from_list(EqAssoc), EQS1, [])}
+  catch error -> error
+  end.
+
+solve_1([], _VarEqs, _EQS, Acc) -> Acc;
+solve_1([{V,K}|Ps], VarEqs, EQS0, Acc0) ->
+  Row0 = eqs_get(K, EQS0),
+  VarsToKill = [Var || {Var, _} <- row_coefs(Row0), Var =/= V],
+  Row1 = kill_vars(VarsToKill, VarEqs, EQS0, Row0),
+  [{V,_}] = row_coefs(Row1), % assertion
+  Row = row_normalise(V, Row1),
+  [{V,1.0}] = row_coefs(Row), % assertion
+  EQS = eliminate(V, K, Row, EQS0),
+  [K] = eqs_lookup(V, EQS),
+  solve_1(Ps, VarEqs, eqs_remove(K, EQS), [{V, row_const(Row)}|Acc0]).
+
+kill_vars([], _VarEqs, _EQS, Row) -> Row;
+kill_vars([V|Vs], VarEqs, EQS, Row0) ->
+  VRow0 = eqs_get(maps:get(V, VarEqs), EQS),
+  VRow = row_normalise(V, VRow0),
+  ?ASSERT(1.0 =:= row_get(V, VRow)),
+  Row = row_addmul(Row0, VRow, -row_get(V, Row0)),
+  ?ASSERT(0.0 =:= row_get(V, Row)), % V has been killed
+  kill_vars(Vs, VarEqs, EQS, Row).
+
+-spec weight(label(), bb_weights()) -> float().
+weight(Lbl, Weights) ->
+  maps:get(Lbl, Weights).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Row datatype
+%% Invariant: No 0.0 coefficiets!
+-spec row_empty() -> row().
+row_empty() -> {orddict:new(), 0.0}.
+
+-spec row_new([{var(), float()}], float()) -> row().
+row_new(Coefs, Const) when is_float(Const) ->
+  row_ensure_invar({row_squash_multiples(lists:keysort(1, Coefs)), Const}).
+
+row_squash_multiples([{K, C1},{K, C2}|Ps]) ->
+  row_squash_multiples([{K,C1+C2}|Ps]);
+row_squash_multiples([P|Ps]) -> [P|row_squash_multiples(Ps)];
+row_squash_multiples([]) -> [].
+
+row_ensure_invar({Coef, Const}) ->
+  {orddict:filter(fun(_, 0.0) -> false; (_, F) when is_float(F) -> true end,
+		  Coef), Const}.
+
+row_const({_, Const}) -> Const.
+row_coefs({Coefs, _}) -> orddict:to_list(Coefs).
+row_size({Coefs, _}) -> orddict:size(Coefs).
+
+row_get(Var, {Coefs, _}) ->
+  case lists:keyfind(Var, 1, Coefs) of
+    false -> 0.0;
+    {_, Coef} -> Coef
+  end.
+
+row_set_coef(Var, 0.0, {Coefs, Const}) ->
+  {orddict:erase(Var, Coefs), Const};
+row_set_coef(Var, Coef, {Coefs, Const}) ->
+  {orddict:store(Var, Coef, Coefs), Const}.
+
+row_set_const(Const, {Coefs, _}) -> {Coefs, Const}.
+
+%% Lhs + Rhs*Factor
+-spec row_addmul(row(), row(), float()) -> row().
+row_addmul({LhsCoefs, LhsConst}, {RhsCoefs, RhsConst}, Factor)
+  when is_float(Factor) ->
+  Coefs = row_addmul_coefs(LhsCoefs, RhsCoefs, Factor),
+  Const = LhsConst + RhsConst * Factor,
+  {Coefs, Const}.
+
+row_addmul_coefs(Ls, [], Factor) when is_float(Factor) -> Ls;
+row_addmul_coefs([], Rs, Factor) when is_float(Factor) ->
+  row_scale_coefs(Rs, Factor);
+row_addmul_coefs([L={LV, _}|Ls], Rs=[{RV,_}|_], Factor)
+  when LV < RV, is_float(Factor) ->
+  [L|row_addmul_coefs(Ls, Rs, Factor)];
+row_addmul_coefs(Ls=[{LV, _}|_], [{RV, RC}|Rs], Factor)
+  when LV > RV, is_float(RC), is_float(Factor) ->
+  [{RV, RC*Factor}|row_addmul_coefs(Ls, Rs, Factor)];
+row_addmul_coefs([{V, LC}|Ls], [{V, RC}|Rs], Factor)
+  when is_float(LC), is_float(RC), is_float(Factor) ->
+  case LC + RC * Factor of
+    0.0 ->      row_addmul_coefs(Ls, Rs, Factor);
+    C -> [{V,C}|row_addmul_coefs(Ls, Rs, Factor)]
+  end.
+
+row_scale(_, 0.0) -> row_empty();
+row_scale({RowCoefs, RowConst}, Factor) when is_float(Factor) ->
+  {row_scale_coefs(RowCoefs, Factor), RowConst * Factor}.
+
+row_scale_coefs([{V,C}|Cs], Factor) when is_float(Factor), is_float(C) ->
+  [{V,C*Factor}|row_scale_coefs(Cs, Factor)];
+row_scale_coefs([], Factor) when is_float(Factor) ->
+  [].
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Equation system ADT
+%%
+%% Stores a linear equation system, allowing for efficient updates and efficient
+%% queries for all equations mentioning a variable.
+%%
+%% It is sort of like a "database" table of {Primary, Terms, Const} indexed both
+%% on Primary as well as the vars (map keys) in Terms.
+-type row()       :: {Terms :: orddict:orddict(var(), float()),
+		      Const :: float()}.
+-type key()       :: non_neg_integer().
+-type rev_index() :: #{var() => ordsets:ordset(key())}.
+-record(eq_system, {
+	  rows = #{}              :: #{key() => row()},
+	  revidx = revidx_empty() :: rev_index(),
+	  next_key = 0            :: key()
+	 }).
+-type eq_system() :: #eq_system{}.
+
+eqs_new() -> #eq_system{}.
+
+-spec eqs_insert(row(), eq_system()) -> {key(), eq_system()}.
+eqs_insert(Row, EQS=#eq_system{next_key=NextKey0}) ->
+  Key = NextKey0,
+  NextKey = NextKey0 + 1,
+  {Key, eqs_insert(Key, Row, EQS#eq_system{next_key=NextKey})}.
+
+eqs_insert(Key, Row, EQS=#eq_system{rows=Rows, revidx=RevIdx0}) ->
+  RevIdx = revidx_add(Key, Row, RevIdx0),
+  EQS#eq_system{rows=Rows#{Key => Row}, revidx=RevIdx}.
+
+eqs_put(Key, Row, EQS0) ->
+  eqs_insert(Key, Row, eqs_remove(Key, EQS0)).
+
+eqs_remove(Key, EQS=#eq_system{rows=Rows, revidx=RevIdx0}) ->
+  OldRow = maps:get(Key, Rows),
+  RevIdx = revidx_remove(Key, OldRow, RevIdx0),
+  EQS#eq_system{rows = maps:remove(Key, Rows), revidx=RevIdx}.
+
+-spec eqs_get(key(), eq_system()) -> row().
+eqs_get(Key, #eq_system{rows=Rows}) -> maps:get(Key, Rows).
+
+%% Keys of all equations containing a nonzero coefficient for Var
+-spec eqs_lookup(var(), eq_system()) -> ordsets:ordset(key()).
+eqs_lookup(Var, #eq_system{revidx=RevIdx}) -> maps:get(Var, RevIdx).
+
+%% eqs_rows(#eq_system{rows=Rows}) -> maps:to_list(Rows).
+
+%% eqs_print(EQS) ->
+%%   lists:foreach(fun({_, Row}) ->
+%% 		    row_print(Row)
+%% 		end, lists:sort(eqs_rows(EQS))).
+
+%% row_print(Row) ->
+%%   CoefStrs = [io_lib:format("~wl~w", [Coef, Var])
+%% 	      || {Var, Coef} <- row_coefs(Row)],
+%%   CoefStr = lists:join(" + ", CoefStrs),
+%%   io:format("~w = ~s~n", [row_const(Row), CoefStr]).
+
+revidx_empty() -> #{}.
+
+-spec revidx_add(key(), row(), rev_index()) -> rev_index().
+revidx_add(Key, Row, RevIdx0) ->
+  orddict:fold(fun(Var, _Coef, RevIdx1) ->
+		?ASSERT(_Coef /= 0.0),
+		RevIdx1#{Var => ordsets:add_element(
+				  Key, maps:get(Var, RevIdx1, ordsets:new()))}
+	    end, RevIdx0, row_coefs(Row)).
+
+-spec revidx_remove(key(), row(), rev_index()) -> rev_index().
+revidx_remove(Key, {Coefs, _}, RevIdx0) ->
+  orddict:fold(fun(Var, _Coef, RevIdx1) ->
+		case RevIdx1 of
+		  #{Var := Keys0} ->
+		    case ordsets:del_element(Key, Keys0) of
+		      [] -> maps:remove(Var, RevIdx1);
+		      Keys -> RevIdx1#{Var := Keys}
+		    end
+		end
+	    end, RevIdx0, Coefs).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-define(FAST_ITERATIONS, 5).
+
+%% @doc Computes a rough approximation of BB weights. The approximation is
+%% particularly poor (converges slowly) for recursive functions and loops.
+-spec compute_fast(cfg(), target_module(), target_context()) -> bb_weights().
+compute_fast(CFG, TgtMod, TgtCtx) ->
+  Target = {TgtMod, TgtCtx},
+  StartLb = hipe_gen_cfg:start_label(CFG),
+  RPO = reverse_postorder(CFG, Target),
+  PredProbs = [{L, pred_prob(L, CFG, Target)} || L <- RPO, L =/= StartLb],
+  Probs0 = (maps:from_list([{L, 0.0} || L <- RPO]))#{StartLb := 1.0},
+  fast_iterate(?FAST_ITERATIONS, PredProbs, Probs0).
+
+fast_iterate(0, _Pred, Probs) -> Probs;
+fast_iterate(Iters, Pred, Probs0) ->
+  fast_iterate(Iters-1, Pred,
+	       fast_one(Pred, Probs0)).
+
+fast_one([{L, Pred}|Ls], Probs0) ->
+  Weight = fast_sum(Pred, Probs0, 0.0),
+  Probs = Probs0#{L => Weight},
+  fast_one(Ls, Probs);
+fast_one([], Probs) ->
+  Probs.
+
+fast_sum([{P,EWt}|Pred], Probs, Acc) when is_float(EWt), is_float(Acc) ->
+  case Probs of
+    #{P := PWt} when is_float(PWt) ->
+      fast_sum(Pred, Probs, Acc + PWt * EWt)
+  end;
+fast_sum([], _Probs, Acc) when is_float(Acc) ->
+  Acc.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Target module interface functions
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-define(TGT_IFACE_0(N), N(         {M,C}) -> M:N(         C)).
+-define(TGT_IFACE_1(N), N(A1,      {M,C}) -> M:N(A1,      C)).
+-define(TGT_IFACE_2(N), N(A1,A2,   {M,C}) -> M:N(A1,A2,   C)).
+-define(TGT_IFACE_3(N), N(A1,A2,A3,{M,C}) -> M:N(A1,A2,A3,C)).
+
+?TGT_IFACE_2(bb).
+?TGT_IFACE_1(branch_preds).
+?TGT_IFACE_1(labels).
+?TGT_IFACE_1(reverse_postorder).
-- 
cgit v1.2.3


From d1d26f4bf9da3cc5eab4e918df771d67fe9e6bb5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 16 Mar 2017 16:39:26 +0100
Subject: hipe: Add range splitter range_split
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

hipe_range_split is a complex live range splitter, more sophisticated
thatn hipe_restore_reuse, but still targeted specifically at temporaries
forced onto stack by being live over call instructions.

hipe_range_split partitions the control flow graph at call instructions,
like hipe_regalloc_prepass. Splitting decisions are made on a per
partition and per temporary basis.

There are three different ways in which hipe_range_split may choose to
split a temporary in a program partition:

 * Mode1: Spill the temp before calls, and restore it after them
 * Mode2: Spill the temp after definitions, restore it after calls
 * Mode3: Spill the temp after definitions, restore it before uses

To pick which of these should be used for each temp×partiton pair,
hipe_range_split uses a cost function. The cost is simply the sum of the
cost of all expected stack accesses, and the cost for an individual
stack access is based on the probability weight of the basic block that
it resides in. This biases the range splitter so that it attempts moving
stack accesses from a functions hot path to the cold path.
hipe_bb_weights is used to compute the probability weights.

mode3 is effectively the same as what hipe_restore_reuse does. Because
of this, hipe_restore_reuse reuses the analysis pass of
hipe_restore_reuse in order to compute the minimal needed set of spills
and restores. The reason mode3 was introduced to hipe_range_split rather
than simply composing it with hipe_restore_reuse (by running both) is
that such a composition resulted in poor register allocation results due
to insufficiently strong move coalescing in the register allocator.

The cost function heuristic has a couple of tuning knobs:

 * {range_split_min_gain, Gain} (default: 1.1, range: [0.0, inf))
   The minimum proportional improvement that the cost of all stack
   accesses to a temp must display in order for that temp to be split.
 * {range_split_mode1_fudge, Factor} (default: 1.1, range: [0.0, inf))
   Costs for mode1 are multiplied by this factor in order to discourage
   it when it provides marginal benefits. The justification is that
   mode1 causes temps to be live for longest, thus leading to higher
   register pressure.
 * {range_split_weight_power, Factor} (default: 2, range: (0.0, inf))
   Adjusts how much effect the basic block weights have on the cost of a
   stack access. A stack access in a block with weight 1.0 has cost 1.0,
   a stack access in a block with weight 0.01 has cost 1/Factor.

Additionally, the option range_split_weights chooses whether the basic
block weights are used at all.

In the case that the input is very big, hipe_range_split automatically
falls back to hipe_restore_reuse only in order to keep compile times
under control. Note that this is not only because of hipe_range_split
being slow, but also due to the resulting program being slow to register
allocate, and is not as partitionable by hipe_regalloc_prepass.
hipe_restore_reuse, on the other hand, does not affect the programs
partitionability.

The hipe_range_split pass is controlled by a new option ra_range_split.
ra_range_split is added to o2, and ra_restore_reuse is disabled in o2.
---
 lib/hipe/main/hipe.app.src               |    1 +
 lib/hipe/main/hipe.erl                   |   17 +-
 lib/hipe/regalloc/Makefile               |    1 +
 lib/hipe/regalloc/hipe_range_split.erl   | 1187 ++++++++++++++++++++++++++++++
 lib/hipe/regalloc/hipe_regalloc_loop.erl |   18 +-
 lib/hipe/regalloc/hipe_restore_reuse.erl |   15 +-
 6 files changed, 1232 insertions(+), 7 deletions(-)
 create mode 100644 lib/hipe/regalloc/hipe_range_split.erl

diff --git a/lib/hipe/main/hipe.app.src b/lib/hipe/main/hipe.app.src
index 01499472f4..de0b255c01 100644
--- a/lib/hipe/main/hipe.app.src
+++ b/lib/hipe/main/hipe.app.src
@@ -148,6 +148,7 @@
 	     hipe_ppc_specific_fp,
 	     hipe_ppc_subst,
 	     hipe_profile,
+	     hipe_range_split,
 	     hipe_reg_worklists,
 	     hipe_regalloc_loop,
 	     hipe_regalloc_prepass,
diff --git a/lib/hipe/main/hipe.erl b/lib/hipe/main/hipe.erl
index f3e7c0879e..19b4e8bfe2 100644
--- a/lib/hipe/main/hipe.erl
+++ b/lib/hipe/main/hipe.erl
@@ -1230,6 +1230,13 @@ option_text(regalloc) ->
   "    optimistic - another variant of a coalescing allocator";
 option_text(remove_comments) ->
   "Strip comments from intermediate code";
+option_text(ra_range_split) ->
+  "Split live ranges of temporaries live over call instructions\n"
+  "before performing register allocation.\n"
+  "Heuristically tries to move stack accesses to the cold path of function.\n"
+  "This range splitter is more sophisticated than 'ra_restore_reuse', but has\n"
+  "a significantly larger impact on compile time.\n"
+  "Should only be used with move coalescing register allocators.";
 option_text(ra_restore_reuse) ->
   "Split live ranges of temporaries such that straight-line\n"
   "code will not need to contain multiple restores from the same stack\n"
@@ -1376,7 +1383,12 @@ opt_keys() ->
      pp_rtl_linear,
      ra_partitioned,
      ra_prespill,
+     ra_range_split,
      ra_restore_reuse,
+     range_split_min_gain,
+     range_split_mode1_fudge,
+     range_split_weight_power,
+     range_split_weights,
      regalloc,
      remove_comments,
      rtl_ssa,
@@ -1436,7 +1448,8 @@ o1_opts(TargetArch) ->
 
 o2_opts(TargetArch) ->
   Common = [icode_type, icode_call_elim, % icode_ssa_struct_reuse,
-	    rtl_lcm | (o1_opts(TargetArch) -- [rtl_ssapre])],
+	    ra_range_split, range_split_weights, % XXX: Having defaults here is ugly
+	    rtl_lcm | (o1_opts(TargetArch) -- [rtl_ssapre, ra_restore_reuse])],
   case TargetArch of
     T when T =:= amd64 orelse T =:= ppc64 -> % 64-bit targets
       [icode_range | Common];
@@ -1484,7 +1497,9 @@ opt_negations() ->
    {no_pp_rtl_ssapre, pp_rtl_ssapre},
    {no_ra_partitioned, ra_partitioned},
    {no_ra_prespill, ra_prespill},
+   {no_ra_range_split, ra_range_split},
    {no_ra_restore_reuse, ra_restore_reuse},
+   {no_range_split_weights, range_split_weights},
    {no_remove_comments, remove_comments},
    {no_rtl_ssa, rtl_ssa},
    {no_rtl_ssa_const_prop, rtl_ssa_const_prop},
diff --git a/lib/hipe/regalloc/Makefile b/lib/hipe/regalloc/Makefile
index 73ac232f51..81a92e5d35 100644
--- a/lib/hipe/regalloc/Makefile
+++ b/lib/hipe/regalloc/Makefile
@@ -50,6 +50,7 @@ MODULES = hipe_ig hipe_ig_moves hipe_moves \
 	  hipe_optimistic_regalloc \
 	  hipe_coalescing_regalloc \
 	  hipe_graph_coloring_regalloc \
+	  hipe_range_split \
 	  hipe_regalloc_loop \
 	  hipe_regalloc_prepass \
 	  hipe_restore_reuse \
diff --git a/lib/hipe/regalloc/hipe_range_split.erl b/lib/hipe/regalloc/hipe_range_split.erl
new file mode 100644
index 0000000000..39b086d9f7
--- /dev/null
+++ b/lib/hipe/regalloc/hipe_range_split.erl
@@ -0,0 +1,1187 @@
+%% -*- erlang-indent-level: 2 -*-
+%%
+%% Licensed under the Apache License, Version 2.0 (the "License");
+%% you may not use this file except in compliance with the License.
+%% You may obtain a copy of the License at
+%%
+%%     http://www.apache.org/licenses/LICENSE-2.0
+%%
+%% Unless required by applicable law or agreed to in writing, software
+%% distributed under the License is distributed on an "AS IS" BASIS,
+%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+%% See the License for the specific language governing permissions and
+%% limitations under the License.
+%%
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%%@doc
+%%	       TEMPORARY LIVE RANGE SPLITTING PASS
+%%
+%% Live range splitting is useful to allow a register allocator to allocate a
+%% temporary to register for a part of its lifetime, even if it cannot be for
+%% the entirety. This improves register allocation quality, at the cost of
+%% making the allocation problem more time and memory intensive to solve.
+%%
+%% Optimal allocation can be achieved if all temporaries are split at every
+%% program point (between all instructions), but this makes register allocation
+%% infeasably slow in practice. Instead, this module uses heuristics to choose
+%% which temporaries should have their live ranges split, and at which points.
+%%
+%% The range splitter only considers temps which are live during a call
+%% instruction, since they're known to be spilled. The control-flow graph is
+%% partitioned at call instructions and splitting decisions are made separately
+%% for each partition. The register copy of a temp (if any) gets a separate name
+%% in each partition.
+%%
+%% There are three different ways the range splitter may choose to split a
+%% temporary in a program partition:
+%%
+%%  * Mode1: Spill the temp before calls, and restore it after them
+%%  * Mode2: Spill the temp after definitions, restore it after calls
+%%  * Mode3: Spill the temp after definitions, restore it before uses
+%%
+%% To pick which of these should be used for each temp×partiton pair, the range
+%% splitter uses a cost function. The cost is simply the sum of the cost of all
+%% expected stack accesses, and the cost for an individual stack access is based
+%% on the probability weight of the basic block that it resides in. This biases
+%% the range splitter so that it attempts moving stack accesses from a functions
+%% hot path to the cold path.
+%%
+%% The heuristic has a couple of tuning knobs, adjusting its preference for
+%% different spilling modes, aggressiveness, and how much influence the basic
+%% block probability weights have.
+%%
+%% Edge case not handled: Call instructions directly defining a pseudo. In that
+%% case, if that pseudo has been selected for mode2 spills, no spill is inserted
+%% after the call.
+-module(hipe_range_split).
+
+-export([split/5]).
+
+-compile(inline).
+
+%% -define(DO_ASSERT, 1).
+%% -define(DEBUG, 1).
+-include("../main/hipe.hrl").
+
+%% Heuristic tuning constants
+-define(DEFAULT_MIN_GAIN, 1.1).    % option: range_split_min_gain
+-define(DEFAULT_MODE1_FUDGE, 1.1). % option: range_split_mode1_fudge
+-define(DEFAULT_WEIGHT_POWER, 2).  % option: range_split_weight_power
+-define(WEIGHT_CONST_FUN(Power), math:log(Power)/math:log(100)).
+-define(WEIGHT_FUN(Wt, Const), math:pow(Wt, Const)).
+-define(HEUR_MAX_TEMPS, 20000).
+
+-type target_cfg()       :: any().
+-type target_instr()     :: any().
+-type target_temp()      :: any().
+-type liveness()         :: any().
+-type target_module()    :: module().
+-type target_context()   :: any().
+-type target()           :: {target_module(), target_context()}.
+-type liveset()          :: ordsets:ordset(temp()).
+-type temp()             :: non_neg_integer().
+-type label()            :: non_neg_integer().
+
+-spec split(target_cfg(), liveness(), target_module(), target_context(),
+	    comp_options())
+	   -> target_cfg().
+split(TCFG0, Liveness, TargetMod, TargetContext, Options) ->
+  Target = {TargetMod, TargetContext},
+  NoTemps = number_of_temporaries(TCFG0, Target),
+  if NoTemps > ?HEUR_MAX_TEMPS ->
+      ?debug_msg("~w: Too many temps (~w), falling back on restore_reuse.~n",
+		 [?MODULE, NoTemps]),
+      hipe_restore_reuse:split(TCFG0, Liveness, TargetMod, TargetContext);
+     true ->
+      Wts = compute_weights(TCFG0, TargetMod, TargetContext, Options),
+      {CFG0, Temps} = convert(TCFG0, Target),
+      Avail = avail_analyse(TCFG0, Liveness, Target),
+      Defs = def_analyse(CFG0, TCFG0),
+      RDefs = rdef_analyse(CFG0),
+      PLive = plive_analyse(CFG0),
+      {CFG, DUCounts, Costs, DSets0} =
+	scan(CFG0, Liveness, PLive, Wts, Defs, RDefs, Avail, Target),
+      {DSets, _} = hipe_dsets:to_map(DSets0),
+      Renames = decide(DUCounts, Costs, Target, Options),
+      rewrite(CFG, TCFG0, Target, Liveness, PLive, Defs, Avail, DSets, Renames,
+	      Temps)
+  end.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Internal program representation
+%%
+%% Second pass: Convert cfg to internal representation
+
+-record(cfg, {
+	  rpo_labels :: [label()],
+	  bbs        :: #{label() => bb()}
+	 }).
+-type cfg() :: #cfg{}.
+
+cfg_bb(L, #cfg{bbs=BBS}) -> maps:get(L, BBS).
+
+cfg_postorder(#cfg{rpo_labels=RPO}) -> lists:reverse(RPO).
+
+-record(bb, {
+	  code     :: [code_elem()],
+	  %% If the last instruction of code defines all allocatable registers
+	  has_call :: boolean(),
+	  succ     :: [label()]
+	 }).
+-type bb() :: #bb{}.
+-type code_elem() :: instr() | mode2_spills() | mode3_restores().
+
+bb_code(#bb{code=Code}) -> Code.
+bb_has_call(#bb{has_call=HasCall}) -> HasCall.
+bb_succ(#bb{succ=Succ}) -> Succ.
+
+bb_butlast(#bb{code=Code}) ->
+  bb_butlast_1(Code).
+
+bb_butlast_1([_Last]) -> [];
+bb_butlast_1([I|Is]) -> [I|bb_butlast_1(Is)].
+
+bb_last(#bb{code=Code}) -> lists:last(Code).
+
+-record(instr, {
+	  i   :: target_instr(),
+	  def :: ordsets:ordset(temp()),
+	  use :: ordsets:ordset(temp())
+	 }).
+-type instr() :: #instr{}.
+
+-record(mode2_spills, {
+	  temps :: ordsets:ordset(temp())
+	}).
+-type mode2_spills() :: #mode2_spills{}.
+
+-record(mode3_restores, {
+	  temps :: ordsets:ordset(temp())
+	}).
+-type mode3_restores() :: #mode3_restores{}.
+
+-spec convert(target_cfg(), target()) -> {cfg(), temps()}.
+convert(CFG, Target) ->
+  RPO = reverse_postorder(CFG, Target),
+  {BBsList, Temps} = convert_bbs(RPO, CFG, Target, #{}, []),
+  {#cfg{rpo_labels = RPO,
+	bbs = maps:from_list(BBsList)},
+   Temps}.
+
+convert_bbs([], _CFG, _Target, Temps, Acc) -> {Acc, Temps};
+convert_bbs([L|Ls], CFG, Target, Temps0, Acc) ->
+  Succs = hipe_gen_cfg:succ(CFG, L),
+  TBB = bb(CFG, L, Target),
+  TCode = hipe_bb:code(TBB),
+  {Code, Last, Temps} = convert_code(TCode, Target, Temps0, []),
+  HasCall = defines_all_alloc(Last#instr.i, Target),
+  BB = #bb{code = Code,
+	   has_call = HasCall,
+	   succ = Succs},
+  convert_bbs(Ls, CFG, Target, Temps, [{L,BB}|Acc]).
+
+convert_code([], _Target, Temps, [Last|_]=Acc) ->
+  {lists:reverse(Acc), Last, Temps};
+convert_code([TI|TIs], Target, Temps0, Acc) ->
+  {TDef, TUse} = def_use(TI, Target),
+  I = #instr{i = TI,
+	     def = ordsets:from_list(reg_names(TDef, Target)),
+	     use = ordsets:from_list(reg_names(TUse, Target))},
+  Temps = add_temps(TUse, Target, add_temps(TDef, Target, Temps0)),
+  convert_code(TIs, Target, Temps, [I|Acc]).
+
+-type temps() :: #{temp() => target_temp()}.
+add_temps([], _Target, Temps) -> Temps;
+add_temps([T|Ts], Target, Temps) ->
+  add_temps(Ts, Target, Temps#{reg_nr(T, Target) => T}).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Fourth pass: P({DEF}) lattice fwd dataflow (for eliding stores at SPILL
+%% splits)
+-type defsi() :: #{label() => defseti() | {call, defseti(), defseti()}}.
+-type defs()  :: #{label() => defsetf()}.
+
+-spec def_analyse(cfg(), target_cfg()) -> defs().
+def_analyse(CFG = #cfg{rpo_labels = RPO}, TCFG) ->
+  Defs0 = def_init(CFG),
+  def_dataf(RPO, TCFG, Defs0).
+
+-spec def_init(cfg()) -> defsi().
+def_init(#cfg{bbs = BBs}) ->
+  maps:from_list(
+    [begin
+       {L, case HasCall of
+	     false -> def_init_scan(bb_code(BB), defseti_new());
+	     true ->
+	       {call, def_init_scan(bb_butlast(BB), defseti_new()),
+		defseti_from_ordset((bb_last(BB))#instr.def)}
+	   end}
+     end || {L, BB = #bb{has_call=HasCall}} <- maps:to_list(BBs)]).
+
+def_init_scan([], Defset) -> Defset;
+def_init_scan([#instr{def=Def}|Is], Defset0) ->
+  Defset = defseti_add_ordset(Def, Defset0),
+  def_init_scan(Is, Defset).
+
+-spec def_dataf([label()], target_cfg(), defsi()) -> defs().
+def_dataf(Labels, TCFG, Defs0) ->
+  case def_dataf_once(Labels, TCFG, Defs0, 0) of
+    {Defs, 0} ->
+      def_finalise(Defs);
+    {Defs, _Changed} ->
+      def_dataf(Labels, TCFG, Defs)
+  end.
+
+-spec def_finalise(defsi()) -> defs().
+def_finalise(Defs) ->
+  maps:from_list([{K, defseti_finalise(BL)}
+		  || {K, {call, BL, _}} <- maps:to_list(Defs)]).
+
+-spec def_dataf_once([label()], target_cfg(), defsi(), non_neg_integer())
+		    -> {defsi(), non_neg_integer()}.
+def_dataf_once([], _TCFG, Defs, Changed) -> {Defs, Changed};
+def_dataf_once([L|Ls], TCFG, Defs0, Changed0) ->
+  AddPreds =
+    fun(Defset1) ->
+	lists:foldl(fun(P, Defset2) ->
+			defseti_union(defout(P, Defs0), Defset2)
+		    end, Defset1, hipe_gen_cfg:pred(TCFG, L))
+    end,
+  Defset =
+    case Defset0 = maps:get(L, Defs0) of
+      {call, Butlast, Defout} -> {call, AddPreds(Butlast), Defout};
+      _ -> AddPreds(Defset0)
+    end,
+  Changed = case Defset =:= Defset0 of
+	      true  -> Changed0;
+	      false -> Changed0+1
+	    end,
+  def_dataf_once(Ls, TCFG, Defs0#{L := Defset}, Changed).
+
+-spec defout(label(), defsi()) -> defseti().
+defout(L, Defs) ->
+  case maps:get(L, Defs) of
+    {call, _DefButLast, Defout} -> Defout;
+    Defout -> Defout
+  end.
+
+-spec defbutlast(label(), defs()) -> defsetf().
+defbutlast(L, Defs) -> maps:get(L, Defs).
+
+-spec defseti_new() -> defseti().
+-spec defseti_union(defseti(), defseti()) -> defseti().
+-spec defseti_add_ordset(ordset:ordset(temp()), defseti()) -> defseti().
+-spec defseti_from_ordset(ordset:ordset(temp())) -> defseti().
+-spec defseti_finalise(defseti()) -> defsetf().
+-spec defsetf_member(temp(), defsetf()) -> boolean().
+-spec defsetf_intersect_ordset(ordsets:ordset(temp()), defsetf())
+			      -> ordsets:ordset(temp()).
+
+-type defseti() :: bitord().
+defseti_new() -> bitord_new().
+defseti_union(A, B) -> bitord_union(A, B).
+defseti_add_ordset(OS, D) -> defseti_union(defseti_from_ordset(OS), D).
+defseti_from_ordset(OS) -> bitord_from_ordset(OS).
+defseti_finalise(D) -> bitarr_from_bitord(D).
+
+-type defsetf() :: bitarr().
+defsetf_member(E, D) -> bitarr_get(E, D).
+
+defsetf_intersect_ordset([], _D) -> [];
+defsetf_intersect_ordset([E|Es], D) ->
+  case bitarr_get(E, D) of
+    true  -> [E|defsetf_intersect_ordset(Es,D)];
+    false ->    defsetf_intersect_ordset(Es,D)
+  end.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Fifth pass: P({DEF}) lattice reverse dataflow (for eliding stores at defines
+%% in mode2)
+-type rdefsi() :: #{label() =>
+		     {call, rdefseti(), [label()]}
+		   | {nocall, rdefseti(), rdefseti(), [label()]}}.
+-type rdefs() :: #{label() => {final, rdefsetf(), [label()]}}.
+
+-spec rdef_analyse(cfg()) -> rdefs().
+rdef_analyse(CFG = #cfg{rpo_labels=RPO}) ->
+  Defs0 = rdef_init(CFG),
+  PO = rdef_postorder(RPO, CFG, []),
+  rdef_dataf(PO, Defs0).
+
+%% Filter out 'call' labels, since they don't change
+-spec rdef_postorder([label()], cfg(), [label()]) -> [label()].
+rdef_postorder([], _CFG, Acc) -> Acc;
+rdef_postorder([L|Ls], CFG, Acc) ->
+  case bb_has_call(cfg_bb(L, CFG)) of
+    true  -> rdef_postorder(Ls, CFG, Acc);
+    false -> rdef_postorder(Ls, CFG, [L|Acc])
+  end.
+
+-spec rdef_init(cfg()) -> rdefsi().
+rdef_init(#cfg{bbs = BBs}) ->
+  maps:from_list(
+    [{L, case HasCall of
+	   true ->
+	     Defin = rdef_init_scan(bb_butlast(BB), rdefseti_empty()),
+	     {call, Defin, Succs};
+	   false ->
+	     Gen = rdef_init_scan(bb_code(BB), rdefseti_empty()),
+	     {nocall, Gen, rdefseti_top(), Succs}
+	 end}
+     || {L, BB = #bb{has_call=HasCall, succ=Succs}} <- maps:to_list(BBs)]).
+
+-spec rdef_init_scan([instr()], rdefseti()) -> rdefseti().
+rdef_init_scan([], Defset) -> Defset;
+rdef_init_scan([#instr{def=Def}|Is], Defset0) ->
+  Defset = rdefseti_add_ordset(Def, Defset0),
+  rdef_init_scan(Is, Defset).
+
+-spec rdef_dataf([label()], rdefsi()) -> rdefs().
+rdef_dataf(Labels, Defs0) ->
+  case rdef_dataf_once(Labels, Defs0, 0) of
+    {Defs, 0} ->
+      rdef_finalise(Defs);
+    {Defs, _Changed} ->
+      rdef_dataf(Labels, Defs)
+  end.
+
+-spec rdef_finalise(rdefsi()) -> rdefs().
+rdef_finalise(Defs) ->
+  maps:map(fun(L, V) ->
+	       Succs = rsuccs_val(V),
+	       Defout0 = rdefout_intersect(L, Defs, rdefseti_top()),
+	       {final, rdefset_finalise(Defout0), Succs}
+	   end, Defs).
+
+-spec rdef_dataf_once([label()], rdefsi(), non_neg_integer())
+		     -> {rdefsi(), non_neg_integer()}.
+rdef_dataf_once([], Defs, Changed) -> {Defs, Changed};
+rdef_dataf_once([L|Ls], Defs0, Changed0) ->
+  #{L := {nocall, Gen, Defin0, Succs}} = Defs0,
+  Defin = rdefseti_union(Gen, rdefout_intersect(L, Defs0, Defin0)),
+  Defset = {nocall, Gen, Defin, Succs},
+  Changed = case Defin =:= Defin0 of
+	      true  -> Changed0;
+	      false -> Changed0+1
+	    end,
+  rdef_dataf_once(Ls, Defs0#{L := Defset}, Changed).
+
+-spec rdefin(label(), rdefsi()) -> rdefseti().
+rdefin(L, Defs) -> rdefin_val(maps:get(L, Defs)).
+rdefin_val({nocall, _Gen, Defin, _Succs}) -> Defin;
+rdefin_val({call, Defin, _Succs}) -> Defin.
+
+-spec rsuccs(label(), rdefsi()) -> [label()].
+rsuccs(L, Defs) -> rsuccs_val(maps:get(L, Defs)).
+rsuccs_val({nocall, _Gen, _Defin, Succs}) -> Succs;
+rsuccs_val({call, _Defin, Succs}) -> Succs.
+
+-spec rdefout(label(), rdefs()) -> rdefsetf().
+rdefout(L, Defs) ->
+  #{L := {final, Defout, _Succs}} = Defs,
+  Defout.
+
+-spec rdefout_intersect(label(), rdefsi(), rdefseti()) -> rdefseti().
+rdefout_intersect(L, Defs, Init) ->
+  lists:foldl(fun(S, Acc) ->
+		  rdefseti_intersect(rdefin(S, Defs), Acc)
+	      end, Init, rsuccs(L, Defs)).
+
+-type rdefseti() :: bitord() | top.
+rdefseti_top() -> top.
+rdefseti_empty() -> bitord_new().
+-spec rdefseti_from_ordset(ordsets:ordset(temp())) -> rdefseti().
+rdefseti_from_ordset(OS) -> bitord_from_ordset(OS).
+
+-spec rdefseti_add_ordset(ordsets:ordset(temp()), rdefseti()) -> rdefseti().
+rdefseti_add_ordset(_, top) -> top; % Should never happen in rdef_dataf
+rdefseti_add_ordset(OS, D) -> rdefseti_union(rdefseti_from_ordset(OS), D).
+
+-spec rdefseti_union(rdefseti(), rdefseti()) -> rdefseti().
+rdefseti_union(top, _) -> top;
+rdefseti_union(_, top) -> top;
+rdefseti_union(A, B) -> bitord_union(A, B).
+
+-spec rdefseti_intersect(rdefseti(), rdefseti()) -> rdefseti().
+rdefseti_intersect(top, D) -> D;
+rdefseti_intersect(D, top) -> D;
+rdefseti_intersect(A, B) -> bitord_intersect(A, B).
+
+-type rdefsetf() :: {arr, bitarr()} | top.
+-spec rdefset_finalise(rdefseti()) -> rdefsetf().
+rdefset_finalise(top) -> top;
+rdefset_finalise(Ord) -> {arr, bitarr_from_bitord(Ord)}.
+
+%% rdefsetf_top() -> top.
+rdefsetf_empty() -> {arr, bitarr_new()}.
+
+-spec rdefsetf_add_ordset(ordset:ordset(temp()), rdefsetf()) -> rdefsetf().
+rdefsetf_add_ordset(_, top) -> top;
+rdefsetf_add_ordset(OS, {arr, Arr}) ->
+  {arr, lists:foldl(fun bitarr_set/2, Arr, OS)}.
+
+-spec rdef_step(instr(), rdefsetf()) -> rdefsetf().
+rdef_step(#instr{def=Def}, Defset) ->
+  %% ?ASSERT(not defines_all_alloc(I, Target)),
+  rdefsetf_add_ordset(Def, Defset).
+
+-spec ordset_subtract_rdefsetf(ordsets:ordset(temp()), rdefsetf())
+			      -> ordsets:ordset(temp()).
+ordset_subtract_rdefsetf(_, top) -> [];
+ordset_subtract_rdefsetf(OS, {arr, Arr}) ->
+  %% Lazy implementation; could do better if OS can grow
+  lists:filter(fun(E) -> not bitarr_get(E, Arr) end, OS).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Integer sets represented as bit sets
+%%
+%% Two representations; bitord() and bitarr()
+-define(LIMB_IX_BITS,    11).
+-define(LIMB_BITS,       (1 bsl ?LIMB_IX_BITS)).
+-define(LIMB_IX(Index),  (Index bsr ?LIMB_IX_BITS)).
+-define(BIT_IX(Index),   (Index band (?LIMB_BITS - 1))).
+-define(BIT_MASK(Index), (1 bsl ?BIT_IX(Index))).
+
+%% bitord(): fast at union/2 and can be compared for equality with '=:='
+-type bitord() :: orddict:orddict(non_neg_integer(), 0..((1 bsl ?LIMB_BITS)-1)).
+
+-spec bitord_new() -> bitord().
+bitord_new() -> [].
+
+-spec bitord_union(bitord(), bitord()) -> bitord().
+bitord_union(Lhs, Rhs) ->
+  orddict:merge(fun(_, L, R) -> L bor R end, Lhs, Rhs).
+
+-spec bitord_intersect(bitord(), bitord()) -> bitord().
+bitord_intersect([], _) -> [];
+bitord_intersect(_, []) -> [];
+bitord_intersect([{K, L}|Ls], [{K, R}|Rs]) ->
+  [{K, L band R} | bitord_intersect(Ls, Rs)];
+bitord_intersect([{LK, _}|Ls], [{RK, _}|_]=Rs) when LK < RK ->
+  bitord_intersect(Ls, Rs);
+bitord_intersect([{LK, _}|_]=Ls, [{RK, _}|Rs]) when LK > RK ->
+  bitord_intersect(Ls, Rs).
+
+-spec bitord_from_ordset(ordsets:ordset(non_neg_integer())) -> bitord().
+bitord_from_ordset([]) -> [];
+bitord_from_ordset([B|Bs]) ->
+  bitord_from_ordset_1(Bs, ?LIMB_IX(B), ?BIT_MASK(B)).
+
+bitord_from_ordset_1([B|Bs], Key, Val) when Key =:= ?LIMB_IX(B) ->
+  bitord_from_ordset_1(Bs, Key, Val bor ?BIT_MASK(B));
+bitord_from_ordset_1([B|Bs], Key, Val) ->
+  [{Key,Val} | bitord_from_ordset_1(Bs, ?LIMB_IX(B), ?BIT_MASK(B))];
+bitord_from_ordset_1([], Key, Val) -> [{Key, Val}].
+
+%% bitarr(): fast (enough) at get/2
+-type bitarr() :: array:array(0..((1 bsl ?LIMB_BITS)-1)).
+
+-spec bitarr_new() -> bitarr().
+bitarr_new() -> array:new({default, 0}).
+
+-spec bitarr_get(non_neg_integer(), bitarr()) -> boolean().
+bitarr_get(Index, Array) ->
+  Limb = array:get(?LIMB_IX(Index), Array),
+  0 =/= (Limb band ?BIT_MASK(Index)).
+
+-spec bitarr_set(non_neg_integer(), bitarr()) -> bitarr().
+bitarr_set(Index, Array) ->
+  Limb0 = array:get(?LIMB_IX(Index), Array),
+  Limb = Limb0 bor ?BIT_MASK(Index),
+  array:set(?LIMB_IX(Index), Limb, Array).
+
+-spec bitarr_from_bitord(bitord()) -> bitarr().
+bitarr_from_bitord(Ord) ->
+  array:from_orddict(Ord, 0).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Sixth pass: Partition-local liveness analysis
+%%
+%% As temps are not spilled when exiting a partition in mode2, only
+%% partition-local uses need to be considered when deciding which temps need
+%% restoring at partition entry.
+
+-type plive() :: #{label() =>
+		     {call, liveset(), [label()]}
+		   | {nocall, {liveset(), liveset()}, liveset(), [label()]}}.
+
+-spec plive_analyse(cfg()) -> plive().
+plive_analyse(CFG) ->
+  Defs0 = plive_init(CFG),
+  PO = cfg_postorder(CFG),
+  plive_dataf(PO, Defs0).
+
+-spec plive_init(cfg()) -> plive().
+plive_init(#cfg{bbs = BBs}) ->
+  maps:from_list(
+    [begin
+       {L, case HasCall of
+	     true ->
+	       {Gen, _} = plive_init_scan(bb_code(BB)),
+	       {call, Gen, Succs};
+	     false ->
+	       GenKill = plive_init_scan(bb_code(BB)),
+	       {nocall, GenKill, liveset_empty(), Succs}
+	   end}
+     end || {L, BB = #bb{has_call=HasCall, succ=Succs}} <- maps:to_list(BBs)]).
+
+-spec plive_init_scan([instr()]) -> {liveset(), liveset()}.
+plive_init_scan([]) -> {liveset_empty(), liveset_empty()};
+plive_init_scan([#instr{def=InstrKill, use=InstrGen}|Is]) ->
+  {Gen0, Kill0} = plive_init_scan(Is),
+  Gen1 = liveset_subtract(Gen0, InstrKill),
+  Gen = liveset_union(Gen1, InstrGen),
+  Kill1 = liveset_union(Kill0, InstrKill),
+  Kill = liveset_subtract(Kill1, InstrGen),
+  {Gen, Kill}.
+
+-spec plive_dataf([label()], plive()) -> plive().
+plive_dataf(Labels, PLive0) ->
+  case plive_dataf_once(Labels, PLive0, 0) of
+    {PLive, 0} -> PLive;
+    {PLive, _Changed} ->
+      plive_dataf(Labels, PLive)
+  end.
+
+-spec plive_dataf_once([label()], plive(), non_neg_integer()) ->
+			  {plive(), non_neg_integer()}.
+plive_dataf_once([], PLive, Changed) -> {PLive, Changed};
+plive_dataf_once([L|Ls], PLive0, Changed0) ->
+  Liveset =
+    case Liveset0 = maps:get(L, PLive0) of
+      {call, Livein, Succs} ->
+	{call, Livein, Succs};
+      {nocall, {Gen, Kill} = GenKill, _OldLivein, Succs} ->
+	Liveout = pliveout(L, PLive0),
+	Livein = liveset_union(Gen, liveset_subtract(Liveout, Kill)),
+	{nocall, GenKill, Livein, Succs}
+    end,
+  Changed = case Liveset =:= Liveset0 of
+	      true  -> Changed0;
+	      false -> Changed0+1
+	    end,
+  plive_dataf_once(Ls, PLive0#{L := Liveset}, Changed).
+
+-spec pliveout(label(), plive()) -> liveset().
+pliveout(L, PLive) ->
+  liveset_union([plivein(S, PLive) || S <- psuccs(L, PLive)]).
+
+-spec psuccs(label(), plive()) -> [label()].
+psuccs(L, PLive) -> psuccs_val(maps:get(L, PLive)).
+psuccs_val({call, _Livein, Succs}) -> Succs;
+psuccs_val({nocall, _GenKill, _Livein, Succs}) -> Succs.
+
+-spec plivein(label(), plive()) -> liveset().
+plivein(L, PLive) -> plivein_val(maps:get(L, PLive)).
+plivein_val({call, Livein, _Succs}) -> Livein;
+plivein_val({nocall, _GenKill, Livein, _Succs}) ->  Livein.
+
+liveset_empty() -> ordsets:new().
+liveset_subtract(A, B) -> ordsets:subtract(A, B).
+liveset_union(A, B) -> ordsets:union(A, B).
+liveset_union(LivesetList) -> ordsets:union(LivesetList).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Third pass: Compute dataflow analyses required for placing mode3
+%% spills/restores.
+%% Reuse analysis implementation in hipe_restore_reuse.
+%% XXX: hipe_restore_reuse has it's own "rdef"; we would like to reuse that one
+%% too.
+-type avail() :: hipe_restore_reuse:avail().
+
+-spec avail_analyse(target_cfg(), liveness(), target()) -> avail().
+avail_analyse(CFG, Liveness, Target) ->
+  hipe_restore_reuse:analyse(CFG, Liveness, Target).
+
+-spec mode3_split_in_block(label(), avail()) -> ordsets:ordset(temp()).
+mode3_split_in_block(L, Avail) ->
+  hipe_restore_reuse:split_in_block(L, Avail).
+
+-spec mode3_block_renameset(label(), avail()) -> ordsets:ordset(temp()).
+mode3_block_renameset(L, Avail) ->
+  hipe_restore_reuse:renamed_in_block(L, Avail).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Seventh pass
+%%
+%% Compute program space partitioning, collect information required by the
+%% heuristic.
+-type part_key() :: label().
+-type part_dsets() :: hipe_dsets:dsets(part_key()).
+-type part_dsets_map() :: #{part_key() => part_key()}.
+-type ducounts() :: #{part_key() => ducount()}.
+
+-spec scan(cfg(), liveness(), plive(), weights(), defs(), rdefs(), avail(),
+	   target()) -> {cfg(), ducounts(), costs(), part_dsets()}.
+scan(CFG0, Liveness, PLive, Weights, Defs, RDefs, Avail, Target) ->
+  #cfg{rpo_labels = Labels, bbs = BBs0} = CFG0,
+  CFG = CFG0#cfg{bbs=#{}}, % kill reference
+  DSets0 = hipe_dsets:new(Labels),
+  Costs0 = costs_new(),
+  {BBs, DUCounts0, Costs1, DSets1} =
+    scan_bbs(maps:to_list(BBs0), Liveness, PLive, Weights, Defs, RDefs, Avail,
+	     Target, #{}, Costs0, DSets0, []),
+  {RLList, DSets2} = hipe_dsets:to_rllist(DSets1),
+  {Costs, DSets} = costs_map_roots(DSets2, Costs1),
+  DUCounts = collect_ducounts(RLList, DUCounts0, #{}),
+  {CFG#cfg{bbs=maps:from_list(BBs)}, DUCounts, Costs, DSets}.
+
+-spec collect_ducounts([{label(), [label()]}], ducounts(), ducounts())
+		      -> ducounts().
+collect_ducounts([], _, Acc) -> Acc;
+collect_ducounts([{R,Ls}|RLs], DUCounts, Acc) ->
+  DUCount = lists:foldl(
+	      fun(Key, FAcc) ->
+		  ducount_merge(maps:get(Key, DUCounts, ducount_new()), FAcc)
+	      end, ducount_new(), Ls),
+  collect_ducounts(RLs, DUCounts, Acc#{R => DUCount}).
+
+-spec scan_bbs([{label(), bb()}], liveness(), plive(), weights(), defs(),
+	       rdefs(), avail(), target(), ducounts(), costs(), part_dsets(),
+	       [{label(), bb()}])
+	      -> {[{label(), bb()}], ducounts(), costs(), part_dsets()}.
+scan_bbs([], _Liveness, _PLive, _Weights, _Defs, _RDefs, _Avail, _Target,
+	 DUCounts, Costs, DSets, Acc) ->
+  {Acc, DUCounts, Costs, DSets};
+scan_bbs([{L,BB}|BBs], Liveness, PLive, Weights, Defs, RDefs, Avail, Target,
+	 DUCounts0, Costs0, DSets0, Acc) ->
+  Wt = weight(L, Weights),
+  {DSets, Costs5, EntryCode, ExitCode, RDefout, Liveout} =
+    case bb_has_call(BB) of
+      false ->
+	DSets1 = lists:foldl(fun(S, DS) -> hipe_dsets:union(L, S, DS) end,
+			     DSets0, bb_succ(BB)),
+	{DSets1, Costs0, bb_code(BB), [], rdefout(L, RDefs),
+	 liveout(Liveness, L, Target)};
+      true ->
+	LastI = #instr{def=LastDef} = bb_last(BB),
+	LiveBefore = ordsets:subtract(liveout(Liveness, L, Target), LastDef),
+	%% We can omit the spill of a temp that has not been defined since the
+	%% last time it was spilled
+	SpillSet = defsetf_intersect_ordset(LiveBefore, defbutlast(L, Defs)),
+	Costs1 = costs_insert(exit, L, Wt, SpillSet, Costs0),
+	Costs4 = lists:foldl(fun({S, BranchWt}, Costs2) ->
+				 SLivein = livein(Liveness, S, Target),
+				 SPLivein = plivein(S, PLive),
+				 SWt = weight_scaled(L, BranchWt, Weights),
+				 Costs3 = costs_insert(entry1, S, SWt, SLivein, Costs2),
+				 costs_insert(entry2, S, SWt, SPLivein, Costs3)
+			     end, Costs1, branch_preds(LastI#instr.i, Target)),
+	{DSets0, Costs4, bb_butlast(BB), [LastI], rdefsetf_empty(), LiveBefore}
+    end,
+  Mode3Splits = mode3_split_in_block(L, Avail),
+  {RevEntryCode, Restored} = scan_bb_fwd(EntryCode, Mode3Splits, [], []),
+  {Code, DUCount, Mode2Spills} =
+    scan_bb(RevEntryCode, Wt, RDefout, Liveout, ducount_new(), [], ExitCode),
+  DUCounts = DUCounts0#{L => DUCount},
+  M2SpillSet = ordsets:from_list(Mode2Spills),
+  Costs6 = costs_insert(spill, L, Wt, M2SpillSet, Costs5),
+  Mode3Renames = mode3_block_renameset(L, Avail),
+  Costs7 = costs_insert(restore, L, Wt, ordsets:intersection(M2SpillSet, Mode3Renames), Costs6),
+  Costs8 = costs_insert(restore, L, Wt, ordsets:from_list(Restored), Costs7),
+  Costs = add_unsplit_mode3_costs(DUCount, Mode3Renames, L, Costs8),
+  scan_bbs(BBs, Liveness, PLive, Weights, Defs, RDefs, Avail, Target, DUCounts,
+	   Costs, DSets, [{L,BB#bb{code=Code}}|Acc]).
+
+-spec add_unsplit_mode3_costs(ducount(), ordsets:ordset(temp()), label(), costs())
+			     -> costs().
+add_unsplit_mode3_costs(DUCount, Mode3Renames, L, Costs) ->
+  Unsplit = orddict_without_ordset(Mode3Renames,
+				   orddict:from_list(ducount_to_list(DUCount))),
+  add_unsplit_mode3_costs_1(Unsplit, L, Costs).
+
+-spec add_unsplit_mode3_costs_1([{temp(),float()}], label(), costs())
+			       -> costs().
+add_unsplit_mode3_costs_1([], _L, Costs) -> Costs;
+add_unsplit_mode3_costs_1([{T,C}|Cs], L, Costs) ->
+  add_unsplit_mode3_costs_1(Cs, L, costs_insert(restore, L, C, [T], Costs)).
+
+%% @doc Returns a new orddict without keys in Set and their associated values.
+-spec orddict_without_ordset(ordsets:ordset(K), orddict:orddict(K, V))
+			    -> orddict:orddict(K, V).
+orddict_without_ordset([S|Ss], [{K,_}|_]=Dict) when S < K ->
+  orddict_without_ordset(Ss, Dict);
+orddict_without_ordset([S|_]=Set, [D={K,_}|Ds]) when S > K ->
+  [D|orddict_without_ordset(Set, Ds)];
+orddict_without_ordset([_S|Ss], [{_K,_}|Ds]) -> % _S == _K
+  orddict_without_ordset(Ss, Ds);
+orddict_without_ordset(_, []) -> [];
+orddict_without_ordset([], Dict) -> Dict.
+
+%% Scans the code forward, collecting and inserting mode3 restores
+-spec scan_bb_fwd([instr()], ordsets:ordset(temp()), ordsets:ordset(temp()),
+		  [code_elem()])
+		 -> {[code_elem()], ordsets:ordset(temp())}.
+scan_bb_fwd([], [], Restored, Acc) -> {Acc, Restored};
+scan_bb_fwd([I|Is], SplitHere0, Restored0, Acc0) ->
+  #instr{def=Def, use=Use} = I,
+  {ToRestore, SplitHere1} =
+    lists:partition(fun(R) -> lists:member(R, Use) end, SplitHere0),
+  SplitHere = lists:filter(fun(R) -> not lists:member(R, Def) end, SplitHere1),
+  Acc =
+    case ToRestore of
+      [] -> [I | Acc0];
+      _  -> [I, #mode3_restores{temps=ToRestore} | Acc0]
+    end,
+  scan_bb_fwd(Is, SplitHere, ToRestore ++ Restored0, Acc).
+
+%% Scans the code backwards, collecting def/use counts and mode2 spills
+-spec scan_bb([code_elem()], float(), rdefsetf(), liveset(), ducount(),
+	      [temp()], [code_elem()])
+	     -> {[code_elem()], ducount(), [temp()]}.
+scan_bb([], _Wt, _RDefout, _Liveout, DUCount, Spills, Acc) ->
+  {Acc, DUCount, Spills};
+scan_bb([I=#mode3_restores{}|Is], Wt, RDefout, Liveout, DUCount, Spills, Acc) ->
+  scan_bb(Is, Wt, RDefout, Liveout, DUCount, Spills, [I|Acc]);
+scan_bb([I|Is], Wt, RDefout, Liveout, DUCount0, Spills0, Acc0) ->
+  #instr{def=Def,use=Use} = I,
+  DUCount = ducount_add(Use, Wt, ducount_add(Def, Wt, DUCount0)),
+  Livein = liveness_step(I, Liveout),
+  RDefin = rdef_step(I, RDefout),
+  %% The temps that would be spilled after I in mode 2
+  NewSpills = ordset_subtract_rdefsetf(
+		ordsets:intersection(Def, Liveout),
+		RDefout),
+  ?ASSERT(NewSpills =:= (NewSpills -- Spills0)),
+  Spills = NewSpills ++ Spills0,
+  Acc1 = case NewSpills of
+	   [] -> Acc0;
+	   _ -> [#mode2_spills{temps=NewSpills}|Acc0]
+	 end,
+  scan_bb(Is, Wt, RDefin, Livein, DUCount, Spills, [I|Acc1]).
+
+-spec liveness_step(instr(), liveset()) -> liveset().
+liveness_step(#instr{def=Def, use=Use}, Liveout) ->
+  ordsets:union(Use, ordsets:subtract(Liveout, Def)).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% First pass: compute basic-block weighting
+
+-type weights() :: no_bb_weights
+		 | {hipe_bb_weights:bb_weights(), float()}.
+
+-spec weight(label(), weights()) -> float().
+weight(L, Weights) -> weight_scaled(L, 1.0, Weights).
+
+-spec compute_weights(target_cfg(), target_module(), target_context(),
+		      comp_options()) -> weights().
+compute_weights(CFG, TargetMod, TargetContext, Options) ->
+  case proplists:get_bool(range_split_weights, Options) of
+    false -> no_bb_weights;
+    true ->
+      {hipe_bb_weights:compute(CFG, TargetMod, TargetContext),
+       ?WEIGHT_CONST_FUN(proplists:get_value(range_split_weight_power,
+					     Options, ?DEFAULT_WEIGHT_POWER))}
+  end.
+
+-spec weight_scaled(label(), float(), weights()) -> float().
+weight_scaled(_L, _Scale, no_bb_weights) -> 1.0;
+weight_scaled(L, Scale, {Weights, Const}) ->
+  Wt0 = hipe_bb_weights:weight(L, Weights) * Scale,
+  Wt = erlang:min(erlang:max(Wt0, 0.0000000000000000001), 10000.0),
+  ?WEIGHT_FUN(Wt, Const).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Heuristic splitting decision.
+%%
+%% Decide which temps to split, in which parts, and pick new names for them.
+-type spill_mode() :: mode1 % Spill temps at partition exits
+		    | mode2 % Spill temps at definitions
+		    | mode3.% Spill temps at definitions, restore temps at uses
+-type ren() :: #{temp() => {spill_mode(), temp()}}.
+-type renames() :: #{label() => ren()}.
+
+-record(heur_par, {
+	  mode1_fudge :: float(),
+	  min_gain    :: float()
+	  }).
+-type heur_par() :: #heur_par{}.
+
+-spec decide(ducounts(), costs(), target(), comp_options()) -> renames().
+decide(DUCounts, Costs, Target, Options) ->
+  Par = #heur_par{
+	   mode1_fudge = proplists:get_value(range_split_mode1_fudge, Options,
+					     ?DEFAULT_MODE1_FUDGE),
+	   min_gain    = proplists:get_value(range_split_min_gain, Options,
+					     ?DEFAULT_MIN_GAIN)},
+  decide_parts(maps:to_list(DUCounts), Costs, Target, Par, #{}).
+
+-spec decide_parts([{part_key(), ducount()}], costs(), target(),
+		   heur_par(), renames())
+		  -> renames().
+decide_parts([], _Costs, _Target, _Par, Acc) -> Acc;
+decide_parts([{Part,DUCount}|Ps], Costs, Target, Par, Acc) ->
+  Spills = decide_temps(ducount_to_list(DUCount), Part, Costs, Target, Par,
+			#{}),
+  decide_parts(Ps, Costs, Target, Par, Acc#{Part => Spills}).
+
+-spec decide_temps([{temp(), float()}], part_key(), costs(), target(),
+		   heur_par(), ren())
+		  -> ren().
+decide_temps([], _Part, _Costs, _Target, _Par, Acc) -> Acc;
+decide_temps([{Temp, SpillGain}|Ts], Part, Costs, Target, Par, Acc0) ->
+  SpillCost1 = costs_query(Temp, entry1, Part, Costs)
+    + costs_query(Temp, exit, Part, Costs),
+  SpillCost2 = costs_query(Temp, entry2, Part, Costs)
+    + costs_query(Temp, spill, Part, Costs),
+  SpillCost3 = costs_query(Temp, restore, Part, Costs),
+  Acc =
+    %% SpillCost1 =:= 0.0 usually means the temp is local to the partition;
+    %% hence no need to split it
+    case (SpillCost1 =/= 0.0) %% maps:is_key(Temp, S)
+      andalso (not is_precoloured(Temp, Target))
+      andalso ((Par#heur_par.min_gain*SpillCost1 < SpillGain)
+	       orelse (Par#heur_par.min_gain*SpillCost2 < SpillGain)
+	       orelse (Par#heur_par.min_gain*SpillCost3 < SpillGain))
+    of
+      false -> Acc0;
+      true ->
+	Mode =
+	  if Par#heur_par.mode1_fudge*SpillCost1 < SpillCost2,
+	     Par#heur_par.mode1_fudge*SpillCost1 < SpillCost3 ->
+	      mode1;
+	     SpillCost2 < SpillCost3 ->
+	      mode2;
+	     true ->
+	      mode3
+	  end,
+	Acc0#{Temp => {Mode, new_reg_nr(Target)}}
+  end,
+  decide_temps(Ts, Part, Costs, Target, Par, Acc).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Eighth pass: Rewrite program performing range splitting.
+
+-spec rewrite(cfg(), target_cfg(), target(), liveness(), plive(), defs(),
+	      avail(), part_dsets_map(), renames(), temps())
+	     -> target_cfg().
+rewrite(#cfg{bbs=BBs}, TCFG, Target, Liveness, PLive, Defs, Avail, DSets,
+	Renames, Temps) ->
+  rewrite_bbs(maps:to_list(BBs), Target, Liveness, PLive, Defs, Avail, DSets,
+	      Renames, Temps, TCFG).
+
+-spec rewrite_bbs([{label(), bb()}], target(), liveness(), plive(), defs(),
+		  avail(), part_dsets_map(), renames(), temps(), target_cfg())
+		 -> target_cfg().
+rewrite_bbs([], _Target, _Liveness, _PLive, _Defs, _Avail, _DSets, _Renames,
+	    _Temps, TCFG) ->
+  TCFG;
+rewrite_bbs([{L,BB}|BBs], Target, Liveness, PLive, Defs, Avail, DSets, Renames,
+	    Temps, TCFG0) ->
+  Code0Rev = lists:reverse(bb_code(BB)),
+  EntryRen = maps:get(maps:get(L,DSets), Renames),
+  M3Ren = mode3_block_renameset(L, Avail),
+  SubstFun = rewrite_subst_fun(Target, EntryRen, M3Ren),
+  Fun = fun(I) -> subst_temps(SubstFun, I, Target) end,
+  {Code, TCFG} =
+    case bb_has_call(BB) of
+      false ->
+	Code1 = rewrite_instrs(Code0Rev, Fun, EntryRen, M3Ren, Temps, Target,
+			       []),
+	{Code1, TCFG0};
+      true ->
+	CallI0 = hd(Code0Rev),
+	Succ = bb_succ(BB),
+	{CallTI, TCFG1} = inject_restores(Succ, Target, Liveness, PLive, DSets,
+					 Renames, Temps, CallI0#instr.i, TCFG0),
+	Liveout1 = liveness_step(CallI0, liveout(Liveness, L, Target)),
+	Defout = defbutlast(L, Defs),
+	SpillMap = mk_spillmap(EntryRen, Liveout1, Defout, Temps, Target),
+	Code1 = rewrite_instrs(tl(Code0Rev), Fun, EntryRen, M3Ren, Temps,
+			       Target, []),
+	Code2 = lift_spills(lists:reverse(Code1), Target, SpillMap, [CallTI]),
+	{Code2, TCFG1}
+    end,
+  TBB = hipe_bb:code_update(bb(TCFG, L, Target), Code),
+  rewrite_bbs(BBs, Target, Liveness, PLive, Defs, Avail, DSets, Renames, Temps,
+	      update_bb(TCFG, L, TBB, Target)).
+
+-spec rewrite_instrs([code_elem()], rewrite_fun(), ren(),
+		     ordsets:ordset(temp()), temps(), target(),
+		     [target_instr()])
+		    -> [target_instr()].
+rewrite_instrs([], _Fun, _Ren, _M3Ren, _Temps, _Target, Acc) -> Acc;
+rewrite_instrs([I|Is], Fun, Ren, M3Ren, Temps, Target, Acc0) ->
+  Acc =
+    case I of
+      #instr{i=TI} -> [Fun(TI)|Acc0];
+      #mode2_spills{temps=Mode2Spills} ->
+	add_mode2_spills(Mode2Spills, Target, Ren, M3Ren, Temps, Acc0);
+      #mode3_restores{temps=Mode3Restores} ->
+	add_mode3_restores(Mode3Restores, Target, Ren, Temps, Acc0)
+    end,
+  rewrite_instrs(Is, Fun, Ren, M3Ren, Temps, Target, Acc).
+
+-spec add_mode2_spills(ordsets:ordset(temp()), target(), ren(),
+		       ordsets:ordset(temp()), temps(), [target_instr()])
+		      -> [target_instr()].
+add_mode2_spills([], _Target, _Ren, _M3Ren, _Temps, Acc) -> Acc;
+add_mode2_spills([R|Rs], Target, Ren, M3Ren, Temps, Acc0) ->
+  Acc =
+    case Ren of
+      #{R := {Mode, NewName}} when Mode =:= mode2; Mode =:= mode3 ->
+	case Mode =/= mode3 orelse lists:member(R, M3Ren) of
+	  false -> Acc0;
+	  true ->
+	    #{R := T} = Temps,
+	    SpillInstr = mk_move(update_reg_nr(NewName, T, Target), T, Target),
+	    [SpillInstr|Acc0]
+	end;
+    #{} ->
+	Acc0
+  end,
+  add_mode2_spills(Rs, Target, Ren, M3Ren, Temps, Acc).
+
+-spec add_mode3_restores(ordsets:ordset(temp()), target(), ren(), temps(),
+			 [target_instr()])
+			-> [target_instr()].
+add_mode3_restores([], _Target, _Ren, _Temps, Acc) -> Acc;
+add_mode3_restores([R|Rs], Target, Ren, Temps, Acc) ->
+  case Ren of
+    #{R := {mode3, NewName}} ->
+      #{R := T} = Temps,
+      RestoreInstr = mk_move(T, update_reg_nr(NewName, T, Target), Target),
+      add_mode3_restores(Rs, Target, Ren, Temps, [RestoreInstr|Acc]);
+    #{} ->
+      add_mode3_restores(Rs, Target, Ren, Temps, Acc)
+  end.
+
+-type rewrite_fun() :: fun((target_instr()) -> target_instr()).
+-type subst_fun() :: fun((target_temp()) -> target_temp()).
+-spec rewrite_subst_fun(target(), ren(), ordsets:ordset(temp())) -> subst_fun().
+rewrite_subst_fun(Target, Ren, M3Ren) ->
+  fun(Temp) ->
+      Reg = reg_nr(Temp, Target),
+      case Ren of
+	#{Reg := {Mode, NewName}} ->
+	  case Mode =/= mode3 orelse lists:member(Reg, M3Ren) of
+	    false -> Temp;
+	    true -> update_reg_nr(NewName, Temp, Target)
+	  end;
+	#{} -> Temp
+      end
+  end.
+
+-type spillmap() :: [{temp(), target_instr()}].
+-spec mk_spillmap(ren(), liveset(), defsetf(), temps(), target())
+		 -> spillmap().
+mk_spillmap(Ren, Livein, Defout, Temps, Target) ->
+  [begin
+     Temp = maps:get(Reg, Temps),
+     {NewName, mk_move(update_reg_nr(NewName, Temp, Target), Temp, Target)}
+   end || {Reg, {mode1, NewName}} <- maps:to_list(Ren),
+	  lists:member(Reg, Livein), defsetf_member(Reg, Defout)].
+
+-spec mk_restores(ren(), liveset(), liveset(), temps(), target())
+		 -> [target_instr()].
+mk_restores(Ren, Livein, PLivein, Temps, Target) ->
+  [begin
+     Temp = maps:get(Reg, Temps),
+     mk_move(Temp, update_reg_nr(NewName, Temp, Target), Target)
+   end || {Reg, {Mode, NewName}} <- maps:to_list(Ren),
+	  (       (Mode =:= mode1 andalso lists:member(Reg, Livein ))
+	   orelse (Mode =:= mode2 andalso lists:member(Reg, PLivein)))].
+
+-spec inject_restores([label()], target(), liveness(), plive(),
+		      part_dsets_map(), renames(), temps(), target_instr(),
+		      target_cfg())
+		     -> {target_instr(), target_cfg()}.
+inject_restores([], _Target, _Liveness, _PLive, _DSets, _Renames, _Temps, CFTI,
+		TCFG) ->
+  {CFTI, TCFG};
+inject_restores([L|Ls], Target, Liveness, PLive, DSets, Renames, Temps, CFTI0,
+		TCFG0) ->
+  Ren = maps:get(maps:get(L,DSets), Renames),
+  Livein = livein(Liveness, L, Target),
+  PLivein = plivein(L, PLive),
+  {CFTI, TCFG} =
+    case mk_restores(Ren, Livein, PLivein, Temps, Target) of
+      [] -> {CFTI0, TCFG0}; % optimisation
+      Restores ->
+	RestBBLbl = new_label(Target),
+	Code = Restores ++ [mk_goto(L, Target)],
+	CFTI1 = redirect_jmp(CFTI0, L, RestBBLbl, Target),
+	TCFG1 = update_bb(TCFG0, RestBBLbl, hipe_bb:mk_bb(Code), Target),
+	{CFTI1, TCFG1}
+    end,
+  inject_restores(Ls, Target, Liveness, PLive, DSets, Renames, Temps, CFTI,
+		  TCFG).
+
+%% Heuristic. Move spills up until we meet the edge of the BB or a definition of
+%% that temp.
+-spec lift_spills([target_instr()], target(), spillmap(), [target_instr()])
+		 -> [target_instr()].
+lift_spills([], _Target, SpillMap, Acc) ->
+  [SpillI || {_, SpillI} <- SpillMap] ++ Acc;
+lift_spills([I|Is], Target, SpillMap0, Acc) ->
+  Def = reg_defines(I, Target),
+  {Spills0, SpillMap} =
+    lists:partition(fun({Reg,_}) -> lists:member(Reg, Def) end, SpillMap0),
+  Spills = [SpillI || {_, SpillI} <- Spills0],
+  lift_spills(Is, Target, SpillMap, [I|Spills ++ Acc]).
+
+reg_defines(I, Target) ->
+  reg_names(defines(I,Target), Target).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Costs ADT
+%%
+%% Keeps track of cumulative cost of spilling temps in particular partitions
+%% using particular spill modes.
+-type cost_map() :: #{[part_key()|temp()] => float()}.
+-type cost_key() :: entry1 | entry2 | exit | spill | restore.
+-record(costs, {entry1  = #{} :: cost_map()
+	       ,entry2  = #{} :: cost_map()
+	       ,exit    = #{} :: cost_map()
+	       ,spill   = #{} :: cost_map()
+	       ,restore = #{} :: cost_map()
+	       }).
+-type costs() :: #costs{}.
+
+-spec costs_new() -> costs().
+costs_new() -> #costs{}.
+
+-spec costs_insert(cost_key(), part_key(), float(), liveset(), costs())
+		  -> costs().
+costs_insert(entry1, A, Weight, Liveset, Costs=#costs{entry1=Entry1}) ->
+  Costs#costs{entry1=costs_insert_1(A, Weight, Liveset, Entry1)};
+costs_insert(entry2, A, Weight, Liveset, Costs=#costs{entry2=Entry2}) ->
+  Costs#costs{entry2=costs_insert_1(A, Weight, Liveset, Entry2)};
+costs_insert(exit, A, Weight, Liveset, Costs=#costs{exit=Exit}) ->
+  Costs#costs{exit=costs_insert_1(A, Weight, Liveset, Exit)};
+costs_insert(spill, A, Weight, Liveset, Costs=#costs{spill=Spill}) ->
+  Costs#costs{spill=costs_insert_1(A, Weight, Liveset, Spill)};
+costs_insert(restore, A, Weight, Liveset, Costs=#costs{restore=Restore}) ->
+  Costs#costs{restore=costs_insert_1(A, Weight, Liveset, Restore)}.
+
+costs_insert_1(A, Weight, Liveset, CostMap0) when is_float(Weight) ->
+  lists:foldl(fun(Live, CostMap1) ->
+		  map_update_counter([A|Live], Weight, CostMap1)
+	      end, CostMap0, Liveset).
+
+-spec costs_map_roots(part_dsets(), costs()) -> {costs(), part_dsets()}.
+costs_map_roots(DSets0, Costs) ->
+  {Entry1,  DSets1} = costs_map_roots_1(DSets0, Costs#costs.entry1),
+  {Entry2,  DSets2} = costs_map_roots_1(DSets1, Costs#costs.entry2),
+  {Exit,    DSets3} = costs_map_roots_1(DSets2, Costs#costs.exit),
+  {Spill,   DSets4} = costs_map_roots_1(DSets3, Costs#costs.spill),
+  {Restore, DSets}  = costs_map_roots_1(DSets4, Costs#costs.restore),
+  {#costs{entry1=Entry1,entry2=Entry2,exit=Exit,spill=Spill,restore=Restore},
+   DSets}.
+
+costs_map_roots_1(DSets0, CostMap) ->
+  {NewEs, DSets} = lists:mapfoldl(fun({[A|T], Wt}, DSets1) ->
+				      {AR, DSets2} = hipe_dsets:find(A, DSets1),
+				      {{[AR|T], Wt}, DSets2}
+				  end, DSets0, maps:to_list(CostMap)),
+  {maps_from_list_merge(NewEs, fun erlang:'+'/2, #{}), DSets}.
+
+maps_from_list_merge([], _MF, Acc) -> Acc;
+maps_from_list_merge([{K,V}|Ps], MF, Acc) ->
+  maps_from_list_merge(Ps, MF, case Acc of
+				 #{K := OV} -> Acc#{K := MF(V, OV)};
+				 #{}        -> Acc#{K => V}
+			       end).
+
+-spec costs_query(temp(), cost_key(), part_key(), costs()) -> float().
+costs_query(Temp, entry1, Part, #costs{entry1=Entry1}) ->
+  costs_query_1(Temp, Part, Entry1);
+costs_query(Temp, entry2, Part, #costs{entry2=Entry2}) ->
+  costs_query_1(Temp, Part, Entry2);
+costs_query(Temp, exit, Part, #costs{exit=Exit}) ->
+  costs_query_1(Temp, Part, Exit);
+costs_query(Temp, spill, Part, #costs{spill=Spill}) ->
+  costs_query_1(Temp, Part, Spill);
+costs_query(Temp, restore, Part, #costs{restore=Restore}) ->
+  costs_query_1(Temp, Part, Restore).
+
+costs_query_1(Temp, Part, CostMap) ->
+  Key = [Part|Temp],
+  case CostMap of
+    #{Key := Wt} -> Wt;
+    #{} -> 0.0
+  end.
+
+-spec map_update_counter(Key, number(), #{Key => number(), OK => OV})
+			-> #{Key := number(), OK => OV}.
+map_update_counter(Key, Incr, Map) ->
+  case Map of
+    #{Key := Orig} -> Map#{Key := Orig + Incr};
+    #{}            -> Map#{Key => Incr}
+  end.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Def and use counting ADT
+-type ducount() :: #{temp() => float()}.
+
+-spec ducount_new() -> ducount().
+ducount_new() -> #{}.
+
+-spec ducount_add([temp()], float(), ducount()) -> ducount().
+ducount_add([], _Weight, DUCount) -> DUCount;
+ducount_add([T|Ts], Weight, DUCount0) ->
+  DUCount =
+    case DUCount0 of
+      #{T := Count} -> DUCount0#{T := Count + Weight};
+      #{}           -> DUCount0#{T => Weight}
+    end,
+  ducount_add(Ts, Weight, DUCount).
+
+ducount_to_list(DUCount) -> maps:to_list(DUCount).
+
+-spec ducount_merge(ducount(), ducount()) -> ducount().
+ducount_merge(DCA, DCB) when map_size(DCA) < map_size(DCB) ->
+  ducount_merge_1(ducount_to_list(DCA), DCB);
+ducount_merge(DCA, DCB) when map_size(DCA) >= map_size(DCB) ->
+  ducount_merge_1(ducount_to_list(DCB), DCA).
+
+ducount_merge_1([], DUCount) -> DUCount;
+ducount_merge_1([{T,AC}|Ts], DUCount0) ->
+  DUCount =
+    case DUCount0 of
+      #{T := BC} -> DUCount0#{T := AC + BC};
+      #{}        -> DUCount0#{T => AC}
+    end,
+  ducount_merge_1(Ts, DUCount).
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%% Target module interface functions
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+-define(TGT_IFACE_0(N), N(         {M,C}) -> M:N(         C)).
+-define(TGT_IFACE_1(N), N(A1,      {M,C}) -> M:N(A1,      C)).
+-define(TGT_IFACE_2(N), N(A1,A2,   {M,C}) -> M:N(A1,A2,   C)).
+-define(TGT_IFACE_3(N), N(A1,A2,A3,{M,C}) -> M:N(A1,A2,A3,C)).
+
+?TGT_IFACE_2(bb).
+?TGT_IFACE_1(def_use).
+?TGT_IFACE_1(defines).
+?TGT_IFACE_1(defines_all_alloc).
+?TGT_IFACE_1(is_precoloured).
+?TGT_IFACE_1(mk_goto).
+?TGT_IFACE_2(mk_move).
+?TGT_IFACE_0(new_label).
+?TGT_IFACE_0(new_reg_nr).
+?TGT_IFACE_1(number_of_temporaries).
+?TGT_IFACE_3(redirect_jmp).
+?TGT_IFACE_1(reg_nr).
+?TGT_IFACE_1(reverse_postorder).
+?TGT_IFACE_2(subst_temps).
+?TGT_IFACE_3(update_bb).
+?TGT_IFACE_2(update_reg_nr).
+
+branch_preds(Instr, {TgtMod,TgtCtx}) ->
+  merge_sorted_preds(lists:keysort(1, TgtMod:branch_preds(Instr, TgtCtx))).
+
+livein(Liveness, L, Target={TgtMod,TgtCtx}) ->
+  ordsets:from_list(reg_names(TgtMod:livein(Liveness, L, TgtCtx), Target)).
+
+liveout(Liveness, L, Target={TgtMod,TgtCtx}) ->
+  ordsets:from_list(reg_names(TgtMod:liveout(Liveness, L, TgtCtx), Target)).
+
+merge_sorted_preds([]) -> [];
+merge_sorted_preds([{L, P1}, {L, P2}|LPs]) ->
+  merge_sorted_preds([{L, P1+P2}|LPs]);
+merge_sorted_preds([LP|LPs]) -> [LP|merge_sorted_preds(LPs)].
+
+reg_names(Regs, {TgtMod,TgtCtx}) ->
+  [TgtMod:reg_nr(X,TgtCtx) || X <- Regs].
diff --git a/lib/hipe/regalloc/hipe_regalloc_loop.erl b/lib/hipe/regalloc/hipe_regalloc_loop.erl
index 5d32073437..29ef3adcc2 100644
--- a/lib/hipe/regalloc/hipe_regalloc_loop.erl
+++ b/lib/hipe/regalloc/hipe_regalloc_loop.erl
@@ -99,11 +99,19 @@ call_allocator(CFG, Liveness, SpillLimit, SpillIndex, Options, RegAllocMod,
 			   TargetCtx, Options)
   end.
 
-do_range_split(CFG0, Liveness, TgtMod, TgtCtx, Options) ->
-  case proplists:get_bool(ra_restore_reuse, Options) of
+do_range_split(CFG0, Liveness0, TgtMod, TgtCtx, Options) ->
+  {CFG2, Liveness1} =
+    case proplists:get_bool(ra_restore_reuse, Options) of
+      true ->
+	CFG1 = hipe_restore_reuse:split(CFG0, Liveness0, TgtMod, TgtCtx),
+	{CFG1, TgtMod:analyze(CFG1, TgtCtx)};
+      false ->
+	{CFG0, Liveness0}
+    end,
+  case proplists:get_bool(ra_range_split, Options) of
     true ->
-      CFG1 = hipe_restore_reuse:split(CFG0, Liveness, TgtMod, TgtCtx),
-      {CFG1, TgtMod:analyze(CFG1, TgtCtx)};
+      CFG3 = hipe_range_split:split(CFG2, Liveness1, TgtMod, TgtCtx, Options),
+      {CFG3, TgtMod:analyze(CFG3, TgtCtx)};
     false ->
-      {CFG0, Liveness}
+      {CFG2, Liveness1}
   end.
diff --git a/lib/hipe/regalloc/hipe_restore_reuse.erl b/lib/hipe/regalloc/hipe_restore_reuse.erl
index 2d8dea2a2c..2158bd185e 100644
--- a/lib/hipe/regalloc/hipe_restore_reuse.erl
+++ b/lib/hipe/regalloc/hipe_restore_reuse.erl
@@ -36,6 +36,14 @@
 
 -export([split/4]).
 
+%% Exports for hipe_range_split, which uses restore_reuse as one possible spill
+%% "mode"
+-export([analyse/3
+	,renamed_in_block/2
+	,split_in_block/2
+	]).
+-export_type([avail/0]).
+
 -compile(inline).
 
 %% -define(DO_ASSERT, 1).
@@ -59,7 +67,7 @@ split(CFG, Liveness, TargetMod, TargetContext) ->
   rewrite(CFG, Target, Avail).
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
--type avail() :: #{label() => avail_bb()}.
+-opaque avail() :: #{label() => avail_bb()}.
 
 -record(avail_bb, {
 	  %% Blocks where HasCall is true are considered to have too high
@@ -332,6 +340,11 @@ rewrite([L|Ls], Target, Avail, Input0, CFG0) ->
 			       Input1, CFG1),
   rewrite(Ls, Target, Avail, Input, CFG).
 
+-spec renamed_in_block(label(), avail()) -> ordsets:ordset(reg()).
+renamed_in_block(L, Avail) ->
+  ordsets:union([avail_self(L, Avail), want_in(L, Avail),
+		 want_out(L, Avail)]).
+
 -spec split_in_block(label(), avail()) -> ordsets:ordset(reg()).
 split_in_block(L, Avail) ->
   ordsets:subtract(ordsets:union(avail_self(L, Avail), want_out(L, Avail)),
-- 
cgit v1.2.3


From c52b2cf226cb3f1bb1b16bee28d47785506adff3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 16 Mar 2017 15:30:00 +0100
Subject: hipe: Add pseudo_spill_f?move instructions

These pseudo instructions are added to all backends and allow spill slot
to spill slot move coalescing in a clean way.

They have regular move semantics, but contain an additional scratch
register to be used if both source and destination are spilled, and can
not be move coalesced.

Additionally, a register allocator callback
Target:is_spill_move(Instr, Context) is added which allows the spill
slot allocators to check for these instructions and try to coalesce the
spill slots the two temporaries are allocated to.
---
 .../amd64/hipe_amd64_ra_sse2_postconditions.erl    | 17 +++++++-
 lib/hipe/arm/hipe_arm.erl                          |  7 ++++
 lib/hipe/arm/hipe_arm.hrl                          |  1 +
 lib/hipe/arm/hipe_arm_defuse.erl                   |  2 +
 lib/hipe/arm/hipe_arm_frame.erl                    | 22 +++++++++++
 lib/hipe/arm/hipe_arm_ra_finalise.erl              | 25 +++++++++---
 lib/hipe/arm/hipe_arm_ra_postconditions.erl        | 24 +++++++----
 lib/hipe/arm/hipe_arm_subst.erl                    |  2 +
 lib/hipe/ppc/hipe_ppc.erl                          | 14 +++++++
 lib/hipe/ppc/hipe_ppc.hrl                          |  2 +
 lib/hipe/ppc/hipe_ppc_defuse.erl                   |  4 ++
 lib/hipe/ppc/hipe_ppc_frame.erl                    | 36 +++++++++++++++++
 lib/hipe/ppc/hipe_ppc_ra_finalise.erl              | 15 +++++++
 lib/hipe/ppc/hipe_ppc_ra_postconditions.erl        | 24 +++++++----
 lib/hipe/ppc/hipe_ppc_ra_postconditions_fp.erl     | 18 ++++++---
 lib/hipe/ppc/hipe_ppc_subst.erl                    |  6 ++-
 lib/hipe/regalloc/hipe_amd64_specific_sse2.erl     |  4 ++
 lib/hipe/regalloc/hipe_arm_specific.erl            |  4 ++
 lib/hipe/regalloc/hipe_ppc_specific.erl            |  4 ++
 lib/hipe/regalloc/hipe_ppc_specific_fp.erl         |  4 ++
 lib/hipe/regalloc/hipe_sparc_specific.erl          |  4 ++
 lib/hipe/regalloc/hipe_sparc_specific_fp.erl       |  4 ++
 lib/hipe/regalloc/hipe_x86_specific.erl            |  4 ++
 lib/hipe/regalloc/hipe_x86_specific_x87.erl        |  4 ++
 lib/hipe/sparc/hipe_sparc.erl                      | 14 +++++++
 lib/hipe/sparc/hipe_sparc.hrl                      |  2 +
 lib/hipe/sparc/hipe_sparc_defuse.erl               |  4 ++
 lib/hipe/sparc/hipe_sparc_frame.erl                | 36 +++++++++++++++++
 lib/hipe/sparc/hipe_sparc_ra_finalise.erl          | 15 +++++++
 lib/hipe/sparc/hipe_sparc_ra_postconditions.erl    | 24 +++++++----
 lib/hipe/sparc/hipe_sparc_ra_postconditions_fp.erl | 18 ++++++---
 lib/hipe/sparc/hipe_sparc_subst.erl                |  6 ++-
 lib/hipe/x86/hipe_x86.erl                          | 14 +++++++
 lib/hipe/x86/hipe_x86.hrl                          |  2 +
 lib/hipe/x86/hipe_x86_defuse.erl                   |  4 ++
 lib/hipe/x86/hipe_x86_frame.erl                    | 46 ++++++++++++++++++----
 lib/hipe/x86/hipe_x86_ra_finalise.erl              | 10 +++++
 lib/hipe/x86/hipe_x86_ra_postconditions.erl        | 26 ++++++++++--
 lib/hipe/x86/hipe_x86_subst.erl                    |  4 ++
 39 files changed, 422 insertions(+), 54 deletions(-)

diff --git a/lib/hipe/amd64/hipe_amd64_ra_sse2_postconditions.erl b/lib/hipe/amd64/hipe_amd64_ra_sse2_postconditions.erl
index 8a3ea92156..891c874a15 100644
--- a/lib/hipe/amd64/hipe_amd64_ra_sse2_postconditions.erl
+++ b/lib/hipe/amd64/hipe_amd64_ra_sse2_postconditions.erl
@@ -53,6 +53,8 @@ do_insn(I, TempMap, Strategy) ->	% Insn -> {Insn list, DidSpill}
       do_fp_unop(I, TempMap, Strategy);
     #fp_binop{} ->
       do_fp_binop(I, TempMap, Strategy);
+    #pseudo_spill_fmove{} ->
+      do_pseudo_spill_fmove(I, TempMap, Strategy);
     _ ->
       %% All non sse2 ops
       {[I], false}
@@ -95,8 +97,13 @@ do_fmove(I, TempMap, Strategy) ->
   of
     true ->
       Tmp = spill_temp(double, Strategy),
-      {[#fmove{src=Src, dst=Tmp},I#fmove{src=Tmp,dst=Dst}],
-       true};
+      %% pseudo_spill_fmove allows spill slot move coalescing, but must not
+      %% contain memory operands (except for spilled temps)
+      Is = case is_float_temp(Src) andalso is_float_temp(Dst) of
+	     true -> [#pseudo_spill_fmove{src=Src, temp=Tmp, dst=Dst}];
+	     false -> [#fmove{src=Src, dst=Tmp},I#fmove{src=Tmp,dst=Dst}]
+	   end,
+      {Is, true};
     false ->
       {[I], false}
   end.
@@ -104,6 +111,12 @@ do_fmove(I, TempMap, Strategy) ->
 is_float_temp(#x86_temp{type=Type}) -> Type =:= double;
 is_float_temp(#x86_mem{}) -> false.
 
+%%% Fix an pseudo_spill_fmove op.
+do_pseudo_spill_fmove(I = #pseudo_spill_fmove{temp=Temp}, TempMap, _Strategy) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = is_mem_opnd(Temp, TempMap),
+  {[I], false}. % nothing to do
+
 %%% Check if an operand denotes a memory cell (mem or pseudo).
 
 is_mem_opnd(Opnd, TempMap) ->
diff --git a/lib/hipe/arm/hipe_arm.erl b/lib/hipe/arm/hipe_arm.erl
index e34a00f561..3b090b501a 100644
--- a/lib/hipe/arm/hipe_arm.erl
+++ b/lib/hipe/arm/hipe_arm.erl
@@ -79,6 +79,9 @@
 	 pseudo_move_dst/1,
 	 pseudo_move_src/1,
 
+	 mk_pseudo_spill_move/3,
+	 is_pseudo_spill_move/1,
+
 	 mk_pseudo_switch/3,
 
 	 mk_pseudo_tailcall/4,
@@ -250,6 +253,10 @@ is_pseudo_move(I) -> case I of #pseudo_move{} -> true; _ -> false end.
 pseudo_move_dst(#pseudo_move{dst=Dst}) -> Dst.
 pseudo_move_src(#pseudo_move{src=Src}) -> Src.
 
+mk_pseudo_spill_move(Dst, Temp, Src) ->
+  #pseudo_spill_move{dst=Dst, temp=Temp, src=Src}.
+is_pseudo_spill_move(I) -> is_record(I, pseudo_spill_move).
+
 mk_pseudo_switch(JTab, Index, Labels) ->
   #pseudo_switch{jtab=JTab, index=Index, labels=Labels}.
 
diff --git a/lib/hipe/arm/hipe_arm.hrl b/lib/hipe/arm/hipe_arm.hrl
index 67bc07634e..be06b1ebd7 100644
--- a/lib/hipe/arm/hipe_arm.hrl
+++ b/lib/hipe/arm/hipe_arm.hrl
@@ -101,6 +101,7 @@
 -record(pseudo_call_prepare, {nrstkargs}).
 -record(pseudo_li, {dst, imm, label}).	% pre-generated label for use by the assembler
 -record(pseudo_move, {dst, src}).
+-record(pseudo_spill_move, {dst, temp, src}).
 -record(pseudo_switch, {jtab, index, labels}).
 -record(pseudo_tailcall, {funv, arity, stkargs, linkage}).
 -record(pseudo_tailcall_prepare, {}).
diff --git a/lib/hipe/arm/hipe_arm_defuse.erl b/lib/hipe/arm/hipe_arm_defuse.erl
index 0e62070c6c..652299a514 100644
--- a/lib/hipe/arm/hipe_arm_defuse.erl
+++ b/lib/hipe/arm/hipe_arm_defuse.erl
@@ -40,6 +40,7 @@ insn_def_gpr(I) ->
     #pseudo_call{} -> call_clobbered_gpr();
     #pseudo_li{dst=Dst} -> [Dst];
     #pseudo_move{dst=Dst} -> [Dst];
+    #pseudo_spill_move{dst=Dst, temp=Temp} -> [Dst, Temp];
     #pseudo_tailcall_prepare{} -> tailcall_clobbered_gpr();
     #smull{dstlo=DstLo,dsthi=DstHi,src1=Src1} ->
       %% ARM requires DstLo, DstHi, and Src1 to be distinct.
@@ -83,6 +84,7 @@ insn_use_gpr(I) ->
     #pseudo_call{funv=FunV,sdesc=#arm_sdesc{arity=Arity}} ->
       funv_use(FunV, arity_use_gpr(Arity));
     #pseudo_move{src=Src} -> [Src];
+    #pseudo_spill_move{src=Src} -> [Src];
     #pseudo_switch{jtab=JTabR,index=IndexR} -> addtemp(JTabR, [IndexR]);
     #pseudo_tailcall{funv=FunV,arity=Arity,stkargs=StkArgs} ->
       addargs(StkArgs, addtemps(tailcall_clobbered_gpr(), funv_use(FunV, arity_use_gpr(Arity))));
diff --git a/lib/hipe/arm/hipe_arm_frame.erl b/lib/hipe/arm/hipe_arm_frame.erl
index e323907e31..a1004fb609 100644
--- a/lib/hipe/arm/hipe_arm_frame.erl
+++ b/lib/hipe/arm/hipe_arm_frame.erl
@@ -69,6 +69,8 @@ do_insn(I, LiveOut, Context, FPoff) ->
       do_pseudo_call_prepare(I, FPoff);
     #pseudo_move{} ->
       {do_pseudo_move(I, Context, FPoff), FPoff};
+    #pseudo_spill_move{} ->
+      {do_pseudo_spill_move(I, Context, FPoff), FPoff};
     #pseudo_tailcall{} ->
       {do_pseudo_tailcall(I, Context), context_framesize(Context)};
     _ ->
@@ -99,6 +101,26 @@ do_pseudo_move(I, Context, FPoff) ->
 pseudo_offset(Temp, FPoff, Context) ->
   FPoff + context_offset(Context, Temp).
 
+%%%
+%%% Moves from one spill slot to another
+%%%
+
+do_pseudo_spill_move(I, Context, FPoff) ->
+  #pseudo_spill_move{dst=Dst, temp=Temp, src=Src} = I,
+  case temp_is_pseudo(Src) andalso temp_is_pseudo(Dst) of
+    false -> % Register allocator changed its mind, turn back to move
+      do_pseudo_move(hipe_arm:mk_pseudo_move(Dst, Src), Context, FPoff);
+    true ->
+      SrcOffset = pseudo_offset(Src, FPoff, Context),
+      DstOffset = pseudo_offset(Dst, FPoff, Context),
+      case SrcOffset =:= DstOffset of
+	true -> []; % omit move-to-self
+	false ->
+	  mk_load('ldr', Temp, SrcOffset, mk_sp(),
+		  mk_store('str', Temp, DstOffset, mk_sp(), []))
+      end
+  end.
+
 %%%
 %%% Return - deallocate frame and emit 'ret $N' insn.
 %%%
diff --git a/lib/hipe/arm/hipe_arm_ra_finalise.erl b/lib/hipe/arm/hipe_arm_ra_finalise.erl
index 9bfe0a9a83..80cd470708 100644
--- a/lib/hipe/arm/hipe_arm_ra_finalise.erl
+++ b/lib/hipe/arm/hipe_arm_ra_finalise.erl
@@ -25,11 +25,17 @@ ra_bb(BB, Map) ->
   hipe_bb:code_update(BB, ra_code(hipe_bb:code(BB), Map, [])).
 
 ra_code([I|Insns], Map, Accum) ->
-  ra_code(Insns, Map, [ra_insn(I, Map) | Accum]);
+  ra_code(Insns, Map, ra_insn(I, Map, Accum));
 ra_code([], _Map, Accum) ->
   lists:reverse(Accum).
 
-ra_insn(I, Map) ->
+ra_insn(I, Map, Accum) ->
+  case I of
+    #pseudo_move{} -> ra_pseudo_move(I, Map, Accum);
+    _ -> [ra_insn_1(I, Map) | Accum]
+  end.
+
+ra_insn_1(I, Map) ->
   case I of
     #alu{} -> ra_alu(I, Map);
     #cmp{} -> ra_cmp(I, Map);
@@ -38,7 +44,7 @@ ra_insn(I, Map) ->
     #move{} -> ra_move(I, Map);
     #pseudo_call{} -> ra_pseudo_call(I, Map);
     #pseudo_li{} -> ra_pseudo_li(I, Map);
-    #pseudo_move{} -> ra_pseudo_move(I, Map);
+    #pseudo_spill_move{} -> ra_pseudo_spill_move(I, Map);
     #pseudo_switch{} -> ra_pseudo_switch(I, Map);
     #pseudo_tailcall{} -> ra_pseudo_tailcall(I, Map);
     #smull{} -> ra_smull(I, Map);
@@ -80,10 +86,19 @@ ra_pseudo_li(I=#pseudo_li{dst=Dst}, Map) ->
   NewDst = ra_temp(Dst, Map),
   I#pseudo_li{dst=NewDst}.
 
-ra_pseudo_move(I=#pseudo_move{dst=Dst,src=Src}, Map) ->
+ra_pseudo_move(I=#pseudo_move{dst=Dst,src=Src}, Map, Accum) ->
+  NewDst = ra_temp(Dst, Map),
+  NewSrc = ra_temp(Src, Map),
+  case NewSrc#arm_temp.reg =:= NewDst#arm_temp.reg of
+    true -> Accum;
+    false -> [I#pseudo_move{dst=NewDst,src=NewSrc} | Accum]
+  end.
+
+ra_pseudo_spill_move(I=#pseudo_spill_move{dst=Dst,temp=Temp,src=Src}, Map) ->
   NewDst = ra_temp(Dst, Map),
+  NewTemp = ra_temp(Temp, Map),
   NewSrc = ra_temp(Src, Map),
-  I#pseudo_move{dst=NewDst,src=NewSrc}.
+  I#pseudo_spill_move{dst=NewDst, temp=NewTemp, src=NewSrc}.
 
 ra_pseudo_switch(I=#pseudo_switch{jtab=JTab,index=Index}, Map) ->
   NewJTab = ra_temp(JTab, Map),
diff --git a/lib/hipe/arm/hipe_arm_ra_postconditions.erl b/lib/hipe/arm/hipe_arm_ra_postconditions.erl
index 8d1ee1cb94..23c305511f 100644
--- a/lib/hipe/arm/hipe_arm_ra_postconditions.erl
+++ b/lib/hipe/arm/hipe_arm_ra_postconditions.erl
@@ -56,6 +56,7 @@ do_insn(I, TempMap, Strategy) ->
     #pseudo_call{} -> do_pseudo_call(I, TempMap, Strategy);
     #pseudo_li{} -> do_pseudo_li(I, TempMap, Strategy);
     #pseudo_move{} -> do_pseudo_move(I, TempMap, Strategy);
+    #pseudo_spill_move{} -> do_pseudo_spill_move(I, TempMap, Strategy);
     #pseudo_switch{} -> do_pseudo_switch(I, TempMap, Strategy);
     #pseudo_tailcall{} -> do_pseudo_tailcall(I, TempMap, Strategy);
     #smull{} -> do_smull(I, TempMap, Strategy);
@@ -108,18 +109,25 @@ do_pseudo_li(I=#pseudo_li{dst=Dst}, TempMap, Strategy) ->
 
 do_pseudo_move(I=#pseudo_move{dst=Dst,src=Src}, TempMap, Strategy) ->
   %% Either Dst or Src (but not both) may be a pseudo temp.
-  %% pseudo_move and pseudo_tailcall are special cases: in
-  %% all other instructions, all temps must be non-pseudos
-  %% after register allocation.
-  case temp_is_spilled(Dst, TempMap) of
-    true -> % Src must not be a pseudo
-      {FixSrc,NewSrc,DidSpill} = fix_src1(Src, TempMap, Strategy),
-      NewI = I#pseudo_move{src=NewSrc},
-      {FixSrc ++ [NewI], DidSpill};
+  %% pseudo_move, pseudo_spill_move, and pseudo_tailcall
+  %% are special cases: in all other instructions, all
+  %% temps must be non-pseudos after register allocation.
+  case temp_is_spilled(Dst, TempMap)
+    andalso temp_is_spilled(Dst, TempMap)
+  of
+    true -> % Turn into pseudo_spill_move
+      Temp = clone(Src, temp1(Strategy)),
+      NewI = #pseudo_spill_move{dst=Dst, temp=Temp, src=Src},
+      {[NewI], true};
     _ ->
       {[I], false}
   end.
 
+do_pseudo_spill_move(I = #pseudo_spill_move{temp=Temp}, TempMap, _Strategy) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = temp_is_spilled(Temp, TempMap),
+  {[I], false}. % nothing to do
+
 do_pseudo_switch(I=#pseudo_switch{jtab=JTab,index=Index}, TempMap, Strategy) ->
   {FixJTab,NewJTab,DidSpill1} = fix_src1(JTab, TempMap, Strategy),
   {FixIndex,NewIndex,DidSpill2} = fix_src2(Index, TempMap, Strategy),
diff --git a/lib/hipe/arm/hipe_arm_subst.erl b/lib/hipe/arm/hipe_arm_subst.erl
index a41a907a4c..4ff245f414 100644
--- a/lib/hipe/arm/hipe_arm_subst.erl
+++ b/lib/hipe/arm/hipe_arm_subst.erl
@@ -59,6 +59,8 @@ insn_temps(T, I) ->
       #pseudo_call{funv=F} -> I#pseudo_call{funv=funv_temps(T, F)};
       #pseudo_call_prepare{} -> I;
       #pseudo_li{dst=D} -> I#pseudo_li{dst=T(D)};
+      #pseudo_spill_move{dst=D,temp=U,src=S} ->
+	  I#pseudo_spill_move{dst=T(D),temp=T(U),src=T(S)};
       #pseudo_switch{jtab=J=#arm_temp{},index=Ix=#arm_temp{}} ->
 	  I#pseudo_switch{jtab=T(J),index=T(Ix)};
       #pseudo_tailcall{funv=F,stkargs=Stk} ->
diff --git a/lib/hipe/ppc/hipe_ppc.erl b/lib/hipe/ppc/hipe_ppc.erl
index df9f193fa3..63ecd0a0b8 100644
--- a/lib/hipe/ppc/hipe_ppc.erl
+++ b/lib/hipe/ppc/hipe_ppc.erl
@@ -98,6 +98,9 @@
 	 pseudo_move_dst/1,
 	 pseudo_move_src/1,
 
+	 mk_pseudo_spill_move/3,
+	 is_pseudo_spill_move/1,
+
 	 mk_pseudo_tailcall/4,
 	 pseudo_tailcall_func/1,
 	 pseudo_tailcall_stkargs/1,
@@ -131,6 +134,9 @@
 	 pseudo_fmove_dst/1,
 	 pseudo_fmove_src/1,
 
+	 mk_pseudo_spill_fmove/3,
+	 is_pseudo_spill_fmove/1,
+
 	 mk_defun/8,
 	 defun_mfa/1,
 	 defun_formals/1,
@@ -412,6 +418,10 @@ is_pseudo_move(I) -> case I of #pseudo_move{} -> true; _ -> false end.
 pseudo_move_dst(#pseudo_move{dst=Dst}) -> Dst.
 pseudo_move_src(#pseudo_move{src=Src}) -> Src.
 
+mk_pseudo_spill_move(Dst, Temp, Src) ->
+  #pseudo_spill_move{dst=Dst, temp=Temp, src=Src}.
+is_pseudo_spill_move(I) -> is_record(I, pseudo_spill_move).
+
 mk_pseudo_tailcall(FunC, Arity, StkArgs, Linkage) ->
   #pseudo_tailcall{func=FunC, arity=Arity, stkargs=StkArgs, linkage=Linkage}.
 pseudo_tailcall_func(#pseudo_tailcall{func=FunC}) -> FunC.
@@ -495,6 +505,10 @@ is_pseudo_fmove(I) -> case I of #pseudo_fmove{} -> true; _ -> false end.
 pseudo_fmove_dst(#pseudo_fmove{dst=Dst}) -> Dst.
 pseudo_fmove_src(#pseudo_fmove{src=Src}) -> Src.
 
+mk_pseudo_spill_fmove(Dst, Temp, Src) ->
+  #pseudo_spill_fmove{dst=Dst, temp=Temp, src=Src}.
+is_pseudo_spill_fmove(I) -> is_record(I, pseudo_spill_fmove).
+
 mk_defun(MFA, Formals, IsClosure, IsLeaf, Code, Data, VarRange, LabelRange) ->
   #defun{mfa=MFA, formals=Formals, code=Code, data=Data,
 	 isclosure=IsClosure, isleaf=IsLeaf,
diff --git a/lib/hipe/ppc/hipe_ppc.hrl b/lib/hipe/ppc/hipe_ppc.hrl
index a96692c52e..3eef8be487 100644
--- a/lib/hipe/ppc/hipe_ppc.hrl
+++ b/lib/hipe/ppc/hipe_ppc.hrl
@@ -87,6 +87,7 @@
 -record(pseudo_call_prepare, {nrstkargs}).
 -record(pseudo_li, {dst, imm}).
 -record(pseudo_move, {dst, src}).
+-record(pseudo_spill_move, {dst, temp, src}).
 -record(pseudo_tailcall, {func, arity, stkargs, linkage}).
 -record(pseudo_tailcall_prepare, {}).
 -record(store, {stop, src, disp, base}).	% non-indexed, non-update form
@@ -99,6 +100,7 @@
 -record(fp_binary, {fp_binop, dst, src1, src2}).
 -record(fp_unary, {fp_unop, dst, src}).
 -record(pseudo_fmove, {dst, src}).
+-record(pseudo_spill_fmove, {dst, temp, src}).
 
 %%% Function definitions.
 
diff --git a/lib/hipe/ppc/hipe_ppc_defuse.erl b/lib/hipe/ppc/hipe_ppc_defuse.erl
index 9a99611493..d8a864f7d5 100644
--- a/lib/hipe/ppc/hipe_ppc_defuse.erl
+++ b/lib/hipe/ppc/hipe_ppc_defuse.erl
@@ -41,6 +41,7 @@ insn_def_gpr(I) ->
     #pseudo_call{} -> call_clobbered_gpr();
     #pseudo_li{dst=Dst} -> [Dst];
     #pseudo_move{dst=Dst} -> [Dst];
+    #pseudo_spill_move{dst=Dst,temp=Temp} -> [Dst, Temp];
     #pseudo_tailcall_prepare{} -> tailcall_clobbered_gpr();
     #unary{dst=Dst} -> [Dst];
     _ -> []
@@ -71,6 +72,7 @@ insn_use_gpr(I) ->
     #mtspr{src=Src} -> [Src];
     #pseudo_call{sdesc=#ppc_sdesc{arity=Arity}} -> arity_use_gpr(Arity);
     #pseudo_move{src=Src} -> [Src];
+    #pseudo_spill_move{src=Src} -> [Src];
     #pseudo_tailcall{arity=Arity,stkargs=StkArgs} ->
       addsrcs(StkArgs, addtemps(tailcall_clobbered_gpr(), arity_use_gpr(Arity)));
     #store{src=Src,base=Base} -> addtemp(Src, [Base]);
@@ -110,6 +112,7 @@ insn_def_fpr(I) ->
     #fp_binary{dst=Dst} -> [Dst];
     #fp_unary{dst=Dst} -> [Dst];
     #pseudo_fmove{dst=Dst} -> [Dst];
+    #pseudo_spill_fmove{dst=Dst,temp=Temp} -> [Dst, Temp];
     _ -> []
   end.
 
@@ -126,6 +129,7 @@ insn_use_fpr(I) ->
     #fp_binary{src1=Src1,src2=Src2} -> addtemp(Src1, [Src2]);
     #fp_unary{src=Src} -> [Src];
     #pseudo_fmove{src=Src} -> [Src];
+    #pseudo_spill_fmove{src=Src} -> [Src];
     _ -> []
   end.
 
diff --git a/lib/hipe/ppc/hipe_ppc_frame.erl b/lib/hipe/ppc/hipe_ppc_frame.erl
index a91cb18cc2..b88b75a5bd 100644
--- a/lib/hipe/ppc/hipe_ppc_frame.erl
+++ b/lib/hipe/ppc/hipe_ppc_frame.erl
@@ -66,10 +66,14 @@ do_insn(I, LiveOut, Context, FPoff) ->
       do_pseudo_call_prepare(I, FPoff);
     #pseudo_move{} ->
       {do_pseudo_move(I, Context, FPoff), FPoff};
+    #pseudo_spill_move{} ->
+      {do_pseudo_spill_move(I, Context, FPoff), FPoff};
     #pseudo_tailcall{} ->
       {do_pseudo_tailcall(I, Context), context_framesize(Context)};
     #pseudo_fmove{} ->
       {do_pseudo_fmove(I, Context, FPoff), FPoff};
+    #pseudo_spill_fmove{} ->
+      {do_pseudo_spill_fmove(I, Context, FPoff), FPoff};
     _ ->
       {[I], FPoff}
   end.
@@ -98,6 +102,22 @@ do_pseudo_move(I, Context, FPoff) ->
       end
   end.
 
+do_pseudo_spill_move(I, Context, FPoff) ->
+  #pseudo_spill_move{dst=Dst,temp=Temp,src=Src} = I,
+  case temp_is_pseudo(Src) andalso temp_is_pseudo(Dst) of
+    false -> % Register allocator changed its mind, turn back to move
+      do_pseudo_move(hipe_ppc:mk_pseudo_move(Dst, Src), Context, FPoff);
+    true ->
+      SrcOffset = pseudo_offset(Src, FPoff, Context),
+      DstOffset = pseudo_offset(Dst, FPoff, Context),
+      case SrcOffset =:= DstOffset of
+	true -> []; % omit move-to-self
+	false ->
+	  mk_load(hipe_ppc:ldop_word(), Temp, SrcOffset, mk_sp(),
+		  mk_store(hipe_ppc:stop_word(), Temp, DstOffset, mk_sp(), []))
+      end
+  end.
+
 do_pseudo_fmove(I, Context, FPoff) ->
   Dst = hipe_ppc:pseudo_fmove_dst(I),
   Src = hipe_ppc:pseudo_fmove_src(I),
@@ -115,6 +135,22 @@ do_pseudo_fmove(I, Context, FPoff) ->
       end
   end.
 
+do_pseudo_spill_fmove(I, Context, FPoff) ->
+  #pseudo_spill_fmove{dst=Dst,temp=Temp,src=Src} = I,
+  case temp_is_pseudo(Src) andalso temp_is_pseudo(Dst) of
+    false -> % Register allocator changed its mind, turn back to move
+      do_pseudo_fmove(hipe_ppc:mk_pseudo_fmove(Dst, Src), Context, FPoff);
+    true ->
+      SrcOffset = pseudo_offset(Src, FPoff, Context),
+      DstOffset = pseudo_offset(Dst, FPoff, Context),
+      case SrcOffset =:= DstOffset of
+	true -> []; % omit move-to-self
+	false ->
+	  hipe_ppc:mk_fload(Temp, SrcOffset, mk_sp(), 0)
+	    ++ hipe_ppc:mk_fstore(Temp, DstOffset, mk_sp(), 0)
+      end
+  end.
+
 pseudo_offset(Temp, FPoff, Context) ->
   FPoff + context_offset(Context, Temp).
 
diff --git a/lib/hipe/ppc/hipe_ppc_ra_finalise.erl b/lib/hipe/ppc/hipe_ppc_ra_finalise.erl
index 74ef7475eb..bca504d754 100644
--- a/lib/hipe/ppc/hipe_ppc_ra_finalise.erl
+++ b/lib/hipe/ppc/hipe_ppc_ra_finalise.erl
@@ -41,6 +41,7 @@ ra_insn(I, Map, FPMap) ->
     #mtspr{} -> ra_mtspr(I, Map);
     #pseudo_li{} -> ra_pseudo_li(I, Map);
     #pseudo_move{} -> ra_pseudo_move(I, Map);
+    #pseudo_spill_move{} -> ra_pseudo_spill_move(I, Map);
     #pseudo_tailcall{} -> ra_pseudo_tailcall(I, Map);
     #store{} -> ra_store(I, Map);
     #storex{} -> ra_storex(I, Map);
@@ -52,6 +53,7 @@ ra_insn(I, Map, FPMap) ->
     #fp_binary{} -> ra_fp_binary(I, FPMap);
     #fp_unary{} -> ra_fp_unary(I, FPMap);
     #pseudo_fmove{} -> ra_pseudo_fmove(I, FPMap);
+    #pseudo_spill_fmove{} -> ra_pseudo_spill_fmove(I, FPMap);
     _ -> I
   end.
 
@@ -98,6 +100,12 @@ ra_pseudo_move(I=#pseudo_move{dst=Dst,src=Src}, Map) ->
   NewSrc = ra_temp(Src, Map),
   I#pseudo_move{dst=NewDst,src=NewSrc}.
 
+ra_pseudo_spill_move(I=#pseudo_spill_move{dst=Dst,temp=Temp,src=Src}, Map) ->
+  NewDst = ra_temp(Dst, Map),
+  NewTemp = ra_temp(Temp, Map),
+  NewSrc = ra_temp(Src, Map),
+  I#pseudo_spill_move{dst=NewDst,temp=NewTemp,src=NewSrc}.
+
 ra_pseudo_tailcall(I=#pseudo_tailcall{stkargs=StkArgs}, Map) ->
   NewStkArgs = ra_args(StkArgs, Map),
   I#pseudo_tailcall{stkargs=NewStkArgs}.
@@ -156,6 +164,13 @@ ra_pseudo_fmove(I=#pseudo_fmove{dst=Dst,src=Src}, FPMap) ->
   NewSrc = ra_temp_fp(Src, FPMap),
   I#pseudo_fmove{dst=NewDst,src=NewSrc}.
 
+ra_pseudo_spill_fmove(I=#pseudo_spill_fmove{dst=Dst,temp=Temp,src=Src},
+		      FPMap) ->
+  NewDst = ra_temp_fp(Dst, FPMap),
+  NewTemp = ra_temp_fp(Temp, FPMap),
+  NewSrc = ra_temp_fp(Src, FPMap),
+  I#pseudo_spill_fmove{dst=NewDst,temp=NewTemp,src=NewSrc}.
+
 ra_args([Arg|Args], Map) ->
   [ra_temp_or_imm(Arg, Map) | ra_args(Args, Map)];
 ra_args([], _) ->
diff --git a/lib/hipe/ppc/hipe_ppc_ra_postconditions.erl b/lib/hipe/ppc/hipe_ppc_ra_postconditions.erl
index 95aa294fe5..0a97129666 100644
--- a/lib/hipe/ppc/hipe_ppc_ra_postconditions.erl
+++ b/lib/hipe/ppc/hipe_ppc_ra_postconditions.erl
@@ -57,6 +57,7 @@ do_insn(I, TempMap, Strategy) ->
     #mtspr{} -> do_mtspr(I, TempMap, Strategy);
     #pseudo_li{} -> do_pseudo_li(I, TempMap, Strategy);
     #pseudo_move{} -> do_pseudo_move(I, TempMap, Strategy);
+    #pseudo_spill_move{} -> do_pseudo_spill_move(I, TempMap, Strategy);
     #store{} -> do_store(I, TempMap, Strategy);
     #storex{} -> do_storex(I, TempMap, Strategy);
     #unary{} -> do_unary(I, TempMap, Strategy);
@@ -117,18 +118,25 @@ do_pseudo_li(I=#pseudo_li{dst=Dst}, TempMap, Strategy) ->
 
 do_pseudo_move(I=#pseudo_move{dst=Dst,src=Src}, TempMap, Strategy) ->
   %% Either Dst or Src (but not both) may be a pseudo temp.
-  %% pseudo_move and pseudo_tailcall are special cases: in
-  %% all other instructions, all temps must be non-pseudos
-  %% after register allocation.
-  case temp_is_spilled(Dst, TempMap) of
-    true -> % Src must not be a pseudo
-      {FixSrc,NewSrc,DidSpill} = fix_src1(Src, TempMap, Strategy),
-      NewI = I#pseudo_move{src=NewSrc},
-      {FixSrc ++ [NewI], DidSpill};
+  %% pseudo_move, pseudo_spill_move, and pseudo_tailcall are
+  %% special cases: in all other instructions, all temps
+  %% must be non-pseudos after register allocation.
+  case temp_is_spilled(Src, TempMap)
+    andalso temp_is_spilled(Dst, TempMap)
+  of
+    true -> % Turn into pseudo_spill_move
+      Temp = clone(Src, temp1(Strategy)),
+      NewI = #pseudo_spill_move{dst=Dst,temp=Temp,src=Src},
+      {[NewI], true};
     _ ->
       {[I], false}
   end.
 
+do_pseudo_spill_move(I=#pseudo_spill_move{temp=Temp}, TempMap, _Strategy) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = temp_is_spilled(Temp, TempMap),
+  {[I], false}.
+
 do_store(I=#store{src=Src,base=Base}, TempMap, Strategy) ->
   {FixSrc,NewSrc,DidSpill1} = fix_src1(Src, TempMap, Strategy),
   {FixBase,NewBase,DidSpill2} = fix_src2(Base, TempMap, Strategy),
diff --git a/lib/hipe/ppc/hipe_ppc_ra_postconditions_fp.erl b/lib/hipe/ppc/hipe_ppc_ra_postconditions_fp.erl
index 5ec5f29577..7342053620 100644
--- a/lib/hipe/ppc/hipe_ppc_ra_postconditions_fp.erl
+++ b/lib/hipe/ppc/hipe_ppc_ra_postconditions_fp.erl
@@ -42,6 +42,7 @@ do_insn(I, TempMap) ->
     #fp_binary{} -> do_fp_binary(I, TempMap);
     #fp_unary{} -> do_fp_unary(I, TempMap);
     #pseudo_fmove{} -> do_pseudo_fmove(I, TempMap);
+    #pseudo_spill_fmove{} -> do_pseudo_spill_fmove(I, TempMap);
     _ -> {[I], false}
   end.
 
@@ -81,15 +82,22 @@ do_fp_unary(I=#fp_unary{dst=Dst,src=Src}, TempMap) ->
   {FixSrc ++ [NewI | FixDst], DidSpill1 or DidSpill2}.
 
 do_pseudo_fmove(I=#pseudo_fmove{dst=Dst,src=Src}, TempMap) ->
-  case temp_is_spilled(Dst, TempMap) of
-    true ->
-      {FixSrc,NewSrc,DidSpill} = fix_src(Src, TempMap),
-      NewI = I#pseudo_fmove{src=NewSrc},
-      {FixSrc ++ [NewI], DidSpill};
+  case temp_is_spilled(Src, TempMap)
+    andalso temp_is_spilled(Dst, TempMap)
+  of
+    true -> % Turn into pseudo_spill_fmove
+      Temp = clone(Src),
+      NewI = #pseudo_spill_fmove{dst=Dst,temp=Temp,src=Src},
+      {[NewI], true};
     _ ->
       {[I], false}
   end.
 
+do_pseudo_spill_fmove(I=#pseudo_spill_fmove{temp=Temp}, TempMap) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = temp_is_spilled(Temp, TempMap),
+  {[I], false}.
+
 %%% Fix Dst and Src operands.
 
 fix_src(Src, TempMap) ->
diff --git a/lib/hipe/ppc/hipe_ppc_subst.erl b/lib/hipe/ppc/hipe_ppc_subst.erl
index 1cd18b5c01..e282b22774 100644
--- a/lib/hipe/ppc/hipe_ppc_subst.erl
+++ b/lib/hipe/ppc/hipe_ppc_subst.erl
@@ -48,6 +48,8 @@ insn_temps(T, I) ->
       #pseudo_call_prepare{} -> I;
       #pseudo_li{dst=D} -> I#pseudo_li{dst=T(D)};
       #pseudo_move{dst=D,src=S} -> I#pseudo_move{dst=T(D),src=T(S)};
+      #pseudo_spill_move{dst=D,temp=U,src=S} ->
+	  I#pseudo_spill_move{dst=T(D),temp=T(U),src=T(S)};
       #pseudo_tailcall{func=F,stkargs=Stk} when not is_record(F, ppc_temp) ->
 	  I#pseudo_tailcall{stkargs=lists:map(A,Stk)};
       #pseudo_tailcall_prepare{} -> I;
@@ -62,7 +64,9 @@ insn_temps(T, I) ->
       #fp_binary{dst=D,src1=L,src2=R} ->
 	  I#fp_binary{dst=T(D),src1=T(L),src2=T(R)};
       #fp_unary{dst=D,src=S} -> I#fp_unary{dst=T(D),src=T(S)};
-      #pseudo_fmove{dst=D,src=S} -> I#pseudo_fmove{dst=T(D),src=T(S)}
+      #pseudo_fmove{dst=D,src=S} -> I#pseudo_fmove{dst=T(D),src=T(S)};
+      #pseudo_spill_fmove{dst=D,temp=U,src=S} ->
+	  I#pseudo_spill_fmove{dst=T(D),temp=T(U),src=T(S)}
   end.
 
 -spec oper_temps(subst_fun(), oper()) -> oper().
diff --git a/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl b/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
index 8914e7cac3..d592ba391c 100644
--- a/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
+++ b/lib/hipe/regalloc/hipe_amd64_specific_sse2.erl
@@ -30,6 +30,7 @@
 	 def_use/2,
 	 is_arg/2,	%% used by hipe_ls_regalloc
 	 is_move/2,
+	 is_spill_move/2,
 	 is_fixed/2,	%% used by hipe_graph_coloring_regalloc
 	 is_global/2,
 	 is_precoloured/2,
@@ -194,6 +195,9 @@ is_move(Instruction, _) ->
 	andalso hipe_x86:is_temp(Dst) andalso hipe_x86:temp_is_allocatable(Dst);
     false -> false
   end.
+
+is_spill_move(Instruction,_) ->
+  hipe_x86:is_pseudo_spill_fmove(Instruction).
  
 reg_nr(Reg, _) ->
   hipe_x86:temp_reg(Reg).
diff --git a/lib/hipe/regalloc/hipe_arm_specific.erl b/lib/hipe/regalloc/hipe_arm_specific.erl
index 517ad83982..7ebc6aa336 100644
--- a/lib/hipe/regalloc/hipe_arm_specific.erl
+++ b/lib/hipe/regalloc/hipe_arm_specific.erl
@@ -24,6 +24,7 @@
 	 ,reg_nr/2
 	 ,def_use/2
 	 ,is_move/2
+	 ,is_spill_move/2
 	 ,is_precoloured/2
 	 ,var_range/2
 	 ,allocatable/1
@@ -154,6 +155,9 @@ is_move(Instruction, _) ->
     false -> false
   end.
 
+is_spill_move(Instruction, _) ->
+  hipe_arm:is_pseudo_spill_move(Instruction).
+
 reg_nr(Reg, _) ->
   hipe_arm:temp_reg(Reg).
 
diff --git a/lib/hipe/regalloc/hipe_ppc_specific.erl b/lib/hipe/regalloc/hipe_ppc_specific.erl
index 50ab0f04cf..81bb551bd2 100644
--- a/lib/hipe/regalloc/hipe_ppc_specific.erl
+++ b/lib/hipe/regalloc/hipe_ppc_specific.erl
@@ -24,6 +24,7 @@
 	 ,reg_nr/2
 	 ,def_use/2
 	 ,is_move/2
+	 ,is_spill_move/2
 	 ,is_precoloured/2
 	 ,var_range/2
 	 ,allocatable/1
@@ -154,6 +155,9 @@ is_move(Instruction, _) ->
     false -> false
   end.
 
+is_spill_move(Instruction, _) ->
+  hipe_ppc:is_pseudo_spill_move(Instruction).
+
 reg_nr(Reg, _) ->
   hipe_ppc:temp_reg(Reg).
 
diff --git a/lib/hipe/regalloc/hipe_ppc_specific_fp.erl b/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
index 5a4f63fc78..dcfdf6592c 100644
--- a/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
+++ b/lib/hipe/regalloc/hipe_ppc_specific_fp.erl
@@ -24,6 +24,7 @@
 	 ,reg_nr/2
 	 ,def_use/2
 	 ,is_move/2
+	 ,is_spill_move/2
 	 ,is_precoloured/2
 	 ,var_range/2
 	 ,allocatable/1
@@ -135,6 +136,9 @@ defines_all_alloc(I, _) ->
 is_move(I, _) ->
   hipe_ppc:is_pseudo_fmove(I).
  
+is_spill_move(I, _) ->
+  hipe_ppc:is_pseudo_spill_fmove(I).
+
 reg_nr(Reg, _) ->
   hipe_ppc:temp_reg(Reg).
 
diff --git a/lib/hipe/regalloc/hipe_sparc_specific.erl b/lib/hipe/regalloc/hipe_sparc_specific.erl
index 9eb57dafd2..78b6379eba 100644
--- a/lib/hipe/regalloc/hipe_sparc_specific.erl
+++ b/lib/hipe/regalloc/hipe_sparc_specific.erl
@@ -24,6 +24,7 @@
 	 ,reg_nr/2
 	 ,def_use/2
 	 ,is_move/2
+	 ,is_spill_move/2
 	 ,is_precoloured/2
 	 ,var_range/2
 	 ,allocatable/1
@@ -154,6 +155,9 @@ is_move(Instruction, _) ->
     false -> false
   end.
 
+is_spill_move(Instruction, _) ->
+  hipe_sparc:is_pseudo_spill_move(Instruction).
+
 reg_nr(Reg, _) ->
   hipe_sparc:temp_reg(Reg).
 
diff --git a/lib/hipe/regalloc/hipe_sparc_specific_fp.erl b/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
index 9993af81c2..485fdc212a 100644
--- a/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
+++ b/lib/hipe/regalloc/hipe_sparc_specific_fp.erl
@@ -24,6 +24,7 @@
 	 ,reg_nr/2
 	 ,def_use/2
 	 ,is_move/2
+	 ,is_spill_move/2
 	 ,is_precoloured/2
 	 ,var_range/2
 	 ,allocatable/1
@@ -135,6 +136,9 @@ defines_all_alloc(I, _) ->
 is_move(I, _) ->
   hipe_sparc:is_pseudo_fmove(I).
  
+is_spill_move(I, _) ->
+  hipe_sparc:is_pseudo_spill_fmove(I).
+
 reg_nr(Reg, _) ->
   hipe_sparc:temp_reg(Reg).
 
diff --git a/lib/hipe/regalloc/hipe_x86_specific.erl b/lib/hipe/regalloc/hipe_x86_specific.erl
index f21384d377..dacfb71b00 100644
--- a/lib/hipe/regalloc/hipe_x86_specific.erl
+++ b/lib/hipe/regalloc/hipe_x86_specific.erl
@@ -46,6 +46,7 @@
 	 def_use/2,
 	 is_arg/2,	% used by hipe_ls_regalloc
 	 is_move/2,
+	 is_spill_move/2,
 	 is_fixed/2,	% used by hipe_graph_coloring_regalloc
 	 is_global/2,
 	 is_precoloured/2,
@@ -210,6 +211,9 @@ is_move(Instruction,_) ->
     false -> false
   end.
 
+is_spill_move(Instruction,_) ->
+  hipe_x86:is_pseudo_spill_move(Instruction).
+
 reg_nr(Reg,_) ->
   hipe_x86:temp_reg(Reg).
 
diff --git a/lib/hipe/regalloc/hipe_x86_specific_x87.erl b/lib/hipe/regalloc/hipe_x86_specific_x87.erl
index 4b4c83f76d..3fe49e1f00 100644
--- a/lib/hipe/regalloc/hipe_x86_specific_x87.erl
+++ b/lib/hipe/regalloc/hipe_x86_specific_x87.erl
@@ -47,6 +47,7 @@
 	 uses/2,
 	 defines/2,
 	 defines_all_alloc/2,
+	 is_spill_move/2,
 	 is_global/2,
 	 reg_nr/2,
 	 physical_name/2,
@@ -158,6 +159,9 @@ defines(I, _) ->
 
 defines_all_alloc(I, _) -> hipe_amd64_defuse:insn_defs_all(I).
 
+is_spill_move(I, _) ->
+  hipe_x86:is_pseudo_spill_fmove(I).
+
 temp_is_double(Temp) ->
   hipe_x86:temp_type(Temp) =:= 'double'.
 
diff --git a/lib/hipe/sparc/hipe_sparc.erl b/lib/hipe/sparc/hipe_sparc.erl
index 916857b224..22e0761b69 100644
--- a/lib/hipe/sparc/hipe_sparc.erl
+++ b/lib/hipe/sparc/hipe_sparc.erl
@@ -87,6 +87,9 @@
 
 	 mk_pseudo_set/2,
 
+	 mk_pseudo_spill_move/3,
+	 is_pseudo_spill_move/1,
+
 	 mk_pseudo_tailcall/4,
 	 pseudo_tailcall_funv/1,
 	 pseudo_tailcall_linkage/1,
@@ -117,6 +120,9 @@
 	 pseudo_fmove_src/1,
 	 pseudo_fmove_dst/1,
 
+	 mk_pseudo_spill_fmove/3,
+	 is_pseudo_spill_fmove/1,
+
 	 mk_pseudo_fstore/3,
 	 mk_fstore/4,
 
@@ -269,6 +275,10 @@ mk_pseudo_ret() -> #pseudo_ret{}.
 
 mk_pseudo_set(Imm, Dst) -> #pseudo_set{imm=Imm, dst=Dst}.
 
+mk_pseudo_spill_move(Src, Temp, Dst) ->
+  #pseudo_spill_move{src=Src, temp=Temp, dst=Dst}.
+is_pseudo_spill_move(I) -> is_record(I, pseudo_spill_move).
+
 mk_pseudo_tailcall(FunV, Arity, StkArgs, Linkage) ->
   #pseudo_tailcall{funv=FunV, arity=Arity, stkargs=StkArgs, linkage=Linkage}.
 pseudo_tailcall_funv(#pseudo_tailcall{funv=FunV}) -> FunV.
@@ -375,6 +385,10 @@ is_pseudo_fmove(I) -> case I of #pseudo_fmove{} -> true; _ -> false end.
 pseudo_fmove_src(#pseudo_fmove{src=Src}) -> Src.
 pseudo_fmove_dst(#pseudo_fmove{dst=Dst}) -> Dst.
 
+mk_pseudo_spill_fmove(Src, Temp, Dst) ->
+  #pseudo_spill_fmove{src=Src, temp=Temp, dst=Dst}.
+is_pseudo_spill_fmove(I) -> is_record(I, pseudo_spill_fmove).
+
 mk_pseudo_fstore(Src, Base, Disp) ->
   #pseudo_fstore{src=Src, base=Base, disp=Disp}.
 
diff --git a/lib/hipe/sparc/hipe_sparc.hrl b/lib/hipe/sparc/hipe_sparc.hrl
index 4eae6777a9..f60e516e59 100644
--- a/lib/hipe/sparc/hipe_sparc.hrl
+++ b/lib/hipe/sparc/hipe_sparc.hrl
@@ -88,6 +88,8 @@
 -record(pseudo_move, {src, dst}).
 -record(pseudo_ret, {}).
 -record(pseudo_set, {imm, dst}).
+-record(pseudo_spill_fmove, {src, temp, dst}).
+-record(pseudo_spill_move, {src, temp, dst}).
 -record(pseudo_tailcall, {funv, arity, stkargs, linkage}).
 -record(pseudo_tailcall_prepare, {}).
 -record(rdy, {dst}).
diff --git a/lib/hipe/sparc/hipe_sparc_defuse.erl b/lib/hipe/sparc/hipe_sparc_defuse.erl
index cb75f82e2b..4d4b11e301 100644
--- a/lib/hipe/sparc/hipe_sparc_defuse.erl
+++ b/lib/hipe/sparc/hipe_sparc_defuse.erl
@@ -39,6 +39,7 @@ insn_def_gpr(I) ->
     #pseudo_call{} -> call_clobbered_gpr();
     #pseudo_move{dst=Dst} -> [Dst];
     #pseudo_set{dst=Dst} -> [Dst];
+    #pseudo_spill_move{temp=Temp, dst=Dst} -> [Temp, Dst];
     #pseudo_tailcall_prepare{} -> tailcall_clobbered_gpr();
     #rdy{dst=Dst} -> [Dst];
     #sethi{dst=Dst} -> [Dst];
@@ -72,6 +73,7 @@ insn_use_gpr(I) ->
       funv_use(FunV, arity_use_gpr(Arity));
     #pseudo_move{src=Src} -> [Src];
     #pseudo_ret{} -> [hipe_sparc:mk_rv()];
+    #pseudo_spill_move{src=Src} -> [Src];
     #pseudo_tailcall{funv=FunV,arity=Arity,stkargs=StkArgs} ->
       addsrcs(StkArgs, addtemps(tailcall_clobbered_gpr(), funv_use(FunV, arity_use_gpr(Arity))));
     #store{src=Src,base=Base,disp=Disp} ->
@@ -112,6 +114,7 @@ insn_def_fpr(I) ->
     #fp_unary{dst=Dst} -> [Dst];
     #pseudo_fload{dst=Dst} -> [Dst];
     #pseudo_fmove{dst=Dst} -> [Dst];
+    #pseudo_spill_fmove{temp=Temp, dst=Dst} -> [Temp, Dst];
     _ -> []
   end.
 
@@ -130,6 +133,7 @@ insn_use_fpr(I) ->
     #fp_unary{src=Src} -> [Src];
     #pseudo_fmove{src=Src} -> [Src];
     #pseudo_fstore{src=Src} -> [Src];
+    #pseudo_spill_fmove{src=Src} -> [Src];
     _ -> []
   end.
 
diff --git a/lib/hipe/sparc/hipe_sparc_frame.erl b/lib/hipe/sparc/hipe_sparc_frame.erl
index 6f29c3c905..1f2a259ca1 100644
--- a/lib/hipe/sparc/hipe_sparc_frame.erl
+++ b/lib/hipe/sparc/hipe_sparc_frame.erl
@@ -82,6 +82,10 @@ do_insn(I, LiveOut, Context, FPoff) ->
       {do_pseudo_tailcall(I, Context), context_framesize(Context)};
     #pseudo_fmove{} ->
       {do_pseudo_fmove(I, Context, FPoff), FPoff};
+    #pseudo_spill_move{} ->
+      {do_pseudo_spill_move(I, Context, FPoff), FPoff};
+    #pseudo_spill_fmove{} ->
+      {do_pseudo_spill_fmove(I, Context, FPoff), FPoff};
     _ ->
       {[I], FPoff}
   end.
@@ -110,6 +114,22 @@ do_pseudo_move(I, Context, FPoff) ->
       end
   end.
 
+do_pseudo_spill_move(I, Context, FPoff) ->
+  #pseudo_spill_move{src=Src,temp=Temp,dst=Dst} = I,
+  case temp_is_pseudo(Src) andalso temp_is_pseudo(Dst) of
+    false -> % Register allocator changed its mind, turn back to move
+      do_pseudo_move(hipe_sparc:mk_pseudo_move(Src, Dst), Context, FPoff);
+    true ->
+      SrcOffset = pseudo_offset(Src, FPoff, Context),
+      DstOffset = pseudo_offset(Dst, FPoff, Context),
+      case SrcOffset =:= DstOffset of
+	true -> []; % omit move-to-self
+	false ->
+	  mk_load(hipe_sparc:mk_sp(), SrcOffset, Temp,
+		 mk_store(Temp, hipe_sparc:mk_sp(), DstOffset, []))
+      end
+  end.
+
 do_pseudo_fmove(I, Context, FPoff) ->
   Dst = hipe_sparc:pseudo_fmove_dst(I),
   Src = hipe_sparc:pseudo_fmove_src(I),
@@ -127,6 +147,22 @@ do_pseudo_fmove(I, Context, FPoff) ->
       end
   end.
 
+do_pseudo_spill_fmove(I, Context, FPoff) ->
+  #pseudo_spill_fmove{src=Src,temp=Temp,dst=Dst} = I,
+  case temp_is_pseudo(Src) andalso temp_is_pseudo(Dst) of
+    false -> % Register allocator changed its mind, turn back to fmove
+      do_pseudo_fmove(hipe_sparc:mk_pseudo_fmove(Src, Dst), Context, FPoff);
+    true ->
+      SrcOffset = pseudo_offset(Src, FPoff, Context),
+      DstOffset = pseudo_offset(Dst, FPoff, Context),
+      case SrcOffset =:= DstOffset of
+	true -> []; % omit move-to-self
+	false ->
+	  mk_fload(hipe_sparc:mk_sp(), SrcOffset, Temp)
+	    ++ mk_fstore(Temp, hipe_sparc:mk_sp(), DstOffset)
+      end
+  end.
+
 pseudo_offset(Temp, FPoff, Context) ->
   FPoff + context_offset(Context, Temp).
 
diff --git a/lib/hipe/sparc/hipe_sparc_ra_finalise.erl b/lib/hipe/sparc/hipe_sparc_ra_finalise.erl
index 5fdb73e197..a724821992 100644
--- a/lib/hipe/sparc/hipe_sparc_ra_finalise.erl
+++ b/lib/hipe/sparc/hipe_sparc_ra_finalise.erl
@@ -38,6 +38,7 @@ ra_insn(I, Map, FPMap) ->
     #pseudo_call{} -> ra_pseudo_call(I, Map);
     #pseudo_move{} -> ra_pseudo_move(I, Map);
     #pseudo_set{} -> ra_pseudo_set(I, Map);
+    #pseudo_spill_move{} -> ra_pseudo_spill_move(I, Map);
     #pseudo_tailcall{} -> ra_pseudo_tailcall(I, Map);
     #rdy{} -> ra_rdy(I, Map);
     #sethi{} -> ra_sethi(I, Map);
@@ -47,6 +48,7 @@ ra_insn(I, Map, FPMap) ->
     #pseudo_fload{} -> ra_pseudo_fload(I, Map, FPMap);
     #pseudo_fmove{} -> ra_pseudo_fmove(I, FPMap);
     #pseudo_fstore{} -> ra_pseudo_fstore(I, Map, FPMap);
+    #pseudo_spill_fmove{} -> ra_pseudo_spill_fmove(I, FPMap);
     _ -> I
   end.
 
@@ -80,6 +82,12 @@ ra_pseudo_set(I=#pseudo_set{dst=Dst}, Map) ->
   NewDst = ra_temp(Dst, Map),
   I#pseudo_set{dst=NewDst}.
 
+ra_pseudo_spill_move(I=#pseudo_spill_move{src=Src,temp=Temp,dst=Dst}, Map) ->
+  NewSrc = ra_temp(Src, Map),
+  NewTemp = ra_temp(Temp, Map),
+  NewDst = ra_temp(Dst, Map),
+  I#pseudo_spill_move{src=NewSrc,temp=NewTemp,dst=NewDst}.
+
 ra_pseudo_tailcall(I=#pseudo_tailcall{funv=FunV,stkargs=StkArgs}, Map) ->
   NewFunV = ra_funv(FunV, Map),
   NewStkArgs = ra_args(StkArgs, Map),
@@ -120,6 +128,13 @@ ra_pseudo_fmove(I=#pseudo_fmove{src=Src,dst=Dst}, FPMap) ->
   NewDst = ra_temp_fp(Dst, FPMap),
   I#pseudo_fmove{src=NewSrc,dst=NewDst}.
 
+ra_pseudo_spill_fmove(I=#pseudo_spill_fmove{src=Src,temp=Temp,dst=Dst},
+		      FPMap) ->
+  NewSrc = ra_temp_fp(Src, FPMap),
+  NewTemp = ra_temp_fp(Temp, FPMap),
+  NewDst = ra_temp_fp(Dst, FPMap),
+  I#pseudo_spill_fmove{src=NewSrc,temp=NewTemp,dst=NewDst}.
+
 ra_pseudo_fstore(I=#pseudo_fstore{src=Src,base=Base}, Map, FPMap) ->
   NewSrc = ra_temp_fp(Src, FPMap),
   NewBase = ra_temp(Base, Map),
diff --git a/lib/hipe/sparc/hipe_sparc_ra_postconditions.erl b/lib/hipe/sparc/hipe_sparc_ra_postconditions.erl
index 984c97fbd4..d3ecb43ec6 100644
--- a/lib/hipe/sparc/hipe_sparc_ra_postconditions.erl
+++ b/lib/hipe/sparc/hipe_sparc_ra_postconditions.erl
@@ -54,6 +54,7 @@ do_insn(I, TempMap, Strategy) ->
     #pseudo_call{} -> do_pseudo_call(I, TempMap, Strategy);
     #pseudo_move{} -> do_pseudo_move(I, TempMap, Strategy);
     #pseudo_set{} -> do_pseudo_set(I, TempMap, Strategy);
+    #pseudo_spill_move{} -> do_pseudo_spill_move(I, TempMap, Strategy);
     #pseudo_tailcall{} -> do_pseudo_tailcall(I, TempMap, Strategy);
     #rdy{} -> do_rdy(I, TempMap, Strategy);
     #sethi{} -> do_sethi(I, TempMap, Strategy);
@@ -92,14 +93,16 @@ do_pseudo_call(I=#pseudo_call{funv=FunV}, TempMap, Strategy) ->
 
 do_pseudo_move(I=#pseudo_move{src=Src,dst=Dst}, TempMap, Strategy) ->
   %% Either Dst or Src (but not both) may be a pseudo temp.
-  %% pseudo_move is a special case: in [XXX: not pseudo_tailcall]
-  %% all other instructions, all temps must be non-pseudos
-  %% after register allocation.
-  case temp_is_spilled(Dst, TempMap) of
-    true -> % Src must not be a pseudo
-      {FixSrc,NewSrc,DidSpill} = fix_src1(Src, TempMap, Strategy),
-      NewI = I#pseudo_move{src=NewSrc},
-      {FixSrc ++ [NewI], DidSpill};
+  %% pseudo_move and pseudo_spill_move [XXX: not pseudo_tailcall]
+  %% are special cases: in all other instructions, all temps must
+  %% be non-pseudos after register allocation.
+  case temp_is_spilled(Src, TempMap)
+    andalso temp_is_spilled(Dst, TempMap)
+  of
+    true -> % Turn into pseudo_spill_move
+      Temp = clone(Src, temp1(Strategy)),
+      NewI = #pseudo_spill_move{src=Src,temp=Temp,dst=Dst},
+      {[NewI], true};
     _ ->
       {[I], false}
   end.
@@ -109,6 +112,11 @@ do_pseudo_set(I=#pseudo_set{dst=Dst}, TempMap, Strategy) ->
   NewI = I#pseudo_set{dst=NewDst},
   {[NewI | FixDst], DidSpill}.
 
+do_pseudo_spill_move(I=#pseudo_spill_move{temp=Temp}, TempMap, _Strategy) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = temp_is_spilled(Temp, TempMap),
+  {[I], false}.
+
 do_pseudo_tailcall(I=#pseudo_tailcall{funv=FunV}, TempMap, Strategy) ->
   {FixFunV,NewFunV,DidSpill} = fix_funv(FunV, TempMap, Strategy),
   NewI = I#pseudo_tailcall{funv=NewFunV},
diff --git a/lib/hipe/sparc/hipe_sparc_ra_postconditions_fp.erl b/lib/hipe/sparc/hipe_sparc_ra_postconditions_fp.erl
index 751e91425c..5fa3a5fc59 100644
--- a/lib/hipe/sparc/hipe_sparc_ra_postconditions_fp.erl
+++ b/lib/hipe/sparc/hipe_sparc_ra_postconditions_fp.erl
@@ -43,6 +43,7 @@ do_insn(I, TempMap) ->
     #pseudo_fload{} -> do_pseudo_fload(I, TempMap);
     #pseudo_fmove{} -> do_pseudo_fmove(I, TempMap);
     #pseudo_fstore{} -> do_pseudo_fstore(I, TempMap);
+    #pseudo_spill_fmove{} -> do_pseudo_spill_fmove(I, TempMap);
     _ -> {[I], false}
   end.
 
@@ -67,11 +68,13 @@ do_pseudo_fload(I=#pseudo_fload{dst=Dst}, TempMap) ->
   {[NewI | FixDst], DidSpill}.
 
 do_pseudo_fmove(I=#pseudo_fmove{src=Src,dst=Dst}, TempMap) ->
-  case temp_is_spilled(Dst, TempMap) of
-    true ->
-      {FixSrc,NewSrc,DidSpill} = fix_src(Src, TempMap),
-      NewI = I#pseudo_fmove{src=NewSrc},
-      {FixSrc ++ [NewI], DidSpill};
+  case temp_is_spilled(Src, TempMap)
+    andalso temp_is_spilled(Dst, TempMap)
+  of
+    true -> % Turn into pseudo_spill_fmove
+      Temp = clone(Src),
+      NewI = #pseudo_spill_fmove{src=Src,temp=Temp,dst=Dst},
+      {[NewI], true};
     _ ->
       {[I], false}
   end.
@@ -81,6 +84,11 @@ do_pseudo_fstore(I=#pseudo_fstore{src=Src}, TempMap) ->
   NewI = I#pseudo_fstore{src=NewSrc},
   {FixSrc ++ [NewI], DidSpill}.
 
+do_pseudo_spill_fmove(I=#pseudo_spill_fmove{temp=Temp}, TempMap) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = temp_is_spilled(Temp, TempMap),
+  {[I], false}.
+
 %%% Fix Dst and Src operands.
 
 fix_src(Src, TempMap) ->
diff --git a/lib/hipe/sparc/hipe_sparc_subst.erl b/lib/hipe/sparc/hipe_sparc_subst.erl
index 1d0671464e..ce3bbb813a 100644
--- a/lib/hipe/sparc/hipe_sparc_subst.erl
+++ b/lib/hipe/sparc/hipe_sparc_subst.erl
@@ -44,6 +44,8 @@ insn_temps(T, I) ->
       #pseudo_move{src=S,dst=D} -> I#pseudo_move{src=T(S),dst=T(D)};
       #pseudo_ret{} -> I;
       #pseudo_set{dst=D}-> I#pseudo_set{dst=T(D)};
+      #pseudo_spill_move{src=S,temp=U,dst=D} ->
+	  I#pseudo_spill_move{src=T(S),temp=T(U),dst=T(D)};
       #pseudo_tailcall{funv=F,stkargs=Stk} ->
 	  I#pseudo_tailcall{funv=funv_temps(T,F),stkargs=lists:map(Arg,Stk)};
       #pseudo_tailcall_prepare{} -> I;
@@ -57,7 +59,9 @@ insn_temps(T, I) ->
 	  I#pseudo_fload{base=T(B),disp=S2(Di),dst=T(Ds)};
       #pseudo_fmove{src=S,dst=D} -> I#pseudo_fmove{src=T(S),dst=T(D)};
       #pseudo_fstore{src=S,base=B,disp=D} ->
-	  I#pseudo_fstore{src=T(S),base=T(B),disp=S2(D)}
+	  I#pseudo_fstore{src=T(S),base=T(B),disp=S2(D)};
+      #pseudo_spill_fmove{src=S,temp=U,dst=D} ->
+	  I#pseudo_spill_fmove{src=T(S),temp=T(U),dst=T(D)}
   end.
 
 -spec src2_temps(subst_fun(), src2()) -> src2().
diff --git a/lib/hipe/x86/hipe_x86.erl b/lib/hipe/x86/hipe_x86.erl
index cc1c75b04d..f514dd1ded 100644
--- a/lib/hipe/x86/hipe_x86.erl
+++ b/lib/hipe/x86/hipe_x86.erl
@@ -167,6 +167,12 @@
 
      mk_pseudo_spill/1,
 
+	 mk_pseudo_spill_fmove/3,
+	 is_pseudo_spill_fmove/1,
+
+	 mk_pseudo_spill_move/3,
+	 is_pseudo_spill_move/1,
+
 	 mk_pseudo_tailcall/4,
 	 %% is_pseudo_tailcall/1,
 	 pseudo_tailcall_fun/1,
@@ -425,6 +431,14 @@ mk_pseudo_jcc_simple(Cc, TrueLabel, FalseLabel, Pred) ->
 mk_pseudo_spill(List) ->
     #pseudo_spill{args=List}.
 
+mk_pseudo_spill_fmove(Src, Temp, Dst) ->
+    #pseudo_spill_fmove{src=Src, temp=Temp, dst=Dst}.
+is_pseudo_spill_fmove(I) -> is_record(I, pseudo_spill_fmove).
+
+mk_pseudo_spill_move(Src, Temp, Dst) ->
+    #pseudo_spill_move{src=Src, temp=Temp, dst=Dst}.
+is_pseudo_spill_move(I) -> is_record(I, pseudo_spill_move).
+
 mk_pseudo_tailcall(Fun, Arity, StkArgs, Linkage) ->
     check_linkage(Linkage),
     #pseudo_tailcall{'fun'=Fun, arity=Arity, stkargs=StkArgs, linkage=Linkage}.
diff --git a/lib/hipe/x86/hipe_x86.hrl b/lib/hipe/x86/hipe_x86.hrl
index 567848bae5..6cd69905b2 100644
--- a/lib/hipe/x86/hipe_x86.hrl
+++ b/lib/hipe/x86/hipe_x86.hrl
@@ -91,6 +91,8 @@
 -record(pseudo_call, {'fun', sdesc, contlab, linkage}).
 -record(pseudo_jcc, {cc, true_label, false_label, pred}).
 -record(pseudo_spill, {args=[]}).
+-record(pseudo_spill_move, {src, temp, dst}).
+-record(pseudo_spill_fmove, {src, temp, dst}).
 -record(pseudo_tailcall, {'fun', arity, stkargs, linkage}).
 -record(pseudo_tailcall_prepare, {}).
 -record(push, {src}).
diff --git a/lib/hipe/x86/hipe_x86_defuse.erl b/lib/hipe/x86/hipe_x86_defuse.erl
index 5d7fadf8e5..2731836dc1 100644
--- a/lib/hipe/x86/hipe_x86_defuse.erl
+++ b/lib/hipe/x86/hipe_x86_defuse.erl
@@ -51,6 +51,8 @@ insn_def(I) ->
     #movzx{dst=Dst} -> dst_def(Dst);
     #pseudo_call{} -> call_clobbered();
     #pseudo_spill{} -> [];
+    #pseudo_spill_fmove{temp=Temp, dst=Dst} -> [Temp, Dst];
+    #pseudo_spill_move{temp=Temp, dst=Dst} -> [Temp, Dst];
     #pseudo_tailcall_prepare{} -> tailcall_clobbered();
     #shift{dst=Dst} -> dst_def(Dst);
     %% call, cmp, comment, jcc, jmp_fun, jmp_label, jmp_switch, label
@@ -108,6 +110,8 @@ insn_use(I) ->
     #pseudo_call{'fun'=Fun,sdesc=#x86_sdesc{arity=Arity}} ->
       addtemp(Fun, arity_use(Arity));
     #pseudo_spill{args=Args} -> Args;
+    #pseudo_spill_fmove{src=Src} -> [Src];
+    #pseudo_spill_move{src=Src} -> [Src];
     #pseudo_tailcall{'fun'=Fun,arity=Arity,stkargs=StkArgs} ->
       addtemp(Fun, addtemps(StkArgs, addtemps(tailcall_clobbered(),
 					      arity_use(Arity))));
diff --git a/lib/hipe/x86/hipe_x86_frame.erl b/lib/hipe/x86/hipe_x86_frame.erl
index 3c2b67967a..558321d0c3 100644
--- a/lib/hipe/x86/hipe_x86_frame.erl
+++ b/lib/hipe/x86/hipe_x86_frame.erl
@@ -95,13 +95,17 @@ do_insn(I, LiveOut, Context, FPoff) ->
     #imul{} ->
       {[do_imul(I, Context, FPoff)], FPoff};
     #move{} ->
-      {[do_move(I, Context, FPoff)], FPoff};
+      {do_move(I, Context, FPoff), FPoff};
     #movsx{} ->
       {[do_movsx(I, Context, FPoff)], FPoff};
     #movzx{} ->
       {[do_movzx(I, Context, FPoff)], FPoff};
     #pseudo_call{} ->
       do_pseudo_call(I, LiveOut, Context, FPoff);
+    #pseudo_spill_fmove{} ->
+      {do_pseudo_spill_fmove(I, Context, FPoff), FPoff};
+    #pseudo_spill_move{} ->
+      {do_pseudo_spill_move(I, Context, FPoff), FPoff};
     #pseudo_tailcall{} ->
       {do_pseudo_tailcall(I, Context), context_framesize(Context)};
     #push{} ->
@@ -144,22 +148,50 @@ do_fp_binop(I, Context, FPoff) ->
   Dst = conv_opnd(Dst0, FPoff, Context),
   [I#fp_binop{src=Src,dst=Dst}].
 
-do_fmove(I, Context, FPoff) ->
-  #fmove{src=Src0,dst=Dst0} = I,
+do_fmove(I0, Context, FPoff) ->
+  #fmove{src=Src0,dst=Dst0} = I0,
   Src = conv_opnd(Src0, FPoff, Context),
   Dst = conv_opnd(Dst0, FPoff, Context),
-  I#fmove{src=Src,dst=Dst}.
+  I = I0#fmove{src=Src,dst=Dst},
+  case Src =:= Dst of
+    true -> []; % omit move-to-self
+    false -> [I]
+  end.
+
+do_pseudo_spill_fmove(I0, Context, FPoff) ->
+  #pseudo_spill_fmove{src=Src0,temp=Temp0,dst=Dst0} = I0,
+  Src = conv_opnd(Src0, FPoff, Context),
+  Temp = conv_opnd(Temp0, FPoff, Context),
+  Dst = conv_opnd(Dst0, FPoff, Context),
+  case Src =:= Dst of
+    true -> []; % omit move-to-self
+    false -> [#fmove{src=Src, dst=Temp}, #fmove{src=Temp, dst=Dst}]
+  end.
 
 do_imul(I, Context, FPoff) ->
   #imul{src=Src0} = I,
   Src = conv_opnd(Src0, FPoff, Context),
   I#imul{src=Src}.
 
-do_move(I, Context, FPoff) ->
-  #move{src=Src0,dst=Dst0} = I,
+do_move(I0, Context, FPoff) ->
+  #move{src=Src0,dst=Dst0} = I0,
   Src = conv_opnd(Src0, FPoff, Context),
   Dst = conv_opnd(Dst0, FPoff, Context),
-  I#move{src=Src,dst=Dst}.
+  I = I0#move{src=Src,dst=Dst},
+  case Src =:= Dst of
+    true -> []; % omit move-to-self
+    false -> [I]
+  end.
+
+do_pseudo_spill_move(I0, Context, FPoff) ->
+  #pseudo_spill_move{src=Src0,temp=Temp0,dst=Dst0} = I0,
+  Src = conv_opnd(Src0, FPoff, Context),
+  Temp = conv_opnd(Temp0, FPoff, Context),
+  Dst = conv_opnd(Dst0, FPoff, Context),
+  case Src =:= Dst of
+    true -> []; % omit move-to-self
+    false -> [#move{src=Src, dst=Temp}, #move{src=Temp, dst=Dst}]
+  end.
 
 do_movsx(I, Context, FPoff) ->
   #movsx{src=Src0,dst=Dst0} = I,
diff --git a/lib/hipe/x86/hipe_x86_ra_finalise.erl b/lib/hipe/x86/hipe_x86_ra_finalise.erl
index 4273e3cee8..e8abe78e00 100644
--- a/lib/hipe/x86/hipe_x86_ra_finalise.erl
+++ b/lib/hipe/x86/hipe_x86_ra_finalise.erl
@@ -140,6 +140,16 @@ ra_insn(I, Map, FpMap) ->
       I#pseudo_call{'fun'=Fun};
     #pseudo_jcc{} ->
       I;
+    #pseudo_spill_fmove{src=Src0, temp=Temp0, dst=Dst0} ->
+      Src = ra_opnd(Src0, Map, FpMap),
+      Temp = ra_opnd(Temp0, Map, FpMap),
+      Dst = ra_opnd(Dst0, Map, FpMap),
+      I#pseudo_spill_fmove{src=Src, temp=Temp, dst=Dst};
+    #pseudo_spill_move{src=Src0, temp=Temp0, dst=Dst0} ->
+      Src = ra_opnd(Src0, Map),
+      Temp = ra_opnd(Temp0, Map),
+      Dst = ra_opnd(Dst0, Map),
+      I#pseudo_spill_move{src=Src, temp=Temp, dst=Dst};
     #pseudo_tailcall{'fun'=Fun0,stkargs=StkArgs0} ->
       Fun = ra_opnd(Fun0, Map),
       StkArgs = ra_args(StkArgs0, Map),
diff --git a/lib/hipe/x86/hipe_x86_ra_postconditions.erl b/lib/hipe/x86/hipe_x86_ra_postconditions.erl
index 28ec9c4277..db6391d5c1 100644
--- a/lib/hipe/x86/hipe_x86_ra_postconditions.erl
+++ b/lib/hipe/x86/hipe_x86_ra_postconditions.erl
@@ -74,6 +74,8 @@ do_insn(I, TempMap, Strategy) ->	% Insn -> {Insn list, DidSpill}
       do_movx(I, TempMap, Strategy);
     #fmove{} ->
       do_fmove(I, TempMap, Strategy);
+    #pseudo_spill_move{} ->
+      do_pseudo_spill_move(I, TempMap, Strategy);
     #shift{} ->
       do_shift(I, TempMap, Strategy);
     #test{} ->
@@ -190,10 +192,19 @@ do_lea(I, TempMap, Strategy) ->
 
 do_move(I, TempMap, Strategy) ->
   #move{src=Src0,dst=Dst0} = I,
-  {FixSrc, Src, FixDst, Dst, DidSpill} =
-    do_check_byte_move(Src0, Dst0, TempMap, Strategy),
-  {FixSrc ++ FixDst ++ [I#move{src=Src,dst=Dst}],
-   DidSpill}.
+  case
+    is_record(Src0, x86_temp) andalso is_record(Dst0, x86_temp)
+    andalso is_spilled(Src0, TempMap) andalso is_spilled(Dst0, TempMap)
+  of
+    true ->
+      Tmp = clone(Src0, Strategy),
+      {[hipe_x86:mk_pseudo_spill_move(Src0, Tmp, Dst0)], true};
+    false ->
+      {FixSrc, Src, FixDst, Dst, DidSpill} =
+	do_check_byte_move(Src0, Dst0, TempMap, Strategy),
+      {FixSrc ++ FixDst ++ [I#move{src=Src,dst=Dst}],
+       DidSpill}
+  end.
 
 -ifdef(HIPE_AMD64).
 
@@ -287,6 +298,13 @@ do_fmove(I, TempMap, Strategy) ->
   {FixSrc ++ FixDst ++ [I#fmove{src=Src,dst=Dst}],
    DidSpill1 or DidSpill2}.
 
+%%% Fix an pseudo_spill_move op.
+
+do_pseudo_spill_move(I = #pseudo_spill_move{temp=Temp}, TempMap, _Strategy) ->
+  %% Temp is above the low water mark and must not have been spilled
+  false = is_spilled(Temp, TempMap),
+  {[I], false}. % nothing to do
+
 %%% Fix a shift operation.
 %%% 1. remove pseudos from any explicit memory operands
 %%% 2. if the source is a register or memory position
diff --git a/lib/hipe/x86/hipe_x86_subst.erl b/lib/hipe/x86/hipe_x86_subst.erl
index adc2e9dc65..7db3b23d92 100644
--- a/lib/hipe/x86/hipe_x86_subst.erl
+++ b/lib/hipe/x86/hipe_x86_subst.erl
@@ -59,6 +59,10 @@ insn_temps(SubstTemp, I) ->
       I#jmp_switch{temp=O(T), jtab=jtab_temps(SubstTemp, J)};
     #pseudo_call{'fun'=F} ->
       I#pseudo_call{'fun'=funv_temps(SubstTemp, F)};
+    #pseudo_spill_fmove{src=S, temp=T, dst=D} ->
+      I#pseudo_spill_fmove{src=O(S), temp=O(T), dst=O(D)};
+    #pseudo_spill_move{src=S, temp=T, dst=D} ->
+      I#pseudo_spill_move{src=O(S), temp=O(T), dst=O(D)};
     #pseudo_tailcall{'fun'=F, stkargs=Stk} ->
       I#pseudo_tailcall{'fun'=funv_temps(SubstTemp, F),
 			stkargs=lists:map(O, Stk)};
-- 
cgit v1.2.3


From e7b26d9d6a36b315518cc2743eb179ccec3b7a36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Mon, 6 Mar 2017 18:31:27 +0100
Subject: hipe_spillmin_color: Move coalescing

---
 lib/hipe/opt/hipe_spillmin_color.erl | 62 +++++++++++++++++++++++++++++-------
 1 file changed, 51 insertions(+), 11 deletions(-)

diff --git a/lib/hipe/opt/hipe_spillmin_color.erl b/lib/hipe/opt/hipe_spillmin_color.erl
index 41f1972df7..28a4f4bca5 100644
--- a/lib/hipe/opt/hipe_spillmin_color.erl
+++ b/lib/hipe/opt/hipe_spillmin_color.erl
@@ -212,16 +212,26 @@ build_ig_bb([X|Xs], LiveOut, IG, Target, TempMap, TempMapping) ->
     build_ig_bb(Xs, LiveOut, IG, Target, TempMap, TempMapping),
   build_ig_instr(X, Live, NewIG, Target, TempMap, TempMapping).
 
-build_ig_instr(X, Live, IG, Target, TempMap, TempMapping) ->
+build_ig_instr(X, Live0, IG0, Target, TempMap, TempMapping) ->
   {Def, Use} = def_use(X, Target, TempMap),
-  ?report3("Live ~w\n~w : Def: ~w Use ~w\n",[Live, X, Def,Use]),
+  ?report3("Live ~w\n~w : Def: ~w Use ~w\n",[Live0, X, Def,Use]),
   DefListMapped = list_map(Def, TempMapping, []),
   UseListMapped = list_map(Use, TempMapping, []),
   DefSetMapped = ordsets:from_list(DefListMapped),
   UseSetMapped = ordsets:from_list(UseListMapped),
-  NewIG = interference_arcs(DefListMapped, ordsets:to_list(Live), IG),
-  NewLive = ordsets:union(UseSetMapped, ordsets:subtract(Live, DefSetMapped)),
-  {NewLive, NewIG}.
+  {Live1, IG1} =
+    analyze_move(X, Live0, IG0, Target, DefSetMapped, UseSetMapped),
+  IG = interference_arcs(DefListMapped, ordsets:to_list(Live1), IG1),
+  Live = ordsets:union(UseSetMapped, ordsets:subtract(Live1, DefSetMapped)),
+  {Live, IG}.
+
+analyze_move(X, Live0, IG0, Target, DefSetMapped, UseSetMapped) ->
+  case {is_spill_move(X, Target), DefSetMapped, UseSetMapped} of
+    {true, [Dst], [Src]} ->
+      {ordsets:del_element(Src, Live0), add_move(Src, Dst, IG0)};
+    {_, _, _} ->
+      {Live0, IG0}
+  end.
 
 %% Given a list of Keys and an ets-table returns a list of the elements 
 %% in Mapping corresponding to the Keys and appends Acc to this list.
@@ -382,7 +392,8 @@ select_colors([{X,colorable}|Xs], IG, Cols, PhysRegs) ->
 
 select_color(X, IG, Cols, PhysRegs) ->
   UsedColors = get_colors(neighbors(X, IG), Cols),
-  Reg = select_unused_color(UsedColors, PhysRegs),
+  Preferences = get_colors(move_connected(X, IG), Cols),
+  Reg = select_unused_color(UsedColors, Preferences, PhysRegs),
   {Reg, set_color(X, Reg, Cols)}.
 
 %%%%%%%%%%%%%%%%%%%%
@@ -396,10 +407,14 @@ get_colors([X|Xs], Cols) ->
       [R|get_colors(Xs, Cols)]
   end.
 
-select_unused_color(UsedColors, PhysRegs) ->
+select_unused_color(UsedColors, Preferences, PhysRegs) ->
   Summary = ordsets:from_list(UsedColors),
-  AvailRegs = ordsets:to_list(ordsets:subtract(PhysRegs, Summary)),
-  hd(AvailRegs).
+  case ordsets:subtract(ordsets:from_list(Preferences), Summary) of
+    [PreferredColor|_] -> PreferredColor;
+    _ ->
+      AvailRegs = ordsets:to_list(ordsets:subtract(PhysRegs, Summary)),
+      hd(AvailRegs)
+  end.
 
 push_colored(X, Stk) ->
   [{X, colorable} | Stk].
@@ -456,7 +471,11 @@ init_stackslots(NumSlots, Acc) ->
 %%
 %% Note: later on, we may wish to add 'move-related' support.
 
--record(ig_info, {neighbors = [] :: [_], degree = 0 :: non_neg_integer()}).
+-record(ig_info, {
+	  neighbors = []      :: [_],
+	  degree = 0          :: non_neg_integer(),
+	  move_connected = [] :: [_]
+	 }).
 
 empty_ig(NumNodes) ->
   hipe_vectors:new(NumNodes, #ig_info{}).
@@ -467,16 +486,29 @@ degree(Info) ->
 neighbors(Info) ->
   Info#ig_info.neighbors.
 
+move_connected(Info) ->
+  Info#ig_info.move_connected.
+
 add_edge(X, X, IG) -> IG;
 add_edge(X, Y, IG) ->
   add_arc(X, Y, add_arc(Y, X, IG)).
 
+add_move(X, X, IG) -> IG;
+add_move(X, Y, IG) ->
+  add_move_arc(X, Y, add_move_arc(Y, X, IG)).
+
 add_arc(X, Y, IG) ->
   Info = hipe_vectors:get(IG, X),
   Old = neighbors(Info),
   New = Info#ig_info{neighbors = [Y|Old]},
   hipe_vectors:set(IG,X,New).
 
+add_move_arc(X, Y, IG) ->
+  Info = hipe_vectors:get(IG, X),
+  Old = move_connected(Info),
+  New = Info#ig_info{move_connected = [Y|Old]},
+  hipe_vectors:set(IG,X,New).
+
 normalize_ig(IG) ->
   Size = hipe_vectors:size(IG),
   normalize_ig(Size-1, IG).
@@ -486,7 +518,8 @@ normalize_ig(-1, IG) ->
 normalize_ig(I, IG) ->
   Info = hipe_vectors:get(IG, I),
   N = ordsets:from_list(neighbors(Info)),
-  NewInfo = Info#ig_info{neighbors = N, degree = length(N)},
+  M = ordsets:subtract(ordsets:from_list(move_connected(Info)), N),
+  NewInfo = Info#ig_info{neighbors = N, degree = length(N), move_connected = M},
   NewIG = hipe_vectors:set(IG, I, NewInfo),
   normalize_ig(I-1, NewIG).
 
@@ -494,6 +527,10 @@ neighbors(X, IG) ->
   Info = hipe_vectors:get(IG, X),
   Info#ig_info.neighbors.
 
+move_connected(X, IG) ->
+  Info = hipe_vectors:get(IG, X),
+  Info#ig_info.move_connected.
+
 decrement_degree(X, IG) ->
   Info = hipe_vectors:get(IG, X),
   Degree = degree(Info),
@@ -555,3 +592,6 @@ def_use(X, Target={TgtMod,TgtCtx}, TempMap) ->
 
 reg_names(Regs, {TgtMod,TgtCtx}) ->
   [TgtMod:reg_nr(X,TgtCtx) || X <- Regs].
+
+is_spill_move(Instr, {TgtMod,TgtCtx}) ->
+  TgtMod:is_spill_move(Instr, TgtCtx).
-- 
cgit v1.2.3


From 9e618caac607379e1154e24bc9bd09709cce5d41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Magnus=20L=C3=A5ng?= <margnus1@telia.com>
Date: Thu, 16 Mar 2017 16:08:17 +0100
Subject: hipe_spillmin_color: cleanup exit calls

---
 lib/hipe/opt/hipe_spillmin_color.erl | 28 +++++++---------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/lib/hipe/opt/hipe_spillmin_color.erl b/lib/hipe/opt/hipe_spillmin_color.erl
index 28a4f4bca5..f87d9a5b61 100644
--- a/lib/hipe/opt/hipe_spillmin_color.erl
+++ b/lib/hipe/opt/hipe_spillmin_color.erl
@@ -166,9 +166,13 @@ remap_temp_map0(Cols, [_Y|Ys], SpillIndex) ->
 %%
 
 build_ig(CFG, Live, Target, TempMap) ->
-  try build_ig0(CFG, Live, Target, TempMap)
-  catch error:Rsn -> exit({regalloc, build_ig, Rsn})
-  end.
+  TempMapping = map_spilled_temporaries(TempMap),
+  TempMappingTable = setup_ets(TempMapping),
+  NumSpilled = length(TempMapping),
+  IG = build_ig_bbs(labels(CFG, Target), CFG, Live, empty_ig(NumSpilled),
+		    Target, TempMap, TempMappingTable),
+  ets:delete(TempMappingTable),
+  {normalize_ig(IG), NumSpilled}.
 
 %% Creates an ETS table consisting of the keys given in List, with the values
 %% being an integer which is the position of the key in List.
@@ -183,15 +187,6 @@ setup_ets0([X|Xs], Table, N) ->
   ets:insert(Table, {X, N}),
   setup_ets0(Xs, Table, N+1).
 
-build_ig0(CFG, Live, Target, TempMap) ->
-  TempMapping = map_spilled_temporaries(TempMap),
-  TempMappingTable = setup_ets(TempMapping),
-  NumSpilled = length(TempMapping),
-  IG = build_ig_bbs(labels(CFG, Target), CFG, Live, empty_ig(NumSpilled),
-		    Target, TempMap, TempMappingTable),
-  ets:delete(TempMappingTable),
-  {normalize_ig(IG), NumSpilled}.
-
 build_ig_bbs([], _CFG, _Live, IG, _Target, _TempMap, _TempMapping) ->
   IG;
 build_ig_bbs([L|Ls], CFG, Live, IG, Target, TempMap, TempMapping) ->
@@ -281,15 +276,6 @@ i_arcs(X, [Y|Ys], IG) ->
 %%     throw an exception (the caller should retry with more stack slots)
 
 color(IG, StackSlots, NumNodes, Target) ->
-  try
-    color_0(IG, StackSlots, NumNodes, Target)
-  catch
-    error:Rsn ->
-      ?error_msg("Coloring failed with ~p~n", [Rsn]),
-      ?EXIT(Rsn)
-  end.
-
-color_0(IG, StackSlots, NumNodes, Target) -> 
   ?report("simplification of IG~n", []),
   K = ordsets:size(StackSlots),
   Nodes = list_ig(IG),
-- 
cgit v1.2.3