Introduce specialized versions of move2

Currently, move2/2 does the two moves sequentially to ensure that the instruction will always work correctly. We can do better than that. If the two move instructions have any registers in common, we can introduce simpler and slightly more efficient instructions to handle those cases: move_shift/3 move_dup/3 For the remaining cases when the the move instructions have no common registers, the move2/4 instruction can perform the moves in parallel which is probably slightly more efficient. For clarity's sake, we will remain the instruction to move2_par/4.
author: Björn Gustavsson <[email protected]> 2015-06-23 17:31:49 +0200
committer: Björn Gustavsson <[email protected]> 2015-07-03 14:34:58 +0200
commit: 3dc2bee53b4d36f41821a6ab512cf01c958c11f9 (patch)
tree: ad25673c3d7ab6532805713a1a2ac946d9d7f8ce
parent: 1f73d45327bb13a615f2f0a8d9d4888ddacb95a5 (diff)
download: otp-3dc2bee53b4d36f41821a6ab512cf01c958c11f9.tar.gz
otp-3dc2bee53b4d36f41821a6ab512cf01c958c11f9.tar.bz2
otp-3dc2bee53b4d36f41821a6ab512cf01c958c11f9.zip
2 files changed, 68 insertions, 21 deletions
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index bac7057abe..2c10e7ae7c 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -551,7 +551,23 @@ void** beam_ops;
        Store(term, Dst);           \
    } while (0)
 
-#define Move2(S1, D1, S2, D2) D1 = (S1); D2 = (S2)
+#define Move2Par(S1, D1, S2, D2)		\
+  do {						\
+      Eterm V1, V2;				\
+      V1 = (S1); V2 = (S2); D1 = V1; D2 = V2;	\
+  } while (0)
+
+#define MoveShift(Src, SD, D)			\
+  do {						\
+    Eterm V;					\
+    V = Src; D = SD; SD = V;			\
+  } while (0)
+
+#define MoveDup(Src, D1, D2)			\
+  do {						\
+    D1 = D2 = (Src);				\
+  } while (0)
+
 #define Move3(S1, D1, S2, D2, S3, D3) D1 = (S1); D2 = (S2); D3 = (S3)
 
 #define MoveReturn(Src)				\
diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab
index 4ab08faae3..97525c3b72 100644
--- a/erts/emulator/beam/ops.tab
+++ b/erts/emulator/beam/ops.tab
@@ -303,21 +303,40 @@ move_window3 x x x y
 move_window4 x x x x y
 move_window5 x x x x x y
 
-move X1=x Y1=y | move X2=x Y2=y => move2 X1 Y1 X2 Y2
-move Y1=y X1=x | move Y2=y X2=x => move2 Y1 X1 Y2 X2
-move X1=x X2=x | move X3=x X4=x => move2 X1 X2 X3 X4
+move Src=x D1=x | move Src=x D2=x => move_dup Src D1 D2
+move Src=x SD=x | move SD=x D=x   => move_dup Src SD D
+move Src=x D1=x | move Src=x D2=y => move_dup Src D1 D2
+move Src=y SD=x | move SD=x D=y   => move_dup Src SD D
+move Src=x SD=x | move SD=x D=y   => move_dup Src SD D
+move Src=y SD=x | move SD=x D=x   => move_dup Src SD D
 
-move X1=x X2=x | move X3=x Y1=y => move2 X1 X2 X3 Y1
+move SD=x D=x | move Src=xy SD=x => move_shift Src SD D
+move SD=y D=x | move Src=x  SD=y => move_shift Src SD D
+move SD=x D=y | move Src=x  SD=x => move_shift Src SD D
 
-move S1=x S2=x | move X1=x Y1=y => move2 S1 S2 X1 Y1
-move S1=y S2=x | move X1=x Y1=y => move2 S1 S2 X1 Y1
+# The transformations above guarantee that the source for
+# the second move is not the same as the destination for
+# the first move. That means that we can do the moves in
+# parallel (fetch both values, then store them) which could
+# be faster.
 
-move Y1=y X1=x | move S1=x D1=x => move2 Y1 X1 S1 D1
-move S1=x D1=x | move Y1=y X1=x => move2 S1 D1 Y1 X1
+move X1=x Y1=y | move X2=x Y2=y => move2_par X1 Y1 X2 Y2
+move Y1=y X1=x | move Y2=y X2=x => move2_par Y1 X1 Y2 X2
 
-move2 X1=x Y1=y X2=x Y2=y | move X3=x Y3=y => move3 X1 Y1 X2 Y2 X3 Y3
-move2 Y1=y X1=x Y2=y X2=x | move Y3=y X3=x => move3 Y1 X1 Y2 X2 Y3 X3
-move2 X1=x X2=x X3=x X4=x | move X5=x X6=x => move3 X1 X2 X3 X4 X5 X6
+move X1=x X2=x | move X3=x X4=x => move2_par X1 X2 X3 X4
+
+move X1=x X2=x | move X3=x Y1=y => move2_par X1 X2 X3 Y1
+
+move S1=x S2=x | move X1=x Y1=y => move2_par S1 S2 X1 Y1
+
+move S1=y S2=x | move X1=x Y1=y => move2_par S1 S2 X1 Y1
+
+move Y1=y X1=x | move S1=x D1=x => move2_par Y1 X1 S1 D1
+move S1=x D1=x | move Y1=y X1=x => move2_par S1 D1 Y1 X1
+
+move2_par X1=x Y1=y X2=x Y2=y | move X3=x Y3=y => move3 X1 Y1 X2 Y2 X3 Y3
+move2_par Y1=y X1=x Y2=y X2=x | move Y3=y X3=x => move3 Y1 X1 Y2 X2 Y3 X3
+move2_par X1=x X2=x X3=x X4=x | move X5=x X6=x => move3 X1 X2 X3 X4 X5 X6
 
 move C=aiq X=x==1 => move_x1 C
 move C=aiq X=x==2 => move_x2 C
@@ -325,18 +344,30 @@ move C=aiq X=x==2 => move_x2 C
 move_x1 c
 move_x2 c
 
-%macro: move2 Move2 -pack
-move2 x y x y
-move2 y x y x
-move2 x x x x
+%macro: move_shift MoveShift -pack
+move_shift x x x
+move_shift y x x
+move_shift x y x
+move_shift x x y
+
+%macro: move_dup MoveDup -pack
+move_dup x x x
+move_dup x x y
+move_dup y x x
+move_dup y x y
+
+%macro: move2_par Move2Par -pack
+
+move2_par x y x y
+move2_par y x y x
+move2_par x x x x
 
-move2 x x x y
+move2_par x x x y
 
-move2 x y x y
-move2 y x x y
+move2_par y x x y
 
-move2 x x y x
-move2 y x x x
+move2_par x x y x
+move2_par y x x x
 
 %macro: move3 Move3
 move3 x y x y x y
author	Björn Gustavsson <[email protected]>	2015-06-23 17:31:49 +0200
committer	Björn Gustavsson <[email protected]>	2015-07-03 14:34:58 +0200
commit	3dc2bee53b4d36f41821a6ab512cf01c958c11f9 (patch)
tree	ad25673c3d7ab6532805713a1a2ac946d9d7f8ce
parent	1f73d45327bb13a615f2f0a8d9d4888ddacb95a5 (diff)
download	otp-3dc2bee53b4d36f41821a6ab512cf01c958c11f9.tar.gz otp-3dc2bee53b4d36f41821a6ab512cf01c958c11f9.tar.bz2 otp-3dc2bee53b4d36f41821a6ab512cf01c958c11f9.zip