aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjörn Gustavsson <[email protected]>2019-08-16 16:04:02 +0200
committerBjörn Gustavsson <[email protected]>2019-08-22 13:37:41 +0200
commit8041849a8e55281b4d954e63e9415995607e1870 (patch)
tree834f0ef2393ca6e8a39222906dd7f3795d049740
parent25fe3fb23c594d735cb6ebae120910e44f0cdae4 (diff)
downloadotp-8041849a8e55281b4d954e63e9415995607e1870.tar.gz
otp-8041849a8e55281b4d954e63e9415995607e1870.tar.bz2
otp-8041849a8e55281b4d954e63e9415995607e1870.zip
Optimize deallocate_return instructions
Eliminating the CP register and putting continuation pointers directly on the stack made the deallocate_return instruction slower. Try to mitigate this slow down by specializing deallocate_return for small stack. For the move_deallocate_return instruction, reorder instructions to make it possible to execute the read instructions in parallel.
-rw-r--r--erts/emulator/beam/instrs.tab71
-rw-r--r--erts/emulator/beam/ops.tab12
2 files changed, 79 insertions, 4 deletions
diff --git a/erts/emulator/beam/instrs.tab b/erts/emulator/beam/instrs.tab
index efdba73057..807f4512d1 100644
--- a/erts/emulator/beam/instrs.tab
+++ b/erts/emulator/beam/instrs.tab
@@ -66,18 +66,81 @@ deallocate(Deallocate) {
E = ADD_BYTE_OFFSET(E, $Deallocate);
}
+deallocate_return0 := dealloc_ret.n0.execute;
+deallocate_return1 := dealloc_ret.n1.execute;
+deallocate_return2 := dealloc_ret.n2.execute;
+deallocate_return3 := dealloc_ret.n3.execute;
+deallocate_return4 := dealloc_ret.n4.execute;
+
+dealloc_ret.head() {
+ Uint num_bytes;
+}
+
+dealloc_ret.n0() {
+ num_bytes = (0+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n1() {
+ num_bytes = (1+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n2() {
+ num_bytes = (2+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n3() {
+ num_bytes = (3+1) * sizeof(Eterm);
+}
+
+dealloc_ret.n4() {
+ num_bytes = (4+1) * sizeof(Eterm);
+}
+
+dealloc_ret.execute() {
+ //| -no_next
+
+ /*
+ * Micro-benchmarks showed that the deallocate_return instruction
+ * became slower when the continuation pointer was moved from
+ * the process struct to the stack. The reason seems to be read
+ * dependencies, i.e. that the CPU cannot figure out beforehand
+ * from which position on the stack the continuation pointer
+ * should be fetched.
+ *
+ * Making sure that num_bytes is always initialized with a
+ * constant value seems to restore the lost speed.
+ */
+
+ E = ADD_BYTE_OFFSET(E, num_bytes);
+ $RETURN();
+ CHECK_TERM(x(0));
+ DispatchReturn;
+}
+
deallocate_return(Deallocate) {
//| -no_next
- int words_to_pop = $Deallocate;
- E = ADD_BYTE_OFFSET(E, words_to_pop);
+ Uint bytes_to_pop = $Deallocate;
+ E = ADD_BYTE_OFFSET(E, bytes_to_pop);
$RETURN();
CHECK_TERM(x(0));
DispatchReturn;
}
move_deallocate_return(Src, Deallocate) {
- x(0) = $Src;
- $deallocate_return($Deallocate);
+ //| -no_next
+
+ /*
+ * Explicitly do reads first to mitigate the impact of read
+ * dependencies.
+ */
+
+ Uint bytes_to_pop = $Deallocate;
+ Eterm src = $Src;
+ E = ADD_BYTE_OFFSET(E, bytes_to_pop);
+ x(0) = src;
+ $RETURN();
+ CHECK_TERM(x(0));
+ DispatchReturn;
}
// Call instructions
diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab
index f525d126e7..c0ca9260a0 100644
--- a/erts/emulator/beam/ops.tab
+++ b/erts/emulator/beam/ops.tab
@@ -596,8 +596,20 @@ move S x==0 | deallocate D | return => move_deallocate_return S D
move_deallocate_return xycn Q
+deallocate u==0 | return => deallocate_return0
+deallocate u==1 | return => deallocate_return1
+deallocate u==2 | return => deallocate_return2
+deallocate u==3 | return => deallocate_return3
+deallocate u==4 | return => deallocate_return4
+
deallocate D | return => deallocate_return D
+deallocate_return0
+deallocate_return1
+deallocate_return2
+deallocate_return3
+deallocate_return4
+
deallocate_return Q
test_heap Need u==1 | put_list Y=y x==0 x==0 => test_heap_1_put_list Need Y