diff options
author | Björn Gustavsson <[email protected]> | 2019-08-16 16:04:02 +0200 |
---|---|---|
committer | Björn Gustavsson <[email protected]> | 2019-08-22 13:37:41 +0200 |
commit | 8041849a8e55281b4d954e63e9415995607e1870 (patch) | |
tree | 834f0ef2393ca6e8a39222906dd7f3795d049740 /erts/emulator | |
parent | 25fe3fb23c594d735cb6ebae120910e44f0cdae4 (diff) | |
download | otp-8041849a8e55281b4d954e63e9415995607e1870.tar.gz otp-8041849a8e55281b4d954e63e9415995607e1870.tar.bz2 otp-8041849a8e55281b4d954e63e9415995607e1870.zip |
Optimize deallocate_return instructions
Eliminating the CP register and putting continuation
pointers directly on the stack made the deallocate_return
instruction slower.
Try to mitigate this slow down by specializing deallocate_return
for small stack. For the move_deallocate_return instruction,
reorder instructions to make it possible to execute the read
instructions in parallel.
Diffstat (limited to 'erts/emulator')
-rw-r--r-- | erts/emulator/beam/instrs.tab | 71 | ||||
-rw-r--r-- | erts/emulator/beam/ops.tab | 12 |
2 files changed, 79 insertions, 4 deletions
diff --git a/erts/emulator/beam/instrs.tab b/erts/emulator/beam/instrs.tab index efdba73057..807f4512d1 100644 --- a/erts/emulator/beam/instrs.tab +++ b/erts/emulator/beam/instrs.tab @@ -66,18 +66,81 @@ deallocate(Deallocate) { E = ADD_BYTE_OFFSET(E, $Deallocate); } +deallocate_return0 := dealloc_ret.n0.execute; +deallocate_return1 := dealloc_ret.n1.execute; +deallocate_return2 := dealloc_ret.n2.execute; +deallocate_return3 := dealloc_ret.n3.execute; +deallocate_return4 := dealloc_ret.n4.execute; + +dealloc_ret.head() { + Uint num_bytes; +} + +dealloc_ret.n0() { + num_bytes = (0+1) * sizeof(Eterm); +} + +dealloc_ret.n1() { + num_bytes = (1+1) * sizeof(Eterm); +} + +dealloc_ret.n2() { + num_bytes = (2+1) * sizeof(Eterm); +} + +dealloc_ret.n3() { + num_bytes = (3+1) * sizeof(Eterm); +} + +dealloc_ret.n4() { + num_bytes = (4+1) * sizeof(Eterm); +} + +dealloc_ret.execute() { + //| -no_next + + /* + * Micro-benchmarks showed that the deallocate_return instruction + * became slower when the continuation pointer was moved from + * the process struct to the stack. The reason seems to be read + * dependencies, i.e. that the CPU cannot figure out beforehand + * from which position on the stack the continuation pointer + * should be fetched. + * + * Making sure that num_bytes is always initialized with a + * constant value seems to restore the lost speed. + */ + + E = ADD_BYTE_OFFSET(E, num_bytes); + $RETURN(); + CHECK_TERM(x(0)); + DispatchReturn; +} + deallocate_return(Deallocate) { //| -no_next - int words_to_pop = $Deallocate; - E = ADD_BYTE_OFFSET(E, words_to_pop); + Uint bytes_to_pop = $Deallocate; + E = ADD_BYTE_OFFSET(E, bytes_to_pop); $RETURN(); CHECK_TERM(x(0)); DispatchReturn; } move_deallocate_return(Src, Deallocate) { - x(0) = $Src; - $deallocate_return($Deallocate); + //| -no_next + + /* + * Explicitly do reads first to mitigate the impact of read + * dependencies. + */ + + Uint bytes_to_pop = $Deallocate; + Eterm src = $Src; + E = ADD_BYTE_OFFSET(E, bytes_to_pop); + x(0) = src; + $RETURN(); + CHECK_TERM(x(0)); + DispatchReturn; } // Call instructions diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab index f525d126e7..c0ca9260a0 100644 --- a/erts/emulator/beam/ops.tab +++ b/erts/emulator/beam/ops.tab @@ -596,8 +596,20 @@ move S x==0 | deallocate D | return => move_deallocate_return S D move_deallocate_return xycn Q +deallocate u==0 | return => deallocate_return0 +deallocate u==1 | return => deallocate_return1 +deallocate u==2 | return => deallocate_return2 +deallocate u==3 | return => deallocate_return3 +deallocate u==4 | return => deallocate_return4 + deallocate D | return => deallocate_return D +deallocate_return0 +deallocate_return1 +deallocate_return2 +deallocate_return3 +deallocate_return4 + deallocate_return Q test_heap Need u==1 | put_list Y=y x==0 x==0 => test_heap_1_put_list Need Y |