From 8041849a8e55281b4d954e63e9415995607e1870 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Gustavsson?= Date: Fri, 16 Aug 2019 16:04:02 +0200 Subject: Optimize deallocate_return instructions Eliminating the CP register and putting continuation pointers directly on the stack made the deallocate_return instruction slower. Try to mitigate this slow down by specializing deallocate_return for small stack. For the move_deallocate_return instruction, reorder instructions to make it possible to execute the read instructions in parallel. --- erts/emulator/beam/instrs.tab | 71 ++++++++++++++++++++++++++++++++++++++++--- erts/emulator/beam/ops.tab | 12 ++++++++ 2 files changed, 79 insertions(+), 4 deletions(-) diff --git a/erts/emulator/beam/instrs.tab b/erts/emulator/beam/instrs.tab index efdba73057..807f4512d1 100644 --- a/erts/emulator/beam/instrs.tab +++ b/erts/emulator/beam/instrs.tab @@ -66,18 +66,81 @@ deallocate(Deallocate) { E = ADD_BYTE_OFFSET(E, $Deallocate); } +deallocate_return0 := dealloc_ret.n0.execute; +deallocate_return1 := dealloc_ret.n1.execute; +deallocate_return2 := dealloc_ret.n2.execute; +deallocate_return3 := dealloc_ret.n3.execute; +deallocate_return4 := dealloc_ret.n4.execute; + +dealloc_ret.head() { + Uint num_bytes; +} + +dealloc_ret.n0() { + num_bytes = (0+1) * sizeof(Eterm); +} + +dealloc_ret.n1() { + num_bytes = (1+1) * sizeof(Eterm); +} + +dealloc_ret.n2() { + num_bytes = (2+1) * sizeof(Eterm); +} + +dealloc_ret.n3() { + num_bytes = (3+1) * sizeof(Eterm); +} + +dealloc_ret.n4() { + num_bytes = (4+1) * sizeof(Eterm); +} + +dealloc_ret.execute() { + //| -no_next + + /* + * Micro-benchmarks showed that the deallocate_return instruction + * became slower when the continuation pointer was moved from + * the process struct to the stack. The reason seems to be read + * dependencies, i.e. that the CPU cannot figure out beforehand + * from which position on the stack the continuation pointer + * should be fetched. + * + * Making sure that num_bytes is always initialized with a + * constant value seems to restore the lost speed. + */ + + E = ADD_BYTE_OFFSET(E, num_bytes); + $RETURN(); + CHECK_TERM(x(0)); + DispatchReturn; +} + deallocate_return(Deallocate) { //| -no_next - int words_to_pop = $Deallocate; - E = ADD_BYTE_OFFSET(E, words_to_pop); + Uint bytes_to_pop = $Deallocate; + E = ADD_BYTE_OFFSET(E, bytes_to_pop); $RETURN(); CHECK_TERM(x(0)); DispatchReturn; } move_deallocate_return(Src, Deallocate) { - x(0) = $Src; - $deallocate_return($Deallocate); + //| -no_next + + /* + * Explicitly do reads first to mitigate the impact of read + * dependencies. + */ + + Uint bytes_to_pop = $Deallocate; + Eterm src = $Src; + E = ADD_BYTE_OFFSET(E, bytes_to_pop); + x(0) = src; + $RETURN(); + CHECK_TERM(x(0)); + DispatchReturn; } // Call instructions diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab index f525d126e7..c0ca9260a0 100644 --- a/erts/emulator/beam/ops.tab +++ b/erts/emulator/beam/ops.tab @@ -596,8 +596,20 @@ move S x==0 | deallocate D | return => move_deallocate_return S D move_deallocate_return xycn Q +deallocate u==0 | return => deallocate_return0 +deallocate u==1 | return => deallocate_return1 +deallocate u==2 | return => deallocate_return2 +deallocate u==3 | return => deallocate_return3 +deallocate u==4 | return => deallocate_return4 + deallocate D | return => deallocate_return D +deallocate_return0 +deallocate_return1 +deallocate_return2 +deallocate_return3 +deallocate_return4 + deallocate_return Q test_heap Need u==1 | put_list Y=y x==0 x==0 => test_heap_1_put_list Need Y -- cgit v1.2.3