1 files changed, 453 insertions, 250 deletions
diff --git a/erts/emulator/beam/ops.tab b/erts/emulator/beam/ops.tab
index e76d896ffc..b9d4f6afcc 100644
--- a/erts/emulator/beam/ops.tab
+++ b/erts/emulator/beam/ops.tab
@@ -1,7 +1,7 @@
 #
 # %CopyrightBegin%
 #
-# Copyright Ericsson AB 1997-2018. All Rights Reserved.
+# Copyright Ericsson AB 1997-2019. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -74,23 +74,19 @@ trace_jump W
 
 return
 
+# To ensure that a "move Src x(0)" instruction can be combined with
+# the following call instruction, we need to make sure that there is
+# no line/1 instruction between the move and the call.
 #
-# To ensure that a "move Src x(0)" instruction can be combined
-# with the following call instruction, we need to make sure that
-# there is no line/1 instruction between the move and the call.
-#
-# A tail-recursive call to an external function (non-BIF) will
-# never be saved on the stack, so there is no reason to keep
-# the line instruction. (The compiler did not remove the line
-# instruction because it cannot tell the difference between
-# BIFs and ordinary Erlang functions.)
-#
+# A tail-recursive call to an external function (BIF or non-BIF) will
+# never be saved on the stack, so there is no reason to keep the line
+# instruction.
 
 move S X0=x==0 | line Loc | call_ext Ar Func => \
      line Loc | move S X0 | call_ext Ar Func
-move S X0=x==0 | line Loc | call_ext_last Ar Func=u$is_not_bif D => \
+move S X0=x==0 | line Loc | call_ext_last Ar Func D => \
      move S X0 | call_ext_last Ar Func D
-move S X0=x==0 | line Loc | call_ext_only Ar Func=u$is_not_bif => \
+move S X0=x==0 | line Loc | call_ext_only Ar Func => \
      move S X0 | call_ext_only Ar Func
 move S X0=x==0 | line Loc | call Ar Func => \
      line Loc | move S X0 | call Ar Func
@@ -102,15 +98,18 @@ line I
 allocate t t?
 allocate_heap t I t?
 
-%cold
+# This instruction when a BIF is called tail-recursively when
+# ther is stack frame.
 deallocate Q
-%hot
 
 init y
 allocate_zero t t?
 allocate_heap_zero t I t?
 
+move Src=y Dst=x | trim N Remaining => move_trim Src Dst N
 trim N Remaining => i_trim N
+
+move_trim y x t
 i_trim t
 
 test_heap I t?
@@ -118,11 +117,21 @@ test_heap I t?
 allocate_heap S u==0 R => allocate S R
 allocate_heap_zero S u==0 R => allocate_zero S R
 
-init2 y y
-init3 y y y
+init Y1 | init Y2 | init Y3 | succ(Y1,Y2) | succ(Y2,Y3) => init_seq3 Y1
+init_seq3 Y1 | init Y4 | succ3(Y1,Y4) => init_seq4 Y1
+init_seq4 Y1 | init Y5 | succ4(Y1,Y5) => init_seq5 Y1
+
+init_seq3 y
+init_seq4 y
+init_seq5 y
+
 init Y1 | init Y2 | init Y3 => init3 Y1 Y2 Y3
 init Y1 | init Y2 => init2 Y1 Y2
 
+init2 y y
+init3 y y y
+
+
 # Selecting values
 
 select_val S=aiq Fail=f Size=u Rest=* => const_select_val(S, Fail, Size, Rest)
@@ -205,14 +214,11 @@ set_tuple_element s S P
 
 # Get tuple element
 
-i_get_tuple_element xy P x
-
-%cold
-i_get_tuple_element xy P y
-%hot
+i_get_tuple_element xy P xy
 
 i_get_tuple_element2 x P x
-i_get_tuple_element2y x P y y
+i_get_tuple_element2_dst x P x x
+i_get_tuple_element2_dst x P y y
 
 i_get_tuple_element3 x P x
 
@@ -258,12 +264,14 @@ system_limit j
 # Move instructions.
 #
 
-move C=cxy x==0 | jump Lbl => move_jump Lbl C
+move Src=cxy Dst=xy | jump Lbl => move_jump Lbl Src Dst
 
-move_jump f ncxy
+move_jump f cxy xy
+move_jump f c r
 
-# Movement to and from the stack is common
-# Try to pack as much as we can into one instruction
+
+# Movement to and from the stack is common.
+# Try to pack as much as we can into one instruction.
 
 # Window move
 move_window/5
@@ -274,6 +282,9 @@ move_window/6
 move X1=x Y1=y | move X2=x Y2=y | move X3=x Y3=y | succ(Y1,Y2) | succ(Y2,Y3) => \
     move_window X1 X2 X3 Y1 Y3
 
+move X1=x Y1=y | move X2=x Y2=y | succ(Y1,Y2) => \
+    move_window2 X1 X2 Y1
+
 move_window X1=x X2=x X3=x Y1=y Y3=y | move X4=x Y4=y | succ(Y3,Y4) => \
     move_window X1 X2 X3 X4 Y1 Y4
 
@@ -283,15 +294,54 @@ move_window X1=x X2=x X3=x X4=x Y1=y Y4=y | move X5=x Y5=y | succ(Y4,Y5) => \
 move_window X1=x X2=x X3=x Y1=y Y3=y => move_window3 X1 X2 X3 Y1
 move_window X1=x X2=x X3=x X4=x Y1=y Y4=y => move_window4 X1 X2 X3 X4 Y1
 
+move_window2 x x y
 move_window3 x x x y
 move_window4 x x x x y
 move_window5 x x x x x y
 
+# y -> x
+
+move_src_window/4
+move_src_window/5
+
+move Y1=y X1=x | move Y2=y X2=x | succ(Y1, Y2) => \
+    move_src_window Y1 Y2 X1 X2
+
+move_src_window Y1 Y2 X1 X2 | move Y3=y X3=x | succ(Y2, Y3) => \
+    move_src_window Y1 Y3 X1 X2 X3
+move_src_window Y1 Y2 X1 X2 | move Y3=y X3=x | move Y4=y X4=x | succ(Y3, Y4) => \
+    move_src_window2 Y1 X1 X2 | move_src_window Y3 Y4 X3 X4
+move_src_window Y1 Y2 X1 X2 | move Y3=y X3=x => \
+    move3 Y1 X1 Y2 X2 Y3 X3
+
+move_src_window Y1 Y3 X1 X2 X3 | move Y4=y X4=x | succ(Y3, Y4) => \
+    move_src_window4 Y1 X1 X2 X3 X4
+
+move_src_window Y1 y X1 X2    => move_src_window2 Y1 X1 X2
+move_src_window Y1 y X1 X2 X3 => move_src_window3 Y1 X1 X2 X3
+
+move_src_window2 y x x
+move_src_window3 y x x x
+move_src_window4 y x x x x
+
 # Swap registers.
-move R1=x Tmp=x | move R2=xy R1 | move Tmp R2 => swap_temp R1 R2 Tmp
+move R1=xy Tmp=x | move R2=xy R1 | move Tmp R2 => swap_temp R1 R2 Tmp
+
+# The compiler uses x(1022) when swapping registers. It will definitely
+# not be used again.
+swap_temp R1 R2 Tmp=x==1022 => swap R1 R2
+
+swap_temp R1 R2 Tmp | move Src Tmp => swap R1 R2 | move Src Tmp
 
 swap_temp R1 R2 Tmp | line Loc | apply Live | is_killed_apply(Tmp, Live) => \
   swap R1 R2 | line Loc | apply Live
+swap_temp R1 R2 Tmp | line Loc | apply_last Live D | is_killed_apply(Tmp, Live) => \
+  swap R1 R2 | line Loc | apply_last Live D
+
+swap_temp R1 R2 Tmp | line Loc | call_fun Live | is_killed_by_call_fun(Tmp, Live) => \
+  swap R1 R2 | line Loc | call_fun Live
+swap_temp R1 R2 Tmp | make_fun2 OldIndex=u | is_killed_by_make_fun(Tmp, OldIndex) => \
+  swap R1 R2 | make_fun2 OldIndex
 
 swap_temp R1 R2 Tmp | line Loc | call Live Addr | is_killed(Tmp, Live) => \
   swap R1 R2 | line Loc | call Live Addr
@@ -307,84 +357,112 @@ swap_temp R1 R2 Tmp | line Loc | call_ext_only Live Addr | \
 swap_temp R1 R2 Tmp | line Loc | call_ext_last Live Addr D | \
   is_killed(Tmp, Live) => swap R1 R2 | line Loc | call_ext_last Live Addr D
 
-swap_temp x xy x
+swap_temp R1 R2 Tmp | call_ext Live Addr | is_killed(Tmp, Live) => \
+  swap R1 R2 | call_ext Live Addr
+swap_temp R1 R2 Tmp | call_ext_only Live Addr | is_killed(Tmp, Live) => \
+  swap R1 R2 | call_ext_only Live Addr
+swap_temp R1 R2 Tmp | call_ext_last Live Addr D | is_killed(Tmp, Live) => \
+  swap R1 R2 | call_ext_last Live Addr D
+
+swap_temp R1 R2 Tmp | move Src Any | line Loc | call Live Addr | \
+  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
+     swap R1 R2 | move Src Any | line Loc | call Live Addr
+swap_temp R1 R2 Tmp | move Src Any | line Loc | call_ext Live Addr | \
+  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
+     swap R1 R2 | move Src Any | line Loc | call_ext Live Addr
+swap_temp R1 R2 Tmp | move Src Any | call_only Live Addr | \
+  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
+    swap R1 R2 | move Src Any | call_only Live Addr
+swap_temp R1 R2 Tmp | move Src Any | line Loc | call_ext_only Live Addr | \
+  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
+    swap R1 R2 | move Src Any | line Loc | call_ext_only Live Addr
+swap_temp R1 R2 Tmp | move Src Any | line Loc | call_fun Live | \
+  is_killed(Tmp, Live) | distinct(Tmp, Src) => \
+    swap R1 R2 | move Src Any | line Loc | call_fun Live
+
+swap_temp R1 R2 Tmp | line Loc | send | is_killed_by_send(Tmp) => \
+  swap R1 R2 | line Loc | send
+
+# swap_temp/3 with Y register operands are rare.
+swap_temp R1 R2=y Tmp => swap R1 R2 | move R2 Tmp
+swap_temp R1=y R2 Tmp => swap R1 R2 | move R2 Tmp
+
+swap R1=x R2=y => swap R2 R1
+
+swap_temp x x x
+
+swap xy x
+swap y y
+
+# move_shift
+
+move SD=x    D=x | move Src=cxy SD=x  | distinct(D, Src) => move_shift Src SD D
+move SD=y    D=x | move Src=x  SD=y   | distinct(D, Src) => move_shift Src SD D
+move SD=y    D=x | init SD            |                  => move_shift n   SD D
+move SD=x    D=y | move Src=x  SD=x   | distinct(D, Src) => move_shift Src SD D
+move SD=x==0 D=y | move Src=y SD=x==0 | distinct(D, Src) => move_shift Src SD D
+
+move_shift cxy x x
+move_shift nx y x
+move_shift x x y
+move_shift y r y
+
+# move2_par x x x x
 
-swap x xy
+move X1=x X2=x | move X3=x X4=x | independent_moves(X1, X2, X3, X4) => \
+     move2_par X1 X2 X3 X4
+move2_par x x x x
 
-move Src=x D1=x | move Src=x D2=x => move_dup Src D1 D2
-move Src=x SD=x | move SD=x D=x   => move_dup Src SD D
-move Src=x D1=x | move Src=x D2=y => move_dup Src D1 D2
-move Src=y SD=x | move SD=x D=y   => move_dup Src SD D
-move Src=x SD=x | move SD=x D=y   => move_dup Src SD D
-move Src=y SD=x | move SD=x D=x   => move_dup Src SD D
+# move2_par x x x y
 
-move SD=x D=x | move Src=xy SD=x => move_shift Src SD D
-move SD=y D=x | move Src=x  SD=y => move_shift Src SD D
-move SD=x D=y | move Src=x  SD=x => move_shift Src SD D
+move X1=x X2=x | move X3=x Y1=y | independent_moves(X1, X2, X3, Y1) => \
+     move2_par X1 X2 X3 Y1
+move X3=x Y1=y | move X1=x X2=x | independent_moves(X3, Y1, X1, X2) => \
+     move2_par X1 X2 X3 Y1
+move2_par x x x y
 
-# The transformations above guarantee that the source for
-# the second move is not the same as the destination for
-# the first move. That means that we can do the moves in
-# parallel (fetch both values, then store them) which could
-# be faster.
+# move2_par y x y x
 
-move X1=x Y1=y | move X2=x Y2=y => move2_par X1 Y1 X2 Y2
 move Y1=y X1=x | move Y2=y X2=x => move2_par Y1 X1 Y2 X2
+move2_par y x y x
 
-move X1=x X2=x | move X3=x X4=x => move2_par X1 X2 X3 X4
+# move2_par y x x y
 
-move X1=x X2=x | move X3=x Y1=y => move2_par X1 X2 X3 Y1
+move S1=y S2=x | move X1=x Y1=y | independent_moves(S1, S2, X1, Y1) => \
+     move2_par S1 S2 X1 Y1
+move X1=x Y1=y | move S1=y S2=x | independent_moves(S1, S2, X1, Y1) => \
+     move2_par S1 S2 X1 Y1
+move2_par y x x y
 
-move S1=x S2=x | move X1=x Y1=y => move2_par S1 S2 X1 Y1
+# move2_par y x x x
 
-move S1=y S2=x | move X1=x Y1=y => move2_par S1 S2 X1 Y1
+move Y1=y X1=x | move S1=x D1=x | independent_moves(Y1, X1, S1, D1) => \
+     move2_par Y1 X1 S1 D1
+move S1=x D1=x | move Y1=y X1=x | independent_moves(Y1, X1, S1, D1) => \
+     move2_par Y1 X1 S1 D1
+move2_par y x x x
 
-move Y1=y X1=x | move S1=x D1=x => move2_par Y1 X1 S1 D1
-move S1=x D1=x | move Y1=y X1=x => move2_par S1 D1 Y1 X1
+# move3
 
-move2_par X1=x Y1=y X2=x Y2=y | move X3=x Y3=y => move3 X1 Y1 X2 Y2 X3 Y3
 move2_par Y1=y X1=x Y2=y X2=x | move Y3=y X3=x => move3 Y1 X1 Y2 X2 Y3 X3
 move2_par X1=x X2=x X3=x X4=x | move X5=x X6=x => move3 X1 X2 X3 X4 X5 X6
 
+move3 y x y x y x
+move3 x x x x x x
+
+# move_x1, move_x2
+
 move C=aiq X=x==1 => move_x1 C
 move C=aiq X=x==2 => move_x2 C
 
+move n D=y => init D
+
 move_x1 c
 move_x2 c
 
-move_shift x x x
-move_shift y x x
-move_shift x y x
-move_shift x x y
-
-move_dup xy x xy
-
-move2_par x y x y
-move2_par y x y x
-move2_par x x x x
-
-move2_par x x x y
-
-move2_par y x x y
-
-move2_par x x y x
-move2_par y x x x
-
-move3 x y x y x y
-move3 y x y x y x
-move3 x x x x x x
-
-# The compiler almost never generates a "move Literal y(Y)" instruction,
-# so let's cheat if we encounter one.
-move S=n D=y => init D
-move S=c D=y => move S x | move x D
-
-move x x
-move x y
-move y x
-move c x
+move xy xy
+move c xy
 move n x
-move y y
 
 # The following move instructions using x(0) are frequently used.
 
@@ -478,14 +556,25 @@ is_ge f? c x
 is_ge f? s s
 %hot
 
-is_eq f? s s
+is_eq Fail=f Const=c Reg=xy => is_eq Fail Reg Const
+is_eq Fail=f C1=c C2=c => move C1 x | is_eq Fail x C2
+is_eq f? S s
 
-is_ne f? s s
+is_ne Fail=f Const=c Reg=xy => is_ne Fail Reg Const
+is_ne Fail=f C1=c C2=c => move C1 x | is_ne Fail x C2
+is_ne f? S s
 
 #
-# Putting things.
+# Putting tuples.
+#
+# Code compiled with OTP 22 and later uses put_tuple2 to
+# to construct a tuple.
+#
+# Code compiled before OTP 22 uses put_tuple + one put instruction
+# per element. Translate to put_tuple2.
 #
 
+i_put_tuple/2
 put_tuple Arity Dst => i_put_tuple Dst u
 
 i_put_tuple Dst Arity Puts=* | put S1 | put S2 | \
@@ -495,11 +584,13 @@ i_put_tuple Dst Arity Puts=* | put S1 | put S2 | \
 i_put_tuple Dst Arity Puts=* | put S => \
 	    tuple_append_put(Arity, Dst, Puts, S)
 
-i_put_tuple/2
+i_put_tuple Dst Arity Puts=* => put_tuple2 Dst Arity Puts
 
-i_put_tuple xy I
+put_tuple2 xy I
 
 #
+# Putting lists.
+#
 # The instruction "put_list Const [] Dst" were generated in rare
 # circumstances up to and including OTP 18. Starting with OTP 19,
 # AFAIK, it should never be generated.
@@ -510,32 +601,26 @@ put_list Src Dst=x Dst => update_list Src Dst
 
 update_list xyc x
 
-put_list x n x
-put_list y n x
-put_list x x x
-put_list y x x
+# put_list SrcReg1 SrcReg2 => Dst
+
+put_list xy xy x
 
-put_list y y x
-put_list x y x
+# put_list SrcReg [] => Dst
 
-# put_list SrcReg Constant Dst
+put_list xy n xy
 
-put_list x c x
-put_list x c y
+# put_list SrcReg Constant => x
 
-put_list y c x
+put_list xy c x
 
-# put_list Constant SrcReg Dst
+# put_list Constant SrcReg => Dst
 
-put_list c x x
-put_list c y x
+put_list c xy x
 
 # The following put_list instructions using x(0) are frequently used.
 
-put_list r n r
-put_list r n x
-put_list r x x
-put_list r x r
+put_list r n rx
+put_list r x rx
 put_list x x r
 
 %cold
@@ -602,19 +687,36 @@ is_tuple f? rxy
 
 test_arity Fail Literal=q Arity => move Literal x | test_arity Fail x Arity
 test_arity Fail=f c Arity => jump Fail
+test_arity Fail Tuple=x Arity | get_tuple_element Tuple Pos Dst=x => \
+   test_arity_get_tuple_element Fail Tuple Arity Pos Dst
 
 test_arity f? xy A
 
-get_tuple_element Reg=x P1 D1=x | get_tuple_element Reg=x P2 D2=x | \
+test_arity_get_tuple_element f? x A P x
+
+is_tuple NotTupleFail Tuple=x | is_tagged_tuple WrongRecordFail Tuple Arity Atom => \
+   is_tagged_tuple_ff NotTupleFail WrongRecordFail Tuple Arity Atom
+
+is_tagged_tuple_ff f? f? rx A a
+
+get_tuple_element Reg=x P1 D1=x | \
+   get_tuple_element Reg=x P2 D2=x | \
    get_tuple_element Reg=x P3 D3=x | \
-   succ(P1, P2) | succ(P2, P3) | \
-   succ(D1, D2) | succ(D2, D3) => i_get_tuple_element3 Reg P1 D1
+   succ(P1, P2) | succ(P2, P3) | succ(D1, D2) | succ(D2, D3) | \
+   distinct(D1, Reg) | distinct(D2, Reg) => \
+      i_get_tuple_element3 Reg P1 D1
+
+get_tuple_element Reg=x P1 D1=x | \
+   get_tuple_element Reg=x P2 D2=x | \
+   succ(P1, P2) | succ(D1, D2) | \
+   distinct(D1, Reg) => \
+      i_get_tuple_element2 Reg P1 D1
 
 get_tuple_element Reg=x P1 D1=x | get_tuple_element Reg=x P2 D2=x | \
-   succ(P1, P2) | succ(D1, D2) => i_get_tuple_element2 Reg P1 D1
+   succ(P1, P2) | distinct(D1, Reg) => i_get_tuple_element2_dst Reg P1 D1 D2
 
 get_tuple_element Reg=x P1 D1=y | get_tuple_element Reg=x P2 D2=y | \
-   succ(P1, P2) => i_get_tuple_element2y Reg P1 D1 D2
+   succ(P1, P2) => i_get_tuple_element2_dst Reg P1 D1 D2
 
 get_tuple_element Reg P Dst => i_get_tuple_element Reg P Dst
 
@@ -638,14 +740,21 @@ is_list f? y
 
 is_nonempty_list Fail=f S=x | allocate Need Rs => is_nonempty_list_allocate Fail S Need Rs
 
-is_nonempty_list F=f x==0 | test_heap I1 I2 => is_nonempty_list_test_heap F I1 I2
-
 is_nonempty_list Fail=f S=x | get_list S D1=x D2=x => \
   is_nonempty_list_get_list Fail S D1 D2
 
+is_nonempty_list Fail=f S=x | get_hd S Dst=x => \
+  is_nonempty_list_get_hd Fail S Dst
+
+is_nonempty_list Fail=f S=x | get_tl S Dst=x => \
+  is_nonempty_list_get_tl Fail S Dst
+
 is_nonempty_list_allocate f? rx t t
-is_nonempty_list_test_heap f? I t
+
 is_nonempty_list_get_list f? rx x x
+is_nonempty_list_get_hd f? x x
+is_nonempty_list_get_tl f? x x
+
 is_nonempty_list f? xy
 
 is_atom f? x
@@ -710,11 +819,12 @@ is_boolean Fail=f ac => jump Fail
 is_boolean f? xy
 %hot
 
-is_function2 Fail=f Literal=q Arity | literal_is_export(Literal) =>
-is_function2 Fail=f c Arity => jump Fail
-is_function2 Fail=f Fun a => jump Fail
+is_function2 Fail=f Fun Arity => gen_is_function2(Fail, Fun, Arity)
 
-is_function2 f? S s
+%cold
+cold_is_function2 f? x x
+%hot
+hot_is_function2 f? S t
 
 # Allocating & initializing.
 allocate Need Regs | init Y => allocate_init Need Regs Y
@@ -946,10 +1056,9 @@ call_ext_only u==0 u$func:os:perf_counter/0 => \
 
 call_ext u Bif=u$is_bif => call_bif Bif
 
-call_ext_last u Bif=u$is_bif D => call_bif Bif | deallocate_return D
+call_ext_last u Bif=u$is_bif D => deallocate D | call_bif_only Bif
 
-call_ext_only Ar=u Bif=u$is_bif => \
-  allocate u Ar | call_bif Bif | deallocate_return u
+call_ext_only Ar=u Bif=u$is_bif => call_bif_only Bif
 
 #
 # Any remaining calls are calls to Erlang functions, not BIFs.
@@ -981,6 +1090,7 @@ i_perf_counter
 %hot
 
 call_bif e
+call_bif_only e
 
 #
 # Calls to non-building and guard BIFs.
@@ -989,14 +1099,18 @@ call_bif e
 bif0 u$bif:erlang:self/0 Dst=d => self Dst
 bif0 u$bif:erlang:node/0 Dst=d => node Dst
 
+bif1 Fail=f Bif=u$bif:erlang:hd/1 Src=x Dst=x => is_nonempty_list_get_hd Fail Src Dst
+bif1 Fail=f Bif=u$bif:erlang:tl/1 Src=x Dst=x => is_nonempty_list_get_tl Fail Src Dst
+
 bif1 Fail Bif=u$bif:erlang:get/1 Src=s Dst=d => gen_get(Src, Dst)
 
 bif2 Jump=j u$bif:erlang:element/2 S1=s S2=xy Dst=d => gen_element(Jump, S1, S2, Dst)
 
-bif1 p Bif S1 Dst => bif1_body Bif S1 Dst
+bif1 p Bif S1 Dst         => i_bif1_body S1 Bif Dst
+bif1 Fail=f Bif S1 Dst    => i_bif1 S1 Fail Bif Dst
 
-bif2 p Bif S1 S2 Dst => i_bif2_body Bif S1 S2 Dst
-bif2 Fail Bif S1 S2 Dst => i_bif2 Fail Bif S1 S2 Dst
+bif2 p Bif S1 S2 Dst      => i_bif2_body S2 S1 Bif Dst
+bif2 Fail=f Bif S1 S2 Dst => i_bif2 S2 S1 Fail Bif Dst
 
 i_get_hash c I d
 i_get s d
@@ -1014,10 +1128,12 @@ i_fast_element xy j? I d
 
 i_element xy j? s d
 
-bif1 f? b s d
-bif1_body b s d
-i_bif2 f? b s s d
-i_bif2_body b s s d
+i_bif1 s f? b d
+i_bif1_body s b d
+i_bif2 s s f? b d
+i_bif2_body s s b d
+i_bif3 s s s f? b d
+i_bif3_body s s s b d
 
 #
 # Internal calls.
@@ -1062,8 +1178,25 @@ call_fun Arity => i_call_fun Arity
 i_call_fun t
 i_call_fun_last t Q
 
+
+#
+# A fun with an empty environment can be converted to a literal.
+# As a further optimization, the we try to move the fun to its
+# final destination directly.
+
 make_fun2 OldIndex=u => gen_make_fun2(OldIndex)
 
+move_fun/2
+move_fun Fun X0 | move X0 Dst | move Src X0 => move Fun Dst | move Src X0
+move_fun Fun X0 | move A B | move X0 Dst | move Src X0 | \
+  independent_moves(Fun, X0, A, B) | distinct(Dst, A) => \
+    move Fun Dst | move A B | move Src X0
+move_fun Fun X0 | move X0 Dst | make_fun2 OldIndex | \
+  is_killed_by_make_fun(X0, OldIndex)=> \
+    move Fun Dst | make_fun2 OldIndex
+
+move_fun Fun Dst => move Fun Dst
+
 %cold
 i_make_fun W t
 %hot
@@ -1074,101 +1207,141 @@ is_function Fail=f c => jump Fail
 func_info M F A => i_func_info u M F A
 
 # ================================================================
-# New bit syntax matching (R11B).
+# Bit syntax matching obsoleted in OTP 22.
 # ================================================================
 
-%warm
+%cold
 bs_start_match2 Fail=f ica X Y D => jump Fail
 bs_start_match2 Fail Bin X Y D => i_bs_start_match2 Bin Fail X Y D
-i_bs_start_match2 xy f t t x
+i_bs_start_match2 xy f t t d
 
+bs_save2 Y=y Index => move Y x | bs_save2 x Index
 bs_save2 Reg Index => gen_bs_save(Reg, Index)
 i_bs_save2 x t
 
+bs_restore2 Y=y Index => move Y x | bs_restore2 x Index
 bs_restore2 Reg Index => gen_bs_restore(Reg, Index)
 i_bs_restore2 x t
 
+bs_context_to_binary Y=y | line L | badmatch Y => \
+    move Y x | bs_context_to_binary x | line L | badmatch x
+bs_context_to_binary Y=y => move Y x | bs_context_to_binary x
+bs_context_to_binary x
+%warm
+
+# ================================================================
+# New bit syntax matching (R11B).
+# ================================================================
+
+%warm
+
 # Matching integers
 bs_match_string Fail Ms Bits Val => i_bs_match_string Ms Fail Bits Val
 
-i_bs_match_string x f W W
+i_bs_match_string xy f W W
 
 # Fetching integers from binaries.
-bs_get_integer2 Fail=f Ms=x Live=u Sz=sq Unit=u Flags=u Dst=d => \
+bs_get_integer2 Fail=f Ms=xy Live=u Sz=sq Unit=u Flags=u Dst=d => \
 			gen_get_integer2(Fail, Ms, Live, Sz, Unit, Flags, Dst)
 
-i_bs_get_integer_small_imm x W f? t x
-i_bs_get_integer_imm x W t f? t x
-i_bs_get_integer f? t t x s x
-i_bs_get_integer_8 x f? x
-i_bs_get_integer_16 x f? x
+i_bs_get_integer_small_imm Ms Bits Fail Flags Y=y => \
+   i_bs_get_integer_small_imm Ms Bits Fail Flags x | move x Y
+
+i_bs_get_integer_imm Ms Bits Live Fail Flags Y=y => \
+   i_bs_get_integer_imm Ms Bits Live Fail Flags x | move x Y
+
+i_bs_get_integer_small_imm xy W f? t x
+i_bs_get_integer_imm xy W t f? t x
+i_bs_get_integer xy f? t t s d
+i_bs_get_integer_8 xy f? d
+i_bs_get_integer_16 xy f? d
 
 %if ARCH_64
-i_bs_get_integer_32 x f? x
+i_bs_get_integer_32 xy f? d
 %endif
 
 # Fetching binaries from binaries.
-bs_get_binary2 Fail=f Ms=x Live=u Sz=sq Unit=u Flags=u Dst=d => \
+bs_get_binary2 Fail=f Ms=xy Live=u Sz=sq Unit=u Flags=u Dst=d => \
 			gen_get_binary2(Fail, Ms, Live, Sz, Unit, Flags, Dst)
 
-i_bs_get_binary_imm2 f? x t W t x
-i_bs_get_binary2 f x t? s t x
-i_bs_get_binary_all2 f? x t t x
-i_bs_get_binary_all_reuse x f? t
+i_bs_get_binary_imm2 xy f? t W t d
+i_bs_get_binary2 xy f t? s t d
+i_bs_get_binary_all2 xy f? t t d
 
 # Fetching float from binaries.
-bs_get_float2 Fail=f Ms=x Live=u Sz=s Unit=u Flags=u Dst=d => \
+bs_get_float2 Fail=f Ms=xy Live=u Sz=s Unit=u Flags=u Dst=d => \
 		gen_get_float2(Fail, Ms, Live, Sz, Unit, Flags, Dst)
 
 bs_get_float2 Fail=f Ms=x Live=u Sz=q Unit=u Flags=u Dst=d => jump Fail
 
-i_bs_get_float2 f? x t s t x
+i_bs_get_float2 xy f? t s t d
 
 # Miscellanous
 
-bs_skip_bits2 Fail=f Ms=x Sz=sq Unit=u Flags=u => \
+bs_skip_bits2 Fail=f Ms=xy Sz=sq Unit=u Flags=u => \
 			gen_skip_bits2(Fail, Ms, Sz, Unit, Flags)
 
-i_bs_skip_bits_imm2 f? x W
-i_bs_skip_bits2 f? x xy t
-i_bs_skip_bits_all2 f? x t
+i_bs_skip_bits_imm2 f? xy W
+i_bs_skip_bits2 xy xy f? t
 
-bs_test_tail2 Fail=f Ms=x Bits=u==0 => bs_test_zero_tail2 Fail Ms
-bs_test_tail2 Fail=f Ms=x Bits=u => bs_test_tail_imm2 Fail Ms Bits
-bs_test_zero_tail2 f? x
-bs_test_tail_imm2 f? x W
+bs_test_tail2 Fail=f Ms=xy Bits=u==0 => bs_test_zero_tail2 Fail Ms
+bs_test_tail2 Fail=f Ms=xy Bits=u => bs_test_tail_imm2 Fail Ms Bits
+bs_test_zero_tail2 f? xy
+bs_test_tail_imm2 f? xy W
 
 bs_test_unit F Ms Unit=u==8 => bs_test_unit8 F Ms
-bs_test_unit f? x t
-bs_test_unit8 f? x
+bs_test_unit f? xy t
+bs_test_unit8 f? xy
 
-# An y register operand for bs_context_to_binary is rare,
-# but can happen because of inlining.
+# Gets a bitstring from the tail of a context.
+bs_get_tail xy d t
 
-bs_context_to_binary Y=y | line L | badmatch Y => \
-    move Y x | bs_context_to_binary x | line L | badmatch x
+# New bs_start_match variant for contexts with external position storage.
+#
+# bs_get/set_position is used to save positions into registers instead of
+# "slots" in the context itself, which lets us continue matching even after
+# we've passed it off to another function.
 
-bs_context_to_binary Y=y => move Y x | bs_context_to_binary x
+%if ARCH_64
+bs_start_match3 Fail Bin Live Ctx | bs_get_position Ctx Pos=x Ignored => \
+    i_bs_start_match3_gp Bin Live Fail Ctx Pos
+i_bs_start_match3_gp xy t f d x
+%endif
 
-bs_context_to_binary x
+bs_start_match3 Fail=f ica Live Dst => jump Fail
+bs_start_match3 Fail Bin Live Dst => i_bs_start_match3 Bin Live Fail Dst
+
+i_bs_start_match3 xy t f d
+
+# Match context position instructions. 64-bit assumes that all positions can
+# fit into an unsigned small.
+
+%if ARCH_64
+    bs_get_position Src Dst Live => i_bs_get_position Src Dst
+    i_bs_get_position xy xy
+    bs_set_position xy xy
+%else
+    bs_get_position xy d t?
+    bs_set_position xy xy
+%endif
 
 #
 # Utf8/utf16/utf32 support. (R12B-5)
 #
-bs_get_utf8 Fail=f Ms=x u u Dst=d => i_bs_get_utf8 Ms Fail Dst
-i_bs_get_utf8 x f? x
+bs_get_utf8 Fail=f Ms=xy u u Dst=d => i_bs_get_utf8 Ms Fail Dst
+i_bs_get_utf8 xy f? d
 
-bs_skip_utf8 Fail=f Ms=x u u => i_bs_get_utf8 Ms Fail x
+bs_skip_utf8 Fail=f Ms=xy u u => i_bs_get_utf8 Ms Fail x
 
-bs_get_utf16 Fail=f Ms=x u Flags=u Dst=d => i_bs_get_utf16 Ms Fail Flags Dst
-bs_skip_utf16 Fail=f Ms=x u Flags=u => i_bs_get_utf16 Ms Fail Flags x
+bs_get_utf16 Fail=f Ms=xy u Flags=u Dst=d => i_bs_get_utf16 Ms Fail Flags Dst
+bs_skip_utf16 Fail=f Ms=xy u Flags=u => i_bs_get_utf16 Ms Fail Flags x
 
-i_bs_get_utf16 x f? t x
+i_bs_get_utf16 xy f? t d
 
-bs_get_utf32 Fail=f Ms=x Live=u Flags=u Dst=d => \
+bs_get_utf32 Fail=f Ms=xy Live=u Flags=u Dst=d => \
 	bs_get_integer2 Fail Ms Live i=32 u=1 Flags Dst | \
 	i_bs_validate_unicode_retract Fail Dst Ms
-bs_skip_utf32 Fail=f Ms=x Live=u Flags=u => \
+bs_skip_utf32 Fail=f Ms=xy Live=u Flags=u => \
 	bs_get_integer2 Fail Ms Live i=32 u=1 Flags x | \
 	i_bs_validate_unicode_retract Fail x Ms
 
@@ -1182,6 +1355,9 @@ i_bs_validate_unicode_retract j s S
 
 bs_init2 Fail Sz Words Regs Flags Dst | binary_too_big(Sz) => system_limit Fail
 
+bs_init2 Fail Sz Words Regs Flags Dst=y => \
+   bs_init2 Fail Sz Words Regs Flags x | move x Dst
+
 bs_init2 Fail Sz=u Words=u==0 Regs Flags Dst => i_bs_init Sz Regs Dst
 
 bs_init2 Fail Sz=u Words Regs Flags Dst => \
@@ -1202,6 +1378,8 @@ i_bs_init_heap W I t? x
 
 
 bs_init_bits Fail Sz=o Words Regs Flags Dst => system_limit Fail
+bs_init_bits Fail Sz Words Regs Flags Dst=y => \
+   bs_init_bits Fail Sz Words Regs Flags x | move x Dst
 
 bs_init_bits Fail Sz=u Words=u==0 Regs Flags Dst => i_bs_init_bits Sz Regs Dst
 bs_init_bits Fail Sz=u Words Regs Flags Dst =>  i_bs_init_bits_heap Sz Words Regs Dst
@@ -1230,7 +1408,7 @@ bs_private_append Fail Size Unit Bin Flags Dst => \
 
 bs_init_writable
 
-i_bs_append j? I t? t s x
+i_bs_append j? I t? t s xy
 i_bs_private_append j? t s S x
 
 #
@@ -1240,31 +1418,35 @@ i_bs_private_append j? t s S x
 bs_put_integer Fail=j Sz=sq Unit=u Flags=u Src=s => \
 			gen_put_integer(Fail, Sz, Unit, Flags, Src)
 
-i_new_bs_put_integer j? s t s
-i_new_bs_put_integer_imm j? W t s
+i_new_bs_put_integer j? S t s
+i_new_bs_put_integer_imm xyc j? W t
 
 #
 # Utf8/utf16/utf32 support. (R12B-5)
 #
 
-bs_utf8_size j Src=s Dst=d => i_bs_utf8_size Src Dst
+bs_utf8_size j Src Dst=d => i_bs_utf8_size Src Dst
+bs_utf16_size j Src Dst=d => i_bs_utf16_size Src Dst
 
-i_bs_utf8_size s x
+bs_put_utf8 Fail u Src => i_bs_put_utf8 Fail Src
 
-bs_utf16_size j Src=s Dst=d => i_bs_utf16_size Src Dst
-
-i_bs_utf16_size s x
-
-bs_put_utf8 Fail u Src=s => i_bs_put_utf8 Fail Src
+bs_put_utf32 Fail=j Flags=u Src=s => \
+   i_bs_validate_unicode Fail Src | bs_put_integer Fail i=32 u=1 Flags Src
 
-i_bs_put_utf8 j? s
+i_bs_utf8_size S x
+i_bs_utf16_size S x
 
-bs_put_utf16 j? t s
+i_bs_put_utf8 j? S
+bs_put_utf16 j? t S
 
-bs_put_utf32 Fail=j Flags=u Src=s => \
-   i_bs_validate_unicode Fail Src | bs_put_integer Fail i=32 u=1 Flags Src
+i_bs_validate_unicode j? S
 
-i_bs_validate_unicode j? s
+# Handle unoptimized code.
+i_bs_utf8_size Src=c Dst => move Src x | i_bs_utf8_size x Dst
+i_bs_utf16_size Src=c Dst => move Src x | i_bs_utf16_size x Dst
+i_bs_put_utf8 Fail Src=c => move Src x | i_bs_put_utf8 Fail x
+bs_put_utf16 Fail Flags Src=c => move Src x | bs_put_utf16 Fail Flags x
+i_bs_validate_unicode Fail Src=c => move Src x | i_bs_validate_unicode Fail x
 
 #
 # Storing floats into binaries.
@@ -1274,7 +1456,7 @@ bs_put_float Fail Sz=q Unit Flags Val => badarg Fail
 bs_put_float Fail=j Sz=s Unit=u Flags=u Src=s => \
 			gen_put_float(Fail, Sz, Unit, Flags, Src)
 
-i_new_bs_put_float j? s t s
+i_new_bs_put_float j? S t s
 i_new_bs_put_float_imm j? W t s
 
 #
@@ -1284,9 +1466,18 @@ i_new_bs_put_float_imm j? W t s
 bs_put_binary Fail=j Sz=s Unit=u Flags=u Src=s => \
 			gen_put_binary(Fail, Sz, Unit, Flags, Src)
 
-i_new_bs_put_binary j? s t s
-i_new_bs_put_binary_imm j? W s
-i_new_bs_put_binary_all j? s t
+# In unoptimized code, the binary argument could be a literal. (In optimized code,
+# there would be a bs_put_string instruction.)
+i_new_bs_put_binary Fail Size Unit Lit=c => \
+   move Lit x | i_new_bs_put_binary Fail Size Unit x
+i_new_bs_put_binary_imm Fail Size Lit=c => \
+   move Lit x | i_new_bs_put_binary_imm Fail Size x
+i_new_bs_put_binary_all Lit=c Fail Unit => \
+   move Lit x | i_new_bs_put_binary_all x Fail Unit
+
+i_new_bs_put_binary j? S t S
+i_new_bs_put_binary_imm j? W S
+i_new_bs_put_binary_all xy j? t
 
 #
 # Warning: The i_bs_put_string and i_new_bs_put_string instructions
@@ -1384,23 +1575,22 @@ put_map_exact F Map Dst Live Size Rest=* | map_key_sort(Size, Rest) => \
 
 sorted_put_map_assoc Map Dst Live Size Rest=* | is_empty_map(Map) => \
    new_map Dst Live Size Rest
-sorted_put_map_assoc Src=s Dst Live Size Rest=* => \
-	       update_map_assoc Src Dst Live Size Rest
-sorted_put_map_assoc Src Dst Live Size Rest=* => \
-	       move Src x | update_map_assoc x Dst Live Size Rest
+sorted_put_map_assoc Src=xyc Dst Live Size Rest=* => \
+   update_map_assoc Src Dst Live Size Rest
 
-sorted_put_map_exact F Src=s Dst Live Size Rest=* => \
-	      update_map_exact F Src Dst Live Size Rest
-sorted_put_map_exact F Src Dst Live Size Rest=* => \
-	      move Src x | update_map_exact F x Dst Live Size Rest
+sorted_put_map_exact Fail Src=xy Dst Live Size Rest=* => \
+   update_map_exact Src Fail Dst Live Size Rest
+# Literal map arguments for an exact update operation are extremely rare.
+sorted_put_map_exact Fail Src Dst Live Size Rest=* => \
+   move Src x | update_map_exact x Fail Dst Live Size Rest
 
 new_map Dst Live Size Rest=* | is_small_map_literal_keys(Size, Rest) => \
    gen_new_small_map_lit(Dst, Live, Size, Rest)
 
 new_map d t I
 i_new_small_map_lit d t q
-update_map_assoc s d t I
-update_map_exact j? s d t I
+update_map_assoc xyc d t I
+update_map_exact xy j? d t I
 
 is_map Fail Lit=q | literal_is_map(Lit) =>
 is_map Fail cq => jump Fail
@@ -1447,80 +1637,94 @@ gc_bif2 Fail Live u$bif:erlang:sminus/2 S1 S2 Dst => \
 
 #
 # Optimize addition and subtraction of small literals using
-# the i_increment/4 instruction (in bodies, not in guards).
+# the i_increment/3 instruction (in bodies, not in guards).
 #
 
 gen_plus p Live Int=i Reg=d Dst => \
-	gen_increment(Reg, Int, Live, Dst)
+	gen_increment(Reg, Int, Dst)
 gen_plus p Live Reg=d Int=i Dst => \
-	gen_increment(Reg, Int, Live, Dst)
+	gen_increment(Reg, Int, Dst)
 
 gen_minus p Live Reg=d Int=i Dst | negation_is_small(Int) => \
-	gen_increment_from_minus(Reg, Int, Live, Dst)
+	gen_increment_from_minus(Reg, Int, Dst)
 
 #
-# GCing arithmetic instructions.
+# Arithmetic instructions.
 #
 
-gen_plus Fail Live S1 S2 Dst => i_plus S1 S2 Fail Live Dst
+# It is OK to swap arguments for '+' in a guard. It is also
+# OK to turn minus into plus in a guard.
+gen_plus Fail=f Live S1=c S2 Dst => i_plus S2 S1 Fail Dst
+gen_minus Fail=f Live S1 S2=i Dst => gen_plus_from_minus(Fail, Live, S1, S2, Dst)
+
+gen_plus Fail Live S1 S2 Dst => i_plus S1 S2 Fail Dst
 
-gen_minus Fail Live S1 S2 Dst => i_minus S1 S2 Fail Live Dst
+gen_minus Fail Live S1 S2 Dst => i_minus S1 S2 Fail Dst
 
 gc_bif2 Fail Live u$bif:erlang:stimes/2 S1 S2 Dst => \
-  i_times Fail Live S1 S2 Dst
+  i_times Fail S1 S2 Dst
 
 gc_bif2 Fail Live u$bif:erlang:div/2 S1 S2 Dst => \
-  i_m_div Fail Live S1 S2 Dst
+  i_m_div Fail S1 S2 Dst
 gc_bif2 Fail Live u$bif:erlang:intdiv/2 S1 S2 Dst => \
-  i_int_div Fail Live S1 S2 Dst
+  i_int_div Fail S1 S2 Dst
 
 gc_bif2 Fail Live u$bif:erlang:rem/2 S1 S2 Dst => \
-  i_rem S1 S2 Fail Live Dst
+  i_rem S1 S2 Fail Dst
 
 gc_bif2 Fail Live u$bif:erlang:bsl/2 S1 S2 Dst => \
-  i_bsl S1 S2 Fail Live Dst
+  i_bsl S1 S2 Fail Dst
 gc_bif2 Fail Live u$bif:erlang:bsr/2 S1 S2 Dst => \
-  i_bsr S1 S2 Fail Live Dst
+  i_bsr S1 S2 Fail Dst
 
 gc_bif2 Fail Live u$bif:erlang:band/2 S1 S2 Dst => \
-  i_band S1 S2 Fail Live Dst
+  i_band S1 S2 Fail Dst
 
 gc_bif2 Fail Live u$bif:erlang:bor/2 S1 S2 Dst => \
-  i_bor Fail Live S1 S2 Dst
+  i_bor Fail S1 S2 Dst
 
 gc_bif2 Fail Live u$bif:erlang:bxor/2 S1 S2 Dst => \
-  i_bxor Fail Live S1 S2 Dst
+  i_bxor Fail S1 S2 Dst
 
-gc_bif1 Fail I u$bif:erlang:bnot/1 Src Dst=d => i_int_bnot Fail Src I Dst
+gc_bif1 Fail Live u$bif:erlang:bnot/1 Src Dst=d => i_int_bnot Fail Src Dst
 
-i_increment rxy W t d
+i_increment rxy W d
 
-i_plus x xy j? t d
-i_plus s s  j? t d
+# Handle unoptimized code.
+i_plus S1=c S2=c Fail Dst => move S1 x | i_plus x S2 Fail Dst
+i_plus S1=c S2=xy Fail Dst => i_plus S2 S1 Fail Dst
 
-i_minus x x j? t d
-i_minus s s j? t d
+i_plus xy xyc j? d
 
-i_times j? t s s d
+# A minus instruction with a constant right operand will be
+# converted to an i_increment instruction, except in guards or
+# when the negated value of the constant won't fit in a guard.
+# Therefore, it very rare.
+i_minus S1 S2=c Fail Dst => move S2 x | i_minus S1 x Fail Dst
 
-i_m_div j? t s s d
-i_int_div j? t s s d
+i_minus xy xy j? d
+i_minus c xy j? d
 
-i_rem x x j? t d
-i_rem s s j? t d
+i_times j? s s d
 
-i_bsl s s j? t d
-i_bsr s s j? t d
+i_m_div j? s s d
+i_int_div j? s s d
 
-i_band x c j? t d
-i_band s s j? t d
+i_rem x x j? d
+i_rem s s j? d
 
-i_bor j? I s s d
-i_bxor j? I s s d
+i_bsl s s j? d
+i_bsr s s j? d
 
-i_int_bnot Fail Src=c Live Dst => move Src x | i_int_bnot Fail x Live Dst
+i_band x c j? d
+i_band s s j? d
 
-i_int_bnot j? S t d
+i_bor j? s s d
+i_bxor j? s s d
+
+i_int_bnot Fail Src=c Dst => move Src x | i_int_bnot Fail x Dst
+
+i_int_bnot j? S d
 
 #
 # Old guard BIFs that creates heap fragments are no longer allowed.
@@ -1533,29 +1737,28 @@ bif1 Fail u$bif:erlang:round/1 s d => too_old_compiler
 bif1 Fail u$bif:erlang:trunc/1 s d => too_old_compiler
 
 #
-# Guard BIFs.
+# Handle the length/1 guard BIF specially to make it trappable.
 #
-gc_bif1 Fail I Bif Src Dst => \
-	gen_guard_bif1(Fail, I, Bif, Src, Dst)
-
-gc_bif2 Fail I Bif S1 S2 Dst => \
-	gen_guard_bif2(Fail, I, Bif, S1, S2, Dst)
 
-gc_bif3 Fail I Bif S1 S2 S3 Dst => \
-	gen_guard_bif3(Fail, I, Bif, S1, S2, S3, Dst)
+gc_bif1 Fail=j Live u$bif:erlang:length/1 Src Dst => \
+   i_length_setup Live Src | i_length Fail Live Dst
 
-i_gc_bif1 j? W s t? d
+i_length_setup Live Src=c => move Src x | i_length_setup Live x
 
-i_gc_bif2 j? W t? s s d
+i_length_setup t xy
+i_length j? t d
 
-ii_gc_bif3/7
+#
+# Guard BIFs.
+#
+gc_bif1 p Live Bif Src Dst           => i_bif1_body Src Bif Dst
+gc_bif1 Fail=f Live Bif Src Dst      => i_bif1 Src Fail Bif Dst
 
-# A specific instruction can only have 6 operands, so we must
-# pass one of the arguments in an x register.
-ii_gc_bif3 Fail Bif Live S1 S2 S3 Dst => \
-  move S1 x | i_gc_bif3 Fail Bif Live S2 S3 Dst
+gc_bif2 p Live Bif S1 S2 Dst         => i_bif2_body S2 S1 Bif Dst
+gc_bif2 Fail=f Live Bif S1 S2 Dst    => i_bif2 S2 S1 Fail Bif Dst
 
-i_gc_bif3 j? W t? s s d
+gc_bif3 p Live Bif S1 S2 S3 Dst      => i_bif3_body S3 S2 S1 Bif Dst
+gc_bif3 Fail=f Live Bif S1 S2 S3 Dst => i_bif3 S3 S2 S1 Fail Bif Dst
 
 #
 # The following instruction is specially handled in beam_load.c