20 files changed, 667 insertions, 630 deletions
diff --git a/lib/compiler/src/Makefile b/lib/compiler/src/Makefile
index 958d3501c7..cbcbf79839 100644
--- a/lib/compiler/src/Makefile
+++ b/lib/compiler/src/Makefile
@@ -45,6 +45,7 @@ RELSYSDIR = $(RELEASE_PATH)/lib/compiler-$(VSN)
 # Target Specs
 # ----------------------------------------------------
 MODULES =  \
+	beam_a \
 	beam_asm \
 	beam_block \
 	beam_bool \
@@ -65,6 +66,7 @@ MODULES =  \
 	beam_type \
 	beam_utils \
 	beam_validator \
+        beam_z \
 	cerl \
 	cerl_clauses \
 	cerl_inline \
diff --git a/lib/compiler/src/beam_a.erl b/lib/compiler/src/beam_a.erl
new file mode 100644
index 0000000000..1c51226314
--- /dev/null
+++ b/lib/compiler/src/beam_a.erl
@@ -0,0 +1,97 @@
+%%
+%% %CopyrightBegin%
+%%
+%% Copyright Ericsson AB 2012. All Rights Reserved.
+%%
+%% The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved online at http://www.erlang.org/.
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% %CopyrightEnd%
+%%
+%% Purpose: Run directly after code generation to do any normalization
+%%          or preparation to simplify the optimization passes.
+%%          (Mandatory.)
+
+-module(beam_a).
+
+-export([module/2]).
+
+module({Mod,Exp,Attr,Fs0,Lc}, _Opt) ->
+    Fs = [function(F) || F <- Fs0],
+    {ok,{Mod,Exp,Attr,Fs,Lc}}.
+
+function({function,Name,Arity,CLabel,Is0}) ->
+    try
+	%% Rename certain operations to simplify the optimization passes.
+	Is1 = rename_instrs(Is0),
+
+	%% Remove unusued labels for cleanliness and to help
+	%% optimization passes and HiPE.
+	Is = beam_jump:remove_unused_labels(Is1),
+	{function,Name,Arity,CLabel,Is}
+    catch
+	Class:Error ->
+	    Stack = erlang:get_stacktrace(),
+	    io:fwrite("Function: ~w/~w\n", [Name,Arity]),
+	    erlang:raise(Class, Error, Stack)
+    end.
+
+rename_instrs([{apply_last,A,N}|Is]) ->
+    [{apply,A},{deallocate,N},return|rename_instrs(Is)];
+rename_instrs([{call_last,A,F,N}|Is]) ->
+    [{call,A,F},{deallocate,N},return|rename_instrs(Is)];
+rename_instrs([{call_ext_last,A,F,N}|Is]) ->
+    [{call_ext,A,F},{deallocate,N},return|rename_instrs(Is)];
+rename_instrs([{call_only,A,F}|Is]) ->
+    [{call,A,F},return|rename_instrs(Is)];
+rename_instrs([{call_ext_only,A,F}|Is]) ->
+    [{call_ext,A,F},return|rename_instrs(Is)];
+rename_instrs([I|Is]) ->
+    [rename_instr(I)|rename_instrs(Is)];
+rename_instrs([]) -> [].
+
+rename_instr({bs_put_binary=I,F,Sz,U,Fl,Src}) ->
+    {bs_put,F,{I,U,Fl},[Sz,Src]};
+rename_instr({bs_put_float=I,F,Sz,U,Fl,Src}) ->
+    {bs_put,F,{I,U,Fl},[Sz,Src]};
+rename_instr({bs_put_integer=I,F,Sz,U,Fl,Src}) ->
+    {bs_put,F,{I,U,Fl},[Sz,Src]};
+rename_instr({bs_put_utf8=I,F,Fl,Src}) ->
+    {bs_put,F,{I,Fl},[Src]};
+rename_instr({bs_put_utf16=I,F,Fl,Src}) ->
+    {bs_put,F,{I,Fl},[Src]};
+rename_instr({bs_put_utf32=I,F,Fl,Src}) ->
+    {bs_put,F,{I,Fl},[Src]};
+%% rename_instr({bs_put_string,_,_}=I) ->
+%%     {bs_put,{f,0},I,[]};
+rename_instr({bs_add=I,F,[Src1,Src2,U],Dst}) when is_integer(U) ->
+    {bif,I,F,[Src1,Src2,{integer,U}],Dst};
+rename_instr({bs_utf8_size=I,F,Src,Dst}) ->
+    {bif,I,F,[Src],Dst};
+rename_instr({bs_utf16_size=I,F,Src,Dst}) ->
+    {bif,I,F,[Src],Dst};
+rename_instr({bs_init2=I,F,Sz,Extra,Live,Flags,Dst}) ->
+    {bs_init,F,{I,Extra,Flags},Live,[Sz],Dst};
+rename_instr({bs_init_bits=I,F,Sz,Extra,Live,Flags,Dst}) ->
+    {bs_init,F,{I,Extra,Flags},Live,[Sz],Dst};
+rename_instr({bs_append=I,F,Sz,Extra,Live,U,Src,Flags,Dst}) ->
+    {bs_init,F,{I,Extra,U,Flags},Live,[Sz,Src],Dst};
+rename_instr({bs_private_append=I,F,Sz,U,Src,Flags,Dst}) ->
+    {bs_init,F,{I,U,Flags},none,[Sz,Src],Dst};
+rename_instr(bs_init_writable=I) ->
+    {bs_init,{f,0},I,1,[{x,0}],{x,0}};
+rename_instr({select_val=I,Reg,Fail,{list,List}}) ->
+    {select,I,Reg,Fail,List};
+rename_instr({select_tuple_arity=I,Reg,Fail,{list,List}}) ->
+    {select,I,Reg,Fail,List};
+rename_instr(send) ->
+    {call_ext,2,send};
+rename_instr(I) -> I.
diff --git a/lib/compiler/src/beam_block.erl b/lib/compiler/src/beam_block.erl
index cd568097fa..3e0050382c 100644
--- a/lib/compiler/src/beam_block.erl
+++ b/lib/compiler/src/beam_block.erl
@@ -31,19 +31,16 @@ module({Mod,Exp,Attr,Fs0,Lc0}, _Opt) ->
 
 function({function,Name,Arity,CLabel,Is0}, Lc0) ->
     try
-	%% Extra labels may thwart optimizations.
-	Is1 = beam_jump:remove_unused_labels(Is0),
-
 	%% Collect basic blocks and optimize them.
-	Is2 = blockify(Is1),
-	Is3 = embed_lines(Is2),
-	Is4 = move_allocates(Is3),
-	Is5 = beam_utils:live_opt(Is4),
-	Is6 = opt_blocks(Is5),
-	Is7 = beam_utils:delete_live_annos(Is6),
+	Is1 = blockify(Is0),
+	Is2 = embed_lines(Is1),
+	Is3 = move_allocates(Is2),
+	Is4 = beam_utils:live_opt(Is3),
+	Is5 = opt_blocks(Is4),
+	Is6 = beam_utils:delete_live_annos(Is5),
 
 	%% Optimize bit syntax.
-	{Is,Lc} = bsm_opt(Is7, Lc0),
+	{Is,Lc} = bsm_opt(Is6, Lc0),
 
 	%% Done.
 	{{function,Name,Arity,CLabel,Is},Lc}
@@ -74,9 +71,9 @@ blockify([{bs_save2,R,Point}=I,{test,is_eq_exact,_,_}=Test,
 
 %% Do other peep-hole optimizations.
 blockify([{test,is_atom,{f,Fail},[Reg]}=I|
-	  [{select_val,Reg,{f,Fail},
-	    {list,[{atom,false},{f,_}=BrFalse,
-		   {atom,true}=AtomTrue,{f,_}=BrTrue]}}|Is]=Is0],
+	  [{select,select_val,Reg,{f,Fail},
+	    [{atom,false},{f,_}=BrFalse,
+	     {atom,true}=AtomTrue,{f,_}=BrTrue]}|Is]=Is0],
 	 [{block,Bl}|_]=Acc) ->
     case is_last_bool(Bl, Reg) of
 	false ->
@@ -89,9 +86,9 @@ blockify([{test,is_atom,{f,Fail},[Reg]}=I|
 			  {test,is_eq_exact,BrFalse,[Reg,AtomTrue]}|Acc])
     end;
 blockify([{test,is_atom,{f,Fail},[Reg]}=I|
-	  [{select_val,Reg,{f,Fail},
-	    {list,[{atom,true}=AtomTrue,{f,_}=BrTrue,
-		   {atom,false},{f,_}=BrFalse]}}|Is]=Is0],
+	  [{select,select_val,Reg,{f,Fail},
+	    [{atom,true}=AtomTrue,{f,_}=BrTrue,
+	     {atom,false},{f,_}=BrFalse]}|Is]=Is0],
 	 [{block,Bl}|_]=Acc) ->
     case is_last_bool(Bl, Reg) of
 	false ->
@@ -423,8 +420,8 @@ inverse_comp_op(_) -> none.
 %%% Evaluation of constant bit fields.
 %%%
 
-is_bs_put({bs_put_integer,_,_,_,_,_}) -> true;
-is_bs_put({bs_put_float,_,_,_,_,_}) -> true;
+is_bs_put({bs_put,_,{bs_put_integer,_,_},_}) -> true;
+is_bs_put({bs_put,_,{bs_put_float,_,_},_}) -> true;
 is_bs_put(_) -> false.
 
 collect_bs_puts(Is) ->
@@ -439,20 +436,24 @@ collect_bs_puts_1([I|Is]=Is0, Acc) ->
 opt_bs_puts(Is) ->
     opt_bs_1(Is, []).
 
-opt_bs_1([{bs_put_float,Fail,{integer,Sz},1,Flags0,Src}=I0|Is], Acc) ->
+opt_bs_1([{bs_put,Fail,
+	   {bs_put_float,1,Flags0},[{integer,Sz},Src]}=I0|Is], Acc) ->
     try eval_put_float(Src, Sz, Flags0) of
 	<<Int:Sz>> ->
 	    Flags = force_big(Flags0),
-	    I = {bs_put_integer,Fail,{integer,Sz},1,Flags,{integer,Int}},
+	    I = {bs_put,Fail,{bs_put_integer,1,Flags},
+		 [{integer,Sz},{integer,Int}]},
 	    opt_bs_1([I|Is], Acc)
     catch
 	error:_ ->
 	    opt_bs_1(Is, [I0|Acc])
     end;
-opt_bs_1([{bs_put_integer,_,{integer,8},1,_,{integer,_}}|_]=IsAll, Acc0) ->
+opt_bs_1([{bs_put,_,{bs_put_integer,1,_},[{integer,8},{integer,_}]}|_]=IsAll,
+	 Acc0) ->
     {Is,Acc} = bs_collect_string(IsAll, Acc0),
     opt_bs_1(Is, Acc);
-opt_bs_1([{bs_put_integer,Fail,{integer,Sz},1,F,{integer,N}}=I|Is0], Acc) when Sz > 8 ->
+opt_bs_1([{bs_put,Fail,{bs_put_integer,1,F},[{integer,Sz},{integer,N}]}=I|Is0],
+	 Acc) when Sz > 8 ->
     case field_endian(F) of
 	big ->
 	    %% We can do this optimization for any field size without risk
@@ -466,14 +467,14 @@ opt_bs_1([{bs_put_integer,Fail,{integer,Sz},1,F,{integer,N}}=I|Is0], Acc) when S
 	    %% an explosion in code size.
 	    <<Int:Sz>> = <<N:Sz/little>>,
 	    Flags = force_big(F),
-	    Is = [{bs_put_integer,Fail,{integer,Sz},1,
-		   Flags,{integer,Int}}|Is0],
+	    Is = [{bs_put,Fail,{bs_put_integer,1,Flags},
+		   [{integer,Sz},{integer,Int}]}|Is0],
 	    opt_bs_1(Is, Acc);
 	_ -> 					%native or too wide little field
 	    opt_bs_1(Is0, [I|Acc])
     end;
-opt_bs_1([{Op,Fail,{integer,Sz},U,F,Src}|Is], Acc) when U > 1 ->
-    opt_bs_1([{Op,Fail,{integer,U*Sz},1,F,Src}|Is], Acc);
+opt_bs_1([{bs_put,Fail,{Op,U,F},[{integer,Sz},Src]}|Is], Acc) when U > 1 ->
+    opt_bs_1([{bs_put,Fail,{Op,1,F},[{integer,U*Sz},Src]}|Is], Acc);
 opt_bs_1([I|Is], Acc) ->
     opt_bs_1(Is, [I|Acc]);
 opt_bs_1([], Acc) -> reverse(Acc).
@@ -489,17 +490,17 @@ eval_put_float(Src, Sz, Flags) when Sz =< 256 -> %Only evaluate if Sz is reasona
 value({integer,I}) -> I;
 value({float,F}) -> F.
 
-bs_collect_string(Is, [{bs_put_string,Len,{string,Str}}|Acc]) ->
+bs_collect_string(Is, [{bs_put,_,{bs_put_string,Len,{string,Str}},[]}|Acc]) ->
     bs_coll_str_1(Is, Len, reverse(Str), Acc);
 bs_collect_string(Is, Acc) ->
     bs_coll_str_1(Is, 0, [], Acc).
     
-bs_coll_str_1([{bs_put_integer,_,{integer,Sz},U,_,{integer,V}}|Is],
+bs_coll_str_1([{bs_put,_,{bs_put_integer,U,_},[{integer,Sz},{integer,V}]}|Is],
 	      Len, StrAcc, IsAcc) when U*Sz =:= 8 ->
     Byte = V band 16#FF,
     bs_coll_str_1(Is, Len+1, [Byte|StrAcc], IsAcc);
 bs_coll_str_1(Is, Len, StrAcc, IsAcc) ->
-    {Is,[{bs_put_string,Len,{string,reverse(StrAcc)}}|IsAcc]}.
+    {Is,[{bs_put,{f,0},{bs_put_string,Len,{string,reverse(StrAcc)}},[]}|IsAcc]}.
 
 field_endian({field_flags,F}) -> field_endian_1(F).
 
@@ -531,15 +532,17 @@ bs_split_int(N, Sz, Fail, Acc) ->
     bs_split_int_1(N, FirstByteSz, Sz, Fail, Acc).
 
 bs_split_int_1(-1, _, Sz, Fail, Acc) when Sz > 64 ->
-    I = {bs_put_integer,Fail,{integer,Sz},1,{field_flags,[big]},{integer,-1}},
+    I = {bs_put,Fail,{bs_put_integer,1,{field_flags,[big]}},
+	 [{integer,Sz},{integer,-1}]},
     [I|Acc];
 bs_split_int_1(0, _, Sz, Fail, Acc) when Sz > 64 ->
-    I = {bs_put_integer,Fail,{integer,Sz},1,{field_flags,[big]},{integer,0}},
+    I = {bs_put,Fail,{bs_put_integer,1,{field_flags,[big]}},
+	 [{integer,Sz},{integer,0}]},
     [I|Acc];
 bs_split_int_1(N, ByteSz, Sz, Fail, Acc) when Sz > 0 ->
     Mask = (1 bsl ByteSz) - 1,
-    I = {bs_put_integer,Fail,{integer,ByteSz},1,
-	 {field_flags,[big]},{integer,N band Mask}},
+    I = {bs_put,Fail,{bs_put_integer,1,{field_flags,[big]}},
+	 [{integer,ByteSz},{integer,N band Mask}]},
     bs_split_int_1(N bsr ByteSz, 8, Sz-ByteSz, Fail, [I|Acc]);
 bs_split_int_1(_, _, _, _, Acc) -> Acc.
 
@@ -577,9 +580,9 @@ bsm_reroute([{bs_restore2,Reg,Save}=I|Is], D, _, Acc) ->
     bsm_reroute(Is, D, {Reg,Save}, [I|Acc]);
 bsm_reroute([{label,_}=I|Is], D, S, Acc) ->
     bsm_reroute(Is, D, S, [I|Acc]);
-bsm_reroute([{select_val,Reg,F0,{list,Lbls0}}|Is], D, {_,Save}=S, Acc0) ->
+bsm_reroute([{select,select_val,Reg,F0,Lbls0}|Is], D, {_,Save}=S, Acc0) ->
     [F|Lbls] = bsm_subst_labels([F0|Lbls0], Save, D),
-    Acc = [{select_val,Reg,F,{list,Lbls}}|Acc0],
+    Acc = [{select,select_val,Reg,F,Lbls}|Acc0],
     bsm_reroute(Is, D, S, Acc);
 bsm_reroute([{test,TestOp,F0,TestArgs}=I|Is], D, {_,Save}=S, Acc0) ->
     F = bsm_subst_label(F0, Save, D),
@@ -615,10 +618,6 @@ bsm_opt_2([{test,bs_skip_bits2,F,[Ctx,{integer,I1},Unit1,_]}|Is],
 	  [{test,bs_skip_bits2,F,[Ctx,{integer,I2},Unit2,Flags]}|Acc]) ->
     bsm_opt_2(Is, [{test,bs_skip_bits2,F,
 		    [Ctx,{integer,I1*Unit1+I2*Unit2},1,Flags]}|Acc]);
-bsm_opt_2([{test,bs_match_string,F,[Ctx,Bin1]},
-	   {test,bs_match_string,F,[Ctx,Bin2]}|Is], Acc) ->
-    I = {test,bs_match_string,F,[Ctx,<<Bin1/bitstring,Bin2/bitstring>>]},
-    bsm_opt_2([I|Is], Acc);
 bsm_opt_2([I|Is], Acc) ->
     bsm_opt_2(Is, [I|Acc]);
 bsm_opt_2([], Acc) -> reverse(Acc).
diff --git a/lib/compiler/src/beam_bool.erl b/lib/compiler/src/beam_bool.erl
index d9ea6f5a70..81be262d6d 100644
--- a/lib/compiler/src/beam_bool.erl
+++ b/lib/compiler/src/beam_bool.erl
@@ -168,18 +168,18 @@ bopt_block(Reg, Fail, OldIs, [{block,Bl0}|Acc0], St0) ->
     end.
 
 %% ensure_opt_safe(OriginalCode, OptCode, FollowingCode, Fail,
-%%             ReversedPreceedingCode, State) -> ok
+%%             ReversedPrecedingCode, State) -> ok
 %%  Comparing the original code to the optimized code, determine
 %%  whether the optimized code is guaranteed to work in the same
 %%  way as the original code.
 %%
 %%  Throw an exception if the optimization is not safe.
 %%
-ensure_opt_safe(Bl, NewCode, OldIs, Fail, PreceedingCode, St) ->
+ensure_opt_safe(Bl, NewCode, OldIs, Fail, PrecedingCode, St) ->
     %% Here are the conditions that must be true for the
     %% optimization to be safe.
     %%
-    %% 1. If a register is INITIALIZED by PreceedingCode,
+    %% 1. If a register is INITIALIZED by PrecedingCode,
     %%    then if that register assigned a value in the original
     %%    code, but not in the optimized code, it must be UNUSED or KILLED
     %%    in the code that follows.
@@ -190,29 +190,50 @@ ensure_opt_safe(Bl, NewCode, OldIs, Fail, PreceedingCode, St) ->
     %%    by the code that follows.
     %%
     %% 3. Any register that is assigned a value in the optimized
-    %%    code must be UNUSED or KILLED in the following code
-    %%    (because the register might be assigned the wrong value,
-    %%    and even if the value is right it might no longer be
-    %%    assigned on *all* paths leading to its use).
+    %%    code must be UNUSED or KILLED in the following code,
+    %%    unless we can be sure that it is always assigned the same
+    %%    value.
 
-    InitInPreceeding = initialized_regs(PreceedingCode),
+    InitInPreceding = initialized_regs(PrecedingCode),
 
     PrevDst = dst_regs(Bl),
     NewDst = dst_regs(NewCode),
     NotSet = ordsets:subtract(PrevDst, NewDst),
-    MustBeKilled = ordsets:subtract(NotSet, InitInPreceeding),
-    MustBeUnused = ordsets:subtract(ordsets:union(NotSet, NewDst), MustBeKilled),
+    MustBeKilled = ordsets:subtract(NotSet, InitInPreceding),
 
     case all_killed(MustBeKilled, OldIs, Fail, St) of
 	false -> throw(all_registers_not_killed);
 	true -> ok
     end,
+    Same = assigned_same_value(Bl, NewCode),
+    MustBeUnused = ordsets:subtract(ordsets:union(NotSet, NewDst),
+				    ordsets:union(MustBeKilled, Same)),
     case none_used(MustBeUnused, OldIs, Fail, St) of
 	false -> throw(registers_used);
 	true -> ok
     end,
     ok.
 
+%% assigned_same_value(OldCode, NewCodeReversed) -> [DestinationRegs]
+%%  Return an ordset with a list of all y registers that are always
+%%  assigned the same value in the old and new code. Currently, we
+%%  are very conservative in that we only consider identical move
+%%  instructions in the same order.
+%%
+assigned_same_value(Old, New) ->
+    case reverse(New) of
+	[{block,Bl}|_] ->
+	    assigned_same_value(Old, Bl, []);
+	_ ->
+	    ordsets:new()
+    end.
+
+assigned_same_value([{set,[{y,_}=D],[S],move}|T1],
+		    [{set,[{y,_}=D],[S],move}|T2], Acc) ->
+    assigned_same_value(T1, T2, [D|Acc]);
+assigned_same_value(_, _, Acc) ->
+    ordsets:from_list(Acc).
+
 update_fail_label([{set,_,_,move}=I|Is], Fail, Acc) ->
     update_fail_label(Is, Fail, [I|Acc]);
 update_fail_label([{set,Ds,As,{bif,N,{f,_}}}|Is], Fail, Acc) ->
diff --git a/lib/compiler/src/beam_bsm.erl b/lib/compiler/src/beam_bsm.erl
index 1217f7f777..02794a8e18 100644
--- a/lib/compiler/src/beam_bsm.erl
+++ b/lib/compiler/src/beam_bsm.erl
@@ -204,16 +204,6 @@ btb_reaches_match_1(Is, Regs, D) ->
 btb_reaches_match_2([{block,Bl}|Is], Regs0, D) ->
     Regs = btb_reaches_match_block(Bl, Regs0),
     btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{call_only,Arity,{f,Lbl}}|_], Regs0, D) ->
-    Regs = btb_kill_not_live(Arity, Regs0),
-    btb_tail_call(Lbl, Regs, D);
-btb_reaches_match_2([{call_ext_only,Arity,Func}|_], Regs0, D) ->
-    Regs = btb_kill_not_live(Arity, Regs0),
-    btb_tail_call(Func, Regs, D);
-btb_reaches_match_2([{call_last,Arity,{f,Lbl},_}|_], Regs0, D) ->
-    Regs1 = btb_kill_not_live(Arity, Regs0),
-    Regs = btb_kill_yregs(Regs1),
-    btb_tail_call(Lbl, Regs, D);
 btb_reaches_match_2([{call,Arity,{f,Lbl}}|Is], Regs, D) ->
     btb_call(Arity, Lbl, Regs, Is, D);
 btb_reaches_match_2([{apply,Arity}|Is], Regs, D) ->
@@ -222,19 +212,16 @@ btb_reaches_match_2([{call_fun,Live}=I|Is], Regs, D) ->
     btb_call(Live, I, Regs, Is, D);
 btb_reaches_match_2([{make_fun2,_,_,_,Live}|Is], Regs, D) ->
     btb_call(Live, make_fun2, Regs, Is, D);
-btb_reaches_match_2([{call_ext,Arity,{extfunc,Mod,Name,Arity}=Func}|Is], Regs0, D) ->
+btb_reaches_match_2([{call_ext,Arity,Func}=I|Is], Regs0, D) ->
     %% Allow us scanning beyond the call in case the match
     %% context is saved on the stack.
-    case erl_bifs:is_exit_bif(Mod, Name, Arity) of
+    case beam_jump:is_exit_instruction(I) of
 	false ->
 	    btb_call(Arity, Func, Regs0, Is, D);
 	true ->
 	    Regs = btb_kill_not_live(Arity, Regs0),
 	    btb_tail_call(Func, Regs, D)
     end;
-btb_reaches_match_2([{call_ext_last,Arity,_,_}=I|_], Regs, D) ->
-    btb_ensure_not_used(btb_regs_from_arity(Arity), I, Regs),
-    D;
 btb_reaches_match_2([{kill,Y}|Is], Regs, D) ->
     btb_reaches_match_1(Is, btb_kill([Y], Regs), D);
 btb_reaches_match_2([{deallocate,_}|Is], Regs0, D) ->
@@ -254,21 +241,45 @@ btb_reaches_match_2([{bif,_,{f,F},Ss,Dst}=I|Is], Regs0, D0) ->
     Regs = btb_kill([Dst], Regs0),
     D = btb_follow_branch(F, Regs, D0),
     btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{test,bs_start_match2,_,_,[Ctx,_],Ctx}|Is], Regs, D) ->
-    case btb_context_regs(Regs) of
-	[Ctx] ->
-	    D;
-	CtxRegs ->
-	    case member(Ctx, CtxRegs) of
-		false -> btb_reaches_match_2(Is, Regs, D);
-		true -> btb_error(unsuitable_bs_start_match)
+btb_reaches_match_2([{test,bs_start_match2,{f,F},Live,[Ctx,_],Ctx}=I|Is],
+		    Regs0, D0) ->
+    CtxRegs = btb_context_regs(Regs0),
+    case member(Ctx, CtxRegs) of
+	false ->
+	    %% This bs_start_match2 instruction does not use "our"
+	    %% match state. Therefore we can continue the search
+	    %% for another bs_start_match2 instruction.
+	    D = btb_follow_branch(F, Regs0, D0),
+	    Regs = btb_kill_not_live(Live, Regs0),
+	    btb_reaches_match_2(Is, Regs, D);
+	true ->
+	    %% OK. This instruction will use "our" match state,
+	    %% but we must make sure that all other copies of the
+	    %% match state are killed in the code that follows
+	    %% the instruction. (We know that the fail branch cannot
+	    %% be taken in this case.)
+	    OtherCtxRegs = CtxRegs -- [Ctx],
+	    case btb_are_all_unused(OtherCtxRegs, Is, D0) of
+		false -> btb_error({OtherCtxRegs,not_all_unused_after,I});
+		true -> D0
 	    end
     end;
-btb_reaches_match_2([{test,bs_start_match2,_,_,[Bin,_],Ctx}|Is], Regs, D) ->
-    CtxRegs = btb_context_regs(Regs),
+btb_reaches_match_2([{test,bs_start_match2,{f,F},Live,[Bin,_],Ctx}|Is],
+		    Regs0, D0) ->
+    CtxRegs = btb_context_regs(Regs0),
     case member(Bin, CtxRegs) orelse member(Ctx, CtxRegs) of
-	false -> btb_reaches_match_2(Is, Regs, D);
-	true -> btb_error(unsuitable_bs_start_match)
+	false ->
+	    %% This bs_start_match2 does not reference any copy of the
+	    %% match state. Therefore it can safely be passed on the
+	    %% way to another (perhaps more suitable) bs_start_match2
+	    %% instruction.
+	    D = btb_follow_branch(F, Regs0, D0),
+	    Regs = btb_kill_not_live(Live, Regs0),
+	    btb_reaches_match_2(Is, Regs, D);
+	true ->
+	    %% This variant of the bs_start_match2 instruction does
+	    %% not accept a match state as source.
+	    btb_error(unsuitable_bs_start_match)
     end;
 btb_reaches_match_2([{test,_,{f,F},Ss}=I|Is], Regs, D0) ->
     btb_ensure_not_used(Ss, I, Regs),
@@ -278,12 +289,7 @@ btb_reaches_match_2([{test,_,{f,F},_,Ss,_}=I|Is], Regs, D0) ->
     btb_ensure_not_used(Ss, I, Regs),
     D = btb_follow_branch(F, Regs, D0),
     btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{select_val,Src,{f,F},{list,Conds}}=I|Is], Regs, D0) ->
-    btb_ensure_not_used([Src], I, Regs),
-    D1 = btb_follow_branch(F, Regs, D0),
-    D = btb_follow_branches(Conds, Regs, D1),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{select_tuple_arity,Src,{f,F},{list,Conds}}=I|Is], Regs, D0) ->
+btb_reaches_match_2([{select,_,Src,{f,F},Conds}=I|Is], Regs, D0) ->
     btb_ensure_not_used([Src], I, Regs),
     D1 = btb_follow_branch(F, Regs, D0),
     D = btb_follow_branches(Conds, Regs, D1),
@@ -293,46 +299,11 @@ btb_reaches_match_2([{jump,{f,Lbl}}|_], Regs, #btb{index=Li}=D) ->
     btb_reaches_match_2(Is, Regs, D);
 btb_reaches_match_2([{label,_}|Is], Regs, D) ->
     btb_reaches_match_2(Is, Regs, D);
-btb_reaches_match_2([{bs_add,{f,0},_,Dst}|Is], Regs, D) ->
-    btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([bs_init_writable|Is], Regs0, D) ->
-    Regs = btb_kill_not_live(0, Regs0),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_init2,{f,0},_,_,_,_,Dst}|Is], Regs, D) ->
-    btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([{bs_init_bits,{f,0},_,_,_,_,Dst}|Is], Regs, D) ->
-    btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([{bs_append,{f,0},_,_,_,_,Src,_,Dst}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([{bs_private_append,{f,0},_,_,Src,_,Dst}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([{bs_put_integer,{f,0},_,_,_,Src}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_put_float,{f,0},_,_,_,Src}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_put_binary,{f,0},_,_,_,Src}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_put_string,_,_}|Is], Regs, D) ->
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_utf8_size,_,Src,Dst}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([{bs_utf16_size,_,Src,Dst}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
+btb_reaches_match_2([{bs_init,{f,0},_,_,Ss,Dst}=I|Is], Regs, D) ->
+    btb_ensure_not_used(Ss, I, Regs),
     btb_reaches_match_1(Is, btb_kill([Dst], Regs), D);
-btb_reaches_match_2([{bs_put_utf8,_,_,Src}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_put_utf16,_,_,Src}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
-    btb_reaches_match_1(Is, Regs, D);
-btb_reaches_match_2([{bs_put_utf32,_,_,Src}=I|Is], Regs, D) ->
-    btb_ensure_not_used([Src], I, Regs),
+btb_reaches_match_2([{bs_put,{f,0},_,Ss}=I|Is], Regs, D) ->
+    btb_ensure_not_used(Ss, I, Regs),
     btb_reaches_match_1(Is, Regs, D);
 btb_reaches_match_2([{bs_restore2,Src,_}=I|Is], Regs0, D) ->
     case btb_contains_context(Src, Regs0) of
@@ -340,11 +311,11 @@ btb_reaches_match_2([{bs_restore2,Src,_}=I|Is], Regs0, D) ->
 	    btb_reaches_match_1(Is, Regs0, D);
 	true ->
 	    %% Check that all other copies of the context registers
-	    %% are killed by the following instructions.
+	    %% are unused by the following instructions.
 	    Regs = btb_kill([Src], Regs0),
 	    CtxRegs = btb_context_regs(Regs),
-	    case btb_are_all_killed(CtxRegs, Is, D) of
-		false -> btb_error({CtxRegs,not_all_killed_after,I});
+	    case btb_are_all_unused(CtxRegs, Is, D) of
+		false -> btb_error({CtxRegs,not_all_unused_after,I});
 		true -> D#btb{must_not_save=true}
 	    end
     end;
@@ -354,11 +325,11 @@ btb_reaches_match_2([{bs_context_to_binary,Src}=I|Is], Regs0, D) ->
 	    btb_reaches_match_1(Is, Regs0, D);
 	true ->
 	    %% Check that all other copies of the context registers
-	    %% are killed by the following instructions.
+	    %% are unused by the following instructions.
 	    Regs = btb_kill([Src], Regs0),
 	    CtxRegs = btb_context_regs(Regs),
-	    case btb_are_all_killed(CtxRegs, Is, D) of
-		false -> btb_error({CtxRegs,not_all_killed_after,I});
+	    case btb_are_all_unused(CtxRegs, Is, D) of
+		false -> btb_error({CtxRegs,not_all_unused_after,I});
 		true -> D#btb{must_not_save=true}
 	    end
     end;
@@ -389,13 +360,16 @@ btb_call(Arity, Lbl, Regs0, Is, D0) ->
 	    %% First handle the call as if it were a tail call.
 	    D = btb_tail_call(Lbl, Regs, D0),
 
-	    %% No problem so far, but now we must make sure that
-	    %% we don't have any copies of the match context
-	    %% tucked away in an y register.
+	    %% No problem so far (the called function can handle a
+	    %% match context). Now we must make sure that the rest
+	    %% of this function following the call does not attempt
+	    %% to use the match context in case there is a copy
+	    %% tucked away in a y register.
 	    RegList = btb_context_regs(Regs),
-	    case [R || {y,_}=R <- RegList] of
-		[] -> D;
-		[_|_] -> btb_error({multiple_uses,RegList})
+	    YRegs = [R || {y,_}=R <- RegList],
+	    case btb_are_all_unused(YRegs, Is, D) of
+		true -> D;
+		false -> btb_error({multiple_uses,RegList})
 	    end;
 	true ->
 	    %% No match context in any x register. It could have been
@@ -475,21 +449,12 @@ btb_reaches_match_block([{set,Ds,Ss,_}=I|Is], Regs0) ->
 btb_reaches_match_block([], Regs) ->
     Regs.
 
-%% btb_regs_from_arity(Arity) -> [Register])
-%%  Create a list of x registers from a function arity.
-
-btb_regs_from_arity(Arity) ->
-    btb_regs_from_arity_1(Arity, []).
-
-btb_regs_from_arity_1(0, Acc) -> Acc;
-btb_regs_from_arity_1(N, Acc) -> btb_regs_from_arity_1(N-1, [{x,N-1}|Acc]).
-
 %% btb_are_all_killed([Register], [Instruction], D) -> true|false
-%%  Test whether all of the register are killed in the instruction stream.
+%%  Test whether all of the register are unused in the instruction stream.
 
-btb_are_all_killed(RegList, Is, #btb{index=Li}) ->
+btb_are_all_unused(RegList, Is, #btb{index=Li}) ->
     all(fun(R) ->
-		beam_utils:is_killed(R, Is, Li)
+		beam_utils:is_not_used(R, Is, Li)
 	end, RegList).
 
 %% btp_regs_from_list([Register]) -> RegisterSet.
diff --git a/lib/compiler/src/beam_clean.erl b/lib/compiler/src/beam_clean.erl
index a7994ab3b3..26ba93b91c 100644
--- a/lib/compiler/src/beam_clean.erl
+++ b/lib/compiler/src/beam_clean.erl
@@ -74,10 +74,6 @@ find_all_used([], _All, Used) -> Used.
 
 update_work_list([{call,_,{f,L}}|Is], Sets) ->
     update_work_list(Is, add_to_work_list(L, Sets));
-update_work_list([{call_last,_,{f,L},_}|Is], Sets) ->
-    update_work_list(Is, add_to_work_list(L, Sets));
-update_work_list([{call_only,_,{f,L}}|Is], Sets) ->
-    update_work_list(Is, add_to_work_list(L, Sets));
 update_work_list([{make_fun2,{f,L},_,_,_}|Is], Sets) ->
     update_work_list(Is, add_to_work_list(L, Sets));
 update_work_list([_|Is], Sets) ->
@@ -200,7 +196,7 @@ replace([{test,Test,{f,Lbl},Ops}|Is], Acc, D) ->
     replace(Is, [{test,Test,{f,label(Lbl, D)},Ops}|Acc], D);
 replace([{test,Test,{f,Lbl},Live,Ops,Dst}|Is], Acc, D) ->
     replace(Is, [{test,Test,{f,label(Lbl, D)},Live,Ops,Dst}|Acc], D);
-replace([{select_val,R,{f,Fail0},{list,Vls0}}|Is], Acc, D) ->
+replace([{select,I,R,{f,Fail0},Vls0}|Is], Acc, D) ->
     Vls1 = map(fun ({f,L}) -> {f,label(L, D)};
 		   (Other) -> Other end, Vls0),
     Fail = label(Fail0, D),
@@ -210,12 +206,8 @@ replace([{select_val,R,{f,Fail0},{list,Vls0}}|Is], Acc, D) ->
 	    %% Convert to a plain jump.
 	    replace(Is, [{jump,{f,Fail}}|Acc], D);
 	Vls ->
-	    replace(Is, [{select_val,R,{f,Fail},{list,Vls}}|Acc], D)
+	    replace(Is, [{select,I,R,{f,Fail},Vls}|Acc], D)
     end;
-replace([{select_tuple_arity,R,{f,Fail},{list,Vls0}}|Is], Acc, D) ->
-    Vls = map(fun ({f,L}) -> {f,label(L, D)};
-		  (Other) -> Other end, Vls0),
-    replace(Is, [{select_tuple_arity,R,{f,label(Fail, D)},{list,Vls}}|Acc], D);
 replace([{'try',R,{f,Lbl}}|Is], Acc, D) ->
     replace(Is, [{'try',R,{f,label(Lbl, D)}}|Acc], D);
 replace([{'catch',R,{f,Lbl}}|Is], Acc, D) ->
@@ -236,37 +228,12 @@ replace([{gc_bif,Name,{f,Lbl},Live,As,R}|Is], Acc, D) when Lbl =/= 0 ->
     replace(Is, [{gc_bif,Name,{f,label(Lbl, D)},Live,As,R}|Acc], D);
 replace([{call,Ar,{f,Lbl}}|Is], Acc, D) ->
     replace(Is, [{call,Ar,{f,label(Lbl,D)}}|Acc], D);
-replace([{call_last,Ar,{f,Lbl},N}|Is], Acc, D) ->
-    replace(Is, [{call_last,Ar,{f,label(Lbl,D)},N}|Acc], D);
-replace([{call_only,Ar,{f,Lbl}}|Is], Acc, D) ->
-    replace(Is, [{call_only,Ar,{f,label(Lbl, D)}}|Acc], D);
 replace([{make_fun2,{f,Lbl},U1,U2,U3}|Is], Acc, D) ->
     replace(Is, [{make_fun2,{f,label(Lbl, D)},U1,U2,U3}|Acc], D);
-replace([{bs_init2,{f,Lbl},Sz,Words,R,F,Dst}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{bs_init2,{f,label(Lbl, D)},Sz,Words,R,F,Dst}|Acc], D);
-replace([{bs_init_bits,{f,Lbl},Sz,Words,R,F,Dst}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{bs_init_bits,{f,label(Lbl, D)},Sz,Words,R,F,Dst}|Acc], D);
-replace([{bs_put_integer,{f,Lbl},Bits,Unit,Fl,Val}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{bs_put_integer,{f,label(Lbl, D)},Bits,Unit,Fl,Val}|Acc], D);
-replace([{bs_put_utf8=I,{f,Lbl},Fl,Val}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{I,{f,label(Lbl, D)},Fl,Val}|Acc], D);
-replace([{bs_put_utf16=I,{f,Lbl},Fl,Val}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{I,{f,label(Lbl, D)},Fl,Val}|Acc], D);
-replace([{bs_put_utf32=I,{f,Lbl},Fl,Val}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{I,{f,label(Lbl, D)},Fl,Val}|Acc], D);
-replace([{bs_put_binary,{f,Lbl},Bits,Unit,Fl,Val}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{bs_put_binary,{f,label(Lbl, D)},Bits,Unit,Fl,Val}|Acc], D);
-replace([{bs_put_float,{f,Lbl},Bits,Unit,Fl,Val}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{bs_put_float,{f,label(Lbl, D)},Bits,Unit,Fl,Val}|Acc], D);
-replace([{bs_add,{f,Lbl},Src,Dst}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{bs_add,{f,label(Lbl, D)},Src,Dst}|Acc], D);
-replace([{bs_append,{f,Lbl},_,_,_,_,_,_,_}=I0|Is], Acc, D) when Lbl =/= 0 ->
-    I = setelement(2, I0, {f,label(Lbl, D)}),
-    replace(Is, [I|Acc], D);
-replace([{bs_utf8_size=I,{f,Lbl},Src,Dst}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{I,{f,label(Lbl, D)},Src,Dst}|Acc], D);
-replace([{bs_utf16_size=I,{f,Lbl},Src,Dst}|Is], Acc, D) when Lbl =/= 0 ->
-    replace(Is, [{I,{f,label(Lbl, D)},Src,Dst}|Acc], D);
+replace([{bs_init,{f,Lbl},Info,Live,Ss,Dst}|Is], Acc, D) when Lbl =/= 0 ->
+    replace(Is, [{bs_init,{f,label(Lbl, D)},Info,Live,Ss,Dst}|Acc], D);
+replace([{bs_put,{f,Lbl},Info,Ss}|Is], Acc, D) when Lbl =/= 0 ->
+    replace(Is, [{bs_put,{f,label(Lbl, D)},Info,Ss}|Acc], D);
 replace([I|Is], Acc, D) ->
     replace(Is, [I|Acc], D);
 replace([], Acc, _) -> Acc.
diff --git a/lib/compiler/src/beam_dead.erl b/lib/compiler/src/beam_dead.erl
index 5f12a98f09..92d8e5acb3 100644
--- a/lib/compiler/src/beam_dead.erl
+++ b/lib/compiler/src/beam_dead.erl
@@ -182,7 +182,7 @@ forward(Is, Lc) ->
 forward([{block,[]}|Is], D, Lc, Acc) ->
     %% Empty blocks can prevent optimizations.
     forward(Is, D, Lc, Acc);
-forward([{select_val,Reg,_,{list,List}}=I|Is], D0, Lc, Acc) ->
+forward([{select,select_val,Reg,_,List}=I|Is], D0, Lc, Acc) ->
     D = update_value_dict(List, Reg, D0),
     forward(Is, D, Lc, [I|Acc]);
 forward([{label,Lbl}=LblI,{block,[{set,[Dst],[Lit],move}|BlkIs]}=Blk|Is], D, Lc, Acc) ->
@@ -271,11 +271,11 @@ backward([{test,is_eq_exact,Fail,[Dst,{integer,Arity}]}=I|
     end;
 backward([{label,Lbl}=L|Is], D, Acc) ->
     backward(Is, beam_utils:index_label(Lbl, Acc, D), [L|Acc]);
-backward([{select_val,Reg,{f,Fail0},{list,List0}}|Is], D, Acc) ->
+backward([{select,select_val,Reg,{f,Fail0},List0}|Is], D, Acc) ->
     List = shortcut_select_list(List0, Reg, D, []),
     Fail1 = shortcut_label(Fail0, D),
     Fail = shortcut_bs_test(Fail1, Is, D),
-    Sel = {select_val,Reg,{f,Fail},{list,List}},
+    Sel = {select,select_val,Reg,{f,Fail},List},
     backward(Is, D, [Sel|Acc]);
 backward([{jump,{f,To0}},{move,Src,Reg}=Move0|Is], D, Acc) ->
     {To,Move} = case Src of
@@ -382,7 +382,7 @@ shortcut_select_label(To0, Reg, Val, D) ->
     case beam_utils:code_at(To0, D) of
  	[{jump,{f,To}}|_] ->
  	    shortcut_select_label(To, Reg, Val, D);
-	[{test,is_atom,_,[Reg]},{select_val,Reg,{f,Fail},{list,Map}}|_] ->
+	[{test,is_atom,_,[Reg]},{select,select_val,Reg,{f,Fail},Map}|_] ->
 	    To = find_select_val(Map, Val, Fail),
 	    shortcut_select_label(To, Reg, Val, D);
   	[{test,is_eq_exact,{f,_},[Reg,{atom,Val}]},{label,To}|_] when is_atom(Val) ->
@@ -472,10 +472,10 @@ combine_eqs(To, [Reg,{Type,_}=Lit1]=Ops, D, [{label,L1}|_])
     case beam_utils:code_at(To, D) of
 	[{test,is_eq_exact,{f,F2},[Reg,{Type,_}=Lit2]},
 	 {label,L2}|_] when Lit1 =/= Lit2 ->
-	    {select_val,Reg,{f,F2},{list,[Lit1,{f,L1},Lit2,{f,L2}]}};
-	[{select_val,Reg,{f,F2},{list,[{Type,_}|_]=List0}}|_] ->
+	    {select,select_val,Reg,{f,F2},[Lit1,{f,L1},Lit2,{f,L2}]};
+	[{select,select_val,Reg,{f,F2},[{Type,_}|_]=List0}|_] ->
 	    List = remove_from_list(Lit1, List0),
-	    {select_val,Reg,{f,F2},{list,[Lit1,{f,L1}|List]}};
+	    {select,select_val,Reg,{f,F2},[Lit1,{f,L1}|List]};
 	_Is ->
 	    {test,is_eq_exact,{f,To},Ops}
 	end;
@@ -527,6 +527,8 @@ count_bits_matched([{test,_,_,_,[_,Sz,U,{field_flags,_}],_}|Is], SavePoint, Bits
 	{integer,N} -> count_bits_matched(Is, SavePoint, Bits+N*U);
 	_ -> count_bits_matched(Is, SavePoint, Bits)
     end;
+count_bits_matched([{test,bs_match_string,_,[_,Bits,_]}|Is], SavePoint, Bits0) ->
+    count_bits_matched(Is, SavePoint, Bits0+Bits);
 count_bits_matched([{test,_,_,_}|Is], SavePoint, Bits) ->
     count_bits_matched(Is, SavePoint, Bits);
 count_bits_matched([{bs_save2,Reg,SavePoint}|_], {Reg,SavePoint}, Bits) ->
diff --git a/lib/compiler/src/beam_except.erl b/lib/compiler/src/beam_except.erl
index fb1a43cd9e..14e9943f88 100644
--- a/lib/compiler/src/beam_except.erl
+++ b/lib/compiler/src/beam_except.erl
@@ -65,10 +65,6 @@ function_1(Is0) ->
 
 translate([{call_ext,Ar,{extfunc,erlang,error,Ar}}=I|Is], St, Acc) ->
     translate_1(Ar, I, Is, St, Acc);
-translate([{call_ext_only,Ar,{extfunc,erlang,error,Ar}}=I|Is], St, Acc) ->
-    translate_1(Ar, I, Is, St, Acc);
-translate([{call_ext_last,Ar,{extfunc,erlang,error,Ar},_}=I|Is], St, Acc) ->
-    translate_1(Ar, I, Is, St, Acc);
 translate([I|Is], St, Acc) ->
     translate(Is, St, [I|Acc]);
 translate([], _, Acc) ->
diff --git a/lib/compiler/src/beam_flatten.erl b/lib/compiler/src/beam_flatten.erl
index 6c7cb849aa..04232d8fd2 100644
--- a/lib/compiler/src/beam_flatten.erl
+++ b/lib/compiler/src/beam_flatten.erl
@@ -79,49 +79,28 @@ norm_allocate({nozero,Ns,Nh,Inits}, Regs) ->
 
 %% insert_alloc_in_bs_init(ReverseInstructionStream, AllocationInfo) ->
 %%                                  impossible | ReverseInstructionStream'
-%%   A bs_init2/6 instruction should not be followed by a test heap instruction.
+%%   A bs_init/6 instruction should not be followed by a test heap instruction.
 %%   Given the AllocationInfo from a test heap instruction, merge the
-%%   allocation amounts into the previous bs_init2/6 instruction (if any).
+%%   allocation amounts into the previous bs_init/6 instruction (if any).
 %%
-insert_alloc_in_bs_init([I|_]=Is, Alloc) ->
-    case is_bs_constructor(I) of
-	false -> impossible;
-	true -> insert_alloc_1(Is, Alloc, [])
-    end.
-
-insert_alloc_1([{bs_init2=Op,Fail,Bs,Ws1,Regs,F,Dst}|Is], {_,nostack,Ws2,[]}, Acc) ->
-    Al = beam_utils:combine_heap_needs(Ws1, Ws2),
-    I = {Op,Fail,Bs,Al,Regs,F,Dst},
-    reverse(Acc, [I|Is]);
-insert_alloc_1([{bs_init_bits=Op,Fail,Bs,Ws1,Regs,F,Dst}|Is], {_,nostack,Ws2,[]}, Acc) ->
-    Al = beam_utils:combine_heap_needs(Ws1, Ws2),
-    I = {Op,Fail,Bs,Al,Regs,F,Dst},
-    reverse(Acc, [I|Is]);
-insert_alloc_1([{bs_append,Fail,Sz,Ws1,Regs,U,Bin,Fl,Dst}|Is],
-	       {_,nostack,Ws2,[]}, Acc) ->
+insert_alloc_in_bs_init([{bs_put,_,_,_}=I|Is], Alloc) ->
+    %% The instruction sequence ends with an bs_put/4 instruction.
+    %% We'll need to search backwards for the bs_init/6 instruction.
+    insert_alloc_1(Is, Alloc, [I]);
+insert_alloc_in_bs_init(_, _) -> impossible.
+
+insert_alloc_1([{bs_init=Op,Fail,Info0,Live,Ss,Dst}|Is],
+	       {_,nostack,Ws2,[]}, Acc) when is_integer(Live) ->
+    %% The number of extra heap words is always in the second position
+    %% in the Info tuple.
+    Ws1 = element(2, Info0),
     Al = beam_utils:combine_heap_needs(Ws1, Ws2),
-    I = {bs_append,Fail,Sz,Al,Regs,U,Bin,Fl,Dst},
+    Info = setelement(2, Info0, Al),
+    I = {Op,Fail,Info,Live,Ss,Dst},
     reverse(Acc, [I|Is]);
-insert_alloc_1([I|Is], Alloc, Acc) ->
+insert_alloc_1([{bs_put,_,_,_}=I|Is], Alloc, Acc) ->
     insert_alloc_1(Is, Alloc, [I|Acc]).
 
-
-%% is_bs_constructor(Instruction) -> true|false.
-%%  Test whether the instruction is a bit syntax construction
-%%  instruction that can occur at the end of a bit syntax
-%%  construction. (Since an empty binary would be expressed
-%%  as a literal, the bs_init2/6 instruction will not occur
-%%  at the end and therefore it is no need to test for it here.)
-%%
-is_bs_constructor({bs_put_integer,_,_,_,_,_}) -> true;
-is_bs_constructor({bs_put_utf8,_,_,_}) -> true;
-is_bs_constructor({bs_put_utf16,_,_,_}) -> true;
-is_bs_constructor({bs_put_utf32,_,_,_}) -> true;
-is_bs_constructor({bs_put_float,_,_,_,_,_}) -> true;
-is_bs_constructor({bs_put_binary,_,_,_,_,_}) -> true;
-is_bs_constructor({bs_put_string,_,_}) -> true;
-is_bs_constructor(_) -> false.
-
 %% opt(Is0) -> Is
 %%  Simple peep-hole optimization to move a {move,Any,{x,0}} past
 %%  any kill up to the next call instruction. (To give the loader
diff --git a/lib/compiler/src/beam_jump.erl b/lib/compiler/src/beam_jump.erl
index db67d24514..636c299e47 100644
--- a/lib/compiler/src/beam_jump.erl
+++ b/lib/compiler/src/beam_jump.erl
@@ -20,7 +20,7 @@
 
 -module(beam_jump).
 
--export([module/2,module_labels/1,
+-export([module/2,
 	 is_unreachable_after/1,is_exit_instruction/1,
 	 remove_unused_labels/1,is_label_used_in/2]).
 
@@ -46,10 +46,13 @@
 %%%     such as a jump that never transfers control to the instruction
 %%%     following it.
 %%%
-%%% (2) case_end, if_end, and badmatch, and function calls that cause an
-%%%     exit (such as calls to exit/1) are moved to the end of the function.
-%%%     The purpose is to allow further optimizations at the place from
-%%%     which the code was moved.
+%%% (2) Short sequences starting with a label and ending in case_end, if_end,
+%%%     and badmatch, and function calls that cause an exit (such as calls
+%%%     to exit/1) are moved to the end of the function, but only if the
+%%%     the block is not entered via a fallthrough. The purpose of this move
+%%%     is to allow further optimizations at the place from which the
+%%%     code was moved (a jump around the block could be replaced with a
+%%%     fallthrough).
 %%%
 %%% (3) Any unreachable code is removed.  Unreachable code is code
 %%%     after jump, call_last and other instructions which never
@@ -130,13 +133,6 @@ module({Mod,Exp,Attr,Fs0,Lc}, _Opt) ->
     Fs = [function(F) || F <- Fs0],
     {ok,{Mod,Exp,Attr,Fs,Lc}}.
 
-module_labels({Mod,Exp,Attr,Fs,Lc}) ->
-    {Mod,Exp,Attr,[function_labels(F) || F <- Fs],Lc}.
-
-function_labels({function,Name,Arity,CLabel,Asm0}) ->
-    Asm = remove_unused_labels(Asm0),
-    {function,Name,Arity,CLabel,Asm}.    
-
 %% function(Function) -> Function'
 %%  Optimize jumps and branches.
 %%
@@ -224,6 +220,8 @@ extract_seq([{line,_}=Line|Is], Acc) ->
     extract_seq(Is, [Line|Acc]);
 extract_seq([{block,_}=Bl|Is], Acc) ->
     extract_seq_1(Is, [Bl|Acc]);
+extract_seq([{bs_context_to_binary,_}=I|Is], Acc) ->
+    extract_seq_1(Is, [I|Acc]);
 extract_seq([{label,_}|_]=Is, Acc) ->
     extract_seq_1(Is, Acc);
 extract_seq(_, _) -> no.
@@ -232,6 +230,9 @@ extract_seq_1([{line,_}=Line|Is], Acc) ->
     extract_seq_1(Is, [Line|Acc]);
 extract_seq_1([{label,_},{func_info,_,_,_}|_], _) ->
     no;
+extract_seq_1([{label,Lbl},{jump,{f,Lbl}}|_], _) ->
+    %% Don't move a sequence which have a fallthrough entering it.
+    no;
 extract_seq_1([{label,_}=Lbl|Is], Acc) ->
     {yes,[Lbl|Acc],Is};
 extract_seq_1(_, _) -> no.
@@ -260,43 +261,39 @@ find_fixpoint(OptFun, Is0) ->
 	Is -> find_fixpoint(OptFun, Is)
     end.
 
-opt([{test,Test0,{f,Lnum}=Lbl,Ops}=I|Is0], Acc, St) ->
-    case Is0 of
-	[{jump,{f,Lnum}}|Is] ->
-	    %% We have
-	    %%    Test Label Ops
-	    %%    jump Label
-	    %% The test instruction is definitely not needed.
-	    %% The jump instruction is not needed if there is
-	    %% a definition of Label following the jump instruction.
-	    case is_label_defined(Is, Lnum) of
-		false ->
-		    %% The jump instruction is still needed.
-		    opt(Is0, [I|Acc], label_used(Lbl, St));
-		true ->
-		    %% Neither the test nor the jump are needed.
-		    opt(Is, Acc, St)
-	    end;
-	[{jump,To}|Is] ->
-	    case is_label_defined(Is, Lnum) of
-		false ->
+opt([{test,_,{f,L}=Lbl,_}=I|[{jump,{f,L}}|_]=Is], Acc, St) ->
+    %% We have
+    %%    Test Label Ops
+    %%    jump Label
+    %% The test instruction is not needed if the test is pure
+    %% (it modifies neither registers nor bit syntax state).
+    case beam_utils:is_pure_test(I) of
+	false ->
+	    %% Test is not pure; we must keep it.
+	    opt(Is, [I|Acc], label_used(Lbl, St));
+	true ->
+	    %% The test is pure and its failure label is the same
+	    %% as in the jump that follows -- thus it is not needed.
+	    opt(Is, Acc, St)
+    end;
+opt([{test,Test0,{f,L}=Lbl,Ops}=I|[{jump,To}|Is]=Is0], Acc, St) ->
+    case is_label_defined(Is, L) of
+	false ->
+	    opt(Is0, [I|Acc], label_used(Lbl, St));
+	true ->
+	    case invert_test(Test0) of
+		not_possible ->
 		    opt(Is0, [I|Acc], label_used(Lbl, St));
-		true ->
-		    case invert_test(Test0) of
-			not_possible ->
-			    opt(Is0, [I|Acc], label_used(Lbl, St));
-			Test ->
-			    opt([{test,Test,To,Ops}|Is], Acc, St)
-		    end
-	    end;
-	_Other ->
-	    opt(Is0, [I|Acc], label_used(Lbl, St))
+		Test ->
+		    %% Invert the test and remove the jump.
+		    opt([{test,Test,To,Ops}|Is], Acc, St)
+	    end
     end;
+opt([{test,_,{f,_}=Lbl,_}=I|Is], Acc, St) ->
+    opt(Is, [I|Acc], label_used(Lbl, St));
 opt([{test,_,{f,_}=Lbl,_,_,_}=I|Is], Acc, St) ->
     opt(Is, [I|Acc], label_used(Lbl, St));
-opt([{select_val,_R,Fail,{list,Vls}}=I|Is], Acc, St) ->
-    skip_unreachable(Is, [I|Acc], label_used([Fail|Vls], St));
-opt([{select_tuple_arity,_R,Fail,{list,Vls}}=I|Is], Acc, St) ->
+opt([{select,_,_R,Fail,Vls}=I|Is], Acc, St) ->
     skip_unreachable(Is, [I|Acc], label_used([Fail|Vls], St));
 opt([{label,L}=I|Is], Acc, #st{entry=L}=St) ->
     %% NEVER move the entry label.
@@ -412,14 +409,8 @@ is_label_used(L, St) ->
 
 is_unreachable_after({func_info,_M,_F,_A}) -> true;
 is_unreachable_after(return) -> true;
-is_unreachable_after({call_ext_last,_Ar,_ExtFunc,_D}) -> true;
-is_unreachable_after({call_ext_only,_Ar,_ExtFunc}) -> true;
-is_unreachable_after({call_last,_Ar,_Lbl,_D}) -> true;
-is_unreachable_after({call_only,_Ar,_Lbl}) -> true;
-is_unreachable_after({apply_last,_Ar,_N}) -> true;
 is_unreachable_after({jump,_Lbl}) -> true;
-is_unreachable_after({select_val,_R,_Lbl,_Cases}) -> true;
-is_unreachable_after({select_tuple_arity,_R,_Lbl,_Cases}) -> true;
+is_unreachable_after({select,_What,_R,_Lbl,_Cases}) -> true;
 is_unreachable_after({loop_rec_end,_}) -> true;
 is_unreachable_after({wait,_}) -> true;
 is_unreachable_after(I) -> is_exit_instruction(I).
@@ -430,10 +421,6 @@ is_unreachable_after(I) -> is_exit_instruction(I).
 
 is_exit_instruction({call_ext,_,{extfunc,M,F,A}}) ->
     erl_bifs:is_exit_bif(M, F, A);
-is_exit_instruction({call_ext_last,_,{extfunc,M,F,A},_}) ->
-    erl_bifs:is_exit_bif(M, F, A);
-is_exit_instruction({call_ext_only,_,{extfunc,M,F,A}}) ->
-    erl_bifs:is_exit_bif(M, F, A);
 is_exit_instruction(if_end) -> true;
 is_exit_instruction({case_end,_}) -> true;
 is_exit_instruction({try_case_end,_}) -> true;
@@ -516,9 +503,7 @@ ulbl({test,_,Fail,_}, Used) ->
     mark_used(Fail, Used);
 ulbl({test,_,Fail,_,_,_}, Used) ->
     mark_used(Fail, Used);
-ulbl({select_val,_,Fail,{list,Vls}}, Used) ->
-    mark_used_list(Vls, mark_used(Fail, Used));
-ulbl({select_tuple_arity,_,Fail,{list,Vls}}, Used) ->
+ulbl({select,_,_,Fail,Vls}, Used) ->
     mark_used_list(Vls, mark_used(Fail, Used));
 ulbl({'try',_,Lbl}, Used) ->
     mark_used(Lbl, Used);
@@ -538,29 +523,9 @@ ulbl({bif,_Name,Lbl,_As,_R}, Used) ->
     mark_used(Lbl, Used);
 ulbl({gc_bif,_Name,Lbl,_Live,_As,_R}, Used) ->
     mark_used(Lbl, Used);
-ulbl({bs_init2,Lbl,_,_,_,_,_}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_init_bits,Lbl,_,_,_,_,_}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_put_integer,Lbl,_Bits,_Unit,_Fl,_Val}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_put_float,Lbl,_Bits,_Unit,_Fl,_Val}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_put_binary,Lbl,_Bits,_Unit,_Fl,_Val}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_put_utf8,Lbl,_Fl,_Val}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_put_utf16,Lbl,_Fl,_Val}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_put_utf32,Lbl,_Fl,_Val}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_add,Lbl,_,_}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_append,Lbl,_,_,_,_,_,_,_}, Used) ->
-    mark_used(Lbl, Used);
-ulbl({bs_utf8_size,Lbl,_,_}, Used) ->
+ulbl({bs_init,Lbl,_,_,_,_}, Used) ->
     mark_used(Lbl, Used);
-ulbl({bs_utf16_size,Lbl,_,_}, Used) ->
+ulbl({bs_put,Lbl,_,_}, Used) ->
     mark_used(Lbl, Used);
 ulbl(_, Used) -> Used.
 
diff --git a/lib/compiler/src/beam_peep.erl b/lib/compiler/src/beam_peep.erl
index f39fc50b95..a199aa50ed 100644
--- a/lib/compiler/src/beam_peep.erl
+++ b/lib/compiler/src/beam_peep.erl
@@ -120,13 +120,13 @@ peep([{test,Op,_,Ops}=I|Is], SeenTests0, Acc) ->
 		    peep(Is, SeenTests, [I|Acc])
 	    end
     end;
-peep([{select_val,Src,Fail,
-	{list,[{atom,false},{f,L},{atom,true},{f,L}]}}|
+peep([{select,select_val,Src,Fail,
+       [{atom,false},{f,L},{atom,true},{f,L}]}|
       [{label,L}|_]=Is], SeenTests, Acc) ->
     I = {test,is_boolean,Fail,[Src]},
     peep([I|Is], SeenTests, Acc);
-peep([{select_val,Src,Fail,
-       {list,[{atom,true},{f,L},{atom,false},{f,L}]}}|
+peep([{select,select_val,Src,Fail,
+       [{atom,true},{f,L},{atom,false},{f,L}]}|
       [{label,L}|_]=Is], SeenTests, Acc) ->
     I = {test,is_boolean,Fail,[Src]},
     peep([I|Is], SeenTests, Acc);
diff --git a/lib/compiler/src/beam_receive.erl b/lib/compiler/src/beam_receive.erl
index bd1f44f66b..fe95a7e35b 100644
--- a/lib/compiler/src/beam_receive.erl
+++ b/lib/compiler/src/beam_receive.erl
@@ -84,13 +84,29 @@ function({function,Name,Arity,Entry,Is}) ->
 	    erlang:raise(Class, Error, Stack)
     end.
 
+opt([{call_ext,A,{extfunc,erlang,spawn_monitor,A}}=I0|Is0], D, Acc)
+  when A =:= 1; A =:= 3 ->
+    case ref_in_tuple(Is0) of
+	no ->
+	    opt(Is0, D, [I0|Acc]);
+	{yes,Regs,Is1,MatchReversed} ->
+	    %% The call creates a brand new reference. Now
+	    %% search for a receive statement in the same
+	    %% function that will match against the reference.
+	    case opt_recv(Is1, Regs, D) of
+		no ->
+		    opt(Is0, D, [I0|Acc]);
+		{yes,Is,Lbl} ->
+		    opt(Is, D, MatchReversed++[I0,{recv_mark,{f,Lbl}}|Acc])
+	    end
+    end;
 opt([{call_ext,Arity,{extfunc,erlang,Name,Arity}}=I|Is0], D, Acc) ->
     case creates_new_ref(Name, Arity) of
 	true ->
 	    %% The call creates a brand new reference. Now
 	    %% search for a receive statement in the same
 	    %% function that will match against the reference.
-	    case opt_recv(Is0, D) of
+	    case opt_recv(Is0, regs_init_x0(), D) of
 		no ->
 		    opt(Is0, D, [I|Acc]);
 		{yes,Is,Lbl} ->
@@ -104,19 +120,34 @@ opt([I|Is], D, Acc) ->
 opt([], _, Acc) ->
     reverse(Acc).
 
+ref_in_tuple([{test,is_tuple,_,[{x,0}]}=I1,
+	      {test,test_arity,_,[{x,0},2]}=I2,
+	      {block,[{set,[_],[{x,0}],{get_tuple_element,0}},
+		      {set,[Dst],[{x,0}],{get_tuple_element,1}}|Bl]}=I3|Is]) ->
+    ref_in_tuple_1(Bl, Dst, Is, [I3,I2,I1]);
+ref_in_tuple([{test,is_tuple,_,[{x,0}]}=I1,
+	      {test,test_arity,_,[{x,0},2]}=I2,
+	      {block,[{set,[Dst],[{x,0}],{get_tuple_element,1}}|Bl]}=I3|Is]) ->
+    ref_in_tuple_1(Bl, Dst, Is, [I3,I2,I1]);
+ref_in_tuple(_) -> no.
+
+ref_in_tuple_1(Bl, Dst, Is, MatchReversed) ->
+    Regs0 = regs_init_singleton(Dst),
+    Regs = opt_update_regs_bl(Bl, Regs0),
+    {yes,Regs,Is,MatchReversed}.
+
 %% creates_new_ref(Name, Arity) -> true|false.
 %%  Return 'true' if the BIF Name/Arity will create a new reference.
 creates_new_ref(monitor, 2) -> true;
 creates_new_ref(make_ref, 0) -> true;
 creates_new_ref(_, _) -> false.
 
-%% opt_recv([Instruction], LabelIndex) -> no|{yes,[Instruction]}
+%% opt_recv([Instruction], Regs, LabelIndex) -> no|{yes,[Instruction]}
 %%  Search for a receive statement that will only retrieve messages
 %%  that contain the newly created reference (which is currently in {x,0}).
-opt_recv(Is, D) ->
-    R = regs_init_x0(),
+opt_recv(Is, Regs, D) ->
     L = gb_sets:empty(),
-    opt_recv(Is, D, R, L, []).
+    opt_recv(Is, D, Regs, L, []).
 
 opt_recv([{label,L}=Lbl,{loop_rec,{f,Fail},_}=Loop|Is], D, R0, _, Acc) ->
     R = regs_kill_not_live(0, R0),
@@ -157,8 +188,6 @@ opt_update_regs({call_fun,_}, R, L) ->
     {regs_kill_not_live(0, R),L};
 opt_update_regs({kill,Y}, R, L) ->
     {regs_kill([Y], R),L};
-opt_update_regs(send, R, L) ->
-    {regs_kill_not_live(0, R),L};
 opt_update_regs({'catch',_,{f,Lbl}}, R, L) ->
     {R,gb_sets:add(Lbl, L)};
 opt_update_regs({catch_end,_}, R, L) ->
@@ -253,10 +282,7 @@ opt_ref_used_1([{test,is_ne_exact,{f,Fail},Args}|Is], RefReg, D, Done0, Regs) ->
 opt_ref_used_1([{test,_,{f,Fail},_}|Is], RefReg, D, Done0, Regs) ->
     Done = opt_ref_used_at(Fail, RefReg, D, Done0, Regs),
     opt_ref_used_1(Is, RefReg, D, Done, Regs);
-opt_ref_used_1([{select_tuple_arity,_,{f,Fail},{list,List}}|_], RefReg, D, Done, Regs) ->
-    Lbls = [F || {f,F} <- List] ++ [Fail],
-    opt_ref_used_in_all(Lbls, RefReg, D, Done, Regs);
-opt_ref_used_1([{select_val,_,{f,Fail},{list,List}}|_], RefReg, D, Done, Regs) ->
+opt_ref_used_1([{select,_,_,{f,Fail},List}|_], RefReg, D, Done, Regs) ->
     Lbls = [F || {f,F} <- List] ++ [Fail],
     opt_ref_used_in_all(Lbls, RefReg, D, Done, Regs);
 opt_ref_used_1([{label,Lbl}|Is], RefReg, D, Done, Regs) ->
@@ -323,6 +349,12 @@ opt_ref_used_bl([], Regs) -> Regs.
 regs_init() ->
     {0,0}.
 
+%% regs_init_singleton(Register) -> RegisterSet
+%%  Return a set that only contains one register.
+
+regs_init_singleton(Reg) ->
+    regs_add(Reg, regs_init()).
+
 %% regs_init_x0() -> RegisterSet
 %%  Return a set that only contains the {x,0} register.
 
diff --git a/lib/compiler/src/beam_trim.erl b/lib/compiler/src/beam_trim.erl
index 5f4fa3b1f8..d95db1f681 100644
--- a/lib/compiler/src/beam_trim.erl
+++ b/lib/compiler/src/beam_trim.erl
@@ -172,38 +172,16 @@ remap([{bif,Name,Fail,Ss,D}|Is], Map, Acc) ->
 remap([{gc_bif,Name,Fail,Live,Ss,D}|Is], Map, Acc) ->
     I = {gc_bif,Name,Fail,Live,[Map(S) || S <- Ss],Map(D)},
     remap(Is, Map, [I|Acc]);
-remap([{bs_add,Fail,[SrcA,SrcB,U],D}|Is], Map, Acc) ->
-    I = {bs_add,Fail,[Map(SrcA),Map(SrcB),U],Map(D)},
+remap([{bs_init,Fail,Info,Live,Ss0,Dst0}|Is], Map, Acc) ->
+    Ss = [Map(Src) || Src <- Ss0],
+    Dst = Map(Dst0),
+    I = {bs_init,Fail,Info,Live,Ss,Dst},
     remap(Is, Map, [I|Acc]);
-remap([{bs_append=Op,Fail,Bits,Heap,Live,Unit,Bin,Flags,D}|Is], Map, Acc) ->
-    I = {Op,Fail,Map(Bits),Heap,Live,Unit,Map(Bin),Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([{bs_private_append=Op,Fail,Bits,Unit,Bin,Flags,D}|Is], Map, Acc) ->
-    I = {Op,Fail,Map(Bits),Unit,Map(Bin),Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([bs_init_writable=I|Is], Map, Acc) ->
-    remap(Is, Map, [I|Acc]);
-remap([{bs_init2,Fail,Src,Live,U,Flags,D}|Is], Map, Acc) ->
-    I = {bs_init2,Fail,Map(Src),Live,U,Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([{bs_init_bits,Fail,Src,Live,U,Flags,D}|Is], Map, Acc) ->
-    I = {bs_init_bits,Fail,Map(Src),Live,U,Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([{bs_put_binary=Op,Fail,Src,U,Flags,D}|Is], Map, Acc) ->
-    I = {Op,Fail,Map(Src),U,Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([{bs_put_integer=Op,Fail,Src,U,Flags,D}|Is], Map, Acc) ->
-    I = {Op,Fail,Map(Src),U,Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([{bs_put_float=Op,Fail,Src,U,Flags,D}|Is], Map, Acc) ->
-    I = {Op,Fail,Map(Src),U,Flags,Map(D)},
-    remap(Is, Map, [I|Acc]);
-remap([{bs_put_string,_,_}=I|Is], Map, Acc) ->
+remap([{bs_put=Op,Fail,Info,Ss}|Is], Map, Acc) ->
+    I = {Op,Fail,Info,[Map(S) || S <- Ss]},
     remap(Is, Map, [I|Acc]);
 remap([{kill,Y}|T], Map, Acc) ->
     remap(T, Map, [{kill,Map(Y)}|Acc]);
-remap([send=I|T], Map, Acc) ->
-    remap(T, Map, [I|Acc]);
 remap([{make_fun2,_,_,_,_}=I|T], Map, Acc) ->
     remap(T, Map, [I|Acc]);
 remap([{deallocate,N}|Is], Map, Acc) ->
@@ -217,12 +195,6 @@ remap([{test,Name,Fail,Live,Ss,Dst}|Is], Map, Acc) ->
     remap(Is, Map, [I|Acc]);
 remap([return|_]=Is, _, Acc) ->
     reverse(Acc, Is);
-remap([{call_last,Ar,Name,N}|Is], Map, Acc) ->
-    I = {call_last,Ar,Name,Map({frame_size,N})},
-    reverse(Acc, [I|Is]);
-remap([{call_ext_last,Ar,Name,N}|Is], Map, Acc) ->
-    I = {call_ext_last,Ar,Name,Map({frame_size,N})},
-    reverse(Acc, [I|Is]);
 remap([{line,_}=I|Is], Map, Acc) ->
     remap(Is, Map, [I|Acc]).
     
@@ -280,8 +252,8 @@ frame_size([{call_fun,_}|Is], Safe) ->
     frame_size(Is, Safe);
 frame_size([{call,_,_}|Is], Safe) ->
     frame_size(Is, Safe);
-frame_size([{call_ext,A,{extfunc,M,F,A}}|Is], Safe) ->
-    case erl_bifs:is_exit_bif(M, F, A) of
+frame_size([{call_ext,_,_}=I|Is], Safe) ->
+    case beam_jump:is_exit_instruction(I) of
 	true -> throw(not_possible);
 	false -> frame_size(Is, Safe)
     end;
@@ -295,35 +267,15 @@ frame_size([{test,_,{f,L},_}|Is], Safe) ->
     frame_size_branch(L, Is, Safe);
 frame_size([{test,_,{f,L},_,_,_}|Is], Safe) ->
     frame_size_branch(L, Is, Safe);
-frame_size([{bs_add,{f,L},_,_}|Is], Safe) ->
+frame_size([{bs_init,{f,L},_,_,_,_}|Is], Safe) ->
     frame_size_branch(L, Is, Safe);
-frame_size([{bs_append,{f,L},_,_,_,_,_,_,_}|Is], Safe) ->
+frame_size([{bs_put,{f,L},_,_}|Is], Safe) ->
     frame_size_branch(L, Is, Safe);
-frame_size([{bs_private_append,{f,L},_,_,_,_,_}|Is], Safe) ->
-    frame_size_branch(L, Is, Safe);
-frame_size([bs_init_writable|Is], Safe) ->
-    frame_size(Is, Safe);
-frame_size([{bs_init2,{f,L},_,_,_,_,_}|Is], Safe) ->
-    frame_size_branch(L, Is, Safe);
-frame_size([{bs_init_bits,{f,L},_,_,_,_,_}|Is], Safe) ->
-    frame_size_branch(L, Is, Safe);
-frame_size([{bs_put_binary,{f,L},_,_,_,_}|Is], Safe) ->
-    frame_size_branch(L, Is, Safe);
-frame_size([{bs_put_integer,{f,L},_,_,_,_}|Is], Safe) ->
-    frame_size_branch(L, Is, Safe);
-frame_size([{bs_put_float,{f,L},_,_,_,_}|Is], Safe) ->
-    frame_size_branch(L, Is, Safe);
-frame_size([{bs_put_string,_,_}|Is], Safe) ->
-    frame_size(Is, Safe);
 frame_size([{kill,_}|Is], Safe) ->
     frame_size(Is, Safe);
-frame_size([send|Is], Safe) ->
-    frame_size(Is, Safe);
 frame_size([{make_fun2,_,_,_,_}|Is], Safe) ->
     frame_size(Is, Safe);
 frame_size([{deallocate,N}|_], _) -> N;
-frame_size([{call_last,_,_,N}|_], _) -> N;
-frame_size([{call_ext_last,_,_,N}|_], _) -> N;
 frame_size([{line,_}|Is], Safe) ->
     frame_size(Is, Safe);
 frame_size([_|_], _) -> throw(not_possible).
diff --git a/lib/compiler/src/beam_utils.erl b/lib/compiler/src/beam_utils.erl
index 194f089ba1..8af0447f63 100644
--- a/lib/compiler/src/beam_utils.erl
+++ b/lib/compiler/src/beam_utils.erl
@@ -87,7 +87,7 @@ is_killed_at(R, Lbl, D) when is_integer(Lbl) ->
 %%  across branches.
 
 is_not_used(R, Is, D) ->
-    St = #live{bl=fun check_used_block/2,lbl=D,res=gb_trees:empty()},
+    St = #live{bl=check_used_block_fun(D),lbl=D,res=gb_trees:empty()},
     case check_liveness(R, Is, St) of
 	{killed,_} -> true;
 	{used,_} -> false;
@@ -102,7 +102,7 @@ is_not_used(R, Is, D) ->
 %%  across branches.
 
 is_not_used_at(R, Lbl, D) ->
-    St = #live{bl=fun check_used_block/2,lbl=D,res=gb_trees:empty()},
+    St = #live{bl=check_used_block_fun(D),lbl=D,res=gb_trees:empty()},
     case check_liveness_at(R, Lbl, St) of
 	{killed,_} -> true;
 	{used,_} -> false;
@@ -263,26 +263,14 @@ check_liveness(R, [{test,_,{f,Fail},As}|Is], St0) ->
 		{_,_}=Other -> Other
 	    end
     end;
-check_liveness(R, [{test,_,{f,Fail},Live,Ss,_}|Is], St0) ->
-    case R of
-	{x,X} ->
-	    case X < Live orelse member(R, Ss) of
-		true -> {used,St0};
-		false -> check_liveness_at(R, Fail, St0)
-	    end;
-	{y,_} ->
-	    case check_liveness_at(R, Fail, St0) of
-		{killed,St} -> check_liveness(R, Is, St);
-		{_,_}=Other -> Other
-	    end
-    end;
-check_liveness(R, [{select_val,R,_,_}|_], St) ->
-    {used,St};
-check_liveness(R, [{select_val,_,Fail,{list,Branches}}|_], St) ->
-    check_liveness_everywhere(R, [Fail|Branches], St);
-check_liveness(R, [{select_tuple_arity,R,_,_}|_], St) ->
+check_liveness(R, [{test,Op,Fail,Live,Ss,Dst}|Is], St) ->
+    %% Check this instruction as a block to get a less conservative
+    %% result if the caller is is_not_used/3.
+    Block = [{set,[Dst],Ss,{alloc,Live,{bif,Op,Fail}}}],
+    check_liveness(R, [{block,Block}|Is], St);
+check_liveness(R, [{select,_,R,_,_}|_], St) ->
     {used,St};
-check_liveness(R, [{select_tuple_arity,_,Fail,{list,Branches}}|_], St) ->
+check_liveness(R, [{select,_,_,Fail,Branches}|_], St) ->
     check_liveness_everywhere(R, [Fail|Branches], St);
 check_liveness(R, [{jump,{f,F}}|_], St) ->
     check_liveness_at(R, F, St);
@@ -301,37 +289,33 @@ check_liveness(R, [{kill,R}|_], St) ->
     {killed,St};
 check_liveness(R, [{kill,_}|Is], St) ->
     check_liveness(R, Is, St);
-check_liveness(R, [bs_init_writable|Is], St) ->
-    if
-	R =:= {x,0} -> {used,St};
-	true -> check_liveness(R, Is, St)
-    end;
-check_liveness(R, [{bs_private_append,_,Bits,_,Bin,_,Dst}|Is], St) ->
-    case R of
-	Bits -> {used,St};
-	Bin -> {used,St};
-	Dst -> {killed,St};
-	_ -> check_liveness(R, Is, St)
+check_liveness(R, [{bs_init,_,_,none,Ss,Dst}|Is], St) ->
+    case member(R, Ss) of
+	true ->
+	    {used,St};
+	false ->
+	    if
+		R =:= Dst -> {killed,St};
+		true -> check_liveness(R, Is, St)
+	    end
     end;
-check_liveness(R, [{bs_append,_,Bits,_,_,_,Bin,_,Dst}|Is], St) ->
+check_liveness(R, [{bs_init,_,_,Live,Ss,Dst}|Is], St) ->
     case R of
-	Bits -> {used,St};
-	Bin -> {used,St};
-	Dst -> {killed,St};
-	_ -> check_liveness(R, Is, St)
-    end;
-check_liveness(R, [{bs_init2,_,_,_,_,_,Dst}|Is], St) ->
-    if
-	R =:= Dst -> {killed,St};
-	true -> check_liveness(R, Is, St)
-    end;
-check_liveness(R, [{bs_init_bits,_,_,_,_,_,Dst}|Is], St) ->
-    if
-	R =:= Dst -> {killed,St};
-	true -> check_liveness(R, Is, St)
+	{x,X} ->
+	    case X < Live orelse member(R, Ss) of
+		true -> {used,St};
+		false -> {killed,St}
+	    end;
+	{y,_} ->
+	    case member(R, Ss) of
+		true -> {used,St};
+		false ->
+		    if
+			R =:= Dst -> {killed,St};
+			true -> check_liveness(R, Is, St)
+		    end
+	    end
     end;
-check_liveness(R, [{bs_put_string,_,_}|Is], St) ->
-    check_liveness(R, Is, St);
 check_liveness(R, [{deallocate,_}|Is], St) ->
     case R of
 	{y,_} -> {killed,St};
@@ -339,29 +323,20 @@ check_liveness(R, [{deallocate,_}|Is], St) ->
     end;
 check_liveness(R, [return|_], St) ->
     check_liveness_live_ret(R, 1, St);
-check_liveness(R, [{call_last,Live,_,_}|_], St) ->
-    check_liveness_live_ret(R, Live, St);
-check_liveness(R, [{call_only,Live,_}|_], St) ->
-    check_liveness_live_ret(R, Live, St);
-check_liveness(R, [{call_ext_last,Live,_,_}|_], St) ->
-    check_liveness_live_ret(R, Live, St);
-check_liveness(R, [{call_ext_only,Live,_}|_], St) ->
-    check_liveness_live_ret(R, Live, St);
 check_liveness(R, [{call,Live,_}|Is], St) ->
     case R of
 	{x,X} when X < Live -> {used,St};
 	{x,_} -> {killed,St};
 	{y,_} -> check_liveness(R, Is, St)
     end;
-check_liveness(R, [{call_ext,Live,Func}|Is], St) ->
+check_liveness(R, [{call_ext,Live,_}=I|Is], St) ->
     case R of
 	{x,X} when X < Live ->
 	    {used,St};
 	{x,_} ->
 	    {killed,St};
 	{y,_} ->
-	    {extfunc,Mod,Name,Arity} = Func,
-	    case erl_bifs:is_exit_bif(Mod, Name, Arity) of
+	    case beam_jump:is_exit_instruction(I) of
 		false ->
 		    check_liveness(R, Is, St);
 		true ->
@@ -387,14 +362,6 @@ check_liveness(R, [{apply,Args}|Is], St) ->
 	{x,_} -> {killed,St};
 	{y,_} -> check_liveness(R, Is, St)
     end;
-check_liveness(R, [{apply_last,Args,_}|_], St) ->
-    check_liveness_live_ret(R, Args+2, St);
-check_liveness(R, [send|Is], St) ->
-    case R of
-	{x,X} when X < 2 -> {used,St};
-	{x,_} -> {killed,St};
-	{y,_} -> check_liveness(R, Is, St)
-    end;
 check_liveness({x,R}, [{'%live',Live}|Is], St) ->
     if
 	R < Live -> check_liveness(R, Is, St);
@@ -429,25 +396,9 @@ check_liveness(R, [{gc_bif,Op,{f,Fail},Live,Ss,D}|Is], St0) ->
 		    Other
 	    end
     end;
-check_liveness(R, [{bs_add,{f,0},Ss,D}|Is], St) ->
+check_liveness(R, [{bs_put,{f,0},_,Ss}|Is], St) ->
     case member(R, Ss) of
 	true -> {used,St};
-	false when R =:= D -> {killed,St};
-	false -> check_liveness(R, Is, St)
-    end;
-check_liveness(R, [{bs_put_binary,{f,0},Sz,_,_,Src}|Is], St) ->
-    case member(R, [Sz,Src]) of
-	true -> {used,St};
-	false -> check_liveness(R, Is, St)
-    end;
-check_liveness(R, [{bs_put_integer,{f,0},Sz,_,_,Src}|Is], St) ->
-    case member(R, [Sz,Src]) of
-	true -> {used,St};
-	false -> check_liveness(R, Is, St)
-    end;
-check_liveness(R, [{bs_put_float,{f,0},Sz,_,_,Src}|Is], St) ->
-    case member(R, [Sz,Src]) of
-	true -> {used,St};
 	false -> check_liveness(R, Is, St)
     end;
 check_liveness(R, [{bs_restore2,S,_}|Is], St) ->
@@ -472,6 +423,16 @@ check_liveness(R, [{make_fun2,_,_,_,NumFree}|Is], St) ->
 	{x,_} -> {killed,St};
 	_ -> check_liveness(R, Is, St)
     end;
+check_liveness({x,_}=R, [{'catch',_,_}|Is], St) ->
+    %% All x registers will be killed if an exception occurs.
+    %% Therefore we only need to check the liveness for the
+    %% instructions following the catch instruction.
+    check_liveness(R, Is, St);
+check_liveness({x,_}=R, [{'try',_,_}|Is], St) ->
+    %% All x registers will be killed if an exception occurs.
+    %% Therefore we only need to check the liveness for the
+    %% instructions inside the 'try' block.
+    check_liveness(R, Is, St);
 check_liveness(R, [{try_end,Y}|Is], St) ->
     case R of
 	Y ->
@@ -602,26 +563,50 @@ check_killed_block(_, []) -> transparent.
 %%  
 %%    (Unknown instructions will cause an exception.)
 
-check_used_block({x,X}=R, [{set,_,_,{alloc,Live,_}}|Is]) ->
+check_used_block_fun(D) ->
+    fun(R, Is) -> check_used_block(R, Is, D) end.
+
+check_used_block({x,X}=R, [{set,Ds,Ss,{alloc,Live,Op}}|Is], D) ->
     if 
 	X >= Live -> killed;
-	true -> check_used_block(R, Is)
+	true ->
+	    case member(R, Ss) orelse
+		is_reg_used_at(R, Op, D) of
+		true -> used;
+		false ->
+		    case member(R, Ds) of
+			true -> killed;
+			false -> check_used_block(R, Is, D)
+		    end
+	    end
     end;
-check_used_block(R, [{set,Ds,Ss,_Op}|Is]) ->
-    case member(R, Ss) of
+check_used_block(R, [{set,Ds,Ss,Op}|Is], D) ->
+    case member(R, Ss) orelse
+	is_reg_used_at(R, Op, D) of
 	true -> used;
 	false ->
 	    case member(R, Ds) of
 		true -> killed;
-		false -> check_used_block(R, Is)
+		false -> check_used_block(R, Is, D)
 	    end
     end;
-check_used_block(R, [{'%live',Live}|Is]) ->
+check_used_block(R, [{'%live',Live}|Is], D) ->
     case R of
 	{x,X} when X >= Live -> killed;
-	_ -> check_used_block(R, Is)
+	_ -> check_used_block(R, Is, D)
     end;
-check_used_block(_, []) -> transparent.
+check_used_block(_, [], _) -> transparent.
+
+is_reg_used_at(R, {gc_bif,_,{f,Lbl}}, D) ->
+    is_reg_used_at_1(R, Lbl, D);
+is_reg_used_at(R, {bif,_,{f,Lbl}}, D) ->
+    is_reg_used_at_1(R, Lbl, D);
+is_reg_used_at(_, _, _) -> false.
+
+is_reg_used_at_1(_, 0, _) ->
+    false;
+is_reg_used_at_1(R, Lbl, D) ->
+    not is_not_used_at(R, Lbl, D).
 
 index_labels_1([{label,Lbl}|Is0], Acc) ->
     Is = lists:dropwhile(fun({label,_}) -> true;
@@ -654,49 +639,21 @@ combine_alloc_lists_1([]) -> [].
 live_opt([{bs_context_to_binary,Src}=I|Is], Regs0, D, Acc) ->
     Regs = x_live([Src], Regs0),
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_add,Fail,[Src1,Src2,_],Dst}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src1,Src2], x_dead([Dst], Regs0)),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_init2,Fail,_,_,Live,_,_}=I|Is], _, D, Acc) ->
-    Regs1 = live_call(Live),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_init_bits,Fail,Src1,_,Live,_,Src2}=I|Is], _, D, Acc) ->
-    Regs1 = live_call(Live),
-    Regs2 = x_live([Src1,Src2], Regs1),
-    Regs = live_join_label(Fail, D, Regs2),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_append,Fail,Src1,_,Live,_,Src2,_,Dst}=I|Is], _Regs0, D, Acc) ->
-    Regs1 = x_dead([Dst], x_live([Src1,Src2], live_call(Live))),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_private_append,Fail,Src1,_,Src2,_,Dst}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src1,Src2], x_dead([Dst], Regs0)),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_put_binary,Fail,Src1,_,_,Src2}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src1,Src2], Regs0),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_put_float,Fail,Src1,_,_,Src2}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src1,Src2], Regs0),
+live_opt([{bs_init,Fail,_,none,Ss,Dst}=I|Is], Regs0, D, Acc) ->
+    Regs1 = x_live(Ss, x_dead([Dst], Regs0)),
     Regs = live_join_label(Fail, D, Regs1),
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_put_integer,Fail,Src1,_,_,Src2}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src1,Src2], Regs0),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_put_utf8,Fail,_,Src}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src], Regs0),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_put_utf16,Fail,_,Src}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src], Regs0),
-    Regs = live_join_label(Fail, D, Regs1),
+live_opt([{bs_init,Fail,Info,Live0,Ss,Dst}|Is], Regs0, D, Acc) ->
+    Regs1 = x_dead([Dst], Regs0),
+    Live = live_regs(Regs1),
+    true = Live =< Live0,	%Assertion.
+    Regs2 = live_call(Live),
+    Regs3 = x_live(Ss, Regs2),
+    Regs = live_join_label(Fail, D, Regs3),
+    I = {bs_init,Fail,Info,Live,Ss,Dst},
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_put_utf32,Fail,_,Src}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src], Regs0),
+live_opt([{bs_put,Fail,_,Ss}=I|Is], Regs0, D, Acc) ->
+    Regs1 = x_live(Ss, Regs0),
     Regs = live_join_label(Fail, D, Regs1),
     live_opt(Is, Regs, D, [I|Acc]);
 live_opt([{bs_restore2,Src,_}=I|Is], Regs0, D, Acc) ->
@@ -705,14 +662,6 @@ live_opt([{bs_restore2,Src,_}=I|Is], Regs0, D, Acc) ->
 live_opt([{bs_save2,Src,_}=I|Is], Regs0, D, Acc) ->
     Regs = x_live([Src], Regs0),
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_utf8_size,Fail,Src,Dst}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src], x_dead([Dst], Regs0)),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{bs_utf16_size,Fail,Src,Dst}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src], x_dead([Dst], Regs0)),
-    Regs = live_join_label(Fail, D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
 live_opt([{test,bs_start_match2,Fail,Live,[Src,_],_}=I|Is], _, D, Acc) ->
     Regs0 = live_call(Live),
     Regs1 = x_live([Src], Regs0),
@@ -747,30 +696,16 @@ live_opt([{try_case_end,Src}=I|Is], _, D, Acc) ->
 live_opt([if_end=I|Is], _, D, Acc) ->
     Regs = 0,
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([bs_init_writable=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(1), D, [I|Acc]);
 live_opt([{call,Arity,_}=I|Is], _, D, Acc) ->
     live_opt(Is, live_call(Arity), D, [I|Acc]);
 live_opt([{call_ext,Arity,_}=I|Is], _, D, Acc) ->
     live_opt(Is, live_call(Arity), D, [I|Acc]);
 live_opt([{call_fun,Arity}=I|Is], _, D, Acc) ->
     live_opt(Is, live_call(Arity+1), D, [I|Acc]);
-live_opt([{call_last,Arity,_,_}=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(Arity), D, [I|Acc]);
-live_opt([{call_ext_last,Arity,_,_}=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(Arity), D, [I|Acc]);
 live_opt([{apply,Arity}=I|Is], _, D, Acc) ->
     live_opt(Is, live_call(Arity+2), D, [I|Acc]);
-live_opt([{apply_last,Arity,_}=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(Arity+2), D, [I|Acc]);
-live_opt([{call_only,Arity,_}=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(Arity), D, [I|Acc]);
-live_opt([{call_ext_only,Arity,_}=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(Arity), D, [I|Acc]);
 live_opt([{make_fun2,_,_,_,Arity}=I|Is], _, D, Acc) ->
     live_opt(Is, live_call(Arity), D, [I|Acc]);
-live_opt([send=I|Is], _, D, Acc) ->
-    live_opt(Is, live_call(2), D, [I|Acc]);
 live_opt([{test,_,Fail,Ss}=I|Is], Regs0, D, Acc) ->
     Regs1 = x_live(Ss, Regs0),
     Regs = live_join_label(Fail, D, Regs1),
@@ -780,16 +715,14 @@ live_opt([{test,_,Fail,Live,Ss,_}=I|Is], _, D, Acc) ->
     Regs1 = x_live(Ss, Regs0),
     Regs = live_join_label(Fail, D, Regs1),
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{select_val,Src,Fail,{list,List}}=I|Is], Regs0, D, Acc) ->
+live_opt([{select,_,Src,Fail,List}=I|Is], Regs0, D, Acc) ->
     Regs1 = x_live([Src], Regs0),
     Regs = live_join_labels([Fail|List], D, Regs1),
     live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{select_tuple_arity,Src,Fail,{list,List}}=I|Is], Regs0, D, Acc) ->
-    Regs1 = x_live([Src], Regs0),
-    Regs = live_join_labels([Fail|List], D, Regs1),
-    live_opt(Is, Regs, D, [I|Acc]);
-live_opt([{'try',_,Fail}=I|Is], Regs0, D, Acc) ->
-    Regs = live_join_label(Fail, D, Regs0),
+live_opt([{'try',_,_}=I|Is], Regs, D, Acc) ->
+    %% If an exeption happens, all x registers will be killed.
+    %% Therefore, we should only base liveness of the code inside
+    %% the try.
     live_opt(Is, Regs, D, [I|Acc]);
 live_opt([{try_case,_}=I|Is], _, D, Acc) ->
     live_opt(Is, live_call(1), D, [I|Acc]);
@@ -799,8 +732,6 @@ live_opt([timeout=I|Is], _, D, Acc) ->
     live_opt(Is, 0, D, [I|Acc]);
 
 %% Transparent instructions - they neither use nor modify x registers.
-live_opt([{bs_put_string,_,_}=I|Is], Regs, D, Acc) ->
-    live_opt(Is, Regs, D, [I|Acc]);
 live_opt([{deallocate,_}=I|Is], Regs, D, Acc) ->
     live_opt(Is, Regs, D, [I|Acc]);
 live_opt([{kill,_}=I|Is], Regs, D, Acc) ->
@@ -827,13 +758,24 @@ live_opt([{allocate_heap,_,_,Live}=I|Is], _, D, Acc) ->
 
 live_opt([], _, _, Acc) -> Acc.
 
-live_opt_block([{set,[],[],{alloc,Live,_}}=I|Is], _, D, Acc) ->
-    live_opt_block(Is, live_call(Live), D, [I|Acc]);
-live_opt_block([{set,Ds,Ss,Op}=I|Is], Regs0, D, Acc) ->
-    Regs = case Op of
-	       {alloc,Live,_} -> live_call(Live);
-	       _ -> x_live(Ss, x_dead(Ds, Regs0))
-	   end,
+live_opt_block([{set,Ds,Ss,Op}=I0|Is], Regs0, D, Acc) ->
+    Regs1 = x_live(Ss, x_dead(Ds, Regs0)),
+    {I,Regs} = case Op of
+		   {alloc,Live0,Alloc} ->
+		       %% The life-time analysis used by the code generator
+		       %% is sometimes too conservative, so it may be
+		       %% possible to lower the number of live registers
+		       %% based on the exact liveness information.
+		       %% The main benefit is that more optimizations that
+		       %% depend on liveness information (such as the
+		       %% beam_bool and beam_dead passes) may be applied.
+		       Live = live_regs(Regs1),
+		       true = Live =< Live0,	%Assertion.
+		       I1 = {set,Ds,Ss,{alloc,Live,Alloc}},
+		       {I1,live_call(Live)};
+		   _ ->
+		       {I0,Regs1}
+	       end,
     case Ds of
 	[{x,X}] ->
 	    case (not is_live(X, Regs0)) andalso Op =:= move of
diff --git a/lib/compiler/src/beam_z.erl b/lib/compiler/src/beam_z.erl
new file mode 100644
index 0000000000..8c6b0c916d
--- /dev/null
+++ b/lib/compiler/src/beam_z.erl
@@ -0,0 +1,79 @@
+%%
+%% %CopyrightBegin%
+%%
+%% Copyright Ericsson AB 2012. All Rights Reserved.
+%%
+%% The contents of this file are subject to the Erlang Public License,
+%% Version 1.1, (the "License"); you may not use this file except in
+%% compliance with the License. You should have received a copy of the
+%% Erlang Public License along with this software. If not, it can be
+%% retrieved online at http://www.erlang.org/.
+%%
+%% Software distributed under the License is distributed on an "AS IS"
+%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+%% the License for the specific language governing rights and limitations
+%% under the License.
+%%
+%% %CopyrightEnd%
+%%
+%% Purpose: Run right before beam_asm to do any final fix-ups or clean-ups.
+%%          (Mandatory.)
+
+-module(beam_z).
+
+-export([module/2]).
+
+module({Mod,Exp,Attr,Fs0,Lc}, _Opt) ->
+    Fs = [function(F) || F <- Fs0],
+    {ok,{Mod,Exp,Attr,Fs,Lc}}.
+
+function({function,Name,Arity,CLabel,Is0}) ->
+    try
+	Is = undo_renames(Is0),
+	{function,Name,Arity,CLabel,Is}
+    catch
+	Class:Error ->
+	    Stack = erlang:get_stacktrace(),
+	    io:fwrite("Function: ~w/~w\n", [Name,Arity]),
+	    erlang:raise(Class, Error, Stack)
+    end.
+
+undo_renames([{call_ext,2,send}|Is]) ->
+    [send|undo_renames(Is)];
+undo_renames([{apply,A},{deallocate,N},return|Is]) ->
+    [{apply_last,A,N}|undo_renames(Is)];
+undo_renames([{call,A,F},{deallocate,N},return|Is]) ->
+    [{call_last,A,F,N}|undo_renames(Is)];
+undo_renames([{call_ext,A,F},{deallocate,N},return|Is]) ->
+    [{call_ext_last,A,F,N}|undo_renames(Is)];
+undo_renames([{call,A,F},return|Is]) ->
+    [{call_only,A,F}|undo_renames(Is)];
+undo_renames([{call_ext,A,F},return|Is]) ->
+    [{call_ext_only,A,F}|undo_renames(Is)];
+undo_renames([I|Is]) ->
+    [undo_rename(I)|undo_renames(Is)];
+undo_renames([]) -> [].
+
+undo_rename({bs_put,F,{I,U,Fl},[Sz,Src]}) ->
+    {I,F,Sz,U,Fl,Src};
+undo_rename({bs_put,F,{I,Fl},[Src]}) ->
+    {I,F,Fl,Src};
+undo_rename({bs_put,{f,0},{bs_put_string,_,_}=I,[]}) ->
+    I;
+undo_rename({bif,bs_add=I,F,[Src1,Src2,{integer,U}],Dst}) ->
+    {I,F,[Src1,Src2,U],Dst};
+undo_rename({bif,bs_utf8_size=I,F,[Src],Dst}) ->
+    {I,F,Src,Dst};
+undo_rename({bif,bs_utf16_size=I,F,[Src],Dst}) ->
+    {I,F,Src,Dst};
+undo_rename({bs_init,F,{I,U,Flags},none,[Sz,Src],Dst}) ->
+    {I,F,Sz,U,Src,Flags,Dst};
+undo_rename({bs_init,F,{I,Extra,Flags},Live,[Sz],Dst}) ->
+    {I,F,Sz,Extra,Live,Flags,Dst};
+undo_rename({bs_init,F,{I,Extra,U,Flags},Live,[Sz,Src],Dst}) ->
+    {I,F,Sz,Extra,Live,U,Src,Flags,Dst};
+undo_rename({bs_init,_,bs_init_writable=I,_,_,_}) ->
+    I;
+undo_rename({select,I,Reg,Fail,List}) ->
+    {I,Reg,Fail,{list,List}};
+undo_rename(I) -> I.
diff --git a/lib/compiler/src/compile.erl b/lib/compiler/src/compile.erl
index 0a368df5d6..df1af36eeb 100644
--- a/lib/compiler/src/compile.erl
+++ b/lib/compiler/src/compile.erl
@@ -224,6 +224,8 @@ format_error({delete_temp,File,Error}) ->
 		  [File,file:format_error(Error)]);
 format_error({parse_transform,M,R}) ->
     io_lib:format("error in parse transform '~s': ~p", [M, R]);
+format_error({undef_parse_transform,M}) ->
+    io_lib:format("undefined parse transform '~s'", [M]);
 format_error({core_transform,M,R}) ->
     io_lib:format("error in core transform '~s': ~p", [M, R]);
 format_error({crash,Pass,Reason}) ->
@@ -551,12 +553,12 @@ select_list_passes_1([{iff,Flag,{done,Ext}}|Ps], Opts, Acc) ->
     end;
 select_list_passes_1([{iff=Op,Flag,List0}|Ps], Opts, Acc) when is_list(List0) ->
     case select_list_passes(List0, Opts) of
-	{done,_}=Done -> Done;
+	{done,List} -> {done,reverse(Acc) ++ List};
 	{not_done,List} -> select_list_passes_1(Ps, Opts, [{Op,Flag,List}|Acc])
     end;
 select_list_passes_1([{unless=Op,Flag,List0}|Ps], Opts, Acc) when is_list(List0) ->
     case select_list_passes(List0, Opts) of
-	{done,_}=Done -> Done;
+	{done,List} -> {done,reverse(Acc) ++ List};
 	{not_done,List} -> select_list_passes_1(Ps, Opts, [{Op,Flag,List}|Acc])
     end;
 select_list_passes_1([P|Ps], Opts, Acc) ->
@@ -630,7 +632,8 @@ kernel_passes() ->
 asm_passes() ->
     %% Assembly level optimisations.
     [{delay,
-      [{unless,no_postopt,
+      [{pass,beam_a},
+       {unless,no_postopt,
 	[{pass,beam_block},
 	 {iff,dblk,{listing,"block"}},
 	 {unless,no_except,{pass,beam_except}},
@@ -657,13 +660,11 @@ asm_passes() ->
 	 {iff,dtrim,{listing,"trim"}},
 	 {pass,beam_flatten}]},
 
-       %% If post optimizations are turned off, we still coalesce
-       %% adjacent labels and remove unused labels to keep the
-       %% HiPE compiler happy.
-       {iff,no_postopt,
-	[?pass(beam_unused_labels),
-	 {pass,beam_clean}]},
+       %% If post optimizations are turned off, we still
+       %% need to do a few clean-ups to code.
+       {iff,no_postopt,[{pass,beam_clean}]},
 
+       {pass,beam_z},
        {iff,dopt,{listing,"optimize"}},
        {iff,'S',{listing,"S"}},
        {iff,'to_asm',{done,"S"}}]},
@@ -850,6 +851,10 @@ foldl_transform(St, [T|Ts]) ->
 	{error,Es,Ws} ->
 	    {error,St#compile{warnings=St#compile.warnings ++ Ws,
 			      errors=St#compile.errors ++ Es}};
+	{'EXIT',{undef,_}} ->
+	    Es = [{St#compile.ifile,[{none,compile,
+				      {undef_parse_transform,T}}]}],
+	    {error,St#compile{errors=St#compile.errors ++ Es}};
 	{'EXIT',R} ->
 	    Es = [{St#compile.ifile,[{none,compile,{parse_transform,T,R}}]}],
 	    {error,St#compile{errors=St#compile.errors ++ Es}};
@@ -1236,10 +1241,6 @@ random_bytes_1(N, Acc) -> random_bytes_1(N-1, [random:uniform(255)|Acc]).
 save_core_code(St) ->
     {ok,St#compile{core_code=cerl:from_records(St#compile.code)}}.
 
-beam_unused_labels(#compile{code=Code0}=St) ->
-    Code = beam_jump:module_labels(Code0),
-    {ok,St#compile{code=Code}}.
-
 beam_asm(#compile{ifile=File,code=Code0,
 		  abstract_code=Abst,mod_options=Opts0}=St) ->
     Source = filename:absname(File),
diff --git a/lib/compiler/src/compiler.app.src b/lib/compiler/src/compiler.app.src
index 1133882728..94c78e68f9 100644
--- a/lib/compiler/src/compiler.app.src
+++ b/lib/compiler/src/compiler.app.src
@@ -20,6 +20,7 @@
  [{description, "ERTS  CXC 138 10"},
   {vsn, "%VSN%"},
   {modules, [
+	     beam_a,
 	     beam_asm,
 	     beam_block,
 	     beam_bool,
@@ -40,6 +41,7 @@
 	     beam_type,
 	     beam_utils,
 	     beam_validator,
+	     beam_z,
 	     cerl,
 	     cerl_clauses,
 	     cerl_inline,
diff --git a/lib/compiler/src/sys_pre_expand.erl b/lib/compiler/src/sys_pre_expand.erl
index 97d3ff626c..e55fb2a037 100644
--- a/lib/compiler/src/sys_pre_expand.erl
+++ b/lib/compiler/src/sys_pre_expand.erl
@@ -454,9 +454,6 @@ expr({call,Line,{remote,Lr,M,F},As0}, St0) ->
     M1 = expand_package(M, St0),
     {[M2,F1|As1],St1} = expr_list([M1,F|As0], St0),
     {{call,Line,{remote,Lr,M2,F1},As1},St1};
-expr({call,Line,{tuple,_,[{atom,_,_}=M,{atom,_,_}=F]},As}, St) ->
-    %% Rewrite {Mod,Function}(Args...) to Mod:Function(Args...).
-    expr({call,Line,{remote,Line,M,F},As}, St);
 expr({call,Line,F,As0}, St0) ->
     {[Fun1|As1],St1} = expr_list([F|As0], St0),
     {{call,Line,Fun1,As1},St1};
diff --git a/lib/compiler/src/v3_codegen.erl b/lib/compiler/src/v3_codegen.erl
index 812e85553f..6a13495523 100644
--- a/lib/compiler/src/v3_codegen.erl
+++ b/lib/compiler/src/v3_codegen.erl
@@ -123,15 +123,24 @@ cg_fun(Les, Hvs, Vdb, AtomMod, NameArity, Anno, St0) ->
 					   put_reg(V, Reg)
 				   end, [], Hvs),
 			 stk=[]}, 0, Vdb),
-    {B,_Aft,St} = cg_list(Les, 0, Vdb, Bef,
+    {B0,_Aft,St} = cg_list(Les, 0, Vdb, Bef,
 			  St3#cg{bfail=0,
 				 ultimate_failure=UltimateMatchFail,
 				 is_top_block=true}),
+    B = fix_bs_match_strings(B0),
     {Name,Arity} = NameArity,
     Asm = [{label,Fi},line(Anno),{func_info,AtomMod,{atom,Name},Arity},
 	   {label,Fl}|B++[{label,UltimateMatchFail},if_end]],
     {Asm,Fl,St}.
 
+fix_bs_match_strings([{test,bs_match_string,F,[Ctx,BinList]}|Is])
+  when is_list(BinList) ->
+    I = {test,bs_match_string,F,[Ctx,list_to_bitstring(BinList)]},
+    [I|fix_bs_match_strings(Is)];
+fix_bs_match_strings([I|Is]) ->
+    [I|fix_bs_match_strings(Is)];
+fix_bs_match_strings([]) -> [].
+
 %% cg(Lkexpr, Vdb, StackReg, State) -> {[Ainstr],StackReg,State}.
 %%  Generate code for a kexpr.
 %%  Split function into two steps for clarity, not efficiency.
@@ -714,7 +723,22 @@ select_bin_seg(#l{ke={val_clause,{bin_int,Ctx,Sz,U,Fs,Val,Es},B},i=I,vdb=Vdb},
 				       I, Vdb, Bef, Ctx, St0),
     {Bis,Aft,St2} = match_cg(B, Fail, Int, St1),
     CtxReg = fetch_var(Ctx, Bef),
-    {[{bs_restore2,CtxReg,{Ctx,Ivar}}|Mis] ++ Bis,Aft,St2}.
+    Is = case Mis ++ Bis of
+	     [{test,bs_match_string,F,[OtherCtx,Bin1]},
+	      {bs_save2,OtherCtx,_},
+	      {bs_restore2,OtherCtx,_},
+	      {test,bs_match_string,F,[OtherCtx,Bin2]}|Is0] ->
+		 %% We used to do this optimization later, but it
+		 %% turns out that in huge functions with many
+		 %% bs_match_string instructions, it's a big win
+		 %% to do the combination now. To avoid copying the
+		 %% binary data again and again, we'll combine bitstrings
+		 %% in a list and convert all of it to a bitstring later.
+		 [{test,bs_match_string,F,[OtherCtx,[Bin1,Bin2]]}|Is0];
+	     Is0 ->
+		 Is0
+	 end,
+    {[{bs_restore2,CtxReg,{Ctx,Ivar}}|Is],Aft,St2}.
 
 select_extract_int([{var,Tl}], Val, {integer,Sz}, U, Fs, Vf,
 		   I, Vdb, Bef, Ctx, St) ->
@@ -1386,22 +1410,32 @@ catch_cg(C, {var,R}, Le, Vdb, Bef, St0) ->
 %%
 %%  put_list for constructing a cons is an atomic instruction
 %%  which can safely resuse one of the source registers as target.
-%%  Also binaries can reuse a source register as target.
 
 set_cg([{var,R}], {cons,Es}, Le, Vdb, Bef, St) ->
-    [S1,S2] = map(fun ({var,V}) -> fetch_var(V, Bef);
-		      (Other) -> Other
-		  end, Es),
+    [S1,S2] = cg_reg_args(Es, Bef),
     Int0 = clear_dead(Bef, Le#l.i, Vdb),
     Int1 = Int0#sr{reg=put_reg(R, Int0#sr.reg)},
     Ret = fetch_reg(R, Int1#sr.reg),
     {[{put_list,S1,S2,Ret}], Int1, St};
 set_cg([{var,R}], {binary,Segs}, Le, Vdb, Bef,
        #cg{in_catch=InCatch, bfail=Bfail}=St) ->
+    %% At run-time, binaries are constructed in three stages:
+    %% 1) First the size of the binary is calculated.
+    %% 2) Then the binary is allocated.
+    %% 3) Then each field in the binary is constructed.
+    %% For simplicity, we use the target register to also hold the
+    %% size of the binary. Therefore the target register must *not*
+    %% be one of the source registers.
+
+    %% First allocate the target register.
     Int0 = Bef#sr{reg=put_reg(R, Bef#sr.reg)},
     Target = fetch_reg(R, Int0#sr.reg),
-    Fail = {f,Bfail},
+
+    %% Also allocate a scratch register for size calculations.
     Temp = find_scratch_reg(Int0#sr.reg),
+
+    %% First generate the code that constructs each field.
+    Fail = {f,Bfail},
     PutCode = cg_bin_put(Segs, Fail, Bef),
     {Sis,Int1} =
 	case InCatch of
@@ -1410,6 +1444,8 @@ set_cg([{var,R}], {binary,Segs}, Le, Vdb, Bef,
 	end,
     MaxRegs = max_reg(Bef#sr.reg),
     Aft = clear_dead(Int1, Le#l.i, Vdb),
+
+    %% Now generate the complete code for constructing the binary.
     Code = cg_binary(PutCode, Target, Temp, Fail, MaxRegs, Le#l.a),
     {Sis++Code,Aft,St};
 set_cg([{var,R}], Con, Le, Vdb, Bef, St) ->
@@ -1419,10 +1455,8 @@ set_cg([{var,R}], Con, Le, Vdb, Bef, St) ->
     Ais = case Con of
 	      {tuple,Es} ->
 		  [{put_tuple,length(Es),Ret}] ++ cg_build_args(Es, Bef);
-	      {var,V} ->	  % Normally removed by kernel optimizer.
-		  [{move,fetch_var(V, Int),Ret}];
 	      Other ->
-		  [{move,Other,Ret}]
+		  [{move,cg_reg_arg(Other, Int),Ret}]
 	  end,
     {Ais,clear_dead(Int, Le#l.i, Vdb),St}.
 
@@ -1576,8 +1610,7 @@ cg_gen_binsize([], _, _, _, _, Acc) -> Acc.
 %% cg_bin_opt(Code0) -> Code
 %%  Optimize the size calculations for binary construction.
 
-cg_bin_opt([{move,Size,D},{bs_append,Fail,D,Extra,Regs0,U,Bin,Flags,D}|Is]) ->
-    Regs = cg_bo_newregs(Regs0, D),
+cg_bin_opt([{move,Size,D},{bs_append,Fail,D,Extra,Regs,U,Bin,Flags,D}|Is]) ->
     cg_bin_opt([{bs_append,Fail,Size,Extra,Regs,U,Bin,Flags,D}|Is]);
 cg_bin_opt([{move,Size,D},{bs_private_append,Fail,D,U,Bin,Flags,D}|Is]) ->
     cg_bin_opt([{bs_private_append,Fail,Size,U,Bin,Flags,D}|Is]);
@@ -1585,9 +1618,8 @@ cg_bin_opt([{move,{integer,0},D},{bs_add,_,[D,{integer,_}=S,1],Dst}|Is]) ->
     cg_bin_opt([{move,S,Dst}|Is]);
 cg_bin_opt([{move,{integer,0},D},{bs_add,Fail,[D,S,U],Dst}|Is]) ->
     cg_bin_opt([{bs_add,Fail,[{integer,0},S,U],Dst}|Is]);
-cg_bin_opt([{move,{integer,Bytes},D},{Op,Fail,D,Extra,Regs0,Flags,D}|Is])
+cg_bin_opt([{move,{integer,Bytes},D},{Op,Fail,D,Extra,Regs,Flags,D}|Is])
   when Op =:= bs_init2; Op =:= bs_init_bits ->
-    Regs = cg_bo_newregs(Regs0, D),
     cg_bin_opt([{Op,Fail,Bytes,Extra,Regs,Flags,D}|Is]);
 cg_bin_opt([{move,Src1,Dst},{bs_add,Fail,[Dst,Src2,U],Dst}|Is]) ->
     cg_bin_opt([{bs_add,Fail,[Src1,Src2,U],Dst}|Is]);
@@ -1595,20 +1627,9 @@ cg_bin_opt([I|Is]) ->
     [I|cg_bin_opt(Is)];
 cg_bin_opt([]) -> [].
 
-cg_bo_newregs(R, {x,X}) when R-1 =:= X -> R-1;
-cg_bo_newregs(R, _) -> R.
-
-%% Common for new and old binary code generation.
-
 cg_bin_put({bin_seg,[],S0,U,T,Fs,[E0,Next]}, Fail, Bef) ->
-    S1 = case S0 of
-	     {var,Sv} -> fetch_var(Sv, Bef);
-	     _ -> S0
-	 end,
-    E1 = case E0 of
-	     {var,V} -> fetch_var(V, Bef);
-	     Other ->   Other
-	 end,
+    S1 = cg_reg_arg(S0, Bef),
+    E1 = cg_reg_arg(E0, Bef),
     {Format,Op} = case T of
 		      integer -> {plain,bs_put_integer};
 		      utf8 ->    {utf,bs_put_utf8};
@@ -1626,9 +1647,7 @@ cg_bin_put({bin_seg,[],S0,U,T,Fs,[E0,Next]}, Fail, Bef) ->
 cg_bin_put({bin_end,[]}, _, _) -> [].
 
 cg_build_args(As, Bef) ->
-    map(fun ({var,V}) -> {put,fetch_var(V, Bef)};
-	    (Other) -> {put,Other}
-	end, As).
+    [{put,cg_reg_arg(A, Bef)} || A <- As].
 
 %% return_cg([Val], Le, Vdb, Bef, St) -> {[Ainstr],Aft,St}.
 %% break_cg([Val], Le, Vdb, Bef, St) -> {[Ainstr],Aft,St}.
@@ -1907,27 +1926,13 @@ fetch_var(V, Sr) ->
 	error -> fetch_stack(V, Sr#sr.stk)
     end.
 
-% find_var(V, Sr) ->
-%     case find_reg(V, Sr#sr.reg) of
-% 	{ok,R} -> {ok,R};
-% 	error ->
-% 	    case find_stack(V, Sr#sr.stk) of
-% 		{ok,S} -> {ok,S};
-% 		error -> error
-% 	    end
-%     end.
-
 load_vars(Vs, Regs) ->
     foldl(fun ({var,V}, Rs) -> put_reg(V, Rs) end, Regs, Vs).
 
 %% put_reg(Val, Regs) -> Regs.
-%% free_reg(Val, Regs) -> Regs.
 %% find_reg(Val, Regs) -> ok{r{R}} | error.
 %% fetch_reg(Val, Regs) -> r{R}.
 %%  Functions to interface the registers.
-%%  put_reg puts a value into a free register,
-%%  load_reg loads a value into a fixed register
-%%  free_reg frees a register containing a specific value.
 
 % put_regs(Vs, Rs) -> foldl(fun put_reg/2, Rs, Vs).
 
@@ -1938,10 +1943,6 @@ put_reg_1(V, [{reserved,I,V}|Rs], I) -> [{I,V}|Rs];
 put_reg_1(V, [R|Rs], I) -> [R|put_reg_1(V, Rs, I+1)];
 put_reg_1(V, [], I) -> [{I,V}].
 
-% free_reg(V, [{I,V}|Rs]) -> [free|Rs];
-% free_reg(V, [R|Rs]) -> [R|free_reg(V, Rs)];
-% free_reg(V, []) -> [].
-
 fetch_reg(V, [{I,V}|_]) -> {x,I};
 fetch_reg(V, [_|SRs]) -> fetch_reg(V, SRs).
 
@@ -1958,9 +1959,6 @@ find_scratch_reg([free|_], I) -> {x,I};
 find_scratch_reg([_|Rs], I) -> find_scratch_reg(Rs, I+1);
 find_scratch_reg([], I) -> {x,I}.
 
-%%copy_reg(Val, R, Regs) -> load_reg(Val, R, Regs).
-%%move_reg(Val, R, Regs) -> load_reg(Val, R, free_reg(Val, Regs)).
-
 replace_reg_contents(Old, New, [{I,Old}|Rs]) -> [{I,New}|Rs];
 replace_reg_contents(Old, New, [R|Rs]) -> [R|replace_reg_contents(Old, New, Rs)].
 
diff --git a/lib/compiler/src/v3_kernel.erl b/lib/compiler/src/v3_kernel.erl
index b184987625..8ef71e1346 100644
--- a/lib/compiler/src/v3_kernel.erl
+++ b/lib/compiler/src/v3_kernel.erl
@@ -81,7 +81,7 @@
 -export([module/2,format_error/1]).
 
 -import(lists, [map/2,foldl/3,foldr/3,mapfoldl/3,splitwith/2,member/2,
-		keymember/3,keyfind/3]).
+		keymember/3,keyfind/3,partition/2]).
 -import(ordsets, [add_element/2,del_element/2,union/2,union/1,subtract/2]).
 -import(cerl, [c_tuple/1]).
 
@@ -1081,9 +1081,44 @@ select_bin_con(Cs0) ->
 		       end, Cs0),
     select_bin_con_1(Cs1).
 
+
 select_bin_con_1(Cs) ->
     try
-	select_bin_int(Cs)
+	%% The usual way to match literals is to first extract the
+	%% value to a register, and then compare the register to the
+	%% literal value. Extracting the value is good if we need
+	%% compare it more than once.
+	%%
+	%% But we would like to combine the extracting and the
+	%% comparing into a single instruction if we know that
+	%% a binary segment must contain specific integer value
+	%% or the matching will fail, like in this example:
+	%%
+	%% <<42:8,...>> ->
+	%% <<42:8,...>> ->
+	%% .
+	%% .
+	%% .
+	%% <<42:8,...>> ->
+	%% <<>> ->
+	%%
+	%% The first segment must either contain the integer 42
+	%% or the binary must end for the match to succeed.
+	%%
+	%% The way we do is to replace the generic #k_bin_seg{}
+	%% record with a #k_bin_int{} record if all clauses will
+	%% select the same literal integer (except for one or more
+	%% clauses that will end the binary).
+
+	{BinSegs0,BinEnd} =
+	    partition(fun (C) ->
+			      clause_con(C) =:= k_bin_seg
+		      end, Cs),
+	BinSegs = select_bin_int(BinSegs0),
+	case BinEnd of
+	    [] -> BinSegs;
+	    [_|_] -> BinSegs ++ [{k_bin_end,BinEnd}]
+	end
     catch
 	throw:not_possible ->
 	    select_bin_con_2(Cs)
@@ -1097,7 +1132,7 @@ select_bin_con_2([]) -> [].
 
 %% select_bin_int([Clause]) -> {k_bin_int,[Clause]}
 %%  If the first pattern in each clause selects the same integer,
-%%  rewrite all clauses to use #k_bin_int{} (which will later to
+%%  rewrite all clauses to use #k_bin_int{} (which will later be
 %%  translated to a bs_match_string/4 instruction).
 %%
 %%  If it is not possible to do this rewrite, a 'not_possible'
@@ -1346,7 +1381,7 @@ clause_arg(#iclause{pats=[Arg|_]}) -> Arg.
 
 clause_con(C) -> arg_con(clause_arg(C)).
 
-clause_val(C) -> arg_val(clause_arg(C)).
+clause_val(C) -> arg_val(clause_arg(C), C).
 
 is_var_clause(C) -> clause_con(C) =:= k_var.
 
@@ -1377,7 +1412,7 @@ arg_con(Arg) ->
 	#k_var{} -> k_var
     end.
 
-arg_val(Arg) ->
+arg_val(Arg, C) ->
     case arg_arg(Arg) of
 	#k_literal{val=Lit} -> Lit;
 	#k_int{val=I} -> I;
@@ -1385,7 +1420,13 @@ arg_val(Arg) ->
 	#k_atom{val=A} -> A;
 	#k_tuple{es=Es} -> length(Es);
 	#k_bin_seg{size=S,unit=U,type=T,flags=Fs} ->
-	    {set_kanno(S, []),U,T,Fs}
+	    case S of
+		#k_var{name=V} ->
+		    #iclause{isub=Isub} = C,
+		    {#k_var{name=get_vsub(V, Isub)},U,T,Fs};
+		_ ->
+		    {set_kanno(S, []),U,T,Fs}
+	    end
     end.
 
 %% ubody_used_vars(Expr, State) -> [UsedVar]