path: root/lib/compiler/src/beam_block.erl


                   
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 1999-2010. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
%% Purpose : Partitions assembly instructions into basic blocks and
%% optimizes them.

-module(beam_block).

-export([module/2]).
-import(lists, [mapfoldl/3,reverse/1,reverse/2,foldl/3,member/2]).
-define(MAXREG, 1024).

module({Mod,Exp,Attr,Fs0,Lc0}, _Opt) ->
    {Fs,Lc} = mapfoldl(fun function/2, Lc0, Fs0),
    {ok,{Mod,Exp,Attr,Fs,Lc}}.

function({function,Name,Arity,CLabel,Is0}, Lc0) ->
    try
	%% Extra labels may thwart optimizations.
	Is1 = beam_jump:remove_unused_labels(Is0),

	%% Collect basic blocks and optimize them.
	Is2 = blockify(Is1),
	Is3 = beam_utils:live_opt(Is2),
	Is4 = opt_blocks(Is3),
	Is5 = beam_utils:delete_live_annos(Is4),

	%% Optimize bit syntax.
	{Is,Lc} = bsm_opt(Is5, Lc0),

	%% Done.
	{{function,Name,Arity,CLabel,Is},Lc}
    catch
	Class:Error ->
	    Stack = erlang:get_stacktrace(),
	    io:fwrite("Function: ~w/~w\n", [Name,Arity]),
	    erlang:raise(Class, Error, Stack)
    end.

%% blockify(Instructions0) -> Instructions
%%  Collect sequences of instructions to basic blocks.
%%  Also do some simple optimations on instructions outside the blocks.

blockify(Is) ->
    blockify(Is, []).

blockify([{loop_rec,{f,Fail},{x,0}},{loop_rec_end,_Lbl},{label,Fail}|Is], Acc) ->
    %% Useless instruction sequence.
    blockify(Is, Acc);

%% New bit syntax matching.
blockify([{bs_save2,R,Point}=I,{bs_restore2,R,Point}|Is], Acc) ->
    blockify([I|Is], Acc);
blockify([{bs_save2,R,Point}=I,{test,is_eq_exact,_,_}=Test,
	  {bs_restore2,R,Point}|Is], Acc) ->
    blockify([I,Test|Is], Acc);

%% Do other peep-hole optimizations.
blockify([{test,is_atom,{f,Fail},[Reg]}=I|
	  [{select_val,Reg,{f,Fail},
	    {list,[{atom,false},{f,_}=BrFalse,
		   {atom,true}=AtomTrue,{f,_}=BrTrue]}}|Is]=Is0],
	 [{block,Bl}|_]=Acc) ->
    case is_last_bool(Bl, Reg) of
	false ->
	    blockify(Is0, [I|Acc]);
	true ->
	    %% The last instruction is a boolean operator/guard BIF that can't fail.
	    %% We can convert the three-way branch to a two-way branch (eliminating
	    %% the reference to the failure label).
	    blockify(Is, [{jump,BrTrue},
			  {test,is_eq_exact,BrFalse,[Reg,AtomTrue]}|Acc])
    end;
blockify([{test,is_atom,{f,Fail},[Reg]}=I|
	  [{select_val,Reg,{f,Fail},
	    {list,[{atom,true}=AtomTrue,{f,_}=BrTrue,
		   {atom,false},{f,_}=BrFalse]}}|Is]=Is0],
	 [{block,Bl}|_]=Acc) ->
    case is_last_bool(Bl, Reg) of
	false ->
	    blockify(Is0, [I|Acc]);
	true ->
	    blockify(Is, [{jump,BrTrue},
			  {test,is_eq_exact,BrFalse,[Reg,AtomTrue]}|Acc])
    end;
blockify([I|Is0]=IsAll, Acc) ->
    case is_bs_put(I) of
	true ->
	    {BsPuts0,Is} = collect_bs_puts(IsAll),
	    BsPuts = opt_bs_puts(BsPuts0),
	    blockify(Is, reverse(BsPuts, Acc));
	false ->
	    case collect(I) of
		error -> blockify(Is0, [I|Acc]);
		Instr when is_tuple(Instr) ->
		    {Block,Is} = collect_block(IsAll),
		    blockify(Is, [{block,Block}|Acc])
	    end
    end;
blockify([], Acc) -> reverse(Acc).

is_last_bool([{set,[Reg],As,{bif,N,_}}], Reg) ->
    Ar = length(As),
    erl_internal:new_type_test(N, Ar) orelse erl_internal:comp_op(N, Ar)
	orelse erl_internal:bool_op(N, Ar);
is_last_bool([_|Is], Reg) -> is_last_bool(Is, Reg);
is_last_bool([], _) -> false.

collect_block(Is) ->
    collect_block(Is, []).

collect_block([{allocate_zero,Ns,R},{test_heap,Nh,R}|Is], Acc) ->
    collect_block(Is, [{set,[],[],{alloc,R,{no_opt,Ns,Nh,[]}}}|Acc]);
collect_block([I|Is]=Is0, Acc) ->
    case collect(I) of
	error -> {reverse(Acc),Is0};
	Instr -> collect_block(Is, [Instr|Acc])
    end.

collect({allocate_zero,N,R}) -> {set,[],[],{alloc,R,{zero,N,0,[]}}};
collect({test_heap,N,R})     -> {set,[],[],{alloc,R,{nozero,nostack,N,[]}}};
collect({bif,N,F,As,D})      -> {set,[D],As,{bif,N,F}};
collect({gc_bif,N,F,R,As,D}) -> {set,[D],As,{alloc,R,{gc_bif,N,F}}};
collect({move,S,D})          -> {set,[D],[S],move};
collect({put_list,S1,S2,D})  -> {set,[D],[S1,S2],put_list};
collect({put_tuple,A,D})     -> {set,[D],[],{put_tuple,A}};
collect({put,S})             -> {set,[],[S],put};
collect({get_tuple_element,S,I,D}) -> {set,[D],[S],{get_tuple_element,I}};
collect({set_tuple_element,S,D,I}) -> {set,[],[S,D],{set_tuple_element,I}};
collect({get_list,S,D1,D2})  -> {set,[D1,D2],[S],get_list};
collect(remove_message)      -> {set,[],[],remove_message};
collect({'catch',R,L})       -> {set,[R],[],{'catch',L}};
collect(_)                   -> error.

opt_blocks([{block,Bl0}|Is]) ->
    %% The live annotation at the beginning is not useful.
    [{'%live',_}|Bl] = Bl0,
    [{block,opt_block(Bl)}|opt_blocks(Is)];
opt_blocks([I|Is]) ->
    [I|opt_blocks(Is)];
opt_blocks([]) -> [].

opt_block(Is0) ->
    %% We explicitly move any allocate instruction upwards before optimising
    %% moves, to avoid any potential problems with the calculation of live
    %% registers.
    Is1 = move_allocates(Is0),
    Is = find_fixpoint(fun opt/1, Is1),
    opt_alloc(Is).

find_fixpoint(OptFun, Is0) ->
    case OptFun(Is0) of
	Is0 -> Is0;
	Is1 -> find_fixpoint(OptFun, Is1)
    end.

%% move_allocates(Is0) -> Is
%%  Move allocates upwards in the instruction stream, in the hope of
%%  getting more possibilities for optimizing away moves later.

move_allocates(Is) ->
    move_allocates_1(reverse(Is), []).

move_allocates_1([{set,[],[],{alloc,_,_}=Alloc}|Is0], Acc0) ->
    {Is,Acc} = move_allocates_2(Alloc, Is0, Acc0),
    move_allocates_1(Is, Acc);
move_allocates_1([I|Is], Acc) ->
    move_allocates_1(Is, [I|Acc]);
move_allocates_1([], Is) -> Is.

move_allocates_2({alloc,Live,Info}, [{set,[],[],{alloc,Live0,Info0}}|Is], Acc) ->
    Live = Live0,				% Assertion.
    Alloc = {alloc,Live,combine_alloc(Info0, Info)},
    move_allocates_2(Alloc, Is, Acc);
move_allocates_2({alloc,Live,Info}=Alloc0, [I|Is]=Is0, Acc) ->
    case alloc_may_pass(I) of
	false ->
	    {Is0,[{set,[],[],Alloc0}|Acc]};
	true ->
	    Alloc = {alloc,alloc_live_regs(I, Live),Info},
	    move_allocates_2(Alloc, Is, [I|Acc])
    end;
move_allocates_2(Alloc, [], Acc) ->
    {[],[{set,[],[],Alloc}|Acc]}.

alloc_may_pass({set,_,_,{alloc,_,_}}) -> false;
alloc_may_pass({set,_,_,{set_tuple_element,_}}) -> false;
alloc_may_pass({set,_,_,put_list}) -> false;
alloc_may_pass({set,_,_,{put_tuple,_}}) -> false;
alloc_may_pass({set,_,_,put}) -> false;
alloc_may_pass({set,_,_,_}) -> true.
    
combine_alloc({_,Ns,Nh1,Init}, {_,nostack,Nh2,[]})  ->
    {zero,Ns,beam_utils:combine_heap_needs(Nh1, Nh2),Init}.

%% opt([Instruction]) -> [Instruction]
%%  Optimize the instruction stream inside a basic block.

opt([{set,[Dst],As,{bif,Bif,Fail}}=I1,
     {set,[Dst],[Dst],{bif,'not',Fail}}=I2|Is]) ->
    %% Get rid of the 'not' if the operation can be inverted.
    case inverse_comp_op(Bif) of
 	none -> [I1,I2|opt(Is)];
 	RevBif -> [{set,[Dst],As,{bif,RevBif,Fail}}|opt(Is)]
    end;
opt([{set,[X],[X],move}|Is]) -> opt(Is);
opt([{set,[D1],[{integer,Idx1},Reg],{bif,element,{f,0}}}=I1,
     {set,[D2],[{integer,Idx2},Reg],{bif,element,{f,0}}}=I2|Is])
  when Idx1 < Idx2, D1 =/= D2, D1 =/= Reg, D2 =/= Reg ->
    opt([I2,I1|Is]);
opt([{set,Ds0,Ss,Op}|Is0]) ->	
    {Ds,Is} = opt_moves(Ds0, Is0),
    [{set,Ds,Ss,Op}|opt(Is)];
opt([{'%live',_}=I|Is]) ->
    [I|opt(Is)];
opt([]) -> [].

%% opt_moves([Dest], [Instruction]) -> {[Dest],[Instruction]}
%%  For each Dest, does the optimization described in opt_move/2.

opt_moves([], Is0) -> {[],Is0};
opt_moves([D0]=Ds, Is0) ->
    case opt_move(D0, Is0) of
	not_possible -> {Ds,Is0};
	{D1,Is} -> {[D1],Is}
    end;
opt_moves([X0,Y0], Is0) ->
    {X,Is2} = case opt_move(X0, Is0) of
		  not_possible -> {X0,Is0};
		  {Y0,_} -> {X0,Is0};
		  {_X1,_Is1} = XIs1 -> XIs1
	      end,
    case opt_move(Y0, Is2) of
	not_possible -> {[X,Y0],Is2};
	{X,_} -> {[X,Y0],Is2};
	{Y,Is} -> {[X,Y],Is}
    end.

%% opt_move(Dest, [Instruction]) -> {UpdatedDest,[Instruction]} | not_possible
%%  If there is a {move,Dest,FinalDest} instruction
%%  in the instruction stream, remove the move instruction
%%  and let FinalDest be the destination.
%%
%%  For this optimization to be safe, we must be sure that
%%  Dest will not be referenced in any other by other instructions
%%  in the rest of the instruction stream. Not even the indirect
%%  reference by an instruction that may allocate (such as
%%  test_heap/2 or a GC Bif) is allowed.

opt_move(Dest, Is) ->
    opt_move_1(Dest, Is, ?MAXREG, []).

opt_move_1(R, [{set,_,_,{alloc,Live,_}}|_]=Is, SafeRegs, Acc) when Live < SafeRegs ->
    %% Downgrade number of safe regs and rescan the instruction, as it most probably
    %% is a gc_bif instruction.
    opt_move_1(R, Is, Live, Acc);
opt_move_1(R, [{set,[{x,X}=D],[R],move}|Is], SafeRegs, Acc) ->
    case X < SafeRegs andalso beam_utils:is_killed_block(R, Is) of
	true -> opt_move_2(D, Acc, Is);
	false -> not_possible
    end;
opt_move_1(R, [{set,[D],[R],move}|Is], _SafeRegs, Acc) ->
    case beam_utils:is_killed_block(R, Is) of
	true -> opt_move_2(D, Acc, Is);
	false -> not_possible
    end;
opt_move_1(R, [I|Is], SafeRegs, Acc) ->
    case is_transparent(R, I) of
	false -> not_possible;
	true -> opt_move_1(R, Is, SafeRegs, [I|Acc])
    end.

%% Reverse the instructions, while checking that there are no instructions that
%% would interfere with using the new destination register chosen.

opt_move_2(D, [I|Is], Acc) ->
    case is_transparent(D, I) of
	false -> not_possible;
	true -> opt_move_2(D, Is, [I|Acc])
    end;
opt_move_2(D, [], Acc) -> {D,Acc}.

%% is_transparent(Register, Instruction) -> true | false
%%  Returns true if Instruction does not in any way references Register
%%  (even indirectly by an allocation instruction).
%%  Returns false if Instruction does reference Register, or we are
%%  not sure.

is_transparent({x,X}, {set,_,_,{alloc,Live,_}}) when X < Live ->
    false;
is_transparent(R, {set,Ds,Ss,_Op}) ->
    case member(R, Ds) of
	true -> false;
	false -> not member(R, Ss)
    end;
is_transparent(_, _) -> false.

%% opt_alloc(Instructions) -> Instructions'
%%  Optimises all allocate instructions.

opt_alloc([{set,[],[],{alloc,R,{_,Ns,Nh,[]}}}|Is]) ->
    [{set,[],[],opt_alloc(Is, Ns, Nh, R)}|opt(Is)];
opt_alloc([I|Is]) -> [I|opt_alloc(Is)];
opt_alloc([]) -> [].
	
%% opt_alloc(Instructions, FrameSize, HeapNeed, LivingRegs) -> [Instr]
%%  Generates the optimal sequence of instructions for
%%  allocating and initalizing the stack frame and needed heap.

opt_alloc(_Is, nostack, Nh, LivingRegs) ->
    {alloc,LivingRegs,{nozero,nostack,Nh,[]}};
opt_alloc(Is, Ns, Nh, LivingRegs) ->
    InitRegs = init_yreg(Is, 0),
    case count_ones(InitRegs) of
	N when N*2 > Ns ->
	    {alloc,LivingRegs,{nozero,Ns,Nh,gen_init(Ns, InitRegs)}};
	_ ->
	    {alloc,LivingRegs,{zero,Ns,Nh,[]}}
    end.

gen_init(Fs, Regs) -> gen_init(Fs, Regs, 0, []).

gen_init(SameFs, _Regs, SameFs, Acc) -> reverse(Acc);
gen_init(Fs, Regs, Y, Acc) when Regs band 1 =:= 0 ->
    gen_init(Fs, Regs bsr 1, Y+1, [{init,{y,Y}}|Acc]);
gen_init(Fs, Regs, Y, Acc) ->
    gen_init(Fs, Regs bsr 1, Y+1, Acc).

%% init_yreg(Instructions, RegSet) -> RegSetInitialized
%%  Calculate the set of initialized y registers.

init_yreg([{set,_,_,{bif,_,_}}|_], Reg) -> Reg;
init_yreg([{set,_,_,{alloc,_,{gc_bif,_,_}}}|_], Reg) -> Reg;
init_yreg([{set,Ds,_,_}|Is], Reg) -> init_yreg(Is, add_yregs(Ds, Reg));
init_yreg(_Is, Reg) -> Reg.

add_yregs(Ys, Reg) -> foldl(fun(Y, R0) -> add_yreg(Y, R0) end, Reg, Ys).
    
add_yreg({y,Y}, Reg) -> Reg bor (1 bsl Y);
add_yreg(_, Reg)     -> Reg.

count_ones(Bits) -> count_ones(Bits, 0).
count_ones(0, Acc) -> Acc;
count_ones(Bits, Acc) ->
    count_ones(Bits bsr 1, Acc + (Bits band 1)).

%% Calculate the new number of live registers when we move an allocate
%% instruction upwards, passing a 'set' instruction.

alloc_live_regs({set,Ds,Ss,_}, Regs0) ->
    Rset = x_live(Ss, x_dead(Ds, (1 bsl Regs0)-1)),
    live_regs(Rset).

live_regs(Regs) ->
    live_regs_1(0, Regs).

live_regs_1(N, 0) -> N;
live_regs_1(N, Regs) -> live_regs_1(N+1, Regs bsr 1).

x_dead([{x,N}|Rs], Regs) -> x_dead(Rs, Regs band (bnot (1 bsl N)));
x_dead([_|Rs], Regs) -> x_dead(Rs, Regs);
x_dead([], Regs) -> Regs.

x_live([{x,N}|Rs], Regs) -> x_live(Rs, Regs bor (1 bsl N));
x_live([_|Rs], Regs) -> x_live(Rs, Regs);
x_live([], Regs) -> Regs.

%% inverse_comp_op(Op) -> none|RevOp

inverse_comp_op('=:=') -> '=/=';
inverse_comp_op('=/=') -> '=:=';
inverse_comp_op('==') -> '/=';
inverse_comp_op('/=') -> '==';
inverse_comp_op('>') -> '=<';
inverse_comp_op('<') -> '>=';
inverse_comp_op('>=') -> '<';
inverse_comp_op('=<') -> '>';
inverse_comp_op(_) -> none.

%%%
%%% Evaluation of constant bit fields.
%%%

is_bs_put({bs_put_integer,_,_,_,_,_}) -> true;
is_bs_put({bs_put_float,_,_,_,_,_}) -> true;
is_bs_put(_) -> false.

collect_bs_puts(Is) ->
    collect_bs_puts_1(Is, []).
    
collect_bs_puts_1([I|Is]=Is0, Acc) ->
    case is_bs_put(I) of
	false -> {reverse(Acc),Is0};
	true -> collect_bs_puts_1(Is, [I|Acc])
    end.
    
opt_bs_puts(Is) ->
    opt_bs_1(Is, []).

opt_bs_1([{bs_put_float,Fail,{integer,Sz},1,Flags0,Src}=I0|Is], Acc) ->
    try eval_put_float(Src, Sz, Flags0) of
	<<Int:Sz>> ->
	    Flags = force_big(Flags0),
	    I = {bs_put_integer,Fail,{integer,Sz},1,Flags,{integer,Int}},
	    opt_bs_1([I|Is], Acc)
    catch
	error:_ ->
	    opt_bs_1(Is, [I0|Acc])
    end;
opt_bs_1([{bs_put_integer,_,{integer,8},1,_,{integer,_}}|_]=IsAll, Acc0) ->
    {Is,Acc} = bs_collect_string(IsAll, Acc0),
    opt_bs_1(Is, Acc);
opt_bs_1([{bs_put_integer,Fail,{integer,Sz},1,F,{integer,N}}=I|Is0], Acc) when Sz > 8 ->
    case field_endian(F) of
	big ->
	    %% We can do this optimization for any field size without risk
	    %% for code explosion.
	    case bs_split_int(N, Sz, Fail, Is0) of
		no_split -> opt_bs_1(Is0, [I|Acc]);
		Is -> opt_bs_1(Is, Acc)
	    end;
	little when Sz < 128 ->
	    %% We only try to optimize relatively small fields, to avoid
	    %% an explosion in code size.
	    <<Int:Sz>> = <<N:Sz/little>>,
	    Flags = force_big(F),
	    Is = [{bs_put_integer,Fail,{integer,Sz},1,
		   Flags,{integer,Int}}|Is0],
	    opt_bs_1(Is, Acc);
	_ -> 					%native or too wide little field
	    opt_bs_1(Is0, [I|Acc])
    end;
opt_bs_1([{Op,Fail,{integer,Sz},U,F,Src}|Is], Acc) when U > 1 ->
    opt_bs_1([{Op,Fail,{integer,U*Sz},1,F,Src}|Is], Acc);
opt_bs_1([I|Is], Acc) ->
    opt_bs_1(Is, [I|Acc]);
opt_bs_1([], Acc) -> reverse(Acc).

eval_put_float(Src, Sz, Flags) when Sz =< 256 -> %Only evaluate if Sz is reasonable.
    Val = value(Src),
    case field_endian(Flags) of
	little -> <<Val:Sz/little-float-unit:1>>;
	big -> <<Val:Sz/big-float-unit:1>>
        %% native intentionally not handled here - we can't optimize it.
    end.

value({integer,I}) -> I;
value({float,F}) -> F.

bs_collect_string(Is, [{bs_put_string,Len,{string,Str}}|Acc]) ->
    bs_coll_str_1(Is, Len, reverse(Str), Acc);
bs_collect_string(Is, Acc) ->
    bs_coll_str_1(Is, 0, [], Acc).
    
bs_coll_str_1([{bs_put_integer,_,{integer,Sz},U,_,{integer,V}}|Is],
	      Len, StrAcc, IsAcc) when U*Sz =:= 8 ->
    Byte = V band 16#FF,
    bs_coll_str_1(Is, Len+1, [Byte|StrAcc], IsAcc);
bs_coll_str_1(Is, Len, StrAcc, IsAcc) ->
    {Is,[{bs_put_string,Len,{string,reverse(StrAcc)}}|IsAcc]}.

field_endian({field_flags,F}) -> field_endian_1(F).

field_endian_1([big=E|_]) -> E;
field_endian_1([little=E|_]) -> E;
field_endian_1([native=E|_]) -> E;
field_endian_1([_|Fs]) -> field_endian_1(Fs).

force_big({field_flags,F}) ->
    {field_flags,force_big_1(F)}.

force_big_1([big|_]=Fs) -> Fs;
force_big_1([little|Fs]) -> [big|Fs];
force_big_1([F|Fs]) -> [F|force_big_1(Fs)].

bs_split_int(0, Sz, _, _) when Sz > 64 ->
    %% We don't want to split in this case because the
    %% string will consist of only zeroes.
    no_split;
bs_split_int(-1, Sz, _, _) when Sz > 64 ->
    %% We don't want to split in this case because the
    %% string will consist of only 255 bytes.
    no_split;
bs_split_int(N, Sz, Fail, Acc) ->
    FirstByteSz = case Sz rem 8 of
		      0 -> 8;
		      Rem -> Rem
		  end,
    bs_split_int_1(N, FirstByteSz, Sz, Fail, Acc).

bs_split_int_1(-1, _, Sz, Fail, Acc) when Sz > 64 ->
    I = {bs_put_integer,Fail,{integer,Sz},1,{field_flags,[big]},{integer,-1}},
    [I|Acc];
bs_split_int_1(0, _, Sz, Fail, Acc) when Sz > 64 ->
    I = {bs_put_integer,Fail,{integer,Sz},1,{field_flags,[big]},{integer,0}},
    [I|Acc];
bs_split_int_1(N, ByteSz, Sz, Fail, Acc) when Sz > 0 ->
    Mask = (1 bsl ByteSz) - 1,
    I = {bs_put_integer,Fail,{integer,ByteSz},1,
	 {field_flags,[big]},{integer,N band Mask}},
    bs_split_int_1(N bsr ByteSz, 8, Sz-ByteSz, Fail, [I|Acc]);
bs_split_int_1(_, _, _, _, Acc) -> Acc.


%%%
%%% Optimization of new bit syntax matching: get rid
%%% of redundant bs_restore2/2 instructions across select_val
%%% instructions, as well as a few other simple peep-hole optimizations.
%%%

bsm_opt(Is0, Lc0) ->
    {Is1,D0,Lc} = bsm_scan(Is0, [], Lc0, []),
    Is2 = case D0 of
	      [] ->
		  Is1;
	     _ ->
		  D = gb_trees:from_orddict(orddict:from_list(D0)),
		  bsm_reroute(Is1, D, none, [])
	 end,
    Is = beam_clean:bs_clean_saves(Is2),
    {bsm_opt_2(Is, []),Lc}.

bsm_scan([{label,L}=Lbl,{bs_restore2,_,Save}=R|Is], D0, Lc, Acc0) ->
    D = [{{L,Save},Lc}|D0],
    Acc = [{label,Lc},R,Lbl|Acc0],
    bsm_scan(Is, D, Lc+1, Acc);
bsm_scan([I|Is], D, Lc, Acc) ->
    bsm_scan(Is, D, Lc, [I|Acc]);
bsm_scan([], D, Lc, Acc) ->
    {reverse(Acc),D,Lc}.

bsm_reroute([{bs_save2,Reg,Save}=I|Is], D, _, Acc) ->
    bsm_reroute(Is, D, {Reg,Save}, [I|Acc]);
bsm_reroute([{bs_restore2,Reg,Save}=I|Is], D, _, Acc) ->
    bsm_reroute(Is, D, {Reg,Save}, [I|Acc]);
bsm_reroute([{label,_}=I|Is], D, S, Acc) ->
    bsm_reroute(Is, D, S, [I|Acc]);
bsm_reroute([{select_val,Reg,F0,{list,Lbls0}}|Is], D, {_,Save}=S, Acc0) ->
    [F|Lbls] = bsm_subst_labels([F0|Lbls0], Save, D),
    Acc = [{select_val,Reg,F,{list,Lbls}}|Acc0],
    bsm_reroute(Is, D, S, Acc);
bsm_reroute([{test,TestOp,F0,TestArgs}=I|Is], D, {_,Save}=S, Acc0) ->
    F = bsm_subst_label(F0, Save, D),
    Acc = [{test,TestOp,F,TestArgs}|Acc0],
    case bsm_not_bs_test(I) of
	true ->
	    %% The test instruction will not update the bit offset for the
	    %% binary being matched. Therefore the save position can be kept.
	    bsm_reroute(Is, D, S, Acc);
	false ->
	    %% The test instruction might update the bit offset. Kill our
	    %% remembered Save position.
	    bsm_reroute(Is, D, none, Acc)
    end;
bsm_reroute([{test,TestOp,F0,Live,TestArgs,Dst}|Is], D, {_,Save}, Acc0) ->
    F = bsm_subst_label(F0, Save, D),
    Acc = [{test,TestOp,F,Live,TestArgs,Dst}|Acc0],
    %% The test instruction will update the bit offset. Kill our
    %% remembered Save position.
    bsm_reroute(Is, D, none, Acc);
bsm_reroute([{block,[{set,[],[],{alloc,_,_}}]}=Bl,
	     {bs_context_to_binary,_}=I|Is], D, S, Acc) ->
    %% To help further bit syntax optimizations.
    bsm_reroute([I,Bl|Is], D, S, Acc);
bsm_reroute([I|Is], D, _, Acc) ->
    bsm_reroute(Is, D, none, [I|Acc]);
bsm_reroute([], _, _, Acc) -> reverse(Acc).

bsm_opt_2([{test,bs_test_tail2,F,[Ctx,Bits]}|Is],
	  [{test,bs_skip_bits2,F,[Ctx,{integer,I},Unit,_Flags]}|Acc]) ->
    bsm_opt_2(Is, [{test,bs_test_tail2,F,[Ctx,Bits+I*Unit]}|Acc]);
bsm_opt_2([{test,bs_skip_bits2,F,[Ctx,{integer,I1},Unit1,_]}|Is],
	  [{test,bs_skip_bits2,F,[Ctx,{integer,I2},Unit2,Flags]}|Acc]) ->
    bsm_opt_2(Is, [{test,bs_skip_bits2,F,
		    [Ctx,{integer,I1*Unit1+I2*Unit2},1,Flags]}|Acc]);
bsm_opt_2([{test,bs_match_string,F,[Ctx,Bin1]},
	   {test,bs_match_string,F,[Ctx,Bin2]}|Is], Acc) ->
    I = {test,bs_match_string,F,[Ctx,<<Bin1/bitstring,Bin2/bitstring>>]},
    bsm_opt_2([I|Is], Acc);
bsm_opt_2([I|Is], Acc) ->
    bsm_opt_2(Is, [I|Acc]);
bsm_opt_2([], Acc) -> reverse(Acc).

%% bsm_not_bs_test({test,Name,_,Operands}) -> true|false.
%%  Test whether is the test is a "safe", i.e. does not move the
%%  bit offset for a binary.
%%
%%  'true' means that the test is safe, 'false' that we don't know or
%%  that the test moves the offset (e.g. bs_get_integer2).

bsm_not_bs_test({test,bs_test_tail2,_,[_,_]}) -> true;
bsm_not_bs_test(Test) -> beam_utils:is_pure_test(Test).

bsm_subst_labels(Fs, Save, D) ->
    bsm_subst_labels_1(Fs, Save, D, []).

bsm_subst_labels_1([F|Fs], Save, D, Acc) ->
    bsm_subst_labels_1(Fs, Save, D, [bsm_subst_label(F, Save, D)|Acc]);
bsm_subst_labels_1([], _, _, Acc) ->
    reverse(Acc).

bsm_subst_label({f,Lbl0}=F, Save, D) ->
    case gb_trees:lookup({Lbl0,Save}, D) of
	{value,Lbl} -> {f,Lbl};
	none -> F
    end;
bsm_subst_label(Other, _, _) -> Other.
%%
%% %CopyrightBegin%
%%
%% Copyright Ericsson AB 1999-2010. All Rights Reserved.
%%
%% The contents of this file are subject to the Erlang Public License,
%% Version 1.1, (the "License"); you may not use this file except in
%% compliance with the License. You should have received a copy of the
%% Erlang Public License along with this software. If not, it can be
%% retrieved online at http://www.erlang.org/.
%%
%% Software distributed under the License is distributed on an "AS IS"
%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%% the License for the specific language governing rights and limitations
%% under the License.
%%
%% %CopyrightEnd%
%%
%% Purpose : Partitions assembly instructions into basic blocks and
%% optimizes them.

-module(beam_block).

-export([module/2]).
-import(lists, [mapfoldl/3,reverse/1,reverse/2,foldl/3,member/2]).
-define(MAXREG, 1024).

module({Mod,Exp,Attr,Fs0,Lc0}, _Opt) ->
    {Fs,Lc} = mapfoldl(fun function/2, Lc0, Fs0),
    {ok,{Mod,Exp,Attr,Fs,Lc}}.

function({function,Name,Arity,CLabel,Is0}, Lc0) ->
    try
	%% Extra labels may thwart optimizations.
	Is1 = beam_jump:remove_unused_labels(Is0),

	%% Collect basic blocks and optimize them.
	Is2 = blockify(Is1),
	Is3 = beam_utils:live_opt(Is2),
	Is4 = opt_blocks(Is3),
	Is5 = beam_utils:delete_live_annos(Is4),

	%% Optimize bit syntax.
	{Is,Lc} = bsm_opt(Is5, Lc0),

	%% Done.
	{{function,Name,Arity,CLabel,Is},Lc}
    catch
	Class:Error ->
	    Stack = erlang:get_stacktrace(),
	    io:fwrite("Function: ~w/~w\n", [Name,Arity]),
	    erlang:raise(Class, Error, Stack)
    end.

%% blockify(Instructions0) -> Instructions
%%  Collect sequences of instructions to basic blocks.
%%  Also do some simple optimations on instructions outside the blocks.

blockify(Is) ->
    blockify(Is, []).

blockify([{loop_rec,{f,Fail},{x,0}},{loop_rec_end,_Lbl},{label,Fail}|Is], Acc) ->
    %% Useless instruction sequence.
    blockify(Is, Acc);

%% New bit syntax matching.
blockify([{bs_save2,R,Point}=I,{bs_restore2,R,Point}|Is], Acc) ->
    blockify([I|Is], Acc);
blockify([{bs_save2,R,Point}=I,{test,is_eq_exact,_,_}=Test,
	  {bs_restore2,R,Point}|Is], Acc) ->
    blockify([I,Test|Is], Acc);

%% Do other peep-hole optimizations.
blockify([{test,is_atom,{f,Fail},[Reg]}=I|
	  [{select_val,Reg,{f,Fail},
	    {list,[{atom,false},{f,_}=BrFalse,
		   {atom,true}=AtomTrue,{f,_}=BrTrue]}}|Is]=Is0],
	 [{block,Bl}|_]=Acc) ->
    case is_last_bool(Bl, Reg) of
	false ->
	    blockify(Is0, [I|Acc]);
	true ->
	    %% The last instruction is a boolean operator/guard BIF that can't fail.
	    %% We can convert the three-way branch to a two-way branch (eliminating
	    %% the reference to the failure label).
	    blockify(Is, [{jump,BrTrue},
			  {test,is_eq_exact,BrFalse,[Reg,AtomTrue]}|Acc])
    end;
blockify([{test,is_atom,{f,Fail},[Reg]}=I|
	  [{select_val,Reg,{f,Fail},
	    {list,[{atom,true}=AtomTrue,{f,_}=BrTrue,
		   {atom,false},{f,_}=BrFalse]}}|Is]=Is0],
	 [{block,Bl}|_]=Acc) ->
    case is_last_bool(Bl, Reg) of
	false ->
	    blockify(Is0, [I|Acc]);
	true ->
	    blockify(Is, [{jump,BrTrue},
			  {test,is_eq_exact,BrFalse,[Reg,AtomTrue]}|Acc])
    end;
blockify([I|Is0]=IsAll, Acc) ->
    case is_bs_put(I) of
	true ->
	    {BsPuts0,Is} = collect_bs_puts(IsAll),
	    BsPuts = opt_bs_puts(BsPuts0),
	    blockify(Is, reverse(BsPuts, Acc));
	false ->
	    case collect(I) of
		error -> blockify(Is0, [I|Acc]);
		Instr when is_tuple(Instr) ->
		    {Block,Is} = collect_block(IsAll),
		    blockify(Is, [{block,Block}|Acc])
	    end
    end;
blockify([], Acc) -> reverse(Acc).

is_last_bool([{set,[Reg],As,{bif,N,_}}], Reg) ->
    Ar = length(As),
    erl_internal:new_type_test(N, Ar) orelse erl_internal:comp_op(N, Ar)
	orelse erl_internal:bool_op(N, Ar);
is_last_bool([_|Is], Reg) -> is_last_bool(Is, Reg);
is_last_bool([], _) -> false.

collect_block(Is) ->
    collect_block(Is, []).

collect_block([{allocate_zero,Ns,R},{test_heap,Nh,R}|Is], Acc) ->
    collect_block(Is, [{set,[],[],{alloc,R,{no_opt,Ns,Nh,[]}}}|Acc]);
collect_block([I|Is]=Is0, Acc) ->
    case collect(I) of
	error -> {reverse(Acc),Is0};
	Instr -> collect_block(Is, [Instr|Acc])
    end.

collect({allocate_zero,N,R}) -> {set,[],[],{alloc,R,{zero,N,0,[]}}};
collect({test_heap,N,R})     -> {set,[],[],{alloc,R,{nozero,nostack,N,[]}}};
collect({bif,N,F,As,D})      -> {set,[D],As,{bif,N,F}};
collect({gc_bif,N,F,R,As,D}) -> {set,[D],As,{alloc,R,{gc_bif,N,F}}};
collect({move,S,D})          -> {set,[D],[S],move};
collect({put_list,S1,S2,D})  -> {set,[D],[S1,S2],put_list};
collect({put_tuple,A,D})     -> {set,[D],[],{put_tuple,A}};
collect({put,S})             -> {set,[],[S],put};
collect({get_tuple_element,S,I,D}) -> {set,[D],[S],{get_tuple_element,I}};
collect({set_tuple_element,S,D,I}) -> {set,[],[S,D],{set_tuple_element,I}};
collect({get_list,S,D1,D2})  -> {set,[D1,D2],[S],get_list};
collect(remove_message)      -> {set,[],[],remove_message};
collect({'catch',R,L})       -> {set,[R],[],{'catch',L}};
collect(_)                   -> error.

opt_blocks([{block,Bl0}|Is]) ->
    %% The live annotation at the beginning is not useful.
    [{'%live',_}|Bl] = Bl0,
    [{block,opt_block(Bl)}|opt_blocks(Is)];
opt_blocks([I|Is]) ->
    [I|opt_blocks(Is)];
opt_blocks([]) -> [].

opt_block(Is0) ->
    %% We explicitly move any allocate instruction upwards before optimising
    %% moves, to avoid any potential problems with the calculation of live
    %% registers.
    Is1 = move_allocates(Is0),
    Is = find_fixpoint(fun opt/1, Is1),
    opt_alloc(Is).

find_fixpoint(OptFun, Is0) ->
    case OptFun(Is0) of
	Is0 -> Is0;
	Is1 -> find_fixpoint(OptFun, Is1)
    end.

%% move_allocates(Is0) -> Is
%%  Move allocates upwards in the instruction stream, in the hope of
%%  getting more possibilities for optimizing away moves later.

move_allocates(Is) ->
    move_allocates_1(reverse(Is), []).

move_allocates_1([{set,[],[],{alloc,_,_}=Alloc}|Is0], Acc0) ->
    {Is,Acc} = move_allocates_2(Alloc, Is0, Acc0),
    move_allocates_1(Is, Acc);
move_allocates_1([I|Is], Acc) ->
    move_allocates_1(Is, [I|Acc]);
move_allocates_1([], Is) -> Is.

move_allocates_2({alloc,Live,Info}, [{set,[],[],{alloc,Live0,Info0}}|Is], Acc) ->
    Live = Live0,				% Assertion.
    Alloc = {alloc,Live,combine_alloc(Info0, Info)},
    move_allocates_2(Alloc, Is, Acc);
move_allocates_2({alloc,Live,Info}=Alloc0, [I|Is]=Is0, Acc) ->
    case alloc_may_pass(I) of
	false ->
	    {Is0,[{set,[],[],Alloc0}|Acc]};
	true ->
	    Alloc = {alloc,alloc_live_regs(I, Live),Info},
	    move_allocates_2(Alloc, Is, [I|Acc])
    end;
move_allocates_2(Alloc, [], Acc) ->
    {[],[{set,[],[],Alloc}|Acc]}.

alloc_may_pass({set,_,_,{alloc,_,_}}) -> false;
alloc_may_pass({set,_,_,{set_tuple_element,_}}) -> false;
alloc_may_pass({set,_,_,put_list}) -> false;
alloc_may_pass({set,_,_,{put_tuple,_}}) -> false;
alloc_may_pass({set,_,_,put}) -> false;
alloc_may_pass({set,_,_,_}) -> true.
    
combine_alloc({_,Ns,Nh1,Init}, {_,nostack,Nh2,[]})  ->
    {zero,Ns,beam_utils:combine_heap_needs(Nh1, Nh2),Init}.

%% opt([Instruction]) -> [Instruction]
%%  Optimize the instruction stream inside a basic block.

opt([{set,[Dst],As,{bif,Bif,Fail}}=I1,
     {set,[Dst],[Dst],{bif,'not',Fail}}=I2|Is]) ->
    %% Get rid of the 'not' if the operation can be inverted.
    case inverse_comp_op(Bif) of
 	none -> [I1,I2|opt(Is)];
 	RevBif -> [{set,[Dst],As,{bif,RevBif,Fail}}|opt(Is)]
    end;
opt([{set,[X],[X],move}|Is]) -> opt(Is);
opt([{set,[D1],[{integer,Idx1},Reg],{bif,element,{f,0}}}=I1,
     {set,[D2],[{integer,Idx2},Reg],{bif,element,{f,0}}}=I2|Is])
  when Idx1 < Idx2, D1 =/= D2, D1 =/= Reg, D2 =/= Reg ->
    opt([I2,I1|Is]);
opt([{set,Ds0,Ss,Op}|Is0]) ->	
    {Ds,Is} = opt_moves(Ds0, Is0),
    [{set,Ds,Ss,Op}|opt(Is)];
opt([{'%live',_}=I|Is]) ->
    [I|opt(Is)];
opt([]) -> [].

%% opt_moves([Dest], [Instruction]) -> {[Dest],[Instruction]}
%%  For each Dest, does the optimization described in opt_move/2.

opt_moves([], Is0) -> {[],Is0};
opt_moves([D0]=Ds, Is0) ->
    case opt_move(D0, Is0) of
	not_possible -> {Ds,Is0};
	{D1,Is} -> {[D1],Is}
    end;
opt_moves([X0,Y0], Is0) ->
    {X,Is2} = case opt_move(X0, Is0) of
		  not_possible -> {X0,Is0};
		  {Y0,_} -> {X0,Is0};
		  {_X1,_Is1} = XIs1 -> XIs1
	      end,
    case opt_move(Y0, Is2) of
	not_possible -> {[X,Y0],Is2};
	{X,_} -> {[X,Y0],Is2};
	{Y,Is} -> {[X,Y],Is}
    end.

%% opt_move(Dest, [Instruction]) -> {UpdatedDest,[Instruction]} | not_possible
%%  If there is a {move,Dest,FinalDest} instruction
%%  in the instruction stream, remove the move instruction
%%  and let FinalDest be the destination.
%%
%%  For this optimization to be safe, we must be sure that
%%  Dest will not be referenced in any other by other instructions
%%  in the rest of the instruction stream. Not even the indirect
%%  reference by an instruction that may allocate (such as
%%  test_heap/2 or a GC Bif) is allowed.

opt_move(Dest, Is) ->
    opt_move_1(Dest, Is, ?MAXREG, []).

opt_move_1(R, [{set,_,_,{alloc,Live,_}}|_]=Is, SafeRegs, Acc) when Live < SafeRegs ->
    %% Downgrade number of safe regs and rescan the instruction, as it most probably
    %% is a gc_bif instruction.
    opt_move_1(R, Is, Live, Acc);
opt_move_1(R, [{set,[{x,X}=D],[R],move}|Is], SafeRegs, Acc) ->
    case X < SafeRegs andalso beam_utils:is_killed_block(R, Is) of
	true -> opt_move_2(D, Acc, Is);
	false -> not_possible
    end;
opt_move_1(R, [{set,[D],[R],move}|Is], _SafeRegs, Acc) ->
    case beam_utils:is_killed_block(R, Is) of
	true -> opt_move_2(D, Acc, Is);
	false -> not_possible
    end;
opt_move_1(R, [I|Is], SafeRegs, Acc) ->
    case is_transparent(R, I) of
	false -> not_possible;
	true -> opt_move_1(R, Is, SafeRegs, [I|Acc])
    end.

%% Reverse the instructions, while checking that there are no instructions that
%% would interfere with using the new destination register chosen.

opt_move_2(D, [I|Is], Acc) ->
    case is_transparent(D, I) of
	false -> not_possible;
	true -> opt_move_2(D, Is, [I|Acc])
    end;
opt_move_2(D, [], Acc) -> {D,Acc}.

%% is_transparent(Register, Instruction) -> true | false
%%  Returns true if Instruction does not in any way references Register
%%  (even indirectly by an allocation instruction).
%%  Returns false if Instruction does reference Register, or we are
%%  not sure.

is_transparent({x,X}, {set,_,_,{alloc,Live,_}}) when X < Live ->
    false;
is_transparent(R, {set,Ds,Ss,_Op}) ->
    case member(R, Ds) of
	true -> false;
	false -> not member(R, Ss)
    end;
is_transparent(_, _) -> false.

%% opt_alloc(Instructions) -> Instructions'
%%  Optimises all allocate instructions.

opt_alloc([{set,[],[],{alloc,R,{_,Ns,Nh,[]}}}|Is]) ->
    [{set,[],[],opt_alloc(Is, Ns, Nh, R)}|opt(Is)];
opt_alloc([I|Is]) -> [I|opt_alloc(Is)];
opt_alloc([]) -> [].
	
%% opt_alloc(Instructions, FrameSize, HeapNeed, LivingRegs) -> [Instr]
%%  Generates the optimal sequence of instructions for
%%  allocating and initalizing the stack frame and needed heap.

opt_alloc(_Is, nostack, Nh, LivingRegs) ->
    {alloc,LivingRegs,{nozero,nostack,Nh,[]}};
opt_alloc(Is, Ns, Nh, LivingRegs) ->
    InitRegs = init_yreg(Is, 0),
    case count_ones(InitRegs) of
	N when N*2 > Ns ->
	    {alloc,LivingRegs,{nozero,Ns,Nh,gen_init(Ns, InitRegs)}};
	_ ->
	    {alloc,LivingRegs,{zero,Ns,Nh,[]}}
    end.

gen_init(Fs, Regs) -> gen_init(Fs, Regs, 0, []).

gen_init(SameFs, _Regs, SameFs, Acc) -> reverse(Acc);
gen_init(Fs, Regs, Y, Acc) when Regs band 1 =:= 0 ->
    gen_init(Fs, Regs bsr 1, Y+1, [{init,{y,Y}}|Acc]);
gen_init(Fs, Regs, Y, Acc) ->
    gen_init(Fs, Regs bsr 1, Y+1, Acc).

%% init_yreg(Instructions, RegSet) -> RegSetInitialized
%%  Calculate the set of initialized y registers.

init_yreg([{set,_,_,{bif,_,_}}|_], Reg) -> Reg;
init_yreg([{set,_,_,{alloc,_,{gc_bif,_,_}}}|_], Reg) -> Reg;
init_yreg([{set,Ds,_,_}|Is], Reg) -> init_yreg(Is, add_yregs(Ds, Reg));
init_yreg(_Is, Reg) -> Reg.

add_yregs(Ys, Reg) -> foldl(fun(Y, R0) -> add_yreg(Y, R0) end, Reg, Ys).
    
add_yreg({y,Y}, Reg) -> Reg bor (1 bsl Y);
add_yreg(_, Reg)     -> Reg.

count_ones(Bits) -> count_ones(Bits, 0).
count_ones(0, Acc) -> Acc;
count_ones(Bits, Acc) ->
    count_ones(Bits bsr 1, Acc + (Bits band 1)).

%% Calculate the new number of live registers when we move an allocate
%% instruction upwards, passing a 'set' instruction.

alloc_live_regs({set,Ds,Ss,_}, Regs0) ->
    Rset = x_live(Ss, x_dead(Ds, (1 bsl Regs0)-1)),
    live_regs(Rset).

live_regs(Regs) ->
    live_regs_1(0, Regs).

live_regs_1(N, 0) -> N;
live_regs_1(N, Regs) -> live_regs_1(N+1, Regs bsr 1).

x_dead([{x,N}|Rs], Regs) -> x_dead(Rs, Regs band (bnot (1 bsl N)));
x_dead([_|Rs], Regs) -> x_dead(Rs, Regs);
x_dead([], Regs) -> Regs.

x_live([{x,N}|Rs], Regs) -> x_live(Rs, Regs bor (1 bsl N));
x_live([_|Rs], Regs) -> x_live(Rs, Regs);
x_live([], Regs) -> Regs.

%% inverse_comp_op(Op) -> none|RevOp

inverse_comp_op('=:=') -> '=/=';
inverse_comp_op('=/=') -> '=:=';
inverse_comp_op('==') -> '/=';
inverse_comp_op('/=') -> '==';
inverse_comp_op('>') -> '=<';
inverse_comp_op('<') -> '>=';
inverse_comp_op('>=') -> '<';
inverse_comp_op('=<') -> '>';
inverse_comp_op(_) -> none.

%%%
%%% Evaluation of constant bit fields.
%%%

is_bs_put({bs_put_integer,_,_,_,_,_}) -> true;
is_bs_put({bs_put_float,_,_,_,_,_}) -> true;
is_bs_put(_) -> false.

collect_bs_puts(Is) ->
    collect_bs_puts_1(Is, []).
    
collect_bs_puts_1([I|Is]=Is0, Acc) ->
    case is_bs_put(I) of
	false -> {reverse(Acc),Is0};
	true -> collect_bs_puts_1(Is, [I|Acc])
    end.
    
opt_bs_puts(Is) ->
    opt_bs_1(Is, []).

opt_bs_1([{bs_put_float,Fail,{integer,Sz},1,Flags0,Src}=I0|Is], Acc) ->
    try eval_put_float(Src, Sz, Flags0) of
	<<Int:Sz>> ->
	    Flags = force_big(Flags0),
	    I = {bs_put_integer,Fail,{integer,Sz},1,Flags,{integer,Int}},
	    opt_bs_1([I|Is], Acc)
    catch
	error:_ ->
	    opt_bs_1(Is, [I0|Acc])
    end;
opt_bs_1([{bs_put_integer,_,{integer,8},1,_,{integer,_}}|_]=IsAll, Acc0) ->
    {Is,Acc} = bs_collect_string(IsAll, Acc0),
    opt_bs_1(Is, Acc);
opt_bs_1([{bs_put_integer,Fail,{integer,Sz},1,F,{integer,N}}=I|Is0], Acc) when Sz > 8 ->
    case field_endian(F) of
	big ->
	    %% We can do this optimization for any field size without risk
	    %% for code explosion.
	    case bs_split_int(N, Sz, Fail, Is0) of
		no_split -> opt_bs_1(Is0, [I|Acc]);
		Is -> opt_bs_1(Is, Acc)
	    end;
	little when Sz < 128 ->
	    %% We only try to optimize relatively small fields, to avoid
	    %% an explosion in code size.
	    <<Int:Sz>> = <<N:Sz/little>>,
	    Flags = force_big(F),
	    Is = [{bs_put_integer,Fail,{integer,Sz},1,
		   Flags,{integer,Int}}|Is0],
	    opt_bs_1(Is, Acc);
	_ -> 					%native or too wide little field
	    opt_bs_1(Is0, [I|Acc])
    end;
opt_bs_1([{Op,Fail,{integer,Sz},U,F,Src}|Is], Acc) when U > 1 ->
    opt_bs_1([{Op,Fail,{integer,U*Sz},1,F,Src}|Is], Acc);
opt_bs_1([I|Is], Acc) ->
    opt_bs_1(Is, [I|Acc]);
opt_bs_1([], Acc) -> reverse(Acc).

eval_put_float(Src, Sz, Flags) when Sz =< 256 -> %Only evaluate if Sz is reasonable.
    Val = value(Src),
    case field_endian(Flags) of
	little -> <<Val:Sz/little-float-unit:1>>;
	big -> <<Val:Sz/big-float-unit:1>>
        %% native intentionally not handled here - we can't optimize it.
    end.

value({integer,I}) -> I;
value({float,F}) -> F.

bs_collect_string(Is, [{bs_put_string,Len,{string,Str}}|Acc]) ->
    bs_coll_str_1(Is, Len, reverse(Str), Acc);
bs_collect_string(Is, Acc) ->
    bs_coll_str_1(Is, 0, [], Acc).
    
bs_coll_str_1([{bs_put_integer,_,{integer,Sz},U,_,{integer,V}}|Is],
	      Len, StrAcc, IsAcc) when U*Sz =:= 8 ->
    Byte = V band 16#FF,
    bs_coll_str_1(Is, Len+1, [Byte|StrAcc], IsAcc);
bs_coll_str_1(Is, Len, StrAcc, IsAcc) ->
    {Is,[{bs_put_string,Len,{string,reverse(StrAcc)}}|IsAcc]}.

field_endian({field_flags,F}) -> field_endian_1(F).

field_endian_1([big=E|_]) -> E;
field_endian_1([little=E|_]) -> E;
field_endian_1([native=E|_]) -> E;
field_endian_1([_|Fs]) -> field_endian_1(Fs).

force_big({field_flags,F}) ->
    {field_flags,force_big_1(F)}.

force_big_1([big|_]=Fs) -> Fs;
force_big_1([little|Fs]) -> [big|Fs];
force_big_1([F|Fs]) -> [F|force_big_1(Fs)].

bs_split_int(0, Sz, _, _) when Sz > 64 ->
    %% We don't want to split in this case because the
    %% string will consist of only zeroes.
    no_split;
bs_split_int(-1, Sz, _, _) when Sz > 64 ->
    %% We don't want to split in this case because the
    %% string will consist of only 255 bytes.
    no_split;
bs_split_int(N, Sz, Fail, Acc) ->
    FirstByteSz = case Sz rem 8 of
		      0 -> 8;
		      Rem -> Rem
		  end,
    bs_split_int_1(N, FirstByteSz, Sz, Fail, Acc).

bs_split_int_1(-1, _, Sz, Fail, Acc) when Sz > 64 ->
    I = {bs_put_integer,Fail,{integer,Sz},1,{field_flags,[big]},{integer,-1}},
    [I|Acc];
bs_split_int_1(0, _, Sz, Fail, Acc) when Sz > 64 ->
    I = {bs_put_integer,Fail,{integer,Sz},1,{field_flags,[big]},{integer,0}},
    [I|Acc];
bs_split_int_1(N, ByteSz, Sz, Fail, Acc) when Sz > 0 ->
    Mask = (1 bsl ByteSz) - 1,
    I = {bs_put_integer,Fail,{integer,ByteSz},1,
	 {field_flags,[big]},{integer,N band Mask}},
    bs_split_int_1(N bsr ByteSz, 8, Sz-ByteSz, Fail, [I|Acc]);
bs_split_int_1(_, _, _, _, Acc) -> Acc.


%%%
%%% Optimization of new bit syntax matching: get rid
%%% of redundant bs_restore2/2 instructions across select_val
%%% instructions, as well as a few other simple peep-hole optimizations.
%%%

bsm_opt(Is0, Lc0) ->
    {Is1,D0,Lc} = bsm_scan(Is0, [], Lc0, []),
    Is2 = case D0 of
	      [] ->
		  Is1;
	     _ ->
		  D = gb_trees:from_orddict(orddict:from_list(D0)),
		  bsm_reroute(Is1, D, none, [])
	 end,
    Is = beam_clean:bs_clean_saves(Is2),
    {bsm_opt_2(Is, []),Lc}.

bsm_scan([{label,L}=Lbl,{bs_restore2,_,Save}=R|Is], D0, Lc, Acc0) ->
    D = [{{L,Save},Lc}|D0],
    Acc = [{label,Lc},R,Lbl|Acc0],
    bsm_scan(Is, D, Lc+1, Acc);
bsm_scan([I|Is], D, Lc, Acc) ->
    bsm_scan(Is, D, Lc, [I|Acc]);
bsm_scan([], D, Lc, Acc) ->
    {reverse(Acc),D,Lc}.

bsm_reroute([{bs_save2,Reg,Save}=I|Is], D, _, Acc) ->
    bsm_reroute(Is, D, {Reg,Save}, [I|Acc]);
bsm_reroute([{bs_restore2,Reg,Save}=I|Is], D, _, Acc) ->
    bsm_reroute(Is, D, {Reg,Save}, [I|Acc]);
bsm_reroute([{label,_}=I|Is], D, S, Acc) ->
    bsm_reroute(Is, D, S, [I|Acc]);
bsm_reroute([{select_val,Reg,F0,{list,Lbls0}}|Is], D, {_,Save}=S, Acc0) ->
    [F|Lbls] = bsm_subst_labels([F0|Lbls0], Save, D),
    Acc = [{select_val,Reg,F,{list,Lbls}}|Acc0],
    bsm_reroute(Is, D, S, Acc);
bsm_reroute([{test,TestOp,F0,TestArgs}=I|Is], D, {_,Save}=S, Acc0) ->
    F = bsm_subst_label(F0, Save, D),
    Acc = [{test,TestOp,F,TestArgs}|Acc0],
    case bsm_not_bs_test(I) of
	true ->
	    %% The test instruction will not update the bit offset for the
	    %% binary being matched. Therefore the save position can be kept.
	    bsm_reroute(Is, D, S, Acc);
	false ->
	    %% The test instruction might update the bit offset. Kill our
	    %% remembered Save position.
	    bsm_reroute(Is, D, none, Acc)
    end;
bsm_reroute([{test,TestOp,F0,Live,TestArgs,Dst}|Is], D, {_,Save}, Acc0) ->
    F = bsm_subst_label(F0, Save, D),
    Acc = [{test,TestOp,F,Live,TestArgs,Dst}|Acc0],
    %% The test instruction will update the bit offset. Kill our
    %% remembered Save position.
    bsm_reroute(Is, D, none, Acc);
bsm_reroute([{block,[{set,[],[],{alloc,_,_}}]}=Bl,
	     {bs_context_to_binary,_}=I|Is], D, S, Acc) ->
    %% To help further bit syntax optimizations.
    bsm_reroute([I,Bl|Is], D, S, Acc);
bsm_reroute([I|Is], D, _, Acc) ->
    bsm_reroute(Is, D, none, [I|Acc]);
bsm_reroute([], _, _, Acc) -> reverse(Acc).

bsm_opt_2([{test,bs_test_tail2,F,[Ctx,Bits]}|Is],
	  [{test,bs_skip_bits2,F,[Ctx,{integer,I},Unit,_Flags]}|Acc]) ->
    bsm_opt_2(Is, [{test,bs_test_tail2,F,[Ctx,Bits+I*Unit]}|Acc]);
bsm_opt_2([{test,bs_skip_bits2,F,[Ctx,{integer,I1},Unit1,_]}|Is],
	  [{test,bs_skip_bits2,F,[Ctx,{integer,I2},Unit2,Flags]}|Acc]) ->
    bsm_opt_2(Is, [{test,bs_skip_bits2,F,
		    [Ctx,{integer,I1*Unit1+I2*Unit2},1,Flags]}|Acc]);
bsm_opt_2([{test,bs_match_string,F,[Ctx,Bin1]},
	   {test,bs_match_string,F,[Ctx,Bin2]}|Is], Acc) ->
    I = {test,bs_match_string,F,[Ctx,<<Bin1/bitstring,Bin2/bitstring>>]},
    bsm_opt_2([I|Is], Acc);
bsm_opt_2([I|Is], Acc) ->
    bsm_opt_2(Is, [I|Acc]);
bsm_opt_2([], Acc) -> reverse(Acc).

%% bsm_not_bs_test({test,Name,_,Operands}) -> true|false.
%%  Test whether is the test is a "safe", i.e. does not move the
%%  bit offset for a binary.
%%
%%  'true' means that the test is safe, 'false' that we don't know or
%%  that the test moves the offset (e.g. bs_get_integer2).

bsm_not_bs_test({test,bs_test_tail2,_,[_,_]}) -> true;
bsm_not_bs_test(Test) -> beam_utils:is_pure_test(Test).

bsm_subst_labels(Fs, Save, D) ->
    bsm_subst_labels_1(Fs, Save, D, []).

bsm_subst_labels_1([F|Fs], Save, D, Acc) ->
    bsm_subst_labels_1(Fs, Save, D, [bsm_subst_label(F, Save, D)|Acc]);
bsm_subst_labels_1([], _, _, Acc) ->
    reverse(Acc).

bsm_subst_label({f,Lbl0}=F, Save, D) ->
    case gb_trees:lookup({Lbl0,Save}, D) of
	{value,Lbl} -> {f,Lbl};
	none -> F
    end;
bsm_subst_label(Other, _, _) -> Other.