%%% -*- erlang-indent-level: 2 -*-
%%%
%%% %CopyrightBegin%
%%% 
%%% Copyright Ericsson AB 2001-2012. All Rights Reserved.
%%% 
%%% The contents of this file are subject to the Erlang Public License,
%%% Version 1.1, (the "License"); you may not use this file except in
%%% compliance with the License. You should have received a copy of the
%%% Erlang Public License along with this software. If not, it can be
%%% retrieved online at http://www.erlang.org/.
%%% 
%%% Software distributed under the License is distributed on an "AS IS"
%%% basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
%%% the License for the specific language governing rights and limitations
%%% under the License.
%%% 
%%% %CopyrightEnd%
%%%
%%% HiPE/x86 assembler
%%%
%%% TODO:
%%% - Simplify combine_label_maps and mk_data_relocs.
%%% - Move find_const to hipe_pack_constants?

-ifdef(HIPE_AMD64).
-define(HIPE_X86_ASSEMBLE,  hipe_amd64_assemble).
-define(HIPE_X86_ENCODE,    hipe_amd64_encode).
-define(HIPE_X86_REGISTERS, hipe_amd64_registers).
-define(HIPE_X86_PP,        hipe_amd64_pp).
-ifdef(AMD64_SIMULATE_NSP).
-define(X86_SIMULATE_NSP, ?AMD64_SIMULATE_NSP).
-endif.
-define(EAX, rax).
-define(REGArch, reg64).
-define(RMArch, rm64).
-define(EA_DISP32_ABSOLUTE, ea_disp32_sindex).
-else.
-define(HIPE_X86_ASSEMBLE,  hipe_x86_assemble).
-define(HIPE_X86_ENCODE,    hipe_x86_encode).
-define(HIPE_X86_REGISTERS, hipe_x86_registers).
-define(HIPE_X86_PP,        hipe_x86_pp).
-define(EAX, eax).
-define(REGArch, reg32).
-define(RMArch, rm32).
-define(EA_DISP32_ABSOLUTE, ea_disp32).
-endif.

-module(?HIPE_X86_ASSEMBLE).
-export([assemble/4]).

-define(DEBUG,true).

-include("../main/hipe.hrl").
-include("../x86/hipe_x86.hrl").
-include("../../kernel/src/hipe_ext_format.hrl").
-include("../rtl/hipe_literals.hrl").
-include("../misc/hipe_sdi.hrl").
-undef(ASSERT).
-define(ASSERT(G), if G -> [] ; true -> exit({assertion_failed,?MODULE,?LINE,??G}) end).

assemble(CompiledCode, Closures, Exports, Options) ->
  ?when_option(time, Options, ?start_timer("x86 assembler")),
  print("****************** Assembling *******************\n", [], Options),
  %%
  Code = [{MFA,
	   hipe_x86:defun_code(Defun),
	   hipe_x86:defun_data(Defun)}
	  || {MFA, Defun} <- CompiledCode],
  %%
  {ConstAlign,ConstSize,ConstMap,RefsFromConsts} =
    hipe_pack_constants:pack_constants(Code, ?HIPE_X86_REGISTERS:alignment()),
  %%
  {CodeSize,CodeBinary,AccRefs,LabelMap,ExportMap} =
    encode(translate(Code, ConstMap, Options), Options),
  print("Total num bytes=~w\n", [CodeSize], Options),
  %% put(code_size, CodeSize),
  %% put(const_size, ConstSize),
  %% ?when_option(verbose, Options,
  %%	       ?debug_msg("Constants are ~w bytes\n",[ConstSize])),
  %%
  SC = hipe_pack_constants:slim_constmap(ConstMap),
  DataRelocs = mk_data_relocs(RefsFromConsts, LabelMap),
  SSE = slim_sorted_exportmap(ExportMap,Closures,Exports),
  SlimRefs = hipe_pack_constants:slim_refs(AccRefs),
  Bin = term_to_binary([{?VERSION_STRING(),?HIPE_SYSTEM_CRC},
			ConstAlign, ConstSize,
			SC,
			DataRelocs, % nee LM, LabelMap
			SSE,
			CodeSize,CodeBinary,SlimRefs,
			0,[] % ColdCodeSize, SlimColdRefs
		       ]),
  %%
  %% ?when_option(time, Options, ?stop_timer("x86 assembler")),
  Bin.

%%%
%%% Assembly Pass 1.
%%% Process initial {MFA,Code,Data} list.
%%% Translate each MFA's body, choosing operand & instruction kinds.
%%%
%%% Assembly Pass 2.
%%% Perform short/long form optimisation for jumps.
%%% Build LabelMap for each MFA.
%%%
%%% Result is {MFA,NewCode,CodeSize,LabelMap} list.
%%%

translate(Code, ConstMap, Options) ->
  translate_mfas(Code, ConstMap, [], Options).

translate_mfas([{MFA,Insns,_Data}|Code], ConstMap, NewCode, Options) ->
  {NewInsns,CodeSize,LabelMap} =
    translate_insns(Insns, {MFA,ConstMap}, hipe_sdi:pass1_init(), 0, [], Options),
  translate_mfas(Code, ConstMap, [{MFA,NewInsns,CodeSize,LabelMap}|NewCode], Options);
translate_mfas([], _ConstMap, NewCode, _Options) ->
  lists:reverse(NewCode).

translate_insns([I|Insns], Context, SdiPass1, Address, NewInsns, Options) ->
  NewIs = translate_insn(I, Context, Options),
  add_insns(NewIs, Insns, Context, SdiPass1, Address, NewInsns, Options);
translate_insns([], _Context, SdiPass1, Address, NewInsns, _Options) ->
  {LabelMap,CodeSizeIncr} = hipe_sdi:pass2(SdiPass1),
  {lists:reverse(NewInsns), Address+CodeSizeIncr, LabelMap}.

add_insns([I|Is], Insns, Context, SdiPass1, Address, NewInsns, Options) ->
  NewSdiPass1 =
    case I of
      {'.label',L,_} ->
	hipe_sdi:pass1_add_label(SdiPass1, Address, L);
      {jcc_sdi,{_,{label,L}},_} ->
	SdiInfo = #sdi_info{incr=(6-2),lb=(-128)+2,ub=127+2},
	hipe_sdi:pass1_add_sdi(SdiPass1, Address, L, SdiInfo);
      {jmp_sdi,{{label,L}},_} ->
	SdiInfo = #sdi_info{incr=(5-2),lb=(-128)+2,ub=127+2},
	hipe_sdi:pass1_add_sdi(SdiPass1, Address, L, SdiInfo);
      _ ->
	SdiPass1
    end,
  Address1 = Address + insn_size(I),
  add_insns(Is, Insns, Context, NewSdiPass1, Address1, [I|NewInsns], Options);
add_insns([], Insns, Context, SdiPass1, Address, NewInsns, Options) ->
  translate_insns(Insns, Context, SdiPass1, Address, NewInsns, Options).

insn_size(I) ->
  case I of
    {'.label',_,_} -> 0;
    {'.sdesc',_,_} -> 0;
    {jcc_sdi,_,_} -> 2;
    {jmp_sdi,_,_} -> 2;
    {Op,Arg,_Orig} -> ?HIPE_X86_ENCODE:insn_sizeof(Op, Arg)
  end.

translate_insn(I, Context, Options) ->
  case I of
    #alu{} ->
      Arg = resolve_alu_args(hipe_x86:alu_src(I), hipe_x86:alu_dst(I), Context),
      [{hipe_x86:alu_op(I), Arg, I}];
    #call{} ->
      translate_call(I);
    #cmovcc{} ->
      {Dst,Src} = resolve_move_args(
		    hipe_x86:cmovcc_src(I), hipe_x86:cmovcc_dst(I),
		    Context),
      CC = {cc,?HIPE_X86_ENCODE:cc(hipe_x86:cmovcc_cc(I))},
      Arg = {CC,Dst,Src},
      [{cmovcc, Arg, I}];
    #cmp{} ->
      Arg = resolve_alu_args(hipe_x86:cmp_src(I), hipe_x86:cmp_dst(I), Context),
      [{cmp, Arg, I}];
    #comment{} ->
      [];
    #fmove{} ->
      {Op,Arg} = resolve_sse2_fmove_args(hipe_x86:fmove_src(I),
					 hipe_x86:fmove_dst(I)),
      [{Op, Arg, I}];
    #fp_binop{} ->
      case proplists:get_bool(x87, Options) of
	true ->  % x87
	  Arg = resolve_x87_binop_args(hipe_x86:fp_binop_src(I),
				       hipe_x86:fp_binop_dst(I)),
	  [{hipe_x86:fp_binop_op(I), Arg, I}];
	false -> % sse2
	  Arg = resolve_sse2_binop_args(hipe_x86:fp_binop_src(I),
					hipe_x86:fp_binop_dst(I)),
	  [{resolve_sse2_op(hipe_x86:fp_binop_op(I)), Arg, I}]
      end;
    #fp_unop{} ->
      case proplists:get_bool(x87, Options) of
	true ->  % x87
	  Arg = resolve_x87_unop_arg(hipe_x86:fp_unop_arg(I)),
	  [{hipe_x86:fp_unop_op(I), Arg, I}];
	false -> % sse2
	  case hipe_x86:fp_unop_op(I) of
	    'fchs' ->
	      Arg = resolve_sse2_fchs_arg(hipe_x86:fp_unop_arg(I)),
	      [{'xorpd', Arg, I}];
	    'fwait' -> % no op on sse2, magic on x87
	      []
	  end
      end;
    #imul{} ->
      translate_imul(I, Context);
    #jcc{} ->
      Cc = {cc,?HIPE_X86_ENCODE:cc(hipe_x86:jcc_cc(I))},
      Label = translate_label(hipe_x86:jcc_label(I)),
      [{jcc_sdi, {Cc,Label}, I}];
    #jmp_fun{} ->
      %% call and jmp are patched the same, so no need to distinguish
      %% call from tailcall
      PatchTypeExt =
	case hipe_x86:jmp_fun_linkage(I) of
	  remote -> ?CALL_REMOTE;
	  not_remote -> ?CALL_LOCAL
	end,
      Arg = translate_fun(hipe_x86:jmp_fun_fun(I), PatchTypeExt),
      [{jmp, {Arg}, I}];
    #jmp_label{} ->
      Arg = translate_label(hipe_x86:jmp_label_label(I)),
      [{jmp_sdi, {Arg}, I}];
    #jmp_switch{} ->
      RM32 = resolve_jmp_switch_arg(I, Context),
      [{jmp, {RM32}, I}];
    #label{} ->
      [{'.label', hipe_x86:label_label(I), I}];
    #lea{} ->
      Arg = resolve_lea_args(hipe_x86:lea_mem(I), hipe_x86:lea_temp(I)),
      [{lea, Arg, I}];
    #move{} ->
      Arg = resolve_move_args(hipe_x86:move_src(I), hipe_x86:move_dst(I),
			      Context),
      [{mov, Arg, I}];
    #move64{} ->
      translate_move64(I, Context);
    #movsx{} ->
      Arg = resolve_movx_args(hipe_x86:movsx_src(I), hipe_x86:movsx_dst(I)),
      [{movsx, Arg, I}];
    #movzx{} ->
      Arg = resolve_movx_args(hipe_x86:movzx_src(I), hipe_x86:movzx_dst(I)),
      [{movzx, Arg, I}];
    %% pseudo_call: eliminated before assembly
    %% pseudo_jcc: eliminated before assembly
    %% pseudo_tailcall: eliminated before assembly
    %% pseudo_tailcall_prepare: eliminated before assembly
    #pop{} ->
      Arg = translate_dst(hipe_x86:pop_dst(I)),
      [{pop, {Arg}, I}];
    #push{} ->
      Arg = translate_src(hipe_x86:push_src(I), Context),
      [{push, {Arg}, I}];
    #ret{} ->
      translate_ret(I);
    #shift{} ->
      Arg = resolve_shift_args(hipe_x86:shift_src(I), hipe_x86:shift_dst(I), Context),
      [{hipe_x86:shift_op(I), Arg, I}];
    #test{} ->
      Arg = resolve_test_args(hipe_x86:test_src(I), hipe_x86:test_dst(I), Context),
      [{test, Arg, I}]
  end.

-ifdef(X86_SIMULATE_NSP).
-ifdef(HIPE_AMD64).
translate_call(I) ->
  WordSize = hipe_amd64_registers:wordsize(),
  RegSP = 2#100, % esp/rsp
  TempSP = hipe_x86:mk_temp(RegSP, untagged),
  FunOrig = hipe_x86:call_fun(I),
  Fun =
    case FunOrig of
      #x86_mem{base=#x86_temp{reg=4}, off=#x86_imm{value=Off}} ->
	FunOrig#x86_mem{off=#x86_imm{value=Off+WordSize}};
      _ -> FunOrig
    end,
  RegRA =
    begin
      RegTemp0 = hipe_amd64_registers:temp0(),
      RegTemp1 = hipe_amd64_registers:temp1(),
      case Fun of
	#x86_temp{reg=RegTemp0} -> RegTemp1;
	#x86_mem{base=#x86_temp{reg=RegTemp0}} -> RegTemp1;
	_ -> RegTemp0
      end
    end,
  TempRA = hipe_x86:mk_temp(RegRA, untagged),
  PatchTypeExt =
    case hipe_x86:call_linkage(I) of
      remote -> ?CALL_REMOTE;
      not_remote -> ?CALL_LOCAL
    end,
  JmpArg = translate_fun(Fun, PatchTypeExt),
  I4 = {'.sdesc', hipe_x86:call_sdesc(I), #comment{term=sdesc}},
  I3 = {jmp, {JmpArg}, #comment{term=call}},
  Size3 = hipe_amd64_encode:insn_sizeof(jmp, {JmpArg}),
  MovArgs = {mem_to_rmArch(hipe_x86:mk_mem(TempSP,
					     hipe_x86:mk_imm(0),
					     untagged)),
	     temp_to_regArch(TempRA)},
  I2 = {mov, MovArgs, #comment{term=call}},
  Size2 = hipe_amd64_encode:insn_sizeof(mov, MovArgs),
  I1 = {lea, {temp_to_regArch(TempRA),
	      {ea, hipe_amd64_encode:ea_disp32_rip(Size2+Size3)}},
	#comment{term=call}},
  I0 = {sub, {temp_to_rmArch(TempSP), {imm8,WordSize}}, I},
  [I0,I1,I2,I3,I4].
-else.
translate_call(I) ->
  WordSize = ?HIPE_X86_REGISTERS:wordsize(),
  RegSP = 2#100, % esp/rsp
  TempSP = hipe_x86:mk_temp(RegSP, untagged),
  FunOrig = hipe_x86:call_fun(I),
  Fun =
    case FunOrig of
      #x86_mem{base=#x86_temp{reg=4}, off=#x86_imm{value=Off}} ->
	FunOrig#x86_mem{off=#x86_imm{value=Off+WordSize}};
      _ -> FunOrig
    end,
  PatchTypeExt =
    case hipe_x86:call_linkage(I) of
      remote -> ?CALL_REMOTE;
      not_remote -> ?CALL_LOCAL
    end,
  JmpArg = translate_fun(Fun, PatchTypeExt),
  I3 = {'.sdesc', hipe_x86:call_sdesc(I), #comment{term=sdesc}},
  I2 = {jmp, {JmpArg}, #comment{term=call}},
  Size2 = ?HIPE_X86_ENCODE:insn_sizeof(jmp, {JmpArg}),
  I1 = {mov, {mem_to_rmArch(hipe_x86:mk_mem(TempSP,
					  hipe_x86:mk_imm(0),
					  untagged)),
	      {imm32,{?X86ABSPCREL,4+Size2}}},
	#comment{term=call}},
  I0 = {sub, {temp_to_rmArch(TempSP), {imm8,WordSize}}, I},
  [I0,I1,I2,I3].
-endif.

translate_ret(I) ->
  NPOP = hipe_x86:ret_npop(I) + ?HIPE_X86_REGISTERS:wordsize(),
  RegSP = 2#100, % esp/rsp
  TempSP = hipe_x86:mk_temp(RegSP, untagged),
  RegRA = 2#011, % ebx/rbx
  TempRA = hipe_x86:mk_temp(RegRA, untagged),
  [{mov,
    {temp_to_regArch(TempRA),
     mem_to_rmArch(hipe_x86:mk_mem(TempSP,
				   hipe_x86:mk_imm(0),
				   untagged))},
    I},
   {add,
    {temp_to_rmArch(TempSP),
     case NPOP < 128 of
       true -> {imm8,NPOP};
       false -> {imm32,NPOP}
     end},
    #comment{term=ret}},
   {jmp,
    {temp_to_rmArch(TempRA)},
    #comment{term=ret}}].

-else. % not X86_SIMULATE_NSP

translate_call(I) ->
  %% call and jmp are patched the same, so no need to distinguish
  %% call from tailcall
  PatchTypeExt =
    case hipe_x86:call_linkage(I) of
      remote -> ?CALL_REMOTE;
      not_remote -> ?CALL_LOCAL
    end,
  Arg = translate_fun(hipe_x86:call_fun(I), PatchTypeExt),
  SDesc = hipe_x86:call_sdesc(I),
  [{call, {Arg}, I}, {'.sdesc', SDesc, #comment{term=sdesc}}].

translate_ret(I) ->
  Arg =
    case hipe_x86:ret_npop(I) of
      0 -> {};
      N -> {{imm16,N}}
    end,
  [{ret, Arg, I}].

-endif. % X86_SIMULATE_NSP

translate_imul(I, Context) ->
  Temp = temp_to_regArch(hipe_x86:imul_temp(I)),
  Src = temp_or_mem_to_rmArch(hipe_x86:imul_src(I)),
  Args =
    case hipe_x86:imul_imm_opt(I) of
      [] -> {Temp,Src};
      Imm -> {Temp,Src,translate_imm(Imm, Context, true)}
    end,
  [{'imul', Args, I}].

temp_or_mem_to_rmArch(Src) ->
  case Src of
    #x86_temp{} -> temp_to_rmArch(Src);
    #x86_mem{} -> mem_to_rmArch(Src)
  end.

translate_label(Label) when is_integer(Label) ->
  {label,Label}.	% symbolic, since offset is not yet computable

translate_fun(Arg, PatchTypeExt) ->
  case Arg of
    #x86_temp{} ->
      temp_to_rmArch(Arg);
    #x86_mem{} ->
      mem_to_rmArch(Arg);
    #x86_mfa{m=M,f=F,a=A} ->
      {rel32,{PatchTypeExt,{M,F,A}}};
    #x86_prim{prim=Prim} ->
      {rel32,{PatchTypeExt,Prim}}
  end.

translate_src(Src, Context) ->
  case Src of
    #x86_imm{} ->
      translate_imm(Src, Context, true);
    _ ->
      translate_dst(Src)
  end.

%%% MayTrunc8 controls whether negative Imm8s should be truncated
%%% to 8 bits or not. Truncation should always be done, except when
%%% the caller will widen the Imm8 to an Imm32 or Imm64.
translate_imm(#x86_imm{value=Imm}, Context, MayTrunc8) ->
  if is_atom(Imm) ->
      {imm32,{?LOAD_ATOM,Imm}};
     is_integer(Imm) ->
      case (Imm =< 127) and (Imm >= -128) of
	true ->
	  Imm8 =
	    case MayTrunc8 of
	      true -> Imm band 16#FF;
	      false -> Imm
	    end,
	  {imm8,Imm8};
	false ->
	  {imm32,Imm}
      end;
     true ->
      Val =
	case Imm of
	  {Label,constant} ->
	    {MFA,ConstMap} = Context,
	    ConstNo = find_const({MFA,Label}, ConstMap),
	    {constant,ConstNo};
	  {Label,closure} ->
	    {closure,Label};
	  {Label,c_const} ->
	    {c_const,Label}
	end,
      {imm32,{?LOAD_ADDRESS,Val}}
  end.

translate_dst(Dst) ->
  case Dst of
    #x86_temp{} ->
      temp_to_regArch(Dst);
    #x86_mem{type='double'} ->
      mem_to_rm64fp(Dst);
    #x86_mem{} ->
      mem_to_rmArch(Dst);
    #x86_fpreg{} ->
      fpreg_to_stack(Dst)
  end.

%%%
%%% Assembly Pass 3.
%%% Process final {MFA,Code,CodeSize,LabelMap} list from pass 2.
%%% Translate to a single binary code segment.
%%% Collect relocation patches.
%%% Build ExportMap (MFA-to-address mapping).
%%% Combine LabelMaps to a single one (for mk_data_relocs/2 compatibility).
%%% Return {CombinedCodeSize,BinaryCode,Relocs,CombinedLabelMap,ExportMap}.
%%%

encode(Code, Options) ->
  CodeSize = compute_code_size(Code, 0),
  ExportMap = build_export_map(Code, 0, []),
  {AccCode,Relocs} = encode_mfas(Code, 0, [], [], Options),
  CodeBinary = list_to_binary(lists:reverse(AccCode)),
  ?ASSERT(CodeSize =:= byte_size(CodeBinary)),
  CombinedLabelMap = combine_label_maps(Code, 0, gb_trees:empty()),
  {CodeSize,CodeBinary,Relocs,CombinedLabelMap,ExportMap}.

nr_pad_bytes(Address) -> (4 - (Address rem 4)) rem 4. % XXX: 16 or 32 instead?

align_entry(Address) -> Address + nr_pad_bytes(Address).

compute_code_size([{_MFA,_Insns,CodeSize,_LabelMap}|Code], Size) ->
  compute_code_size(Code, align_entry(Size+CodeSize));
compute_code_size([], Size) -> Size.

build_export_map([{{M,F,A},_Insns,CodeSize,_LabelMap}|Code], Address, ExportMap) ->
  build_export_map(Code, align_entry(Address+CodeSize), [{Address,M,F,A}|ExportMap]);
build_export_map([], _Address, ExportMap) -> ExportMap.

combine_label_maps([{MFA,_Insns,CodeSize,LabelMap}|Code], Address, CLM) ->
  NewCLM = merge_label_map(gb_trees:to_list(LabelMap), MFA, Address, CLM),
  combine_label_maps(Code, align_entry(Address+CodeSize), NewCLM);
combine_label_maps([], _Address, CLM) -> CLM.

merge_label_map([{Label,Offset}|Rest], MFA, Address, CLM) ->
  NewCLM = gb_trees:insert({MFA,Label}, Address+Offset, CLM),
  merge_label_map(Rest, MFA, Address, NewCLM);
merge_label_map([], _MFA, _Address, CLM) -> CLM.

encode_mfas([{MFA,Insns,CodeSize,LabelMap}|Code], Address, AccCode, Relocs, Options) ->
  print("Generating code for:~w\n", [MFA], Options),
  print("Offset   | Opcode                   | Instruction\n", [], Options),
  {Address1,Relocs1,AccCode1} =
    encode_insns(Insns, Address, Address, LabelMap, Relocs, AccCode, Options),
  ExpectedAddress = align_entry(Address + CodeSize),
  ?ASSERT(Address1 =:= ExpectedAddress),
  print("Finished.\n\n", [], Options),
  encode_mfas(Code, Address1, AccCode1, Relocs1, Options);
encode_mfas([], _Address, AccCode, Relocs, _Options) ->
  {AccCode, Relocs}.

encode_insns([I|Insns], Address, FunAddress, LabelMap, Relocs, AccCode, Options) ->
  case I of
    {'.label',L,_} ->
      LabelAddress = gb_trees:get(L, LabelMap) + FunAddress,
      ?ASSERT(Address =:= LabelAddress),	% sanity check
      print_insn(Address, [], I, Options),
      encode_insns(Insns, Address, FunAddress, LabelMap, Relocs, AccCode, Options);
    {'.sdesc',SDesc,_} ->
      #x86_sdesc{exnlab=ExnLab,fsize=FSize,arity=Arity,live=Live} = SDesc,
      ExnRA =
	case ExnLab of
	  [] -> [];	% don't cons up a new one
	  ExnLab -> gb_trees:get(ExnLab, LabelMap) + FunAddress
	end,
      Reloc = {?SDESC, Address,
	       ?STACK_DESC(ExnRA, FSize, Arity, Live)},
      encode_insns(Insns, Address, FunAddress, LabelMap, [Reloc|Relocs], AccCode, Options);
    _ ->
      {Op,Arg,_} = fix_jumps(I, Address, FunAddress, LabelMap),
      {Bytes, NewRelocs} = ?HIPE_X86_ENCODE:insn_encode(Op, Arg, Address),
      print_insn(Address, Bytes, I, Options),
      Segment = list_to_binary(Bytes),
      Size = byte_size(Segment),
      NewAccCode = [Segment|AccCode],
      encode_insns(Insns, Address+Size, FunAddress, LabelMap, NewRelocs++Relocs, NewAccCode, Options)
  end;
encode_insns([], Address, FunAddress, LabelMap, Relocs, AccCode, Options) ->
  case nr_pad_bytes(Address) of
    0 ->
      {Address,Relocs,AccCode};
    NrPadBytes ->	% triggers at most once per function body
      Padding = lists:duplicate(NrPadBytes, {nop,{},#comment{term=padding}}),
      encode_insns(Padding, Address, FunAddress, LabelMap, Relocs, AccCode, Options)
  end.

fix_jumps(I, InsnAddress, FunAddress, LabelMap) ->
  case I of
    {jcc_sdi,{CC,{label,L}},OrigI} ->
      LabelAddress = gb_trees:get(L, LabelMap) + FunAddress,
      ShortOffset = LabelAddress - (InsnAddress + 2),
      if is_integer(ShortOffset), ShortOffset >= -128, ShortOffset =< 127 ->
	  {jcc,{CC,{rel8,ShortOffset band 16#FF}},OrigI};
	 true ->
	  LongOffset = LabelAddress - (InsnAddress + 6),
	  {jcc,{CC,{rel32,LongOffset}},OrigI}
      end;
    {jmp_sdi,{{label,L}},OrigI} ->
      LabelAddress = gb_trees:get(L, LabelMap) + FunAddress,
      ShortOffset = LabelAddress - (InsnAddress + 2),
      if is_integer(ShortOffset), ShortOffset >= -128, ShortOffset =< 127 ->
	  {jmp,{{rel8,ShortOffset band 16#FF}},OrigI};
	 true ->
	  LongOffset = LabelAddress - (InsnAddress + 5),
	  {jmp,{{rel32,LongOffset}},OrigI}
      end;
    _ -> I
  end.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

fpreg_to_stack(#x86_fpreg{reg=Reg}) ->
  {fpst, Reg}.

temp_to_regArch(#x86_temp{reg=Reg}) ->
  {?REGArch, Reg}.

-ifdef(HIPE_AMD64).
temp_to_reg64(#x86_temp{reg=Reg}) ->
  {reg64, Reg}.
-endif.

temp_to_reg32(#x86_temp{reg=Reg}) ->
  {reg32, Reg}.
temp_to_reg16(#x86_temp{reg=Reg}) ->
  {reg16, Reg}.
temp_to_reg8(#x86_temp{reg=Reg}) ->
  {reg8, Reg}.

temp_to_xmm(#x86_temp{reg=Reg}) ->
  {xmm, Reg}. 

-ifdef(HIPE_AMD64).
temp_to_rm64(#x86_temp{reg=Reg}) ->
  {rm64, hipe_amd64_encode:rm_reg(Reg)}.
-endif.

temp_to_rmArch(#x86_temp{reg=Reg}) ->
  {?RMArch, ?HIPE_X86_ENCODE:rm_reg(Reg)}.
temp_to_rm64fp(#x86_temp{reg=Reg}) ->
  {rm64fp, ?HIPE_X86_ENCODE:rm_reg(Reg)}.

mem_to_ea(Mem) ->
  EA = mem_to_ea_common(Mem),
  {ea, EA}.

mem_to_rm32(Mem) ->
  EA = mem_to_ea_common(Mem),
  {rm32, ?HIPE_X86_ENCODE:rm_mem(EA)}.

mem_to_rmArch(Mem) ->
  EA = mem_to_ea_common(Mem),
  {?RMArch, ?HIPE_X86_ENCODE:rm_mem(EA)}.

mem_to_rm64fp(Mem) ->
  EA = mem_to_ea_common(Mem),
  {rm64fp, ?HIPE_X86_ENCODE:rm_mem(EA)}.

%%%%%%%%%%%%%%%%%
mem_to_rm8(Mem) ->
  EA = mem_to_ea_common(Mem),
  {rm8, ?HIPE_X86_ENCODE:rm_mem(EA)}.

mem_to_rm16(Mem) ->
  EA = mem_to_ea_common(Mem),
  {rm16, ?HIPE_X86_ENCODE:rm_mem(EA)}.
%%%%%%%%%%%%%%%%%

mem_to_ea_common(#x86_mem{base=[], off=#x86_imm{value=Off}}) ->
  ?HIPE_X86_ENCODE:?EA_DISP32_ABSOLUTE(Off);
mem_to_ea_common(#x86_mem{base=#x86_temp{reg=Base}, off=#x86_temp{reg=Index}}) ->
  case Base band 2#111 of
    5 -> % ebp/rbp or r13
      case Index band 2#111 of
	5 -> % ebp/rbp or r13
	  SINDEX = ?HIPE_X86_ENCODE:sindex(0, Index),
	  SIB = ?HIPE_X86_ENCODE:sib(Base, SINDEX),
	  ?HIPE_X86_ENCODE:ea_disp8_sib(0, SIB);
	_ ->
	  SINDEX = ?HIPE_X86_ENCODE:sindex(0, Base),
	  SIB = ?HIPE_X86_ENCODE:sib(Index, SINDEX),
	  ?HIPE_X86_ENCODE:ea_sib(SIB)
      end;
    _ ->
      SINDEX = ?HIPE_X86_ENCODE:sindex(0, Index),
      SIB = ?HIPE_X86_ENCODE:sib(Base, SINDEX),
      ?HIPE_X86_ENCODE:ea_sib(SIB)
  end;
mem_to_ea_common(#x86_mem{base=#x86_temp{reg=Base}, off=#x86_imm{value=Off}}) ->
  if
    Off =:= 0 ->
      case Base of
	4 -> %esp, use SIB w/o disp8
	  SIB = ?HIPE_X86_ENCODE:sib(Base),
	  ?HIPE_X86_ENCODE:ea_sib(SIB);
	5 -> %ebp, use disp8 w/o SIB
	  ?HIPE_X86_ENCODE:ea_disp8_base(Off, Base);
        12 -> %r12, use SIB w/o disp8
	  SIB = ?HIPE_X86_ENCODE:sib(Base),
	  ?HIPE_X86_ENCODE:ea_sib(SIB);
        13 -> %r13, use disp8 w/o SIB
 	  ?HIPE_X86_ENCODE:ea_disp8_base(Off, Base);         
	_ -> %neither SIB nor disp8 needed
	  ?HIPE_X86_ENCODE:ea_base(Base)
      end;
    Off >= -128, Off =< 127 ->
      Disp8 = Off band 16#FF,
      case Base of
	4 -> %esp, must use SIB
	  SIB = ?HIPE_X86_ENCODE:sib(Base),
	  ?HIPE_X86_ENCODE:ea_disp8_sib(Disp8, SIB);
        12 -> %r12, must use SIB
	  SIB = ?HIPE_X86_ENCODE:sib(Base),
	  ?HIPE_X86_ENCODE:ea_disp8_sib(Disp8, SIB);
	_ -> %use disp8 w/o SIB
	  ?HIPE_X86_ENCODE:ea_disp8_base(Disp8, Base)
      end;
    true ->
      case Base of
	4 -> %esp, must use SIB
	  SIB = ?HIPE_X86_ENCODE:sib(Base),
	  ?HIPE_X86_ENCODE:ea_disp32_sib(Off, SIB);
	12 -> %r12, must use SIB
	  SIB = ?HIPE_X86_ENCODE:sib(Base),
	  ?HIPE_X86_ENCODE:ea_disp32_sib(Off, SIB);
	_ ->
	  ?HIPE_X86_ENCODE:ea_disp32_base(Off, Base)
      end
  end.

%% jmp_switch
-ifdef(HIPE_AMD64).
resolve_jmp_switch_arg(I, _Context) ->
  Base = hipe_x86:temp_reg(hipe_x86:jmp_switch_jtab(I)),
  Index = hipe_x86:temp_reg(hipe_x86:jmp_switch_temp(I)),
  SINDEX = hipe_amd64_encode:sindex(3, Index),
  SIB = hipe_amd64_encode:sib(Base, SINDEX),
  EA =
    if (Base =:= 5) or (Base =:= 13) ->
	hipe_amd64_encode:ea_disp8_sib(0, SIB);
       true ->
	hipe_amd64_encode:ea_sib(SIB)
    end,
  {rm64,hipe_amd64_encode:rm_mem(EA)}.
-else.
resolve_jmp_switch_arg(I, {MFA,ConstMap}) ->
  ConstNo = find_const({MFA,hipe_x86:jmp_switch_jtab(I)}, ConstMap),
  Disp32 = {?LOAD_ADDRESS,{constant,ConstNo}},
  SINDEX = ?HIPE_X86_ENCODE:sindex(2, hipe_x86:temp_reg(hipe_x86:jmp_switch_temp(I))),
  EA = ?HIPE_X86_ENCODE:ea_disp32_sindex(Disp32, SINDEX), % this creates a SIB implicitly
  {rm32,?HIPE_X86_ENCODE:rm_mem(EA)}.
-endif.

%% lea reg, mem
resolve_lea_args(Src=#x86_mem{}, Dst=#x86_temp{}) ->
  {temp_to_regArch(Dst),mem_to_ea(Src)}.

resolve_sse2_op(Op) ->
  case Op of
    fadd -> addsd;
    fdiv -> divsd;
    fmul -> mulsd;
    fsub -> subsd;
    _ -> exit({?MODULE, unknown_sse2_operator, Op})
  end.

%% OP xmm, mem
resolve_sse2_binop_args(Src=#x86_mem{type=double},
			Dst=#x86_temp{type=double}) ->
  {temp_to_xmm(Dst),mem_to_rm64fp(Src)};
%% movsd mem, xmm
resolve_sse2_binop_args(Src=#x86_temp{type=double},
			Dst=#x86_mem{type=double}) ->
  {mem_to_rm64fp(Dst),temp_to_xmm(Src)};
%% OP xmm, xmm
resolve_sse2_binop_args(Src=#x86_temp{type=double},
			Dst=#x86_temp{type=double}) ->
  {temp_to_xmm(Dst),temp_to_rm64fp(Src)}.

%%% fmove -> cvtsi2sd or movsd
resolve_sse2_fmove_args(Src, Dst) ->
  case {Src,Dst} of
    {#x86_temp{type=untagged}, #x86_temp{type=double}} -> % cvtsi2sd xmm, reg
      {cvtsi2sd, {temp_to_xmm(Dst),temp_to_rmArch(Src)}};
    {#x86_mem{type=untagged}, #x86_temp{type=double}} -> % cvtsi2sd xmm, mem
      {cvtsi2sd, {temp_to_xmm(Dst),mem_to_rmArch(Src)}};
    _ -> % movsd
      {movsd, resolve_sse2_binop_args(Src, Dst)}
  end.

%%% xorpd xmm, mem
resolve_sse2_fchs_arg(Dst=#x86_temp{type=double}) ->
  {temp_to_xmm(Dst),
   {rm64fp, {rm_mem, ?HIPE_X86_ENCODE:?EA_DISP32_ABSOLUTE(
		       {?LOAD_ADDRESS,
			{c_const, sse2_fnegate_mask}})}}}.

%% mov mem, imm
resolve_move_args(#x86_imm{value=ImmSrc}, Dst=#x86_mem{type=Type}, Context) ->
  case Type of   % to support byte, int16 and int32 stores
    byte ->
      ByteImm = ImmSrc band 255, %to ensure that it is a bytesized imm
      {mem_to_rm8(Dst),{imm8,ByteImm}};
    int16 ->
      {mem_to_rm16(Dst),{imm16,ImmSrc band 16#FFFF}};
    int32 ->
      {_,Imm} = translate_imm(#x86_imm{value=ImmSrc}, Context, false),
      {mem_to_rm32(Dst),{imm32,Imm}};
    _ ->
      RMArch = mem_to_rmArch(Dst),
      {_,Imm} = translate_imm(#x86_imm{value=ImmSrc}, Context, false),
      {RMArch,{imm32,Imm}}
  end;

%% mov reg,mem
resolve_move_args(Src=#x86_mem{type=Type}, Dst=#x86_temp{}, _Context) ->
  case Type of
    int32 -> % must be unsigned
      {temp_to_reg32(Dst),mem_to_rm32(Src)};
    _ ->
      {temp_to_regArch(Dst),mem_to_rmArch(Src)}
  end;

%% mov mem,reg
resolve_move_args(Src=#x86_temp{}, Dst=#x86_mem{type=Type}, _Context) ->
  case Type of   % to support byte, int16 and int32 stores
    byte ->
      {mem_to_rm8(Dst),temp_to_reg8(Src)};
    int16 ->
      {mem_to_rm16(Dst),temp_to_reg16(Src)};
    int32 ->
      {mem_to_rm32(Dst),temp_to_reg32(Src)};
    tagged -> % tagged, untagged
      {mem_to_rmArch(Dst),temp_to_regArch(Src)};
    untagged -> % tagged, untagged
      {mem_to_rmArch(Dst),temp_to_regArch(Src)}
  end;

%% mov reg,reg
resolve_move_args(Src=#x86_temp{}, Dst=#x86_temp{}, _Context) ->
  {temp_to_regArch(Dst),temp_to_rmArch(Src)};

%% mov reg,imm
resolve_move_args(Src=#x86_imm{value=_ImmSrc}, Dst=#x86_temp{}, Context) ->
  {_,Imm} = translate_imm(Src, Context, false),
  imm_move_args(Dst, Imm).

-ifdef(HIPE_AMD64).
imm_move_args(Dst, Imm) ->
  if is_number(Imm), Imm >= 0 ->
      {temp_to_reg32(Dst),{imm32,Imm}};
     true ->
      {temp_to_rm64(Dst),{imm32,Imm}}
  end.
-else.
imm_move_args(Dst, Imm) ->
  {temp_to_reg32(Dst),{imm32,Imm}}.
-endif.

-ifdef(HIPE_AMD64).
translate_move64(I, Context) ->
  Arg = resolve_move64_args(hipe_x86:move64_src(I),
			    hipe_x86:move64_dst(I),
			    Context),
  [{mov, Arg, I}].

%% mov reg,imm64
resolve_move64_args(Src=#x86_imm{}, Dst=#x86_temp{}, Context) ->
  {_,Imm} = translate_imm(Src, Context, false),
  {temp_to_reg64(Dst),{imm64,Imm}}.
-else.
translate_move64(I, _Context) -> exit({?MODULE, I}).
-endif.

%%% mov{s,z}x
resolve_movx_args(Src=#x86_mem{type=Type}, Dst=#x86_temp{}) ->
  {temp_to_regArch(Dst),
   case Type of
     byte ->
       mem_to_rm8(Src);
     int16 ->
       mem_to_rm16(Src);
     int32 ->
       mem_to_rm32(Src)
   end}.

%%% alu/cmp (_not_ test)
resolve_alu_args(Src, Dst, Context) ->
  case {Src,Dst} of
    {#x86_imm{}, #x86_mem{}} ->
      {mem_to_rmArch(Dst), translate_imm(Src, Context, true)};
    {#x86_mem{}, #x86_temp{}} ->
      {temp_to_regArch(Dst), mem_to_rmArch(Src)};
    {#x86_temp{}, #x86_mem{}} ->
      {mem_to_rmArch(Dst), temp_to_regArch(Src)};
    {#x86_temp{}, #x86_temp{}} ->
      {temp_to_regArch(Dst), temp_to_rmArch(Src)};
    {#x86_imm{}, #x86_temp{reg=0}} -> % eax,imm
      NewSrc = translate_imm(Src, Context, true),
      NewDst =
	case NewSrc of
	  {imm8,_} -> temp_to_rmArch(Dst);
	  {imm32,_} -> ?EAX
	end,
      {NewDst, NewSrc};
    {#x86_imm{}, #x86_temp{}} ->
      {temp_to_rmArch(Dst), translate_imm(Src, Context, true)}
  end.

%%% test
resolve_test_args(Src, Dst, Context) ->
  case Src of
    #x86_imm{} -> % imm8 not allowed
      {_ImmSize,ImmValue} = translate_imm(Src, Context, false),
      NewDst =
	case Dst of
	  #x86_temp{reg=0} -> ?EAX;
	  #x86_temp{} -> temp_to_rmArch(Dst);
	  #x86_mem{} -> mem_to_rmArch(Dst)
	end,
      {NewDst, {imm32,ImmValue}};
    #x86_temp{} ->
      NewDst =
	case Dst of
	  #x86_temp{} -> temp_to_rmArch(Dst);
	  #x86_mem{} -> mem_to_rmArch(Dst)
	end,
      {NewDst, temp_to_regArch(Src)}
  end.

%%% shifts
resolve_shift_args(Src, Dst, Context) ->
  RM32 =
    case Dst of
      #x86_temp{} -> temp_to_rmArch(Dst);
      #x86_mem{} -> mem_to_rmArch(Dst)
    end,
  Count =
    case Src of
      #x86_imm{value=1} -> 1;
      #x86_imm{} -> translate_imm(Src, Context, true); % must be imm8
      #x86_temp{reg=1} -> cl	% temp must be ecx
    end,
  {RM32, Count}.

%% x87_binop mem
resolve_x87_unop_arg(Arg=#x86_mem{type=Type})->
  case Type of
    'double' -> {mem_to_rm64fp(Arg)};
    'untagged' -> {mem_to_rmArch(Arg)};
    _ -> ?EXIT({fmovArgNotSupported,{Arg}})
  end;
resolve_x87_unop_arg(Arg=#x86_fpreg{}) ->
  {fpreg_to_stack(Arg)};
resolve_x87_unop_arg([]) ->
  [].

%% x87_binop mem, st(i)
resolve_x87_binop_args(Src=#x86_fpreg{}, Dst=#x86_mem{})->
  {mem_to_rm64fp(Dst),fpreg_to_stack(Src)};
%% x87_binop st(0), st(i)
resolve_x87_binop_args(Src=#x86_fpreg{}, Dst=#x86_fpreg{})->
  {fpreg_to_stack(Dst),fpreg_to_stack(Src)}.

%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

mk_data_relocs(RefsFromConsts, LabelMap) ->
  lists:flatten(mk_data_relocs(RefsFromConsts, LabelMap, [])).

mk_data_relocs([{MFA,Labels} | Rest], LabelMap, Acc) ->
  Map = [case Label of
	   {L,Pos} ->
	     Offset = find({MFA,L}, LabelMap),
	     {Pos,Offset};
	   {sorted,Base,OrderedLabels} ->
	     {sorted, Base, [begin
			       Offset = find({MFA,L}, LabelMap),
			       {Order, Offset}
			     end
			     || {L,Order} <- OrderedLabels]}
	 end
	 || Label <- Labels],
  %% msg("Map: ~w Map\n",[Map]),
  mk_data_relocs(Rest, LabelMap, [Map,Acc]);
mk_data_relocs([],_,Acc) -> Acc.

find({MFA,L},LabelMap) ->
  gb_trees:get({MFA,L}, LabelMap).

slim_sorted_exportmap([{Addr,M,F,A}|Rest], Closures, Exports) ->
  IsClosure = lists:member({M,F,A}, Closures),
  IsExported = is_exported(F, A, Exports),
  [Addr,M,F,A,IsClosure,IsExported | slim_sorted_exportmap(Rest, Closures, Exports)];
slim_sorted_exportmap([],_,_) -> [].

is_exported(F, A, Exports) -> lists:member({F,A}, Exports).

%%%
%%% Assembly listing support (pp_asm option).
%%%

print(String, Arglist, Options) ->
  ?when_option(pp_asm, Options, io:format(String, Arglist)).

print_insn(Address, Bytes, I, Options) ->
  ?when_option(pp_asm, Options, print_insn_2(Address, Bytes, I)),
  ?when_option(pp_cxmon, Options, print_code_list_2(Bytes)).

print_code_list_2([H | Tail]) ->
  print_byte(H),
  io:format(","),
  print_code_list_2(Tail);
print_code_list_2([]) ->
  io:format("").

print_insn_2(Address, Bytes, {_,_,OrigI}) ->
  io:format("~8.16b | ", [Address]),
  print_code_list(Bytes, 0),
  ?HIPE_X86_PP:pp_insn(OrigI).

print_code_list([Byte|Rest], Len) ->
  print_byte(Byte),
  print_code_list(Rest, Len+1);
print_code_list([], Len) ->
  fill_spaces(24-(Len*2)),
  io:format(" | ").

print_byte(Byte) ->
  io:format("~2.16.0b", [Byte band 16#FF]).

fill_spaces(N) when N > 0 ->
  io:format(" "),
  fill_spaces(N-1);
fill_spaces(0) ->
  [].

%%%
%%% Lookup a constant in a ConstMap.
%%%

find_const({MFA,Label},[{pcm_entry,MFA,Label,ConstNo,_,_,_}|_]) ->
  ConstNo;
find_const(N,[_|R]) ->
  find_const(N,R);
find_const(C,[]) ->
  ?EXIT({constant_not_found,C}).