/*
 * %CopyrightBegin%
 * 
 * Copyright Ericsson AB 2005-2009. All Rights Reserved.
 * 
 * The contents of this file are subject to the Erlang Public License,
 * Version 1.1, (the "License"); you may not use this file except in
 * compliance with the License. You should have received a copy of the
 * Erlang Public License along with this software. If not, it can be
 * retrieved online at http://www.erlang.org/.
 * 
 * Software distributed under the License is distributed on an "AS IS"
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
 * the License for the specific language governing rights and limitations
 * under the License.
 * 
 * %CopyrightEnd%
 */
/* $Id$
 */
#include <stddef.h>	/* offsetof() */
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "global.h"
#include "erl_binary.h"
#include <sys/mman.h>

#include "hipe_arch.h"
#include "hipe_native_bif.h"	/* nbif_callemu() */
#include "hipe_bif0.h"

/* Flush dcache and invalidate icache for a range of addresses. */
void hipe_flush_icache_range(void *address, unsigned int nbytes)
{
#if defined(__ARM_EABI__)
    register unsigned long beg __asm__("r0") = (unsigned long)address;
    register unsigned long end __asm__("r1") = (unsigned long)address + nbytes;
    register unsigned long flg __asm__("r2") = 0;
    register unsigned long scno __asm__("r7") = 0xf0002;
    __asm__ __volatile__("swi 0"	/* sys_cacheflush() */
			 : "=r"(beg)
			 : "0"(beg), "r"(end), "r"(flg), "r"(scno));
#else
    register unsigned long beg __asm__("r0") = (unsigned long)address;
    register unsigned long end __asm__("r1") = (unsigned long)address + nbytes;
    register unsigned long flg __asm__("r2") = 0;
    __asm__ __volatile__("swi 0x9f0002"	/* sys_cacheflush() */
			 : "=r"(beg)
			 : "0"(beg), "r"(end), "r"(flg));
#endif
}

void hipe_flush_icache_word(void *address)
{
    hipe_flush_icache_range(address, 4);
}

/*
 * Management of 32MB code segments for regular code and trampolines.
 */

#define SEGMENT_NRBYTES	(32*1024*1024)	/* named constant, _not_ a tunable */

static struct segment {
    unsigned int *base;		/* [base,base+32MB[ */
    unsigned int *code_pos;	/* INV: base <= code_pos <= tramp_pos  */
    unsigned int *tramp_pos;	/* INV: tramp_pos <= base+32MB */
    /* On ARM we always allocate a trampoline at base+32MB-8 for
       nbif_callemu, so tramp_pos <= base+32MB-8. */
} curseg;

#define in_area(ptr,start,nbytes)	\
	((unsigned long)((char*)(ptr) - (char*)(start)) < (nbytes))

static void *new_code_mapping(void)
{
    return mmap(0, SEGMENT_NRBYTES,
		PROT_EXEC|PROT_READ|PROT_WRITE,
		MAP_PRIVATE|MAP_ANONYMOUS,
		-1, 0);
}

static int check_callees(Eterm callees)
{
    Eterm *tuple;
    Uint arity;
    Uint i;

    if (is_not_tuple(callees))
	return -1;
    tuple = tuple_val(callees);
    arity = arityval(tuple[0]);
    for (i = 1; i <= arity; ++i) {
	Eterm mfa = tuple[i];
	if (is_atom(mfa))
	    continue;
	if (is_not_tuple(mfa) ||
	    tuple_val(mfa)[0] != make_arityval(3) ||
	    is_not_atom(tuple_val(mfa)[1]) ||
	    is_not_atom(tuple_val(mfa)[2]) ||
	    is_not_small(tuple_val(mfa)[3]) ||
	    unsigned_val(tuple_val(mfa)[3]) > 255)
	    return -1;
    }
    return arity;
}

static unsigned int *try_alloc(Uint nrwords, int nrcallees, Eterm callees, unsigned int **trampvec)
{
    unsigned int *base, *address, *tramp_pos, nrfreewords;
    int trampnr;
    Eterm mfa, m, f;
    unsigned int a, *trampoline;

    m = NIL; f = NIL; a = 0; /* silence stupid compiler warning */
    tramp_pos = curseg.tramp_pos;
    address = curseg.code_pos;
    nrfreewords = tramp_pos - address;
    if (nrwords > nrfreewords)
	return NULL;
    curseg.code_pos = address + nrwords;
    nrfreewords -= nrwords;

    base = curseg.base;
    for (trampnr = 1; trampnr <= nrcallees; ++trampnr) {
	mfa = tuple_val(callees)[trampnr];
	if (is_atom(mfa))
	    trampoline = hipe_primop_get_trampoline(mfa);
	else {
	    m = tuple_val(mfa)[1];
	    f = tuple_val(mfa)[2];
	    a = unsigned_val(tuple_val(mfa)[3]);
	    trampoline = hipe_mfa_get_trampoline(m, f, a);
	}
	if (!in_area(trampoline, base, SEGMENT_NRBYTES)) {
	    if (nrfreewords < 2)
		return NULL;
	    nrfreewords -= 2;
	    tramp_pos = trampoline = tramp_pos - 2;
	    trampoline[0] = 0xE51FF004; /* ldr pc, [pc,#-4] */
	    trampoline[1] = 0;		/* callee's address */
	    hipe_flush_icache_range(trampoline, 2*sizeof(int));
	    if (is_atom(mfa))
		hipe_primop_set_trampoline(mfa, trampoline);
	    else
		hipe_mfa_set_trampoline(m, f, a, trampoline);
	}
	trampvec[trampnr-1] = trampoline;
    }
    curseg.tramp_pos = tramp_pos;
    return address;
}

void *hipe_alloc_code(Uint nrbytes, Eterm callees, Eterm *trampolines, Process *p)
{
    Uint nrwords;
    int nrcallees;
    Eterm trampvecbin;
    unsigned int **trampvec;
    unsigned int *address;
    unsigned int *base;
    struct segment oldseg;

    if (nrbytes & 0x3)
	return NULL;
    nrwords = nrbytes >> 2;

    nrcallees = check_callees(callees);
    if (nrcallees < 0)
	return NULL;
    trampvecbin = new_binary(p, NULL, nrcallees*sizeof(unsigned int*));
    trampvec = (unsigned int**)binary_bytes(trampvecbin);

    address = try_alloc(nrwords, nrcallees, callees, trampvec);
    if (!address) {
	base = new_code_mapping();
	if (base == MAP_FAILED)
	    return NULL;
	oldseg = curseg;
	curseg.base = base;
	curseg.code_pos = base;
	curseg.tramp_pos = (unsigned int*)((char*)base + SEGMENT_NRBYTES);
#if defined(__arm__)
	curseg.tramp_pos -= 2;
	curseg.tramp_pos[0] = 0xE51FF004;	/* ldr pc, [pc,#-4] */
	curseg.tramp_pos[1] = (unsigned int)&nbif_callemu;
#endif

	address = try_alloc(nrwords, nrcallees, callees, trampvec);
	if (!address) {
	    munmap(base, SEGMENT_NRBYTES);
	    curseg = oldseg;
	    return NULL;
	}
	/* commit to new segment, ignore leftover space in old segment */
    }
    *trampolines = trampvecbin;
    return address;
}

static unsigned int *alloc_stub(Uint nrwords, unsigned int **tramp_callemu)
{
    unsigned int *address;
    unsigned int *base;
    struct segment oldseg;

    address = try_alloc(nrwords, 0, NIL, NULL);
    if (!address) {
	base = new_code_mapping();
	if (base == MAP_FAILED)
	    return NULL;
	oldseg = curseg;
	curseg.base = base;
	curseg.code_pos = base;
	curseg.tramp_pos = (unsigned int*)((char*)base + SEGMENT_NRBYTES);
#if defined(__arm__)
	curseg.tramp_pos -= 2;
	curseg.tramp_pos[0] = 0xE51FF004;	/* ldr pc, [pc,#-4] */
	curseg.tramp_pos[1] = (unsigned int)&nbif_callemu;
#endif

	address = try_alloc(nrwords, 0, NIL, NULL);
	if (!address) {
	    munmap(base, SEGMENT_NRBYTES);
	    curseg = oldseg;
	    return NULL;
	}
	/* commit to new segment, ignore leftover space in old segment */
    }
    *tramp_callemu = (unsigned int*)((char*)curseg.base + SEGMENT_NRBYTES) - 2;
    return address;
}

/*
 * ARMv5's support for 32-bit immediates is effectively non-existent.
 * Hence, every 32-bit immediate is stored in memory and loaded via
 * a PC-relative addressing mode. Relocation entries refer to those
 * data words, NOT the load instructions, so patching is trivial.
 */
static void patch_imm32(Uint32 *address, unsigned int imm32)
{
    *address = imm32;
    hipe_flush_icache_word(address);
}

void hipe_patch_load_fe(Uint32 *address, Uint value)
{
    patch_imm32(address, value);
}

int hipe_patch_insn(void *address, Uint32 value, Eterm type)
{
    switch (type) {
      case am_closure:
      case am_constant:
      case am_atom:
      case am_c_const:
	break;
      default:
	return -1;
    }
    patch_imm32((Uint32*)address, value);
    return 0;
}

/* called from hipe_bif0.c:hipe_bifs_make_native_stub_2()
   and hipe_bif0.c:hipe_make_stub() */
void *hipe_make_native_stub(void *beamAddress, unsigned int beamArity)
{
    unsigned int *code;
#if defined(__arm__)
    unsigned int *tramp_callemu;
    int callemu_offset;
#endif

    /*
     * Native code calls BEAM via a stub looking as follows:
     *
     * mov r0, #beamArity
     * ldr r8, [pc,#0] // beamAddress
     * b nbif_callemu
     * .long beamAddress
     *
     * I'm using r0 and r8 since they aren't used for
     * parameter passing in native code. The branch to
     * nbif_callemu may need to go via a trampoline.
     * (Trampolines are allowed to modify r12, but they don't.)
     */

#if !defined(__arm__)
    /* verify that 'ba' can reach nbif_callemu */
    if ((unsigned long)&nbif_callemu & ~0x01FFFFFCUL)
	abort();
#endif

#if defined(__arm__)
    code = alloc_stub(4, &tramp_callemu);
    callemu_offset = ((int)&nbif_callemu - ((int)&code[2] + 8)) >> 2;
    if (!(callemu_offset >= -0x00800000 && callemu_offset <= 0x007FFFFF)) {
	callemu_offset = ((int)tramp_callemu - ((int)&code[2] + 8)) >> 2;
	if (!(callemu_offset >= -0x00800000 && callemu_offset <= 0x007FFFFF))
	    abort();
    }
#else
    code = alloc_stub(4, &trampoline);
#endif

#if defined(__arm__)
    /* mov r0, #beamArity */
    code[0] = 0xE3A00000 | (beamArity & 0xFF);
    /* ldr r8, [pc,#0] // beamAddress */
    code[1] = 0xE59F8000;
    /* b nbif_callemu */
    code[2] = 0xEA000000 | (callemu_offset & 0x00FFFFFF);
    /* .long beamAddress */
    code[3] = (unsigned int)beamAddress;
#else
    /* addi r12,0,beamAddress@l */
    code[0] = 0x39800000 | ((unsigned long)beamAddress & 0xFFFF);
    /* addi r0,0,beamArity */
    code[1] = 0x38000000 | (beamArity & 0x7FFF);
    /* addis r12,r12,beamAddress@ha */
    code[2] = 0x3D8C0000 | at_ha((unsigned long)beamAddress);
    /* ba nbif_callemu */
    code[3] = 0x48000002 | (unsigned long)&nbif_callemu;
#endif

    hipe_flush_icache_range(code, 4*sizeof(int));

    return code;
}

static void patch_b(Uint32 *address, Sint32 offset, Uint32 AA)
{
    Uint32 oldI = *address;
#if defined(__arm__)
    Uint32 newI = (oldI & 0xFF000000) | (offset & 0x00FFFFFF);
#else
    Uint32 newI = (oldI & 0xFC000001) | ((offset & 0x00FFFFFF) << 2) | (AA & 2);
#endif
    *address = newI;
    hipe_flush_icache_word(address);
}

int hipe_patch_call(void *callAddress, void *destAddress, void *trampoline)
{
#if !defined(__arm__)
    if ((Uint32)destAddress == ((Uint32)destAddress & 0x01FFFFFC)) {
	/* The destination is in the [0,32MB[ range.
	   We can reach it with a ba/bla instruction.
	   This is the typical case for BIFs and primops.
	   It's also common for trap-to-BEAM stubs (on ppc32). */
	patch_b((Uint32*)callAddress, (Uint32)destAddress >> 2, 2);
    } else {
#endif
#if defined(__arm__)
	Sint32 destOffset = ((Sint32)destAddress - ((Sint32)callAddress+8)) >> 2;
#else
	Sint32 destOffset = ((Sint32)destAddress - (Sint32)callAddress) >> 2;
#endif
	if (destOffset >= -0x800000 && destOffset <= 0x7FFFFF) {
	    /* The destination is within a [-32MB,+32MB[ range from us.
	       We can reach it with a b/bl instruction.
	       This is typical for nearby Erlang code. */
	    patch_b((Uint32*)callAddress, destOffset, 0);
	} else {
	    /* The destination is too distant for b/bl/ba/bla.
	       Must do a b/bl to the trampoline. */
#if defined(__arm__)
	    Sint32 trampOffset = ((Sint32)trampoline - ((Sint32)callAddress+8)) >> 2;
#else
	    Sint32 trampOffset = ((Sint32)trampoline - (Sint32)callAddress) >> 2;
#endif
	    if (trampOffset >= -0x800000 && trampOffset <= 0x7FFFFF) {
		/* Update the trampoline's address computation.
		   (May be redundant, but we can't tell.) */
#if defined(__arm__)
		patch_imm32((Uint32*)trampoline+1, (Uint32)destAddress);
#else
		patch_li((Uint32*)trampoline, (Uint32)destAddress);
#endif
		/* Update this call site. */
		patch_b((Uint32*)callAddress, trampOffset, 0);
	    } else
		return -1;
	}
#if !defined(__arm__)
    }
#endif
    return 0;
}

void hipe_arch_print_pcb(struct hipe_process_state *p)
{
#define U(n,x) \
    printf(" % 4d | %s | 0x%0*lx | %*s |\r\n", (int)offsetof(struct hipe_process_state,x), n, 2*(int)sizeof(long), (unsigned long)p->x, 2+2*(int)sizeof(long), "")
    U("nra        ", nra);
    U("narity     ", narity);
#undef U
}