diff options
author | Sverker Eriksson <[email protected]> | 2017-08-30 20:55:08 +0200 |
---|---|---|
committer | Sverker Eriksson <[email protected]> | 2017-08-30 20:55:08 +0200 |
commit | 7c67bbddb53c364086f66260701bc54a61c9659c (patch) | |
tree | 92ab0d4b91d5e2f6e7a3f9d61ea25089e8a71fe0 /erts/emulator/sys | |
parent | 97dc5e7f396129222419811c173edc7fa767b0f8 (diff) | |
parent | 3b7a6ffddc819bf305353a593904cea9e932e7dc (diff) | |
download | otp-7c67bbddb53c364086f66260701bc54a61c9659c.tar.gz otp-7c67bbddb53c364086f66260701bc54a61c9659c.tar.bz2 otp-7c67bbddb53c364086f66260701bc54a61c9659c.zip |
Merge tag 'OTP-19.0' into sverker/19/binary_to_atom-utf8-crash/ERL-474/OTP-14590
Diffstat (limited to 'erts/emulator/sys')
37 files changed, 9967 insertions, 4087 deletions
diff --git a/erts/emulator/sys/common/erl_check_io.c b/erts/emulator/sys/common/erl_check_io.c index 7035dc77df..44a77f3ea5 100644 --- a/erts/emulator/sys/common/erl_check_io.c +++ b/erts/emulator/sys/common/erl_check_io.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2013. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -34,12 +35,15 @@ #endif #include "sys.h" #include "global.h" +#include "erl_port.h" #include "erl_check_io.h" #include "erl_thr_progress.h" #include "dtrace-wrapper.h" +#include "lttng-wrapper.h" +#define ERTS_WANT_TIMER_WHEEL_API +#include "erl_time.h" -#ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS -#else +#ifndef ERTS_SYS_CONTINOUS_FD_NUMBERS # include "safe_hash.h" # define DRV_EV_STATE_HTAB_SIZE 1024 #endif @@ -51,8 +55,17 @@ typedef char EventStateType; #define ERTS_EV_TYPE_STOP_USE ((EventStateType) 3) /* pending stop_select */ typedef char EventStateFlags; -#define ERTS_EV_FLAG_USED ((EventStateFlags) 1) /* ERL_DRV_USE has been turned on */ +#define ERTS_EV_FLAG_USED ((EventStateFlags) 1) /* ERL_DRV_USE has been turned on */ +#define ERTS_EV_FLAG_DEFER_IN_EV ((EventStateFlags) 2) +#define ERTS_EV_FLAG_DEFER_OUT_EV ((EventStateFlags) 4) + +#ifdef DEBUG +# define ERTS_ACTIVE_FD_INC 2 +#else +# define ERTS_ACTIVE_FD_INC 128 +#endif +#define ERTS_CHECK_IO_POLL_RES_LEN 512 #if defined(ERTS_KERNEL_POLL_VERSION) # define ERTS_CIO_EXPORT(FUNC) FUNC ## _kp @@ -66,6 +79,7 @@ typedef char EventStateFlags; (ERTS_POLL_USE_POLL && !ERTS_POLL_USE_KERNEL_POLL) #define ERTS_CIO_POLL_CTL ERTS_POLL_EXPORT(erts_poll_control) +#define ERTS_CIO_POLL_CTLV ERTS_POLL_EXPORT(erts_poll_controlv) #define ERTS_CIO_POLL_WAIT ERTS_POLL_EXPORT(erts_poll_wait) #ifdef ERTS_POLL_NEED_ASYNC_INTERRUPT_SUPPORT #define ERTS_CIO_POLL_AS_INTR ERTS_POLL_EXPORT(erts_poll_async_sig_interrupt) @@ -78,10 +92,19 @@ typedef char EventStateFlags; #define ERTS_CIO_POLL_INIT ERTS_POLL_EXPORT(erts_poll_init) #define ERTS_CIO_POLL_INFO ERTS_POLL_EXPORT(erts_poll_info) +#define GET_FD(fd) fd + static struct pollset_info { ErtsPollSet ps; erts_smp_atomic_t in_poll_wait; /* set while doing poll */ + struct { + int six; /* start index */ + int eix; /* end index */ + erts_smp_atomic32_t no; + int size; + ErtsSysFdType *array; + } active_fd; #ifdef ERTS_SMP struct removed_fd* removed_list; /* list of deselected fd's*/ erts_smp_spinlock_t removed_list_lock; @@ -94,9 +117,11 @@ typedef struct { SafeHashBucket hb; #endif ErtsSysFdType fd; - union { - ErtsDrvEventDataState *event; /* ERTS_EV_TYPE_DRV_EV */ + struct { ErtsDrvSelectDataState *select; /* ERTS_EV_TYPE_DRV_SEL */ +#if ERTS_CIO_HAVE_DRV_EVENT + ErtsDrvEventDataState *event; /* ERTS_EV_TYPE_DRV_EV */ +#endif erts_driver_t* drv_ptr; /* ERTS_EV_TYPE_STOP_USE */ } driver; ErtsPollEvents events; @@ -166,6 +191,10 @@ static ERTS_INLINE ErtsDrvEventState* hash_new_drv_ev_state(ErtsSysFdType fd) ErtsDrvEventState tmpl; tmpl.fd = fd; tmpl.driver.select = NULL; +#if ERTS_CIO_HAVE_DRV_EVENT + tmpl.driver.event = NULL; +#endif + tmpl.driver.drv_ptr = NULL; tmpl.events = 0; tmpl.remove_cnt = 0; tmpl.type = ERTS_EV_TYPE_NONE; @@ -206,6 +235,69 @@ ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(removed_fd, struct removed_fd, 64, ERTS_ALC_T_F #endif static ERTS_INLINE void +init_iotask(ErtsIoTask *io_task) +{ + erts_port_task_handle_init(&io_task->task); + erts_smp_atomic_init_nob(&io_task->executed_time, ~((erts_aint_t) 0)); +} + +static ERTS_INLINE int +is_iotask_active(ErtsIoTask *io_task, erts_aint_t current_cio_time) +{ + if (erts_port_task_is_scheduled(&io_task->task)) + return 1; + if (erts_smp_atomic_read_nob(&io_task->executed_time) == current_cio_time) + return 1; + return 0; +} + +static ERTS_INLINE ErtsDrvSelectDataState * +alloc_drv_select_data(void) +{ + ErtsDrvSelectDataState *dsp = erts_alloc(ERTS_ALC_T_DRV_SEL_D_STATE, + sizeof(ErtsDrvSelectDataState)); + dsp->inport = NIL; + dsp->outport = NIL; + init_iotask(&dsp->iniotask); + init_iotask(&dsp->outiotask); + return dsp; +} + +static ERTS_INLINE void +free_drv_select_data(ErtsDrvSelectDataState *dsp) +{ + ASSERT(!erts_port_task_is_scheduled(&dsp->iniotask.task)); + ASSERT(!erts_port_task_is_scheduled(&dsp->outiotask.task)); + erts_free(ERTS_ALC_T_DRV_SEL_D_STATE, dsp); +} + +#if ERTS_CIO_HAVE_DRV_EVENT + +static ERTS_INLINE ErtsDrvEventDataState * +alloc_drv_event_data(void) +{ + ErtsDrvEventDataState *dep = erts_alloc(ERTS_ALC_T_DRV_EV_D_STATE, + sizeof(ErtsDrvEventDataState)); + dep->port = NIL; + dep->data = NULL; + dep->removed_events = 0; +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + dep->deferred_events = 0; +#endif + init_iotask(&dep->iotask); + return dep; +} + +static ERTS_INLINE void +free_drv_event_data(ErtsDrvEventDataState *dep) +{ + ASSERT(!erts_port_task_is_scheduled(&dep->iotask.task)); + erts_free(ERTS_ALC_T_DRV_EV_D_STATE, dep); +} + +#endif /* ERTS_CIO_HAVE_DRV_EVENT */ + +static ERTS_INLINE void remember_removed(ErtsDrvEventState *state, struct pollset_info* psi) { #ifdef ERTS_SMP @@ -285,7 +377,7 @@ forget_removed(struct pollset_info* psi) drv_ptr = state->driver.drv_ptr; ASSERT(drv_ptr); state->type = ERTS_EV_TYPE_NONE; - state->flags = 0; + state->flags &= ~ERTS_EV_FLAG_USED; state->driver.drv_ptr = NULL; /* Fall through */ case ERTS_EV_TYPE_NONE: @@ -304,6 +396,7 @@ forget_removed(struct pollset_info* psi) if (drv_ptr) { int was_unmasked = erts_block_fpe(); DTRACE1(driver_stop_select, drv_ptr->name); + LTTNG1(driver_stop_select, drv_ptr->name); (*drv_ptr->stop_select) ((ErlDrvEvent) fd, NULL); erts_unblock_fpe(was_unmasked); if (drv_ptr->handle) { @@ -322,14 +415,16 @@ static void grow_drv_ev_state(int min_ix) { int i; + int old_len; int new_len; - new_len = ERTS_POLL_EXPORT(erts_poll_get_table_len)(min_ix + 1); - if (new_len > max_fds) - new_len = max_fds; - erts_smp_mtx_lock(&drv_ev_state_grow_lock); - if (erts_smp_atomic_read_nob(&drv_ev_state_len) <= min_ix) { + old_len = erts_smp_atomic_read_nob(&drv_ev_state_len); + if (min_ix >= old_len) { + new_len = erts_poll_new_table_len(old_len, min_ix + 1); + if (new_len > max_fds) + new_len = max_fds; + for (i=0; i<DRV_EV_STATE_LOCK_CNT; i++) { /* lock all fd's */ erts_smp_mtx_lock(&drv_ev_state_locks[i].lck); } @@ -339,9 +434,13 @@ grow_drv_ev_state(int min_ix) sizeof(ErtsDrvEventState)*new_len) : erts_alloc(ERTS_ALC_T_DRV_EV_STATE, sizeof(ErtsDrvEventState)*new_len)); - for (i = erts_smp_atomic_read_nob(&drv_ev_state_len); i < new_len; i++) { + for (i = old_len; i < new_len; i++) { drv_ev_state[i].fd = (ErtsSysFdType) i; drv_ev_state[i].driver.select = NULL; +#if ERTS_CIO_HAVE_DRV_EVENT + drv_ev_state[i].driver.event = NULL; +#endif + drv_ev_state[i].driver.drv_ptr = NULL; drv_ev_state[i].events = 0; drv_ev_state[i].remove_cnt = 0; drv_ev_state[i].type = ERTS_EV_TYPE_NONE; @@ -362,11 +461,7 @@ grow_drv_ev_state(int min_ix) static ERTS_INLINE void abort_task(Eterm id, ErtsPortTaskHandle *pthp, EventStateType type) { - if (is_nil(id)) { - ASSERT(type == ERTS_EV_TYPE_NONE - || !erts_port_task_is_scheduled(pthp)); - } - else if (erts_port_task_is_scheduled(pthp)) { + if (is_not_nil(id) && erts_port_task_is_scheduled(pthp)) { erts_port_task_abort(pthp); ASSERT(erts_is_port_alive(id)); } @@ -381,7 +476,7 @@ abort_tasks(ErtsDrvEventState *state, int mode) #if ERTS_CIO_HAVE_DRV_EVENT case ERTS_EV_TYPE_DRV_EV: abort_task(state->driver.event->port, - &state->driver.event->task, + &state->driver.event->iotask.task, ERTS_EV_TYPE_DRV_EV); return; #endif @@ -395,14 +490,14 @@ abort_tasks(ErtsDrvEventState *state, int mode) case ERL_DRV_WRITE: ASSERT(state->type == ERTS_EV_TYPE_DRV_SEL); abort_task(state->driver.select->outport, - &state->driver.select->outtask, + &state->driver.select->outiotask.task, state->type); if (mode == ERL_DRV_WRITE) break; case ERL_DRV_READ: ASSERT(state->type == ERTS_EV_TYPE_DRV_SEL); abort_task(state->driver.select->inport, - &state->driver.select->intask, + &state->driver.select->iniotask.task, state->type); break; default: @@ -440,16 +535,14 @@ deselect(ErtsDrvEventState *state, int mode) if (!(state->events)) { switch (state->type) { case ERTS_EV_TYPE_DRV_SEL: - ASSERT(!erts_port_task_is_scheduled(&state->driver.select->intask)); - ASSERT(!erts_port_task_is_scheduled(&state->driver.select->outtask)); - erts_free(ERTS_ALC_T_DRV_SEL_D_STATE, - state->driver.select); + state->driver.select->inport = NIL; + state->driver.select->outport = NIL; break; #if ERTS_CIO_HAVE_DRV_EVENT case ERTS_EV_TYPE_DRV_EV: - ASSERT(!erts_port_task_is_scheduled(&state->driver.event->task)); - erts_free(ERTS_ALC_T_DRV_EV_D_STATE, - state->driver.event); + state->driver.event->port = NIL; + state->driver.event->data = NULL; + state->driver.event->removed_events = (ErtsPollEvents) 0; break; #endif case ERTS_EV_TYPE_NONE: @@ -459,20 +552,297 @@ deselect(ErtsDrvEventState *state, int mode) break; } - state->driver.select = NULL; state->type = ERTS_EV_TYPE_NONE; - state->flags = 0; + state->flags &= ~ERTS_EV_FLAG_USED; remember_removed(state, &pollset); } } - #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS # define IS_FD_UNKNOWN(state) ((state)->type == ERTS_EV_TYPE_NONE && (state)->remove_cnt == 0) #else # define IS_FD_UNKNOWN(state) ((state) == NULL) #endif +static ERTS_INLINE void +check_fd_cleanup(ErtsDrvEventState *state, +#if ERTS_CIO_HAVE_DRV_EVENT + ErtsDrvEventDataState **free_event, +#endif + ErtsDrvSelectDataState **free_select) +{ + erts_aint_t current_cio_time; + + ERTS_SMP_LC_ASSERT(erts_smp_lc_mtx_is_locked(fd_mtx(state->fd))); + + current_cio_time = erts_smp_atomic_read_acqb(&erts_check_io_time); + *free_select = NULL; + if (state->driver.select + && (state->type != ERTS_EV_TYPE_DRV_SEL) + && !is_iotask_active(&state->driver.select->iniotask, current_cio_time) + && !is_iotask_active(&state->driver.select->outiotask, current_cio_time)) { + + *free_select = state->driver.select; + state->driver.select = NULL; + } + +#if ERTS_CIO_HAVE_DRV_EVENT + *free_event = NULL; + if (state->driver.event + && (state->type != ERTS_EV_TYPE_DRV_EV) + && !is_iotask_active(&state->driver.event->iotask, current_cio_time)) { + + *free_event = state->driver.event; + state->driver.event = NULL; + } +#endif + +#ifndef ERTS_SYS_CONTINOUS_FD_NUMBERS + if (((state->type != ERTS_EV_TYPE_NONE) + | state->remove_cnt +#if ERTS_CIO_HAVE_DRV_EVENT + | (state->driver.event != NULL) +#endif + | (state->driver.select != NULL)) == 0) { + + hash_erase_drv_ev_state(state); + + } +#endif +} + +static ERTS_INLINE int +check_cleanup_active_fd(ErtsSysFdType fd, +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + ErtsPollControlEntry *pce, + int *pce_ix, +#endif + erts_aint_t current_cio_time) +{ + ErtsDrvEventState *state; + int active = 0; + erts_smp_mtx_t *mtx = fd_mtx(fd); + void *free_select = NULL; +#if ERTS_CIO_HAVE_DRV_EVENT + void *free_event = NULL; +#endif +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + ErtsPollEvents evon = 0, evoff = 0; +#endif + + erts_smp_mtx_lock(mtx); + +#ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS + state = &drv_ev_state[(int) fd]; +#else + state = hash_get_drv_ev_state(fd); /* may be NULL! */ + if (state) +#endif + { + if (state->driver.select) { +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + if (is_iotask_active(&state->driver.select->iniotask, current_cio_time)) { + active = 1; + if ((state->events & ERTS_POLL_EV_IN) + && !(state->flags & ERTS_EV_FLAG_DEFER_IN_EV)) { + evoff |= ERTS_POLL_EV_IN; + state->flags |= ERTS_EV_FLAG_DEFER_IN_EV; + } + } + else if (state->flags & ERTS_EV_FLAG_DEFER_IN_EV) { + if (state->events & ERTS_POLL_EV_IN) + evon |= ERTS_POLL_EV_IN; + state->flags &= ~ERTS_EV_FLAG_DEFER_IN_EV; + } + if (is_iotask_active(&state->driver.select->outiotask, current_cio_time)) { + active = 1; + if ((state->events & ERTS_POLL_EV_OUT) + && !(state->flags & ERTS_EV_FLAG_DEFER_OUT_EV)) { + evoff |= ERTS_POLL_EV_OUT; + state->flags |= ERTS_EV_FLAG_DEFER_OUT_EV; + } + } + else if (state->flags & ERTS_EV_FLAG_DEFER_OUT_EV) { + if (state->events & ERTS_POLL_EV_OUT) + evon |= ERTS_POLL_EV_OUT; + state->flags &= ~ERTS_EV_FLAG_DEFER_OUT_EV; + } + if (active) + (void) 0; + else +#else + if (is_iotask_active(&state->driver.select->iniotask, current_cio_time) + || is_iotask_active(&state->driver.select->outiotask, current_cio_time)) + active = 1; + else +#endif + if (state->type != ERTS_EV_TYPE_DRV_SEL) { + free_select = state->driver.select; + state->driver.select = NULL; + } + } + +#if ERTS_CIO_HAVE_DRV_EVENT + if (state->driver.event) { + if (is_iotask_active(&state->driver.event->iotask, current_cio_time)) { +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + ErtsPollEvents evs = state->events & ~state->driver.event->deferred_events; + if (evs) { + evoff |= evs; + state->driver.event->deferred_events |= evs; + } +#endif + active = 1; + } + else if (state->type != ERTS_EV_TYPE_DRV_EV) { + free_event = state->driver.event; + state->driver.event = NULL; + } +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + else { + ErtsPollEvents evs = state->events & state->driver.event->deferred_events; + if (evs) { + evon |= evs; + state->driver.event->deferred_events = 0; + } + } +#endif + + } +#endif + +#ifndef ERTS_SYS_CONTINOUS_FD_NUMBERS + if (((state->type != ERTS_EV_TYPE_NONE) | state->remove_cnt | active) == 0) + hash_erase_drv_ev_state(state); +#endif + + } + + erts_smp_mtx_unlock(mtx); + + if (free_select) + free_drv_select_data(free_select); +#if ERTS_CIO_HAVE_DRV_EVENT + if (free_event) + free_drv_event_data(free_event); +#endif + +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + if (evoff) { + ErtsPollControlEntry *pcep = &pce[(*pce_ix)++]; + pcep->fd = fd; + pcep->events = evoff; + pcep->on = 0; + } + if (evon) { + ErtsPollControlEntry *pcep = &pce[(*pce_ix)++]; + pcep->fd = fd; + pcep->events = evon; + pcep->on = 1; + } +#endif + + return active; +} + +static void +check_cleanup_active_fds(erts_aint_t current_cio_time) +{ + int six = pollset.active_fd.six; + int eix = pollset.active_fd.eix; + erts_aint32_t no = erts_smp_atomic32_read_dirty(&pollset.active_fd.no); + int size = pollset.active_fd.size; + int ix = six; +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + /* every fd might add two entries */ + Uint pce_sz = 2*sizeof(ErtsPollControlEntry)*no; + ErtsPollControlEntry *pctrl_entries = (pce_sz + ? erts_alloc(ERTS_ALC_T_TMP, pce_sz) + : NULL); + int pctrl_ix = 0; +#endif + + while (ix != eix) { + ErtsSysFdType fd = pollset.active_fd.array[ix]; + int nix = ix + 1; + if (nix >= size) + nix = 0; + ASSERT(fd != ERTS_SYS_FD_INVALID); + if (!check_cleanup_active_fd(fd, +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + pctrl_entries, + &pctrl_ix, +#endif + current_cio_time)) { + no--; + if (ix == six) { +#ifdef DEBUG + pollset.active_fd.array[ix] = ERTS_SYS_FD_INVALID; +#endif + six = nix; + } + else { + pollset.active_fd.array[ix] = pollset.active_fd.array[six]; +#ifdef DEBUG + pollset.active_fd.array[six] = ERTS_SYS_FD_INVALID; +#endif + six++; + if (six >= size) + six = 0; + } + } + ix = nix; + } + +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + ASSERT(pctrl_ix <= pce_sz/sizeof(ErtsPollControlEntry)); + if (pctrl_ix) + ERTS_CIO_POLL_CTLV(pollset.ps, pctrl_entries, pctrl_ix); + if (pctrl_entries) + erts_free(ERTS_ALC_T_TMP, pctrl_entries); +#endif + + pollset.active_fd.six = six; + pollset.active_fd.eix = eix; + erts_smp_atomic32_set_relb(&pollset.active_fd.no, no); +} + +static ERTS_INLINE void +add_active_fd(ErtsSysFdType fd) +{ + int eix = pollset.active_fd.eix; + int size = pollset.active_fd.size; + + + pollset.active_fd.array[eix] = fd; + + erts_smp_atomic32_set_relb(&pollset.active_fd.no, + (erts_smp_atomic32_read_dirty(&pollset.active_fd.no) + + 1)); + + eix++; + if (eix >= size) + eix = 0; + if (pollset.active_fd.six == eix) { + pollset.active_fd.six = 0; + eix = size; + size += ERTS_ACTIVE_FD_INC; + pollset.active_fd.array = erts_realloc(ERTS_ALC_T_ACTIVE_FD_ARR, + pollset.active_fd.array, + sizeof(ErtsSysFdType)*size); + pollset.active_fd.size = size; +#ifdef DEBUG + { + int i; + for (i = eix + 1; i < size; i++) + pollset.active_fd.array[i] = ERTS_SYS_FD_INVALID; + } +#endif + + } + + pollset.active_fd.eix = eix; +} int ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, @@ -489,6 +859,10 @@ ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, ErtsDrvEventState *state; int wake_poller; int ret; +#if ERTS_CIO_HAVE_DRV_EVENT + ErtsDrvEventDataState *free_event = NULL; +#endif + ErtsDrvSelectDataState *free_select = NULL; #ifdef USE_VM_PROBES DTRACE_CHARBUF(name, 64); #endif @@ -524,7 +898,8 @@ ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, /* fast track to stop_select callback */ stop_select_fn = prt->drv_ptr->stop_select; #ifdef USE_VM_PROBES - strncpy(name, prt->drv_ptr->name, sizeof(name)-1); + strncpy(name, prt->drv_ptr->name, + sizeof(DTRACE_CHARBUF_NAME(name))-1); name[sizeof(name)-1] = '\0'; #endif ret = 0; @@ -589,9 +964,9 @@ ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, if (new_events & (ERTS_POLL_EV_ERR|ERTS_POLL_EV_NVAL)) { if (state->type == ERTS_EV_TYPE_DRV_SEL && !state->events) { state->type = ERTS_EV_TYPE_NONE; - state->flags = 0; - erts_free(ERTS_ALC_T_DRV_SEL_D_STATE, state->driver.select); - state->driver.select = NULL; + state->flags &= ~ERTS_EV_FLAG_USED; + state->driver.select->inport = NIL; + state->driver.select->outport = NIL; } ret = -1; goto done; @@ -609,18 +984,10 @@ ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, state->events = new_events; if (ctl_events) { if (on) { - if (state->type == ERTS_EV_TYPE_NONE) { - ErtsDrvSelectDataState *dsdsp - = erts_alloc(ERTS_ALC_T_DRV_SEL_D_STATE, - sizeof(ErtsDrvSelectDataState)); - dsdsp->inport = NIL; - dsdsp->outport = NIL; - erts_port_task_handle_init(&dsdsp->intask); - erts_port_task_handle_init(&dsdsp->outtask); - ASSERT(state->driver.select == NULL); - state->driver.select = dsdsp; + if (!state->driver.select) + state->driver.select = alloc_drv_select_data(); + if (state->type == ERTS_EV_TYPE_NONE) state->type = ERTS_EV_TYPE_DRV_SEL; - } ASSERT(state->type == ERTS_EV_TYPE_DRV_SEL); if (ctl_events & ERTS_POLL_EV_IN) state->driver.select->inport = id; @@ -641,17 +1008,12 @@ ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, state->driver.select->outport = NIL; } if (new_events == 0) { - ASSERT(!erts_port_task_is_scheduled(&state->driver.select->intask)); - ASSERT(!erts_port_task_is_scheduled(&state->driver.select->outtask)); if (old_events != 0) { remember_removed(state, &pollset); } if ((mode & ERL_DRV_USE) || !(state->flags & ERTS_EV_FLAG_USED)) { state->type = ERTS_EV_TYPE_NONE; - state->flags = 0; - erts_free(ERTS_ALC_T_DRV_SEL_D_STATE, - state->driver.select); - state->driver.select = NULL; + state->flags &= ~ERTS_EV_FLAG_USED; } /*else keep it, as fd will probably be selected upon again */ } @@ -682,20 +1044,29 @@ ERTS_CIO_EXPORT(driver_select)(ErlDrvPort ix, ret = 0; -done:; -#ifndef ERTS_SYS_CONTINOUS_FD_NUMBERS - if (state->type == ERTS_EV_TYPE_NONE && state->remove_cnt == 0) { - hash_erase_drv_ev_state(state); - } +done: + + check_fd_cleanup(state, +#if ERTS_CIO_HAVE_DRV_EVENT + &free_event, #endif -done_unknown: + &free_select); + +done_unknown: erts_smp_mtx_unlock(fd_mtx(fd)); if (stop_select_fn) { int was_unmasked = erts_block_fpe(); DTRACE1(driver_stop_select, name); + LTTNG1(driver_stop_select, "unknown"); (*stop_select_fn)(e, NULL); erts_unblock_fpe(was_unmasked); } + if (free_select) + free_drv_select_data(free_select); +#if ERTS_CIO_HAVE_DRV_EVENT + if (free_event) + free_drv_event_data(free_event); +#endif return ret; } @@ -715,6 +1086,10 @@ ERTS_CIO_EXPORT(driver_event)(ErlDrvPort ix, ErtsDrvEventState *state; int do_wake = 0; int ret; +#if ERTS_CIO_HAVE_DRV_EVENT + ErtsDrvEventDataState *free_event; +#endif + ErtsDrvSelectDataState *free_select; Port *prt = erts_drvport2port(ix); if (prt == ERTS_INVALID_ERL_DRV_PORT) @@ -795,10 +1170,8 @@ ERTS_CIO_EXPORT(driver_event)(ErlDrvPort ix, state->driver.event->removed_events |= remove_events; } else { - state->driver.event - = erts_alloc(ERTS_ALC_T_DRV_EV_D_STATE, - sizeof(ErtsDrvEventDataState)); - erts_port_task_handle_init(&state->driver.event->task); + if (!state->driver.event) + state->driver.event = alloc_drv_event_data(); state->driver.event->port = id; state->driver.event->removed_events = (ErtsPollEvents) 0; state->type = ERTS_EV_TYPE_DRV_EV; @@ -808,10 +1181,10 @@ ERTS_CIO_EXPORT(driver_event)(ErlDrvPort ix, else { if (state->type == ERTS_EV_TYPE_DRV_EV) { abort_tasks(state, 0); - erts_free(ERTS_ALC_T_DRV_EV_D_STATE, - state->driver.event); + state->driver.event->port = NIL; + state->driver.event->data = NULL; + state->driver.event->removed_events = (ErtsPollEvents) 0; } - state->driver.select = NULL; state->type = ERTS_EV_TYPE_NONE; remember_removed(state, &pollset); } @@ -821,12 +1194,22 @@ ERTS_CIO_EXPORT(driver_event)(ErlDrvPort ix, ret = 0; done: -#ifndef ERTS_SYS_CONTINOUS_FD_NUMBERS - if (state->type == ERTS_EV_TYPE_NONE && state->remove_cnt == 0) { - hash_erase_drv_ev_state(state); - } + + check_fd_cleanup(state, +#if ERTS_CIO_HAVE_DRV_EVENT + &free_event, #endif + &free_select); + erts_smp_mtx_unlock(fd_mtx(fd)); + + if (free_select) + free_drv_select_data(free_select); +#if ERTS_CIO_HAVE_DRV_EVENT + if (free_event) + free_drv_event_data(free_event); +#endif + return ret; #endif } @@ -894,7 +1277,7 @@ print_driver_name(erts_dsprintf_buf_t *dsbufp, Eterm id) static void steal(erts_dsprintf_buf_t *dsbufp, ErtsDrvEventState *state, int mode) { - erts_dsprintf(dsbufp, "stealing control of fd=%d from ", (int) state->fd); + erts_dsprintf(dsbufp, "stealing control of fd=%d from ", (int) GET_FD(state->fd)); switch (state->type) { case ERTS_EV_TYPE_DRV_SEL: { int deselect_mode = 0; @@ -918,7 +1301,7 @@ steal(erts_dsprintf_buf_t *dsbufp, ErtsDrvEventState *state, int mode) if (deselect_mode) deselect(state, deselect_mode); else { - erts_dsprintf(dsbufp, "no one", (int) state->fd); + erts_dsprintf(dsbufp, "no one", (int) GET_FD(state->fd)); ASSERT(0); } erts_dsprintf(dsbufp, "\n"); @@ -946,7 +1329,7 @@ steal(erts_dsprintf_buf_t *dsbufp, ErtsDrvEventState *state, int mode) break; } default: - erts_dsprintf(dsbufp, "no one\n", (int) state->fd); + erts_dsprintf(dsbufp, "no one\n", (int) GET_FD(state->fd)); ASSERT(0); } } @@ -960,7 +1343,7 @@ print_select_op(erts_dsprintf_buf_t *dsbufp, "driver_select(%p, %d,%s%s%s%s, %d) " "by ", ix, - (int) fd, + (int) GET_FD(fd), mode & ERL_DRV_READ ? " ERL_DRV_READ" : "", mode & ERL_DRV_WRITE ? " ERL_DRV_WRITE" : "", mode & ERL_DRV_USE ? " ERL_DRV_USE" : "", @@ -1010,7 +1393,7 @@ steal_pending_stop_select(erts_dsprintf_buf_t *dsbufp, ErlDrvPort ix, ASSERT(state->type == ERTS_EV_TYPE_STOP_USE); erts_dsprintf(dsbufp, "failed: fd=%d (re)selected before stop_select " "was called for driver %s\n", - (int) state->fd, state->driver.drv_ptr->name); + (int) GET_FD(state->fd), state->driver.drv_ptr->name); erts_send_error_to_logger_nogl(dsbufp); if (on) { @@ -1019,7 +1402,7 @@ steal_pending_stop_select(erts_dsprintf_buf_t *dsbufp, ErlDrvPort ix, * In either case stop_select should not be called. */ state->type = ERTS_EV_TYPE_NONE; - state->flags = 0; + state->flags &= ~ERTS_EV_FLAG_USED; if (state->driver.drv_ptr->handle) { erts_ddll_dereference_driver(state->driver.drv_ptr->handle); } @@ -1091,38 +1474,103 @@ event_large_fd_error(ErlDrvPort ix, ErtsSysFdType fd, ErlDrvEventData event_data #endif #endif +static ERTS_INLINE int +io_task_schedule_allowed(ErtsDrvEventState *state, + ErtsPortTaskType type, + erts_aint_t current_cio_time) +{ + ErtsIoTask *io_task; + + switch (type) { + case ERTS_PORT_TASK_INPUT: + if (!state->driver.select) + return 0; +#if ERTS_CIO_HAVE_DRV_EVENT + if (state->driver.event) + return 0; +#endif + io_task = &state->driver.select->iniotask; + break; + case ERTS_PORT_TASK_OUTPUT: + if (!state->driver.select) + return 0; +#if ERTS_CIO_HAVE_DRV_EVENT + if (state->driver.event) + return 0; +#endif + io_task = &state->driver.select->outiotask; + break; +#if ERTS_CIO_HAVE_DRV_EVENT + case ERTS_PORT_TASK_EVENT: + if (!state->driver.event) + return 0; + if (state->driver.select) + return 0; + io_task = &state->driver.event->iotask; + break; +#endif + default: + ERTS_INTERNAL_ERROR("Invalid I/O-task type"); + return 0; + } + + return !is_iotask_active(io_task, current_cio_time); +} + static ERTS_INLINE void -iready(Eterm id, ErtsDrvEventState *state) +iready(Eterm id, ErtsDrvEventState *state, erts_aint_t current_cio_time) { - if (erts_port_task_schedule(id, - &state->driver.select->intask, - ERTS_PORT_TASK_INPUT, - (ErlDrvEvent) state->fd) != 0) { - stale_drv_select(id, state, ERL_DRV_READ); + if (io_task_schedule_allowed(state, + ERTS_PORT_TASK_INPUT, + current_cio_time)) { + ErtsIoTask *iotask = &state->driver.select->iniotask; + erts_smp_atomic_set_nob(&iotask->executed_time, current_cio_time); + if (erts_port_task_schedule(id, + &iotask->task, + ERTS_PORT_TASK_INPUT, + (ErlDrvEvent) state->fd) != 0) { + stale_drv_select(id, state, ERL_DRV_READ); + } + add_active_fd(state->fd); } } static ERTS_INLINE void -oready(Eterm id, ErtsDrvEventState *state) +oready(Eterm id, ErtsDrvEventState *state, erts_aint_t current_cio_time) { - if (erts_port_task_schedule(id, - &state->driver.select->outtask, - ERTS_PORT_TASK_OUTPUT, - (ErlDrvEvent) state->fd) != 0) { - stale_drv_select(id, state, ERL_DRV_WRITE); + if (io_task_schedule_allowed(state, + ERTS_PORT_TASK_OUTPUT, + current_cio_time)) { + ErtsIoTask *iotask = &state->driver.select->outiotask; + erts_smp_atomic_set_nob(&iotask->executed_time, current_cio_time); + if (erts_port_task_schedule(id, + &iotask->task, + ERTS_PORT_TASK_OUTPUT, + (ErlDrvEvent) state->fd) != 0) { + stale_drv_select(id, state, ERL_DRV_WRITE); + } + add_active_fd(state->fd); } } #if ERTS_CIO_HAVE_DRV_EVENT static ERTS_INLINE void -eready(Eterm id, ErtsDrvEventState *state, ErlDrvEventData event_data) +eready(Eterm id, ErtsDrvEventState *state, ErlDrvEventData event_data, + erts_aint_t current_cio_time) { - if (erts_port_task_schedule(id, - &state->driver.event->task, - ERTS_PORT_TASK_EVENT, - (ErlDrvEvent) state->fd, - event_data) != 0) { - stale_drv_select(id, state, 0); + if (io_task_schedule_allowed(state, + ERTS_PORT_TASK_EVENT, + current_cio_time)) { + ErtsIoTask *iotask = &state->driver.event->iotask; + erts_smp_atomic_set_nob(&iotask->executed_time, current_cio_time); + if (erts_port_task_schedule(id, + &iotask->task, + ERTS_PORT_TASK_EVENT, + (ErlDrvEvent) state->fd, + event_data) != 0) { + stale_drv_select(id, state, 0); + } + add_active_fd(state->fd); } } #endif @@ -1145,18 +1593,22 @@ ERTS_CIO_EXPORT(erts_check_io_interrupt)(int set) void ERTS_CIO_EXPORT(erts_check_io_interrupt_timed)(int set, - erts_short_time_t msec) + ErtsMonotonicTime timeout_time) { - ERTS_CIO_POLL_INTR_TMD(pollset.ps, set, msec); + ERTS_CIO_POLL_INTR_TMD(pollset.ps, set, timeout_time); } void ERTS_CIO_EXPORT(erts_check_io)(int do_wait) { - ErtsPollResFd pollres[256]; + ErtsPollResFd *pollres; int pollres_len; - SysTimeval wait_time; + ErtsMonotonicTime timeout_time; int poll_ret, i; + erts_aint_t current_cio_time; + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + + ASSERT(esdp); restart: @@ -1166,28 +1618,37 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) #endif /* Figure out timeout value */ - if (do_wait) { - erts_time_remaining(&wait_time); - } else { /* poll only */ - wait_time.tv_sec = 0; - wait_time.tv_usec = 0; - } + timeout_time = (do_wait + ? erts_check_next_timeout_time(esdp) + : ERTS_POLL_NO_TIMEOUT /* poll only */); + + /* + * No need for an atomic inc op when incrementing + * erts_check_io_time, since only one thread can + * check io at a time. + */ + current_cio_time = erts_smp_atomic_read_dirty(&erts_check_io_time); + current_cio_time++; + erts_smp_atomic_set_relb(&erts_check_io_time, current_cio_time); + + check_cleanup_active_fds(current_cio_time); #ifdef ERTS_ENABLE_LOCK_CHECK erts_lc_check_exact(NULL, 0); /* No locks should be locked */ #endif - pollres_len = sizeof(pollres)/sizeof(ErtsPollResFd); + + pollres_len = erts_smp_atomic32_read_dirty(&pollset.active_fd.no) + ERTS_CHECK_IO_POLL_RES_LEN; + + pollres = erts_alloc(ERTS_ALC_T_TMP, sizeof(ErtsPollResFd)*pollres_len); erts_smp_atomic_set_nob(&pollset.in_poll_wait, 1); - poll_ret = ERTS_CIO_POLL_WAIT(pollset.ps, pollres, &pollres_len, &wait_time); + poll_ret = ERTS_CIO_POLL_WAIT(pollset.ps, pollres, &pollres_len, timeout_time); #ifdef ERTS_ENABLE_LOCK_CHECK erts_lc_check_exact(NULL, 0); /* No locks should be locked */ #endif - erts_deliver_time(); /* sync the machine's idea of time */ - #ifdef ERTS_BREAK_REQUESTED if (ERTS_BREAK_REQUESTED) erts_do_break_handling(); @@ -1196,6 +1657,7 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) if (poll_ret != 0) { erts_smp_atomic_set_nob(&pollset.in_poll_wait, 0); forget_removed(&pollset); + erts_free(ERTS_ALC_T_TMP, pollres); if (poll_ret == EAGAIN) { goto restart; } @@ -1255,15 +1717,15 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) if ((revents & ERTS_POLL_EV_IN) || (!(revents & ERTS_POLL_EV_OUT) && state->events & ERTS_POLL_EV_IN)) { - iready(state->driver.select->inport, state); + iready(state->driver.select->inport, state, current_cio_time); } else if (state->events & ERTS_POLL_EV_OUT) { - oready(state->driver.select->outport, state); + oready(state->driver.select->outport, state, current_cio_time); } } else if (revents & (ERTS_POLL_EV_IN|ERTS_POLL_EV_OUT)) { if (revents & ERTS_POLL_EV_OUT) { - oready(state->driver.select->outport, state); + oready(state->driver.select->outport, state, current_cio_time); } /* Someone might have deselected input since revents was read (true also on the non-smp emulator since @@ -1271,7 +1733,7 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) revents... */ revents &= ~(~state->events & ERTS_POLL_EV_IN); if (revents & ERTS_POLL_EV_IN) { - iready(state->driver.select->inport, state); + iready(state->driver.select->inport, state, current_cio_time); } } else if (revents & ERTS_POLL_EV_NVAL) { @@ -1279,6 +1741,7 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) state->driver.select->inport, state->driver.select->outport, state->events); + add_active_fd(state->fd); } break; } @@ -1296,8 +1759,7 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) if (revents) { event_data->events = state->events; event_data->revents = revents; - - eready(state->driver.event->port, state, event_data); + eready(state->driver.event->port, state, event_data, current_cio_time); } break; } @@ -1315,6 +1777,7 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) (int) state->type); ASSERT(0); deselect(state, 0); + add_active_fd(state->fd); break; } } @@ -1326,6 +1789,7 @@ ERTS_CIO_EXPORT(erts_check_io)(int do_wait) } erts_smp_atomic_set_nob(&pollset.in_poll_wait, 0); + erts_free(ERTS_ALC_T_TMP, pollres); forget_removed(&pollset); } @@ -1395,6 +1859,7 @@ stale_drv_select(Eterm id, ErtsDrvEventState *state, int mode) } #ifndef ERTS_SYS_CONTINOUS_FD_NUMBERS + static SafeHashValue drv_ev_state_hash(void *des) { SafeHashValue val = (SafeHashValue) ((ErtsDrvEventState *) des)->fd; @@ -1440,10 +1905,27 @@ static void drv_ev_state_free(void *des) void ERTS_CIO_EXPORT(erts_init_check_io)(void) { + erts_smp_atomic_init_nob(&erts_check_io_time, 0); erts_smp_atomic_init_nob(&pollset.in_poll_wait, 0); + ERTS_CIO_POLL_INIT(); pollset.ps = ERTS_CIO_NEW_POLLSET(); + pollset.active_fd.six = 0; + pollset.active_fd.eix = 0; + erts_smp_atomic32_init_nob(&pollset.active_fd.no, 0); + pollset.active_fd.size = ERTS_ACTIVE_FD_INC; + pollset.active_fd.array = erts_alloc(ERTS_ALC_T_ACTIVE_FD_ARR, + sizeof(ErtsSysFdType)*ERTS_ACTIVE_FD_INC); +#ifdef DEBUG + { + int i; + for (i = 0; i < ERTS_ACTIVE_FD_INC; i++) + pollset.active_fd.array[i] = ERTS_SYS_FD_INVALID; + } +#endif + + #ifdef ERTS_SMP init_removed_fd_alloc(); pollset.removed_list = NULL; @@ -1519,12 +2001,27 @@ Eterm ERTS_CIO_EXPORT(erts_check_io_info)(void *proc) { Process *p = (Process *) proc; - Eterm tags[15], values[15], res; + Eterm tags[16], values[16], res; Uint sz, *szp, *hp, **hpp, memory_size; Sint i; ErtsPollInfo pi; - - ERTS_CIO_POLL_INFO(pollset.ps, &pi); + erts_aint_t cio_time = erts_smp_atomic_read_acqb(&erts_check_io_time); + int active_fds = (int) erts_smp_atomic32_read_acqb(&pollset.active_fd.no); + + while (1) { + erts_aint_t post_cio_time; + int post_active_fds; + + ERTS_CIO_POLL_INFO(pollset.ps, &pi); + + post_cio_time = erts_smp_atomic_read_mb(&erts_check_io_time); + post_active_fds = (int) erts_smp_atomic32_read_acqb(&pollset.active_fd.no); + if (cio_time == post_cio_time && active_fds == post_active_fds) + break; + cio_time = post_cio_time; + active_fds = post_active_fds; + } + memory_size = pi.memory_size; #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS memory_size += sizeof(ErtsDrvEventState) * erts_smp_atomic_read_nob(&drv_ev_state_len); @@ -1588,6 +2085,9 @@ ERTS_CIO_EXPORT(erts_check_io_info)(void *proc) tags[i] = erts_bld_atom(hpp, szp, "max_fds"); values[i++] = erts_bld_uint(hpp, szp, (Uint) pi.max_fds); + tags[i] = erts_bld_atom(hpp, szp, "active_fds"); + values[i++] = erts_bld_uint(hpp, szp, (Uint) active_fds); + #ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS tags[i] = erts_bld_atom(hpp, szp, "no_avoided_wakeups"); values[i++] = erts_bld_uint(hpp, szp, (Uint) pi.no_avoided_wakeups); @@ -1642,6 +2142,8 @@ print_events(ErtsPollEvents ev) typedef struct { int used_fds; int num_errors; + int no_driver_select_structs; + int no_driver_event_structs; #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS int internal_fds; ErtsPollEvents *epep; @@ -1664,6 +2166,13 @@ static void doit_erts_check_io_debug(void *vstate, void *vcounters) struct stat stat_buf; #endif + if (state->driver.select) + counters->no_driver_select_structs++; +#if ERTS_CIO_HAVE_DRV_EVENT + if (state->driver.event) + counters->no_driver_event_structs++; +#endif + #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS if (state->events || ep_events) { if (ep_events & ERTS_POLL_EV_NVAL) { @@ -1802,6 +2311,7 @@ static void doit_erts_check_io_debug(void *vstate, void *vcounters) } } } +#if ERTS_CIO_HAVE_DRV_EVENT else if (state->type == ERTS_EV_TYPE_DRV_EV) { Eterm id; erts_printf("driver_event "); @@ -1837,6 +2347,7 @@ static void doit_erts_check_io_debug(void *vstate, void *vcounters) erts_free_port_names(pnp); } } +#endif #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS else if (internal) { erts_printf("internal "); @@ -1876,18 +2387,24 @@ static void doit_erts_check_io_debug(void *vstate, void *vcounters) } int -ERTS_CIO_EXPORT(erts_check_io_debug)(void) +ERTS_CIO_EXPORT(erts_check_io_debug)(ErtsCheckIoDebugInfo *ciodip) { #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS int fd, len; #endif IterDebugCounters counters; +#ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS ErtsDrvEventState null_des; null_des.driver.select = NULL; +#if ERTS_CIO_HAVE_DRV_EVENT + null_des.driver.event = NULL; +#endif + null_des.driver.drv_ptr = NULL; null_des.events = 0; null_des.remove_cnt = 0; null_des.type = ERTS_EV_TYPE_NONE; +#endif erts_printf("--- fds in pollset --------------------------------------\n"); @@ -1904,6 +2421,8 @@ ERTS_CIO_EXPORT(erts_check_io_debug)(void) #endif counters.used_fds = 0; counters.num_errors = 0; + counters.no_driver_select_structs = 0; + counters.no_driver_event_structs = 0; #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS len = erts_smp_atomic_read_nob(&drv_ev_state_len); @@ -1920,8 +2439,16 @@ ERTS_CIO_EXPORT(erts_check_io_debug)(void) erts_smp_thr_progress_unblock(); + ciodip->no_used_fds = counters.used_fds; + ciodip->no_driver_select_structs = counters.no_driver_select_structs; + ciodip->no_driver_event_structs = counters.no_driver_event_structs; + erts_printf("\n"); erts_printf("used fds=%d\n", counters.used_fds); + erts_printf("Number of driver_select() structures=%d\n", counters.no_driver_select_structs); +#if ERTS_CIO_HAVE_DRV_EVENT + erts_printf("Number of driver_event() structures=%d\n", counters.no_driver_event_structs); +#endif #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS erts_printf("internal fds=%d\n", counters.internal_fds); #endif @@ -1930,6 +2457,7 @@ ERTS_CIO_EXPORT(erts_check_io_debug)(void) #ifdef ERTS_SYS_CONTINOUS_FD_NUMBERS erts_free(ERTS_ALC_T_TMP, (void *) counters.epep); #endif + return counters.num_errors; } diff --git a/erts/emulator/sys/common/erl_check_io.h b/erts/emulator/sys/common/erl_check_io.h index edab7947ba..14f1ea3f43 100644 --- a/erts/emulator/sys/common/erl_check_io.h +++ b/erts/emulator/sys/common/erl_check_io.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2011. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -26,6 +27,7 @@ #ifndef ERL_CHECK_IO_H__ #define ERL_CHECK_IO_H__ +#include "sys.h" #include "erl_sys_driver.h" #ifdef ERTS_ENABLE_KERNEL_POLL @@ -46,14 +48,14 @@ void erts_check_io_async_sig_interrupt_nkp(void); #endif void erts_check_io_interrupt_kp(int); void erts_check_io_interrupt_nkp(int); -void erts_check_io_interrupt_timed_kp(int, erts_short_time_t); -void erts_check_io_interrupt_timed_nkp(int, erts_short_time_t); +void erts_check_io_interrupt_timed_kp(int, ErtsMonotonicTime); +void erts_check_io_interrupt_timed_nkp(int, ErtsMonotonicTime); void erts_check_io_kp(int); void erts_check_io_nkp(int); void erts_init_check_io_kp(void); void erts_init_check_io_nkp(void); -int erts_check_io_debug_kp(void); -int erts_check_io_debug_nkp(void); +int erts_check_io_debug_kp(ErtsCheckIoDebugInfo *); +int erts_check_io_debug_nkp(ErtsCheckIoDebugInfo *); #else /* !ERTS_ENABLE_KERNEL_POLL */ @@ -64,12 +66,33 @@ int erts_check_io_max_files(void); void erts_check_io_async_sig_interrupt(void); #endif void erts_check_io_interrupt(int); -void erts_check_io_interrupt_timed(int, erts_short_time_t); +void erts_check_io_interrupt_timed(int, ErtsMonotonicTime); void erts_check_io(int); void erts_init_check_io(void); #endif +extern erts_smp_atomic_t erts_check_io_time; + +typedef struct { + ErtsPortTaskHandle task; + erts_smp_atomic_t executed_time; +} ErtsIoTask; + +ERTS_GLB_INLINE void erts_io_notify_port_task_executed(ErtsPortTaskHandle *pthp); + +#if ERTS_GLB_INLINE_INCL_FUNC_DEF + +ERTS_GLB_INLINE void +erts_io_notify_port_task_executed(ErtsPortTaskHandle *pthp) +{ + ErtsIoTask *itp = (ErtsIoTask *) (((char *) pthp) - offsetof(ErtsIoTask, task)); + erts_aint_t ci_time = erts_smp_atomic_read_acqb(&erts_check_io_time); + erts_smp_atomic_set_relb(&itp->executed_time, ci_time); +} + +#endif + #endif /* ERL_CHECK_IO_H__ */ #if !defined(ERL_CHECK_IO_C__) && !defined(ERTS_ALLOC_C__) @@ -81,6 +104,16 @@ void erts_init_check_io(void); #include "erl_poll.h" #include "erl_port_task.h" +#ifdef __WIN32__ +/* + * Current erts_poll implementation for Windows cannot handle + * active events in the set of events polled. + */ +# define ERTS_CIO_DEFER_ACTIVE_EVENTS 1 +#else +# define ERTS_CIO_DEFER_ACTIVE_EVENTS 0 +#endif + /* * ErtsDrvEventDataState is used by driver_event() which is almost never * used. We allocate ErtsDrvEventDataState separate since we dont wan't @@ -91,13 +124,16 @@ typedef struct { Eterm port; ErlDrvEventData data; ErtsPollEvents removed_events; - ErtsPortTaskHandle task; +#if ERTS_CIO_DEFER_ACTIVE_EVENTS + ErtsPollEvents deferred_events; +#endif + ErtsIoTask iotask; } ErtsDrvEventDataState; typedef struct { Eterm inport; Eterm outport; - ErtsPortTaskHandle intask; - ErtsPortTaskHandle outtask; + ErtsIoTask iniotask; + ErtsIoTask outiotask; } ErtsDrvSelectDataState; #endif /* #ifndef ERL_CHECK_IO_INTERNAL__ */ diff --git a/erts/emulator/sys/common/erl_mmap.c b/erts/emulator/sys/common/erl_mmap.c new file mode 100644 index 0000000000..7bbb406f29 --- /dev/null +++ b/erts/emulator/sys/common/erl_mmap.c @@ -0,0 +1,2918 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2002-2016. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "sys.h" +#include "erl_process.h" +#include "erl_smp.h" +#include "atom.h" +#include "erl_mmap.h" +#include <stddef.h> + +#if HAVE_ERTS_MMAP + +/* #define ERTS_MMAP_OP_RINGBUF_SZ 100 */ + +#if defined(DEBUG) || 0 +# undef ERTS_MMAP_DEBUG +# define ERTS_MMAP_DEBUG +# ifndef ERTS_MMAP_OP_RINGBUF_SZ +# define ERTS_MMAP_OP_RINGBUF_SZ 100 +# endif +#endif + +#ifndef ERTS_MMAP_OP_RINGBUF_SZ +# define ERTS_MMAP_OP_RINGBUF_SZ 0 +#endif + +/* #define ERTS_MMAP_DEBUG_FILL_AREAS */ + +#ifdef ERTS_MMAP_DEBUG +# define ERTS_MMAP_ASSERT ERTS_ASSERT +#else +# define ERTS_MMAP_ASSERT(A) ((void) 1) +#endif + +/* + * `mm->sa.bot` and `mm->sua.top` are read only after + * initialization, but the other pointers are not; i.e., only + * ERTS_MMAP_IN_SUPERCARRIER() is allowed without the mutex held. + */ +#define ERTS_MMAP_IN_SUPERCARRIER(PTR) \ + (((UWord) (PTR)) - ((UWord) mm->sa.bot) \ + < ((UWord) mm->sua.top) - ((UWord) mm->sa.bot)) +#define ERTS_MMAP_IN_SUPERALIGNED_AREA(PTR) \ + (ERTS_SMP_LC_ASSERT(erts_lc_mtx_is_locked(&mm->mtx)), \ + (((UWord) (PTR)) - ((UWord) mm->sa.bot) \ + < ((UWord) mm->sa.top) - ((UWord) mm->sa.bot))) +#define ERTS_MMAP_IN_SUPERUNALIGNED_AREA(PTR) \ + (ERTS_SMP_LC_ASSERT(erts_lc_mtx_is_locked(&mm->mtx)), \ + (((UWord) (PTR)) - ((UWord) mm->sua.bot) \ + < ((UWord) mm->sua.top) - ((UWord) mm->sua.bot))) + +UWord erts_page_inv_mask; + +#if defined(DEBUG) || defined(ERTS_MMAP_DEBUG) +# undef RBT_DEBUG +# define RBT_DEBUG +#endif +#ifdef RBT_DEBUG +# define RBT_ASSERT ERTS_ASSERT +# define IF_RBT_DEBUG(C) C +#else +# define RBT_ASSERT(x) +# define IF_RBT_DEBUG(C) +#endif + +typedef struct RBTNode_ RBTNode; +struct RBTNode_ { + UWord parent_and_color; /* color in bit 0 of parent ptr */ + RBTNode *left; + RBTNode *right; +}; + +#define RED_FLG (1) +#define IS_RED(N) ((N) && ((N)->parent_and_color & RED_FLG)) +#define IS_BLACK(N) (!IS_RED(N)) +#define SET_RED(N) ((N)->parent_and_color |= RED_FLG) +#define SET_BLACK(N) ((N)->parent_and_color &= ~RED_FLG) + +static ERTS_INLINE RBTNode* parent(RBTNode* node) +{ + return (RBTNode*) (node->parent_and_color & ~RED_FLG); +} + +static ERTS_INLINE void set_parent(RBTNode* node, RBTNode* parent) +{ + RBT_ASSERT(!((UWord)parent & RED_FLG)); + node->parent_and_color = ((UWord)parent) | (node->parent_and_color & RED_FLG); +} + +static ERTS_INLINE UWord parent_and_color(RBTNode* parent, int color) +{ + RBT_ASSERT(!((UWord)parent & RED_FLG)); + RBT_ASSERT(!(color & ~RED_FLG)); + return ((UWord)parent) | color; +} + + +enum SortOrder { + ADDR_ORDER, /* only address order */ + SA_SZ_ADDR_ORDER, /* first super-aligned size then address order */ + SZ_REVERSE_ADDR_ORDER /* first size then reverse address order */ +}; +#ifdef HARD_DEBUG +static const char* sort_order_names[] = {"Address","SuperAlignedSize-Address","Size-RevAddress"}; +#endif + +typedef struct { + RBTNode* root; + enum SortOrder order; +}RBTree; + +#ifdef HARD_DEBUG +# define HARD_CHECK_IS_MEMBER(ROOT,NODE) rbt_assert_is_member(ROOT,NODE) +# define HARD_CHECK_TREE(TREE,SZ) check_tree(TREE, SZ) +static int rbt_assert_is_member(RBTNode* root, RBTNode* node); +static RBTNode* check_tree(RBTree* tree, Uint); +#else +# define HARD_CHECK_IS_MEMBER(ROOT,NODE) +# define HARD_CHECK_TREE(TREE,SZ) +#endif + +#if ERTS_MMAP_OP_RINGBUF_SZ + +static int mmap_op_ix; + +typedef enum { + ERTS_OP_TYPE_NONE, + ERTS_OP_TYPE_MMAP, + ERTS_OP_TYPE_MUNMAP, + ERTS_OP_TYPE_MREMAP +} ErtsMMapOpType; + +typedef struct { + ErtsMMapOpType type; + void *result; + UWord in_size; + UWord out_size; + void *old_ptr; + UWord old_size; +} ErtsMMapOp; + +static ErtsMMapOp mmap_ops[ERTS_MMAP_OP_RINGBUF_SZ]; + +#define ERTS_MMAP_OP_RINGBUF_INIT() \ + do { \ + int ix__; \ + for (ix__ = 0; ix__ < ERTS_MMAP_OP_RINGBUF_SZ; ix__++) {\ + mmap_ops[ix__].type = ERTS_OP_TYPE_NONE; \ + mmap_ops[ix__].result = NULL; \ + mmap_ops[ix__].in_size = 0; \ + mmap_ops[ix__].out_size = 0; \ + mmap_ops[ix__].old_ptr = NULL; \ + mmap_ops[ix__].old_size = 0; \ + } \ + mmap_op_ix = ERTS_MMAP_OP_RINGBUF_SZ-1; \ + } while (0) + +#define ERTS_MMAP_OP_START(SZ) \ + do { \ + int ix__; \ + if (++mmap_op_ix >= ERTS_MMAP_OP_RINGBUF_SZ) \ + mmap_op_ix = 0; \ + ix__ = mmap_op_ix; \ + mmap_ops[ix__].type = ERTS_OP_TYPE_MMAP; \ + mmap_ops[ix__].result = NULL; \ + mmap_ops[ix__].in_size = (SZ); \ + mmap_ops[ix__].out_size = 0; \ + mmap_ops[ix__].old_ptr = NULL; \ + mmap_ops[ix__].old_size = 0; \ + } while (0) + +#define ERTS_MMAP_OP_END(PTR, SZ) \ + do { \ + int ix__ = mmap_op_ix; \ + mmap_ops[ix__].result = (PTR); \ + mmap_ops[ix__].out_size = (SZ); \ + } while (0) + +#define ERTS_MMAP_OP_LCK(RES, IN_SZ, OUT_SZ) \ + do { \ + erts_smp_mtx_lock(&mm->mtx); \ + ERTS_MMAP_OP_START((IN_SZ)); \ + ERTS_MMAP_OP_END((RES), (OUT_SZ)); \ + erts_smp_mtx_unlock(&mm->mtx); \ + } while (0) + +#define ERTS_MUNMAP_OP(PTR, SZ) \ + do { \ + int ix__; \ + if (++mmap_op_ix >= ERTS_MMAP_OP_RINGBUF_SZ) \ + mmap_op_ix = 0; \ + ix__ = mmap_op_ix; \ + mmap_ops[ix__].type = ERTS_OP_TYPE_MUNMAP; \ + mmap_ops[ix__].result = NULL; \ + mmap_ops[ix__].in_size = 0; \ + mmap_ops[ix__].out_size = 0; \ + mmap_ops[ix__].old_ptr = (PTR); \ + mmap_ops[ix__].old_size = (SZ); \ + } while (0) + +#define ERTS_MUNMAP_OP_LCK(PTR, SZ) \ + do { \ + erts_smp_mtx_lock(&mm->mtx); \ + ERTS_MUNMAP_OP((PTR), (SZ)); \ + erts_smp_mtx_unlock(&mm->mtx); \ + } while (0) + +#define ERTS_MREMAP_OP_START(OLD_PTR, OLD_SZ, IN_SZ) \ + do { \ + int ix__; \ + if (++mmap_op_ix >= ERTS_MMAP_OP_RINGBUF_SZ) \ + mmap_op_ix = 0; \ + ix__ = mmap_op_ix; \ + mmap_ops[ix__].type = ERTS_OP_TYPE_MREMAP; \ + mmap_ops[ix__].result = NULL; \ + mmap_ops[ix__].in_size = (IN_SZ); \ + mmap_ops[ix__].out_size = (OLD_SZ); \ + mmap_ops[ix__].old_ptr = (OLD_PTR); \ + mmap_ops[ix__].old_size = (OLD_SZ); \ + } while (0) + +#define ERTS_MREMAP_OP_END(PTR, SZ) \ + do { \ + int ix__ = mmap_op_ix; \ + mmap_ops[ix__].result = (PTR); \ + mmap_ops[mmap_op_ix].out_size = (SZ); \ + } while (0) + +#define ERTS_MREMAP_OP_LCK(RES, OLD_PTR, OLD_SZ, IN_SZ, OUT_SZ) \ + do { \ + erts_smp_mtx_lock(&mm->mtx); \ + ERTS_MREMAP_OP_START((OLD_PTR), (OLD_SZ), (IN_SZ)); \ + ERTS_MREMAP_OP_END((RES), (OUT_SZ)); \ + erts_smp_mtx_unlock(&mm->mtx); \ + } while (0) + +#define ERTS_MMAP_OP_ABORT() \ + do { \ + int ix__ = mmap_op_ix; \ + mmap_ops[ix__].type = ERTS_OP_TYPE_NONE; \ + mmap_ops[ix__].result = NULL; \ + mmap_ops[ix__].in_size = 0; \ + mmap_ops[ix__].out_size = 0; \ + mmap_ops[ix__].old_ptr = NULL; \ + mmap_ops[ix__].old_size = 0; \ + if (--mmap_op_ix < 0) \ + mmap_op_ix = ERTS_MMAP_OP_RINGBUF_SZ-1; \ + } while (0) + +#else + +#define ERTS_MMAP_OP_RINGBUF_INIT() +#define ERTS_MMAP_OP_START(SZ) +#define ERTS_MMAP_OP_END(PTR, SZ) +#define ERTS_MMAP_OP_LCK(RES, IN_SZ, OUT_SZ) +#define ERTS_MUNMAP_OP(PTR, SZ) +#define ERTS_MUNMAP_OP_LCK(PTR, SZ) +#define ERTS_MREMAP_OP_START(OLD_PTR, OLD_SZ, IN_SZ) +#define ERTS_MREMAP_OP_END(PTR, SZ) +#define ERTS_MREMAP_OP_LCK(RES, OLD_PTR, OLD_SZ, IN_SZ, OUT_SZ) +#define ERTS_MMAP_OP_ABORT() + +#endif + +typedef struct { + RBTNode snode; /* node in 'stree' */ + RBTNode anode; /* node in 'atree' */ + char* start; + char* end; +}ErtsFreeSegDesc; + +typedef struct { + RBTree stree; /* size ordered tree */ + RBTree atree; /* address ordered tree */ + Uint nseg; +}ErtsFreeSegMap; + +struct ErtsMemMapper_ { + int (*reserve_physical)(char *, UWord, int exec); + void (*unreserve_physical)(char *, UWord); + int supercarrier; + int no_os_mmap; + int executable; /* is client a native code allocator? */ + /* + * Super unaligned area is located above super aligned + * area. That is, `sa.bot` is beginning of the super + * carrier, `sua.top` is the end of the super carrier, + * and sa.top and sua.bot moves towards eachother. + */ + struct { + char *top; + char *bot; + ErtsFreeSegMap map; + } sua; + struct { + char *top; + char *bot; + ErtsFreeSegMap map; + } sa; +#if HAVE_MMAP && (!defined(MAP_ANON) && !defined(MAP_ANONYMOUS)) + int mmap_fd; +#endif + erts_smp_mtx_t mtx; + struct { + char *free_list; + char *unused_start; + char *unused_end; + char *new_area_hint; + Uint reserved; + } desc; + struct { + UWord free_seg_descs; + struct { + UWord curr; + UWord max; + } free_segs; + } no; + struct { + struct { + UWord total; + struct { + UWord total; + UWord sa; + UWord sua; + } used; + } supercarrier; + struct { + UWord used; + } os; + } size; +}; + +ErtsMemMapper erts_dflt_mmapper; + +#if defined(ARCH_64) && defined(ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION) +ErtsMemMapper erts_literal_mmapper; +char* erts_literals_start; +UWord erts_literals_size; +#endif + +#ifdef ERTS_ALC_A_EXEC +ErtsMemMapper erts_exec_mmapper; +#endif + + + +#define ERTS_MMAP_SIZE_SC_SA_INC(SZ) \ + do { \ + mm->size.supercarrier.used.total += (SZ); \ + mm->size.supercarrier.used.sa += (SZ); \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.total \ + <= mm->size.supercarrier.total); \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.sa \ + <= mm->size.supercarrier.used.total); \ + } while (0) +#define ERTS_MMAP_SIZE_SC_SA_DEC(SZ) \ + do { \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.total >= (SZ)); \ + mm->size.supercarrier.used.total -= (SZ); \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.sa >= (SZ)); \ + mm->size.supercarrier.used.sa -= (SZ); \ + } while (0) +#define ERTS_MMAP_SIZE_SC_SUA_INC(SZ) \ + do { \ + mm->size.supercarrier.used.total += (SZ); \ + mm->size.supercarrier.used.sua += (SZ); \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.total \ + <= mm->size.supercarrier.total); \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.sua \ + <= mm->size.supercarrier.used.total); \ + } while (0) +#define ERTS_MMAP_SIZE_SC_SUA_DEC(SZ) \ + do { \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.total >= (SZ)); \ + mm->size.supercarrier.used.total -= (SZ); \ + ERTS_MMAP_ASSERT(mm->size.supercarrier.used.sua >= (SZ)); \ + mm->size.supercarrier.used.sua -= (SZ); \ + } while (0) +#define ERTS_MMAP_SIZE_OS_INC(SZ) \ + do { \ + ERTS_MMAP_ASSERT(mm->size.os.used + (SZ) >= (SZ)); \ + mm->size.os.used += (SZ); \ + } while (0) +#define ERTS_MMAP_SIZE_OS_DEC(SZ) \ + do { \ + ERTS_MMAP_ASSERT(mm->size.os.used >= (SZ)); \ + mm->size.os.used -= (SZ); \ + } while (0) + + +static void +add_free_desc_area(ErtsMemMapper* mm, char *start, char *end) +{ + ERTS_MMAP_ASSERT(end == (void *) 0 || end > start); + if (sizeof(ErtsFreeSegDesc) <= ((UWord) end) - ((UWord) start)) { + UWord no; + ErtsFreeSegDesc *prev_desc, *desc; + char *desc_end; + + no = 1; + prev_desc = (ErtsFreeSegDesc *) start; + prev_desc->start = mm->desc.free_list; + desc = (ErtsFreeSegDesc *) (start + sizeof(ErtsFreeSegDesc)); + desc_end = start + 2*sizeof(ErtsFreeSegDesc); + + while (desc_end <= end) { + desc->start = (char *) prev_desc; + prev_desc = desc; + desc = (ErtsFreeSegDesc *) desc_end; + desc_end += sizeof(ErtsFreeSegDesc); + no++; + } + mm->desc.free_list = (char *) prev_desc; + mm->no.free_seg_descs += no; + } +} + +static ErtsFreeSegDesc * +add_unused_free_desc_area(ErtsMemMapper* mm) +{ + char *ptr; + if (!mm->desc.unused_start) + return NULL; + + ERTS_MMAP_ASSERT(mm->desc.unused_end); + ERTS_MMAP_ASSERT(ERTS_PAGEALIGNED_SIZE + <= mm->desc.unused_end - mm->desc.unused_start); + + ptr = mm->desc.unused_start + ERTS_PAGEALIGNED_SIZE; + add_free_desc_area(mm, mm->desc.unused_start, ptr); + + if ((mm->desc.unused_end - ptr) >= ERTS_PAGEALIGNED_SIZE) + mm->desc.unused_start = ptr; + else + mm->desc.unused_end = mm->desc.unused_start = NULL; + + ERTS_MMAP_ASSERT(mm->desc.free_list); + return (ErtsFreeSegDesc *) mm->desc.free_list; +} + +static ERTS_INLINE ErtsFreeSegDesc * +alloc_desc(ErtsMemMapper* mm) +{ + ErtsFreeSegDesc *res; + res = (ErtsFreeSegDesc *) mm->desc.free_list; + if (!res) { + res = add_unused_free_desc_area(mm); + if (!res) + return NULL; + } + mm->desc.free_list = res->start; + ASSERT(mm->no.free_segs.curr < mm->no.free_seg_descs); + mm->no.free_segs.curr++; + if (mm->no.free_segs.max < mm->no.free_segs.curr) + mm->no.free_segs.max = mm->no.free_segs.curr; + return res; +} + +static ERTS_INLINE void +free_desc(ErtsMemMapper* mm, ErtsFreeSegDesc *desc) +{ + desc->start = mm->desc.free_list; + mm->desc.free_list = (char *) desc; + ERTS_MMAP_ASSERT(mm->no.free_segs.curr > 0); + mm->no.free_segs.curr--; +} + +static ERTS_INLINE ErtsFreeSegDesc* anode_to_desc(RBTNode* anode) +{ + return (ErtsFreeSegDesc*) ((char*)anode - offsetof(ErtsFreeSegDesc, anode)); +} + +static ERTS_INLINE ErtsFreeSegDesc* snode_to_desc(RBTNode* snode) +{ + return (ErtsFreeSegDesc*) ((char*)snode - offsetof(ErtsFreeSegDesc, snode)); +} + +static ERTS_INLINE ErtsFreeSegDesc* node_to_desc(enum SortOrder order, RBTNode* node) +{ + return order==ADDR_ORDER ? anode_to_desc(node) : snode_to_desc(node); +} + +static ERTS_INLINE SWord usable_size(enum SortOrder order, + ErtsFreeSegDesc* desc) +{ + return ((order == SA_SZ_ADDR_ORDER) ? + ERTS_SUPERALIGNED_FLOOR(desc->end) - ERTS_SUPERALIGNED_CEILING(desc->start) + : desc->end - desc->start); +} + +#ifdef HARD_DEBUG +static ERTS_INLINE SWord cmp_nodes(enum SortOrder order, + RBTNode* lhs, RBTNode* rhs) +{ + ErtsFreeSegDesc* ldesc = node_to_desc(order, lhs); + ErtsFreeSegDesc* rdesc = node_to_desc(order, rhs); + RBT_ASSERT(lhs != rhs); + if (order != ADDR_ORDER) { + SWord diff = usable_size(order, ldesc) - usable_size(order, rdesc); + if (diff) return diff; + } + if (order != SZ_REVERSE_ADDR_ORDER) { + return (char*)ldesc->start - (char*)rdesc->start; + } + else { + return (char*)rdesc->start - (char*)ldesc->start; + } +} +#endif /* HARD_DEBUG */ + +static ERTS_INLINE SWord cmp_with_node(enum SortOrder order, + SWord sz, char* addr, RBTNode* rhs) +{ + ErtsFreeSegDesc* rdesc; + if (order != ADDR_ORDER) { + SWord diff; + rdesc = snode_to_desc(rhs); + diff = sz - usable_size(order, rdesc); + if (diff) return diff; + } + else + rdesc = anode_to_desc(rhs); + + if (order != SZ_REVERSE_ADDR_ORDER) + return addr - (char*)rdesc->start; + else + return (char*)rdesc->start - addr; +} + + +static ERTS_INLINE void +left_rotate(RBTNode **root, RBTNode *x) +{ + RBTNode *y = x->right; + x->right = y->left; + if (y->left) + set_parent(y->left, x); + set_parent(y, parent(x)); + if (!parent(y)) { + RBT_ASSERT(*root == x); + *root = y; + } + else if (x == parent(x)->left) + parent(x)->left = y; + else { + RBT_ASSERT(x == parent(x)->right); + parent(x)->right = y; + } + y->left = x; + set_parent(x, y); +} + +static ERTS_INLINE void +right_rotate(RBTNode **root, RBTNode *x) +{ + RBTNode *y = x->left; + x->left = y->right; + if (y->right) + set_parent(y->right, x); + set_parent(y, parent(x)); + if (!parent(y)) { + RBT_ASSERT(*root == x); + *root = y; + } + else if (x == parent(x)->right) + parent(x)->right = y; + else { + RBT_ASSERT(x == parent(x)->left); + parent(x)->left = y; + } + y->right = x; + set_parent(x, y); +} + +/* + * Replace node x with node y + * NOTE: segment descriptor of y is not changed + */ +static ERTS_INLINE void +replace(RBTNode **root, RBTNode *x, RBTNode *y) +{ + + if (!parent(x)) { + RBT_ASSERT(*root == x); + *root = y; + } + else if (x == parent(x)->left) + parent(x)->left = y; + else { + RBT_ASSERT(x == parent(x)->right); + parent(x)->right = y; + } + if (x->left) { + RBT_ASSERT(parent(x->left) == x); + set_parent(x->left, y); + } + if (x->right) { + RBT_ASSERT(parent(x->right) == x); + set_parent(x->right, y); + } + + y->parent_and_color = x->parent_and_color; + y->right = x->right; + y->left = x->left; +} + +static void +tree_insert_fixup(RBTNode** root, RBTNode *node) +{ + RBTNode *x = node, *y, *papa_x, *granpa_x; + + /* + * Rearrange the tree so that it satisfies the Red-Black Tree properties + */ + + papa_x = parent(x); + RBT_ASSERT(x != *root && IS_RED(papa_x)); + do { + + /* + * x and its parent are both red. Move the red pair up the tree + * until we get to the root or until we can separate them. + */ + + granpa_x = parent(papa_x); + RBT_ASSERT(IS_RED(x)); + RBT_ASSERT(IS_BLACK(granpa_x)); + RBT_ASSERT(granpa_x); + + if (papa_x == granpa_x->left) { + y = granpa_x->right; + if (IS_RED(y)) { + SET_BLACK(y); + SET_BLACK(papa_x); + SET_RED(granpa_x); + x = granpa_x; + } + else { + + if (x == papa_x->right) { + left_rotate(root, papa_x); + papa_x = x; + x = papa_x->left; + } + + RBT_ASSERT(x == granpa_x->left->left); + RBT_ASSERT(IS_RED(x)); + RBT_ASSERT(IS_RED(papa_x)); + RBT_ASSERT(IS_BLACK(granpa_x)); + RBT_ASSERT(IS_BLACK(y)); + + SET_BLACK(papa_x); + SET_RED(granpa_x); + right_rotate(root, granpa_x); + + RBT_ASSERT(x == parent(x)->left); + RBT_ASSERT(IS_RED(x)); + RBT_ASSERT(IS_RED(parent(x)->right)); + RBT_ASSERT(IS_BLACK(parent(x))); + break; + } + } + else { + RBT_ASSERT(papa_x == granpa_x->right); + y = granpa_x->left; + if (IS_RED(y)) { + SET_BLACK(y); + SET_BLACK(papa_x); + SET_RED(granpa_x); + x = granpa_x; + } + else { + + if (x == papa_x->left) { + right_rotate(root, papa_x); + papa_x = x; + x = papa_x->right; + } + + RBT_ASSERT(x == granpa_x->right->right); + RBT_ASSERT(IS_RED(x)); + RBT_ASSERT(IS_RED(papa_x)); + RBT_ASSERT(IS_BLACK(granpa_x)); + RBT_ASSERT(IS_BLACK(y)); + + SET_BLACK(papa_x); + SET_RED(granpa_x); + left_rotate(root, granpa_x); + + RBT_ASSERT(x == parent(x)->right); + RBT_ASSERT(IS_RED(x)); + RBT_ASSERT(IS_RED(parent(x)->left)); + RBT_ASSERT(IS_BLACK(parent(x))); + break; + } + } + } while (x != *root && (papa_x=parent(x), IS_RED(papa_x))); + + SET_BLACK(*root); +} + +static void +rbt_delete(RBTree* tree, RBTNode* del) +{ + Uint spliced_is_black; + RBTNode *x, *y, *z = del, *papa_y; + RBTNode null_x; /* null_x is used to get the fixup started when we + splice out a node without children. */ + + HARD_CHECK_IS_MEMBER(tree->root, del); + HARD_CHECK_TREE(tree, 0); + + null_x.parent_and_color = parent_and_color(NULL, !RED_FLG); + + /* Remove node from tree... */ + + /* Find node to splice out */ + if (!z->left || !z->right) + y = z; + else + /* Set y to z:s successor */ + for(y = z->right; y->left; y = y->left); + /* splice out y */ + x = y->left ? y->left : y->right; + spliced_is_black = IS_BLACK(y); + papa_y = parent(y); + if (x) { + set_parent(x, papa_y); + } + else if (spliced_is_black) { + x = &null_x; + x->right = x->left = NULL; + x->parent_and_color = parent_and_color(papa_y, !RED_FLG); + y->left = x; + } + + if (!papa_y) { + RBT_ASSERT(tree->root == y); + tree->root = x; + } + else { + if (y == papa_y->left) { + papa_y->left = x; + } + else { + RBT_ASSERT(y == papa_y->right); + papa_y->right = x; + } + } + if (y != z) { + /* We spliced out the successor of z; replace z by the successor */ + RBT_ASSERT(z != &null_x); + replace(&tree->root, z, y); + } + + if (spliced_is_black) { + RBTNode* papa_x; + /* We removed a black node which makes the resulting tree + violate the Red-Black Tree properties. Fixup tree... */ + + papa_x = parent(x); + while (IS_BLACK(x) && papa_x) { + + /* + * x has an "extra black" which we move up the tree + * until we reach the root or until we can get rid of it. + * + * y is the sibbling of x + */ + + if (x == papa_x->left) { + y = papa_x->right; + RBT_ASSERT(y); + if (IS_RED(y)) { + RBT_ASSERT(y->right); + RBT_ASSERT(y->left); + SET_BLACK(y); + RBT_ASSERT(IS_BLACK(papa_x)); + SET_RED(papa_x); + left_rotate(&tree->root, papa_x); + RBT_ASSERT(papa_x == parent(x)); + y = papa_x->right; + } + RBT_ASSERT(y); + RBT_ASSERT(IS_BLACK(y)); + if (IS_BLACK(y->left) && IS_BLACK(y->right)) { + SET_RED(y); + } + else { + if (IS_BLACK(y->right)) { + SET_BLACK(y->left); + SET_RED(y); + right_rotate(&tree->root, y); + RBT_ASSERT(papa_x == parent(x)); + y = papa_x->right; + } + RBT_ASSERT(y); + if (IS_RED(papa_x)) { + + SET_BLACK(papa_x); + SET_RED(y); + } + RBT_ASSERT(y->right); + SET_BLACK(y->right); + left_rotate(&tree->root, papa_x); + x = tree->root; + break; + } + } + else { + RBT_ASSERT(x == papa_x->right); + y = papa_x->left; + RBT_ASSERT(y); + if (IS_RED(y)) { + RBT_ASSERT(y->right); + RBT_ASSERT(y->left); + SET_BLACK(y); + RBT_ASSERT(IS_BLACK(papa_x)); + SET_RED(papa_x); + right_rotate(&tree->root, papa_x); + RBT_ASSERT(papa_x == parent(x)); + y = papa_x->left; + } + RBT_ASSERT(y); + RBT_ASSERT(IS_BLACK(y)); + if (IS_BLACK(y->right) && IS_BLACK(y->left)) { + SET_RED(y); + } + else { + if (IS_BLACK(y->left)) { + SET_BLACK(y->right); + SET_RED(y); + left_rotate(&tree->root, y); + RBT_ASSERT(papa_x == parent(x)); + y = papa_x->left; + } + RBT_ASSERT(y); + if (IS_RED(papa_x)) { + SET_BLACK(papa_x); + SET_RED(y); + } + RBT_ASSERT(y->left); + SET_BLACK(y->left); + right_rotate(&tree->root, papa_x); + x = tree->root; + break; + } + } + x = papa_x; + papa_x = parent(x); + } + SET_BLACK(x); + + papa_x = parent(&null_x); + if (papa_x) { + if (papa_x->left == &null_x) + papa_x->left = NULL; + else { + RBT_ASSERT(papa_x->right == &null_x); + papa_x->right = NULL; + } + RBT_ASSERT(!null_x.left); + RBT_ASSERT(!null_x.right); + } + else if (tree->root == &null_x) { + tree->root = NULL; + RBT_ASSERT(!null_x.left); + RBT_ASSERT(!null_x.right); + } + } + HARD_CHECK_TREE(tree, 0); +} + + +static void +rbt_insert(RBTree* tree, RBTNode* node) +{ +#ifdef RBT_DEBUG + ErtsFreeSegDesc *dbg_under=NULL, *dbg_over=NULL; +#endif + ErtsFreeSegDesc* desc = node_to_desc(tree->order, node); + char* seg_addr = desc->start; + SWord seg_sz = desc->end - desc->start; + + HARD_CHECK_TREE(tree, 0); + + node->left = NULL; + node->right = NULL; + + if (!tree->root) { + node->parent_and_color = parent_and_color(NULL, !RED_FLG); + tree->root = node; + } + else { + RBTNode *x = tree->root; + while (1) { + SWord diff = cmp_with_node(tree->order, seg_sz, seg_addr, x); + if (diff < 0) { + IF_RBT_DEBUG(dbg_over = node_to_desc(tree->order, x)); + if (!x->left) { + node->parent_and_color = parent_and_color(x, RED_FLG); + x->left = node; + break; + } + x = x->left; + } + else { + RBT_ASSERT(diff > 0); + IF_RBT_DEBUG(dbg_under = node_to_desc(tree->order, x)); + if (!x->right) { + node->parent_and_color = parent_and_color(x, RED_FLG); + x->right = node; + break; + } + x = x->right; + } + } + + RBT_ASSERT(parent(node)); +#ifdef RBT_DEBUG + if (tree->order == ADDR_ORDER) { + RBT_ASSERT(!dbg_under || dbg_under->end < desc->start); + RBT_ASSERT(!dbg_over || dbg_over->start > desc->end); + } +#endif + RBT_ASSERT(IS_RED(node)); + if (IS_RED(parent(node))) + tree_insert_fixup(&tree->root, node); + } + HARD_CHECK_TREE(tree, 0); +} + +/* + * Traverse tree in (reverse) sorting order + */ +static void +rbt_foreach_node(RBTree* tree, + void (*fn)(RBTNode*,void*), + void* arg, int reverse) +{ +#ifdef HARD_DEBUG + Sint blacks = -1; + Sint curr_blacks = 1; + Uint depth = 1; + Uint max_depth = 0; + Uint node_cnt = 0; +#endif + enum { RECURSE_LEFT, DO_NODE, RECURSE_RIGHT, RETURN_TO_PARENT }state; + RBTNode *x = tree->root; + + RBT_ASSERT(!x || !parent(x)); + + state = reverse ? RECURSE_RIGHT : RECURSE_LEFT; + while (x) { + switch (state) { + case RECURSE_LEFT: + if (x->left) { + RBT_ASSERT(parent(x->left) == x); + #ifdef HARD_DEBUG + ++depth; + if (IS_BLACK(x->left)) + curr_blacks++; + #endif + x = x->left; + state = reverse ? RECURSE_RIGHT : RECURSE_LEFT; + } + else { + #ifdef HARD_DEBUG + if (blacks < 0) + blacks = curr_blacks; + RBT_ASSERT(blacks == curr_blacks); + #endif + state = reverse ? RETURN_TO_PARENT : DO_NODE; + } + break; + + case DO_NODE: + #ifdef HARD_DEBUG + ++node_cnt; + if (depth > max_depth) + max_depth = depth; + #endif + (*fn) (x, arg); /* Do it! */ + state = reverse ? RECURSE_LEFT : RECURSE_RIGHT; + break; + + case RECURSE_RIGHT: + if (x->right) { + RBT_ASSERT(parent(x->right) == x); + #ifdef HARD_DEBUG + ++depth; + if (IS_BLACK(x->right)) + curr_blacks++; + #endif + x = x->right; + state = reverse ? RECURSE_RIGHT : RECURSE_LEFT; + } + else { + #ifdef HARD_DEBUG + if (blacks < 0) + blacks = curr_blacks; + RBT_ASSERT(blacks == curr_blacks); + #endif + state = reverse ? DO_NODE : RETURN_TO_PARENT; + } + break; + + case RETURN_TO_PARENT: + #ifdef HARD_DEBUG + if (IS_BLACK(x)) + curr_blacks--; + --depth; + #endif + if (parent(x)) { + if (x == parent(x)->left) { + state = reverse ? RETURN_TO_PARENT : DO_NODE; + } + else { + RBT_ASSERT(x == parent(x)->right); + state = reverse ? DO_NODE : RETURN_TO_PARENT; + } + } + x = parent(x); + break; + } + } +#ifdef HARD_DEBUG + RBT_ASSERT(depth == 0 || (!tree->root && depth==1)); + RBT_ASSERT(curr_blacks == 0); + RBT_ASSERT((1 << (max_depth/2)) <= node_cnt); +#endif +} + +#if defined(RBT_DEBUG) || defined(HARD_DEBUG_MSEG) +static RBTNode* rbt_prev_node(RBTNode* node) +{ + RBTNode* x; + if (node->left) { + for (x=node->left; x->right; x=x->right) + ; + return x; + } + for (x=node; parent(x); x=parent(x)) { + if (parent(x)->right == x) + return parent(x); + } + return NULL; +} +static RBTNode* rbt_next_node(RBTNode* node) +{ + RBTNode* x; + if (node->right) { + for (x=node->right; x->left; x=x->left) + ; + return x; + } + for (x=node; parent(x); x=parent(x)) { + if (parent(x)->left == x) + return parent(x); + } + return NULL; +} +#endif /* RBT_DEBUG || HARD_DEBUG_MSEG */ + + +/* The API to keep track of a bunch of separated (free) segments + (non-overlapping and non-adjacent). + */ +static void init_free_seg_map(ErtsFreeSegMap*, enum SortOrder); +static void adjacent_free_seg(ErtsFreeSegMap*, char* start, char* end, + ErtsFreeSegDesc** under, ErtsFreeSegDesc** over); +static void insert_free_seg(ErtsFreeSegMap*, ErtsFreeSegDesc*, char* start, char* end); +static void resize_free_seg(ErtsFreeSegMap*, ErtsFreeSegDesc*, char* start, char* end); +static void delete_free_seg(ErtsFreeSegMap*, ErtsFreeSegDesc*); +static ErtsFreeSegDesc* lookup_free_seg(ErtsFreeSegMap*, SWord sz); + + +static void init_free_seg_map(ErtsFreeSegMap* map, enum SortOrder order) +{ + map->atree.root = NULL; + map->atree.order = ADDR_ORDER; + map->stree.root = NULL; + map->stree.order = order; + map->nseg = 0; +} + +/* Lookup directly adjacent free segments to the given area [start->end]. + * The given area must not contain any free segments. + */ +static void adjacent_free_seg(ErtsFreeSegMap* map, char* start, char* end, + ErtsFreeSegDesc** under, ErtsFreeSegDesc** over) +{ + RBTNode* x = map->atree.root; + + *under = NULL; + *over = NULL; + while (x) { + if (start < anode_to_desc(x)->start) { + RBT_ASSERT(end <= anode_to_desc(x)->start); + if (end == anode_to_desc(x)->start) { + RBT_ASSERT(!*over); + *over = anode_to_desc(x); + } + x = x->left; + } + else { + RBT_ASSERT(start >= anode_to_desc(x)->end); + if (start == anode_to_desc(x)->end) { + RBT_ASSERT(!*under); + *under = anode_to_desc(x); + } + x = x->right; + } + } +} + +/* Initialize 'desc' and insert as new free segment [start->end]. + * The new segment must not contain or be adjacent to any free segment in 'map'. + */ +static void insert_free_seg(ErtsFreeSegMap* map, ErtsFreeSegDesc* desc, + char* start, char* end) +{ + desc->start = start; + desc->end = end; + rbt_insert(&map->atree, &desc->anode); + rbt_insert(&map->stree, &desc->snode); + map->nseg++; +} + +/* Resize existing free segment 'desc' to [start->end]. + * The new segment location must overlap the old location and + * it must not contain or be adjacent to any other free segment in 'map'. + */ +static void resize_free_seg(ErtsFreeSegMap* map, ErtsFreeSegDesc* desc, + char* start, char* end) +{ +#ifdef RBT_DEBUG + RBTNode* prev = rbt_prev_node(&desc->anode); + RBTNode* next = rbt_next_node(&desc->anode); + RBT_ASSERT(!prev || anode_to_desc(prev)->end < start); + RBT_ASSERT(!next || anode_to_desc(next)->start > end); +#endif + rbt_delete(&map->stree, &desc->snode); + desc->start = start; + desc->end = end; + rbt_insert(&map->stree, &desc->snode); +} + +/* Delete existing free segment 'desc' from 'map'. + */ +static void delete_free_seg(ErtsFreeSegMap* map, ErtsFreeSegDesc* desc) +{ + rbt_delete(&map->atree, &desc->anode); + rbt_delete(&map->stree, &desc->snode); + map->nseg--; +} + +/* Lookup a free segment in 'map' with a size of at least 'need_sz' usable bytes. + */ +static ErtsFreeSegDesc* lookup_free_seg(ErtsFreeSegMap* map, SWord need_sz) +{ + RBTNode* x = map->stree.root; + ErtsFreeSegDesc* best_desc = NULL; + const enum SortOrder order = map->stree.order; + + while (x) { + ErtsFreeSegDesc* desc = snode_to_desc(x); + SWord seg_sz = usable_size(order, desc); + + if (seg_sz < need_sz) { + x = x->right; + } + else { + best_desc = desc; + x = x->left; + } + } + return best_desc; +} + +struct build_arg_t +{ + Process* p; + Eterm* hp; + Eterm acc; +}; + +static void build_free_seg_tuple(RBTNode* node, void* arg) +{ + struct build_arg_t* a = (struct build_arg_t*)arg; + ErtsFreeSegDesc* desc = anode_to_desc(node); + Eterm start= erts_bld_uword(&a->hp, NULL, (UWord)desc->start); + Eterm end = erts_bld_uword(&a->hp, NULL, (UWord)desc->end); + Eterm tpl = TUPLE2(a->hp, start, end); + + a->hp += 3; + a->acc = CONS(a->hp, tpl, a->acc); + a->hp += 2; +} + +static +Eterm build_free_seg_list(Process* p, ErtsFreeSegMap* map) +{ + struct build_arg_t barg; + Eterm* hp_end; + const Uint may_need = map->nseg * (2 + 3 + 2*2); /* cons + tuple + bigs */ + + barg.p = p; + barg.hp = HAlloc(p, may_need); + hp_end = barg.hp + may_need; + barg.acc = NIL; + rbt_foreach_node(&map->atree, build_free_seg_tuple, &barg, 1); + + RBT_ASSERT(barg.hp <= hp_end); + HRelease(p, hp_end, barg.hp); + return barg.acc; +} + +#if ERTS_HAVE_OS_MMAP +/* Implementation of os_mmap()/os_munmap()/os_mremap()... */ + +#if HAVE_MMAP +# define ERTS_MMAP_PROT (PROT_READ|PROT_WRITE) +# define ERTS_MMAP_PROT_EXEC (PROT_READ|PROT_WRITE|PROT_EXEC) +# if defined(MAP_ANONYMOUS) +# define ERTS_MMAP_FLAGS (MAP_ANON|MAP_PRIVATE) +# define ERTS_MMAP_FD (-1) +# elif defined(MAP_ANON) +# define ERTS_MMAP_FLAGS (MAP_ANON|MAP_PRIVATE) +# define ERTS_MMAP_FD (-1) +# else +# define ERTS_MMAP_FLAGS (MAP_PRIVATE) +# define ERTS_MMAP_FD mm->mmap_fd +# endif +#endif + +static ERTS_INLINE void * +os_mmap(void *hint_ptr, UWord size, int try_superalign, int executable) +{ +#if HAVE_MMAP + const int prot = executable ? ERTS_MMAP_PROT_EXEC : ERTS_MMAP_PROT; + void *res; +#ifdef MAP_ALIGN + if (try_superalign) + res = mmap((void *) ERTS_SUPERALIGNED_SIZE, size, prot, + ERTS_MMAP_FLAGS|MAP_ALIGN, ERTS_MMAP_FD, 0); + else +#endif + res = mmap((void *) hint_ptr, size, prot, + ERTS_MMAP_FLAGS, ERTS_MMAP_FD, 0); + if (res == MAP_FAILED) + return NULL; + return res; +#elif HAVE_VIRTUALALLOC + const DWORD prot = executable ? PAGE_EXECUTE_READWRITE : PAGE_READWRITE; + return (void *) VirtualAlloc(NULL, (SIZE_T) size, + MEM_COMMIT|MEM_RESERVE, prot); +#else +# error "missing mmap() or similar" +#endif +} + +static ERTS_INLINE void +os_munmap(void *ptr, UWord size) +{ +#if HAVE_MMAP +#ifdef ERTS_MMAP_DEBUG + int res = +#endif + munmap(ptr, size); + ERTS_MMAP_ASSERT(res == 0); +#elif HAVE_VIRTUALALLOC +#ifdef DEBUG + BOOL res = +#endif + VirtualFree((LPVOID) ptr, (SIZE_T) 0, MEM_RELEASE); + ERTS_MMAP_ASSERT(res != 0); +#else +# error "missing munmap() or similar" +#endif +} + +#ifdef ERTS_HAVE_OS_MREMAP +# if HAVE_MREMAP +# if defined(__NetBSD__) +# define ERTS_MREMAP_FLAGS (0) +# else +# define ERTS_MREMAP_FLAGS (MREMAP_MAYMOVE) +# endif +# endif +static ERTS_INLINE void * +os_mremap(void *ptr, UWord old_size, UWord new_size, int try_superalign) +{ + void *new_seg; +#if HAVE_MREMAP + new_seg = mremap(ptr, (size_t) old_size, +# if defined(__NetBSD__) + NULL, +# endif + (size_t) new_size, ERTS_MREMAP_FLAGS); + if (new_seg == (void *) MAP_FAILED) + return NULL; + return new_seg; +#else +# error "missing mremap() or similar" +#endif +} +#endif + +#ifdef ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION +#if HAVE_MMAP + +#define ERTS_MMAP_RESERVE_PROT (ERTS_MMAP_PROT) +#define ERTS_MMAP_RESERVE_PROT_EXEC (ERTS_MMAP_PROT_EXEC) +#define ERTS_MMAP_RESERVE_FLAGS (ERTS_MMAP_FLAGS|MAP_FIXED) +#define ERTS_MMAP_UNRESERVE_PROT (PROT_NONE) +#if defined(__FreeBSD__) +#define ERTS_MMAP_UNRESERVE_FLAGS (ERTS_MMAP_FLAGS|MAP_FIXED) +#else +#define ERTS_MMAP_UNRESERVE_FLAGS (ERTS_MMAP_FLAGS|MAP_NORESERVE|MAP_FIXED) +#endif /* __FreeBSD__ */ +#define ERTS_MMAP_VIRTUAL_PROT (PROT_NONE) +#if defined(__FreeBSD__) +#define ERTS_MMAP_VIRTUAL_FLAGS (ERTS_MMAP_FLAGS) +#else +#define ERTS_MMAP_VIRTUAL_FLAGS (ERTS_MMAP_FLAGS|MAP_NORESERVE) +#endif /* __FreeBSD__ */ + +static int +os_reserve_physical(char *ptr, UWord size, int exec) +{ + const int prot = exec ? ERTS_MMAP_RESERVE_PROT_EXEC : ERTS_MMAP_RESERVE_PROT; + void *res = mmap((void *) ptr, (size_t) size, prot, + ERTS_MMAP_RESERVE_FLAGS, ERTS_MMAP_FD, 0); + if (res == (void *) MAP_FAILED) + return 0; + return 1; +} + +static void +os_unreserve_physical(char *ptr, UWord size) +{ + void *res = mmap((void *) ptr, (size_t) size, ERTS_MMAP_UNRESERVE_PROT, + ERTS_MMAP_UNRESERVE_FLAGS, ERTS_MMAP_FD, 0); + if (res == (void *) MAP_FAILED) + erts_exit(ERTS_ABORT_EXIT, "Failed to unreserve memory"); +} + +static void * +os_mmap_virtual(char *ptr, UWord size, int exec) +{ + int flags = ERTS_MMAP_VIRTUAL_FLAGS; + void* res; + +#ifdef ERTS_ALC_A_EXEC + if (exec) { + ASSERT(!ptr); + /* OTP-19.0: Nice hack below cut-and-pasted from hipe_amd64.c */ + +# ifdef MAP_32BIT + /* If we got MAP_32BIT (Linux), then use that to ask for low memory */ + flags |= MAP_32BIT; +# else + /* FreeBSD doesn't have MAP_32BIT, and it doesn't respect + a plain map_hint (returns high mappings even though the + hint refers to a free area), so we have to use both map_hint + and MAP_FIXED to get addresses below the 2GB boundary. + This is even worse than the Linux/ppc64 case. + Similarly, Solaris 10 doesn't have MAP_32BIT, + and it doesn't respect a plain map_hint. */ + ptr = (char*)(512*1024*1024); /* 0.5GB */ + +# if defined(__FreeBSD__) || defined(__sun__) + flags |= MAP_FIXED; +# endif +# endif /* !MAP_32BIT */ + } +#else /* !ERTS_ALC_A_EXEC */ + ASSERT(!exec); +#endif + res = mmap((void *) ptr, (size_t) size, ERTS_MMAP_VIRTUAL_PROT, + flags, ERTS_MMAP_FD, 0); + if (res == (void *) MAP_FAILED) + return NULL; + return res; +} + +#else +#error "Missing reserve/unreserve physical memory implementation" +#endif +#endif /* ERTS_HAVE_OS_RESERVE_PHYSICAL_MEMORY */ + +#endif /* ERTS_HAVE_OS_MMAP */ + +static int reserve_noop(char *ptr, UWord size, int exec) +{ +#ifdef ERTS_MMAP_DEBUG_FILL_AREAS + Uint32 *uip, *end = (Uint32 *) (ptr + size); + + for (uip = (Uint32 *) ptr; uip < end; uip++) + ERTS_MMAP_ASSERT(*uip == (Uint32) 0xdeadbeef); + for (uip = (Uint32 *) ptr; uip < end; uip++) + *uip = (Uint32) 0xfeedfeed; +#endif + return 1; +} + +static void unreserve_noop(char *ptr, UWord size) +{ +#ifdef ERTS_MMAP_DEBUG_FILL_AREAS + Uint32 *uip, *end = (Uint32 *) (ptr + size); + + for (uip = (Uint32 *) ptr; uip < end; uip++) + *uip = (Uint32) 0xdeadbeef; +#endif +} + +static UWord +alloc_desc_insert_free_seg(ErtsMemMapper* mm, + ErtsFreeSegMap *map, char* start, char* end) +{ + char *ptr; + ErtsFreeSegMap *da_map; + ErtsFreeSegDesc *desc = alloc_desc(mm); + if (desc) { + insert_free_seg(map, desc, start, end); + return 0; + } + + /* + * Ahh; ran out of free segment descriptors. + * + * First try to map a new page... + */ + +#if ERTS_HAVE_OS_MMAP + if (!mm->no_os_mmap) { + ptr = os_mmap(mm->desc.new_area_hint, ERTS_PAGEALIGNED_SIZE, 0, 0); + if (ptr) { + mm->desc.new_area_hint = ptr+ERTS_PAGEALIGNED_SIZE; + ERTS_MMAP_SIZE_OS_INC(ERTS_PAGEALIGNED_SIZE); + add_free_desc_area(mm, ptr, ptr+ERTS_PAGEALIGNED_SIZE); + desc = alloc_desc(mm); + ERTS_MMAP_ASSERT(desc); + insert_free_seg(map, desc, start, end); + return 0; + } + } +#endif + + /* + * ...then try to find a good place in the supercarrier... + */ + da_map = &mm->sua.map; + desc = lookup_free_seg(da_map, ERTS_PAGEALIGNED_SIZE); + if (desc) { + if (mm->reserve_physical(desc->start, ERTS_PAGEALIGNED_SIZE, 0)) + ERTS_MMAP_SIZE_SC_SUA_INC(ERTS_PAGEALIGNED_SIZE); + else + desc = NULL; + + } + else { + da_map = &mm->sa.map; + desc = lookup_free_seg(da_map, ERTS_PAGEALIGNED_SIZE); + if (desc) { + if (mm->reserve_physical(desc->start, ERTS_PAGEALIGNED_SIZE, 0)) + ERTS_MMAP_SIZE_SC_SA_INC(ERTS_PAGEALIGNED_SIZE); + else + desc = NULL; + } + } + if (desc) { + char *da_end = desc->start + ERTS_PAGEALIGNED_SIZE; + add_free_desc_area(mm, desc->start, da_end); + if (da_end != desc->end) + resize_free_seg(da_map, desc, da_end, desc->end); + else { + delete_free_seg(da_map, desc); + free_desc(mm, desc); + } + + desc = alloc_desc(mm); + ERTS_MMAP_ASSERT(desc); + insert_free_seg(map, desc, start, end); + return 0; + } + + /* + * ... and then as last resort use the first page of the + * free segment we are trying to insert for free descriptors. + */ + ptr = start + ERTS_PAGEALIGNED_SIZE; + ERTS_MMAP_ASSERT(ptr <= end); + + add_free_desc_area(mm, start, ptr); + + if (ptr != end) { + desc = alloc_desc(mm); + ERTS_MMAP_ASSERT(desc); + insert_free_seg(map, desc, ptr, end); + } + + return ERTS_PAGEALIGNED_SIZE; +} + +void * +erts_mmap(ErtsMemMapper* mm, Uint32 flags, UWord *sizep) +{ + char *seg; + UWord asize = ERTS_PAGEALIGNED_CEILING(*sizep); + + /* Map in premapped supercarrier */ + if (mm->supercarrier && !(ERTS_MMAPFLG_OS_ONLY & flags)) { + char *end; + ErtsFreeSegDesc *desc; + Uint32 superaligned = (ERTS_MMAPFLG_SUPERALIGNED & flags); + + erts_smp_mtx_lock(&mm->mtx); + + ERTS_MMAP_OP_START(*sizep); + + if (!superaligned) { + desc = lookup_free_seg(&mm->sua.map, asize); + if (desc) { + seg = desc->start; + end = seg+asize; + if (!mm->reserve_physical(seg, asize, mm->executable)) + goto supercarrier_reserve_failure; + if (desc->end == end) { + delete_free_seg(&mm->sua.map, desc); + free_desc(mm, desc); + } + else { + ERTS_MMAP_ASSERT(end < desc->end); + resize_free_seg(&mm->sua.map, desc, end, desc->end); + } + ERTS_MMAP_SIZE_SC_SUA_INC(asize); + goto supercarrier_success; + } + + if (asize <= mm->sua.bot - mm->sa.top) { + if (!mm->reserve_physical(mm->sua.bot - asize, asize, + mm->executable)) + goto supercarrier_reserve_failure; + mm->sua.bot -= asize; + seg = mm->sua.bot; + ERTS_MMAP_SIZE_SC_SUA_INC(asize); + goto supercarrier_success; + } + } + + asize = ERTS_SUPERALIGNED_CEILING(asize); + + desc = lookup_free_seg(&mm->sa.map, asize); + if (desc) { + char *start = seg = desc->start; + seg = (char *) ERTS_SUPERALIGNED_CEILING(seg); + end = seg+asize; + if (!mm->reserve_physical(start, (UWord) (end - start), + mm->executable)) + goto supercarrier_reserve_failure; + ERTS_MMAP_SIZE_SC_SA_INC(asize); + if (desc->end == end) { + if (start != seg) + resize_free_seg(&mm->sa.map, desc, start, seg); + else { + delete_free_seg(&mm->sa.map, desc); + free_desc(mm, desc); + } + } + else { + ERTS_MMAP_ASSERT(end < desc->end); + resize_free_seg(&mm->sa.map, desc, end, desc->end); + if (start != seg) { + UWord ad_sz; + ad_sz = alloc_desc_insert_free_seg(mm, &mm->sua.map, + start, seg); + start += ad_sz; + if (start != seg) + mm->unreserve_physical(start, (UWord) (seg - start)); + } + } + goto supercarrier_success; + } + + if (superaligned) { + char *start = mm->sa.top; + seg = (char *) ERTS_SUPERALIGNED_CEILING(start); + + if (asize + (seg - start) <= mm->sua.bot - start) { + end = seg + asize; + if (!mm->reserve_physical(start, (UWord) (end - start), + mm->executable)) + goto supercarrier_reserve_failure; + mm->sa.top = end; + ERTS_MMAP_SIZE_SC_SA_INC(asize); + if (start != seg) { + UWord ad_sz; + ad_sz = alloc_desc_insert_free_seg(mm, &mm->sua.map, + start, seg); + start += ad_sz; + if (start != seg) + mm->unreserve_physical(start, (UWord) (seg - start)); + } + goto supercarrier_success; + } + + desc = lookup_free_seg(&mm->sua.map, asize + ERTS_SUPERALIGNED_SIZE); + if (desc) { + char *org_start = desc->start; + char *org_end = desc->end; + + seg = (char *) ERTS_SUPERALIGNED_CEILING(org_start); + end = seg + asize; + if (!mm->reserve_physical(seg, (UWord) (org_end - seg), + mm->executable)) + goto supercarrier_reserve_failure; + ERTS_MMAP_SIZE_SC_SUA_INC(asize); + if (org_start != seg) { + ERTS_MMAP_ASSERT(org_start < seg); + resize_free_seg(&mm->sua.map, desc, org_start, seg); + desc = NULL; + } + if (end != org_end) { + UWord ad_sz = 0; + ERTS_MMAP_ASSERT(end < org_end); + if (desc) + resize_free_seg(&mm->sua.map, desc, end, org_end); + else + ad_sz = alloc_desc_insert_free_seg(mm, &mm->sua.map, + end, org_end); + end += ad_sz; + if (end != org_end) + mm->unreserve_physical(end, + (UWord) (org_end - end)); + } + goto supercarrier_success; + } + } + + ERTS_MMAP_OP_ABORT(); + erts_smp_mtx_unlock(&mm->mtx); + } + +#if ERTS_HAVE_OS_MMAP + /* Map using OS primitives */ + if (!(ERTS_MMAPFLG_SUPERCARRIER_ONLY & flags) && !mm->no_os_mmap) { + if (!(ERTS_MMAPFLG_SUPERALIGNED & flags)) { + seg = os_mmap(NULL, asize, 0, mm->executable); + if (!seg) + goto failure; + } + else { + asize = ERTS_SUPERALIGNED_CEILING(*sizep); + seg = os_mmap(NULL, asize, 1, mm->executable); + if (!seg) + goto failure; + + if (!ERTS_IS_SUPERALIGNED(seg)) { + char *ptr; + UWord sz; + + os_munmap(seg, asize); + + ptr = os_mmap(NULL, asize + ERTS_SUPERALIGNED_SIZE, 1, + mm->executable); + if (!ptr) + goto failure; + + seg = (char *) ERTS_SUPERALIGNED_CEILING(ptr); + sz = (UWord) (seg - ptr); + ERTS_MMAP_ASSERT(sz <= ERTS_SUPERALIGNED_SIZE); + if (sz) + os_munmap(ptr, sz); + sz = ERTS_SUPERALIGNED_SIZE - sz; + if (sz) + os_munmap(seg+asize, sz); + } + } + + ERTS_MMAP_OP_LCK(seg, *sizep, asize); + ERTS_MMAP_SIZE_OS_INC(asize); + *sizep = asize; + return (void *) seg; + } +failure: +#endif + ERTS_MMAP_OP_LCK(NULL, *sizep, 0); + *sizep = 0; + return NULL; + +supercarrier_success: + +#ifdef ERTS_MMAP_DEBUG + if (ERTS_MMAPFLG_SUPERALIGNED & flags) { + ERTS_MMAP_ASSERT(ERTS_IS_SUPERALIGNED(seg)); + ERTS_MMAP_ASSERT(ERTS_IS_SUPERALIGNED(asize)); + } + else { + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(seg)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(asize)); + } +#endif + + ERTS_MMAP_OP_END(seg, asize); + erts_smp_mtx_unlock(&mm->mtx); + + *sizep = asize; + return (void *) seg; + +supercarrier_reserve_failure: + erts_smp_mtx_unlock(&mm->mtx); + *sizep = 0; + return NULL; +} + +void +erts_munmap(ErtsMemMapper* mm, Uint32 flags, void *ptr, UWord size) +{ + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(ptr)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(size)); + + if (!ERTS_MMAP_IN_SUPERCARRIER(ptr)) { + ERTS_MMAP_ASSERT(!mm->no_os_mmap); +#if ERTS_HAVE_OS_MMAP + ERTS_MUNMAP_OP_LCK(ptr, size); + ERTS_MMAP_SIZE_OS_DEC(size); + os_munmap(ptr, size); +#endif + } + else { + char *start, *end; + ErtsFreeSegMap *map; + ErtsFreeSegDesc *prev, *next, *desc; + UWord ad_sz = 0; + + ERTS_MMAP_ASSERT(mm->supercarrier); + + start = (char *) ptr; + end = start + size; + + erts_smp_mtx_lock(&mm->mtx); + + ERTS_MUNMAP_OP(ptr, size); + + if (ERTS_MMAP_IN_SUPERALIGNED_AREA(ptr)) { + + map = &mm->sa.map; + adjacent_free_seg(map, start, end, &prev, &next); + + ERTS_MMAP_SIZE_SC_SA_DEC(size); + if (end == mm->sa.top) { + ERTS_MMAP_ASSERT(!next); + if (prev) { + start = prev->start; + delete_free_seg(map, prev); + free_desc(mm, prev); + } + mm->sa.top = start; + goto supercarrier_success; + } + } + else { + map = &mm->sua.map; + adjacent_free_seg(map, start, end, &prev, &next); + + ERTS_MMAP_SIZE_SC_SUA_DEC(size); + if (start == mm->sua.bot) { + ERTS_MMAP_ASSERT(!prev); + if (next) { + end = next->end; + delete_free_seg(map, next); + free_desc(mm, next); + } + mm->sua.bot = end; + goto supercarrier_success; + } + } + + desc = NULL; + + if (next) { + ERTS_MMAP_ASSERT(end < next->end); + end = next->end; + if (prev) { + delete_free_seg(map, next); + free_desc(mm, next); + goto save_prev; + } + desc = next; + } else if (prev) { + save_prev: + ERTS_MMAP_ASSERT(prev->start < start); + start = prev->start; + desc = prev; + } + + if (desc) + resize_free_seg(map, desc, start, end); + else + ad_sz = alloc_desc_insert_free_seg(mm, map, start, end); + + supercarrier_success: { + UWord unres_sz; + + ERTS_MMAP_ASSERT(size >= ad_sz); + unres_sz = size - ad_sz; + if (unres_sz) + mm->unreserve_physical(((char *) ptr) + ad_sz, unres_sz); + + erts_smp_mtx_unlock(&mm->mtx); + } + } +} + +static void * +remap_move(ErtsMemMapper* mm, + Uint32 flags, void *ptr, UWord old_size, UWord *sizep) +{ + UWord size = *sizep; + void *new_ptr = erts_mmap(mm, flags, &size); + if (!new_ptr) + return NULL; + *sizep = size; + if (old_size < size) + size = old_size; + sys_memcpy(new_ptr, ptr, (size_t) size); + erts_munmap(mm, flags, ptr, old_size); + return new_ptr; +} + +void * +erts_mremap(ErtsMemMapper* mm, + Uint32 flags, void *ptr, UWord old_size, UWord *sizep) +{ + void *new_ptr; + Uint32 superaligned; + UWord asize; + + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(ptr)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(old_size)); + ERTS_MMAP_ASSERT(sizep && ERTS_IS_PAGEALIGNED(*sizep)); + + if (!ERTS_MMAP_IN_SUPERCARRIER(ptr)) { + + ERTS_MMAP_ASSERT(!mm->no_os_mmap); + + if (!(ERTS_MMAPFLG_OS_ONLY & flags) && mm->supercarrier) { + new_ptr = remap_move(mm, ERTS_MMAPFLG_SUPERCARRIER_ONLY|flags, + ptr, old_size, sizep); + if (new_ptr) + return new_ptr; + } + + if (ERTS_MMAPFLG_SUPERCARRIER_ONLY & flags) { + ERTS_MREMAP_OP_LCK(NULL, ptr, old_size, *sizep, old_size); + return NULL; + } + +#if ERTS_HAVE_OS_MREMAP || ERTS_HAVE_GENUINE_OS_MMAP + superaligned = (ERTS_MMAPFLG_SUPERALIGNED & flags); + + if (superaligned) { + asize = ERTS_SUPERALIGNED_CEILING(*sizep); + if (asize == old_size && ERTS_IS_SUPERALIGNED(ptr)) { + ERTS_MREMAP_OP_LCK(ptr, ptr, old_size, *sizep, asize); + *sizep = asize; + return ptr; + } + } + else { + asize = ERTS_PAGEALIGNED_CEILING(*sizep); + if (asize == old_size) { + ERTS_MREMAP_OP_LCK(ptr, ptr, old_size, *sizep, asize); + *sizep = asize; + return ptr; + } + } + +#if ERTS_HAVE_GENUINE_OS_MMAP + if (asize < old_size + && (!superaligned + || ERTS_IS_SUPERALIGNED(ptr))) { + UWord um_sz; + new_ptr = ((char *) ptr) + asize; + ERTS_MMAP_ASSERT((((char *)ptr) + old_size) > (char *) new_ptr); + um_sz = (UWord) ((((char *) ptr) + old_size) - (char *) new_ptr); + ERTS_MMAP_SIZE_OS_DEC(um_sz); + os_munmap(new_ptr, um_sz); + ERTS_MREMAP_OP_LCK(ptr, ptr, old_size, *sizep, asize); + *sizep = asize; + return ptr; + } +#endif +#if ERTS_HAVE_OS_MREMAP + if (superaligned) + return remap_move(mm, flags, new_ptr, old_size, sizep); + else { + new_ptr = os_mremap(ptr, old_size, asize, 0); + if (!new_ptr) + return NULL; + if (asize > old_size) + ERTS_MMAP_SIZE_OS_INC(asize - old_size); + else + ERTS_MMAP_SIZE_OS_DEC(old_size - asize); + ERTS_MREMAP_OP_LCK(new_ptr, ptr, old_size, *sizep, asize); + *sizep = asize; + return new_ptr; + } +#endif +#endif + } + else { /* In super carrier */ + char *start, *end, *new_end; + ErtsFreeSegMap *map; + ErtsFreeSegDesc *prev, *next; + UWord ad_sz = 0; + + ERTS_MMAP_ASSERT(mm->supercarrier); + + if (ERTS_MMAPFLG_OS_ONLY & flags) + return remap_move(mm, flags, ptr, old_size, sizep); + + superaligned = (ERTS_MMAPFLG_SUPERALIGNED & flags); + + asize = (superaligned + ? ERTS_SUPERALIGNED_CEILING(*sizep) + : ERTS_PAGEALIGNED_CEILING(*sizep)); + + erts_smp_mtx_lock(&mm->mtx); + + if (ERTS_MMAP_IN_SUPERALIGNED_AREA(ptr) + ? (!superaligned && lookup_free_seg(&mm->sua.map, asize)) + : (superaligned && lookup_free_seg(&mm->sa.map, asize))) { + erts_smp_mtx_unlock(&mm->mtx); + /* + * Segment currently in wrong area (due to a previous memory + * shortage), move it to the right area. + * (remap_move() will succeed) + */ + return remap_move(mm, ERTS_MMAPFLG_SUPERCARRIER_ONLY|flags, + ptr, old_size, sizep); + } + + ERTS_MREMAP_OP_START(ptr, old_size, *sizep); + + if (asize == old_size) { + new_ptr = ptr; + goto supercarrier_resize_success; + } + + start = (char *) ptr; + end = start + old_size; + new_end = start+asize; + + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(ptr)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(old_size)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(asize)); + + if (asize < old_size) { + UWord unres_sz; + new_ptr = ptr; + if (!ERTS_MMAP_IN_SUPERALIGNED_AREA(ptr)) { + map = &mm->sua.map; + ERTS_MMAP_SIZE_SC_SUA_DEC(old_size - asize); + } + else { + if (end == mm->sa.top) { + mm->sa.top = new_end; + mm->unreserve_physical(((char *) ptr) + asize, + old_size - asize); + goto supercarrier_resize_success; + } + ERTS_MMAP_SIZE_SC_SA_DEC(old_size - asize); + map = &mm->sa.map; + } + + adjacent_free_seg(map, start, end, &prev, &next); + + if (next) + resize_free_seg(map, next, new_end, next->end); + else + ad_sz = alloc_desc_insert_free_seg(mm, map, new_end, end); + ERTS_MMAP_ASSERT(old_size - asize >= ad_sz); + unres_sz = old_size - asize - ad_sz; + if (unres_sz) + mm->unreserve_physical(((char *) ptr) + asize + ad_sz, + unres_sz); + goto supercarrier_resize_success; + } + + if (!ERTS_MMAP_IN_SUPERALIGNED_AREA(ptr)) { + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(ptr)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(old_size)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(asize)); + + adjacent_free_seg(&mm->sua.map, start, end, &prev, &next); + + if (next && new_end <= next->end) { + if (!mm->reserve_physical(((char *) ptr) + old_size, + asize - old_size, + mm->executable)) + goto supercarrier_reserve_failure; + if (new_end < next->end) + resize_free_seg(&mm->sua.map, next, new_end, next->end); + else { + delete_free_seg(&mm->sua.map, next); + free_desc(mm, next); + } + new_ptr = ptr; + ERTS_MMAP_SIZE_SC_SUA_INC(asize - old_size); + goto supercarrier_resize_success; + } + } + else { /* Superaligned area */ + + if (end == mm->sa.top) { + if (new_end <= mm->sua.bot) { + if (!mm->reserve_physical(((char *) ptr) + old_size, + asize - old_size, + mm->executable)) + goto supercarrier_reserve_failure; + mm->sa.top = new_end; + new_ptr = ptr; + ERTS_MMAP_SIZE_SC_SA_INC(asize - old_size); + goto supercarrier_resize_success; + } + } + else { + adjacent_free_seg(&mm->sa.map, start, end, &prev, &next); + if (next && new_end <= next->end) { + if (!mm->reserve_physical(((char *) ptr) + old_size, + asize - old_size, + mm->executable)) + goto supercarrier_reserve_failure; + if (new_end < next->end) + resize_free_seg(&mm->sa.map, next, new_end, next->end); + else { + delete_free_seg(&mm->sa.map, next); + free_desc(mm, next); + } + new_ptr = ptr; + ERTS_MMAP_SIZE_SC_SA_INC(asize - old_size); + goto supercarrier_resize_success; + } + } + } + + ERTS_MMAP_OP_ABORT(); + erts_smp_mtx_unlock(&mm->mtx); + + /* Failed to resize... */ + } + + return remap_move(mm, flags, ptr, old_size, sizep); + +supercarrier_resize_success: + +#ifdef ERTS_MMAP_DEBUG + if ((ERTS_MMAPFLG_SUPERALIGNED & flags) + || ERTS_MMAP_IN_SUPERALIGNED_AREA(new_ptr)) { + ERTS_MMAP_ASSERT(ERTS_IS_SUPERALIGNED(new_ptr)); + ERTS_MMAP_ASSERT(ERTS_IS_SUPERALIGNED(asize)); + } + else { + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(new_ptr)); + ERTS_MMAP_ASSERT(ERTS_IS_PAGEALIGNED(asize)); + } +#endif + + ERTS_MREMAP_OP_END(new_ptr, asize); + erts_smp_mtx_unlock(&mm->mtx); + + *sizep = asize; + return new_ptr; + +supercarrier_reserve_failure: + ERTS_MREMAP_OP_END(NULL, old_size); + erts_smp_mtx_unlock(&mm->mtx); + *sizep = old_size; + return NULL; + +} + +int erts_mmap_in_supercarrier(ErtsMemMapper* mm, void *ptr) +{ + return ERTS_MMAP_IN_SUPERCARRIER(ptr); +} + +static struct { + Eterm options; + Eterm total; + Eterm total_sa; + Eterm total_sua; + Eterm used; + Eterm used_sa; + Eterm used_sua; + Eterm max; + Eterm allocated; + Eterm reserved; + Eterm sizes; + Eterm free_segs; + Eterm supercarrier; + Eterm os; + Eterm scs; + Eterm sco; + Eterm scrpm; + Eterm scrfsd; + + int is_initialized; + erts_mtx_t init_mutex; +}am; + +static void ERTS_INLINE atom_init(Eterm *atom, char *name) +{ + *atom = am_atom_put(name, strlen(name)); +} +#define AM_INIT(AM) atom_init(&am.AM, #AM) + +static void init_atoms(void) +{ + erts_mtx_lock(&am.init_mutex); + + if (!am.is_initialized) { + AM_INIT(options); + AM_INIT(total); + AM_INIT(total_sa); + AM_INIT(total_sua); + AM_INIT(used); + AM_INIT(used_sa); + AM_INIT(used_sua); + AM_INIT(max); + AM_INIT(allocated); + AM_INIT(reserved); + AM_INIT(sizes); + AM_INIT(free_segs); + AM_INIT(supercarrier); + AM_INIT(os); + AM_INIT(scs); + AM_INIT(sco); + AM_INIT(scrpm); + AM_INIT(scrfsd); + am.is_initialized = 1; + } + erts_mtx_unlock(&am.init_mutex); +}; + + +#ifdef HARD_DEBUG_MSEG +static void hard_dbg_mseg_init(void); +#endif + +void +erts_mmap_init(ErtsMemMapper* mm, ErtsMMapInit *init, int executable) +{ + static int is_first_call = 1; + int virtual_map = 0; + char *start = NULL, *end = NULL; + UWord pagesize; +#if defined(__WIN32__) + SYSTEM_INFO sysinfo; + GetSystemInfo(&sysinfo); + pagesize = (UWord) sysinfo.dwPageSize; +#elif defined(_SC_PAGESIZE) + pagesize = (UWord) sysconf(_SC_PAGESIZE); +#elif defined(HAVE_GETPAGESIZE) + pagesize = (UWord) getpagesize(); +#else +# error "Do not know how to get page size" +#endif +#if defined(HARD_DEBUG) || 0 + erts_fprintf(stderr, "erts_mmap: scs = %bpu\n", init->scs); + erts_fprintf(stderr, "erts_mmap: sco = %i\n", init->sco); + erts_fprintf(stderr, "erts_mmap: scrfsd = %i\n", init->scrfsd); +#endif + erts_page_inv_mask = pagesize - 1; + if (pagesize & erts_page_inv_mask) + erts_exit(1, "erts_mmap: Invalid pagesize: %bpu\n", + pagesize); + + ERTS_MMAP_OP_RINGBUF_INIT(); + + mm->supercarrier = 0; + mm->reserve_physical = reserve_noop; + mm->unreserve_physical = unreserve_noop; + mm->executable = executable; + +#if HAVE_MMAP && !defined(MAP_ANON) + mm->mmap_fd = open("/dev/zero", O_RDWR); + if (mm->mmap_fd < 0) + erts_exit(1, "erts_mmap: Failed to open /dev/zero\n"); +#endif + + erts_smp_mtx_init(&mm->mtx, "erts_mmap"); + if (is_first_call) { + erts_mtx_init(&am.init_mutex, "mmap_init_atoms"); + } + +#ifdef ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION + if (init->virtual_range.start) { + char *ptr; + UWord sz; + ptr = (char *) ERTS_PAGEALIGNED_CEILING(init->virtual_range.start); + end = (char *) ERTS_PAGEALIGNED_FLOOR(init->virtual_range.end); + sz = end - ptr; + start = os_mmap_virtual(ptr, sz, executable); + if (!start || start > ptr || start >= end) + erts_exit(1, + "erts_mmap: Failed to create virtual range for super carrier\n"); + sz = start - ptr; + if (sz) + os_munmap(end, sz); + mm->reserve_physical = os_reserve_physical; + mm->unreserve_physical = os_unreserve_physical; + virtual_map = 1; + } + else +#endif + if (init->predefined_area.start) { + start = init->predefined_area.start; + end = init->predefined_area.end; + if (end != (void *) 0 && end < start) + end = start; + } +#if ERTS_HAVE_OS_MMAP + else if (init->scs) { + UWord sz; + sz = ERTS_PAGEALIGNED_CEILING(init->scs); +#ifdef ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION + if (!init->scrpm) { + start = os_mmap_virtual(NULL, sz, executable); + mm->reserve_physical = os_reserve_physical; + mm->unreserve_physical = os_unreserve_physical; + virtual_map = 1; + } + else +#endif + { + /* + * The whole supercarrier will by physically + * reserved all the time. + */ + start = os_mmap(NULL, sz, 1, executable); + } + if (!start) + erts_exit(1, + "erts_mmap: Failed to create super carrier of size %bpu MB\n", + init->scs/1024/1024); + end = start + sz; +#ifdef ERTS_MMAP_DEBUG_FILL_AREAS + if (!virtual_map) { + Uint32 *uip; + + for (uip = (Uint32 *) start; uip < (Uint32 *) end; uip++) + *uip = (Uint32) 0xdeadbeef; + } +#endif + } +#endif + + mm->no.free_seg_descs = 0; + mm->no.free_segs.curr = 0; + mm->no.free_segs.max = 0; + + mm->size.supercarrier.total = 0; + mm->size.supercarrier.used.total = 0; + mm->size.supercarrier.used.sa = 0; + mm->size.supercarrier.used.sua = 0; + mm->size.os.used = 0; + + mm->desc.new_area_hint = NULL; + + if (!start) { + mm->sa.bot = NULL; + mm->sua.top = NULL; + mm->sa.bot = NULL; + mm->sua.top = NULL; + mm->no_os_mmap = 0; + mm->supercarrier = 0; + } + else { + size_t desc_size; + + mm->no_os_mmap = init->sco; + + desc_size = init->scrfsd; + if (desc_size < 100) + desc_size = 100; + desc_size *= sizeof(ErtsFreeSegDesc); + if ((desc_size + + ERTS_SUPERALIGNED_SIZE + + ERTS_PAGEALIGNED_SIZE) > end - start) + erts_exit(1, "erts_mmap: No space for segments in super carrier\n"); + + mm->sa.bot = start; + mm->sa.bot += desc_size; + mm->sa.bot = (char *) ERTS_SUPERALIGNED_CEILING(mm->sa.bot); + mm->sa.top = mm->sa.bot; + mm->sua.top = end; + mm->sua.bot = mm->sua.top; + + mm->size.supercarrier.used.total += (UWord) (mm->sa.bot - start); + + mm->desc.free_list = NULL; + mm->desc.reserved = 0; + + if (end == (void *) 0) { + /* + * Very unlikely, but we need a guarantee + * that `mm->sua.top` always will + * compare as larger than all segment pointers + * into the super carrier... + */ + mm->sua.top -= ERTS_PAGEALIGNED_SIZE; + mm->size.supercarrier.used.total += ERTS_PAGEALIGNED_SIZE; +#ifdef ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION + if (!virtual_map || os_reserve_physical(mm->sua.top, ERTS_PAGEALIGNED_SIZE, 0)) +#endif + add_free_desc_area(mm, mm->sua.top, end); + mm->desc.reserved += (end - mm->sua.top) / sizeof(ErtsFreeSegDesc); + } + + mm->size.supercarrier.total = (UWord) (mm->sua.top - start); + + /* + * Area before (and after) super carrier + * will be used for free segment descritors. + */ +#ifdef ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION + if (virtual_map && !os_reserve_physical(start, mm->sa.bot - start, 0)) + erts_exit(1, "erts_mmap: Failed to reserve physical memory for descriptors\n"); +#endif + mm->desc.unused_start = start; + mm->desc.unused_end = mm->sa.bot; + mm->desc.reserved += ((mm->desc.unused_end - start) + / sizeof(ErtsFreeSegDesc)); + + init_free_seg_map(&mm->sa.map, SA_SZ_ADDR_ORDER); + init_free_seg_map(&mm->sua.map, SZ_REVERSE_ADDR_ORDER); + + mm->supercarrier = 1; + + mm->desc.new_area_hint = end; + + } + +#if !ERTS_HAVE_OS_MMAP + mm->no_os_mmap = 1; +#endif + +#ifdef HARD_DEBUG_MSEG + hard_dbg_mseg_init(); +#endif + +#if defined(ARCH_64) && defined(ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION) + if (mm == &erts_literal_mmapper) { + erts_literals_start = erts_literal_mmapper.sa.bot; + erts_literals_size = erts_literal_mmapper.sua.top - erts_literals_start; + } +#endif + is_first_call = 0; +} + + +static ERTS_INLINE void +add_2tup(Uint **hpp, Uint *szp, Eterm *lp, Eterm el1, Eterm el2) +{ + *lp = erts_bld_cons(hpp, szp, erts_bld_tuple(hpp, szp, 2, el1, el2), *lp); +} + +Eterm erts_mmap_info(ErtsMemMapper* mm, + int *print_to_p, + void *print_to_arg, + Eterm** hpp, Uint* szp, + struct erts_mmap_info_struct* emis) +{ + Eterm size_tags[] = { am.total, am.total_sa, am.total_sua, am.used, am.used_sa, am.used_sua }; + Eterm seg_tags[] = { am.used, am.max, am.allocated, am.reserved, am.used_sa, am.used_sua }; + Eterm group[2]; + Eterm group_tags[] = { am.sizes, am.free_segs }; + Eterm list[3]; + Eterm list_tags[3]; /* { am.options, am.supercarrier, am.os } */ + int lix = 0; + Eterm res = THE_NON_VALUE; + + if (!hpp) { + erts_smp_mtx_lock(&mm->mtx); + emis->sizes[0] = mm->size.supercarrier.total; + emis->sizes[1] = mm->sa.top - mm->sa.bot; + emis->sizes[2] = mm->sua.top - mm->sua.bot; + emis->sizes[3] = mm->size.supercarrier.used.total; + emis->sizes[4] = mm->size.supercarrier.used.sa; + emis->sizes[5] = mm->size.supercarrier.used.sua; + + emis->segs[0] = mm->no.free_segs.curr; + emis->segs[1] = mm->no.free_segs.max; + emis->segs[2] = mm->no.free_seg_descs; + emis->segs[3] = mm->desc.reserved; + emis->segs[4] = mm->sa.map.nseg; + emis->segs[5] = mm->sua.map.nseg; + + emis->os_used = mm->size.os.used; + erts_smp_mtx_unlock(&mm->mtx); + } + + list[lix] = erts_mmap_info_options(mm, "option ", print_to_p, print_to_arg, + hpp, szp); + list_tags[lix] = am.options; + lix++; + + + if (print_to_p) { + int to = *print_to_p; + void *arg = print_to_arg; + if (mm->supercarrier) { + const char* prefix = "supercarrier "; + erts_print(to, arg, "%stotal size: %bpu\n", prefix, emis->sizes[0]); + erts_print(to, arg, "%stotal sa size: %bpu\n", prefix, emis->sizes[1]); + erts_print(to, arg, "%stotal sua size: %bpu\n", prefix, emis->sizes[2]); + erts_print(to, arg, "%sused size: %bpu\n", prefix, emis->sizes[3]); + erts_print(to, arg, "%sused sa size: %bpu\n", prefix, emis->sizes[4]); + erts_print(to, arg, "%sused sua size: %bpu\n", prefix, emis->sizes[5]); + erts_print(to, arg, "%sused free segs: %bpu\n", prefix, emis->segs[0]); + erts_print(to, arg, "%smax free segs: %bpu\n", prefix, emis->segs[1]); + erts_print(to, arg, "%sallocated free segs: %bpu\n", prefix, emis->segs[2]); + erts_print(to, arg, "%sreserved free segs: %bpu\n", prefix, emis->segs[3]); + erts_print(to, arg, "%ssa free segs: %bpu\n", prefix, emis->segs[4]); + erts_print(to, arg, "%ssua free segs: %bpu\n", prefix, emis->segs[5]); + } + if (!mm->no_os_mmap) { + erts_print(to, arg, "os mmap size used: %bpu\n", emis->os_used); + } + } + + + if (hpp || szp) { + if (!am.is_initialized) { + init_atoms(); + } + + if (mm->supercarrier) { + group[0] = erts_bld_atom_uword_2tup_list(hpp, szp, + sizeof(size_tags)/sizeof(Eterm), + size_tags, emis->sizes); + group[1] = erts_bld_atom_uword_2tup_list(hpp, szp, + sizeof(seg_tags)/sizeof(Eterm), + seg_tags, emis->segs); + list[lix] = erts_bld_2tup_list(hpp, szp, 2, group_tags, group); + list_tags[lix] = am.supercarrier; + lix++; + } + + if (!mm->no_os_mmap) { + group[0] = erts_bld_atom_uword_2tup_list(hpp, szp, + 1, &am.used, &emis->os_used); + list[lix] = erts_bld_2tup_list(hpp, szp, 1, group_tags, group); + list_tags[lix] = am.os; + lix++; + } + res = erts_bld_2tup_list(hpp, szp, lix, list_tags, list); + } + return res; +} + +Eterm erts_mmap_info_options(ErtsMemMapper* mm, + char *prefix, + int *print_to_p, + void *print_to_arg, + Uint **hpp, + Uint *szp) +{ + const UWord scs = mm->sua.top - mm->sa.bot; + const Eterm sco = mm->no_os_mmap ? am_true : am_false; + const Eterm scrpm = (mm->reserve_physical == reserve_noop) ? am_true : am_false; + Eterm res = THE_NON_VALUE; + + if (print_to_p) { + int to = *print_to_p; + void *arg = print_to_arg; + erts_print(to, arg, "%sscs: %bpu\n", prefix, scs); + if (mm->supercarrier) { + erts_print(to, arg, "%ssco: %T\n", prefix, sco); + erts_print(to, arg, "%sscrpm: %T\n", prefix, scrpm); + erts_print(to, arg, "%sscrfsd: %beu\n", prefix, mm->desc.reserved); + } + } + + if (hpp || szp) { + if (!am.is_initialized) { + init_atoms(); + } + + res = NIL; + if (mm->supercarrier) { + add_2tup(hpp, szp, &res, am.scrfsd, + erts_bld_uint(hpp,szp, mm->desc.reserved)); + add_2tup(hpp, szp, &res, am.scrpm, scrpm); + add_2tup(hpp, szp, &res, am.sco, sco); + } + add_2tup(hpp, szp, &res, am.scs, erts_bld_uword(hpp, szp, scs)); + } + return res; +} + +#endif /* HAVE_ERTS_MMAP */ + +Eterm erts_mmap_debug_info(Process* p) +{ +#if HAVE_ERTS_MMAP + ErtsMemMapper* mm = &erts_dflt_mmapper; + + if (mm->supercarrier) { + ERTS_DECL_AM(sabot); + ERTS_DECL_AM(satop); + ERTS_DECL_AM(suabot); + ERTS_DECL_AM(suatop); + Eterm sa_list, sua_list, list; + Eterm tags[] = { AM_sabot, AM_satop, AM_suabot, AM_suatop }; + UWord values[4]; + Eterm *hp, *hp_end; + Uint may_need; + + erts_smp_mtx_lock(&mm->mtx); + values[0] = (UWord)mm->sa.bot; + values[1] = (UWord)mm->sa.top; + values[2] = (UWord)mm->sua.bot; + values[3] = (UWord)mm->sua.top; + sa_list = build_free_seg_list(p, &mm->sa.map); + sua_list = build_free_seg_list(p, &mm->sua.map); + erts_smp_mtx_unlock(&mm->mtx); + + may_need = 4*(2+3+2) + 2*(2+3); + hp = HAlloc(p, may_need); + hp_end = hp + may_need; + + list = erts_bld_atom_uword_2tup_list(&hp, NULL, + sizeof(values)/sizeof(*values), + tags, values); + + sa_list = TUPLE2(hp, am_atom_put("sa_free_segs",12), sa_list); hp+=3; + sua_list = TUPLE2(hp, am_atom_put("sua_free_segs",13), sua_list); hp+=3; + list = CONS(hp, sua_list, list); hp+=2; + list = CONS(hp, sa_list, list); hp+=2; + + ASSERT(hp <= hp_end); + HRelease(p, hp_end, hp); + return list; + } +#endif + return am_undefined; +} + + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * Debug functions * +\* */ + + +#ifdef HARD_DEBUG + +static int rbt_assert_is_member(RBTNode* root, RBTNode* node) +{ + while (node != root) { + RBT_ASSERT(parent(node)); + RBT_ASSERT(parent(node)->left == node || parent(node)->right == node); + node = parent(node); + } + return 1; +} + + +#if 0 +# define PRINT_TREE +#else +# undef PRINT_TREE +#endif + +#ifdef PRINT_TREE +static void print_tree(enum SortOrder order, RBTNode*); +#endif + +/* + * Checks that the order between parent and children are correct, + * and that the Red-Black Tree properies are satisfied. if size > 0, + * check_tree() returns the node that satisfies "address order first fit" + * + * The Red-Black Tree properies are: + * 1. Every node is either red or black. + * 2. Every leaf (NIL) is black. + * 3. If a node is red, then both its children are black. + * 4. Every simple path from a node to a descendant leaf + * contains the same number of black nodes. + * + */ + +struct check_arg_t { + RBTree* tree; + ErtsFreeSegDesc* prev_seg; + Uint size; + RBTNode *res; +}; +static void check_node_callback(RBTNode* x, void* arg); + + +static RBTNode * +check_tree(RBTree* tree, Uint size) +{ + struct check_arg_t carg; + carg.tree = tree; + carg.prev_seg = NULL; + carg.size = size; + carg.res = NULL; + +#ifdef PRINT_TREE + print_tree(tree->order, tree->root); +#endif + + if (!tree->root) + return NULL; + + RBT_ASSERT(IS_BLACK(tree->root)); + RBT_ASSERT(!parent(tree->root)); + + rbt_foreach_node(tree, check_node_callback, &carg, 0); + + return carg.res; +} + +static void check_node_callback(RBTNode* x, void* arg) +{ + struct check_arg_t* a = (struct check_arg_t*) arg; + ErtsFreeSegDesc* seg; + + if (IS_RED(x)) { + RBT_ASSERT(IS_BLACK(x->right)); + RBT_ASSERT(IS_BLACK(x->left)); + } + + RBT_ASSERT(parent(x) || x == a->tree->root); + + if (x->left) { + RBT_ASSERT(cmp_nodes(a->tree->order, x->left, x) < 0); + } + if (x->right) { + RBT_ASSERT(cmp_nodes(a->tree->order, x->right, x) > 0); + } + + seg = node_to_desc(a->tree->order, x); + RBT_ASSERT(seg->start < seg->end); + if (a->size && (seg->end - seg->start) >= a->size) { + if (!a->res || cmp_nodes(a->tree->order, x, a->res) < 0) { + a->res = x; + } + } + if (a->tree->order == ADDR_ORDER) { + RBT_ASSERT(!a->prev_seg || a->prev_seg->end < seg->start); + a->prev_seg = seg; + } +} + +#endif /* HARD_DEBUG */ + + +#ifdef PRINT_TREE +#define INDENT_STEP 2 + +#include <stdio.h> + +static void +print_tree_aux(enum SortOrder order, RBTNode *x, int indent) +{ + int i; + + if (x) { + ErtsFreeSegDesc* desc = node_to_desc(order, x); + print_tree_aux(order, x->right, indent + INDENT_STEP); + for (i = 0; i < indent; i++) { + putc(' ', stderr); + } + fprintf(stderr, "%s: sz=%lx [%p - %p] desc=%p\r\n", + IS_BLACK(x) ? "BLACK" : "RED", + desc->end - desc->start, desc->start, desc->end, desc); + print_tree_aux(order, x->left, indent + INDENT_STEP); + } +} + + +static void +print_tree(enum SortOrder order, RBTNode* root) +{ + fprintf(stderr, " --- %s ordered tree begin ---\r\n", sort_order_names[order]); + print_tree_aux(order, root, 0); + fprintf(stderr, " --- %s ordered tree end ---\r\n", sort_order_names[order]); +} + +#endif /* PRINT_TREE */ + + +#ifdef FREE_SEG_API_SMOKE_TEST + +void test_it(void) +{ + ErtsFreeSegMap map; + ErtsFreeSegDesc *desc, *under, *over, *d1, *d2; + const int i = 1; /* reverse addr order */ + + { + init_free_seg_map(&map, SZ_REVERSE_ADDR_ORDER); + + insert_free_seg(&map, alloc_desc(), (char*)0x11000, (char*)0x12000); + HARD_CHECK_TREE(&map.atree, 0); HARD_CHECK_TREE(&map.stree, 0); + insert_free_seg(&map, alloc_desc(), (char*)0x13000, (char*)0x14000); + HARD_CHECK_TREE(&map.atree, 0); HARD_CHECK_TREE(&map.stree, 0); + insert_free_seg(&map, alloc_desc(), (char*)0x15000, (char*)0x17000); + HARD_CHECK_TREE(&map.atree, 0); HARD_CHECK_TREE(&map.stree, 0); + insert_free_seg(&map, alloc_desc(), (char*)0x8000, (char*)0x10000); + HARD_CHECK_TREE(&map.atree, 0); HARD_CHECK_TREE(&map.stree, 0); + + desc = lookup_free_seg(&map, 0x500); + ERTS_ASSERT(desc->start == (char*)(i?0x13000L:0x11000L)); + + desc = lookup_free_seg(&map, 0x1500); + ERTS_ASSERT(desc->start == (char*)0x15000); + + adjacent_free_seg(&map, (char*)0x6666, (char*)0x7777, &under, &over); + ERTS_ASSERT(!under && !over); + + adjacent_free_seg(&map, (char*)0x6666, (char*)0x8000, &under, &over); + ERTS_ASSERT(!under && over->start == (char*)0x8000); + + adjacent_free_seg(&map, (char*)0x10000, (char*)0x10500, &under, &over); + ERTS_ASSERT(under->end == (char*)0x10000 && !over); + + adjacent_free_seg(&map, (char*)0x10100, (char*)0x10500, &under, &over); + ERTS_ASSERT(!under && !over); + + adjacent_free_seg(&map, (char*)0x10100, (char*)0x11000, &under, &over); + ERTS_ASSERT(!under && over && over->start == (char*)0x11000); + + adjacent_free_seg(&map, (char*)0x12000, (char*)0x12500, &under, &over); + ERTS_ASSERT(under && under->end == (char*)0x12000 && !over); + + adjacent_free_seg(&map, (char*)0x12000, (char*)0x13000, &under, &over); + ERTS_ASSERT(under && under->end == (char*)0x12000 && + over && over->start == (char*)0x13000); + + adjacent_free_seg(&map, (char*)0x12500, (char*)0x13000, &under, &over); + ERTS_ASSERT(!under && over && over->start == (char*)0x13000); + + d1 = lookup_free_seg(&map, 0x500); + ERTS_ASSERT(d1->start == (char*)(i?0x13000L:0x11000L)); + + resize_free_seg(&map, d1, d1->start - 0x800, (char*)d1->end); + HARD_CHECK_TREE(&map.atree, 0); HARD_CHECK_TREE(&map.stree, 0); + + d2 = lookup_free_seg(&map, 0x1200); + ERTS_ASSERT(d2 == d1); + + delete_free_seg(&map, d1); + HARD_CHECK_TREE(&map.atree, 0); HARD_CHECK_TREE(&map.stree, 0); + + d1 = lookup_free_seg(&map, 0x1200); + ERTS_ASSERT(d1->start == (char*)0x15000); + } +} + +#endif /* FREE_SEG_API_SMOKE_TEST */ + + +#ifdef HARD_DEBUG_MSEG + +/* + * Debug stuff used by erl_mseg to check that it does the right thing. + * The reason for keeping it here is that we (ab)use the rb-tree code + * for keeping track of *allocated* segments. + */ + +typedef struct ErtsFreeSegDesc_fake_ { + /*RBTNode snode; Save memory by skipping unused size tree node */ + RBTNode anode; /* node in 'atree' */ + union { + char* start; + struct ErtsFreeSegDesc_fake_* next_free; + }u; + char* end; +}ErtsFreeSegDesc_fake; + +static ErtsFreeSegDesc_fake hard_dbg_mseg_desc_pool[10000]; +static ErtsFreeSegDesc_fake* hard_dbg_mseg_desc_first; +RBTree hard_dbg_mseg_tree; + +static erts_mtx_t hard_dbg_mseg_mtx; + +static void hard_dbg_mseg_init(void) +{ + ErtsFreeSegDesc_fake* p; + + erts_mtx_init(&hard_dbg_mseg_mtx, "hard_dbg_mseg"); + hard_dbg_mseg_tree.root = NULL; + hard_dbg_mseg_tree.order = ADDR_ORDER; + + p = &hard_dbg_mseg_desc_pool[(sizeof(hard_dbg_mseg_desc_pool) / + sizeof(*hard_dbg_mseg_desc_pool)) - 1]; + p->u.next_free = NULL; + while (--p >= hard_dbg_mseg_desc_pool) { + p->u.next_free = (p+1); + } + hard_dbg_mseg_desc_first = &hard_dbg_mseg_desc_pool[0]; +} + +static ErtsFreeSegDesc* hard_dbg_alloc_desc(void) +{ + ErtsFreeSegDesc_fake* p = hard_dbg_mseg_desc_first; + ERTS_ASSERT(p || !"HARD_DEBUG_MSEG: Out of mseg descriptors"); + hard_dbg_mseg_desc_first = p->u.next_free; + + /* Creative pointer arithmetic to return something that looks like + * a ErtsFreeSegDesc as long as we don't use the absent 'snode'. + */ + return (ErtsFreeSegDesc*) ((char*)p - offsetof(ErtsFreeSegDesc,anode)); +} + +static void hard_dbg_free_desc(ErtsFreeSegDesc* desc) +{ + ErtsFreeSegDesc_fake* p = (ErtsFreeSegDesc_fake*) &desc->anode; + memset(p, 0xfe, sizeof(*p)); + p->u.next_free = hard_dbg_mseg_desc_first; + hard_dbg_mseg_desc_first = p; +} + +static void check_seg_writable(void* seg, UWord sz) +{ + UWord* seg_end = (UWord*)((char*)seg + sz); + volatile UWord* p; + ERTS_ASSERT(ERTS_IS_PAGEALIGNED(seg)); + ERTS_ASSERT(ERTS_IS_PAGEALIGNED(sz)); + for (p=(UWord*)seg; p<seg_end; p += (ERTS_INV_PAGEALIGNED_MASK+1)/sizeof(UWord)) { + UWord write_back = *p; + *p = 0xfade2b1acc; + *p = write_back; + } +} + +void hard_dbg_insert_mseg(void* seg, UWord sz) +{ + check_seg_writable(seg, sz); + erts_mtx_lock(&hard_dbg_mseg_mtx); + { + ErtsFreeSegDesc *desc = hard_dbg_alloc_desc(); + RBTNode *prev, *next; + desc->start = (char*)seg; + desc->end = desc->start + sz - 1; /* -1 to allow adjacent segments in tree */ + rbt_insert(&hard_dbg_mseg_tree, &desc->anode); + prev = rbt_prev_node(&desc->anode); + next = rbt_next_node(&desc->anode); + ERTS_ASSERT(!prev || anode_to_desc(prev)->end < desc->start); + ERTS_ASSERT(!next || anode_to_desc(next)->start > desc->end); + } + erts_mtx_unlock(&hard_dbg_mseg_mtx); +} + +static ErtsFreeSegDesc* hard_dbg_lookup_seg_at(RBTree* tree, char* start) +{ + RBTNode* x = tree->root; + + while (x) { + ErtsFreeSegDesc* desc = anode_to_desc(x); + if (start < desc->start) { + x = x->left; + } + else if (start > desc->start) { + ERTS_ASSERT(start > desc->end); + x = x->right; + } + else + return desc; + } + return NULL; +} + +void hard_dbg_remove_mseg(void* seg, UWord sz) +{ + check_seg_writable(seg, sz); + erts_mtx_lock(&hard_dbg_mseg_mtx); + { + ErtsFreeSegDesc* desc = hard_dbg_lookup_seg_at(&hard_dbg_mseg_tree, (char*)seg); + ERTS_ASSERT(desc); + ERTS_ASSERT(desc->start == (char*)seg); + ERTS_ASSERT(desc->end == (char*)seg + sz - 1); + + rbt_delete(&hard_dbg_mseg_tree, &desc->anode); + hard_dbg_free_desc(desc); + } + erts_mtx_unlock(&hard_dbg_mseg_mtx); +} + +#endif /* HARD_DEBUG_MSEG */ diff --git a/erts/emulator/sys/common/erl_mmap.h b/erts/emulator/sys/common/erl_mmap.h new file mode 100644 index 0000000000..fa51b663fa --- /dev/null +++ b/erts/emulator/sys/common/erl_mmap.h @@ -0,0 +1,181 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2013-2016. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#ifndef ERL_MMAP_H__ +#define ERL_MMAP_H__ + +#include "sys.h" + +#define ERTS_MMAP_SUPERALIGNED_BITS (18) +/* Affects hard limits for sbct and lmbcs documented in erts_alloc.xml */ + +#ifndef HAVE_MMAP +# define HAVE_MMAP 0 +#endif +#ifndef HAVE_MREMAP +# define HAVE_MREMAP 0 +#endif +#if HAVE_MMAP +# define ERTS_HAVE_OS_MMAP 1 +# define ERTS_HAVE_GENUINE_OS_MMAP 1 +# if HAVE_MREMAP +# define ERTS_HAVE_OS_MREMAP 1 +# endif +/* + * MAP_NORESERVE is undefined in FreeBSD 10.x and later. + * This is to enable 64bit HiPE experimentally on FreeBSD. + * Note that on FreeBSD MAP_NORESERVE was "never implemented" + * even before 11.x (and the flag does not exist in /usr/src/sys/vm/mmap.c + * of 10.3-STABLE r301478 either), and HiPE was working on OTP 18.3.3, + * so mandating MAP_NORESERVE on FreeBSD might not be needed. + * See the following message on how MAP_NORESERVE was treated on FreeBSD: + * <http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20150202/122958.html> + */ +# if defined(MAP_FIXED) && (defined(MAP_NORESERVE) || defined(__FreeBSD__)) +# define ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION 1 +# endif +#endif + +#ifndef HAVE_VIRTUALALLOC +# define HAVE_VIRTUALALLOC 0 +#endif +#if HAVE_VIRTUALALLOC +# define ERTS_HAVE_OS_MMAP 1 +#endif + +#ifdef ERTS_HAVE_GENUINE_OS_MMAP +# define HAVE_ERTS_MMAP 1 +#else +# define HAVE_ERTS_MMAP 0 +#endif + + +extern UWord erts_page_inv_mask; + +typedef struct { + struct { + char *start; + char *end; + } virtual_range; + struct { + char *start; + char *end; + } predefined_area; + UWord scs; /* super carrier size */ + int sco; /* super carrier only? */ + UWord scrfsd; /* super carrier reserved free segment descriptors */ + int scrpm; /* super carrier reserve physical memory */ +}ErtsMMapInit; + +#define ERTS_MMAP_INIT_DEFAULT_INITER \ + {{NULL, NULL}, {NULL, NULL}, 0, 1, (1 << 16), 1} + +#define ERTS_LITERAL_VIRTUAL_AREA_SIZE (UWORD_CONSTANT(1)*1024*1024*1024) + +#define ERTS_MMAP_INIT_LITERAL_INITER \ + {{NULL, NULL}, {NULL, NULL}, ERTS_LITERAL_VIRTUAL_AREA_SIZE, 1, (1 << 10), 0} + +#define ERTS_HIPE_EXEC_VIRTUAL_AREA_SIZE (UWORD_CONSTANT(512)*1024*1024) + +#define ERTS_MMAP_INIT_HIPE_EXEC_INITER \ + {{NULL, NULL}, {NULL, NULL}, ERTS_HIPE_EXEC_VIRTUAL_AREA_SIZE, 1, (1 << 10), 0} + + +#define ERTS_SUPERALIGNED_SIZE \ + (1 << ERTS_MMAP_SUPERALIGNED_BITS) +#define ERTS_INV_SUPERALIGNED_MASK \ + ((UWord) (ERTS_SUPERALIGNED_SIZE - 1)) +#define ERTS_SUPERALIGNED_MASK \ + (~ERTS_INV_SUPERALIGNED_MASK) +#define ERTS_SUPERALIGNED_FLOOR(X) \ + (((UWord) (X)) & ERTS_SUPERALIGNED_MASK) +#define ERTS_SUPERALIGNED_CEILING(X) \ + ERTS_SUPERALIGNED_FLOOR((X) + ERTS_INV_SUPERALIGNED_MASK) +#define ERTS_IS_SUPERALIGNED(X) \ + (((UWord) (X) & ERTS_INV_SUPERALIGNED_MASK) == 0) + +#define ERTS_INV_PAGEALIGNED_MASK \ + (erts_page_inv_mask) +#define ERTS_PAGEALIGNED_MASK \ + (~ERTS_INV_PAGEALIGNED_MASK) +#define ERTS_PAGEALIGNED_FLOOR(X) \ + (((UWord) (X)) & ERTS_PAGEALIGNED_MASK) +#define ERTS_PAGEALIGNED_CEILING(X) \ + ERTS_PAGEALIGNED_FLOOR((X) + ERTS_INV_PAGEALIGNED_MASK) +#define ERTS_IS_PAGEALIGNED(X) \ + (((UWord) (X) & ERTS_INV_PAGEALIGNED_MASK) == 0) +#define ERTS_PAGEALIGNED_SIZE \ + (ERTS_INV_PAGEALIGNED_MASK + 1) + +struct process; +Eterm erts_mmap_debug_info(struct process*); + +#if HAVE_ERTS_MMAP + +typedef struct ErtsMemMapper_ ErtsMemMapper; + +#define ERTS_MMAPFLG_OS_ONLY (((Uint32) 1) << 0) +#define ERTS_MMAPFLG_SUPERCARRIER_ONLY (((Uint32) 1) << 1) +#define ERTS_MMAPFLG_SUPERALIGNED (((Uint32) 1) << 2) + +void *erts_mmap(ErtsMemMapper*, Uint32 flags, UWord *sizep); +void erts_munmap(ErtsMemMapper*, Uint32 flags, void *ptr, UWord size); +void *erts_mremap(ErtsMemMapper*, Uint32 flags, void *ptr, UWord old_size, UWord *sizep); +int erts_mmap_in_supercarrier(ErtsMemMapper*, void *ptr); +void erts_mmap_init(ErtsMemMapper*, ErtsMMapInit*, int executable); +struct erts_mmap_info_struct +{ + UWord sizes[6]; + UWord segs[6]; + UWord os_used; +}; +Eterm erts_mmap_info(ErtsMemMapper*, int *print_to_p, void *print_to_arg, + Eterm** hpp, Uint* szp, struct erts_mmap_info_struct*); +Eterm erts_mmap_info_options(ErtsMemMapper*, + char *prefix, int *print_to_p, void *print_to_arg, + Uint **hpp, Uint *szp); + + +#ifdef ERTS_WANT_MEM_MAPPERS +# include "erl_alloc_types.h" + +extern ErtsMemMapper erts_dflt_mmapper; +# if defined(ARCH_64) && defined(ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION) +extern ErtsMemMapper erts_literal_mmapper; +# endif +# ifdef ERTS_ALC_A_EXEC +extern ErtsMemMapper erts_exec_mmapper; +# endif +#endif /* ERTS_WANT_MEM_MAPPERS */ + +/*#define HARD_DEBUG_MSEG*/ +#ifdef HARD_DEBUG_MSEG +# define HARD_DBG_INSERT_MSEG hard_dbg_insert_mseg +# define HARD_DBG_REMOVE_MSEG hard_dbg_remove_mseg +void hard_dbg_insert_mseg(void* seg, UWord sz); +void hard_dbg_remove_mseg(void* seg, UWord sz); +#else +# define HARD_DBG_INSERT_MSEG(SEG,SZ) +# define HARD_DBG_REMOVE_MSEG(SEG,SZ) +#endif + +#endif /* HAVE_ERTS_MMAP */ + +#endif /* ERL_MMAP_H__ */ diff --git a/erts/emulator/sys/common/erl_mseg.c b/erts/emulator/sys/common/erl_mseg.c index 2748edba02..f3306a888c 100644 --- a/erts/emulator/sys/common/erl_mseg.c +++ b/erts/emulator/sys/common/erl_mseg.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2002-2013. All Rights Reserved. + * Copyright Ericsson AB 2002-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -30,6 +31,7 @@ # include "config.h" #endif +#define ERTS_WANT_MEM_MAPPERS #include "sys.h" #include "erl_mseg.h" #include "global.h" @@ -98,56 +100,12 @@ static const int debruijn[32] = { static int atoms_initialized; -typedef struct mem_kind_t MemKind; - -#if HALFWORD_HEAP -static int initialize_pmmap(void); -static void *pmmap(size_t size); -static int pmunmap(void *p, size_t size); -static void *pmremap(void *old_address, size_t old_size, - size_t new_size); -#endif - -#if HAVE_MMAP -/* Mmap ... */ - -#define MMAP_PROT (PROT_READ|PROT_WRITE) - - -#ifdef MAP_ANON -# define MMAP_FLAGS (MAP_ANON|MAP_PRIVATE) -# define MMAP_FD (-1) -#else -# define MMAP_FLAGS (MAP_PRIVATE) -# define MMAP_FD mmap_fd -static int mmap_fd; -#endif - -#if HAVE_MREMAP -# define HAVE_MSEG_RECREATE 1 -#else -# define HAVE_MSEG_RECREATE 0 -#endif - -#if HALFWORD_HEAP -#define CAN_PARTLY_DESTROY 0 -#else -#define CAN_PARTLY_DESTROY 1 -#endif -#else /* #if HAVE_MMAP */ -#define CAN_PARTLY_DESTROY 0 -#error "Not supported" -#endif /* #if HAVE_MMAP */ - const ErtsMsegOpt_t erts_mseg_default_opt = { 1, /* Use cache */ 1, /* Preserv data */ 0, /* Absolute shrink threshold */ 0, /* Relative shrink threshold */ 0 /* Scheduler specific */ -#if HALFWORD_HEAP - ,0 /* need low memory */ -#endif }; @@ -163,9 +121,7 @@ typedef struct { CallCounter create; CallCounter create_resize; CallCounter destroy; -#if HAVE_MSEG_RECREATE CallCounter recreate; -#endif CallCounter clear_cache; CallCounter check_cache; } ErtsMsegCalls; @@ -173,7 +129,7 @@ typedef struct { typedef struct cache_t_ cache_t; struct cache_t_ { - Uint size; + UWord size; void *seg; cache_t *next; cache_t *prev; @@ -182,7 +138,14 @@ struct cache_t_ { typedef struct ErtsMsegAllctr_t_ ErtsMsegAllctr_t; -struct mem_kind_t { +struct ErtsMsegAllctr_t_ { + int ix; + + int is_init_done; + int is_thread_safe; + erts_mtx_t mtx; + + int is_cache_check_scheduled; cache_t cache[MAX_CACHE_SIZE]; cache_t cache_unpowered_node; @@ -208,39 +171,11 @@ struct mem_kind_t { } max_ever; } segments; - ErtsMsegAllctr_t *ma; - const char* name; - MemKind* next; -};/*MemKind*/ - -struct ErtsMsegAllctr_t_ { - int ix; - - int is_init_done; - int is_thread_safe; - erts_mtx_t mtx; - - int is_cache_check_scheduled; - - MemKind* mk_list; - -#if HALFWORD_HEAP - MemKind low_mem; - MemKind hi_mem; -#else - MemKind the_mem; -#endif - Uint max_cache_size; Uint abs_max_cache_bad_fit; Uint rel_max_cache_bad_fit; ErtsMsegCalls calls; - -#if CAN_PARTLY_DESTROY - Uint min_seg_size; -#endif - }; typedef union { @@ -344,69 +279,26 @@ schedule_cache_check(ErtsMsegAllctr_t *ma) { } } -/* remove ErtsMsegAllctr_t from arguments? - * only used for statistics - */ -static ERTS_INLINE void * -mmap_align(ErtsMsegAllctr_t *ma, void *addr, size_t length, int prot, int flags, int fd, off_t offset) { - - char *p, *q; - UWord d; - - p = mmap(addr, length, prot, flags, fd, offset); - - if (MAP_IS_ALIGNED(p) || p == MAP_FAILED) - return p; - - if (ma) - INC_CC(ma, create_resize); - - munmap(p, length); - - if ((p = mmap(addr, length + MSEG_ALIGNED_SIZE, prot, flags, fd, offset)) == MAP_FAILED) - return MAP_FAILED; - - q = (void *)ALIGNED_CEILING((char *)p); - d = (UWord)(q - p); - - if (d > 0) - munmap(p, d); - - if (MSEG_ALIGNED_SIZE - d > 0) - munmap((void *)(q + length), MSEG_ALIGNED_SIZE - d); - - return q; -} +/* #define ERTS_PRINT_ERTS_MMAP */ static ERTS_INLINE void * -mseg_create(ErtsMsegAllctr_t *ma, MemKind* mk, Uint size) +mseg_create(ErtsMsegAllctr_t *ma, Uint flags, UWord *sizep) { - void *seg; - ASSERT(size % MSEG_ALIGNED_SIZE == 0); - -#if HALFWORD_HEAP - if (mk == &ma->low_mem) { - seg = pmmap(size); - if ((unsigned long) seg & CHECK_POINTER_MASK) { - erts_fprintf(stderr,"Pointer mask failure (0x%08lx)\n",(unsigned long) seg); - return NULL; - } - } else +#ifdef ERTS_PRINT_ERTS_MMAP + UWord req_size = *sizep; #endif - { -#if HAVE_MMAP - { - seg = (void *) mmap_align(ma, (void *) 0, (size_t) size, - MMAP_PROT, MMAP_FLAGS, MMAP_FD, 0); - if (seg == (void *) MAP_FAILED) - seg = NULL; - - ASSERT(MAP_IS_ALIGNED(seg) || !seg); - } -#else -# error "Missing mseg_create() implementation" + void *seg; + Uint32 mmap_flags = 0; + if (MSEG_FLG_IS_2POW(flags)) + mmap_flags |= ERTS_MMAPFLG_SUPERALIGNED; + + seg = erts_mmap(&erts_dflt_mmapper, mmap_flags, sizep); + +#ifdef ERTS_PRINT_ERTS_MMAP + erts_fprintf(stderr, "%p = erts_mmap(%s, {%bpu, %bpu});\n", seg, + (mmap_flags & ERTS_MMAPFLG_SUPERALIGNED) ? "sa" : "sua", + req_size, *sizep); #endif - } INC_CC(ma, create); @@ -414,91 +306,45 @@ mseg_create(ErtsMsegAllctr_t *ma, MemKind* mk, Uint size) } static ERTS_INLINE void -mseg_destroy(ErtsMsegAllctr_t *ma, MemKind* mk, void *seg, Uint size) { - ERTS_DECLARE_DUMMY(int res); - -#if HALFWORD_HEAP - if (mk == &ma->low_mem) { - res = pmunmap((void *) seg, size); - } - else -#endif - { -#ifdef HAVE_MMAP - res = munmap((void *) seg, size); -#else -# error "Missing mseg_destroy() implementation" +mseg_destroy(ErtsMsegAllctr_t *ma, Uint flags, void *seg_p, UWord size) { + + Uint32 mmap_flags = 0; + if (MSEG_FLG_IS_2POW(flags)) + mmap_flags |= ERTS_MMAPFLG_SUPERALIGNED; + + erts_munmap(&erts_dflt_mmapper, mmap_flags, seg_p, size); +#ifdef ERTS_PRINT_ERTS_MMAP + erts_fprintf(stderr, "erts_munmap(%s, %p, %bpu);\n", + (mmap_flags & ERTS_MMAPFLG_SUPERALIGNED) ? "sa" : "sua", + seg_p, *size); #endif - } - - ASSERT(size % MSEG_ALIGNED_SIZE == 0); - ASSERT(res == 0); - INC_CC(ma, destroy); } -#if HAVE_MSEG_RECREATE -#if defined(__NetBSD__) -#define MREMAP_FLAGS (0) -#else -#define MREMAP_FLAGS (MREMAP_MAYMOVE) -#endif - - -/* mseg_recreate - * May return *unaligned* segments as in address not aligned to MSEG_ALIGNMENT - * it is still page aligned - * - * This is fine for single block carriers as long as we don't cache misaligned - * segments (since multiblock carriers may use them) - * - * For multiblock carriers we *need* MSEG_ALIGNMENT but mbc's will never be - * reallocated. - * - * This should probably be fixed the following way: - * 1) Use an option to segment allocation - NEED_ALIGNMENT - * 2) Add mremap_align which takes care of aligning a new a mremaped area - * 3) Fix the cache to handle of aligned and unaligned segments - */ - static ERTS_INLINE void * -mseg_recreate(ErtsMsegAllctr_t *ma, MemKind* mk, void *old_seg, Uint old_size, Uint new_size) +mseg_recreate(ErtsMsegAllctr_t *ma, Uint flags, void *old_seg, UWord old_size, UWord *sizep) { +#ifdef ERTS_PRINT_ERTS_MMAP + UWord req_size = *sizep; +#endif void *new_seg; + Uint32 mmap_flags = 0; + if (MSEG_FLG_IS_2POW(flags)) + mmap_flags |= ERTS_MMAPFLG_SUPERALIGNED; - ASSERT(old_size % MSEG_ALIGNED_SIZE == 0); - ASSERT(new_size % MSEG_ALIGNED_SIZE == 0); + new_seg = erts_mremap(&erts_dflt_mmapper, mmap_flags, old_seg, old_size, sizep); -#if HALFWORD_HEAP - if (mk == &ma->low_mem) { - new_seg = (void *) pmremap((void *) old_seg, - (size_t) old_size, - (size_t) new_size); - } - else -#endif - { -#if HAVE_MREMAP -#if defined(__NetBSD__) - new_seg = mremap(old_seg, (size_t)old_size, NULL, new_size, MREMAP_FLAGS); -#else - new_seg = mremap(old_seg, (size_t)old_size, (size_t)new_size, MREMAP_FLAGS); +#ifdef ERTS_PRINT_ERTS_MMAP + erts_fprintf(stderr, "%p = erts_mremap(%s, %p, %bpu, {%bpu, %bpu});\n", + new_seg, (mmap_flags & ERTS_MMAPFLG_SUPERALIGNED) ? "sa" : "sua", + old_seg, old_size, req_size, *sizep); #endif - if (new_seg == (void *) MAP_FAILED) - new_seg = NULL; -#else -#error "Missing mseg_recreate() implementation" -#endif - } - INC_CC(ma, recreate); return new_seg; } -#endif /* #if HAVE_MSEG_RECREATE */ - #ifdef DEBUG #define ERTS_DBG_MA_CHK_THR_ACCESS(MA) \ do { \ @@ -511,11 +357,8 @@ do { \ || erts_smp_thr_progress_is_blocking() \ || ERTS_IS_CRASH_DUMPING); \ } while (0) -#define ERTS_DBG_MK_CHK_THR_ACCESS(MK) \ - ERTS_DBG_MA_CHK_THR_ACCESS((MK)->ma) #else #define ERTS_DBG_MA_CHK_THR_ACCESS(MA) -#define ERTS_DBG_MK_CHK_THR_ACCESS(MK) #endif /* Cache interface */ @@ -528,10 +371,10 @@ static ERTS_INLINE void mseg_cache_clear_node(cache_t *c) { c->prev = c; } -static ERTS_INLINE int cache_bless_segment(MemKind *mk, void *seg, Uint size, Uint flags) { +static ERTS_INLINE int cache_bless_segment(ErtsMsegAllctr_t *ma, void *seg, UWord size, Uint flags) { cache_t *c; - ERTS_DBG_MK_CHK_THR_ACCESS(mk); + ERTS_DBG_MA_CHK_THR_ACCESS(ma); ASSERT(!MSEG_FLG_IS_2POW(flags) || (MSEG_FLG_IS_2POW(flags) && MAP_IS_ALIGNED(seg) && IS_2POW(size))); @@ -540,11 +383,11 @@ static ERTS_INLINE int cache_bless_segment(MemKind *mk, void *seg, Uint size, Ui * Large blocks has no such cache and it is up to mseg to cache them to speed things up. */ - if (!erts_circleq_is_empty(&(mk->cache_free))) { + if (!erts_circleq_is_empty(&(ma->cache_free))) { /* We have free slots, use one to cache the segment */ - c = erts_circleq_head(&(mk->cache_free)); + c = erts_circleq_head(&(ma->cache_free)); erts_circleq_remove(c); c->seg = seg; @@ -556,30 +399,28 @@ static ERTS_INLINE int cache_bless_segment(MemKind *mk, void *seg, Uint size, Ui ASSERT(ix < CACHE_AREAS); ASSERT((1 << (ix + MSEG_ALIGN_BITS)) == size); - erts_circleq_push_head(&(mk->cache_powered_node[ix]), c); + erts_circleq_push_head(&(ma->cache_powered_node[ix]), c); } else - erts_circleq_push_head(&(mk->cache_unpowered_node), c); + erts_circleq_push_head(&(ma->cache_unpowered_node), c); - mk->cache_size++; - ASSERT(mk->cache_size <= mk->ma->max_cache_size); + ma->cache_size++; return 1; - } else if (!MSEG_FLG_IS_2POW(flags) && !erts_circleq_is_empty(&(mk->cache_unpowered_node))) { - + } else if (!MSEG_FLG_IS_2POW(flags) && !erts_circleq_is_empty(&(ma->cache_unpowered_node))) { /* No free slots. * Evict oldest slot from unpowered cache so we can cache an unpowered (sbc) segment */ - c = erts_circleq_tail(&(mk->cache_unpowered_node)); + c = erts_circleq_tail(&(ma->cache_unpowered_node)); erts_circleq_remove(c); - mseg_destroy(mk->ma, mk, c->seg, c->size); + mseg_destroy(ma, ERTS_MSEG_FLG_NONE, c->seg, c->size); mseg_cache_clear_node(c); c->seg = seg; c->size = size; - erts_circleq_push_head(&(mk->cache_unpowered_node), c); + erts_circleq_push_head(&(ma->cache_unpowered_node), c); return 1; } else if (!MSEG_FLG_IS_2POW(flags)) { @@ -593,19 +434,20 @@ static ERTS_INLINE int cache_bless_segment(MemKind *mk, void *seg, Uint size, Ui int i; for( i = 0; i < CACHE_AREAS; i++) { - if (erts_circleq_is_empty(&(mk->cache_powered_node[i]))) + if (erts_circleq_is_empty(&(ma->cache_powered_node[i]))) continue; - c = erts_circleq_tail(&(mk->cache_powered_node[i])); + c = erts_circleq_tail(&(ma->cache_powered_node[i])); erts_circleq_remove(c); - mseg_destroy(mk->ma, mk, c->seg, c->size); + mseg_destroy(ma, ERTS_MSEG_FLG_2POW, c->seg, c->size); + mseg_cache_clear_node(c); c->seg = seg; c->size = size; - erts_circleq_push_head(&(mk->cache_unpowered_node), c); + erts_circleq_push_head(&(ma->cache_unpowered_node), c); return 1; } @@ -614,61 +456,60 @@ static ERTS_INLINE int cache_bless_segment(MemKind *mk, void *seg, Uint size, Ui return 0; } -static ERTS_INLINE void *cache_get_segment(MemKind *mk, Uint *size_p, Uint flags) { +static ERTS_INLINE void *cache_get_segment(ErtsMsegAllctr_t *ma, UWord *size_p, Uint flags) { - Uint size = *size_p; + UWord size = *size_p; - ERTS_DBG_MK_CHK_THR_ACCESS(mk); + ERTS_DBG_MA_CHK_THR_ACCESS(ma); if (MSEG_FLG_IS_2POW(flags)) { int i, ix = SIZE_TO_CACHE_AREA_IDX(size); - void *seg; + char *seg; cache_t *c; - Uint csize; + UWord csize; ASSERT(IS_2POW(size)); for( i = ix; i < CACHE_AREAS; i++) { - if (erts_circleq_is_empty(&(mk->cache_powered_node[i]))) + if (erts_circleq_is_empty(&(ma->cache_powered_node[i]))) continue; - c = erts_circleq_head(&(mk->cache_powered_node[i])); + c = erts_circleq_head(&(ma->cache_powered_node[i])); erts_circleq_remove(c); ASSERT(IS_2POW(c->size)); ASSERT(MAP_IS_ALIGNED(c->seg)); csize = c->size; - seg = c->seg; + seg = (char*) c->seg; - mk->cache_size--; - mk->cache_hits++; + ma->cache_size--; + ma->cache_hits++; /* link to free cache list */ mseg_cache_clear_node(c); - erts_circleq_push_head(&(mk->cache_free), c); + erts_circleq_push_head(&(ma->cache_free), c); - ASSERT(!(mk->cache_size < 0)); + ASSERT(!(ma->cache_size < 0)); - if (csize != size) { - mseg_destroy(mk->ma, mk, (char *)seg + size, csize - size); - } + if (csize != size) + mseg_destroy(ma, ERTS_MSEG_FLG_2POW, seg + size, csize - size); return seg; } } - else if (!erts_circleq_is_empty(&(mk->cache_unpowered_node))) { + else if (!erts_circleq_is_empty(&(ma->cache_unpowered_node))) { void *seg; cache_t *c; cache_t *best = NULL; - Uint bdiff = 0; - Uint csize; - Uint bad_max_abs = mk->ma->abs_max_cache_bad_fit; - Uint bad_max_rel = mk->ma->rel_max_cache_bad_fit; + UWord bdiff = 0; + UWord csize; + UWord bad_max_abs = ma->abs_max_cache_bad_fit; + UWord bad_max_rel = ma->rel_max_cache_bad_fit; - erts_circleq_foreach(c, &(mk->cache_unpowered_node)) { + erts_circleq_foreach(c, &(ma->cache_unpowered_node)) { csize = c->size; if (csize >= size) { if (((csize - size)*100 < bad_max_rel*size) && (csize - size) < bad_max_abs ) { @@ -677,11 +518,11 @@ static ERTS_INLINE void *cache_get_segment(MemKind *mk, Uint *size_p, Uint flags erts_circleq_remove(c); - mk->cache_size--; - mk->cache_hits++; + ma->cache_size--; + ma->cache_hits++; mseg_cache_clear_node(c); - erts_circleq_push_head(&(mk->cache_free), c); + erts_circleq_push_head(&(ma->cache_free), c); *size_p = csize; @@ -704,7 +545,7 @@ static ERTS_INLINE void *cache_get_segment(MemKind *mk, Uint *size_p, Uint flags ASSERT(best->seg); ASSERT(best->size > 0); - mk->cache_hits++; + ma->cache_hits++; /* Use current cache placement for remaining segment space */ @@ -728,7 +569,7 @@ static ERTS_INLINE void *cache_get_segment(MemKind *mk, Uint *size_p, Uint flags * using callbacks from aux-work in the scheduler. */ -static ERTS_INLINE Uint mseg_drop_one_memkind_cache_size(MemKind *mk, cache_t *head) { +static ERTS_INLINE Uint mseg_drop_one_cache_size(ErtsMsegAllctr_t *ma, Uint flags, cache_t *head) { cache_t *c = NULL; c = erts_circleq_tail(head); @@ -737,19 +578,19 @@ static ERTS_INLINE Uint mseg_drop_one_memkind_cache_size(MemKind *mk, cache_t *h if (erts_mtrace_enabled) erts_mtrace_crr_free(SEGTYPE, SEGTYPE, c->seg); - mseg_destroy(mk->ma, mk, c->seg, c->size); + mseg_destroy(ma, flags, c->seg, c->size); mseg_cache_clear_node(c); - erts_circleq_push_head(&(mk->cache_free), c); + erts_circleq_push_head(&(ma->cache_free), c); - mk->segments.current.watermark--; - mk->cache_size--; + ma->segments.current.watermark--; + ma->cache_size--; - ASSERT( mk->cache_size >= 0 ); + ASSERT(ma->cache_size >= 0); - return mk->cache_size; + return ma->cache_size; } -static ERTS_INLINE Uint mseg_drop_memkind_cache_size(MemKind *mk, cache_t *head) { +static ERTS_INLINE Uint mseg_drop_cache_size(ErtsMsegAllctr_t *ma, Uint flags, cache_t *head) { cache_t *c = NULL; while (!erts_circleq_is_empty(head)) { @@ -760,58 +601,52 @@ static ERTS_INLINE Uint mseg_drop_memkind_cache_size(MemKind *mk, cache_t *head) if (erts_mtrace_enabled) erts_mtrace_crr_free(SEGTYPE, SEGTYPE, c->seg); - mseg_destroy(mk->ma, mk, c->seg, c->size); + mseg_destroy(ma, flags, c->seg, c->size); mseg_cache_clear_node(c); - erts_circleq_push_head(&(mk->cache_free), c); - - mk->segments.current.watermark--; - mk->cache_size--; + erts_circleq_push_head(&(ma->cache_free), c); + ma->segments.current.watermark--; + ma->cache_size--; } - ASSERT( mk->cache_size >= 0 ); + ASSERT(ma->cache_size >= 0); - return mk->cache_size; + return ma->cache_size; } -/* mseg_check_memkind_cache - * - Check if we can empty some cached segments in this - * MemKind. +/* mseg_check_cache + * - Check if we can empty some cached segments in this allocator */ -static Uint mseg_check_memkind_cache(MemKind *mk) { +static Uint mseg_check_cache(ErtsMsegAllctr_t *ma) { int i; - ERTS_DBG_MK_CHK_THR_ACCESS(mk); + ERTS_DBG_MA_CHK_THR_ACCESS(ma); for (i = 0; i < CACHE_AREAS; i++) { - if (!erts_circleq_is_empty(&(mk->cache_powered_node[i]))) - return mseg_drop_one_memkind_cache_size(mk, &(mk->cache_powered_node[i])); + if (!erts_circleq_is_empty(&(ma->cache_powered_node[i]))) + return mseg_drop_one_cache_size(ma, ERTS_MSEG_FLG_2POW, &(ma->cache_powered_node[i])); } - if (!erts_circleq_is_empty(&(mk->cache_unpowered_node))) - return mseg_drop_one_memkind_cache_size(mk, &(mk->cache_unpowered_node)); + if (!erts_circleq_is_empty(&(ma->cache_unpowered_node))) + return mseg_drop_one_cache_size(ma, ERTS_MSEG_FLG_NONE, &(ma->cache_unpowered_node)); return 0; } /* mseg_cache_check * - Check if we have some cache we can purge - * in any of the memkinds. */ static void mseg_cache_check(ErtsMsegAllctr_t *ma) { - MemKind* mk; Uint empty_cache = 1; ERTS_MSEG_LOCK(ma); - for (mk = ma->mk_list; mk; mk = mk->next) { - if (mseg_check_memkind_cache(mk)) - empty_cache = 0; - } + if (mseg_check_cache(ma)) + empty_cache = 0; /* If all MemKinds caches are empty, * remove aux-work callback @@ -829,7 +664,7 @@ static void mseg_cache_check(ErtsMsegAllctr_t *ma) { /* erts_mseg_cache_check * - This is a callback that is scheduled as aux-work from * schedulers and is called at some interval if we have a cache - * on this mseg-allocator and memkind. + * on this mseg-allocator. * - Purpose: Empty cache slowly so we don't collect mapped areas * and bloat memory. */ @@ -839,42 +674,32 @@ void erts_mseg_cache_check(void) { } -/* *_mseg_clear_*_cache +/* mseg_clear_cache * Remove cached segments from the allocator completely */ -static void mseg_clear_memkind_cache(MemKind *mk) { + +static void mseg_clear_cache(ErtsMsegAllctr_t *ma) { int i; + ERTS_MSEG_LOCK(ma); + ERTS_DBG_MA_CHK_THR_ACCESS(ma); /* drop pow2 caches */ for (i = 0; i < CACHE_AREAS; i++) { - if (erts_circleq_is_empty(&(mk->cache_powered_node[i]))) + if (erts_circleq_is_empty(&(ma->cache_powered_node[i]))) continue; - mseg_drop_memkind_cache_size(mk, &(mk->cache_powered_node[i])); - ASSERT(erts_circleq_is_empty(&(mk->cache_powered_node[i]))); + mseg_drop_cache_size(ma, ERTS_MSEG_FLG_2POW, &(ma->cache_powered_node[i])); + ASSERT(erts_circleq_is_empty(&(ma->cache_powered_node[i]))); } /* drop varied caches */ - if (!erts_circleq_is_empty(&(mk->cache_unpowered_node))) - mseg_drop_memkind_cache_size(mk, &(mk->cache_unpowered_node)); + if (!erts_circleq_is_empty(&(ma->cache_unpowered_node))) + mseg_drop_cache_size(ma, ERTS_MSEG_FLG_NONE, &(ma->cache_unpowered_node)); - ASSERT(erts_circleq_is_empty(&(mk->cache_unpowered_node))); - ASSERT(mk->cache_size == 0); -} - -static void mseg_clear_cache(ErtsMsegAllctr_t *ma) { - MemKind* mk; - - ERTS_MSEG_LOCK(ma); - ERTS_DBG_MA_CHK_THR_ACCESS(ma); - - - for (mk = ma->mk_list; mk; mk = mk->next) { - mseg_clear_memkind_cache(mk); - } + ASSERT(erts_circleq_is_empty(&(ma->cache_unpowered_node))); + ASSERT(ma->cache_size == 0); INC_CC(ma, clear_cache); - ERTS_MSEG_UNLOCK(ma); } @@ -883,53 +708,39 @@ void erts_mseg_clear_cache(void) { mseg_clear_cache(ERTS_MSEG_ALLCTR_IX(0)); } - - -static ERTS_INLINE MemKind* memkind(ErtsMsegAllctr_t *ma, - const ErtsMsegOpt_t *opt) -{ -#if HALFWORD_HEAP - return opt->low_mem ? &ma->low_mem : &ma->hi_mem; -#else - return &ma->the_mem; -#endif -} - static void * -mseg_alloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, Uint *size_p, +mseg_alloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, UWord *size_p, Uint flags, const ErtsMsegOpt_t *opt) { - Uint size; + UWord size; void *seg; - MemKind* mk = memkind(ma, opt); INC_CC(ma, alloc); - /* Carrier align */ - size = ALIGNED_CEILING(*size_p); - - /* Cache optim (if applicable) */ - if (MSEG_FLG_IS_2POW(flags) && !IS_2POW(size)) - size = ceil_2pow(size); + if (!MSEG_FLG_IS_2POW(flags)) + size = ERTS_PAGEALIGNED_CEILING(*size_p); + else { + size = ALIGNED_CEILING(*size_p); + if (!IS_2POW(size)) { + /* Cache optim (if applicable) */ + size = ceil_2pow(size); + } + } -#if CAN_PARTLY_DESTROY - if (size < ma->min_seg_size) - ma->min_seg_size = size; -#endif - - if (opt->cache && mk->cache_size > 0 && (seg = cache_get_segment(mk, &size, flags)) != NULL) + if (opt->cache && ma->cache_size > 0 && (seg = cache_get_segment(ma, &size, flags)) != NULL) goto done; - if ((seg = mseg_create(ma, mk, size)) == NULL) - size = 0; + seg = mseg_create(ma, flags, &size); + if (!seg) + *size_p = 0; + else { done: - *size_p = size; - if (seg) { + *size_p = size; if (erts_mtrace_enabled) erts_mtrace_crr_alloc(seg, atype, ERTS_MTRACE_SEGMENT_ID, size); - ERTS_MSEG_ALLOC_STAT(mk,size); + ERTS_MSEG_ALLOC_STAT(ma,size); } return seg; @@ -937,14 +748,12 @@ done: static void -mseg_dealloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, void *seg, Uint size, +mseg_dealloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, void *seg, UWord size, Uint flags, const ErtsMsegOpt_t *opt) { - MemKind* mk = memkind(ma, opt); - - ERTS_MSEG_DEALLOC_STAT(mk,size); + ERTS_MSEG_DEALLOC_STAT(ma,size); - if (opt->cache && cache_bless_segment(mk, seg, size, flags)) { + if (opt->cache && cache_bless_segment(ma, seg, size, flags)) { schedule_cache_check(ma); goto done; } @@ -952,7 +761,7 @@ mseg_dealloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, void *seg, Uint size, if (erts_mtrace_enabled) erts_mtrace_crr_free(atype, SEGTYPE, seg); - mseg_destroy(ma, mk, seg, size); + mseg_destroy(ma, flags, seg, size); done: @@ -961,11 +770,10 @@ done: static void * mseg_realloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, void *seg, - Uint old_size, Uint *new_size_p, Uint flags, const ErtsMsegOpt_t *opt) + UWord old_size, UWord *new_size_p, Uint flags, const ErtsMsegOpt_t *opt) { - MemKind* mk; void *new_seg; - Uint new_size; + UWord new_size; /* Just allocate a new segment if we didn't have one before */ if (!seg || !old_size) { @@ -982,99 +790,55 @@ mseg_realloc(ErtsMsegAllctr_t *ma, ErtsAlcType_t atype, void *seg, return NULL; } - mk = memkind(ma, opt); new_seg = seg; - /* Carrier align */ - new_size = ALIGNED_CEILING(*new_size_p); - - /* Cache optim (if applicable) */ - if (MSEG_FLG_IS_2POW(flags) && !IS_2POW(new_size)) - new_size = ceil_2pow(new_size); - - if (new_size == old_size) - ; - else if (new_size < old_size) { - Uint shrink_sz = old_size - new_size; + if (!MSEG_FLG_IS_2POW(flags)) + new_size = ERTS_PAGEALIGNED_CEILING(*new_size_p); + else { + new_size = ALIGNED_CEILING(*new_size_p); + if (!IS_2POW(new_size)) { + /* Cache optim (if applicable) */ + new_size = ceil_2pow(new_size); + } + } -#if CAN_PARTLY_DESTROY - if (new_size < ma->min_seg_size) - ma->min_seg_size = new_size; -#endif - /* +M<S>rsbcst <ratio> */ - if (shrink_sz < opt->abs_shrink_th - && 100*shrink_sz < opt->rel_shrink_th*old_size) { - new_size = old_size; + if (new_size > old_size) { + if (opt->preserv) { + new_seg = mseg_recreate(ma, flags, (void *) seg, old_size, &new_size); + if (!new_seg) + new_size = old_size; } else { - -#if CAN_PARTLY_DESTROY - - if (erts_mtrace_enabled) - erts_mtrace_crr_realloc(new_seg, atype, SEGTYPE, seg, new_size); - - mseg_destroy(ma, mk, ((char *) seg) + new_size, shrink_sz); - -#elif HAVE_MSEG_RECREATE - goto do_recreate; -#else + mseg_dealloc(ma, atype, seg, old_size, flags, opt); new_seg = mseg_alloc(ma, atype, &new_size, flags, opt); - - ASSERT(MAP_IS_ALIGNED(new_seg) || !new_seg); - if (!new_seg) - new_size = old_size; - else { - sys_memcpy(((char *) new_seg), - ((char *) seg), - MIN(new_size, old_size)); - mseg_dealloc(ma, atype, seg, old_size, flags, opt); - } -#endif + new_size = 0; } } - else { + else if (new_size < old_size) { + UWord shrink_sz = old_size - new_size; - if (!opt->preserv) { - mseg_dealloc(ma, atype, seg, old_size, flags, opt); - new_seg = mseg_alloc(ma, atype, &new_size, flags, opt); - ASSERT(MAP_IS_ALIGNED(new_seg) || !new_seg); + /* +M<S>rsbcst <ratio> */ + if (shrink_sz < opt->abs_shrink_th + && 100*shrink_sz < opt->rel_shrink_th*old_size) { + new_size = old_size; } else { -#if HAVE_MSEG_RECREATE -#if !CAN_PARTLY_DESTROY - do_recreate: -#endif - new_seg = mseg_recreate(ma, mk, (void *) seg, old_size, new_size); - /* ASSERT(MAP_IS_ALIGNED(new_seg) || !new_seg); - * will not always be aligned and it ok for now - */ - - if (erts_mtrace_enabled) - erts_mtrace_crr_realloc(new_seg, atype, SEGTYPE, seg, new_size); - if (!new_seg) - new_size = old_size; -#else - new_seg = mseg_alloc(ma, atype, &new_size, flags, opt); - - ASSERT(MAP_IS_ALIGNED(new_seg) || !new_seg); - + new_seg = mseg_recreate(ma, flags, (void *) seg, old_size, &new_size); if (!new_seg) new_size = old_size; - else { - sys_memcpy(((char *) new_seg), ((char *) seg), MIN(new_size, old_size)); - mseg_dealloc(ma, atype, seg, old_size, flags, opt); - } -#endif } } + if (erts_mtrace_enabled) + erts_mtrace_crr_realloc(new_seg, atype, SEGTYPE, seg, new_size); + INC_CC(ma, realloc); ASSERT(!MSEG_FLG_IS_2POW(flags) || IS_2POW(new_size)); *new_size_p = new_size; - ERTS_MSEG_REALLOC_STAT(mk, old_size, new_size); + ERTS_MSEG_REALLOC_STAT(ma, old_size, new_size); return new_seg; } @@ -1106,9 +870,7 @@ static struct { Eterm mseg_create; Eterm mseg_create_resize; Eterm mseg_destroy; -#if HAVE_MSEG_RECREATE Eterm mseg_recreate; -#endif Eterm mseg_clear_cache; Eterm mseg_check_cache; @@ -1129,8 +891,6 @@ init_atoms(ErtsMsegAllctr_t *ma) #ifdef DEBUG Eterm *atom; #endif - - ERTS_MSEG_UNLOCK(ma); erts_mtx_lock(&init_atoms_mutex); if (!atoms_initialized) { @@ -1163,9 +923,7 @@ init_atoms(ErtsMsegAllctr_t *ma) AM_INIT(mseg_create); AM_INIT(mseg_create_resize); AM_INIT(mseg_destroy); -#if HAVE_MSEG_RECREATE AM_INIT(mseg_recreate); -#endif AM_INIT(mseg_clear_cache); AM_INIT(mseg_check_cache); @@ -1176,7 +934,6 @@ init_atoms(ErtsMsegAllctr_t *ma) #endif } - ERTS_MSEG_LOCK(ma); atoms_initialized = 1; erts_mtx_unlock(&init_atoms_mutex); } @@ -1239,7 +996,7 @@ info_options(ErtsMsegAllctr_t *ma, Uint **hpp, Uint *szp) { - Eterm res = THE_NON_VALUE; + Eterm res = NIL; if (print_to_p) { int to = *print_to_p; @@ -1254,7 +1011,6 @@ info_options(ErtsMsegAllctr_t *ma, if (!atoms_initialized) init_atoms(ma); - res = NIL; add_2tup(hpp, szp, &res, am.mcs, bld_uint(hpp, szp, ma->max_cache_size)); @@ -1293,9 +1049,7 @@ info_calls(ErtsMsegAllctr_t *ma, int *print_to_p, void *print_to_arg, Uint **hpp PRINT_CC(to, arg, create); PRINT_CC(to, arg, create_resize); PRINT_CC(to, arg, destroy); -#if HAVE_MSEG_RECREATE PRINT_CC(to, arg, recreate); -#endif PRINT_CC(to, arg, clear_cache); PRINT_CC(to, arg, check_cache); @@ -1316,12 +1070,10 @@ info_calls(ErtsMsegAllctr_t *ma, int *print_to_p, void *print_to_arg, Uint **hpp bld_unstable_uint(hpp, szp, ma->calls.clear_cache.giga_no), bld_unstable_uint(hpp, szp, ma->calls.clear_cache.no)); -#if HAVE_MSEG_RECREATE add_3tup(hpp, szp, &res, am.mseg_recreate, bld_unstable_uint(hpp, szp, ma->calls.recreate.giga_no), bld_unstable_uint(hpp, szp, ma->calls.recreate.no)); -#endif add_3tup(hpp, szp, &res, am.mseg_destroy, bld_unstable_uint(hpp, szp, ma->calls.destroy.giga_no), @@ -1354,88 +1106,95 @@ info_calls(ErtsMsegAllctr_t *ma, int *print_to_p, void *print_to_arg, Uint **hpp } static Eterm -info_status(ErtsMsegAllctr_t *ma, MemKind* mk, int *print_to_p, void *print_to_arg, - int begin_new_max_period, Uint **hpp, Uint *szp) +info_status(ErtsMsegAllctr_t *ma, int *print_to_p, void *print_to_arg, + int begin_new_max_period, int only_sz, Uint **hpp, Uint *szp) { Eterm res = THE_NON_VALUE; - if (mk->segments.max_ever.no < mk->segments.max.no) - mk->segments.max_ever.no = mk->segments.max.no; - if (mk->segments.max_ever.sz < mk->segments.max.sz) - mk->segments.max_ever.sz = mk->segments.max.sz; + if (ma->segments.max_ever.no < ma->segments.max.no) + ma->segments.max_ever.no = ma->segments.max.no; + if (ma->segments.max_ever.sz < ma->segments.max.sz) + ma->segments.max_ever.sz = ma->segments.max.sz; if (print_to_p) { int to = *print_to_p; void *arg = print_to_arg; - erts_print(to, arg, "cached_segments: %beu\n", mk->cache_size); - erts_print(to, arg, "cache_hits: %beu\n", mk->cache_hits); - erts_print(to, arg, "segments: %beu %beu %beu\n", - mk->segments.current.no, mk->segments.max.no, mk->segments.max_ever.no); - erts_print(to, arg, "segments_size: %beu %beu %beu\n", - mk->segments.current.sz, mk->segments.max.sz, mk->segments.max_ever.sz); - erts_print(to, arg, "segments_watermark: %beu\n", - mk->segments.current.watermark); + if (!only_sz) { + erts_print(to, arg, "cached_segments: %beu\n", ma->cache_size); + erts_print(to, arg, "cache_hits: %beu\n", ma->cache_hits); + erts_print(to, arg, "segments: %beu %beu %beu\n", + ma->segments.current.no, ma->segments.max.no, ma->segments.max_ever.no); + erts_print(to, arg, "segments_watermark: %beu\n", + ma->segments.current.watermark); + } + erts_print(to, arg, "segments_size: %beu %beu %beu\n", + ma->segments.current.sz, ma->segments.max.sz, ma->segments.max_ever.sz); } if (hpp || szp) { res = NIL; - add_2tup(hpp, szp, &res, - am.segments_watermark, - bld_unstable_uint(hpp, szp, mk->segments.current.watermark)); - add_4tup(hpp, szp, &res, - am.segments_size, - bld_unstable_uint(hpp, szp, mk->segments.current.sz), - bld_unstable_uint(hpp, szp, mk->segments.max.sz), - bld_unstable_uint(hpp, szp, mk->segments.max_ever.sz)); - add_4tup(hpp, szp, &res, - am.segments, - bld_unstable_uint(hpp, szp, mk->segments.current.no), - bld_unstable_uint(hpp, szp, mk->segments.max.no), - bld_unstable_uint(hpp, szp, mk->segments.max_ever.no)); - add_2tup(hpp, szp, &res, - am.cache_hits, - bld_unstable_uint(hpp, szp, mk->cache_hits)); - add_2tup(hpp, szp, &res, - am.cached_segments, - bld_unstable_uint(hpp, szp, mk->cache_size)); - + add_4tup(hpp, szp, &res, + am.segments_size, + bld_unstable_uint(hpp, szp, ma->segments.current.sz), + bld_unstable_uint(hpp, szp, ma->segments.max.sz), + bld_unstable_uint(hpp, szp, ma->segments.max_ever.sz)); + if (!only_sz) { + add_2tup(hpp, szp, &res, + am.segments_watermark, + bld_unstable_uint(hpp, szp, ma->segments.current.watermark)); + add_4tup(hpp, szp, &res, + am.segments, + bld_unstable_uint(hpp, szp, ma->segments.current.no), + bld_unstable_uint(hpp, szp, ma->segments.max.no), + bld_unstable_uint(hpp, szp, ma->segments.max_ever.no)); + add_2tup(hpp, szp, &res, + am.cache_hits, + bld_unstable_uint(hpp, szp, ma->cache_hits)); + add_2tup(hpp, szp, &res, + am.cached_segments, + bld_unstable_uint(hpp, szp, ma->cache_size)); + } } if (begin_new_max_period) { - mk->segments.max.no = mk->segments.current.no; - mk->segments.max.sz = mk->segments.current.sz; + ma->segments.max.no = ma->segments.current.no; + ma->segments.max.sz = ma->segments.current.sz; } return res; } -static Eterm info_memkind(ErtsMsegAllctr_t *ma, MemKind* mk, int *print_to_p, void *print_to_arg, - int begin_max_per, Uint **hpp, Uint *szp) +static Eterm info_memkind(ErtsMsegAllctr_t *ma, int *print_to_p, void *print_to_arg, + int begin_max_per, int only_sz, Uint **hpp, Uint *szp) { Eterm res = THE_NON_VALUE; Eterm atoms[3]; Eterm values[3]; - if (print_to_p) { - erts_print(*print_to_p, print_to_arg, "memory kind: %s\n", mk->name); + if (!only_sz) { + if (print_to_p) { + erts_print(*print_to_p, print_to_arg, "memory kind: %s\n", "all memory"); + } + if (hpp || szp) { + atoms[0] = am.name; + atoms[1] = am.status; + atoms[2] = am.calls; + values[0] = erts_bld_string(hpp, szp, "all memory"); + } } - if (hpp || szp) { - atoms[0] = am.name; - atoms[1] = am.status; - atoms[2] = am.calls; - values[0] = erts_bld_string(hpp, szp, mk->name); - } - values[1] = info_status(ma, mk, print_to_p, print_to_arg, begin_max_per, hpp, szp); - values[2] = info_calls(ma, print_to_p, print_to_arg, hpp, szp); + res = info_status(ma, print_to_p, print_to_arg, begin_max_per, only_sz, hpp, szp); + if (!only_sz) { + values[1] = res; + values[2] = info_calls(ma, print_to_p, print_to_arg, hpp, szp); - if (hpp || szp) - res = bld_2tup_list(hpp, szp, 3, atoms, values); + if (hpp || szp) + res = bld_2tup_list(hpp, szp, 3, atoms, values); + } return res; } - static Eterm info_version(ErtsMsegAllctr_t *ma, int *print_to_p, void *print_to_arg, Uint **hpp, Uint *szp) { @@ -1465,14 +1224,8 @@ erts_mseg_info_options(int ix, ErtsMsegAllctr_t *ma = ERTS_MSEG_ALLCTR_IX(ix); Eterm res; - ERTS_MSEG_LOCK(ma); - - ERTS_DBG_MA_CHK_THR_ACCESS(ma); - res = info_options(ma, "option ", print_to_p, print_to_arg, hpp, szp); - ERTS_MSEG_UNLOCK(ma); - return res; } @@ -1481,6 +1234,7 @@ erts_mseg_info(int ix, int *print_to_p, void *print_to_arg, int begin_max_per, + int only_sz, Uint **hpp, Uint *szp) { @@ -1490,30 +1244,30 @@ erts_mseg_info(int ix, Eterm values[4]; Uint n = 0; - ERTS_MSEG_LOCK(ma); + if (hpp || szp) { + if (!atoms_initialized) + init_atoms(ma); + } + if (!only_sz) { + if (hpp || szp) { + atoms[0] = am.version; + atoms[1] = am.options; + atoms[2] = am.memkind; + } + values[n++] = info_version(ma, print_to_p, print_to_arg, hpp, szp); + values[n++] = info_options(ma, "option ", print_to_p, print_to_arg, hpp, szp); + } + ERTS_MSEG_LOCK(ma); ERTS_DBG_MA_CHK_THR_ACCESS(ma); - if (hpp || szp) { - - if (!atoms_initialized) - init_atoms(ma); + res = info_memkind(ma, print_to_p, print_to_arg, begin_max_per, only_sz, hpp, szp); - atoms[0] = am.version; - atoms[1] = am.options; - atoms[2] = am.memkind; - atoms[3] = am.memkind; + if (!only_sz) { + values[n++] = res; + if (hpp || szp) + res = bld_2tup_list(hpp, szp, n, atoms, values); } - values[n++] = info_version(ma, print_to_p, print_to_arg, hpp, szp); - values[n++] = info_options(ma, "option ", print_to_p, print_to_arg, hpp, szp); -#if HALFWORD_HEAP - values[n++] = info_memkind(ma, &ma->low_mem, print_to_p, print_to_arg, begin_max_per, hpp, szp); - values[n++] = info_memkind(ma, &ma->hi_mem, print_to_p, print_to_arg, begin_max_per, hpp, szp); -#else - values[n++] = info_memkind(ma, &ma->the_mem, print_to_p, print_to_arg, begin_max_per, hpp, szp); -#endif - if (hpp || szp) - res = bld_2tup_list(hpp, szp, n, atoms, values); ERTS_MSEG_UNLOCK(ma); @@ -1521,7 +1275,7 @@ erts_mseg_info(int ix, } void * -erts_mseg_alloc_opt(ErtsAlcType_t atype, Uint *size_p, Uint flags, const ErtsMsegOpt_t *opt) +erts_mseg_alloc_opt(ErtsAlcType_t atype, UWord *size_p, Uint flags, const ErtsMsegOpt_t *opt) { ErtsMsegAllctr_t *ma = ERTS_MSEG_ALLCTR_OPT(opt); void *seg; @@ -1529,20 +1283,23 @@ erts_mseg_alloc_opt(ErtsAlcType_t atype, Uint *size_p, Uint flags, const ErtsMse ERTS_DBG_MA_CHK_THR_ACCESS(ma); seg = mseg_alloc(ma, atype, size_p, flags, opt); ERTS_MSEG_UNLOCK(ma); + HARD_DBG_INSERT_MSEG(seg, *size_p); return seg; } void * -erts_mseg_alloc(ErtsAlcType_t atype, Uint *size_p, Uint flags) +erts_mseg_alloc(ErtsAlcType_t atype, UWord *size_p, Uint flags) { return erts_mseg_alloc_opt(atype, size_p, flags, &erts_mseg_default_opt); } void erts_mseg_dealloc_opt(ErtsAlcType_t atype, void *seg, - Uint size, Uint flags, const ErtsMsegOpt_t *opt) + UWord size, Uint flags, const ErtsMsegOpt_t *opt) { ErtsMsegAllctr_t *ma = ERTS_MSEG_ALLCTR_OPT(opt); + + HARD_DBG_REMOVE_MSEG(seg, size); ERTS_MSEG_LOCK(ma); ERTS_DBG_MA_CHK_THR_ACCESS(ma); mseg_dealloc(ma, atype, seg, size, flags, opt); @@ -1550,29 +1307,32 @@ erts_mseg_dealloc_opt(ErtsAlcType_t atype, void *seg, } void -erts_mseg_dealloc(ErtsAlcType_t atype, void *seg, Uint size, Uint flags) +erts_mseg_dealloc(ErtsAlcType_t atype, void *seg, UWord size, Uint flags) { erts_mseg_dealloc_opt(atype, seg, size, flags, &erts_mseg_default_opt); } void * erts_mseg_realloc_opt(ErtsAlcType_t atype, void *seg, - Uint old_size, Uint *new_size_p, + UWord old_size, UWord *new_size_p, Uint flags, const ErtsMsegOpt_t *opt) { ErtsMsegAllctr_t *ma = ERTS_MSEG_ALLCTR_OPT(opt); void *new_seg; + + HARD_DBG_REMOVE_MSEG(seg, old_size); ERTS_MSEG_LOCK(ma); ERTS_DBG_MA_CHK_THR_ACCESS(ma); new_seg = mseg_realloc(ma, atype, seg, old_size, new_size_p, flags, opt); ERTS_MSEG_UNLOCK(ma); + HARD_DBG_INSERT_MSEG(new_seg, *new_size_p); return new_seg; } void * erts_mseg_realloc(ErtsAlcType_t atype, void *seg, - Uint old_size, Uint *new_size_p, Uint flags) + UWord old_size, UWord *new_size_p, Uint flags) { return erts_mseg_realloc_opt(atype, seg, old_size, new_size_p, flags, &erts_mseg_default_opt); @@ -1582,13 +1342,10 @@ Uint erts_mseg_no(const ErtsMsegOpt_t *opt) { ErtsMsegAllctr_t *ma = ERTS_MSEG_ALLCTR_OPT(opt); - MemKind* mk; - Uint n = 0; + Uint n; ERTS_MSEG_LOCK(ma); ERTS_DBG_MA_CHK_THR_ACCESS(ma); - for (mk=ma->mk_list; mk; mk=mk->next) { - n += mk->segments.current.no; - } + n = ma->segments.current.no; ERTS_MSEG_UNLOCK(ma); return n; } @@ -1600,16 +1357,16 @@ erts_mseg_unit_size(void) } -static void mem_kind_init(ErtsMsegAllctr_t *ma, MemKind* mk, const char* name) +static void mem_cache_init(ErtsMsegAllctr_t *ma) { int i; /* Clear all cache headers */ - mseg_cache_clear_node(&(mk->cache_free)); - mseg_cache_clear_node(&(mk->cache_unpowered_node)); + mseg_cache_clear_node(&(ma->cache_free)); + mseg_cache_clear_node(&(ma->cache_unpowered_node)); for (i = 0; i < CACHE_AREAS; i++) { - mseg_cache_clear_node(&(mk->cache_powered_node[i])); + mseg_cache_clear_node(&(ma->cache_powered_node[i])); } /* Populate cache free list */ @@ -1617,25 +1374,20 @@ static void mem_kind_init(ErtsMsegAllctr_t *ma, MemKind* mk, const char* name) ASSERT(ma->max_cache_size <= MAX_CACHE_SIZE); for (i = 0; i < ma->max_cache_size; i++) { - mseg_cache_clear_node(&(mk->cache[i])); - erts_circleq_push_head(&(mk->cache_free), &(mk->cache[i])); + mseg_cache_clear_node(&(ma->cache[i])); + erts_circleq_push_head(&(ma->cache_free), &(ma->cache[i])); } - mk->cache_size = 0; - mk->cache_hits = 0; - - mk->segments.current.watermark = 0; - mk->segments.current.no = 0; - mk->segments.current.sz = 0; - mk->segments.max.no = 0; - mk->segments.max.sz = 0; - mk->segments.max_ever.no = 0; - mk->segments.max_ever.sz = 0; - - mk->ma = ma; - mk->name = name; - mk->next = ma->mk_list; - ma->mk_list = mk; + ma->cache_size = 0; + ma->cache_hits = 0; + + ma->segments.current.watermark = 0; + ma->segments.current.no = 0; + ma->segments.current.sz = 0; + ma->segments.max.no = 0; + ma->segments.max.sz = 0; + ma->segments.max_ever.no = 0; + ma->segments.max_ever.sz = 0; } void @@ -1662,18 +1414,19 @@ erts_mseg_init(ErtsMsegInit_t *init) erts_mtx_init(&init_atoms_mutex, "mseg_init_atoms"); -#if HAVE_MMAP && !defined(MAP_ANON) - mmap_fd = open("/dev/zero", O_RDWR); - if (mmap_fd < 0) - erl_exit(ERTS_ABORT_EXIT, "erts_mseg: unable to open /dev/zero\n"); +#ifdef ERTS_ALC_A_EXEC + /* Initialize erts_exec_mapper *FIRST*, to increase probability + * of getting low memory for HiPE AMD64's small code model. + */ + erts_mmap_init(&erts_exec_mmapper, &init->exec_mmap, 1); #endif - -#if HAVE_MMAP && HALFWORD_HEAP - initialize_pmmap(); + erts_mmap_init(&erts_dflt_mmapper, &init->dflt_mmap, 0); +#if defined(ARCH_64) && defined(ERTS_HAVE_OS_PHYSICAL_MEMORY_RESERVATION) + erts_mmap_init(&erts_literal_mmapper, &init->literal_mmap, 0); #endif if (!IS_2POW(GET_PAGE_SIZE)) - erl_exit(ERTS_ABORT_EXIT, "erts_mseg: Unexpected page_size %beu\n", GET_PAGE_SIZE); + erts_exit(ERTS_ABORT_EXIT, "erts_mseg: Unexpected page_size %beu\n", GET_PAGE_SIZE); ASSERT((MSEG_ALIGNED_SIZE % GET_PAGE_SIZE) == 0); @@ -1702,33 +1455,17 @@ erts_mseg_init(ErtsMsegInit_t *init) if (ma->max_cache_size > MAX_CACHE_SIZE) ma->max_cache_size = MAX_CACHE_SIZE; - ma->mk_list = NULL; - -#if HALFWORD_HEAP - mem_kind_init(ma, &ma->low_mem, "low memory"); - mem_kind_init(ma, &ma->hi_mem, "high memory"); -#else - mem_kind_init(ma, &ma->the_mem, "all memory"); -#endif + mem_cache_init(ma); sys_memzero((void *) &ma->calls, sizeof(ErtsMsegCalls)); - -#if CAN_PARTLY_DESTROY - ma->min_seg_size = ~((Uint) 0); -#endif } } static ERTS_INLINE Uint tot_cache_size(ErtsMsegAllctr_t *ma) { - MemKind* mk; - Uint sz = 0; ERTS_DBG_MA_CHK_THR_ACCESS(ma); - for (mk=ma->mk_list; mk; mk=mk->next) { - sz += mk->cache_size; - } - return sz; + return ma->cache_size; } /* @@ -1757,7 +1494,7 @@ erts_mseg_test(UWord op, UWord a1, UWord a2, UWord a3) case 0x400: /* Have erts_mseg */ return (UWord) 1; case 0x401: - return (UWord) erts_mseg_alloc(ERTS_ALC_A_INVALID, (Uint *) a1, (Uint) 0); + return (UWord) erts_mseg_alloc(ERTS_ALC_A_INVALID, (UWord *) a1, (Uint) 0); case 0x402: erts_mseg_dealloc(ERTS_ALC_A_INVALID, (void *) a1, (Uint) a2, (Uint) 0); return (UWord) 0; @@ -1765,7 +1502,7 @@ erts_mseg_test(UWord op, UWord a1, UWord a2, UWord a3) return (UWord) erts_mseg_realloc(ERTS_ALC_A_INVALID, (void *) a1, (Uint) a2, - (Uint *) a3, + (UWord *) a3, (Uint) 0); case 0x404: erts_mseg_clear_cache(); @@ -1788,405 +1525,3 @@ erts_mseg_test(UWord op, UWord a1, UWord a2, UWord a3) } } - - -#if HALFWORD_HEAP -/* - * Very simple page oriented mmap replacer. Works in the lower - * 32 bit address range of a 64bit program. - * Implements anonymous mmap mremap and munmap with address order first fit. - * The free list is expected to be very short... - * To be used for compressed pointers in Erlang halfword emulator - * implementation. The MacOS X version is more of a toy, it's not really - * for production as the halfword erlang VM relies on Linux specific memory - * mapping tricks. - */ - -/* #define HARDDEBUG 1 */ - -#ifdef HARDDEBUG -static void dump_freelist(void) -{ - FreeBlock *p = first; - - while (p) { - fprintf(stderr, "p = %p\r\np->num = %ld\r\np->next = %p\r\n\r\n", - (void *) p, (unsigned long) p->num, (void *) p->next); - p = p->next; - } -} - -#define HARDDEBUG_HW_INCOMPLETE_ALIGNMENT(PTR, SZ) \ - fprintf(stderr,"Mapping of address %p with size %ld " \ - "does not map complete pages (%s:%d)\r\n", \ - (void *) (PTR), (unsigned long) (SZ),__FILE__, __LINE__) - -#define HARDDEBUG_HW_UNALIGNED_ALIGNMENT(PTR, SZ) \ - fprintf(stderr,"Mapping of address %p with size %ld " \ - "is not page aligned (%s:%d)\r\n", \ - (void *) (PTR), (unsigned long) (SZ),__FILE__, __LINE__) - -#define HARDDEBUG_MAP_FAILED(PTR, SZ) \ - fprintf(stderr, "Could not actually map memory " \ - "at address %p with size %ld (%s:%d) ..\r\n", \ - (void *) (PTR), (unsigned long) (SZ),__FILE__, __LINE__) -#else -#define HARDDEBUG_HW_INCOMPLETE_ALIGNMENT(PTR, SZ) do{}while(0) -#define HARDDEBUG_HW_UNALIGNED_ALIGNMENT(PTR, SZ) do{}while(0) -#define HARDDEBUG_MAP_FAILED(PTR, SZ) do{}while(0) -#endif - - -#ifdef __APPLE__ -#define MAP_ANONYMOUS MAP_ANON -#endif - -#define INIT_LOCK() do {erts_mtx_init(&pmmap_mutex, "pmmap");} while(0) - -#define TAKE_LOCK() do {erts_mtx_lock(&pmmap_mutex);} while(0) - -#define RELEASE_LOCK() do {erts_mtx_unlock(&pmmap_mutex);} while(0) - -static erts_mtx_t pmmap_mutex; /* Also needed when !USE_THREADS */ - -typedef struct _free_block { - unsigned long num; /*pages*/ - struct _free_block *next; -} FreeBlock; - -/* Protect with lock */ -static FreeBlock *first; - -static void *do_map(void *ptr, size_t sz) -{ - void *res; - - if (ALIGNED_CEILING(sz) != sz) { - HARDDEBUG_HW_INCOMPLETE_ALIGNMENT(ptr, sz); - return NULL; - } - - if (((unsigned long) ptr) % MSEG_ALIGNED_SIZE) { - HARDDEBUG_HW_UNALIGNED_ALIGNMENT(ptr, sz); - return NULL; - } - -#if HAVE_MMAP - res = mmap(ptr, sz, - PROT_READ | PROT_WRITE, MAP_PRIVATE | - MAP_ANONYMOUS | MAP_FIXED, - -1 , 0); -#else -# error "Missing mmap support" -#endif - - if (res == MAP_FAILED) { - HARDDEBUG_MAP_FAILED(ptr, sz); - return NULL; - } - - return res; -} - -static int do_unmap(void *ptr, size_t sz) -{ - void *res; - - if (ALIGNED_CEILING(sz) != sz) { - HARDDEBUG_HW_INCOMPLETE_ALIGNMENT(ptr, sz); - return 1; - } - - if (((unsigned long) ptr) % MSEG_ALIGNED_SIZE) { - HARDDEBUG_HW_UNALIGNED_ALIGNMENT(ptr, sz); - return 1; - } - - res = mmap(ptr, sz, - PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE | MAP_FIXED, - -1 , 0); - - if (res == MAP_FAILED) { - HARDDEBUG_MAP_FAILED(ptr, sz); - return 1; - } - - return 0; -} - -#ifdef __APPLE__ -/* - * The first 4 gig's are protected on Macos X for 64bit processes :( - * The range 0x1000000000 - 0x10FFFFFFFF is selected as an arbitrary - * value of a normally unused range... Real MMAP's will avoid - * it and all 32bit compressed pointers can be in that range... - * More expensive than on Linux where expansion of compressed - * poiters involves no masking (as they are in the first 4 gig's). - * It's also very uncertain if the MAP_NORESERVE flag really has - * any effect in MacOS X. Swap space may always be allocated... - */ -#define SET_RANGE_MIN() /* nothing */ -#define RANGE_MIN 0x1000000000UL -#define RANGE_MAX 0x1100000000UL -#define RANGE_MASK (RANGE_MIN) -#define EXTRA_MAP_FLAGS (MAP_FIXED) -#else -static size_t range_min; -#define SET_RANGE_MIN() do { range_min = (size_t) sbrk(0); } while (0) -#define RANGE_MIN range_min -#define RANGE_MAX 0x100000000UL -#define RANGE_MASK 0UL -#define EXTRA_MAP_FLAGS (0) -#endif - -static int initialize_pmmap(void) -{ - char *p,*q,*rptr; - size_t rsz; - FreeBlock *initial; - - SET_RANGE_MIN(); - if (sizeof(void *) != 8) { - erl_exit(1,"Halfword emulator cannot be run in 32bit mode"); - } - - p = (char *) RANGE_MIN; - q = (char *) RANGE_MAX; - - rsz = ALIGNED_FLOOR(q - p); - - rptr = mmap_align(NULL, (void *) p, rsz, - PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS | - MAP_NORESERVE | EXTRA_MAP_FLAGS, - -1 , 0); -#ifdef HARDDEBUG - printf("p=%p, rsz = %ld, pages = %ld, got range = %p -> %p\r\n", - p, (unsigned long) rsz, (unsigned long) (rsz / MSEG_ALIGNED_SIZE), - (void *) rptr, (void*)(rptr + rsz)); -#endif - if ((UWord)(rptr + rsz) > RANGE_MAX) { - size_t rsz_trunc = RANGE_MAX - (UWord)rptr; -#ifdef HARDDEBUG - printf("Reducing mmap'ed memory from %lu to %lu Mb, reduced range = %p -> %p\r\n", - rsz/(1024*1024), rsz_trunc/(1024*1024), rptr, rptr+rsz_trunc); -#endif - munmap((void*)RANGE_MAX, rsz - rsz_trunc); - rsz = rsz_trunc; - } - if (!do_map(rptr, MSEG_ALIGNED_SIZE)) { - erl_exit(1,"Could not actually mmap first page for halfword emulator...\n"); - } - initial = (FreeBlock *) rptr; - initial->num = (rsz / MSEG_ALIGNED_SIZE); - initial->next = NULL; - first = initial; - INIT_LOCK(); - return 0; -} - -static void *pmmap(size_t size) -{ - size_t real_size = ALIGNED_CEILING(size); - size_t num_pages = real_size / MSEG_ALIGNED_SIZE; - FreeBlock **block; - FreeBlock *tail; - FreeBlock *res; - - TAKE_LOCK(); - - for (block = &first; - *block != NULL && (*block)->num < num_pages; - block = &((*block)->next)) - ; - if (!(*block)) { - RELEASE_LOCK(); - return NULL; - } - if ((*block)->num == num_pages) { - /* nice, perfect fit */ - res = *block; - *block = (*block)->next; - } else { - tail = (FreeBlock *) (((char *) ((void *) (*block))) + real_size); - if (!do_map(tail, MSEG_ALIGNED_SIZE)) { - HARDDEBUG_MAP_FAILED(tail, MSEG_ALIGNED_SIZE); - RELEASE_LOCK(); - return NULL; - } - tail->num = (*block)->num - num_pages; - tail->next = (*block)->next; - res = *block; - *block = tail; - } - - RELEASE_LOCK(); - - if (!do_map(res, real_size)) { - HARDDEBUG_MAP_FAILED(res, real_size); - return NULL; - } - - return (void *) res; -} - -static int pmunmap(void *p, size_t size) -{ - size_t real_size = ALIGNED_CEILING(size); - size_t num_pages = real_size / MSEG_ALIGNED_SIZE; - - FreeBlock *block; - FreeBlock *last; - FreeBlock *nb = (FreeBlock *) p; - - ASSERT(((unsigned long)p & CHECK_POINTER_MASK)==0); - - if (real_size > MSEG_ALIGNED_SIZE) { - if (do_unmap(((char *) p) + MSEG_ALIGNED_SIZE, real_size - MSEG_ALIGNED_SIZE)) { - return 1; - } - } - - TAKE_LOCK(); - - last = NULL; - block = first; - while(block != NULL && ((void *) block) < p) { - last = block; - block = block->next; - } - - if (block != NULL && - ((void *) block) == ((void *) (((char *) p) + real_size))) { - /* Merge new free block with following */ - nb->num = block->num + num_pages; - nb->next = block->next; - if (do_unmap(block, MSEG_ALIGNED_SIZE)) { - RELEASE_LOCK(); - return 1; - } - } else { - /* just link in */ - nb->num = num_pages; - nb->next = block; - } - if (last != NULL) { - if (p == ((void *) (((char *) last) + (last->num * MSEG_ALIGNED_SIZE)))) { - /* Merge with previous */ - last->num += nb->num; - last->next = nb->next; - if (do_unmap(nb, MSEG_ALIGNED_SIZE)) { - RELEASE_LOCK(); - return 1; - } - } else { - last->next = nb; - } - } else { - first = nb; - } - RELEASE_LOCK(); - return 0; -} - -static void *pmremap(void *old_address, size_t old_size, - size_t new_size) -{ - size_t new_real_size = ALIGNED_CEILING(new_size); - size_t new_num_pages = new_real_size / MSEG_ALIGNED_SIZE; - size_t old_real_size = ALIGNED_CEILING(old_size); - size_t old_num_pages = old_real_size / MSEG_ALIGNED_SIZE; - if (new_num_pages == old_num_pages) { - return old_address; - } else if (new_num_pages < old_num_pages) { /* Shrink */ - size_t nfb_pages = old_num_pages - new_num_pages; - size_t nfb_real_size = old_real_size - new_real_size; - void *vnfb = (void *) (((char *)old_address) + new_real_size); - FreeBlock *nfb = (FreeBlock *) vnfb; - FreeBlock **block; - TAKE_LOCK(); - for (block = &first; - *block != NULL && (*block) < nfb; - block = &((*block)->next)) - ; - if (!(*block) || - (*block) > ((FreeBlock *)(((char *) vnfb) + nfb_real_size))) { - /* Normal link in */ - if (nfb_pages > 1) { - if (do_unmap((void *)(((char *) vnfb) + MSEG_ALIGNED_SIZE), - (nfb_pages - 1)*MSEG_ALIGNED_SIZE)) { - return NULL; - } - } - nfb->next = (*block); - nfb->num = nfb_pages; - (*block) = nfb; - } else { /* block merge */ - nfb->next = (*block)->next; - nfb->num = nfb_pages + (*block)->num; - /* unmap also the first page of the next freeblock */ - (*block) = nfb; - if (do_unmap((void *)(((char *) vnfb) + MSEG_ALIGNED_SIZE), - nfb_pages*MSEG_ALIGNED_SIZE)) { - return NULL; - } - } - RELEASE_LOCK(); - return old_address; - } else { /* Enlarge */ - FreeBlock **block; - void *old_end = (void *) (((char *)old_address) + old_real_size); - TAKE_LOCK(); - for (block = &first; - *block != NULL && (*block) < (FreeBlock *) old_address; - block = &((*block)->next)) - ; - if ((*block) == NULL || old_end > ((void *) RANGE_MAX) || - (*block) != old_end || - (*block)->num < (new_num_pages - old_num_pages)) { - /* cannot extend */ - void *result; - RELEASE_LOCK(); - result = pmmap(new_size); - if (result == NULL) { - return NULL; - } - memcpy(result,old_address,old_size); - if (pmunmap(old_address,old_size)) { - /* Oups... */ - pmunmap(result,new_size); - return NULL; - } - return result; - } else { /* extend */ - size_t remaining_pages = (*block)->num - - (new_num_pages - old_num_pages); - if (!remaining_pages) { - void *p = (void *) (((char *) (*block)) + MSEG_ALIGNED_SIZE); - void *n = (*block)->next; - size_t x = ((*block)->num - 1) * MSEG_ALIGNED_SIZE; - if (x > 0) { - if (do_map(p,x) == NULL) { - RELEASE_LOCK(); - return NULL; - } - } - (*block) = n; - } else { - FreeBlock *nfb = (FreeBlock *) ((void *) - (((char *) old_address) + - new_real_size)); - void *p = (void *) (((char *) (*block)) + MSEG_ALIGNED_SIZE); - if (do_map(p,new_real_size - old_real_size) == NULL) { - RELEASE_LOCK(); - return NULL; - } - nfb->num = remaining_pages; - nfb->next = (*block)->next; - (*block) = nfb; - } - RELEASE_LOCK(); - return old_address; - } - } -} -#endif /* HALFWORD_HEAP */ diff --git a/erts/emulator/sys/common/erl_mseg.h b/erts/emulator/sys/common/erl_mseg.h index a4f250ceab..a43b409e94 100644 --- a/erts/emulator/sys/common/erl_mseg.h +++ b/erts/emulator/sys/common/erl_mseg.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2002-2013. All Rights Reserved. + * Copyright Ericsson AB 2002-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -22,35 +23,25 @@ #include "sys.h" #include "erl_alloc_types.h" +#include "erl_mmap.h" -#ifndef HAVE_MMAP -# define HAVE_MMAP 0 -#endif -#ifndef HAVE_MREMAP -# define HAVE_MREMAP 0 -#endif - -#if HAVE_MMAP +/* + * We currently only enable mseg_alloc if we got + * a genuine mmap()/munmap() primitive. It is possible + * to utilize erts_mmap() withiout a mmap support but + * alloc_util needs to be prepared before we can do + * that. + */ +#ifdef ERTS_HAVE_GENUINE_OS_MMAP # define HAVE_ERTS_MSEG 1 -# define HAVE_SUPER_ALIGNED_MB_CARRIERS 1 +# define ERTS_HAVE_MSEG_SUPER_ALIGNED 1 #else # define HAVE_ERTS_MSEG 0 -# define HAVE_SUPER_ALIGNED_MB_CARRIERS 0 +# define ERTS_HAVE_MSEG_SUPER_ALIGNED 0 #endif -#if HAVE_SUPER_ALIGNED_MB_CARRIERS -# define MSEG_ALIGN_BITS (18) - /* Affects hard limits for sbct and lmbcs documented in erts_alloc.xml */ -#else -/* If we don't use super aligned multiblock carriers - * we will mmap with page size alignment (and thus use corresponding - * align bits). - * - * Current implementation needs this to be a constant and - * only uses this for user dev testing so setting page size - * to 4096 (12 bits) is fine. - */ -# define MSEG_ALIGN_BITS (12) +#if ERTS_HAVE_MSEG_SUPER_ALIGNED +# define MSEG_ALIGN_BITS ERTS_MMAP_SUPERALIGNED_BITS #endif #if HAVE_ERTS_MSEG @@ -68,6 +59,9 @@ typedef struct { Uint rmcbf; Uint mcs; Uint nos; + ErtsMMapInit dflt_mmap; + ErtsMMapInit literal_mmap; + ErtsMMapInit exec_mmap; } ErtsMsegInit_t; #define ERTS_MSEG_INIT_DEFAULT_INITIALIZER \ @@ -75,7 +69,10 @@ typedef struct { 4*1024*1024, /* amcbf: Absolute max cache bad fit */ \ 20, /* rmcbf: Relative max cache bad fit */ \ 10, /* mcs: Max cache size */ \ - 1000 /* cci: Cache check interval */ \ + 1000, /* cci: Cache check interval */ \ + ERTS_MMAP_INIT_DEFAULT_INITER, \ + ERTS_MMAP_INIT_LITERAL_INITER, \ + ERTS_MMAP_INIT_HIPE_EXEC_INITER \ } typedef struct { @@ -84,19 +81,16 @@ typedef struct { UWord abs_shrink_th; UWord rel_shrink_th; int sched_spec; -#if HALFWORD_HEAP - int low_mem; -#endif } ErtsMsegOpt_t; extern const ErtsMsegOpt_t erts_mseg_default_opt; -void *erts_mseg_alloc(ErtsAlcType_t, Uint *, Uint); -void *erts_mseg_alloc_opt(ErtsAlcType_t, Uint *, Uint, const ErtsMsegOpt_t *); -void erts_mseg_dealloc(ErtsAlcType_t, void *, Uint, Uint); -void erts_mseg_dealloc_opt(ErtsAlcType_t, void *, Uint, Uint, const ErtsMsegOpt_t *); -void *erts_mseg_realloc(ErtsAlcType_t, void *, Uint, Uint *, Uint); -void *erts_mseg_realloc_opt(ErtsAlcType_t, void *, Uint, Uint *, Uint, const ErtsMsegOpt_t *); +void *erts_mseg_alloc(ErtsAlcType_t, UWord *, Uint); +void *erts_mseg_alloc_opt(ErtsAlcType_t, UWord *, Uint, const ErtsMsegOpt_t *); +void erts_mseg_dealloc(ErtsAlcType_t, void *, UWord, Uint); +void erts_mseg_dealloc_opt(ErtsAlcType_t, void *, UWord, Uint, const ErtsMsegOpt_t *); +void *erts_mseg_realloc(ErtsAlcType_t, void *, UWord, UWord *, Uint); +void *erts_mseg_realloc_opt(ErtsAlcType_t, void *, UWord, UWord *, Uint, const ErtsMsegOpt_t *); void erts_mseg_clear_cache(void); void erts_mseg_cache_check(void); Uint erts_mseg_no( const ErtsMsegOpt_t *); @@ -105,7 +99,7 @@ void erts_mseg_init(ErtsMsegInit_t *init); void erts_mseg_late_init(void); /* Have to be called after all allocators, threads and timers have been initialized. */ Eterm erts_mseg_info_options(int, int *, void*, Uint **, Uint *); -Eterm erts_mseg_info(int, int *, void*, int, Uint **, Uint *); +Eterm erts_mseg_info(int, int *, void*, int, int, Uint **, Uint *); #endif /* #if HAVE_ERTS_MSEG */ diff --git a/erts/emulator/sys/common/erl_mtrace_sys_wrap.c b/erts/emulator/sys/common/erl_mtrace_sys_wrap.c index 408aa7e016..fc871f94f1 100644 --- a/erts/emulator/sys/common/erl_mtrace_sys_wrap.c +++ b/erts/emulator/sys/common/erl_mtrace_sys_wrap.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2004-2009. All Rights Reserved. + * Copyright Ericsson AB 2004-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/common/erl_os_monotonic_time_extender.c b/erts/emulator/sys/common/erl_os_monotonic_time_extender.c new file mode 100644 index 0000000000..d53190fdd5 --- /dev/null +++ b/erts/emulator/sys/common/erl_os_monotonic_time_extender.c @@ -0,0 +1,89 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2015-2016. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include "erl_os_monotonic_time_extender.h" + +#ifdef USE_THREADS + +static void *os_monotonic_time_extender(void *vstatep) +{ + ErtsOsMonotonicTimeExtendState *state = (ErtsOsMonotonicTimeExtendState *) vstatep; + long sleep_time = state->check_interval*1000; + Uint32 (*raw_os_mtime)(void) = state->raw_os_monotonic_time; + Uint32 last_msb = 0; + + while (1) { + Uint32 msb = (*raw_os_mtime)() & (((Uint32) 1) << 31); + + if (msb != last_msb) { + int ix = ((int) (last_msb >> 31)) & 1; + Uint32 xtnd = (Uint32) erts_atomic32_read_nob(&state->extend[ix]); + erts_atomic32_set_nob(&state->extend[ix], (erts_aint32_t) (xtnd + 1)); + last_msb = msb; + } + erts_milli_sleep(sleep_time); + } + + erts_exit(ERTS_ABORT_EXIT, "os_monotonic_time_extender thread terminating"); + return NULL; +} + +static erts_tid_t os_monotonic_extender_tid; +#endif + +void +erts_init_os_monotonic_time_extender(ErtsOsMonotonicTimeExtendState *statep, + Uint32 (*raw_os_monotonic_time)(void), + int check_seconds) +{ +#ifdef USE_THREADS + statep->raw_os_monotonic_time = raw_os_monotonic_time; + erts_atomic32_init_nob(&statep->extend[0], (erts_aint32_t) 0); + erts_atomic32_init_nob(&statep->extend[1], (erts_aint32_t) 0); + statep->check_interval = check_seconds; + +#else + statep->extend[0] = (Uint32) 0; + statep->extend[1] = (Uint32) 0; + statep->last_msb = (ErtsMonotonicTime) 0; +#endif +} + +void +erts_late_init_os_monotonic_time_extender(ErtsOsMonotonicTimeExtendState *statep) +{ +#ifdef USE_THREADS + erts_thr_opts_t thr_opts = ERTS_THR_OPTS_DEFAULT_INITER; + thr_opts.detached = 1; + thr_opts.suggested_stack_size = 4; + +#if 0 + thr_opts.name = "os_monotonic_time_extender"; +#endif + + erts_thr_create(&os_monotonic_extender_tid, + os_monotonic_time_extender, + (void*) statep, + &thr_opts); +#endif +} diff --git a/erts/emulator/sys/common/erl_os_monotonic_time_extender.h b/erts/emulator/sys/common/erl_os_monotonic_time_extender.h new file mode 100644 index 0000000000..8089c9aed9 --- /dev/null +++ b/erts/emulator/sys/common/erl_os_monotonic_time_extender.h @@ -0,0 +1,66 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2015. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#ifndef ERL_OS_MONOTONIC_TIME_EXTENDER_H__ +#define ERL_OS_MONOTONIC_TIME_EXTENDER_H__ + +#include "sys.h" +#include "erl_threads.h" + +typedef struct { +#ifdef USE_THREADS + Uint32 (*raw_os_monotonic_time)(void); + erts_atomic32_t extend[2]; + int check_interval; +#else + Uint32 extend[2]; + ErtsMonotonicTime last_msb; +#endif +} ErtsOsMonotonicTimeExtendState; + +#ifdef USE_THREADS +# define ERTS_CHK_EXTEND_OS_MONOTONIC_TIME(S, RT) ((void) 1) +# define ERTS_EXTEND_OS_MONOTONIC_TIME(S, RT) \ + ((((ErtsMonotonicTime) \ + erts_atomic32_read_nob(&((S)->extend[((int) ((RT) >> 31)) & 1]))) \ + << 32) \ + + (RT)) +#else +# define ERTS_CHK_EXTEND_OS_MONOTONIC_TIME(S, RT) \ + do { \ + Uint32 msb__ = (RT) & (((Uint32) 1) << 31); \ + if (msb__ != (S)->last_msb) { \ + int ix__ = ((int) ((S)->last_msb >> 31)) & 1; \ + (S)->extend[ix__]++; \ + (S)->last_msb = msb; \ + } \ + } while (0) +# define ERTS_EXTEND_OS_MONOTONIC_TIME(S, RT) \ + ((((ErtsMonotonicTime) (S)->extend[((int) ((RT) >> 31)) & 1]) << 32) + (RT)) +#endif + +void +erts_init_os_monotonic_time_extender(ErtsOsMonotonicTimeExtendState *statep, + Uint32 (*raw_os_monotonic_time)(void), + int check_seconds); +void +erts_late_init_os_monotonic_time_extender(ErtsOsMonotonicTimeExtendState *statep); + +#endif diff --git a/erts/emulator/sys/common/erl_poll.c b/erts/emulator/sys/common/erl_poll.c index 5861b30315..e394d84f73 100644 --- a/erts/emulator/sys/common/erl_poll.c +++ b/erts/emulator/sys/common/erl_poll.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2013. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -73,6 +74,7 @@ #include "erl_thr_progress.h" #include "erl_driver.h" #include "erl_alloc.h" +#include "erl_msacc.h" #if !defined(ERTS_POLL_USE_EPOLL) \ && !defined(ERTS_POLL_USE_DEVPOLL) \ @@ -123,8 +125,8 @@ static ERTS_INLINE int ERTS_SELECT(int nfds, ERTS_fd_set *readfds, ERTS_fd_set *writefds, ERTS_fd_set *exceptfds, struct timeval *timeout) { - ASSERT(!readfds || readfds->sz >= nfds); - ASSERT(!writefds || writefds->sz >= nfds); + ASSERT(!readfds || readfds->sz >= ERTS_FD_SIZE(nfds)); + ASSERT(!writefds || writefds->sz >= ERTS_FD_SIZE(nfds)); ASSERT(!exceptfds); return select(nfds, (readfds ? readfds->ptr : NULL ), @@ -153,9 +155,6 @@ int ERTS_SELECT(int nfds, ERTS_fd_set *readfds, ERTS_fd_set *writefds, #define ERTS_POLL_COALESCE_KP_RES (ERTS_POLL_USE_KQUEUE || ERTS_POLL_USE_EPOLL) -#define ERTS_EV_TABLE_MIN_LENGTH 1024 -#define ERTS_EV_TABLE_EXP_THRESHOLD (2048*1024) - #ifdef ERTS_POLL_NEED_ASYNC_INTERRUPT_SUPPORT # define ERTS_POLL_ASYNC_INTERRUPT_SUPPORT 1 #else @@ -314,13 +313,16 @@ struct ErtsPollSet_ { #if ERTS_POLL_USE_WAKEUP_PIPE int wake_fds[2]; #endif +#if ERTS_POLL_USE_TIMERFD + int timer_fd; +#endif #if ERTS_POLL_USE_FALLBACK int fallback_used; #endif #if defined(USE_THREADS) || ERTS_POLL_ASYNC_INTERRUPT_SUPPORT erts_atomic32_t wakeup_state; #endif - erts_smp_atomic32_t timeout; + erts_atomic64_t timeout_time; #ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS erts_smp_atomic_t no_avoided_wakeups; erts_smp_atomic_t no_avoided_interrupts; @@ -384,6 +386,26 @@ static void check_poll_status(ErtsPollSet ps); static void print_misc_debug_info(void); #endif +static ERTS_INLINE void +init_timeout_time(ErtsPollSet ps) +{ + erts_atomic64_init_nob(&ps->timeout_time, + (erts_aint64_t) ERTS_MONOTONIC_TIME_MAX); +} + +static ERTS_INLINE void +set_timeout_time(ErtsPollSet ps, ErtsMonotonicTime time) +{ + erts_atomic64_set_relb(&ps->timeout_time, + (erts_aint64_t) time); +} + +static ERTS_INLINE ErtsMonotonicTime +get_timeout_time(ErtsPollSet ps) +{ + return (ErtsMonotonicTime) erts_atomic64_read_acqb(&ps->timeout_time); +} + #define ERTS_POLL_NOT_WOKEN 0 #define ERTS_POLL_WOKEN -1 #define ERTS_POLL_WOKEN_INTR 1 @@ -410,7 +432,7 @@ static ERTS_INLINE int is_interrupted_reset(ErtsPollSet ps) { #if defined(USE_THREADS) || ERTS_POLL_ASYNC_INTERRUPT_SUPPORT - return (erts_atomic32_xchg_nob(&ps->wakeup_state, ERTS_POLL_NOT_WOKEN) + return (erts_atomic32_xchg_acqb(&ps->wakeup_state, ERTS_POLL_NOT_WOKEN) == ERTS_POLL_WOKEN_INTR); #else return 0; @@ -421,7 +443,7 @@ static ERTS_INLINE void woke_up(ErtsPollSet ps) { #if defined(USE_THREADS) || ERTS_POLL_ASYNC_INTERRUPT_SUPPORT - erts_aint32_t wakeup_state = erts_atomic32_read_nob(&ps->wakeup_state); + erts_aint32_t wakeup_state = erts_atomic32_read_acqb(&ps->wakeup_state); if (wakeup_state == ERTS_POLL_NOT_WOKEN) (void) erts_atomic32_cmpxchg_nob(&ps->wakeup_state, ERTS_POLL_WOKEN, @@ -448,14 +470,9 @@ wake_poller(ErtsPollSet ps, int interrupted, int async_signal_safe) wakeup_state = erts_atomic32_cmpxchg_relb(&ps->wakeup_state, ERTS_POLL_WOKEN, ERTS_POLL_NOT_WOKEN); - else { - /* - * We might unnecessarily write to the pipe, however, - * that isn't problematic. - */ - wakeup_state = erts_atomic32_read_nob(&ps->wakeup_state); - erts_atomic32_set_relb(&ps->wakeup_state, ERTS_POLL_WOKEN_INTR); - } + else + wakeup_state = erts_atomic32_xchg_relb(&ps->wakeup_state, + ERTS_POLL_WOKEN_INTR); wake = wakeup_state == ERTS_POLL_NOT_WOKEN; } /* @@ -560,6 +577,75 @@ create_wakeup_pipe(ErtsPollSet ps) #endif /* ERTS_POLL_USE_WAKEUP_PIPE */ /* + * --- timer fd ----------------------------------------------------------- + */ + +#if ERTS_POLL_USE_TIMERFD + +/* We use the timerfd when using epoll_wait to get high accuracy + timeouts, i.e. we want to sleep with < ms accuracy. */ + +static void +create_timerfd(ErtsPollSet ps) +{ + int do_wake = 0; + int timer_fd; + timer_fd = timerfd_create(CLOCK_MONOTONIC,0); + ERTS_POLL_EXPORT(erts_poll_control)(ps, + timer_fd, + ERTS_POLL_EV_IN, + 1, &do_wake); +#if ERTS_POLL_USE_FALLBACK + /* We depend on the wakeup pipe being handled by kernel poll */ + if (ps->fds_status[timer_fd].flags & ERTS_POLL_FD_FLG_INFLBCK) + fatal_error("%s:%d:create_wakeup_pipe(): Internal error\n", + __FILE__, __LINE__); +#endif + if (ps->internal_fd_limit <= timer_fd) + ps->internal_fd_limit = timer_fd + 1; + ps->timer_fd = timer_fd; +} + +static ERTS_INLINE void +timerfd_set(ErtsPollSet ps, struct itimerspec *its) +{ +#ifdef DEBUG + struct itimerspec old_its; + int res; + res = timerfd_settime(ps->timer_fd, 0, its, &old_its); + ASSERT(res == 0); + ASSERT(old_its.it_interval.tv_sec == 0 && + old_its.it_interval.tv_nsec == 0 && + old_its.it_value.tv_sec == 0 && + old_its.it_value.tv_nsec == 0); + +#else + timerfd_settime(ps->timer_fd, 0, its, NULL); +#endif +} + +static ERTS_INLINE int +timerfd_clear(ErtsPollSet ps, int res, int max_res) { + + struct itimerspec its; + /* we always have to clear the timer */ + its.it_interval.tv_sec = 0; + its.it_interval.tv_nsec = 0; + its.it_value.tv_sec = 0; + its.it_value.tv_nsec = 0; + timerfd_settime(ps->timer_fd, 0, &its, NULL); + + /* only timeout fd triggered */ + if (res == 1 && ps->res_events[0].data.fd == ps->timer_fd) + return 0; + + return res; +} + +#endif /* ERTS_POLL_USE_TIMERFD */ + + +/* * --- Poll set update requests ---------------------------------------------- */ #if ERTS_POLL_USE_UPDATE_REQUESTS_QUEUE @@ -611,27 +697,33 @@ free_update_requests_block(ErtsPollSet ps, * --- Growing poll set structures ------------------------------------------- */ -int -ERTS_POLL_EXPORT(erts_poll_get_table_len) (int new_len) +#ifndef ERTS_KERNEL_POLL_VERSION /* only one shared implementation */ + +#define ERTS_FD_TABLE_MIN_LENGTH 1024 +#define ERTS_FD_TABLE_EXP_THRESHOLD (2048*1024) + +int erts_poll_new_table_len (int old_len, int need_len) { - if (new_len < ERTS_EV_TABLE_MIN_LENGTH) { - new_len = ERTS_EV_TABLE_MIN_LENGTH; - } else if (new_len < ERTS_EV_TABLE_EXP_THRESHOLD) { - /* find next power of 2 */ - --new_len; - new_len |= new_len >> 1; - new_len |= new_len >> 2; - new_len |= new_len >> 4; - new_len |= new_len >> 8; - new_len |= new_len >> 16; - ++new_len; - } else { - /* grow incrementally */ - new_len += ERTS_EV_TABLE_EXP_THRESHOLD; + int new_len; + + ASSERT(need_len > old_len); + if (need_len < ERTS_FD_TABLE_MIN_LENGTH) { + new_len = ERTS_FD_TABLE_MIN_LENGTH; + } + else { + new_len = old_len; + do { + if (new_len < ERTS_FD_TABLE_EXP_THRESHOLD) + new_len *= 2; + else + new_len += ERTS_FD_TABLE_EXP_THRESHOLD; + + } while (new_len < need_len); } + ASSERT(new_len >= need_len); return new_len; } - +#endif #if ERTS_POLL_USE_KERNEL_POLL static void @@ -645,7 +737,7 @@ grow_res_events(ErtsPollSet ps, int new_len) #elif ERTS_POLL_USE_KQUEUE struct kevent #endif - ) * ERTS_POLL_EXPORT(erts_poll_get_table_len)(new_len); + ) * erts_poll_new_table_len(ps->res_events_len, new_len); /* We do not need to save previously stored data */ if (ps->res_events) erts_free(ERTS_ALC_T_POLL_RES_EVS, ps->res_events); @@ -659,7 +751,7 @@ static void grow_poll_fds(ErtsPollSet ps, int min_ix) { int i; - int new_len = ERTS_POLL_EXPORT(erts_poll_get_table_len)(min_ix + 1); + int new_len = erts_poll_new_table_len(ps->poll_fds_len, min_ix + 1); if (new_len > max_fds) new_len = max_fds; ps->poll_fds = (ps->poll_fds_len @@ -681,7 +773,7 @@ grow_poll_fds(ErtsPollSet ps, int min_ix) static void grow_select_fds(int fd, ERTS_fd_set* fds) { - int new_len = ERTS_POLL_EXPORT(erts_poll_get_table_len)(fd + 1); + int new_len = erts_poll_new_table_len(fds->sz, fd + 1); if (new_len > max_fds) new_len = max_fds; new_len = ERTS_FD_SIZE(new_len); @@ -708,7 +800,7 @@ static void grow_fds_status(ErtsPollSet ps, int min_fd) { int i; - int new_len = ERTS_POLL_EXPORT(erts_poll_get_table_len)(min_fd + 1); + int new_len = erts_poll_new_table_len(ps->fds_status_len, min_fd + 1); ASSERT(min_fd < max_fds); if (new_len > max_fds) new_len = max_fds; @@ -1497,6 +1589,12 @@ poll_control(ErtsPollSet ps, int fd, ErtsPollEvents events, int on, int *do_wake goto done; } #endif +#if ERTS_POLL_USE_TIMERFD + if (fd == ps->timer_fd) { + new_events = ERTS_POLL_EV_NVAL; + goto done; + } +#endif } if (fd >= ps->fds_status_len) @@ -1644,6 +1742,9 @@ save_kp_result(ErtsPollSet ps, ErtsPollResFd pr[], int max_res, int chk_fds_res) #if ERTS_POLL_USE_WAKEUP_PIPE int wake_fd = ps->wake_fds[0]; #endif +#if ERTS_POLL_USE_TIMERFD + int timer_fd = ps->timer_fd; +#endif for (i = 0; i < n; i++) { @@ -1659,6 +1760,11 @@ save_kp_result(ErtsPollSet ps, ErtsPollResFd pr[], int max_res, int chk_fds_res) continue; } #endif +#if ERTS_POLL_USE_TIMERFD + if (fd == timer_fd) { + continue; + } +#endif ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK)); /* epoll_wait() can repeat the same fd in result array... */ ix = (int) ps->fds_status[fd].res_ev_ix; @@ -1733,6 +1839,11 @@ save_kp_result(ErtsPollSet ps, ErtsPollResFd pr[], int max_res, int chk_fds_res) continue; } #endif +#if ERTS_POLL_USE_TIMERFD + if (fd == timer_fd) { + continue; + } +#endif revents = ERTS_POLL_EV_N2E(ps->res_events[i].events); pr[res].fd = fd; pr[res].events = revents; @@ -1993,44 +2104,192 @@ save_poll_result(ErtsPollSet ps, ErtsPollResFd pr[], int max_res, } } +static ERTS_INLINE ErtsMonotonicTime +get_timeout(ErtsPollSet ps, + int resolution, + ErtsMonotonicTime timeout_time) +{ + ErtsMonotonicTime timeout, save_timeout_time; + + if (timeout_time == ERTS_POLL_NO_TIMEOUT) { + save_timeout_time = ERTS_MONOTONIC_TIME_MIN; + timeout = 0; + } + else { + ErtsMonotonicTime diff_time, current_time; + current_time = erts_get_monotonic_time(NULL); + diff_time = timeout_time - current_time; + if (diff_time <= 0) { + save_timeout_time = ERTS_MONOTONIC_TIME_MIN; + timeout = 0; + } + else { + save_timeout_time = current_time; + switch (resolution) { + case 1000: + /* Round up to nearest even milli second */ + timeout = ERTS_MONOTONIC_TO_MSEC(diff_time - 1) + 1; + if (timeout > (ErtsMonotonicTime) INT_MAX) + timeout = (ErtsMonotonicTime) INT_MAX; + save_timeout_time += ERTS_MSEC_TO_MONOTONIC(timeout); + break; + case 1000000: + /* Round up to nearest even micro second */ + timeout = ERTS_MONOTONIC_TO_USEC(diff_time - 1) + 1; + save_timeout_time += ERTS_USEC_TO_MONOTONIC(timeout); + break; + case 1000000000: + /* Round up to nearest even nano second */ + timeout = ERTS_MONOTONIC_TO_NSEC(diff_time - 1) + 1; + save_timeout_time += ERTS_NSEC_TO_MONOTONIC(timeout); + break; + default: + ERTS_INTERNAL_ERROR("Invalid resolution"); + timeout = 0; + save_timeout_time = 0; + break; + } + } + } + set_timeout_time(ps, save_timeout_time); + return timeout; +} + +#if ERTS_POLL_USE_SELECT + +static ERTS_INLINE int +get_timeout_timeval(ErtsPollSet ps, + SysTimeval *tvp, + ErtsMonotonicTime timeout_time) +{ + ErtsMonotonicTime timeout = get_timeout(ps, + 1000*1000, + timeout_time); + + if (!timeout) { + tvp->tv_sec = 0; + tvp->tv_usec = 0; + + return 0; + } + else { + ErtsMonotonicTime sec = timeout/(1000*1000); + tvp->tv_sec = sec; + tvp->tv_usec = timeout - sec*(1000*1000); + + ASSERT(tvp->tv_sec >= 0); + ASSERT(tvp->tv_usec >= 0); + ASSERT(tvp->tv_usec < 1000*1000); + + return !0; + } + +} + +#endif + +#if ERTS_POLL_USE_KQUEUE || (ERTS_POLL_USE_POLL && defined(HAVE_PPOLL)) || ERTS_POLL_USE_TIMERFD + +static ERTS_INLINE int +get_timeout_timespec(ErtsPollSet ps, + struct timespec *tsp, + ErtsMonotonicTime timeout_time) +{ + ErtsMonotonicTime timeout = get_timeout(ps, + 1000*1000*1000, + timeout_time); + + if (!timeout) { + tsp->tv_sec = 0; + tsp->tv_nsec = 0; + return 0; + } + else { + ErtsMonotonicTime sec = timeout/(1000*1000*1000); + tsp->tv_sec = sec; + tsp->tv_nsec = timeout - sec*(1000*1000*1000); + + ASSERT(tsp->tv_sec >= 0); + ASSERT(tsp->tv_nsec >= 0); + ASSERT(tsp->tv_nsec < 1000*1000*1000); + + return !0; + } +} + +#endif + +#if ERTS_POLL_USE_TIMERFD + static ERTS_INLINE int -check_fd_events(ErtsPollSet ps, SysTimeval *tv, int max_res) +get_timeout_itimerspec(ErtsPollSet ps, + struct itimerspec *itsp, + ErtsMonotonicTime timeout_time) +{ + + itsp->it_interval.tv_sec = 0; + itsp->it_interval.tv_nsec = 0; + + return get_timeout_timespec(ps, &itsp->it_value, timeout_time); +} + +#endif + +static ERTS_INLINE int +check_fd_events(ErtsPollSet ps, ErtsMonotonicTime timeout_time, int max_res) { int res; + ERTS_MSACC_PUSH_STATE_M(); if (erts_smp_atomic_read_nob(&ps->no_of_user_fds) == 0 - && tv->tv_usec == 0 && tv->tv_sec == 0) { + && timeout_time == ERTS_POLL_NO_TIMEOUT) { /* Nothing to poll and zero timeout; done... */ return 0; } else { - long timeout = tv->tv_sec*1000 + tv->tv_usec/1000; - if (timeout > ERTS_AINT32_T_MAX) - timeout = ERTS_AINT32_T_MAX; - ASSERT(timeout >= 0); - erts_smp_atomic32_set_relb(&ps->timeout, (erts_aint32_t) timeout); + int timeout; #if ERTS_POLL_USE_FALLBACK if (!(ps->fallback_used = ERTS_POLL_NEED_FALLBACK(ps))) { #if ERTS_POLL_USE_EPOLL /* --- epoll ------------------------------- */ - if (timeout > INT_MAX) - timeout = INT_MAX; if (max_res > ps->res_events_len) grow_res_events(ps, max_res); +#if ERTS_POLL_USE_TIMERFD + { + struct itimerspec its; + timeout = get_timeout_itimerspec(ps, &its, timeout_time); + if (timeout) { +#ifdef ERTS_SMP + erts_thr_progress_prepare_wait(NULL); +#endif + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + timerfd_set(ps, &its); + res = epoll_wait(ps->kp_fd, ps->res_events, max_res, -1); + res = timerfd_clear(ps, res, max_res); + } else { + res = epoll_wait(ps->kp_fd, ps->res_events, max_res, 0); + } + } +#else /* !ERTS_POLL_USE_TIMERFD */ + timeout = (int) get_timeout(ps, 1000, timeout_time); + if (timeout) { #ifdef ERTS_SMP - if (timeout) erts_thr_progress_prepare_wait(NULL); #endif - res = epoll_wait(ps->kp_fd, ps->res_events, max_res, (int)timeout); + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + } + res = epoll_wait(ps->kp_fd, ps->res_events, max_res, timeout); +#endif /* !ERTS_POLL_USE_TIMERFD */ #elif ERTS_POLL_USE_KQUEUE /* --- kqueue ------------------------------ */ struct timespec ts; if (max_res > ps->res_events_len) grow_res_events(ps, max_res); + timeout = get_timeout_timespec(ps, &ts, timeout_time); + if (timeout) { #ifdef ERTS_SMP - if (timeout) erts_thr_progress_prepare_wait(NULL); #endif - ts.tv_sec = tv->tv_sec; - ts.tv_nsec = tv->tv_usec*1000; + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + } res = kevent(ps->kp_fd, NULL, 0, ps->res_events, max_res, &ts); #endif /* ----------------------------------------- */ } @@ -2049,44 +2308,62 @@ check_fd_events(ErtsPollSet ps, SysTimeval *tv, int max_res) #if ERTS_POLL_USE_WAKEUP_PIPE nfds++; /* Wakeup pipe */ #endif - if (timeout > INT_MAX) - timeout = INT_MAX; + timeout = (int) get_timeout(ps, 1000, timeout_time); poll_res.dp_nfds = nfds < max_res ? nfds : max_res; if (poll_res.dp_nfds > ps->res_events_len) grow_res_events(ps, poll_res.dp_nfds); poll_res.dp_fds = ps->res_events; + if (timeout) { #ifdef ERTS_SMP - if (timeout) erts_thr_progress_prepare_wait(NULL); #endif - poll_res.dp_timeout = (int) timeout; + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + } + poll_res.dp_timeout = timeout; res = ioctl(ps->kp_fd, DP_POLL, &poll_res); -#elif ERTS_POLL_USE_POLL /* --- poll -------------------------------- */ - if (timeout > INT_MAX) - timeout = INT_MAX; +#elif ERTS_POLL_USE_POLL && defined(HAVE_PPOLL) /* --- ppoll ---------------- */ + struct timespec ts; + timeout = get_timeout_timespec(ps, &ts, timeout_time); + if (timeout) { #ifdef ERTS_SMP - if (timeout) erts_thr_progress_prepare_wait(NULL); #endif - res = poll(ps->poll_fds, ps->no_poll_fds, (int) timeout); + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + } + res = ppoll(ps->poll_fds, ps->no_poll_fds, &ts, NULL); +#elif ERTS_POLL_USE_POLL /* --- poll --------------------------------- */ + timeout = (int) get_timeout(ps, 1000, timeout_time); + + if (timeout) { +#ifdef ERTS_SMP + erts_thr_progress_prepare_wait(NULL); +#endif + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + } + res = poll(ps->poll_fds, ps->no_poll_fds, timeout); #elif ERTS_POLL_USE_SELECT /* --- select ------------------------------ */ - SysTimeval to = *tv; + SysTimeval to; + timeout = get_timeout_timeval(ps, &to, timeout_time); ERTS_FD_COPY(&ps->input_fds, &ps->res_input_fds); ERTS_FD_COPY(&ps->output_fds, &ps->res_output_fds); - + + if (timeout) { #ifdef ERTS_SMP - if (to.tv_sec || to.tv_usec) erts_thr_progress_prepare_wait(NULL); #endif + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); + } res = ERTS_SELECT(ps->max_fd + 1, - &ps->res_input_fds, - &ps->res_output_fds, - NULL, - &to); + &ps->res_input_fds, + &ps->res_output_fds, + NULL, + &to); #ifdef ERTS_SMP - if (to.tv_sec || to.tv_usec) + if (timeout) { erts_thr_progress_finalize_wait(NULL); + ERTS_MSACC_POP_STATE_M(); + } if (res < 0 && errno == EBADF && ERTS_POLLSET_HAVE_UPDATE_REQUESTS(ps)) { @@ -2108,10 +2385,10 @@ check_fd_events(ErtsPollSet ps, SysTimeval *tv, int max_res) handle_update_requests(ps); ERTS_POLLSET_UNLOCK(ps); res = ERTS_SELECT(ps->max_fd + 1, - &ps->res_input_fds, - &ps->res_output_fds, - NULL, - &to); + &ps->res_input_fds, + &ps->res_output_fds, + NULL, + &to); if (res == 0) { errno = EAGAIN; res = -1; @@ -2121,10 +2398,12 @@ check_fd_events(ErtsPollSet ps, SysTimeval *tv, int max_res) return res; #endif /* ----------------------------------------- */ } + if (timeout) { #ifdef ERTS_SMP - if (timeout) erts_thr_progress_finalize_wait(NULL); #endif + ERTS_MSACC_POP_STATE_M(); + } return res; } } @@ -2133,15 +2412,14 @@ int ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet ps, ErtsPollResFd pr[], int *len, - SysTimeval *utvp) + ErtsMonotonicTime timeout_time) { + ErtsMonotonicTime to; int res, no_fds; int ebadf = 0; #ifdef ERTS_SMP int ps_locked = 0; #endif - SysTimeval *tvp; - SysTimeval itv; no_fds = *len; #ifdef ERTS_POLL_MAX_RES @@ -2151,13 +2429,9 @@ ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet ps, *len = 0; - ASSERT(utvp); - - tvp = utvp; - #ifdef ERTS_POLL_DEBUG_PRINT - erts_printf("Entering erts_poll_wait(), timeout=%d\n", - (int) tv->tv_sec*1000 + tv->tv_usec/1000); + erts_printf("Entering erts_poll_wait(), timeout_time=%bps\n", + timeout_time); #endif if (ERTS_POLLSET_SET_POLLED_CHK(ps)) { @@ -2166,12 +2440,9 @@ ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet ps, goto done; } - if (is_woken(ps)) { - /* Use zero timeout */ - itv.tv_sec = 0; - itv.tv_usec = 0; - tvp = &itv; - } + to = (is_woken(ps) + ? ERTS_POLL_NO_TIMEOUT /* Use zero timeout */ + : timeout_time); #if ERTS_POLL_USE_UPDATE_REQUESTS_QUEUE if (ERTS_POLLSET_HAVE_UPDATE_REQUESTS(ps)) { @@ -2181,7 +2452,7 @@ ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet ps, } #endif - res = check_fd_events(ps, tvp, no_fds); + res = check_fd_events(ps, to, no_fds); woke_up(ps); @@ -2224,7 +2495,7 @@ ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet ps, #endif done: - erts_smp_atomic32_set_relb(&ps->timeout, ERTS_AINT32_T_MAX); + set_timeout_time(ps, ERTS_MONOTONIC_TIME_MAX); #ifdef ERTS_POLL_DEBUG_PRINT erts_printf("Leaving %s = erts_poll_wait()\n", res == 0 ? "0" : erl_errno_id(res)); @@ -2268,13 +2539,14 @@ ERTS_POLL_EXPORT(erts_poll_async_sig_interrupt)(ErtsPollSet ps) void ERTS_POLL_EXPORT(erts_poll_interrupt_timed)(ErtsPollSet ps, int set, - erts_short_time_t msec) + ErtsMonotonicTime timeout_time) { #if ERTS_POLL_ASYNC_INTERRUPT_SUPPORT || defined(ERTS_SMP) if (!set) reset_wakeup_state(ps); else { - if (erts_smp_atomic32_read_acqb(&ps->timeout) > (erts_aint32_t) msec) + ErtsMonotonicTime max_wait_time = get_timeout_time(ps); + if (max_wait_time > timeout_time) wake_poller(ps, 1, 0); #ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS else { @@ -2414,6 +2686,9 @@ ERTS_POLL_EXPORT(erts_poll_create_pollset)(void) #if ERTS_POLL_USE_WAKEUP_PIPE create_wakeup_pipe(ps); #endif +#if ERTS_POLL_USE_TIMERFD + create_timerfd(ps); +#endif #if ERTS_POLL_USE_FALLBACK if (kp_fd >= ps->fds_status_len) grow_fds_status(ps, kp_fd); @@ -2431,7 +2706,7 @@ ERTS_POLL_EXPORT(erts_poll_create_pollset)(void) ps->internal_fd_limit = kp_fd + 1; ps->kp_fd = kp_fd; #endif - erts_smp_atomic32_init_nob(&ps->timeout, ERTS_AINT32_T_MAX); + init_timeout_time(ps); #ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS erts_smp_atomic_init_nob(&ps->no_avoided_wakeups, 0); erts_smp_atomic_init_nob(&ps->no_avoided_interrupts, 0); @@ -2504,13 +2779,18 @@ ERTS_POLL_EXPORT(erts_poll_destroy_pollset)(ErtsPollSet ps) if (ps->wake_fds[1] >= 0) close(ps->wake_fds[1]); #endif +#if ERTS_POLL_USE_TIMERFD + if (ps->timer_fd >= 0) + close(ps->timer_fd); +#endif erts_smp_spin_lock(&pollsets_lock); if (ps == pollsets) pollsets = pollsets->next; else { ErtsPollSet prev_ps; - for (prev_ps = pollsets; ps != prev_ps->next; prev_ps = prev_ps->next); + for (prev_ps = pollsets; ps != prev_ps->next; prev_ps = prev_ps->next) + ; ASSERT(ps == prev_ps->next); prev_ps->next = ps->next; } @@ -2607,6 +2887,9 @@ ERTS_POLL_EXPORT(erts_poll_info)(ErtsPollSet ps, ErtsPollInfo *pip) #if ERTS_POLL_USE_WAKEUP_PIPE pip->poll_set_size++; /* Wakeup pipe */ #endif +#if ERTS_POLL_USE_TIMERFD + pip->poll_set_size++; /* timerfd */ +#endif pip->fallback_poll_set_size = #if !ERTS_POLL_USE_FALLBACK @@ -2735,14 +3018,18 @@ ERTS_POLL_EXPORT(erts_poll_get_selected_events)(ErtsPollSet ps, ev[fd] = 0; else { ev[fd] = ps->fds_status[fd].events; + if ( #if ERTS_POLL_USE_WAKEUP_PIPE - if (fd == ps->wake_fds[0] || fd == ps->wake_fds[1]) - ev[fd] |= ERTS_POLL_EV_NVAL; + fd == ps->wake_fds[0] || fd == ps->wake_fds[1] || +#endif +#if ERTS_POLL_USE_TIMERFD + fd == ps->timer_fd || #endif #if ERTS_POLL_USE_KERNEL_POLL - if (fd == ps->kp_fd) - ev[fd] |= ERTS_POLL_EV_NVAL; + fd == ps->kp_fd || #endif + 0) + ev[fd] |= ERTS_POLL_EV_NVAL; } } ERTS_POLLSET_UNLOCK(ps); diff --git a/erts/emulator/sys/common/erl_poll.h b/erts/emulator/sys/common/erl_poll.h index 09ed9f41af..c16122610d 100644 --- a/erts/emulator/sys/common/erl_poll.h +++ b/erts/emulator/sys/common/erl_poll.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2013. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -29,6 +30,8 @@ #include "sys.h" +#define ERTS_POLL_NO_TIMEOUT ERTS_MONOTONIC_TIME_MIN + #if 0 #define ERTS_POLL_COUNT_AVOIDED_WAKEUPS #endif @@ -96,10 +99,12 @@ # endif #endif +#define ERTS_POLL_USE_TIMERFD 0 + typedef Uint32 ErtsPollEvents; #undef ERTS_POLL_EV_E2N -#if defined(__WIN32__) /* --- win32 ------------------------------- */ +#if defined(__WIN32__) /* --- win32 --------------------------------------- */ #define ERTS_POLL_EV_IN 1 #define ERTS_POLL_EV_OUT 2 @@ -110,8 +115,14 @@ typedef Uint32 ErtsPollEvents; #include <sys/epoll.h> +#ifdef HAVE_SYS_TIMERFD_H +#include <sys/timerfd.h> +#undef ERTS_POLL_USE_TIMERFD +#define ERTS_POLL_USE_TIMERFD 1 +#endif + #define ERTS_POLL_EV_E2N(EV) \ - ((__uint32_t) (EV)) + ((uint32_t) (EV)) #define ERTS_POLL_EV_N2E(EV) \ ((ErtsPollEvents) (EV)) @@ -223,19 +234,20 @@ void ERTS_POLL_EXPORT(erts_poll_interrupt)(ErtsPollSet, int); void ERTS_POLL_EXPORT(erts_poll_interrupt_timed)(ErtsPollSet, int, - erts_short_time_t); + ErtsMonotonicTime); ErtsPollEvents ERTS_POLL_EXPORT(erts_poll_control)(ErtsPollSet, ErtsSysFdType, ErtsPollEvents, int on, - int* wake_poller); + int* wake_poller + ); void ERTS_POLL_EXPORT(erts_poll_controlv)(ErtsPollSet, ErtsPollControlEntry [], int on); int ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet, ErtsPollResFd [], int *, - SysTimeval *); + ErtsMonotonicTime); int ERTS_POLL_EXPORT(erts_poll_max_fds)(void); void ERTS_POLL_EXPORT(erts_poll_info)(ErtsPollSet, ErtsPollInfo *); @@ -246,6 +258,6 @@ void ERTS_POLL_EXPORT(erts_poll_get_selected_events)(ErtsPollSet, ErtsPollEvents [], int); -int ERTS_POLL_EXPORT(erts_poll_get_table_len)(int); +int erts_poll_new_table_len(int old_len, int need_len); #endif /* #ifndef ERL_POLL_H__ */ diff --git a/erts/emulator/sys/common/erl_sys_common_misc.c b/erts/emulator/sys/common/erl_sys_common_misc.c index 31ad3b82d5..79f87eb3a9 100644 --- a/erts/emulator/sys/common/erl_sys_common_misc.c +++ b/erts/emulator/sys/common/erl_sys_common_misc.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2013. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -44,6 +45,14 @@ #endif #endif +/* + * erts_check_io_time is used by the erl_check_io implementation. The + * global erts_check_io_time variable is declared here since there + * (often) exist two versions of erl_check_io (kernel-poll and + * non-kernel-poll), and we dont want two versions of this variable. + */ +erts_smp_atomic_t erts_check_io_time; + /* Written once and only once */ static int filename_encoding = ERL_FILENAME_UNKNOWN; @@ -52,7 +61,7 @@ static int filename_warning = ERL_FILENAME_WARNING_WARNING; /* Default unicode on windows and MacOS X */ static int user_filename_encoding = ERL_FILENAME_UTF8; #else -static int user_filename_encoding = ERL_FILENAME_LATIN1; +static int user_filename_encoding = ERL_FILENAME_UNKNOWN; #endif /* This controls the heuristic in printing characters in shell and w/ io:format("~tp", ...) etc. */ diff --git a/erts/emulator/sys/common/erl_util_queue.h b/erts/emulator/sys/common/erl_util_queue.h index 47925e2264..73293e0225 100644 --- a/erts/emulator/sys/common/erl_util_queue.h +++ b/erts/emulator/sys/common/erl_util_queue.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2013. All Rights Reserved. + * Copyright Ericsson AB 2013-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/unix/driver_int.h b/erts/emulator/sys/unix/driver_int.h index a7ee8087ab..840b832878 100644 --- a/erts/emulator/sys/unix/driver_int.h +++ b/erts/emulator/sys/unix/driver_int.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2009. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/unix/erl_child_setup.c b/erts/emulator/sys/unix/erl_child_setup.c index 7c6e4a2f37..6beb316350 100644 --- a/erts/emulator/sys/unix/erl_child_setup.c +++ b/erts/emulator/sys/unix/erl_child_setup.c @@ -1,100 +1,252 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2002-2009. All Rights Reserved. + * Copyright Ericsson AB 2002-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ /* - * After a vfork() (or fork()) the child exec()s to this program which - * sets up the child and exec()s to the user program (see spawn_start() - * in sys.c and ticket OTP-4389). + * This program is started at erts startup and all fork's that + * have to be done are done in here. This is done for a couple + * of reasons: + * - Allow usage of fork without a memory explosion. + * -- we do not want to use vfork, as it blocks the VM + * until the execv is done, and if the program that + * is to be executed is on an NFS that is unavailable, + * the execv can block for a very long time. + * -- we cannot do fork inside the VM as that would temporarily + * duplicate the memory usage of the VM per parallel exec. + * + * Some implementation notes: + * - A single Unix Domain Socket is setup in between the VM and + * this program. Over that UDS the file descriptors that should + * be used to talk to the child program are sent. + * The actual command to execute, together with options and the + * environment, is sent over the pipe represented by the + * file descriptors mentioned above. We don't send the + * command over the UDS as that would increase the likely hood + * that it's buffer would be full. + * + * - Since it is this program that execv's, it has to take care of + * all the SIGCHLD signals that the child programs generate. The + * signals are received and the pid+exit reason is sent as data + * on the UDS to the VM. The VM is then able to map the pid to the + * port of the child program that just exited and deliver the status + * code if requested. */ #ifdef HAVE_CONFIG_H # include "config.h" #endif -#define NEED_CHILD_SETUP_DEFINES -#include "sys.h" -#include "erl_misc_utils.h" +#include <stdlib.h> +#include <stdio.h> +#include <sys/wait.h> -#ifdef SIG_SIGSET /* Old SysV */ -void sys_sigrelease(int sig) +#define WANT_NONBLOCKING + +#include "erl_driver.h" +#include "sys_uds.h" +#include "hash.h" +#include "erl_term.h" +#include "erl_child_setup.h" + +#define SET_CLOEXEC(fd) fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC) + +#if defined(__ANDROID__) +#define SHELL "/system/bin/sh" +#else +#define SHELL "/bin/sh" +#endif /* __ANDROID__ */ + +//#define HARD_DEBUG +#ifdef HARD_DEBUG +#define DEBUG_PRINT(fmt, ...) fprintf(stderr, fmt "\r\n", ##__VA_ARGS__) +#else +#define DEBUG_PRINT(fmt, ...) +#endif + +#define ABORT(fmt, ...) do { \ + fprintf(stderr, "erl_child_setup: " fmt "\r\n", ##__VA_ARGS__); \ + abort(); \ + } while(0) + +#ifdef DEBUG +void +erl_assert_error(const char* expr, const char* func, const char* file, int line) { - sigrelse(sig); + fflush(stdout); + fprintf(stderr, "%s:%d:%s() Assertion failed: %s\n", + file, line, func, expr); + fflush(stderr); + abort(); } -#else /* !SIG_SIGSET */ -#ifdef SIG_SIGNAL /* Old BSD */ -sys_sigrelease(int sig) +#endif + +void sys_sigblock(int sig) { - sigsetmask(sigblock(0) & ~sigmask(sig)); + sigset_t mask; + + sigemptyset(&mask); + sigaddset(&mask, sig); + sigprocmask(SIG_BLOCK, &mask, (sigset_t *)NULL); } -#else /* !SIG_SIGNAL */ /* The True Way - POSIX!:-) */ + void sys_sigrelease(int sig) { sigset_t mask; - sigemptyset(&mask); sigaddset(&mask, sig); sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL); } -#endif /* !SIG_SIGNAL */ -#endif /* !SIG_SIGSET */ -int -main(int argc, char *argv[]) +static void add_os_pid_to_port_id_mapping(Eterm, pid_t); +static Eterm get_port_id(pid_t); +static int forker_hash_init(void); + +static int max_files = -1; +static int sigchld_pipe[2]; + +static int +start_new_child(int pipes[]) { - int i, from, to; - int erts_spawn_executable = 0; + int size, res, i, pos = 0; + char *buff, *o_buff; + + char *cmd, *wd, **new_environ, **args = NULL; + + Sint cnt, flags; + + /* only child executes here */ + + do { + res = read(pipes[0], (char*)&size, sizeof(size)); + } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK)); + + if (res <= 0) { + goto child_error; + } + + buff = malloc(size); + + DEBUG_PRINT("size = %d", size); + + do { + if ((res = read(pipes[0], buff + pos, size - pos)) < 0) { + if (errno == ERRNO_BLOCK || errno == EINTR) + continue; + goto child_error; + } + if (res == 0) { + errno = EPIPE; + goto child_error; + } + pos += res; + } while(size - pos != 0); + + o_buff = buff; - /* OBSERVE! - * Keep child setup after fork() (implemented in sys.c) up to date - * if changes are made here. - */ + flags = get_int32(buff); + buff += sizeof(Sint32); - if (argc != CS_ARGV_NO_OF_ARGS) { - if (argc < CS_ARGV_NO_OF_ARGS) { - return 1; - } else { - erts_spawn_executable = 1; - } + DEBUG_PRINT("flags = %d", flags); + + cmd = buff; + buff += strlen(buff) + 1; + if (*buff == '\0') { + wd = NULL; + } else { + wd = buff; + buff += strlen(buff) + 1; + } + buff++; + + DEBUG_PRINT("wd = %s", wd); + + cnt = get_int32(buff); + buff += sizeof(Sint32); + new_environ = malloc(sizeof(char*)*(cnt + 1)); + + DEBUG_PRINT("env_len = %ld", cnt); + for (i = 0; i < cnt; i++, buff++) { + new_environ[i] = buff; + while(*buff != '\0') buff++; + } + new_environ[cnt] = NULL; + + if (o_buff + size != buff) { + /* This is a spawn executable call */ + cnt = get_int32(buff); + buff += sizeof(Sint32); + args = malloc(sizeof(char*)*(cnt + 1)); + for (i = 0; i < cnt; i++, buff++) { + args[i] = buff; + while(*buff != '\0') buff++; + } + args[cnt] = NULL; } - if (strcmp("false", argv[CS_ARGV_UNBIND_IX]) != 0) - if (erts_unbind_from_cpu_str(argv[CS_ARGV_UNBIND_IX]) != 0) - return 1; + if (o_buff + size != buff) { + errno = EINVAL; + goto child_error; + } + + DEBUG_PRINT("read ack"); + do { + ErtsSysForkerProto proto; + res = read(pipes[0], &proto, sizeof(proto)); + if (res > 0) { + ASSERT(proto.action == ErtsSysForkerProtoAction_Ack); + ASSERT(res == sizeof(proto)); + } + } while(res < 0 && (errno == EINTR || errno == ERRNO_BLOCK)); + if (res < 1) { + errno = EPIPE; + goto child_error; + } + + DEBUG_PRINT("Do that forking business: '%s'\n",cmd); - for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) { - if (argv[CS_ARGV_DUP2_OP_IX(i)][0] == '-' - && argv[CS_ARGV_DUP2_OP_IX(i)][1] == '\0') - break; - if (sscanf(argv[CS_ARGV_DUP2_OP_IX(i)], "%d:%d", &from, &to) != 2) - return 1; - if (dup2(from, to) < 0) - return 1; + /* When the dup2'ing below is done, only + fd's 0, 1, 2 and maybe 3, 4 should survive the + exec. All other fds (i.e. the unix domain sockets + and stray pipe ends) should have CLOEXEC set on them + so they will be closed when the exec happens */ + if (flags & FORKER_FLAG_USE_STDIO) { + /* stdin for process */ + if (flags & FORKER_FLAG_DO_WRITE && + dup2(pipes[0], 0) < 0) + goto child_error; + /* stdout for process */ + if (flags & FORKER_FLAG_DO_READ && + dup2(pipes[1], 1) < 0) + goto child_error; + } + else { /* XXX will fail if pipes[0] == 4 (unlikely..) */ + if (flags & FORKER_FLAG_DO_READ && dup2(pipes[1], 4) < 0) + goto child_error; + if (flags & FORKER_FLAG_DO_WRITE && dup2(pipes[0], 3) < 0) + goto child_error; } - if (sscanf(argv[CS_ARGV_FD_CR_IX], "%d:%d", &from, &to) != 2) - return 1; - for (i = from; i <= to; i++) - (void) close(i); + if (dup2(pipes[2], 2) < 0) + goto child_error; - if (!(argv[CS_ARGV_WD_IX][0] == '.' && argv[CS_ARGV_WD_IX][1] == '\0') - && chdir(argv[CS_ARGV_WD_IX]) < 0) - return 1; + if (wd && chdir(wd) < 0) + goto child_error; #if defined(USE_SETPGRP_NOARGS) /* SysV */ (void) setpgrp(); @@ -104,19 +256,301 @@ main(int argc, char *argv[]) (void) setsid(); #endif + close(pipes[0]); + close(pipes[1]); + close(pipes[2]); + sys_sigrelease(SIGCHLD); - sys_sigrelease(SIGINT); - sys_sigrelease(SIGUSR1); - - if (erts_spawn_executable) { - if (argv[CS_ARGV_NO_OF_ARGS + 1] == NULL) { - execl(argv[CS_ARGV_NO_OF_ARGS],argv[CS_ARGV_NO_OF_ARGS], - (char *) NULL); - } else { - execv(argv[CS_ARGV_NO_OF_ARGS],&(argv[CS_ARGV_NO_OF_ARGS + 1])); - } + + if (args) { + /* spawn_executable */ + execve(cmd, args, new_environ); } else { - execl("/bin/sh", "sh", "-c", argv[CS_ARGV_CMD_IX], (char *) NULL); + execle(SHELL, "sh", "-c", cmd, (char *) NULL, new_environ); } +child_error: + DEBUG_PRINT("exec error: %d\r\n",errno); + _exit(128 + errno); +} + + +/* + * [OTP-3906] + * Solaris signal management gets confused when threads are used and a + * lot of child processes dies. The confusion results in that SIGCHLD + * signals aren't delivered to the emulator which in turn results in + * a lot of defunct processes in the system. + * + * The problem seems to appear when a signal is frequently + * blocked/unblocked at the same time as the signal is frequently + * propagated. The child waiter thread is a workaround for this problem. + * The SIGCHLD signal is always blocked (in all threads), and the child + * waiter thread fetches the signal by a call to sigwait(). See + * child_waiter(). + * + * This should be a non-issue since the fork:ing was moved outside of + * the emulator into erl_child_setup. I'm leaving the comment here + * for posterity. */ + +static void handle_sigchld(int sig) { + int buff[2], res; + + sys_sigblock(SIGCHLD); + + while ((buff[0] = waitpid((pid_t)(-1), buff+1, WNOHANG)) > 0) { + do { + res = write(sigchld_pipe[1], buff, sizeof(buff)); + } while (res < 0 && errno == EINTR); + if (res <= 0) + ABORT("Failed to write to sigchld_pipe (%d): %d (%d)", sigchld_pipe[1], res, errno); + DEBUG_PRINT("Reap child %d (%d)", buff[0], buff[1]); + } + + sys_sigrelease(SIGCHLD); +} + +#if defined(__ANDROID__) +static int system_properties_fd(void) +{ + static int fd = -2; + char *env; + + if (fd != -2) return fd; + env = getenv("ANDROID_PROPERTY_WORKSPACE"); + if (!env) { + fd = -1; + return -1; + } + fd = atoi(env); + return fd; +} +#endif /* __ANDROID__ */ + +int +main(int argc, char *argv[]) +{ + /* This fd should be open from beam */ + int uds_fd = 3, max_fd = 3; +#ifndef HAVE_CLOSEFROM + int i; +#endif + struct sigaction sa; + + if (argc < 1 || sscanf(argv[1],"%d",&max_files) != 1) { + ABORT("Invalid arguments to child_setup"); + } + +/* We close all fds except the uds from beam. + All other fds from now on will have the + CLOEXEC flags set on them. This means that we + only have to close a very limited number of fds + after we fork before the exec. */ +#if defined(HAVE_CLOSEFROM) + closefrom(4); +#else + for (i = 4; i < max_files; i++) +#if defined(__ANDROID__) + if (i != system_properties_fd()) +#endif + (void) close(i); +#endif + + if (pipe(sigchld_pipe) < 0) { + ABORT("Failed to setup sigchld pipe (%d)", errno); + } + + SET_CLOEXEC(sigchld_pipe[0]); + SET_CLOEXEC(sigchld_pipe[1]); + + max_fd = max_fd < sigchld_pipe[0] ? sigchld_pipe[0] : max_fd; + + sa.sa_handler = &handle_sigchld; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_RESTART | SA_NOCLDSTOP; + if (sigaction(SIGCHLD, &sa, 0) == -1) { + perror(0); + exit(1); + } + + forker_hash_init(); + + SET_CLOEXEC(uds_fd); + + DEBUG_PRINT("Starting forker %d", max_files); + + while (1) { + fd_set read_fds; + int res; + FD_ZERO(&read_fds); + FD_SET(uds_fd, &read_fds); + FD_SET(sigchld_pipe[0], &read_fds); + DEBUG_PRINT("child_setup selecting on %d, %d (%d)", + uds_fd, sigchld_pipe[0], max_fd); + res = select(max_fd+1, &read_fds, NULL, NULL, NULL); + + if (res < 0) { + if (errno == EINTR) continue; + ABORT("Select failed: %d (%d)",res, errno); + } + + if (FD_ISSET(uds_fd, &read_fds)) { + int pipes[3], res, os_pid; + ErtsSysForkerProto proto; + errno = 0; + if ((res = sys_uds_read(uds_fd, (char*)&proto, sizeof(proto), + pipes, 3, MSG_DONTWAIT)) < 0) { + if (errno == EINTR) + continue; + DEBUG_PRINT("erl_child_setup failed to read from uds: %d, %d", res, errno); + _exit(0); + } + + if (res == 0) { + DEBUG_PRINT("uds was closed!"); + _exit(0); + } + /* Since we use unix domain sockets and send the entire data in + one go we *should* get the entire payload at once. */ + ASSERT(res == sizeof(proto)); + ASSERT(proto.action == ErtsSysForkerProtoAction_Start); + + sys_sigblock(SIGCHLD); + + errno = 0; + + os_pid = fork(); + if (os_pid == 0) + start_new_child(pipes); + + add_os_pid_to_port_id_mapping(proto.u.start.port_id, os_pid); + + /* We write an ack here, but expect the reply on + the pipes[0] inside the fork */ + proto.action = ErtsSysForkerProtoAction_Go; + proto.u.go.os_pid = os_pid; + proto.u.go.error_number = errno; + while (write(pipes[1], &proto, sizeof(proto)) < 0 && errno == EINTR) + ; /* remove gcc warning */ + +#ifdef FORKER_PROTO_START_ACK + proto.action = ErtsSysForkerProtoAction_StartAck; + while (write(uds_fd, &proto, sizeof(proto)) < 0 && errno == EINTR) + ; /* remove gcc warning */ +#endif + + sys_sigrelease(SIGCHLD); + close(pipes[0]); + close(pipes[1]); + close(pipes[2]); + } + + if (FD_ISSET(sigchld_pipe[0], &read_fds)) { + int ibuff[2]; + ErtsSysForkerProto proto; + res = read(sigchld_pipe[0], ibuff, sizeof(ibuff)); + if (res <= 0) { + if (errno == EINTR) + continue; + ABORT("Failed to read from sigchld pipe: %d (%d)", res, errno); + } + + proto.u.sigchld.port_id = get_port_id((pid_t)(ibuff[0])); + + if (proto.u.sigchld.port_id == THE_NON_VALUE) + continue; /* exit status report not requested */ + + proto.action = ErtsSysForkerProtoAction_SigChld; + proto.u.sigchld.error_number = ibuff[1]; + DEBUG_PRINT("send %s to %d", buff, uds_fd); + if (write(uds_fd, &proto, sizeof(proto)) < 0) { + if (errno == EINTR) + continue; + /* The uds was close, which most likely means that the VM + has exited. This will be detected when we try to read + from the uds_fd. */ + DEBUG_PRINT("Failed to write to uds: %d (%d)", uds_fd, errno); + } + } + } + return 1; +} + +typedef struct exit_status { + HashBucket hb; + pid_t os_pid; + Eterm port_id; +} ErtsSysExitStatus; + +static Hash *forker_hash; + +static void add_os_pid_to_port_id_mapping(Eterm port_id, pid_t os_pid) +{ + if (port_id != THE_NON_VALUE) { + /* exit status report requested */ + ErtsSysExitStatus es; + es.os_pid = os_pid; + es.port_id = port_id; + hash_put(forker_hash, &es); + } +} + +static Eterm get_port_id(pid_t os_pid) +{ + ErtsSysExitStatus est, *es; + Eterm port_id; + est.os_pid = os_pid; + es = hash_remove(forker_hash, &est); + if (!es) return THE_NON_VALUE; + port_id = es->port_id; + free(es); + return port_id; +} + +static int fcmp(void *a, void *b) +{ + ErtsSysExitStatus *sa = a; + ErtsSysExitStatus *sb = b; + return !(sa->os_pid == sb->os_pid); +} + +static HashValue fhash(void *e) +{ + ErtsSysExitStatus *se = e; + Uint32 val = se->os_pid; + val = (val+0x7ed55d16) + (val<<12); + val = (val^0xc761c23c) ^ (val>>19); + val = (val+0x165667b1) + (val<<5); + val = (val+0xd3a2646c) ^ (val<<9); + val = (val+0xfd7046c5) + (val<<3); + val = (val^0xb55a4f09) ^ (val>>16); + return val; +} + +static void *falloc(void *e) +{ + ErtsSysExitStatus *se = e; + ErtsSysExitStatus *ne = malloc(sizeof(ErtsSysExitStatus)); + ne->os_pid = se->os_pid; + ne->port_id = se->port_id; + return ne; +} + +static void *meta_alloc(int type, size_t size) { return malloc(size); } +static void meta_free(int type, void *p) { free(p); } + +static int forker_hash_init(void) +{ + HashFunctions forker_hash_functions; + forker_hash_functions.hash = fhash; + forker_hash_functions.cmp = fcmp; + forker_hash_functions.alloc = falloc; + forker_hash_functions.free = free; + forker_hash_functions.meta_alloc = meta_alloc; + forker_hash_functions.meta_free = meta_free; + forker_hash_functions.meta_print = NULL; + + forker_hash = hash_new(0, "forker_hash", + 16, forker_hash_functions); + return 1; } diff --git a/erts/emulator/sys/unix/erl_child_setup.h b/erts/emulator/sys/unix/erl_child_setup.h new file mode 100644 index 0000000000..a28b136bfc --- /dev/null +++ b/erts/emulator/sys/unix/erl_child_setup.h @@ -0,0 +1,77 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2015-2015. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + * + * This file defines the interface inbetween erts and child_setup. + */ + +#ifndef _ERL_UNIX_FORKER_H +#define _ERL_UNIX_FORKER_H + +#include "sys.h" + +#ifdef __FreeBSD__ +/* The freebsd sendmsg man page explicitly states that + you should not close fds before they are known + to have reached the other side, so this Ack protects + against that. */ +#define FORKER_PROTO_START_ACK 1 +#endif + +#define FORKER_ARGV_NO_OF_ARGS 3 +#define FORKER_ARGV_PROGNAME_IX 0 /* Program name */ +#define FORKER_ARGV_MAX_FILES 1 /* max_files */ + +#define FORKER_FLAG_USE_STDIO (1 << 0) /* dup the pipe to stdin/stderr */ +#define FORKER_FLAG_EXIT_STATUS (1 << 1) /* send the exit status to parent */ +#define FORKER_FLAG_DO_READ (1 << 2) /* dup write fd */ +#define FORKER_FLAG_DO_WRITE (1 << 3) /* dup read fd */ + +#if SIZEOF_VOID_P == SIZEOF_LONG +typedef unsigned long ErtsSysPortId; +#elif SIZEOF_VOID_P == SIZEOF_INT +typedef unsigned int ErtsSysPortId; +#elif SIZEOF_VOID_P == SIZEOF_LONG_LONG +typedef unsigned long long ErtsSysPortId; +#endif + +typedef struct ErtsSysForkerProto_ { + enum { + ErtsSysForkerProtoAction_Start, + ErtsSysForkerProtoAction_StartAck, + ErtsSysForkerProtoAction_Go, + ErtsSysForkerProtoAction_SigChld, + ErtsSysForkerProtoAction_Ack + } action; + union { + struct { + ErtsSysPortId port_id; + int fds[3]; + } start; + struct { + pid_t os_pid; + int error_number; + } go; + struct { + ErtsSysPortId port_id; + int error_number; + } sigchld; + } u; +} ErtsSysForkerProto; + +#endif /* #ifndef _ERL_UNIX_FORKER_H */ diff --git a/erts/emulator/sys/unix/erl_main.c b/erts/emulator/sys/unix/erl_main.c index b26f93f77e..972b93a505 100644 --- a/erts/emulator/sys/unix/erl_main.c +++ b/erts/emulator/sys/unix/erl_main.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2000-2009. All Rights Reserved. + * Copyright Ericsson AB 2000-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/unix/erl_unix_sys.h b/erts/emulator/sys/unix/erl_unix_sys.h index c8fcec8547..3a0d23cd36 100644 --- a/erts/emulator/sys/unix/erl_unix_sys.h +++ b/erts/emulator/sys/unix/erl_unix_sys.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2011. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% * @@ -29,9 +30,7 @@ #include <limits.h> #include <stdlib.h> #include <string.h> -#ifndef QNX #include <memory.h> -#endif #if defined(__sun__) && defined(__SVR4) && !defined(__EXTENSIONS__) # define __EXTENSIONS__ @@ -45,7 +44,7 @@ #include <fcntl.h> #include "erl_errno.h" #include <signal.h> - +#include <setjmp.h> #if HAVE_SYS_SOCKETIO_H # include <sys/socketio.h> @@ -91,11 +90,6 @@ #include <ieeefp.h> #endif -#ifdef QNX -#include <process.h> -#include <sys/qnx_glob.h> -#endif - #include <pwd.h> #ifndef HZ @@ -107,14 +101,17 @@ #endif #include <netdb.h> +#ifdef HAVE_MACH_ABSOLUTE_TIME +#include <mach/mach_time.h> +#endif + +#ifdef HAVE_POSIX_MEMALIGN +# define ERTS_HAVE_ERTS_SYS_ALIGNED_ALLOC 1 +#endif + /* * Make sure that MAXPATHLEN is defined. */ -#ifdef GETHRTIME_WITH_CLOCK_GETTIME -#undef HAVE_GETHRTIME -#define HAVE_GETHRTIME 1 -#endif - #ifndef MAXPATHLEN # ifdef PATH_MAX # define MAXPATHLEN PATH_MAX @@ -123,24 +120,19 @@ # endif #endif +/* + * Min number of async threads + */ +#define ERTS_MIN_NO_OF_ASYNC_THREADS 0 + /* File descriptors are numbers anc consecutively allocated on Unix */ #define ERTS_SYS_CONTINOUS_FD_NUMBERS -#define HAVE_ERTS_CHECK_IO_DEBUG -int erts_check_io_debug(void); - #ifndef ERTS_SMP # undef ERTS_POLL_NEED_ASYNC_INTERRUPT_SUPPORT # define ERTS_POLL_NEED_ASYNC_INTERRUPT_SUPPORT #endif -#ifndef ENABLE_CHILD_WAITER_THREAD -# ifdef ERTS_SMP -# define ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN -void erts_check_children(void); -# endif -#endif - typedef void *GETENV_STATE; /* @@ -154,35 +146,155 @@ typedef struct timeval SysTimeval; typedef struct tms SysTimes; -extern int erts_ticks_per_sec; - -#define SYS_CLK_TCK (erts_ticks_per_sec) +#define SYS_CLK_TCK (erts_sys_time_data__.r.o.ticks_per_sec) #define sys_times(Arg) times(Arg) -#define ERTS_WRAP_SYS_TIMES 1 -extern int erts_ticks_per_sec_wrap; -#define SYS_CLK_TCK_WRAP (erts_ticks_per_sec_wrap) -extern clock_t sys_times_wrap(void); +#if SIZEOF_LONG == 8 +typedef long ErtsMonotonicTime; +typedef long ErtsSysHrTime; +#elif SIZEOF_LONG_LONG == 8 +typedef long long ErtsMonotonicTime; +typedef long long ErtsSysHrTime; +#else +#error No signed 64-bit type found... +#endif -#ifdef HAVE_GETHRTIME -#ifdef GETHRTIME_WITH_CLOCK_GETTIME -typedef long long SysHrTime; +typedef ErtsMonotonicTime ErtsSystemTime; +typedef ErtsSysHrTime ErtsSysPerfCounter; -extern SysHrTime sys_gethrtime(void); -#define sys_init_hrtime() /* Nothing */ +#define ERTS_MONOTONIC_TIME_MIN ((ErtsMonotonicTime) (1ULL << 63)) +#define ERTS_MONOTONIC_TIME_MAX (~ERTS_MONOTONIC_TIME_MIN) -#else /* Real gethrtime (Solaris) */ +/* + * OS monotonic time and OS system time + */ +#undef ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__ -typedef hrtime_t SysHrTime; +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) \ + && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) +# if defined(__linux__) +# define ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__ 1 +# endif +#endif -#define sys_gethrtime() gethrtime() -#define sys_init_hrtime() /* Nothing */ +ErtsSystemTime erts_os_system_time(void); -#endif /* GETHRTIME_WITH_CLOCK_GETTIME */ -#endif /* HAVE_GETHRTIME */ +#undef ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT +#undef ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT +#undef ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__ -#if (defined(HAVE_GETHRVTIME) || defined(HAVE_CLOCK_GETTIME)) +#if defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) +# define ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT 1 +# define ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT (1000*1000*1000) +# if defined(__linux__) +# define ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__ 1 +# endif +#elif defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) +# define ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT 1 +# define ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT (1000*1000*1000) +#elif defined(OS_MONOTONIC_TIME_USING_GETHRTIME) +# define ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT 1 +# define ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT (1000*1000*1000) +#elif defined(OS_MONOTONIC_TIME_USING_TIMES) +# define ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT 1 +/* Time unit determined at runtime... */ +# define ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT 0 +#else /* No OS monotonic available... */ +# define ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT (1000*1000) +#endif + +/* + * erts_sys_hrtime() is the highest resolution + * time function found. Time unit is nano-seconds. + * It may or may not be monotonic. + */ +ErtsSysHrTime erts_sys_hrtime(void); +#define ERTS_HRTIME_UNIT (1000*1000*1000) + +struct erts_sys_time_read_only_data__ { +#ifdef ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__ + ErtsMonotonicTime (*os_monotonic_time)(void); +#endif +#ifdef ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__ + void (*os_times)(ErtsMonotonicTime *, ErtsSystemTime *); +#endif + ErtsSysPerfCounter (*perf_counter)(void); + ErtsSysPerfCounter perf_counter_unit; + int ticks_per_sec; +}; + +typedef struct { + union { + struct erts_sys_time_read_only_data__ o; + char align__[(((sizeof(struct erts_sys_time_read_only_data__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } r; +} ErtsSysTimeData__; + +extern ErtsSysTimeData__ erts_sys_time_data__; + +#ifdef ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT + +#ifdef ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__ +ERTS_GLB_INLINE +#endif +ErtsMonotonicTime erts_os_monotonic_time(void); + +#ifdef ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__ +ERTS_GLB_INLINE +#endif +void erts_os_times(ErtsMonotonicTime *, ErtsSystemTime *); + +#if ERTS_GLB_INLINE_INCL_FUNC_DEF + +#ifdef ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__ + +ERTS_GLB_INLINE ErtsMonotonicTime +erts_os_monotonic_time(void) +{ + return (*erts_sys_time_data__.r.o.os_monotonic_time)(); +} + +#endif /* ERTS_OS_MONOTONIC_INLINE_FUNC_PTR_CALL__ */ + +#ifdef ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__ + +ERTS_GLB_INLINE void +erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + return (*erts_sys_time_data__.r.o.os_times)(mtimep, stimep); +} + +#endif /* ERTS_OS_TIMES_INLINE_FUNC_PTR_CALL__ */ + +#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */ + +#endif /* ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT */ + +/* + * Functions for getting the performance counter + */ + +ERTS_GLB_INLINE ErtsSysPerfCounter erts_sys_perf_counter(void); +#define erts_sys_perf_counter_unit() erts_sys_time_data__.r.o.perf_counter_unit + +#if ERTS_GLB_INLINE_INCL_FUNC_DEF + +ERTS_GLB_INLINE ErtsSysPerfCounter +erts_sys_perf_counter() +{ + return (*erts_sys_time_data__.r.o.perf_counter)(); +} + +#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */ + +/* + * Functions for measuring CPU time + */ + +#if (defined(HAVE_GETHRVTIME) || defined(HAVE_CLOCK_GETTIME_CPU_TIME)) typedef long long SysCpuTime; typedef struct timespec SysTimespec; @@ -194,7 +306,7 @@ typedef struct timespec SysTimespec; int sys_start_hrvtime(void); int sys_stop_hrvtime(void); -#elif defined(HAVE_CLOCK_GETTIME) +#elif defined(HAVE_CLOCK_GETTIME_CPU_TIME) #define sys_clock_gettime(cid,tp) clock_gettime((cid),&(tp)) #define sys_get_proc_cputime(t,tp) sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID,(tp)) @@ -205,26 +317,39 @@ int sys_stop_hrvtime(void); #define SYS_CLOCK_RESOLUTION 1 /* These are defined in sys.c */ -#if defined(SIG_SIGSET) /* Old SysV */ -RETSIGTYPE (*sys_sigset())(); -#elif defined(SIG_SIGNAL) /* Old BSD */ -RETSIGTYPE (*sys_sigset())(); -#else -RETSIGTYPE (*sys_sigset(int, RETSIGTYPE (*func)(int)))(int); -#endif +typedef void (*SIGFUNC)(int); +extern SIGFUNC sys_signal(int, SIGFUNC); extern void sys_sigrelease(int); extern void sys_sigblock(int); -extern void sys_stop_cat(void); +extern void sys_init_suspend_handler(void); /* * Handling of floating point exceptions. */ #ifdef USE_ISINF_ISNAN /* simulate finite() */ -# define finite(f) (!isinf(f) && !isnan(f)) -# define HAVE_FINITE +# define isfinite(f) (!isinf(f) && !isnan(f)) +# define HAVE_ISFINITE +#elif (defined(__GNUC__) && !defined(__llvm__)) && defined(HAVE_FINITE) +/* We use finite in gcc as it emits assembler instead of + the function call that isfinite emits. The assembler is + significantly faster. */ +# ifdef isfinite +# undef isfinite +# endif +# define isfinite finite +# ifndef HAVE_ISFINITE +# define HAVE_ISFINITE +# endif +#elif defined(isfinite) && !defined(HAVE_ISFINITE) +# define HAVE_ISFINITE +#elif !defined(HAVE_ISFINITE) && defined(HAVE_FINITE) +# define isfinite finite +# define HAVE_ISFINITE #endif +#define erts_isfinite isfinite + #ifdef NO_FPE_SIGNALS #define erts_get_current_fp_exception() NULL @@ -232,7 +357,7 @@ extern void sys_stop_cat(void); #define erts_thread_init_fp_exception() do{}while(0) #endif # define __ERTS_FP_CHECK_INIT(fpexnp) do {} while (0) -# define __ERTS_FP_ERROR(fpexnp, f, Action) if (!finite(f)) { Action; } else {} +# define __ERTS_FP_ERROR(fpexnp, f, Action) if (!isfinite(f)) { Action; } else {} # define __ERTS_FP_ERROR_THOROUGH(fpexnp, f, Action) __ERTS_FP_ERROR(fpexnp, f, Action) # define __ERTS_SAVE_FP_EXCEPTION(fpexnp) # define __ERTS_RESTORE_FP_EXCEPTION(fpexnp) @@ -296,7 +421,7 @@ static __inline__ void __ERTS_FP_CHECK_INIT(volatile unsigned long *fp_exception code to always throw floating-point exceptions on errors. */ static __inline__ int erts_check_fpe_thorough(volatile unsigned long *fp_exception, double f) { - return erts_check_fpe(fp_exception, f) || !finite(f); + return erts_check_fpe(fp_exception, f) || !isfinite(f); } # define __ERTS_FP_ERROR_THOROUGH(fpexnp, f, Action) \ do { if (erts_check_fpe_thorough((fpexnp),(f))) { Action; } } while (0) @@ -311,19 +436,6 @@ void erts_sys_unblock_fpe(int); #define ERTS_FP_ERROR_THOROUGH(p, f, A) __ERTS_FP_ERROR_THOROUGH(&(p)->fp_exception, f, A) -#ifdef NEED_CHILD_SETUP_DEFINES -/* The child setup argv[] */ -#define CS_ARGV_PROGNAME_IX 0 /* Program name */ -#define CS_ARGV_UNBIND_IX 1 /* Unbind from cpu */ -#define CS_ARGV_WD_IX 2 /* Working directory */ -#define CS_ARGV_CMD_IX 3 /* Command */ -#define CS_ARGV_FD_CR_IX 4 /* Fd close range */ -#define CS_ARGV_DUP2_OP_IX(N) ((N) + 5) /* dup2 operations */ - -#define CS_ARGV_NO_OF_DUP2_OPS 3 /* Number of dup2 ops */ -#define CS_ARGV_NO_OF_ARGS 8 /* Number of arguments */ -#endif /* #ifdef NEED_CHILD_SETUP_DEFINES */ - /* Threads */ #ifdef USE_THREADS extern int init_async(int); @@ -332,4 +444,28 @@ extern int exit_async(void); #define ERTS_EXIT_AFTER_DUMP _exit +#if !defined(__APPLE__) && !defined(__MACH__) +/* Some OS X versions do not allow (ab)using signal handlers like this */ +#define ERTS_HAVE_TRY_CATCH 1 + +/* We try to simulate a try catch in C with the help of signal handlers. + * Only use this as a very last resort, as it is not very portable and + * quite unstable. It is also not thread safe, so make sure that only + * one thread can call this at a time! + */ +extern void erts_sys_sigsegv_handler(int); +extern jmp_buf erts_sys_sigsegv_jmp; +#define ERTS_SYS_TRY_CATCH(EXPR,CATCH) \ + do { \ + SIGFUNC prev_handler = sys_signal(SIGSEGV, \ + erts_sys_sigsegv_handler); \ + if (!setjmp(erts_sys_sigsegv_jmp)) { \ + EXPR; \ + } else { \ + CATCH; \ + } \ + sys_signal(SIGSEGV,prev_handler); \ + } while(0) +#endif + #endif /* #ifndef _ERL_UNIX_SYS_H */ diff --git a/erts/emulator/sys/unix/erl_unix_sys_ddll.c b/erts/emulator/sys/unix/erl_unix_sys_ddll.c index 12c47d0088..8f1ceac883 100644 --- a/erts/emulator/sys/unix/erl_unix_sys_ddll.c +++ b/erts/emulator/sys/unix/erl_unix_sys_ddll.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2013. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -101,7 +102,7 @@ void erl_sys_ddll_init(void) { /* * Open a shared object */ -int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err) +int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err) { #if defined(HAVE_DLOPEN) char* dlname; @@ -123,6 +124,7 @@ int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* int erts_sys_ddll_open_noext(char *dlname, void **handle, ErtsSysDdllError* err) { +#if defined(HAVE_DLOPEN) int ret = ERL_DE_NO_ERROR; char *str; dlerror(); @@ -148,6 +150,9 @@ int erts_sys_ddll_open_noext(char *dlname, void **handle, ErtsSysDdllError* err) ret = ERL_DE_DYNAMIC_ERROR_OFFSET - find_errcode(str, err); } return ret; +#else + return ERL_DE_ERROR_NO_DDLL_FUNCTIONALITY; +#endif } /* diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c index a7ea4b2490..6fb86f6dda 100644 --- a/erts/emulator/sys/unix/sys.c +++ b/erts/emulator/sys/unix/sys.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1996-2013. All Rights Reserved. + * Copyright Ericsson AB 1996-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -34,6 +35,7 @@ #include <termios.h> #include <ctype.h> #include <sys/utsname.h> +#include <sys/select.h> #ifdef ISC32 #include <sys/bsdtypes.h> @@ -47,7 +49,6 @@ #include <sys/ioctl.h> #endif -#define NEED_CHILD_SETUP_DEFINES #define ERTS_WANT_BREAK_HANDLING #define ERTS_WANT_GOT_SIGUSR1 #define WANT_NONBLOCKING /* must define this to pull in defs from sys.h */ @@ -65,7 +66,7 @@ #include "erl_mseg.h" extern char **environ; -static erts_smp_rwmtx_t environ_rwmtx; +erts_smp_rwmtx_t environ_rwmtx; #define MAX_VSIZE 16 /* Max number of entries allowed in an I/O * vector sock_sendv(). @@ -74,68 +75,12 @@ static erts_smp_rwmtx_t environ_rwmtx; * Don't need global.h, but bif_table.h (included by bif.h), * won't compile otherwise */ -#include "global.h" +#include "global.h" #include "bif.h" -#include "erl_sys_driver.h" #include "erl_check_io.h" #include "erl_cpu_topology.h" -#ifndef DISABLE_VFORK -#define DISABLE_VFORK 0 -#endif - -#ifdef USE_THREADS -# ifdef ENABLE_CHILD_WAITER_THREAD -# define CHLDWTHR ENABLE_CHILD_WAITER_THREAD -# else -# define CHLDWTHR 0 -# endif -#else -# define CHLDWTHR 0 -#endif -/* - * [OTP-3906] - * Solaris signal management gets confused when threads are used and a - * lot of child processes dies. The confusion results in that SIGCHLD - * signals aren't delivered to the emulator which in turn results in - * a lot of defunct processes in the system. - * - * The problem seems to appear when a signal is frequently - * blocked/unblocked at the same time as the signal is frequently - * propagated. The child waiter thread is a workaround for this problem. - * The SIGCHLD signal is always blocked (in all threads), and the child - * waiter thread fetches the signal by a call to sigwait(). See - * child_waiter(). - */ - -typedef struct ErtsSysReportExit_ ErtsSysReportExit; -struct ErtsSysReportExit_ { - ErtsSysReportExit *next; - Eterm port; - int pid; - int ifd; - int ofd; -#if CHLDWTHR && !defined(ERTS_SMP) - int status; -#endif -}; - -/* This data is shared by these drivers - initialized by spawn_init() */ -static struct driver_data { - ErlDrvPort port_num; - int ofd, packet_bytes; - ErtsSysReportExit *report_exit; - int pid; - int alive; - int status; -} *driver_data; /* indexed by fd */ - -static ErtsSysReportExit *report_exit_list; -#if CHLDWTHR && !defined(ERTS_SMP) -static ErtsSysReportExit *report_exit_transit_list; -#endif - extern int driver_interrupt(int, int); extern void do_break(void); @@ -147,26 +92,6 @@ extern void erts_sys_init_float(void); extern void erl_crash_dump(char* file, int line, char* fmt, ...); -#define DIR_SEPARATOR_CHAR '/' - -#if defined(DEBUG) -#define ERL_BUILD_TYPE_MARKER ".debug" -#elif defined(PURIFY) -#define ERL_BUILD_TYPE_MARKER ".purify" -#elif defined(QUANTIFY) -#define ERL_BUILD_TYPE_MARKER ".quantify" -#elif defined(PURECOV) -#define ERL_BUILD_TYPE_MARKER ".purecov" -#elif defined(VALGRIND) -#define ERL_BUILD_TYPE_MARKER ".valgrind" -#else /* opt */ -#define ERL_BUILD_TYPE_MARKER -#endif - -#define CHILD_SETUP_PROG_NAME "child_setup" ERL_BUILD_TYPE_MARKER -#if !DISABLE_VFORK -static char *child_setup_prog; -#endif #ifdef DEBUG static int debug_log = 0; @@ -190,54 +115,22 @@ static volatile int have_prepared_crash_dump; (have_prepared_crash_dump++) #endif -static erts_smp_atomic_t sys_misc_mem_sz; +erts_smp_atomic_t sys_misc_mem_sz; #if defined(ERTS_SMP) static void smp_sig_notify(char c); static int sig_notify_fds[2] = {-1, -1}; -#elif defined(USE_THREADS) -static int async_fd[2]; -#endif -#if CHLDWTHR || defined(ERTS_SMP) -erts_mtx_t chld_stat_mtx; -#endif -#if CHLDWTHR -static erts_tid_t child_waiter_tid; -/* chld_stat_mtx is used to protect against concurrent accesses - of the driver_data fields pid, alive, and status. */ -erts_cnd_t chld_stat_cnd; -static long children_alive; -#define CHLD_STAT_LOCK erts_mtx_lock(&chld_stat_mtx) -#define CHLD_STAT_UNLOCK erts_mtx_unlock(&chld_stat_mtx) -#define CHLD_STAT_WAIT erts_cnd_wait(&chld_stat_cnd, &chld_stat_mtx) -#define CHLD_STAT_SIGNAL erts_cnd_signal(&chld_stat_cnd) -#elif defined(ERTS_SMP) /* ------------------------------------------------- */ -#define CHLD_STAT_LOCK erts_mtx_lock(&chld_stat_mtx) -#define CHLD_STAT_UNLOCK erts_mtx_unlock(&chld_stat_mtx) - -#else /* ------------------------------------------------------------------- */ -#define CHLD_STAT_LOCK -#define CHLD_STAT_UNLOCK -static volatile int children_died; +#if !defined(ETHR_UNUSABLE_SIGUSRX) && defined(ERTS_THR_HAVE_SIG_FUNCS) +static int sig_suspend_fds[2] = {-1, -1}; +#define ERTS_SYS_SUSPEND_SIGNAL SIGUSR2 #endif +#endif -static struct fd_data { - char pbuf[4]; /* hold partial packet bytes */ - int psz; /* size of pbuf */ - char *buf; - char *cpos; - int sz; - int remain; /* for input on fd */ -} *fd_data; /* indexed by fd */ - -/* static FUNCTION(int, write_fill, (int, char*, int)); unused? */ -static void note_child_death(int, int); +jmp_buf erts_sys_sigsegv_jmp; -#if CHLDWTHR -static void* child_waiter(void *); -#endif +static int crashdump_companion_cube_fd = -1; /********************* General functions ****************************/ @@ -273,11 +166,11 @@ struct { int (*event)(ErlDrvPort, ErlDrvEvent, ErlDrvEventData); void (*check_io_as_interrupt)(void); void (*check_io_interrupt)(int); - void (*check_io_interrupt_tmd)(int, erts_short_time_t); + void (*check_io_interrupt_tmd)(int, ErtsMonotonicTime); void (*check_io)(int); Uint (*size)(void); Eterm (*info)(void *); - int (*check_io_debug)(void); + int (*check_io_debug)(ErtsCheckIoDebugInfo *); } io_func = {0}; @@ -299,9 +192,9 @@ Eterm erts_check_io_info(void *p) } int -erts_check_io_debug(void) +erts_check_io_debug(ErtsCheckIoDebugInfo *ip) { - return (*io_func.check_io_debug)(); + return (*io_func.check_io_debug)(ip); } @@ -379,9 +272,9 @@ erts_sys_schedule_interrupt(int set) #ifdef ERTS_SMP void -erts_sys_schedule_interrupt_timed(int set, erts_short_time_t msec) +erts_sys_schedule_interrupt_timed(int set, ErtsMonotonicTime timeout_time) { - ERTS_CHK_IO_INTR_TMD(set, msec); + ERTS_CHK_IO_INTR_TMD(set, timeout_time); } #endif @@ -409,15 +302,18 @@ void sys_tty_reset(int exit_code) #ifdef __tile__ /* Direct malloc to spread memory around the caches of multiple tiles. */ #include <malloc.h> +#if defined(MALLOC_USE_HASH) MALLOC_USE_HASH(1); #endif +#endif #ifdef USE_THREADS #ifdef ERTS_THR_HAVE_SIG_FUNCS + /* * Child thread inherits parents signal mask at creation. In order to - * guarantee that the main thread will receive all SIGINT, SIGCHLD, and + * guarantee that the main thread will receive all SIGINT, and * SIGUSR1 signals sent to the process, we block these signals in the * parent thread when creating a new thread. */ @@ -495,11 +391,14 @@ thr_create_prepare_child(void *vtcdp) void erts_sys_pre_init(void) { +#ifdef USE_THREADS + erts_thr_init_data_t eid = ERTS_THR_INIT_DATA_DEF_INITER; +#endif + erts_printf_add_cr_to_stdout = 1; erts_printf_add_cr_to_stderr = 1; + #ifdef USE_THREADS - { - erts_thr_init_data_t eid = ERTS_THR_INIT_DATA_DEF_INITER; eid.thread_create_child_func = thr_create_prepare_child; /* Before creation in parent */ @@ -510,29 +409,21 @@ erts_sys_pre_init(void) #ifdef ERTS_THR_HAVE_SIG_FUNCS sigemptyset(&thr_create_sigmask); sigaddset(&thr_create_sigmask, SIGINT); /* block interrupt */ - sigaddset(&thr_create_sigmask, SIGCHLD); /* block child signals */ sigaddset(&thr_create_sigmask, SIGUSR1); /* block user defined signal */ #endif erts_thr_init(&eid); - report_exit_list = NULL; - #ifdef ERTS_ENABLE_LOCK_COUNT erts_lcnt_init(); #endif -#if CHLDWTHR || defined(ERTS_SMP) - erts_mtx_init(&chld_stat_mtx, "child_status"); -#endif -#if CHLDWTHR -#ifndef ERTS_SMP - report_exit_transit_list = NULL; -#endif - erts_cnd_init(&chld_stat_cnd); - children_alive = 0; -#endif - } +#endif /* USE_THREADS */ + + erts_init_sys_time_sup(); + +#ifdef USE_THREADS + #ifdef ERTS_SMP erts_smp_atomic32_init_nob(&erts_break_requested, 0); erts_smp_atomic32_init_nob(&erts_got_sigusr1, 0); @@ -542,49 +433,42 @@ erts_sys_pre_init(void) erts_got_sigusr1 = 0; have_prepared_crash_dump = 0; #endif -#if !CHLDWTHR && !defined(ERTS_SMP) - children_died = 0; -#endif + #endif /* USE_THREADS */ + erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0); + + { + /* + * Unfortunately we depend on fd 0,1,2 in the old shell code. + * So if for some reason we do not have those open when we start + * we have to open them here. Not doing this can cause the emulator + * to deadlock when reaping the fd_driver ports :( + */ + int fd; + /* Make sure fd 0 is open */ + if ((fd = open("/dev/null", O_RDONLY)) != 0) + close(fd); + /* Make sure fds 1 and 2 are open */ + while (fd < 3) { + fd = open("/dev/null", O_WRONLY); + } + close(fd); + } + + /* We need a file descriptor to close in the crashdump creation. + * We close this one to be sure we can get a fd for our real file ... + * so, we create one here ... a stone to carry all the way home. + */ + + crashdump_companion_cube_fd = open("/dev/null", O_RDONLY); + + /* don't lose it, there will be cake */ } void erl_sys_init(void) { -#if !DISABLE_VFORK - { - int res; - char bindir[MAXPATHLEN]; - size_t bindirsz = sizeof(bindir); - Uint csp_path_sz; - - res = erts_sys_getenv_raw("BINDIR", bindir, &bindirsz); - if (res != 0) { - if (res < 0) - erl_exit(-1, - "Environment variable BINDIR is not set\n"); - if (res > 0) - erl_exit(-1, - "Value of environment variable BINDIR is too large\n"); - } - if (bindir[0] != DIR_SEPARATOR_CHAR) - erl_exit(-1, - "Environment variable BINDIR does not contain an" - " absolute path\n"); - csp_path_sz = (strlen(bindir) - + 1 /* DIR_SEPARATOR_CHAR */ - + sizeof(CHILD_SETUP_PROG_NAME) - + 1); - child_setup_prog = erts_alloc(ERTS_ALC_T_CS_PROG_PATH, csp_path_sz); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, csp_path_sz); - erts_snprintf(child_setup_prog, csp_path_sz, - "%s%c%s", - bindir, - DIR_SEPARATOR_CHAR, - CHILD_SETUP_PROG_NAME); - } -#endif #ifdef USE_SETLINEBUF setlinebuf(stdout); @@ -604,39 +488,7 @@ erl_sys_init(void) /* signal handling */ -#ifdef SIG_SIGSET /* Old SysV */ -RETSIGTYPE (*sys_sigset(sig, func))() -int sig; -RETSIGTYPE (*func)(); -{ - return(sigset(sig, func)); -} -void sys_sigblock(int sig) -{ - sighold(sig); -} -void sys_sigrelease(int sig) -{ - sigrelse(sig); -} -#else /* !SIG_SIGSET */ -#ifdef SIG_SIGNAL /* Old BSD */ -RETSIGTYPE (*sys_sigset(sig, func))(int, int) -int sig; -RETSIGTYPE (*func)(); -{ - return(signal(sig, func)); -} -sys_sigblock(int sig) -{ - sigblock(sig); -} -sys_sigrelease(int sig) -{ - sigsetmask(sigblock(0) & ~sigmask(sig)); -} -#else /* !SIG_SIGNAL */ /* The True Way - POSIX!:-) */ -RETSIGTYPE (*sys_sigset(int sig, RETSIGTYPE (*func)(int)))(int) +SIGFUNC sys_signal(int sig, SIGFUNC func) { struct sigaction act, oact; @@ -669,36 +521,47 @@ void sys_sigrelease(int sig) sigaddset(&mask, sig); sigprocmask(SIG_UNBLOCK, &mask, (sigset_t *)NULL); } -#endif /* !SIG_SIGNAL */ -#endif /* !SIG_SIGSET */ -#if (0) /* not used? -- gordon */ -static void (*break_func)(); -static RETSIGTYPE break_handler(int sig) -{ -#ifdef QNX - /* Turn off SIGCHLD during break processing */ - sys_sigblock(SIGCHLD); -#endif - (*break_func)(); -#ifdef QNX - sys_sigrelease(SIGCHLD); -#endif +void erts_sys_sigsegv_handler(int signo) { + if (signo == SIGSEGV) { + longjmp(erts_sys_sigsegv_jmp, 1); + } +} + +/* + * Function returns 1 if we can read from all values in between + * start and stop. + */ +int +erts_sys_is_area_readable(char *start, char *stop) { + int fds[2]; + if (!pipe(fds)) { + /* We let write try to figure out if the pointers are readable */ + int res = write(fds[1], start, (char*)stop - (char*)start); + if (res == -1) { + close(fds[0]); + close(fds[1]); + return 0; + } + close(fds[0]); + close(fds[1]); + return 1; + } + return 0; + } -#endif /* 0 */ static ERTS_INLINE int prepare_crash_dump(int secs) { #define NUFBUF (3) - int i, max; + int i; char env[21]; /* enough to hold any 64-bit integer */ size_t envsz; DeclareTmpHeapNoproc(heap,NUFBUF); Port *heart_port; Eterm *hp = heap; Eterm list = NIL; - int heart_fd[2] = {-1,-1}; int has_heart = 0; UseTmpHeapNoproc(NUFBUF); @@ -721,43 +584,22 @@ prepare_crash_dump(int secs) alarm((unsigned int)secs); } - if (heart_port) { - /* hearts input fd - * We "know" drv_data is the in_fd since the port is started with read|write - */ - heart_fd[0] = (int)heart_port->drv_data; - heart_fd[1] = (int)driver_data[heart_fd[0]].ofd; - has_heart = 1; + /* close all viable sockets via emergency close callbacks. + * Specifically we want to close epmd sockets. + */ - list = CONS(hp, make_small(8), list); hp += 2; + erts_emergency_close_ports(); + if (heart_port) { + has_heart = 1; + list = CONS(hp, make_small(8), list); hp += 2; /* send to heart port, CMD = 8, i.e. prepare crash dump =o */ erts_port_output(NULL, ERTS_PORT_SIG_FLG_FORCE_IMM_CALL, heart_port, heart_port->common.id, list, NULL); } - /* Make sure we unregister at epmd (unknown fd) and get at least - one free filedescriptor (for erl_crash.dump) */ - - max = max_files; - if (max < 1024) - max = 1024; - for (i = 3; i < max; i++) { -#if defined(ERTS_SMP) - /* We don't want to close the signal notification pipe... */ - if (i == sig_notify_fds[0] || i == sig_notify_fds[1]) - continue; -#elif defined(USE_THREADS) - /* We don't want to close the async notification pipe... */ - if (i == async_fd[0] || i == async_fd[1]) - continue; -#endif - /* We don't want to close our heart yet ... */ - if (i == heart_fd[0] || i == heart_fd[1]) - continue; - - close(i); - } + /* Make sure we have a fd for our crashdump file. */ + close(crashdump_companion_cube_fd); envsz = sizeof(env); i = erts_sys_getenv__("ERL_CRASH_DUMP_NICE", env, &envsz); @@ -791,7 +633,7 @@ break_requested(void) fprintf(stderr,"break!\n"); #endif if (ERTS_BREAK_REQUESTED) - erl_exit(ERTS_INTR_EXIT, ""); + erts_exit(ERTS_INTR_EXIT, ""); ERTS_SET_BREAK_REQUESTED; ERTS_CHK_IO_AS_INTR(); /* Make sure we don't sleep in poll */ @@ -830,14 +672,28 @@ sigusr1_exit(void) } prepare_crash_dump(secs); - erl_exit(1, "Received SIGUSR1\n"); + erts_exit(ERTS_ERROR_EXIT, "Received SIGUSR1\n"); } #ifdef ETHR_UNUSABLE_SIGUSRX #warning "Unusable SIGUSR1 & SIGUSR2. Disabling use of these signals" -#endif -#ifndef ETHR_UNUSABLE_SIGUSRX +#else + +#ifdef ERTS_SYS_SUSPEND_SIGNAL +void +sys_thr_suspend(erts_tid_t tid) { + erts_thr_kill(tid, ERTS_SYS_SUSPEND_SIGNAL); +} + +void +sys_thr_resume(erts_tid_t tid) { + int i = 0, res; + do { + res = write(sig_suspend_fds[1],&i,sizeof(i)); + } while (res < 0 && errno == EAGAIN); +} +#endif #if (defined(SIG_SIGSET) || defined(SIG_SIGNAL)) static RETSIGTYPE user_signal1(void) @@ -852,27 +708,27 @@ static RETSIGTYPE user_signal1(int signum) #endif } -#ifdef QUANTIFY +#ifdef ERTS_SYS_SUSPEND_SIGNAL #if (defined(SIG_SIGSET) || defined(SIG_SIGNAL)) -static RETSIGTYPE user_signal2(void) +static RETSIGTYPE suspend_signal(void) #else -static RETSIGTYPE user_signal2(int signum) +static RETSIGTYPE suspend_signal(int signum) #endif { -#ifdef ERTS_SMP - smp_sig_notify('2'); -#else - quantify_save_data(); -#endif + int res; + int buf[1]; + do { + res = read(sig_suspend_fds[0], buf, sizeof(int)); + } while (res < 0 && errno == EINTR); } -#endif +#endif /* #ifdef ERTS_SYS_SUSPEND_SIGNAL */ #endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */ static void quit_requested(void) { - erl_exit(ERTS_INTR_EXIT, ""); + erts_exit(ERTS_INTR_EXIT, ""); } #if (defined(SIG_SIGSET) || defined(SIG_SIGNAL)) @@ -890,9 +746,9 @@ static RETSIGTYPE do_quit(int signum) /* Disable break */ void erts_set_ignore_break(void) { - sys_sigset(SIGINT, SIG_IGN); - sys_sigset(SIGQUIT, SIG_IGN); - sys_sigset(SIGTSTP, SIG_IGN); + sys_signal(SIGINT, SIG_IGN); + sys_signal(SIGQUIT, SIG_IGN); + sys_signal(SIGTSTP, SIG_IGN); } /* Don't use ctrl-c for break handler but let it be @@ -915,65 +771,24 @@ void erts_replace_intr(void) { void init_break_handler(void) { - sys_sigset(SIGINT, request_break); + sys_signal(SIGINT, request_break); #ifndef ETHR_UNUSABLE_SIGUSRX - sys_sigset(SIGUSR1, user_signal1); -#ifdef QUANTIFY - sys_sigset(SIGUSR2, user_signal2); -#endif + sys_signal(SIGUSR1, user_signal1); #endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */ - sys_sigset(SIGQUIT, do_quit); -} - -int sys_max_files(void) -{ - return(max_files); + sys_signal(SIGQUIT, do_quit); } -static void block_signals(void) +void sys_init_suspend_handler(void) { -#if !CHLDWTHR - sys_sigblock(SIGCHLD); -#endif -#ifndef ERTS_SMP - sys_sigblock(SIGINT); -#ifndef ETHR_UNUSABLE_SIGUSRX - sys_sigblock(SIGUSR1); -#endif +#ifdef ERTS_SYS_SUSPEND_SIGNAL + sys_signal(ERTS_SYS_SUSPEND_SIGNAL, suspend_signal); #endif } -static void unblock_signals(void) -{ - /* Update erl_child_setup.c if changed */ -#if !CHLDWTHR - sys_sigrelease(SIGCHLD); -#endif -#ifndef ERTS_SMP - sys_sigrelease(SIGINT); -#ifndef ETHR_UNUSABLE_SIGUSRX - sys_sigrelease(SIGUSR1); -#endif /* #ifndef ETHR_UNUSABLE_SIGUSRX */ -#endif -} -/************************** Time stuff **************************/ -#ifdef HAVE_GETHRTIME -#ifdef GETHRTIME_WITH_CLOCK_GETTIME - -SysHrTime sys_gethrtime(void) +int sys_max_files(void) { - struct timespec ts; - long long result; - if (clock_gettime(CLOCK_MONOTONIC,&ts) != 0) { - erl_exit(1,"Fatal, could not get clock_monotonic value!, " - "errno = %d\n", errno); - } - result = ((long long) ts.tv_sec) * 1000000000LL + - ((long long) ts.tv_nsec); - return (SysHrTime) result; + return(max_files); } -#endif -#endif /************************** OS info *******************************/ @@ -1062,1306 +877,6 @@ void fini_getenv_state(GETENV_STATE *state) erts_smp_rwmtx_runlock(&environ_rwmtx); } - -/************************** Port I/O *******************************/ - - - -/* I. Common stuff */ - -/* - * Decreasing the size of it below 16384 is not allowed. - */ - -/* II. The spawn/fd/vanilla drivers */ - -#define ERTS_SYS_READ_BUF_SZ (64*1024) - -/* Driver interfaces */ -static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*); -static ErlDrvData fd_start(ErlDrvPort, char*, SysDriverOpts*); -static ErlDrvSSizeT fd_control(ErlDrvData, unsigned int, char *, ErlDrvSizeT, - char **, ErlDrvSizeT); -static ErlDrvData vanilla_start(ErlDrvPort, char*, SysDriverOpts*); -static int spawn_init(void); -static void fd_stop(ErlDrvData); -static void stop(ErlDrvData); -static void ready_input(ErlDrvData, ErlDrvEvent); -static void ready_output(ErlDrvData, ErlDrvEvent); -static void output(ErlDrvData, char*, ErlDrvSizeT); -static void outputv(ErlDrvData, ErlIOVec*); -static void stop_select(ErlDrvEvent, void*); - -struct erl_drv_entry spawn_driver_entry = { - spawn_init, - spawn_start, - stop, - output, - ready_input, - ready_output, - "spawn", - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - ERL_DRV_EXTENDED_MARKER, - ERL_DRV_EXTENDED_MAJOR_VERSION, - ERL_DRV_EXTENDED_MINOR_VERSION, - ERL_DRV_FLAG_USE_PORT_LOCKING, - NULL, NULL, - stop_select -}; -struct erl_drv_entry fd_driver_entry = { - NULL, - fd_start, - fd_stop, - output, - ready_input, - ready_output, - "fd", - NULL, - NULL, - fd_control, - NULL, - outputv, - NULL, /* ready_async */ - NULL, /* flush */ - NULL, /* call */ - NULL, /* event */ - ERL_DRV_EXTENDED_MARKER, - ERL_DRV_EXTENDED_MAJOR_VERSION, - ERL_DRV_EXTENDED_MINOR_VERSION, - 0, /* ERL_DRV_FLAGs */ - NULL, /* handle2 */ - NULL, /* process_exit */ - stop_select -}; -struct erl_drv_entry vanilla_driver_entry = { - NULL, - vanilla_start, - stop, - output, - ready_input, - ready_output, - "vanilla", - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, - NULL, /* flush */ - NULL, /* call */ - NULL, /* event */ - ERL_DRV_EXTENDED_MARKER, - ERL_DRV_EXTENDED_MAJOR_VERSION, - ERL_DRV_EXTENDED_MINOR_VERSION, - 0, /* ERL_DRV_FLAGs */ - NULL, /* handle2 */ - NULL, /* process_exit */ - stop_select -}; - -/* Handle SIGCHLD signals. */ -#if (defined(SIG_SIGSET) || defined(SIG_SIGNAL)) -static RETSIGTYPE onchld(void) -#else -static RETSIGTYPE onchld(int signum) -#endif -{ -#if CHLDWTHR - ASSERT(0); /* We should *never* catch a SIGCHLD signal */ -#elif defined(ERTS_SMP) - smp_sig_notify('C'); -#else - children_died = 1; - ERTS_CHK_IO_AS_INTR(); /* Make sure we don't sleep in poll */ -#endif -} - -static int set_driver_data(ErlDrvPort port_num, - int ifd, - int ofd, - int packet_bytes, - int read_write, - int exit_status, - int pid) -{ - Port *prt; - ErtsSysReportExit *report_exit; - - if (!exit_status) - report_exit = NULL; - else { - report_exit = erts_alloc(ERTS_ALC_T_PRT_REP_EXIT, - sizeof(ErtsSysReportExit)); - report_exit->next = report_exit_list; - report_exit->port = erts_drvport2id(port_num); - report_exit->pid = pid; - report_exit->ifd = read_write & DO_READ ? ifd : -1; - report_exit->ofd = read_write & DO_WRITE ? ofd : -1; -#if CHLDWTHR && !defined(ERTS_SMP) - report_exit->status = 0; -#endif - report_exit_list = report_exit; - } - - prt = erts_drvport2port(port_num); - if (prt != ERTS_INVALID_ERL_DRV_PORT) - prt->os_pid = pid; - - if (read_write & DO_READ) { - driver_data[ifd].packet_bytes = packet_bytes; - driver_data[ifd].port_num = port_num; - driver_data[ifd].report_exit = report_exit; - driver_data[ifd].pid = pid; - driver_data[ifd].alive = 1; - driver_data[ifd].status = 0; - if (read_write & DO_WRITE) { - driver_data[ifd].ofd = ofd; - if (ifd != ofd) - driver_data[ofd] = driver_data[ifd]; /* structure copy */ - } else { /* DO_READ only */ - driver_data[ifd].ofd = -1; - } - (void) driver_select(port_num, ifd, (ERL_DRV_READ|ERL_DRV_USE), 1); - return(ifd); - } else { /* DO_WRITE only */ - driver_data[ofd].packet_bytes = packet_bytes; - driver_data[ofd].port_num = port_num; - driver_data[ofd].report_exit = report_exit; - driver_data[ofd].ofd = ofd; - driver_data[ofd].pid = pid; - driver_data[ofd].alive = 1; - driver_data[ofd].status = 0; - return(ofd); - } -} - -static int spawn_init() -{ - int i; -#if CHLDWTHR - erts_thr_opts_t thr_opts = ERTS_THR_OPTS_DEFAULT_INITER; - thr_opts.detached = 0; - thr_opts.suggested_stack_size = 0; /* Smallest possible */ -#endif - - sys_sigset(SIGPIPE, SIG_IGN); /* Ignore - we'll handle the write failure */ - driver_data = (struct driver_data *) - erts_alloc(ERTS_ALC_T_DRV_TAB, max_files * sizeof(struct driver_data)); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, - max_files * sizeof(struct driver_data)); - - for (i = 0; i < max_files; i++) - driver_data[i].pid = -1; - -#if CHLDWTHR - sys_sigblock(SIGCHLD); -#endif - - sys_sigset(SIGCHLD, onchld); /* Reap children */ - -#if CHLDWTHR - erts_thr_create(&child_waiter_tid, child_waiter, NULL, &thr_opts); -#endif - - return 1; -} - -static void close_pipes(int ifd[2], int ofd[2], int read_write) -{ - if (read_write & DO_READ) { - (void) close(ifd[0]); - (void) close(ifd[1]); - } - if (read_write & DO_WRITE) { - (void) close(ofd[0]); - (void) close(ofd[1]); - } -} - -static void init_fd_data(int fd, ErlDrvPort port_num) -{ - fd_data[fd].buf = NULL; - fd_data[fd].cpos = NULL; - fd_data[fd].remain = 0; - fd_data[fd].sz = 0; - fd_data[fd].psz = 0; -} - -static char **build_unix_environment(char *block) -{ - int i; - int j; - int len; - char *cp; - char **cpp; - char** old_env; - - ERTS_SMP_LC_ASSERT(erts_smp_lc_rwmtx_is_rlocked(&environ_rwmtx)); - - cp = block; - len = 0; - while (*cp != '\0') { - cp += strlen(cp) + 1; - len++; - } - old_env = environ; - while (*old_env++ != NULL) { - len++; - } - - cpp = (char **) erts_alloc_fnf(ERTS_ALC_T_ENVIRONMENT, - sizeof(char *) * (len+1)); - if (cpp == NULL) { - return NULL; - } - - cp = block; - len = 0; - while (*cp != '\0') { - cpp[len] = cp; - cp += strlen(cp) + 1; - len++; - } - - i = len; - for (old_env = environ; *old_env; old_env++) { - char* old = *old_env; - - for (j = 0; j < len; j++) { - char *s, *t; - - s = cpp[j]; - t = old; - while (*s == *t && *s != '=') { - s++, t++; - } - if (*s == '=' && *t == '=') { - break; - } - } - - if (j == len) { /* New version not found */ - cpp[len++] = old; - } - } - - for (j = 0; j < i; ) { - size_t last = strlen(cpp[j])-1; - if (cpp[j][last] == '=' && strchr(cpp[j], '=') == cpp[j]+last) { - cpp[j] = cpp[--len]; - if (len < i) { - i--; - } else { - j++; - } - } - else { - j++; - } - } - - cpp[len] = NULL; - return cpp; -} - -/* - [arndt] In most Unix systems, including Solaris 2.5, 'fork' allocates memory - in swap space for the child of a 'fork', whereas 'vfork' does not do this. - The natural call to use here is therefore 'vfork'. Due to a bug in - 'vfork' in Solaris 2.5 (apparently fixed in 2.6), using 'vfork' - can be dangerous in what seems to be these circumstances: - If the child code under a vfork sets the signal action to SIG_DFL - (or SIG_IGN) - for any signal which was previously set to a signal handler, the - state of the parent is clobbered, so that the later arrival of - such a signal yields a sigsegv in the parent. If the signal was - not set to a signal handler, but ignored, all seems to work. - If you change the forking code below, beware of this. - */ - -static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) -{ -#define CMD_LINE_PREFIX_STR "exec " -#define CMD_LINE_PREFIX_STR_SZ (sizeof(CMD_LINE_PREFIX_STR) - 1) - - int ifd[2], ofd[2], len, pid, i; - char **volatile new_environ; /* volatile since a vfork() then cannot - cause 'new_environ' to be clobbered - in the parent process. */ - int saved_errno; - long res; - char *cmd_line; -#ifndef QNX - int unbind; -#endif -#if !DISABLE_VFORK - int no_vfork; - size_t no_vfork_sz = sizeof(no_vfork); - - no_vfork = (erts_sys_getenv_raw("ERL_NO_VFORK", - (char *) &no_vfork, - &no_vfork_sz) >= 0); -#endif - - switch (opts->read_write) { - case DO_READ: - if (pipe(ifd) < 0) - return ERL_DRV_ERROR_ERRNO; - if (ifd[0] >= max_files) { - close_pipes(ifd, ofd, opts->read_write); - errno = EMFILE; - return ERL_DRV_ERROR_ERRNO; - } - ofd[1] = -1; /* keep purify happy */ - break; - case DO_WRITE: - if (pipe(ofd) < 0) return ERL_DRV_ERROR_ERRNO; - if (ofd[1] >= max_files) { - close_pipes(ifd, ofd, opts->read_write); - errno = EMFILE; - return ERL_DRV_ERROR_ERRNO; - } - ifd[0] = -1; /* keep purify happy */ - break; - case DO_READ|DO_WRITE: - if (pipe(ifd) < 0) return ERL_DRV_ERROR_ERRNO; - errno = EMFILE; /* default for next two conditions */ - if (ifd[0] >= max_files || pipe(ofd) < 0) { - close_pipes(ifd, ofd, DO_READ); - return ERL_DRV_ERROR_ERRNO; - } - if (ofd[1] >= max_files) { - close_pipes(ifd, ofd, opts->read_write); - errno = EMFILE; - return ERL_DRV_ERROR_ERRNO; - } - break; - default: - ASSERT(0); - return ERL_DRV_ERROR_GENERAL; - } - - if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { - /* started with spawn_executable, not with spawn */ - len = strlen(name); - cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, len + 1); - if (!cmd_line) { - close_pipes(ifd, ofd, opts->read_write); - errno = ENOMEM; - return ERL_DRV_ERROR_ERRNO; - } - memcpy((void *) cmd_line,(void *) name, len); - cmd_line[len] = '\0'; - if (access(cmd_line,X_OK) != 0) { - int save_errno = errno; - erts_free(ERTS_ALC_T_TMP, cmd_line); - errno = save_errno; - return ERL_DRV_ERROR_ERRNO; - } - } else { - /* make the string suitable for giving to "sh" */ - len = strlen(name); - cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, - CMD_LINE_PREFIX_STR_SZ + len + 1); - if (!cmd_line) { - close_pipes(ifd, ofd, opts->read_write); - errno = ENOMEM; - return ERL_DRV_ERROR_ERRNO; - } - memcpy((void *) cmd_line, - (void *) CMD_LINE_PREFIX_STR, - CMD_LINE_PREFIX_STR_SZ); - memcpy((void *) (cmd_line + CMD_LINE_PREFIX_STR_SZ), (void *) name, len); - cmd_line[CMD_LINE_PREFIX_STR_SZ + len] = '\0'; - } - - erts_smp_rwmtx_rlock(&environ_rwmtx); - - if (opts->envir == NULL) { - new_environ = environ; - } else if ((new_environ = build_unix_environment(opts->envir)) == NULL) { - erts_smp_rwmtx_runlock(&environ_rwmtx); - erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); - errno = ENOMEM; - return ERL_DRV_ERROR_ERRNO; - } - -#ifndef QNX - /* Block child from SIGINT and SIGUSR1. Must be before fork() - to be safe. */ - block_signals(); - - CHLD_STAT_LOCK; - - unbind = erts_sched_bind_atfork_prepare(); - -#if !DISABLE_VFORK - /* See fork/vfork discussion before this function. */ - if (no_vfork) { -#endif - - DEBUGF(("Using fork\n")); - pid = fork(); - - if (pid == 0) { - /* The child! Setup child... */ - - if (erts_sched_bind_atfork_child(unbind) != 0) - goto child_error; - - /* OBSERVE! - * Keep child setup after vfork() (implemented below and in - * erl_child_setup.c) up to date if changes are made here. - */ - - if (opts->use_stdio) { - if (opts->read_write & DO_READ) { - /* stdout for process */ - if (dup2(ifd[1], 1) < 0) - goto child_error; - if(opts->redir_stderr) - /* stderr for process */ - if (dup2(ifd[1], 2) < 0) - goto child_error; - } - if (opts->read_write & DO_WRITE) - /* stdin for process */ - if (dup2(ofd[0], 0) < 0) - goto child_error; - } - else { /* XXX will fail if ofd[0] == 4 (unlikely..) */ - if (opts->read_write & DO_READ) - if (dup2(ifd[1], 4) < 0) - goto child_error; - if (opts->read_write & DO_WRITE) - if (dup2(ofd[0], 3) < 0) - goto child_error; - } - - for (i = opts->use_stdio ? 3 : 5; i < max_files; i++) - (void) close(i); - - if (opts->wd && chdir(opts->wd) < 0) - goto child_error; - -#if defined(USE_SETPGRP_NOARGS) /* SysV */ - (void) setpgrp(); -#elif defined(USE_SETPGRP) /* BSD */ - (void) setpgrp(0, getpid()); -#else /* POSIX */ - (void) setsid(); -#endif - - unblock_signals(); - - if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { - if (opts->argv == NULL) { - execle(cmd_line,cmd_line,(char *) NULL, new_environ); - } else { - if (opts->argv[0] == erts_default_arg0) { - opts->argv[0] = cmd_line; - } - execve(cmd_line, opts->argv, new_environ); - if (opts->argv[0] == cmd_line) { - opts->argv[0] = erts_default_arg0; - } - } - } else { - execle("/bin/sh", "sh", "-c", cmd_line, (char *) NULL, new_environ); - } - child_error: - _exit(1); - } -#if !DISABLE_VFORK - } -#define ENOUGH_BYTES (44) - else { /* Use vfork() */ - char **cs_argv= erts_alloc(ERTS_ALC_T_TMP,(CS_ARGV_NO_OF_ARGS + 1)* - sizeof(char *)); - char fd_close_range[ENOUGH_BYTES]; /* 44 bytes are enough to */ - char dup2_op[CS_ARGV_NO_OF_DUP2_OPS][ENOUGH_BYTES]; /* hold any "%d:%d" string */ - /* on a 64-bit machine. */ - - /* Setup argv[] for the child setup program (implemented in - erl_child_setup.c) */ - i = 0; - if (opts->use_stdio) { - if (opts->read_write & DO_READ){ - /* stdout for process */ - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 1); - if(opts->redir_stderr) - /* stderr for process */ - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 2); - } - if (opts->read_write & DO_WRITE) - /* stdin for process */ - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ofd[0], 0); - } else { /* XXX will fail if ofd[0] == 4 (unlikely..) */ - if (opts->read_write & DO_READ) - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ifd[1], 4); - if (opts->read_write & DO_WRITE) - erts_snprintf(&dup2_op[i++][0], ENOUGH_BYTES, "%d:%d", ofd[0], 3); - } - for (; i < CS_ARGV_NO_OF_DUP2_OPS; i++) - strcpy(&dup2_op[i][0], "-"); - erts_snprintf(fd_close_range, ENOUGH_BYTES, "%d:%d", opts->use_stdio ? 3 : 5, max_files-1); - - cs_argv[CS_ARGV_PROGNAME_IX] = child_setup_prog; - cs_argv[CS_ARGV_WD_IX] = opts->wd ? opts->wd : "."; - cs_argv[CS_ARGV_UNBIND_IX] = erts_sched_bind_atvfork_child(unbind); - cs_argv[CS_ARGV_FD_CR_IX] = fd_close_range; - for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) - cs_argv[CS_ARGV_DUP2_OP_IX(i)] = &dup2_op[i][0]; - - if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { - int num = 0; - int j = 0; - if (opts->argv != NULL) { - for(; opts->argv[num] != NULL; ++num) - ; - } - cs_argv = erts_realloc(ERTS_ALC_T_TMP,cs_argv, (CS_ARGV_NO_OF_ARGS + 1 + num + 1) * sizeof(char *)); - cs_argv[CS_ARGV_CMD_IX] = "-"; - cs_argv[CS_ARGV_NO_OF_ARGS] = cmd_line; - if (opts->argv != NULL) { - for (;opts->argv[j] != NULL; ++j) { - if (opts->argv[j] == erts_default_arg0) { - cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = cmd_line; - } else { - cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = opts->argv[j]; - } - } - } - cs_argv[CS_ARGV_NO_OF_ARGS + 1 + j] = NULL; - } else { - cs_argv[CS_ARGV_CMD_IX] = cmd_line; /* Command */ - cs_argv[CS_ARGV_NO_OF_ARGS] = NULL; - } - DEBUGF(("Using vfork\n")); - pid = vfork(); - - if (pid == 0) { - /* The child! */ - - /* Observe! - * OTP-4389: The child setup program (implemented in - * erl_child_setup.c) will perform the necessary setup of the - * child before it execs to the user program. This because - * vfork() only allow an *immediate* execve() or _exit() in the - * child. - */ - execve(child_setup_prog, cs_argv, new_environ); - _exit(1); - } - erts_free(ERTS_ALC_T_TMP,cs_argv); - } -#undef ENOUGH_BYTES -#endif - - erts_sched_bind_atfork_parent(unbind); - - if (pid == -1) { - saved_errno = errno; - CHLD_STAT_UNLOCK; - erts_smp_rwmtx_runlock(&environ_rwmtx); - erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); - unblock_signals(); - close_pipes(ifd, ofd, opts->read_write); - errno = saved_errno; - return ERL_DRV_ERROR_ERRNO; - } -#else /* QNX */ - if (opts->use_stdio) { - if (opts->read_write & DO_READ) - qnx_spawn_options.iov[1] = ifd[1]; /* stdout for process */ - if (opts->read_write & DO_WRITE) - qnx_spawn_options.iov[0] = ofd[0]; /* stdin for process */ - } - else { - if (opts->read_write & DO_READ) - qnx_spawn_options.iov[4] = ifd[1]; - if (opts->read_write & DO_WRITE) - qnx_spawn_options.iov[3] = ofd[0]; - } - /* Close fds on exec */ - for (i = 3; i < max_files; i++) - fcntl(i, F_SETFD, 1); - - qnx_spawn_options.flags = _SPAWN_SETSID; - if ((pid = spawnl(P_NOWAIT, "/bin/sh", "/bin/sh", "-c", cmd_line, - (char *) 0)) < 0) { - erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); - reset_qnx_spawn(); - erts_smp_rwmtx_runlock(&environ_rwmtx); - close_pipes(ifd, ofd, opts->read_write); - return ERL_DRV_ERROR_GENERAL; - } - reset_qnx_spawn(); -#endif /* QNX */ - - erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); - - if (new_environ != environ) - erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); - - if (opts->read_write & DO_READ) - (void) close(ifd[1]); - if (opts->read_write & DO_WRITE) - (void) close(ofd[0]); - - if (opts->read_write & DO_READ) { - SET_NONBLOCKING(ifd[0]); - init_fd_data(ifd[0], port_num); - } - if (opts->read_write & DO_WRITE) { - SET_NONBLOCKING(ofd[1]); - init_fd_data(ofd[1], port_num); - } - - res = set_driver_data(port_num, ifd[0], ofd[1], opts->packet_bytes, - opts->read_write, opts->exit_status, pid); - /* Don't unblock SIGCHLD until now, since the call above must - first complete putting away the info about our new subprocess. */ - unblock_signals(); - -#if CHLDWTHR - ASSERT(children_alive >= 0); - - if (!(children_alive++)) - CHLD_STAT_SIGNAL; /* Wake up child waiter thread if no children - was alive before we fork()ed ... */ -#endif - /* Don't unlock chld_stat_mtx until now of the same reason as above */ - CHLD_STAT_UNLOCK; - - erts_smp_rwmtx_runlock(&environ_rwmtx); - - return (ErlDrvData)res; -#undef CMD_LINE_PREFIX_STR -#undef CMD_LINE_PREFIX_STR_SZ -} - -#ifdef QNX -static reset_qnx_spawn() -{ - int i; - - /* Reset qnx_spawn_options */ - qnx_spawn_options.flags = 0; - qnx_spawn_options.iov[0] = 0xff; - qnx_spawn_options.iov[1] = 0xff; - qnx_spawn_options.iov[2] = 0xff; - qnx_spawn_options.iov[3] = 0xff; -} -#endif - -#define FD_DEF_HEIGHT 24 -#define FD_DEF_WIDTH 80 -/* Control op */ -#define FD_CTRL_OP_GET_WINSIZE 100 - -static int fd_get_window_size(int fd, Uint32 *width, Uint32 *height) -{ -#ifdef TIOCGWINSZ - struct winsize ws; - if (ioctl(fd,TIOCGWINSZ,&ws) == 0) { - *width = (Uint32) ws.ws_col; - *height = (Uint32) ws.ws_row; - return 0; - } -#endif - return -1; -} - -static ErlDrvSSizeT fd_control(ErlDrvData drv_data, - unsigned int command, - char *buf, ErlDrvSizeT len, - char **rbuf, ErlDrvSizeT rlen) -{ - int fd = (int)(long)drv_data; - char resbuff[2*sizeof(Uint32)]; - switch (command) { - case FD_CTRL_OP_GET_WINSIZE: - { - Uint32 w,h; - if (fd_get_window_size(fd,&w,&h)) - return 0; - memcpy(resbuff,&w,sizeof(Uint32)); - memcpy(resbuff+sizeof(Uint32),&h,sizeof(Uint32)); - } - break; - default: - return 0; - } - if (rlen < 2*sizeof(Uint32)) { - *rbuf = driver_alloc(2*sizeof(Uint32)); - } - memcpy(*rbuf,resbuff,2*sizeof(Uint32)); - return 2*sizeof(Uint32); -} - -static ErlDrvData fd_start(ErlDrvPort port_num, char* name, - SysDriverOpts* opts) -{ - ErlDrvData res; - - if (((opts->read_write & DO_READ) && opts->ifd >= max_files) || - ((opts->read_write & DO_WRITE) && opts->ofd >= max_files)) - return ERL_DRV_ERROR_GENERAL; - - /* - * Historical: - * - * "Note about nonblocking I/O. - * - * At least on Solaris, setting the write end of a TTY to nonblocking, - * will set the input end to nonblocking as well (and vice-versa). - * If erl is run in a pipeline like this: cat | erl - * the input end of the TTY will be the standard input of cat. - * And cat is not prepared to handle nonblocking I/O." - * - * Actually, the reason for this is not that the tty itself gets set - * in non-blocking mode, but that the "input end" (cat's stdin) and - * the "output end" (erlang's stdout) are typically the "same" file - * descriptor, dup()'ed from a single fd by one of this process' - * ancestors. - * - * The workaround for this problem used to be a rather bad kludge, - * interposing an extra process ("internal cat") between erlang's - * stdout and the original stdout, allowing erlang to set its stdout - * in non-blocking mode without affecting the stdin of the preceding - * process in the pipeline - and being a kludge, it caused all kinds - * of weird problems. - * - * So, this is the current logic: - * - * The only reason to set non-blocking mode on the output fd at all is - * if it's something that can cause a write() to block, of course, - * i.e. primarily if it points to a tty, socket, pipe, or fifo. - * - * If we don't set non-blocking mode when we "should" have, and output - * becomes blocked, the entire runtime system will be suspended - this - * is normally bad of course, and can happen fairly "easily" - e.g. user - * hits ^S on tty - but doesn't necessarily happen. - * - * If we do set non-blocking mode when we "shouldn't" have, the runtime - * system will end up seeing EOF on the input fd (due to the preceding - * process dying), which typically will cause the entire runtime system - * to terminate immediately (due to whatever erlang process is seeing - * the EOF taking it as a signal to halt the system). This is *very* bad. - * - * I.e. we should take a conservative approach, and only set non- - * blocking mode when we a) need to, and b) are reasonably certain - * that it won't be a problem. And as in the example above, the problem - * occurs when input fd and output fd point to different "things". - * - * However, determining that they are not just the same "type" of - * "thing", but actually the same instance of that type of thing, is - * unreasonably complex in many/most cases. - * - * Also, with pipes, sockets, and fifos it's far from obvious that the - * user *wants* non-blocking output: If you're running erlang inside - * some complex pipeline, you're probably not running a real-time system - * that must never stop, but rather *want* it to suspend if the output - * channel is "full". - * - * So, the bottom line: We will only set the output fd non-blocking if - * it points to a tty, and either a) the input fd also points to a tty, - * or b) we can make sure that setting the output fd non-blocking - * doesn't interfere with someone else's input, via a somewhat milder - * kludge than the above. - * - * Also keep in mind that while this code is almost exclusively run as - * a result of an erlang open_port({fd,0,1}, ...), that isn't the only - * case - it can be called with any old pre-existing file descriptors, - * the relations between which (if they're even two) we can only guess - * at - still, we try our best... - */ - - if (opts->read_write & DO_READ) { - init_fd_data(opts->ifd, port_num); - } - if (opts->read_write & DO_WRITE) { - init_fd_data(opts->ofd, port_num); - - /* If we don't have a read end, all bets are off - no non-blocking. */ - if (opts->read_write & DO_READ) { - - if (isatty(opts->ofd)) { /* output fd is a tty:-) */ - - if (isatty(opts->ifd)) { /* input fd is also a tty */ - - /* To really do this "right", we should also check that - input and output fd point to the *same* tty - but - this seems like overkill; ttyname() isn't for free, - and this is a very common case - and it's hard to - imagine a scenario where setting non-blocking mode - here would cause problems - go ahead and do it. */ - - SET_NONBLOCKING(opts->ofd); - - } else { /* output fd is a tty, input fd isn't */ - - /* This is a "problem case", but also common (see the - example above) - i.e. it makes sense to try a bit - harder before giving up on non-blocking mode: Try to - re-open the tty that the output fd points to, and if - successful replace the original one with the "new" fd - obtained this way, and set *that* one in non-blocking - mode. (Yes, this is a kludge.) - - However, re-opening the tty may fail in a couple of - (unusual) cases: - - 1) The name of the tty (or an equivalent one, i.e. - same major/minor number) can't be found, because - it actually lives somewhere other than /dev (or - wherever ttyname() looks for it), and isn't - equivalent to any of those that do live in the - "standard" place - this should be *very* unusual. - - 2) Permissions on the tty don't allow us to open it - - it's perfectly possible to have an fd open to an - object whose permissions wouldn't allow us to open - it. This is not as unusual as it sounds, one case - is if the user has su'ed to someone else (not - root) - we have a read/write fd open to the tty - (because it has been inherited all the way down - here), but we have neither read nor write - permission for the tty. - - In these cases, we finally give up, and don't set the - output fd in non-blocking mode. */ - - char *tty; - int nfd; - - if ((tty = ttyname(opts->ofd)) != NULL && - (nfd = open(tty, O_WRONLY)) != -1) { - dup2(nfd, opts->ofd); - close(nfd); - SET_NONBLOCKING(opts->ofd); - } - } - } - } - } - CHLD_STAT_LOCK; - res = (ErlDrvData)(long)set_driver_data(port_num, opts->ifd, opts->ofd, - opts->packet_bytes, - opts->read_write, 0, -1); - CHLD_STAT_UNLOCK; - return res; -} - -static void clear_fd_data(int fd) -{ - if (fd_data[fd].sz > 0) { - erts_free(ERTS_ALC_T_FD_ENTRY_BUF, (void *) fd_data[fd].buf); - ASSERT(erts_smp_atomic_read_nob(&sys_misc_mem_sz) >= fd_data[fd].sz); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*fd_data[fd].sz); - } - fd_data[fd].buf = NULL; - fd_data[fd].sz = 0; - fd_data[fd].remain = 0; - fd_data[fd].cpos = NULL; - fd_data[fd].psz = 0; -} - -static void nbio_stop_fd(ErlDrvPort prt, int fd) -{ - driver_select(prt,fd,DO_READ|DO_WRITE,0); - clear_fd_data(fd); - SET_BLOCKING(fd); -} - -static void fd_stop(ErlDrvData fd) /* Does not close the fds */ -{ - int ofd; - - nbio_stop_fd(driver_data[(int)(long)fd].port_num, (int)(long)fd); - ofd = driver_data[(int)(long)fd].ofd; - if (ofd != (int)(long)fd && ofd != -1) - nbio_stop_fd(driver_data[(int)(long)fd].port_num, (int)(long)ofd); -} - -static ErlDrvData vanilla_start(ErlDrvPort port_num, char* name, - SysDriverOpts* opts) -{ - int flags, fd; - ErlDrvData res; - - flags = (opts->read_write == DO_READ ? O_RDONLY : - opts->read_write == DO_WRITE ? O_WRONLY|O_CREAT|O_TRUNC : - O_RDWR|O_CREAT); - if ((fd = open(name, flags, 0666)) < 0) - return ERL_DRV_ERROR_GENERAL; - if (fd >= max_files) { - close(fd); - return ERL_DRV_ERROR_GENERAL; - } - SET_NONBLOCKING(fd); - init_fd_data(fd, port_num); - - CHLD_STAT_LOCK; - res = (ErlDrvData)(long)set_driver_data(port_num, fd, fd, - opts->packet_bytes, - opts->read_write, 0, -1); - CHLD_STAT_UNLOCK; - return res; -} - -/* Note that driver_data[fd].ifd == fd if the port was opened for reading, */ -/* otherwise (i.e. write only) driver_data[fd].ofd = fd. */ - -static void stop(ErlDrvData fd) -{ - ErlDrvPort prt; - int ofd; - - prt = driver_data[(int)(long)fd].port_num; - nbio_stop_fd(prt, (int)(long)fd); - - ofd = driver_data[(int)(long)fd].ofd; - if (ofd != (int)(long)fd && (int)(long)ofd != -1) - nbio_stop_fd(prt, ofd); - else - ofd = -1; - - CHLD_STAT_LOCK; - - /* Mark as unused. */ - driver_data[(int)(long)fd].pid = -1; - - CHLD_STAT_UNLOCK; - - /* SMP note: Close has to be last thing done (open file descriptors work - as locks on driver_data[] entries) */ - driver_select(prt, (int)(long)fd, ERL_DRV_USE, 0); /* close(fd); */ - if (ofd >= 0) { - driver_select(prt, (int)(long)ofd, ERL_DRV_USE, 0); /* close(ofd); */ - } -} - -static void outputv(ErlDrvData e, ErlIOVec* ev) -{ - int fd = (int)(long)e; - ErlDrvPort ix = driver_data[fd].port_num; - int pb = driver_data[fd].packet_bytes; - int ofd = driver_data[fd].ofd; - ssize_t n; - ErlDrvSizeT sz; - char lb[4]; - char* lbp; - ErlDrvSizeT len = ev->size; - - /* (len > ((unsigned long)-1 >> (4-pb)*8)) */ - /* if (pb >= 0 && (len & (((ErlDrvSizeT)1 << (pb*8))) - 1) != len) {*/ - if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) { - driver_failure_posix(ix, EINVAL); - return; /* -1; */ - } - /* Handles 0 <= pb <= 4 only */ - put_int32((Uint32) len, lb); - lbp = lb + (4-pb); - - ev->iov[0].iov_base = lbp; - ev->iov[0].iov_len = pb; - ev->size += pb; - if ((sz = driver_sizeq(ix)) > 0) { - driver_enqv(ix, ev, 0); - if (sz + ev->size >= (1 << 13)) - set_busy_port(ix, 1); - } - else { - int vsize = ev->vsize > MAX_VSIZE ? MAX_VSIZE : ev->vsize; - - n = writev(ofd, (const void *) (ev->iov), vsize); - if (n == ev->size) - return; /* 0;*/ - if (n < 0) { - if ((errno != EINTR) && (errno != ERRNO_BLOCK)) { - driver_failure_posix(ix, errno); - return; /* -1;*/ - } - n = 0; - } - driver_enqv(ix, ev, n); /* n is the skip value */ - driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1); - } - /* return 0;*/ -} - - -static void output(ErlDrvData e, char* buf, ErlDrvSizeT len) -{ - int fd = (int)(long)e; - ErlDrvPort ix = driver_data[fd].port_num; - int pb = driver_data[fd].packet_bytes; - int ofd = driver_data[fd].ofd; - ssize_t n; - ErlDrvSizeT sz; - char lb[4]; - char* lbp; - struct iovec iv[2]; - - /* (len > ((unsigned long)-1 >> (4-pb)*8)) */ - if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) { - driver_failure_posix(ix, EINVAL); - return; /* -1; */ - } - put_int32(len, lb); - lbp = lb + (4-pb); - - if ((sz = driver_sizeq(ix)) > 0) { - driver_enq(ix, lbp, pb); - driver_enq(ix, buf, len); - if (sz + len + pb >= (1 << 13)) - set_busy_port(ix, 1); - } - else { - iv[0].iov_base = lbp; - iv[0].iov_len = pb; /* should work for pb=0 */ - iv[1].iov_base = buf; - iv[1].iov_len = len; - n = writev(ofd, iv, 2); - if (n == pb+len) - return; /* 0; */ - if (n < 0) { - if ((errno != EINTR) && (errno != ERRNO_BLOCK)) { - driver_failure_posix(ix, errno); - return; /* -1; */ - } - n = 0; - } - if (n < pb) { - driver_enq(ix, lbp+n, pb-n); - driver_enq(ix, buf, len); - } - else { - n -= pb; - driver_enq(ix, buf+n, len-n); - } - driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1); - } - return; /* 0; */ -} - -static int port_inp_failure(ErlDrvPort port_num, int ready_fd, int res) - /* Result: 0 (eof) or -1 (error) */ -{ - int err = errno; - - ASSERT(res <= 0); - (void) driver_select(port_num, ready_fd, ERL_DRV_READ|ERL_DRV_WRITE, 0); - clear_fd_data(ready_fd); - if (res == 0) { - if (driver_data[ready_fd].report_exit) { - CHLD_STAT_LOCK; - - if (driver_data[ready_fd].alive) { - /* - * We have eof and want to report exit status, but the process - * hasn't exited yet. When it does report_exit_status() will - * driver_select() this fd which will make sure that we get - * back here with driver_data[ready_fd].alive == 0 and - * driver_data[ready_fd].status set. - */ - CHLD_STAT_UNLOCK; - return 0; - } - else { - int status = driver_data[ready_fd].status; - CHLD_STAT_UNLOCK; - - /* We need not be prepared for stopped/continued processes. */ - if (WIFSIGNALED(status)) - status = 128 + WTERMSIG(status); - else - status = WEXITSTATUS(status); - - driver_report_exit(driver_data[ready_fd].port_num, status); - } - } - driver_failure_eof(port_num); - } else { - driver_failure_posix(port_num, err); - } - return 0; -} - -/* fd is the drv_data that is returned from the */ -/* initial start routine */ -/* ready_fd is the descriptor that is ready to read */ - -static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd) -{ - int fd = (int)(long)e; - ErlDrvPort port_num; - int packet_bytes; - int res; - Uint h; - - port_num = driver_data[fd].port_num; - packet_bytes = driver_data[fd].packet_bytes; - - if (packet_bytes == 0) { - byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF, - ERTS_SYS_READ_BUF_SZ); - res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ); - if (res < 0) { - if ((errno != EINTR) && (errno != ERRNO_BLOCK)) - port_inp_failure(port_num, ready_fd, res); - } - else if (res == 0) - port_inp_failure(port_num, ready_fd, res); - else - driver_output(port_num, (char*) read_buf, res); - erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf); - } - else if (fd_data[ready_fd].remain > 0) { /* We try to read the remainder */ - /* space is allocated in buf */ - res = read(ready_fd, fd_data[ready_fd].cpos, - fd_data[ready_fd].remain); - if (res < 0) { - if ((errno != EINTR) && (errno != ERRNO_BLOCK)) - port_inp_failure(port_num, ready_fd, res); - } - else if (res == 0) { - port_inp_failure(port_num, ready_fd, res); - } - else if (res == fd_data[ready_fd].remain) { /* we're done */ - driver_output(port_num, fd_data[ready_fd].buf, - fd_data[ready_fd].sz); - clear_fd_data(ready_fd); - } - else { /* if (res < fd_data[ready_fd].remain) */ - fd_data[ready_fd].cpos += res; - fd_data[ready_fd].remain -= res; - } - } - else if (fd_data[ready_fd].remain == 0) { /* clean fd */ - byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF, - ERTS_SYS_READ_BUF_SZ); - /* We make one read attempt and see what happens */ - res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ); - if (res < 0) { - if ((errno != EINTR) && (errno != ERRNO_BLOCK)) - port_inp_failure(port_num, ready_fd, res); - } - else if (res == 0) { /* eof */ - port_inp_failure(port_num, ready_fd, res); - } - else if (res < packet_bytes - fd_data[ready_fd].psz) { - memcpy(fd_data[ready_fd].pbuf+fd_data[ready_fd].psz, - read_buf, res); - fd_data[ready_fd].psz += res; - } - else { /* if (res >= packet_bytes) */ - unsigned char* cpos = read_buf; - int bytes_left = res; - - while (1) { - int psz = fd_data[ready_fd].psz; - char* pbp = fd_data[ready_fd].pbuf + psz; - - while(bytes_left && (psz < packet_bytes)) { - *pbp++ = *cpos++; - bytes_left--; - psz++; - } - - if (psz < packet_bytes) { - fd_data[ready_fd].psz = psz; - break; - } - fd_data[ready_fd].psz = 0; - - switch (packet_bytes) { - case 1: h = get_int8(fd_data[ready_fd].pbuf); break; - case 2: h = get_int16(fd_data[ready_fd].pbuf); break; - case 4: h = get_int32(fd_data[ready_fd].pbuf); break; - default: ASSERT(0); return; /* -1; */ - } - - if (h <= (bytes_left)) { - driver_output(port_num, (char*) cpos, h); - cpos += h; - bytes_left -= h; - continue; - } - else { /* The last message we got was split */ - char *buf = erts_alloc_fnf(ERTS_ALC_T_FD_ENTRY_BUF, h); - if (!buf) { - errno = ENOMEM; - port_inp_failure(port_num, ready_fd, -1); - } - else { - erts_smp_atomic_add_nob(&sys_misc_mem_sz, h); - sys_memcpy(buf, cpos, bytes_left); - fd_data[ready_fd].buf = buf; - fd_data[ready_fd].sz = h; - fd_data[ready_fd].remain = h - bytes_left; - fd_data[ready_fd].cpos = buf + bytes_left; - } - break; - } - } - } - erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf); - } -} - - -/* fd is the drv_data that is returned from the */ -/* initial start routine */ -/* ready_fd is the descriptor that is ready to read */ - -static void ready_output(ErlDrvData e, ErlDrvEvent ready_fd) -{ - int fd = (int)(long)e; - ErlDrvPort ix = driver_data[fd].port_num; - int n; - struct iovec* iv; - int vsize; - - - if ((iv = (struct iovec*) driver_peekq(ix, &vsize)) == NULL) { - driver_select(ix, ready_fd, ERL_DRV_WRITE, 0); - return; /* 0; */ - } - vsize = vsize > MAX_VSIZE ? MAX_VSIZE : vsize; - if ((n = writev(ready_fd, iv, vsize)) > 0) { - if (driver_deq(ix, n) == 0) - set_busy_port(ix, 0); - } - else if (n < 0) { - if (errno == ERRNO_BLOCK || errno == EINTR) - return; /* 0; */ - else { - int res = errno; - driver_select(ix, ready_fd, ERL_DRV_WRITE, 0); - driver_failure_posix(ix, res); - return; /* -1; */ - } - } - return; /* 0; */ -} - -static void stop_select(ErlDrvEvent fd, void* _) -{ - close((int)fd); -} - - void erts_do_break_handling(void) { struct termios temp_mode; @@ -2489,13 +1004,19 @@ erts_sys_getenv(char *key, char *value, size_t *size) return res; } +int +erts_sys_unsetenv(char *key) +{ + int res; + erts_smp_rwmtx_rwlock(&environ_rwmtx); + res = unsetenv(key); + erts_smp_rwmtx_rwunlock(&environ_rwmtx); + return res; +} + void sys_init_io(void) { - fd_data = (struct fd_data *) - erts_alloc(ERTS_ALC_T_FD_TAB, max_files * sizeof(struct fd_data)); - erts_smp_atomic_add_nob(&sys_misc_mem_sz, - max_files * sizeof(struct fd_data)); } #if (0) /* unused? */ @@ -2524,6 +1045,52 @@ void erts_sys_alloc_init(void) { } +#if ERTS_HAVE_ERTS_SYS_ALIGNED_ALLOC +void *erts_sys_aligned_alloc(UWord alignment, UWord size) +{ +#ifdef HAVE_POSIX_MEMALIGN + void *ptr = NULL; + int error; + ASSERT(alignment && (alignment & (alignment-1)) == 0); /* power of 2 */ + error = posix_memalign(&ptr, (size_t) alignment, (size_t) size); +#if HAVE_ERTS_MSEG + if (error || !ptr) { + erts_mseg_clear_cache(); + error = posix_memalign(&ptr, (size_t) alignment, (size_t) size); + } +#endif + if (error) { + errno = error; + return NULL; + } + if (!ptr) + errno = ENOMEM; + ASSERT(!ptr || (((UWord) ptr) & (alignment - 1)) == 0); + return ptr; +#else +# error "Missing erts_sys_aligned_alloc() implementation" +#endif +} + +void erts_sys_aligned_free(UWord alignment, void *ptr) +{ + ASSERT(alignment && (alignment & (alignment-1)) == 0); /* power of 2 */ + free(ptr); +} + +void *erts_sys_aligned_realloc(UWord alignment, void *ptr, UWord size, UWord old_size) +{ + void *new_ptr = erts_sys_aligned_alloc(alignment, size); + if (new_ptr) { + UWord copy_size = old_size < size ? old_size : size; + sys_memcpy(new_ptr, ptr, (size_t) copy_size); + erts_sys_aligned_free(alignment, ptr); + } + return new_ptr; +} + +#endif + void *erts_sys_alloc(ErtsAlcType_t t, void *x, Uint sz) { void *res = malloc((size_t) sz); @@ -2574,33 +1141,43 @@ void sys_preload_end(Preload* p) /* Nothing */ } -/* Read a key from console (?) */ - +/* Read a key from console, used by break.c + Here we assume that all schedulers are stopped so that erl_poll + does not interfere with the select below. +*/ int sys_get_key(fd) int fd; { - int c; + int c, ret; unsigned char rbuf[64]; + fd_set fds; fflush(stdout); /* Flush query ??? */ - if ((c = read(fd,rbuf,64)) <= 0) { - return c; + FD_ZERO(&fds); + FD_SET(fd,&fds); + + ret = select(fd+1, &fds, NULL, NULL, NULL); + + if (ret == 1) { + do { + c = read(fd,rbuf,64); + } while (c < 0 && errno == EAGAIN); + if (c <= 0) + return c; } return rbuf[0]; } -#ifdef DEBUG - extern int erts_initialized; void -erl_assert_error(char* expr, char* file, int line) +erl_assert_error(const char* expr, const char* func, const char* file, int line) { fflush(stdout); - fprintf(stderr, "Assertion failed: %s in %s, line %d\n", - expr, file, line); + fprintf(stderr, "%s:%d:%s() Assertion failed: %s\n", + file, line, func, expr); fflush(stderr); #if !defined(ERTS_SMP) && 0 /* Writing a crashdump from a failed assertion when smp support @@ -2615,6 +1192,8 @@ erl_assert_error(char* expr, char* file, int line) abort(); } +#ifdef DEBUG + void erl_debug(char* fmt, ...) { @@ -2631,179 +1210,6 @@ erl_debug(char* fmt, ...) #endif /* DEBUG */ -static ERTS_INLINE void -report_exit_status(ErtsSysReportExit *rep, int status) -{ - Port *pp; -#ifdef ERTS_SMP - CHLD_STAT_UNLOCK; - pp = erts_thr_id2port_sflgs(rep->port, - ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP); - CHLD_STAT_LOCK; -#else - pp = erts_id2port_sflgs(rep->port, - NULL, - 0, - ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP); -#endif - if (pp) { - if (rep->ifd >= 0) { - driver_data[rep->ifd].alive = 0; - driver_data[rep->ifd].status = status; - (void) driver_select(ERTS_Port2ErlDrvPort(pp), - rep->ifd, - (ERL_DRV_READ|ERL_DRV_USE), - 1); - } - if (rep->ofd >= 0) { - driver_data[rep->ofd].alive = 0; - driver_data[rep->ofd].status = status; - (void) driver_select(ERTS_Port2ErlDrvPort(pp), - rep->ofd, - (ERL_DRV_WRITE|ERL_DRV_USE), - 1); - } -#ifdef ERTS_SMP - erts_thr_port_release(pp); -#else - erts_port_release(pp); -#endif - } - erts_free(ERTS_ALC_T_PRT_REP_EXIT, rep); -} - -#if !CHLDWTHR /* ---------------------------------------------------------- */ - -#define ERTS_REPORT_EXIT_STATUS report_exit_status - -static int check_children(void) -{ - int res = 0; - int pid; - int status; - -#ifndef ERTS_SMP - if (children_died) -#endif - { - sys_sigblock(SIGCHLD); - CHLD_STAT_LOCK; - while ((pid = waitpid(-1, &status, WNOHANG)) > 0) - note_child_death(pid, status); -#ifndef ERTS_SMP - children_died = 0; -#endif - CHLD_STAT_UNLOCK; - sys_sigrelease(SIGCHLD); - res = 1; - } - return res; -} - -#ifdef ERTS_SMP - -void -erts_check_children(void) -{ - (void) check_children(); -} - -#endif - -#elif CHLDWTHR && defined(ERTS_SMP) /* ------------------------------------- */ - -#define ERTS_REPORT_EXIT_STATUS report_exit_status - -#define check_children() (0) - - -#else /* CHLDWTHR && !defined(ERTS_SMP) ------------------------------------ */ - -#define ERTS_REPORT_EXIT_STATUS initiate_report_exit_status - -static ERTS_INLINE void -initiate_report_exit_status(ErtsSysReportExit *rep, int status) -{ - rep->next = report_exit_transit_list; - rep->status = status; - report_exit_transit_list = rep; - erts_sys_schedule_interrupt(1); -} - -static int check_children(void) -{ - int res; - ErtsSysReportExit *rep; - CHLD_STAT_LOCK; - rep = report_exit_transit_list; - res = rep != NULL; - while (rep) { - ErtsSysReportExit *curr_rep = rep; - rep = rep->next; - report_exit_status(curr_rep, curr_rep->status); - } - report_exit_transit_list = NULL; - CHLD_STAT_UNLOCK; - return res; -} - -#endif /* ------------------------------------------------------------------ */ - -static void note_child_death(int pid, int status) -{ - ErtsSysReportExit **repp = &report_exit_list; - ErtsSysReportExit *rep = report_exit_list; - - while (rep) { - if (pid == rep->pid) { - *repp = rep->next; - ERTS_REPORT_EXIT_STATUS(rep, status); - break; - } - repp = &rep->next; - rep = rep->next; - } -} - -#if CHLDWTHR - -static void * -child_waiter(void *unused) -{ - int pid; - int status; - -#ifdef ERTS_ENABLE_LOCK_CHECK - erts_lc_set_thread_name("child waiter"); -#endif - - while(1) { -#ifdef DEBUG - int waitpid_errno; -#endif - pid = waitpid(-1, &status, 0); -#ifdef DEBUG - waitpid_errno = errno; -#endif - CHLD_STAT_LOCK; - if (pid < 0) { - ASSERT(waitpid_errno == ECHILD); - } - else { - children_alive--; - ASSERT(children_alive >= 0); - note_child_death(pid, status); - } - while (!children_alive) - CHLD_STAT_WAIT; /* Wait for children to wait on... :) */ - CHLD_STAT_UNLOCK; - } - - return NULL; -} - -#endif - /* * Called from schedule() when it runs out of runnable processes, * or when Erlang code has performed INPUT_REDUCTIONS reduction @@ -2812,13 +1218,8 @@ child_waiter(void *unused) void erl_sys_schedule(int runnable) { -#ifdef ERTS_SMP ERTS_CHK_IO(!runnable); -#else - ERTS_CHK_IO(runnable ? 0 : !check_children()); -#endif ERTS_SMP_LC_ASSERT(!erts_thr_progress_is_blocking()); - (void) check_children(); } @@ -2846,10 +1247,6 @@ smp_sig_notify(char c) static void * signal_dispatcher_thread_func(void *unused) { -#if !CHLDWTHR - int initialized = 0; - int notify_check_children = 0; -#endif #ifdef ERTS_ENABLE_LOCK_CHECK erts_lc_set_thread_name("signal_dispatcher"); #endif @@ -2861,7 +1258,7 @@ signal_dispatcher_thread_func(void *unused) if (res < 0) { if (errno == EINTR) continue; - erl_exit(ERTS_ABORT_EXIT, + erts_exit(ERTS_ABORT_EXIT, "signal-dispatcher thread got unexpected error: %s (%d)\n", erl_errno_id(errno), errno); @@ -2887,19 +1284,7 @@ signal_dispatcher_thread_func(void *unused) */ switch (buf[i]) { case 0: /* Emulator initialized */ -#if !CHLDWTHR - initialized = 1; - if (!notify_check_children) -#endif - break; -#if !CHLDWTHR - case 'C': /* SIGCHLD */ - if (initialized) - erts_smp_notify_check_children_needed(); - else - notify_check_children = 1; - break; -#endif + break; case 'I': /* SIGINT */ break_requested(); break; @@ -2909,15 +1294,8 @@ signal_dispatcher_thread_func(void *unused) case '1': /* SIGUSR1 */ sigusr1_exit(); break; -#ifdef QUANTIFY - case '2': /* SIGUSR2 */ - quantify_save_data(); /* Might take a substantial amount of - time, but this is a test/debug - build */ - break; -#endif default: - erl_exit(ERTS_ABORT_EXIT, + erts_exit(ERTS_ABORT_EXIT, "signal-dispatcher thread received unknown " "signal notification: '%c'\n", buf[i]); @@ -2933,9 +1311,10 @@ init_smp_sig_notify(void) { erts_smp_thr_opts_t thr_opts = ERTS_SMP_THR_OPTS_DEFAULT_INITER; thr_opts.detached = 1; + thr_opts.name = "sys_sig_dispatcher"; if (pipe(sig_notify_fds) < 0) { - erl_exit(ERTS_ABORT_EXIT, + erts_exit(ERTS_ABORT_EXIT, "Failed to create signal-dispatcher pipe: %s (%d)\n", erl_errno_id(errno), errno); @@ -2947,6 +1326,19 @@ init_smp_sig_notify(void) NULL, &thr_opts); } + +static void +init_smp_sig_suspend(void) { +#ifdef ERTS_SYS_SUSPEND_SIGNAL + if (pipe(sig_suspend_fds) < 0) { + erts_exit(ERTS_ABORT_EXIT, + "Failed to create sig_suspend pipe: %s (%d)\n", + erl_errno_id(errno), + errno); + } +#endif +} + #ifdef __DARWIN__ int erts_darwin_main_thread_pipe[2]; @@ -2956,7 +1348,7 @@ static void initialize_darwin_main_thread_pipes(void) { if (pipe(erts_darwin_main_thread_pipe) < 0 || pipe(erts_darwin_main_thread_result_pipe) < 0) { - erl_exit(1,"Fatal error initializing Darwin main thread stealing"); + erts_exit(ERTS_ERROR_EXIT,"Fatal error initializing Darwin main thread stealing"); } } @@ -2974,9 +1366,11 @@ erts_sys_main_thread(void) #endif smp_sig_notify(0); /* Notify initialized */ - while (1) { - /* Wait for a signal to arrive... */ + + /* Wait for a signal to arrive... */ + #ifdef __DARWIN__ + while (1) { /* * The wx driver needs to be able to steal the main thread for Cocoa to * work properly. @@ -2991,12 +1385,24 @@ erts_sys_main_thread(void) void* (*func)(void*); void* arg; void *resp; - read(erts_darwin_main_thread_pipe[0],&func,sizeof(void* (*)(void*))); - read(erts_darwin_main_thread_pipe[0],&arg, sizeof(void*)); + res = read(erts_darwin_main_thread_pipe[0],&func,sizeof(void* (*)(void*))); + if (res != sizeof(void* (*)(void*))) + break; + res = read(erts_darwin_main_thread_pipe[0],&arg,sizeof(void*)); + if (res != sizeof(void*)) + break; resp = (*func)(arg); write(erts_darwin_main_thread_result_pipe[1],&resp,sizeof(void *)); } -#else + + if (res == -1 && errno != EINTR) + break; + } + /* Something broke with the main thread pipe, so we ignore it for now. + Most probably erts has closed this pipe and is about to exit. */ +#endif /* #ifdef __DARWIN__ */ + + while (1) { #ifdef DEBUG int res = #else @@ -3005,7 +1411,6 @@ erts_sys_main_thread(void) select(0, NULL, NULL, NULL, NULL); ASSERT(res < 0); ASSERT(errno == EINTR); -#endif } } @@ -3097,6 +1502,7 @@ erl_sys_args(int* argc, char** argv) #ifdef ERTS_SMP init_smp_sig_notify(); + init_smp_sig_suspend(); #endif /* Handled arguments have been marked with NULL. Slide arguments diff --git a/erts/emulator/sys/unix/sys_drivers.c b/erts/emulator/sys/unix/sys_drivers.c new file mode 100644 index 0000000000..812112fb91 --- /dev/null +++ b/erts/emulator/sys/unix/sys_drivers.c @@ -0,0 +1,1862 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 1996-2016. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#ifdef ISC32 +#define _POSIX_SOURCE +#define _XOPEN_SOURCE +#endif + +#include <sys/times.h> /* ! */ +#include <time.h> +#include <signal.h> +#include <sys/wait.h> +#include <sys/uio.h> +#include <termios.h> +#include <ctype.h> +#include <sys/utsname.h> +#include <sys/select.h> +#include <arpa/inet.h> + +#ifdef ISC32 +#include <sys/bsdtypes.h> +#endif + +#include <termios.h> +#ifdef HAVE_FCNTL_H +#include <fcntl.h> +#endif +#ifdef HAVE_SYS_IOCTL_H +#include <sys/ioctl.h> +#endif + +#define WANT_NONBLOCKING /* must define this to pull in defs from sys.h */ +#include "sys.h" + +#ifdef USE_THREADS +#include "erl_threads.h" +#endif + +extern char **environ; +extern erts_smp_rwmtx_t environ_rwmtx; + +extern erts_smp_atomic_t sys_misc_mem_sz; + +static Eterm forker_port; + +#define MAX_VSIZE 16 /* Max number of entries allowed in an I/O + * vector sock_sendv(). + */ +/* + * Don't need global.h, but erl_cpu_topology.h won't compile otherwise + */ +#include "global.h" +#include "erl_cpu_topology.h" + +#include "erl_sys_driver.h" +#include "sys_uds.h" + +#include "erl_child_setup.h" + +#if defined IOV_MAX +#define MAXIOV IOV_MAX +#elif defined UIO_MAXIOV +#define MAXIOV UIO_MAXIOV +#else +#define MAXIOV 16 +#endif + +#ifdef USE_THREADS +# define FDBLOCK 1 +#else +# define FDBLOCK 0 +#endif + +/* Used by the fd driver iff the fd could not be set to non-blocking */ +typedef struct ErtsSysBlocking_ { + ErlDrvPDL pdl; + int res; + int err; + unsigned int pkey; +} ErtsSysBlocking; + +typedef struct fd_data { + int fd; + char pbuf[4]; /* hold partial packet bytes */ + int psz; /* size of pbuf */ + char *buf; + char *cpos; + int sz; + int remain; /* for input on fd */ +} ErtsSysFdData; + +typedef struct driver_data { + ErlDrvPort port_num; + ErtsSysFdData *ofd; + ErtsSysFdData *ifd; + int packet_bytes; + int pid; + int alive; + int status; + int terminating; + ErtsSysBlocking *blocking; +} ErtsSysDriverData; + +#define DIR_SEPARATOR_CHAR '/' + +#if defined(__ANDROID__) +#define SHELL "/system/bin/sh" +#else +#define SHELL "/bin/sh" +#endif /* __ANDROID__ */ + +#if defined(DEBUG) +#define ERL_BUILD_TYPE_MARKER ".debug" +#elif defined(PURIFY) +#define ERL_BUILD_TYPE_MARKER ".purify" +#elif defined(QUANTIFY) +#define ERL_BUILD_TYPE_MARKER ".quantify" +#elif defined(PURECOV) +#define ERL_BUILD_TYPE_MARKER ".purecov" +#elif defined(VALGRIND) +#define ERL_BUILD_TYPE_MARKER ".valgrind" +#else /* opt */ +#define ERL_BUILD_TYPE_MARKER +#endif + +#ifdef DEBUG +#define close(fd) do { int res = close(fd); ASSERT(res > -1); } while(0) +#endif + +#define CHILD_SETUP_PROG_NAME "erl_child_setup" ERL_BUILD_TYPE_MARKER + +// #define HARD_DEBUG +#ifdef HARD_DEBUG +#define driver_select(port_num, fd, flags, onoff) \ + do { \ + if (((flags) & ERL_DRV_READ) && onoff) \ + fprintf(stderr,"%010d %p: read select %d\r\n", __LINE__, port_num, (int)fd); \ + if (((flags) & ERL_DRV_WRITE) && onoff) \ + fprintf(stderr,"%010d %p: writ select %d\r\n", __LINE__, port_num, (int)fd); \ + if (((flags) & ERL_DRV_READ) && !onoff) \ + fprintf(stderr,"%010d %p: read unsele %d\r\n", __LINE__, port_num, (int)fd); \ + if (((flags) & ERL_DRV_WRITE) && !onoff) \ + fprintf(stderr,"%010d %p: writ unsele %d\r\n", __LINE__, port_num, (int)fd); \ + driver_select_nkp(port_num, fd, flags, onoff); \ + } while(0) +#endif + +/* + * Decreasing the size of it below 16384 is not allowed. + */ + +#define ERTS_SYS_READ_BUF_SZ (64*1024) + +/* I. Initialization */ + +void +erl_sys_late_init(void) +{ + SysDriverOpts opts; +#ifdef ERTS_SMP + Port *port; +#endif + + sys_signal(SIGPIPE, SIG_IGN); /* Ignore - we'll handle the write failure */ + + opts.packet_bytes = 0; + opts.use_stdio = 1; + opts.redir_stderr = 0; + opts.read_write = 0; + opts.hide_window = 0; + opts.wd = NULL; + opts.envir = NULL; + opts.exit_status = 0; + opts.overlapped_io = 0; + opts.spawn_type = ERTS_SPAWN_ANY; + opts.argv = NULL; + opts.parallelism = erts_port_parallelism; + +#ifdef ERTS_SMP + port = +#endif + erts_open_driver(&forker_driver, make_internal_pid(0), "forker", &opts, NULL, NULL); +#ifdef ERTS_SMP + erts_mtx_unlock(port->lock); +#endif +} + +/* II. Prototypes */ + +/* II.I Spawn prototypes */ +static ErlDrvData spawn_start(ErlDrvPort, char*, SysDriverOpts*); +static ErlDrvSSizeT spawn_control(ErlDrvData, unsigned int, char *, + ErlDrvSizeT, char **, ErlDrvSizeT); + +/* II.II Vanilla prototypes */ +static ErlDrvData vanilla_start(ErlDrvPort, char*, SysDriverOpts*); + + +/* II.III FD prototypes */ +static ErlDrvData fd_start(ErlDrvPort, char*, SysDriverOpts*); +#if FDBLOCK +static void fd_async(void *); +static void fd_ready_async(ErlDrvData drv_data, ErlDrvThreadData thread_data); +#endif +static ErlDrvSSizeT fd_control(ErlDrvData, unsigned int, char *, ErlDrvSizeT, + char **, ErlDrvSizeT); +static void fd_stop(ErlDrvData); +static void fd_flush(ErlDrvData); + +/* II.IV Common prototypes */ +static void stop(ErlDrvData); +static void ready_input(ErlDrvData, ErlDrvEvent); +static void ready_output(ErlDrvData, ErlDrvEvent); +static void output(ErlDrvData, char*, ErlDrvSizeT); +static void outputv(ErlDrvData, ErlIOVec*); +static void stop_select(ErlDrvEvent, void*); + +/* II.V Forker prototypes */ +static ErlDrvData forker_start(ErlDrvPort, char*, SysDriverOpts*); +static void forker_stop(ErlDrvData); +static void forker_ready_input(ErlDrvData, ErlDrvEvent); +static void forker_ready_output(ErlDrvData, ErlDrvEvent); +static ErlDrvSSizeT forker_control(ErlDrvData, unsigned int, char *, + ErlDrvSizeT, char **, ErlDrvSizeT); + +/* III Driver entries */ + +/* III.I The spawn driver */ +struct erl_drv_entry spawn_driver_entry = { + NULL, + spawn_start, + stop, + output, + ready_input, + ready_output, + "spawn", + NULL, + NULL, + spawn_control, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + ERL_DRV_EXTENDED_MARKER, + ERL_DRV_EXTENDED_MAJOR_VERSION, + ERL_DRV_EXTENDED_MINOR_VERSION, + ERL_DRV_FLAG_USE_PORT_LOCKING | ERL_DRV_FLAG_USE_INIT_ACK, + NULL, NULL, + stop_select +}; + +/* III.II The fd driver */ +struct erl_drv_entry fd_driver_entry = { + NULL, + fd_start, + fd_stop, + output, + ready_input, + ready_output, + "fd", + NULL, + NULL, + fd_control, + NULL, + outputv, +#if FDBLOCK + fd_ready_async, /* ready_async */ +#else + NULL, +#endif + fd_flush, /* flush */ + NULL, /* call */ + NULL, /* event */ + ERL_DRV_EXTENDED_MARKER, + ERL_DRV_EXTENDED_MAJOR_VERSION, + ERL_DRV_EXTENDED_MINOR_VERSION, + 0, /* ERL_DRV_FLAGs */ + NULL, /* handle2 */ + NULL, /* process_exit */ + stop_select +}; + +/* III.III The vanilla driver */ +struct erl_drv_entry vanilla_driver_entry = { + NULL, + vanilla_start, + stop, + output, + ready_input, + ready_output, + "vanilla", + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, /* flush */ + NULL, /* call */ + NULL, /* event */ + ERL_DRV_EXTENDED_MARKER, + ERL_DRV_EXTENDED_MAJOR_VERSION, + ERL_DRV_EXTENDED_MINOR_VERSION, + 0, /* ERL_DRV_FLAGs */ + NULL, /* handle2 */ + NULL, /* process_exit */ + stop_select +}; + +/* III.III The forker driver */ +struct erl_drv_entry forker_driver_entry = { + NULL, + forker_start, + forker_stop, + NULL, + forker_ready_input, + forker_ready_output, + "spawn_forker", + NULL, + NULL, + forker_control, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + ERL_DRV_EXTENDED_MARKER, + ERL_DRV_EXTENDED_MAJOR_VERSION, + ERL_DRV_EXTENDED_MINOR_VERSION, + 0, + NULL, NULL, + stop_select +}; + +/* Untility functions */ + +static int set_blocking_data(ErtsSysDriverData *dd) { + + dd->blocking = erts_alloc(ERTS_ALC_T_SYS_BLOCKING, sizeof(ErtsSysBlocking)); + + erts_smp_atomic_add_nob(&sys_misc_mem_sz, sizeof(ErtsSysBlocking)); + + dd->blocking->pdl = driver_pdl_create(dd->port_num); + dd->blocking->res = 0; + dd->blocking->err = 0; + dd->blocking->pkey = driver_async_port_key(dd->port_num); + + return 1; +} + +static void init_fd_data(ErtsSysFdData *fd_data, int fd) +{ + fd_data->fd = fd; + fd_data->buf = NULL; + fd_data->cpos = NULL; + fd_data->remain = 0; + fd_data->sz = 0; + fd_data->psz = 0; +} + +static ErtsSysDriverData * +create_driver_data(ErlDrvPort port_num, + int ifd, + int ofd, + int packet_bytes, + int read_write, + int exit_status, + int pid, + int is_blocking) +{ + Port *prt; + ErtsSysDriverData *driver_data; + char *data; + int size = sizeof(ErtsSysDriverData); + + if (read_write & DO_READ) + size += sizeof(ErtsSysFdData); + + if ((read_write & DO_WRITE) && + ((ifd != ofd || ofd == -1) || !(read_write & DO_READ))) + size += sizeof(ErtsSysFdData); + + data = erts_alloc(ERTS_ALC_T_DRV_TAB,size); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, size); + + driver_data = (ErtsSysDriverData*)data; + data += sizeof(*driver_data); + + prt = erts_drvport2port(port_num); + if (prt != ERTS_INVALID_ERL_DRV_PORT) + prt->os_pid = pid; + + driver_data->packet_bytes = packet_bytes; + driver_data->port_num = port_num; + driver_data->pid = pid; + driver_data->alive = exit_status ? 1 : 0; + driver_data->status = 0; + driver_data->terminating = 0; + driver_data->blocking = NULL; + + if (read_write & DO_READ) { + driver_data->ifd = (ErtsSysFdData*)data; + data += sizeof(*driver_data->ifd); + init_fd_data(driver_data->ifd, ifd); + driver_select(port_num, ifd, (ERL_DRV_READ|ERL_DRV_USE), 1); + } else { + driver_data->ifd = NULL; + } + + if (read_write & DO_WRITE) { + if (ofd != -1 && ifd == ofd && read_write & DO_READ) { + /* This is for when ifd and ofd are the same fd */ + driver_data->ofd = driver_data->ifd; + } else { + driver_data->ofd = (ErtsSysFdData*)data; + data += sizeof(*driver_data->ofd); + init_fd_data(driver_data->ofd, ofd); + } + if (is_blocking && FDBLOCK) + if (!set_blocking_data(driver_data)) { + erts_free(ERTS_ALC_T_DRV_TAB, driver_data); + return NULL; + } + } else { + driver_data->ofd = NULL; + } + + return driver_data; +} + +/* Spawn driver */ + +static void close_pipes(int ifd[2], int ofd[2]) +{ + close(ifd[0]); + close(ifd[1]); + close(ofd[0]); + close(ofd[1]); +} + +static char **build_unix_environment(char *block) +{ + int i; + int j; + int len; + char *cp; + char **cpp; + char** old_env; + + ERTS_SMP_LC_ASSERT(erts_smp_lc_rwmtx_is_rlocked(&environ_rwmtx)); + + cp = block; + len = 0; + while (*cp != '\0') { + cp += strlen(cp) + 1; + len++; + } + old_env = environ; + while (*old_env++ != NULL) { + len++; + } + + cpp = (char **) erts_alloc_fnf(ERTS_ALC_T_ENVIRONMENT, + sizeof(char *) * (len+1)); + if (cpp == NULL) { + return NULL; + } + + cp = block; + len = 0; + while (*cp != '\0') { + cpp[len] = cp; + cp += strlen(cp) + 1; + len++; + } + + i = len; + for (old_env = environ; *old_env; old_env++) { + char* old = *old_env; + + for (j = 0; j < len; j++) { + char *s, *t; + + /* check if cpp[j] equals old + before the = sign, + i.e. + "TMPDIR=/tmp/" */ + s = cpp[j]; + t = old; + while (*s == *t && *s != '=') { + s++, t++; + } + if (*s == '=' && *t == '=') { + break; + } + } + + if (j == len) { /* New version not found */ + cpp[len++] = old; + } + } + + for (j = 0; j < i; ) { + size_t last = strlen(cpp[j])-1; + if (cpp[j][last] == '=' && strchr(cpp[j], '=') == cpp[j]+last) { + cpp[j] = cpp[--len]; + if (len < i) { + i--; + } else { + j++; + } + } + else { + j++; + } + } + + cpp[len] = NULL; + return cpp; +} + +static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, + SysDriverOpts* opts) +{ +#define CMD_LINE_PREFIX_STR "exec " +#define CMD_LINE_PREFIX_STR_SZ (sizeof(CMD_LINE_PREFIX_STR) - 1) + + int len; + char **new_environ; + ErtsSysDriverData *dd; + char *cmd_line; + char wd_buff[MAXPATHLEN+1]; + char *wd; + int ifd[2], ofd[2], stderrfd; + + if (pipe(ifd) < 0) return ERL_DRV_ERROR_ERRNO; + errno = EMFILE; /* default for next three conditions */ + if (ifd[0] >= sys_max_files() || pipe(ofd) < 0) { + close(ifd[0]); + close(ifd[1]); + return ERL_DRV_ERROR_ERRNO; + } + if (ofd[1] >= sys_max_files()) { + close_pipes(ifd, ofd); + errno = EMFILE; + return ERL_DRV_ERROR_ERRNO; + } + + SET_NONBLOCKING(ifd[0]); + SET_NONBLOCKING(ofd[1]); + + stderrfd = opts->redir_stderr ? ifd[1] : dup(2); + + if (stderrfd >= sys_max_files() || stderrfd < 0) { + close_pipes(ifd, ofd); + if (stderrfd > -1) + close(stderrfd); + return ERL_DRV_ERROR_ERRNO; + } + + if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { + /* started with spawn_executable, not with spawn */ + len = strlen(name); + cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, len + 1); + if (!cmd_line) { + close_pipes(ifd, ofd); + errno = ENOMEM; + return ERL_DRV_ERROR_ERRNO; + } + memcpy((void *) cmd_line,(void *) name, len); + cmd_line[len] = '\0'; + len = len + 1; + if (access(cmd_line,X_OK) != 0) { + int save_errno = errno; + erts_free(ERTS_ALC_T_TMP, cmd_line); + close_pipes(ifd, ofd); + errno = save_errno; + return ERL_DRV_ERROR_ERRNO; + } + } else { + /* make the string suitable for giving to "sh" */ + len = strlen(name); + cmd_line = (char *) erts_alloc_fnf(ERTS_ALC_T_TMP, + CMD_LINE_PREFIX_STR_SZ + len + 1); + if (!cmd_line) { + close_pipes(ifd, ofd); + errno = ENOMEM; + return ERL_DRV_ERROR_ERRNO; + } + memcpy((void *) cmd_line, + (void *) CMD_LINE_PREFIX_STR, + CMD_LINE_PREFIX_STR_SZ); + memcpy((void *) (cmd_line + CMD_LINE_PREFIX_STR_SZ), (void *) name, len); + cmd_line[CMD_LINE_PREFIX_STR_SZ + len] = '\0'; + len = CMD_LINE_PREFIX_STR_SZ + len + 1; + } + + erts_smp_rwmtx_rlock(&environ_rwmtx); + + if (opts->envir == NULL) { + new_environ = environ; + } else if ((new_environ = build_unix_environment(opts->envir)) == NULL) { + erts_smp_rwmtx_runlock(&environ_rwmtx); + close_pipes(ifd, ofd); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + errno = ENOMEM; + return ERL_DRV_ERROR_ERRNO; + } + + if (opts->wd == NULL) { + if ((wd = getcwd(wd_buff, MAXPATHLEN+1)) == NULL) { + /* on some OSs this call opens a fd in the + background which means that this can + return EMFILE */ + int err = errno; + close_pipes(ifd, ofd); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + erts_smp_rwmtx_runlock(&environ_rwmtx); + errno = err; + return ERL_DRV_ERROR_ERRNO; + } + } else { + wd = opts->wd; + } + + { + struct iovec *io_vector; + int iov_len = 5; + char nullbuff[] = "\0"; + int j, i = 0, res; + Sint32 buffsz = 0, env_len = 0, argv_len = 0, + flags = (opts->use_stdio ? FORKER_FLAG_USE_STDIO : 0) + | (opts->exit_status ? FORKER_FLAG_EXIT_STATUS : 0) + | (opts->read_write & DO_READ ? FORKER_FLAG_DO_READ : 0) + | (opts->read_write & DO_WRITE ? FORKER_FLAG_DO_WRITE : 0); + + /* count number of elements in environment */ + while(new_environ[env_len] != NULL) + env_len++; + iov_len += 1 + env_len; /* num envs including size int */ + + /* count number of element in argument list */ + if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { + if (opts->argv != NULL) { + while(opts->argv[argv_len] != NULL) + argv_len++; + } else { + argv_len++; + } + iov_len += 1 + argv_len; /* num argvs including size int */ + } + + io_vector = erts_alloc_fnf(ERTS_ALC_T_TMP, sizeof(struct iovec) * iov_len); + + if (!io_vector) { + close_pipes(ifd, ofd); + erts_smp_rwmtx_runlock(&environ_rwmtx); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + errno = ENOMEM; + return ERL_DRV_ERROR_ERRNO; + } + + io_vector[i].iov_base = (void*)&buffsz; + io_vector[i++].iov_len = sizeof(buffsz); + + io_vector[i].iov_base = (void*)&flags; + flags = htonl(flags); + io_vector[i++].iov_len = sizeof(flags); + buffsz += sizeof(flags); + + io_vector[i].iov_base = cmd_line; + io_vector[i++].iov_len = len; + buffsz += len; + + io_vector[i].iov_base = wd; + io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1; + buffsz += io_vector[i++].iov_len; + + io_vector[i].iov_base = nullbuff; + io_vector[i++].iov_len = 1; + buffsz += io_vector[i-1].iov_len; + + io_vector[i].iov_base = (void*)&env_len; + env_len = htonl(env_len); + io_vector[i++].iov_len = sizeof(env_len); + buffsz += io_vector[i-1].iov_len; + + for (j = 0; new_environ[j] != NULL; j++) { + io_vector[i].iov_base = new_environ[j]; + io_vector[i++].iov_len = strlen(new_environ[j]) + 1; + buffsz += io_vector[i-1].iov_len; + } + + /* only append arguments if this was a spawn_executable */ + if (opts->spawn_type == ERTS_SPAWN_EXECUTABLE) { + + io_vector[i].iov_base = (void*)&argv_len; + argv_len = htonl(argv_len); + io_vector[i++].iov_len = sizeof(argv_len); + buffsz += io_vector[i-1].iov_len; + + if (opts->argv) { + /* If there are arguments we copy in the references to + them into the iov */ + for (j = 0; opts->argv[j]; j++) { + if (opts->argv[j] == erts_default_arg0) + io_vector[i].iov_base = cmd_line; + else + io_vector[i].iov_base = opts->argv[j]; + io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1; + buffsz += io_vector[i++].iov_len; + } + } else { + io_vector[i].iov_base = cmd_line; + io_vector[i].iov_len = strlen(io_vector[i].iov_base) + 1; + buffsz += io_vector[i++].iov_len; + } + } + + /* we send the request to do the fork */ + if ((res = writev(ofd[1], io_vector, iov_len > MAXIOV ? MAXIOV : iov_len)) < 0) { + if (errno == ERRNO_BLOCK) { + res = 0; + } else { + int err = errno; + close_pipes(ifd, ofd); + erts_free(ERTS_ALC_T_TMP, io_vector); + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + erts_smp_rwmtx_runlock(&environ_rwmtx); + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + errno = err; + return ERL_DRV_ERROR_ERRNO; + } + } + + if (res < (buffsz + sizeof(buffsz))) { + /* we only wrote part of the command payload. Enqueue the rest. */ + for (i = 0; i < iov_len; i++) { + driver_enq(port_num, io_vector[i].iov_base, io_vector[i].iov_len); + } + driver_deq(port_num, res); + driver_select(port_num, ofd[1], ERL_DRV_WRITE|ERL_DRV_USE, 1); + } + + erts_free(ERTS_ALC_T_TMP, io_vector); + } + + erts_free(ERTS_ALC_T_TMP, (void *) cmd_line); + + if (new_environ != environ) + erts_free(ERTS_ALC_T_ENVIRONMENT, (void *) new_environ); + + erts_smp_rwmtx_runlock(&environ_rwmtx); + + dd = create_driver_data(port_num, ifd[0], ofd[1], opts->packet_bytes, + DO_WRITE | DO_READ, opts->exit_status, + 0, 0); + + { + /* send ofd[0] + ifd[1] + stderrfd to forker port */ + ErtsSysForkerProto *proto = + erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA, + sizeof(ErtsSysForkerProto)); + memset(proto, 0, sizeof(ErtsSysForkerProto)); + proto->action = ErtsSysForkerProtoAction_Start; + proto->u.start.fds[0] = ofd[0]; + proto->u.start.fds[1] = ifd[1]; + proto->u.start.fds[2] = stderrfd; + proto->u.start.port_id = opts->exit_status ? erts_drvport2id(port_num) : THE_NON_VALUE; + if (erl_drv_port_control(forker_port, 'S', (char*)proto, sizeof(*proto))) { + /* The forker port has been killed, we close both fd's which will + make open_port throw an epipe error */ + close(ofd[0]); + close(ifd[1]); + } + } + + /* we set these fds to negative to mark if + they should be closed after the handshake */ + if (!(opts->read_write & DO_READ)) + dd->ifd->fd *= -1; + + if (!(opts->read_write & DO_WRITE)) + dd->ofd->fd *= -1; + + return (ErlDrvData)dd; +#undef CMD_LINE_PREFIX_STR +#undef CMD_LINE_PREFIX_STR_SZ +} + +static ErlDrvSSizeT spawn_control(ErlDrvData e, unsigned int cmd, char *buf, + ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen) +{ + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErtsSysForkerProto *proto = (ErtsSysForkerProto *)buf; + + ASSERT(len == sizeof(*proto)); + ASSERT(proto->action == ErtsSysForkerProtoAction_SigChld); + + dd->status = proto->u.sigchld.error_number; + dd->alive = -1; + + if (dd->ifd) + driver_select(dd->port_num, abs(dd->ifd->fd), ERL_DRV_READ | ERL_DRV_USE, 1); + + if (dd->ofd) + driver_select(dd->port_num, abs(dd->ofd->fd), ERL_DRV_WRITE | ERL_DRV_USE, 1); + + return 0; +} + +#define FD_DEF_HEIGHT 24 +#define FD_DEF_WIDTH 80 +/* Control op */ +#define FD_CTRL_OP_GET_WINSIZE 100 + +static int fd_get_window_size(int fd, Uint32 *width, Uint32 *height) +{ +#ifdef TIOCGWINSZ + struct winsize ws; + if (ioctl(fd,TIOCGWINSZ,&ws) == 0) { + *width = (Uint32) ws.ws_col; + *height = (Uint32) ws.ws_row; + return 0; + } +#endif + return -1; +} + +static ErlDrvSSizeT fd_control(ErlDrvData drv_data, + unsigned int command, + char *buf, ErlDrvSizeT len, + char **rbuf, ErlDrvSizeT rlen) +{ + int fd = (int)(long)drv_data; + char resbuff[2*sizeof(Uint32)]; + switch (command) { + case FD_CTRL_OP_GET_WINSIZE: + { + Uint32 w,h; + if (fd_get_window_size(fd,&w,&h)) + return 0; + memcpy(resbuff,&w,sizeof(Uint32)); + memcpy(resbuff+sizeof(Uint32),&h,sizeof(Uint32)); + } + break; + default: + return 0; + } + if (rlen < 2*sizeof(Uint32)) { + *rbuf = driver_alloc(2*sizeof(Uint32)); + } + memcpy(*rbuf,resbuff,2*sizeof(Uint32)); + return 2*sizeof(Uint32); +} + +static ErlDrvData fd_start(ErlDrvPort port_num, char* name, + SysDriverOpts* opts) +{ + int non_blocking = 0; + + if (((opts->read_write & DO_READ) && opts->ifd >= sys_max_files()) || + ((opts->read_write & DO_WRITE) && opts->ofd >= sys_max_files())) + return ERL_DRV_ERROR_GENERAL; + + /* + * Historical: + * + * "Note about nonblocking I/O. + * + * At least on Solaris, setting the write end of a TTY to nonblocking, + * will set the input end to nonblocking as well (and vice-versa). + * If erl is run in a pipeline like this: cat | erl + * the input end of the TTY will be the standard input of cat. + * And cat is not prepared to handle nonblocking I/O." + * + * Actually, the reason for this is not that the tty itself gets set + * in non-blocking mode, but that the "input end" (cat's stdin) and + * the "output end" (erlang's stdout) are typically the "same" file + * descriptor, dup()'ed from a single fd by one of this process' + * ancestors. + * + * The workaround for this problem used to be a rather bad kludge, + * interposing an extra process ("internal cat") between erlang's + * stdout and the original stdout, allowing erlang to set its stdout + * in non-blocking mode without affecting the stdin of the preceding + * process in the pipeline - and being a kludge, it caused all kinds + * of weird problems. + * + * So, this is the current logic: + * + * The only reason to set non-blocking mode on the output fd at all is + * if it's something that can cause a write() to block, of course, + * i.e. primarily if it points to a tty, socket, pipe, or fifo. + * + * If we don't set non-blocking mode when we "should" have, and output + * becomes blocked, the entire runtime system will be suspended - this + * is normally bad of course, and can happen fairly "easily" - e.g. user + * hits ^S on tty - but doesn't necessarily happen. + * + * If we do set non-blocking mode when we "shouldn't" have, the runtime + * system will end up seeing EOF on the input fd (due to the preceding + * process dying), which typically will cause the entire runtime system + * to terminate immediately (due to whatever erlang process is seeing + * the EOF taking it as a signal to halt the system). This is *very* bad. + * + * I.e. we should take a conservative approach, and only set non- + * blocking mode when we a) need to, and b) are reasonably certain + * that it won't be a problem. And as in the example above, the problem + * occurs when input fd and output fd point to different "things". + * + * However, determining that they are not just the same "type" of + * "thing", but actually the same instance of that type of thing, is + * unreasonably complex in many/most cases. + * + * Also, with pipes, sockets, and fifos it's far from obvious that the + * user *wants* non-blocking output: If you're running erlang inside + * some complex pipeline, you're probably not running a real-time system + * that must never stop, but rather *want* it to suspend if the output + * channel is "full". + * + * So, the bottom line: We will only set the output fd non-blocking if + * it points to a tty, and either a) the input fd also points to a tty, + * or b) we can make sure that setting the output fd non-blocking + * doesn't interfere with someone else's input, via a somewhat milder + * kludge than the above. + * + * Also keep in mind that while this code is almost exclusively run as + * a result of an erlang open_port({fd,0,1}, ...), that isn't the only + * case - it can be called with any old pre-existing file descriptors, + * the relations between which (if they're even two) we can only guess + * at - still, we try our best... + * + * Added note OTP 18: Some systems seem to use stdout/stderr to log data + * using unix pipes, so we cannot allow the system to block on a write. + * Therefore we use an async thread to write the data to fd's that could + * not be set to non-blocking. When no async threads are available we + * fall back on the old behaviour. + * + * Also the guarantee about what is delivered to the OS has changed. + * Pre 18 the fd driver did no flushing of data before terminating. + * Now it does. This is because we want to be able to guarantee that things + * such as escripts and friends really have outputted all data before + * terminating. This could potentially block the termination of the system + * for a very long time, but if the user wants to terminate fast she should + * use erlang:halt with flush=false. + */ + + /* Try to figure out if we can use non-blocking writes */ + if (opts->read_write & DO_WRITE) { + + /* If we don't have a read end, all bets are off - no non-blocking. */ + if (opts->read_write & DO_READ) { + + if (isatty(opts->ofd)) { /* output fd is a tty:-) */ + + if (isatty(opts->ifd)) { /* input fd is also a tty */ + + /* To really do this "right", we should also check that + input and output fd point to the *same* tty - but + this seems like overkill; ttyname() isn't for free, + and this is a very common case - and it's hard to + imagine a scenario where setting non-blocking mode + here would cause problems - go ahead and do it. */ + + non_blocking = 1; + SET_NONBLOCKING(opts->ofd); + + } else { /* output fd is a tty, input fd isn't */ + + /* This is a "problem case", but also common (see the + example above) - i.e. it makes sense to try a bit + harder before giving up on non-blocking mode: Try to + re-open the tty that the output fd points to, and if + successful replace the original one with the "new" fd + obtained this way, and set *that* one in non-blocking + mode. (Yes, this is a kludge.) + + However, re-opening the tty may fail in a couple of + (unusual) cases: + + 1) The name of the tty (or an equivalent one, i.e. + same major/minor number) can't be found, because + it actually lives somewhere other than /dev (or + wherever ttyname() looks for it), and isn't + equivalent to any of those that do live in the + "standard" place - this should be *very* unusual. + + 2) Permissions on the tty don't allow us to open it - + it's perfectly possible to have an fd open to an + object whose permissions wouldn't allow us to open + it. This is not as unusual as it sounds, one case + is if the user has su'ed to someone else (not + root) - we have a read/write fd open to the tty + (because it has been inherited all the way down + here), but we have neither read nor write + permission for the tty. + + In these cases, we finally give up, and don't set the + output fd in non-blocking mode. */ + + char *tty; + int nfd; + + if ((tty = ttyname(opts->ofd)) != NULL && + (nfd = open(tty, O_WRONLY)) != -1) { + dup2(nfd, opts->ofd); + close(nfd); + non_blocking = 1; + SET_NONBLOCKING(opts->ofd); + } + } + } + } + } + return (ErlDrvData)create_driver_data(port_num, opts->ifd, opts->ofd, + opts->packet_bytes, + opts->read_write, 0, -1, + !non_blocking); +} + +static void clear_fd_data(ErtsSysFdData *fdd) +{ + if (fdd->sz > 0) { + erts_free(ERTS_ALC_T_FD_ENTRY_BUF, (void *) fdd->buf); + ASSERT(erts_smp_atomic_read_nob(&sys_misc_mem_sz) >= fdd->sz); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -1*fdd->sz); + } + fdd->buf = NULL; + fdd->sz = 0; + fdd->remain = 0; + fdd->cpos = NULL; + fdd->psz = 0; +} + +static void nbio_stop_fd(ErlDrvPort prt, ErtsSysFdData *fdd) +{ + driver_select(prt, abs(fdd->fd), DO_READ|DO_WRITE, 0); + clear_fd_data(fdd); + SET_BLOCKING(abs(fdd->fd)); + +} + +static void fd_stop(ErlDrvData ev) /* Does not close the fds */ +{ + ErtsSysDriverData* dd = (ErtsSysDriverData*)ev; + ErlDrvPort prt = dd->port_num; + int sz = sizeof(ErtsSysDriverData); + +#if FDBLOCK + if (dd->blocking) { + erts_free(ERTS_ALC_T_SYS_BLOCKING, dd->blocking); + dd->blocking = NULL; + sz += sizeof(ErtsSysBlocking); + } +#endif + + if (dd->ifd) { + sz += sizeof(ErtsSysFdData); + nbio_stop_fd(prt, dd->ifd); + } + if (dd->ofd && dd->ofd != dd->ifd) { + sz += sizeof(ErtsSysFdData); + nbio_stop_fd(prt, dd->ofd); + } + + erts_free(ERTS_ALC_T_DRV_TAB, dd); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sz); +} + +static void fd_flush(ErlDrvData ev) +{ + ErtsSysDriverData* dd = (ErtsSysDriverData*)ev; + if (!dd->terminating) + dd->terminating = 1; +} + +static ErlDrvData vanilla_start(ErlDrvPort port_num, char* name, + SysDriverOpts* opts) +{ + int flags, fd; + ErlDrvData res; + + flags = (opts->read_write == DO_READ ? O_RDONLY : + opts->read_write == DO_WRITE ? O_WRONLY|O_CREAT|O_TRUNC : + O_RDWR|O_CREAT); + if ((fd = open(name, flags, 0666)) < 0) + return ERL_DRV_ERROR_GENERAL; + if (fd >= sys_max_files()) { + close(fd); + return ERL_DRV_ERROR_GENERAL; + } + SET_NONBLOCKING(fd); + + res = (ErlDrvData)(long)create_driver_data(port_num, fd, fd, + opts->packet_bytes, + opts->read_write, 0, -1, 0); + return res; +} + +/* Note that driver_data[fd].ifd == fd if the port was opened for reading, */ +/* otherwise (i.e. write only) driver_data[fd].ofd = fd. */ + +static void stop(ErlDrvData ev) +{ + ErtsSysDriverData* dd = (ErtsSysDriverData*)ev; + ErlDrvPort prt = dd->port_num; + + if (dd->ifd) { + nbio_stop_fd(prt, dd->ifd); + driver_select(prt, abs(dd->ifd->fd), ERL_DRV_USE, 0); /* close(ifd); */ + } + + if (dd->ofd && dd->ofd != dd->ifd) { + nbio_stop_fd(prt, dd->ofd); + driver_select(prt, abs(dd->ofd->fd), ERL_DRV_USE, 0); /* close(ofd); */ + } + + erts_free(ERTS_ALC_T_DRV_TAB, dd); +} + +/* used by fd_driver */ +static void outputv(ErlDrvData e, ErlIOVec* ev) +{ + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort ix = dd->port_num; + int pb = dd->packet_bytes; + int ofd = dd->ofd ? dd->ofd->fd : -1; + ssize_t n; + ErlDrvSizeT sz; + char lb[4]; + char* lbp; + ErlDrvSizeT len = ev->size; + + /* (len > ((unsigned long)-1 >> (4-pb)*8)) */ + /* if (pb >= 0 && (len & (((ErlDrvSizeT)1 << (pb*8))) - 1) != len) {*/ + if (((pb == 2) && (len > 0xffff)) || (pb == 1 && len > 0xff)) { + driver_failure_posix(ix, EINVAL); + return; /* -1; */ + } + /* Handles 0 <= pb <= 4 only */ + put_int32((Uint32) len, lb); + lbp = lb + (4-pb); + + ev->iov[0].iov_base = lbp; + ev->iov[0].iov_len = pb; + ev->size += pb; + + if (dd->blocking && FDBLOCK) + driver_pdl_lock(dd->blocking->pdl); + + if ((sz = driver_sizeq(ix)) > 0) { + driver_enqv(ix, ev, 0); + + if (dd->blocking && FDBLOCK) + driver_pdl_unlock(dd->blocking->pdl); + + if (sz + ev->size >= (1 << 13)) + set_busy_port(ix, 1); + } + else if (!dd->blocking || !FDBLOCK) { + /* We try to write directly if the fd in non-blocking */ + int vsize = ev->vsize > MAX_VSIZE ? MAX_VSIZE : ev->vsize; + + n = writev(ofd, (const void *) (ev->iov), vsize); + if (n == ev->size) + return; /* 0;*/ + if (n < 0) { + if ((errno != EINTR) && (errno != ERRNO_BLOCK)) { + driver_failure_posix(ix, errno); + return; /* -1;*/ + } + n = 0; + } + driver_enqv(ix, ev, n); /* n is the skip value */ + driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1); + } +#if FDBLOCK + else { + if (ev->size != 0) { + driver_enqv(ix, ev, 0); + driver_pdl_unlock(dd->blocking->pdl); + driver_async(ix, &dd->blocking->pkey, + fd_async, dd, NULL); + } else { + driver_pdl_unlock(dd->blocking->pdl); + } + } +#endif + /* return 0;*/ +} + +/* Used by spawn_driver and vanilla driver */ +static void output(ErlDrvData e, char* buf, ErlDrvSizeT len) +{ + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort ix = dd->port_num; + int pb = dd->packet_bytes; + int ofd = dd->ofd ? dd->ofd->fd : -1; + ssize_t n; + ErlDrvSizeT sz; + char lb[4]; + char* lbp; + struct iovec iv[2]; + + /* (len > ((unsigned long)-1 >> (4-pb)*8)) */ + if (((pb == 2) && (len > 0xffff)) + || (pb == 1 && len > 0xff) + || dd->pid == 0 /* Attempt at output before port is ready */) { + driver_failure_posix(ix, EINVAL); + return; /* -1; */ + } + put_int32(len, lb); + lbp = lb + (4-pb); + + if ((sz = driver_sizeq(ix)) > 0) { + driver_enq(ix, lbp, pb); + driver_enq(ix, buf, len); + if (sz + len + pb >= (1 << 13)) + set_busy_port(ix, 1); + } + else { + iv[0].iov_base = lbp; + iv[0].iov_len = pb; /* should work for pb=0 */ + iv[1].iov_base = buf; + iv[1].iov_len = len; + n = writev(ofd, iv, 2); + if (n == pb+len) + return; /* 0; */ + if (n < 0) { + if ((errno != EINTR) && (errno != ERRNO_BLOCK)) { + driver_failure_posix(ix, errno); + return; /* -1; */ + } + n = 0; + } + if (n < pb) { + driver_enq(ix, lbp+n, pb-n); + driver_enq(ix, buf, len); + } + else { + n -= pb; + driver_enq(ix, buf+n, len-n); + } + driver_select(ix, ofd, ERL_DRV_WRITE|ERL_DRV_USE, 1); + } + return; /* 0; */ +} + +static int port_inp_failure(ErtsSysDriverData *dd, int res) + /* Result: 0 (eof) or -1 (error) */ +{ + int err = errno; + + ASSERT(res <= 0); + if (dd->ifd) { + driver_select(dd->port_num, dd->ifd->fd, ERL_DRV_READ|ERL_DRV_WRITE, 0); + clear_fd_data(dd->ifd); + } + + if (dd->blocking && FDBLOCK) { + driver_pdl_lock(dd->blocking->pdl); + if (driver_sizeq(dd->port_num) > 0) { + driver_pdl_unlock(dd->blocking->pdl); + /* We have stuff in the output queue, so we just + set the state to terminating and wait for fd_async_ready + to terminate the port */ + if (res == 0) + dd->terminating = 2; + else + dd->terminating = -err; + return 0; + } + driver_pdl_unlock(dd->blocking->pdl); + } + + if (res == 0) { + if (dd->alive == 1) { + /* + * We have eof and want to report exit status, but the process + * hasn't exited yet. When it does ready_input will + * driver_select() this fd which will make sure that we get + * back here with dd->alive == -1 and dd->status set. + */ + return 0; + } + else if (dd->alive == -1) { + int status = dd->status; + + /* We need not be prepared for stopped/continued processes. */ + if (WIFSIGNALED(status)) + status = 128 + WTERMSIG(status); + else + status = WEXITSTATUS(status); + driver_report_exit(dd->port_num, status); + } + driver_failure_eof(dd->port_num); + } else if (dd->ifd) { + erl_drv_init_ack(dd->port_num, ERL_DRV_ERROR_ERRNO); + } else { + driver_failure_posix(dd->port_num, err); + } + return 0; +} + +/* fd is the drv_data that is returned from the */ +/* initial start routine */ +/* ready_fd is the descriptor that is ready to read */ + +static void ready_input(ErlDrvData e, ErlDrvEvent ready_fd) +{ + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort port_num; + int packet_bytes; + int res; + Uint h; + + port_num = dd->port_num; + packet_bytes = dd->packet_bytes; + + ASSERT(abs(dd->ifd->fd) == ready_fd); + + if (dd->pid == 0) { + /* the pid is sent from erl_child_setup. spawn driver only. */ + ErtsSysForkerProto proto; + int res; + + if((res = read(ready_fd, &proto, sizeof(proto))) <= 0) { + /* hmm, child setup seems to have closed the pipe too early... + we close the port as there is not much else we can do */ + if (res < 0 && errno == ERRNO_BLOCK) + return; + driver_select(port_num, ready_fd, ERL_DRV_READ, 0); + if (res == 0) + errno = EPIPE; + port_inp_failure(dd, -1); + return; + } + + ASSERT(proto.action == ErtsSysForkerProtoAction_Go); + dd->pid = proto.u.go.os_pid; + + if (dd->pid == -1) { + /* Setup failed! The only reason why this should happen is if + the fork fails. */ + errno = proto.u.go.error_number; + port_inp_failure(dd, -1); + return; + } + + proto.action = ErtsSysForkerProtoAction_Ack; + + if (driver_sizeq(port_num) > 0) { + driver_enq(port_num, (char*)&proto, sizeof(proto)); + } else { + if (write(abs(dd->ofd->fd), &proto, sizeof(proto)) < 0) + if (errno == ERRNO_BLOCK || errno == EINTR) + driver_enq(port_num, (char*)&proto, sizeof(proto)); + /* do nothing on failure here. If the ofd is broken, then + the ifd will probably also be broken and trigger + a port_inp_failure */ + } + + if (dd->ifd->fd < 0) { + driver_select(port_num, abs(dd->ifd->fd), ERL_DRV_READ|ERL_DRV_USE, 0); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysFdData)); + dd->ifd = NULL; + } + + if (dd->ofd->fd < 0 || driver_sizeq(port_num) > 0) + /* we select in order to close fd or write to queue, + child setup will close this fd if fd < 0 */ + driver_select(port_num, abs(dd->ofd->fd), ERL_DRV_WRITE|ERL_DRV_USE, 1); + + erl_drv_set_os_pid(port_num, dd->pid); + erl_drv_init_ack(port_num, e); + return; + } + + if (packet_bytes == 0) { + byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF, + ERTS_SYS_READ_BUF_SZ); + res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ); + if (res < 0) { + if ((errno != EINTR) && (errno != ERRNO_BLOCK)) + port_inp_failure(dd, res); + } + else if (res == 0) + port_inp_failure(dd, res); + else + driver_output(port_num, (char*) read_buf, res); + erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf); + } + else if (dd->ifd->remain > 0) { /* We try to read the remainder */ + /* space is allocated in buf */ + res = read(ready_fd, dd->ifd->cpos, + dd->ifd->remain); + if (res < 0) { + if ((errno != EINTR) && (errno != ERRNO_BLOCK)) + port_inp_failure(dd, res); + } + else if (res == 0) { + port_inp_failure(dd, res); + } + else if (res == dd->ifd->remain) { /* we're done */ + driver_output(port_num, dd->ifd->buf, + dd->ifd->sz); + clear_fd_data(dd->ifd); + } + else { /* if (res < dd->ifd->remain) */ + dd->ifd->cpos += res; + dd->ifd->remain -= res; + } + } + else if (dd->ifd->remain == 0) { /* clean fd */ + byte *read_buf = (byte *) erts_alloc(ERTS_ALC_T_SYS_READ_BUF, + ERTS_SYS_READ_BUF_SZ); + /* We make one read attempt and see what happens */ + res = read(ready_fd, read_buf, ERTS_SYS_READ_BUF_SZ); + if (res < 0) { + if ((errno != EINTR) && (errno != ERRNO_BLOCK)) + port_inp_failure(dd, res); + } + else if (res == 0) { /* eof */ + port_inp_failure(dd, res); + } + else if (res < packet_bytes - dd->ifd->psz) { + memcpy(dd->ifd->pbuf+dd->ifd->psz, + read_buf, res); + dd->ifd->psz += res; + } + else { /* if (res >= packet_bytes) */ + unsigned char* cpos = read_buf; + int bytes_left = res; + + while (1) { + int psz = dd->ifd->psz; + char* pbp = dd->ifd->pbuf + psz; + + while(bytes_left && (psz < packet_bytes)) { + *pbp++ = *cpos++; + bytes_left--; + psz++; + } + + if (psz < packet_bytes) { + dd->ifd->psz = psz; + break; + } + dd->ifd->psz = 0; + + switch (packet_bytes) { + case 1: h = get_int8(dd->ifd->pbuf); break; + case 2: h = get_int16(dd->ifd->pbuf); break; + case 4: h = get_int32(dd->ifd->pbuf); break; + default: ASSERT(0); return; /* -1; */ + } + + if (h <= (bytes_left)) { + driver_output(port_num, (char*) cpos, h); + cpos += h; + bytes_left -= h; + continue; + } + else { /* The last message we got was split */ + char *buf = erts_alloc_fnf(ERTS_ALC_T_FD_ENTRY_BUF, h); + if (!buf) { + errno = ENOMEM; + port_inp_failure(dd, -1); + } + else { + erts_smp_atomic_add_nob(&sys_misc_mem_sz, h); + sys_memcpy(buf, cpos, bytes_left); + dd->ifd->buf = buf; + dd->ifd->sz = h; + dd->ifd->remain = h - bytes_left; + dd->ifd->cpos = buf + bytes_left; + } + break; + } + } + } + erts_free(ERTS_ALC_T_SYS_READ_BUF, (void *) read_buf); + } +} + + +/* fd is the drv_data that is returned from the */ +/* initial start routine */ +/* ready_fd is the descriptor that is ready to read */ + +static void ready_output(ErlDrvData e, ErlDrvEvent ready_fd) +{ + ErtsSysDriverData *dd = (ErtsSysDriverData*)e; + ErlDrvPort ix = dd->port_num; + int n; + struct iovec* iv; + int vsize; + + if ((iv = (struct iovec*) driver_peekq(ix, &vsize)) == NULL) { + driver_select(ix, ready_fd, ERL_DRV_WRITE, 0); + if (dd->pid > 0 && dd->ofd->fd < 0) { + /* The port was opened with 'in' option, which means we + should close the output fd as soon as the command has + been sent. */ + driver_select(ix, ready_fd, ERL_DRV_WRITE|ERL_DRV_USE, 0); + erts_smp_atomic_add_nob(&sys_misc_mem_sz, -sizeof(ErtsSysFdData)); + dd->ofd = NULL; + } + if (dd->terminating) + driver_failure_atom(dd->port_num,"normal"); + return; /* 0; */ + } + vsize = vsize > MAX_VSIZE ? MAX_VSIZE : vsize; + if ((n = writev(ready_fd, iv, vsize)) > 0) { + if (driver_deq(ix, n) == 0) + set_busy_port(ix, 0); + } + else if (n < 0) { + if (errno == ERRNO_BLOCK || errno == EINTR) + return; /* 0; */ + else { + int res = errno; + driver_select(ix, ready_fd, ERL_DRV_WRITE, 0); + driver_failure_posix(ix, res); + return; /* -1; */ + } + } + return; /* 0; */ +} + +static void stop_select(ErlDrvEvent fd, void* _) +{ + close((int)fd); +} + +#if FDBLOCK + +static void +fd_async(void *async_data) +{ + int res; + ErtsSysDriverData *dd = (ErtsSysDriverData *)async_data; + SysIOVec *iov0; + SysIOVec *iov; + int iovlen; + int err = 0; + /* much of this code is stolen from efile_drv:invoke_writev */ + driver_pdl_lock(dd->blocking->pdl); + iov0 = driver_peekq(dd->port_num, &iovlen); + iovlen = iovlen < MAXIOV ? iovlen : MAXIOV; + iov = erts_alloc_fnf(ERTS_ALC_T_SYS_WRITE_BUF, + sizeof(SysIOVec)*iovlen); + if (!iov) { + res = -1; + err = ENOMEM; + driver_pdl_unlock(dd->blocking->pdl); + } else { + memcpy(iov,iov0,iovlen*sizeof(SysIOVec)); + driver_pdl_unlock(dd->blocking->pdl); + + do { + res = writev(dd->ofd->fd, iov, iovlen); + } while (res < 0 && errno == EINTR); + if (res < 0) + err = errno; + err = errno; + + erts_free(ERTS_ALC_T_SYS_WRITE_BUF, iov); + } + dd->blocking->res = res; + dd->blocking->err = err; +} + +void fd_ready_async(ErlDrvData drv_data, + ErlDrvThreadData thread_data) { + ErtsSysDriverData *dd = (ErtsSysDriverData *)thread_data; + ErlDrvPort port_num = dd->port_num; + + ASSERT(dd->blocking); + + if (dd->blocking->res > 0) { + driver_pdl_lock(dd->blocking->pdl); + if (driver_deq(port_num, dd->blocking->res) == 0) { + driver_pdl_unlock(dd->blocking->pdl); + set_busy_port(port_num, 0); + if (dd->terminating) { + /* The port is has been ordered to terminate + from either fd_flush or port_inp_failure */ + if (dd->terminating == 1) + driver_failure_atom(port_num, "normal"); + else if (dd->terminating == 2) + driver_failure_eof(port_num); + else if (dd->terminating < 0) + driver_failure_posix(port_num, -dd->terminating); + return; /* -1; */ + } + } else { + driver_pdl_unlock(dd->blocking->pdl); + /* still data left to write in queue */ + driver_async(port_num, &dd->blocking->pkey, fd_async, dd, NULL); + return /* 0; */; + } + } else if (dd->blocking->res < 0) { + if (dd->blocking->err == ERRNO_BLOCK) { + set_busy_port(port_num, 1); + /* still data left to write in queue */ + driver_async(port_num, &dd->blocking->pkey, fd_async, dd, NULL); + } else + driver_failure_posix(port_num, dd->blocking->err); + return; /* -1; */ + } + return; /* 0; */ +} + +#endif + +/* Forker driver */ + +static int forker_fd; + +static ErlDrvData forker_start(ErlDrvPort port_num, char* name, + SysDriverOpts* opts) +{ + + int i; + int fds[2]; + int res, unbind; + char bindir[MAXPATHLEN]; + size_t bindirsz = sizeof(bindir); + Uint csp_path_sz; + char *child_setup_prog; + + forker_port = erts_drvport2id(port_num); + + res = erts_sys_getenv_raw("BINDIR", bindir, &bindirsz); + if (res != 0) { + if (res < 0) + erts_exit(1, + "Environment variable BINDIR is not set\n"); + if (res > 0) + erts_exit(1, + "Value of environment variable BINDIR is too large\n"); + } + if (bindir[0] != DIR_SEPARATOR_CHAR) + erts_exit(1, + "Environment variable BINDIR does not contain an" + " absolute path\n"); + csp_path_sz = (strlen(bindir) + + 1 /* DIR_SEPARATOR_CHAR */ + + sizeof(CHILD_SETUP_PROG_NAME) + + 1); + child_setup_prog = erts_alloc(ERTS_ALC_T_CS_PROG_PATH, csp_path_sz); + erts_snprintf(child_setup_prog, csp_path_sz, + "%s%c%s", + bindir, + DIR_SEPARATOR_CHAR, + CHILD_SETUP_PROG_NAME); + if (socketpair(AF_UNIX, SOCK_STREAM, 0, fds) < 0) { + erts_exit(ERTS_ABORT_EXIT, + "Could not open unix domain socket in spawn_init: %d\n", + errno); + } + + forker_fd = fds[0]; + + unbind = erts_sched_bind_atfork_prepare(); + + i = fork(); + + if (i == 0) { + /* The child */ + char *cs_argv[FORKER_ARGV_NO_OF_ARGS] = + {CHILD_SETUP_PROG_NAME, NULL, NULL}; + char buff[128]; + + erts_sched_bind_atfork_child(unbind); + + snprintf(buff, 128, "%d", sys_max_files()); + cs_argv[FORKER_ARGV_MAX_FILES] = buff; + + /* We preallocate fd 3 for the uds fd */ + if (fds[1] != 3) { + dup2(fds[1], 3); + } + +#if defined(USE_SETPGRP_NOARGS) /* SysV */ + (void) setpgrp(); +#elif defined(USE_SETPGRP) /* BSD */ + (void) setpgrp(0, getpid()); +#else /* POSIX */ + (void) setsid(); +#endif + + execv(child_setup_prog, cs_argv); + _exit(1); + } + + erts_sched_bind_atfork_parent(unbind); + + erts_free(ERTS_ALC_T_CS_PROG_PATH, child_setup_prog); + + close(fds[1]); + + SET_NONBLOCKING(forker_fd); + + driver_select(port_num, forker_fd, ERL_DRV_READ|ERL_DRV_USE, 1); + + return (ErlDrvData)port_num; +} + +static void forker_stop(ErlDrvData e) +{ + /* we probably should do something here, + the port has been closed by the user. */ +} + +static void forker_ready_input(ErlDrvData e, ErlDrvEvent fd) +{ + int res; + ErtsSysForkerProto *proto; + + proto = erts_alloc(ERTS_ALC_T_DRV_CTRL_DATA, sizeof(*proto)); + + if ((res = read(fd, proto, sizeof(*proto))) < 0) { + if (errno == ERRNO_BLOCK) + return; + erts_exit(ERTS_DUMP_EXIT, "Failed to read from erl_child_setup: %d\n", errno); + } + + if (res == 0) + erts_exit(ERTS_DUMP_EXIT, "erl_child_setup closed\n"); + + ASSERT(res == sizeof(*proto)); + +#ifdef FORKER_PROTO_START_ACK + if (proto->action == ErtsSysForkerProtoAction_StartAck) { + /* Ideally we would like to not have to ack each Start + command being sent over the uds, but it would seem + that some operating systems (only observed on FreeBSD) + throw away data on the uds when the socket becomes full, + so we have to. + */ + ErlDrvPort port_num = (ErlDrvPort)e; + int vlen; + SysIOVec *iov = driver_peekq(port_num, &vlen); + ErtsSysForkerProto *proto = (ErtsSysForkerProto *)iov[0].iov_base; + + close(proto->u.start.fds[0]); + close(proto->u.start.fds[1]); + if (proto->u.start.fds[1] != proto->u.start.fds[2]) + close(proto->u.start.fds[2]); + + driver_deq(port_num, sizeof(*proto)); + + if (driver_sizeq(port_num) > 0) + driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1); + } else +#endif + { + ASSERT(proto->action == ErtsSysForkerProtoAction_SigChld); + + /* ideally this would be a port_command call, but as command is + already used by the spawn_driver, we use control instead. + Note that when using erl_drv_port_control it is an asynchronous + control. */ + erl_drv_port_control(proto->u.sigchld.port_id, 'S', + (char*)proto, sizeof(*proto)); + } + +} + +static void forker_ready_output(ErlDrvData e, ErlDrvEvent fd) +{ + ErlDrvPort port_num = (ErlDrvPort)e; + +#ifndef FORKER_PROTO_START_ACK + while (driver_sizeq(port_num) > 0) { +#endif + int vlen; + SysIOVec *iov = driver_peekq(port_num, &vlen); + ErtsSysForkerProto *proto = (ErtsSysForkerProto *)iov[0].iov_base; + ASSERT(iov[0].iov_len >= (sizeof(*proto))); + if (sys_uds_write(forker_fd, (char*)proto, sizeof(*proto), + proto->u.start.fds, 3, 0) < 0) { + if (errno == ERRNO_BLOCK) + return; + erts_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno); + } +#ifndef FORKER_PROTO_START_ACK + close(proto->u.start.fds[0]); + close(proto->u.start.fds[1]); + if (proto->u.start.fds[1] != proto->u.start.fds[2]) + close(proto->u.start.fds[2]); + driver_deq(port_num, sizeof(*proto)); + } +#endif + + driver_select(port_num, forker_fd, ERL_DRV_WRITE, 0); +} + +static ErlDrvSSizeT forker_control(ErlDrvData e, unsigned int cmd, char *buf, + ErlDrvSizeT len, char **rbuf, ErlDrvSizeT rlen) +{ + ErtsSysForkerProto *proto = (ErtsSysForkerProto *)buf; + ErlDrvPort port_num = (ErlDrvPort)e; + int res; + + driver_enq(port_num, buf, len); + if (driver_sizeq(port_num) > sizeof(*proto)) { + return 0; + } + + if ((res = sys_uds_write(forker_fd, (char*)proto, sizeof(*proto), + proto->u.start.fds, 3, 0)) < 0) { + if (errno == ERRNO_BLOCK) { + driver_select(port_num, forker_fd, ERL_DRV_WRITE|ERL_DRV_USE, 1); + return 0; + } + erts_exit(ERTS_DUMP_EXIT, "Failed to write to erl_child_setup: %d\n", errno); + } + +#ifndef FORKER_PROTO_START_ACK + ASSERT(res == sizeof(*proto)); + close(proto->u.start.fds[0]); + close(proto->u.start.fds[1]); + if (proto->u.start.fds[1] != proto->u.start.fds[2]) + close(proto->u.start.fds[2]); + driver_deq(port_num, sizeof(*proto)); +#endif + + return 0; +} diff --git a/erts/emulator/sys/unix/sys_float.c b/erts/emulator/sys/unix/sys_float.c index 689be98969..6435da086f 100644 --- a/erts/emulator/sys/unix/sys_float.c +++ b/erts/emulator/sys/unix/sys_float.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2001-2013. All Rights Reserved. + * Copyright Ericsson AB 2001-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -32,7 +33,7 @@ void erts_sys_init_float(void) { # ifdef SIGFPE - sys_sigset(SIGFPE, SIG_IGN); /* Ignore so we can test for NaN and Inf */ + sys_signal(SIGFPE, SIG_IGN); /* Ignore so we can test for NaN and Inf */ # endif } @@ -46,13 +47,13 @@ static void erts_init_fp_exception(void) { /* XXX: the wrappers prevent using a pthread destructor to deallocate the key's value; so when/where do we do that? */ - erts_tsd_key_create(&fpe_key); + erts_tsd_key_create(&fpe_key,"fp_exception"); } void erts_thread_init_fp_exception(void) { unsigned long *fpe = erts_alloc(ERTS_ALC_T_FP_EXCEPTION, sizeof(*fpe)); - *fpe = 0L; + *fpe = 0; erts_tsd_set(fpe_key, fpe); } @@ -85,11 +86,11 @@ static void set_current_fp_exception(unsigned long pc) void erts_fp_check_init_error(volatile unsigned long *fpexnp) { - char buf[64]; + char buf[128]; snprintf(buf, sizeof buf, "ERTS_FP_CHECK_INIT at %p: detected unhandled FPE at %p\r\n", __builtin_return_address(0), (void*)*fpexnp); if (write(2, buf, strlen(buf)) <= 0) - erl_exit(ERTS_ABORT_EXIT, "%s", buf); + erts_exit(ERTS_ABORT_EXIT, "%s", buf); *fpexnp = 0; #if defined(__i386__) || defined(__x86_64__) erts_restore_fpu(); @@ -101,6 +102,17 @@ void erts_fp_check_init_error(volatile unsigned long *fpexnp) #define __DARWIN__ 1 #endif +/* + * Define two processor and possibly OS-specific primitives: + * + * static void unmask_fpe(void); + * -- unmask invalid, overflow, and divide-by-zero exceptions + * + * static int mask_fpe(void); + * -- mask invalid, overflow, and divide-by-zero exceptions + * -- return non-zero if the previous state was unmasked + */ + #if (defined(__i386__) || defined(__x86_64__)) && defined(__GNUC__) static void unmask_x87(void) @@ -112,7 +124,6 @@ static void unmask_x87(void) __asm__ __volatile__("fldcw %0" : : "m"(cw)); } -/* mask x87 FPE, return true if the previous state was unmasked */ static int mask_x87(void) { unsigned short cw; @@ -135,7 +146,6 @@ static void unmask_sse2(void) __asm__ __volatile__("ldmxcsr %0" : : "m"(mxcsr)); } -/* mask SSE2 FPE, return true if the previous state was unmasked */ static int mask_sse2(void) { unsigned int mxcsr; @@ -256,21 +266,19 @@ static int cpu_has_sse2(void) } #endif /* !__x86_64__ */ -static void unmask_fpe(void) +static void unmask_fpe_internal(void) { - __asm__ __volatile__("fnclex"); unmask_x87(); if (cpu_has_sse2()) unmask_sse2(); } -static void unmask_fpe_conditional(int unmasked) +static void unmask_fpe(void) { - if (unmasked) - unmask_fpe(); + __asm__ __volatile__("fnclex"); + unmask_fpe_internal(); } -/* mask x86 FPE, return true if the previous state was unmasked */ static int mask_fpe(void) { int unmasked; @@ -284,9 +292,7 @@ static int mask_fpe(void) void erts_restore_fpu(void) { __asm__ __volatile__("fninit"); - unmask_x87(); - if (cpu_has_sse2()) - unmask_sse2(); + unmask_fpe_internal(); } #elif defined(__sparc__) && defined(__linux__) @@ -309,13 +315,6 @@ static void unmask_fpe(void) __asm__ __volatile__(LDX " %0, %%fsr" : : "m"(fsr)); } -static void unmask_fpe_conditional(int unmasked) -{ - if (unmasked) - unmask_fpe(); -} - -/* mask SPARC FPE, return true if the previous state was unmasked */ static int mask_fpe(void) { unsigned long fsr; @@ -430,13 +429,6 @@ static void unmask_fpe(void) set_fpscr(0x80|0x40|0x10); /* VE, OE, ZE; not UE or XE */ } -static void unmask_fpe_conditional(int unmasked) -{ - if (unmasked) - unmask_fpe(); -} - -/* mask PowerPC FPE, return true if the previous state was unmasked */ static int mask_fpe(void) { int unmasked; @@ -446,20 +438,13 @@ static int mask_fpe(void) return unmasked; } -#else +#else /* !(x86 || (sparc && linux) || (powerpc && (linux || darwin))) */ static void unmask_fpe(void) { fpsetmask(FP_X_INV | FP_X_OFL | FP_X_DZ); } -static void unmask_fpe_conditional(int unmasked) -{ - if (unmasked) - unmask_fpe(); -} - -/* mask IEEE FPE, return true if previous state was unmasked */ static int mask_fpe(void) { const fp_except unmasked_mask = FP_X_INV | FP_X_OFL | FP_X_DZ; @@ -471,6 +456,16 @@ static int mask_fpe(void) #endif +/* + * Define a processor and OS-specific SIGFPE handler. + * + * The effect of receiving a SIGFPE should be: + * 1. Update the processor context: + * a) on x86: mask FP exceptions, do not skip faulting instruction + * b) on SPARC and PowerPC: unmask FP exceptions, skip faulting instruction + * 2. call set_current_fp_exception with the PC of the faulting instruction + */ + #if (defined(__linux__) && (defined(__i386__) || defined(__x86_64__) || defined(__sparc__) || defined(__powerpc__))) || (defined(__DARWIN__) && (defined(__i386__) || defined(__x86_64__) || defined(__ppc__))) || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__))) || ((defined(__NetBSD__) || defined(__OpenBSD__)) && defined(__x86_64__)) || (defined(__sun__) && defined(__x86_64__)) #if defined(__linux__) && defined(__i386__) @@ -498,18 +493,8 @@ static int mask_fpe(void) #define mc_pc(mc) ((mc)->gregs[REG_RIP]) #elif defined(__linux__) && defined(__i386__) #define mc_pc(mc) ((mc)->gregs[REG_EIP]) -#elif defined(__DARWIN__) && defined(__i386__) -#ifdef DARWIN_MODERN_MCONTEXT -#define mc_pc(mc) ((mc)->__ss.__eip) -#else -#define mc_pc(mc) ((mc)->ss.eip) -#endif -#elif defined(__DARWIN__) && defined(__x86_64__) -#ifdef DARWIN_MODERN_MCONTEXT -#define mc_pc(mc) ((mc)->__ss.__rip) -#else -#define mc_pc(mc) ((mc)->ss.rip) -#endif +#elif defined(__DARWIN__) +# error "Floating-point exceptions not supported on MacOS X" #elif defined(__FreeBSD__) && defined(__x86_64__) #define mc_pc(mc) ((mc)->mc_rip) #elif defined(__FreeBSD__) && defined(__i386__) @@ -529,8 +514,7 @@ static void fpe_sig_action(int sig, siginfo_t *si, void *puc) ucontext_t *uc = puc; unsigned long pc; -#if defined(__linux__) -#if defined(__x86_64__) +#if defined(__linux__) && defined(__x86_64__) mcontext_t *mc = &uc->uc_mcontext; fpregset_t fpstate = mc->fpregs; pc = mc_pc(mc); @@ -542,26 +526,26 @@ static void fpe_sig_action(int sig, siginfo_t *si, void *puc) set encoding makes that a poor solution here. */ fpstate->mxcsr = 0x1F80; fpstate->swd &= ~0xFF; -#elif defined(__i386__) +#elif defined(__linux__) && defined(__i386__) mcontext_t *mc = &uc->uc_mcontext; fpregset_t fpstate = mc->fpregs; pc = mc_pc(mc); if ((fpstate->status >> 16) == X86_FXSR_MAGIC) ((struct _fpstate*)fpstate)->mxcsr = 0x1F80; fpstate->sw &= ~0xFF; -#elif defined(__sparc__) && defined(__arch64__) +#elif defined(__linux__) && defined(__sparc__) && defined(__arch64__) /* on SPARC the 3rd parameter points to a sigcontext not a ucontext */ struct sigcontext *sc = (struct sigcontext*)puc; pc = sc->sigc_regs.tpc; sc->sigc_regs.tpc = sc->sigc_regs.tnpc; sc->sigc_regs.tnpc += 4; -#elif defined(__sparc__) +#elif defined(__linux__) && defined(__sparc__) /* on SPARC the 3rd parameter points to a sigcontext not a ucontext */ struct sigcontext *sc = (struct sigcontext*)puc; pc = sc->si_regs.pc; sc->si_regs.pc = sc->si_regs.npc; sc->si_regs.npc = (unsigned long)sc->si_regs.npc + 4; -#elif defined(__powerpc__) +#elif defined(__linux__) && defined(__powerpc__) #if defined(__powerpc64__) mcontext_t *mc = &uc->uc_mcontext; unsigned long *regs = &mc->gp_regs[0]; @@ -572,19 +556,8 @@ static void fpe_sig_action(int sig, siginfo_t *si, void *puc) pc = regs[PT_NIP]; regs[PT_NIP] += 4; regs[PT_FPSCR] = 0x80|0x40|0x10; /* VE, OE, ZE; not UE or XE */ -#endif #elif defined(__DARWIN__) && (defined(__i386__) || defined(__x86_64__)) -#ifdef DARWIN_MODERN_MCONTEXT - mcontext_t mc = uc->uc_mcontext; - pc = mc_pc(mc); - mc->__fs.__fpu_mxcsr = 0x1F80; - *(unsigned short *)&mc->__fs.__fpu_fsw &= ~0xFF; -#else - mcontext_t mc = uc->uc_mcontext; - pc = mc_pc(mc); - mc->fs.fpu_mxcsr = 0x1F80; - *(unsigned short *)&mc->fs.fpu_fsw &= ~0xFF; -#endif /* DARWIN_MODERN_MCONTEXT */ +# error "Floating-point exceptions not supported on MacOS X" #elif defined(__DARWIN__) && defined(__ppc__) mcontext_t mc = uc->uc_mcontext; pc = mc->ss.srr0; @@ -640,7 +613,7 @@ static void fpe_sig_action(int sig, siginfo_t *si, void *puc) #endif #if 0 { - char buf[64]; + char buf[128]; snprintf(buf, sizeof buf, "%s: FPE at %p\r\n", __FUNCTION__, (void*)pc); write(2, buf, strlen(buf)); } @@ -667,7 +640,7 @@ static void fpe_sig_handler(int sig) static void erts_thread_catch_fp_exceptions(void) { - sys_sigset(SIGFPE, fpe_sig_handler); + sys_signal(SIGFPE, fpe_sig_handler); unmask_fpe(); } @@ -725,7 +698,8 @@ int erts_sys_block_fpe(void) void erts_sys_unblock_fpe(int unmasked) { - unmask_fpe_conditional(unmasked); + if (unmasked) + unmask_fpe(); } #endif @@ -837,11 +811,6 @@ sys_chars_to_double(char* buf, double* fp) int matherr(struct exception *exc) { -#if !defined(NO_FPE_SIGNALS) - volatile unsigned long *fpexnp = erts_get_current_fp_exception(); - if (fpexnp != NULL) - *fpexnp = (unsigned long)__builtin_return_address(0); -#endif return 1; } diff --git a/erts/emulator/sys/unix/sys_time.c b/erts/emulator/sys/unix/sys_time.c index fcce54a2c4..4f26639703 100644 --- a/erts/emulator/sys/unix/sys_time.c +++ b/erts/emulator/sys/unix/sys_time.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2005-2009. All Rights Reserved. + * Copyright Ericsson AB 2005-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -31,8 +32,21 @@ # undef _FILE_OFFSET_BITS #endif +#include <stdlib.h> #include "sys.h" #include "global.h" +#include "erl_os_monotonic_time_extender.h" + +#undef ERTS_HAVE_ERTS_OS_TIMES_IMPL__ +#undef ERTS_HAVE_ERTS_SYS_HRTIME_IMPL__ + +#if defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) \ + || defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) \ + || defined(SYS_HRTIME_USING_MACH_CLOCK_GET_TIME) +# include <mach/clock.h> +# include <mach/mach.h> +# define ERTS_MACH_CLOCKS +#endif #ifdef NO_SYSCONF # define TICKS_PER_SEC() HZ @@ -51,11 +65,27 @@ # include <fcntl.h> #endif +static void init_perf_counter(void); + /******************* Routines for time measurement *********************/ -int erts_ticks_per_sec = 0; /* Will be SYS_CLK_TCK in erl_unix_sys.h */ -int erts_ticks_per_sec_wrap = 0; /* Will be SYS_CLK_TCK_WRAP */ -static int ticks_bsr = 0; /* Shift wrapped tick value this much to the right */ +#undef ERTS_SYS_TIME_INTERNAL_STATE_WRITE_FREQ__ +#undef ERTS_SYS_TIME_INTERNAL_STATE_READ_ONLY__ +#undef ERTS_SYS_TIME_INTERNAL_STATE_READ_MOSTLY__ + +#if defined(OS_MONOTONIC_TIME_USING_TIMES) + +static Uint32 +get_tick_count(void) +{ + struct tms unused; + return (Uint32) times(&unused); +} + +#define ERTS_SYS_TIME_INTERNAL_STATE_READ_ONLY__ +#define ERTS_SYS_TIME_INTERNAL_STATE_READ_MOSTLY__ + +#endif /* * init timers, chose a tick length, and return it. @@ -63,40 +93,939 @@ static int ticks_bsr = 0; /* Shift wrapped tick value this much to the right */ * does almost everything. Other platforms have to * emulate Unix in this sense. */ -int sys_init_time(void) + +ErtsSysTimeData__ erts_sys_time_data__ erts_align_attribute(ERTS_CACHE_LINE_SIZE); + +#if defined(__linux__) && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) + +#define ERTS_SYS_TIME_INTERNAL_STATE_WRITE_FREQ__ + +static ErtsMonotonicTime clock_gettime_monotonic(void); +static ErtsMonotonicTime clock_gettime_monotonic_verified(void); +#if defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) +static ErtsMonotonicTime clock_gettime_monotonic_raw(void); +#endif +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) +static void clock_gettime_times(ErtsMonotonicTime *, ErtsSystemTime *); +static void clock_gettime_times_verified(ErtsMonotonicTime *, ErtsSystemTime *); +#if defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) +static void clock_gettime_times_raw(ErtsMonotonicTime *, ErtsSystemTime *); +#endif +#endif + +#endif /* defined(__linux__) && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) */ + +#ifdef ERTS_MACH_CLOCKS +# define ERTS_SYS_TIME_INTERNAL_STATE_READ_ONLY__ +typedef struct { + clock_id_t id; + clock_serv_t srv; + char *name; +} ErtsMachClock; + +typedef struct { + host_name_port_t host; + struct { + ErtsMachClock monotonic; + ErtsMachClock wall; + } clock; +} ErtsMachClocks; +static void mach_clocks_init(void); +static void mach_clocks_fini(void); +# ifdef HAVE_CLOCK_GET_ATTRIBUTES +# define ERTS_HAVE_MACH_CLOCK_GETRES +static Sint64 +mach_clock_getres(ErtsMachClock *clk); +# endif +#endif /* ERTS_MACH_CLOCKS */ + +#ifdef ERTS_SYS_TIME_INTERNAL_STATE_READ_ONLY__ +struct sys_time_internal_state_read_only__ { +#if defined(OS_MONOTONIC_TIME_USING_TIMES) + int times_shift; +#endif +#ifdef ERTS_MACH_CLOCKS + ErtsMachClocks mach; +#endif +}; +#endif + +#ifdef ERTS_SYS_TIME_INTERNAL_STATE_READ_MOSTLY__ +struct sys_time_internal_state_read_mostly__ { +#if defined(OS_MONOTONIC_TIME_USING_TIMES) + ErtsOsMonotonicTimeExtendState os_mtime_xtnd; +#endif +}; +#endif + +#ifdef ERTS_SYS_TIME_INTERNAL_STATE_WRITE_FREQ__ +struct sys_time_internal_state_write_freq__ { + erts_smp_mtx_t mtx; +#if defined(__linux__) && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) + ErtsMonotonicTime last_delivered; +#endif +}; +#endif + +#if defined(ERTS_SYS_TIME_INTERNAL_STATE_READ_ONLY__) \ + || defined(ERTS_SYS_TIME_INTERNAL_STATE_WRITE_FREQ__) +static struct { +#ifdef ERTS_SYS_TIME_INTERNAL_STATE_READ_ONLY__ + union { + struct sys_time_internal_state_read_only__ o; + char align__[(((sizeof(struct sys_time_internal_state_read_only__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } r; +#endif +#ifdef ERTS_SYS_TIME_INTERNAL_STATE_READ_MOSTLY__ + union { + struct sys_time_internal_state_read_mostly__ m; + char align__[(((sizeof(struct sys_time_internal_state_read_mostly__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } wr; +#endif +#ifdef ERTS_SYS_TIME_INTERNAL_STATE_WRITE_FREQ__ + union { + struct sys_time_internal_state_write_freq__ f; + char align__[(((sizeof(struct sys_time_internal_state_write_freq__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } w; +#endif +} internal_state erts_align_attribute(ERTS_CACHE_LINE_SIZE); +#endif + +void +sys_init_time(ErtsSysInitTimeResult *init_resp) { +#if defined(ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT) + int major, minor, build, vsn; +#endif +#if defined(ERTS_MACH_CLOCKS) + mach_clocks_init(); +#endif +#if !defined(ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT) + + init_resp->have_os_monotonic_time = 0; + +#else /* defined(ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT) */ + +#ifdef ERTS_HAVE_CORRECTED_OS_MONOTONIC_TIME + init_resp->have_corrected_os_monotonic_time = 1; +#else + init_resp->have_corrected_os_monotonic_time = 0; +#endif + + init_resp->os_monotonic_time_info.resolution = (Uint64) 1000*1000*1000; +#if defined(ERTS_HAVE_MACH_CLOCK_GETRES) && defined(MONOTONIC_CLOCK_ID) + init_resp->os_monotonic_time_info.resolution + = mach_clock_getres(&internal_state.r.o.mach.clock.monotonic); +#elif defined(HAVE_CLOCK_GETRES) && defined(MONOTONIC_CLOCK_ID) + { + struct timespec ts; + if (clock_getres(MONOTONIC_CLOCK_ID, &ts) == 0) { + if (ts.tv_sec == 0 && ts.tv_nsec != 0) + init_resp->os_monotonic_time_info.resolution /= ts.tv_nsec; + else if (ts.tv_sec >= 1) + init_resp->os_monotonic_time_info.resolution = 1; + } + } +#endif + +#ifdef MONOTONIC_CLOCK_ID_STR + init_resp->os_monotonic_time_info.clock_id = MONOTONIC_CLOCK_ID_STR; +#else + init_resp->os_monotonic_time_info.clock_id = NULL; +#endif + + init_resp->os_monotonic_time_info.locked_use = 0; + +#if defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) + init_resp->os_monotonic_time_info.func = "clock_gettime"; +#elif defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) + init_resp->os_monotonic_time_info.func = "clock_get_time"; +#elif defined(OS_MONOTONIC_TIME_USING_GETHRTIME) + init_resp->os_monotonic_time_info.func = "gethrtime"; +#elif defined(OS_MONOTONIC_TIME_USING_TIMES) + init_resp->os_monotonic_time_info.func = "times"; +#else +# error Unknown erts_os_monotonic_time() implementation +#endif + + init_resp->have_os_monotonic_time = 1; + + os_version(&major, &minor, &build); + + vsn = ERTS_MK_VSN_INT(major, minor, build); + + +#if defined(__linux__) && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) + if (vsn >= ERTS_MK_VSN_INT(2, 6, 33)) { + erts_sys_time_data__.r.o.os_monotonic_time = + clock_gettime_monotonic; +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + erts_sys_time_data__.r.o.os_times = + clock_gettime_times; +#endif + } + else { + /* + * Linux versions prior to 2.6.33 have a + * known bug that sometimes cause the NTP + * adjusted monotonic clock to take small + * steps backwards. Use raw monotonic clock + * if it is present; otherwise, fall back + * on locked verification of values. + */ + init_resp->have_corrected_os_monotonic_time = 0; +#if defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) + /* We know that CLOCK_MONOTONIC_RAW is defined, + but we don't know if we got a kernel that + supports it. Support for CLOCK_MONOTONIC_RAW + appeared in kernel 2.6.28... */ + if (vsn >= ERTS_MK_VSN_INT(2, 6, 28)) { + erts_sys_time_data__.r.o.os_monotonic_time = + clock_gettime_monotonic_raw; +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + erts_sys_time_data__.r.o.os_times = + clock_gettime_times_raw; +#endif + init_resp->os_monotonic_time_info.clock_id = + "CLOCK_MONOTONIC_RAW"; + } + else +#endif /* defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) */ + { + erts_sys_time_data__.r.o.os_monotonic_time = + clock_gettime_monotonic_verified; +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + erts_sys_time_data__.r.o.os_times = + clock_gettime_times_verified; +#endif + erts_smp_mtx_init(&internal_state.w.f.mtx, + "os_monotonic_time"); + internal_state.w.f.last_delivered + = clock_gettime_monotonic(); + init_resp->os_monotonic_time_info.locked_use = 1; + } + } +#else /* !(defined(__linux__) && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME)) */ + { + char flavor[1024]; + + os_flavor(flavor, sizeof(flavor)); + + if (sys_strcmp(flavor, "sunos") == 0) { + /* + * Don't trust hrtime on multi processors + * on SunOS prior to SunOS 5.8 + */ + if (vsn < ERTS_MK_VSN_INT(5, 8, 0)) { +#if defined(HAVE_SYSCONF) && defined(_SC_NPROCESSORS_CONF) + if (sysconf(_SC_NPROCESSORS_CONF) > 1) +#endif + init_resp->have_os_monotonic_time = 0; + } + } + } +#endif /* !(defined(__linux__) && defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME)) */ + +#endif /* defined(ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT) */ + +#ifdef ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT + init_resp->os_monotonic_time_unit = ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT; +#endif + init_resp->sys_clock_resolution = SYS_CLOCK_RESOLUTION; + /* - * This (erts_ticks_per_sec) is only for times() (CLK_TCK), - * the resolution is always one millisecond.. + * This (erts_sys_time_data__.r.o.ticks_per_sec) is only for + * times() (CLK_TCK), the resolution is always one millisecond.. */ - if ((erts_ticks_per_sec = TICKS_PER_SEC()) < 0) - erl_exit(1, "Can't get clock ticks/sec\n"); - if (erts_ticks_per_sec >= 1000) { - /* Workaround for beta linux kernels, need to be done in runtime - to make erlang run on both 2.4 and 2.5 kernels. In the future, - the kernel ticks might as - well be used as a high res timer instead, but that's for when the - majority uses kernels with HZ == 1024 */ - ticks_bsr = 3; - } else { - ticks_bsr = 0; + if ((erts_sys_time_data__.r.o.ticks_per_sec = TICKS_PER_SEC()) < 0) + erts_exit(ERTS_ABORT_EXIT, "Can't get clock ticks/sec\n"); + +#if defined(OS_MONOTONIC_TIME_USING_TIMES) +#if ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT +# error Time unit is supposed to be determined at runtime... +#endif + { + ErtsMonotonicTime resolution = erts_sys_time_data__.r.o.ticks_per_sec; + ErtsMonotonicTime time_unit = resolution; + int shift = 0; + + while (time_unit < 1000*1000) { + time_unit <<= 1; + shift++; + } + + init_resp->os_monotonic_time_info.resolution = resolution; + init_resp->os_monotonic_time_unit = time_unit; + init_resp->os_monotonic_time_info.extended = 1; + internal_state.r.o.times_shift = shift; + + erts_init_os_monotonic_time_extender(&internal_state.wr.m.os_mtime_xtnd, + get_tick_count, + (1 << 29) / resolution); + } +#endif /* defined(OS_MONOTONIC_TIME_USING_TIMES) */ + +#ifdef WALL_CLOCK_ID_STR + init_resp->os_system_time_info.clock_id = WALL_CLOCK_ID_STR; +#else + init_resp->os_system_time_info.clock_id = NULL; +#endif + + init_resp->os_system_time_info.locked_use = 0; + init_resp->os_system_time_info.resolution = (Uint64) 1000*1000*1000; +#if defined(ERTS_HAVE_MACH_CLOCK_GETRES) && defined(WALL_CLOCK_ID) + init_resp->os_system_time_info.resolution + = mach_clock_getres(&internal_state.r.o.mach.clock.wall); +#elif defined(HAVE_CLOCK_GETRES) && defined(WALL_CLOCK_ID) + { + struct timespec ts; + if (clock_getres(WALL_CLOCK_ID, &ts) == 0) { + if (ts.tv_sec == 0 && ts.tv_nsec != 0) + init_resp->os_system_time_info.resolution /= ts.tv_nsec; + else if (ts.tv_sec >= 1) + init_resp->os_system_time_info.resolution = 1; + } + } +#endif + +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + init_resp->os_system_time_info.func = "clock_gettime"; +#elif defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) + init_resp->os_system_time_info.func = "clock_get_time"; +#elif defined(OS_SYSTEM_TIME_GETTIMEOFDAY) + init_resp->os_system_time_info.func = "gettimeofday"; + init_resp->os_system_time_info.resolution = 1000*1000; + init_resp->os_system_time_info.clock_id = NULL; +#else +# error Missing erts_os_system_time() implementation +#endif + + init_perf_counter(); + +} + +void +erts_late_sys_init_time(void) +{ +#if defined(OS_MONOTONIC_TIME_USING_TIMES) + erts_late_init_os_monotonic_time_extender(&internal_state.wr.m.os_mtime_xtnd); +#endif +} + +static ERTS_INLINE ErtsSystemTime +adj_stime_time_unit(ErtsSystemTime stime, Uint32 res) +{ + if (res == ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT) + return stime; + if (res == (Uint32) 1000*1000*1000 + && ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT == 1000*1000) + return stime/1000; + if (res == (Uint32) 1000*1000 + && ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT == 1000*1000*1000) + return stime*1000; + return ((ErtsSystemTime) + erts_time_unit_conversion(stime, + (Uint32) res, + (Uint32) ERTS_MONOTONIC_TIME_UNIT)); +} + +#define ERTS_TimeSpec2Sint64(TS) \ + ((((Sint64) (TS)->tv_sec) * ((Sint64) 1000*1000*1000)) \ + + ((Sint64) (TS)->tv_nsec)) + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * POSIX clock_gettime() * +\* */ + +#if defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) \ + || defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + +static ERTS_INLINE Sint64 +posix_clock_gettime(clockid_t id, char *name) +{ + struct timespec ts; + + if (clock_gettime(id, &ts) != 0) { + int err = errno; + char *errstr = err ? strerror(err) : "unknown"; + erts_exit(ERTS_ABORT_EXIT, + "clock_gettime(%s, _) failed: %s (%d)\n", + name, errstr, err); + } + return ERTS_TimeSpec2Sint64(&ts); +} + +#endif /* defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) \ + || defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) */ + +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + +ErtsSystemTime +erts_os_system_time(void) +{ + Sint64 stime = posix_clock_gettime(WALL_CLOCK_ID, + WALL_CLOCK_ID_STR); + return adj_stime_time_unit((ErtsSystemTime) stime, + (Uint32) 1000*1000*1000); +} + +#endif /* defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) */ + +#if defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) + +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + +#define ERTS_HAVE_ERTS_OS_TIMES_IMPL__ + +static ERTS_INLINE void +posix_clock_gettime_times(clockid_t mid, char *mname, + ErtsMonotonicTime *mtimep, + clockid_t sid, char *sname, + ErtsSystemTime *stimep) +{ + struct timespec mts, sts; + int mres, sres, merr, serr; + + mres = clock_gettime(mid, &mts); + merr = errno; + sres = clock_gettime(sid, &sts); + serr = errno; + + if (mres != 0) { + char *errstr = merr ? strerror(merr) : "unknown"; + erts_exit(ERTS_ABORT_EXIT, + "clock_gettime(%s, _) failed: %s (%d)\n", + mname, errstr, merr); + } + if (sres != 0) { + char *errstr = serr ? strerror(serr) : "unknown"; + erts_exit(ERTS_ABORT_EXIT, + "clock_gettime(%s, _) failed: %s (%d)\n", + sname, errstr, serr); + } + + *mtimep = (ErtsMonotonicTime) ERTS_TimeSpec2Sint64(&mts); + *stimep = (ErtsSystemTime) ERTS_TimeSpec2Sint64(&sts); +} + +#endif /* defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) */ + +#if defined(__linux__) + +static ErtsMonotonicTime clock_gettime_monotonic_verified(void) +{ + ErtsMonotonicTime mtime; + + mtime = (ErtsMonotonicTime) posix_clock_gettime(MONOTONIC_CLOCK_ID, + MONOTONIC_CLOCK_ID_STR); + + erts_smp_mtx_lock(&internal_state.w.f.mtx); + if (mtime < internal_state.w.f.last_delivered) + mtime = internal_state.w.f.last_delivered; + else + internal_state.w.f.last_delivered = mtime; + erts_smp_mtx_unlock(&internal_state.w.f.mtx); + + return mtime; +} + +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + +static void clock_gettime_times_verified(ErtsMonotonicTime *mtimep, + ErtsSystemTime *stimep) +{ + posix_clock_gettime_times(MONOTONIC_CLOCK_ID, + MONOTONIC_CLOCK_ID_STR, + mtimep, + WALL_CLOCK_ID, + WALL_CLOCK_ID_STR, + stimep); + + erts_smp_mtx_lock(&internal_state.w.f.mtx); + if (*mtimep < internal_state.w.f.last_delivered) + *mtimep = internal_state.w.f.last_delivered; + else + internal_state.w.f.last_delivered = *mtimep; + erts_smp_mtx_unlock(&internal_state.w.f.mtx); +} + +#endif /* defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) */ + +static ErtsMonotonicTime clock_gettime_monotonic(void) +{ + return (ErtsMonotonicTime) posix_clock_gettime(MONOTONIC_CLOCK_ID, + MONOTONIC_CLOCK_ID_STR); +} + +#if defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) + +static ErtsMonotonicTime clock_gettime_monotonic_raw(void) +{ + return (ErtsMonotonicTime) posix_clock_gettime(CLOCK_MONOTONIC_RAW, + "CLOCK_MONOTONIC_RAW"); +} + +#endif /* defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) */ + +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + +static void clock_gettime_times(ErtsMonotonicTime *mtimep, + ErtsSystemTime *stimep) +{ + posix_clock_gettime_times(MONOTONIC_CLOCK_ID, + MONOTONIC_CLOCK_ID_STR, + mtimep, + WALL_CLOCK_ID, + WALL_CLOCK_ID_STR, + stimep); +} + +#if defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) + +static void clock_gettime_times_raw(ErtsMonotonicTime *mtimep, + ErtsSystemTime *stimep) +{ + posix_clock_gettime_times(CLOCK_MONOTONIC_RAW, + "CLOCK_MONOTONIC_RAW", + mtimep, + WALL_CLOCK_ID, + WALL_CLOCK_ID_STR, + stimep); +} + +#endif /* defined(HAVE_CLOCK_GETTIME_MONOTONIC_RAW) */ + +#endif /* defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) */ + +#else /* !defined(__linux__) */ + +ErtsMonotonicTime erts_os_monotonic_time(void) +{ + return posix_clock_gettime(MONOTONIC_CLOCK_ID, + MONOTONIC_CLOCK_ID_STR); +} + +#if defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) + +void erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + posix_clock_gettime_times(MONOTONIC_CLOCK_ID, + MONOTONIC_CLOCK_ID_STR, + mtimep, + WALL_CLOCK_ID, + WALL_CLOCK_ID_STR, + stimep); +} + +#endif /* defined(OS_SYSTEM_TIME_USING_CLOCK_GETTIME) */ + +#endif /* !defined(__linux__) */ + +#endif /* defined(OS_MONOTONIC_TIME_USING_CLOCK_GETTIME) */ + +#if defined(SYS_HRTIME_USING_CLOCK_GETTIME) +# define ERTS_HAVE_ERTS_SYS_HRTIME_IMPL__ + +ErtsSysHrTime +erts_sys_hrtime(void) +{ + return (ErtsSysHrTime) posix_clock_gettime(HRTIME_CLOCK_ID, + HRTIME_CLOCK_ID_STR); +} + +#endif /* defined(SYS_HRTIME_USING_CLOCK_GETTIME) */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * MACH clock_get_time() * +\* */ + +#if defined(ERTS_MACH_CLOCKS) + +static void +mach_clocks_fini(void) +{ + mach_port_t task = mach_task_self(); + mach_port_deallocate(task, internal_state.r.o.mach.host); +#if defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) + mach_port_deallocate(task, internal_state.r.o.mach.clock.monotonic.srv); +#endif +#if defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) + mach_port_deallocate(task, internal_state.r.o.mach.clock.wall.srv); +#endif +} + +static void +mach_clocks_init(void) +{ + kern_return_t kret; + host_name_port_t host; + clock_id_t id; + clock_serv_t *clck_srv_p; + char *name; + + host = internal_state.r.o.mach.host = mach_host_self(); + +#if defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) \ + || defined(SYS_HRTIME_USING_MACH_CLOCK_GET_TIME) + id = internal_state.r.o.mach.clock.monotonic.id = MONOTONIC_CLOCK_ID; + name = internal_state.r.o.mach.clock.monotonic.name = MONOTONIC_CLOCK_ID_STR; + clck_srv_p = &internal_state.r.o.mach.clock.monotonic.srv; + kret = host_get_clock_service(host, id, clck_srv_p); + if (kret != KERN_SUCCESS) { + erts_exit(ERTS_ABORT_EXIT, + "host_get_clock_service(_, %s, _) failed\n", + name); + } +#endif + +#if defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) + id = internal_state.r.o.mach.clock.wall.id = WALL_CLOCK_ID; + name = internal_state.r.o.mach.clock.wall.name = WALL_CLOCK_ID_STR; + clck_srv_p = &internal_state.r.o.mach.clock.wall.srv; + kret = host_get_clock_service(host, id, clck_srv_p); + if (kret != KERN_SUCCESS) { + erts_exit(ERTS_ABORT_EXIT, + "host_get_clock_service(_, %s, _) failed\n", + name); + } +#endif + + if (atexit(mach_clocks_fini) != 0) { + int err = errno; + char *errstr = err ? strerror(err) : "unknown"; + erts_exit(ERTS_ABORT_EXIT, + "Failed to register mach_clocks_fini() " + "for call at exit: %s (%d)\n", + errstr, err); + } +} + +#ifdef ERTS_HAVE_MACH_CLOCK_GETRES + +static Sint64 +mach_clock_getres(ErtsMachClock *clk) +{ + kern_return_t kret; + natural_t attr[1]; + mach_msg_type_number_t cnt; + + cnt = sizeof(attr); + kret = clock_get_attributes(clk->srv, + CLOCK_GET_TIME_RES, + (clock_attr_t) attr, + &cnt); + if (kret != KERN_SUCCESS || cnt != 1) { + erts_exit(ERTS_ABORT_EXIT, + "clock_get_attributes(%s, _) failed\n", + clk->name); + } + + return (Sint64) attr[0]; +} + +#endif /* ERTS_HAVE_MACH_CLOCK_GETRES */ + +static ERTS_INLINE Sint64 +mach_clock_get_time(ErtsMachClock *clk) +{ + kern_return_t kret; + mach_timespec_t time_spec; + + kret = clock_get_time(clk->srv, &time_spec); + if (kret != KERN_SUCCESS) + erts_exit(ERTS_ABORT_EXIT, "clock_get_time(%s, _) failed\n", clk->name); + + return ERTS_TimeSpec2Sint64(&time_spec); +} + +#endif /* defined(ERTS_MACH_CLOCKS) */ + +#if defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) + +#define ERTS_HAVE_ERTS_OS_TIMES_IMPL__ + +ErtsSystemTime +erts_os_system_time(void) +{ + Sint64 stime = mach_clock_get_time(&internal_state.r.o.mach.clock.wall); + return adj_stime_time_unit((ErtsSystemTime) stime, + (Uint32) 1000*1000*1000); +} + +#endif /* defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) */ + +#if defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) + +ErtsMonotonicTime +erts_os_monotonic_time(void) +{ + return (ErtsMonotonicTime) + mach_clock_get_time(&internal_state.r.o.mach.clock.monotonic); +} + +#if defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) + +#define ERTS_HAVE_ERTS_OS_TIMES_IMPL__ + +void +erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + kern_return_t mkret, skret; + mach_timespec_t mon_time_spec, sys_time_spec; + + mkret = clock_get_time(internal_state.r.o.mach.clock.monotonic.srv, + &mon_time_spec); + skret = clock_get_time(internal_state.r.o.mach.clock.wall.srv, + &sys_time_spec); + + if (mkret != KERN_SUCCESS) + erts_exit(ERTS_ABORT_EXIT, + "clock_get_time(%s, _) failed\n", + internal_state.r.o.mach.clock.monotonic.name); + if (skret != KERN_SUCCESS) + erts_exit(ERTS_ABORT_EXIT, + "clock_get_time(%s, _) failed\n", + internal_state.r.o.mach.clock.wall.name); + + *mtimep = (ErtsMonotonicTime) ERTS_TimeSpec2Sint64(&mon_time_spec); + *stimep = (ErtsSystemTime) ERTS_TimeSpec2Sint64(&sys_time_spec); +} + +#endif /* defined(OS_SYSTEM_TIME_USING_MACH_CLOCK_GET_TIME) */ + +#endif /* defined(OS_MONOTONIC_TIME_USING_MACH_CLOCK_GET_TIME) */ + +#if defined(SYS_HRTIME_USING_MACH_CLOCK_GET_TIME) + +#define ERTS_HAVE_ERTS_SYS_HRTIME_IMPL__ + +ErtsSysHrTime +erts_sys_hrtime(void) +{ + return (ErtsSysHrTime) + mach_clock_get_time(&internal_state.r.o.mach.clock.monotonic); +} + +#endif /* defined(SYS_HRTIME_USING_MACH_CLOCK_GET_TIME) */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * Solaris gethrtime() - OS monotonic time * +\* */ + +#if defined(OS_MONOTONIC_TIME_USING_GETHRTIME) + +ErtsMonotonicTime erts_os_monotonic_time(void) +{ + return (ErtsMonotonicTime) gethrtime(); +} + +#endif /* defined(OS_MONOTONIC_TIME_USING_GETHRTIME) */ + +#if defined(SYS_HRTIME_USING_GETHRTIME) + +#define ERTS_HAVE_ERTS_SYS_HRTIME_IMPL__ + +ErtsSysHrTime +erts_sys_hrtime(void) +{ + return (ErtsSysHrTime) gethrtime(); +} + +#endif /* defined(SYS_HRTIME_USING_GETHRTIME) */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * gettimeofday() - OS system time * +\* */ + +#if defined(OS_SYSTEM_TIME_GETTIMEOFDAY) + +ErtsSystemTime +erts_os_system_time(void) +{ + ErtsSystemTime stime; + struct timeval tv; + + if (gettimeofday(&tv, NULL) != 0) { + int err = errno; + char *errstr = err ? strerror(err) : "unknown"; + erts_exit(ERTS_ABORT_EXIT, + "gettimeofday(_, NULL) failed: %s (%d)\n", + errstr, err); + } + + stime = (ErtsSystemTime) tv.tv_sec; + stime *= (ErtsSystemTime) 1000*1000; + stime += (ErtsSystemTime) tv.tv_usec; + + return adj_stime_time_unit(stime, (Uint32) 1000*1000); +} + +#endif /* defined(OS_SYSTEM_TIME_GETTIMEOFDAY) */ + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * times() - OS monotonic time * +\* */ + +#if defined(OS_MONOTONIC_TIME_USING_TIMES) + +ErtsMonotonicTime +erts_os_monotonic_time(void) +{ + Uint32 ticks = get_tick_count(); + ERTS_CHK_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks); + return ERTS_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks) << internal_state.r.o.times_shift; +} + +#endif + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * Fallbacks * +\* */ + +#ifndef ERTS_HAVE_ERTS_SYS_HRTIME_IMPL__ + +ErtsSysHrTime +erts_sys_hrtime(void) +{ + return (ErtsSysHrTime) ERTS_MONOTONIC_TO_NSEC(erts_os_system_time()); +} + +#endif + +#if !defined(ERTS_HAVE_ERTS_OS_TIMES_IMPL__) \ + && defined(ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT) + +void +erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + *mtimep = erts_os_monotonic_time(); + *stimep = erts_os_system_time(); +} + +#endif + +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *\ + * Performance counter functions * +\* */ + + +/* What resolution to spin to in micro seconds */ +#define RESOLUTION 100 +/* How many iterations to spin */ +#define ITERATIONS 1 +/* How many significant figures to round to */ +#define SIGFIGS 3 + +static ErtsSysPerfCounter calculate_perf_counter_unit(void) { + int i; + ErtsSysPerfCounter pre, post; + double value = 0; + double round_factor; +#if defined(HAVE_GETHRTIME) && defined(GETHRTIME_WITH_CLOCK_GETTIME) + struct timespec basetime,comparetime; +#define __GETTIME(arg) clock_gettime(CLOCK_MONOTONIC,arg) +#define __GETUSEC(arg) (arg.tv_nsec / 1000) +#else + SysTimeval basetime,comparetime; +#define __GETTIME(arg) sys_gettimeofday(arg) +#define __GETUSEC(arg) arg.tv_usec +#endif + + for (i = 0; i < ITERATIONS; i++) { + /* Make sure usec just flipped over at current resolution */ + __GETTIME(&basetime); + do { + __GETTIME(&comparetime); + } while ((__GETUSEC(basetime) / RESOLUTION) == (__GETUSEC(comparetime) / RESOLUTION)); + + pre = erts_sys_perf_counter(); + + __GETTIME(&basetime); + do { + __GETTIME(&comparetime); + } while ((__GETUSEC(basetime) / RESOLUTION) == (__GETUSEC(comparetime) / RESOLUTION)); + + post = erts_sys_perf_counter(); + + value += post - pre; } - erts_ticks_per_sec_wrap = (erts_ticks_per_sec >> ticks_bsr); - return SYS_CLOCK_RESOLUTION; + /* After this value is ticks per us */ + value /= (RESOLUTION*ITERATIONS); + + /* We round to 3 significant figures */ + round_factor = pow(10.0, SIGFIGS - ceil(log10(value))); + value = ((ErtsSysPerfCounter)(value * round_factor + 0.5)) / round_factor; + + /* convert to ticks per second */ + return 1000000 * value; } -clock_t sys_times_wrap(void) +static int have_rdtscp(void) { - SysTimes dummy; - clock_t result = (sys_times(&dummy) >> ticks_bsr); - return result; +#if defined(ETHR_X86_RUNTIME_CONF__) + /* On early x86 cpu's the tsc varies with the current speed of the cpu, + which means that the time per tick vary depending on the current + load of the cpu. We do not want this as it would give very scewed + numbers when the cpu is mostly idle. + The linux kernel seems to think that checking for constant and + reliable is enough to trust the counter so we do the same. + + If this test is not good enough, I don't know what we'll do. + Maybe fallback on erts_sys_hrtime always, but that would be a shame as + rdtsc is about 3 times faster than hrtime... */ + return ETHR_X86_RUNTIME_CONF_HAVE_CONSTANT_TSC__ && + ETHR_X86_RUNTIME_CONF_HAVE_TSC_RELIABLE__; +#else + return 0; +#endif } +static ErtsSysPerfCounter rdtsc(void) +{ + /* It may have been a good idea to put the cpuid instruction before + the rdtsc, but I decided against it because it is not really + needed for msacc, and it slows it down by quite a bit (5-7 times slower). + As a result though, this timestamp becomes much less + accurate as it might be re-ordered to be executed way before or after this + function is called. + */ + ErtsSysPerfCounter ts; +#if defined(__x86_64__) + __asm__ __volatile__ ("rdtsc\n\t" + "shl $32, %%rdx\n\t" + "or %%rdx, %0" : "=a" (ts) : : "rdx"); +#elif defined(__i386__) + __asm__ __volatile__ ("rdtsc\n\t" + : "=A" (ts) ); +#endif + return ts; +} +static void init_perf_counter(void) +{ + if (have_rdtscp()) { + erts_sys_time_data__.r.o.perf_counter = rdtsc; + erts_sys_time_data__.r.o.perf_counter_unit = calculate_perf_counter_unit(); + } else { + erts_sys_time_data__.r.o.perf_counter = erts_sys_hrtime; + erts_sys_time_data__.r.o.perf_counter_unit = ERTS_HRTIME_UNIT; + } +} +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ #ifdef HAVE_GETHRVTIME_PROCFS_IOCTL +/* The code below only has effect on solaris < 10, + needed in order for gehhrvtime to work properly */ int sys_start_hrvtime(void) { long msacct = PR_MSACCT; diff --git a/erts/emulator/sys/unix/sys_uds.c b/erts/emulator/sys/unix/sys_uds.c new file mode 100644 index 0000000000..dd0a3b03ff --- /dev/null +++ b/erts/emulator/sys/unix/sys_uds.c @@ -0,0 +1,155 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2002-2016. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#include "sys_uds.h" + +int +sys_uds_readv(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags) { + struct msghdr msg; + struct cmsghdr *cmsg = NULL; + char ancillary_buff[256] = {0}; + int res, i = 0; + + /* setup a place to fill in message contents */ + memset(&msg, 0, sizeof(struct msghdr)); + msg.msg_iov = iov; + msg.msg_iovlen = iov_len; + + /* provide space for the ancillary data */ + msg.msg_control = ancillary_buff; + msg.msg_controllen = sizeof(ancillary_buff); + + if((res = recvmsg(fd, &msg, flags)) < 0) { +#if defined(__APPLE__) && defined(__MACH__) && !defined(__DARWIN__) + /* When some OS X versions run out of fd's + they give EMSGSIZE instead of EMFILE. + We remap this as we want the correct + error to appear for the user */ + if (errno == EMSGSIZE) + errno = EMFILE; +#endif + return res; + } + + if((msg.msg_flags & MSG_CTRUNC) == MSG_CTRUNC) + { + /* We assume that we have given enough space for any header + that are sent to us. So the only remaining reason to get + this flag set is if the caller has run out of file descriptors. + */ + errno = EMFILE; + return -1; + } + + for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg) ) { + if ((cmsg->cmsg_level == SOL_SOCKET) && + (cmsg->cmsg_type == SCM_RIGHTS)) { + int *cmsg_data = (int *)CMSG_DATA(cmsg); + while ((char*)cmsg_data < (char*)cmsg + cmsg->cmsg_len) { + if (i < fd_count) { + fds[i++] = *cmsg_data++; + } else { + /* for some strange reason, we have received more FD's + than we wanted... close them if we are not running + debug. */ + if(i >= fd_count) abort(); + close(*cmsg_data++); + } + } + } + } + + return res; +} + +int +sys_uds_read(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags) { + struct iovec iov; + iov.iov_base = buff; + iov.iov_len = len; + return sys_uds_readv(fd, &iov, 1, fds, fd_count, flags); +} + + +int +sys_uds_writev(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags) { + + struct msghdr msg; + struct cmsghdr *cmsg = NULL; + int res, i; + + /* initialize socket message */ + memset(&msg, 0, sizeof(struct msghdr)); + + /* We flatten the iov if it is too long */ + if (iov_len > MAXIOV) { + int size = 0; + char *buff; + for (i = 0; i < iov_len; i++) + size += iov[i].iov_len; + buff = malloc(size); + + for (i = 0; i < iov_len; i++) { + memcpy(buff, iov[i].iov_base, iov[i].iov_len); + buff += iov[i].iov_len; + } + + iov[0].iov_base = buff - size; + iov[0].iov_len = size; + msg.msg_iov = iov; + msg.msg_iovlen = 1; + } else { + msg.msg_iov = iov; + msg.msg_iovlen = iov_len; + } + + /* initialize the ancillary data */ + msg.msg_control = calloc(1, CMSG_SPACE(sizeof(int) * fd_count)); + msg.msg_controllen = CMSG_SPACE(sizeof(int) * fd_count); + + /* copy the fd array into the ancillary data */ + cmsg = CMSG_FIRSTHDR(&msg); + if(!cmsg) abort(); + cmsg->cmsg_level = SOL_SOCKET; + cmsg->cmsg_type = SCM_RIGHTS; + cmsg->cmsg_len = CMSG_LEN(sizeof(int) * fd_count); + memcpy(CMSG_DATA(cmsg), fds, sizeof(int) * fd_count); + + res = sendmsg(fd, &msg, flags); + + if (iov_len > MAXIOV) + free(iov[0].iov_base); + + free(msg.msg_control); + + return res; +} + +int +sys_uds_write(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags) { + struct iovec iov; + iov.iov_base = buff; + iov.iov_len = len; + return sys_uds_writev(fd, &iov, 1, fds, fd_count, flags); +} diff --git a/erts/emulator/sys/unix/sys_uds.h b/erts/emulator/sys/unix/sys_uds.h new file mode 100644 index 0000000000..a598102d5c --- /dev/null +++ b/erts/emulator/sys/unix/sys_uds.h @@ -0,0 +1,57 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2002-2016. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +#ifndef _ERL_UNIX_UDS_H +#define _ERL_UNIX_UDS_H + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#if defined(__sun__) && !defined(_XOPEN_SOURCE) +#define _XOPEN_SOURCE 500 +#endif + +#include <limits.h> + +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/uio.h> + +#if defined IOV_MAX +#define MAXIOV IOV_MAX +#elif defined UIO_MAXIOV +#define MAXIOV UIO_MAXIOV +#else +#define MAXIOV 16 +#endif + +#include "sys.h" + +int sys_uds_readv(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags); +int sys_uds_read(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags); +int sys_uds_writev(int fd, struct iovec *iov, size_t iov_len, + int *fds, int fd_count, int flags); +int sys_uds_write(int fd, char *buff, size_t len, + int *fds, int fd_count, int flags); + +#endif /* #ifndef _ERL_UNIX_UDS_H */ diff --git a/erts/emulator/sys/win32/dosmap.c b/erts/emulator/sys/win32/dosmap.c index 15416a66c5..95fbba384b 100644 --- a/erts/emulator/sys/win32/dosmap.c +++ b/erts/emulator/sys/win32/dosmap.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1998-2009. All Rights Reserved. + * Copyright Ericsson AB 1998-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/win32/driver_int.h b/erts/emulator/sys/win32/driver_int.h index 97e188816e..50097d3fd2 100644 --- a/erts/emulator/sys/win32/driver_int.h +++ b/erts/emulator/sys/win32/driver_int.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2009. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/win32/erl_main.c b/erts/emulator/sys/win32/erl_main.c index 5471bffb52..9eee300f48 100644 --- a/erts/emulator/sys/win32/erl_main.c +++ b/erts/emulator/sys/win32/erl_main.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2000-2009. All Rights Reserved. + * Copyright Ericsson AB 2000-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ diff --git a/erts/emulator/sys/win32/erl_poll.c b/erts/emulator/sys/win32/erl_poll.c index 7a1d129cd5..f23c7ab03d 100644 --- a/erts/emulator/sys/win32/erl_poll.c +++ b/erts/emulator/sys/win32/erl_poll.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2007-2011. All Rights Reserved. + * Copyright Ericsson AB 2007-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -25,6 +26,8 @@ #include "sys.h" #include "erl_alloc.h" #include "erl_poll.h" +#include "erl_time.h" +#include "erl_msacc.h" /* * Some debug macros @@ -285,7 +288,7 @@ struct ErtsPollSet_ { #ifdef ERTS_SMP erts_smp_mtx_t mtx; #endif - erts_smp_atomic32_t timeout; + erts_atomic64_t timeout_time; }; #ifdef ERTS_SMP @@ -363,6 +366,26 @@ do { \ wait_standby(PS); \ } while(0) +static ERTS_INLINE void +init_timeout_time(ErtsPollSet ps) +{ + erts_atomic64_init_nob(&ps->timeout_time, + (erts_aint64_t) ERTS_MONOTONIC_TIME_MAX); +} + +static ERTS_INLINE void +set_timeout_time(ErtsPollSet ps, ErtsMonotonicTime time) +{ + erts_atomic64_set_relb(&ps->timeout_time, + (erts_aint64_t) time); +} + +static ERTS_INLINE ErtsMonotonicTime +get_timeout_time(ErtsPollSet ps) +{ + return (ErtsMonotonicTime) erts_atomic64_read_acqb(&ps->timeout_time); +} + #define ERTS_POLL_NOT_WOKEN ((erts_aint32_t) 0) #define ERTS_POLL_WOKEN_IO_READY ((erts_aint32_t) 1) #define ERTS_POLL_WOKEN_INTR ((erts_aint32_t) 2) @@ -401,7 +424,7 @@ static ERTS_INLINE int wakeup_cause(ErtsPollSet ps) { int res; - erts_aint32_t wakeup_state = erts_atomic32_read_nob(&ps->wakeup_state); + erts_aint32_t wakeup_state = erts_atomic32_read_acqb(&ps->wakeup_state); switch (wakeup_state) { case ERTS_POLL_WOKEN_IO_READY: res = 0; @@ -414,7 +437,7 @@ wakeup_cause(ErtsPollSet ps) break; default: res = 0; - erl_exit(ERTS_ABORT_EXIT, + erts_exit(ERTS_ABORT_EXIT, "%s:%d: Internal error: Invalid wakeup_state=%d\n", __FILE__, __LINE__, (int) wakeup_state); } @@ -422,15 +445,29 @@ wakeup_cause(ErtsPollSet ps) } static ERTS_INLINE DWORD -poll_wait_timeout(ErtsPollSet ps, SysTimeval *tvp) +poll_wait_timeout(ErtsPollSet ps, ErtsMonotonicTime timeout_time) { - time_t timeout = tvp->tv_sec * 1000 + tvp->tv_usec / 1000; + ErtsMonotonicTime current_time, diff_time, timeout; - if (timeout <= 0) { + if (timeout_time == ERTS_POLL_NO_TIMEOUT) { + no_timeout: + set_timeout_time(ps, ERTS_MONOTONIC_TIME_MIN); woke_up(ps); return (DWORD) 0; } + current_time = erts_get_monotonic_time(NULL); + diff_time = timeout_time - current_time; + if (diff_time <= 0) + goto no_timeout; + + /* Round up to nearest milli second */ + timeout = (ERTS_MONOTONIC_TO_MSEC(diff_time - 1) + 1); + if (timeout > INT_MAX) + timeout = INT_MAX; /* Also prevents DWORD overflow */ + + set_timeout_time(ps, current_time + ERTS_MSEC_TO_MONOTONIC(timeout)); + ResetEvent(ps->event_io_ready); /* * Since we don't know the internals of ResetEvent() we issue @@ -442,10 +479,6 @@ poll_wait_timeout(ErtsPollSet ps, SysTimeval *tvp) if (erts_atomic32_read_nob(&ps->wakeup_state) != ERTS_POLL_NOT_WOKEN) return (DWORD) 0; - if (timeout > ((time_t) ERTS_AINT32_T_MAX)) - timeout = ERTS_AINT32_T_MAX; /* Also prevents DWORD overflow */ - - erts_smp_atomic32_set_relb(&ps->timeout, (erts_aint32_t) timeout); return (DWORD) timeout; } @@ -454,9 +487,8 @@ wake_poller(ErtsPollSet ps, int io_ready) { erts_aint32_t wakeup_state; if (io_ready) { - /* We may set the event multiple times. This is, however, harmless. */ - wakeup_state = erts_atomic32_read_nob(&ps->wakeup_state); - erts_atomic32_set_relb(&ps->wakeup_state, ERTS_POLL_WOKEN_IO_READY); + wakeup_state = erts_atomic32_xchg_relb(&ps->wakeup_state, + ERTS_POLL_WOKEN_IO_READY); } else { ERTS_THR_MEMORY_BARRIER; @@ -544,7 +576,7 @@ static void signal_standby(ErtsPollSet ps) --(ps->standby_wait_counter); if (ps->standby_wait_counter < 0) { LeaveCriticalSection(&(ps->standby_crit)); - erl_exit(1,"Standby signalled by more threads than expected"); + erts_exit(ERTS_ERROR_EXIT,"Standby signalled by more threads than expected"); } if (!(ps->standby_wait_counter)) { SetEvent(ps->standby_wait_event); @@ -706,7 +738,7 @@ static void *break_waiter(void *param) erts_mtx_unlock(&break_waiter_lock); break; default: - erl_exit(1,"Unexpected event in break_waiter"); + erts_exit(ERTS_ERROR_EXIT,"Unexpected event in break_waiter"); } } } @@ -1012,12 +1044,12 @@ void erts_poll_interrupt(ErtsPollSet ps, int set /* bool */) void erts_poll_interrupt_timed(ErtsPollSet ps, int set /* bool */, - erts_short_time_t msec) + ErtsMonotonicTime timeout_time) { - HARDTRACEF(("In erts_poll_interrupt_timed(%d,%ld)",set,msec)); + HARDTRACEF(("In erts_poll_interrupt_timed(%d,%ld)",set,timeout_time)); if (!set) reset_interrupt(ps); - else if (erts_smp_atomic32_read_acqb(&ps->timeout) > (erts_aint32_t) msec) + else if (get_timeout_time(ps) > timeout_time) set_interrupt(ps); HARDTRACEF(("Out erts_poll_interrupt_timed")); } @@ -1085,14 +1117,14 @@ void erts_poll_controlv(ErtsPollSet ps, pcev[i].events, pcev[i].on); } - ERTS_POLLSET_LOCK(ps); + ERTS_POLLSET_UNLOCK(ps); HARDTRACEF(("Out erts_poll_controlv")); } int erts_poll_wait(ErtsPollSet ps, ErtsPollResFd pr[], int *len, - SysTimeval *tvp) + ErtsMonotonicTime timeout_time) { int no_fds; DWORD timeout; @@ -1125,7 +1157,7 @@ int erts_poll_wait(ErtsPollSet ps, HARDDEBUGF(("Oups!")); /* Oups, got signalled before we took the lock, can't reset */ if(!is_io_ready(ps)) { - erl_exit(1,"Internal error: " + erts_exit(ERTS_ERROR_EXIT,"Internal error: " "Inconsistent io structures in erl_poll.\n"); } START_WAITER(ps,w); @@ -1149,23 +1181,26 @@ int erts_poll_wait(ErtsPollSet ps, no_fds = ERTS_POLL_MAX_RES; #endif - timeout = poll_wait_timeout(ps, tvp); + timeout = poll_wait_timeout(ps, timeout_time); /*HARDDEBUGF(("timeout = %ld",(long) timeout));*/ if (timeout > 0 && !erts_atomic32_read_nob(&break_waiter_state)) { HANDLE harr[2] = {ps->event_io_ready, break_happened_event}; int num_h = 2; + ERTS_MSACC_PUSH_STATE_M(); HARDDEBUGF(("Start waiting %d [%d]",num_h, (int) timeout)); ERTS_POLLSET_UNLOCK(ps); #ifdef ERTS_SMP erts_thr_progress_prepare_wait(NULL); #endif + ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP); WaitForMultipleObjects(num_h, harr, FALSE, timeout); #ifdef ERTS_SMP erts_thr_progress_finalize_wait(NULL); #endif + ERTS_MSACC_POP_STATE_M(); ERTS_POLLSET_LOCK(ps); HARDDEBUGF(("Stop waiting %d [%d]",num_h, (int) timeout)); woke_up(ps); @@ -1183,7 +1218,7 @@ int erts_poll_wait(ErtsPollSet ps, ERTS_SET_BREAK_REQUESTED; break; case BREAK_WAITER_GOT_HALT: - erl_exit(0,""); + erts_exit(0,""); break; default: break; @@ -1242,7 +1277,7 @@ int erts_poll_wait(ErtsPollSet ps, erts_mtx_unlock(&w->mtx); } done: - erts_smp_atomic32_set_nob(&ps->timeout, ERTS_AINT32_T_MAX); + set_timeout_time(ps, ERTS_MONOTONIC_TIME_MAX); *len = num; ERTS_POLLSET_UNLOCK(ps); HARDTRACEF(("Out erts_poll_wait")); @@ -1326,7 +1361,7 @@ ErtsPollSet erts_poll_create_pollset(void) #ifdef ERTS_SMP erts_smp_mtx_init(&ps->mtx, "pollset"); #endif - erts_smp_atomic32_init_nob(&ps->timeout, ERTS_AINT32_T_MAX); + init_timeout_time(ps); HARDTRACEF(("Out erts_poll_create_pollset")); return ps; diff --git a/erts/emulator/sys/win32/erl_win32_sys_ddll.c b/erts/emulator/sys/win32/erl_win32_sys_ddll.c index 4c8d83ab16..274133a346 100644 --- a/erts/emulator/sys/win32/erl_win32_sys_ddll.c +++ b/erts/emulator/sys/win32/erl_win32_sys_ddll.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2006-2013. All Rights Reserved. + * Copyright Ericsson AB 2006-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -38,7 +39,7 @@ #include "erl_nif.h" #define EXT_LEN 4 -#define FILE_EXT ".dll" +#define FILE_EXT_WCHAR L".dll" static DWORD tls_index = 0; static TWinDynDriverCallbacks wddc; @@ -51,17 +52,22 @@ void erl_sys_ddll_init(void) { #define ERL_NIF_API_FUNC_DECL(RET,NAME,ARGS) nif_callbacks.NAME = NAME #include "erl_nif_api_funcs.h" #undef ERL_NIF_API_FUNC_DECL - + nif_callbacks.erts_alc_test = erts_alc_test; + return; } /* * Open a shared object + * Expecting 'full_name' as an UTF-8 string. */ -int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err) +int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err) { + HINSTANCE hinstance; int len; - char dlname[MAXPATHLEN + 1]; + wchar_t* wcp; + Sint used; + int code; if ((len = sys_strlen(full_name)) >= MAXPATHLEN - EXT_LEN) { if (err != NULL) { @@ -69,10 +75,26 @@ int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* } return ERL_DE_LOAD_ERROR_NAME_TO_LONG; } - sys_strcpy(dlname, full_name); - sys_strcpy(dlname+len, FILE_EXT); - return erts_sys_ddll_open_noext(dlname, handle, err); + + wcp = (wchar_t*)erts_convert_filename_to_wchar((byte*)full_name, len, + NULL, 0, + ERTS_ALC_T_TMP, &used, EXT_LEN); + wcscpy(&wcp[used/2 - 1], FILE_EXT_WCHAR); + + if ((hinstance = LoadLibraryW(wcp)) == NULL) { + code = ERL_DE_DYNAMIC_ERROR_OFFSET - GetLastError(); + if (err != NULL) { + err->str = erts_sys_ddll_error(code); + } + } + else { + code = ERL_DE_NO_ERROR; + *handle = (void *) hinstance; + } + erts_free(ERTS_ALC_T_TMP, wcp); + return code; } + int erts_sys_ddll_open_noext(char *dlname, void **handle, ErtsSysDdllError* err) { HINSTANCE hinstance; diff --git a/erts/emulator/sys/win32/erl_win_dyn_driver.h b/erts/emulator/sys/win32/erl_win_dyn_driver.h index a7c53c904d..6f28d513c2 100644 --- a/erts/emulator/sys/win32/erl_win_dyn_driver.h +++ b/erts/emulator/sys/win32/erl_win_dyn_driver.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2003-2013. All Rights Reserved. + * Copyright Ericsson AB 2003-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -80,8 +81,8 @@ WDD_TYPEDEF(int, erl_drv_output_term, (ErlDrvTermData, ErlDrvTermData*, int)); WDD_TYPEDEF(int, driver_output_term, (ErlDrvPort, ErlDrvTermData*, int)); WDD_TYPEDEF(int, erl_drv_send_term, (ErlDrvTermData, ErlDrvTermData, ErlDrvTermData*, int)); WDD_TYPEDEF(int, driver_send_term, (ErlDrvPort, ErlDrvTermData, ErlDrvTermData*, int)); +WDD_TYPEDEF(unsigned int, driver_async_port_key, (ErlDrvPort)); WDD_TYPEDEF(long, driver_async, (ErlDrvPort,unsigned int*,void (*)(void*),void*,void (*)(void*))); -WDD_TYPEDEF(int, driver_async_cancel, (unsigned int)); WDD_TYPEDEF(int, driver_lock_driver, (ErlDrvPort)); WDD_TYPEDEF(void *, driver_dl_open, (char *)); WDD_TYPEDEF(void *, driver_dl_sym, (void *, char *)); @@ -102,6 +103,11 @@ WDD_TYPEDEF(ErlDrvSInt, driver_pdl_inc_refc, (ErlDrvPDL)); WDD_TYPEDEF(ErlDrvSInt, driver_pdl_dec_refc, (ErlDrvPDL)); WDD_TYPEDEF(void, driver_system_info, (ErlDrvSysInfo *, size_t)); WDD_TYPEDEF(int, driver_get_now, (ErlDrvNowData *)); +WDD_TYPEDEF(ErlDrvTime, erl_drv_monotonic_time, (ErlDrvTimeUnit)); +WDD_TYPEDEF(ErlDrvTime, erl_drv_time_offset, (ErlDrvTimeUnit)); +WDD_TYPEDEF(ErlDrvTime, erl_drv_convert_time_unit, (ErlDrvTime, + ErlDrvTimeUnit, + ErlDrvTimeUnit)); WDD_TYPEDEF(int, driver_monitor_process, (ErlDrvPort port, ErlDrvTermData process, ErlDrvMonitor *monitor)); @@ -144,8 +150,8 @@ WDD_TYPEDEF(ErlDrvTid, erl_drv_thread_self, (void)); WDD_TYPEDEF(int, erl_drv_equal_tids, (ErlDrvTid tid1, ErlDrvTid tid2)); WDD_TYPEDEF(void, erl_drv_thread_exit, (void *resp)); WDD_TYPEDEF(int, erl_drv_thread_join, (ErlDrvTid, void **respp)); -WDD_TYPEDEF(int, erl_drv_putenv, (char *key, char *value)); -WDD_TYPEDEF(int, erl_drv_getenv, (char *key, char *value, size_t *value_size)); +WDD_TYPEDEF(int, erl_drv_putenv, (const char *key, char *value)); +WDD_TYPEDEF(int, erl_drv_getenv, (const char *key, char *value, size_t *value_size)); typedef struct { WDD_FTYPE(null_func) *null_func; @@ -197,8 +203,8 @@ typedef struct { WDD_FTYPE(driver_output_term) *driver_output_term; WDD_FTYPE(erl_drv_send_term) *erl_drv_send_term; WDD_FTYPE(driver_send_term) *driver_send_term; + WDD_FTYPE(driver_async_port_key) *driver_async_port_key; WDD_FTYPE(driver_async) *driver_async; - WDD_FTYPE(driver_async_cancel) *driver_async_cancel; WDD_FTYPE(driver_lock_driver) *driver_lock_driver; WDD_FTYPE(driver_dl_open) *driver_dl_open; WDD_FTYPE(driver_dl_sym) *driver_dl_sym; @@ -216,6 +222,9 @@ typedef struct { WDD_FTYPE(driver_pdl_dec_refc) *driver_pdl_dec_refc; WDD_FTYPE(driver_system_info) *driver_system_info; WDD_FTYPE(driver_get_now) *driver_get_now; + WDD_FTYPE(erl_drv_monotonic_time) *erl_drv_monotonic_time; + WDD_FTYPE(erl_drv_time_offset) *erl_drv_time_offset; + WDD_FTYPE(erl_drv_convert_time_unit) *erl_drv_convert_time_unit; WDD_FTYPE(driver_monitor_process) *driver_monitor_process; WDD_FTYPE(driver_demonitor_process) *driver_demonitor_process; WDD_FTYPE(driver_get_monitored_process) *driver_get_monitored_process; @@ -308,8 +317,8 @@ extern TWinDynDriverCallbacks WinDynDriverCallbacks; #define driver_output_term (WinDynDriverCallbacks.driver_output_term) #define erl_drv_send_term (WinDynDriverCallbacks.erl_drv_send_term) #define driver_send_term (WinDynDriverCallbacks.driver_send_term) +#define driver_async_port_key (WinDynDriverCallbacks.driver_async_port_key) #define driver_async (WinDynDriverCallbacks.driver_async) -#define driver_async_cancel (WinDynDriverCallbacks.driver_async_cancel) #define driver_lock_driver (WinDynDriverCallbacks.driver_lock_driver) #define driver_dl_open (WinDynDriverCallbacks.driver_dl_open) #define driver_dl_sym (WinDynDriverCallbacks.driver_dl_sym) @@ -327,6 +336,9 @@ extern TWinDynDriverCallbacks WinDynDriverCallbacks; #define driver_pdl_dec_refc (WinDynDriverCallbacks.driver_pdl_dec_refc) #define driver_system_info (WinDynDriverCallbacks.driver_system_info) #define driver_get_now (WinDynDriverCallbacks.driver_get_now) +#define erl_drv_monotonic_time (WinDynDriverCallbacks.erl_drv_monotonic_time) +#define erl_drv_time_offset (WinDynDriverCallbacks.erl_drv_time_offset) +#define erl_drv_convert_time_unit (WinDynDriverCallbacks.erl_drv_convert_time_unit) #define driver_monitor_process \ (WinDynDriverCallbacks.driver_monitor_process) #define driver_demonitor_process \ @@ -443,8 +455,8 @@ do { \ ((W).driver_output_term) = driver_output_term; \ ((W).erl_drv_send_term) = erl_drv_send_term; \ ((W).driver_send_term) = driver_send_term; \ +((W).driver_async_port_key) = driver_async_port_key; \ ((W).driver_async) = driver_async; \ -((W).driver_async_cancel) = driver_async_cancel; \ ((W).driver_lock_driver) = driver_lock_driver; \ ((W).driver_dl_open) = driver_dl_open; \ ((W).driver_dl_sym) = driver_dl_sym; \ @@ -462,6 +474,9 @@ do { \ ((W).driver_pdl_dec_refc) = driver_pdl_dec_refc; \ ((W).driver_system_info) = driver_system_info; \ ((W).driver_get_now) = driver_get_now; \ +((W).erl_drv_monotonic_time) = erl_drv_monotonic_time; \ +((W).erl_drv_time_offset) = erl_drv_time_offset; \ +((W).erl_drv_convert_time_unit) = erl_drv_convert_time_unit; \ ((W).driver_monitor_process) = driver_monitor_process; \ ((W).driver_demonitor_process) = driver_demonitor_process; \ ((W).driver_get_monitored_process) = driver_get_monitored_process; \ diff --git a/erts/emulator/sys/win32/erl_win_sys.h b/erts/emulator/sys/win32/erl_win_sys.h index 5ce1a61303..04fbf23109 100644 --- a/erts/emulator/sys/win32/erl_win_sys.h +++ b/erts/emulator/sys/win32/erl_win_sys.h @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2012. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -60,16 +61,18 @@ #include <windows.h> #undef WIN32_LEAN_AND_MEAN -/* - * Define MAXPATHLEN in terms of MAXPATH if available. - */ - -#ifndef MAXPATH -#define MAXPATH MAX_PATH -#endif /* MAXPATH */ #ifndef MAXPATHLEN -#define MAXPATHLEN MAXPATH +#define MAXPATHLEN 4096 +/* + erts-6.0 (OTP 17.0): + We now accept windows paths longer than 260 (MAX_PATH) by conversion to + UNC path format. In order to also return long paths from the driver we + increased MAXPATHLEN from 260 to larger (but arbitrary) value 4096. + It would of course be nicer to instead dynamically allocate large enough + tmp buffers when efile_drv needs to return really long paths, and do that + for unix as well. + */ #endif /* MAXPATHLEN */ /* @@ -82,7 +85,6 @@ #define NO_ERF #define NO_ERFC -#define NO_SYSLOG #define NO_SYSCONF #define NO_DAEMON #define NO_PWD @@ -95,6 +97,8 @@ # define ERTS_I64_LITERAL(X) X##i64 #endif +#define ERTS_HAVE_ERTS_SYS_ALIGNED_ALLOC 1 + /* * Practial Windows specific macros. */ @@ -102,22 +106,21 @@ #define CreateAutoEvent(state) CreateEvent(NULL, FALSE, state, NULL) #define CreateManualEvent(state) CreateEvent(NULL, TRUE, state, NULL) +/* + * Min number of async threads + */ +#define ERTS_MIN_NO_OF_ASYNC_THREADS 0 /* * Our own type of "FD's" */ +#define ERTS_SYS_FD_INVALID INVALID_HANDLE_VALUE #define ERTS_SYS_FD_TYPE HANDLE #define NO_FSTAT_ON_SYS_FD_TYPE 1 /* They are events, not files */ -#define HAVE_ERTS_CHECK_IO_DEBUG -int erts_check_io_debug(void); - /* * For erl_time_sup */ -#define HAVE_GETHRTIME - -#define sys_init_hrtime() /* Nothing */ #define SYS_CLK_TCK 1000 #define SYS_CLOCK_RESOLUTION 1 @@ -159,18 +162,95 @@ typedef struct { #if defined (__GNUC__) typedef unsigned long long Uint64; typedef long long Sint64; - -typedef long long SysHrTime; +# ifdef ULLONG_MAX +# define ERTS_UINT64_MAX ULLONG_MAX +# endif +# ifdef LLONG_MAX +# define ERTS_SINT64_MAX LLONG_MAX +# endif +# ifdef LLONG_MIN +# define ERTS_SINT64_MIN LLONG_MIN +# endif + +typedef long long ErtsMonotonicTime; +typedef long long ErtsSysHrTime; #else typedef ULONGLONG Uint64; typedef LONGLONG Sint64; -typedef LONGLONG SysHrTime; +typedef LONGLONG ErtsMonotonicTime; +typedef LONGLONG ErtsSysHrTime; #endif -extern int sys_init_time(void); +typedef ErtsMonotonicTime ErtsSystemTime; +typedef ErtsMonotonicTime ErtsSysPerfCounter; + +ErtsSystemTime erts_os_system_time(void); + +#define ERTS_MONOTONIC_TIME_MIN ((ErtsMonotonicTime) (1ULL << 63)) +#define ERTS_MONOTONIC_TIME_MAX (~ERTS_MONOTONIC_TIME_MIN) + +#define ERTS_HAVE_OS_MONOTONIC_TIME_SUPPORT 1 +#define ERTS_COMPILE_TIME_MONOTONIC_TIME_UNIT 0 + +struct erts_sys_time_read_only_data__ { + ErtsMonotonicTime (*os_monotonic_time)(void); + void (*os_times)(ErtsMonotonicTime *, ErtsSystemTime*); + ErtsSysHrTime (*sys_hrtime)(void); +}; + +typedef struct { + union { + struct erts_sys_time_read_only_data__ o; + char align__[(((sizeof(struct erts_sys_time_read_only_data__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } r; +} ErtsSysTimeData__; + +extern ErtsSysTimeData__ erts_sys_time_data__; + +ERTS_GLB_INLINE ErtsMonotonicTime erts_os_monotonic_time(void); +ERTS_GLB_INLINE void erts_os_times(ErtsMonotonicTime *, + ErtsSystemTime *); +ERTS_GLB_INLINE ErtsSysHrTime erts_sys_hrtime(void); +ERTS_GLB_INLINE ErtsSysPerfCounter erts_sys_perf_counter(void); + +#if ERTS_GLB_INLINE_INCL_FUNC_DEF + +ERTS_GLB_INLINE ErtsMonotonicTime +erts_os_monotonic_time(void) +{ + return (*erts_sys_time_data__.r.o.os_monotonic_time)(); +} + +ERTS_GLB_INLINE void +erts_os_times(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + (*erts_sys_time_data__.r.o.os_times)(mtimep, stimep); +} + +ERTS_GLB_INLINE ErtsSysHrTime +erts_sys_hrtime(void) +{ + return (*erts_sys_time_data__.r.o.sys_hrtime)(); +} + +ERTS_GLB_INLINE ErtsSysPerfCounter +erts_sys_perf_counter(void) +{ + return (*erts_sys_time_data__.r.o.sys_hrtime)(); +} + +ERTS_GLB_INLINE ErtsSysPerfCounter +erts_sys_perf_counter_unit(void) +{ + return 1000 * 1000 * 1000; +} + +#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */ + extern void sys_gettimeofday(SysTimeval *tv); -extern SysHrTime sys_gethrtime(void); extern clock_t sys_times(SysTimes *buffer); extern char *win_build_environment(char *); @@ -202,6 +282,8 @@ extern volatile int erl_fp_exception; int _finite(double x); #endif +#define erts_isfinite _finite + /*#define NO_FPE_SIGNALS*/ #define erts_get_current_fp_exception() NULL #define __ERTS_FP_CHECK_INIT(fpexnp) do {} while (0) @@ -231,4 +313,16 @@ typedef long ssize_t; int init_async(int); int exit_async(void); #endif + +#define ERTS_HAVE_TRY_CATCH 1 + +#define ERTS_SYS_TRY_CATCH(EXPR,CATCH) \ + __try { \ + EXPR; \ + } \ + __except(GetExceptionCode() == EXCEPTION_ACCESS_VIOLATION ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH) \ + { \ + CATCH; \ + } + #endif diff --git a/erts/emulator/sys/win32/sys.c b/erts/emulator/sys/win32/sys.c index 922967c698..cf821b05cb 100755..100644 --- a/erts/emulator/sys/win32/sys.c +++ b/erts/emulator/sys/win32/sys.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1996-2013. All Rights Reserved. + * Copyright Ericsson AB 1996-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -32,12 +33,12 @@ #include "erl_threads.h" #include "../../drivers/win32/win_con.h" #include "erl_cpu_topology.h" - +#include <malloc.h> void erts_sys_init_float(void); void erl_start(int, char**); -void erl_exit(int n, char*, ...); +void erts_exit(int n, char*, ...); void erl_error(char*, va_list); void erl_crash_dump(char*, int, char*, ...); @@ -69,17 +70,14 @@ static void async_read_file(struct async_io* aio, LPVOID buf, DWORD numToRead); static int async_write_file(struct async_io* aio, LPVOID buf, DWORD numToWrite); static int get_overlapped_result(struct async_io* aio, LPDWORD pBytesRead, BOOL wait); -static BOOL create_child_process(char *, HANDLE, HANDLE, +static BOOL create_child_process(wchar_t *, HANDLE, HANDLE, HANDLE, LPHANDLE, LPDWORD, BOOL, - LPVOID, LPTSTR, unsigned, - char **, int *); + LPVOID, wchar_t*, unsigned, + wchar_t **, int *); static int create_pipe(LPHANDLE, LPHANDLE, BOOL, BOOL); -static int application_type(const char* originalName, char fullPath[MAX_PATH], +static int application_type(const wchar_t* originalName, wchar_t fullPath[MAX_PATH], BOOL search_in_path, BOOL handle_quotes, int *error_return); -static int application_type_w(const WCHAR *originalName, WCHAR fullPath[MAX_PATH], - BOOL search_in_path, BOOL handle_quotes, - int *error_return); HANDLE erts_service_event; @@ -202,7 +200,7 @@ erts_sys_misc_mem_sz(void) */ void sys_tty_reset(int exit_code) { - if (exit_code > 0) + if (exit_code == ERTS_ERROR_EXIT) ConWaitForExit(); else ConNormalExit(); @@ -250,6 +248,27 @@ void erl_sys_args(int* argc, char** argv) #endif } +/* + * Function returns 1 if we can read from all values in between + * start and stop. + */ +int +erts_sys_is_area_readable(char *start, char *stop) { + volatile char tmp; + __try + { + while(start < stop) { + tmp = *start; + start++; + } + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + return 0; + } + return 1; +} + int erts_sys_prepare_crash_dump(int secs) { Port *heart_port; @@ -1173,7 +1192,7 @@ spawn_init(void) } static ErlDrvData -spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) +spawn_start(ErlDrvPort port_num, char* utf8_name, SysDriverOpts* opts) { HANDLE hToChild = INVALID_HANDLE_VALUE; /* Write handle to child. */ HANDLE hFromChild = INVALID_HANDLE_VALUE; /* Read handle from child. */ @@ -1189,7 +1208,9 @@ spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) SECURITY_ATTRIBUTES sa = {sizeof(SECURITY_ATTRIBUTES), NULL, TRUE}; char* envir = opts->envir; int errno_return = -1; - + wchar_t *name; + int len; + if (opts->read_write & DO_READ) neededSelects++; if (opts->read_write & DO_WRITE) @@ -1247,26 +1268,40 @@ spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) * Spawn the port program. */ - DEBUGF(("Spawning \"%s\"\n", name)); + if ((len = MultiByteToWideChar(CP_UTF8, 0, utf8_name, -1, NULL, 0)) > 0) { + name = erts_alloc(ERTS_ALC_T_TMP, len*sizeof(wchar_t)); + MultiByteToWideChar(CP_UTF8, 0, utf8_name, -1, name, len); + } else { /* Not valid utf-8, just convert byte to wchar */ + int i; + len = strlen(utf8_name); + name = erts_alloc(ERTS_ALC_T_TMP, (1+len)*sizeof(wchar_t)); + for(i=0; i<len; i++) { + name[i] = (wchar_t) utf8_name[i]; + } + name[i] = L'\0'; + } + DEBUGF(("Spawning \"%S\"\n", name)); envir = win_build_environment(envir); /* Always a unicode environment */ - ok = create_child_process(name, - hChildStdin, - hChildStdout, - hChildStderr, - &dp->port_pid, - &pid, - opts->hide_window, - (LPVOID) envir, - (LPTSTR) opts->wd, - opts->spawn_type, - opts->argv, - &errno_return); + ok = create_child_process(name, + hChildStdin, + hChildStdout, + hChildStderr, + &dp->port_pid, + &pid, + opts->hide_window, + (LPVOID) envir, + (wchar_t *) opts->wd, + opts->spawn_type, + (wchar_t **) opts->argv, + &errno_return); CloseHandle(hChildStdin); CloseHandle(hChildStdout); if (close_child_stderr && hChildStderr != INVALID_HANDLE_VALUE && hChildStderr != 0) { CloseHandle(hChildStderr); } + erts_free(ERTS_ALC_T_TMP, name); + if (envir != NULL) { erts_free(ERTS_ALC_T_ENVIRONMENT, envir); } @@ -1299,10 +1334,8 @@ spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts) retval = set_driver_data(dp, hFromChild, hToChild, opts->read_write, opts->exit_status); if (retval != ERL_DRV_ERROR_GENERAL && retval != ERL_DRV_ERROR_ERRNO) { - Port *prt = erts_drvport2port(port_num); - /* We assume that this cannot generate a negative number */ - ASSERT(prt != ERTS_INVALID_ERL_DRV_PORT); - prt->os_pid = (SWord) pid; + /* We assume that this cannot generate a negative number */ + erl_drv_set_os_pid(port_num, pid); } } @@ -1344,9 +1377,9 @@ create_file_thread(AsyncIo* aio, int mode) * Example: input = "\"Program Files\"\\erl arg1 arg2" * gives 19 as result. * The length returned is equivalent with length(argv[0]) if the - * comman line should have been prepared by _setargv for the main function + * command line should have been prepared by _setargv for the main function */ -int parse_command(char* cmd){ +int parse_command(wchar_t* cmd){ #define NORMAL 2 #define STRING 1 #define STOP 0 @@ -1354,17 +1387,17 @@ int parse_command(char* cmd){ int state = NORMAL; while (cmd[i]) { switch (cmd[i]) { - case '"': + case L'"': if (state == NORMAL) state = STRING; else state = NORMAL; break; - case '\\': - if ((state == STRING) && (cmd[i+1]=='"')) + case L'\\': + if ((state == STRING) && (cmd[i+1]==L'"')) i++; break; - case ' ': + case L' ': if (state == NORMAL) state = STOP; break; @@ -1379,39 +1412,46 @@ int parse_command(char* cmd){ return i; } -static BOOL need_quotes(WCHAR *str) -{ - int in_quote = 0; - int backslashed = 0; - int naked_space = 0; - while (*str != L'\0') { - switch (*str) { - case L'\\' : - backslashed = !backslashed; - break; - case L'"': - if (backslashed) { - backslashed=0; - } else { - in_quote = !in_quote; - } - break; - case L' ': - backslashed = 0; - if (!(backslashed || in_quote)) { - naked_space++; - } - break; - default: - backslashed = 0; +/* + * Translating of command line arguments to correct format. In the examples + * below the '' are not part of the actual string. + * 'io:format("hello").' -> 'io:format(\"hello\").' + * 'io:format("is anybody in there?").' -> '"io:format(\"is anybody in there?\")."' + * 'Just nod if you can hear me.' -> '"Just nod if you can hear me."' + * 'Is there ""anyone at home?' -> '"Is there \"\"anyone at home?"' + * 'Relax."' -> 'Relax.\"' + * + * If new == NULL we just calculate the length. + * + * The reason for having to quote all of the is becasue CreateProcessW removes + * one level of escaping since it takes a single long command line rather + * than the argument chunks that unix uses. + */ +static int escape_and_quote(wchar_t *str, wchar_t *new, BOOL *quoted) { + int i, j = 0; + if (new == NULL) + *quoted = FALSE; + else if (*quoted) + new[j++] = L'"'; + for ( i = 0; str[i] != L'\0'; i++,j++) { + if (str[i] == L' ' && new == NULL && *quoted == FALSE) { + *quoted = TRUE; + j++; + } + /* check if we have to escape quotes */ + if (str[i] == L'"') { + if (new) new[j] = L'\\'; + j++; } - ++str; + if (new) new[j] = str[i]; } - return (naked_space > 0); + if (*quoted) { + if (new) new[j] = L'"'; + j++; + } + return j; } - - /* *---------------------------------------------------------------------- @@ -1443,7 +1483,7 @@ static BOOL need_quotes(WCHAR *str) static BOOL create_child_process ( - char *origcmd, /* Command line for child process (including + wchar_t *origcmd, /* Command line for child process (including * name of executable). Or whole executable if st is * ERTS_SPAWN_EXECUTABLE */ @@ -1454,57 +1494,53 @@ create_child_process LPDWORD pdwID, /* Pointer to variable to received Process ID */ BOOL hide, /* Hide the window unconditionally. */ LPVOID env, /* Environment for the child */ - LPTSTR wd, /* Working dir for the child */ + wchar_t *wd, /* Working dir for the child */ unsigned st, /* Flags for spawn, tells us how to interpret origcmd */ - char **argv, /* Argument vector if given. */ + wchar_t **argv, /* Argument vector if given. */ int *errno_return /* Place to put an errno in in case of failure */ ) -{ +{ PROCESS_INFORMATION piProcInfo = {0}; BOOL ok = FALSE; int applType; /* Not to be changed for different types of executables */ int staticCreateFlags = GetPriorityClass(GetCurrentProcess()); int createFlags = DETACHED_PROCESS; - char *newcmdline = NULL; + wchar_t *newcmdline = NULL; int cmdlength; - char* thecommand; - LPTSTR appname = NULL; + wchar_t* thecommand; + wchar_t* appname = NULL; HANDLE hProcess = GetCurrentProcess(); - - *errno_return = -1; + STARTUPINFOW siStartInfo = {0}; + wchar_t execPath[MAX_PATH]; + *errno_return = -1; + siStartInfo.cb = sizeof(STARTUPINFOW); + siStartInfo.dwFlags = STARTF_USESTDHANDLES; + siStartInfo.hStdInput = hStdin; + siStartInfo.hStdOutput = hStdout; + siStartInfo.hStdError = hStderr; if (st != ERTS_SPAWN_EXECUTABLE) { - STARTUPINFO siStartInfo = {0}; - char execPath[MAX_PATH]; - - siStartInfo.cb = sizeof(STARTUPINFO); - siStartInfo.dwFlags = STARTF_USESTDHANDLES; - siStartInfo.hStdInput = hStdin; - siStartInfo.hStdOutput = hStdout; - siStartInfo.hStdError = hStderr; - /* * Parse out the program name from the command line (it can be quoted and * contain spaces). */ - newcmdline = erts_alloc(ERTS_ALC_T_TMP, 2048); cmdlength = parse_command(origcmd); - thecommand = (char *) erts_alloc(ERTS_ALC_T_TMP, cmdlength+1); - strncpy(thecommand, origcmd, cmdlength); - thecommand[cmdlength] = '\0'; - DEBUGF(("spawn command: %s\n", thecommand)); - - applType = application_type(thecommand, execPath, TRUE, - TRUE, errno_return); - DEBUGF(("application_type returned for (%s) is %d\n", thecommand, applType)); + newcmdline = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, (MAX_PATH+wcslen(origcmd)-cmdlength)*sizeof(wchar_t)); + thecommand = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, (cmdlength+1)*sizeof(wchar_t)); + wcsncpy(thecommand, origcmd, cmdlength); + thecommand[cmdlength] = L'\0'; + DEBUGF(("spawn command: %S\n", thecommand)); + + applType = application_type(thecommand, execPath, TRUE, TRUE, errno_return); + DEBUGF(("application_type returned for (%S) is %d\n", thecommand, applType)); erts_free(ERTS_ALC_T_TMP, (void *) thecommand); if (applType == APPL_NONE) { erts_free(ERTS_ALC_T_TMP,newcmdline); return FALSE; } - newcmdline[0] = '\0'; + newcmdline[0] = L'\0'; if (applType == APPL_DOS) { /* @@ -1513,11 +1549,11 @@ create_child_process * a normal process inside of a hidden console application, * and then run that hidden console as a detached process. */ - + siStartInfo.wShowWindow = SW_HIDE; siStartInfo.dwFlags |= STARTF_USESHOWWINDOW; createFlags = CREATE_NEW_CONSOLE; - strcat(newcmdline, "cmd.exe /c "); + wcscat(newcmdline, L"cmd.exe /c "); } else if (hide) { DEBUGF(("hiding window\n")); siStartInfo.wShowWindow = SW_HIDE; @@ -1525,35 +1561,25 @@ create_child_process createFlags = 0; } - strcat(newcmdline, execPath); - strcat(newcmdline, origcmd+cmdlength); - DEBUGF(("Creating child process: %s, createFlags = %d\n", newcmdline, createFlags)); - ok = CreateProcessA(appname, - newcmdline, - NULL, - NULL, - TRUE, - createFlags | staticCreateFlags | - CREATE_UNICODE_ENVIRONMENT, - env, - wd, - &siStartInfo, + wcscat(newcmdline, execPath); + wcscat(newcmdline, origcmd+cmdlength); + DEBUGF(("Creating child process: %S, createFlags = %d\n", newcmdline, createFlags)); + ok = CreateProcessW(appname, + newcmdline, + NULL, + NULL, + TRUE, + createFlags | staticCreateFlags | + CREATE_UNICODE_ENVIRONMENT, + env, + wd, + &siStartInfo, &piProcInfo); } else { /* ERTS_SPAWN_EXECUTABLE, filename and args are in unicode ({utf16,little}) */ int run_cmd = 0; - STARTUPINFOW siStartInfo = {0}; - WCHAR execPath[MAX_PATH]; - - siStartInfo.cb = sizeof(STARTUPINFOW); - siStartInfo.dwFlags = STARTF_USESTDHANDLES; - siStartInfo.hStdInput = hStdin; - siStartInfo.hStdOutput = hStdout; - siStartInfo.hStdError = hStderr; - - applType = application_type_w((WCHAR *) origcmd, execPath, FALSE, FALSE, - errno_return); + applType = application_type(origcmd, execPath, FALSE, FALSE, errno_return); if (applType == APPL_NONE) { return FALSE; } @@ -1573,100 +1599,87 @@ create_child_process createFlags = 0; } if (run_cmd) { - WCHAR cmdPath[MAX_PATH]; + wchar_t cmdPath[MAX_PATH]; int cmdType; - cmdType = application_type_w(L"cmd.exe", cmdPath, TRUE, FALSE, errno_return); + cmdType = application_type(L"cmd.exe", cmdPath, TRUE, FALSE, errno_return); if (cmdType == APPL_NONE || cmdType == APPL_DOS) { return FALSE; } - appname = (char *) erts_alloc(ERTS_ALC_T_TMP, (wcslen(cmdPath)+1)*sizeof(WCHAR)); - wcscpy((WCHAR *) appname,cmdPath); + appname = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, (wcslen(cmdPath)+1)*sizeof(wchar_t)); + wcscpy(appname,cmdPath); } else { - appname = (char *) erts_alloc(ERTS_ALC_T_TMP, (wcslen(execPath)+1)*sizeof(WCHAR)); - wcscpy((WCHAR *) appname, execPath); + appname = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, (wcslen(execPath)+1)*sizeof(wchar_t)); + wcscpy(appname, execPath); } if (argv == NULL) { - BOOL orig_need_q = need_quotes(execPath); - WCHAR *ptr; - int ocl = wcslen(execPath); + BOOL orig_need_q; + wchar_t *ptr; + int ocl = escape_and_quote(execPath, NULL, &orig_need_q); if (run_cmd) { - newcmdline = (char *) erts_alloc(ERTS_ALC_T_TMP, - (ocl + ((orig_need_q) ? 3 : 1) - + 11)*sizeof(WCHAR)); - memcpy(newcmdline,L"cmd.exe /c ",11*sizeof(WCHAR)); - ptr = (WCHAR *) (newcmdline + (11*sizeof(WCHAR))); + newcmdline = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, + (ocl + 1 + 11)*sizeof(wchar_t)); + memcpy(newcmdline,L"cmd.exe /c ",11*sizeof(wchar_t)); + ptr = newcmdline + 11; } else { - newcmdline = (char *) erts_alloc(ERTS_ALC_T_TMP, - (ocl + ((orig_need_q) ? 3 : 1))*sizeof(WCHAR)); - ptr = (WCHAR *) newcmdline; - } - if (orig_need_q) { - *ptr++ = L'"'; + newcmdline = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, + (ocl + 1)*sizeof(wchar_t)); + ptr = (wchar_t *) newcmdline; } - memcpy(ptr,execPath,ocl*sizeof(WCHAR)); - ptr += ocl; - if (orig_need_q) { - *ptr++ = L'"'; - } - *ptr = L'\0'; + ptr += escape_and_quote(execPath, ptr, &orig_need_q); + ptr[0] = L'\0'; } else { - int sum = 1; /* '\0' */ - WCHAR **ar = (WCHAR **) argv; - WCHAR *n; - char *save_arg0 = NULL; - if (argv[0] == erts_default_arg0 || run_cmd) { + int sum = 0; + BOOL *qte = NULL; + wchar_t **ar = argv; + wchar_t *n; + wchar_t *save_arg0 = NULL; + if (argv[0] == (wchar_t *) erts_default_arg0 || run_cmd) { save_arg0 = argv[0]; - argv[0] = (char *) execPath; + argv[0] = execPath; } if (run_cmd) { sum += 11; /* cmd.exe /c */ } + + while (*ar != NULL) ar++; + qte = erts_alloc(ERTS_ALC_T_TMP, (ar - argv)*sizeof(BOOL)); + + ar = argv; while (*ar != NULL) { - sum += wcslen(*ar); - if (need_quotes(*ar)) { - sum += 2; /* quotes */ - } + sum += escape_and_quote(*ar,NULL,qte+(ar - argv)); sum++; /* space */ ++ar; } - ar = (WCHAR **) argv; - newcmdline = erts_alloc(ERTS_ALC_T_TMP, sum*sizeof(WCHAR)); - n = (WCHAR *) newcmdline; + ar = argv; + newcmdline = (wchar_t *) erts_alloc(ERTS_ALC_T_TMP, sum*sizeof(wchar_t)); + n = newcmdline; if (run_cmd) { - memcpy(n,L"cmd.exe /c ",11*sizeof(WCHAR)); + memcpy(n,L"cmd.exe /c ",11*sizeof(wchar_t)); n += 11; } while (*ar != NULL) { - int q = need_quotes(*ar); - sum = wcslen(*ar); - if (q) { - *n++ = L'"'; - } - memcpy(n,*ar,sum*sizeof(WCHAR)); - n += sum; - if (q) { - *n++ = L'"'; - } + n += escape_and_quote(*ar,n,qte+(ar - argv)); *n++ = L' '; ++ar; } - *(n-1) = L'\0'; + *(n-1) = L'\0'; /* overwrite last space with '\0' */ if (save_arg0 != NULL) { argv[0] = save_arg0; } + erts_free(ERTS_ALC_T_TMP, qte); } - DEBUGF(("Creating child process: %s, createFlags = %d\n", newcmdline, createFlags)); - ok = CreateProcessW((WCHAR *) appname, - (WCHAR *) newcmdline, - NULL, - NULL, - TRUE, - createFlags | staticCreateFlags | - CREATE_UNICODE_ENVIRONMENT, - env, - (WCHAR *) wd, - &siStartInfo, + DEBUGF((stderr,"Creating child process: %S, createFlags = %d\n", newcmdline, createFlags)); + ok = CreateProcessW((wchar_t *) appname, + (wchar_t *) newcmdline, + NULL, + NULL, + TRUE, + createFlags | staticCreateFlags | + CREATE_UNICODE_ENVIRONMENT, + env, + wd, + &siStartInfo, &piProcInfo); } /* end SPAWN_EXECUTABLE */ @@ -1775,180 +1788,23 @@ static int create_pipe(HANDLE *phRead, HANDLE *phWrite, BOOL inheritRead, BOOL o return TRUE; } - - - -static int application_type -( - const char *originalName, /* Name of the application to find. */ - char fullPath[MAX_PATH], /* Filled with complete path to - * application. */ - BOOL search_in_path, /* If we should search the system wide path */ - BOOL handle_quotes, /* If we should handle quotes around executable */ - int *error_return /* A place to put an error code */ - ) -{ - int applType, i; - HANDLE hFile; - char *ext, *rest; - char buf[2]; - DWORD read; - IMAGE_DOS_HEADER header; - static char extensions[][5] = {"", ".com", ".exe", ".bat"}; - int is_quoted; - int len; - - /* Look for the program as an external program. First try the name - * as it is, then try adding .com, .exe, and .bat, in that order, to - * the name, looking for an executable. - * NOTE! that we does not support execution of .com programs on Windows NT - * - * - * Using the raw SearchPath() procedure doesn't do quite what is - * necessary. If the name of the executable already contains a '.' - * character, it will not try appending the specified extension when - * searching (in other words, SearchPath will not find the program - * "a.b.exe" if the arguments specified "a.b" and ".exe"). - * So, first look for the file as it is named. Then manually append - * the extensions, looking for a match. (') - */ - - len = strlen(originalName); - is_quoted = handle_quotes && len > 0 && originalName[0] == '"' && - originalName[len-1] == '"'; - - applType = APPL_NONE; - *error_return = ENOENT; - for (i = 0; i < (int) (sizeof(extensions) / sizeof(extensions[0])); i++) { - if(is_quoted) { - lstrcpyn(fullPath, originalName+1, MAX_PATH - 7); - len = strlen(fullPath); - if(len > 0) { - fullPath[len-1] = '\0'; - } - } else { - lstrcpyn(fullPath, originalName, MAX_PATH - 5); - } - lstrcat(fullPath, extensions[i]); - SearchPath((search_in_path) ? NULL : ".", fullPath, NULL, MAX_PATH, fullPath, &rest); - - /* - * Ignore matches on directories or data files, return if identified - * a known type. - */ - - if (GetFileAttributes(fullPath) & FILE_ATTRIBUTE_DIRECTORY) { - continue; - } - - ext = strrchr(fullPath, '.'); - if ((ext != NULL) && (strcmpi(ext, ".bat") == 0)) { - *error_return = EACCES; - applType = APPL_DOS; - break; - } - - hFile = CreateFile(fullPath, GENERIC_READ, FILE_SHARE_READ, NULL, - OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL); - if (hFile == INVALID_HANDLE_VALUE) { - continue; - } - - *error_return = EACCES; /* If considered an error, - it's an access error */ - header.e_magic = 0; - ReadFile(hFile, (void *) &header, sizeof(header), &read, NULL); - if (header.e_magic != IMAGE_DOS_SIGNATURE) { - /* - * Doesn't have the magic number for relocatable executables. If - * filename ends with .com, assume it's a DOS application anyhow. - * Note that we didn't make this assumption at first, because some - * supposed .com files are really 32-bit executables with all the - * magic numbers and everything. - */ - - CloseHandle(hFile); - if ((ext != NULL) && (strcmpi(ext, ".com") == 0)) { - applType = APPL_DOS; - break; - } - continue; - } - if (header.e_lfarlc != sizeof(header)) { - /* - * All Windows 3.X and Win32 and some DOS programs have this value - * set here. If it doesn't, assume that since it already had the - * other magic number it was a DOS application. - */ - - CloseHandle(hFile); - applType = APPL_DOS; - break; - } - - /* - * The DWORD at header.e_lfanew points to yet another magic number. - */ - - buf[0] = '\0'; - SetFilePointer(hFile, header.e_lfanew, NULL, FILE_BEGIN); - ReadFile(hFile, (void *) buf, 2, &read, NULL); - CloseHandle(hFile); - - if ((buf[0] == 'L') && (buf[1] == 'E')) { - applType = APPL_DOS; - } else if ((buf[0] == 'N') && (buf[1] == 'E')) { - applType = APPL_WIN3X; - } else if ((buf[0] == 'P') && (buf[1] == 'E')) { - applType = APPL_WIN32; - } else { - continue; - } - break; - } - - if (applType == APPL_NONE) { - return APPL_NONE; - } - - if ((applType == APPL_DOS) || (applType == APPL_WIN3X)) { - /* - * Replace long path name of executable with short path name for - * 16-bit applications. Otherwise the application may not be able - * to correctly parse its own command line to separate off the - * application name from the arguments. - */ - - GetShortPathName(fullPath, fullPath, MAX_PATH); - } - if (is_quoted) { - /* restore quotes on quoted program name */ - len = strlen(fullPath); - memmove(fullPath+1,fullPath,len); - fullPath[0]='"'; - fullPath[len+1]='"'; - fullPath[len+2]='\0'; - } - return applType; -} - -static int application_type_w (const WCHAR *originalName, /* Name of the application to find. */ - WCHAR wfullpath[MAX_PATH],/* Filled with complete path to +static int application_type (const wchar_t *originalName, /* Name of the application to find. */ + wchar_t wfullpath[MAX_PATH],/* Filled with complete path to * application. */ - BOOL search_in_path, /* If we should search the system wide path */ - BOOL handle_quotes, /* If we should handle quotes around executable */ - int *error_return) /* A place to put an error code */ + BOOL search_in_path, /* If we should search the system wide path */ + BOOL handle_quotes, /* If we should handle quotes around executable */ + int *error_return) /* A place to put an error code */ { int applType, i; HANDLE hFile; - WCHAR *ext, *rest; + wchar_t *ext, *rest; char buf[2]; DWORD read; IMAGE_DOS_HEADER header; - static WCHAR extensions[][5] = {L"", L".com", L".exe", L".bat"}; + static wchar_t extensions[][5] = {L"", L".com", L".exe", L".bat"}; int is_quoted; int len; - WCHAR xfullpath[MAX_PATH]; + wchar_t xfullpath[MAX_PATH]; len = wcslen(originalName); is_quoted = handle_quotes && len > 0 && originalName[0] == L'"' && @@ -2063,7 +1919,7 @@ static int application_type_w (const WCHAR *originalName, /* Name of the applica if (is_quoted) { /* restore quotes on quoted program name */ len = wcslen(wfullpath); - memmove(wfullpath+1,wfullpath,len*sizeof(WCHAR)); + memmove(wfullpath+1,wfullpath,len*sizeof(wchar_t)); wfullpath[0]=L'"'; wfullpath[len+1]=L'"'; wfullpath[len+2]=L'\0'; @@ -2348,7 +2204,7 @@ static void fd_stop(ErlDrvData data) ASSERT(dp->out.flushEvent); SetEvent(dp->out.flushEvent); } while (WaitForSingleObject(dp->out.flushReplyEvent, 10) == WAIT_TIMEOUT - || !(dp->out.flags & DF_THREAD_FLUSHED)); + && !(dp->out.flags & DF_THREAD_FLUSHED)); } } @@ -2912,6 +2768,30 @@ void erts_sys_free(ErtsAlcType_t t, void *x, void *p) free(p); } +void *erts_sys_aligned_alloc(UWord alignment, UWord size) +{ + void *ptr; + ASSERT(alignment && (alignment & (alignment-1)) == 0); /* power of 2 */ + ptr = _aligned_malloc((size_t) size, (size_t) alignment); + ASSERT(!ptr || (((UWord) ptr) & (alignment - 1)) == 0); + return ptr; +} + +void erts_sys_aligned_free(UWord alignment, void *ptr) +{ + ASSERT(alignment && (alignment & (alignment-1)) == 0); /* power of 2 */ + _aligned_free(ptr); +} + +void *erts_sys_aligned_realloc(UWord alignment, void *ptr, UWord size, UWord old_size) +{ + void *new_ptr; + ASSERT(alignment && (alignment & (alignment-1)) == 0); /* power of 2 */ + new_ptr = _aligned_realloc(ptr, (size_t) size, (size_t) alignment); + ASSERT(!new_ptr || (((UWord) new_ptr) & (alignment - 1)) == 0); + return new_ptr; +} + static Preload* preloaded = NULL; static unsigned* res_name = NULL; static int num_preloaded = 0; @@ -3199,8 +3079,10 @@ erl_bin_write(buf, sz, max) } } +#endif /* DEBUG */ + void -erl_assert_error(char* expr, char* file, int line) +erl_assert_error(const char* expr, const char* func, const char* file, int line) { char message[1024]; @@ -3214,7 +3096,6 @@ erl_assert_error(char* expr, char* file, int line) DebugBreak(); } -#endif /* DEBUG */ static void check_supported_os_version(void) @@ -3228,13 +3109,13 @@ check_supported_os_version(void) || int_os_version.dwMajorVersion < major || (int_os_version.dwMajorVersion == major && int_os_version.dwMinorVersion < minor)) - erl_exit(-1, + erts_exit(1, "Windows version not supported " "(min required: winnt %d.%d)\n", major, minor); } #else - erl_exit(-1, + erts_exit(1, "Windows version not supported " "(min required: win %d.%d)\n", nt_major, nt_minor); @@ -3291,25 +3172,31 @@ thr_create_prepare_child(void *vtcdp) void erts_sys_pre_init(void) { +#ifdef USE_THREADS + erts_thr_init_data_t eid = ERTS_THR_INIT_DATA_DEF_INITER; +#endif int_os_version.dwOSVersionInfoSize = sizeof(OSVERSIONINFO); GetVersionEx(&int_os_version); check_supported_os_version(); + #ifdef USE_THREADS - { - erts_thr_init_data_t eid = ERTS_THR_INIT_DATA_DEF_INITER; + eid.thread_create_child_func = thr_create_prepare_child; + /* Before creation in parent */ + eid.thread_create_prepare_func = thr_create_prepare; + /* After creation in parent */ + eid.thread_create_parent_func = thr_create_cleanup; - eid.thread_create_child_func = thr_create_prepare_child; - /* Before creation in parent */ - eid.thread_create_prepare_func = thr_create_prepare; - /* After creation in parent */ - eid.thread_create_parent_func = thr_create_cleanup, + erts_thr_init(&eid); +#endif + + erts_init_sys_time_sup(); - erts_thr_init(&eid); +#ifdef USE_THREADS #ifdef ERTS_ENABLE_LOCK_COUNT - erts_lcnt_init(); + erts_lcnt_init(); #endif - } #endif + erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0); } @@ -3331,7 +3218,7 @@ void erl_sys_init(void) noinherit_std_handle(STD_ERROR_HANDLE); #ifdef ERTS_SMP - erts_smp_tsd_key_create(&win32_errstr_key); + erts_smp_tsd_key_create(&win32_errstr_key,"win32_errstr_key"); InitializeCriticalSection(&htbc_lock); #endif erts_smp_atomic_init_nob(&pipe_creation_counter,0); @@ -3385,6 +3272,12 @@ void erl_sys_init(void) } void +erl_sys_late_init(void) +{ + /* do nothing */ +} + +void erts_sys_schedule_interrupt(int set) { erts_check_io_interrupt(set); @@ -3392,9 +3285,9 @@ erts_sys_schedule_interrupt(int set) #ifdef ERTS_SMP void -erts_sys_schedule_interrupt_timed(int set, erts_short_time_t msec) +erts_sys_schedule_interrupt_timed(int set, ErtsMonotonicTime timeout_time) { - erts_check_io_interrupt_timed(set, msec); + erts_check_io_interrupt_timed(set, timeout_time); } #endif diff --git a/erts/emulator/sys/win32/sys_env.c b/erts/emulator/sys/win32/sys_env.c index 754f4c6e4c..21ef71ad9a 100644 --- a/erts/emulator/sys/win32/sys_env.c +++ b/erts/emulator/sys/win32/sys_env.c @@ -1,18 +1,19 @@ /*
* %CopyrightBegin%
*
- * Copyright Ericsson AB 2002-2012. All Rights Reserved.
+ * Copyright Ericsson AB 2002-2016. All Rights Reserved.
*
- * The contents of this file are subject to the Erlang Public License,
- * Version 1.1, (the "License"); you may not use this file except in
- * compliance with the License. You should have received a copy of the
- * Erlang Public License along with this software. If not, it can be
- * retrieved online at http://www.erlang.org/.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
*
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
- * the License for the specific language governing rights and limitations
- * under the License.
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
*
* %CopyrightEnd%
*/
@@ -141,6 +142,24 @@ void fini_getenv_state(GETENV_STATE *state) erts_smp_rwmtx_runlock(&environ_rwmtx);
}
+int erts_sys_unsetenv(char *key)
+{
+ int res = 0;
+ WCHAR *wkey = (WCHAR *) key;
+
+ SetLastError(0);
+ erts_smp_rwmtx_rlock(&environ_rwmtx);
+ GetEnvironmentVariableW(wkey,
+ NULL,
+ 0);
+ if (GetLastError() != ERROR_ENVVAR_NOT_FOUND) {
+ res = (SetEnvironmentVariableW(wkey,
+ NULL) ? 0 : 1);
+ }
+ erts_smp_rwmtx_runlock(&environ_rwmtx);
+ return res;
+}
+
char*
win_build_environment(char* new_env)
{
diff --git a/erts/emulator/sys/win32/sys_float.c b/erts/emulator/sys/win32/sys_float.c index 0e19746cf5..2b2d6ab7d3 100644 --- a/erts/emulator/sys/win32/sys_float.c +++ b/erts/emulator/sys/win32/sys_float.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2013. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -138,8 +139,7 @@ sys_double_to_chars_ext(double fp, char *buffer, size_t buffer_size, size_t deci int matherr(struct _exception *exc) { - erl_fp_exception = 1; - DEBUGF(("FP exception (matherr) (0x%x) (%d)\n", exc->type, erl_fp_exception)); + DEBUGF(("FP exception (matherr) (0x%x)\n", exc->type)); return 1; } diff --git a/erts/emulator/sys/win32/sys_interrupt.c b/erts/emulator/sys/win32/sys_interrupt.c index 36c43e93da..df838960eb 100644 --- a/erts/emulator/sys/win32/sys_interrupt.c +++ b/erts/emulator/sys/win32/sys_interrupt.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2012. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -80,7 +81,7 @@ BOOL WINAPI ctrl_handler_ignore_break(DWORD dwCtrlType) return TRUE; /* else pour through... */ case CTRL_CLOSE_EVENT: - erl_exit(0, ""); + erts_exit(0, ""); break; } return TRUE; @@ -105,7 +106,7 @@ BOOL WINAPI ctrl_handler_replace_intr(DWORD dwCtrlType) /* else pour through... */ case CTRL_CLOSE_EVENT: case CTRL_SHUTDOWN_EVENT: - erl_exit(0, ""); + erts_exit(0, ""); break; } return TRUE; @@ -132,7 +133,7 @@ BOOL WINAPI ctrl_handler(DWORD dwCtrlType) return TRUE; /* else pour through... */ case CTRL_CLOSE_EVENT: - erl_exit(0, ""); + erts_exit(0, ""); break; } return TRUE; diff --git a/erts/emulator/sys/win32/sys_time.c b/erts/emulator/sys/win32/sys_time.c index b84c8f85ce..e8c67b3928 100644 --- a/erts/emulator/sys/win32/sys_time.c +++ b/erts/emulator/sys/win32/sys_time.c @@ -1,18 +1,19 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1997-2013. All Rights Reserved. + * Copyright Ericsson AB 1997-2016. All Rights Reserved. * - * The contents of this file are subject to the Erlang Public License, - * Version 1.1, (the "License"); you may not use this file except in - * compliance with the License. You should have received a copy of the - * Erlang Public License along with this software. If not, it can be - * retrieved online at http://www.erlang.org/. - * - * Software distributed under the License is distributed on an "AS IS" - * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See - * the License for the specific language governing rights and limitations - * under the License. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. * * %CopyrightEnd% */ @@ -25,6 +26,11 @@ #endif #include "sys.h" #include "assert.h" +#include "erl_os_monotonic_time_extender.h" +#include "erl_time.h" + +/* Need to look more closely at qpc before use... */ +#define ERTS_DISABLE_USE_OF_QPC_FOR_MONOTONIC_TIME 1 #define LL_LITERAL(X) ERTS_I64_LITERAL(X) @@ -61,11 +67,6 @@ (epoch) = ((ull.QuadPart / TICKS_PER_SECOND) - EPOCH_JULIAN_DIFF); \ } while(0) -static SysHrTime wrap = 0; -static DWORD last_tick_count = 0; -static erts_smp_mtx_t wrap_lock; -static ULONGLONG (WINAPI *pGetTickCount64)(void) = NULL; - /* Getting timezone information is a heavy operation, so we want to do this only once */ @@ -76,27 +77,349 @@ static int days_in_month[2][13] = { {0,31,28,31,30,31,30,31,31,30,31,30,31}, {0,31,29,31,30,31,30,31,31,30,31,30,31}}; -int -sys_init_time(void) +#define ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT 10 + +/* + * erts_os_monotonic_time() + */ + +struct sys_time_internal_state_read_only__ { + ULONGLONG (WINAPI *pGetTickCount64)(void); + BOOL (WINAPI *pQueryPerformanceCounter)(LARGE_INTEGER *); + Sint32 pcf; + int using_get_tick_count_time_unit; +}; + +struct sys_time_internal_state_read_mostly__ { + ErtsOsMonotonicTimeExtendState os_mtime_xtnd; +}; + +struct sys_time_internal_state_write_freq__ { + erts_smp_mtx_t mtime_mtx; + ULONGLONG wrap; + ULONGLONG last_tick_count; +}; + +__declspec(align(ASSUMED_CACHE_LINE_SIZE)) struct { + union { + struct sys_time_internal_state_read_only__ o; + char align__[(((sizeof(struct sys_time_internal_state_read_only__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } r; + union { + struct sys_time_internal_state_read_mostly__ m; + char align__[(((sizeof(struct sys_time_internal_state_read_mostly__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } wr; + union { + struct sys_time_internal_state_write_freq__ f; + char align__[(((sizeof(struct sys_time_internal_state_write_freq__) - 1) + / ASSUMED_CACHE_LINE_SIZE) + 1) + * ASSUMED_CACHE_LINE_SIZE]; + } w; +} internal_state; + +__declspec(align(ASSUMED_CACHE_LINE_SIZE)) ErtsSysTimeData__ erts_sys_time_data__; + + +static ERTS_INLINE ErtsSystemTime +SystemTime2MilliSec(SYSTEMTIME *stp) +{ + ErtsSystemTime stime; + FILETIME ft; + ULARGE_INTEGER ull; + + SystemTimeToFileTime(stp, &ft); + FILETIME_TO_ULI(ull,ft); + /* now in 100 ns units */ + stime = (ErtsSystemTime) ull.QuadPart; + stime -= (((ErtsSystemTime) EPOCH_JULIAN_DIFF) + * ((ErtsSystemTime) (10*1000*1000))); + stime /= (ErtsSystemTime) (10*1000); /* ms */ + return stime; +} + +static ErtsMonotonicTime +os_monotonic_time_qpc(void) { + LARGE_INTEGER pc; + + if (!(*internal_state.r.o.pQueryPerformanceCounter)(&pc)) + erts_exit(ERTS_ABORT_EXIT, "QueryPerformanceCounter() failed\n"); + + return (ErtsMonotonicTime) pc.QuadPart; +} + +static void +os_times_qpc(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + LARGE_INTEGER pc; + SYSTEMTIME st; + ErtsSystemTime stime; + BOOL qpcr; + + qpcr = (*internal_state.r.o.pQueryPerformanceCounter)(&pc); + GetSystemTime(&st); + + if (!qpcr) + erts_exit(ERTS_ABORT_EXIT, "QueryPerformanceCounter() failed\n"); + + *mtimep = (ErtsMonotonicTime) pc.QuadPart; + + stime = SystemTime2MilliSec(&st); + + *stimep = ((ErtsSystemTime) + erts_time_unit_conversion((Uint64) stime, + (Uint32) 1000, + internal_state.r.o.pcf)); +} + +static Uint32 +get_tick_count(void) +{ + return (Uint32) GetTickCount(); +} + +static ErtsMonotonicTime +os_monotonic_time_gtc32(void) +{ + ErtsMonotonicTime mtime; + Uint32 ticks = (Uint32) GetTickCount(); + ERTS_CHK_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks); + mtime = ERTS_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks); + mtime <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + return mtime; +} + +static void +os_times_gtc32(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + SYSTEMTIME st; + ErtsSystemTime stime, mtime; + Uint32 ticks; + + ticks = (Uint32) GetTickCount(); + GetSystemTime(&st); + + ERTS_CHK_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks); + mtime = ERTS_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks); + mtime <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + *mtimep = mtime; + + stime = SystemTime2MilliSec(&st); + stime <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + *stimep = stime; + +} + +static ErtsMonotonicTime +os_monotonic_time_gtc64(void) +{ + ULONGLONG ticks = (*internal_state.r.o.pGetTickCount64)(); + ErtsMonotonicTime mtime = (ErtsMonotonicTime) ticks; + return mtime << ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; +} + +static void +os_times_gtc64(ErtsMonotonicTime *mtimep, ErtsSystemTime *stimep) +{ + SYSTEMTIME st; + ErtsSystemTime stime, mtime; + ULONGLONG ticks; + + ticks = (*internal_state.r.o.pGetTickCount64)(); + GetSystemTime(&st); + + mtime = (ErtsMonotonicTime) ticks; + mtime <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + *mtimep = mtime; + + stime = SystemTime2MilliSec(&st); + stime <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + *stimep = stime; +} + +static ErtsSysHrTime +sys_hrtime_qpc(void) +{ + LARGE_INTEGER pc; + + if (!(*internal_state.r.o.pQueryPerformanceCounter)(&pc)) + erts_exit(ERTS_ABORT_EXIT, "QueryPerformanceCounter() failed\n"); + + ASSERT(pc.QuadPart > 0); + + return (ErtsSysHrTime) erts_time_unit_conversion((Uint64) pc.QuadPart, + internal_state.r.o.pcf, + (Uint32) 1000*1000*1000); +} + +static ErtsSysHrTime +sys_hrtime_gtc32(void) +{ + ErtsSysHrTime time; + Uint32 ticks = (Uint32) GetTickCount(); + ERTS_CHK_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + tick_count); + time = (ErtsSysHrTime) ERTS_EXTEND_OS_MONOTONIC_TIME(&internal_state.wr.m.os_mtime_xtnd, + ticks); + time *= (ErtsSysHrTime) (1000 * 1000); + return time; +} + +static ErtsSysHrTime +sys_hrtime_gtc64(void) +{ + ErtsSysHrTime time = (*internal_state.r.o.pGetTickCount64)(); + time *= (ErtsSysHrTime) (1000*1000); + return time; +} + +/* + * Init + */ + +void +sys_init_time(ErtsSysInitTimeResult *init_resp) +{ + ErtsMonotonicTime (*os_mtime_func)(void); + void (*os_times_func)(ErtsMonotonicTime *, ErtsSystemTime *); + ErtsSysHrTime (*sys_hrtime_func)(void) = NULL; + ErtsMonotonicTime time_unit; char kernel_dll_name[] = "kernel32"; HMODULE module; + init_resp->os_monotonic_time_info.clock_id = NULL; + module = GetModuleHandle(kernel_dll_name); - pGetTickCount64 = (module != NULL) ? - (ULONGLONG (WINAPI *)(void)) - GetProcAddress(module,"GetTickCount64") : - NULL; + if (!module) { + get_tick_count: + erts_smp_mtx_init(&internal_state.w.f.mtime_mtx, + "os_monotonic_time"); + internal_state.w.f.wrap = 0; + internal_state.w.f.last_tick_count = 0; + + init_resp->os_monotonic_time_info.func = "GetTickCount"; + init_resp->os_monotonic_time_info.locked_use = 0; + /* 10-16 ms resolution according to MicroSoft documentation */ + init_resp->os_monotonic_time_info.resolution = 100; /* 10 ms */ + time_unit = (ErtsMonotonicTime) 1000; + time_unit <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + internal_state.r.o.using_get_tick_count_time_unit = 1; + os_mtime_func = os_monotonic_time_gtc32; + os_times_func = os_times_gtc32; + init_resp->os_monotonic_time_info.extended = 1; + erts_init_os_monotonic_time_extender(&internal_state.wr.m.os_mtime_xtnd, + get_tick_count, + 60*60*24*7); /* Check once a week */ + if (!sys_hrtime_func) + sys_hrtime_func = sys_hrtime_gtc32; + } + else { + int major, minor, build; + + os_version(&major, &minor, &build); + + if (major < 6) { + + get_tick_count64: + + internal_state.r.o.pGetTickCount64 + = ((ULONGLONG (WINAPI *)(void)) + GetProcAddress(module, "GetTickCount64")); + if (!internal_state.r.o.pGetTickCount64) + goto get_tick_count; + + init_resp->os_monotonic_time_info.func = "GetTickCount64"; + init_resp->os_monotonic_time_info.locked_use = 0; + /* 10-16 ms resolution according to MicroSoft documentation */ + init_resp->os_monotonic_time_info.resolution = 100; /* 10 ms */ + time_unit = (ErtsMonotonicTime) 1000; + time_unit <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + internal_state.r.o.using_get_tick_count_time_unit = 1; + os_mtime_func = os_monotonic_time_gtc64; + os_times_func = os_times_gtc64; + if (!sys_hrtime_func) + sys_hrtime_func = sys_hrtime_gtc64; + } + else { /* Vista or newer... */ + + LARGE_INTEGER pf; + BOOL (WINAPI *QPF)(LARGE_INTEGER *); + + QPF = ((BOOL (WINAPI *)(LARGE_INTEGER *)) + GetProcAddress(module, "QueryPerformanceFrequency")); + if (!QPF) + goto get_tick_count64; + if (!(*QPF)(&pf)) + goto get_tick_count64; + + internal_state.r.o.pQueryPerformanceCounter + = ((BOOL (WINAPI *)(LARGE_INTEGER *)) + GetProcAddress(module, "QueryPerformanceCounter")); + if (!internal_state.r.o.pQueryPerformanceCounter) + goto get_tick_count64; + + if (pf.QuadPart > (((LONGLONG) 1) << 32)) + goto get_tick_count64; + + internal_state.r.o.pcf = (Uint32) pf.QuadPart; + sys_hrtime_func = sys_hrtime_qpc; + + /* + * We only use QueryPerformanceCounter() for + * os-monotonic-time if its frequency is equal + * to, or larger than GHz in order to ensure + * that the user wont be able to observe faulty + * order between values retrieved on different threads. + */ + if (pf.QuadPart < (LONGLONG) 1000*1000*1000) + goto get_tick_count64; + + if (ERTS_DISABLE_USE_OF_QPC_FOR_MONOTONIC_TIME) + goto get_tick_count64; + + init_resp->os_monotonic_time_info.func = "QueryPerformanceCounter"; + init_resp->os_monotonic_time_info.locked_use = 0; + time_unit = (ErtsMonotonicTime) pf.QuadPart; + internal_state.r.o.using_get_tick_count_time_unit = 0; + init_resp->os_monotonic_time_info.resolution = time_unit; + os_mtime_func = os_monotonic_time_qpc; + os_times_func = os_times_qpc; + } + } + + erts_sys_time_data__.r.o.os_monotonic_time = os_mtime_func; + erts_sys_time_data__.r.o.os_times = os_times_func; + erts_sys_time_data__.r.o.sys_hrtime = sys_hrtime_func; + init_resp->os_monotonic_time_unit = time_unit; + init_resp->have_os_monotonic_time = 1; + init_resp->have_corrected_os_monotonic_time = 0; + init_resp->sys_clock_resolution = 1; + + init_resp->os_system_time_info.func = "GetSystemTime"; + init_resp->os_system_time_info.clock_id = NULL; + init_resp->os_system_time_info.resolution = 100; + init_resp->os_system_time_info.locked_use = 0; if(GetTimeZoneInformation(&static_tzi) && static_tzi.StandardDate.wMonth != 0 && static_tzi.DaylightDate.wMonth != 0) { have_static_tzi = 1; } +} - erts_smp_mtx_init(&wrap_lock, "sys_gethrtime"); - - return 1; +void +erts_late_sys_init_time(void) +{ + if (erts_sys_time_data__.r.o.os_monotonic_time == os_monotonic_time_gtc32) + erts_late_init_os_monotonic_time_extender(&internal_state.wr.m.os_mtime_xtnd); } /* Returns a switchtimes for DST as UTC filetimes given data from a @@ -377,41 +700,27 @@ sys_gettimeofday(SysTimeval *tv) EPOCH_JULIAN_DIFF); } -extern int erts_initialized; -SysHrTime -sys_gethrtime(void) +ErtsSystemTime +erts_os_system_time(void) { - if (pGetTickCount64 != NULL) { - return ((SysHrTime) pGetTickCount64()) * LL_LITERAL(1000000); - } else { - DWORD ticks; - SysHrTime res; - erts_smp_mtx_lock(&wrap_lock); - ticks = (SysHrTime) (GetTickCount() & 0x7FFFFFFF); - if (ticks < (SysHrTime) last_tick_count) { - /* Detect a race that should no longer be here... */ - if ((((SysHrTime) last_tick_count) - ((SysHrTime) ticks)) > 1000) { - wrap += LL_LITERAL(1) << 31; - } else { - /* - * XXX Debug: Violates locking order, remove all this, - * after testing! - */ - erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); - erts_dsprintf(dsbufp, "Did not wrap when last_tick %d " - "and tick %d", - last_tick_count, ticks); - erts_send_error_to_logger_nogl(dsbufp); - ticks = last_tick_count; - } - } - last_tick_count = ticks; - res = ((((LONGLONG) ticks) + wrap) * LL_LITERAL(1000000)); - erts_smp_mtx_unlock(&wrap_lock); - return res; + SYSTEMTIME st; + ErtsSystemTime stime; + + GetSystemTime(&st); + stime = SystemTime2MilliSec(&st); + + if (internal_state.r.o.using_get_tick_count_time_unit) { + stime <<= ERTS_GET_TICK_COUNT_TIME_UNIT_SHIFT; + return stime; } + + return ((ErtsSystemTime) + erts_time_unit_conversion((Uint64) stime, + (Uint32) 1000, + internal_state.r.o.pcf)); } + clock_t sys_times(SysTimes *buffer) { clock_t kernel_ticks = (GetTickCount() / |