/*
* %CopyrightBegin%
*
* Copyright Ericsson AB 2006-2016. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* %CopyrightEnd%
*/
/*
* Description: Poll interface suitable for ERTS with or without
* SMP support.
*
* The interface is currently implemented using:
* - select
* - poll
* - /dev/poll
* - epoll with poll or select as fallback
* - kqueue with poll or select as fallback
*
* Some time in the future it will also be
* implemented using Solaris ports.
*
*
*
* Author: Rickard Green
*/
#ifdef HAVE_CONFIG_H
# include "config.h"
#endif
#if defined(__DARWIN__) || defined(__APPLE__) && defined(__MACH__)
/* Setting _DARWIN_UNLIMITED_SELECT before including sys/select.h enables
* the version of select() that does not place a limit on the fd_set.
*/
# define _DARWIN_UNLIMITED_SELECT
#endif
#ifndef WANT_NONBLOCKING
# define WANT_NONBLOCKING
#endif
#include "erl_poll.h"
#if ERTS_POLL_USE_KQUEUE
# include <sys/types.h>
# include <sys/event.h>
# include <sys/time.h>
#endif
#if ERTS_POLL_USE_SELECT
# ifdef SYS_SELECT_H
# include <sys/select.h>
# endif
#endif
#ifdef NO_SYSCONF
# if ERTS_POLL_USE_SELECT
# include <sys/param.h>
# else
# include <limits.h>
# endif
#endif
#include "erl_thr_progress.h"
#include "erl_driver.h"
#include "erl_alloc.h"
#include "erl_msacc.h"
#include "erl_misc_utils.h"
#if !defined(ERTS_POLL_USE_EPOLL) \
&& !defined(ERTS_POLL_USE_DEVPOLL) \
&& !defined(ERTS_POLL_USE_POLL) \
&& !defined(ERTS_POLL_USE_SELECT)
#error "Missing implementation of erts_poll()"
#endif
#if defined(ERTS_KERNEL_POLL_VERSION) && !ERTS_POLL_USE_KERNEL_POLL
#error "Missing kernel poll implementation of erts_poll()"
#endif
#if defined(ERTS_NO_KERNEL_POLL_VERSION) && ERTS_POLL_USE_KERNEL_POLL
#error "Kernel poll used when it shouldn't be used"
#endif
#if 0
#define ERTS_POLL_DEBUG_PRINT
#endif
#ifdef _DARWIN_UNLIMITED_SELECT
typedef struct {
size_t sz;
fd_set* ptr;
}ERTS_fd_set;
# define ERTS_FD_CLR(fd, fds) FD_CLR((fd), (fds)->ptr)
# define ERTS_FD_SET(fd, fds) FD_SET((fd), (fds)->ptr)
# define ERTS_FD_ISSET(fd,fds) FD_ISSET((fd), (fds)->ptr)
# define ERTS_FD_ZERO(fds) memset((fds)->ptr, 0, (fds)->sz)
# define ERTS_FD_SIZE(n) ((((n)+NFDBITS-1)/NFDBITS)*sizeof(fd_mask))
static void ERTS_FD_COPY(ERTS_fd_set *src, ERTS_fd_set *dst)
{
if (dst->sz != src->sz) {
dst->ptr = dst->ptr
? erts_realloc(ERTS_ALC_T_SELECT_FDS, dst->ptr, src->sz)
: erts_alloc(ERTS_ALC_T_SELECT_FDS, src->sz);
dst->sz = src->sz;
}
memcpy(dst->ptr, src->ptr, src->sz);
}
static ERTS_INLINE
int ERTS_SELECT(int nfds, ERTS_fd_set *readfds, ERTS_fd_set *writefds,
ERTS_fd_set *exceptfds, struct timeval *timeout)
{
ASSERT(!readfds || readfds->sz >= ERTS_FD_SIZE(nfds));
ASSERT(!writefds || writefds->sz >= ERTS_FD_SIZE(nfds));
ASSERT(!exceptfds);
return select(nfds,
(readfds ? readfds->ptr : NULL ),
(writefds ? writefds->ptr : NULL),
NULL,
timeout);
}
#else /* !_DARWIN_UNLIMITED_SELECT */
# define ERTS_fd_set fd_set
# define ERTS_FD_CLR FD_CLR
# define ERTS_FD_ISSET FD_ISSET
# define ERTS_FD_SET FD_SET
# define ERTS_FD_ZERO FD_ZERO
# define ERTS_FD_COPY(src,dst) (*(dst) = *(src))
# define ERTS_SELECT select
#endif
#define ERTS_POLL_USE_BATCH_UPDATE_POLLSET (ERTS_POLL_USE_DEVPOLL \
|| ERTS_POLL_USE_KQUEUE)
#define ERTS_POLL_USE_CONCURRENT_UPDATE ERTS_POLL_USE_EPOLL
#define ERTS_POLL_COALESCE_KP_RES (ERTS_POLL_USE_KQUEUE || ERTS_POLL_USE_EPOLL)
#define ERTS_POLLSET_LOCK(PS) \
erts_mtx_lock(&(PS)->mtx)
#define ERTS_POLLSET_UNLOCK(PS) \
erts_mtx_unlock(&(PS)->mtx)
#define ERTS_POLLSET_SET_POLLED_CHK(PS) \
((int) erts_atomic32_xchg_nob(&(PS)->polled, (erts_aint32_t) 1))
#define ERTS_POLLSET_UNSET_POLLED(PS) \
erts_atomic32_set_nob(&(PS)->polled, (erts_aint32_t) 0)
#define ERTS_POLLSET_IS_POLLED(PS) \
((int) erts_atomic32_read_nob(&(PS)->polled))
#define ERTS_POLLSET_SET_HAVE_UPDATE_REQUESTS(PS) \
erts_atomic32_set_nob(&(PS)->have_update_requests, (erts_aint32_t) 1)
#define ERTS_POLLSET_UNSET_HAVE_UPDATE_REQUESTS(PS) \
erts_atomic32_set_nob(&(PS)->have_update_requests, (erts_aint32_t) 0)
#define ERTS_POLLSET_HAVE_UPDATE_REQUESTS(PS) \
((int) erts_atomic32_read_nob(&(PS)->have_update_requests))
#if ERTS_POLL_USE_FALLBACK
# if ERTS_POLL_USE_POLL
# define ERTS_POLL_NEED_FALLBACK(PS) ((PS)->no_poll_fds > 1)
# elif ERTS_POLL_USE_SELECT
# define ERTS_POLL_NEED_FALLBACK(PS) ((PS)->no_select_fds > 1)
# endif
#endif
/*
* --- Data types ------------------------------------------------------------
*/
#define ERTS_POLLSET_UPDATE_REQ_BLOCK_SIZE 128
typedef struct ErtsPollSetUpdateRequestsBlock_ ErtsPollSetUpdateRequestsBlock;
struct ErtsPollSetUpdateRequestsBlock_ {
ErtsPollSetUpdateRequestsBlock *next;
int len;
int fds[ERTS_POLLSET_UPDATE_REQ_BLOCK_SIZE];
};
# define ERTS_POLL_FD_FLG_INURQ (((unsigned short) 1) << 0)
#if ERTS_POLL_USE_FALLBACK
# define ERTS_POLL_FD_FLG_INFLBCK (((unsigned short) 1) << 1)
# define ERTS_POLL_FD_FLG_USEFLBCK (((unsigned short) 1) << 2)
#endif
# define ERTS_POLL_FD_FLG_RST (((unsigned short) 1) << 3)
typedef struct {
#if ERTS_POLL_USE_POLL
int pix;
#endif
ErtsPollEvents used_events;
ErtsPollEvents events;
#if ERTS_POLL_COALESCE_KP_RES
unsigned short res_ev_ix;
#endif
unsigned short flags;
} ErtsFdStatus;
#if ERTS_POLL_COALESCE_KP_RES
/* res_ev_ix max value */
#define ERTS_POLL_MAX_RES ((1 << sizeof(unsigned short)*8) - 1)
#endif
#if ERTS_POLL_USE_KQUEUE
#define ERTS_POLL_KQ_OP_HANDLED 1
#define ERTS_POLL_KQ_OP_DEL_R 2
#define ERTS_POLL_KQ_OP_DEL_W 3
#define ERTS_POLL_KQ_OP_ADD_R 4
#define ERTS_POLL_KQ_OP_ADD_W 5
#define ERTS_POLL_KQ_OP_ADD2_R 6
#define ERTS_POLL_KQ_OP_ADD2_W 7
#endif
/*
* This struct is not really exported, but it's nice to
* get unique names in debugger for kp/nkp
*/
struct ERTS_POLL_EXPORT(erts_pollset) {
ErtsPollSet next;
int internal_fd_limit;
ErtsFdStatus *fds_status;
erts_atomic_t no_of_user_fds;
int fds_status_len;
#if ERTS_POLL_USE_KERNEL_POLL
int kp_fd;
int res_events_len;
#if ERTS_POLL_USE_EPOLL
struct epoll_event *res_events;
#elif ERTS_POLL_USE_KQUEUE
struct kevent *res_events;
#elif ERTS_POLL_USE_DEVPOLL
struct pollfd *res_events;
#endif
#endif /* ERTS_POLL_USE_KERNEL_POLL */
#if ERTS_POLL_USE_POLL
int next_poll_fds_ix;
int no_poll_fds;
int poll_fds_len;
struct pollfd*poll_fds;
#elif ERTS_POLL_USE_SELECT
int next_sel_fd;
int max_fd;
#if ERTS_POLL_USE_FALLBACK
int no_select_fds;
#endif
ERTS_fd_set input_fds;
ERTS_fd_set res_input_fds;
ERTS_fd_set output_fds;
ERTS_fd_set res_output_fds;
#endif
ErtsPollSetUpdateRequestsBlock update_requests;
ErtsPollSetUpdateRequestsBlock *curr_upd_req_block;
erts_atomic32_t have_update_requests;
erts_atomic32_t polled;
erts_mtx_t mtx;
int wake_fds[2];
#if ERTS_POLL_USE_TIMERFD
int timer_fd;
#endif
#if ERTS_POLL_USE_FALLBACK
int fallback_used;
#endif
erts_atomic32_t wakeup_state;
erts_atomic64_t timeout_time;
#ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS
erts_atomic_t no_avoided_wakeups;
erts_atomic_t no_avoided_interrupts;
erts_atomic_t no_interrupt_timed;
#endif
};
void erts_silence_warn_unused_result(long unused);
static void fatal_error(char *format, ...);
static void fatal_error_async_signal_safe(char *error_str);
static int max_fds = -1;
static ErtsPollSet pollsets;
static erts_mtx_t pollsets_lock;
#if ERTS_POLL_USE_POLL
static ERTS_INLINE short
ev2pollev(ErtsPollEvents ev)
{
#if !ERTS_POLL_USE_FALLBACK || ERTS_POLL_USE_KQUEUE
return ERTS_POLL_EV_E2N(ev);
#else /* Note, we only map events we are interested in */
short res_ev = (short) 0;
if (ev & ERTS_POLL_EV_IN)
res_ev |= ERTS_POLL_EV_NKP_IN;
if (ev & ERTS_POLL_EV_OUT)
res_ev |= ERTS_POLL_EV_NKP_OUT;
return res_ev;
#endif
}
static ERTS_INLINE ErtsPollEvents
pollev2ev(short ev)
{
#if !ERTS_POLL_USE_FALLBACK || ERTS_POLL_USE_KQUEUE
return ERTS_POLL_EV_N2E(ev);
#else /* Note, we only map events we are interested in */
ErtsPollEvents res_ev = (ErtsPollEvents) 0;
if (ev & ERTS_POLL_EV_NKP_IN)
res_ev |= ERTS_POLL_EV_IN;
if (ev & ERTS_POLL_EV_NKP_OUT)
res_ev |= ERTS_POLL_EV_OUT;
if (ev & ERTS_POLL_EV_NKP_ERR)
res_ev |= ERTS_POLL_EV_ERR;
if (ev & ERTS_POLL_EV_NKP_NVAL)
res_ev |= ERTS_POLL_EV_NVAL;
return res_ev;
#endif
}
#endif
#ifdef HARD_DEBUG
static void check_poll_result(ErtsPollResFd pr[], int len);
#if ERTS_POLL_USE_DEVPOLL
static void check_poll_status(ErtsPollSet ps);
#endif /* ERTS_POLL_USE_DEVPOLL */
#endif /* HARD_DEBUG */
#ifdef ERTS_POLL_DEBUG_PRINT
static void print_misc_debug_info(void);
#endif
static ERTS_INLINE void
init_timeout_time(ErtsPollSet ps)
{
erts_atomic64_init_nob(&ps->timeout_time,
(erts_aint64_t) ERTS_MONOTONIC_TIME_MAX);
}
static ERTS_INLINE void
set_timeout_time(ErtsPollSet ps, ErtsMonotonicTime time)
{
erts_atomic64_set_relb(&ps->timeout_time,
(erts_aint64_t) time);
}
static ERTS_INLINE ErtsMonotonicTime
get_timeout_time(ErtsPollSet ps)
{
return (ErtsMonotonicTime) erts_atomic64_read_acqb(&ps->timeout_time);
}
#define ERTS_POLL_NOT_WOKEN 0
#define ERTS_POLL_WOKEN -1
#define ERTS_POLL_WOKEN_INTR 1
static ERTS_INLINE void
reset_wakeup_state(ErtsPollSet ps)
{
erts_atomic32_set_mb(&ps->wakeup_state, ERTS_POLL_NOT_WOKEN);
}
static ERTS_INLINE int
is_woken(ErtsPollSet ps)
{
return erts_atomic32_read_acqb(&ps->wakeup_state) != ERTS_POLL_NOT_WOKEN;
}
static ERTS_INLINE int
is_interrupted_reset(ErtsPollSet ps)
{
return (erts_atomic32_xchg_acqb(&ps->wakeup_state, ERTS_POLL_NOT_WOKEN)
== ERTS_POLL_WOKEN_INTR);
}
static ERTS_INLINE void
woke_up(ErtsPollSet ps)
{
erts_aint32_t wakeup_state = erts_atomic32_read_acqb(&ps->wakeup_state);
if (wakeup_state == ERTS_POLL_NOT_WOKEN)
(void) erts_atomic32_cmpxchg_nob(&ps->wakeup_state,
ERTS_POLL_WOKEN,
ERTS_POLL_NOT_WOKEN);
ASSERT(erts_atomic32_read_nob(&ps->wakeup_state) != ERTS_POLL_NOT_WOKEN);
}
/*
* --- Wakeup pipe -----------------------------------------------------------
*/
static ERTS_INLINE void
wake_poller(ErtsPollSet ps, int interrupted, int async_signal_safe)
{
int wake;
if (async_signal_safe)
wake = 1;
else {
erts_aint32_t wakeup_state;
if (!interrupted)
wakeup_state = erts_atomic32_cmpxchg_relb(&ps->wakeup_state,
ERTS_POLL_WOKEN,
ERTS_POLL_NOT_WOKEN);
else
wakeup_state = erts_atomic32_xchg_relb(&ps->wakeup_state,
ERTS_POLL_WOKEN_INTR);
wake = wakeup_state == ERTS_POLL_NOT_WOKEN;
}
/*
* NOTE: This function might be called from signal handlers in the
* non-smp case; therefore, it has to be async-signal safe in
* the non-smp case.
*/
if (wake) {
ssize_t res;
if (ps->wake_fds[1] < 0)
return; /* Not initialized yet */
do {
/* write() is async-signal safe (according to posix) */
res = write(ps->wake_fds[1], "!", 1);
} while (res < 0 && errno == EINTR);
if (res <= 0 && errno != ERRNO_BLOCK) {
if (async_signal_safe)
fatal_error_async_signal_safe(__FILE__
":XXX:wake_poller(): "
"Failed to write on wakeup pipe\n");
else
fatal_error("%s:%d:wake_poller(): "
"Failed to write to wakeup pipe fd=%d: "
"%s (%d)\n",
__FILE__, __LINE__,
ps->wake_fds[1],
erl_errno_id(errno), errno);
}
}
}
static ERTS_INLINE void
cleanup_wakeup_pipe(ErtsPollSet ps)
{
int fd = ps->wake_fds[0];
int res;
do {
char buf[32];
res = read(fd, buf, sizeof(buf));
} while (res > 0 || (res < 0 && errno == EINTR));
if (res < 0 && errno != ERRNO_BLOCK) {
fatal_error("%s:%d:cleanup_wakeup_pipe(): "
"Failed to read on wakeup pipe fd=%d: "
"%s (%d)\n",
__FILE__, __LINE__,
fd,
erl_errno_id(errno), errno);
}
}
static void
create_wakeup_pipe(ErtsPollSet ps)
{
int do_wake = 0;
int wake_fds[2];
ps->wake_fds[0] = -1;
ps->wake_fds[1] = -1;
if (pipe(wake_fds) < 0) {
fatal_error("%s:%d:create_wakeup_pipe(): "
"Failed to create pipe: %s (%d)\n",
__FILE__,
__LINE__,
erl_errno_id(errno),
errno);
}
SET_NONBLOCKING(wake_fds[0]);
SET_NONBLOCKING(wake_fds[1]);
#ifdef ERTS_POLL_DEBUG_PRINT
erts_printf("wakeup fds = {%d, %d}\n", wake_fds[0], wake_fds[1]);
#endif
ERTS_POLL_EXPORT(erts_poll_control)(ps,
wake_fds[0],
ERTS_POLL_EV_IN,
1, &do_wake);
#if ERTS_POLL_USE_FALLBACK
/* We depend on the wakeup pipe being handled by kernel poll */
if (ps->fds_status[wake_fds[0]].flags & ERTS_POLL_FD_FLG_INFLBCK)
fatal_error("%s:%d:create_wakeup_pipe(): Internal error\n",
__FILE__, __LINE__);
#endif
if (ps->internal_fd_limit <= wake_fds[1])
ps->internal_fd_limit = wake_fds[1] + 1;
if (ps->internal_fd_limit <= wake_fds[0])
ps->internal_fd_limit = wake_fds[0] + 1;
ps->wake_fds[0] = wake_fds[0];
ps->wake_fds[1] = wake_fds[1];
}
/*
* --- timer fd -----------------------------------------------------------
*/
#if ERTS_POLL_USE_TIMERFD
/* We use the timerfd when using epoll_wait to get high accuracy
timeouts, i.e. we want to sleep with < ms accuracy. */
static void
create_timerfd(ErtsPollSet ps)
{
int do_wake = 0;
int timer_fd;
timer_fd = timerfd_create(CLOCK_MONOTONIC,0);
ERTS_POLL_EXPORT(erts_poll_control)(ps,
timer_fd,
ERTS_POLL_EV_IN,
1, &do_wake);
#if ERTS_POLL_USE_FALLBACK
/* We depend on the wakeup pipe being handled by kernel poll */
if (ps->fds_status[timer_fd].flags & ERTS_POLL_FD_FLG_INFLBCK)
fatal_error("%s:%d:create_wakeup_pipe(): Internal error\n",
__FILE__, __LINE__);
#endif
if (ps->internal_fd_limit <= timer_fd)
ps->internal_fd_limit = timer_fd + 1;
ps->timer_fd = timer_fd;
}
static ERTS_INLINE void
timerfd_set(ErtsPollSet ps, struct itimerspec *its)
{
#ifdef DEBUG
struct itimerspec old_its;
int res;
res = timerfd_settime(ps->timer_fd, 0, its, &old_its);
ASSERT(res == 0);
ASSERT(old_its.it_interval.tv_sec == 0 &&
old_its.it_interval.tv_nsec == 0 &&
old_its.it_value.tv_sec == 0 &&
old_its.it_value.tv_nsec == 0);
#else
timerfd_settime(ps->timer_fd, 0, its, NULL);
#endif
}
static ERTS_INLINE int
timerfd_clear(ErtsPollSet ps, int res, int max_res) {
struct itimerspec its;
/* we always have to clear the timer */
its.it_interval.tv_sec = 0;
its.it_interval.tv_nsec = 0;
its.it_value.tv_sec = 0;
its.it_value.tv_nsec = 0;
timerfd_settime(ps->timer_fd, 0, &its, NULL);
/* only timeout fd triggered */
if (res == 1 && ps->res_events[0].data.fd == ps->timer_fd)
return 0;
return res;
}
#endif /* ERTS_POLL_USE_TIMERFD */
/*
* --- Poll set update requests ----------------------------------------------
*/
static ERTS_INLINE void
enqueue_update_request(ErtsPollSet ps, int fd)
{
ErtsPollSetUpdateRequestsBlock *urqbp;
ASSERT(fd < ps->fds_status_len);
if (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INURQ)
return;
if (ps->update_requests.len == 0)
ERTS_POLLSET_SET_HAVE_UPDATE_REQUESTS(ps);
urqbp = ps->curr_upd_req_block;
if (urqbp->len == ERTS_POLLSET_UPDATE_REQ_BLOCK_SIZE) {
ASSERT(!urqbp->next);
urqbp = erts_alloc(ERTS_ALC_T_POLLSET_UPDREQ,
sizeof(ErtsPollSetUpdateRequestsBlock));
ps->curr_upd_req_block->next = urqbp;
ps->curr_upd_req_block = urqbp;
urqbp->next = NULL;
urqbp->len = 0;
}
ps->fds_status[fd].flags |= ERTS_POLL_FD_FLG_INURQ;
urqbp->fds[urqbp->len++] = fd;
}
static ERTS_INLINE void
free_update_requests_block(ErtsPollSet ps,
ErtsPollSetUpdateRequestsBlock *urqbp)
{
if (urqbp != &ps->update_requests)
erts_free(ERTS_ALC_T_POLLSET_UPDREQ, (void *) urqbp);
else {
urqbp->next = NULL;
urqbp->len = 0;
}
}
/*
* --- Growing poll set structures -------------------------------------------
*/
#ifndef ERTS_KERNEL_POLL_VERSION /* only one shared implementation */
#define ERTS_FD_TABLE_MIN_LENGTH 1024
#define ERTS_FD_TABLE_EXP_THRESHOLD (2048*1024)
int erts_poll_new_table_len (int old_len, int need_len)
{
int new_len;
ASSERT(need_len > old_len);
if (need_len < ERTS_FD_TABLE_MIN_LENGTH) {
new_len = ERTS_FD_TABLE_MIN_LENGTH;
}
else {
new_len = old_len;
do {
if (new_len < ERTS_FD_TABLE_EXP_THRESHOLD)
new_len *= 2;
else
new_len += ERTS_FD_TABLE_EXP_THRESHOLD;
} while (new_len < need_len);
}
ASSERT(new_len >= need_len);
return new_len;
}
#endif
#if ERTS_POLL_USE_KERNEL_POLL
static void
grow_res_events(ErtsPollSet ps, int new_len)
{
size_t new_size = sizeof(
#if ERTS_POLL_USE_EPOLL
struct epoll_event
#elif ERTS_POLL_USE_DEVPOLL
struct pollfd
#elif ERTS_POLL_USE_KQUEUE
struct kevent
#endif
) * erts_poll_new_table_len(ps->res_events_len, new_len);
/* We do not need to save previously stored data */
if (ps->res_events)
erts_free(ERTS_ALC_T_POLL_RES_EVS, ps->res_events);
ps->res_events = erts_alloc(ERTS_ALC_T_POLL_RES_EVS, new_size);
ps->res_events_len = new_len;
}
#endif /* ERTS_POLL_USE_KERNEL_POLL */
#if ERTS_POLL_USE_POLL
static void
grow_poll_fds(ErtsPollSet ps, int min_ix)
{
int i;
int new_len = erts_poll_new_table_len(ps->poll_fds_len, min_ix + 1);
if (new_len > max_fds)
new_len = max_fds;
ps->poll_fds = (ps->poll_fds_len
? erts_realloc(ERTS_ALC_T_POLL_FDS,
ps->poll_fds,
sizeof(struct pollfd)*new_len)
: erts_alloc(ERTS_ALC_T_POLL_FDS,
sizeof(struct pollfd)*new_len));
for (i = ps->poll_fds_len; i < new_len; i++) {
ps->poll_fds[i].fd = -1;
ps->poll_fds[i].events = (short) 0;
ps->poll_fds[i].revents = (short) 0;
}
ps->poll_fds_len = new_len;
}
#endif
#ifdef _DARWIN_UNLIMITED_SELECT
static void
grow_select_fds(int fd, ERTS_fd_set* fds)
{
int new_len = erts_poll_new_table_len(fds->sz, fd + 1);
if (new_len > max_fds)
new_len = max_fds;
new_len = ERTS_FD_SIZE(new_len);
fds->ptr = fds->sz
? erts_realloc(ERTS_ALC_T_SELECT_FDS, fds->ptr, new_len)
: erts_alloc(ERTS_ALC_T_SELECT_FDS, new_len);
memset((char*)fds->ptr + fds->sz, 0, new_len - fds->sz);
fds->sz = new_len;
}
static ERTS_INLINE void
ensure_select_fds(int fd, ERTS_fd_set* in, ERTS_fd_set* out)
{
ASSERT(in->sz == out->sz);
if (ERTS_FD_SIZE(fd+1) > in->sz) {
grow_select_fds(fd, in);
grow_select_fds(fd, out);
}
}
#else
# define ensure_select_fds(fd, in, out) do {} while(0)
#endif /* _DARWIN_UNLIMITED_SELECT */
static void
grow_fds_status(ErtsPollSet ps, int min_fd)
{
int i;
int new_len = erts_poll_new_table_len(ps->fds_status_len, min_fd + 1);
ASSERT(min_fd < max_fds);
if (new_len > max_fds)
new_len = max_fds;
ps->fds_status = (ps->fds_status_len
? erts_realloc(ERTS_ALC_T_FD_STATUS,
ps->fds_status,
sizeof(ErtsFdStatus)*new_len)
: erts_alloc(ERTS_ALC_T_FD_STATUS,
sizeof(ErtsFdStatus)*new_len));
for (i = ps->fds_status_len; i < new_len; i++) {
#if ERTS_POLL_USE_POLL
ps->fds_status[i].pix = -1;
#endif
ps->fds_status[i].used_events = (ErtsPollEvents) 0;
ps->fds_status[i].events = (ErtsPollEvents) 0;
#if ERTS_POLL_COALESCE_KP_RES
ps->fds_status[i].res_ev_ix = (unsigned short) ERTS_POLL_MAX_RES;
#endif
ps->fds_status[i].flags = (unsigned short) 0;
}
ps->fds_status_len = new_len;
}
/*
* --- Selecting fd to poll on -----------------------------------------------
*/
#if ERTS_POLL_USE_FALLBACK
static int update_fallback_pollset(ErtsPollSet ps, int fd);
#endif
static ERTS_INLINE int
need_update(ErtsPollSet ps, int fd)
{
#if ERTS_POLL_USE_KERNEL_POLL
int reset;
#endif
ASSERT(fd < ps->fds_status_len);
#if ERTS_POLL_USE_KERNEL_POLL
reset = (int) (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_RST);
if (reset && !ps->fds_status[fd].used_events) {
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_RST;
reset = 0;
}
#else
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_RST;
#endif
if (ps->fds_status[fd].used_events != ps->fds_status[fd].events)
return 1;
#if ERTS_POLL_USE_KERNEL_POLL
return reset;
#else
return 0;
#endif
}
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
#if ERTS_POLL_USE_KQUEUE
#define ERTS_POLL_MIN_BATCH_BUF_SIZE 128
#else
#define ERTS_POLL_MIN_BATCH_BUF_SIZE 64
#endif
typedef struct {
int len;
int size;
#if ERTS_POLL_USE_DEVPOLL
struct pollfd *buf;
#elif ERTS_POLL_USE_KQUEUE
struct kevent *buf;
struct kevent *ebuf;
#endif
} ErtsPollBatchBuf;
static ERTS_INLINE void
setup_batch_buf(ErtsPollSet ps, ErtsPollBatchBuf *bbp)
{
bbp->len = 0;
#if ERTS_POLL_USE_DEVPOLL
bbp->size = ps->res_events_len;
bbp->buf = ps->res_events;
#elif ERTS_POLL_USE_KQUEUE
bbp->size = ps->res_events_len/2;
bbp->buf = ps->res_events;
bbp->ebuf = bbp->buf + bbp->size;
#endif
}
#if ERTS_POLL_USE_DEVPOLL
static void
write_batch_buf(ErtsPollSet ps, ErtsPollBatchBuf *bbp)
{
ssize_t wres;
char *buf = (char *) bbp->buf;
size_t buf_size = sizeof(struct pollfd)*bbp->len;
while (1) {
wres = write(ps->kp_fd, (void *) buf, buf_size);
if (wres < 0) {
if (errno == EINTR)
continue;
fatal_error("%s:%d:write_batch_buf(): "
"Failed to write to /dev/poll: "
"%s (%d)\n",
__FILE__, __LINE__,
erl_errno_id(errno), errno);
}
buf_size -= wres;
if (buf_size <= 0)
break;
buf += wres;
}
if (buf_size < 0) {
fatal_error("%s:%d:write_devpoll_buf(): Internal error\n",
__FILE__, __LINE__);
}
bbp->len = 0;
}
#elif ERTS_POLL_USE_KQUEUE
static void
write_batch_buf(ErtsPollSet ps, ErtsPollBatchBuf *bbp)
{
int res;
int len = bbp->len;
struct kevent *buf = bbp->buf;
struct timespec ts = {0, 0};
do {
res = kevent(ps->kp_fd, buf, len, NULL, 0, &ts);
} while (res < 0 && errno == EINTR);
if (res < 0) {
int i;
struct kevent *ebuf = bbp->ebuf;
do {
res = kevent(ps->kp_fd, buf, len, ebuf, len, &ts);
} while (res < 0 && errno == EINTR);
if (res < 0) {
fatal_error("%s:%d: kevent() failed: %s (%d)\n",
__FILE__, __LINE__, erl_errno_id(errno), errno);
}
for (i = 0; i < res; i++) {
if (ebuf[i].flags & EV_ERROR) {
short filter;
int fd = (int) ebuf[i].ident;
switch ((int) (long) ebuf[i].udata) {
/*
* Since we use a lazy update approach EV_DELETE will
* frequently fail. This since kqueue automatically
* removes a file descriptor that is closed from the
* poll set.
*/
case ERTS_POLL_KQ_OP_DEL_R:
case ERTS_POLL_KQ_OP_DEL_W:
case ERTS_POLL_KQ_OP_HANDLED:
break;
/*
* According to the kqueue man page EVFILT_READ support
* does not imply EVFILT_WRITE support; therefore,
* if an EV_ADD fail, we may have to remove other
* events on this fd in the kqueue pollset before
* adding fd to the fallback pollset.
*/
case ERTS_POLL_KQ_OP_ADD_W:
if (ps->fds_status[fd].used_events & ERTS_POLL_EV_IN) {
filter = EVFILT_READ;
goto rm_add_fb;
}
goto add_fb;
case ERTS_POLL_KQ_OP_ADD_R:
if (ps->fds_status[fd].used_events & ERTS_POLL_EV_OUT) {
filter = EVFILT_WRITE;
goto rm_add_fb;
}
goto add_fb;
case ERTS_POLL_KQ_OP_ADD2_W:
case ERTS_POLL_KQ_OP_ADD2_R: {
int j;
for (j = i+1; j < res; j++) {
if (fd == (int) ebuf[j].ident) {
ebuf[j].udata = (void *) ERTS_POLL_KQ_OP_HANDLED;
if (!(ebuf[j].flags & EV_ERROR)) {
switch ((int) (long) ebuf[j].udata) {
case ERTS_POLL_KQ_OP_ADD2_W:
filter = EVFILT_WRITE;
goto rm_add_fb;
case ERTS_POLL_KQ_OP_ADD2_R:
filter = EVFILT_READ;
goto rm_add_fb;
default:
fatal_error("%s:%d:write_batch_buf(): "
"Internal error",
__FILE__, __LINE__);
break;
}
}
goto add_fb;
}
}
/* The other add succeded... */
filter = ((((int) (long) ebuf[i].udata)
== ERTS_POLL_KQ_OP_ADD2_W)
? EVFILT_READ
: EVFILT_WRITE);
rm_add_fb:
{
struct kevent kev;
struct timespec ts = {0, 0};
EV_SET(&kev, fd, filter, EV_DELETE, 0, 0, 0);
(void) kevent(ps->kp_fd, &kev, 1, NULL, 0, &ts);
}
add_fb:
ps->fds_status[fd].flags |= ERTS_POLL_FD_FLG_USEFLBCK;
ASSERT(ps->fds_status[fd].used_events);
ps->fds_status[fd].used_events = 0;
erts_atomic_dec_nob(&ps->no_of_user_fds);
update_fallback_pollset(ps, fd);
ASSERT(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK);
break;
}
default:
fatal_error("%s:%d:write_batch_buf(): Internal error",
__FILE__, __LINE__);
break;
}
}
}
}
bbp->len = 0;
}
#endif /* ERTS_POLL_USE_KQUEUE */
static ERTS_INLINE void
batch_update_pollset(ErtsPollSet ps, int fd, ErtsPollBatchBuf *bbp)
{
int buf_len;
#if ERTS_POLL_USE_DEVPOLL
short events;
struct pollfd *buf;
#elif ERTS_POLL_USE_KQUEUE
struct kevent *buf;
#endif
#ifdef ERTS_POLL_DEBUG_PRINT
erts_printf("Doing lazy update on fd=%d\n", fd);
#endif
if (!need_update(ps, fd))
return;
/* Make sure we have room for at least maximum no of entries
per fd */
if (bbp->size - bbp->len < 2)
write_batch_buf(ps, bbp);
buf_len = bbp->len;
buf = bbp->buf;
ASSERT(fd < ps->fds_status_len);
#if ERTS_POLL_USE_DEVPOLL
events = ERTS_POLL_EV_E2N(ps->fds_status[fd].events);
if (!events) {
buf[buf_len].events = POLLREMOVE;
erts_atomic_dec_nob(&ps->no_of_user_fds);
}
else if (!ps->fds_status[fd].used_events) {
buf[buf_len].events = events;
erts_atomic_inc_nob(&ps->no_of_user_fds);
}
else {
if ((ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_RST)
|| (ps->fds_status[fd].used_events & ~events)) {
/* Reset or removed events... */
buf[buf_len].fd = fd;
buf[buf_len].events = POLLREMOVE;
buf[buf_len++].revents = 0;
}
buf[buf_len].events = events;
}
buf[buf_len].fd = fd;
buf[buf_len++].revents = 0;
#elif ERTS_POLL_USE_KQUEUE
if (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK) {
if (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_USEFLBCK)
update_fallback_pollset(ps, fd);
else { /* Remove from fallback and try kqueue */
ErtsPollEvents events = ps->fds_status[fd].events;
ps->fds_status[fd].events = (ErtsPollEvents) 0;
update_fallback_pollset(ps, fd);
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK));
if (events) {
ps->fds_status[fd].events = events;
goto try_kqueue;
}
}
}
else {
ErtsPollEvents events, used_events;
int mod_w, mod_r;
try_kqueue:
events = ERTS_POLL_EV_E2N(ps->fds_status[fd].events);
used_events = ERTS_POLL_EV_E2N(ps->fds_status[fd].used_events);
if (!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_RST)) {
if (!used_events &&
(events & ERTS_POLL_EV_IN) && (events & ERTS_POLL_EV_OUT))
goto do_add_rw;
mod_r = ((events & ERTS_POLL_EV_IN)
!= (used_events & ERTS_POLL_EV_IN));
mod_w = ((events & ERTS_POLL_EV_OUT)
!= (used_events & ERTS_POLL_EV_OUT));
goto do_mod;
}
else { /* Reset */
if ((events & ERTS_POLL_EV_IN) && (events & ERTS_POLL_EV_OUT)) {
do_add_rw:
EV_SET(&buf[buf_len], fd, EVFILT_READ, EV_ADD,
0, 0, (void *) ERTS_POLL_KQ_OP_ADD2_R);
buf_len++;
EV_SET(&buf[buf_len], fd, EVFILT_WRITE, EV_ADD,
0, 0, (void *) ERTS_POLL_KQ_OP_ADD2_W);
buf_len++;
}
else {
mod_r = 1;
mod_w = 1;
do_mod:
if (mod_r) {
if (events & ERTS_POLL_EV_IN) {
EV_SET(&buf[buf_len], fd, EVFILT_READ, EV_ADD,
0, 0, (void *) ERTS_POLL_KQ_OP_ADD_R);
buf_len++;
}
else if (used_events & ERTS_POLL_EV_IN) {
EV_SET(&buf[buf_len], fd, EVFILT_READ, EV_DELETE,
0, 0, (void *) ERTS_POLL_KQ_OP_DEL_R);
buf_len++;
}
}
if (mod_w) {
if (events & ERTS_POLL_EV_OUT) {
EV_SET(&buf[buf_len], fd, EVFILT_WRITE, EV_ADD,
0, 0, (void *) ERTS_POLL_KQ_OP_ADD_W);
buf_len++;
}
else if (used_events & ERTS_POLL_EV_OUT) {
EV_SET(&buf[buf_len], fd, EVFILT_WRITE, EV_DELETE,
0, 0, (void *) ERTS_POLL_KQ_OP_DEL_W);
buf_len++;
}
}
}
}
if (used_events) {
if (!events) {
erts_atomic_dec_nob(&ps->no_of_user_fds);
}
}
else {
if (events)
erts_atomic_inc_nob(&ps->no_of_user_fds);
}
ASSERT((events & ~(ERTS_POLL_EV_IN|ERTS_POLL_EV_OUT)) == 0);
ASSERT((used_events & ~(ERTS_POLL_EV_IN|ERTS_POLL_EV_OUT)) == 0);
}
#endif
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_RST;
ps->fds_status[fd].used_events = ps->fds_status[fd].events;
bbp->len = buf_len;
}
#else /* !ERTS_POLL_USE_BATCH_UPDATE_POLLSET */
#if ERTS_POLL_USE_EPOLL
static int
#if ERTS_POLL_USE_CONCURRENT_UPDATE
conc_update_pollset(ErtsPollSet ps, int fd, int *update_fallback)
#else
update_pollset(ErtsPollSet ps, int fd)
#endif
{
int res;
int op;
struct epoll_event epe_templ;
struct epoll_event epe;
ASSERT(fd < ps->fds_status_len);
if (!need_update(ps, fd))
return 0;
#ifdef ERTS_POLL_DEBUG_PRINT
erts_printf("Doing update on fd=%d\n", fd);
#endif
if (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK) {
#if ERTS_POLL_USE_CONCURRENT_UPDATE
if (!*update_fallback) {
*update_fallback = 1;
return 0;
}
#endif
if (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_USEFLBCK) {
return update_fallback_pollset(ps, fd);
}
else { /* Remove from fallback and try epoll */
ErtsPollEvents events = ps->fds_status[fd].events;
ps->fds_status[fd].events = (ErtsPollEvents) 0;
res = update_fallback_pollset(ps, fd);
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK));
if (!events)
return res;
ps->fds_status[fd].events = events;
}
}
epe_templ.events = ERTS_POLL_EV_E2N(ps->fds_status[fd].events);
epe_templ.data.fd = fd;
#ifdef VALGRIND
/* Silence invalid valgrind warning ... */
memset((void *) &epe.data, 0, sizeof(epoll_data_t));
#endif
if (epe_templ.events && ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_RST) {
do {
/* We init 'epe' every time since epoll_ctl() may modify it
(not declared const and not documented as const). */
epe.events = epe_templ.events;
epe.data.fd = epe_templ.data.fd;
res = epoll_ctl(ps->kp_fd, EPOLL_CTL_DEL, fd, &epe);
} while (res != 0 && errno == EINTR);
erts_atomic_dec_nob(&ps->no_of_user_fds);
ps->fds_status[fd].used_events = 0;
}
if (!epe_templ.events) {
/* A note on EPOLL_CTL_DEL: linux kernel versions before 2.6.9
need a non-NULL event pointer even though it is ignored... */
op = EPOLL_CTL_DEL;
erts_atomic_dec_nob(&ps->no_of_user_fds);
}
else if (!ps->fds_status[fd].used_events) {
op = EPOLL_CTL_ADD;
erts_atomic_inc_nob(&ps->no_of_user_fds);
}
else {
op = EPOLL_CTL_MOD;
}
do {
/* We init 'epe' every time since epoll_ctl() may modify it
(not declared const and not documented as const). */
epe.events = epe_templ.events;
epe.data.fd = epe_templ.data.fd;
res = epoll_ctl(ps->kp_fd, op, fd, &epe);
} while (res != 0 && errno == EINTR);
#if defined(ERTS_POLL_DEBUG_PRINT) && 1
{
int saved_errno = errno;
erts_printf("%s = epoll_ctl(%d, %s, %d, {Ox%x, %d})\n",
res == 0 ? "0" : erl_errno_id(errno),
ps->kp_fd,
(op == EPOLL_CTL_ADD
? "EPOLL_CTL_ADD"
: (op == EPOLL_CTL_MOD
? "EPOLL_CTL_MOD"
: (op == EPOLL_CTL_DEL
? "EPOLL_CTL_DEL"
: "UNKNOWN"))),
fd,
epe_templ.events,
fd);
errno = saved_errno;
}
#endif
if (res == 0)
ps->fds_status[fd].used_events = ps->fds_status[fd].events;
else {
switch (op) {
case EPOLL_CTL_MOD:
epe.events = 0;
do {
/* We init 'epe' every time since epoll_ctl() may modify it
(not declared const and not documented as const). */
epe.events = 0;
epe.data.fd = fd;
res = epoll_ctl(ps->kp_fd, EPOLL_CTL_DEL, fd, &epe);
} while (res != 0 && errno == EINTR);
ps->fds_status[fd].used_events = 0;
/* Fall through ... */
case EPOLL_CTL_ADD: {
ps->fds_status[fd].flags |= ERTS_POLL_FD_FLG_USEFLBCK;
erts_atomic_dec_nob(&ps->no_of_user_fds);
#if ERTS_POLL_USE_CONCURRENT_UPDATE
if (!*update_fallback) {
*update_fallback = 1;
return 0;
}
#endif
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK));
res = update_fallback_pollset(ps, fd);
ASSERT(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK);
break;
}
case EPOLL_CTL_DEL: {
/*
* Since we use a lazy update approach EPOLL_CTL_DEL will
* frequently fail. This since epoll automatically removes
* a filedescriptor that is closed from the poll set.
*/
ps->fds_status[fd].used_events = 0;
res = 0;
break;
}
default:
fatal_error("%s:%d:update_pollset(): Internal error\n",
__FILE__, __LINE__);
break;
}
}
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_RST;
return res;
}
#if ERTS_POLL_USE_CONCURRENT_UPDATE
static int
update_pollset(ErtsPollSet ps, int fd)
{
int update_fallback = 1;
return conc_update_pollset(ps, fd, &update_fallback);
}
#endif
#endif /* ERTS_POLL_USE_EPOLL */
#endif /* ERTS_POLL_USE_BATCH_UPDATE_POLLSET */
#if ERTS_POLL_USE_POLL || ERTS_POLL_USE_SELECT || ERTS_POLL_USE_FALLBACK
#if ERTS_POLL_USE_FALLBACK
static int update_fallback_pollset(ErtsPollSet ps, int fd)
#else
static int update_pollset(ErtsPollSet ps, int fd)
#endif
{
#ifdef ERTS_POLL_DEBUG_PRINT
#if ERTS_POLL_USE_FALLBACK
erts_printf("Doing fallback update on fd=%d\n", fd);
#else
erts_printf("Doing update on fd=%d\n", fd);
#endif
#endif
ASSERT(fd < ps->fds_status_len);
#if ERTS_POLL_USE_FALLBACK
ASSERT(ps->fds_status[fd].used_events
? (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK)
: (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_USEFLBCK));
#endif
if (!need_update(ps, fd))
return 0;
#if ERTS_POLL_USE_FALLBACK
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_RST;
#endif
#if ERTS_POLL_USE_POLL /* --- poll -------------------------------- */
if (!ps->fds_status[fd].events) {
int pix = ps->fds_status[fd].pix;
int last_pix;
if (pix < 0) {
#if ERTS_POLL_USE_FALLBACK
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK));
#endif
return -1;
}
#if ERTS_POLL_USE_FALLBACK
ASSERT(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK);
#endif
erts_atomic_dec_nob(&ps->no_of_user_fds);
last_pix = --ps->no_poll_fds;
if (pix != last_pix) {
/* Move last pix to this pix */
ps->poll_fds[pix].fd = ps->poll_fds[last_pix].fd;
ps->poll_fds[pix].events = ps->poll_fds[last_pix].events;
ps->poll_fds[pix].revents = ps->poll_fds[last_pix].revents;
ps->fds_status[ps->poll_fds[pix].fd].pix = pix;
}
/* Clear last pix */
ps->poll_fds[last_pix].fd = -1;
ps->poll_fds[last_pix].events = (short) 0;
ps->poll_fds[last_pix].revents = (short) 0;
/* Clear this fd status */
ps->fds_status[fd].pix = -1;
ps->fds_status[fd].used_events = (ErtsPollEvents) 0;
#if ERTS_POLL_USE_FALLBACK
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_INFLBCK;
#endif
}
else {
int pix = ps->fds_status[fd].pix;
if (pix < 0) {
#if ERTS_POLL_USE_FALLBACK
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK)
|| fd == ps->kp_fd);
#endif
erts_atomic_inc_nob(&ps->no_of_user_fds);
ps->fds_status[fd].pix = pix = ps->no_poll_fds++;
if (pix >= ps->poll_fds_len)
grow_poll_fds(ps, pix);
ps->poll_fds[pix].fd = fd;
ps->fds_status[fd].pix = pix;
#if ERTS_POLL_USE_FALLBACK
ps->fds_status[fd].flags |= ERTS_POLL_FD_FLG_INFLBCK;
#endif
}
#if ERTS_POLL_USE_FALLBACK
ASSERT(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK);
#endif
/* Events to be used in next poll */
ps->poll_fds[pix].events = ev2pollev(ps->fds_status[fd].events);
if (ps->poll_fds[pix].revents) {
/* Remove result events that we should not poll for anymore */
ps->poll_fds[pix].revents
&= ev2pollev(~(~ps->fds_status[fd].used_events
& ps->fds_status[fd].events));
}
/* Save events to be used in next poll */
ps->fds_status[fd].used_events = ps->fds_status[fd].events;
}
return 0;
#elif ERTS_POLL_USE_SELECT /* --- select ------------------------------ */
{
ErtsPollEvents events = ps->fds_status[fd].events;
ensure_select_fds(fd, &ps->input_fds, &ps->output_fds);
if ((ERTS_POLL_EV_IN & events)
!= (ERTS_POLL_EV_IN & ps->fds_status[fd].used_events)) {
if (ERTS_POLL_EV_IN & events) {
ERTS_FD_SET(fd, &ps->input_fds);
}
else {
ERTS_FD_CLR(fd, &ps->input_fds);
}
}
if ((ERTS_POLL_EV_OUT & events)
!= (ERTS_POLL_EV_OUT & ps->fds_status[fd].used_events)) {
if (ERTS_POLL_EV_OUT & events) {
ERTS_FD_SET(fd, &ps->output_fds);
}
else {
ERTS_FD_CLR(fd, &ps->output_fds);
}
}
if (!ps->fds_status[fd].used_events) {
ASSERT(events);
erts_atomic_inc_nob(&ps->no_of_user_fds);
#if ERTS_POLL_USE_FALLBACK
ps->no_select_fds++;
ps->fds_status[fd].flags |= ERTS_POLL_FD_FLG_INFLBCK;
#endif
}
else if (!events) {
ASSERT(ps->fds_status[fd].used_events);
erts_atomic_dec_nob(&ps->no_of_user_fds);
ps->fds_status[fd].events = events;
#if ERTS_POLL_USE_FALLBACK
ps->no_select_fds--;
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_INFLBCK;
#endif
}
ps->fds_status[fd].used_events = events;
if (events && fd > ps->max_fd)
ps->max_fd = fd;
else if (!events && fd == ps->max_fd) {
int max = ps->max_fd;
for (max = ps->max_fd; max >= 0; max--)
if (ps->fds_status[max].used_events)
break;
ps->max_fd = max;
}
}
return 0;
#endif
}
#endif /* ERTS_POLL_USE_POLL || ERTS_POLL_USE_SELECT || ERTS_POLL_USE_FALLBACK */
static void
handle_update_requests(ErtsPollSet ps)
{
ErtsPollSetUpdateRequestsBlock *urqbp = &ps->update_requests;
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
ErtsPollBatchBuf bb;
setup_batch_buf(ps, &bb);
#endif
while (urqbp) {
ErtsPollSetUpdateRequestsBlock *free_urqbp = urqbp;
int i;
int len = urqbp->len;
for (i = 0; i < len; i++) {
int fd = urqbp->fds[i];
ASSERT(fd < ps->fds_status_len);
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_INURQ;
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
batch_update_pollset(ps, fd, &bb);
#else
update_pollset(ps, fd);
#endif
}
free_urqbp = urqbp;
urqbp = urqbp->next;
free_update_requests_block(ps, free_urqbp);
}
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
if (bb.len)
write_batch_buf(ps, &bb);
#endif
ps->curr_upd_req_block = &ps->update_requests;
#if ERTS_POLL_USE_DEVPOLL && defined(HARD_DEBUG)
check_poll_status(ps);
#endif
ERTS_POLLSET_UNSET_HAVE_UPDATE_REQUESTS(ps);
}
static ERTS_INLINE ErtsPollEvents
poll_control(ErtsPollSet ps, int fd, ErtsPollEvents events, int on, int *do_wake)
{
ErtsPollEvents new_events;
if (fd < ps->internal_fd_limit || fd >= max_fds) {
if (fd < 0) {
new_events = ERTS_POLL_EV_ERR;
goto done;
}
#if ERTS_POLL_USE_KERNEL_POLL
if (fd == ps->kp_fd) {
new_events = ERTS_POLL_EV_NVAL;
goto done;
}
#endif
if (fd == ps->wake_fds[0] || fd == ps->wake_fds[1]) {
new_events = ERTS_POLL_EV_NVAL;
goto done;
}
#if ERTS_POLL_USE_TIMERFD
if (fd == ps->timer_fd) {
new_events = ERTS_POLL_EV_NVAL;
goto done;
}
#endif
}
if (fd >= ps->fds_status_len)
grow_fds_status(ps, fd);
ASSERT(fd < ps->fds_status_len);
new_events = ps->fds_status[fd].events;
if (events == 0) {
*do_wake = 0;
goto done;
}
if (on)
new_events |= events;
else
new_events &= ~events;
if (new_events == (ErtsPollEvents) 0) {
ps->fds_status[fd].flags |= ERTS_POLL_FD_FLG_RST;
#if ERTS_POLL_USE_FALLBACK
ps->fds_status[fd].flags &= ~ERTS_POLL_FD_FLG_USEFLBCK;
#endif
}
ps->fds_status[fd].events = new_events;
if (new_events == ps->fds_status[fd].used_events
&& !(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_RST)
) {
*do_wake = 0;
goto done;
}
#if ERTS_POLL_USE_CONCURRENT_UPDATE
if (ERTS_POLLSET_IS_POLLED(ps)) {
int update_fallback = 0;
conc_update_pollset(ps, fd, &update_fallback);
if (!update_fallback) {
*do_wake = 0; /* no need to wake kernel poller */
goto done;
}
}
#endif
enqueue_update_request(ps, fd);
/*
* If new events have been added, we need to wake up the
* polling thread, but if events have been removed we don't.
*/
if ((new_events && (ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_RST))
|| (~ps->fds_status[fd].used_events & new_events))
*do_wake = 1;
done:
#ifdef ERTS_POLL_DEBUG_PRINT
erts_printf("0x%x = poll_control(ps, %d, 0x%x, %s) do_wake=%d\n",
(int) new_events, fd, (int) events, (on ? "on" : "off"), *do_wake);
#endif
return new_events;
}
void
ERTS_POLL_EXPORT(erts_poll_controlv)(ErtsPollSet ps,
ErtsPollControlEntry pcev[],
int len)
{
int i;
int do_wake;
int final_do_wake = 0;
ERTS_POLLSET_LOCK(ps);
for (i = 0; i < len; i++) {
do_wake = 0;
pcev[i].events = poll_control(ps,
pcev[i].fd,
pcev[i].events,
pcev[i].on,
&do_wake);
final_do_wake |= do_wake;
}
ERTS_POLLSET_UNLOCK(ps);
if (final_do_wake)
wake_poller(ps, 0, 0);
}
ErtsPollEvents
ERTS_POLL_EXPORT(erts_poll_control)(ErtsPollSet ps,
ErtsSysFdType fd,
ErtsPollEvents events,
int on,
int* do_wake) /* In: Wake up polling thread */
/* Out: Poller is woken */
{
ErtsPollEvents res;
ERTS_POLLSET_LOCK(ps);
res = poll_control(ps, fd, events, on, do_wake);
ERTS_POLLSET_UNLOCK(ps);
if (*do_wake) {
wake_poller(ps, 0, 0);
}
return res;
}
/*
* --- Wait on poll set ------------------------------------------------------
*/
#if ERTS_POLL_USE_KERNEL_POLL
static ERTS_INLINE int
save_kp_result(ErtsPollSet ps, ErtsPollResFd pr[], int max_res, int chk_fds_res)
{
int res = 0;
int i;
int n = chk_fds_res < max_res ? chk_fds_res : max_res;
int wake_fd = ps->wake_fds[0];
#if ERTS_POLL_USE_TIMERFD
int timer_fd = ps->timer_fd;
#endif
for (i = 0; i < n; i++) {
#if ERTS_POLL_USE_EPOLL /* --- epoll ------------------------------- */
if (ps->res_events[i].events) {
int fd = ps->res_events[i].data.fd;
int ix;
ErtsPollEvents revents;
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
continue;
}
#if ERTS_POLL_USE_TIMERFD
if (fd == timer_fd) {
continue;
}
#endif
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK));
/* epoll_wait() can repeat the same fd in result array... */
ix = (int) ps->fds_status[fd].res_ev_ix;
ASSERT(ix >= 0);
if (ix >= res || pr[ix].fd != fd) {
ix = res;
pr[ix].fd = fd;
pr[ix].events = (ErtsPollEvents) 0;
}
revents = ERTS_POLL_EV_N2E(ps->res_events[i].events);
pr[ix].events |= revents;
if (revents) {
if (res == ix) {
ps->fds_status[fd].res_ev_ix = (unsigned short) ix;
res++;
}
}
}
#elif ERTS_POLL_USE_KQUEUE /* --- kqueue ------------------------------ */
struct kevent *ev;
int fd;
int ix;
ev = &ps->res_events[i];
fd = (int) ev->ident;
ASSERT(fd < ps->fds_status_len);
ASSERT(!(ps->fds_status[fd].flags & ERTS_POLL_FD_FLG_INFLBCK));
ix = (int) ps->fds_status[fd].res_ev_ix;
ASSERT(ix >= 0);
if (ix >= res || pr[ix].fd != fd) {
ix = res;
pr[ix].fd = (int) ev->ident;
pr[ix].events = (ErtsPollEvents) 0;
}
if (ev->filter == EVFILT_READ) {
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
continue;
}
pr[ix].events |= ERTS_POLL_EV_IN;
}
else if (ev->filter == EVFILT_WRITE)
pr[ix].events |= ERTS_POLL_EV_OUT;
if (ev->flags & (EV_ERROR|EV_EOF)) {
if ((ev->flags & EV_ERROR) && (((int) ev->data) == EBADF))
pr[ix].events |= ERTS_POLL_EV_NVAL;
else
pr[ix].events |= ERTS_POLL_EV_ERR;
}
if (pr[ix].events) {
if (res == ix) {
ps->fds_status[fd].res_ev_ix = (unsigned short) ix;
res++;
}
}
#elif ERTS_POLL_USE_DEVPOLL /* --- devpoll ----------------------------- */
if (ps->res_events[i].revents) {
int fd = ps->res_events[i].fd;
ErtsPollEvents revents;
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
continue;
}
#if ERTS_POLL_USE_TIMERFD
if (fd == timer_fd) {
continue;
}
#endif
revents = ERTS_POLL_EV_N2E(ps->res_events[i].events);
pr[res].fd = fd;
pr[res].events = revents;
res++;
}
#endif
}
return res;
}
#endif /* ERTS_POLL_USE_KERNEL_POLL */
#if ERTS_POLL_USE_FALLBACK
static int
get_kp_results(ErtsPollSet ps, ErtsPollResFd pr[], int max_res)
{
int res;
#if ERTS_POLL_USE_KQUEUE
struct timespec ts = {0, 0};
#endif
if (max_res > ps->res_events_len)
grow_res_events(ps, max_res);
do {
#if ERTS_POLL_USE_EPOLL
res = epoll_wait(ps->kp_fd, ps->res_events, max_res, 0);
#elif ERTS_POLL_USE_KQUEUE
res = kevent(ps->kp_fd, NULL, 0, ps->res_events, max_res, &ts);
#endif
} while (res < 0 && errno == EINTR);
if (res < 0) {
fatal_error("%s:%d: %s() failed: %s (%d)\n",
__FILE__, __LINE__,
#if ERTS_POLL_USE_EPOLL
"epoll_wait",
#elif ERTS_POLL_USE_KQUEUE
"kevent",
#endif
erl_errno_id(errno), errno);
}
return save_kp_result(ps, pr, max_res, res);
}
#endif /* ERTS_POLL_USE_FALLBACK */
static ERTS_INLINE int
save_poll_result(ErtsPollSet ps, ErtsPollResFd pr[], int max_res,
int chk_fds_res, int ebadf)
{
#if ERTS_POLL_USE_DEVPOLL
return save_kp_result(ps, pr, max_res, chk_fds_res);
#elif ERTS_POLL_USE_FALLBACK
if (!ps->fallback_used)
return save_kp_result(ps, pr, max_res, chk_fds_res);
else
#endif /* ERTS_POLL_USE_FALLBACK */
{
#if ERTS_POLL_USE_POLL /* --- poll -------------------------------- */
int res = 0;
#if !ERTS_POLL_USE_FALLBACK
int wake_fd = ps->wake_fds[0];
#endif
int i, first_ix, end_ix;
/*
* In order to be somewhat fair, we continue on the poll_fds
* index where we stopped last time.
*/
first_ix = i = ((ps->next_poll_fds_ix < ps->no_poll_fds)
? ps->next_poll_fds_ix
: 0);
end_ix = ps->no_poll_fds;
while (1) {
while (i < end_ix && res < max_res) {
if (ps->poll_fds[i].revents != (short) 0) {
int fd = ps->poll_fds[i].fd;
ErtsPollEvents revents;
#if ERTS_POLL_USE_FALLBACK
if (fd == ps->kp_fd) {
res += get_kp_results(ps, &pr[res], max_res-res);
i++;
continue;
}
#else
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
i++;
continue;
}
#endif
revents = pollev2ev(ps->poll_fds[i].revents);
pr[res].fd = fd;
pr[res].events = revents;
res++;
}
i++;
}
if (res == max_res || i == first_ix)
break;
ASSERT(i == ps->no_poll_fds);
i = 0;
end_ix = first_ix;
}
ps->next_poll_fds_ix = i;
return res;
#elif ERTS_POLL_USE_SELECT /* --- select ------------------------------ */
int res = 0;
#if !ERTS_POLL_USE_FALLBACK
int wake_fd = ps->wake_fds[0];
#endif
int fd, first_fd, end_fd;
/*
* In order to be fair, we continue on the fd where we stopped
* last time.
*/
first_fd = fd = ps->next_sel_fd <= ps->max_fd ? ps->next_sel_fd : 0;
end_fd = ps->max_fd + 1;
if (!ebadf) {
while (1) {
while (fd < end_fd && res < max_res) {
pr[res].events = (ErtsPollEvents) 0;
if (ERTS_FD_ISSET(fd, &ps->res_input_fds)) {
#if ERTS_POLL_USE_FALLBACK
if (fd == ps->kp_fd) {
res += get_kp_results(ps, &pr[res], max_res-res);
fd++;
continue;
}
#else
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
fd++;
continue;
}
#endif
pr[res].events |= ERTS_POLL_EV_IN;
}
if (ERTS_FD_ISSET(fd, &ps->res_output_fds))
pr[res].events |= ERTS_POLL_EV_OUT;
if (pr[res].events) {
pr[res].fd = fd;
res++;
}
fd++;
}
if (res == max_res || fd == first_fd)
break;
ASSERT(fd == ps->max_fd + 1);
fd = 0;
end_fd = first_fd;
}
}
else {
/*
* Bad file descriptors in poll set.
*
* This only happens when running poorly written
* drivers. This code could be optimized, but we
* don't bother since it should never happen...
*/
while (1) {
while (fd < end_fd && res < max_res) {
if (ps->fds_status[fd].events) {
int sres;
ERTS_fd_set *iset = NULL;
ERTS_fd_set *oset = NULL;
if (ps->fds_status[fd].events & ERTS_POLL_EV_IN) {
iset = &ps->res_input_fds;
ERTS_FD_ZERO(iset);
ERTS_FD_SET(fd, iset);
}
if (ps->fds_status[fd].events & ERTS_POLL_EV_OUT) {
oset = &ps->res_output_fds;
ERTS_FD_ZERO(oset);
ERTS_FD_SET(fd, oset);
}
do {
/* Initiate 'tv' each time;
select() may modify it */
SysTimeval tv = {0, 0};
sres = ERTS_SELECT(ps->max_fd+1, iset, oset, NULL, &tv);
} while (sres < 0 && errno == EINTR);
if (sres < 0) {
#if ERTS_POLL_USE_FALLBACK
if (fd == ps->kp_fd) {
res += get_kp_results(ps,
&pr[res],
max_res-res);
fd++;
continue;
}
#else
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
fd++;
continue;
}
#endif
pr[res].fd = fd;
pr[res].events = ERTS_POLL_EV_NVAL;
res++;
}
else if (sres > 0) {
pr[res].fd = fd;
if (iset && ERTS_FD_ISSET(fd, iset)) {
#if ERTS_POLL_USE_FALLBACK
if (fd == ps->kp_fd) {
res += get_kp_results(ps,
&pr[res],
max_res-res);
fd++;
continue;
}
#else
if (fd == wake_fd) {
cleanup_wakeup_pipe(ps);
fd++;
continue;
}
#endif
pr[res].events |= ERTS_POLL_EV_IN;
}
if (oset && ERTS_FD_ISSET(fd, oset)) {
pr[res].events |= ERTS_POLL_EV_OUT;
}
ASSERT(pr[res].events);
res++;
}
}
fd++;
}
if (res == max_res || fd == first_fd)
break;
ASSERT(fd == ps->max_fd + 1);
fd = 0;
end_fd = first_fd;
}
}
ps->next_sel_fd = fd;
return res;
#endif
}
}
static ERTS_INLINE ErtsMonotonicTime
get_timeout(ErtsPollSet ps,
int resolution,
ErtsMonotonicTime timeout_time)
{
ErtsMonotonicTime timeout, save_timeout_time;
if (timeout_time == ERTS_POLL_NO_TIMEOUT) {
save_timeout_time = ERTS_MONOTONIC_TIME_MIN;
timeout = 0;
}
else {
ErtsMonotonicTime diff_time, current_time;
current_time = erts_get_monotonic_time(NULL);
diff_time = timeout_time - current_time;
if (diff_time <= 0) {
save_timeout_time = ERTS_MONOTONIC_TIME_MIN;
timeout = 0;
}
else {
save_timeout_time = current_time;
switch (resolution) {
case 1000:
/* Round up to nearest even milli second */
timeout = ERTS_MONOTONIC_TO_MSEC(diff_time - 1) + 1;
if (timeout > (ErtsMonotonicTime) INT_MAX)
timeout = (ErtsMonotonicTime) INT_MAX;
save_timeout_time += ERTS_MSEC_TO_MONOTONIC(timeout);
timeout -= ERTS_PREMATURE_TIMEOUT(timeout, 1000);
break;
case 1000000:
/* Round up to nearest even micro second */
timeout = ERTS_MONOTONIC_TO_USEC(diff_time - 1) + 1;
save_timeout_time += ERTS_USEC_TO_MONOTONIC(timeout);
timeout -= ERTS_PREMATURE_TIMEOUT(timeout, 1000*1000);
break;
case 1000000000:
/* Round up to nearest even nano second */
timeout = ERTS_MONOTONIC_TO_NSEC(diff_time - 1) + 1;
save_timeout_time += ERTS_NSEC_TO_MONOTONIC(timeout);
timeout -= ERTS_PREMATURE_TIMEOUT(timeout, 1000*1000*1000);
break;
default:
ERTS_INTERNAL_ERROR("Invalid resolution");
timeout = 0;
save_timeout_time = 0;
break;
}
}
}
set_timeout_time(ps, save_timeout_time);
return timeout;
}
#if ERTS_POLL_USE_SELECT
static ERTS_INLINE int
get_timeout_timeval(ErtsPollSet ps,
SysTimeval *tvp,
ErtsMonotonicTime timeout_time)
{
ErtsMonotonicTime timeout = get_timeout(ps,
1000*1000,
timeout_time);
if (!timeout) {
tvp->tv_sec = 0;
tvp->tv_usec = 0;
return 0;
}
else {
ErtsMonotonicTime sec = timeout/(1000*1000);
tvp->tv_sec = sec;
tvp->tv_usec = timeout - sec*(1000*1000);
ASSERT(tvp->tv_sec >= 0);
ASSERT(tvp->tv_usec >= 0);
ASSERT(tvp->tv_usec < 1000*1000);
return !0;
}
}
#endif
#if ERTS_POLL_USE_KQUEUE || (ERTS_POLL_USE_POLL && defined(HAVE_PPOLL)) || ERTS_POLL_USE_TIMERFD
static ERTS_INLINE int
get_timeout_timespec(ErtsPollSet ps,
struct timespec *tsp,
ErtsMonotonicTime timeout_time)
{
ErtsMonotonicTime timeout = get_timeout(ps,
1000*1000*1000,
timeout_time);
if (!timeout) {
tsp->tv_sec = 0;
tsp->tv_nsec = 0;
return 0;
}
else {
ErtsMonotonicTime sec = timeout/(1000*1000*1000);
tsp->tv_sec = sec;
tsp->tv_nsec = timeout - sec*(1000*1000*1000);
ASSERT(tsp->tv_sec >= 0);
ASSERT(tsp->tv_nsec >= 0);
ASSERT(tsp->tv_nsec < 1000*1000*1000);
return !0;
}
}
#endif
#if ERTS_POLL_USE_TIMERFD
static ERTS_INLINE int
get_timeout_itimerspec(ErtsPollSet ps,
struct itimerspec *itsp,
ErtsMonotonicTime timeout_time)
{
itsp->it_interval.tv_sec = 0;
itsp->it_interval.tv_nsec = 0;
return get_timeout_timespec(ps, &itsp->it_value, timeout_time);
}
#endif
static ERTS_INLINE int
check_fd_events(ErtsPollSet ps, ErtsMonotonicTime timeout_time, int max_res)
{
int res;
ERTS_MSACC_PUSH_STATE_M();
if (erts_atomic_read_nob(&ps->no_of_user_fds) == 0
&& timeout_time == ERTS_POLL_NO_TIMEOUT) {
/* Nothing to poll and zero timeout; done... */
return 0;
}
else {
int timeout;
#if ERTS_POLL_USE_FALLBACK
if (!(ps->fallback_used = ERTS_POLL_NEED_FALLBACK(ps))) {
#if ERTS_POLL_USE_EPOLL /* --- epoll ------------------------------- */
if (max_res > ps->res_events_len)
grow_res_events(ps, max_res);
#if ERTS_POLL_USE_TIMERFD
{
struct itimerspec its;
timeout = get_timeout_itimerspec(ps, &its, timeout_time);
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
timerfd_set(ps, &its);
res = epoll_wait(ps->kp_fd, ps->res_events, max_res, -1);
res = timerfd_clear(ps, res, max_res);
} else {
res = epoll_wait(ps->kp_fd, ps->res_events, max_res, 0);
}
}
#else /* !ERTS_POLL_USE_TIMERFD */
timeout = (int) get_timeout(ps, 1000, timeout_time);
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
}
res = epoll_wait(ps->kp_fd, ps->res_events, max_res, timeout);
#endif /* !ERTS_POLL_USE_TIMERFD */
#elif ERTS_POLL_USE_KQUEUE /* --- kqueue ------------------------------ */
struct timespec ts;
if (max_res > ps->res_events_len)
grow_res_events(ps, max_res);
timeout = get_timeout_timespec(ps, &ts, timeout_time);
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
}
res = kevent(ps->kp_fd, NULL, 0, ps->res_events, max_res, &ts);
#endif /* ----------------------------------------- */
}
else /* use fallback (i.e. poll() or select()) */
#endif /* ERTS_POLL_USE_FALLBACK */
{
#if ERTS_POLL_USE_DEVPOLL /* --- devpoll ----------------------------- */
/*
* The ioctl() will fail with EINVAL on Solaris 10 if dp_nfds
* is set too high. dp_nfds should not be set greater than
* the maximum number of file descriptors in the poll set.
*/
struct dvpoll poll_res;
int nfds = (int) erts_atomic_read_nob(&ps->no_of_user_fds);
nfds++; /* Wakeup pipe */
timeout = (int) get_timeout(ps, 1000, timeout_time);
poll_res.dp_nfds = nfds < max_res ? nfds : max_res;
if (poll_res.dp_nfds > ps->res_events_len)
grow_res_events(ps, poll_res.dp_nfds);
poll_res.dp_fds = ps->res_events;
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
}
poll_res.dp_timeout = timeout;
res = ioctl(ps->kp_fd, DP_POLL, &poll_res);
#elif ERTS_POLL_USE_POLL && defined(HAVE_PPOLL) /* --- ppoll ---------------- */
struct timespec ts;
timeout = get_timeout_timespec(ps, &ts, timeout_time);
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
}
res = ppoll(ps->poll_fds, ps->no_poll_fds, &ts, NULL);
#elif ERTS_POLL_USE_POLL /* --- poll --------------------------------- */
timeout = (int) get_timeout(ps, 1000, timeout_time);
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
}
res = poll(ps->poll_fds, ps->no_poll_fds, timeout);
#elif ERTS_POLL_USE_SELECT /* --- select ------------------------------ */
SysTimeval to;
timeout = get_timeout_timeval(ps, &to, timeout_time);
ERTS_FD_COPY(&ps->input_fds, &ps->res_input_fds);
ERTS_FD_COPY(&ps->output_fds, &ps->res_output_fds);
if (timeout) {
erts_thr_progress_prepare_wait(NULL);
ERTS_MSACC_SET_STATE_CACHED_M(ERTS_MSACC_STATE_SLEEP);
}
res = ERTS_SELECT(ps->max_fd + 1,
&ps->res_input_fds,
&ps->res_output_fds,
NULL,
&to);
if (timeout) {
erts_thr_progress_finalize_wait(NULL);
ERTS_MSACC_POP_STATE_M();
}
if (res < 0
&& errno == EBADF
&& ERTS_POLLSET_HAVE_UPDATE_REQUESTS(ps)) {
/*
* This may have happened because another thread deselected
* a fd in our poll set and then closed it, i.e. the driver
* behaved correctly. We wan't to avoid looking for a bad
* fd, that may even not exist anymore. Therefore, handle
* update requests and try again.
*
* We don't know how much of the timeout is left; therfore,
* we use a zero timeout. If no error occur and no events
* have triggered, we fake an EAGAIN error and let the caller
* restart us.
*/
to.tv_sec = 0;
to.tv_usec = 0;
ERTS_POLLSET_LOCK(ps);
handle_update_requests(ps);
ERTS_POLLSET_UNLOCK(ps);
res = ERTS_SELECT(ps->max_fd + 1,
&ps->res_input_fds,
&ps->res_output_fds,
NULL,
&to);
if (res == 0) {
errno = EAGAIN;
res = -1;
}
}
return res;
#endif /* ----------------------------------------- */
}
if (timeout) {
erts_thr_progress_finalize_wait(NULL);
ERTS_MSACC_POP_STATE_M();
}
return res;
}
}
int
ERTS_POLL_EXPORT(erts_poll_wait)(ErtsPollSet ps,
ErtsPollResFd pr[],
int *len,
ErtsMonotonicTime timeout_time)
{
ErtsMonotonicTime to;
int res, no_fds;
int ebadf = 0;
int ps_locked = 0;
no_fds = *len;
#ifdef ERTS_POLL_MAX_RES
if (no_fds >= ERTS_POLL_MAX_RES)
no_fds = ERTS_POLL_MAX_RES;
#endif
*len = 0;
#ifdef ERTS_POLL_DEBUG_PRINT
erts_printf("Entering erts_poll_wait(), timeout_time=%bps\n",
timeout_time);
#endif
if (ERTS_POLLSET_SET_POLLED_CHK(ps)) {
res = EINVAL; /* Another thread is in erts_poll_wait()
on this pollset... */
goto done;
}
to = (is_woken(ps)
? ERTS_POLL_NO_TIMEOUT /* Use zero timeout */
: timeout_time);
if (ERTS_POLLSET_HAVE_UPDATE_REQUESTS(ps)) {
ERTS_POLLSET_LOCK(ps);
handle_update_requests(ps);
ERTS_POLLSET_UNLOCK(ps);
}
while (1) {
res = check_fd_events(ps, to, no_fds);
if (res != 0)
break;
if (to == ERTS_POLL_NO_TIMEOUT)
break;
if (erts_get_monotonic_time(NULL) >= timeout_time)
break;
}
woke_up(ps);
if (res == 0) {
res = ETIMEDOUT;
}
else if (res < 0) {
#if ERTS_POLL_USE_SELECT
if (errno == EBADF) {
ebadf = 1;
goto save_results;
}
#endif
res = errno;
}
else {
#if ERTS_POLL_USE_SELECT
save_results:
#endif
ps_locked = 1;
ERTS_POLLSET_LOCK(ps);
no_fds = save_poll_result(ps, pr, no_fds, res, ebadf);
#ifdef HARD_DEBUG
check_poll_result(pr, no_fds);
#endif
res = (no_fds == 0 ? (is_interrupted_reset(ps) ? EINTR : EAGAIN) : 0);
*len = no_fds;
}
if (ps_locked)
ERTS_POLLSET_UNLOCK(ps);
ERTS_POLLSET_UNSET_POLLED(ps);
done:
set_timeout_time(ps, ERTS_MONOTONIC_TIME_MAX);
#ifdef ERTS_POLL_DEBUG_PRINT
erts_printf("Leaving %s = erts_poll_wait()\n",
res == 0 ? "0" : erl_errno_id(res));
#endif
return res;
}
/*
* --- Interrupt a thread doing erts_poll_wait() -----------------------------
*/
void
ERTS_POLL_EXPORT(erts_poll_interrupt)(ErtsPollSet ps, int set)
{
if (!set)
reset_wakeup_state(ps);
else
wake_poller(ps, 1, 0);
}
/*
* erts_poll_interrupt_timed():
* If 'set' != 0, interrupt thread blocked in erts_poll_wait() if it
* is not guaranteed that it will timeout before 'msec' milli seconds.
*/
void
ERTS_POLL_EXPORT(erts_poll_interrupt_timed)(ErtsPollSet ps,
int set,
ErtsMonotonicTime timeout_time)
{
if (!set)
reset_wakeup_state(ps);
else {
ErtsMonotonicTime max_wait_time = get_timeout_time(ps);
if (max_wait_time > timeout_time)
wake_poller(ps, 1, 0);
#ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS
else {
if (ERTS_POLLSET_IS_POLLED(ps))
erts_atomic_inc_nob(&ps->no_avoided_wakeups);
erts_atomic_inc_nob(&ps->no_avoided_interrupts);
}
erts_atomic_inc_nob(&ps->no_interrupt_timed);
#endif
}
}
int
ERTS_POLL_EXPORT(erts_poll_max_fds)(void)
{
return max_fds;
}
/*
* --- Initialization --------------------------------------------------------
*/
void
ERTS_POLL_EXPORT(erts_poll_init)(void)
{
erts_mtx_init(&pollsets_lock, "pollsets_lock", NIL,
ERTS_LOCK_FLAGS_PROPERTY_STATIC | ERTS_LOCK_FLAGS_CATEGORY_IO);
pollsets = NULL;
errno = 0;
#if !defined(NO_SYSCONF)
max_fds = sysconf(_SC_OPEN_MAX);
#elif ERTS_POLL_USE_SELECT
max_fds = NOFILE;
#else
max_fds = OPEN_MAX;
#endif
#if ERTS_POLL_USE_SELECT && defined(FD_SETSIZE) && \
!defined(_DARWIN_UNLIMITED_SELECT)
if (max_fds > FD_SETSIZE)
max_fds = FD_SETSIZE;
#endif
if (max_fds < 0)
fatal_error("erts_poll_init(): Failed to get max number of files: %s\n",
erl_errno_id(errno));
#ifdef ERTS_POLL_DEBUG_PRINT
print_misc_debug_info();
#endif
}
ErtsPollSet
ERTS_POLL_EXPORT(erts_poll_create_pollset)(void)
{
#if ERTS_POLL_USE_KERNEL_POLL
int kp_fd;
#endif
ErtsPollSet ps = erts_alloc(ERTS_ALC_T_POLLSET,
sizeof(struct ERTS_POLL_EXPORT(erts_pollset)));
ps->internal_fd_limit = 0;
ps->fds_status = NULL;
ps->fds_status_len = 0;
erts_atomic_init_nob(&ps->no_of_user_fds, 0);
#if ERTS_POLL_USE_KERNEL_POLL
ps->kp_fd = -1;
#if ERTS_POLL_USE_EPOLL
kp_fd = epoll_create(256);
ps->res_events_len = 0;
ps->res_events = NULL;
#elif ERTS_POLL_USE_DEVPOLL
kp_fd = open("/dev/poll", O_RDWR);
ps->res_events_len = 0;
ps->res_events = NULL;
#elif ERTS_POLL_USE_KQUEUE
kp_fd = kqueue();
ps->res_events_len = 0;
ps->res_events = NULL;
#endif
if (kp_fd < 0)
fatal_error("erts_poll_create_pollset(): Failed to "
#if ERTS_POLL_USE_EPOLL
"create epoll set"
#elif ERTS_POLL_USE_DEVPOLL
"to open /dev/poll"
#elif ERTS_POLL_USE_KQUEUE
"create kqueue"
#endif
": %s (%d)\n",
erl_errno_id(errno), errno);
#endif /* ERTS_POLL_USE_KERNEL_POLL */
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
/* res_events is also used as write buffer */
grow_res_events(ps, ERTS_POLL_MIN_BATCH_BUF_SIZE);
#endif
#if ERTS_POLL_USE_POLL
ps->next_poll_fds_ix = 0;
ps->no_poll_fds = 0;
ps->poll_fds_len = 0;
ps->poll_fds = NULL;
#elif ERTS_POLL_USE_SELECT
ps->next_sel_fd = 0;
ps->max_fd = -1;
#if ERTS_POLL_USE_FALLBACK
ps->no_select_fds = 0;
#endif
#ifdef _DARWIN_UNLIMITED_SELECT
ps->input_fds.sz = 0;
ps->input_fds.ptr = NULL;
ps->res_input_fds.sz = 0;
ps->res_input_fds.ptr = NULL;
ps->output_fds.sz = 0;
ps->output_fds.ptr = NULL;
ps->res_output_fds.sz = 0;
ps->res_output_fds.ptr = NULL;
#else
ERTS_FD_ZERO(&ps->input_fds);
ERTS_FD_ZERO(&ps->res_input_fds);
ERTS_FD_ZERO(&ps->output_fds);
ERTS_FD_ZERO(&ps->res_output_fds);
#endif
#endif
ps->update_requests.next = NULL;
ps->update_requests.len = 0;
ps->curr_upd_req_block = &ps->update_requests;
erts_atomic32_init_nob(&ps->have_update_requests, 0);
erts_atomic32_init_nob(&ps->polled, 0);
erts_mtx_init(&ps->mtx, "pollset", NIL, ERTS_LOCK_FLAGS_CATEGORY_IO);
erts_atomic32_init_nob(&ps->wakeup_state, (erts_aint32_t) 0);
create_wakeup_pipe(ps);
#if ERTS_POLL_USE_TIMERFD
create_timerfd(ps);
#endif
#if ERTS_POLL_USE_FALLBACK
if (kp_fd >= ps->fds_status_len)
grow_fds_status(ps, kp_fd);
/* Force kernel poll fd into fallback (poll/select) set */
ps->fds_status[kp_fd].flags
|= ERTS_POLL_FD_FLG_INFLBCK|ERTS_POLL_FD_FLG_USEFLBCK;
{
int do_wake = 0;
ERTS_POLL_EXPORT(erts_poll_control)(ps, kp_fd, ERTS_POLL_EV_IN, 1,
&do_wake);
}
#endif
#if ERTS_POLL_USE_KERNEL_POLL
if (ps->internal_fd_limit <= kp_fd)
ps->internal_fd_limit = kp_fd + 1;
ps->kp_fd = kp_fd;
#endif
init_timeout_time(ps);
#ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS
erts_atomic_init_nob(&ps->no_avoided_wakeups, 0);
erts_atomic_init_nob(&ps->no_avoided_interrupts, 0);
erts_atomic_init_nob(&ps->no_interrupt_timed, 0);
#endif
handle_update_requests(ps);
#if ERTS_POLL_USE_FALLBACK
ps->fallback_used = 0;
#endif
erts_atomic_set_nob(&ps->no_of_user_fds, 0); /* Don't count wakeup pipe and fallback fd */
erts_mtx_lock(&pollsets_lock);
ps->next = pollsets;
pollsets = ps;
erts_mtx_unlock(&pollsets_lock);
return ps;
}
void
ERTS_POLL_EXPORT(erts_poll_destroy_pollset)(ErtsPollSet ps)
{
if (ps->fds_status)
erts_free(ERTS_ALC_T_FD_STATUS, (void *) ps->fds_status);
#if ERTS_POLL_USE_EPOLL
if (ps->kp_fd >= 0)
close(ps->kp_fd);
if (ps->res_events)
erts_free(ERTS_ALC_T_POLL_RES_EVS, (void *) ps->res_events);
#elif ERTS_POLL_USE_DEVPOLL
if (ps->kp_fd >= 0)
close(ps->kp_fd);
if (ps->res_events)
erts_free(ERTS_ALC_T_POLL_RES_EVS, (void *) ps->res_events);
#elif ERTS_POLL_USE_POLL
if (ps->poll_fds)
erts_free(ERTS_ALC_T_POLL_FDS, (void *) ps->poll_fds);
#elif ERTS_POLL_USE_SELECT
#ifdef _DARWIN_UNLIMITED_SELECT
if (ps->input_fds.ptr)
erts_free(ERTS_ALC_T_SELECT_FDS, (void *) ps->input_fds.ptr);
if (ps->res_input_fds.ptr)
erts_free(ERTS_ALC_T_SELECT_FDS, (void *) ps->res_input_fds.ptr);
if (ps->output_fds.ptr)
erts_free(ERTS_ALC_T_SELECT_FDS, (void *) ps->output_fds.ptr);
if (ps->res_output_fds.ptr)
erts_free(ERTS_ALC_T_SELECT_FDS, (void *) ps->res_output_fds.ptr);
#endif
#endif
{
ErtsPollSetUpdateRequestsBlock *urqbp = ps->update_requests.next;
while (urqbp) {
ErtsPollSetUpdateRequestsBlock *free_urqbp = urqbp;
urqbp = urqbp->next;
free_update_requests_block(ps, free_urqbp);
}
}
erts_mtx_destroy(&ps->mtx);
if (ps->wake_fds[0] >= 0)
close(ps->wake_fds[0]);
if (ps->wake_fds[1] >= 0)
close(ps->wake_fds[1]);
#if ERTS_POLL_USE_TIMERFD
if (ps->timer_fd >= 0)
close(ps->timer_fd);
#endif
erts_mtx_lock(&pollsets_lock);
if (ps == pollsets)
pollsets = pollsets->next;
else {
ErtsPollSet prev_ps;
for (prev_ps = pollsets; ps != prev_ps->next; prev_ps = prev_ps->next)
;
ASSERT(ps == prev_ps->next);
prev_ps->next = ps->next;
}
erts_mtx_unlock(&pollsets_lock);
erts_free(ERTS_ALC_T_POLLSET, (void *) ps);
}
/*
* --- Info ------------------------------------------------------------------
*/
void
ERTS_POLL_EXPORT(erts_poll_info)(ErtsPollSet ps, ErtsPollInfo *pip)
{
int pending_updates;
Uint size = 0;
ERTS_POLLSET_LOCK(ps);
size += sizeof(struct ERTS_POLL_EXPORT(erts_pollset));
size += ps->fds_status_len*sizeof(ErtsFdStatus);
#if ERTS_POLL_USE_EPOLL
size += ps->res_events_len*sizeof(struct epoll_event);
#elif ERTS_POLL_USE_DEVPOLL
size += ps->res_events_len*sizeof(struct pollfd);
#elif ERTS_POLL_USE_KQUEUE
size += ps->res_events_len*sizeof(struct kevent);
#endif
#if ERTS_POLL_USE_POLL
size += ps->poll_fds_len*sizeof(struct pollfd);
#elif ERTS_POLL_USE_SELECT
#ifdef _DARWIN_UNLIMITED_SELECT
size += ps->input_fds.sz + ps->res_input_fds.sz
+ ps->output_fds.sz + ps->res_output_fds.sz;
#endif
#endif
{
ErtsPollSetUpdateRequestsBlock *urqbp = ps->update_requests.next;
pending_updates = ps->update_requests.len;
while (urqbp) {
size += sizeof(ErtsPollSetUpdateRequestsBlock);
pending_updates += urqbp->len;
urqbp = urqbp->next;
}
}
pip->primary =
#if ERTS_POLL_USE_KQUEUE
"kqueue"
#elif ERTS_POLL_USE_EPOLL
"epoll"
#elif ERTS_POLL_USE_DEVPOLL
"/dev/poll"
#elif ERTS_POLL_USE_POLL
"poll"
#elif ERTS_POLL_USE_SELECT
"select"
#endif
;
pip->fallback =
#if !ERTS_POLL_USE_FALLBACK
NULL
#elif ERTS_POLL_USE_POLL
"poll"
#elif ERTS_POLL_USE_SELECT
"select"
#endif
;
pip->kernel_poll =
#if !ERTS_POLL_USE_KERNEL_POLL
NULL
#elif ERTS_POLL_USE_KQUEUE
"kqueue"
#elif ERTS_POLL_USE_EPOLL
"epoll"
#elif ERTS_POLL_USE_DEVPOLL
"/dev/poll"
#endif
;
pip->memory_size = size;
pip->poll_set_size = (int) erts_atomic_read_nob(&ps->no_of_user_fds);
pip->poll_set_size++; /* Wakeup pipe */
#if ERTS_POLL_USE_TIMERFD
pip->poll_set_size++; /* timerfd */
#endif
pip->fallback_poll_set_size =
#if !ERTS_POLL_USE_FALLBACK
0
#elif ERTS_POLL_USE_POLL
ps->no_poll_fds
#elif ERTS_POLL_USE_SELECT
ps->no_select_fds
#endif
;
#if ERTS_POLL_USE_FALLBACK
/* If only kp_fd is in fallback poll set we don't use fallback... */
if (pip->fallback_poll_set_size == 1)
pip->fallback_poll_set_size = 0;
else
pip->poll_set_size++; /* kp_fd */
#endif
pip->lazy_updates =
1
;
pip->pending_updates =
pending_updates
;
pip->batch_updates =
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
1
#else
0
#endif
;
pip->concurrent_updates =
#if ERTS_POLL_USE_CONCURRENT_UPDATE
1
#else
0
#endif
;
pip->max_fds = max_fds;
#ifdef ERTS_POLL_COUNT_AVOIDED_WAKEUPS
pip->no_avoided_wakeups = erts_atomic_read_nob(&ps->no_avoided_wakeups);
pip->no_avoided_interrupts = erts_atomic_read_nob(&ps->no_avoided_interrupts);
pip->no_interrupt_timed = erts_atomic_read_nob(&ps->no_interrupt_timed);
#endif
ERTS_POLLSET_UNLOCK(ps);
}
/*
* Fatal error...
*/
#ifndef ERTS_GOT_SIGUSR1
# define ERTS_GOT_SIGUSR1 0
#endif
static void
fatal_error(char *format, ...)
{
va_list ap;
if (ERTS_SOMEONE_IS_CRASH_DUMPING || ERTS_GOT_SIGUSR1) {
/*
* Crash dump writing and reception of sigusr1 (which will
* result in a crash dump) closes all file descriptors. This
* typically results in a fatal error for erts_poll() (wakeup
* pipes and kernel poll fds are closed).
*
* We ignore the error and let the crash dump writing continue...
*/
return;
}
va_start(ap, format);
erts_vfprintf(stderr, format, ap);
va_end(ap);
abort();
}
static void
fatal_error_async_signal_safe(char *error_str)
{
if (ERTS_SOMEONE_IS_CRASH_DUMPING || ERTS_GOT_SIGUSR1) {
/* See comment above in fatal_error() */
return;
}
if (error_str) {
int len = 0;
while (error_str[len])
len++;
if (len) {
/* async signal safe */
erts_silence_warn_unused_result(write(2, error_str, len));
}
}
abort();
}
/*
* --- Debug -----------------------------------------------------------------
*/
void
ERTS_POLL_EXPORT(erts_poll_get_selected_events)(ErtsPollSet ps,
ErtsPollEvents ev[],
int len)
{
int fd;
ERTS_POLLSET_LOCK(ps);
for (fd = 0; fd < len; fd++) {
if (fd >= ps->fds_status_len)
ev[fd] = 0;
else {
ev[fd] = ps->fds_status[fd].events;
if (
fd == ps->wake_fds[0] || fd == ps->wake_fds[1] ||
#if ERTS_POLL_USE_TIMERFD
fd == ps->timer_fd ||
#endif
#if ERTS_POLL_USE_KERNEL_POLL
fd == ps->kp_fd ||
#endif
0)
ev[fd] |= ERTS_POLL_EV_NVAL;
}
}
ERTS_POLLSET_UNLOCK(ps);
}
#ifdef HARD_DEBUG
static void
check_poll_result(ErtsPollResFd pr[], int len)
{
int i, j;
for (i = 0; i < len; i++) {
ASSERT(pr[i].fd >= 0);
ASSERT(pr[i].fd < max_fds);
for (j = 0; j < len; j++) {
ASSERT(i == j || pr[i].fd != pr[j].fd);
}
}
}
#if ERTS_POLL_USE_DEVPOLL
static void
check_poll_status(ErtsPollSet ps)
{
int i;
for (i = 0; i < ps->fds_status_len; i++) {
int ires;
struct pollfd dp_fd;
short events = ERTS_POLL_EV_E2N(ps->fds_status[i].events);
dp_fd.fd = i;
dp_fd.events = (short) 0;
dp_fd.revents = (short) 0;
ires = ioctl(ps->kp_fd, DP_ISPOLLED, &dp_fd);
if (ires == 0) {
ASSERT(!events);
}
else if (ires == 1) {
ASSERT(events);
ASSERT(events == dp_fd.revents);
}
else {
ASSERT(0);
}
ASSERT(dp_fd.fd == i);
ASSERT(ps->fds_status[i].events == ps->fds_status[i].used_events);
}
}
#endif /* ERTS_POLL_USE_DEVPOLL */
#endif /* HARD_DEBUG */
#ifdef ERTS_POLL_DEBUG_PRINT
static void
print_misc_debug_info(void)
{
erts_printf("erts_poll using: %s lazy_updates:%s batch_updates:%s\n",
#if ERTS_POLL_USE_KQUEUE
"kqueue"
#elif ERTS_POLL_USE_EPOLL
"epoll"
#elif ERTS_POLL_USE_DEVPOLL
"/dev/poll"
#endif
#if ERTS_POLL_USE_FALLBACK
"-"
#endif
#if ERTS_POLL_USE_POLL
"poll"
#elif ERTS_POLL_USE_SELECT
"select"
#endif
,
"true"
,
#if ERTS_POLL_USE_BATCH_UPDATE_POLLSET
"true"
#else
"false"
#endif
);
erts_printf("ERTS_POLL_EV_IN=0x%x\n"
"ERTS_POLL_EV_OUT=0x%x\n"
"ERTS_POLL_EV_NVAL=0x%x\n"
"ERTS_POLL_EV_ERR=0x%x\n",
ERTS_POLL_EV_IN,
ERTS_POLL_EV_OUT,
ERTS_POLL_EV_NVAL,
ERTS_POLL_EV_ERR);
#ifdef FD_SETSIZE
erts_printf("FD_SETSIZE=%d\n", FD_SETSIZE);
#endif
}
#endif
#ifdef ERTS_ENABLE_LOCK_COUNT
static void erts_lcnt_enable_pollset_lock_count(ErtsPollSet pollset, int enable) {
if(enable) {
erts_lcnt_install_new_lock_info(&pollset->mtx.lcnt, "pollset_rm", NIL,
ERTS_LOCK_TYPE_MUTEX | ERTS_LOCK_FLAGS_CATEGORY_IO);
} else {
erts_lcnt_uninstall(&pollset->mtx.lcnt);
}
}
void ERTS_POLL_EXPORT(erts_lcnt_update_pollset_locks)(int enable) {
ErtsPollSet iterator;
erts_mtx_lock(&pollsets_lock);
for(iterator = pollsets; iterator != NULL; iterator = iterator->next) {
erts_lcnt_enable_pollset_lock_count(iterator, enable);
}
erts_mtx_unlock(&pollsets_lock);
}
#endif