From 300b419486c1ca88e33938f182d5d5a8b90fb73f Mon Sep 17 00:00:00 2001 From: Rickard Green Date: Thu, 17 Jun 2010 10:23:50 +0200 Subject: Rewrite ethread library Large parts of the ethread library have been rewritten. The ethread library is an Erlang runtime system internal, portable thread library used by the runtime system itself. Most notable improvement is a reader optimized rwlock implementation which dramatically improve the performance of read-lock/read-unlock operations on multi processor systems by avoiding ping-ponging of the rwlock cache lines. The reader optimized rwlock implementation is used by miscellaneous rwlocks in the runtime system that are known to be read-locked frequently, and can be enabled on ETS tables by passing the `{read_concurrency, true}' option upon table creation. See the documentation of `ets:new/2' for more information. The ethread library can now also use the libatomic_ops library for atomic memory accesses. This makes it possible for the Erlang runtime system to utilize optimized atomic operations on more platforms than before. Use the `--with-libatomic_ops=PATH' configure command line argument when specifying where the libatomic_ops installation is located. The libatomic_ops library can be downloaded from: http://www.hpl.hp.com/research/linux/atomic_ops/ The changed API of the ethread library has also caused modifications in the Erlang runtime system. Preparations for the to come "delayed deallocation" feature has also been done since it depends on the ethread library. Note: When building for x86, the ethread library will now use instructions that first appeared on the pentium 4 processor. If you want the runtime system to be compatible with older processors (back to 486) you need to pass the `--enable-ethread-pre-pentium4-compatibility' configure command line argument when configuring the system. --- erts/emulator/beam/erl_process.c | 1832 +++++++++++++++++++++++++++++--------- 1 file changed, 1419 insertions(+), 413 deletions(-) (limited to 'erts/emulator/beam/erl_process.c') diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index 41031f5468..abf4c9d96a 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -46,7 +46,13 @@ #define ERTS_PROC_MIN_CONTEXT_SWITCH_REDS_COST (CONTEXT_REDS/10) -#define ERTS_SCHED_SLEEP_SPINCOUNT 10000 +#define ERTS_SCHED_SPIN_UNTIL_YIELD 100 + +#define ERTS_SCHED_SYS_SLEEP_SPINCOUNT 10 +#define ERTS_SCHED_TSE_SLEEP_SPINCOUNT_FACT 1000 +#define ERTS_SCHED_TSE_SLEEP_SPINCOUNT \ + (ERTS_SCHED_SYS_SLEEP_SPINCOUNT*ERTS_SCHED_TSE_SLEEP_SPINCOUNT_FACT) +#define ERTS_SCHED_SUSPEND_SLEEP_SPINCOUNT 0 #define ERTS_WAKEUP_OTHER_LIMIT (100*CONTEXT_REDS/2) #define ERTS_WAKEUP_OTHER_DEC 10 @@ -106,6 +112,10 @@ Uint erts_no_schedulers; Uint erts_max_processes = ERTS_DEFAULT_MAX_PROCESSES; Uint erts_process_tab_index_mask; +#ifdef ERTS_SMP +Uint erts_max_main_threads; +#endif + int erts_sched_thread_suggested_stack_size = -1; #ifdef ERTS_ENABLE_LOCK_CHECK @@ -116,16 +126,34 @@ ErtsLcPSDLocks erts_psd_required_locks[ERTS_PSD_SIZE]; int erts_disable_proc_not_running_opt; -#define ERTS_SCHED_CHANGING_ONLINE 1 -#define ERTS_SCHED_CHANGING_MULTI_SCHED 2 +#define ERTS_SCHDLR_SSPND_CHNG_WAITER (((long) 1) << 0) +#define ERTS_SCHDLR_SSPND_CHNG_MSB (((long) 1) << 1) +#define ERTS_SCHDLR_SSPND_CHNG_ONLN (((long) 1) << 2) + +#ifndef DEBUG + +#define ERTS_SCHDLR_SSPND_CHNG_SET(VAL, OLD_VAL) \ + erts_smp_atomic_set(&schdlr_sspnd.changing, (VAL)) + +#else + +#define ERTS_SCHDLR_SSPND_CHNG_SET(VAL, OLD_VAL) \ +do { \ + long old_val__ = erts_smp_atomic_xchg(&schdlr_sspnd.changing, \ + (VAL)); \ + ASSERT(old_val__ == (OLD_VAL)); \ +} while (0) + +#endif + static struct { erts_smp_mtx_t mtx; erts_smp_cnd_t cnd; - int changing; int online; int curr_online; int wait_curr_online; + erts_smp_atomic_t changing; erts_smp_atomic_t active; struct { erts_smp_atomic_t ongoing; @@ -231,6 +259,17 @@ typedef union { ErtsAlignedSchedulerData *erts_aligned_scheduler_data; +#ifdef ERTS_SMP + +typedef union { + ErtsSchedulerSleepInfo ssi; + char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsSchedulerSleepInfo))]; +} ErtsAlignedSchedulerSleepInfo; + +static ErtsAlignedSchedulerSleepInfo *aligned_sched_sleep_info; + +#endif + #ifndef BM_COUNTERS static int processes_busy; #endif @@ -288,8 +327,15 @@ ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(proclist, 200, ERTS_ALC_T_PROC_LIST) -#define ERTS_RUNQ_IX(IX) (&erts_aligned_run_queues[(IX)].runq) -#define ERTS_SCHEDULER_IX(IX) (&erts_aligned_scheduler_data[(IX)].esd) +#define ERTS_RUNQ_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_run_queues), \ + &erts_aligned_run_queues[(IX)].runq) +#define ERTS_SCHEDULER_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ + &erts_aligned_scheduler_data[(IX)].esd) +#define ERTS_SCHED_SLEEP_INFO_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ + &aligned_sched_sleep_info[(IX)].ssi) #define ERTS_FOREACH_RUNQ(RQVAR, DO) \ do { \ @@ -353,6 +399,11 @@ static void signal_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size #endif +static int reader_group_lookup(int logical); +static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, + int *cpudata_size); +static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata); + static void early_cpu_bind_init(void); static void late_cpu_bind_init(void); @@ -582,6 +633,76 @@ erts_psd_set_init(Process *p, ErtsProcLocks plocks, int ix, void *data) #ifdef ERTS_SMP +void +erts_sched_finish_poke(ErtsSchedulerSleepInfo *ssi, long flags) +{ + switch (flags & ERTS_SSI_FLGS_SLEEP_TYPE) { + case ERTS_SSI_FLG_POLL_SLEEPING: + erts_sys_schedule_interrupt(1); + break; + case ERTS_SSI_FLG_TSE_SLEEPING: + erts_tse_set(ssi->event); + break; + case 0: + break; + default: + erl_exit(ERTS_ABORT_EXIT, "%s:%d: Internal error\n", + __FILE__, __LINE__); + break; + } +} + +#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN +void +erts_smp_notify_check_children_needed(void) +{ + int i; + + for (i = 0; i < erts_no_schedulers; i++) { + long aux_work; + ErtsSchedulerSleepInfo *ssi; + ssi = ERTS_SCHED_SLEEP_INFO_IX(i); + aux_work = erts_smp_atomic_bor(&ssi->aux_work, + ERTS_SSI_AUX_WORK_CHECK_CHILDREN); + if (!(aux_work & ERTS_SSI_AUX_WORK_CHECK_CHILDREN)) + erts_sched_poke(ssi); + } +} +#endif + +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK +static ERTS_INLINE long +blockable_aux_work(ErtsSchedulerData *esdp, + ErtsSchedulerSleepInfo *ssi, + long aux_work) +{ + if (aux_work & ERTS_SSI_BLOCKABLE_AUX_WORK_MASK) { +#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN + if (aux_work & ERTS_SSI_AUX_WORK_CHECK_CHILDREN) { + aux_work = erts_smp_atomic_band(&ssi->aux_work, + ~ERTS_SSI_AUX_WORK_CHECK_CHILDREN); + aux_work &= ~ERTS_SSI_AUX_WORK_CHECK_CHILDREN; + erts_check_children(); + } +#endif + } + return aux_work; +} + +#endif + +#ifdef ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK +static ERTS_INLINE long +nonblockable_aux_work(ErtsSchedulerData *esdp, + ErtsSchedulerSleepInfo *ssi, + long aux_work) +{ + if (aux_work & ERTS_SSI_NONBLOCKABLE_AUX_WORK_MASK) { + + } +} +#endif + static void prepare_for_block(void *vrq) { @@ -634,8 +755,32 @@ erts_active_schedulers(void) return as; } +static ERTS_INLINE int +prepare_for_sys_schedule(void) +{ +#ifdef ERTS_SMP + while (!erts_port_task_have_outstanding_io_tasks() + && !erts_smp_atomic_xchg(&doing_sys_schedule, 1)) { + if (!erts_port_task_have_outstanding_io_tasks()) + return 1; + erts_smp_atomic_set(&doing_sys_schedule, 0); + } + return 0; +#else + return !erts_port_task_have_outstanding_io_tasks(); +#endif +} + #ifdef ERTS_SMP +static ERTS_INLINE void +sched_change_waiting_sys_to_waiting(Uint no, ErtsRunQueue *rq) +{ + ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq)); + ASSERT(rq->waiting < 0); + rq->waiting *= -1; +} + static ERTS_INLINE void sched_waiting(Uint no, ErtsRunQueue *rq) { @@ -695,101 +840,304 @@ non_empty_runq(ErtsRunQueue *rq) } } -static ERTS_INLINE int -sched_spin_wake(ErtsRunQueue *rq) +static long +sched_prep_spin_wait(ErtsSchedulerSleepInfo *ssi) { -#if ERTS_SCHED_SLEEP_SPINCOUNT == 0 - return 0; -#else - long val; - ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq)); + long oflgs; + long nflgs = (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING); + long xflgs = 0; - val = erts_smp_atomic_read(&rq->spin_waiter); - ASSERT(val >= 0); - if (val != 0) { - erts_smp_atomic_inc(&rq->spin_wake); - return 1; - } - return 0; -#endif + do { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, nflgs, xflgs); + if (oflgs == xflgs) + return nflgs; + xflgs = oflgs; + } while (!(oflgs & ERTS_SSI_FLG_SUSPENDED)); + return oflgs; } -static ERTS_INLINE int -sched_spin_wake_all(ErtsRunQueue *rq) +static long +sched_prep_cont_spin_wait(ErtsSchedulerSleepInfo *ssi) { -#if ERTS_SCHED_SLEEP_SPINCOUNT == 0 - return 0; -#else - long val; - ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq)); + long oflgs; + long nflgs = (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING); + long xflgs = ERTS_SSI_FLG_WAITING; - val = erts_smp_atomic_read(&rq->spin_waiter); - ASSERT(val >= 0); - if (val != 0) - erts_smp_atomic_add(&rq->spin_wake, val); - return val; -#endif + do { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, nflgs, xflgs); + if (oflgs == xflgs) + return nflgs; + xflgs = oflgs; + nflgs |= oflgs & ERTS_SSI_FLG_SUSPENDED; + } while (oflgs & ERTS_SSI_FLG_WAITING); + return oflgs; +} + +static long +sched_spin_wait(ErtsSchedulerSleepInfo *ssi, int spincount) +{ + long until_yield = ERTS_SCHED_SPIN_UNTIL_YIELD; + int sc = spincount; + long flgs; + + do { + flgs = erts_smp_atomic_read(&ssi->flags); + if ((flgs & (ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING)) + != (ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING)) { + break; + } + ERTS_SPIN_BODY; + if (--until_yield == 0) { + until_yield = ERTS_SCHED_SPIN_UNTIL_YIELD; + erts_thr_yield(); + } + } while (--sc > 0); + return flgs; +} + +static long +sched_set_sleeptype(ErtsSchedulerSleepInfo *ssi, long sleep_type) +{ + long oflgs; + long nflgs = ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING|sleep_type; + long xflgs = ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING; + + if (sleep_type == ERTS_SSI_FLG_TSE_SLEEPING) + erts_tse_reset(ssi->event); + + while (1) { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, nflgs, xflgs); + if (oflgs == xflgs) + return nflgs; + if ((oflgs & (ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING)) + != (ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING)) { + return oflgs; + } + xflgs = oflgs; + nflgs |= oflgs & ERTS_SSI_FLG_SUSPENDED; + } } +#define ERTS_SCHED_WAIT_WOKEN(FLGS) \ + (((FLGS) & (ERTS_SSI_FLG_WAITING|ERTS_SSI_FLG_SUSPENDED)) \ + != ERTS_SSI_FLG_WAITING) + static void -sched_sys_wait(Uint no, ErtsRunQueue *rq) +scheduler_wait(long *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) { - long dt; -#if ERTS_SCHED_SLEEP_SPINCOUNT != 0 - int val; - int spincount = ERTS_SCHED_SLEEP_SPINCOUNT; + ErtsSchedulerSleepInfo *ssi = esdp->ssi; + int spincount; + long flgs; +#if defined(ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK) \ + || defined(ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK) + long aux_work; +#endif + ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq)); + erts_smp_spin_lock(&rq->sleepers.lock); + flgs = sched_prep_spin_wait(ssi); + if (flgs & ERTS_SSI_FLG_SUSPENDED) { + /* Go suspend instead... */ + erts_smp_spin_unlock(&rq->sleepers.lock); + return; + } + + ssi->prev = NULL; + ssi->next = rq->sleepers.list; + if (rq->sleepers.list) + rq->sleepers.list->prev = ssi; + rq->sleepers.list = ssi; + erts_smp_spin_unlock(&rq->sleepers.lock); + + /* + * If all schedulers are waiting, one of them *should* + * be waiting in erl_sys_schedule() + */ + + if (!prepare_for_sys_schedule()) { + + sched_waiting(esdp->no, rq); + + erts_smp_runq_unlock(rq); + + spincount = ERTS_SCHED_TSE_SLEEP_SPINCOUNT; + + tse_wait: + +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); + tse_blockable_aux_work: + aux_work = blockable_aux_work(esdp, ssi, aux_work); #endif + erts_smp_activity_begin(ERTS_ACTIVITY_WAIT, NULL, NULL, NULL); - sched_waiting_sys(no, rq); + while (1) { -#if ERTS_SCHED_SLEEP_SPINCOUNT != 0 - erts_smp_atomic_inc(&rq->spin_waiter); - erts_smp_runq_unlock(rq); +#ifdef ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK +#ifndef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); +#endif + nonblockable_aux_work(esdp, ssi, aux_work); +#endif - erl_sys_schedule(1); /* Might give us something to do */ + flgs = sched_spin_wait(ssi, spincount); + if (flgs & ERTS_SSI_FLG_SLEEPING) { + ASSERT(flgs & ERTS_SSI_FLG_WAITING); + flgs = sched_set_sleeptype(ssi, ERTS_SSI_FLG_TSE_SLEEPING); + if (flgs & ERTS_SSI_FLG_SLEEPING) { + int res; + ASSERT(flgs & ERTS_SSI_FLG_TSE_SLEEPING); + ASSERT(flgs & ERTS_SSI_FLG_WAITING); + do { + res = erts_tse_wait(ssi->event); + } while (res == EINTR); + } + } - dt = do_time_read_and_reset(); - if (dt) bump_timer(dt); + if (!(flgs & ERTS_SSI_FLG_WAITING)) { + ASSERT(!(flgs & ERTS_SSI_FLG_SLEEPING)); + break; + } + + flgs = sched_prep_cont_spin_wait(ssi); + spincount = ERTS_SCHED_TSE_SLEEP_SPINCOUNT; + + if (!(flgs & ERTS_SSI_FLG_WAITING)) { + ASSERT(!(flgs & ERTS_SSI_FLG_SLEEPING)); + break; + } + +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); + if (aux_work & ERTS_SSI_BLOCKABLE_AUX_WORK_MASK) { + erts_smp_activity_end(ERTS_ACTIVITY_WAIT, NULL, NULL, NULL); + goto tse_blockable_aux_work; + } +#endif - while (spincount-- > 0) { - val = erts_smp_atomic_read(&rq->spin_wake); - ASSERT(val >= 0); - if (val != 0) { - erts_smp_runq_lock(rq); - val = erts_smp_atomic_read(&rq->spin_wake); - ASSERT(val >= 0); - if (val != 0) - goto woken; - if (spincount == 0) - goto sleep; - erts_smp_runq_unlock(rq); } - } - erts_smp_runq_lock(rq); - val = erts_smp_atomic_read(&rq->spin_wake); - ASSERT(val >= 0); - if (val != 0) { - woken: - erts_smp_atomic_dec(&rq->spin_wake); - ASSERT(erts_smp_atomic_read(&rq->spin_wake) >= 0); - erts_smp_atomic_dec(&rq->spin_waiter); - ASSERT(erts_smp_atomic_read(&rq->spin_waiter) >= 0); + erts_smp_activity_end(ERTS_ACTIVITY_WAIT, NULL, NULL, NULL); + + if (flgs & ~ERTS_SSI_FLG_SUSPENDED) + erts_smp_atomic_band(&ssi->flags, ERTS_SSI_FLG_SUSPENDED); + + erts_smp_runq_lock(rq); + sched_active(esdp->no, rq); + } else { - sleep: - erts_smp_atomic_dec(&rq->spin_waiter); - ASSERT(erts_smp_atomic_read(&rq->spin_waiter) >= 0); + long dt; + + erts_smp_atomic_set(&function_calls, 0); + *fcalls = 0; + + sched_waiting_sys(esdp->no, rq); + + erts_smp_runq_unlock(rq); + + spincount = ERTS_SCHED_SYS_SLEEP_SPINCOUNT; + + while (spincount-- > 0) { + + sys_poll_aux_work: + + erl_sys_schedule(1); /* Might give us something to do */ + + dt = do_time_read_and_reset(); + if (dt) bump_timer(dt); + + sys_aux_work: + +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); + aux_work = blockable_aux_work(esdp, ssi, aux_work); +#endif +#ifdef ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK +#ifndef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); +#endif + nonblockable_aux_work(esdp, ssi, aux_work); +#endif + + flgs = erts_smp_atomic_read(&ssi->flags); + if (!(flgs & ERTS_SSI_FLG_WAITING)) { + ASSERT(!(flgs & ERTS_SSI_FLG_SLEEPING)); + goto sys_woken; + } + if (!(flgs & ERTS_SSI_FLG_SLEEPING)) { + flgs = sched_prep_cont_spin_wait(ssi); + if (!(flgs & ERTS_SSI_FLG_WAITING)) { + ASSERT(!(flgs & ERTS_SSI_FLG_SLEEPING)); + goto sys_woken; + } + } + + /* + * If we got new I/O tasks we aren't allowed to + * call erl_sys_schedule() until it is handled. + */ + if (erts_port_task_have_outstanding_io_tasks()) { + erts_smp_atomic_set(&doing_sys_schedule, 0); + /* + * Got to check that we still got I/O tasks; otherwise + * we have to continue checking for I/O... + */ + if (!prepare_for_sys_schedule()) { + spincount *= ERTS_SCHED_TSE_SLEEP_SPINCOUNT_FACT; + goto tse_wait; + } + } + } + + erts_smp_runq_lock(rq); + /* * If we got new I/O tasks we aren't allowed to * sleep in erl_sys_schedule(). */ - if (!erts_port_task_have_outstanding_io_tasks()) { -#endif + if (erts_port_task_have_outstanding_io_tasks()) { + erts_smp_atomic_set(&doing_sys_schedule, 0); + /* + * Got to check that we still got I/O tasks; otherwise + * we have to wait in erl_sys_schedule() after all... + */ + if (prepare_for_sys_schedule()) + goto do_sys_schedule; + + /* + * Not allowed to wait in erl_sys_schedule; + * do tse wait instead... + */ + sched_change_waiting_sys_to_waiting(esdp->no, rq); + erts_smp_runq_unlock(rq); + spincount = 0; + goto tse_wait; + } + else { + do_sys_schedule: erts_sys_schedule_interrupt(0); + flgs = sched_set_sleeptype(ssi, ERTS_SSI_FLG_POLL_SLEEPING); + if (!(flgs & ERTS_SSI_FLG_SLEEPING)) { + if (!(flgs & ERTS_SSI_FLG_WAITING)) + goto sys_locked_woken; + erts_smp_runq_unlock(rq); + flgs = sched_prep_cont_spin_wait(ssi); + if (!(flgs & ERTS_SSI_FLG_WAITING)) { + ASSERT(!(flgs & ERTS_SSI_FLG_SLEEPING)); + goto sys_woken; + } + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + goto sys_poll_aux_work; + } + + ASSERT(flgs & ERTS_SSI_FLG_POLL_SLEEPING); + ASSERT(flgs & ERTS_SSI_FLG_WAITING); + erts_smp_runq_unlock(rq); erl_sys_schedule(0); @@ -797,134 +1145,103 @@ sched_sys_wait(Uint no, ErtsRunQueue *rq) dt = do_time_read_and_reset(); if (dt) bump_timer(dt); - erts_smp_runq_lock(rq); + flgs = sched_prep_cont_spin_wait(ssi); + if (flgs & ERTS_SSI_FLG_WAITING) + goto sys_aux_work; -#if ERTS_SCHED_SLEEP_SPINCOUNT != 0 + sys_woken: + erts_smp_runq_lock(rq); + sys_locked_woken: + erts_smp_atomic_set(&doing_sys_schedule, 0); + if (flgs & ~ERTS_SSI_FLG_SUSPENDED) + erts_smp_atomic_band(&ssi->flags, ERTS_SSI_FLG_SUSPENDED); + sched_active_sys(esdp->no, rq); } } -#endif - - sched_active_sys(no, rq); -} -static void -sched_cnd_wait(Uint no, ErtsRunQueue *rq) -{ -#if ERTS_SCHED_SLEEP_SPINCOUNT != 0 - int val; - int spincount = ERTS_SCHED_SLEEP_SPINCOUNT; ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq)); -#endif - - sched_waiting(no, rq); - erts_smp_activity_begin(ERTS_ACTIVITY_WAIT, - prepare_for_block, - resume_after_block, - (void *) rq); - -#if ERTS_SCHED_SLEEP_SPINCOUNT == 0 - erts_smp_cnd_wait(&rq->cnd, &rq->mtx); -#else - erts_smp_atomic_inc(&rq->spin_waiter); - erts_smp_mtx_unlock(&rq->mtx); - - while (spincount-- > 0) { - val = erts_smp_atomic_read(&rq->spin_wake); - ASSERT(val >= 0); - if (val != 0) { - erts_smp_mtx_lock(&rq->mtx); - val = erts_smp_atomic_read(&rq->spin_wake); - ASSERT(val >= 0); - if (val != 0) - goto woken; - if (spincount == 0) - goto sleep; - erts_smp_mtx_unlock(&rq->mtx); - } - } - - erts_smp_mtx_lock(&rq->mtx); - val = erts_smp_atomic_read(&rq->spin_wake); - ASSERT(val >= 0); - if (val == 0) { - sleep: - erts_smp_atomic_dec(&rq->spin_waiter); - ASSERT(erts_smp_atomic_read(&rq->spin_waiter) >= 0); - erts_smp_cnd_wait(&rq->cnd, &rq->mtx); - } - else { - woken: - erts_smp_atomic_dec(&rq->spin_wake); - ASSERT(erts_smp_atomic_read(&rq->spin_wake) >= 0); - erts_smp_atomic_dec(&rq->spin_waiter); - ASSERT(erts_smp_atomic_read(&rq->spin_waiter) >= 0); - } -#endif - - erts_smp_activity_end(ERTS_ACTIVITY_WAIT, - prepare_for_block, - resume_after_block, - (void *) rq); - - sched_active(no, rq); } -static void -wake_one_scheduler(void) -{ - ASSERT(erts_common_run_queue); - ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(erts_common_run_queue)); - if (erts_common_run_queue->waiting) { - if (!sched_spin_wake(erts_common_run_queue)) { - if (erts_common_run_queue->waiting == -1) /* One scheduler waiting - and doing so in - sys_schedule */ - erts_sys_schedule_interrupt(1); - else - erts_smp_cnd_signal(&erts_common_run_queue->cnd); - } +static ERTS_INLINE long +ssi_flags_set_wake(ErtsSchedulerSleepInfo *ssi) +{ + /* reset all flags but suspended */ + long oflgs; + long nflgs = 0; + long xflgs = ERTS_SSI_FLG_SLEEPING|ERTS_SSI_FLG_WAITING; + while (1) { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, nflgs, xflgs); + if (oflgs == xflgs) + return oflgs; + nflgs = oflgs & ERTS_SSI_FLG_SUSPENDED; + xflgs = oflgs; } } static void -wake_scheduler(ErtsRunQueue *rq, int incq) +wake_scheduler(ErtsRunQueue *rq, int incq, int one) { - ASSERT(!erts_common_run_queue); - ASSERT(-1 <= rq->waiting && rq->waiting <= 1); - ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(rq)); - if (rq->waiting && !rq->woken) { - if (!sched_spin_wake(rq)) { - if (rq->waiting < 0) - erts_sys_schedule_interrupt(1); - else - erts_smp_cnd_signal(&rq->cnd); + int res; + ErtsSchedulerSleepInfo *ssi; + ErtsSchedulerSleepList *sl; + + /* + * The unlocked run queue is not strictly necessary + * from a thread safety or deadlock prevention + * perspective. It will, however, cost us performance + * if it is locked during wakup of another scheduler, + * so all code *should* handle this without having + * the lock on the run queue. + */ + ERTS_SMP_LC_ASSERT(!erts_smp_lc_runq_is_locked(rq)); + + sl = &rq->sleepers; + + erts_smp_spin_lock(&sl->lock); + ssi = sl->list; + if (!ssi) + erts_smp_spin_unlock(&sl->lock); + else if (one) { + long flgs; + if (ssi->prev) + ssi->prev->next = ssi->next; + else { + ASSERT(sl->list == ssi); + sl->list = ssi->next; } - rq->woken = 1; - if (incq) + if (ssi->next) + ssi->next->prev = ssi->prev; + + res = sl->list != NULL; + erts_smp_spin_unlock(&sl->lock); + + flgs = ssi_flags_set_wake(ssi); + erts_sched_finish_poke(ssi, flgs); + + if (incq && !erts_common_run_queue && (flgs & ERTS_SSI_FLG_WAITING)) non_empty_runq(rq); } + else { + sl->list = NULL; + erts_smp_spin_unlock(&sl->lock); + do { + ErtsSchedulerSleepInfo *wake_ssi = ssi; + ssi = ssi->next; + erts_sched_finish_poke(ssi, ssi_flags_set_wake(wake_ssi)); + } while (ssi); + } } static void wake_all_schedulers(void) { - if (erts_common_run_queue) { - erts_smp_runq_lock(erts_common_run_queue); - if (erts_common_run_queue->waiting) { - if (erts_common_run_queue->waiting < 0) - erts_sys_schedule_interrupt(1); - sched_spin_wake_all(erts_common_run_queue); - erts_smp_cnd_broadcast(&erts_common_run_queue->cnd); - } - erts_smp_runq_unlock(erts_common_run_queue); - } + if (erts_common_run_queue) + wake_scheduler(erts_common_run_queue, 0, 0); else { int ix; for (ix = 0; ix < erts_no_run_queues; ix++) { ErtsRunQueue *rq = ERTS_RUNQ_IX(ix); - erts_smp_runq_lock(rq); - wake_scheduler(rq, 0); - erts_smp_runq_unlock(rq); + wake_scheduler(rq, 0, 1); } } } @@ -939,14 +1256,14 @@ chk_wake_sched(ErtsRunQueue *crq, int ix, int activate) wrq = ERTS_RUNQ_IX(ix); iflgs = erts_smp_atomic_read(&wrq->info_flags); if (!(iflgs & (ERTS_RUNQ_IFLG_SUSPENDED|ERTS_RUNQ_IFLG_NONEMPTY))) { - erts_smp_xrunq_lock(crq, wrq); if (activate) { if (ix == erts_smp_atomic_cmpxchg(&balance_info.active_runqs, ix+1, ix)) { + erts_smp_xrunq_lock(crq, wrq); wrq->flags &= ~ERTS_RUNQ_FLG_INACTIVE; + erts_smp_xrunq_unlock(crq, wrq); } } - wake_scheduler(wrq, 0); - erts_smp_xrunq_unlock(crq, wrq); + wake_scheduler(wrq, 0, 1); return 1; } return 0; @@ -992,15 +1309,13 @@ static ERTS_INLINE void smp_notify_inc_runq(ErtsRunQueue *runq) { #ifdef ERTS_SMP - if (erts_common_run_queue) - wake_one_scheduler(); - else - wake_scheduler(runq, 1); + if (runq) + wake_scheduler(runq, 1, 1); #endif } void -erts_smp_notify_inc_runq__(ErtsRunQueue *runq) +erts_smp_notify_inc_runq(ErtsRunQueue *runq) { smp_notify_inc_runq(runq); } @@ -1146,20 +1461,23 @@ static void evacuate_run_queue(ErtsRunQueue *evac_rq, ErtsRunQueue *rq) { Port *prt; + int notify_to_rq = 0; int prio; int prt_locked = 0; int rq_locked = 0; int evac_rq_locked = 1; + ErtsMigrateResult mres; erts_smp_runq_lock(evac_rq); + erts_smp_atomic_bor(&evac_rq->scheduler->ssi->flags, ERTS_SSI_FLG_SUSPENDED); + evac_rq->flags &= ~ERTS_RUNQ_FLGS_IMMIGRATE_QMASK; evac_rq->flags |= (ERTS_RUNQ_FLGS_EMIGRATE_QMASK | ERTS_RUNQ_FLGS_EVACUATE_QMASK | ERTS_RUNQ_FLG_SUSPENDED); erts_smp_atomic_bor(&evac_rq->info_flags, ERTS_RUNQ_IFLG_SUSPENDED); - /* * Need to set up evacuation paths first since we * may release the run queue lock on evac_rq @@ -1187,9 +1505,11 @@ evacuate_run_queue(ErtsRunQueue *evac_rq, ErtsRunQueue *rq) /* Evacuate scheduled ports */ prt = evac_rq->ports.start; while (prt) { - (void) erts_port_migrate(prt, &prt_locked, + mres = erts_port_migrate(prt, &prt_locked, evac_rq, &evac_rq_locked, rq, &rq_locked); + if (mres == ERTS_MIGRATE_SUCCESS) + notify_to_rq = 1; if (prt_locked) erts_smp_port_unlock(prt); if (!evac_rq_locked) { @@ -1218,9 +1538,11 @@ evacuate_run_queue(ErtsRunQueue *evac_rq, ErtsRunQueue *rq) goto end_of_proc; } - (void) erts_proc_migrate(proc, &proc_locks, + mres = erts_proc_migrate(proc, &proc_locks, evac_rq, &evac_rq_locked, rq, &rq_locked); + if (mres == ERTS_MIGRATE_SUCCESS) + notify_to_rq = 1; if (proc_locks) erts_smp_proc_unlock(proc, proc_locks); if (!evac_rq_locked) { @@ -1252,10 +1574,13 @@ evacuate_run_queue(ErtsRunQueue *evac_rq, ErtsRunQueue *rq) if (rq_locked) erts_smp_runq_unlock(rq); - if (!evac_rq_locked) - erts_smp_runq_lock(evac_rq); - wake_scheduler(evac_rq, 0); - erts_smp_runq_unlock(evac_rq); + if (evac_rq_locked) + erts_smp_runq_unlock(evac_rq); + + if (notify_to_rq) + smp_notify_inc_runq(rq); + + wake_scheduler(evac_rq, 0, 1); } static int @@ -1483,31 +1808,6 @@ try_steal_task(ErtsRunQueue *rq) return res; } -#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN -void -erts_smp_notify_check_children_needed(void) -{ - int i; - for (i = 0; i < erts_no_schedulers; i++) { - erts_smp_runq_lock(ERTS_SCHEDULER_IX(i)->run_queue); - ERTS_SCHEDULER_IX(i)->check_children = 1; - if (!erts_common_run_queue) - wake_scheduler(ERTS_SCHEDULER_IX(i)->run_queue, 0); - erts_smp_runq_unlock(ERTS_SCHEDULER_IX(i)->run_queue); - } - if (ongoing_multi_scheduling_block()) { - /* Also blocked schedulers need to check children */ - erts_smp_mtx_lock(&schdlr_sspnd.mtx); - for (i = 0; i < erts_no_schedulers; i++) - ERTS_SCHEDULER_IX(i)->blocked_check_children = 1; - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); - erts_smp_mtx_unlock(&schdlr_sspnd.mtx); - } - if (erts_common_run_queue) - wake_all_schedulers(); -} -#endif - /* Run queue balancing */ typedef struct { @@ -2100,6 +2400,8 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) erts_smp_atomic_init(&no_empty_run_queues, 0); #endif + erts_no_run_queues = n; + for (ix = 0; ix < n; ix++) { int pix, rix; ErtsRunQueue *rq = ERTS_RUNQ_IX(ix); @@ -2114,8 +2416,10 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) erts_smp_mtx_init_x(&rq->mtx, "run_queue", make_small(ix + 1)); erts_smp_cnd_init(&rq->cnd); - erts_smp_atomic_init(&rq->spin_waiter, 0); - erts_smp_atomic_init(&rq->spin_wake, 0); +#ifdef ERTS_SMP + erts_smp_spinlock_init(&rq->sleepers.lock, "run_queue_sleep_list"); + rq->sleepers.list = NULL; +#endif rq->waiting = 0; rq->woken = 0; @@ -2166,7 +2470,6 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) } erts_common_run_queue = !mrq ? ERTS_RUNQ_IX(0) : NULL; - erts_no_run_queues = n; #ifdef ERTS_SMP @@ -2179,11 +2482,36 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) * erts_no_run_queues)); } +#endif + + n = (int) no_schedulers; + erts_no_schedulers = n; + +#ifdef ERTS_SMP + /* Create and initialize scheduler sleep info */ + + aligned_sched_sleep_info = erts_alloc(ERTS_ALC_T_SCHDLR_SLP_INFO, + (sizeof(ErtsAlignedSchedulerSleepInfo) + *(n+1))); + if ((((Uint) aligned_sched_sleep_info) & ERTS_CACHE_LINE_MASK) == 0) + aligned_sched_sleep_info = ((ErtsAlignedSchedulerSleepInfo *) + ((((Uint) aligned_sched_sleep_info) + & ~ERTS_CACHE_LINE_MASK) + + ERTS_CACHE_LINE_SIZE)); + for (ix = 0; ix < n; ix++) { + ErtsSchedulerSleepInfo *ssi = ERTS_SCHED_SLEEP_INFO_IX(ix); +#if 0 /* no need to initialize these... */ + ssi->next = NULL; + ssi->prev = NULL; +#endif + erts_smp_atomic_init(&ssi->flags, 0); + ssi->event = NULL; /* initialized in sched_thread_func */ + erts_smp_atomic_init(&ssi->aux_work, 0); + } #endif /* Create and initialize scheduler specific data */ - n = (int) no_schedulers; erts_aligned_scheduler_data = erts_alloc(ERTS_ALC_T_SCHDLR_DATA, (sizeof(ErtsAlignedSchedulerData) *(n+1))); @@ -2200,6 +2528,7 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) #ifdef ERTS_SMP erts_bits_init_state(&esdp->erl_bits_state); esdp->match_pseudo_process = NULL; + esdp->ssi = ERTS_SCHED_SLEEP_INFO_IX(ix); esdp->free_process = NULL; #if HALFWORD_HEAP /* Registers need to be heap allocated (correct memory range) for tracing to work */ @@ -2228,11 +2557,6 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) } #ifdef ERTS_SMP -#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN - esdp->check_children = 0; - esdp->blocked_check_children = 0; -#endif - erts_smp_atomic_init(&esdp->suspended, 0); erts_smp_atomic_init(&esdp->chk_cpu_bind, 0); #endif } @@ -2241,7 +2565,7 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) erts_smp_mtx_init(&schdlr_sspnd.mtx, "schdlr_sspnd"); erts_smp_cnd_init(&schdlr_sspnd.cnd); - schdlr_sspnd.changing = 0; + erts_smp_atomic_init(&schdlr_sspnd.changing, 0); schdlr_sspnd.online = no_schedulers_online; schdlr_sspnd.curr_online = no_schedulers; erts_smp_atomic_init(&schdlr_sspnd.msb.ongoing, 0); @@ -2264,7 +2588,8 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) if (no_schedulers_online < no_schedulers) { if (erts_common_run_queue) { for (ix = no_schedulers_online; ix < no_schedulers; ix++) - erts_smp_atomic_set(&(ERTS_SCHEDULER_IX(ix)->suspended), 1); + erts_smp_atomic_bor(&ERTS_SCHED_SLEEP_INFO_IX(ix)->flags, + ERTS_SSI_FLG_SUSPENDED); } else { for (ix = no_schedulers_online; ix < erts_no_run_queues; ix++) @@ -2275,7 +2600,8 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) schdlr_sspnd.wait_curr_online = no_schedulers_online; schdlr_sspnd.curr_online *= 2; /* Boot strapping... */ - schdlr_sspnd.changing = ERTS_SCHED_CHANGING_ONLINE; + ERTS_SCHDLR_SSPND_CHNG_SET((ERTS_SCHDLR_SSPND_CHNG_ONLN + | ERTS_SCHDLR_SSPND_CHNG_WAITER), 0); erts_smp_atomic_init(&doing_sys_schedule, 0); @@ -2422,14 +2748,116 @@ susp_sched_resume_block(void *unused) erts_smp_mtx_lock(&schdlr_sspnd.mtx); } +static void +scheduler_ix_resume_wake(Uint ix) +{ + ErtsSchedulerSleepInfo *ssi = ERTS_SCHED_SLEEP_INFO_IX(ix); + long xflgs = (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_TSE_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED); + long oflgs; + do { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, 0, xflgs); + if (oflgs == xflgs) { + erts_sched_finish_poke(ssi, oflgs); + break; + } + xflgs = oflgs; + } while (oflgs & ERTS_SSI_FLG_SUSPENDED); +} + +static long +sched_prep_spin_suspended(ErtsSchedulerSleepInfo *ssi, long xpct) +{ + long oflgs; + long nflgs = (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED); + long xflgs = xpct; + + do { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, nflgs, xflgs); + if (oflgs == xflgs) + return nflgs; + xflgs = oflgs; + } while (oflgs & ERTS_SSI_FLG_SUSPENDED); + + return oflgs; +} + +static long +sched_spin_suspended(ErtsSchedulerSleepInfo *ssi, int spincount) +{ + int until_yield = ERTS_SCHED_SPIN_UNTIL_YIELD; + int sc = spincount; + long flgs; + + do { + flgs = erts_smp_atomic_read(&ssi->flags); + if ((flgs & (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)) + != (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)) { + break; + } + ERTS_SPIN_BODY; + if (--until_yield == 0) { + until_yield = ERTS_SCHED_SPIN_UNTIL_YIELD; + erts_thr_yield(); + } + } while (--sc > 0); + return flgs; +} + +static long +sched_set_suspended_sleeptype(ErtsSchedulerSleepInfo *ssi) +{ + long oflgs; + long nflgs = (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_TSE_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED); + long xflgs = (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED); + + erts_tse_reset(ssi->event); + + while (1) { + oflgs = erts_smp_atomic_cmpxchg(&ssi->flags, nflgs, xflgs); + if (oflgs == xflgs) + return nflgs; + if ((oflgs & (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)) + != (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)) { + return oflgs; + } + xflgs = oflgs; + } +} + static void suspend_scheduler(ErtsSchedulerData *esdp) { + long flgs; + int changing; long no = (long) esdp->no; ErtsRunQueue *rq = esdp->run_queue; + ErtsSchedulerSleepInfo *ssi = esdp->ssi; long active_schedulers; int curr_online = 1; int wake = 0; + int reset_read_group = 0; +#if defined(ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK) \ + || defined(ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK) + long aux_work; +#endif /* * Schedulers may be suspended in two different ways: @@ -2451,113 +2879,151 @@ suspend_scheduler(ErtsSchedulerData *esdp) if (scheduler2cpu_map[esdp->no].bound_id >= 0 && erts_unbind_from_cpu(erts_cpuinfo) == 0) { esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + reset_read_group = 1; } erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); + if (reset_read_group) + erts_smp_rwmtx_set_reader_group(0); + + if (esdp->no <= erts_max_main_threads) + erts_thr_set_main_status(0, 0); + if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_inactive); erts_smp_mtx_lock(&schdlr_sspnd.mtx); - active_schedulers = erts_smp_atomic_dectest(&schdlr_sspnd.active); - ASSERT(active_schedulers >= 1); - if (schdlr_sspnd.changing == ERTS_SCHED_CHANGING_MULTI_SCHED) { - if (active_schedulers == schdlr_sspnd.msb.wait_active) - wake = 1; - if (active_schedulers == 1) - schdlr_sspnd.changing = 0; - } + flgs = sched_prep_spin_suspended(ssi, ERTS_SSI_FLG_SUSPENDED); + if (flgs & ERTS_SSI_FLG_SUSPENDED) { - while (1) { - -#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN - int check_children; - erts_smp_runq_lock(esdp->run_queue); - check_children = esdp->check_children; - esdp->check_children = 0; - erts_smp_runq_unlock(esdp->run_queue); - if (check_children) { - erts_smp_mtx_unlock(&schdlr_sspnd.mtx); - erts_check_children(); - erts_smp_mtx_lock(&schdlr_sspnd.mtx); + active_schedulers = erts_smp_atomic_dectest(&schdlr_sspnd.active); + ASSERT(active_schedulers >= 1); + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); + if (changing & ERTS_SCHDLR_SSPND_CHNG_MSB) { + if (active_schedulers == schdlr_sspnd.msb.wait_active) + wake = 1; + if (active_schedulers == 1) { + changing = erts_smp_atomic_band(&schdlr_sspnd.changing, + ~ERTS_SCHDLR_SSPND_CHNG_MSB); + changing &= ~ERTS_SCHDLR_SSPND_CHNG_MSB; + } } -#endif - if (schdlr_sspnd.changing == ERTS_SCHED_CHANGING_ONLINE) { - int changed = 0; - if (no > schdlr_sspnd.online && curr_online) { - schdlr_sspnd.curr_online--; - curr_online = 0; - changed = 1; + while (1) { + if (changing & ERTS_SCHDLR_SSPND_CHNG_ONLN) { + int changed = 0; + if (no > schdlr_sspnd.online && curr_online) { + schdlr_sspnd.curr_online--; + curr_online = 0; + changed = 1; + } + else if (no <= schdlr_sspnd.online && !curr_online) { + schdlr_sspnd.curr_online++; + curr_online = 1; + changed = 1; + } + if (changed + && schdlr_sspnd.curr_online == schdlr_sspnd.wait_curr_online) + wake = 1; + if (schdlr_sspnd.online == schdlr_sspnd.curr_online) { + changing = erts_smp_atomic_band(&schdlr_sspnd.changing, + ~ERTS_SCHDLR_SSPND_CHNG_ONLN); + changing &= ~ERTS_SCHDLR_SSPND_CHNG_ONLN; + } } - else if (no <= schdlr_sspnd.online && !curr_online) { - schdlr_sspnd.curr_online++; - curr_online = 1; - changed = 1; + + if (wake) { + erts_smp_cnd_signal(&schdlr_sspnd.cnd); + wake = 0; } - if (changed - && schdlr_sspnd.curr_online == schdlr_sspnd.wait_curr_online) - wake = 1; - if (schdlr_sspnd.online == schdlr_sspnd.curr_online) - schdlr_sspnd.changing = 0; - } - if (wake) { - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); - wake = 0; - } + flgs = erts_smp_atomic_read(&ssi->flags); + if (!(flgs & ERTS_SSI_FLG_SUSPENDED)) + break; + erts_smp_mtx_unlock(&schdlr_sspnd.mtx); - if (!(rq->flags & (ERTS_RUNQ_FLG_SHARED_RUNQ|ERTS_RUNQ_FLG_SUSPENDED))) - break; - if ((rq->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) - && !erts_smp_atomic_read(&esdp->suspended)) - break; +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); + blockable_aux_work: + blockable_aux_work(esdp, ssi, aux_work); +#endif - erts_smp_activity_begin(ERTS_ACTIVITY_WAIT, - susp_sched_prep_block, - susp_sched_resume_block, - NULL); - while (1) { + erts_smp_activity_begin(ERTS_ACTIVITY_WAIT, NULL, NULL, NULL); + while (1) { + long flgs; +#ifdef ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK +#ifndef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); +#endif + nonblockable_aux_work(esdp, ssi, aux_work); +#endif -#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN - if (esdp->blocked_check_children) - break; + flgs = sched_spin_suspended(ssi, ERTS_SCHED_SUSPEND_SLEEP_SPINCOUNT); + if (flgs == (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)) { + flgs = sched_set_suspended_sleeptype(ssi); + if (flgs == (ERTS_SSI_FLG_SLEEPING + | ERTS_SSI_FLG_TSE_SLEEPING + | ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)) { + int res; + do { + res = erts_tse_wait(ssi->event); + } while (res == EINTR); + } + } + + flgs = sched_prep_spin_suspended(ssi, (ERTS_SSI_FLG_WAITING + | ERTS_SSI_FLG_SUSPENDED)); + if (!(flgs & ERTS_SSI_FLG_SUSPENDED)) + break; + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); + if (changing & ~ERTS_SCHDLR_SSPND_CHNG_WAITER) + break; + + +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = erts_smp_atomic_read(&ssi->aux_work); + if (aux_work & ERTS_SSI_BLOCKABLE_AUX_WORK_MASK) { + erts_smp_activity_end(ERTS_ACTIVITY_WAIT, NULL, NULL, NULL); + goto blockable_aux_work; + } #endif - erts_smp_cnd_wait(&schdlr_sspnd.cnd, &schdlr_sspnd.mtx); + } - if (schdlr_sspnd.changing == ERTS_SCHED_CHANGING_ONLINE) - break; + erts_smp_activity_end(ERTS_ACTIVITY_WAIT, NULL, NULL, NULL); - if (!(rq->flags & (ERTS_RUNQ_FLG_SHARED_RUNQ - | ERTS_RUNQ_FLG_SUSPENDED))) - break; - if ((rq->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) - && !erts_smp_atomic_read(&esdp->suspended)) - break; + erts_smp_mtx_lock(&schdlr_sspnd.mtx); + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); } -#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN - esdp->blocked_check_children = 0; -#endif + active_schedulers = erts_smp_atomic_inctest(&schdlr_sspnd.active); + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); + if ((changing & ERTS_SCHDLR_SSPND_CHNG_MSB) + && schdlr_sspnd.online == active_schedulers) { + erts_smp_atomic_band(&schdlr_sspnd.changing, + ~ERTS_SCHDLR_SSPND_CHNG_MSB); + } - erts_smp_activity_end(ERTS_ACTIVITY_WAIT, - susp_sched_prep_block, - susp_sched_resume_block, - NULL); - } + ASSERT(no <= schdlr_sspnd.online); + ASSERT(!erts_smp_atomic_read(&schdlr_sspnd.msb.ongoing)); - active_schedulers = erts_smp_atomic_inctest(&schdlr_sspnd.active); - if (schdlr_sspnd.changing == ERTS_SCHED_CHANGING_MULTI_SCHED - && schdlr_sspnd.online == active_schedulers) { - schdlr_sspnd.changing = 0; } + erts_smp_mtx_unlock(&schdlr_sspnd.mtx); + ASSERT(curr_online); + if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_active); + if (esdp->no <= erts_max_main_threads) + erts_thr_set_main_status(1, (int) esdp->no); + erts_smp_runq_lock(esdp->run_queue); non_empty_runq(esdp->run_queue); @@ -2622,8 +3088,10 @@ erts_schedulers_state(Uint *total, int yield_allowed) { int res; + long changing; erts_smp_mtx_lock(&schdlr_sspnd.mtx); - if (yield_allowed && schdlr_sspnd.changing) + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); + if (yield_allowed && (changing & ~ERTS_SCHDLR_SSPND_CHNG_WAITER)) res = ERTS_SCHDLR_SSPND_YIELD_RESTART; else { *active = *online = schdlr_sspnd.online; @@ -2643,6 +3111,7 @@ erts_set_schedulers_online(Process *p, Sint *old_no) { int ix, res, no, have_unlocked_plocks; + long changing; if (new_no < 1 || erts_no_schedulers < new_no) return ERTS_SCHDLR_SSPND_EINVAL; @@ -2652,7 +3121,8 @@ erts_set_schedulers_online(Process *p, have_unlocked_plocks = 0; no = (int) new_no; - if (schdlr_sspnd.changing) { + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); + if (changing) { res = ERTS_SCHDLR_SSPND_YIELD_RESTART; } else { @@ -2661,17 +3131,19 @@ erts_set_schedulers_online(Process *p, res = ERTS_SCHDLR_SSPND_DONE; } else { - schdlr_sspnd.changing = ERTS_SCHED_CHANGING_ONLINE; + ERTS_SCHDLR_SSPND_CHNG_SET((ERTS_SCHDLR_SSPND_CHNG_ONLN + | ERTS_SCHDLR_SSPND_CHNG_WAITER), 0); schdlr_sspnd.online = no; if (no > online) { int ix; schdlr_sspnd.wait_curr_online = no; - if (ongoing_multi_scheduling_block()) - /* No schedulers to resume */; + if (ongoing_multi_scheduling_block()) { + for (ix = online; ix < no; ix++) + erts_sched_poke(ERTS_SCHED_SLEEP_INFO_IX(ix)); + } else if (erts_common_run_queue) { for (ix = online; ix < no; ix++) - erts_smp_atomic_set(&ERTS_SCHEDULER_IX(ix)->suspended, - 0); + scheduler_ix_resume_wake(ix); } else { if (plocks) { @@ -2685,6 +3157,7 @@ erts_set_schedulers_online(Process *p, erts_smp_runq_lock(rq); ERTS_RUNQ_RESET_SUSPEND_INFO(rq, 0x5); erts_smp_runq_unlock(rq); + scheduler_ix_resume_wake(ix); } /* * Spread evacuation paths among all online @@ -2699,7 +3172,6 @@ erts_set_schedulers_online(Process *p, erts_smp_mtx_unlock(&balance_info.update_mtx); erts_smp_mtx_lock(&schdlr_sspnd.mtx); } - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); res = ERTS_SCHDLR_SSPND_DONE; } else /* if (no < online) */ { @@ -2716,12 +3188,17 @@ erts_set_schedulers_online(Process *p, schdlr_sspnd.wait_curr_online = no+1; } - if (ongoing_multi_scheduling_block()) - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); + if (ongoing_multi_scheduling_block()) { + for (ix = no; ix < online; ix++) + erts_sched_poke(ERTS_SCHED_SLEEP_INFO_IX(ix)); + } else if (erts_common_run_queue) { - for (ix = no; ix < online; ix++) - erts_smp_atomic_set(&ERTS_SCHEDULER_IX(ix)->suspended, - 1); + for (ix = no; ix < online; ix++) { + ErtsSchedulerSleepInfo *ssi; + ssi = ERTS_SCHED_SLEEP_INFO_IX(ix); + erts_smp_atomic_bor(&ssi->flags, + ERTS_SSI_FLG_SUSPENDED); + } wake_all_schedulers(); } else { @@ -2748,7 +3225,10 @@ erts_set_schedulers_online(Process *p, erts_smp_atomic_set(&balance_info.used_runqs, no); erts_smp_mtx_unlock(&balance_info.update_mtx); erts_smp_mtx_lock(&schdlr_sspnd.mtx); - ERTS_FOREACH_OP_RUNQ(rq, wake_scheduler(rq, 0)); + for (ix = no; ix < online; ix++) { + ErtsRunQueue *rq = ERTS_RUNQ_IX(ix); + wake_scheduler(rq, 0, 1); + } } } @@ -2762,6 +3242,12 @@ erts_set_schedulers_online(Process *p, susp_sched_prep_block, susp_sched_resume_block, NULL); + ASSERT(res != ERTS_SCHDLR_SSPND_DONE + ? (ERTS_SCHDLR_SSPND_CHNG_WAITER + & erts_smp_atomic_read(&schdlr_sspnd.changing)) + : (ERTS_SCHDLR_SSPND_CHNG_WAITER + == erts_smp_atomic_read(&schdlr_sspnd.changing))); + erts_smp_atomic_band(&schdlr_sspnd.changing, ~ERTS_SCHDLR_SSPND_CHNG_WAITER); } } @@ -2776,11 +3262,12 @@ ErtsSchedSuspendResult erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) { int ix, res, have_unlocked_plocks = 0; + long changing; ErtsProcList *plp; erts_smp_mtx_lock(&schdlr_sspnd.mtx); - - if (schdlr_sspnd.changing) { + changing = erts_smp_atomic_read(&schdlr_sspnd.changing); + if (changing) { res = ERTS_SCHDLR_SSPND_YIELD_RESTART; /* Yield */ } else if (on) { /* ------ BLOCK ------ */ @@ -2794,19 +3281,22 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) res = ERTS_SCHDLR_SSPND_DONE_MSCHED_BLOCKED; } else { + int online = schdlr_sspnd.online; p->flags |= F_HAVE_BLCKD_MSCHED; if (plocks) { have_unlocked_plocks = 1; erts_smp_proc_unlock(p, plocks); } + ASSERT(0 == erts_smp_atomic_read(&schdlr_sspnd.msb.ongoing)); erts_smp_atomic_set(&schdlr_sspnd.msb.ongoing, 1); - if (schdlr_sspnd.online == 1) { + if (online == 1) { res = ERTS_SCHDLR_SSPND_DONE_MSCHED_BLOCKED; ASSERT(erts_smp_atomic_read(&schdlr_sspnd.active) == 1); ASSERT(p->scheduler_data->no == 1); } else { - schdlr_sspnd.changing = ERTS_SCHED_CHANGING_MULTI_SCHED; + ERTS_SCHDLR_SSPND_CHNG_SET((ERTS_SCHDLR_SSPND_CHNG_MSB + | ERTS_SCHDLR_SSPND_CHNG_WAITER), 0); if (p->scheduler_data->no == 1) { res = ERTS_SCHDLR_SSPND_DONE_MSCHED_BLOCKED; schdlr_sspnd.msb.wait_active = 1; @@ -2820,17 +3310,19 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) schdlr_sspnd.msb.wait_active = 2; } if (erts_common_run_queue) { - for (ix = 1; ix < schdlr_sspnd.online; ix++) - erts_smp_atomic_set(&ERTS_SCHEDULER_IX(ix)->suspended, 1); + for (ix = 1; ix < online; ix++) + erts_smp_atomic_bor(&ERTS_SCHED_SLEEP_INFO_IX(ix)->flags, + ERTS_SSI_FLG_SUSPENDED); wake_all_schedulers(); } else { erts_smp_mtx_unlock(&schdlr_sspnd.mtx); erts_smp_mtx_lock(&balance_info.update_mtx); erts_smp_atomic_set(&balance_info.used_runqs, 1); - for (ix = 0; ix < schdlr_sspnd.online; ix++) { + for (ix = 0; ix < online; ix++) { ErtsRunQueue *rq = ERTS_RUNQ_IX(ix); erts_smp_runq_lock(rq); + ASSERT(!(rq->flags & ERTS_RUNQ_FLG_SUSPENDED)); ERTS_RUNQ_RESET_MIGRATION_PATHS(rq, 0x7); erts_smp_runq_unlock(rq); } @@ -2855,6 +3347,13 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) susp_sched_prep_block, susp_sched_resume_block, NULL); + ASSERT(res != ERTS_SCHDLR_SSPND_DONE_MSCHED_BLOCKED + ? (ERTS_SCHDLR_SSPND_CHNG_WAITER + & erts_smp_atomic_read(&schdlr_sspnd.changing)) + : (ERTS_SCHDLR_SSPND_CHNG_WAITER + == erts_smp_atomic_read(&schdlr_sspnd.changing))); + erts_smp_atomic_band(&schdlr_sspnd.changing, + ~ERTS_SCHDLR_SSPND_CHNG_WAITER); } plp = proclist_create(p); plp->next = schdlr_sspnd.msb.procs; @@ -2898,7 +3397,7 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) if (schdlr_sspnd.msb.procs) res = ERTS_SCHDLR_SSPND_DONE_MSCHED_BLOCKED; else { - schdlr_sspnd.changing = ERTS_SCHED_CHANGING_MULTI_SCHED; + ERTS_SCHDLR_SSPND_CHNG_SET(ERTS_SCHDLR_SSPND_CHNG_MSB, 0); #ifdef DEBUG ERTS_FOREACH_RUNQ(rq, { @@ -2925,13 +3424,13 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) if (schdlr_sspnd.online == 1) { /* No schedulers to resume */ ASSERT(erts_smp_atomic_read(&schdlr_sspnd.active) == 1); - schdlr_sspnd.changing = 0; + ERTS_SCHDLR_SSPND_CHNG_SET(0, ERTS_SCHDLR_SSPND_CHNG_MSB); } else if (erts_common_run_queue) { for (ix = 1; ix < schdlr_sspnd.online; ix++) - erts_smp_atomic_set(&ERTS_SCHEDULER_IX(ix)->suspended, 0); + erts_smp_atomic_band(&ERTS_SCHED_SLEEP_INFO_IX(ix)->flags, + ~ERTS_SSI_FLG_SUSPENDED); wake_all_schedulers(); - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); } else { int online = schdlr_sspnd.online; @@ -2948,6 +3447,7 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) erts_smp_runq_lock(rq); ERTS_RUNQ_RESET_SUSPEND_INFO(rq, 0x4); erts_smp_runq_unlock(rq); + scheduler_ix_resume_wake(ix); } /* Spread evacuation paths among all online run queues */ @@ -2963,7 +3463,6 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all) erts_smp_runq_unlock(ERTS_RUNQ_IX(0)); erts_smp_mtx_unlock(&balance_info.update_mtx); erts_smp_mtx_lock(&schdlr_sspnd.mtx); - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); } res = ERTS_SCHDLR_SSPND_DONE; } @@ -3035,18 +3534,34 @@ erts_multi_scheduling_blockers(Process *p) static void * sched_thread_func(void *vesdp) { +#ifdef ERTS_SMP + Uint no = ((ErtsSchedulerData *) vesdp)->no; +#endif #ifdef ERTS_ENABLE_LOCK_CHECK { char buf[31]; - Uint no = ((ErtsSchedulerData *) vesdp)->no; erts_snprintf(&buf[0], 31, "scheduler %bpu", no); erts_lc_set_thread_name(&buf[0]); } #endif - erts_alloc_reg_scheduler_id(((ErtsSchedulerData *) vesdp)->no); + erts_alloc_reg_scheduler_id(no); erts_tsd_set(sched_data_key, vesdp); #ifdef ERTS_SMP + + if (no <= erts_max_main_threads) { + erts_thr_set_main_status(1, (int) no); + if (erts_reader_groups) { + int rg = (int) no; + if (rg > erts_reader_groups) + rg = (((int) no) - 1) % erts_reader_groups + 1; + erts_smp_rwmtx_set_reader_group(rg); + } + } + erts_proc_lock_prepare_proc_lock_waiter(); + ERTS_SCHED_SLEEP_INFO_IX(no - 1)->event = erts_tse_fetch(); + + #endif erts_register_blockable_thread(); #ifdef HIPE @@ -3055,30 +3570,30 @@ sched_thread_func(void *vesdp) erts_thread_init_float(); erts_smp_mtx_lock(&schdlr_sspnd.mtx); - ASSERT(schdlr_sspnd.changing == ERTS_SCHED_CHANGING_ONLINE); + ASSERT(erts_smp_atomic_read(&schdlr_sspnd.changing) + & ERTS_SCHDLR_SSPND_CHNG_ONLN); - schdlr_sspnd.curr_online--; + if (--schdlr_sspnd.curr_online == schdlr_sspnd.wait_curr_online) { + erts_smp_atomic_band(&schdlr_sspnd.changing, + ~ERTS_SCHDLR_SSPND_CHNG_ONLN); + if (((ErtsSchedulerData *) vesdp)->no != 1) + erts_smp_cnd_signal(&schdlr_sspnd.cnd); + } - if (((ErtsSchedulerData *) vesdp)->no != 1) { - if (schdlr_sspnd.online == schdlr_sspnd.curr_online) { - schdlr_sspnd.changing = 0; - erts_smp_cnd_broadcast(&schdlr_sspnd.cnd); + if (((ErtsSchedulerData *) vesdp)->no == 1) { + if (schdlr_sspnd.curr_online != schdlr_sspnd.wait_curr_online) { + erts_smp_activity_begin(ERTS_ACTIVITY_WAIT, + susp_sched_prep_block, + susp_sched_resume_block, + NULL); + while (schdlr_sspnd.curr_online != schdlr_sspnd.wait_curr_online) + erts_smp_cnd_wait(&schdlr_sspnd.cnd, &schdlr_sspnd.mtx); + erts_smp_activity_end(ERTS_ACTIVITY_WAIT, + susp_sched_prep_block, + susp_sched_resume_block, + NULL); } - } - else if (schdlr_sspnd.curr_online == schdlr_sspnd.wait_curr_online) - schdlr_sspnd.changing = 0; - else { - erts_smp_activity_begin(ERTS_ACTIVITY_WAIT, - susp_sched_prep_block, - susp_sched_resume_block, - NULL); - while (schdlr_sspnd.curr_online != schdlr_sspnd.wait_curr_online) - erts_smp_cnd_wait(&schdlr_sspnd.cnd, &schdlr_sspnd.mtx); - erts_smp_activity_end(ERTS_ACTIVITY_WAIT, - susp_sched_prep_block, - susp_sched_resume_block, - NULL); - ASSERT(!schdlr_sspnd.changing); + ERTS_SCHDLR_SSPND_CHNG_SET(0, ERTS_SCHDLR_SSPND_CHNG_WAITER); } erts_smp_mtx_unlock(&schdlr_sspnd.mtx); @@ -3428,6 +3943,7 @@ processor_order_cmp(const void *vx, const void *vy) static void check_cpu_bind(ErtsSchedulerData *esdp) { + int rg = 0; int res; int cpu_id; erts_smp_runq_unlock(esdp->run_queue); @@ -3459,6 +3975,12 @@ check_cpu_bind(ErtsSchedulerData *esdp) erts_send_error_to_logger_nogl(dsbufp); } } + if (erts_reader_groups) { + if (esdp->cpu_id >= 0) + rg = reader_group_lookup(esdp->cpu_id); + else + rg = (((int) esdp->no) - 1) % erts_reader_groups + 1; + } erts_smp_runq_lock(esdp->run_queue); #ifdef ERTS_SMP if (erts_common_run_queue) @@ -3469,6 +3991,8 @@ check_cpu_bind(ErtsSchedulerData *esdp) #endif erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); + if (erts_reader_groups) + erts_smp_rwmtx_set_reader_group(rg); } static void @@ -3497,11 +4021,13 @@ signal_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size) wake_all_schedulers(); } else { - ERTS_FOREACH_RUNQ(rq, - { + for (s_ix = 0; s_ix < erts_no_run_queues; s_ix++) { + ErtsRunQueue *rq = ERTS_RUNQ_IX(s_ix); + erts_smp_runq_lock(rq); rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; - wake_scheduler(rq, 0); - }); + erts_smp_runq_unlock(rq); + wake_scheduler(rq, 0, 1); + }; } #else check_cpu_bind(erts_get_scheduler_data()); @@ -3541,6 +4067,490 @@ erts_init_scheduler_bind_type(char *how) return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS; } +/* + * reader groups map + */ + +typedef struct { + int level[ERTS_TOPOLOGY_MAX_DEPTH+1]; +} erts_avail_cput; + +typedef struct { + int *map; + int size; + int groups; +} erts_reader_groups_map_test; + +typedef struct { + int id; + int sub_levels; + int reader_groups; +} erts_rg_count_t; + +typedef struct { + int logical; + int reader_group; +} erts_reader_groups_map_t; + +typedef struct { + erts_reader_groups_map_t *map; + int map_size; + int logical_processors; + int groups; +} erts_make_reader_groups_map_test; + +static int reader_groups_available_cpu_check; +static int reader_groups_logical_processors; +static int reader_groups_map_size; +static erts_reader_groups_map_t *reader_groups_map; + +#define ERTS_TOPOLOGY_RG ERTS_TOPOLOGY_MAX_DEPTH + +static void +make_reader_groups_map(erts_make_reader_groups_map_test *test); + +static Eterm +get_reader_groups_map(Process *c_p, + erts_reader_groups_map_t *map, + int map_size, + int logical_processors) +{ +#ifdef DEBUG + Eterm *endp; +#endif + Eterm res = NIL, tuple; + Eterm *hp; + int i; + + hp = HAlloc(c_p, logical_processors*(2+3)); +#ifdef DEBUG + endp = hp + logical_processors*(2+3); +#endif + for (i = map_size - 1; i >= 0; i--) { + if (map[i].logical >= 0) { + tuple = TUPLE2(hp, + make_small(map[i].logical), + make_small(map[i].reader_group)); + hp += 3; + res = CONS(hp, tuple, res); + hp += 2; + } + } + ASSERT(hp == endp); + return res; +} + +Eterm +erts_debug_reader_groups_map(Process *c_p, int groups) +{ + Eterm res; + erts_make_reader_groups_map_test test; + + test.groups = groups; + make_reader_groups_map(&test); + if (!test.map) + res = NIL; + else { + res = get_reader_groups_map(c_p, + test.map, + test.map_size, + test.logical_processors); + erts_free(ERTS_ALC_T_TMP, test.map); + } + return res; +} + + +Eterm +erts_get_reader_groups_map(Process *c_p) +{ + Eterm res; + erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); + res = get_reader_groups_map(c_p, + reader_groups_map, + reader_groups_map_size, + reader_groups_logical_processors); + erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + return res; +} + +static void +make_available_cpu_topology(erts_avail_cput *no, + erts_avail_cput *avail, + erts_cpu_topology_t *cpudata, + int *size, + int test) +{ + int len = *size; + erts_cpu_topology_t last; + int a, i, j; + + no->level[ERTS_TOPOLOGY_NODE] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1; + no->level[ERTS_TOPOLOGY_CORE] = -1; + no->level[ERTS_TOPOLOGY_THREAD] = -1; + no->level[ERTS_TOPOLOGY_LOGICAL] = -1; + + last.node = INT_MIN; + last.processor = INT_MIN; + last.processor_node = INT_MIN; + last.core = INT_MIN; + last.thread = INT_MIN; + last.logical = INT_MIN; + + a = 0; + + for (i = 0; i < len; i++) { + + if (!test && !erts_is_cpu_available(erts_cpuinfo, cpudata[i].logical)) + continue; + + if (last.node != cpudata[i].node) + goto node; + if (last.processor != cpudata[i].processor) + goto processor; + if (last.processor_node != cpudata[i].processor_node) + goto processor_node; + if (last.core != cpudata[i].core) + goto core; + ASSERT(last.thread != cpudata[i].thread); + goto thread; + + node: + no->level[ERTS_TOPOLOGY_NODE]++; + processor: + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + processor_node: + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + core: + no->level[ERTS_TOPOLOGY_CORE]++; + thread: + no->level[ERTS_TOPOLOGY_THREAD]++; + + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++) + avail[a].level[j] = no->level[j]; + + avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical; + avail[a].level[ERTS_TOPOLOGY_RG] = 0; + + ASSERT(last.logical != cpudata[a].logical); + + last = cpudata[i]; + a++; + } + + no->level[ERTS_TOPOLOGY_NODE]++; + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + no->level[ERTS_TOPOLOGY_CORE]++; + no->level[ERTS_TOPOLOGY_THREAD]++; + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + *size = a; +} + +static int +reader_group_lookup(int logical) +{ + int start = logical % reader_groups_map_size; + int ix = start; + + do { + if (reader_groups_map[ix].logical == logical) { + ASSERT(reader_groups_map[ix].reader_group > 0); + return reader_groups_map[ix].reader_group; + } + ix++; + if (ix == reader_groups_map_size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical); +} + +static void +reader_group_insert(erts_reader_groups_map_t *map, int map_size, + int logical, int reader_group) +{ + int start = logical % map_size; + int ix = start; + + do { + if (map[ix].logical < 0) { + map[ix].logical = logical; + map[ix].reader_group = reader_group; + return; + } + ix++; + if (ix == map_size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Reader groups map full\n"); +} + + +static int +sub_levels(erts_rg_count_t *rgc, int level, int aix, int avail_sz, erts_avail_cput *avail) +{ + int sub_level = level+1; + int last = -1; + rgc->sub_levels = 0; + + do { + if (last != avail[aix].level[sub_level]) { + rgc->sub_levels++; + last = avail[aix].level[sub_level]; + } + aix++; + } + while (aix < avail_sz && rgc->id == avail[aix].level[level]); + rgc->reader_groups = 0; + return aix; +} + +static int +write_reader_groups(int *rgp, erts_rg_count_t *rgcp, + int level, int a, + int avail_sz, erts_avail_cput *avail) +{ + int rg = *rgp; + int sub_level = level+1; + int sl_per_gr = rgcp->sub_levels / rgcp->reader_groups; + int xsl = rgcp->sub_levels % rgcp->reader_groups; + int sls = 0; + int last = -1; + int xsl_rg_lim = (rgcp->reader_groups - xsl) + rg + 1; + + ASSERT(level < 0 || avail[a].level[level] == rgcp->id) + + do { + if (last != avail[a].level[sub_level]) { + if (!sls) { + sls = sl_per_gr; + rg++; + if (rg >= xsl_rg_lim) + sls++; + } + last = avail[a].level[sub_level]; + sls--; + } + avail[a].level[ERTS_TOPOLOGY_RG] = rg; + a++; + } while (a < avail_sz && (level < 0 + || avail[a].level[level] == rgcp->id)); + + ASSERT(rgcp->reader_groups == rg - *rgp); + + *rgp = rg; + + return a; +} + +static int +rg_count_sub_levels_compare(const void *vx, const void *vy) +{ + erts_rg_count_t *x = (erts_rg_count_t *) vx; + erts_rg_count_t *y = (erts_rg_count_t *) vy; + if (x->sub_levels != y->sub_levels) + return y->sub_levels - x->sub_levels; + return x->id - y->id; +} + +static int +rg_count_id_compare(const void *vx, const void *vy) +{ + erts_rg_count_t *x = (erts_rg_count_t *) vx; + erts_rg_count_t *y = (erts_rg_count_t *) vy; + return x->id - y->id; +} + +static void +make_reader_groups_map(erts_make_reader_groups_map_test *test) +{ + int i, spread_level, avail_sz; + erts_avail_cput no, *avail; + erts_cpu_topology_t *cpudata; + erts_reader_groups_map_t *map; + int map_sz; + int groups = erts_reader_groups; + + if (test) { + test->map = NULL; + test->map_size = 0; + groups = test->groups; + } + + if (!groups) + return; + + if (!test) { + if (reader_groups_map) + erts_free(ERTS_ALC_T_RDR_GRPS_MAP, reader_groups_map); + + reader_groups_logical_processors = 0; + reader_groups_map_size = 0; + reader_groups_map = NULL; + } + + create_tmp_cpu_topology_copy(&cpudata, &avail_sz); + + if (!cpudata) + return; + + cpu_bind_order_sort(cpudata, + avail_sz, + ERTS_CPU_BIND_NO_SPREAD, + 1); + + avail = erts_alloc(ERTS_ALC_T_TMP, + sizeof(erts_avail_cput)*avail_sz); + + make_available_cpu_topology(&no, avail, cpudata, + &avail_sz, test != NULL); + + destroy_tmp_cpu_topology_copy(cpudata); + + map_sz = avail_sz*2+1; + + if (test) { + map = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_reader_groups_map_t) + * map_sz)); + test->map = map; + test->map_size = map_sz; + test->logical_processors = avail_sz; + } + else { + map = erts_alloc(ERTS_ALC_T_RDR_GRPS_MAP, + (sizeof(erts_reader_groups_map_t) + * map_sz)); + reader_groups_map = map; + reader_groups_logical_processors = avail_sz; + reader_groups_map_size = map_sz; + + } + + for (i = 0; i < map_sz; i++) { + map[i].logical = -1; + map[i].reader_group = 0; + } + + spread_level = ERTS_TOPOLOGY_CORE; + for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) { + if (no.level[i] > groups) { + spread_level = i; + break; + } + } + + if (no.level[spread_level] <= groups) { + int a, rg, last = -1; + rg = 0; + ASSERT(spread_level == ERTS_TOPOLOGY_CORE); + for (a = 0; a < avail_sz; a++) { + if (last != avail[a].level[spread_level]) { + rg++; + last = avail[a].level[spread_level]; + } + reader_group_insert(map, + map_sz, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + rg); + } + } + else { /* groups < no.level[spread_level] */ + erts_rg_count_t *rg_count; + int a, rg, tl, toplevels; + + tl = spread_level-1; + + if (spread_level == ERTS_TOPOLOGY_NODE) + toplevels = 1; + else + toplevels = no.level[tl]; + + rg_count = erts_alloc(ERTS_ALC_T_TMP, + toplevels*sizeof(erts_rg_count_t)); + + if (toplevels == 1) { + rg_count[0].id = 0; + rg_count[0].sub_levels = no.level[spread_level]; + rg_count[0].reader_groups = groups; + } + else { + int rgs_per_tl, rgs; + rgs = groups; + rgs_per_tl = rgs / toplevels; + + a = 0; + for (i = 0; i < toplevels; i++) { + rg_count[i].id = avail[a].level[tl]; + a = sub_levels(&rg_count[i], tl, a, avail_sz, avail); + } + + qsort(rg_count, + toplevels, + sizeof(erts_rg_count_t), + rg_count_sub_levels_compare); + + for (i = 0; i < toplevels; i++) { + if (rg_count[i].sub_levels < rgs_per_tl) { + rg_count[i].reader_groups = rg_count[i].sub_levels; + rgs -= rg_count[i].sub_levels; + } + else { + rg_count[i].reader_groups = rgs_per_tl; + rgs -= rgs_per_tl; + } + } + + while (rgs > 0) { + for (i = 0; i < toplevels; i++) { + if (rg_count[i].sub_levels == rg_count[i].reader_groups) + break; + else { + rg_count[i].reader_groups++; + if (--rgs == 0) + break; + } + } + } + + qsort(rg_count, + toplevels, + sizeof(erts_rg_count_t), + rg_count_id_compare); + } + + a = i = rg = 0; + while (a < avail_sz) { + a = write_reader_groups(&rg, &rg_count[i], tl, + a, avail_sz, avail); + i++; + } + + ASSERT(groups == rg); + + for (a = 0; a < avail_sz; a++) + reader_group_insert(map, + map_sz, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + avail[a].level[ERTS_TOPOLOGY_RG]); + + erts_free(ERTS_ALC_T_TMP, rg_count); + } + + erts_free(ERTS_ALC_T_TMP, avail); +} + +/* + * CPU topology + */ + typedef struct { int *id; int used; @@ -4054,6 +5064,8 @@ erts_set_cpu_topology(Process *c_p, Eterm term) sizeof(erts_cpu_topology_t)*cpudata_size); } + make_reader_groups_map(NULL); + signal_schedulers_bind_change(cpudata, cpudata_size); done: @@ -4457,6 +5469,11 @@ early_cpu_bind_init(void) cpu_bind_order = ERTS_CPU_BIND_UNDEFINED; + reader_groups_available_cpu_check = 1; + reader_groups_logical_processors = 0; + reader_groups_map_size = 0; + reader_groups_map = NULL; + if (!erts_get_cpu_topology(erts_cpuinfo, system_cpudata) || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata, system_cpudata_size)) { @@ -4492,6 +5509,8 @@ late_cpu_bind_init(void) : ERTS_CPU_BIND_NONE); } + make_reader_groups_map(NULL); + if (cpu_bind_order != ERTS_CPU_BIND_NONE) { erts_cpu_topology_t *cpudata; int cpudata_size; @@ -5357,7 +6376,7 @@ dequeue_process(ErtsRunQueue *runq, Process *p) } /* schedule a process */ -static ERTS_INLINE void +static ERTS_INLINE ErtsRunQueue * internal_add_to_runq(ErtsRunQueue *runq, Process *p) { Uint32 prev_status = p->status; @@ -5368,12 +6387,12 @@ internal_add_to_runq(ErtsRunQueue *runq, Process *p) ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(runq)); if (p->status_flags & ERTS_PROC_SFLG_INRUNQ) - return; + return NULL; else if (p->runq_flags & ERTS_PROC_RUNQ_FLG_RUNNING) { ASSERT(p->status != P_SUSPENDED); ERTS_DBG_CHK_PROCS_RUNQ_NOPROC(runq, p); p->status_flags |= ERTS_PROC_SFLG_PENDADD2SCHEDQ; - return; + return NULL; } ASSERT(!p->scheduler_data); #endif @@ -5412,20 +6431,23 @@ internal_add_to_runq(ErtsRunQueue *runq, Process *p) profile_runnable_proc(p, am_active); } - smp_notify_inc_runq(add_runq); - if (add_runq != runq) erts_smp_runq_unlock(add_runq); + + return add_runq; } void erts_add_to_runq(Process *p) { + ErtsRunQueue *notify_runq; ErtsRunQueue *runq = erts_get_runq_proc(p); erts_smp_runq_lock(runq); - internal_add_to_runq(runq, p); + notify_runq = internal_add_to_runq(runq, p); erts_smp_runq_unlock(runq); + smp_notify_inc_runq(notify_runq); + } /* Possibly remove a scheduled process we need to suspend */ @@ -5564,8 +6586,6 @@ erts_proc_migrate(Process *p, ErtsProcLocks *plcks, p->run_queue = to_rq; enqueue_process(to_rq, p); - smp_notify_inc_runq(to_rq); - return ERTS_MIGRATE_SUCCESS; } #endif /* ERTS_SMP */ @@ -5762,30 +6782,6 @@ erts_set_process_priority(Process *p, Eterm new_value) return old_value; } -#ifdef ERTS_SMP - -static ERTS_INLINE int -prepare_for_sys_schedule(void) -{ - while (!erts_port_task_have_outstanding_io_tasks() - && !erts_smp_atomic_xchg(&doing_sys_schedule, 1)) { - if (!erts_port_task_have_outstanding_io_tasks()) - return 1; - erts_smp_atomic_set(&doing_sys_schedule, 0); - } - return 0; -} - -#else - -static ERTS_INLINE int -prepare_for_sys_schedule(void) -{ - return !erts_port_task_have_outstanding_io_tasks(); -} - -#endif - /* note that P_RUNNING is only set so that we don't try to remove ** running processes from the schedule queue if they exit - a running ** process not being in the schedule queue!! @@ -5920,8 +6916,11 @@ Process *schedule(Process *p, int calls) p->status_flags &= ~ERTS_PROC_SFLG_RUNNING; if (p->status_flags & ERTS_PROC_SFLG_PENDADD2SCHEDQ) { + ErtsRunQueue *notify_runq; p->status_flags &= ~ERTS_PROC_SFLG_PENDADD2SCHEDQ; - internal_add_to_runq(rq, p); + notify_runq = internal_add_to_runq(rq, p); + if (notify_runq != rq) + smp_notify_inc_runq(notify_runq); } #endif @@ -5989,7 +6988,10 @@ Process *schedule(Process *p, int calls) | ERTS_RUNQ_FLG_CHK_CPU_BIND | ERTS_RUNQ_FLG_SUSPENDED)) { if ((rq->flags & ERTS_RUNQ_FLG_SUSPENDED) - || erts_smp_atomic_read(&esdp->suspended)) { + || (erts_smp_atomic_read(&esdp->ssi->flags) + & ERTS_SSI_FLG_SUSPENDED)) { + ASSERT(erts_smp_atomic_read(&esdp->ssi->flags) + & ERTS_SSI_FLG_SUSPENDED); suspend_scheduler(esdp); } if ((rq->flags & ERTS_RUNQ_FLG_CHK_CPU_BIND) @@ -5998,12 +7000,21 @@ Process *schedule(Process *p, int calls) } } -#ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN - if (esdp->check_children) { - esdp->check_children = 0; - erts_smp_runq_unlock(rq); - erts_check_children(); - erts_smp_runq_lock(rq); +#if defined(ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK) \ + || defined(ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK) + { + ErtsSchedulerSleepInfo *ssi = esdp->ssi; + long aux_work = erts_smp_atomic_read(&ssi->aux_work); + if (aux_work) { + erts_smp_runq_unlock(rq); +#ifdef ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK + aux_work = blockable_aux_work(esdp, ssi, aux_work); +#endif +#ifdef ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK + nonblockable_aux_work(esdp, ssi, aux_work); +#endif + erts_smp_runq_lock(rq); + } } #endif @@ -6035,7 +7046,10 @@ Process *schedule(Process *p, int calls) if (rq->flags & (ERTS_RUNQ_FLG_SHARED_RUNQ | ERTS_RUNQ_FLG_SUSPENDED)) { if ((rq->flags & ERTS_RUNQ_FLG_SUSPENDED) - || erts_smp_atomic_read(&esdp->suspended)) { + || (erts_smp_atomic_read(&esdp->ssi->flags) + & ERTS_SSI_FLG_SUSPENDED)) { + ASSERT(erts_smp_atomic_read(&esdp->ssi->flags) + & ERTS_SSI_FLG_SUSPENDED); non_empty_runq(rq); goto continue_check_activities_to_run; } @@ -6052,17 +7066,7 @@ Process *schedule(Process *p, int calls) } } - if (prepare_for_sys_schedule()) { - erts_smp_atomic_set(&function_calls, 0); - fcalls = 0; - sched_sys_wait(esdp->no, rq); - erts_smp_atomic_set(&doing_sys_schedule, 0); - } - else { - /* If all schedulers are waiting, one of them *should* - be waiting in erl_sys_schedule() */ - sched_cnd_wait(esdp->no, rq); - } + scheduler_wait(&fcalls, esdp, rq); non_empty_runq(rq); @@ -6124,7 +7128,7 @@ Process *schedule(Process *p, int calls) else { if (erts_common_run_queue) { if (erts_common_run_queue->waiting) - wake_one_scheduler(); + wake_scheduler(erts_common_run_queue, 0, 1); } else if (erts_smp_atomic_read(&no_empty_run_queues) != 0) { wake_scheduler_on_empty_runq(rq); @@ -6439,8 +7443,8 @@ erts_schedule_misc_op(void (*func)(void *), void *arg) else rq->misc.start = molp; rq->misc.end = molp; - smp_notify_inc_runq(rq); erts_smp_runq_unlock(rq); + smp_notify_inc_runq(rq); } static void @@ -6682,7 +7686,7 @@ erl_create_process(Process* parent, /* Parent of process (default group leader). Eterm args, /* Arguments for function (must be well-formed list). */ ErlSpawnOpts* so) /* Options for spawn. */ { - ErtsRunQueue *rq; + ErtsRunQueue *rq, *notify_runq; Process *p; Sint arity; /* Number of arguments. */ #ifndef HYBRID @@ -6999,10 +8003,12 @@ erl_create_process(Process* parent, /* Parent of process (default group leader). #endif p->status = P_WAITING; - internal_add_to_runq(rq, p); + notify_runq = internal_add_to_runq(rq, p); erts_smp_runq_unlock(rq); + smp_notify_inc_runq(notify_runq); + res = p->id; erts_smp_proc_unlock(p, ERTS_PROC_LOCKS_ALL); -- cgit v1.2.3