diff options
Diffstat (limited to 'erts/emulator/beam/erl_cpu_topology.c')
-rw-r--r-- | erts/emulator/beam/erl_cpu_topology.c | 2162 |
1 files changed, 2162 insertions, 0 deletions
diff --git a/erts/emulator/beam/erl_cpu_topology.c b/erts/emulator/beam/erl_cpu_topology.c new file mode 100644 index 0000000000..befab6f3b7 --- /dev/null +++ b/erts/emulator/beam/erl_cpu_topology.c @@ -0,0 +1,2162 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2010. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +/* + * Description: CPU topology and related functionality + * + * Author: Rickard Green + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <ctype.h> + +#include "global.h" +#include "error.h" +#include "bif.h" +#include "erl_cpu_topology.h" + +#define ERTS_MAX_READER_GROUPS 8 + +/* + * Cpu topology hierarchy. + */ +#define ERTS_TOPOLOGY_NODE 0 +#define ERTS_TOPOLOGY_PROCESSOR 1 +#define ERTS_TOPOLOGY_PROCESSOR_NODE 2 +#define ERTS_TOPOLOGY_CORE 3 +#define ERTS_TOPOLOGY_THREAD 4 +#define ERTS_TOPOLOGY_LOGICAL 5 + +#define ERTS_TOPOLOGY_MAX_DEPTH 6 + +typedef struct { + int bind_id; + int bound_id; +} ErtsCpuBindData; + +static erts_cpu_info_t *cpuinfo; + +static int max_main_threads; +static int reader_groups; + +static ErtsCpuBindData *scheduler2cpu_map; +static erts_smp_rwmtx_t cpuinfo_rwmtx; + +typedef enum { + ERTS_CPU_BIND_UNDEFINED, + ERTS_CPU_BIND_SPREAD, + ERTS_CPU_BIND_PROCESSOR_SPREAD, + ERTS_CPU_BIND_THREAD_SPREAD, + ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD, + ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD, + ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD, + ERTS_CPU_BIND_NO_SPREAD, + ERTS_CPU_BIND_NONE +} ErtsCpuBindOrder; + +#define ERTS_CPU_BIND_DEFAULT_BIND \ + ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD + +ErtsCpuBindOrder cpu_bind_order; + +static erts_cpu_topology_t *user_cpudata; +static int user_cpudata_size; +static erts_cpu_topology_t *system_cpudata; +static int system_cpudata_size; + +typedef struct { + int level[ERTS_TOPOLOGY_MAX_DEPTH+1]; +} erts_avail_cput; + +typedef struct { + int *map; + int size; + int groups; +} erts_reader_groups_map_test; + +typedef struct { + int id; + int sub_levels; + int reader_groups; +} erts_rg_count_t; + +typedef struct { + int logical; + int reader_group; +} erts_reader_groups_map_t; + +typedef struct { + erts_reader_groups_map_t *map; + int map_size; + int logical_processors; + int groups; +} erts_make_reader_groups_map_test; + +static int reader_groups_available_cpu_check; +static int reader_groups_logical_processors; +static int reader_groups_map_size; +static erts_reader_groups_map_t *reader_groups_map; + +#define ERTS_TOPOLOGY_RG ERTS_TOPOLOGY_MAX_DEPTH + +#define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff) + +#ifdef ERTS_SMP +static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata, + int size, + ErtsCpuBindOrder bind_order, + int mk_seq); +static void write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size); +#endif + +static void make_reader_groups_map(erts_make_reader_groups_map_test *test); +static int reader_group_lookup(int logical); + +static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, + int *cpudata_size); +static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata); + +static int +int_cmp(const void *vx, const void *vy) +{ + return *((int *) vx) - *((int *) vy); +} + +static int +cpu_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->core != y->core) + return x->core - y->core; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->node != y->node) + return x->node - y->node; + return 0; +} + +static int +cpu_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_thread_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + return 0; +} + +static int +cpu_thread_no_node_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->node != y->node) + return x->node - y->node; + if (x->core != y->core) + return x->core - y->core; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_no_node_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->core != y->core) + return x->core - y->core; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_no_node_thread_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->core != y->core) + return x->core - y->core; + return 0; +} + +static int +cpu_no_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->thread != y->thread) + return x->thread - y->thread; + return 0; +} + +static ERTS_INLINE void +make_cpudata_id_seq(erts_cpu_topology_t *cpudata, int size, int no_node) +{ + int ix; + int node = -1; + int processor = -1; + int processor_node = -1; + int processor_node_node = -1; + int core = -1; + int thread = -1; + int old_node = -1; + int old_processor = -1; + int old_processor_node = -1; + int old_core = -1; + int old_thread = -1; + + for (ix = 0; ix < size; ix++) { + if (!no_node || cpudata[ix].node >= 0) { + if (old_node == cpudata[ix].node) + cpudata[ix].node = node; + else { + old_node = cpudata[ix].node; + old_processor = processor = -1; + if (!no_node) + old_processor_node = processor_node = -1; + old_core = core = -1; + old_thread = thread = -1; + if (no_node || cpudata[ix].node >= 0) + cpudata[ix].node = ++node; + } + } + if (old_processor == cpudata[ix].processor) + cpudata[ix].processor = processor; + else { + old_processor = cpudata[ix].processor; + if (!no_node) + processor_node_node = old_processor_node = processor_node = -1; + old_core = core = -1; + old_thread = thread = -1; + cpudata[ix].processor = ++processor; + } + if (no_node && cpudata[ix].processor_node < 0) + old_processor_node = -1; + else { + if (old_processor_node == cpudata[ix].processor_node) { + if (no_node) + cpudata[ix].node = cpudata[ix].processor_node = node; + else { + if (processor_node_node >= 0) + cpudata[ix].node = processor_node_node; + cpudata[ix].processor_node = processor_node; + } + } + else { + old_processor_node = cpudata[ix].processor_node; + old_core = core = -1; + old_thread = thread = -1; + if (no_node) + cpudata[ix].node = cpudata[ix].processor_node = ++node; + else { + cpudata[ix].node = processor_node_node = ++node; + cpudata[ix].processor_node = ++processor_node; + } + } + } + if (!no_node && cpudata[ix].processor_node < 0) + cpudata[ix].processor_node = 0; + if (old_core == cpudata[ix].core) + cpudata[ix].core = core; + else { + old_core = cpudata[ix].core; + old_thread = thread = -1; + cpudata[ix].core = ++core; + } + if (old_thread == cpudata[ix].thread) + cpudata[ix].thread = thread; + else + old_thread = cpudata[ix].thread = ++thread; + } +} + +static void +cpu_bind_order_sort(erts_cpu_topology_t *cpudata, + int size, + ErtsCpuBindOrder bind_order, + int mk_seq) +{ + if (size > 1) { + int no_node = 0; + int (*cmp_func)(const void *, const void *); + switch (bind_order) { + case ERTS_CPU_BIND_SPREAD: + cmp_func = cpu_spread_order_cmp; + break; + case ERTS_CPU_BIND_PROCESSOR_SPREAD: + cmp_func = cpu_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_THREAD_SPREAD: + cmp_func = cpu_thread_spread_order_cmp; + break; + case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: + no_node = 1; + cmp_func = cpu_thread_no_node_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: + no_node = 1; + cmp_func = cpu_no_node_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: + no_node = 1; + cmp_func = cpu_no_node_thread_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_SPREAD: + cmp_func = cpu_no_spread_order_cmp; + break; + default: + cmp_func = NULL; + erl_exit(ERTS_ABORT_EXIT, + "Bad cpu bind type: %d\n", + (int) cpu_bind_order); + break; + } + + if (mk_seq) + make_cpudata_id_seq(cpudata, size, no_node); + + qsort(cpudata, size, sizeof(erts_cpu_topology_t), cmp_func); + } +} + +static int +processor_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->node != y->node) + return x->node - y->node; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->thread != y->thread) + return x->thread - y->thread; + return 0; +} + +#ifdef ERTS_SMP +void +erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData *esdp) +{ + int reset_read_group = 0; + ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(esdp->run_queue)); + /* Unbind from cpu */ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + if (scheduler2cpu_map[esdp->no].bound_id >= 0 + && erts_unbind_from_cpu(cpuinfo) == 0) { + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + reset_read_group = 1; + } + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + if (reset_read_group) + erts_smp_rwmtx_set_reader_group(0); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(0, 0); +} + +void +erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData *esdp) +{ + ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(esdp->run_queue)); + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(1, (int) esdp->no); + + /* Make sure we check if we should bind to a cpu or not... */ + if (esdp->run_queue->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) + erts_smp_atomic_set(&esdp->chk_cpu_bind, 1); + else + esdp->run_queue->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; +} + +#endif + +void +erts_sched_check_cpu_bind(ErtsSchedulerData *esdp) +{ + int rg = 0; + int res; + int cpu_id; + erts_smp_runq_unlock(esdp->run_queue); + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + cpu_id = scheduler2cpu_map[esdp->no].bind_id; + if (cpu_id >= 0 && cpu_id != scheduler2cpu_map[esdp->no].bound_id) { + res = erts_bind_to_cpu(cpuinfo, cpu_id); + if (res == 0) + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = cpu_id; + else { + erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); + erts_dsprintf(dsbufp, "Scheduler %d failed to bind to cpu %d: %s\n", + (int) esdp->no, cpu_id, erl_errno_id(-res)); + erts_send_error_to_logger_nogl(dsbufp); + if (scheduler2cpu_map[esdp->no].bound_id >= 0) + goto unbind; + } + } + else if (cpu_id < 0) { + unbind: + /* Get rid of old binding */ + res = erts_unbind_from_cpu(cpuinfo); + if (res == 0) + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + else if (res != -ENOTSUP) { + erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); + erts_dsprintf(dsbufp, "Scheduler %d failed to unbind from cpu %d: %s\n", + (int) esdp->no, cpu_id, erl_errno_id(-res)); + erts_send_error_to_logger_nogl(dsbufp); + } + } + if (reader_groups) { + if (esdp->cpu_id >= 0) + rg = reader_group_lookup(esdp->cpu_id); + else + rg = (((int) esdp->no) - 1) % reader_groups + 1; + } + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_smp_runq_lock(esdp->run_queue); +#ifdef ERTS_SMP + if (erts_common_run_queue) + erts_smp_atomic_set(&esdp->chk_cpu_bind, 0); + else { + esdp->run_queue->flags &= ~ERTS_RUNQ_FLG_CHK_CPU_BIND; + } +#endif + if (reader_groups) + erts_smp_rwmtx_set_reader_group(rg); +} + +#ifdef ERTS_SMP +void +erts_sched_init_check_cpu_bind(ErtsSchedulerData *esdp) +{ + int no = (int) esdp->no; + if (no <= max_main_threads) { + erts_thr_set_main_status(1, (int) no); + if (reader_groups) { + int rg = (int) no; + if (rg > reader_groups) + rg = (((int) no) - 1) % reader_groups + 1; + erts_smp_rwmtx_set_reader_group(rg); + } + } +} +#endif + +static void +write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size) +{ + int s_ix = 1; + int cpu_ix; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (cpu_bind_order != ERTS_CPU_BIND_NONE && size) { + + cpu_bind_order_sort(cpudata, size, cpu_bind_order, 1); + + for (cpu_ix = 0; cpu_ix < size && cpu_ix < erts_no_schedulers; cpu_ix++) + if (erts_is_cpu_available(cpuinfo, cpudata[cpu_ix].logical)) + scheduler2cpu_map[s_ix++].bind_id = cpudata[cpu_ix].logical; + } + + if (s_ix <= erts_no_schedulers) + for (; s_ix <= erts_no_schedulers; s_ix++) + scheduler2cpu_map[s_ix].bind_id = -1; +} + +int +erts_init_scheduler_bind_type_string(char *how) +{ + if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) + return ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED; + + if (!system_cpudata && !user_cpudata) + return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY; + + if (sys_strcmp(how, "db") == 0) + cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (sys_strcmp(how, "s") == 0) + cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (sys_strcmp(how, "ps") == 0) + cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "ts") == 0) + cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (sys_strcmp(how, "tnnps") == 0) + cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "nnps") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "nnts") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (sys_strcmp(how, "ns") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (sys_strcmp(how, "u") == 0) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else + return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE; + + return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS; +} + +static Eterm +bound_schedulers_term(ErtsCpuBindOrder order) +{ + switch (order) { + case ERTS_CPU_BIND_SPREAD: { + ERTS_DECL_AM(spread); + return AM_spread; + } + case ERTS_CPU_BIND_PROCESSOR_SPREAD: { + ERTS_DECL_AM(processor_spread); + return AM_processor_spread; + } + case ERTS_CPU_BIND_THREAD_SPREAD: { + ERTS_DECL_AM(thread_spread); + return AM_thread_spread; + } + case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: { + ERTS_DECL_AM(thread_no_node_processor_spread); + return AM_thread_no_node_processor_spread; + } + case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: { + ERTS_DECL_AM(no_node_processor_spread); + return AM_no_node_processor_spread; + } + case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: { + ERTS_DECL_AM(no_node_thread_spread); + return AM_no_node_thread_spread; + } + case ERTS_CPU_BIND_NO_SPREAD: { + ERTS_DECL_AM(no_spread); + return AM_no_spread; + } + case ERTS_CPU_BIND_NONE: { + ERTS_DECL_AM(unbound); + return AM_unbound; + } + default: + ASSERT(0); + return THE_NON_VALUE; + } +} + +Eterm +erts_bound_schedulers_term(Process *c_p) +{ + ErtsCpuBindOrder order; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + order = cpu_bind_order; + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return bound_schedulers_term(order); +} + +Eterm +erts_bind_schedulers(Process *c_p, Eterm how) +{ + int notify = 0; + Eterm res; + erts_cpu_topology_t *cpudata; + int cpudata_size; + ErtsCpuBindOrder old_cpu_bind_order; + + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + + if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) { + ERTS_BIF_PREP_ERROR(res, c_p, EXC_NOTSUP); + } + else { + + old_cpu_bind_order = cpu_bind_order; + + if (ERTS_IS_ATOM_STR("default_bind", how)) + cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (ERTS_IS_ATOM_STR("spread", how)) + cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (ERTS_IS_ATOM_STR("processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("no_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (ERTS_IS_ATOM_STR("unbound", how)) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else { + cpu_bind_order = old_cpu_bind_order; + ERTS_BIF_PREP_ERROR(res, c_p, BADARG); + goto done; + } + + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + + if (!cpudata) { + cpu_bind_order = old_cpu_bind_order; + ERTS_BIF_PREP_ERROR(res, c_p, BADARG); + goto done; + } + + write_schedulers_bind_change(cpudata, cpudata_size); + notify = 1; + + destroy_tmp_cpu_topology_copy(cpudata); + + res = bound_schedulers_term(old_cpu_bind_order); + } + + done: + + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + if (notify) + erts_sched_notify_check_cpu_bind(); + + return res; +} + +int +erts_sched_bind_atthrcreate_prepare(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + return esdp != NULL && erts_is_scheduler_bound(esdp); +} + +int +erts_sched_bind_atthrcreate_child(int unbind) +{ + int res = 0; + if (unbind) { + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + res = erts_unbind_from_cpu(cpuinfo); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + } + return res; +} + +void +erts_sched_bind_atthrcreate_parent(int unbind) +{ + +} + +int +erts_sched_bind_atfork_prepare(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + int unbind = esdp != NULL && erts_is_scheduler_bound(esdp); + if (unbind) + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + return unbind; +} + +int +erts_sched_bind_atfork_child(int unbind) +{ + if (unbind) { + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + return erts_unbind_from_cpu(cpuinfo); + } + return 0; +} + +char * +erts_sched_bind_atvfork_child(int unbind) +{ + if (unbind) { + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + return erts_get_unbind_from_cpu_str(cpuinfo); + } + return "false"; +} + +void +erts_sched_bind_atfork_parent(int unbind) +{ + if (unbind) + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); +} + +Eterm +erts_fake_scheduler_bindings(Process *p, Eterm how) +{ + ErtsCpuBindOrder fake_cpu_bind_order; + erts_cpu_topology_t *cpudata; + int cpudata_size; + Eterm res; + + if (ERTS_IS_ATOM_STR("default_bind", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (ERTS_IS_ATOM_STR("spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (ERTS_IS_ATOM_STR("processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("no_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (ERTS_IS_ATOM_STR("unbound", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NONE; + else { + ERTS_BIF_PREP_ERROR(res, p, BADARG); + return res; + } + + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + + if (!cpudata || fake_cpu_bind_order == ERTS_CPU_BIND_NONE) + ERTS_BIF_PREP_RET(res, am_false); + else { + int i; + Eterm *hp; + + cpu_bind_order_sort(cpudata, cpudata_size, fake_cpu_bind_order, 1); + +#ifdef ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA + + erts_fprintf(stderr, "node: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].node); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "processor: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].processor); + erts_fprintf(stderr, "\n"); + if (fake_cpu_bind_order != ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD + && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD + && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD) { + erts_fprintf(stderr, "processor_node:"); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].processor_node); + erts_fprintf(stderr, "\n"); + } + erts_fprintf(stderr, "core: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].core); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "thread: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].thread); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "logical: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].logical); + erts_fprintf(stderr, "\n"); +#endif + + hp = HAlloc(p, cpudata_size+1); + ERTS_BIF_PREP_RET(res, make_tuple(hp)); + *hp++ = make_arityval((Uint) cpudata_size); + for (i = 0; i < cpudata_size; i++) + *hp++ = make_small((Uint) cpudata[i].logical); + } + + destroy_tmp_cpu_topology_copy(cpudata); + + return res; +} + +Eterm +erts_get_schedulers_binds(Process *c_p) +{ + int ix; + ERTS_DECL_AM(unbound); + Eterm *hp = HAlloc(c_p, erts_no_schedulers+1); + Eterm res = make_tuple(hp); + + *(hp++) = make_arityval(erts_no_schedulers); + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + for (ix = 1; ix <= erts_no_schedulers; ix++) + *(hp++) = (scheduler2cpu_map[ix].bound_id >= 0 + ? make_small(scheduler2cpu_map[ix].bound_id) + : AM_unbound); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +/* + * CPU topology + */ + +typedef struct { + int *id; + int used; + int size; +} ErtsCpuTopIdSeq; + +typedef struct { + ErtsCpuTopIdSeq logical; + ErtsCpuTopIdSeq thread; + ErtsCpuTopIdSeq core; + ErtsCpuTopIdSeq processor_node; + ErtsCpuTopIdSeq processor; + ErtsCpuTopIdSeq node; +} ErtsCpuTopEntry; + +static void +init_cpu_top_entry(ErtsCpuTopEntry *cte) +{ + int size = 10; + cte->logical.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->logical.size = size; + cte->thread.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->thread.size = size; + cte->core.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->core.size = size; + cte->processor_node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->processor_node.size = size; + cte->processor.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->processor.size = size; + cte->node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->node.size = size; +} + +static void +destroy_cpu_top_entry(ErtsCpuTopEntry *cte) +{ + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->logical.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->thread.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->core.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor_node.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->node.id); +} + +static int +get_cput_value_or_range(int *v, int *vr, char **str) +{ + long l; + char *c = *str; + errno = 0; + if (!isdigit((unsigned char)*c)) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; + l = strtol(c, &c, 10); + if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; + *v = (int) l; + if (*c == '-') { + c++; + if (!isdigit((unsigned char)*c)) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + l = strtol(c, &c, 10); + if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + *vr = (int) l; + } + *str = c; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +get_cput_id_seq(ErtsCpuTopIdSeq *idseq, char **str) +{ + int ix = 0; + int need_size = 0; + char *c = *str; + + while (1) { + int res; + int val; + int nids; + int val_range = -1; + res = get_cput_value_or_range(&val, &val_range, &c); + if (res != ERTS_INIT_CPU_TOPOLOGY_OK) + return res; + if (val_range < 0 || val_range == val) + nids = 1; + else { + if (val_range > val) + nids = val_range - val + 1; + else + nids = val - val_range + 1; + } + need_size += nids; + if (need_size > idseq->size) { + idseq->size = need_size + 10; + idseq->id = erts_realloc(ERTS_ALC_T_TMP_CPU_IDS, + idseq->id, + sizeof(int)*idseq->size); + } + if (nids == 1) + idseq->id[ix++] = val; + else if (val_range > val) { + for (; val <= val_range; val++) + idseq->id[ix++] = val; + } + else { + for (; val >= val_range; val--) + idseq->id[ix++] = val; + } + if (*c != ',') + break; + c++; + } + *str = c; + idseq->used = ix; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +get_cput_entry(ErtsCpuTopEntry *cput, char **str) +{ + int h; + char *c = *str; + + cput->logical.used = 0; + cput->thread.id[0] = 0; + cput->thread.used = 1; + cput->core.id[0] = 0; + cput->core.used = 1; + cput->processor_node.id[0] = -1; + cput->processor_node.used = 1; + cput->processor.id[0] = 0; + cput->processor.used = 1; + cput->node.id[0] = -1; + cput->node.used = 1; + + h = ERTS_TOPOLOGY_MAX_DEPTH; + while (*c != ':' && *c != '\0') { + int res; + ErtsCpuTopIdSeq *idseqp; + switch (*c++) { + case 'L': + if (h <= ERTS_TOPOLOGY_LOGICAL) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->logical; + h = ERTS_TOPOLOGY_LOGICAL; + break; + case 't': + case 'T': + if (h <= ERTS_TOPOLOGY_THREAD) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->thread; + h = ERTS_TOPOLOGY_THREAD; + break; + case 'c': + case 'C': + if (h <= ERTS_TOPOLOGY_CORE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->core; + h = ERTS_TOPOLOGY_CORE; + break; + case 'p': + case 'P': + if (h <= ERTS_TOPOLOGY_PROCESSOR) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->processor; + h = ERTS_TOPOLOGY_PROCESSOR; + break; + case 'n': + case 'N': + if (h <= ERTS_TOPOLOGY_PROCESSOR) { + do_node: + if (h <= ERTS_TOPOLOGY_NODE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->node; + h = ERTS_TOPOLOGY_NODE; + } + else { + int p_node = 0; + char *p_chk = c; + while (*p_chk != '\0' && *p_chk != ':') { + if (*p_chk == 'p' || *p_chk == 'P') { + p_node = 1; + break; + } + p_chk++; + } + if (!p_node) + goto do_node; + if (h <= ERTS_TOPOLOGY_PROCESSOR_NODE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->processor_node; + h = ERTS_TOPOLOGY_PROCESSOR_NODE; + } + break; + default: + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE; + } + res = get_cput_id_seq(idseqp, &c); + if (res != ERTS_INIT_CPU_TOPOLOGY_OK) + return res; + } + + if (cput->logical.used < 1) + return ERTS_INIT_CPU_TOPOLOGY_MISSING_LID; + + if (*c == ':') { + c++; + } + + if (cput->thread.used != 1 + && cput->thread.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->core.used != 1 + && cput->core.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->processor_node.used != 1 + && cput->processor_node.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->processor.used != 1 + && cput->processor.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->node.used != 1 + && cput->node.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + + *str = c; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +verify_topology(erts_cpu_topology_t *cpudata, int size) +{ + if (size > 0) { + int *logical; + int node, processor, no_nodes, i; + + /* Verify logical ids */ + logical = erts_alloc(ERTS_ALC_T_TMP, sizeof(int)*size); + + for (i = 0; i < size; i++) + logical[i] = cpudata[i].logical; + + qsort(logical, size, sizeof(int), int_cmp); + for (i = 0; i < size-1; i++) { + if (logical[i] == logical[i+1]) { + erts_free(ERTS_ALC_T_TMP, logical); + return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS; + } + } + + erts_free(ERTS_ALC_T_TMP, logical); + + qsort(cpudata, size, sizeof(erts_cpu_topology_t), processor_order_cmp); + + /* Verify unique entities */ + + for (i = 1; i < size; i++) { + if (cpudata[i-1].processor == cpudata[i].processor + && cpudata[i-1].node == cpudata[i].node + && (cpudata[i-1].processor_node + == cpudata[i].processor_node) + && cpudata[i-1].core == cpudata[i].core + && cpudata[i-1].thread == cpudata[i].thread) { + return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES; + } + } + + /* Verify numa nodes */ + node = cpudata[0].node; + processor = cpudata[0].processor; + no_nodes = cpudata[0].node < 0 && cpudata[0].processor_node < 0; + for (i = 1; i < size; i++) { + if (no_nodes) { + if (cpudata[i].node >= 0 || cpudata[i].processor_node >= 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + } + else { + if (cpudata[i].processor == processor && cpudata[i].node != node) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + node = cpudata[i].node; + processor = cpudata[i].processor; + if (node >= 0 && cpudata[i].processor_node >= 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + if (node < 0 && cpudata[i].processor_node < 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + } + } + } + + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +int +erts_init_cpu_topology_string(char *topology_str) +{ + ErtsCpuTopEntry cput; + int need_size; + char *c; + int ix; + int error = ERTS_INIT_CPU_TOPOLOGY_OK; + + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata_size = 10; + + user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + + init_cpu_top_entry(&cput); + + ix = 0; + need_size = 0; + + c = topology_str; + if (*c == '\0') { + error = ERTS_INIT_CPU_TOPOLOGY_MISSING; + goto fail; + } + do { + int r; + error = get_cput_entry(&cput, &c); + if (error != ERTS_INIT_CPU_TOPOLOGY_OK) + goto fail; + need_size += cput.logical.used; + if (user_cpudata_size < need_size) { + user_cpudata_size = need_size + 10; + user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, + user_cpudata, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + } + + ASSERT(cput.thread.used == 1 + || cput.thread.used == cput.logical.used); + ASSERT(cput.core.used == 1 + || cput.core.used == cput.logical.used); + ASSERT(cput.processor_node.used == 1 + || cput.processor_node.used == cput.logical.used); + ASSERT(cput.processor.used == 1 + || cput.processor.used == cput.logical.used); + ASSERT(cput.node.used == 1 + || cput.node.used == cput.logical.used); + + for (r = 0; r < cput.logical.used; r++) { + user_cpudata[ix].logical = cput.logical.id[r]; + user_cpudata[ix].thread = + cput.thread.id[cput.thread.used == 1 ? 0 : r]; + user_cpudata[ix].core = + cput.core.id[cput.core.used == 1 ? 0 : r]; + user_cpudata[ix].processor_node = + cput.processor_node.id[cput.processor_node.used == 1 ? 0 : r]; + user_cpudata[ix].processor = + cput.processor.id[cput.processor.used == 1 ? 0 : r]; + user_cpudata[ix].node = + cput.node.id[cput.node.used == 1 ? 0 : r]; + ix++; + } + } while (*c != '\0'); + + if (user_cpudata_size != ix) { + user_cpudata_size = ix; + user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, + user_cpudata, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + } + + error = verify_topology(user_cpudata, user_cpudata_size); + if (error == ERTS_INIT_CPU_TOPOLOGY_OK) { + destroy_cpu_top_entry(&cput); + return ERTS_INIT_CPU_TOPOLOGY_OK; + } + + fail: + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata_size = 0; + destroy_cpu_top_entry(&cput); + return error; +} + +#define ERTS_GET_CPU_TOPOLOGY_ERROR -1 +#define ERTS_GET_USED_CPU_TOPOLOGY 0 +#define ERTS_GET_DETECTED_CPU_TOPOLOGY 1 +#define ERTS_GET_DEFINED_CPU_TOPOLOGY 2 + +static Eterm get_cpu_topology_term(Process *c_p, int type); + +Eterm +erts_set_cpu_topology(Process *c_p, Eterm term) +{ + erts_cpu_topology_t *cpudata = NULL; + int cpudata_size = 0; + Eterm res; + + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + res = get_cpu_topology_term(c_p, ERTS_GET_USED_CPU_TOPOLOGY); + if (term == am_undefined) { + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata = NULL; + user_cpudata_size = 0; + + if (cpu_bind_order != ERTS_CPU_BIND_NONE && system_cpudata) { + cpudata_size = system_cpudata_size; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + + sys_memcpy((void *) cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*cpudata_size); + } + } + else if (is_not_list(term)) { + error: + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + res = THE_NON_VALUE; + goto done; + } + else { + Eterm list = term; + int ix = 0; + + cpudata_size = 100; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + + while (is_list(list)) { + Eterm *lp = list_val(list); + Eterm cpu = CAR(lp); + Eterm* tp; + Sint id; + + if (is_not_tuple(cpu)) + goto error; + + tp = tuple_val(cpu); + + if (arityval(tp[0]) != 7 || tp[1] != am_cpu) + goto error; + + if (ix >= cpudata_size) { + cpudata_size += 100; + cpudata = erts_realloc(ERTS_ALC_T_TMP, + cpudata, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + } + + id = signed_val(tp[2]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].node = (int) id; + + id = signed_val(tp[3]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].processor = (int) id; + + id = signed_val(tp[4]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].processor_node = (int) id; + + id = signed_val(tp[5]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].core = (int) id; + + id = signed_val(tp[6]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].thread = (int) id; + + id = signed_val(tp[7]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].logical = (int) id; + + list = CDR(lp); + ix++; + } + + if (is_not_nil(list)) + goto error; + + cpudata_size = ix; + + if (ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(cpudata, cpudata_size)) + goto error; + + if (user_cpudata_size != cpudata_size) { + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + sizeof(erts_cpu_topology_t)*cpudata_size); + user_cpudata_size = cpudata_size; + } + + sys_memcpy((void *) user_cpudata, + (void *) cpudata, + sizeof(erts_cpu_topology_t)*cpudata_size); + } + + make_reader_groups_map(NULL); + + write_schedulers_bind_change(cpudata, cpudata_size); + + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_sched_notify_check_cpu_bind(); + + done: + + if (cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); + + return res; +} + +static void +create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, int *cpudata_size) +{ + if (user_cpudata) { + *cpudata_size = user_cpudata_size; + *cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * (*cpudata_size))); + sys_memcpy((void *) *cpudata, + (void *) user_cpudata, + sizeof(erts_cpu_topology_t)*(*cpudata_size)); + } + else if (system_cpudata) { + *cpudata_size = system_cpudata_size; + *cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * (*cpudata_size))); + sys_memcpy((void *) *cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*(*cpudata_size)); + } + else { + *cpudata = NULL; + *cpudata_size = 0; + } +} + +static void +destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata) +{ + if (cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); +} + + +static Eterm +bld_topology_term(Eterm **hpp, + Uint *hszp, + erts_cpu_topology_t *cpudata, + int size) +{ + Eterm res = NIL; + int i; + + if (size == 0) + return am_undefined; + + for (i = size-1; i >= 0; i--) { + res = erts_bld_cons(hpp, + hszp, + erts_bld_tuple(hpp, + hszp, + 7, + am_cpu, + make_small(cpudata[i].node), + make_small(cpudata[i].processor), + make_small(cpudata[i].processor_node), + make_small(cpudata[i].core), + make_small(cpudata[i].thread), + make_small(cpudata[i].logical)), + res); + } + return res; +} + +static Eterm +get_cpu_topology_term(Process *c_p, int type) +{ +#ifdef DEBUG + Eterm *hp_end; +#endif + Eterm *hp; + Uint hsz; + Eterm res = THE_NON_VALUE; + erts_cpu_topology_t *cpudata = NULL; + int size = 0; + + switch (type) { + case ERTS_GET_USED_CPU_TOPOLOGY: + if (user_cpudata) + goto defined; + else + goto detected; + case ERTS_GET_DETECTED_CPU_TOPOLOGY: + detected: + if (!system_cpudata) + res = am_undefined; + else { + size = system_cpudata_size; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * size)); + sys_memcpy((void *) cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*size); + } + break; + case ERTS_GET_DEFINED_CPU_TOPOLOGY: + defined: + if (!user_cpudata) + res = am_undefined; + else { + size = user_cpudata_size; + cpudata = user_cpudata; + } + break; + default: + erl_exit(ERTS_ABORT_EXIT, "Bad cpu topology type: %d\n", type); + break; + } + + if (res == am_undefined) { + ASSERT(!cpudata); + return res; + } + + hsz = 0; + + bld_topology_term(NULL, &hsz, + cpudata, size); + + hp = HAlloc(c_p, hsz); + +#ifdef DEBUG + hp_end = hp + hsz; +#endif + + res = bld_topology_term(&hp, NULL, + cpudata, size); + + ASSERT(hp_end == hp); + + if (cpudata && cpudata != system_cpudata && cpudata != user_cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); + + return res; +} + +Eterm +erts_get_cpu_topology_term(Process *c_p, Eterm which) +{ + Eterm res; + int type; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + if (ERTS_IS_ATOM_STR("used", which)) + type = ERTS_GET_USED_CPU_TOPOLOGY; + else if (ERTS_IS_ATOM_STR("detected", which)) + type = ERTS_GET_DETECTED_CPU_TOPOLOGY; + else if (ERTS_IS_ATOM_STR("defined", which)) + type = ERTS_GET_DEFINED_CPU_TOPOLOGY; + else + type = ERTS_GET_CPU_TOPOLOGY_ERROR; + if (type == ERTS_GET_CPU_TOPOLOGY_ERROR) + res = THE_NON_VALUE; + else + res = get_cpu_topology_term(c_p, type); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +static void +get_logical_processors(int *conf, int *onln, int *avail) +{ + if (conf) + *conf = erts_get_cpu_configured(cpuinfo); + if (onln) + *onln = erts_get_cpu_online(cpuinfo); + if (avail) + *avail = erts_get_cpu_available(cpuinfo); +} + +void +erts_get_logical_processors(int *conf, int *onln, int *avail) +{ + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + get_logical_processors(conf, onln, avail); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); +} + +void +erts_pre_early_init_cpu_topology(int *max_rg_p, + int *conf_p, + int *onln_p, + int *avail_p) +{ + *max_rg_p = ERTS_MAX_READER_GROUPS; + cpuinfo = erts_cpu_info_create(); + get_logical_processors(conf_p, onln_p, avail_p); +} + +void +erts_early_init_cpu_topology(int no_schedulers, + int *max_main_threads_p, + int max_reader_groups, + int *reader_groups_p) +{ + user_cpudata = NULL; + user_cpudata_size = 0; + + system_cpudata_size = erts_get_cpu_topology_size(cpuinfo); + system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * system_cpudata_size)); + + cpu_bind_order = ERTS_CPU_BIND_UNDEFINED; + + reader_groups_available_cpu_check = 1; + reader_groups_logical_processors = 0; + reader_groups_map_size = 0; + reader_groups_map = NULL; + + if (!erts_get_cpu_topology(cpuinfo, system_cpudata) + || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata, + system_cpudata_size)) { + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + system_cpudata = NULL; + system_cpudata_size = 0; + } + + max_main_threads = erts_get_cpu_configured(cpuinfo); + if (max_main_threads > no_schedulers) + max_main_threads = no_schedulers; + *max_main_threads_p = max_main_threads; + + reader_groups = max_main_threads; + if (reader_groups <= 1 || max_reader_groups <= 1) + reader_groups = 0; + if (reader_groups > max_reader_groups) + reader_groups = max_reader_groups; + *reader_groups_p = reader_groups; +} + +void +erts_init_cpu_topology(void) +{ + int ix; + + erts_smp_rwmtx_init(&cpuinfo_rwmtx, "cpu_info"); + + scheduler2cpu_map = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(ErtsCpuBindData) + * (erts_no_schedulers+1))); + for (ix = 1; ix <= erts_no_schedulers; ix++) { + scheduler2cpu_map[ix].bind_id = -1; + scheduler2cpu_map[ix].bound_id = -1; + } + + if (cpu_bind_order == ERTS_CPU_BIND_UNDEFINED) { + int ncpus = erts_get_cpu_configured(cpuinfo); + if (ncpus < 1 || erts_no_schedulers < ncpus) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else + cpu_bind_order = ((system_cpudata || user_cpudata) + && (erts_bind_to_cpu(cpuinfo, -1) != -ENOTSUP) + ? ERTS_CPU_BIND_DEFAULT_BIND + : ERTS_CPU_BIND_NONE); + } + + make_reader_groups_map(NULL); + + if (cpu_bind_order == ERTS_CPU_BIND_NONE) + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + else { + erts_cpu_topology_t *cpudata; + int cpudata_size; + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + write_schedulers_bind_change(cpudata, cpudata_size); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_sched_notify_check_cpu_bind(); + destroy_tmp_cpu_topology_copy(cpudata); + } +} + +int +erts_update_cpu_info(void) +{ + int changed; + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + changed = erts_cpu_info_update(cpuinfo); + if (changed) { + erts_cpu_topology_t *cpudata; + int cpudata_size; + + if (system_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + + system_cpudata_size = erts_get_cpu_topology_size(cpuinfo); + if (!system_cpudata_size) + system_cpudata = NULL; + else { + system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * system_cpudata_size)); + + if (!erts_get_cpu_topology(cpuinfo, system_cpudata) + || (ERTS_INIT_CPU_TOPOLOGY_OK + != verify_topology(system_cpudata, + system_cpudata_size))) { + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + system_cpudata = NULL; + system_cpudata_size = 0; + } + } + + make_reader_groups_map(NULL); + + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + write_schedulers_bind_change(cpudata, cpudata_size); + destroy_tmp_cpu_topology_copy(cpudata); + } + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + if (changed) + erts_sched_notify_check_cpu_bind(); + return changed; +} + +/* + * reader groups map + */ + +static Eterm +get_reader_groups_map(Process *c_p, + erts_reader_groups_map_t *map, + int map_size, + int logical_processors) +{ +#ifdef DEBUG + Eterm *endp; +#endif + Eterm res = NIL, tuple; + Eterm *hp; + int i; + + hp = HAlloc(c_p, logical_processors*(2+3)); +#ifdef DEBUG + endp = hp + logical_processors*(2+3); +#endif + for (i = map_size - 1; i >= 0; i--) { + if (map[i].logical >= 0) { + tuple = TUPLE2(hp, + make_small(map[i].logical), + make_small(map[i].reader_group)); + hp += 3; + res = CONS(hp, tuple, res); + hp += 2; + } + } + ASSERT(hp == endp); + return res; +} + +Eterm +erts_debug_reader_groups_map(Process *c_p, int groups) +{ + Eterm res; + erts_make_reader_groups_map_test test; + + test.groups = groups; + make_reader_groups_map(&test); + if (!test.map) + res = NIL; + else { + res = get_reader_groups_map(c_p, + test.map, + test.map_size, + test.logical_processors); + erts_free(ERTS_ALC_T_TMP, test.map); + } + return res; +} + + +Eterm +erts_get_reader_groups_map(Process *c_p) +{ + Eterm res; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + res = get_reader_groups_map(c_p, + reader_groups_map, + reader_groups_map_size, + reader_groups_logical_processors); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +static void +make_available_cpu_topology(erts_avail_cput *no, + erts_avail_cput *avail, + erts_cpu_topology_t *cpudata, + int *size, + int test) +{ + int len = *size; + erts_cpu_topology_t last; + int a, i, j; + + no->level[ERTS_TOPOLOGY_NODE] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1; + no->level[ERTS_TOPOLOGY_CORE] = -1; + no->level[ERTS_TOPOLOGY_THREAD] = -1; + no->level[ERTS_TOPOLOGY_LOGICAL] = -1; + + last.node = INT_MIN; + last.processor = INT_MIN; + last.processor_node = INT_MIN; + last.core = INT_MIN; + last.thread = INT_MIN; + last.logical = INT_MIN; + + a = 0; + + for (i = 0; i < len; i++) { + + if (!test && !erts_is_cpu_available(cpuinfo, cpudata[i].logical)) + continue; + + if (last.node != cpudata[i].node) + goto node; + if (last.processor != cpudata[i].processor) + goto processor; + if (last.processor_node != cpudata[i].processor_node) + goto processor_node; + if (last.core != cpudata[i].core) + goto core; + ASSERT(last.thread != cpudata[i].thread); + goto thread; + + node: + no->level[ERTS_TOPOLOGY_NODE]++; + processor: + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + processor_node: + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + core: + no->level[ERTS_TOPOLOGY_CORE]++; + thread: + no->level[ERTS_TOPOLOGY_THREAD]++; + + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++) + avail[a].level[j] = no->level[j]; + + avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical; + avail[a].level[ERTS_TOPOLOGY_RG] = 0; + + ASSERT(last.logical != cpudata[i].logical); + + last = cpudata[i]; + a++; + } + + no->level[ERTS_TOPOLOGY_NODE]++; + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + no->level[ERTS_TOPOLOGY_CORE]++; + no->level[ERTS_TOPOLOGY_THREAD]++; + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + *size = a; +} + +static int +reader_group_lookup(int logical) +{ + int start = logical % reader_groups_map_size; + int ix = start; + + do { + if (reader_groups_map[ix].logical == logical) { + ASSERT(reader_groups_map[ix].reader_group > 0); + return reader_groups_map[ix].reader_group; + } + ix++; + if (ix == reader_groups_map_size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical); +} + +static void +reader_group_insert(erts_reader_groups_map_t *map, int map_size, + int logical, int reader_group) +{ + int start = logical % map_size; + int ix = start; + + do { + if (map[ix].logical < 0) { + map[ix].logical = logical; + map[ix].reader_group = reader_group; + return; + } + ix++; + if (ix == map_size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Reader groups map full\n"); +} + + +static int +sub_levels(erts_rg_count_t *rgc, int level, int aix, int avail_sz, erts_avail_cput *avail) +{ + int sub_level = level+1; + int last = -1; + rgc->sub_levels = 0; + + do { + if (last != avail[aix].level[sub_level]) { + rgc->sub_levels++; + last = avail[aix].level[sub_level]; + } + aix++; + } + while (aix < avail_sz && rgc->id == avail[aix].level[level]); + rgc->reader_groups = 0; + return aix; +} + +static int +write_reader_groups(int *rgp, erts_rg_count_t *rgcp, + int level, int a, + int avail_sz, erts_avail_cput *avail) +{ + int rg = *rgp; + int sub_level = level+1; + int sl_per_gr = rgcp->sub_levels / rgcp->reader_groups; + int xsl = rgcp->sub_levels % rgcp->reader_groups; + int sls = 0; + int last = -1; + int xsl_rg_lim = (rgcp->reader_groups - xsl) + rg + 1; + + ASSERT(level < 0 || avail[a].level[level] == rgcp->id) + + do { + if (last != avail[a].level[sub_level]) { + if (!sls) { + sls = sl_per_gr; + rg++; + if (rg >= xsl_rg_lim) + sls++; + } + last = avail[a].level[sub_level]; + sls--; + } + avail[a].level[ERTS_TOPOLOGY_RG] = rg; + a++; + } while (a < avail_sz && (level < 0 + || avail[a].level[level] == rgcp->id)); + + ASSERT(rgcp->reader_groups == rg - *rgp); + + *rgp = rg; + + return a; +} + +static int +rg_count_sub_levels_compare(const void *vx, const void *vy) +{ + erts_rg_count_t *x = (erts_rg_count_t *) vx; + erts_rg_count_t *y = (erts_rg_count_t *) vy; + if (x->sub_levels != y->sub_levels) + return y->sub_levels - x->sub_levels; + return x->id - y->id; +} + +static int +rg_count_id_compare(const void *vx, const void *vy) +{ + erts_rg_count_t *x = (erts_rg_count_t *) vx; + erts_rg_count_t *y = (erts_rg_count_t *) vy; + return x->id - y->id; +} + +static void +make_reader_groups_map(erts_make_reader_groups_map_test *test) +{ + int i, spread_level, avail_sz; + erts_avail_cput no, *avail; + erts_cpu_topology_t *cpudata; + erts_reader_groups_map_t *map; + int map_sz; + int groups = reader_groups; + + if (test) { + test->map = NULL; + test->map_size = 0; + groups = test->groups; + } + + if (!groups) + return; + + if (!test) { + if (reader_groups_map) + erts_free(ERTS_ALC_T_RDR_GRPS_MAP, reader_groups_map); + + reader_groups_logical_processors = 0; + reader_groups_map_size = 0; + reader_groups_map = NULL; + } + + create_tmp_cpu_topology_copy(&cpudata, &avail_sz); + + if (!cpudata) + return; + + cpu_bind_order_sort(cpudata, + avail_sz, + ERTS_CPU_BIND_NO_SPREAD, + 1); + + avail = erts_alloc(ERTS_ALC_T_TMP, + sizeof(erts_avail_cput)*avail_sz); + + make_available_cpu_topology(&no, avail, cpudata, + &avail_sz, test != NULL); + + destroy_tmp_cpu_topology_copy(cpudata); + + map_sz = avail_sz*2+1; + + if (test) { + map = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_reader_groups_map_t) + * map_sz)); + test->map = map; + test->map_size = map_sz; + test->logical_processors = avail_sz; + } + else { + map = erts_alloc(ERTS_ALC_T_RDR_GRPS_MAP, + (sizeof(erts_reader_groups_map_t) + * map_sz)); + reader_groups_map = map; + reader_groups_logical_processors = avail_sz; + reader_groups_map_size = map_sz; + + } + + for (i = 0; i < map_sz; i++) { + map[i].logical = -1; + map[i].reader_group = 0; + } + + spread_level = ERTS_TOPOLOGY_CORE; + for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) { + if (no.level[i] > groups) { + spread_level = i; + break; + } + } + + if (no.level[spread_level] <= groups) { + int a, rg, last = -1; + rg = 0; + ASSERT(spread_level == ERTS_TOPOLOGY_CORE); + for (a = 0; a < avail_sz; a++) { + if (last != avail[a].level[spread_level]) { + rg++; + last = avail[a].level[spread_level]; + } + reader_group_insert(map, + map_sz, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + rg); + } + } + else { /* groups < no.level[spread_level] */ + erts_rg_count_t *rg_count; + int a, rg, tl, toplevels; + + tl = spread_level-1; + + if (spread_level == ERTS_TOPOLOGY_NODE) + toplevels = 1; + else + toplevels = no.level[tl]; + + rg_count = erts_alloc(ERTS_ALC_T_TMP, + toplevels*sizeof(erts_rg_count_t)); + + if (toplevels == 1) { + rg_count[0].id = 0; + rg_count[0].sub_levels = no.level[spread_level]; + rg_count[0].reader_groups = groups; + } + else { + int rgs_per_tl, rgs; + rgs = groups; + rgs_per_tl = rgs / toplevels; + + a = 0; + for (i = 0; i < toplevels; i++) { + rg_count[i].id = avail[a].level[tl]; + a = sub_levels(&rg_count[i], tl, a, avail_sz, avail); + } + + qsort(rg_count, + toplevels, + sizeof(erts_rg_count_t), + rg_count_sub_levels_compare); + + for (i = 0; i < toplevels; i++) { + if (rg_count[i].sub_levels < rgs_per_tl) { + rg_count[i].reader_groups = rg_count[i].sub_levels; + rgs -= rg_count[i].sub_levels; + } + else { + rg_count[i].reader_groups = rgs_per_tl; + rgs -= rgs_per_tl; + } + } + + while (rgs > 0) { + for (i = 0; i < toplevels; i++) { + if (rg_count[i].sub_levels == rg_count[i].reader_groups) + break; + else { + rg_count[i].reader_groups++; + if (--rgs == 0) + break; + } + } + } + + qsort(rg_count, + toplevels, + sizeof(erts_rg_count_t), + rg_count_id_compare); + } + + a = i = rg = 0; + while (a < avail_sz) { + a = write_reader_groups(&rg, &rg_count[i], tl, + a, avail_sz, avail); + i++; + } + + ASSERT(groups == rg); + + for (a = 0; a < avail_sz; a++) + reader_group_insert(map, + map_sz, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + avail[a].level[ERTS_TOPOLOGY_RG]); + + erts_free(ERTS_ALC_T_TMP, rg_count); + } + + erts_free(ERTS_ALC_T_TMP, avail); +} |