diff options
Diffstat (limited to 'erts/emulator')
27 files changed, 3648 insertions, 2453 deletions
diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in index 76d782b159..4ed0ccabc6 100644 --- a/erts/emulator/Makefile.in +++ b/erts/emulator/Makefile.in @@ -734,7 +734,7 @@ RUN_OBJS = \ $(OBJDIR)/erl_fun.o $(OBJDIR)/erl_bif_port.o \ $(OBJDIR)/erl_term.o $(OBJDIR)/erl_node_tables.o \ $(OBJDIR)/erl_monitors.o $(OBJDIR)/erl_process_dump.o \ - $(OBJDIR)/erl_bif_timer.o \ + $(OBJDIR)/erl_bif_timer.o $(OBJDIR)/erl_cpu_topology.o \ $(OBJDIR)/erl_drv_thread.o $(OBJDIR)/erl_bif_chksum.o \ $(OBJDIR)/erl_bif_re.o $(OBJDIR)/erl_unicode.o \ $(OBJDIR)/packet_parser.o $(OBJDIR)/safe_hash.o \ diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c index 16b6aeac3f..694460d702 100644 --- a/erts/emulator/beam/dist.c +++ b/erts/emulator/beam/dist.c @@ -97,6 +97,8 @@ dist_msg_dbg(ErtsDistExternal *edep, char *what, byte *buf, int sz) #define PASS_THROUGH 'p' /* This code should go */ int erts_is_alive; /* System must be blocked on change */ +int erts_dist_buf_busy_limit; + /* distribution trap functions */ Export* dsend2_trap = NULL; @@ -160,7 +162,7 @@ Uint erts_dist_cache_size(void) static ErtsProcList * get_suspended_on_de(DistEntry *dep, Uint32 unset_qflgs) { - ERTS_SMP_LC_ASSERT(erts_smp_lc_spinlock_is_locked(&dep->qlock)); + ERTS_SMP_LC_ASSERT(erts_smp_lc_mtx_is_locked(&dep->qlock)); dep->qflgs &= ~unset_qflgs; if (dep->qflgs & ERTS_DE_QFLG_EXIT) { /* No resume when exit has been scheduled */ @@ -453,17 +455,17 @@ int erts_do_net_exits(DistEntry *dep, Eterm reason) if (dep->status & ERTS_DE_SFLG_EXITING) { #ifdef DEBUG - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qflgs & ERTS_DE_QFLG_EXIT); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); #endif } else { dep->status |= ERTS_DE_SFLG_EXITING; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(!(dep->qflgs & ERTS_DE_QFLG_EXIT)); dep->qflgs |= ERTS_DE_QFLG_EXIT; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } erts_smp_de_links_lock(dep); @@ -577,7 +579,7 @@ static void clear_dist_entry(DistEntry *dep) erts_smp_de_links_unlock(dep); #endif - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); if (!dep->out_queue.last) obuf = dep->finalized_out_queue.first; @@ -593,7 +595,7 @@ static void clear_dist_entry(DistEntry *dep) dep->status = 0; suspendees = get_suspended_on_de(dep, ERTS_DE_QFLGS_ALL); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); erts_smp_atomic_set(&dep->dist_cmd_scheduled, 0); dep->send = NULL; erts_smp_de_rwunlock(dep); @@ -611,10 +613,10 @@ static void clear_dist_entry(DistEntry *dep) } if (obufsize) { - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize >= obufsize); dep->qsize -= obufsize; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } } @@ -1453,8 +1455,6 @@ int erts_net_message(Port *prt, return -1; } -#define ERTS_DE_BUSY_LIMIT (128*1024) - static int dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) { @@ -1538,18 +1538,18 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) } else { ErtsProcList *plp = NULL; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); dep->qsize += size_obuf(obuf); - if (dep->qsize >= ERTS_DE_BUSY_LIMIT) + if (dep->qsize >= erts_dist_buf_busy_limit) dep->qflgs |= ERTS_DE_QFLG_BUSY; if (!force_busy && (dep->qflgs & ERTS_DE_QFLG_BUSY)) { - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); plp = erts_proclist_create(c_p); plp->next = NULL; erts_suspend(c_p, ERTS_PROC_LOCK_MAIN, NULL); suspended = 1; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); } /* Enqueue obuf on dist entry */ @@ -1575,7 +1575,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) } } - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); erts_schedule_dist_command(NULL, dep); erts_smp_de_runlock(dep); @@ -1708,10 +1708,8 @@ erts_dist_command(Port *prt, int reds_limit) { Sint reds = ERTS_PORT_REDS_DIST_CMD_START; int prt_busy; - int de_busy; Uint32 status; Uint32 flags; - Uint32 qflgs; Sint obufsize = 0; ErtsDistOutputQueue oq, foq; DistEntry *dep = prt->dist_entry; @@ -1746,13 +1744,12 @@ erts_dist_command(Port *prt, int reds_limit) * a mess. */ - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); oq.first = dep->out_queue.first; oq.last = dep->out_queue.last; dep->out_queue.first = NULL; dep->out_queue.last = NULL; - qflgs = dep->qflgs; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); foq.first = dep->finalized_out_queue.first; foq.last = dep->finalized_out_queue.last; @@ -1763,17 +1760,8 @@ erts_dist_command(Port *prt, int reds_limit) goto preempted; prt_busy = (int) (prt->status & ERTS_PORT_SFLG_PORT_BUSY); - de_busy = (int) (qflgs & ERTS_DE_QFLG_BUSY); - if (prt_busy) { - if (!de_busy) { - erts_smp_spin_lock(&dep->qlock); - dep->qflgs |= ERTS_DE_QFLG_BUSY; - erts_smp_spin_unlock(&dep->qlock); - de_busy = 1; - } - } - else if (foq.first) { + if (!prt_busy && foq.first) { int preempt = 0; do { Uint size; @@ -1791,10 +1779,7 @@ erts_dist_command(Port *prt, int reds_limit) free_dist_obuf(fob); preempt = reds > reds_limit || (prt->status & ERTS_PORT_SFLGS_DEAD); if (prt->status & ERTS_PORT_SFLG_PORT_BUSY) { - erts_smp_spin_lock(&dep->qlock); - dep->qflgs |= ERTS_DE_QFLG_BUSY; - erts_smp_spin_unlock(&dep->qlock); - de_busy = prt_busy = 1; + prt_busy = 1; break; } } while (foq.first && !preempt); @@ -1877,10 +1862,7 @@ erts_dist_command(Port *prt, int reds_limit) free_dist_obuf(fob); preempt = reds > reds_limit || (prt->status & ERTS_PORT_SFLGS_DEAD); if (prt->status & ERTS_PORT_SFLG_PORT_BUSY) { - erts_smp_spin_lock(&dep->qlock); - dep->qflgs |= ERTS_DE_QFLG_BUSY; - erts_smp_spin_unlock(&dep->qlock); - de_busy = prt_busy = 1; + prt_busy = 1; if (oq.first && !preempt) goto finalize_only; } @@ -1907,22 +1889,23 @@ erts_dist_command(Port *prt, int reds_limit) * dist entry in a non-busy state and resume suspended * processes. */ - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize >= obufsize); dep->qsize -= obufsize; obufsize = 0; - if (de_busy && !prt_busy && dep->qsize < ERTS_DE_BUSY_LIMIT) { + if (!prt_busy + && (dep->qflgs & ERTS_DE_QFLG_BUSY) + && dep->qsize < erts_dist_buf_busy_limit) { ErtsProcList *suspendees; int resumed; suspendees = get_suspended_on_de(dep, ERTS_DE_QFLG_BUSY); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); resumed = erts_resume_processes(suspendees); reds += resumed*ERTS_PORT_REDS_DIST_CMD_RESUMED; - de_busy = 0; } else - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } ASSERT(!oq.first && !oq.last); @@ -1931,10 +1914,10 @@ erts_dist_command(Port *prt, int reds_limit) if (obufsize != 0) { ASSERT(obufsize > 0); - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize >= obufsize); dep->qsize -= obufsize; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } ASSERT(foq.first || !foq.last); @@ -1984,9 +1967,9 @@ erts_dist_command(Port *prt, int reds_limit) foq.last = NULL; #ifdef DEBUG - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize == obufsize); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); #endif } else { @@ -1995,14 +1978,14 @@ erts_dist_command(Port *prt, int reds_limit) * Unhandle buffers need to be put back first * in out_queue. */ - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); dep->qsize -= obufsize; obufsize = 0; oq.last->next = dep->out_queue.first; dep->out_queue.first = oq.first; if (!dep->out_queue.last) dep->out_queue.last = oq.last; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } erts_schedule_dist_command(prt, NULL); @@ -2026,10 +2009,10 @@ erts_kill_dist_connection(DistEntry *dep, Uint32 connection_id) dep->status |= ERTS_DE_SFLG_EXITING; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(!(dep->qflgs & ERTS_DE_QFLG_EXIT)); dep->qflgs |= ERTS_DE_QFLG_EXIT; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); erts_schedule_dist_command(NULL, dep); } @@ -2400,13 +2383,13 @@ BIF_RETTYPE setnode_3(BIF_ALIST_3) ErtsProcList *plp = erts_proclist_create(BIF_P); plp->next = NULL; erts_suspend(BIF_P, ERTS_PROC_LOCK_MAIN, NULL); - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); if (dep->suspended.last) dep->suspended.last->next = plp; else dep->suspended.first = plp; dep->suspended.last = plp; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); goto yield; } @@ -2434,9 +2417,9 @@ BIF_RETTYPE setnode_3(BIF_ALIST_3) ASSERT(dep->send); #ifdef DEBUG - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize == 0); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); #endif erts_set_dist_entry_connected(dep, BIF_ARG_2, flags); diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index fa19c7fb45..64caf34550 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -99,7 +99,8 @@ typedef struct { #define ERTS_DE_IS_CONNECTED(DEP) \ (!ERTS_DE_IS_NOT_CONNECTED((DEP))) - +#define ERTS_DE_BUSY_LIMIT (1024*1024) +extern int erts_dist_buf_busy_limit; extern int erts_is_alive; /* @@ -153,10 +154,10 @@ erts_dsig_prepare(ErtsDSigData *dsdp, } if (no_suspend) { failure = ERTS_DSIG_PREP_CONNECTED; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); if (dep->qflgs & ERTS_DE_QFLG_BUSY) failure = ERTS_DSIG_PREP_WOULD_SUSPEND; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); if (failure == ERTS_DSIG_PREP_WOULD_SUSPEND) goto fail; } diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types index 7df9f19af0..408ffd12f7 100644 --- a/erts/emulator/beam/erl_alloc.types +++ b/erts/emulator/beam/erl_alloc.types @@ -247,7 +247,7 @@ type CPUDATA LONG_LIVED SYSTEM cpu_data type TMP_CPU_IDS SHORT_LIVED SYSTEM tmp_cpu_ids type EXT_TERM_DATA SHORT_LIVED PROCESSES external_term_data type ZLIB STANDARD SYSTEM zlib -type RDR_GRPS_MAP LONG_LIVED SYSTEM reader_groups_map +type CPU_GRPS_MAP LONG_LIVED SYSTEM cpu_groups_map +if smp type ASYNC SHORT_LIVED SYSTEM async diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index 40d8dc097c..89e3b3209c 100644 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -38,6 +38,7 @@ #include "erl_instrument.h" #include "dist.h" #include "erl_gc.h" +#include "erl_cpu_topology.h" #ifdef HIPE #include "hipe_arch.h" #endif @@ -1687,6 +1688,8 @@ info_1_tuple(Process* BIF_P, /* Pointer to current process. */ return erts_get_cpu_topology_term(BIF_P, *tp); } else if (ERTS_IS_ATOM_STR("cpu_topology", sel) && arity == 2) { Eterm res = erts_get_cpu_topology_term(BIF_P, *tp); + if (res == THE_NON_VALUE) + goto badarg; ERTS_BIF_PREP_TRAP1(ret, erts_format_cpu_topology_trap, BIF_P, res); return ret; #if defined(PURIFY) || defined(VALGRIND) @@ -2345,9 +2348,7 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) /* Arguments that are unusual follow ... */ else if (ERTS_IS_ATOM_STR("logical_processors", BIF_ARG_1)) { int no; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - no = erts_get_cpu_configured(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_get_logical_processors(&no, NULL, NULL); if (no > 0) BIF_RET(make_small((Uint) no)); else { @@ -2357,9 +2358,7 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) } else if (ERTS_IS_ATOM_STR("logical_processors_online", BIF_ARG_1)) { int no; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - no = erts_get_cpu_online(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_get_logical_processors(NULL, &no, NULL); if (no > 0) BIF_RET(make_small((Uint) no)); else { @@ -2369,9 +2368,7 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) } else if (ERTS_IS_ATOM_STR("logical_processors_available", BIF_ARG_1)) { int no; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - no = erts_get_cpu_available(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_get_logical_processors(NULL, NULL, &no); if (no > 0) BIF_RET(make_small((Uint) no)); else { @@ -2533,6 +2530,13 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) BIF_RET(erts_nif_taints(BIF_P)); } else if (ERTS_IS_ATOM_STR("reader_groups_map", BIF_ARG_1)) { BIF_RET(erts_get_reader_groups_map(BIF_P)); + } else if (ERTS_IS_ATOM_STR("dist_buf_busy_limit", BIF_ARG_1)) { + Uint hsz = 0; + + (void) erts_bld_uint(NULL, &hsz, erts_dist_buf_busy_limit); + hp = hsz ? HAlloc(BIF_P, hsz) : NULL; + res = erts_bld_uint(&hp, NULL, erts_dist_buf_busy_limit); + BIF_RET(res); } BIF_ERROR(BIF_P, BADARG); diff --git a/erts/emulator/beam/erl_cpu_topology.c b/erts/emulator/beam/erl_cpu_topology.c new file mode 100644 index 0000000000..db95c4a5d4 --- /dev/null +++ b/erts/emulator/beam/erl_cpu_topology.c @@ -0,0 +1,2359 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2010. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +/* + * Description: CPU topology and related functionality + * + * Author: Rickard Green + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <ctype.h> + +#include "global.h" +#include "error.h" +#include "bif.h" +#include "erl_cpu_topology.h" + +#define ERTS_MAX_READER_GROUPS 8 + +/* + * Cpu topology hierarchy. + */ +#define ERTS_TOPOLOGY_NODE 0 +#define ERTS_TOPOLOGY_PROCESSOR 1 +#define ERTS_TOPOLOGY_PROCESSOR_NODE 2 +#define ERTS_TOPOLOGY_CORE 3 +#define ERTS_TOPOLOGY_THREAD 4 +#define ERTS_TOPOLOGY_LOGICAL 5 + +#define ERTS_TOPOLOGY_MAX_DEPTH 6 + +typedef struct { + int bind_id; + int bound_id; +} ErtsCpuBindData; + +static erts_cpu_info_t *cpuinfo; + +static int max_main_threads; +static int reader_groups; + +static ErtsCpuBindData *scheduler2cpu_map; +static erts_smp_rwmtx_t cpuinfo_rwmtx; + +typedef enum { + ERTS_CPU_BIND_UNDEFINED, + ERTS_CPU_BIND_SPREAD, + ERTS_CPU_BIND_PROCESSOR_SPREAD, + ERTS_CPU_BIND_THREAD_SPREAD, + ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD, + ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD, + ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD, + ERTS_CPU_BIND_NO_SPREAD, + ERTS_CPU_BIND_NONE +} ErtsCpuBindOrder; + +#define ERTS_CPU_BIND_DEFAULT_BIND \ + ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD + +static int no_cpu_groups_callbacks; +static ErtsCpuBindOrder cpu_bind_order; + +static erts_cpu_topology_t *user_cpudata; +static int user_cpudata_size; +static erts_cpu_topology_t *system_cpudata; +static int system_cpudata_size; + +typedef struct { + int level[ERTS_TOPOLOGY_MAX_DEPTH+1]; +} erts_avail_cput; + +typedef struct { + int id; + int sub_levels; + int cpu_groups; +} erts_cpu_groups_count_t; + +typedef struct { + int logical; + int cpu_group; +} erts_cpu_groups_map_array_t; + +typedef struct erts_cpu_groups_callback_list_t_ erts_cpu_groups_callback_list_t; +struct erts_cpu_groups_callback_list_t_ { + erts_cpu_groups_callback_list_t *next; + erts_cpu_groups_callback_t callback; + void *arg; +}; + +typedef struct erts_cpu_groups_map_t_ erts_cpu_groups_map_t; +struct erts_cpu_groups_map_t_ { + erts_cpu_groups_map_t *next; + int groups; + erts_cpu_groups_map_array_t *array; + int size; + int logical_processors; + erts_cpu_groups_callback_list_t *callback_list; +}; + +typedef struct { + erts_cpu_groups_callback_t callback; + int ix; + void *arg; +} erts_cpu_groups_callback_call_t; + +static erts_cpu_groups_map_t *cpu_groups_maps; + +static erts_cpu_groups_map_t *reader_groups_map; + +#define ERTS_TOPOLOGY_CG ERTS_TOPOLOGY_MAX_DEPTH + +#define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff) + +#ifdef ERTS_SMP +static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata, + int size, + ErtsCpuBindOrder bind_order, + int mk_seq); +static void write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size); +#endif + +static void reader_groups_callback(int, ErtsSchedulerData *, int, void *); +static erts_cpu_groups_map_t *add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg); +static void update_cpu_groups_maps(void); +static void make_cpu_groups_map(erts_cpu_groups_map_t *map, int test); +static int cpu_groups_lookup(erts_cpu_groups_map_t *map, + ErtsSchedulerData *esdp); + +static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, + int *cpudata_size); +static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata); + +static int +int_cmp(const void *vx, const void *vy) +{ + return *((int *) vx) - *((int *) vy); +} + +static int +cpu_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->core != y->core) + return x->core - y->core; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->node != y->node) + return x->node - y->node; + return 0; +} + +static int +cpu_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_thread_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + return 0; +} + +static int +cpu_thread_no_node_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->node != y->node) + return x->node - y->node; + if (x->core != y->core) + return x->core - y->core; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_no_node_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->core != y->core) + return x->core - y->core; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_no_node_thread_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->core != y->core) + return x->core - y->core; + return 0; +} + +static int +cpu_no_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->thread != y->thread) + return x->thread - y->thread; + return 0; +} + +static ERTS_INLINE void +make_cpudata_id_seq(erts_cpu_topology_t *cpudata, int size, int no_node) +{ + int ix; + int node = -1; + int processor = -1; + int processor_node = -1; + int processor_node_node = -1; + int core = -1; + int thread = -1; + int old_node = -1; + int old_processor = -1; + int old_processor_node = -1; + int old_core = -1; + int old_thread = -1; + + for (ix = 0; ix < size; ix++) { + if (!no_node || cpudata[ix].node >= 0) { + if (old_node == cpudata[ix].node) + cpudata[ix].node = node; + else { + old_node = cpudata[ix].node; + old_processor = processor = -1; + if (!no_node) + old_processor_node = processor_node = -1; + old_core = core = -1; + old_thread = thread = -1; + if (no_node || cpudata[ix].node >= 0) + cpudata[ix].node = ++node; + } + } + if (old_processor == cpudata[ix].processor) + cpudata[ix].processor = processor; + else { + old_processor = cpudata[ix].processor; + if (!no_node) + processor_node_node = old_processor_node = processor_node = -1; + old_core = core = -1; + old_thread = thread = -1; + cpudata[ix].processor = ++processor; + } + if (no_node && cpudata[ix].processor_node < 0) + old_processor_node = -1; + else { + if (old_processor_node == cpudata[ix].processor_node) { + if (no_node) + cpudata[ix].node = cpudata[ix].processor_node = node; + else { + if (processor_node_node >= 0) + cpudata[ix].node = processor_node_node; + cpudata[ix].processor_node = processor_node; + } + } + else { + old_processor_node = cpudata[ix].processor_node; + old_core = core = -1; + old_thread = thread = -1; + if (no_node) + cpudata[ix].node = cpudata[ix].processor_node = ++node; + else { + cpudata[ix].node = processor_node_node = ++node; + cpudata[ix].processor_node = ++processor_node; + } + } + } + if (!no_node && cpudata[ix].processor_node < 0) + cpudata[ix].processor_node = 0; + if (old_core == cpudata[ix].core) + cpudata[ix].core = core; + else { + old_core = cpudata[ix].core; + old_thread = thread = -1; + cpudata[ix].core = ++core; + } + if (old_thread == cpudata[ix].thread) + cpudata[ix].thread = thread; + else + old_thread = cpudata[ix].thread = ++thread; + } +} + +static void +cpu_bind_order_sort(erts_cpu_topology_t *cpudata, + int size, + ErtsCpuBindOrder bind_order, + int mk_seq) +{ + if (size > 1) { + int no_node = 0; + int (*cmp_func)(const void *, const void *); + switch (bind_order) { + case ERTS_CPU_BIND_SPREAD: + cmp_func = cpu_spread_order_cmp; + break; + case ERTS_CPU_BIND_PROCESSOR_SPREAD: + cmp_func = cpu_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_THREAD_SPREAD: + cmp_func = cpu_thread_spread_order_cmp; + break; + case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: + no_node = 1; + cmp_func = cpu_thread_no_node_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: + no_node = 1; + cmp_func = cpu_no_node_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: + no_node = 1; + cmp_func = cpu_no_node_thread_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_SPREAD: + cmp_func = cpu_no_spread_order_cmp; + break; + default: + cmp_func = NULL; + erl_exit(ERTS_ABORT_EXIT, + "Bad cpu bind type: %d\n", + (int) cpu_bind_order); + break; + } + + if (mk_seq) + make_cpudata_id_seq(cpudata, size, no_node); + + qsort(cpudata, size, sizeof(erts_cpu_topology_t), cmp_func); + } +} + +static int +processor_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->node != y->node) + return x->node - y->node; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->thread != y->thread) + return x->thread - y->thread; + return 0; +} + +#ifdef ERTS_SMP +void +erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData *esdp) +{ + erts_cpu_groups_map_t *cgm; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_callback_call_t *cgcc; + int cgcc_ix; + + /* Unbind from cpu */ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + if (scheduler2cpu_map[esdp->no].bound_id >= 0 + && erts_unbind_from_cpu(cpuinfo) == 0) { + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + } + + cgcc = erts_alloc(ERTS_ALC_T_TMP, + (no_cpu_groups_callbacks + * sizeof(erts_cpu_groups_callback_call_t))); + cgcc_ix = 0; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + cgcc[cgcc_ix].callback = cgcl->callback; + cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp); + cgcc[cgcc_ix].arg = cgcl->arg; + cgcc_ix++; + } + } + ASSERT(no_cpu_groups_callbacks == cgcc_ix); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++) + cgcc[cgcc_ix].callback(1, + esdp, + cgcc[cgcc_ix].ix, + cgcc[cgcc_ix].arg); + + erts_free(ERTS_ALC_T_TMP, cgcc); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(0, 0); + +} + +void +erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData *esdp) +{ + ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(esdp->run_queue)); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(1, (int) esdp->no); + + /* Make sure we check if we should bind to a cpu or not... */ + if (esdp->run_queue->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) + erts_smp_atomic_set(&esdp->chk_cpu_bind, 1); + else + esdp->run_queue->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; +} + +#endif + +void +erts_sched_check_cpu_bind(ErtsSchedulerData *esdp) +{ + int res, cpu_id, cgcc_ix; + erts_cpu_groups_map_t *cgm; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_callback_call_t *cgcc; +#ifdef ERTS_SMP + if (erts_common_run_queue) + erts_smp_atomic_set(&esdp->chk_cpu_bind, 0); + else { + esdp->run_queue->flags &= ~ERTS_RUNQ_FLG_CHK_CPU_BIND; + } +#endif + erts_smp_runq_unlock(esdp->run_queue); + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + cpu_id = scheduler2cpu_map[esdp->no].bind_id; + if (cpu_id >= 0 && cpu_id != scheduler2cpu_map[esdp->no].bound_id) { + res = erts_bind_to_cpu(cpuinfo, cpu_id); + if (res == 0) + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = cpu_id; + else { + erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); + erts_dsprintf(dsbufp, "Scheduler %d failed to bind to cpu %d: %s\n", + (int) esdp->no, cpu_id, erl_errno_id(-res)); + erts_send_error_to_logger_nogl(dsbufp); + if (scheduler2cpu_map[esdp->no].bound_id >= 0) + goto unbind; + } + } + else if (cpu_id < 0) { + unbind: + /* Get rid of old binding */ + res = erts_unbind_from_cpu(cpuinfo); + if (res == 0) + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + else if (res != -ENOTSUP) { + erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); + erts_dsprintf(dsbufp, "Scheduler %d failed to unbind from cpu %d: %s\n", + (int) esdp->no, cpu_id, erl_errno_id(-res)); + erts_send_error_to_logger_nogl(dsbufp); + } + } + + cgcc = erts_alloc(ERTS_ALC_T_TMP, + (no_cpu_groups_callbacks + * sizeof(erts_cpu_groups_callback_call_t))); + cgcc_ix = 0; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + cgcc[cgcc_ix].callback = cgcl->callback; + cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp); + cgcc[cgcc_ix].arg = cgcl->arg; + cgcc_ix++; + } + } + + ASSERT(no_cpu_groups_callbacks == cgcc_ix); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++) + cgcc[cgcc_ix].callback(0, + esdp, + cgcc[cgcc_ix].ix, + cgcc[cgcc_ix].arg); + + erts_free(ERTS_ALC_T_TMP, cgcc); + + erts_smp_runq_lock(esdp->run_queue); +} + +#ifdef ERTS_SMP +void +erts_sched_init_check_cpu_bind(ErtsSchedulerData *esdp) +{ + int cgcc_ix; + erts_cpu_groups_map_t *cgm; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_callback_call_t *cgcc; + + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + + cgcc = erts_alloc(ERTS_ALC_T_TMP, + (no_cpu_groups_callbacks + * sizeof(erts_cpu_groups_callback_call_t))); + cgcc_ix = 0; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + cgcc[cgcc_ix].callback = cgcl->callback; + cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp); + cgcc[cgcc_ix].arg = cgcl->arg; + cgcc_ix++; + } + } + + ASSERT(no_cpu_groups_callbacks == cgcc_ix); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + + for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++) + cgcc[cgcc_ix].callback(0, + esdp, + cgcc[cgcc_ix].ix, + cgcc[cgcc_ix].arg); + + erts_free(ERTS_ALC_T_TMP, cgcc); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(1, (int) esdp->no); +} +#endif + +static void +write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size) +{ + int s_ix = 1; + int cpu_ix; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (cpu_bind_order != ERTS_CPU_BIND_NONE && size) { + + cpu_bind_order_sort(cpudata, size, cpu_bind_order, 1); + + for (cpu_ix = 0; cpu_ix < size && cpu_ix < erts_no_schedulers; cpu_ix++) + if (erts_is_cpu_available(cpuinfo, cpudata[cpu_ix].logical)) + scheduler2cpu_map[s_ix++].bind_id = cpudata[cpu_ix].logical; + } + + if (s_ix <= erts_no_schedulers) + for (; s_ix <= erts_no_schedulers; s_ix++) + scheduler2cpu_map[s_ix].bind_id = -1; +} + +int +erts_init_scheduler_bind_type_string(char *how) +{ + if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) + return ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED; + + if (!system_cpudata && !user_cpudata) + return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY; + + if (sys_strcmp(how, "db") == 0) + cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (sys_strcmp(how, "s") == 0) + cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (sys_strcmp(how, "ps") == 0) + cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "ts") == 0) + cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (sys_strcmp(how, "tnnps") == 0) + cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "nnps") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "nnts") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (sys_strcmp(how, "ns") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (sys_strcmp(how, "u") == 0) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else + return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE; + + return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS; +} + +static Eterm +bound_schedulers_term(ErtsCpuBindOrder order) +{ + switch (order) { + case ERTS_CPU_BIND_SPREAD: { + ERTS_DECL_AM(spread); + return AM_spread; + } + case ERTS_CPU_BIND_PROCESSOR_SPREAD: { + ERTS_DECL_AM(processor_spread); + return AM_processor_spread; + } + case ERTS_CPU_BIND_THREAD_SPREAD: { + ERTS_DECL_AM(thread_spread); + return AM_thread_spread; + } + case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: { + ERTS_DECL_AM(thread_no_node_processor_spread); + return AM_thread_no_node_processor_spread; + } + case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: { + ERTS_DECL_AM(no_node_processor_spread); + return AM_no_node_processor_spread; + } + case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: { + ERTS_DECL_AM(no_node_thread_spread); + return AM_no_node_thread_spread; + } + case ERTS_CPU_BIND_NO_SPREAD: { + ERTS_DECL_AM(no_spread); + return AM_no_spread; + } + case ERTS_CPU_BIND_NONE: { + ERTS_DECL_AM(unbound); + return AM_unbound; + } + default: + ASSERT(0); + return THE_NON_VALUE; + } +} + +Eterm +erts_bound_schedulers_term(Process *c_p) +{ + ErtsCpuBindOrder order; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + order = cpu_bind_order; + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return bound_schedulers_term(order); +} + +Eterm +erts_bind_schedulers(Process *c_p, Eterm how) +{ + int notify = 0; + Eterm res; + erts_cpu_topology_t *cpudata; + int cpudata_size; + ErtsCpuBindOrder old_cpu_bind_order; + + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + + if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) { + ERTS_BIF_PREP_ERROR(res, c_p, EXC_NOTSUP); + } + else { + + old_cpu_bind_order = cpu_bind_order; + + if (ERTS_IS_ATOM_STR("default_bind", how)) + cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (ERTS_IS_ATOM_STR("spread", how)) + cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (ERTS_IS_ATOM_STR("processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("no_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (ERTS_IS_ATOM_STR("unbound", how)) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else { + cpu_bind_order = old_cpu_bind_order; + ERTS_BIF_PREP_ERROR(res, c_p, BADARG); + goto done; + } + + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + + if (!cpudata) { + cpu_bind_order = old_cpu_bind_order; + ERTS_BIF_PREP_ERROR(res, c_p, BADARG); + goto done; + } + + write_schedulers_bind_change(cpudata, cpudata_size); + notify = 1; + + destroy_tmp_cpu_topology_copy(cpudata); + + res = bound_schedulers_term(old_cpu_bind_order); + } + + done: + + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + if (notify) + erts_sched_notify_check_cpu_bind(); + + return res; +} + +int +erts_sched_bind_atthrcreate_prepare(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + return esdp != NULL && erts_is_scheduler_bound(esdp); +} + +int +erts_sched_bind_atthrcreate_child(int unbind) +{ + int res = 0; + if (unbind) { + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + res = erts_unbind_from_cpu(cpuinfo); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + } + return res; +} + +void +erts_sched_bind_atthrcreate_parent(int unbind) +{ + +} + +int +erts_sched_bind_atfork_prepare(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + int unbind = esdp != NULL && erts_is_scheduler_bound(esdp); + if (unbind) + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + return unbind; +} + +int +erts_sched_bind_atfork_child(int unbind) +{ + if (unbind) { + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + return erts_unbind_from_cpu(cpuinfo); + } + return 0; +} + +char * +erts_sched_bind_atvfork_child(int unbind) +{ + if (unbind) { + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + return erts_get_unbind_from_cpu_str(cpuinfo); + } + return "false"; +} + +void +erts_sched_bind_atfork_parent(int unbind) +{ + if (unbind) + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); +} + +Eterm +erts_fake_scheduler_bindings(Process *p, Eterm how) +{ + ErtsCpuBindOrder fake_cpu_bind_order; + erts_cpu_topology_t *cpudata; + int cpudata_size; + Eterm res; + + if (ERTS_IS_ATOM_STR("default_bind", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (ERTS_IS_ATOM_STR("spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (ERTS_IS_ATOM_STR("processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("no_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (ERTS_IS_ATOM_STR("unbound", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NONE; + else { + ERTS_BIF_PREP_ERROR(res, p, BADARG); + return res; + } + + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + + if (!cpudata || fake_cpu_bind_order == ERTS_CPU_BIND_NONE) + ERTS_BIF_PREP_RET(res, am_false); + else { + int i; + Eterm *hp; + + cpu_bind_order_sort(cpudata, cpudata_size, fake_cpu_bind_order, 1); + +#ifdef ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA + + erts_fprintf(stderr, "node: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].node); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "processor: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].processor); + erts_fprintf(stderr, "\n"); + if (fake_cpu_bind_order != ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD + && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD + && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD) { + erts_fprintf(stderr, "processor_node:"); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].processor_node); + erts_fprintf(stderr, "\n"); + } + erts_fprintf(stderr, "core: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].core); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "thread: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].thread); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "logical: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].logical); + erts_fprintf(stderr, "\n"); +#endif + + hp = HAlloc(p, cpudata_size+1); + ERTS_BIF_PREP_RET(res, make_tuple(hp)); + *hp++ = make_arityval((Uint) cpudata_size); + for (i = 0; i < cpudata_size; i++) + *hp++ = make_small((Uint) cpudata[i].logical); + } + + destroy_tmp_cpu_topology_copy(cpudata); + + return res; +} + +Eterm +erts_get_schedulers_binds(Process *c_p) +{ + int ix; + ERTS_DECL_AM(unbound); + Eterm *hp = HAlloc(c_p, erts_no_schedulers+1); + Eterm res = make_tuple(hp); + + *(hp++) = make_arityval(erts_no_schedulers); + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + for (ix = 1; ix <= erts_no_schedulers; ix++) + *(hp++) = (scheduler2cpu_map[ix].bound_id >= 0 + ? make_small(scheduler2cpu_map[ix].bound_id) + : AM_unbound); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +/* + * CPU topology + */ + +typedef struct { + int *id; + int used; + int size; +} ErtsCpuTopIdSeq; + +typedef struct { + ErtsCpuTopIdSeq logical; + ErtsCpuTopIdSeq thread; + ErtsCpuTopIdSeq core; + ErtsCpuTopIdSeq processor_node; + ErtsCpuTopIdSeq processor; + ErtsCpuTopIdSeq node; +} ErtsCpuTopEntry; + +static void +init_cpu_top_entry(ErtsCpuTopEntry *cte) +{ + int size = 10; + cte->logical.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->logical.size = size; + cte->thread.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->thread.size = size; + cte->core.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->core.size = size; + cte->processor_node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->processor_node.size = size; + cte->processor.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->processor.size = size; + cte->node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->node.size = size; +} + +static void +destroy_cpu_top_entry(ErtsCpuTopEntry *cte) +{ + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->logical.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->thread.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->core.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor_node.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->node.id); +} + +static int +get_cput_value_or_range(int *v, int *vr, char **str) +{ + long l; + char *c = *str; + errno = 0; + if (!isdigit((unsigned char)*c)) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; + l = strtol(c, &c, 10); + if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; + *v = (int) l; + if (*c == '-') { + c++; + if (!isdigit((unsigned char)*c)) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + l = strtol(c, &c, 10); + if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + *vr = (int) l; + } + *str = c; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +get_cput_id_seq(ErtsCpuTopIdSeq *idseq, char **str) +{ + int ix = 0; + int need_size = 0; + char *c = *str; + + while (1) { + int res; + int val; + int nids; + int val_range = -1; + res = get_cput_value_or_range(&val, &val_range, &c); + if (res != ERTS_INIT_CPU_TOPOLOGY_OK) + return res; + if (val_range < 0 || val_range == val) + nids = 1; + else { + if (val_range > val) + nids = val_range - val + 1; + else + nids = val - val_range + 1; + } + need_size += nids; + if (need_size > idseq->size) { + idseq->size = need_size + 10; + idseq->id = erts_realloc(ERTS_ALC_T_TMP_CPU_IDS, + idseq->id, + sizeof(int)*idseq->size); + } + if (nids == 1) + idseq->id[ix++] = val; + else if (val_range > val) { + for (; val <= val_range; val++) + idseq->id[ix++] = val; + } + else { + for (; val >= val_range; val--) + idseq->id[ix++] = val; + } + if (*c != ',') + break; + c++; + } + *str = c; + idseq->used = ix; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +get_cput_entry(ErtsCpuTopEntry *cput, char **str) +{ + int h; + char *c = *str; + + cput->logical.used = 0; + cput->thread.id[0] = 0; + cput->thread.used = 1; + cput->core.id[0] = 0; + cput->core.used = 1; + cput->processor_node.id[0] = -1; + cput->processor_node.used = 1; + cput->processor.id[0] = 0; + cput->processor.used = 1; + cput->node.id[0] = -1; + cput->node.used = 1; + + h = ERTS_TOPOLOGY_MAX_DEPTH; + while (*c != ':' && *c != '\0') { + int res; + ErtsCpuTopIdSeq *idseqp; + switch (*c++) { + case 'L': + if (h <= ERTS_TOPOLOGY_LOGICAL) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->logical; + h = ERTS_TOPOLOGY_LOGICAL; + break; + case 't': + case 'T': + if (h <= ERTS_TOPOLOGY_THREAD) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->thread; + h = ERTS_TOPOLOGY_THREAD; + break; + case 'c': + case 'C': + if (h <= ERTS_TOPOLOGY_CORE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->core; + h = ERTS_TOPOLOGY_CORE; + break; + case 'p': + case 'P': + if (h <= ERTS_TOPOLOGY_PROCESSOR) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->processor; + h = ERTS_TOPOLOGY_PROCESSOR; + break; + case 'n': + case 'N': + if (h <= ERTS_TOPOLOGY_PROCESSOR) { + do_node: + if (h <= ERTS_TOPOLOGY_NODE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->node; + h = ERTS_TOPOLOGY_NODE; + } + else { + int p_node = 0; + char *p_chk = c; + while (*p_chk != '\0' && *p_chk != ':') { + if (*p_chk == 'p' || *p_chk == 'P') { + p_node = 1; + break; + } + p_chk++; + } + if (!p_node) + goto do_node; + if (h <= ERTS_TOPOLOGY_PROCESSOR_NODE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->processor_node; + h = ERTS_TOPOLOGY_PROCESSOR_NODE; + } + break; + default: + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE; + } + res = get_cput_id_seq(idseqp, &c); + if (res != ERTS_INIT_CPU_TOPOLOGY_OK) + return res; + } + + if (cput->logical.used < 1) + return ERTS_INIT_CPU_TOPOLOGY_MISSING_LID; + + if (*c == ':') { + c++; + } + + if (cput->thread.used != 1 + && cput->thread.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->core.used != 1 + && cput->core.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->processor_node.used != 1 + && cput->processor_node.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->processor.used != 1 + && cput->processor.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->node.used != 1 + && cput->node.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + + *str = c; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +verify_topology(erts_cpu_topology_t *cpudata, int size) +{ + if (size > 0) { + int *logical; + int node, processor, no_nodes, i; + + /* Verify logical ids */ + logical = erts_alloc(ERTS_ALC_T_TMP, sizeof(int)*size); + + for (i = 0; i < size; i++) + logical[i] = cpudata[i].logical; + + qsort(logical, size, sizeof(int), int_cmp); + for (i = 0; i < size-1; i++) { + if (logical[i] == logical[i+1]) { + erts_free(ERTS_ALC_T_TMP, logical); + return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS; + } + } + + erts_free(ERTS_ALC_T_TMP, logical); + + qsort(cpudata, size, sizeof(erts_cpu_topology_t), processor_order_cmp); + + /* Verify unique entities */ + + for (i = 1; i < size; i++) { + if (cpudata[i-1].processor == cpudata[i].processor + && cpudata[i-1].node == cpudata[i].node + && (cpudata[i-1].processor_node + == cpudata[i].processor_node) + && cpudata[i-1].core == cpudata[i].core + && cpudata[i-1].thread == cpudata[i].thread) { + return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES; + } + } + + /* Verify numa nodes */ + node = cpudata[0].node; + processor = cpudata[0].processor; + no_nodes = cpudata[0].node < 0 && cpudata[0].processor_node < 0; + for (i = 1; i < size; i++) { + if (no_nodes) { + if (cpudata[i].node >= 0 || cpudata[i].processor_node >= 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + } + else { + if (cpudata[i].processor == processor && cpudata[i].node != node) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + node = cpudata[i].node; + processor = cpudata[i].processor; + if (node >= 0 && cpudata[i].processor_node >= 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + if (node < 0 && cpudata[i].processor_node < 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + } + } + } + + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +int +erts_init_cpu_topology_string(char *topology_str) +{ + ErtsCpuTopEntry cput; + int need_size; + char *c; + int ix; + int error = ERTS_INIT_CPU_TOPOLOGY_OK; + + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata_size = 10; + + user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + + init_cpu_top_entry(&cput); + + ix = 0; + need_size = 0; + + c = topology_str; + if (*c == '\0') { + error = ERTS_INIT_CPU_TOPOLOGY_MISSING; + goto fail; + } + do { + int r; + error = get_cput_entry(&cput, &c); + if (error != ERTS_INIT_CPU_TOPOLOGY_OK) + goto fail; + need_size += cput.logical.used; + if (user_cpudata_size < need_size) { + user_cpudata_size = need_size + 10; + user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, + user_cpudata, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + } + + ASSERT(cput.thread.used == 1 + || cput.thread.used == cput.logical.used); + ASSERT(cput.core.used == 1 + || cput.core.used == cput.logical.used); + ASSERT(cput.processor_node.used == 1 + || cput.processor_node.used == cput.logical.used); + ASSERT(cput.processor.used == 1 + || cput.processor.used == cput.logical.used); + ASSERT(cput.node.used == 1 + || cput.node.used == cput.logical.used); + + for (r = 0; r < cput.logical.used; r++) { + user_cpudata[ix].logical = cput.logical.id[r]; + user_cpudata[ix].thread = + cput.thread.id[cput.thread.used == 1 ? 0 : r]; + user_cpudata[ix].core = + cput.core.id[cput.core.used == 1 ? 0 : r]; + user_cpudata[ix].processor_node = + cput.processor_node.id[cput.processor_node.used == 1 ? 0 : r]; + user_cpudata[ix].processor = + cput.processor.id[cput.processor.used == 1 ? 0 : r]; + user_cpudata[ix].node = + cput.node.id[cput.node.used == 1 ? 0 : r]; + ix++; + } + } while (*c != '\0'); + + if (user_cpudata_size != ix) { + user_cpudata_size = ix; + user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, + user_cpudata, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + } + + error = verify_topology(user_cpudata, user_cpudata_size); + if (error == ERTS_INIT_CPU_TOPOLOGY_OK) { + destroy_cpu_top_entry(&cput); + return ERTS_INIT_CPU_TOPOLOGY_OK; + } + + fail: + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata_size = 0; + destroy_cpu_top_entry(&cput); + return error; +} + +#define ERTS_GET_CPU_TOPOLOGY_ERROR -1 +#define ERTS_GET_USED_CPU_TOPOLOGY 0 +#define ERTS_GET_DETECTED_CPU_TOPOLOGY 1 +#define ERTS_GET_DEFINED_CPU_TOPOLOGY 2 + +static Eterm get_cpu_topology_term(Process *c_p, int type); + +Eterm +erts_set_cpu_topology(Process *c_p, Eterm term) +{ + erts_cpu_topology_t *cpudata = NULL; + int cpudata_size = 0; + Eterm res; + + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + res = get_cpu_topology_term(c_p, ERTS_GET_USED_CPU_TOPOLOGY); + if (term == am_undefined) { + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata = NULL; + user_cpudata_size = 0; + + if (cpu_bind_order != ERTS_CPU_BIND_NONE && system_cpudata) { + cpudata_size = system_cpudata_size; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + + sys_memcpy((void *) cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*cpudata_size); + } + } + else if (is_not_list(term)) { + error: + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + res = THE_NON_VALUE; + goto done; + } + else { + Eterm list = term; + int ix = 0; + + cpudata_size = 100; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + + while (is_list(list)) { + Eterm *lp = list_val(list); + Eterm cpu = CAR(lp); + Eterm* tp; + Sint id; + + if (is_not_tuple(cpu)) + goto error; + + tp = tuple_val(cpu); + + if (arityval(tp[0]) != 7 || tp[1] != am_cpu) + goto error; + + if (ix >= cpudata_size) { + cpudata_size += 100; + cpudata = erts_realloc(ERTS_ALC_T_TMP, + cpudata, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + } + + id = signed_val(tp[2]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].node = (int) id; + + id = signed_val(tp[3]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].processor = (int) id; + + id = signed_val(tp[4]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].processor_node = (int) id; + + id = signed_val(tp[5]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].core = (int) id; + + id = signed_val(tp[6]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].thread = (int) id; + + id = signed_val(tp[7]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].logical = (int) id; + + list = CDR(lp); + ix++; + } + + if (is_not_nil(list)) + goto error; + + cpudata_size = ix; + + if (ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(cpudata, cpudata_size)) + goto error; + + if (user_cpudata_size != cpudata_size) { + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + sizeof(erts_cpu_topology_t)*cpudata_size); + user_cpudata_size = cpudata_size; + } + + sys_memcpy((void *) user_cpudata, + (void *) cpudata, + sizeof(erts_cpu_topology_t)*cpudata_size); + } + + update_cpu_groups_maps(); + + write_schedulers_bind_change(cpudata, cpudata_size); + + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_sched_notify_check_cpu_bind(); + + done: + + if (cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); + + return res; +} + +static void +create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, int *cpudata_size) +{ + if (user_cpudata) { + *cpudata_size = user_cpudata_size; + *cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * (*cpudata_size))); + sys_memcpy((void *) *cpudata, + (void *) user_cpudata, + sizeof(erts_cpu_topology_t)*(*cpudata_size)); + } + else if (system_cpudata) { + *cpudata_size = system_cpudata_size; + *cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * (*cpudata_size))); + sys_memcpy((void *) *cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*(*cpudata_size)); + } + else { + *cpudata = NULL; + *cpudata_size = 0; + } +} + +static void +destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata) +{ + if (cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); +} + + +static Eterm +bld_topology_term(Eterm **hpp, + Uint *hszp, + erts_cpu_topology_t *cpudata, + int size) +{ + Eterm res = NIL; + int i; + + if (size == 0) + return am_undefined; + + for (i = size-1; i >= 0; i--) { + res = erts_bld_cons(hpp, + hszp, + erts_bld_tuple(hpp, + hszp, + 7, + am_cpu, + make_small(cpudata[i].node), + make_small(cpudata[i].processor), + make_small(cpudata[i].processor_node), + make_small(cpudata[i].core), + make_small(cpudata[i].thread), + make_small(cpudata[i].logical)), + res); + } + return res; +} + +static Eterm +get_cpu_topology_term(Process *c_p, int type) +{ +#ifdef DEBUG + Eterm *hp_end; +#endif + Eterm *hp; + Uint hsz; + Eterm res = THE_NON_VALUE; + erts_cpu_topology_t *cpudata = NULL; + int size = 0; + + switch (type) { + case ERTS_GET_USED_CPU_TOPOLOGY: + if (user_cpudata) + goto defined; + else + goto detected; + case ERTS_GET_DETECTED_CPU_TOPOLOGY: + detected: + if (!system_cpudata) + res = am_undefined; + else { + size = system_cpudata_size; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * size)); + sys_memcpy((void *) cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*size); + } + break; + case ERTS_GET_DEFINED_CPU_TOPOLOGY: + defined: + if (!user_cpudata) + res = am_undefined; + else { + size = user_cpudata_size; + cpudata = user_cpudata; + } + break; + default: + erl_exit(ERTS_ABORT_EXIT, "Bad cpu topology type: %d\n", type); + break; + } + + if (res == am_undefined) { + ASSERT(!cpudata); + return res; + } + + hsz = 0; + + bld_topology_term(NULL, &hsz, + cpudata, size); + + hp = HAlloc(c_p, hsz); + +#ifdef DEBUG + hp_end = hp + hsz; +#endif + + res = bld_topology_term(&hp, NULL, + cpudata, size); + + ASSERT(hp_end == hp); + + if (cpudata && cpudata != system_cpudata && cpudata != user_cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); + + return res; +} + +Eterm +erts_get_cpu_topology_term(Process *c_p, Eterm which) +{ + Eterm res; + int type; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + if (ERTS_IS_ATOM_STR("used", which)) + type = ERTS_GET_USED_CPU_TOPOLOGY; + else if (ERTS_IS_ATOM_STR("detected", which)) + type = ERTS_GET_DETECTED_CPU_TOPOLOGY; + else if (ERTS_IS_ATOM_STR("defined", which)) + type = ERTS_GET_DEFINED_CPU_TOPOLOGY; + else + type = ERTS_GET_CPU_TOPOLOGY_ERROR; + if (type == ERTS_GET_CPU_TOPOLOGY_ERROR) + res = THE_NON_VALUE; + else + res = get_cpu_topology_term(c_p, type); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +static void +get_logical_processors(int *conf, int *onln, int *avail) +{ + if (conf) + *conf = erts_get_cpu_configured(cpuinfo); + if (onln) + *onln = erts_get_cpu_online(cpuinfo); + if (avail) + *avail = erts_get_cpu_available(cpuinfo); +} + +void +erts_get_logical_processors(int *conf, int *onln, int *avail) +{ + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + get_logical_processors(conf, onln, avail); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); +} + +void +erts_pre_early_init_cpu_topology(int *max_rg_p, + int *conf_p, + int *onln_p, + int *avail_p) +{ + cpu_groups_maps = NULL; + no_cpu_groups_callbacks = 0; + *max_rg_p = ERTS_MAX_READER_GROUPS; + cpuinfo = erts_cpu_info_create(); + get_logical_processors(conf_p, onln_p, avail_p); +} + +void +erts_early_init_cpu_topology(int no_schedulers, + int *max_main_threads_p, + int max_reader_groups, + int *reader_groups_p) +{ + user_cpudata = NULL; + user_cpudata_size = 0; + + system_cpudata_size = erts_get_cpu_topology_size(cpuinfo); + system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * system_cpudata_size)); + + cpu_bind_order = ERTS_CPU_BIND_UNDEFINED; + + if (!erts_get_cpu_topology(cpuinfo, system_cpudata) + || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata, + system_cpudata_size)) { + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + system_cpudata = NULL; + system_cpudata_size = 0; + } + + max_main_threads = erts_get_cpu_configured(cpuinfo); + if (max_main_threads > no_schedulers) + max_main_threads = no_schedulers; + *max_main_threads_p = max_main_threads; + + reader_groups = max_main_threads; + if (reader_groups <= 1 || max_reader_groups <= 1) + reader_groups = 0; + if (reader_groups > max_reader_groups) + reader_groups = max_reader_groups; + *reader_groups_p = reader_groups; +} + +void +erts_init_cpu_topology(void) +{ + int ix; + + erts_smp_rwmtx_init(&cpuinfo_rwmtx, "cpu_info"); + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + + scheduler2cpu_map = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(ErtsCpuBindData) + * (erts_no_schedulers+1))); + for (ix = 1; ix <= erts_no_schedulers; ix++) { + scheduler2cpu_map[ix].bind_id = -1; + scheduler2cpu_map[ix].bound_id = -1; + } + + if (cpu_bind_order == ERTS_CPU_BIND_UNDEFINED) { + int ncpus = erts_get_cpu_configured(cpuinfo); + if (ncpus < 1 || erts_no_schedulers < ncpus) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else + cpu_bind_order = ((system_cpudata || user_cpudata) + && (erts_bind_to_cpu(cpuinfo, -1) != -ENOTSUP) + ? ERTS_CPU_BIND_DEFAULT_BIND + : ERTS_CPU_BIND_NONE); + } + + reader_groups_map = add_cpu_groups(reader_groups, + reader_groups_callback, + NULL); + + if (cpu_bind_order == ERTS_CPU_BIND_NONE) + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + else { + erts_cpu_topology_t *cpudata; + int cpudata_size; + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + write_schedulers_bind_change(cpudata, cpudata_size); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_sched_notify_check_cpu_bind(); + destroy_tmp_cpu_topology_copy(cpudata); + } +} + +int +erts_update_cpu_info(void) +{ + int changed; + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + changed = erts_cpu_info_update(cpuinfo); + if (changed) { + erts_cpu_topology_t *cpudata; + int cpudata_size; + + if (system_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + + system_cpudata_size = erts_get_cpu_topology_size(cpuinfo); + if (!system_cpudata_size) + system_cpudata = NULL; + else { + system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * system_cpudata_size)); + + if (!erts_get_cpu_topology(cpuinfo, system_cpudata) + || (ERTS_INIT_CPU_TOPOLOGY_OK + != verify_topology(system_cpudata, + system_cpudata_size))) { + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + system_cpudata = NULL; + system_cpudata_size = 0; + } + } + + update_cpu_groups_maps(); + + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + write_schedulers_bind_change(cpudata, cpudata_size); + destroy_tmp_cpu_topology_copy(cpudata); + } + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + if (changed) + erts_sched_notify_check_cpu_bind(); + return changed; +} + +/* + * reader groups map + */ + +void +reader_groups_callback(int suspending, + ErtsSchedulerData *esdp, + int group, + void *unused) +{ + if (reader_groups && esdp->no <= max_main_threads) + erts_smp_rwmtx_set_reader_group(suspending ? 0 : group+1); +} + +static Eterm get_cpu_groups_map(Process *c_p, + erts_cpu_groups_map_t *map, + int offset); +Eterm +erts_debug_reader_groups_map(Process *c_p, int groups) +{ + Eterm res; + erts_cpu_groups_map_t test; + + test.array = NULL; + test.groups = groups; + make_cpu_groups_map(&test, 1); + if (!test.array) + res = NIL; + else { + res = get_cpu_groups_map(c_p, &test, 1); + erts_free(ERTS_ALC_T_TMP, test.array); + } + return res; +} + + +Eterm +erts_get_reader_groups_map(Process *c_p) +{ + Eterm res; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + res = get_cpu_groups_map(c_p, reader_groups_map, 1); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +/* + * CPU groups + */ + +static Eterm +get_cpu_groups_map(Process *c_p, + erts_cpu_groups_map_t *map, + int offset) +{ +#ifdef DEBUG + Eterm *endp; +#endif + Eterm res = NIL, tuple; + Eterm *hp; + int i; + + hp = HAlloc(c_p, map->logical_processors*(2+3)); +#ifdef DEBUG + endp = hp + map->logical_processors*(2+3); +#endif + for (i = map->size - 1; i >= 0; i--) { + if (map->array[i].logical >= 0) { + tuple = TUPLE2(hp, + make_small(map->array[i].logical), + make_small(map->array[i].cpu_group + offset)); + hp += 3; + res = CONS(hp, tuple, res); + hp += 2; + } + } + ASSERT(hp == endp); + return res; +} + +static void +make_available_cpu_topology(erts_avail_cput *no, + erts_avail_cput *avail, + erts_cpu_topology_t *cpudata, + int *size, + int test) +{ + int len = *size; + erts_cpu_topology_t last; + int a, i, j; + + no->level[ERTS_TOPOLOGY_NODE] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1; + no->level[ERTS_TOPOLOGY_CORE] = -1; + no->level[ERTS_TOPOLOGY_THREAD] = -1; + no->level[ERTS_TOPOLOGY_LOGICAL] = -1; + + last.node = INT_MIN; + last.processor = INT_MIN; + last.processor_node = INT_MIN; + last.core = INT_MIN; + last.thread = INT_MIN; + last.logical = INT_MIN; + + a = 0; + + for (i = 0; i < len; i++) { + + if (!test && !erts_is_cpu_available(cpuinfo, cpudata[i].logical)) + continue; + + if (last.node != cpudata[i].node) + goto node; + if (last.processor != cpudata[i].processor) + goto processor; + if (last.processor_node != cpudata[i].processor_node) + goto processor_node; + if (last.core != cpudata[i].core) + goto core; + ASSERT(last.thread != cpudata[i].thread); + goto thread; + + node: + no->level[ERTS_TOPOLOGY_NODE]++; + processor: + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + processor_node: + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + core: + no->level[ERTS_TOPOLOGY_CORE]++; + thread: + no->level[ERTS_TOPOLOGY_THREAD]++; + + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++) + avail[a].level[j] = no->level[j]; + + avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical; + avail[a].level[ERTS_TOPOLOGY_CG] = 0; + + ASSERT(last.logical != cpudata[i].logical); + + last = cpudata[i]; + a++; + } + + no->level[ERTS_TOPOLOGY_NODE]++; + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + no->level[ERTS_TOPOLOGY_CORE]++; + no->level[ERTS_TOPOLOGY_THREAD]++; + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + *size = a; +} + +static void +cpu_group_insert(erts_cpu_groups_map_t *map, + int logical, int cpu_group) +{ + int start = logical % map->size; + int ix = start; + + do { + if (map->array[ix].logical < 0) { + map->array[ix].logical = logical; + map->array[ix].cpu_group = cpu_group; + return; + } + ix++; + if (ix == map->size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Reader groups map full\n"); +} + + +static int +sub_levels(erts_cpu_groups_count_t *cgc, int level, int aix, + int avail_sz, erts_avail_cput *avail) +{ + int sub_level = level+1; + int last = -1; + cgc->sub_levels = 0; + + do { + if (last != avail[aix].level[sub_level]) { + cgc->sub_levels++; + last = avail[aix].level[sub_level]; + } + aix++; + } + while (aix < avail_sz && cgc->id == avail[aix].level[level]); + cgc->cpu_groups = 0; + return aix; +} + +static int +write_cpu_groups(int *cgp, erts_cpu_groups_count_t *cgcp, + int level, int a, + int avail_sz, erts_avail_cput *avail) +{ + int cg = *cgp; + int sub_level = level+1; + int sl_per_gr = cgcp->sub_levels / cgcp->cpu_groups; + int xsl = cgcp->sub_levels % cgcp->cpu_groups; + int sls = 0; + int last = -1; + int xsl_cg_lim = (cgcp->cpu_groups - xsl) + cg + 1; + + ASSERT(level < 0 || avail[a].level[level] == cgcp->id); + + do { + if (last != avail[a].level[sub_level]) { + if (!sls) { + sls = sl_per_gr; + cg++; + if (cg >= xsl_cg_lim) + sls++; + } + last = avail[a].level[sub_level]; + sls--; + } + avail[a].level[ERTS_TOPOLOGY_CG] = cg; + a++; + } while (a < avail_sz && (level < 0 + || avail[a].level[level] == cgcp->id)); + + ASSERT(cgcp->cpu_groups == cg - *cgp); + + *cgp = cg; + + return a; +} + +static int +cg_count_sub_levels_compare(const void *vx, const void *vy) +{ + erts_cpu_groups_count_t *x = (erts_cpu_groups_count_t *) vx; + erts_cpu_groups_count_t *y = (erts_cpu_groups_count_t *) vy; + if (x->sub_levels != y->sub_levels) + return y->sub_levels - x->sub_levels; + return x->id - y->id; +} + +static int +cg_count_id_compare(const void *vx, const void *vy) +{ + erts_cpu_groups_count_t *x = (erts_cpu_groups_count_t *) vx; + erts_cpu_groups_count_t *y = (erts_cpu_groups_count_t *) vy; + return x->id - y->id; +} + +static void +make_cpu_groups_map(erts_cpu_groups_map_t *map, int test) +{ + int i, spread_level, avail_sz; + erts_avail_cput no, *avail; + erts_cpu_topology_t *cpudata; + ErtsAlcType_t alc_type = (test + ? ERTS_ALC_T_TMP + : ERTS_ALC_T_CPU_GRPS_MAP); + + if (map->array) + erts_free(alc_type, map->array); + + map->array = NULL; + map->logical_processors = 0; + map->size = 0; + + if (!map->groups) + return; + + create_tmp_cpu_topology_copy(&cpudata, &avail_sz); + + if (!cpudata) + return; + + cpu_bind_order_sort(cpudata, + avail_sz, + ERTS_CPU_BIND_NO_SPREAD, + 1); + + avail = erts_alloc(ERTS_ALC_T_TMP, + sizeof(erts_avail_cput)*avail_sz); + + make_available_cpu_topology(&no, avail, cpudata, + &avail_sz, test); + + destroy_tmp_cpu_topology_copy(cpudata); + + map->size = avail_sz*2+1; + + map->array = erts_alloc(alc_type, + (sizeof(erts_cpu_groups_map_array_t) + * map->size));; + map->logical_processors = avail_sz; + + for (i = 0; i < map->size; i++) { + map->array[i].logical = -1; + map->array[i].cpu_group = -1; + } + + spread_level = ERTS_TOPOLOGY_CORE; + for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) { + if (no.level[i] > map->groups) { + spread_level = i; + break; + } + } + + if (no.level[spread_level] <= map->groups) { + int a, cg, last = -1; + cg = -1; + ASSERT(spread_level == ERTS_TOPOLOGY_CORE); + for (a = 0; a < avail_sz; a++) { + if (last != avail[a].level[spread_level]) { + cg++; + last = avail[a].level[spread_level]; + } + cpu_group_insert(map, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + cg); + } + } + else { /* map->groups < no.level[spread_level] */ + erts_cpu_groups_count_t *cg_count; + int a, cg, tl, toplevels; + + tl = spread_level-1; + + if (spread_level == ERTS_TOPOLOGY_NODE) + toplevels = 1; + else + toplevels = no.level[tl]; + + cg_count = erts_alloc(ERTS_ALC_T_TMP, + toplevels*sizeof(erts_cpu_groups_count_t)); + + if (toplevels == 1) { + cg_count[0].id = 0; + cg_count[0].sub_levels = no.level[spread_level]; + cg_count[0].cpu_groups = map->groups; + } + else { + int cgs_per_tl, cgs; + cgs = map->groups; + cgs_per_tl = cgs / toplevels; + + a = 0; + for (i = 0; i < toplevels; i++) { + cg_count[i].id = avail[a].level[tl]; + a = sub_levels(&cg_count[i], tl, a, avail_sz, avail); + } + + qsort(cg_count, + toplevels, + sizeof(erts_cpu_groups_count_t), + cg_count_sub_levels_compare); + + for (i = 0; i < toplevels; i++) { + if (cg_count[i].sub_levels < cgs_per_tl) { + cg_count[i].cpu_groups = cg_count[i].sub_levels; + cgs -= cg_count[i].sub_levels; + } + else { + cg_count[i].cpu_groups = cgs_per_tl; + cgs -= cgs_per_tl; + } + } + + while (cgs > 0) { + for (i = 0; i < toplevels; i++) { + if (cg_count[i].sub_levels == cg_count[i].cpu_groups) + break; + else { + cg_count[i].cpu_groups++; + if (--cgs == 0) + break; + } + } + } + + qsort(cg_count, + toplevels, + sizeof(erts_cpu_groups_count_t), + cg_count_id_compare); + } + + a = i = 0; + cg = -1; + while (a < avail_sz) { + a = write_cpu_groups(&cg, &cg_count[i], tl, + a, avail_sz, avail); + i++; + } + + ASSERT(map->groups == cg + 1); + + for (a = 0; a < avail_sz; a++) + cpu_group_insert(map, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + avail[a].level[ERTS_TOPOLOGY_CG]); + + erts_free(ERTS_ALC_T_TMP, cg_count); + } + + erts_free(ERTS_ALC_T_TMP, avail); +} + +static erts_cpu_groups_map_t * +add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg) +{ + int use_groups = groups; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_map_t *cgm; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (use_groups > max_main_threads) + use_groups = max_main_threads; + + if (!use_groups) + return NULL; + + no_cpu_groups_callbacks++; + cgcl = erts_alloc(ERTS_ALC_T_CPU_GRPS_MAP, + sizeof(erts_cpu_groups_callback_list_t)); + cgcl->callback = callback; + cgcl->arg = arg; + + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + if (cgm->groups == use_groups) { + cgcl->next = cgm->callback_list; + cgm->callback_list = cgcl; + return cgm; + } + } + + + cgm = erts_alloc(ERTS_ALC_T_CPU_GRPS_MAP, + sizeof(erts_cpu_groups_map_t)); + cgm->next = cpu_groups_maps; + cgm->groups = use_groups; + cgm->array = NULL; + cgm->size = 0; + cgm->logical_processors = 0; + cgm->callback_list = cgcl; + + cgcl->next = NULL; + + make_cpu_groups_map(cgm, 0); + + cpu_groups_maps = cgm; + + return cgm; +} + +static void +remove_cpu_groups(erts_cpu_groups_callback_t callback, void *arg) +{ + erts_cpu_groups_map_t *prev_cgm, *cgm; + erts_cpu_groups_callback_list_t *prev_cgcl, *cgcl; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + no_cpu_groups_callbacks--; + + prev_cgm = NULL; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + prev_cgcl = NULL; + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + if (cgcl->callback == callback && cgcl->arg == arg) { + if (prev_cgcl) + prev_cgcl->next = cgcl->next; + else + cgm->callback_list = cgcl->next; + erts_free(ERTS_ALC_T_CPU_GRPS_MAP, cgcl); + if (!cgm->callback_list) { + if (prev_cgm) + prev_cgm->next = cgm->next; + else + cpu_groups_maps = cgm->next; + if (cgm->array) + erts_free(ERTS_ALC_T_CPU_GRPS_MAP, cgm->array); + erts_free(ERTS_ALC_T_CPU_GRPS_MAP, cgm); + } + return; + } + prev_cgcl = cgcl; + } + prev_cgm = cgm; + } + + erl_exit(ERTS_ABORT_EXIT, "Cpu groups not found\n"); +} + +static int +cpu_groups_lookup(erts_cpu_groups_map_t *map, + ErtsSchedulerData *esdp) +{ + int start, logical, ix; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (esdp->cpu_id < 0) + return (((int) esdp->no) - 1) % map->groups; + + logical = esdp->cpu_id; + start = logical % map->size; + ix = start; + + do { + if (map->array[ix].logical == logical) { + int group = map->array[ix].cpu_group; + ASSERT(0 <= group && group < map->groups); + return group; + } + ix++; + if (ix == map->size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical); +} + +static void +update_cpu_groups_maps(void) +{ + erts_cpu_groups_map_t *cgm; + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) + make_cpu_groups_map(cgm, 0); +} + +void +erts_add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg) +{ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + add_cpu_groups(groups, callback, arg); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); +} + +void erts_remove_cpu_groups(erts_cpu_groups_callback_t callback, + void *arg) +{ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + remove_cpu_groups(callback, arg); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); +} diff --git a/erts/emulator/beam/erl_cpu_topology.h b/erts/emulator/beam/erl_cpu_topology.h new file mode 100644 index 0000000000..c5a9520b61 --- /dev/null +++ b/erts/emulator/beam/erl_cpu_topology.h @@ -0,0 +1,105 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2010. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +/* + * Description: CPU topology and related functionality + * + * Author: Rickard Green + */ + +#ifndef ERL_CPU_TOPOLOGY_H__ +#define ERL_CPU_TOPOLOGY_H__ + +void erts_pre_early_init_cpu_topology(int *max_rg_p, + int *conf_p, + int *onln_p, + int *avail_p); +void erts_early_init_cpu_topology(int no_schedulers, + int *max_main_threads_p, + int max_reader_groups, + int *reader_groups_p); +void erts_init_cpu_topology(void); + + +#define ERTS_INIT_SCHED_BIND_TYPE_SUCCESS 0 +#define ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED 1 +#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY 2 +#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE 3 + +int erts_init_scheduler_bind_type_string(char *how); + + +#define ERTS_INIT_CPU_TOPOLOGY_OK 0 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID 1 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE 2 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY 3 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE 4 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES 5 +#define ERTS_INIT_CPU_TOPOLOGY_MISSING_LID 6 +#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS 7 +#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES 8 +#define ERTS_INIT_CPU_TOPOLOGY_MISSING 9 + +int erts_init_cpu_topology_string(char *topology_str); + +void erts_sched_check_cpu_bind(ErtsSchedulerData *esdp); +#ifdef ERTS_SMP +void erts_sched_init_check_cpu_bind(ErtsSchedulerData *esdp); +void erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData *esdp); +void erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData *esdp); +#endif + +int erts_update_cpu_info(void); + +Eterm erts_bind_schedulers(Process *c_p, Eterm how); +Eterm erts_get_schedulers_binds(Process *c_p); + +Eterm erts_get_reader_groups_map(Process *c_p); + +Eterm erts_set_cpu_topology(Process *c_p, Eterm term); +Eterm erts_get_cpu_topology_term(Process *c_p, Eterm which); + +int erts_update_cpu_info(void); +void erts_get_logical_processors(int *conf, int *onln, int *avail); + +int erts_sched_bind_atthrcreate_prepare(void); +int erts_sched_bind_atthrcreate_child(int unbind); +void erts_sched_bind_atthrcreate_parent(int unbind); + +int erts_sched_bind_atfork_prepare(void); +int erts_sched_bind_atfork_child(int unbind); +char *erts_sched_bind_atvfork_child(int unbind); +void erts_sched_bind_atfork_parent(int unbind); + +Eterm erts_fake_scheduler_bindings(Process *p, Eterm how); +Eterm erts_debug_cpu_groups_map(Process *c_p, int groups); + + +typedef void (*erts_cpu_groups_callback_t)(int, + ErtsSchedulerData *, + int, + void *); + +void erts_add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg); +void erts_remove_cpu_groups(erts_cpu_groups_callback_t callback, + void *arg); + +#endif diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c index 52d5f86ee0..1c2c0fe4cb 100644 --- a/erts/emulator/beam/erl_db.c +++ b/erts/emulator/beam/erl_db.c @@ -2754,13 +2754,13 @@ void init_db(void) (sizeof(erts_meta_main_tab_lock_t) * (ERTS_META_MAIN_TAB_LOCK_TAB_SIZE+1))); - if ((((Uint) meta_main_tab_locks) & ERTS_CACHE_LINE_MASK) != 0) + if ((((UWord) meta_main_tab_locks) & ERTS_CACHE_LINE_MASK) != 0) meta_main_tab_locks = ((erts_meta_main_tab_lock_t *) - ((((Uint) meta_main_tab_locks) + ((((UWord) meta_main_tab_locks) & ~ERTS_CACHE_LINE_MASK) + ERTS_CACHE_LINE_SIZE)); - ASSERT((((Uint) meta_main_tab_locks) & ERTS_CACHE_LINE_MASK) == 0); + ASSERT((((UWord) meta_main_tab_locks) & ERTS_CACHE_LINE_MASK) == 0); for (i = 0; i < ERTS_META_MAIN_TAB_LOCK_TAB_SIZE; i++) { erts_smp_rwmtx_init_opt_x(&meta_main_tab_locks[i].rwmtx, &rwmtx_opt, diff --git a/erts/emulator/beam/erl_drv_thread.c b/erts/emulator/beam/erl_drv_thread.c index d42820ddf3..17b08a71d4 100644 --- a/erts/emulator/beam/erl_drv_thread.c +++ b/erts/emulator/beam/erl_drv_thread.c @@ -528,7 +528,7 @@ erl_drv_tsd_get(ErlDrvTSDKey key) if (!dtid) return NULL; #endif - if (ERL_DRV_TSD_LEN__ < key) + if (ERL_DRV_TSD_LEN__ <= key) return NULL; return ERL_DRV_TSD__[key]; } diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index 4ae656a3ad..a2fd5921a2 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -41,6 +41,7 @@ #include "erl_printf_term.h" #include "erl_misc_utils.h" #include "packet_parser.h" +#include "erl_cpu_topology.h" #ifdef HIPE #include "hipe_mode_switch.h" /* for hipe_mode_switch_init() */ @@ -63,6 +64,8 @@ extern void ConNormalExit(void); extern void ConWaitForExit(void); #endif +static void erl_init(int ncpu); + #define ERTS_MIN_COMPAT_REL 7 #ifdef ERTS_SMP @@ -76,9 +79,6 @@ int erts_initialized = 0; static erts_tid_t main_thread; #endif -erts_cpu_info_t *erts_cpuinfo; - -int erts_reader_groups; int erts_use_sender_punish; /* @@ -111,7 +111,6 @@ int erts_compat_rel; static int use_multi_run_queue; static int no_schedulers; static int no_schedulers_online; -static int max_reader_groups; #ifdef DEBUG Uint32 verbose; /* See erl_debug.h for information about verbose */ @@ -230,18 +229,18 @@ void erl_error(char *fmt, va_list args) erts_vfprintf(stderr, fmt, args); } -static void early_init(int *argc, char **argv); +static int early_init(int *argc, char **argv); void erts_short_init(void) { - early_init(NULL, NULL); - erl_init(); + int ncpu = early_init(NULL, NULL); + erl_init(ncpu); erts_initialized = 1; } -void -erl_init(void) +static void +erl_init(int ncpu) { init_benchmarking(); @@ -252,11 +251,11 @@ erl_init(void) erts_init_monitors(); erts_init_gc(); init_time(); - erts_init_process(); + erts_init_process(ncpu); erts_init_scheduling(use_multi_run_queue, no_schedulers, no_schedulers_online); - + erts_init_cpu_topology(); /* Must be after init_scheduling */ H_MIN_SIZE = erts_next_heap_size(H_MIN_SIZE, 0); BIN_VH_MIN_SIZE = erts_next_heap_size(BIN_VH_MIN_SIZE, 0); @@ -535,7 +534,8 @@ void erts_usage(void) erts_fprintf(stderr, "-W<i|w> set error logger warnings mapping,\n"); erts_fprintf(stderr, " see error_logger documentation for details\n"); - + erts_fprintf(stderr, "-zdbbl size set the distribution buffer busy limit in kilobytes\n"); + erts_fprintf(stderr, " valid range is [1-%d]\n", INT_MAX/1024); erts_fprintf(stderr, "\n"); erts_fprintf(stderr, "Note that if the emulator is started with erlexec (typically\n"); erts_fprintf(stderr, "from the erl script), these flags should be specified with +.\n"); @@ -587,7 +587,7 @@ static void ethr_ll_free(void *ptr) #endif -static void +static int early_init(int *argc, char **argv) /* * Only put things here which are * really important initialize @@ -600,6 +600,10 @@ early_init(int *argc, char **argv) /* int ncpuavail; int schdlrs; int schdlrs_onln; + int max_main_threads; + int max_reader_groups; + int reader_groups; + use_multi_run_queue = 1; erts_printf_eterm_func = erts_printf_term; erts_disable_tolerant_timeofday = 0; @@ -615,13 +619,11 @@ early_init(int *argc, char **argv) /* erts_use_sender_punish = 1; - erts_cpuinfo = erts_cpu_info_create(); - -#ifdef ERTS_SMP - ncpu = erts_get_cpu_configured(erts_cpuinfo); - ncpuonln = erts_get_cpu_online(erts_cpuinfo); - ncpuavail = erts_get_cpu_available(erts_cpuinfo); -#else + erts_pre_early_init_cpu_topology(&max_reader_groups, + &ncpu, + &ncpuonln, + &ncpuavail); +#ifndef ERTS_SMP ncpu = 1; ncpuonln = 1; ncpuavail = 1; @@ -664,15 +666,9 @@ early_init(int *argc, char **argv) /* ? ncpuavail : (ncpuonln > 0 ? ncpuonln : no_schedulers)); -#ifdef ERTS_SMP - erts_max_main_threads = no_schedulers_online; -#endif - schdlrs = no_schedulers; schdlrs_onln = no_schedulers_online; - max_reader_groups = ERTS_MAX_READER_GROUPS; - if (argc && argv) { int i = 1; while (i < *argc) { @@ -768,9 +764,13 @@ early_init(int *argc, char **argv) /* erts_alloc_init(argc, argv, &alloc_opts); /* Handles (and removes) -M flags. */ - - erts_early_init_scheduling(); /* Require allocators */ - erts_init_utils(); /* Require allocators */ + /* Require allocators */ + erts_early_init_scheduling(); + erts_init_utils(); + erts_early_init_cpu_topology(no_schedulers, + &max_main_threads, + max_reader_groups, + &reader_groups); #ifdef USE_THREADS { @@ -784,24 +784,13 @@ early_init(int *argc, char **argv) /* elid.mem.ll.alloc = ethr_ll_alloc; elid.mem.ll.realloc = ethr_ll_realloc; elid.mem.ll.free = ethr_ll_free; - -#ifdef ERTS_SMP - elid.main_threads = erts_max_main_threads; -#else - elid.main_threads = 1; -#endif - elid.reader_groups = (elid.main_threads > 1 - ? elid.main_threads - : 0); - if (max_reader_groups <= 1) - elid.reader_groups = 0; - if (elid.reader_groups > max_reader_groups) - elid.reader_groups = max_reader_groups; - erts_reader_groups = elid.reader_groups; + elid.main_threads = max_main_threads; + elid.reader_groups = reader_groups; erts_thr_late_init(&elid); } #endif + #ifdef ERTS_ENABLE_LOCK_CHECK erts_lc_late_init(); #endif @@ -818,7 +807,9 @@ early_init(int *argc, char **argv) /* erl_sys_args(argc, argv); erts_ets_realloc_always_moves = 0; + erts_dist_buf_busy_limit = ERTS_DE_BUSY_LIMIT; + return ncpu; } #ifndef ERTS_SMP @@ -852,8 +843,7 @@ erl_start(int argc, char **argv) char envbuf[21]; /* enough for any 64-bit integer */ size_t envbufsz; int async_max_threads = erts_async_max_threads; - - early_init(&argc, argv); + int ncpu = early_init(&argc, argv); envbufsz = sizeof(envbuf); if (erts_sys_getenv(ERL_MAX_ETS_TABLES_ENV, envbuf, &envbufsz) == 0) @@ -1110,7 +1100,7 @@ erl_start(int argc, char **argv) char *sub_param = argv[i]+2; if (has_prefix("bt", sub_param)) { arg = get_arg(sub_param+2, argv[i+1], &i); - res = erts_init_scheduler_bind_type(arg); + res = erts_init_scheduler_bind_type_string(arg); if (res != ERTS_INIT_SCHED_BIND_TYPE_SUCCESS) { switch (res) { case ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED: @@ -1135,7 +1125,7 @@ erl_start(int argc, char **argv) } else if (has_prefix("ct", sub_param)) { arg = get_arg(sub_param+2, argv[i+1], &i); - res = erts_init_cpu_topology(arg); + res = erts_init_cpu_topology_string(arg); if (res != ERTS_INIT_CPU_TOPOLOGY_OK) { switch (res) { case ERTS_INIT_CPU_TOPOLOGY_INVALID_ID: @@ -1346,6 +1336,26 @@ erl_start(int argc, char **argv) } break; + case 'z': { + char *sub_param = argv[i]+2; + int new_limit; + + if (has_prefix("dbbl", sub_param)) { + arg = get_arg(sub_param+4, argv[i+1], &i); + new_limit = atoi(arg); + if (new_limit < 1 || INT_MAX/1024 < new_limit) { + erts_fprintf(stderr, "Invalid dbbl limit: %d\n", new_limit); + erts_usage(); + } else { + erts_dist_buf_busy_limit = new_limit*1024; + } + } else { + erts_fprintf(stderr, "bad -z option %s\n", argv[i]); + erts_usage(); + } + break; + } + default: erts_fprintf(stderr, "%s unknown flag %s\n", argv[0], argv[i]); erts_usage(); @@ -1386,7 +1396,7 @@ erl_start(int argc, char **argv) boot_argc = argc - i; /* Number of arguments to init */ boot_argv = &argv[i]; - erl_init(); + erl_init(ncpu); init_shared_memory(boot_argc, boot_argv); load_preloaded(); diff --git a/erts/emulator/beam/erl_lock_check.c b/erts/emulator/beam/erl_lock_check.c index d6138fa4e4..04c7dbd2ec 100644 --- a/erts/emulator/beam/erl_lock_check.c +++ b/erts/emulator/beam/erl_lock_check.c @@ -128,8 +128,8 @@ static erts_lc_lock_order_t erts_lock_order[] = { { "removed_fd_pre_alloc_lock", NULL }, { "state_prealloc", NULL }, { "schdlr_sspnd", NULL }, - { "cpu_bind", NULL }, { "run_queue", "address" }, + { "cpu_info", NULL }, { "pollset", "address" }, #ifdef __WIN32__ { "pollwaiter", "address" }, diff --git a/erts/emulator/beam/erl_node_tables.c b/erts/emulator/beam/erl_node_tables.c index d0b08bf72e..8cdda395df 100644 --- a/erts/emulator/beam/erl_node_tables.c +++ b/erts/emulator/beam/erl_node_tables.c @@ -107,7 +107,7 @@ dist_table_alloc(void *dep_tmpl) dep->nlinks = NULL; dep->monitors = NULL; - erts_smp_spinlock_init_x(&dep->qlock, "dist_entry_out_queue", chnl_nr); + erts_smp_mtx_init_x(&dep->qlock, "dist_entry_out_queue", chnl_nr); dep->qflgs = 0; dep->qsize = 0; dep->out_queue.first = NULL; @@ -172,7 +172,7 @@ dist_table_free(void *vdep) ASSERT(!dep->cache); erts_smp_rwmtx_destroy(&dep->rwmtx); erts_smp_mtx_destroy(&dep->lnk_mtx); - erts_smp_spinlock_destroy(&dep->qlock); + erts_smp_mtx_destroy(&dep->qlock); #ifdef DEBUG sys_memset(vdep, 0x77, sizeof(DistEntry)); @@ -755,9 +755,9 @@ void erts_init_node_tables(void) erts_this_dist_entry->nlinks = NULL; erts_this_dist_entry->monitors = NULL; - erts_smp_spinlock_init_x(&erts_this_dist_entry->qlock, - "dist_entry_out_queue", - make_small(ERST_INTERNAL_CHANNEL_NO)); + erts_smp_mtx_init_x(&erts_this_dist_entry->qlock, + "dist_entry_out_queue", + make_small(ERST_INTERNAL_CHANNEL_NO)); erts_this_dist_entry->qflgs = 0; erts_this_dist_entry->qsize = 0; erts_this_dist_entry->out_queue.first = NULL; diff --git a/erts/emulator/beam/erl_node_tables.h b/erts/emulator/beam/erl_node_tables.h index eb759b87e9..b0a63ae035 100644 --- a/erts/emulator/beam/erl_node_tables.h +++ b/erts/emulator/beam/erl_node_tables.h @@ -131,7 +131,7 @@ typedef struct dist_entry_ { ErtsLink *nlinks; /* Link tree with subtrees */ ErtsMonitor *monitors; /* Monitor tree */ - erts_smp_spinlock_t qlock; /* Protects qflgs and out_queue */ + erts_smp_mtx_t qlock; /* Protects qflgs and out_queue */ Uint32 qflgs; Sint qsize; ErtsDistOutputQueue out_queue; diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index 901167a315..f252c2cbe2 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -24,7 +24,6 @@ #endif #include <stddef.h> /* offsetof() */ -#include <ctype.h> #include "sys.h" #include "erl_vm.h" #include "global.h" @@ -39,6 +38,7 @@ #include "erl_threads.h" #include "erl_binary.h" #include "beam_bp.h" +#include "erl_cpu_topology.h" #define ERTS_RUNQ_CHECK_BALANCE_REDS_PER_SCHED (2000*CONTEXT_REDS) #define ERTS_RUNQ_CALL_CHECK_BALANCE_REDS \ @@ -63,8 +63,6 @@ #define ERTS_WAKEUP_OTHER_DEC 10 #define ERTS_WAKEUP_OTHER_FIXED_INC (CONTEXT_REDS/10) -#define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff) - #if 0 || defined(DEBUG) #define ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA #endif @@ -119,10 +117,6 @@ Uint erts_process_tab_index_mask; static int wakeup_other_limit; -#ifdef ERTS_SMP -Uint erts_max_main_threads; -#endif - int erts_sched_thread_suggested_stack_size = -1; #ifdef ERTS_ENABLE_LOCK_CHECK @@ -195,48 +189,6 @@ do { \ #endif -/* - * Cpu topology hierarchy. - */ -#define ERTS_TOPOLOGY_NODE 0 -#define ERTS_TOPOLOGY_PROCESSOR 1 -#define ERTS_TOPOLOGY_PROCESSOR_NODE 2 -#define ERTS_TOPOLOGY_CORE 3 -#define ERTS_TOPOLOGY_THREAD 4 -#define ERTS_TOPOLOGY_LOGICAL 5 - -#define ERTS_TOPOLOGY_MAX_DEPTH 6 - -typedef struct { - int bind_id; - int bound_id; -} ErtsCpuBindData; - -static ErtsCpuBindData *scheduler2cpu_map; -erts_smp_rwmtx_t erts_cpu_bind_rwmtx; - -typedef enum { - ERTS_CPU_BIND_UNDEFINED, - ERTS_CPU_BIND_SPREAD, - ERTS_CPU_BIND_PROCESSOR_SPREAD, - ERTS_CPU_BIND_THREAD_SPREAD, - ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD, - ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD, - ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD, - ERTS_CPU_BIND_NO_SPREAD, - ERTS_CPU_BIND_NONE -} ErtsCpuBindOrder; - -#define ERTS_CPU_BIND_DEFAULT_BIND \ - ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD - -ErtsCpuBindOrder cpu_bind_order; - -static erts_cpu_topology_t *user_cpudata; -static int user_cpudata_size; -static erts_cpu_topology_t *system_cpudata; -static int system_cpudata_size; - erts_sched_stat_t erts_sched_stat; ErtsRunQueue *erts_common_run_queue; @@ -259,11 +211,6 @@ ErtsSchedulerData *erts_scheduler_data; ErtsAlignedRunQueue *erts_aligned_run_queues; Uint erts_no_run_queues; -typedef union { - ErtsSchedulerData esd; - char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsSchedulerData))]; -} ErtsAlignedSchedulerData; - ErtsAlignedSchedulerData *erts_aligned_scheduler_data; #ifdef ERTS_SMP @@ -334,12 +281,6 @@ ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(proclist, 200, ERTS_ALC_T_PROC_LIST) -#define ERTS_RUNQ_IX(IX) \ - (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_run_queues), \ - &erts_aligned_run_queues[(IX)].runq) -#define ERTS_SCHEDULER_IX(IX) \ - (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ - &erts_aligned_scheduler_data[(IX)].esd) #define ERTS_SCHED_SLEEP_INFO_IX(IX) \ (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ &aligned_sched_sleep_info[(IX)].ssi) @@ -398,22 +339,8 @@ static int stack_element_dump(int to, void *to_arg, Process* p, Eterm* sp, #ifdef ERTS_SMP static void handle_pending_exiters(ErtsProcList *); -static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata, - int size, - ErtsCpuBindOrder bind_order, - int mk_seq); -static void signal_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size); - #endif -static int reader_group_lookup(int logical); -static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, - int *cpudata_size); -static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata); - -static void early_cpu_bind_init(void); -static void late_cpu_bind_init(void); - #if defined(ERTS_SMP) && defined(ERTS_ENABLE_LOCK_CHECK) int erts_smp_lc_runq_is_locked(ErtsRunQueue *runq) @@ -469,13 +396,13 @@ erts_pre_init_process(void) /* initialize the scheduler */ void -erts_init_process(void) +erts_init_process(int ncpu) { Uint proc_bits = ERTS_PROC_BITS; #ifdef ERTS_SMP erts_disable_proc_not_running_opt = 0; - erts_init_proc_lock(); + erts_init_proc_lock(ncpu); #endif init_proclist_alloc(); @@ -1060,6 +987,8 @@ scheduler_wait(long *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) sys_poll_aux_work: + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + erl_sys_schedule(1); /* Might give us something to do */ dt = do_time_read_and_reset(); @@ -1155,6 +1084,8 @@ scheduler_wait(long *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) erts_smp_runq_unlock(rq); + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + erl_sys_schedule(0); dt = do_time_read_and_reset(); @@ -1242,7 +1173,7 @@ wake_scheduler(ErtsRunQueue *rq, int incq, int one) do { ErtsSchedulerSleepInfo *wake_ssi = ssi; ssi = ssi->next; - erts_sched_finish_poke(ssi, ssi_flags_set_wake(wake_ssi)); + erts_sched_finish_poke(wake_ssi, ssi_flags_set_wake(wake_ssi)); } while (ssi); } } @@ -1335,6 +1266,31 @@ erts_smp_notify_inc_runq(ErtsRunQueue *runq) smp_notify_inc_runq(runq); } +void +erts_sched_notify_check_cpu_bind(void) +{ +#ifdef ERTS_SMP + int ix; + if (erts_common_run_queue) { + for (ix = 0; ix < erts_no_schedulers; ix++) + erts_smp_atomic_set(&ERTS_SCHEDULER_IX(ix)->chk_cpu_bind, 1); + wake_all_schedulers(); + } + else { + for (ix = 0; ix < erts_no_run_queues; ix++) { + ErtsRunQueue *rq = ERTS_RUNQ_IX(ix); + erts_smp_runq_lock(rq); + rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; + erts_smp_runq_unlock(rq); + wake_scheduler(rq, 0, 1); + }; + } +#else + erts_sched_check_cpu_bind(erts_get_scheduler_data()); +#endif +} + + #ifdef ERTS_SMP ErtsRunQueue * @@ -2379,7 +2335,6 @@ erts_debug_nbalance(void) void erts_early_init_scheduling(void) { - early_cpu_bind_init(); wakeup_other_limit = ERTS_WAKEUP_OTHER_LIMIT_MEDIUM; } @@ -2528,9 +2483,9 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) aligned_sched_sleep_info = erts_alloc(ERTS_ALC_T_SCHDLR_SLP_INFO, (sizeof(ErtsAlignedSchedulerSleepInfo) *(n+1))); - if ((((Uint) aligned_sched_sleep_info) & ERTS_CACHE_LINE_MASK) == 0) + if ((((UWord) aligned_sched_sleep_info) & ERTS_CACHE_LINE_MASK) == 0) aligned_sched_sleep_info = ((ErtsAlignedSchedulerSleepInfo *) - ((((Uint) aligned_sched_sleep_info) + ((((UWord) aligned_sched_sleep_info) & ~ERTS_CACHE_LINE_MASK) + ERTS_CACHE_LINE_SIZE)); for (ix = 0; ix < n; ix++) { @@ -2656,8 +2611,6 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) /* init port tasks */ erts_port_task_init(); - - late_cpu_bind_init(); } ErtsRunQueue * @@ -2883,12 +2836,10 @@ suspend_scheduler(ErtsSchedulerData *esdp) long flgs; int changing; long no = (long) esdp->no; - ErtsRunQueue *rq = esdp->run_queue; ErtsSchedulerSleepInfo *ssi = esdp->ssi; long active_schedulers; int curr_online = 1; int wake = 0; - int reset_read_group = 0; #if defined(ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK) \ || defined(ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK) long aux_work; @@ -2909,20 +2860,7 @@ suspend_scheduler(ErtsSchedulerData *esdp) erts_smp_runq_unlock(esdp->run_queue); - /* Unbind from cpu */ - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - if (scheduler2cpu_map[esdp->no].bound_id >= 0 - && erts_unbind_from_cpu(erts_cpuinfo) == 0) { - esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; - reset_read_group = 1; - } - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - if (reset_read_group) - erts_smp_rwmtx_set_reader_group(0); - - if (esdp->no <= erts_max_main_threads) - erts_thr_set_main_status(0, 0); + erts_sched_check_cpu_bind_prep_suspend(esdp); if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_inactive); @@ -3056,17 +2994,10 @@ suspend_scheduler(ErtsSchedulerData *esdp) if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_active); - if (esdp->no <= erts_max_main_threads) - erts_thr_set_main_status(1, (int) esdp->no); - erts_smp_runq_lock(esdp->run_queue); non_empty_runq(esdp->run_queue); - /* Make sure we check if we should bind to a cpu or not... */ - if (rq->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) - erts_smp_atomic_set(&esdp->chk_cpu_bind, 1); - else - rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; + erts_sched_check_cpu_bind_post_suspend(esdp); } #define ERTS_RUNQ_RESET_SUSPEND_INFO(RQ, DBG_ID) \ @@ -3583,15 +3514,7 @@ sched_thread_func(void *vesdp) erts_tsd_set(sched_data_key, vesdp); #ifdef ERTS_SMP - if (no <= erts_max_main_threads) { - erts_thr_set_main_status(1, (int) no); - if (erts_reader_groups) { - int rg = (int) no; - if (rg > erts_reader_groups) - rg = (((int) no) - 1) % erts_reader_groups + 1; - erts_smp_rwmtx_set_reader_group(rg); - } - } + erts_sched_init_check_cpu_bind((ErtsSchedulerData *) vesdp); erts_proc_lock_prepare_proc_lock_waiter(); ERTS_SCHED_SLEEP_INFO_IX(no - 1)->event = erts_tse_fetch(); @@ -3693,1907 +3616,6 @@ erts_start_schedulers(void) #endif /* ERTS_SMP */ -static int -int_cmp(const void *vx, const void *vy) -{ - return *((int *) vx) - *((int *) vy); -} - -static int -cpu_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->core != y->core) - return x->core - y->core; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->node != y->node) - return x->node - y->node; - return 0; -} - -static int -cpu_processor_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - if (x->node != y->node) - return x->node - y->node; - if (x->processor != y->processor) - return x->processor - y->processor; - return 0; -} - -static int -cpu_thread_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->node != y->node) - return x->node - y->node; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - return 0; -} - -static int -cpu_thread_no_node_processor_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->node != y->node) - return x->node - y->node; - if (x->core != y->core) - return x->core - y->core; - if (x->processor != y->processor) - return x->processor - y->processor; - return 0; -} - -static int -cpu_no_node_processor_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->node != y->node) - return x->node - y->node; - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->core != y->core) - return x->core - y->core; - if (x->processor != y->processor) - return x->processor - y->processor; - return 0; -} - -static int -cpu_no_node_thread_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->node != y->node) - return x->node - y->node; - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->core != y->core) - return x->core - y->core; - return 0; -} - -static int -cpu_no_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->node != y->node) - return x->node - y->node; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - if (x->thread != y->thread) - return x->thread - y->thread; - return 0; -} - -static ERTS_INLINE void -make_cpudata_id_seq(erts_cpu_topology_t *cpudata, int size, int no_node) -{ - int ix; - int node = -1; - int processor = -1; - int processor_node = -1; - int processor_node_node = -1; - int core = -1; - int thread = -1; - int old_node = -1; - int old_processor = -1; - int old_processor_node = -1; - int old_core = -1; - int old_thread = -1; - - for (ix = 0; ix < size; ix++) { - if (!no_node || cpudata[ix].node >= 0) { - if (old_node == cpudata[ix].node) - cpudata[ix].node = node; - else { - old_node = cpudata[ix].node; - old_processor = processor = -1; - if (!no_node) - old_processor_node = processor_node = -1; - old_core = core = -1; - old_thread = thread = -1; - if (no_node || cpudata[ix].node >= 0) - cpudata[ix].node = ++node; - } - } - if (old_processor == cpudata[ix].processor) - cpudata[ix].processor = processor; - else { - old_processor = cpudata[ix].processor; - if (!no_node) - processor_node_node = old_processor_node = processor_node = -1; - old_core = core = -1; - old_thread = thread = -1; - cpudata[ix].processor = ++processor; - } - if (no_node && cpudata[ix].processor_node < 0) - old_processor_node = -1; - else { - if (old_processor_node == cpudata[ix].processor_node) { - if (no_node) - cpudata[ix].node = cpudata[ix].processor_node = node; - else { - if (processor_node_node >= 0) - cpudata[ix].node = processor_node_node; - cpudata[ix].processor_node = processor_node; - } - } - else { - old_processor_node = cpudata[ix].processor_node; - old_core = core = -1; - old_thread = thread = -1; - if (no_node) - cpudata[ix].node = cpudata[ix].processor_node = ++node; - else { - cpudata[ix].node = processor_node_node = ++node; - cpudata[ix].processor_node = ++processor_node; - } - } - } - if (!no_node && cpudata[ix].processor_node < 0) - cpudata[ix].processor_node = 0; - if (old_core == cpudata[ix].core) - cpudata[ix].core = core; - else { - old_core = cpudata[ix].core; - old_thread = thread = -1; - cpudata[ix].core = ++core; - } - if (old_thread == cpudata[ix].thread) - cpudata[ix].thread = thread; - else - old_thread = cpudata[ix].thread = ++thread; - } -} - -static void -cpu_bind_order_sort(erts_cpu_topology_t *cpudata, - int size, - ErtsCpuBindOrder bind_order, - int mk_seq) -{ - if (size > 1) { - int no_node = 0; - int (*cmp_func)(const void *, const void *); - switch (bind_order) { - case ERTS_CPU_BIND_SPREAD: - cmp_func = cpu_spread_order_cmp; - break; - case ERTS_CPU_BIND_PROCESSOR_SPREAD: - cmp_func = cpu_processor_spread_order_cmp; - break; - case ERTS_CPU_BIND_THREAD_SPREAD: - cmp_func = cpu_thread_spread_order_cmp; - break; - case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: - no_node = 1; - cmp_func = cpu_thread_no_node_processor_spread_order_cmp; - break; - case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: - no_node = 1; - cmp_func = cpu_no_node_processor_spread_order_cmp; - break; - case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: - no_node = 1; - cmp_func = cpu_no_node_thread_spread_order_cmp; - break; - case ERTS_CPU_BIND_NO_SPREAD: - cmp_func = cpu_no_spread_order_cmp; - break; - default: - cmp_func = NULL; - erl_exit(ERTS_ABORT_EXIT, - "Bad cpu bind type: %d\n", - (int) cpu_bind_order); - break; - } - - if (mk_seq) - make_cpudata_id_seq(cpudata, size, no_node); - - qsort(cpudata, size, sizeof(erts_cpu_topology_t), cmp_func); - } -} - -static int -processor_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->node != y->node) - return x->node - y->node; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - if (x->thread != y->thread) - return x->thread - y->thread; - return 0; -} - -static void -check_cpu_bind(ErtsSchedulerData *esdp) -{ - int rg = 0; - int res; - int cpu_id; - erts_smp_runq_unlock(esdp->run_queue); - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - cpu_id = scheduler2cpu_map[esdp->no].bind_id; - if (cpu_id >= 0 && cpu_id != scheduler2cpu_map[esdp->no].bound_id) { - res = erts_bind_to_cpu(erts_cpuinfo, cpu_id); - if (res == 0) - esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = cpu_id; - else { - erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); - erts_dsprintf(dsbufp, "Scheduler %d failed to bind to cpu %d: %s\n", - (int) esdp->no, cpu_id, erl_errno_id(-res)); - erts_send_error_to_logger_nogl(dsbufp); - if (scheduler2cpu_map[esdp->no].bound_id >= 0) - goto unbind; - } - } - else if (cpu_id < 0) { - unbind: - /* Get rid of old binding */ - res = erts_unbind_from_cpu(erts_cpuinfo); - if (res == 0) - esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; - else if (res != -ENOTSUP) { - erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); - erts_dsprintf(dsbufp, "Scheduler %d failed to unbind from cpu %d: %s\n", - (int) esdp->no, cpu_id, erl_errno_id(-res)); - erts_send_error_to_logger_nogl(dsbufp); - } - } - if (erts_reader_groups) { - if (esdp->cpu_id >= 0) - rg = reader_group_lookup(esdp->cpu_id); - else - rg = (((int) esdp->no) - 1) % erts_reader_groups + 1; - } - erts_smp_runq_lock(esdp->run_queue); -#ifdef ERTS_SMP - if (erts_common_run_queue) - erts_smp_atomic_set(&esdp->chk_cpu_bind, 0); - else { - esdp->run_queue->flags &= ~ERTS_RUNQ_FLG_CHK_CPU_BIND; - } -#endif - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - if (erts_reader_groups) - erts_smp_rwmtx_set_reader_group(rg); -} - -static void -signal_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size) -{ - int s_ix = 1; - int cpu_ix; - - if (cpu_bind_order != ERTS_CPU_BIND_NONE && size) { - - cpu_bind_order_sort(cpudata, size, cpu_bind_order, 1); - - for (cpu_ix = 0; cpu_ix < size && cpu_ix < erts_no_schedulers; cpu_ix++) - if (erts_is_cpu_available(erts_cpuinfo, cpudata[cpu_ix].logical)) - scheduler2cpu_map[s_ix++].bind_id = cpudata[cpu_ix].logical; - } - - if (s_ix <= erts_no_schedulers) - for (; s_ix <= erts_no_schedulers; s_ix++) - scheduler2cpu_map[s_ix].bind_id = -1; - -#ifdef ERTS_SMP - if (erts_common_run_queue) { - for (s_ix = 0; s_ix < erts_no_schedulers; s_ix++) - erts_smp_atomic_set(&ERTS_SCHEDULER_IX(s_ix)->chk_cpu_bind, 1); - wake_all_schedulers(); - } - else { - for (s_ix = 0; s_ix < erts_no_run_queues; s_ix++) { - ErtsRunQueue *rq = ERTS_RUNQ_IX(s_ix); - erts_smp_runq_lock(rq); - rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; - erts_smp_runq_unlock(rq); - wake_scheduler(rq, 0, 1); - }; - } -#else - check_cpu_bind(erts_get_scheduler_data()); -#endif -} - -int -erts_init_scheduler_bind_type(char *how) -{ - if (erts_bind_to_cpu(erts_cpuinfo, -1) == -ENOTSUP) - return ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED; - - if (!system_cpudata && !user_cpudata) - return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY; - - if (sys_strcmp(how, "db") == 0) - cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; - else if (sys_strcmp(how, "s") == 0) - cpu_bind_order = ERTS_CPU_BIND_SPREAD; - else if (sys_strcmp(how, "ps") == 0) - cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; - else if (sys_strcmp(how, "ts") == 0) - cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; - else if (sys_strcmp(how, "tnnps") == 0) - cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; - else if (sys_strcmp(how, "nnps") == 0) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; - else if (sys_strcmp(how, "nnts") == 0) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; - else if (sys_strcmp(how, "ns") == 0) - cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; - else if (sys_strcmp(how, "u") == 0) - cpu_bind_order = ERTS_CPU_BIND_NONE; - else - return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE; - - return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS; -} - -/* - * reader groups map - */ - -typedef struct { - int level[ERTS_TOPOLOGY_MAX_DEPTH+1]; -} erts_avail_cput; - -typedef struct { - int *map; - int size; - int groups; -} erts_reader_groups_map_test; - -typedef struct { - int id; - int sub_levels; - int reader_groups; -} erts_rg_count_t; - -typedef struct { - int logical; - int reader_group; -} erts_reader_groups_map_t; - -typedef struct { - erts_reader_groups_map_t *map; - int map_size; - int logical_processors; - int groups; -} erts_make_reader_groups_map_test; - -static int reader_groups_available_cpu_check; -static int reader_groups_logical_processors; -static int reader_groups_map_size; -static erts_reader_groups_map_t *reader_groups_map; - -#define ERTS_TOPOLOGY_RG ERTS_TOPOLOGY_MAX_DEPTH - -static void -make_reader_groups_map(erts_make_reader_groups_map_test *test); - -static Eterm -get_reader_groups_map(Process *c_p, - erts_reader_groups_map_t *map, - int map_size, - int logical_processors) -{ -#ifdef DEBUG - Eterm *endp; -#endif - Eterm res = NIL, tuple; - Eterm *hp; - int i; - - hp = HAlloc(c_p, logical_processors*(2+3)); -#ifdef DEBUG - endp = hp + logical_processors*(2+3); -#endif - for (i = map_size - 1; i >= 0; i--) { - if (map[i].logical >= 0) { - tuple = TUPLE2(hp, - make_small(map[i].logical), - make_small(map[i].reader_group)); - hp += 3; - res = CONS(hp, tuple, res); - hp += 2; - } - } - ASSERT(hp == endp); - return res; -} - -Eterm -erts_debug_reader_groups_map(Process *c_p, int groups) -{ - Eterm res; - erts_make_reader_groups_map_test test; - - test.groups = groups; - make_reader_groups_map(&test); - if (!test.map) - res = NIL; - else { - res = get_reader_groups_map(c_p, - test.map, - test.map_size, - test.logical_processors); - erts_free(ERTS_ALC_T_TMP, test.map); - } - return res; -} - - -Eterm -erts_get_reader_groups_map(Process *c_p) -{ - Eterm res; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - res = get_reader_groups_map(c_p, - reader_groups_map, - reader_groups_map_size, - reader_groups_logical_processors); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return res; -} - -static void -make_available_cpu_topology(erts_avail_cput *no, - erts_avail_cput *avail, - erts_cpu_topology_t *cpudata, - int *size, - int test) -{ - int len = *size; - erts_cpu_topology_t last; - int a, i, j; - - no->level[ERTS_TOPOLOGY_NODE] = -1; - no->level[ERTS_TOPOLOGY_PROCESSOR] = -1; - no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1; - no->level[ERTS_TOPOLOGY_CORE] = -1; - no->level[ERTS_TOPOLOGY_THREAD] = -1; - no->level[ERTS_TOPOLOGY_LOGICAL] = -1; - - last.node = INT_MIN; - last.processor = INT_MIN; - last.processor_node = INT_MIN; - last.core = INT_MIN; - last.thread = INT_MIN; - last.logical = INT_MIN; - - a = 0; - - for (i = 0; i < len; i++) { - - if (!test && !erts_is_cpu_available(erts_cpuinfo, cpudata[i].logical)) - continue; - - if (last.node != cpudata[i].node) - goto node; - if (last.processor != cpudata[i].processor) - goto processor; - if (last.processor_node != cpudata[i].processor_node) - goto processor_node; - if (last.core != cpudata[i].core) - goto core; - ASSERT(last.thread != cpudata[i].thread); - goto thread; - - node: - no->level[ERTS_TOPOLOGY_NODE]++; - processor: - no->level[ERTS_TOPOLOGY_PROCESSOR]++; - processor_node: - no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; - core: - no->level[ERTS_TOPOLOGY_CORE]++; - thread: - no->level[ERTS_TOPOLOGY_THREAD]++; - - no->level[ERTS_TOPOLOGY_LOGICAL]++; - - for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++) - avail[a].level[j] = no->level[j]; - - avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical; - avail[a].level[ERTS_TOPOLOGY_RG] = 0; - - ASSERT(last.logical != cpudata[a].logical); - - last = cpudata[i]; - a++; - } - - no->level[ERTS_TOPOLOGY_NODE]++; - no->level[ERTS_TOPOLOGY_PROCESSOR]++; - no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; - no->level[ERTS_TOPOLOGY_CORE]++; - no->level[ERTS_TOPOLOGY_THREAD]++; - no->level[ERTS_TOPOLOGY_LOGICAL]++; - - *size = a; -} - -static int -reader_group_lookup(int logical) -{ - int start = logical % reader_groups_map_size; - int ix = start; - - do { - if (reader_groups_map[ix].logical == logical) { - ASSERT(reader_groups_map[ix].reader_group > 0); - return reader_groups_map[ix].reader_group; - } - ix++; - if (ix == reader_groups_map_size) - ix = 0; - } while (ix != start); - - erl_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical); -} - -static void -reader_group_insert(erts_reader_groups_map_t *map, int map_size, - int logical, int reader_group) -{ - int start = logical % map_size; - int ix = start; - - do { - if (map[ix].logical < 0) { - map[ix].logical = logical; - map[ix].reader_group = reader_group; - return; - } - ix++; - if (ix == map_size) - ix = 0; - } while (ix != start); - - erl_exit(ERTS_ABORT_EXIT, "Reader groups map full\n"); -} - - -static int -sub_levels(erts_rg_count_t *rgc, int level, int aix, int avail_sz, erts_avail_cput *avail) -{ - int sub_level = level+1; - int last = -1; - rgc->sub_levels = 0; - - do { - if (last != avail[aix].level[sub_level]) { - rgc->sub_levels++; - last = avail[aix].level[sub_level]; - } - aix++; - } - while (aix < avail_sz && rgc->id == avail[aix].level[level]); - rgc->reader_groups = 0; - return aix; -} - -static int -write_reader_groups(int *rgp, erts_rg_count_t *rgcp, - int level, int a, - int avail_sz, erts_avail_cput *avail) -{ - int rg = *rgp; - int sub_level = level+1; - int sl_per_gr = rgcp->sub_levels / rgcp->reader_groups; - int xsl = rgcp->sub_levels % rgcp->reader_groups; - int sls = 0; - int last = -1; - int xsl_rg_lim = (rgcp->reader_groups - xsl) + rg + 1; - - ASSERT(level < 0 || avail[a].level[level] == rgcp->id) - - do { - if (last != avail[a].level[sub_level]) { - if (!sls) { - sls = sl_per_gr; - rg++; - if (rg >= xsl_rg_lim) - sls++; - } - last = avail[a].level[sub_level]; - sls--; - } - avail[a].level[ERTS_TOPOLOGY_RG] = rg; - a++; - } while (a < avail_sz && (level < 0 - || avail[a].level[level] == rgcp->id)); - - ASSERT(rgcp->reader_groups == rg - *rgp); - - *rgp = rg; - - return a; -} - -static int -rg_count_sub_levels_compare(const void *vx, const void *vy) -{ - erts_rg_count_t *x = (erts_rg_count_t *) vx; - erts_rg_count_t *y = (erts_rg_count_t *) vy; - if (x->sub_levels != y->sub_levels) - return y->sub_levels - x->sub_levels; - return x->id - y->id; -} - -static int -rg_count_id_compare(const void *vx, const void *vy) -{ - erts_rg_count_t *x = (erts_rg_count_t *) vx; - erts_rg_count_t *y = (erts_rg_count_t *) vy; - return x->id - y->id; -} - -static void -make_reader_groups_map(erts_make_reader_groups_map_test *test) -{ - int i, spread_level, avail_sz; - erts_avail_cput no, *avail; - erts_cpu_topology_t *cpudata; - erts_reader_groups_map_t *map; - int map_sz; - int groups = erts_reader_groups; - - if (test) { - test->map = NULL; - test->map_size = 0; - groups = test->groups; - } - - if (!groups) - return; - - if (!test) { - if (reader_groups_map) - erts_free(ERTS_ALC_T_RDR_GRPS_MAP, reader_groups_map); - - reader_groups_logical_processors = 0; - reader_groups_map_size = 0; - reader_groups_map = NULL; - } - - create_tmp_cpu_topology_copy(&cpudata, &avail_sz); - - if (!cpudata) - return; - - cpu_bind_order_sort(cpudata, - avail_sz, - ERTS_CPU_BIND_NO_SPREAD, - 1); - - avail = erts_alloc(ERTS_ALC_T_TMP, - sizeof(erts_avail_cput)*avail_sz); - - make_available_cpu_topology(&no, avail, cpudata, - &avail_sz, test != NULL); - - destroy_tmp_cpu_topology_copy(cpudata); - - map_sz = avail_sz*2+1; - - if (test) { - map = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_reader_groups_map_t) - * map_sz)); - test->map = map; - test->map_size = map_sz; - test->logical_processors = avail_sz; - } - else { - map = erts_alloc(ERTS_ALC_T_RDR_GRPS_MAP, - (sizeof(erts_reader_groups_map_t) - * map_sz)); - reader_groups_map = map; - reader_groups_logical_processors = avail_sz; - reader_groups_map_size = map_sz; - - } - - for (i = 0; i < map_sz; i++) { - map[i].logical = -1; - map[i].reader_group = 0; - } - - spread_level = ERTS_TOPOLOGY_CORE; - for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) { - if (no.level[i] > groups) { - spread_level = i; - break; - } - } - - if (no.level[spread_level] <= groups) { - int a, rg, last = -1; - rg = 0; - ASSERT(spread_level == ERTS_TOPOLOGY_CORE); - for (a = 0; a < avail_sz; a++) { - if (last != avail[a].level[spread_level]) { - rg++; - last = avail[a].level[spread_level]; - } - reader_group_insert(map, - map_sz, - avail[a].level[ERTS_TOPOLOGY_LOGICAL], - rg); - } - } - else { /* groups < no.level[spread_level] */ - erts_rg_count_t *rg_count; - int a, rg, tl, toplevels; - - tl = spread_level-1; - - if (spread_level == ERTS_TOPOLOGY_NODE) - toplevels = 1; - else - toplevels = no.level[tl]; - - rg_count = erts_alloc(ERTS_ALC_T_TMP, - toplevels*sizeof(erts_rg_count_t)); - - if (toplevels == 1) { - rg_count[0].id = 0; - rg_count[0].sub_levels = no.level[spread_level]; - rg_count[0].reader_groups = groups; - } - else { - int rgs_per_tl, rgs; - rgs = groups; - rgs_per_tl = rgs / toplevels; - - a = 0; - for (i = 0; i < toplevels; i++) { - rg_count[i].id = avail[a].level[tl]; - a = sub_levels(&rg_count[i], tl, a, avail_sz, avail); - } - - qsort(rg_count, - toplevels, - sizeof(erts_rg_count_t), - rg_count_sub_levels_compare); - - for (i = 0; i < toplevels; i++) { - if (rg_count[i].sub_levels < rgs_per_tl) { - rg_count[i].reader_groups = rg_count[i].sub_levels; - rgs -= rg_count[i].sub_levels; - } - else { - rg_count[i].reader_groups = rgs_per_tl; - rgs -= rgs_per_tl; - } - } - - while (rgs > 0) { - for (i = 0; i < toplevels; i++) { - if (rg_count[i].sub_levels == rg_count[i].reader_groups) - break; - else { - rg_count[i].reader_groups++; - if (--rgs == 0) - break; - } - } - } - - qsort(rg_count, - toplevels, - sizeof(erts_rg_count_t), - rg_count_id_compare); - } - - a = i = rg = 0; - while (a < avail_sz) { - a = write_reader_groups(&rg, &rg_count[i], tl, - a, avail_sz, avail); - i++; - } - - ASSERT(groups == rg); - - for (a = 0; a < avail_sz; a++) - reader_group_insert(map, - map_sz, - avail[a].level[ERTS_TOPOLOGY_LOGICAL], - avail[a].level[ERTS_TOPOLOGY_RG]); - - erts_free(ERTS_ALC_T_TMP, rg_count); - } - - erts_free(ERTS_ALC_T_TMP, avail); -} - -/* - * CPU topology - */ - -typedef struct { - int *id; - int used; - int size; -} ErtsCpuTopIdSeq; - -typedef struct { - ErtsCpuTopIdSeq logical; - ErtsCpuTopIdSeq thread; - ErtsCpuTopIdSeq core; - ErtsCpuTopIdSeq processor_node; - ErtsCpuTopIdSeq processor; - ErtsCpuTopIdSeq node; -} ErtsCpuTopEntry; - -static void -init_cpu_top_entry(ErtsCpuTopEntry *cte) -{ - int size = 10; - cte->logical.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->logical.size = size; - cte->thread.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->thread.size = size; - cte->core.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->core.size = size; - cte->processor_node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->processor_node.size = size; - cte->processor.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->processor.size = size; - cte->node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->node.size = size; -} - -static void -destroy_cpu_top_entry(ErtsCpuTopEntry *cte) -{ - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->logical.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->thread.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->core.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor_node.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->node.id); -} - -static int -get_cput_value_or_range(int *v, int *vr, char **str) -{ - long l; - char *c = *str; - errno = 0; - if (!isdigit((unsigned char)*c)) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; - l = strtol(c, &c, 10); - if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; - *v = (int) l; - if (*c == '-') { - c++; - if (!isdigit((unsigned char)*c)) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - l = strtol(c, &c, 10); - if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - *vr = (int) l; - } - *str = c; - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -static int -get_cput_id_seq(ErtsCpuTopIdSeq *idseq, char **str) -{ - int ix = 0; - int need_size = 0; - char *c = *str; - - while (1) { - int res; - int val; - int nids; - int val_range = -1; - res = get_cput_value_or_range(&val, &val_range, &c); - if (res != ERTS_INIT_CPU_TOPOLOGY_OK) - return res; - if (val_range < 0 || val_range == val) - nids = 1; - else { - if (val_range > val) - nids = val_range - val + 1; - else - nids = val - val_range + 1; - } - need_size += nids; - if (need_size > idseq->size) { - idseq->size = need_size + 10; - idseq->id = erts_realloc(ERTS_ALC_T_TMP_CPU_IDS, - idseq->id, - sizeof(int)*idseq->size); - } - if (nids == 1) - idseq->id[ix++] = val; - else if (val_range > val) { - for (; val <= val_range; val++) - idseq->id[ix++] = val; - } - else { - for (; val >= val_range; val--) - idseq->id[ix++] = val; - } - if (*c != ',') - break; - c++; - } - *str = c; - idseq->used = ix; - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -static int -get_cput_entry(ErtsCpuTopEntry *cput, char **str) -{ - int h; - char *c = *str; - - cput->logical.used = 0; - cput->thread.id[0] = 0; - cput->thread.used = 1; - cput->core.id[0] = 0; - cput->core.used = 1; - cput->processor_node.id[0] = -1; - cput->processor_node.used = 1; - cput->processor.id[0] = 0; - cput->processor.used = 1; - cput->node.id[0] = -1; - cput->node.used = 1; - - h = ERTS_TOPOLOGY_MAX_DEPTH; - while (*c != ':' && *c != '\0') { - int res; - ErtsCpuTopIdSeq *idseqp; - switch (*c++) { - case 'L': - if (h <= ERTS_TOPOLOGY_LOGICAL) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->logical; - h = ERTS_TOPOLOGY_LOGICAL; - break; - case 't': - case 'T': - if (h <= ERTS_TOPOLOGY_THREAD) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->thread; - h = ERTS_TOPOLOGY_THREAD; - break; - case 'c': - case 'C': - if (h <= ERTS_TOPOLOGY_CORE) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->core; - h = ERTS_TOPOLOGY_CORE; - break; - case 'p': - case 'P': - if (h <= ERTS_TOPOLOGY_PROCESSOR) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->processor; - h = ERTS_TOPOLOGY_PROCESSOR; - break; - case 'n': - case 'N': - if (h <= ERTS_TOPOLOGY_PROCESSOR) { - do_node: - if (h <= ERTS_TOPOLOGY_NODE) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->node; - h = ERTS_TOPOLOGY_NODE; - } - else { - int p_node = 0; - char *p_chk = c; - while (*p_chk != '\0' && *p_chk != ':') { - if (*p_chk == 'p' || *p_chk == 'P') { - p_node = 1; - break; - } - p_chk++; - } - if (!p_node) - goto do_node; - if (h <= ERTS_TOPOLOGY_PROCESSOR_NODE) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->processor_node; - h = ERTS_TOPOLOGY_PROCESSOR_NODE; - } - break; - default: - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE; - } - res = get_cput_id_seq(idseqp, &c); - if (res != ERTS_INIT_CPU_TOPOLOGY_OK) - return res; - } - - if (cput->logical.used < 1) - return ERTS_INIT_CPU_TOPOLOGY_MISSING_LID; - - if (*c == ':') { - c++; - } - - if (cput->thread.used != 1 - && cput->thread.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->core.used != 1 - && cput->core.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->processor_node.used != 1 - && cput->processor_node.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->processor.used != 1 - && cput->processor.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->node.used != 1 - && cput->node.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - - *str = c; - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -static int -verify_topology(erts_cpu_topology_t *cpudata, int size) -{ - if (size > 0) { - int *logical; - int node, processor, no_nodes, i; - - /* Verify logical ids */ - logical = erts_alloc(ERTS_ALC_T_TMP, sizeof(int)*size); - - for (i = 0; i < size; i++) - logical[i] = cpudata[i].logical; - - qsort(logical, size, sizeof(int), int_cmp); - for (i = 0; i < size-1; i++) { - if (logical[i] == logical[i+1]) { - erts_free(ERTS_ALC_T_TMP, logical); - return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS; - } - } - - erts_free(ERTS_ALC_T_TMP, logical); - - qsort(cpudata, size, sizeof(erts_cpu_topology_t), processor_order_cmp); - - /* Verify unique entities */ - - for (i = 1; i < size; i++) { - if (cpudata[i-1].processor == cpudata[i].processor - && cpudata[i-1].node == cpudata[i].node - && (cpudata[i-1].processor_node - == cpudata[i].processor_node) - && cpudata[i-1].core == cpudata[i].core - && cpudata[i-1].thread == cpudata[i].thread) { - return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES; - } - } - - /* Verify numa nodes */ - node = cpudata[0].node; - processor = cpudata[0].processor; - no_nodes = cpudata[0].node < 0 && cpudata[0].processor_node < 0; - for (i = 1; i < size; i++) { - if (no_nodes) { - if (cpudata[i].node >= 0 || cpudata[i].processor_node >= 0) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - } - else { - if (cpudata[i].processor == processor && cpudata[i].node != node) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - node = cpudata[i].node; - processor = cpudata[i].processor; - if (node >= 0 && cpudata[i].processor_node >= 0) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - if (node < 0 && cpudata[i].processor_node < 0) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - } - } - } - - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -int -erts_init_cpu_topology(char *topology_str) -{ - ErtsCpuTopEntry cput; - int need_size; - char *c; - int ix; - int error = ERTS_INIT_CPU_TOPOLOGY_OK; - - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata_size = 10; - - user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(erts_cpu_topology_t) - * user_cpudata_size)); - - init_cpu_top_entry(&cput); - - ix = 0; - need_size = 0; - - c = topology_str; - if (*c == '\0') { - error = ERTS_INIT_CPU_TOPOLOGY_MISSING; - goto fail; - } - do { - int r; - error = get_cput_entry(&cput, &c); - if (error != ERTS_INIT_CPU_TOPOLOGY_OK) - goto fail; - need_size += cput.logical.used; - if (user_cpudata_size < need_size) { - user_cpudata_size = need_size + 10; - user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, - user_cpudata, - (sizeof(erts_cpu_topology_t) - * user_cpudata_size)); - } - - ASSERT(cput.thread.used == 1 - || cput.thread.used == cput.logical.used); - ASSERT(cput.core.used == 1 - || cput.core.used == cput.logical.used); - ASSERT(cput.processor_node.used == 1 - || cput.processor_node.used == cput.logical.used); - ASSERT(cput.processor.used == 1 - || cput.processor.used == cput.logical.used); - ASSERT(cput.node.used == 1 - || cput.node.used == cput.logical.used); - - for (r = 0; r < cput.logical.used; r++) { - user_cpudata[ix].logical = cput.logical.id[r]; - user_cpudata[ix].thread = - cput.thread.id[cput.thread.used == 1 ? 0 : r]; - user_cpudata[ix].core = - cput.core.id[cput.core.used == 1 ? 0 : r]; - user_cpudata[ix].processor_node = - cput.processor_node.id[cput.processor_node.used == 1 ? 0 : r]; - user_cpudata[ix].processor = - cput.processor.id[cput.processor.used == 1 ? 0 : r]; - user_cpudata[ix].node = - cput.node.id[cput.node.used == 1 ? 0 : r]; - ix++; - } - } while (*c != '\0'); - - if (user_cpudata_size != ix) { - user_cpudata_size = ix; - user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, - user_cpudata, - (sizeof(erts_cpu_topology_t) - * user_cpudata_size)); - } - - error = verify_topology(user_cpudata, user_cpudata_size); - if (error == ERTS_INIT_CPU_TOPOLOGY_OK) { - destroy_cpu_top_entry(&cput); - return ERTS_INIT_CPU_TOPOLOGY_OK; - } - - fail: - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata_size = 0; - destroy_cpu_top_entry(&cput); - return error; -} - -#define ERTS_GET_CPU_TOPOLOGY_ERROR -1 -#define ERTS_GET_USED_CPU_TOPOLOGY 0 -#define ERTS_GET_DETECTED_CPU_TOPOLOGY 1 -#define ERTS_GET_DEFINED_CPU_TOPOLOGY 2 - -static Eterm get_cpu_topology_term(Process *c_p, int type); - -Eterm -erts_set_cpu_topology(Process *c_p, Eterm term) -{ - erts_cpu_topology_t *cpudata = NULL; - int cpudata_size = 0; - Eterm res; - - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - res = get_cpu_topology_term(c_p, ERTS_GET_USED_CPU_TOPOLOGY); - if (term == am_undefined) { - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata = NULL; - user_cpudata_size = 0; - - if (cpu_bind_order != ERTS_CPU_BIND_NONE && system_cpudata) { - cpudata_size = system_cpudata_size; - cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * cpudata_size)); - - sys_memcpy((void *) cpudata, - (void *) system_cpudata, - sizeof(erts_cpu_topology_t)*cpudata_size); - } - } - else if (is_not_list(term)) { - error: - res = THE_NON_VALUE; - goto done; - } - else { - Eterm list = term; - int ix = 0; - - cpudata_size = 100; - cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * cpudata_size)); - - while (is_list(list)) { - Eterm *lp = list_val(list); - Eterm cpu = CAR(lp); - Eterm* tp; - Sint id; - - if (is_not_tuple(cpu)) - goto error; - - tp = tuple_val(cpu); - - if (arityval(tp[0]) != 7 || tp[1] != am_cpu) - goto error; - - if (ix >= cpudata_size) { - cpudata_size += 100; - cpudata = erts_realloc(ERTS_ALC_T_TMP, - cpudata, - (sizeof(erts_cpu_topology_t) - * cpudata_size)); - } - - id = signed_val(tp[2]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].node = (int) id; - - id = signed_val(tp[3]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].processor = (int) id; - - id = signed_val(tp[4]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].processor_node = (int) id; - - id = signed_val(tp[5]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].core = (int) id; - - id = signed_val(tp[6]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].thread = (int) id; - - id = signed_val(tp[7]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].logical = (int) id; - - list = CDR(lp); - ix++; - } - - if (is_not_nil(list)) - goto error; - - cpudata_size = ix; - - if (ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(cpudata, cpudata_size)) - goto error; - - if (user_cpudata_size != cpudata_size) { - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - sizeof(erts_cpu_topology_t)*cpudata_size); - user_cpudata_size = cpudata_size; - } - - sys_memcpy((void *) user_cpudata, - (void *) cpudata, - sizeof(erts_cpu_topology_t)*cpudata_size); - } - - make_reader_groups_map(NULL); - - signal_schedulers_bind_change(cpudata, cpudata_size); - - done: - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - if (cpudata) - erts_free(ERTS_ALC_T_TMP, cpudata); - - return res; -} - -static Eterm -bound_schedulers_term(ErtsCpuBindOrder order) -{ - switch (order) { - case ERTS_CPU_BIND_SPREAD: { - ERTS_DECL_AM(spread); - return AM_spread; - } - case ERTS_CPU_BIND_PROCESSOR_SPREAD: { - ERTS_DECL_AM(processor_spread); - return AM_processor_spread; - } - case ERTS_CPU_BIND_THREAD_SPREAD: { - ERTS_DECL_AM(thread_spread); - return AM_thread_spread; - } - case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: { - ERTS_DECL_AM(thread_no_node_processor_spread); - return AM_thread_no_node_processor_spread; - } - case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: { - ERTS_DECL_AM(no_node_processor_spread); - return AM_no_node_processor_spread; - } - case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: { - ERTS_DECL_AM(no_node_thread_spread); - return AM_no_node_thread_spread; - } - case ERTS_CPU_BIND_NO_SPREAD: { - ERTS_DECL_AM(no_spread); - return AM_no_spread; - } - case ERTS_CPU_BIND_NONE: { - ERTS_DECL_AM(unbound); - return AM_unbound; - } - default: - ASSERT(0); - return THE_NON_VALUE; - } -} - -Eterm -erts_bound_schedulers_term(Process *c_p) -{ - ErtsCpuBindOrder order; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - order = cpu_bind_order; - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return bound_schedulers_term(order); -} - -static void -create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, int *cpudata_size) -{ - if (user_cpudata) { - *cpudata_size = user_cpudata_size; - *cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * (*cpudata_size))); - sys_memcpy((void *) *cpudata, - (void *) user_cpudata, - sizeof(erts_cpu_topology_t)*(*cpudata_size)); - } - else if (system_cpudata) { - *cpudata_size = system_cpudata_size; - *cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * (*cpudata_size))); - sys_memcpy((void *) *cpudata, - (void *) system_cpudata, - sizeof(erts_cpu_topology_t)*(*cpudata_size)); - } - else { - *cpudata = NULL; - *cpudata_size = 0; - } -} - -static void -destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata) -{ - if (cpudata) - erts_free(ERTS_ALC_T_TMP, cpudata); -} - -Eterm -erts_bind_schedulers(Process *c_p, Eterm how) -{ - Eterm res; - erts_cpu_topology_t *cpudata; - int cpudata_size; - ErtsCpuBindOrder old_cpu_bind_order; - - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - - if (erts_bind_to_cpu(erts_cpuinfo, -1) == -ENOTSUP) { - ERTS_BIF_PREP_ERROR(res, c_p, EXC_NOTSUP); - } - else { - - old_cpu_bind_order = cpu_bind_order; - - if (ERTS_IS_ATOM_STR("default_bind", how)) - cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; - else if (ERTS_IS_ATOM_STR("spread", how)) - cpu_bind_order = ERTS_CPU_BIND_SPREAD; - else if (ERTS_IS_ATOM_STR("processor_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("no_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; - else if (ERTS_IS_ATOM_STR("unbound", how)) - cpu_bind_order = ERTS_CPU_BIND_NONE; - else { - cpu_bind_order = old_cpu_bind_order; - ERTS_BIF_PREP_ERROR(res, c_p, BADARG); - goto done; - } - - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - - if (!cpudata) { - cpu_bind_order = old_cpu_bind_order; - ERTS_BIF_PREP_ERROR(res, c_p, BADARG); - goto done; - } - - signal_schedulers_bind_change(cpudata, cpudata_size); - - destroy_tmp_cpu_topology_copy(cpudata); - - res = bound_schedulers_term(old_cpu_bind_order); - } - - done: - - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - return res; -} - -Eterm -erts_fake_scheduler_bindings(Process *p, Eterm how) -{ - ErtsCpuBindOrder fake_cpu_bind_order; - erts_cpu_topology_t *cpudata; - int cpudata_size; - Eterm res; - - if (ERTS_IS_ATOM_STR("default_bind", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; - else if (ERTS_IS_ATOM_STR("spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_SPREAD; - else if (ERTS_IS_ATOM_STR("processor_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("no_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; - else if (ERTS_IS_ATOM_STR("unbound", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NONE; - else { - ERTS_BIF_PREP_ERROR(res, p, BADARG); - return res; - } - - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - - if (!cpudata || fake_cpu_bind_order == ERTS_CPU_BIND_NONE) - ERTS_BIF_PREP_RET(res, am_false); - else { - int i; - Eterm *hp; - - cpu_bind_order_sort(cpudata, cpudata_size, fake_cpu_bind_order, 1); - -#ifdef ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA - - erts_fprintf(stderr, "node: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].node); - erts_fprintf(stderr, "\n"); - erts_fprintf(stderr, "processor: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].processor); - erts_fprintf(stderr, "\n"); - if (fake_cpu_bind_order != ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD - && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD - && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD) { - erts_fprintf(stderr, "processor_node:"); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].processor_node); - erts_fprintf(stderr, "\n"); - } - erts_fprintf(stderr, "core: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].core); - erts_fprintf(stderr, "\n"); - erts_fprintf(stderr, "thread: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].thread); - erts_fprintf(stderr, "\n"); - erts_fprintf(stderr, "logical: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].logical); - erts_fprintf(stderr, "\n"); -#endif - - hp = HAlloc(p, cpudata_size+1); - ERTS_BIF_PREP_RET(res, make_tuple(hp)); - *hp++ = make_arityval((Uint) cpudata_size); - for (i = 0; i < cpudata_size; i++) - *hp++ = make_small((Uint) cpudata[i].logical); - } - - destroy_tmp_cpu_topology_copy(cpudata); - - return res; -} - -Eterm -erts_get_schedulers_binds(Process *c_p) -{ - int ix; - ERTS_DECL_AM(unbound); - Eterm *hp = HAlloc(c_p, erts_no_schedulers+1); - Eterm res = make_tuple(hp); - - *(hp++) = make_arityval(erts_no_schedulers); - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - for (ix = 1; ix <= erts_no_schedulers; ix++) - *(hp++) = (scheduler2cpu_map[ix].bound_id >= 0 - ? make_small(scheduler2cpu_map[ix].bound_id) - : AM_unbound); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return res; -} - -static Eterm -bld_topology_term(Eterm **hpp, - Uint *hszp, - erts_cpu_topology_t *cpudata, - int size) -{ - Eterm res = NIL; - int i; - - if (size == 0) - return am_undefined; - - for (i = size-1; i >= 0; i--) { - res = erts_bld_cons(hpp, - hszp, - erts_bld_tuple(hpp, - hszp, - 7, - am_cpu, - make_small(cpudata[i].node), - make_small(cpudata[i].processor), - make_small(cpudata[i].processor_node), - make_small(cpudata[i].core), - make_small(cpudata[i].thread), - make_small(cpudata[i].logical)), - res); - } - return res; -} - -static Eterm -get_cpu_topology_term(Process *c_p, int type) -{ -#ifdef DEBUG - Eterm *hp_end; -#endif - Eterm *hp; - Uint hsz; - Eterm res = THE_NON_VALUE; - erts_cpu_topology_t *cpudata = NULL; - int size = 0; - - switch (type) { - case ERTS_GET_USED_CPU_TOPOLOGY: - if (user_cpudata) - goto defined; - else - goto detected; - case ERTS_GET_DETECTED_CPU_TOPOLOGY: - detected: - if (!system_cpudata) - res = am_undefined; - else { - size = system_cpudata_size; - cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * size)); - sys_memcpy((void *) cpudata, - (void *) system_cpudata, - sizeof(erts_cpu_topology_t)*size); - } - break; - case ERTS_GET_DEFINED_CPU_TOPOLOGY: - defined: - if (!user_cpudata) - res = am_undefined; - else { - size = user_cpudata_size; - cpudata = user_cpudata; - } - break; - default: - erl_exit(ERTS_ABORT_EXIT, "Bad cpu topology type: %d\n", type); - break; - } - - if (res == am_undefined) { - ASSERT(!cpudata); - return res; - } - - hsz = 0; - - bld_topology_term(NULL, &hsz, - cpudata, size); - - hp = HAlloc(c_p, hsz); - -#ifdef DEBUG - hp_end = hp + hsz; -#endif - - res = bld_topology_term(&hp, NULL, - cpudata, size); - - ASSERT(hp_end == hp); - - if (cpudata && cpudata != system_cpudata && cpudata != user_cpudata) - erts_free(ERTS_ALC_T_TMP, cpudata); - - return res; -} - -Eterm -erts_get_cpu_topology_term(Process *c_p, Eterm which) -{ - Eterm res; - int type; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - if (ERTS_IS_ATOM_STR("used", which)) - type = ERTS_GET_USED_CPU_TOPOLOGY; - else if (ERTS_IS_ATOM_STR("detected", which)) - type = ERTS_GET_DETECTED_CPU_TOPOLOGY; - else if (ERTS_IS_ATOM_STR("defined", which)) - type = ERTS_GET_DEFINED_CPU_TOPOLOGY; - else - type = ERTS_GET_CPU_TOPOLOGY_ERROR; - if (type == ERTS_GET_CPU_TOPOLOGY_ERROR) - res = THE_NON_VALUE; - else - res = get_cpu_topology_term(c_p, type); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return res; -} - -static void -early_cpu_bind_init(void) -{ - user_cpudata = NULL; - user_cpudata_size = 0; - - system_cpudata_size = erts_get_cpu_topology_size(erts_cpuinfo); - system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(erts_cpu_topology_t) - * system_cpudata_size)); - - cpu_bind_order = ERTS_CPU_BIND_UNDEFINED; - - reader_groups_available_cpu_check = 1; - reader_groups_logical_processors = 0; - reader_groups_map_size = 0; - reader_groups_map = NULL; - - if (!erts_get_cpu_topology(erts_cpuinfo, system_cpudata) - || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata, - system_cpudata_size)) { - erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); - system_cpudata = NULL; - system_cpudata_size = 0; - } -} - -static void -late_cpu_bind_init(void) -{ - int ix; - - erts_smp_rwmtx_init(&erts_cpu_bind_rwmtx, "cpu_bind"); - - scheduler2cpu_map = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(ErtsCpuBindData) - * (erts_no_schedulers+1))); - for (ix = 1; ix <= erts_no_schedulers; ix++) { - scheduler2cpu_map[ix].bind_id = -1; - scheduler2cpu_map[ix].bound_id = -1; - } - - if (cpu_bind_order == ERTS_CPU_BIND_UNDEFINED) { - int ncpus = erts_get_cpu_configured(erts_cpuinfo); - if (ncpus < 1 || erts_no_schedulers < ncpus) - cpu_bind_order = ERTS_CPU_BIND_NONE; - else - cpu_bind_order = ((system_cpudata || user_cpudata) - && (erts_bind_to_cpu(erts_cpuinfo, -1) != -ENOTSUP) - ? ERTS_CPU_BIND_DEFAULT_BIND - : ERTS_CPU_BIND_NONE); - } - - make_reader_groups_map(NULL); - - if (cpu_bind_order != ERTS_CPU_BIND_NONE) { - erts_cpu_topology_t *cpudata; - int cpudata_size; - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - signal_schedulers_bind_change(cpudata, cpudata_size); - destroy_tmp_cpu_topology_copy(cpudata); - } -} - -int -erts_update_cpu_info(void) -{ - int changed; - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - changed = erts_cpu_info_update(erts_cpuinfo); - if (changed) { - erts_cpu_topology_t *cpudata; - int cpudata_size; - - if (system_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); - - system_cpudata_size = erts_get_cpu_topology_size(erts_cpuinfo); - if (!system_cpudata_size) - system_cpudata = NULL; - else { - system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(erts_cpu_topology_t) - * system_cpudata_size)); - - if (!erts_get_cpu_topology(erts_cpuinfo, system_cpudata) - || (ERTS_INIT_CPU_TOPOLOGY_OK - != verify_topology(system_cpudata, - system_cpudata_size))) { - erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); - system_cpudata = NULL; - system_cpudata_size = 0; - } - } - - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - signal_schedulers_bind_change(cpudata, cpudata_size); - destroy_tmp_cpu_topology_copy(cpudata); - } - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - return changed; -} - #ifdef ERTS_SMP static void @@ -7069,7 +5091,7 @@ Process *schedule(Process *p, int calls) } if ((rq->flags & ERTS_RUNQ_FLG_CHK_CPU_BIND) || erts_smp_atomic_read(&esdp->chk_cpu_bind)) { - check_cpu_bind(esdp); + erts_sched_check_cpu_bind(esdp); } } @@ -7165,7 +5187,9 @@ Process *schedule(Process *p, int calls) erts_smp_atomic_set(&function_calls, 0); fcalls = 0; + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + #ifdef ERTS_SMP /* erts_sys_schedule_interrupt(0); */ #endif @@ -7497,6 +5521,15 @@ erts_schedule_misc_op(void (*func)(void *), void *arg) ErtsRunQueue *rq = erts_get_runq_current(NULL); ErtsMiscOpList *molp = misc_op_list_alloc(); + if (!rq) { + /* + * This can only happen when the sys msg dispatcher + * thread schedules misc ops (this happens *very* + * seldom; only when trace drivers are unloaded). + */ + rq = ERTS_RUNQ_IX(0); + } + erts_smp_runq_lock(rq); while (rq->misc.evac_runq) { diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h index 4365e409e5..c038e57b65 100644 --- a/erts/emulator/beam/erl_process.h +++ b/erts/emulator/beam/erl_process.h @@ -89,7 +89,6 @@ extern int erts_sched_thread_suggested_stack_size; #define ERTS_SCHED_THREAD_MAX_STACK_SIZE 8192 /* Kilo words */ #ifdef ERTS_SMP -extern Uint erts_max_main_threads; #include "erl_bits.h" #endif @@ -426,6 +425,13 @@ struct ErtsSchedulerData_ { #endif }; +typedef union { + ErtsSchedulerData esd; + char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsSchedulerData))]; +} ErtsAlignedSchedulerData; + +extern ErtsAlignedSchedulerData *erts_aligned_scheduler_data; + #ifndef ERTS_SMP extern ErtsSchedulerData *erts_scheduler_data; #endif @@ -1007,27 +1013,12 @@ extern struct erts_system_profile_flags_t erts_system_profile_flags; (p)->flags &= ~F_TIMO; \ } while (0) - -#define ERTS_INIT_SCHED_BIND_TYPE_SUCCESS 0 -#define ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED 1 -#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY 2 -#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE 3 - -int erts_init_scheduler_bind_type(char *how); - -#define ERTS_INIT_CPU_TOPOLOGY_OK 0 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID 1 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE 2 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY 3 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE 4 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES 5 -#define ERTS_INIT_CPU_TOPOLOGY_MISSING_LID 6 -#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS 7 -#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES 8 -#define ERTS_INIT_CPU_TOPOLOGY_MISSING 9 - -int erts_init_cpu_topology(char *topology_str); -int erts_update_cpu_info(void); +#define ERTS_RUNQ_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_run_queues), \ + &erts_aligned_run_queues[(IX)].runq) +#define ERTS_SCHEDULER_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ + &erts_aligned_scheduler_data[(IX)].esd) void erts_pre_init_process(void); void erts_late_init_process(void); @@ -1058,8 +1049,9 @@ Eterm erts_multi_scheduling_blockers(Process *); void erts_start_schedulers(void); void erts_smp_notify_check_children_needed(void); #endif +void erts_sched_notify_check_cpu_bind(void); Uint erts_active_schedulers(void); -void erts_init_process(void); +void erts_init_process(int); Eterm erts_process_status(Process *, ErtsProcLocks, Process *, Eterm); Uint erts_run_queues_len(Uint *); void erts_add_to_runq(Process *); diff --git a/erts/emulator/beam/erl_process_lock.c b/erts/emulator/beam/erl_process_lock.c index a4d12139e9..1bebcdb911 100644 --- a/erts/emulator/beam/erl_process_lock.c +++ b/erts/emulator/beam/erl_process_lock.c @@ -117,10 +117,9 @@ static int aux_thr_proc_lock_spin_count; static void cleanup_tse(void); void -erts_init_proc_lock(void) +erts_init_proc_lock(int cpus) { int i; - int cpus; erts_smp_spinlock_init(&qs_lock, "proc_lck_qs_alloc"); for (i = 0; i < ERTS_NO_OF_PIX_LOCKS; i++) { #ifdef ERTS_ENABLE_LOCK_COUNT @@ -138,7 +137,6 @@ erts_init_proc_lock(void) lc_id.proc_lock_msgq = erts_lc_get_lock_order_id("proc_msgq"); lc_id.proc_lock_status = erts_lc_get_lock_order_id("proc_status"); #endif - cpus = erts_get_cpu_configured(erts_cpuinfo); if (cpus > 1) { proc_lock_spin_count = ERTS_PROC_LOCK_SPIN_COUNT_BASE; proc_lock_spin_count += (ERTS_PROC_LOCK_SPIN_COUNT_SCHED_INC diff --git a/erts/emulator/beam/erl_process_lock.h b/erts/emulator/beam/erl_process_lock.h index 7cfc9893fa..4fe30c7209 100644 --- a/erts/emulator/beam/erl_process_lock.h +++ b/erts/emulator/beam/erl_process_lock.h @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2007-2009. All Rights Reserved. + * Copyright Ericsson AB 2007-2010. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in @@ -334,7 +334,7 @@ erts_proc_lock_flags_cmpxchg(erts_proc_lock_t *lck, ErtsProcLocks new, extern erts_pix_lock_t erts_pix_locks[ERTS_NO_OF_PIX_LOCKS]; -void erts_init_proc_lock(void); +void erts_init_proc_lock(int cpus); void erts_proc_lock_prepare_proc_lock_waiter(void); void erts_proc_lock_failed(Process *, erts_pix_lock_t *, diff --git a/erts/emulator/beam/erl_threads.h b/erts/emulator/beam/erl_threads.h index 0b7269262e..a74cf79b8c 100644 --- a/erts/emulator/beam/erl_threads.h +++ b/erts/emulator/beam/erl_threads.h @@ -27,9 +27,6 @@ #define ERTS_SPIN_BODY ETHR_SPIN_BODY -#define ERTS_MAX_READER_GROUPS 8 -extern int erts_reader_groups; - #include "sys.h" #ifdef USE_THREADS diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index ecd3c8f68a..12536f6cde 100644 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -1728,11 +1728,6 @@ Uint erts_current_reductions(Process* current, Process *p); int erts_print_system_version(int to, void *arg, Process *c_p); -/* - * Interface to erl_init - */ -void erl_init(void); - #define seq_trace_output(token, msg, type, receiver, process) \ seq_trace_output_generic((token), (msg), (type), (receiver), (process), NIL) #define seq_trace_output_exit(token, msg, type, receiver, exitfrom) \ diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index 79022d5dd7..9ed92bbe03 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -2802,17 +2802,25 @@ driver_deliver_term(ErlDrvPort port, break; case ERL_DRV_INT: /* signed int argument */ ERTS_DDT_CHK_ENOUGH_ARGS(1); +#if HALFWORD_HEAP + erts_bld_sint64(NULL, &need, (Sint64)ptr[0]); +#else /* check for bignum */ if (!IS_SSMALL((Sint)ptr[0])) need += BIG_UINT_HEAP_SIZE; /* use small_to_big */ +#endif ptr++; depth++; break; case ERL_DRV_UINT: /* unsigned int argument */ ERTS_DDT_CHK_ENOUGH_ARGS(1); +#if HALFWORD_HEAP + erts_bld_uint64(NULL, &need, (Uint64)ptr[0]); +#else /* check for bignum */ if (!IS_USMALL(0, (Uint)ptr[0])) need += BIG_UINT_HEAP_SIZE; /* use small_to_big */ +#endif ptr++; depth++; break; @@ -2979,22 +2987,30 @@ driver_deliver_term(ErlDrvPort port, break; case ERL_DRV_INT: /* signed int argument */ +#if HALFWORD_HEAP + mess = erts_bld_sint64(&hp, NULL, (Sint64)ptr[0]); +#else if (IS_SSMALL((Sint)ptr[0])) mess = make_small((Sint)ptr[0]); else { mess = small_to_big((Sint)ptr[0], hp); hp += BIG_UINT_HEAP_SIZE; } +#endif ptr++; break; case ERL_DRV_UINT: /* unsigned int argument */ +#if HALFWORD_HEAP + mess = erts_bld_uint64(&hp, NULL, (Uint64)ptr[0]); +#else if (IS_USMALL(0, (Uint)ptr[0])) mess = make_small((Uint)ptr[0]); else { mess = uint_to_big((Uint)ptr[0], hp); hp += BIG_UINT_HEAP_SIZE; } +#endif ptr++; break; diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h index 0031568af6..93203a08a9 100644 --- a/erts/emulator/beam/sys.h +++ b/erts/emulator/beam/sys.h @@ -466,8 +466,6 @@ static const int zero_value = 0, one_value = 1; # endif /* !__WIN32__ */ #endif /* WANT_NONBLOCKING */ -extern erts_cpu_info_t *erts_cpuinfo; /* erl_init.c */ - __decl_noreturn void __noreturn erl_exit(int n, char*, ...); /* Some special erl_exit() codes: */ diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c index 3de48194fb..18f7cdd15a 100644 --- a/erts/emulator/drivers/common/inet_drv.c +++ b/erts/emulator/drivers/common/inet_drv.c @@ -54,6 +54,9 @@ #ifdef HAVE_IFADDRS_H #include <ifaddrs.h> #endif +#ifdef HAVE_NETPACKET_PACKET_H +#include <netpacket/packet.h> +#endif /* All platforms fail on malloc errors. */ #define FATAL_MALLOC @@ -85,6 +88,7 @@ #include <winsock2.h> #endif #include <windows.h> +#include <iphlpapi.h> #include <Ws2tcpip.h> /* NEED VC 6.0 !!! */ @@ -467,6 +471,7 @@ static int my_strncasecmp(const char *s1, const char *s2, size_t n) #define INET_REQ_IFGET 22 #define INET_REQ_IFSET 23 #define INET_REQ_SUBSCRIBE 24 +#define INET_REQ_GETIFADDRS 25 /* TCP requests */ #define TCP_REQ_ACCEPT 40 #define TCP_REQ_LISTEN 41 @@ -632,15 +637,12 @@ static int my_strncasecmp(const char *s1, const char *s2, size_t n) #define IS_BUSY(d) \ (((d)->state & INET_F_BUSY) == INET_F_BUSY) +#define INET_MAX_OPT_BUFFER (64*1024) + #define INET_DEF_BUFFER 1460 /* default buffer size */ #define INET_MIN_BUFFER 1 /* internal min buffer */ -#define INET_MAX_BUFFER (1024*64) /* internal max buffer */ -/* Note: INET_HIGH_WATERMARK MUST be less than 2*INET_MAX_BUFFER */ #define INET_HIGH_WATERMARK (1024*8) /* 8k pending high => busy */ -/* Note: INET_LOW_WATERMARK MUST be less than INET_MAX_BUFFER and -** less than INET_HIGH_WATERMARK -*/ #define INET_LOW_WATERMARK (1024*4) /* 4k pending => allow more */ #define INET_INFINITY 0xffffffff /* infinity value */ @@ -1251,139 +1253,136 @@ static int load_ip_and_port LOAD_ATOM((spec), (i), (flag) ? am_true : am_false); #endif /* HAVE_SCTP */ +/* Assume a cache line size of 64 bytes */ +#define INET_DRV_CACHE_LINE_SIZE ((ErlDrvUInt) 64) +#define INET_DRV_CACHE_LINE_MASK (INET_DRV_CACHE_LINE_SIZE - 1) + /* ** Binary Buffer Managment ** We keep a stack of usable buffers */ -#define BUFFER_STACK_SIZE 16 +#define BUFFER_STACK_SIZE 14 +#define BUFFER_STACK_MAX_MEM_SIZE (1024*1024) -static erts_smp_spinlock_t inet_buffer_stack_lock; -static ErlDrvBinary* buffer_stack[BUFFER_STACK_SIZE]; -static int buffer_stack_pos = 0; - - -/* - * XXX - * The erts_smp_spin_* functions should not be used by drivers (but this - * driver is special). Replace when driver locking api has been implemented. - * /rickard - */ -#define BUFSTK_LOCK erts_smp_spin_lock(&inet_buffer_stack_lock); -#define BUFSTK_UNLOCK erts_smp_spin_unlock(&inet_buffer_stack_lock); +ErlDrvTSDKey buffer_stack_key; -#ifdef DEBUG -static int tot_buf_allocated = 0; /* memory in use for i_buf */ -static int tot_buf_stacked = 0; /* memory on stack */ -static int max_buf_allocated = 0; /* max allocated */ - -#define COUNT_BUF_ALLOC(sz) do { \ - BUFSTK_LOCK; \ - tot_buf_allocated += (sz); \ - if (tot_buf_allocated > max_buf_allocated) \ - max_buf_allocated = tot_buf_allocated; \ - BUFSTK_UNLOCK; \ -} while(0) - -#define COUNT_BUF_FREE(sz) do { \ - BUFSTK_LOCK; \ - tot_buf_allocated -= (sz); \ - BUFSTK_UNLOCK; \ - } while(0) - -#define COUNT_BUF_STACK(sz) do { \ - BUFSTK_LOCK; \ - tot_buf_stacked += (sz); \ - BUFSTK_UNLOCK; \ - } while(0) +typedef struct { + int mem_size; + int pos; + ErlDrvBinary* stk[BUFFER_STACK_SIZE]; +} InetDrvBufStkBase; -#else +typedef struct { + InetDrvBufStkBase buf; + char align[(((sizeof(InetDrvBufStkBase) - 1) / INET_DRV_CACHE_LINE_SIZE) + 1) + * INET_DRV_CACHE_LINE_SIZE]; +} InetDrvBufStk; + +static InetDrvBufStk *get_bufstk(void) +{ + InetDrvBufStk *bs = erl_drv_tsd_get(buffer_stack_key); + if (bs) + return bs; + bs = driver_alloc(sizeof(InetDrvBufStk) + + INET_DRV_CACHE_LINE_SIZE - 1); + if (!bs) + return NULL; + if ((((ErlDrvUInt) bs) & INET_DRV_CACHE_LINE_MASK) != 0) + bs = ((InetDrvBufStk *) + ((((ErlDrvUInt) bs) & ~INET_DRV_CACHE_LINE_MASK) + + INET_DRV_CACHE_LINE_SIZE)); + erl_drv_tsd_set(buffer_stack_key, bs); + bs->buf.pos = 0; + bs->buf.mem_size = 0; -#define COUNT_BUF_ALLOC(sz) -#define COUNT_BUF_FREE(sz) -#define COUNT_BUF_STACK(sz) + ASSERT(bs == erl_drv_tsd_get(buffer_stack_key)); -#endif + return bs; +} static ErlDrvBinary* alloc_buffer(long minsz) { - ErlDrvBinary* buf = NULL; + InetDrvBufStk *bs = get_bufstk(); - BUFSTK_LOCK; + DEBUGF(("alloc_buffer: %ld\r\n", minsz)); - DEBUGF(("alloc_buffer: sz = %ld, tot = %d, max = %d\r\n", - minsz, tot_buf_allocated, max_buf_allocated)); + if (bs && bs->buf.pos > 0) { + long size; + ErlDrvBinary* buf = bs->buf.stk[--bs->buf.pos]; + size = buf->orig_size; + bs->buf.mem_size -= size; + ASSERT(0 <= bs->buf.mem_size + && bs->buf.mem_size <= BUFFER_STACK_MAX_MEM_SIZE); + if (size >= minsz) + return buf; - if (buffer_stack_pos > 0) { - int origsz; + driver_free_binary(buf); + } - buf = buffer_stack[--buffer_stack_pos]; - origsz = buf->orig_size; - BUFSTK_UNLOCK; - COUNT_BUF_STACK(-origsz); - if (origsz < minsz) { - if ((buf = driver_realloc_binary(buf, minsz)) == NULL) - return NULL; - COUNT_BUF_ALLOC(buf->orig_size - origsz); + ASSERT(!bs || bs->buf.pos != 0 || bs->buf.mem_size == 0); + + return driver_alloc_binary(minsz); +} + +/*#define CHECK_DOUBLE_RELEASE 1*/ +#ifdef CHECK_DOUBLE_RELEASE +static void +check_double_release(InetDrvBufStk *bs, ErlDrvBinary* buf) +{ +#ifdef __GNUC__ +#warning CHECK_DOUBLE_RELEASE is enabled, this is a custom build emulator +#endif + int i; + for (i = 0; i < bs->buf.pos; ++i) { + if (bs->buf.stk[i] == buf) { + erl_exit(ERTS_ABORT_EXIT, + "Multiple buffer release in inet_drv, this " + "is a bug, save the core and send it to " + "[email protected]!"); } } - else { - BUFSTK_UNLOCK; - if ((buf = driver_alloc_binary(minsz)) == NULL) - return NULL; - COUNT_BUF_ALLOC(buf->orig_size); - } - return buf; } +#endif -/* -** Max buffer memory "cached" BUFFER_STACK_SIZE * INET_MAX_BUFFER -** (16 * 64k ~ 1M) -*/ -/*#define CHECK_DOUBLE_RELEASE 1*/ static void release_buffer(ErlDrvBinary* buf) { + InetDrvBufStk *bs; + long size; + DEBUGF(("release_buffer: %ld\r\n", (buf==NULL) ? 0 : buf->orig_size)); - if (buf == NULL) + + if (!buf) return; - BUFSTK_LOCK; - if ((buf->orig_size > INET_MAX_BUFFER) || - (buffer_stack_pos >= BUFFER_STACK_SIZE)) { - BUFSTK_UNLOCK; - COUNT_BUF_FREE(buf->orig_size); + + size = buf->orig_size; + + if (size > BUFFER_STACK_MAX_MEM_SIZE) + goto free_binary; + + bs = get_bufstk(); + if (!bs + || (bs->buf.mem_size + size > BUFFER_STACK_MAX_MEM_SIZE) + || (bs->buf.pos >= BUFFER_STACK_SIZE)) { + free_binary: driver_free_binary(buf); } else { #ifdef CHECK_DOUBLE_RELEASE -#ifdef __GNUC__ -#warning CHECK_DOUBLE_RELEASE is enabled, this is a custom build emulator + check_double_release(bs, buf); #endif - int i; - for (i = 0; i < buffer_stack_pos; ++i) { - if (buffer_stack[i] == buf) { - erl_exit(1,"Multiple buffer release in inet_drv, this is a " - "bug, save the core and send it to " - "[email protected]!"); - } - } -#endif - buffer_stack[buffer_stack_pos++] = buf; - BUFSTK_UNLOCK; - COUNT_BUF_STACK(buf->orig_size); + ASSERT(bs->buf.pos != 0 || bs->buf.mem_size == 0); + + bs->buf.mem_size += size; + bs->buf.stk[bs->buf.pos++] = buf; + + ASSERT(0 <= bs->buf.mem_size + && bs->buf.mem_size <= BUFFER_STACK_MAX_MEM_SIZE); } } static ErlDrvBinary* realloc_buffer(ErlDrvBinary* buf, long newsz) { - ErlDrvBinary* bin; -#ifdef DEBUG - long orig_size = buf->orig_size; -#endif - - if ((bin = driver_realloc_binary(buf,newsz)) != NULL) { - COUNT_BUF_ALLOC(newsz - orig_size); - ; - } - return bin; + return driver_realloc_binary(buf, newsz); } /* use a TRICK, access the refc field to see if any one else has @@ -1397,10 +1396,8 @@ static void free_buffer(ErlDrvBinary* buf) if (buf != NULL) { if (driver_binary_get_refc(buf) == 1) release_buffer(buf); - else { - COUNT_BUF_FREE(buf->orig_size); + else driver_free_binary(buf); - } } } @@ -3404,20 +3401,14 @@ static int inet_init() if (!sock_init()) goto error; - buffer_stack_pos = 0; - - erts_smp_spinlock_init(&inet_buffer_stack_lock, "inet_buffer_stack_lock"); + if (0 != erl_drv_tsd_key_create("inet_buffer_stack_key", &buffer_stack_key)) + goto error; ASSERT(sizeof(struct in_addr) == 4); # if defined(HAVE_IN6) && defined(AF_INET6) ASSERT(sizeof(struct in6_addr) == 16); # endif -#ifdef DEBUG - tot_buf_allocated = 0; - max_buf_allocated = 0; - tot_buf_stacked = 0; -#endif INIT_ATOM(ok); INIT_ATOM(tcp); INIT_ATOM(udp); @@ -3824,39 +3815,81 @@ do { if ((end)-(ptr) < (n)) goto error; } while(0) static char* sockaddr_to_buf(struct sockaddr* addr, char* ptr, char* end) { if (addr->sa_family == AF_INET || addr->sa_family == 0) { - struct in_addr a; - buf_check(ptr,end,sizeof(struct in_addr)); - a = ((struct sockaddr_in*) addr)->sin_addr; - sys_memcpy(ptr, (char*)&a, sizeof(struct in_addr)); - return ptr + sizeof(struct in_addr); + struct in_addr *p = &(((struct sockaddr_in*) addr)->sin_addr); + buf_check(ptr, end, 1 + sizeof(struct in_addr)); + *ptr = INET_AF_INET; + sys_memcpy(ptr+1, (char*)p, sizeof(struct in_addr)); + return ptr + 1 + sizeof(struct in_addr); } #if defined(HAVE_IN6) && defined(AF_INET6) else if (addr->sa_family == AF_INET6) { - struct in6_addr a; - buf_check(ptr,end,sizeof(struct in6_addr)); - a = ((struct sockaddr_in6*) addr)->sin6_addr; - sys_memcpy(ptr, (char*)&a, sizeof(struct in6_addr)); - return ptr + sizeof(struct in6_addr); + struct in6_addr *p = &(((struct sockaddr_in6*) addr)->sin6_addr); + buf_check(ptr, end, 1 + sizeof(struct in6_addr)); + *ptr = INET_AF_INET6; + sys_memcpy(ptr+1, (char*)p, sizeof(struct in6_addr)); + return ptr + 1 + sizeof(struct in6_addr); + } +#endif +#if defined(AF_LINK) + else if (addr->sa_family == AF_LINK) { + struct sockaddr_dl *sdl_p = (struct sockaddr_dl*) addr; + buf_check(ptr, end, 2 + sdl_p->sdl_alen); + put_int16(sdl_p->sdl_alen, ptr); ptr += 2; + sys_memcpy(ptr, sdl_p->sdl_data + sdl_p->sdl_nlen, sdl_p->sdl_alen); + return ptr + sdl_p->sdl_alen; + } +#endif +#if defined(AF_PACKET) && defined(HAVE_NETPACKET_PACKET_H) + else if(addr->sa_family == AF_PACKET) { + struct sockaddr_ll *sll_p = (struct sockaddr_ll*) addr; + buf_check(ptr, end, 2 + sll_p->sll_halen); + put_int16(sll_p->sll_halen, ptr); ptr += 2; + sys_memcpy(ptr, sll_p->sll_addr, sll_p->sll_halen); + return ptr + sll_p->sll_halen; } #endif + return ptr; error: return NULL; - } static char* buf_to_sockaddr(char* ptr, char* end, struct sockaddr* addr) { - buf_check(ptr,end,sizeof(struct in_addr)); - sys_memcpy((char*) &((struct sockaddr_in*)addr)->sin_addr, ptr, - sizeof(struct in_addr)); - addr->sa_family = AF_INET; - return ptr + sizeof(struct in_addr); - + buf_check(ptr,end,1); + switch (*ptr++) { + case INET_AF_INET: { + struct in_addr *p = &((struct sockaddr_in*)addr)->sin_addr; + buf_check(ptr,end,sizeof(struct in_addr)); + sys_memcpy((char*) p, ptr, sizeof(struct in_addr)); + addr->sa_family = AF_INET; + return ptr + sizeof(struct in_addr); + } + case INET_AF_INET6: { + struct in6_addr *p = &((struct sockaddr_in6*)addr)->sin6_addr; + buf_check(ptr,end,sizeof(struct in6_addr)); + sys_memcpy((char*) p, ptr, sizeof(struct in6_addr)); + addr->sa_family = AF_INET6; + return ptr + sizeof(struct in6_addr); + } + } error: return NULL; } +#if defined (IFF_POINTOPOINT) +#define IFGET_FLAGS(cflags) IFGET_FLAGS_P2P(cflags, IFF_POINTOPOINT) +#elif defined IFF_POINTTOPOINT +#define IFGET_FLAGS(cflags) IFGET_FLAGS_P2P(cflags, IFF_POINTTOPOINT) +#endif + +#define IFGET_FLAGS_P2P(cflags, iff_ptp) \ + ((((cflags) & IFF_UP) ? INET_IFF_UP : 0) | \ + (((cflags) & IFF_BROADCAST) ? INET_IFF_BROADCAST : 0) | \ + (((cflags) & IFF_LOOPBACK) ? INET_IFF_LOOPBACK : 0) | \ + (((cflags) & iff_ptp) ? INET_IFF_POINTTOPOINT : 0) | \ + (((cflags) & IFF_UP) ? INET_IFF_RUNNING : 0) | /* emulate running ? */ \ + (((cflags) & IFF_MULTICAST) ? INET_IFF_MULTICAST : 0)) #if defined(__WIN32__) && defined(SIO_GET_INTERFACE_LIST) @@ -3894,7 +3927,6 @@ static int inet_ctl_getiflist(inet_descriptor* desc, char** rbuf, int rsize) return ctl_reply(INET_REP_OK, sbuf, sptr - sbuf, rbuf, rsize); } - /* input is an ip-address in string format i.e A.B.C.D ** scan the INTERFACE_LIST to get the options */ @@ -3980,27 +4012,12 @@ static int inet_ctl_ifget(inet_descriptor* desc, char* buf, int len, break; case INET_IFOPT_FLAGS: { - long eflags = 0; int flags = ifp->iiFlags; /* just enumerate the interfaces (no names) */ - /* translate flags */ - if (flags & IFF_UP) - eflags |= INET_IFF_UP; - if (flags & IFF_BROADCAST) - eflags |= INET_IFF_BROADCAST; - if (flags & IFF_LOOPBACK) - eflags |= INET_IFF_LOOPBACK; - if (flags & IFF_POINTTOPOINT) - eflags |= INET_IFF_POINTTOPOINT; - if (flags & IFF_UP) /* emulate runnign ? */ - eflags |= INET_IFF_RUNNING; - if (flags & IFF_MULTICAST) - eflags |= INET_IFF_MULTICAST; - buf_check(sptr, s_end, 5); *sptr++ = INET_IFOPT_FLAGS; - put_int32(eflags, sptr); + put_int32(IFGET_FLAGS(flags), sptr); sptr += 4; break; } @@ -4021,7 +4038,6 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, return ctl_reply(INET_REP_OK, NULL, 0, rbuf, rsize); } - #elif defined(SIOCGIFCONF) && defined(SIOCSIFFLAGS) /* cygwin has SIOCGIFCONF but not SIOCSIFFLAGS (Nov 2002) */ @@ -4032,69 +4048,77 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, #define SIZEA(p) (sizeof (p)) #endif - -static int inet_ctl_getiflist(inet_descriptor* desc, char** rbuf, int rsize) -{ - struct ifconf ifc; - struct ifreq *ifr; - char *buf; - int buflen, ifc_len, i; - char *sbuf, *sp; - - /* Courtesy of Per Bergqvist and W. Richard Stevens */ - - ifc_len = 0; - buflen = 100 * sizeof(struct ifreq); - buf = ALLOC(buflen); +static int get_ifconf(SOCKET s, struct ifconf *ifcp) { + int ifc_len = 0; + int buflen = 100 * sizeof(struct ifreq); + char *buf = ALLOC(buflen); for (;;) { - ifc.ifc_len = buflen; - ifc.ifc_buf = buf; - if (ioctl(desc->s, SIOCGIFCONF, (char *)&ifc) < 0) { + ifcp->ifc_len = buflen; + ifcp->ifc_buf = buf; + if (ioctl(s, SIOCGIFCONF, (char *)ifcp) < 0) { int res = sock_errno(); if (res != EINVAL || ifc_len) { FREE(buf); - return ctl_error(res, rbuf, rsize); + return -1; } } else { - if (ifc.ifc_len == ifc_len) break; /* buf large enough */ - ifc_len = ifc.ifc_len; + if (ifcp->ifc_len == ifc_len) break; /* buf large enough */ + ifc_len = ifcp->ifc_len; } buflen += 10 * sizeof(struct ifreq); buf = (char *)REALLOC(buf, buflen); } - - sp = sbuf = ALLOC(ifc_len+1); + return 0; +} + +static void free_ifconf(struct ifconf *ifcp) { + FREE(ifcp->ifc_buf); +} + +static int inet_ctl_getiflist(inet_descriptor* desc, char** rbuf, int rsize) +{ + struct ifconf ifc; + struct ifreq *ifrp; + char *sbuf, *sp; + int i; + + /* Courtesy of Per Bergqvist and W. Richard Stevens */ + + if (get_ifconf(desc->s, &ifc) < 0) { + return ctl_error(sock_errno(), rbuf, rsize); + } + + sp = sbuf = ALLOC(ifc.ifc_len+1); *sp++ = INET_REP_OK; i = 0; for (;;) { int n; - - ifr = (struct ifreq *) VOIDP(buf + i); - n = sizeof(ifr->ifr_name) + SIZEA(ifr->ifr_addr); - if (n < sizeof(*ifr)) n = sizeof(*ifr); - if (i+n > ifc_len) break; + + ifrp = (struct ifreq *) VOIDP(ifc.ifc_buf + i); + n = sizeof(ifrp->ifr_name) + SIZEA(ifrp->ifr_addr); + if (n < sizeof(*ifrp)) n = sizeof(*ifrp); + if (i+n > ifc.ifc_len) break; i += n; - - switch (ifr->ifr_addr.sa_family) { + + switch (ifrp->ifr_addr.sa_family) { #if defined(HAVE_IN6) && defined(AF_INET6) case AF_INET6: #endif case AF_INET: - ASSERT(sp+IFNAMSIZ+1 < sbuf+buflen+1) - strncpy(sp, ifr->ifr_name, IFNAMSIZ); + ASSERT(sp+IFNAMSIZ+1 < sbuf+ifc.ifc_len+1) + strncpy(sp, ifrp->ifr_name, IFNAMSIZ); sp[IFNAMSIZ] = '\0'; sp += strlen(sp), ++sp; } - - if (i >= ifc_len) break; + + if (i >= ifc.ifc_len) break; } - FREE(buf); + free_ifconf(&ifc); *rbuf = sbuf; return sp - sbuf; } - /* FIXME: temporary hack */ #ifndef IFHWADDRLEN #define IFHWADDRLEN 6 @@ -4133,37 +4157,52 @@ static int inet_ctl_ifget(inet_descriptor* desc, char* buf, int len, #ifdef SIOCGIFHWADDR if (ioctl(desc->s, SIOCGIFHWADDR, (char *)&ifreq) < 0) break; - buf_check(sptr, s_end, 1+IFHWADDRLEN); + buf_check(sptr, s_end, 1+2+IFHWADDRLEN); *sptr++ = INET_IFOPT_HWADDR; + put_int16(IFHWADDRLEN, sptr); sptr += 2; /* raw memcpy (fix include autoconf later) */ sys_memcpy(sptr, (char*)(&ifreq.ifr_hwaddr.sa_data), IFHWADDRLEN); sptr += IFHWADDRLEN; -#elif defined(HAVE_GETIFADDRS) - struct ifaddrs *ifa, *ifp; - int found = 0; - - if (getifaddrs(&ifa) == -1) - goto error; +#elif defined(SIOCGENADDR) + if (ioctl(desc->s, SIOCGENADDR, (char *)&ifreq) < 0) + break; + buf_check(sptr, s_end, 1+2+sizeof(ifreq.ifr_enaddr)); + *sptr++ = INET_IFOPT_HWADDR; + put_int16(sizeof(ifreq.ifr_enaddr), sptr); sptr += 2; + /* raw memcpy (fix include autoconf later) */ + sys_memcpy(sptr, (char*)(&ifreq.ifr_enaddr), + sizeof(ifreq.ifr_enaddr)); + sptr += sizeof(ifreq.ifr_enaddr); +#elif defined(HAVE_GETIFADDRS) && defined(AF_LINK) + struct ifaddrs *ifa, *ifp; + struct sockaddr_dl *sdlp; + int found = 0; + + if (getifaddrs(&ifa) == -1) + goto error; - for (ifp = ifa; ifp; ifp = ifp->ifa_next) { - if ((ifp->ifa_addr->sa_family == AF_LINK) && - (sys_strcmp(ifp->ifa_name, ifreq.ifr_name) == 0)) { - found = 1; - break; - } - } + for (ifp = ifa; ifp; ifp = ifp->ifa_next) { + if ((ifp->ifa_addr->sa_family == AF_LINK) && + (sys_strcmp(ifp->ifa_name, ifreq.ifr_name) == 0)) { + found = 1; + break; + } + } - if (found == 0) { - freeifaddrs(ifa); - break; - } + if (found == 0) { + freeifaddrs(ifa); + break; + } + sdlp = (struct sockaddr_dl *)ifp->ifa_addr; - buf_check(sptr, s_end, 1+IFHWADDRLEN); - *sptr++ = INET_IFOPT_HWADDR; - sys_memcpy(sptr, ((struct sockaddr_dl *)ifp->ifa_addr)->sdl_data + - ((struct sockaddr_dl *)ifp->ifa_addr)->sdl_nlen, IFHWADDRLEN); - freeifaddrs(ifa); - sptr += IFHWADDRLEN; + buf_check(sptr, s_end, 1+2+sdlp->sdl_alen); + *sptr++ = INET_IFOPT_HWADDR; + put_int16(sdlp->sdl_alen, sptr); sptr += 2; + sys_memcpy(sptr, + sdlp->sdl_data + sdlp->sdl_nlen, + sdlp->sdl_alen); + freeifaddrs(ifa); + sptr += sdlp->sdl_alen; #endif break; } @@ -4240,29 +4279,15 @@ static int inet_ctl_ifget(inet_descriptor* desc, char* buf, int len, case INET_IFOPT_FLAGS: { int flags; - int eflags = 0; if (ioctl(desc->s, SIOCGIFFLAGS, (char*)&ifreq) < 0) flags = 0; else flags = ifreq.ifr_flags; - /* translate flags */ - if (flags & IFF_UP) - eflags |= INET_IFF_UP; - if (flags & IFF_BROADCAST) - eflags |= INET_IFF_BROADCAST; - if (flags & IFF_LOOPBACK) - eflags |= INET_IFF_LOOPBACK; - if (flags & IFF_POINTOPOINT) - eflags |= INET_IFF_POINTTOPOINT; - if (flags & IFF_RUNNING) - eflags |= INET_IFF_RUNNING; - if (flags & IFF_MULTICAST) - eflags |= INET_IFF_MULTICAST; buf_check(sptr, s_end, 5); *sptr++ = INET_IFOPT_FLAGS; - put_int32(eflags, sptr); + put_int32(IFGET_FLAGS(flags), sptr); sptr += 4; break; } @@ -4300,17 +4325,22 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, (void) ioctl(desc->s, SIOCSIFADDR, (char*)&ifreq); break; - case INET_IFOPT_HWADDR: - buf_check(buf, b_end, IFHWADDRLEN); + case INET_IFOPT_HWADDR: { + unsigned int len; + buf_check(buf, b_end, 2); + len = get_int16(buf); buf += 2; + buf_check(buf, b_end, len); #ifdef SIOCSIFHWADDR /* raw memcpy (fix include autoconf later) */ - sys_memcpy((char*)(&ifreq.ifr_hwaddr.sa_data), buf, IFHWADDRLEN); + sys_memset((char*)(&ifreq.ifr_hwaddr.sa_data), + '\0', sizeof(ifreq.ifr_hwaddr.sa_data)); + sys_memcpy((char*)(&ifreq.ifr_hwaddr.sa_data), buf, len); (void) ioctl(desc->s, SIOCSIFHWADDR, (char *)&ifreq); #endif - buf += IFHWADDRLEN; + buf += len; break; - + } case INET_IFOPT_BROADADDR: #ifdef SIOCSIFBRDADDR @@ -4415,6 +4445,551 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, #endif + + +/* Latin-1 to utf8 */ + +static int utf8_len(const char *c, int m) { + int l; + for (l = 0; m; c++, l++, m--) { + if (*c == '\0') break; + if ((*c & 0x7f) != *c) l++; + } + return l; +} + +static void utf8_encode(const char *c, int m, char *p) { + for (; m; c++, m--) { + if (*c == '\0') break; + if ((*c & 0x7f) != *c) { + *p++ = (char) (0xC0 | (0x03 & (*c >> 6))); + *p++ = (char) (0x80 | (0x3F & *c)); + } else { + *p++ = (char) *c; + } + } +} + +#if defined(__WIN32__) + +static void set_netmask_bytes(char *c, int len, int pref_len) { + int i, m; + for (i = 0, m = pref_len >> 3; i < m && i < len; i++) c[i] = '\xFF'; + if (i < len) c[i++] = 0xFF << (8 - (pref_len & 7)); + for (; i < len; i++) c[i] = '\0'; +} + + +int eq_masked_bytes(char *a, char *b, int pref_len) { + int i, m; + for (i = 0, m = pref_len >> 3; i < m; i++) { + if (a[i] != b[i]) return 0; + } + m = pref_len & 7; + if (m) { + m = 0xFF & (0xFF << (8 - m)); + if ((a[i] & m) != (b[i] & m)) return 0; + } + return !0; +} + +static int inet_ctl_getifaddrs(inet_descriptor* desc_p, + char **rbuf_pp, int rsize) +{ + int i; + DWORD ret, n; + IP_INTERFACE_INFO *info_p; + MIB_IPADDRTABLE *ip_addrs_p; + IP_ADAPTER_ADDRESSES *ip_adaddrs_p, *ia_p; + + char *buf_p; + char *buf_alloc_p; + int buf_size =512; +# define BUF_ENSURE(Size) \ + do { \ + int NEED_, GOT_ = buf_p - buf_alloc_p; \ + NEED_ = GOT_ + (Size); \ + if (NEED_ > buf_size) { \ + buf_size = NEED_ + 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + } while(0) +# define SOCKADDR_TO_BUF(opt, sa) \ + do { \ + if (sa) { \ + char *P_; \ + *buf_p++ = (opt); \ + while (! (P_ = sockaddr_to_buf((sa), buf_p, \ + buf_alloc_p+buf_size))) { \ + int GOT_ = buf_p - buf_alloc_p; \ + buf_size += 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + if (P_ == buf_p) { \ + buf_p--; \ + } else { \ + buf_p = P_; \ + } \ + } \ + } while (0) + + { + /* Try GetAdaptersAddresses, if it is available */ + unsigned long ip_adaddrs_size = 16 * 1024; + ULONG family = AF_UNSPEC; + ULONG flags = + GAA_FLAG_INCLUDE_PREFIX | GAA_FLAG_SKIP_ANYCAST | + GAA_FLAG_SKIP_DNS_SERVER | GAA_FLAG_SKIP_FRIENDLY_NAME | + GAA_FLAG_SKIP_MULTICAST; + ULONG (WINAPI *fpGetAdaptersAddresses) + (ULONG, ULONG, PVOID, PIP_ADAPTER_ADDRESSES, PULONG); + HMODULE iphlpapi = GetModuleHandle("iphlpapi"); + fpGetAdaptersAddresses = (void *) + (iphlpapi ? + GetProcAddress(iphlpapi, "GetAdaptersAddresses") : + NULL); + if (fpGetAdaptersAddresses) { + ip_adaddrs_p = ALLOC(ip_adaddrs_size); + for (i = 17; i; i--) { + ret = fpGetAdaptersAddresses( + family, flags, NULL, ip_adaddrs_p, &ip_adaddrs_size); + ip_adaddrs_p = REALLOC(ip_adaddrs_p, ip_adaddrs_size); + if (ret == NO_ERROR) break; + if (ret == ERROR_BUFFER_OVERFLOW) continue; + i = 0; + } + if (! i) { + FREE(ip_adaddrs_p); + ip_adaddrs_p = NULL; + } + } else ip_adaddrs_p = NULL; + } + + { + /* Load the IP_INTERFACE_INFO table (only IPv4 interfaces), + * reliable source of interface names on XP + */ + unsigned long info_size = 4 * 1024; + info_p = ALLOC(info_size); + for (i = 17; i; i--) { + ret = GetInterfaceInfo(info_p, &info_size); + info_p = REALLOC(info_p, info_size); + if (ret == NO_ERROR) break; + if (ret == ERROR_INSUFFICIENT_BUFFER) continue; + i = 0; + } + if (! i) { + FREE(info_p); + info_p = NULL; + } + } + + if (! ip_adaddrs_p) { + /* If GetAdaptersAddresses gave nothing we fall back to + * MIB_IPADDRTABLE (only IPv4 interfaces) + */ + unsigned long ip_addrs_size = 16 * sizeof(*ip_addrs_p); + ip_addrs_p = ALLOC(ip_addrs_size); + for (i = 17; i; i--) { + ret = GetIpAddrTable(ip_addrs_p, &ip_addrs_size, FALSE); + ip_addrs_p = REALLOC(ip_addrs_p, ip_addrs_size); + if (ret == NO_ERROR) break; + if (ret == ERROR_INSUFFICIENT_BUFFER) continue; + i = 0; + } + if (! i) { + if (info_p) FREE(info_p); + FREE(ip_addrs_p); + return ctl_reply(INET_REP_OK, NULL, 0, rbuf_pp, rsize); + } + } else ip_addrs_p = NULL; + + buf_p = buf_alloc_p = ALLOC(buf_size); + *buf_p++ = INET_REP_OK; + + /* Iterate over MIB_IPADDRTABLE or IP_ADAPTER_ADDRESSES */ + for (ia_p = NULL, ip_addrs_p ? ((void *)(i = 0)) : (ia_p = ip_adaddrs_p); + ip_addrs_p ? (i < ip_addrs_p->dwNumEntries) : (ia_p != NULL); + ip_addrs_p ? ((void *)(i++)) : (ia_p = ia_p->Next)) { + MIB_IPADDRROW *ipaddrrow_p = NULL; + DWORD flags = INET_IFF_MULTICAST; + DWORD index = 0; + WCHAR *wname_p = NULL; + MIB_IFROW ifrow; + + if (ip_addrs_p) { + ipaddrrow_p = ip_addrs_p->table + i; + index = ipaddrrow_p->dwIndex; + } else { + index = ia_p->IfIndex; + if (ia_p->Flags & IP_ADAPTER_NO_MULTICAST) { + flags &= ~INET_IFF_MULTICAST; + } + } +index: + if (! index) goto done; + sys_memzero(&ifrow, sizeof(ifrow)); + ifrow.dwIndex = index; + if (GetIfEntry(&ifrow) != NO_ERROR) break; + /* Find the interface name - first try MIB_IFROW.wzname */ + if (ifrow.wszName[0] != 0) { + wname_p = ifrow.wszName; + } else { + /* Then try IP_ADAPTER_INDEX_MAP.Name (only IPv4 adapters) */ + int j; + for (j = 0; j < info_p->NumAdapters; j++) { + if (info_p->Adapter[j].Index == (ULONG) ifrow.dwIndex) { + if (info_p->Adapter[j].Name[0] != 0) { + wname_p = info_p->Adapter[j].Name; + } + break; + } + } + } + if (wname_p) { + int len; + /* Convert interface name to UTF-8 */ + len = + WideCharToMultiByte( + CP_UTF8, 0, wname_p, -1, NULL, 0, NULL, NULL); + if (! len) break; + BUF_ENSURE(len); + WideCharToMultiByte( + CP_UTF8, 0, wname_p, -1, buf_p, len, NULL, NULL); + buf_p += len; + } else { + /* Found no name - + * use "MIB_IFROW.dwIndex: MIB_IFROW.bDescr" as name instead */ + int l; + l = utf8_len(ifrow.bDescr, ifrow.dwDescrLen); + BUF_ENSURE(9 + l+1); + buf_p += + erts_sprintf( + buf_p, "%lu: ", (unsigned long) ifrow.dwIndex); + utf8_encode(ifrow.bDescr, ifrow.dwDescrLen, buf_p); + buf_p += l; + *buf_p++ = '\0'; + } + /* Interface flags, often make up broadcast and multicast flags */ + switch (ifrow.dwType) { + case IF_TYPE_ETHERNET_CSMACD: + flags |= INET_IFF_BROADCAST; + break; + case IF_TYPE_SOFTWARE_LOOPBACK: + flags |= INET_IFF_LOOPBACK; + flags &= ~INET_IFF_MULTICAST; + break; + default: + flags &= ~INET_IFF_MULTICAST; + break; + } + if (ifrow.dwAdminStatus) { + flags |= INET_IFF_UP; + switch (ifrow.dwOperStatus) { + case IF_OPER_STATUS_CONNECTING: + flags |= INET_IFF_POINTTOPOINT; + break; + case IF_OPER_STATUS_CONNECTED: + flags |= INET_IFF_RUNNING | INET_IFF_POINTTOPOINT; + break; + case IF_OPER_STATUS_OPERATIONAL: + flags |= INET_IFF_RUNNING; + break; + } + } + BUF_ENSURE(1 + 4); + *buf_p++ = INET_IFOPT_FLAGS; + put_int32(flags, buf_p); buf_p += 4; + if (ipaddrrow_p) { + /* Legacy implementation through GetIpAddrTable */ + struct sockaddr_in sin; + /* IP Address */ + sys_memzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ipaddrrow_p->dwAddr; + BUF_ENSURE(1); + /* Netmask */ + SOCKADDR_TO_BUF(INET_IFOPT_ADDR, (struct sockaddr *) &sin); + sin.sin_addr.s_addr = ipaddrrow_p->dwMask; + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, (struct sockaddr *) &sin); + if (flags & INET_IFF_BROADCAST) { + /* Broadcast address - fake it*/ + sin.sin_addr.s_addr = ipaddrrow_p->dwAddr; + sin.sin_addr.s_addr |= ~ipaddrrow_p->dwMask; + BUF_ENSURE(1); + SOCKADDR_TO_BUF( + INET_IFOPT_BROADADDR, (struct sockaddr *) &sin); + } + } else { + IP_ADAPTER_UNICAST_ADDRESS *p; + /* IP Address(es) */ + for (p = ia_p->FirstUnicastAddress; + p; + p = p->Next) + { + IP_ADAPTER_PREFIX *q; + ULONG shortest_length; + struct sockaddr *shortest_p, *sa_p = p->Address.lpSockaddr; + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_ADDR, sa_p); + shortest_p = NULL; + shortest_length = 0; + for (q = ia_p->FirstPrefix; + q; + q = q->Next) { + struct sockaddr *sp_p = q->Address.lpSockaddr; + if (sa_p->sa_family != sp_p->sa_family) continue; + switch (sa_p->sa_family) { + case AF_INET: { + struct sockaddr_in sin; + DWORD sa, sp, mask; + sa = ntohl((DWORD) + ((struct sockaddr_in *) + sa_p)->sin_addr.s_addr); + sp = ntohl((DWORD) + ((struct sockaddr_in *) + sp_p)->sin_addr.s_addr); + mask = 0xFFFFFFFF << (32 - q->PrefixLength); + if ((sa & mask) != (sp & mask)) continue; + if ((! shortest_p) + || q->PrefixLength < shortest_length) { + shortest_p = sp_p; + shortest_length = q->PrefixLength; + } + } break; + case AF_INET6: { + struct sockaddr_in6 sin6; + if (!eq_masked_bytes((char *) + &((struct sockaddr_in6 *) + sa_p)->sin6_addr, + (char *) + &((struct sockaddr_in6 *) + sp_p)->sin6_addr, + q->PrefixLength)) { + continue; + } + if ((! shortest_p) + || q->PrefixLength < shortest_length) { + shortest_p = sp_p; + shortest_length = q->PrefixLength; + } + } break; + } + } + if (! shortest_p) { + /* Found no shortest prefix */ + shortest_p = sa_p; + switch (shortest_p->sa_family) { + case AF_INET: { + /* Fall back to old classfull network addresses */ + DWORD addr = ntohl(((struct sockaddr_in *)shortest_p) + ->sin_addr.s_addr); + if (! (addr & 0x800000)) { + /* Class A */ + shortest_length = 8; + } else if (! (addr & 0x400000)) { + /* Class B */ + shortest_length = 16; + } else if (! (addr & 0x200000)) { + /* Class C */ + shortest_length = 24; + } else { + shortest_length = 32; + } + } break; + case AF_INET6: { + /* Just play it safe */ + shortest_length = 128; + } break; + } + } + switch (shortest_p->sa_family) { + case AF_INET: { + struct sockaddr_in sin; + DWORD mask = 0xFFFFFFFF << (32 - shortest_length); + sys_memzero(&sin, sizeof(sin)); + sin.sin_family = shortest_p->sa_family; + sin.sin_addr.s_addr = htonl(mask); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, + (struct sockaddr *) &sin); + if (flags & INET_IFF_BROADCAST) { + DWORD sp = + ntohl((DWORD) + ((struct sockaddr_in *)shortest_p) + -> sin_addr.s_addr); + sin.sin_addr.s_addr = htonl(sp | ~mask); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_BROADADDR, + (struct sockaddr *) &sin); + } + } break; + case AF_INET6: { + struct sockaddr_in6 sin6; + sys_memzero(&sin6, sizeof(sin6)); + sin6.sin6_family = shortest_p->sa_family; + set_netmask_bytes((char *) &sin6.sin6_addr, + 16, + shortest_length); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, + (struct sockaddr *) &sin6); + } break; + } + } + } + if (ifrow.dwPhysAddrLen) { + /* Hardware Address */ + BUF_ENSURE(1 + 2 + ifrow.dwPhysAddrLen); + *buf_p++ = INET_IFOPT_HWADDR; + put_int16(ifrow.dwPhysAddrLen, buf_p); buf_p += 2; + sys_memcpy(buf_p, ifrow.bPhysAddr, ifrow.dwPhysAddrLen); + buf_p += ifrow.dwPhysAddrLen; + } + +done: + /* That is all for this interface */ + BUF_ENSURE(1); + *buf_p++ = '\0'; + if (ia_p && + ia_p->Ipv6IfIndex && + ia_p->Ipv6IfIndex != index) + { + /* Oops, there was an other interface for IPv6. Possible? XXX */ + index = ia_p->Ipv6IfIndex; + goto index; + } + } + + if (ip_adaddrs_p) FREE(ip_adaddrs_p); + if (info_p) FREE(info_p); + if (ip_addrs_p) FREE(ip_addrs_p); + + buf_size = buf_p - buf_alloc_p; + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); + /* buf_p is now unreliable */ + *rbuf_pp = buf_alloc_p; + return buf_size; +# undef BUF_ENSURE +} + +#elif defined(HAVE_GETIFADDRS) + +static int inet_ctl_getifaddrs(inet_descriptor* desc_p, + char **rbuf_pp, int rsize) +{ + struct ifaddrs *ifa_p, *ifa_free_p; + + int buf_size; + char *buf_p; + char *buf_alloc_p; + + buf_size = 512; + buf_alloc_p = ALLOC(buf_size); + buf_p = buf_alloc_p; +# define BUF_ENSURE(Size) \ + do { \ + int NEED_, GOT_ = buf_p - buf_alloc_p; \ + NEED_ = GOT_ + (Size); \ + if (NEED_ > buf_size) { \ + buf_size = NEED_ + 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + } while (0) +# define SOCKADDR_TO_BUF(opt, sa) \ + do { \ + if (sa) { \ + char *P_; \ + *buf_p++ = (opt); \ + while (! (P_ = sockaddr_to_buf((sa), buf_p, \ + buf_alloc_p+buf_size))) { \ + int GOT_ = buf_p - buf_alloc_p; \ + buf_size += 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + if (P_ == buf_p) { \ + buf_p--; \ + } else { \ + buf_p = P_; \ + } \ + } \ + } while (0) + + if (getifaddrs(&ifa_p) < 0) { + return ctl_error(sock_errno(), rbuf_pp, rsize); + } + ifa_free_p = ifa_p; + *buf_p++ = INET_REP_OK; + for (; ifa_p; ifa_p = ifa_p->ifa_next) { + int len = utf8_len(ifa_p->ifa_name, -1); + BUF_ENSURE(len+1 + 1+4 + 1); + utf8_encode(ifa_p->ifa_name, -1, buf_p); + buf_p += len; + *buf_p++ = '\0'; + *buf_p++ = INET_IFOPT_FLAGS; + put_int32(IFGET_FLAGS(ifa_p->ifa_flags), buf_p); buf_p += 4; + if (ifa_p->ifa_addr->sa_family == AF_INET +#if defined(AF_INET6) + || ifa_p->ifa_addr->sa_family == AF_INET6 +#endif + ) { + SOCKADDR_TO_BUF(INET_IFOPT_ADDR, ifa_p->ifa_addr); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, ifa_p->ifa_netmask); + if (ifa_p->ifa_flags & IFF_POINTOPOINT) { + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_DSTADDR, ifa_p->ifa_dstaddr); + } else if (ifa_p->ifa_flags & IFF_BROADCAST) { + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_BROADADDR, ifa_p->ifa_broadaddr); + } + } +#if defined(AF_LINK) || defined(AF_PACKET) + else if ( +#if defined(AF_LINK) + ifa_p->ifa_addr->sa_family == AF_LINK +#else + 0 +#endif +#if defined(AF_PACKET) + || ifa_p->ifa_addr->sa_family == AF_PACKET +#endif + ) { + char *bp = buf_p; + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_HWADDR, ifa_p->ifa_addr); + if (buf_p - bp < 4) buf_p = bp; /* Empty hwaddr */ + } +#endif + BUF_ENSURE(1); + *buf_p++ = '\0'; + } + buf_size = buf_p - buf_alloc_p; + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); + /* buf_p is now unreliable */ + freeifaddrs(ifa_free_p); + *rbuf_pp = buf_alloc_p; + return buf_size; +# undef BUF_ENSURE +} + +#else + +static int inet_ctl_getifaddrs(inet_descriptor* desc_p, + char **rbuf_pp, int rsize) +{ + return ctl_error(ENOTSUP, rbuf_pp, rsize); +} + +#endif + + + #ifdef VXWORKS /* ** THIS is a terrible creature, a bug in the TCP part @@ -4576,8 +5151,7 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len) case INET_LOPT_BUFFER: DEBUGF(("inet_set_opts(%ld): s=%d, BUFFER=%d\r\n", (long)desc->port, desc->s, ival)); - if (ival > INET_MAX_BUFFER) ival = INET_MAX_BUFFER; - else if (ival < INET_MIN_BUFFER) ival = INET_MIN_BUFFER; + if (ival < INET_MIN_BUFFER) ival = INET_MIN_BUFFER; desc->bufsz = ival; continue; @@ -4642,7 +5216,6 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len) if (desc->stype == SOCK_STREAM) { tcp_descriptor* tdesc = (tcp_descriptor*) desc; if (ival < 0) ival = 0; - else if (ival > INET_MAX_BUFFER*2) ival = INET_MAX_BUFFER*2; if (tdesc->low > ival) tdesc->low = ival; tdesc->high = ival; @@ -4653,7 +5226,6 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len) if (desc->stype == SOCK_STREAM) { tcp_descriptor* tdesc = (tcp_descriptor*) desc; if (ival < 0) ival = 0; - else if (ival > INET_MAX_BUFFER) ival = INET_MAX_BUFFER; if (tdesc->high < ival) tdesc->high = ival; tdesc->low = ival; @@ -4999,9 +5571,6 @@ static int sctp_set_opts(inet_descriptor* desc, char* ptr, int len) case INET_LOPT_BUFFER: desc->bufsz = get_int32(curr); curr += 4; - if (desc->bufsz > INET_MAX_BUFFER) - desc->bufsz = INET_MAX_BUFFER; - else if (desc->bufsz < INET_MIN_BUFFER) desc->bufsz = INET_MIN_BUFFER; res = 0; /* This does not affect the kernel buffer size */ @@ -5293,12 +5862,15 @@ static int sctp_set_opts(inet_descriptor* desc, char* ptr, int len) if (pmtud_enable) cflags |= SPP_PMTUD_ENABLE; if (pmtud_disable) cflags |= SPP_PMTUD_DISABLE; +# ifdef HAVE_STRUCT_SCTP_PADDRPARAMS_SPP_SACKDELAY + /* The followings are missing in FreeBSD 7.1 */ sackdelay_enable =eflags& SCTP_FLAG_SACDELAY_ENABLE; sackdelay_disable=eflags& SCTP_FLAG_SACDELAY_DISABLE; if (sackdelay_enable && sackdelay_disable) return -1; if (sackdelay_enable) cflags |= SPP_SACKDELAY_ENABLE; if (sackdelay_disable) cflags |= SPP_SACKDELAY_DISABLE; +# endif arg.pap.spp_flags = cflags; # endif @@ -5436,7 +6008,7 @@ static int inet_fill_opts(inet_descriptor* desc, #define PLACE_FOR(Size,Ptr) \ do { \ int need = dest_used + (Size); \ - if (need > INET_MAX_BUFFER) { \ + if (need > INET_MAX_OPT_BUFFER) { \ RETURN_ERROR(); \ } \ if (need > dest_allocated) { \ @@ -5660,7 +6232,7 @@ static int inet_fill_opts(inet_descriptor* desc, buf += 4; data_provided = (int) *buf++; arg_sz = get_int32(buf); - if (arg_sz > INET_MAX_BUFFER) { + if (arg_sz > INET_MAX_OPT_BUFFER) { RETURN_ERROR(); } buf += 4; @@ -5774,7 +6346,7 @@ static int sctp_fill_opts(inet_descriptor* desc, char* buf, int buflen, "miscalculated buffer size"); \ } \ need = (Index) + (N); \ - if (need > INET_MAX_BUFFER/sizeof(ErlDrvTermData)) { \ + if (need > INET_MAX_OPT_BUFFER/sizeof(ErlDrvTermData)) {\ RETURN_ERROR((Spec), -ENOMEM); \ } \ if (need > spec_allocated) { \ @@ -6199,13 +6771,15 @@ static int sctp_fill_opts(inet_descriptor* desc, char* buf, int buflen, if (ap.spp_flags & SPP_PMTUD_DISABLE) { i = LOAD_ATOM (spec, i, am_pmtud_disable); n++; } - +# ifdef HAVE_STRUCT_SCTP_PADDRPARAMS_SPP_SACKDELAY + /* SPP_SACKDELAY_* not in FreeBSD 7.1 */ if (ap.spp_flags & SPP_SACKDELAY_ENABLE) { i = LOAD_ATOM (spec, i, am_sackdelay_enable); n++; } if (ap.spp_flags & SPP_SACKDELAY_DISABLE) { i = LOAD_ATOM (spec, i, am_sackdelay_disable); n++; } # endif +# endif PLACE_FOR(spec, i, LOAD_NIL_CNT + LOAD_LIST_CNT + 2*LOAD_TUPLE_CNT); @@ -6625,7 +7199,7 @@ static int inet_ctl(inet_descriptor* desc, int cmd, char* buf, int len, } } DEBUGF(("inet_ctl(%ld): GETSTAT\r\n", (long) desc->port)); - if (dstlen > INET_MAX_BUFFER) /* sanity check */ + if (dstlen > INET_MAX_OPT_BUFFER) /* sanity check */ return 0; if (dstlen > rsize) { if ((dst = (char*) ALLOC(dstlen)) == NULL) @@ -6641,7 +7215,7 @@ static int inet_ctl(inet_descriptor* desc, int cmd, char* buf, int len, char* dst; int dstlen = 1 /* Reply code */ + len*5; DEBUGF(("inet_ctl(%ld): INET_REQ_SUBSCRIBE\r\n", (long) desc->port)); - if (dstlen > INET_MAX_BUFFER) /* sanity check */ + if (dstlen > INET_MAX_OPT_BUFFER) /* sanity check */ return 0; if (dstlen > rsize) { if ((dst = (char*) ALLOC(dstlen)) == NULL) @@ -6676,6 +7250,13 @@ static int inet_ctl(inet_descriptor* desc, int cmd, char* buf, int len, return inet_ctl_getiflist(desc, rbuf, rsize); } + case INET_REQ_GETIFADDRS: { + DEBUGF(("inet_ctl(%ld): GETIFADDRS\r\n", (long)desc->port)); + if (!IS_OPEN(desc)) + return ctl_xerror(EXBADPORT, rbuf, rsize); + return inet_ctl_getifaddrs(desc, rbuf, rsize); + } + case INET_REQ_IFGET: { DEBUGF(("inet_ctl(%ld): IFGET\r\n", (long)desc->port)); if (!IS_OPEN(desc)) diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c index af4ab693dc..01ba773688 100644 --- a/erts/emulator/sys/unix/sys.c +++ b/erts/emulator/sys/unix/sys.c @@ -75,6 +75,7 @@ static erts_smp_rwmtx_t environ_rwmtx; #include "erl_sys_driver.h" #include "erl_check_io.h" +#include "erl_cpu_topology.h" #ifndef DISABLE_VFORK #define DISABLE_VFORK 0 @@ -399,7 +400,7 @@ typedef struct { #ifdef ERTS_THR_HAVE_SIG_FUNCS sigset_t saved_sigmask; #endif - int unbind_child; + int sched_bind_data; } erts_thr_create_data_t; /* @@ -410,15 +411,13 @@ static void * thr_create_prepare(void) { erts_thr_create_data_t *tcdp; - ErtsSchedulerData *esdp; tcdp = erts_alloc(ERTS_ALC_T_TMP, sizeof(erts_thr_create_data_t)); #ifdef ERTS_THR_HAVE_SIG_FUNCS erts_thr_sigmask(SIG_BLOCK, &thr_create_sigmask, &tcdp->saved_sigmask); #endif - esdp = erts_get_scheduler_data(); - tcdp->unbind_child = esdp && erts_is_scheduler_bound(esdp); + tcdp->sched_bind_data = erts_sched_bind_atthrcreate_prepare(); return (void *) tcdp; } @@ -430,6 +429,8 @@ thr_create_cleanup(void *vtcdp) { erts_thr_create_data_t *tcdp = (erts_thr_create_data_t *) vtcdp; + erts_sched_bind_atthrcreate_parent(tcdp->sched_bind_data); + #ifdef ERTS_THR_HAVE_SIG_FUNCS /* Restore signalmask... */ erts_thr_sigmask(SIG_SETMASK, &tcdp->saved_sigmask, NULL); @@ -456,12 +457,7 @@ thr_create_prepare_child(void *vtcdp) erts_thread_disable_fpe(); #endif - if (tcdp->unbind_child) { - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - erts_unbind_from_cpu(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - } - + erts_sched_bind_atthrcreate_child(tcdp->sched_bind_data); } #endif /* #ifdef USE_THREADS */ @@ -1461,9 +1457,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op CHLD_STAT_LOCK; - unbind = erts_is_scheduler_bound(NULL); - if (unbind) - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); + unbind = erts_sched_bind_atfork_prepare(); #if !DISABLE_VFORK /* See fork/vfork discussion before this function. */ @@ -1476,7 +1470,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op if (pid == 0) { /* The child! Setup child... */ - if (unbind && erts_unbind_from_cpu(erts_cpuinfo) != 0) + if (erts_sched_bind_atfork_child(unbind) != 0) goto child_error; /* OBSERVE! @@ -1577,8 +1571,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op cs_argv[CS_ARGV_PROGNAME_IX] = child_setup_prog; cs_argv[CS_ARGV_WD_IX] = opts->wd ? opts->wd : "."; - cs_argv[CS_ARGV_UNBIND_IX] - = (unbind ? erts_get_unbind_from_cpu_str(erts_cpuinfo) : "false"); + cs_argv[CS_ARGV_UNBIND_IX] = erts_sched_bind_atvfork_child(unbind); cs_argv[CS_ARGV_FD_CR_IX] = fd_close_range; for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) cs_argv[CS_ARGV_DUP2_OP_IX(i)] = &dup2_op[i][0]; @@ -1627,8 +1620,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op } #endif - if (unbind) - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_sched_bind_atfork_parent(unbind); if (pid == -1) { saved_errno = errno; diff --git a/erts/emulator/sys/win32/sys.c b/erts/emulator/sys/win32/sys.c index 15d4cd7361..d24347b3aa 100644 --- a/erts/emulator/sys/win32/sys.c +++ b/erts/emulator/sys/win32/sys.c @@ -31,7 +31,7 @@ #include "global.h" #include "erl_threads.h" #include "../../drivers/win32/win_con.h" - +#include "erl_cpu_topology.h" void erts_sys_init_float(void); @@ -2973,13 +2973,50 @@ check_supported_os_version(void) } #ifdef USE_THREADS -#ifdef ERTS_ENABLE_LOCK_COUNT + +typedef struct { + int sched_bind_data; +} erts_thr_create_data_t; + +/* + * thr_create_prepare() is called in parent thread before thread creation. + * Returned value is passed as argument to thr_create_cleanup(). + */ +static void * +thr_create_prepare(void) +{ + erts_thr_create_data_t *tcdp; + + tcdp = erts_alloc(ERTS_ALC_T_TMP, sizeof(erts_thr_create_data_t)); + tcdp->sched_bind_data = erts_sched_bind_atthrcreate_prepare(); + + return (void *) tcdp; +} + + +/* thr_create_cleanup() is called in parent thread after thread creation. */ +static void +thr_create_cleanup(void *vtcdp) +{ + erts_thr_create_data_t *tcdp = (erts_thr_create_data_t *) vtcdp; + + erts_sched_bind_atthrcreate_parent(tcdp->sched_bind_data); + + erts_free(ERTS_ALC_T_TMP, tcdp); +} + static void thr_create_prepare_child(void *vtcdp) { + erts_thr_create_data_t *tcdp = (erts_thr_create_data_t *) vtcdp; + +#ifdef ERTS_ENABLE_LOCK_COUNT erts_lcnt_thread_setup(); -} #endif /* ERTS_ENABLE_LOCK_COUNT */ + + erts_sched_bind_atthrcreate_child(tcdp->sched_bind_data); +} + #endif /* USE_THREADS */ void @@ -2991,9 +3028,13 @@ erts_sys_pre_init(void) #ifdef USE_THREADS { erts_thr_init_data_t eid = ERTS_THR_INIT_DATA_DEF_INITER; -#ifdef ERTS_ENABLE_LOCK_COUNT + eid.thread_create_child_func = thr_create_prepare_child; -#endif + /* Before creation in parent */ + eid.thread_create_prepare_func = thr_create_prepare; + /* After creation in parent */ + eid.thread_create_parent_func = thr_create_cleanup, + erts_thr_init(&eid); #ifdef ERTS_ENABLE_LOCK_COUNT erts_lcnt_init(); diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index 7c19274696..79252d0593 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -27,6 +27,7 @@ -export([all/1, ping/1, bulk_send/1, bulk_send_small/1, bulk_send_big/1, + bulk_send_bigbig/1, local_send/1, local_send_small/1, local_send_big/1, local_send_legal/1, link_to_busy/1, exit_to_busy/1, lost_exit/1, link_to_dead/1, link_to_dead_new_node/1, @@ -50,7 +51,8 @@ -export([sender/3, receiver2/2, dummy_waiter/0, dead_process/0, roundtrip/1, bounce/1, do_dist_auto_connect/1, inet_rpc_server/1, dist_parallel_sender/3, dist_parallel_receiver/0, - dist_evil_parallel_receiver/0]). + dist_evil_parallel_receiver/0, + sendersender/4, sendersender2/4]). all(suite) -> [ ping, bulk_send, local_send, link_to_busy, exit_to_busy, @@ -121,7 +123,7 @@ bulk_send(doc) -> "the time. This tests that a process that is suspended on a ", "busy port will eventually be resumed."]; bulk_send(suite) -> - [bulk_send_small, bulk_send_big]. + [bulk_send_small, bulk_send_big, bulk_send_bigbig]. bulk_send_small(Config) when is_list(Config) -> ?line bulk_send(64, 32). @@ -129,6 +131,9 @@ bulk_send_small(Config) when is_list(Config) -> bulk_send_big(Config) when is_list(Config) -> ?line bulk_send(32, 64). +bulk_send_bigbig(Config) when is_list(Config) -> + ?line bulk_sendsend(32*5, 4). + bulk_send(Terms, BinSize) -> ?line Dog = test_server:timetrap(test_server:seconds(30)), @@ -145,6 +150,53 @@ bulk_send(Terms, BinSize) -> ?line test_server:timetrap_cancel(Dog), {comment, integer_to_list(trunc(Size/1024/Elapsed+0.5)) ++ " K/s"}. +bulk_sendsend(Terms, BinSize) -> + {Rate1, MonitorCount1} = bulk_sendsend2(Terms, BinSize, 5), + {Rate2, MonitorCount2} = bulk_sendsend2(Terms, BinSize, 995), + Ratio = if MonitorCount2 == 0 -> MonitorCount1 / 1.0; + true -> MonitorCount1 / MonitorCount2 + end, + %% A somewhat arbitrary ratio, but hopefully one that will accomodate + %% a wide range of CPU speeds. + true = (Ratio > 8.0), + {comment, + integer_to_list(Rate1) ++ " K/s, " ++ + integer_to_list(Rate2) ++ " K/s, " ++ + integer_to_list(MonitorCount1) ++ " monitor msgs, " ++ + integer_to_list(MonitorCount2) ++ " monitor msgs, " ++ + float_to_list(Ratio) ++ " monitor ratio"}. + +bulk_sendsend2(Terms, BinSize, BusyBufSize) -> + ?line Dog = test_server:timetrap(test_server:seconds(30)), + + ?line io:format("Sending ~w binaries, each of size ~w K", + [Terms, BinSize]), + ?line {ok, NodeRecv} = start_node(bulk_receiver), + ?line Recv = spawn(NodeRecv, erlang, apply, [fun receiver/2, [0, 0]]), + ?line Bin = list_to_binary(lists:duplicate(BinSize*1024, 253)), + ?line Size = Terms*size(Bin), + + %% SLF LEFT OFF HERE. + %% When the caller uses small hunks, like 4k via + %% bulk_sendsend(32*5, 4), then (on my laptop at least), we get + %% zero monitor messages. But if we use "+zdbbl 5", then we + %% get a lot of monitor messages. So, if we can count up the + %% total number of monitor messages that we get when running both + %% default busy size and "+zdbbl 5", and if the 5 case gets + %% "many many more" monitor messages, then we know we're working. + + ?line {ok, NodeSend} = start_node(bulk_sender, "+zdbbl " ++ integer_to_list(BusyBufSize)), + ?line _Send = spawn(NodeSend, erlang, apply, [fun sendersender/4, [self(), Recv, Bin, Terms]]), + ?line {Elapsed, {TermsN, SizeN}, MonitorCount} = + receive {sendersender, BigRes} -> + BigRes + end, + ?line stop_node(NodeRecv), + ?line stop_node(NodeSend), + + ?line test_server:timetrap_cancel(Dog), + {trunc(SizeN/1024/Elapsed+0.5), MonitorCount}. + sender(To, _Bin, 0) -> To ! {done, self()}, receive @@ -155,6 +207,43 @@ sender(To, Bin, Left) -> To ! {term, Bin}, sender(To, Bin, Left-1). +%% Sender process to be run on a slave node + +sendersender(Parent, To, Bin, Left) -> + erlang:system_monitor(self(), [busy_dist_port]), + [spawn(fun() -> sendersender2(To, Bin, Left, false) end) || + _ <- lists:seq(1,1)], + {USec, {Res, MonitorCount}} = + timer:tc(?MODULE, sendersender2, [To, Bin, Left, true]), + Parent ! {sendersender, {USec/1000000, Res, MonitorCount}}. + +sendersender2(To, Bin, Left, SendDone) -> + sendersender3(To, Bin, Left, SendDone, 0). + +sendersender3(To, _Bin, 0, SendDone, MonitorCount) -> + if SendDone -> + To ! {done, self()}; + true -> + ok + end, + receive + {monitor, _Pid, _Type, _Info} = M -> + sendersender3(To, _Bin, 0, SendDone, MonitorCount + 1) + after 0 -> + if SendDone -> + receive + Any when is_tuple(Any), size(Any) == 2 -> + {Any, MonitorCount} + end; + true -> + exit(normal) + end + end; +sendersender3(To, Bin, Left, SendDone, MonitorCount) -> + To ! {term, Bin}, + %%timer:sleep(50), + sendersender3(To, Bin, Left-1, SendDone, MonitorCount). + %% Receiver process to be run on a slave node. receiver(Terms, Size) -> diff --git a/erts/emulator/test/send_term_SUITE.erl b/erts/emulator/test/send_term_SUITE.erl index 5fd01a9ac5..819aa34886 100644 --- a/erts/emulator/test/send_term_SUITE.erl +++ b/erts/emulator/test/send_term_SUITE.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2005-2009. All Rights Reserved. +%% Copyright Ericsson AB 2005-2010. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -61,7 +61,7 @@ basic(Config) when is_list(Config) -> ?line ExpectExt2Term = term(P, 5), %% ERL_DRV_INT, ERL_DRV_UINT - ?line case erlang:system_info(wordsize) of + ?line case erlang:system_info({wordsize, external}) of 4 -> ?line {-1, 4294967295} = term(P, 6); 8 -> diff --git a/erts/emulator/test/system_info_SUITE.erl b/erts/emulator/test/system_info_SUITE.erl index ba433d4e11..cd940f3ddf 100644 --- a/erts/emulator/test/system_info_SUITE.erl +++ b/erts/emulator/test/system_info_SUITE.erl @@ -132,6 +132,7 @@ misc_smoke_tests(Config) when is_list(Config) -> ?line true = is_binary(erlang:system_info(procs)), ?line true = is_binary(erlang:system_info(loaded)), ?line true = is_binary(erlang:system_info(dist)), + ?line ok = try erlang:system_info({cpu_topology,erts_get_cpu_topology_error_case}), fail catch error:badarg -> ok end, ?line ok. |