diff options
Diffstat (limited to 'erts/emulator')
42 files changed, 4896 insertions, 2929 deletions
diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in index 76d782b159..4ed0ccabc6 100644 --- a/erts/emulator/Makefile.in +++ b/erts/emulator/Makefile.in @@ -734,7 +734,7 @@ RUN_OBJS = \ $(OBJDIR)/erl_fun.o $(OBJDIR)/erl_bif_port.o \ $(OBJDIR)/erl_term.o $(OBJDIR)/erl_node_tables.o \ $(OBJDIR)/erl_monitors.o $(OBJDIR)/erl_process_dump.o \ - $(OBJDIR)/erl_bif_timer.o \ + $(OBJDIR)/erl_bif_timer.o $(OBJDIR)/erl_cpu_topology.o \ $(OBJDIR)/erl_drv_thread.o $(OBJDIR)/erl_bif_chksum.o \ $(OBJDIR)/erl_bif_re.o $(OBJDIR)/erl_unicode.o \ $(OBJDIR)/packet_parser.o $(OBJDIR)/safe_hash.o \ diff --git a/erts/emulator/beam/dist.c b/erts/emulator/beam/dist.c index 16b6aeac3f..694460d702 100644 --- a/erts/emulator/beam/dist.c +++ b/erts/emulator/beam/dist.c @@ -97,6 +97,8 @@ dist_msg_dbg(ErtsDistExternal *edep, char *what, byte *buf, int sz) #define PASS_THROUGH 'p' /* This code should go */ int erts_is_alive; /* System must be blocked on change */ +int erts_dist_buf_busy_limit; + /* distribution trap functions */ Export* dsend2_trap = NULL; @@ -160,7 +162,7 @@ Uint erts_dist_cache_size(void) static ErtsProcList * get_suspended_on_de(DistEntry *dep, Uint32 unset_qflgs) { - ERTS_SMP_LC_ASSERT(erts_smp_lc_spinlock_is_locked(&dep->qlock)); + ERTS_SMP_LC_ASSERT(erts_smp_lc_mtx_is_locked(&dep->qlock)); dep->qflgs &= ~unset_qflgs; if (dep->qflgs & ERTS_DE_QFLG_EXIT) { /* No resume when exit has been scheduled */ @@ -453,17 +455,17 @@ int erts_do_net_exits(DistEntry *dep, Eterm reason) if (dep->status & ERTS_DE_SFLG_EXITING) { #ifdef DEBUG - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qflgs & ERTS_DE_QFLG_EXIT); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); #endif } else { dep->status |= ERTS_DE_SFLG_EXITING; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(!(dep->qflgs & ERTS_DE_QFLG_EXIT)); dep->qflgs |= ERTS_DE_QFLG_EXIT; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } erts_smp_de_links_lock(dep); @@ -577,7 +579,7 @@ static void clear_dist_entry(DistEntry *dep) erts_smp_de_links_unlock(dep); #endif - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); if (!dep->out_queue.last) obuf = dep->finalized_out_queue.first; @@ -593,7 +595,7 @@ static void clear_dist_entry(DistEntry *dep) dep->status = 0; suspendees = get_suspended_on_de(dep, ERTS_DE_QFLGS_ALL); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); erts_smp_atomic_set(&dep->dist_cmd_scheduled, 0); dep->send = NULL; erts_smp_de_rwunlock(dep); @@ -611,10 +613,10 @@ static void clear_dist_entry(DistEntry *dep) } if (obufsize) { - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize >= obufsize); dep->qsize -= obufsize; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } } @@ -1453,8 +1455,6 @@ int erts_net_message(Port *prt, return -1; } -#define ERTS_DE_BUSY_LIMIT (128*1024) - static int dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) { @@ -1538,18 +1538,18 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) } else { ErtsProcList *plp = NULL; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); dep->qsize += size_obuf(obuf); - if (dep->qsize >= ERTS_DE_BUSY_LIMIT) + if (dep->qsize >= erts_dist_buf_busy_limit) dep->qflgs |= ERTS_DE_QFLG_BUSY; if (!force_busy && (dep->qflgs & ERTS_DE_QFLG_BUSY)) { - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); plp = erts_proclist_create(c_p); plp->next = NULL; erts_suspend(c_p, ERTS_PROC_LOCK_MAIN, NULL); suspended = 1; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); } /* Enqueue obuf on dist entry */ @@ -1575,7 +1575,7 @@ dsig_send(ErtsDSigData *dsdp, Eterm ctl, Eterm msg, int force_busy) } } - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); erts_schedule_dist_command(NULL, dep); erts_smp_de_runlock(dep); @@ -1708,10 +1708,8 @@ erts_dist_command(Port *prt, int reds_limit) { Sint reds = ERTS_PORT_REDS_DIST_CMD_START; int prt_busy; - int de_busy; Uint32 status; Uint32 flags; - Uint32 qflgs; Sint obufsize = 0; ErtsDistOutputQueue oq, foq; DistEntry *dep = prt->dist_entry; @@ -1746,13 +1744,12 @@ erts_dist_command(Port *prt, int reds_limit) * a mess. */ - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); oq.first = dep->out_queue.first; oq.last = dep->out_queue.last; dep->out_queue.first = NULL; dep->out_queue.last = NULL; - qflgs = dep->qflgs; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); foq.first = dep->finalized_out_queue.first; foq.last = dep->finalized_out_queue.last; @@ -1763,17 +1760,8 @@ erts_dist_command(Port *prt, int reds_limit) goto preempted; prt_busy = (int) (prt->status & ERTS_PORT_SFLG_PORT_BUSY); - de_busy = (int) (qflgs & ERTS_DE_QFLG_BUSY); - if (prt_busy) { - if (!de_busy) { - erts_smp_spin_lock(&dep->qlock); - dep->qflgs |= ERTS_DE_QFLG_BUSY; - erts_smp_spin_unlock(&dep->qlock); - de_busy = 1; - } - } - else if (foq.first) { + if (!prt_busy && foq.first) { int preempt = 0; do { Uint size; @@ -1791,10 +1779,7 @@ erts_dist_command(Port *prt, int reds_limit) free_dist_obuf(fob); preempt = reds > reds_limit || (prt->status & ERTS_PORT_SFLGS_DEAD); if (prt->status & ERTS_PORT_SFLG_PORT_BUSY) { - erts_smp_spin_lock(&dep->qlock); - dep->qflgs |= ERTS_DE_QFLG_BUSY; - erts_smp_spin_unlock(&dep->qlock); - de_busy = prt_busy = 1; + prt_busy = 1; break; } } while (foq.first && !preempt); @@ -1877,10 +1862,7 @@ erts_dist_command(Port *prt, int reds_limit) free_dist_obuf(fob); preempt = reds > reds_limit || (prt->status & ERTS_PORT_SFLGS_DEAD); if (prt->status & ERTS_PORT_SFLG_PORT_BUSY) { - erts_smp_spin_lock(&dep->qlock); - dep->qflgs |= ERTS_DE_QFLG_BUSY; - erts_smp_spin_unlock(&dep->qlock); - de_busy = prt_busy = 1; + prt_busy = 1; if (oq.first && !preempt) goto finalize_only; } @@ -1907,22 +1889,23 @@ erts_dist_command(Port *prt, int reds_limit) * dist entry in a non-busy state and resume suspended * processes. */ - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize >= obufsize); dep->qsize -= obufsize; obufsize = 0; - if (de_busy && !prt_busy && dep->qsize < ERTS_DE_BUSY_LIMIT) { + if (!prt_busy + && (dep->qflgs & ERTS_DE_QFLG_BUSY) + && dep->qsize < erts_dist_buf_busy_limit) { ErtsProcList *suspendees; int resumed; suspendees = get_suspended_on_de(dep, ERTS_DE_QFLG_BUSY); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); resumed = erts_resume_processes(suspendees); reds += resumed*ERTS_PORT_REDS_DIST_CMD_RESUMED; - de_busy = 0; } else - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } ASSERT(!oq.first && !oq.last); @@ -1931,10 +1914,10 @@ erts_dist_command(Port *prt, int reds_limit) if (obufsize != 0) { ASSERT(obufsize > 0); - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize >= obufsize); dep->qsize -= obufsize; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } ASSERT(foq.first || !foq.last); @@ -1984,9 +1967,9 @@ erts_dist_command(Port *prt, int reds_limit) foq.last = NULL; #ifdef DEBUG - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize == obufsize); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); #endif } else { @@ -1995,14 +1978,14 @@ erts_dist_command(Port *prt, int reds_limit) * Unhandle buffers need to be put back first * in out_queue. */ - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); dep->qsize -= obufsize; obufsize = 0; oq.last->next = dep->out_queue.first; dep->out_queue.first = oq.first; if (!dep->out_queue.last) dep->out_queue.last = oq.last; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); } erts_schedule_dist_command(prt, NULL); @@ -2026,10 +2009,10 @@ erts_kill_dist_connection(DistEntry *dep, Uint32 connection_id) dep->status |= ERTS_DE_SFLG_EXITING; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(!(dep->qflgs & ERTS_DE_QFLG_EXIT)); dep->qflgs |= ERTS_DE_QFLG_EXIT; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); erts_schedule_dist_command(NULL, dep); } @@ -2400,13 +2383,13 @@ BIF_RETTYPE setnode_3(BIF_ALIST_3) ErtsProcList *plp = erts_proclist_create(BIF_P); plp->next = NULL; erts_suspend(BIF_P, ERTS_PROC_LOCK_MAIN, NULL); - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); if (dep->suspended.last) dep->suspended.last->next = plp; else dep->suspended.first = plp; dep->suspended.last = plp; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); goto yield; } @@ -2434,9 +2417,9 @@ BIF_RETTYPE setnode_3(BIF_ALIST_3) ASSERT(dep->send); #ifdef DEBUG - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); ASSERT(dep->qsize == 0); - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); #endif erts_set_dist_entry_connected(dep, BIF_ARG_2, flags); diff --git a/erts/emulator/beam/dist.h b/erts/emulator/beam/dist.h index fa19c7fb45..9ccc3e5ba9 100644 --- a/erts/emulator/beam/dist.h +++ b/erts/emulator/beam/dist.h @@ -38,6 +38,7 @@ #define DFLAG_UNICODE_IO 0x1000 #define DFLAG_DIST_HDR_ATOM_CACHE 0x2000 #define DFLAG_SMALL_ATOM_TAGS 0x4000 +#define DFLAGS_INTERNAL_TAGS 0x8000 /* All flags that should be enabled when term_to_binary/1 is used. */ #define TERM_TO_BINARY_DFLAGS (DFLAG_EXTENDED_REFERENCES \ @@ -99,7 +100,8 @@ typedef struct { #define ERTS_DE_IS_CONNECTED(DEP) \ (!ERTS_DE_IS_NOT_CONNECTED((DEP))) - +#define ERTS_DE_BUSY_LIMIT (1024*1024) +extern int erts_dist_buf_busy_limit; extern int erts_is_alive; /* @@ -153,10 +155,10 @@ erts_dsig_prepare(ErtsDSigData *dsdp, } if (no_suspend) { failure = ERTS_DSIG_PREP_CONNECTED; - erts_smp_spin_lock(&dep->qlock); + erts_smp_mtx_lock(&dep->qlock); if (dep->qflgs & ERTS_DE_QFLG_BUSY) failure = ERTS_DSIG_PREP_WOULD_SUSPEND; - erts_smp_spin_unlock(&dep->qlock); + erts_smp_mtx_unlock(&dep->qlock); if (failure == ERTS_DSIG_PREP_WOULD_SUSPEND) goto fail; } diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c index 07b4167b27..7793f60f4f 100644 --- a/erts/emulator/beam/erl_alloc.c +++ b/erts/emulator/beam/erl_alloc.c @@ -1348,6 +1348,13 @@ handle_args(int *argc, char **argv, erts_alc_hndl_args_init_t *init) argv[j++] = argv[i]; } *argc = j; +#if HALFWORD_HEAP + /* If halfword heap, silently ignore any disabling of internal + allocators */ + for (i = 0; i < aui_sz; ++i) + aui[i]->enable = 1; +#endif + } diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types index 7df9f19af0..408ffd12f7 100644 --- a/erts/emulator/beam/erl_alloc.types +++ b/erts/emulator/beam/erl_alloc.types @@ -247,7 +247,7 @@ type CPUDATA LONG_LIVED SYSTEM cpu_data type TMP_CPU_IDS SHORT_LIVED SYSTEM tmp_cpu_ids type EXT_TERM_DATA SHORT_LIVED PROCESSES external_term_data type ZLIB STANDARD SYSTEM zlib -type RDR_GRPS_MAP LONG_LIVED SYSTEM reader_groups_map +type CPU_GRPS_MAP LONG_LIVED SYSTEM cpu_groups_map +if smp type ASYNC SHORT_LIVED SYSTEM async diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index 40d8dc097c..75d8db880c 100644 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -38,6 +38,7 @@ #include "erl_instrument.h" #include "dist.h" #include "erl_gc.h" +#include "erl_cpu_topology.h" #ifdef HIPE #include "hipe_arch.h" #endif @@ -1687,6 +1688,8 @@ info_1_tuple(Process* BIF_P, /* Pointer to current process. */ return erts_get_cpu_topology_term(BIF_P, *tp); } else if (ERTS_IS_ATOM_STR("cpu_topology", sel) && arity == 2) { Eterm res = erts_get_cpu_topology_term(BIF_P, *tp); + if (res == THE_NON_VALUE) + goto badarg; ERTS_BIF_PREP_TRAP1(ret, erts_format_cpu_topology_trap, BIF_P, res); return ret; #if defined(PURIFY) || defined(VALGRIND) @@ -1999,6 +2002,8 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) BIF_RET(db_get_trace_control_word_0(BIF_P)); } else if (ERTS_IS_ATOM_STR("ets_realloc_moves", BIF_ARG_1)) { BIF_RET((erts_ets_realloc_always_moves) ? am_true : am_false); + } else if (ERTS_IS_ATOM_STR("ets_always_compress", BIF_ARG_1)) { + BIF_RET((erts_ets_always_compress) ? am_true : am_false); } else if (ERTS_IS_ATOM_STR("snifs", BIF_ARG_1)) { Uint size = 0; Uint *szp; @@ -2345,9 +2350,7 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) /* Arguments that are unusual follow ... */ else if (ERTS_IS_ATOM_STR("logical_processors", BIF_ARG_1)) { int no; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - no = erts_get_cpu_configured(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_get_logical_processors(&no, NULL, NULL); if (no > 0) BIF_RET(make_small((Uint) no)); else { @@ -2357,9 +2360,7 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) } else if (ERTS_IS_ATOM_STR("logical_processors_online", BIF_ARG_1)) { int no; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - no = erts_get_cpu_online(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_get_logical_processors(NULL, &no, NULL); if (no > 0) BIF_RET(make_small((Uint) no)); else { @@ -2369,9 +2370,7 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) } else if (ERTS_IS_ATOM_STR("logical_processors_available", BIF_ARG_1)) { int no; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - no = erts_get_cpu_available(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_get_logical_processors(NULL, NULL, &no); if (no > 0) BIF_RET(make_small((Uint) no)); else { @@ -2533,6 +2532,13 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) BIF_RET(erts_nif_taints(BIF_P)); } else if (ERTS_IS_ATOM_STR("reader_groups_map", BIF_ARG_1)) { BIF_RET(erts_get_reader_groups_map(BIF_P)); + } else if (ERTS_IS_ATOM_STR("dist_buf_busy_limit", BIF_ARG_1)) { + Uint hsz = 0; + + (void) erts_bld_uint(NULL, &hsz, erts_dist_buf_busy_limit); + hp = hsz ? HAlloc(BIF_P, hsz) : NULL; + res = erts_bld_uint(&hp, NULL, erts_dist_buf_busy_limit); + BIF_RET(res); } BIF_ERROR(BIF_P, BADARG); diff --git a/erts/emulator/beam/erl_cpu_topology.c b/erts/emulator/beam/erl_cpu_topology.c new file mode 100644 index 0000000000..db95c4a5d4 --- /dev/null +++ b/erts/emulator/beam/erl_cpu_topology.c @@ -0,0 +1,2359 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2010. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +/* + * Description: CPU topology and related functionality + * + * Author: Rickard Green + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <ctype.h> + +#include "global.h" +#include "error.h" +#include "bif.h" +#include "erl_cpu_topology.h" + +#define ERTS_MAX_READER_GROUPS 8 + +/* + * Cpu topology hierarchy. + */ +#define ERTS_TOPOLOGY_NODE 0 +#define ERTS_TOPOLOGY_PROCESSOR 1 +#define ERTS_TOPOLOGY_PROCESSOR_NODE 2 +#define ERTS_TOPOLOGY_CORE 3 +#define ERTS_TOPOLOGY_THREAD 4 +#define ERTS_TOPOLOGY_LOGICAL 5 + +#define ERTS_TOPOLOGY_MAX_DEPTH 6 + +typedef struct { + int bind_id; + int bound_id; +} ErtsCpuBindData; + +static erts_cpu_info_t *cpuinfo; + +static int max_main_threads; +static int reader_groups; + +static ErtsCpuBindData *scheduler2cpu_map; +static erts_smp_rwmtx_t cpuinfo_rwmtx; + +typedef enum { + ERTS_CPU_BIND_UNDEFINED, + ERTS_CPU_BIND_SPREAD, + ERTS_CPU_BIND_PROCESSOR_SPREAD, + ERTS_CPU_BIND_THREAD_SPREAD, + ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD, + ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD, + ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD, + ERTS_CPU_BIND_NO_SPREAD, + ERTS_CPU_BIND_NONE +} ErtsCpuBindOrder; + +#define ERTS_CPU_BIND_DEFAULT_BIND \ + ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD + +static int no_cpu_groups_callbacks; +static ErtsCpuBindOrder cpu_bind_order; + +static erts_cpu_topology_t *user_cpudata; +static int user_cpudata_size; +static erts_cpu_topology_t *system_cpudata; +static int system_cpudata_size; + +typedef struct { + int level[ERTS_TOPOLOGY_MAX_DEPTH+1]; +} erts_avail_cput; + +typedef struct { + int id; + int sub_levels; + int cpu_groups; +} erts_cpu_groups_count_t; + +typedef struct { + int logical; + int cpu_group; +} erts_cpu_groups_map_array_t; + +typedef struct erts_cpu_groups_callback_list_t_ erts_cpu_groups_callback_list_t; +struct erts_cpu_groups_callback_list_t_ { + erts_cpu_groups_callback_list_t *next; + erts_cpu_groups_callback_t callback; + void *arg; +}; + +typedef struct erts_cpu_groups_map_t_ erts_cpu_groups_map_t; +struct erts_cpu_groups_map_t_ { + erts_cpu_groups_map_t *next; + int groups; + erts_cpu_groups_map_array_t *array; + int size; + int logical_processors; + erts_cpu_groups_callback_list_t *callback_list; +}; + +typedef struct { + erts_cpu_groups_callback_t callback; + int ix; + void *arg; +} erts_cpu_groups_callback_call_t; + +static erts_cpu_groups_map_t *cpu_groups_maps; + +static erts_cpu_groups_map_t *reader_groups_map; + +#define ERTS_TOPOLOGY_CG ERTS_TOPOLOGY_MAX_DEPTH + +#define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff) + +#ifdef ERTS_SMP +static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata, + int size, + ErtsCpuBindOrder bind_order, + int mk_seq); +static void write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size); +#endif + +static void reader_groups_callback(int, ErtsSchedulerData *, int, void *); +static erts_cpu_groups_map_t *add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg); +static void update_cpu_groups_maps(void); +static void make_cpu_groups_map(erts_cpu_groups_map_t *map, int test); +static int cpu_groups_lookup(erts_cpu_groups_map_t *map, + ErtsSchedulerData *esdp); + +static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, + int *cpudata_size); +static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata); + +static int +int_cmp(const void *vx, const void *vy) +{ + return *((int *) vx) - *((int *) vy); +} + +static int +cpu_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->core != y->core) + return x->core - y->core; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->node != y->node) + return x->node - y->node; + return 0; +} + +static int +cpu_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_thread_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + return 0; +} + +static int +cpu_thread_no_node_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->node != y->node) + return x->node - y->node; + if (x->core != y->core) + return x->core - y->core; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_no_node_processor_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->core != y->core) + return x->core - y->core; + if (x->processor != y->processor) + return x->processor - y->processor; + return 0; +} + +static int +cpu_no_node_thread_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->thread != y->thread) + return x->thread - y->thread; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->core != y->core) + return x->core - y->core; + return 0; +} + +static int +cpu_no_spread_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->node != y->node) + return x->node - y->node; + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->thread != y->thread) + return x->thread - y->thread; + return 0; +} + +static ERTS_INLINE void +make_cpudata_id_seq(erts_cpu_topology_t *cpudata, int size, int no_node) +{ + int ix; + int node = -1; + int processor = -1; + int processor_node = -1; + int processor_node_node = -1; + int core = -1; + int thread = -1; + int old_node = -1; + int old_processor = -1; + int old_processor_node = -1; + int old_core = -1; + int old_thread = -1; + + for (ix = 0; ix < size; ix++) { + if (!no_node || cpudata[ix].node >= 0) { + if (old_node == cpudata[ix].node) + cpudata[ix].node = node; + else { + old_node = cpudata[ix].node; + old_processor = processor = -1; + if (!no_node) + old_processor_node = processor_node = -1; + old_core = core = -1; + old_thread = thread = -1; + if (no_node || cpudata[ix].node >= 0) + cpudata[ix].node = ++node; + } + } + if (old_processor == cpudata[ix].processor) + cpudata[ix].processor = processor; + else { + old_processor = cpudata[ix].processor; + if (!no_node) + processor_node_node = old_processor_node = processor_node = -1; + old_core = core = -1; + old_thread = thread = -1; + cpudata[ix].processor = ++processor; + } + if (no_node && cpudata[ix].processor_node < 0) + old_processor_node = -1; + else { + if (old_processor_node == cpudata[ix].processor_node) { + if (no_node) + cpudata[ix].node = cpudata[ix].processor_node = node; + else { + if (processor_node_node >= 0) + cpudata[ix].node = processor_node_node; + cpudata[ix].processor_node = processor_node; + } + } + else { + old_processor_node = cpudata[ix].processor_node; + old_core = core = -1; + old_thread = thread = -1; + if (no_node) + cpudata[ix].node = cpudata[ix].processor_node = ++node; + else { + cpudata[ix].node = processor_node_node = ++node; + cpudata[ix].processor_node = ++processor_node; + } + } + } + if (!no_node && cpudata[ix].processor_node < 0) + cpudata[ix].processor_node = 0; + if (old_core == cpudata[ix].core) + cpudata[ix].core = core; + else { + old_core = cpudata[ix].core; + old_thread = thread = -1; + cpudata[ix].core = ++core; + } + if (old_thread == cpudata[ix].thread) + cpudata[ix].thread = thread; + else + old_thread = cpudata[ix].thread = ++thread; + } +} + +static void +cpu_bind_order_sort(erts_cpu_topology_t *cpudata, + int size, + ErtsCpuBindOrder bind_order, + int mk_seq) +{ + if (size > 1) { + int no_node = 0; + int (*cmp_func)(const void *, const void *); + switch (bind_order) { + case ERTS_CPU_BIND_SPREAD: + cmp_func = cpu_spread_order_cmp; + break; + case ERTS_CPU_BIND_PROCESSOR_SPREAD: + cmp_func = cpu_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_THREAD_SPREAD: + cmp_func = cpu_thread_spread_order_cmp; + break; + case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: + no_node = 1; + cmp_func = cpu_thread_no_node_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: + no_node = 1; + cmp_func = cpu_no_node_processor_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: + no_node = 1; + cmp_func = cpu_no_node_thread_spread_order_cmp; + break; + case ERTS_CPU_BIND_NO_SPREAD: + cmp_func = cpu_no_spread_order_cmp; + break; + default: + cmp_func = NULL; + erl_exit(ERTS_ABORT_EXIT, + "Bad cpu bind type: %d\n", + (int) cpu_bind_order); + break; + } + + if (mk_seq) + make_cpudata_id_seq(cpudata, size, no_node); + + qsort(cpudata, size, sizeof(erts_cpu_topology_t), cmp_func); + } +} + +static int +processor_order_cmp(const void *vx, const void *vy) +{ + erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; + erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; + + if (x->processor != y->processor) + return x->processor - y->processor; + if (x->node != y->node) + return x->node - y->node; + if (x->processor_node != y->processor_node) + return x->processor_node - y->processor_node; + if (x->core != y->core) + return x->core - y->core; + if (x->thread != y->thread) + return x->thread - y->thread; + return 0; +} + +#ifdef ERTS_SMP +void +erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData *esdp) +{ + erts_cpu_groups_map_t *cgm; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_callback_call_t *cgcc; + int cgcc_ix; + + /* Unbind from cpu */ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + if (scheduler2cpu_map[esdp->no].bound_id >= 0 + && erts_unbind_from_cpu(cpuinfo) == 0) { + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + } + + cgcc = erts_alloc(ERTS_ALC_T_TMP, + (no_cpu_groups_callbacks + * sizeof(erts_cpu_groups_callback_call_t))); + cgcc_ix = 0; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + cgcc[cgcc_ix].callback = cgcl->callback; + cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp); + cgcc[cgcc_ix].arg = cgcl->arg; + cgcc_ix++; + } + } + ASSERT(no_cpu_groups_callbacks == cgcc_ix); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++) + cgcc[cgcc_ix].callback(1, + esdp, + cgcc[cgcc_ix].ix, + cgcc[cgcc_ix].arg); + + erts_free(ERTS_ALC_T_TMP, cgcc); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(0, 0); + +} + +void +erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData *esdp) +{ + ERTS_SMP_LC_ASSERT(erts_smp_lc_runq_is_locked(esdp->run_queue)); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(1, (int) esdp->no); + + /* Make sure we check if we should bind to a cpu or not... */ + if (esdp->run_queue->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) + erts_smp_atomic_set(&esdp->chk_cpu_bind, 1); + else + esdp->run_queue->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; +} + +#endif + +void +erts_sched_check_cpu_bind(ErtsSchedulerData *esdp) +{ + int res, cpu_id, cgcc_ix; + erts_cpu_groups_map_t *cgm; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_callback_call_t *cgcc; +#ifdef ERTS_SMP + if (erts_common_run_queue) + erts_smp_atomic_set(&esdp->chk_cpu_bind, 0); + else { + esdp->run_queue->flags &= ~ERTS_RUNQ_FLG_CHK_CPU_BIND; + } +#endif + erts_smp_runq_unlock(esdp->run_queue); + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + cpu_id = scheduler2cpu_map[esdp->no].bind_id; + if (cpu_id >= 0 && cpu_id != scheduler2cpu_map[esdp->no].bound_id) { + res = erts_bind_to_cpu(cpuinfo, cpu_id); + if (res == 0) + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = cpu_id; + else { + erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); + erts_dsprintf(dsbufp, "Scheduler %d failed to bind to cpu %d: %s\n", + (int) esdp->no, cpu_id, erl_errno_id(-res)); + erts_send_error_to_logger_nogl(dsbufp); + if (scheduler2cpu_map[esdp->no].bound_id >= 0) + goto unbind; + } + } + else if (cpu_id < 0) { + unbind: + /* Get rid of old binding */ + res = erts_unbind_from_cpu(cpuinfo); + if (res == 0) + esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; + else if (res != -ENOTSUP) { + erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); + erts_dsprintf(dsbufp, "Scheduler %d failed to unbind from cpu %d: %s\n", + (int) esdp->no, cpu_id, erl_errno_id(-res)); + erts_send_error_to_logger_nogl(dsbufp); + } + } + + cgcc = erts_alloc(ERTS_ALC_T_TMP, + (no_cpu_groups_callbacks + * sizeof(erts_cpu_groups_callback_call_t))); + cgcc_ix = 0; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + cgcc[cgcc_ix].callback = cgcl->callback; + cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp); + cgcc[cgcc_ix].arg = cgcl->arg; + cgcc_ix++; + } + } + + ASSERT(no_cpu_groups_callbacks == cgcc_ix); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++) + cgcc[cgcc_ix].callback(0, + esdp, + cgcc[cgcc_ix].ix, + cgcc[cgcc_ix].arg); + + erts_free(ERTS_ALC_T_TMP, cgcc); + + erts_smp_runq_lock(esdp->run_queue); +} + +#ifdef ERTS_SMP +void +erts_sched_init_check_cpu_bind(ErtsSchedulerData *esdp) +{ + int cgcc_ix; + erts_cpu_groups_map_t *cgm; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_callback_call_t *cgcc; + + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + + cgcc = erts_alloc(ERTS_ALC_T_TMP, + (no_cpu_groups_callbacks + * sizeof(erts_cpu_groups_callback_call_t))); + cgcc_ix = 0; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + cgcc[cgcc_ix].callback = cgcl->callback; + cgcc[cgcc_ix].ix = cpu_groups_lookup(cgm, esdp); + cgcc[cgcc_ix].arg = cgcl->arg; + cgcc_ix++; + } + } + + ASSERT(no_cpu_groups_callbacks == cgcc_ix); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + + for (cgcc_ix = 0; cgcc_ix < no_cpu_groups_callbacks; cgcc_ix++) + cgcc[cgcc_ix].callback(0, + esdp, + cgcc[cgcc_ix].ix, + cgcc[cgcc_ix].arg); + + erts_free(ERTS_ALC_T_TMP, cgcc); + + if (esdp->no <= max_main_threads) + erts_thr_set_main_status(1, (int) esdp->no); +} +#endif + +static void +write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size) +{ + int s_ix = 1; + int cpu_ix; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (cpu_bind_order != ERTS_CPU_BIND_NONE && size) { + + cpu_bind_order_sort(cpudata, size, cpu_bind_order, 1); + + for (cpu_ix = 0; cpu_ix < size && cpu_ix < erts_no_schedulers; cpu_ix++) + if (erts_is_cpu_available(cpuinfo, cpudata[cpu_ix].logical)) + scheduler2cpu_map[s_ix++].bind_id = cpudata[cpu_ix].logical; + } + + if (s_ix <= erts_no_schedulers) + for (; s_ix <= erts_no_schedulers; s_ix++) + scheduler2cpu_map[s_ix].bind_id = -1; +} + +int +erts_init_scheduler_bind_type_string(char *how) +{ + if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) + return ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED; + + if (!system_cpudata && !user_cpudata) + return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY; + + if (sys_strcmp(how, "db") == 0) + cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (sys_strcmp(how, "s") == 0) + cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (sys_strcmp(how, "ps") == 0) + cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "ts") == 0) + cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (sys_strcmp(how, "tnnps") == 0) + cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "nnps") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (sys_strcmp(how, "nnts") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (sys_strcmp(how, "ns") == 0) + cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (sys_strcmp(how, "u") == 0) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else + return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE; + + return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS; +} + +static Eterm +bound_schedulers_term(ErtsCpuBindOrder order) +{ + switch (order) { + case ERTS_CPU_BIND_SPREAD: { + ERTS_DECL_AM(spread); + return AM_spread; + } + case ERTS_CPU_BIND_PROCESSOR_SPREAD: { + ERTS_DECL_AM(processor_spread); + return AM_processor_spread; + } + case ERTS_CPU_BIND_THREAD_SPREAD: { + ERTS_DECL_AM(thread_spread); + return AM_thread_spread; + } + case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: { + ERTS_DECL_AM(thread_no_node_processor_spread); + return AM_thread_no_node_processor_spread; + } + case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: { + ERTS_DECL_AM(no_node_processor_spread); + return AM_no_node_processor_spread; + } + case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: { + ERTS_DECL_AM(no_node_thread_spread); + return AM_no_node_thread_spread; + } + case ERTS_CPU_BIND_NO_SPREAD: { + ERTS_DECL_AM(no_spread); + return AM_no_spread; + } + case ERTS_CPU_BIND_NONE: { + ERTS_DECL_AM(unbound); + return AM_unbound; + } + default: + ASSERT(0); + return THE_NON_VALUE; + } +} + +Eterm +erts_bound_schedulers_term(Process *c_p) +{ + ErtsCpuBindOrder order; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + order = cpu_bind_order; + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return bound_schedulers_term(order); +} + +Eterm +erts_bind_schedulers(Process *c_p, Eterm how) +{ + int notify = 0; + Eterm res; + erts_cpu_topology_t *cpudata; + int cpudata_size; + ErtsCpuBindOrder old_cpu_bind_order; + + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + + if (erts_bind_to_cpu(cpuinfo, -1) == -ENOTSUP) { + ERTS_BIF_PREP_ERROR(res, c_p, EXC_NOTSUP); + } + else { + + old_cpu_bind_order = cpu_bind_order; + + if (ERTS_IS_ATOM_STR("default_bind", how)) + cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (ERTS_IS_ATOM_STR("spread", how)) + cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (ERTS_IS_ATOM_STR("processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("no_spread", how)) + cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (ERTS_IS_ATOM_STR("unbound", how)) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else { + cpu_bind_order = old_cpu_bind_order; + ERTS_BIF_PREP_ERROR(res, c_p, BADARG); + goto done; + } + + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + + if (!cpudata) { + cpu_bind_order = old_cpu_bind_order; + ERTS_BIF_PREP_ERROR(res, c_p, BADARG); + goto done; + } + + write_schedulers_bind_change(cpudata, cpudata_size); + notify = 1; + + destroy_tmp_cpu_topology_copy(cpudata); + + res = bound_schedulers_term(old_cpu_bind_order); + } + + done: + + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + + if (notify) + erts_sched_notify_check_cpu_bind(); + + return res; +} + +int +erts_sched_bind_atthrcreate_prepare(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + return esdp != NULL && erts_is_scheduler_bound(esdp); +} + +int +erts_sched_bind_atthrcreate_child(int unbind) +{ + int res = 0; + if (unbind) { + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + res = erts_unbind_from_cpu(cpuinfo); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + } + return res; +} + +void +erts_sched_bind_atthrcreate_parent(int unbind) +{ + +} + +int +erts_sched_bind_atfork_prepare(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + int unbind = esdp != NULL && erts_is_scheduler_bound(esdp); + if (unbind) + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + return unbind; +} + +int +erts_sched_bind_atfork_child(int unbind) +{ + if (unbind) { + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + return erts_unbind_from_cpu(cpuinfo); + } + return 0; +} + +char * +erts_sched_bind_atvfork_child(int unbind) +{ + if (unbind) { + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + return erts_get_unbind_from_cpu_str(cpuinfo); + } + return "false"; +} + +void +erts_sched_bind_atfork_parent(int unbind) +{ + if (unbind) + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); +} + +Eterm +erts_fake_scheduler_bindings(Process *p, Eterm how) +{ + ErtsCpuBindOrder fake_cpu_bind_order; + erts_cpu_topology_t *cpudata; + int cpudata_size; + Eterm res; + + if (ERTS_IS_ATOM_STR("default_bind", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; + else if (ERTS_IS_ATOM_STR("spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_SPREAD; + else if (ERTS_IS_ATOM_STR("processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; + else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; + else if (ERTS_IS_ATOM_STR("no_spread", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; + else if (ERTS_IS_ATOM_STR("unbound", how)) + fake_cpu_bind_order = ERTS_CPU_BIND_NONE; + else { + ERTS_BIF_PREP_ERROR(res, p, BADARG); + return res; + } + + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + + if (!cpudata || fake_cpu_bind_order == ERTS_CPU_BIND_NONE) + ERTS_BIF_PREP_RET(res, am_false); + else { + int i; + Eterm *hp; + + cpu_bind_order_sort(cpudata, cpudata_size, fake_cpu_bind_order, 1); + +#ifdef ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA + + erts_fprintf(stderr, "node: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].node); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "processor: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].processor); + erts_fprintf(stderr, "\n"); + if (fake_cpu_bind_order != ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD + && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD + && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD) { + erts_fprintf(stderr, "processor_node:"); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].processor_node); + erts_fprintf(stderr, "\n"); + } + erts_fprintf(stderr, "core: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].core); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "thread: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].thread); + erts_fprintf(stderr, "\n"); + erts_fprintf(stderr, "logical: "); + for (i = 0; i < cpudata_size; i++) + erts_fprintf(stderr, " %2d", cpudata[i].logical); + erts_fprintf(stderr, "\n"); +#endif + + hp = HAlloc(p, cpudata_size+1); + ERTS_BIF_PREP_RET(res, make_tuple(hp)); + *hp++ = make_arityval((Uint) cpudata_size); + for (i = 0; i < cpudata_size; i++) + *hp++ = make_small((Uint) cpudata[i].logical); + } + + destroy_tmp_cpu_topology_copy(cpudata); + + return res; +} + +Eterm +erts_get_schedulers_binds(Process *c_p) +{ + int ix; + ERTS_DECL_AM(unbound); + Eterm *hp = HAlloc(c_p, erts_no_schedulers+1); + Eterm res = make_tuple(hp); + + *(hp++) = make_arityval(erts_no_schedulers); + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + for (ix = 1; ix <= erts_no_schedulers; ix++) + *(hp++) = (scheduler2cpu_map[ix].bound_id >= 0 + ? make_small(scheduler2cpu_map[ix].bound_id) + : AM_unbound); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +/* + * CPU topology + */ + +typedef struct { + int *id; + int used; + int size; +} ErtsCpuTopIdSeq; + +typedef struct { + ErtsCpuTopIdSeq logical; + ErtsCpuTopIdSeq thread; + ErtsCpuTopIdSeq core; + ErtsCpuTopIdSeq processor_node; + ErtsCpuTopIdSeq processor; + ErtsCpuTopIdSeq node; +} ErtsCpuTopEntry; + +static void +init_cpu_top_entry(ErtsCpuTopEntry *cte) +{ + int size = 10; + cte->logical.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->logical.size = size; + cte->thread.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->thread.size = size; + cte->core.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->core.size = size; + cte->processor_node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->processor_node.size = size; + cte->processor.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->processor.size = size; + cte->node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, + sizeof(int)*size); + cte->node.size = size; +} + +static void +destroy_cpu_top_entry(ErtsCpuTopEntry *cte) +{ + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->logical.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->thread.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->core.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor_node.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor.id); + erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->node.id); +} + +static int +get_cput_value_or_range(int *v, int *vr, char **str) +{ + long l; + char *c = *str; + errno = 0; + if (!isdigit((unsigned char)*c)) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; + l = strtol(c, &c, 10); + if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; + *v = (int) l; + if (*c == '-') { + c++; + if (!isdigit((unsigned char)*c)) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + l = strtol(c, &c, 10); + if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + *vr = (int) l; + } + *str = c; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +get_cput_id_seq(ErtsCpuTopIdSeq *idseq, char **str) +{ + int ix = 0; + int need_size = 0; + char *c = *str; + + while (1) { + int res; + int val; + int nids; + int val_range = -1; + res = get_cput_value_or_range(&val, &val_range, &c); + if (res != ERTS_INIT_CPU_TOPOLOGY_OK) + return res; + if (val_range < 0 || val_range == val) + nids = 1; + else { + if (val_range > val) + nids = val_range - val + 1; + else + nids = val - val_range + 1; + } + need_size += nids; + if (need_size > idseq->size) { + idseq->size = need_size + 10; + idseq->id = erts_realloc(ERTS_ALC_T_TMP_CPU_IDS, + idseq->id, + sizeof(int)*idseq->size); + } + if (nids == 1) + idseq->id[ix++] = val; + else if (val_range > val) { + for (; val <= val_range; val++) + idseq->id[ix++] = val; + } + else { + for (; val >= val_range; val--) + idseq->id[ix++] = val; + } + if (*c != ',') + break; + c++; + } + *str = c; + idseq->used = ix; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +get_cput_entry(ErtsCpuTopEntry *cput, char **str) +{ + int h; + char *c = *str; + + cput->logical.used = 0; + cput->thread.id[0] = 0; + cput->thread.used = 1; + cput->core.id[0] = 0; + cput->core.used = 1; + cput->processor_node.id[0] = -1; + cput->processor_node.used = 1; + cput->processor.id[0] = 0; + cput->processor.used = 1; + cput->node.id[0] = -1; + cput->node.used = 1; + + h = ERTS_TOPOLOGY_MAX_DEPTH; + while (*c != ':' && *c != '\0') { + int res; + ErtsCpuTopIdSeq *idseqp; + switch (*c++) { + case 'L': + if (h <= ERTS_TOPOLOGY_LOGICAL) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->logical; + h = ERTS_TOPOLOGY_LOGICAL; + break; + case 't': + case 'T': + if (h <= ERTS_TOPOLOGY_THREAD) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->thread; + h = ERTS_TOPOLOGY_THREAD; + break; + case 'c': + case 'C': + if (h <= ERTS_TOPOLOGY_CORE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->core; + h = ERTS_TOPOLOGY_CORE; + break; + case 'p': + case 'P': + if (h <= ERTS_TOPOLOGY_PROCESSOR) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->processor; + h = ERTS_TOPOLOGY_PROCESSOR; + break; + case 'n': + case 'N': + if (h <= ERTS_TOPOLOGY_PROCESSOR) { + do_node: + if (h <= ERTS_TOPOLOGY_NODE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->node; + h = ERTS_TOPOLOGY_NODE; + } + else { + int p_node = 0; + char *p_chk = c; + while (*p_chk != '\0' && *p_chk != ':') { + if (*p_chk == 'p' || *p_chk == 'P') { + p_node = 1; + break; + } + p_chk++; + } + if (!p_node) + goto do_node; + if (h <= ERTS_TOPOLOGY_PROCESSOR_NODE) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; + idseqp = &cput->processor_node; + h = ERTS_TOPOLOGY_PROCESSOR_NODE; + } + break; + default: + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE; + } + res = get_cput_id_seq(idseqp, &c); + if (res != ERTS_INIT_CPU_TOPOLOGY_OK) + return res; + } + + if (cput->logical.used < 1) + return ERTS_INIT_CPU_TOPOLOGY_MISSING_LID; + + if (*c == ':') { + c++; + } + + if (cput->thread.used != 1 + && cput->thread.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->core.used != 1 + && cput->core.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->processor_node.used != 1 + && cput->processor_node.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->processor.used != 1 + && cput->processor.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + if (cput->node.used != 1 + && cput->node.used != cput->logical.used) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; + + *str = c; + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +static int +verify_topology(erts_cpu_topology_t *cpudata, int size) +{ + if (size > 0) { + int *logical; + int node, processor, no_nodes, i; + + /* Verify logical ids */ + logical = erts_alloc(ERTS_ALC_T_TMP, sizeof(int)*size); + + for (i = 0; i < size; i++) + logical[i] = cpudata[i].logical; + + qsort(logical, size, sizeof(int), int_cmp); + for (i = 0; i < size-1; i++) { + if (logical[i] == logical[i+1]) { + erts_free(ERTS_ALC_T_TMP, logical); + return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS; + } + } + + erts_free(ERTS_ALC_T_TMP, logical); + + qsort(cpudata, size, sizeof(erts_cpu_topology_t), processor_order_cmp); + + /* Verify unique entities */ + + for (i = 1; i < size; i++) { + if (cpudata[i-1].processor == cpudata[i].processor + && cpudata[i-1].node == cpudata[i].node + && (cpudata[i-1].processor_node + == cpudata[i].processor_node) + && cpudata[i-1].core == cpudata[i].core + && cpudata[i-1].thread == cpudata[i].thread) { + return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES; + } + } + + /* Verify numa nodes */ + node = cpudata[0].node; + processor = cpudata[0].processor; + no_nodes = cpudata[0].node < 0 && cpudata[0].processor_node < 0; + for (i = 1; i < size; i++) { + if (no_nodes) { + if (cpudata[i].node >= 0 || cpudata[i].processor_node >= 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + } + else { + if (cpudata[i].processor == processor && cpudata[i].node != node) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + node = cpudata[i].node; + processor = cpudata[i].processor; + if (node >= 0 && cpudata[i].processor_node >= 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + if (node < 0 && cpudata[i].processor_node < 0) + return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; + } + } + } + + return ERTS_INIT_CPU_TOPOLOGY_OK; +} + +int +erts_init_cpu_topology_string(char *topology_str) +{ + ErtsCpuTopEntry cput; + int need_size; + char *c; + int ix; + int error = ERTS_INIT_CPU_TOPOLOGY_OK; + + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata_size = 10; + + user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + + init_cpu_top_entry(&cput); + + ix = 0; + need_size = 0; + + c = topology_str; + if (*c == '\0') { + error = ERTS_INIT_CPU_TOPOLOGY_MISSING; + goto fail; + } + do { + int r; + error = get_cput_entry(&cput, &c); + if (error != ERTS_INIT_CPU_TOPOLOGY_OK) + goto fail; + need_size += cput.logical.used; + if (user_cpudata_size < need_size) { + user_cpudata_size = need_size + 10; + user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, + user_cpudata, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + } + + ASSERT(cput.thread.used == 1 + || cput.thread.used == cput.logical.used); + ASSERT(cput.core.used == 1 + || cput.core.used == cput.logical.used); + ASSERT(cput.processor_node.used == 1 + || cput.processor_node.used == cput.logical.used); + ASSERT(cput.processor.used == 1 + || cput.processor.used == cput.logical.used); + ASSERT(cput.node.used == 1 + || cput.node.used == cput.logical.used); + + for (r = 0; r < cput.logical.used; r++) { + user_cpudata[ix].logical = cput.logical.id[r]; + user_cpudata[ix].thread = + cput.thread.id[cput.thread.used == 1 ? 0 : r]; + user_cpudata[ix].core = + cput.core.id[cput.core.used == 1 ? 0 : r]; + user_cpudata[ix].processor_node = + cput.processor_node.id[cput.processor_node.used == 1 ? 0 : r]; + user_cpudata[ix].processor = + cput.processor.id[cput.processor.used == 1 ? 0 : r]; + user_cpudata[ix].node = + cput.node.id[cput.node.used == 1 ? 0 : r]; + ix++; + } + } while (*c != '\0'); + + if (user_cpudata_size != ix) { + user_cpudata_size = ix; + user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, + user_cpudata, + (sizeof(erts_cpu_topology_t) + * user_cpudata_size)); + } + + error = verify_topology(user_cpudata, user_cpudata_size); + if (error == ERTS_INIT_CPU_TOPOLOGY_OK) { + destroy_cpu_top_entry(&cput); + return ERTS_INIT_CPU_TOPOLOGY_OK; + } + + fail: + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata_size = 0; + destroy_cpu_top_entry(&cput); + return error; +} + +#define ERTS_GET_CPU_TOPOLOGY_ERROR -1 +#define ERTS_GET_USED_CPU_TOPOLOGY 0 +#define ERTS_GET_DETECTED_CPU_TOPOLOGY 1 +#define ERTS_GET_DEFINED_CPU_TOPOLOGY 2 + +static Eterm get_cpu_topology_term(Process *c_p, int type); + +Eterm +erts_set_cpu_topology(Process *c_p, Eterm term) +{ + erts_cpu_topology_t *cpudata = NULL; + int cpudata_size = 0; + Eterm res; + + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + res = get_cpu_topology_term(c_p, ERTS_GET_USED_CPU_TOPOLOGY); + if (term == am_undefined) { + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata = NULL; + user_cpudata_size = 0; + + if (cpu_bind_order != ERTS_CPU_BIND_NONE && system_cpudata) { + cpudata_size = system_cpudata_size; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + + sys_memcpy((void *) cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*cpudata_size); + } + } + else if (is_not_list(term)) { + error: + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + res = THE_NON_VALUE; + goto done; + } + else { + Eterm list = term; + int ix = 0; + + cpudata_size = 100; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + + while (is_list(list)) { + Eterm *lp = list_val(list); + Eterm cpu = CAR(lp); + Eterm* tp; + Sint id; + + if (is_not_tuple(cpu)) + goto error; + + tp = tuple_val(cpu); + + if (arityval(tp[0]) != 7 || tp[1] != am_cpu) + goto error; + + if (ix >= cpudata_size) { + cpudata_size += 100; + cpudata = erts_realloc(ERTS_ALC_T_TMP, + cpudata, + (sizeof(erts_cpu_topology_t) + * cpudata_size)); + } + + id = signed_val(tp[2]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].node = (int) id; + + id = signed_val(tp[3]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].processor = (int) id; + + id = signed_val(tp[4]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].processor_node = (int) id; + + id = signed_val(tp[5]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].core = (int) id; + + id = signed_val(tp[6]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].thread = (int) id; + + id = signed_val(tp[7]); + if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) + goto error; + cpudata[ix].logical = (int) id; + + list = CDR(lp); + ix++; + } + + if (is_not_nil(list)) + goto error; + + cpudata_size = ix; + + if (ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(cpudata, cpudata_size)) + goto error; + + if (user_cpudata_size != cpudata_size) { + if (user_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); + user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + sizeof(erts_cpu_topology_t)*cpudata_size); + user_cpudata_size = cpudata_size; + } + + sys_memcpy((void *) user_cpudata, + (void *) cpudata, + sizeof(erts_cpu_topology_t)*cpudata_size); + } + + update_cpu_groups_maps(); + + write_schedulers_bind_change(cpudata, cpudata_size); + + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_sched_notify_check_cpu_bind(); + + done: + + if (cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); + + return res; +} + +static void +create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, int *cpudata_size) +{ + if (user_cpudata) { + *cpudata_size = user_cpudata_size; + *cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * (*cpudata_size))); + sys_memcpy((void *) *cpudata, + (void *) user_cpudata, + sizeof(erts_cpu_topology_t)*(*cpudata_size)); + } + else if (system_cpudata) { + *cpudata_size = system_cpudata_size; + *cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * (*cpudata_size))); + sys_memcpy((void *) *cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*(*cpudata_size)); + } + else { + *cpudata = NULL; + *cpudata_size = 0; + } +} + +static void +destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata) +{ + if (cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); +} + + +static Eterm +bld_topology_term(Eterm **hpp, + Uint *hszp, + erts_cpu_topology_t *cpudata, + int size) +{ + Eterm res = NIL; + int i; + + if (size == 0) + return am_undefined; + + for (i = size-1; i >= 0; i--) { + res = erts_bld_cons(hpp, + hszp, + erts_bld_tuple(hpp, + hszp, + 7, + am_cpu, + make_small(cpudata[i].node), + make_small(cpudata[i].processor), + make_small(cpudata[i].processor_node), + make_small(cpudata[i].core), + make_small(cpudata[i].thread), + make_small(cpudata[i].logical)), + res); + } + return res; +} + +static Eterm +get_cpu_topology_term(Process *c_p, int type) +{ +#ifdef DEBUG + Eterm *hp_end; +#endif + Eterm *hp; + Uint hsz; + Eterm res = THE_NON_VALUE; + erts_cpu_topology_t *cpudata = NULL; + int size = 0; + + switch (type) { + case ERTS_GET_USED_CPU_TOPOLOGY: + if (user_cpudata) + goto defined; + else + goto detected; + case ERTS_GET_DETECTED_CPU_TOPOLOGY: + detected: + if (!system_cpudata) + res = am_undefined; + else { + size = system_cpudata_size; + cpudata = erts_alloc(ERTS_ALC_T_TMP, + (sizeof(erts_cpu_topology_t) + * size)); + sys_memcpy((void *) cpudata, + (void *) system_cpudata, + sizeof(erts_cpu_topology_t)*size); + } + break; + case ERTS_GET_DEFINED_CPU_TOPOLOGY: + defined: + if (!user_cpudata) + res = am_undefined; + else { + size = user_cpudata_size; + cpudata = user_cpudata; + } + break; + default: + erl_exit(ERTS_ABORT_EXIT, "Bad cpu topology type: %d\n", type); + break; + } + + if (res == am_undefined) { + ASSERT(!cpudata); + return res; + } + + hsz = 0; + + bld_topology_term(NULL, &hsz, + cpudata, size); + + hp = HAlloc(c_p, hsz); + +#ifdef DEBUG + hp_end = hp + hsz; +#endif + + res = bld_topology_term(&hp, NULL, + cpudata, size); + + ASSERT(hp_end == hp); + + if (cpudata && cpudata != system_cpudata && cpudata != user_cpudata) + erts_free(ERTS_ALC_T_TMP, cpudata); + + return res; +} + +Eterm +erts_get_cpu_topology_term(Process *c_p, Eterm which) +{ + Eterm res; + int type; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + if (ERTS_IS_ATOM_STR("used", which)) + type = ERTS_GET_USED_CPU_TOPOLOGY; + else if (ERTS_IS_ATOM_STR("detected", which)) + type = ERTS_GET_DETECTED_CPU_TOPOLOGY; + else if (ERTS_IS_ATOM_STR("defined", which)) + type = ERTS_GET_DEFINED_CPU_TOPOLOGY; + else + type = ERTS_GET_CPU_TOPOLOGY_ERROR; + if (type == ERTS_GET_CPU_TOPOLOGY_ERROR) + res = THE_NON_VALUE; + else + res = get_cpu_topology_term(c_p, type); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +static void +get_logical_processors(int *conf, int *onln, int *avail) +{ + if (conf) + *conf = erts_get_cpu_configured(cpuinfo); + if (onln) + *onln = erts_get_cpu_online(cpuinfo); + if (avail) + *avail = erts_get_cpu_available(cpuinfo); +} + +void +erts_get_logical_processors(int *conf, int *onln, int *avail) +{ + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + get_logical_processors(conf, onln, avail); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); +} + +void +erts_pre_early_init_cpu_topology(int *max_rg_p, + int *conf_p, + int *onln_p, + int *avail_p) +{ + cpu_groups_maps = NULL; + no_cpu_groups_callbacks = 0; + *max_rg_p = ERTS_MAX_READER_GROUPS; + cpuinfo = erts_cpu_info_create(); + get_logical_processors(conf_p, onln_p, avail_p); +} + +void +erts_early_init_cpu_topology(int no_schedulers, + int *max_main_threads_p, + int max_reader_groups, + int *reader_groups_p) +{ + user_cpudata = NULL; + user_cpudata_size = 0; + + system_cpudata_size = erts_get_cpu_topology_size(cpuinfo); + system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * system_cpudata_size)); + + cpu_bind_order = ERTS_CPU_BIND_UNDEFINED; + + if (!erts_get_cpu_topology(cpuinfo, system_cpudata) + || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata, + system_cpudata_size)) { + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + system_cpudata = NULL; + system_cpudata_size = 0; + } + + max_main_threads = erts_get_cpu_configured(cpuinfo); + if (max_main_threads > no_schedulers) + max_main_threads = no_schedulers; + *max_main_threads_p = max_main_threads; + + reader_groups = max_main_threads; + if (reader_groups <= 1 || max_reader_groups <= 1) + reader_groups = 0; + if (reader_groups > max_reader_groups) + reader_groups = max_reader_groups; + *reader_groups_p = reader_groups; +} + +void +erts_init_cpu_topology(void) +{ + int ix; + + erts_smp_rwmtx_init(&cpuinfo_rwmtx, "cpu_info"); + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + + scheduler2cpu_map = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(ErtsCpuBindData) + * (erts_no_schedulers+1))); + for (ix = 1; ix <= erts_no_schedulers; ix++) { + scheduler2cpu_map[ix].bind_id = -1; + scheduler2cpu_map[ix].bound_id = -1; + } + + if (cpu_bind_order == ERTS_CPU_BIND_UNDEFINED) { + int ncpus = erts_get_cpu_configured(cpuinfo); + if (ncpus < 1 || erts_no_schedulers < ncpus) + cpu_bind_order = ERTS_CPU_BIND_NONE; + else + cpu_bind_order = ((system_cpudata || user_cpudata) + && (erts_bind_to_cpu(cpuinfo, -1) != -ENOTSUP) + ? ERTS_CPU_BIND_DEFAULT_BIND + : ERTS_CPU_BIND_NONE); + } + + reader_groups_map = add_cpu_groups(reader_groups, + reader_groups_callback, + NULL); + + if (cpu_bind_order == ERTS_CPU_BIND_NONE) + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + else { + erts_cpu_topology_t *cpudata; + int cpudata_size; + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + write_schedulers_bind_change(cpudata, cpudata_size); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + erts_sched_notify_check_cpu_bind(); + destroy_tmp_cpu_topology_copy(cpudata); + } +} + +int +erts_update_cpu_info(void) +{ + int changed; + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + changed = erts_cpu_info_update(cpuinfo); + if (changed) { + erts_cpu_topology_t *cpudata; + int cpudata_size; + + if (system_cpudata) + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + + system_cpudata_size = erts_get_cpu_topology_size(cpuinfo); + if (!system_cpudata_size) + system_cpudata = NULL; + else { + system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, + (sizeof(erts_cpu_topology_t) + * system_cpudata_size)); + + if (!erts_get_cpu_topology(cpuinfo, system_cpudata) + || (ERTS_INIT_CPU_TOPOLOGY_OK + != verify_topology(system_cpudata, + system_cpudata_size))) { + erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); + system_cpudata = NULL; + system_cpudata_size = 0; + } + } + + update_cpu_groups_maps(); + + create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); + write_schedulers_bind_change(cpudata, cpudata_size); + destroy_tmp_cpu_topology_copy(cpudata); + } + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); + if (changed) + erts_sched_notify_check_cpu_bind(); + return changed; +} + +/* + * reader groups map + */ + +void +reader_groups_callback(int suspending, + ErtsSchedulerData *esdp, + int group, + void *unused) +{ + if (reader_groups && esdp->no <= max_main_threads) + erts_smp_rwmtx_set_reader_group(suspending ? 0 : group+1); +} + +static Eterm get_cpu_groups_map(Process *c_p, + erts_cpu_groups_map_t *map, + int offset); +Eterm +erts_debug_reader_groups_map(Process *c_p, int groups) +{ + Eterm res; + erts_cpu_groups_map_t test; + + test.array = NULL; + test.groups = groups; + make_cpu_groups_map(&test, 1); + if (!test.array) + res = NIL; + else { + res = get_cpu_groups_map(c_p, &test, 1); + erts_free(ERTS_ALC_T_TMP, test.array); + } + return res; +} + + +Eterm +erts_get_reader_groups_map(Process *c_p) +{ + Eterm res; + erts_smp_rwmtx_rlock(&cpuinfo_rwmtx); + res = get_cpu_groups_map(c_p, reader_groups_map, 1); + erts_smp_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + +/* + * CPU groups + */ + +static Eterm +get_cpu_groups_map(Process *c_p, + erts_cpu_groups_map_t *map, + int offset) +{ +#ifdef DEBUG + Eterm *endp; +#endif + Eterm res = NIL, tuple; + Eterm *hp; + int i; + + hp = HAlloc(c_p, map->logical_processors*(2+3)); +#ifdef DEBUG + endp = hp + map->logical_processors*(2+3); +#endif + for (i = map->size - 1; i >= 0; i--) { + if (map->array[i].logical >= 0) { + tuple = TUPLE2(hp, + make_small(map->array[i].logical), + make_small(map->array[i].cpu_group + offset)); + hp += 3; + res = CONS(hp, tuple, res); + hp += 2; + } + } + ASSERT(hp == endp); + return res; +} + +static void +make_available_cpu_topology(erts_avail_cput *no, + erts_avail_cput *avail, + erts_cpu_topology_t *cpudata, + int *size, + int test) +{ + int len = *size; + erts_cpu_topology_t last; + int a, i, j; + + no->level[ERTS_TOPOLOGY_NODE] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR] = -1; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1; + no->level[ERTS_TOPOLOGY_CORE] = -1; + no->level[ERTS_TOPOLOGY_THREAD] = -1; + no->level[ERTS_TOPOLOGY_LOGICAL] = -1; + + last.node = INT_MIN; + last.processor = INT_MIN; + last.processor_node = INT_MIN; + last.core = INT_MIN; + last.thread = INT_MIN; + last.logical = INT_MIN; + + a = 0; + + for (i = 0; i < len; i++) { + + if (!test && !erts_is_cpu_available(cpuinfo, cpudata[i].logical)) + continue; + + if (last.node != cpudata[i].node) + goto node; + if (last.processor != cpudata[i].processor) + goto processor; + if (last.processor_node != cpudata[i].processor_node) + goto processor_node; + if (last.core != cpudata[i].core) + goto core; + ASSERT(last.thread != cpudata[i].thread); + goto thread; + + node: + no->level[ERTS_TOPOLOGY_NODE]++; + processor: + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + processor_node: + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + core: + no->level[ERTS_TOPOLOGY_CORE]++; + thread: + no->level[ERTS_TOPOLOGY_THREAD]++; + + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++) + avail[a].level[j] = no->level[j]; + + avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical; + avail[a].level[ERTS_TOPOLOGY_CG] = 0; + + ASSERT(last.logical != cpudata[i].logical); + + last = cpudata[i]; + a++; + } + + no->level[ERTS_TOPOLOGY_NODE]++; + no->level[ERTS_TOPOLOGY_PROCESSOR]++; + no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; + no->level[ERTS_TOPOLOGY_CORE]++; + no->level[ERTS_TOPOLOGY_THREAD]++; + no->level[ERTS_TOPOLOGY_LOGICAL]++; + + *size = a; +} + +static void +cpu_group_insert(erts_cpu_groups_map_t *map, + int logical, int cpu_group) +{ + int start = logical % map->size; + int ix = start; + + do { + if (map->array[ix].logical < 0) { + map->array[ix].logical = logical; + map->array[ix].cpu_group = cpu_group; + return; + } + ix++; + if (ix == map->size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Reader groups map full\n"); +} + + +static int +sub_levels(erts_cpu_groups_count_t *cgc, int level, int aix, + int avail_sz, erts_avail_cput *avail) +{ + int sub_level = level+1; + int last = -1; + cgc->sub_levels = 0; + + do { + if (last != avail[aix].level[sub_level]) { + cgc->sub_levels++; + last = avail[aix].level[sub_level]; + } + aix++; + } + while (aix < avail_sz && cgc->id == avail[aix].level[level]); + cgc->cpu_groups = 0; + return aix; +} + +static int +write_cpu_groups(int *cgp, erts_cpu_groups_count_t *cgcp, + int level, int a, + int avail_sz, erts_avail_cput *avail) +{ + int cg = *cgp; + int sub_level = level+1; + int sl_per_gr = cgcp->sub_levels / cgcp->cpu_groups; + int xsl = cgcp->sub_levels % cgcp->cpu_groups; + int sls = 0; + int last = -1; + int xsl_cg_lim = (cgcp->cpu_groups - xsl) + cg + 1; + + ASSERT(level < 0 || avail[a].level[level] == cgcp->id); + + do { + if (last != avail[a].level[sub_level]) { + if (!sls) { + sls = sl_per_gr; + cg++; + if (cg >= xsl_cg_lim) + sls++; + } + last = avail[a].level[sub_level]; + sls--; + } + avail[a].level[ERTS_TOPOLOGY_CG] = cg; + a++; + } while (a < avail_sz && (level < 0 + || avail[a].level[level] == cgcp->id)); + + ASSERT(cgcp->cpu_groups == cg - *cgp); + + *cgp = cg; + + return a; +} + +static int +cg_count_sub_levels_compare(const void *vx, const void *vy) +{ + erts_cpu_groups_count_t *x = (erts_cpu_groups_count_t *) vx; + erts_cpu_groups_count_t *y = (erts_cpu_groups_count_t *) vy; + if (x->sub_levels != y->sub_levels) + return y->sub_levels - x->sub_levels; + return x->id - y->id; +} + +static int +cg_count_id_compare(const void *vx, const void *vy) +{ + erts_cpu_groups_count_t *x = (erts_cpu_groups_count_t *) vx; + erts_cpu_groups_count_t *y = (erts_cpu_groups_count_t *) vy; + return x->id - y->id; +} + +static void +make_cpu_groups_map(erts_cpu_groups_map_t *map, int test) +{ + int i, spread_level, avail_sz; + erts_avail_cput no, *avail; + erts_cpu_topology_t *cpudata; + ErtsAlcType_t alc_type = (test + ? ERTS_ALC_T_TMP + : ERTS_ALC_T_CPU_GRPS_MAP); + + if (map->array) + erts_free(alc_type, map->array); + + map->array = NULL; + map->logical_processors = 0; + map->size = 0; + + if (!map->groups) + return; + + create_tmp_cpu_topology_copy(&cpudata, &avail_sz); + + if (!cpudata) + return; + + cpu_bind_order_sort(cpudata, + avail_sz, + ERTS_CPU_BIND_NO_SPREAD, + 1); + + avail = erts_alloc(ERTS_ALC_T_TMP, + sizeof(erts_avail_cput)*avail_sz); + + make_available_cpu_topology(&no, avail, cpudata, + &avail_sz, test); + + destroy_tmp_cpu_topology_copy(cpudata); + + map->size = avail_sz*2+1; + + map->array = erts_alloc(alc_type, + (sizeof(erts_cpu_groups_map_array_t) + * map->size));; + map->logical_processors = avail_sz; + + for (i = 0; i < map->size; i++) { + map->array[i].logical = -1; + map->array[i].cpu_group = -1; + } + + spread_level = ERTS_TOPOLOGY_CORE; + for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) { + if (no.level[i] > map->groups) { + spread_level = i; + break; + } + } + + if (no.level[spread_level] <= map->groups) { + int a, cg, last = -1; + cg = -1; + ASSERT(spread_level == ERTS_TOPOLOGY_CORE); + for (a = 0; a < avail_sz; a++) { + if (last != avail[a].level[spread_level]) { + cg++; + last = avail[a].level[spread_level]; + } + cpu_group_insert(map, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + cg); + } + } + else { /* map->groups < no.level[spread_level] */ + erts_cpu_groups_count_t *cg_count; + int a, cg, tl, toplevels; + + tl = spread_level-1; + + if (spread_level == ERTS_TOPOLOGY_NODE) + toplevels = 1; + else + toplevels = no.level[tl]; + + cg_count = erts_alloc(ERTS_ALC_T_TMP, + toplevels*sizeof(erts_cpu_groups_count_t)); + + if (toplevels == 1) { + cg_count[0].id = 0; + cg_count[0].sub_levels = no.level[spread_level]; + cg_count[0].cpu_groups = map->groups; + } + else { + int cgs_per_tl, cgs; + cgs = map->groups; + cgs_per_tl = cgs / toplevels; + + a = 0; + for (i = 0; i < toplevels; i++) { + cg_count[i].id = avail[a].level[tl]; + a = sub_levels(&cg_count[i], tl, a, avail_sz, avail); + } + + qsort(cg_count, + toplevels, + sizeof(erts_cpu_groups_count_t), + cg_count_sub_levels_compare); + + for (i = 0; i < toplevels; i++) { + if (cg_count[i].sub_levels < cgs_per_tl) { + cg_count[i].cpu_groups = cg_count[i].sub_levels; + cgs -= cg_count[i].sub_levels; + } + else { + cg_count[i].cpu_groups = cgs_per_tl; + cgs -= cgs_per_tl; + } + } + + while (cgs > 0) { + for (i = 0; i < toplevels; i++) { + if (cg_count[i].sub_levels == cg_count[i].cpu_groups) + break; + else { + cg_count[i].cpu_groups++; + if (--cgs == 0) + break; + } + } + } + + qsort(cg_count, + toplevels, + sizeof(erts_cpu_groups_count_t), + cg_count_id_compare); + } + + a = i = 0; + cg = -1; + while (a < avail_sz) { + a = write_cpu_groups(&cg, &cg_count[i], tl, + a, avail_sz, avail); + i++; + } + + ASSERT(map->groups == cg + 1); + + for (a = 0; a < avail_sz; a++) + cpu_group_insert(map, + avail[a].level[ERTS_TOPOLOGY_LOGICAL], + avail[a].level[ERTS_TOPOLOGY_CG]); + + erts_free(ERTS_ALC_T_TMP, cg_count); + } + + erts_free(ERTS_ALC_T_TMP, avail); +} + +static erts_cpu_groups_map_t * +add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg) +{ + int use_groups = groups; + erts_cpu_groups_callback_list_t *cgcl; + erts_cpu_groups_map_t *cgm; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (use_groups > max_main_threads) + use_groups = max_main_threads; + + if (!use_groups) + return NULL; + + no_cpu_groups_callbacks++; + cgcl = erts_alloc(ERTS_ALC_T_CPU_GRPS_MAP, + sizeof(erts_cpu_groups_callback_list_t)); + cgcl->callback = callback; + cgcl->arg = arg; + + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + if (cgm->groups == use_groups) { + cgcl->next = cgm->callback_list; + cgm->callback_list = cgcl; + return cgm; + } + } + + + cgm = erts_alloc(ERTS_ALC_T_CPU_GRPS_MAP, + sizeof(erts_cpu_groups_map_t)); + cgm->next = cpu_groups_maps; + cgm->groups = use_groups; + cgm->array = NULL; + cgm->size = 0; + cgm->logical_processors = 0; + cgm->callback_list = cgcl; + + cgcl->next = NULL; + + make_cpu_groups_map(cgm, 0); + + cpu_groups_maps = cgm; + + return cgm; +} + +static void +remove_cpu_groups(erts_cpu_groups_callback_t callback, void *arg) +{ + erts_cpu_groups_map_t *prev_cgm, *cgm; + erts_cpu_groups_callback_list_t *prev_cgcl, *cgcl; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + no_cpu_groups_callbacks--; + + prev_cgm = NULL; + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) { + prev_cgcl = NULL; + for (cgcl = cgm->callback_list; cgcl; cgcl = cgcl->next) { + if (cgcl->callback == callback && cgcl->arg == arg) { + if (prev_cgcl) + prev_cgcl->next = cgcl->next; + else + cgm->callback_list = cgcl->next; + erts_free(ERTS_ALC_T_CPU_GRPS_MAP, cgcl); + if (!cgm->callback_list) { + if (prev_cgm) + prev_cgm->next = cgm->next; + else + cpu_groups_maps = cgm->next; + if (cgm->array) + erts_free(ERTS_ALC_T_CPU_GRPS_MAP, cgm->array); + erts_free(ERTS_ALC_T_CPU_GRPS_MAP, cgm); + } + return; + } + prev_cgcl = cgcl; + } + prev_cgm = cgm; + } + + erl_exit(ERTS_ABORT_EXIT, "Cpu groups not found\n"); +} + +static int +cpu_groups_lookup(erts_cpu_groups_map_t *map, + ErtsSchedulerData *esdp) +{ + int start, logical, ix; + + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rlocked(&cpuinfo_rwmtx) + || erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + if (esdp->cpu_id < 0) + return (((int) esdp->no) - 1) % map->groups; + + logical = esdp->cpu_id; + start = logical % map->size; + ix = start; + + do { + if (map->array[ix].logical == logical) { + int group = map->array[ix].cpu_group; + ASSERT(0 <= group && group < map->groups); + return group; + } + ix++; + if (ix == map->size) + ix = 0; + } while (ix != start); + + erl_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical); +} + +static void +update_cpu_groups_maps(void) +{ + erts_cpu_groups_map_t *cgm; + ERTS_SMP_LC_ASSERT(erts_lc_rwmtx_is_rwlocked(&cpuinfo_rwmtx)); + + for (cgm = cpu_groups_maps; cgm; cgm = cgm->next) + make_cpu_groups_map(cgm, 0); +} + +void +erts_add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg) +{ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + add_cpu_groups(groups, callback, arg); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); +} + +void erts_remove_cpu_groups(erts_cpu_groups_callback_t callback, + void *arg) +{ + erts_smp_rwmtx_rwlock(&cpuinfo_rwmtx); + remove_cpu_groups(callback, arg); + erts_smp_rwmtx_rwunlock(&cpuinfo_rwmtx); +} diff --git a/erts/emulator/beam/erl_cpu_topology.h b/erts/emulator/beam/erl_cpu_topology.h new file mode 100644 index 0000000000..c5a9520b61 --- /dev/null +++ b/erts/emulator/beam/erl_cpu_topology.h @@ -0,0 +1,105 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2010. All Rights Reserved. + * + * The contents of this file are subject to the Erlang Public License, + * Version 1.1, (the "License"); you may not use this file except in + * compliance with the License. You should have received a copy of the + * Erlang Public License along with this software. If not, it can be + * retrieved online at http://www.erlang.org/. + * + * Software distributed under the License is distributed on an "AS IS" + * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See + * the License for the specific language governing rights and limitations + * under the License. + * + * %CopyrightEnd% + */ + +/* + * Description: CPU topology and related functionality + * + * Author: Rickard Green + */ + +#ifndef ERL_CPU_TOPOLOGY_H__ +#define ERL_CPU_TOPOLOGY_H__ + +void erts_pre_early_init_cpu_topology(int *max_rg_p, + int *conf_p, + int *onln_p, + int *avail_p); +void erts_early_init_cpu_topology(int no_schedulers, + int *max_main_threads_p, + int max_reader_groups, + int *reader_groups_p); +void erts_init_cpu_topology(void); + + +#define ERTS_INIT_SCHED_BIND_TYPE_SUCCESS 0 +#define ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED 1 +#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY 2 +#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE 3 + +int erts_init_scheduler_bind_type_string(char *how); + + +#define ERTS_INIT_CPU_TOPOLOGY_OK 0 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID 1 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE 2 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY 3 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE 4 +#define ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES 5 +#define ERTS_INIT_CPU_TOPOLOGY_MISSING_LID 6 +#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS 7 +#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES 8 +#define ERTS_INIT_CPU_TOPOLOGY_MISSING 9 + +int erts_init_cpu_topology_string(char *topology_str); + +void erts_sched_check_cpu_bind(ErtsSchedulerData *esdp); +#ifdef ERTS_SMP +void erts_sched_init_check_cpu_bind(ErtsSchedulerData *esdp); +void erts_sched_check_cpu_bind_prep_suspend(ErtsSchedulerData *esdp); +void erts_sched_check_cpu_bind_post_suspend(ErtsSchedulerData *esdp); +#endif + +int erts_update_cpu_info(void); + +Eterm erts_bind_schedulers(Process *c_p, Eterm how); +Eterm erts_get_schedulers_binds(Process *c_p); + +Eterm erts_get_reader_groups_map(Process *c_p); + +Eterm erts_set_cpu_topology(Process *c_p, Eterm term); +Eterm erts_get_cpu_topology_term(Process *c_p, Eterm which); + +int erts_update_cpu_info(void); +void erts_get_logical_processors(int *conf, int *onln, int *avail); + +int erts_sched_bind_atthrcreate_prepare(void); +int erts_sched_bind_atthrcreate_child(int unbind); +void erts_sched_bind_atthrcreate_parent(int unbind); + +int erts_sched_bind_atfork_prepare(void); +int erts_sched_bind_atfork_child(int unbind); +char *erts_sched_bind_atvfork_child(int unbind); +void erts_sched_bind_atfork_parent(int unbind); + +Eterm erts_fake_scheduler_bindings(Process *p, Eterm how); +Eterm erts_debug_cpu_groups_map(Process *c_p, int groups); + + +typedef void (*erts_cpu_groups_callback_t)(int, + ErtsSchedulerData *, + int, + void *); + +void erts_add_cpu_groups(int groups, + erts_cpu_groups_callback_t callback, + void *arg); +void erts_remove_cpu_groups(erts_cpu_groups_callback_t callback, + void *arg); + +#endif diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c index 1c2c0fe4cb..8577354d27 100644 --- a/erts/emulator/beam/erl_db.c +++ b/erts/emulator/beam/erl_db.c @@ -179,6 +179,7 @@ extern DbTableMethod db_tree; int user_requested_db_max_tabs; int erts_ets_realloc_always_moves; +int erts_ets_always_compress; static int db_max_tabs; static DbTable *meta_pid_to_tab; /* Pid mapped to owned tables */ static DbTable *meta_pid_to_fixed_tab; /* Pid mapped to fixed tables */ @@ -931,7 +932,7 @@ BIF_RETTYPE ets_update_counter_3(BIF_ALIST_3) position > arityval(handle.dbterm->tpl[0])) { goto finalize; } - oldcnt = handle.dbterm->tpl[position]; + oldcnt = db_do_read_element(&handle, position); if (is_big(oldcnt)) { halloc_size += BIG_NEED_SIZE(big_arity(oldcnt)); } @@ -1276,7 +1277,7 @@ BIF_RETTYPE ets_new_2(BIF_ALIST_2) UWord heir_data; Uint32 status; Sint keypos; - int is_named, is_fine_locked, frequent_read; + int is_named, is_fine_locked, frequent_read, is_compressed; int cret; DeclareTmpHeap(meta_tuple,3,BIF_P); DbTableMethod* meth; @@ -1296,6 +1297,7 @@ BIF_RETTYPE ets_new_2(BIF_ALIST_2) frequent_read = 0; heir = am_none; heir_data = (UWord) am_undefined; + is_compressed = erts_ets_always_compress; list = BIF_ARG_2; while(is_list(list)) { @@ -1358,6 +1360,9 @@ BIF_RETTYPE ets_new_2(BIF_ALIST_2) else if (val == am_named_table) { is_named = 1; } + else if (val == am_compressed) { + is_compressed = 1; + } else if (val == am_set || val == am_protected) ; else break; @@ -1418,6 +1423,7 @@ BIF_RETTYPE ets_new_2(BIF_ALIST_2) erts_smp_atomic_init(&tb->common.nitems, 0); tb->common.fixations = NULL; + tb->common.compress = is_compressed; cret = meth->db_create(BIF_P, tb); ASSERT(cret == DB_ERROR_NONE); @@ -2572,7 +2578,7 @@ BIF_RETTYPE ets_match_object_3(BIF_ALIST_3) BIF_RETTYPE ets_info_1(BIF_ALIST_1) { static Eterm fields[] = {am_protection, am_keypos, am_type, am_named_table, - am_node, am_size, am_name, am_heir, am_owner, am_memory}; + am_node, am_size, am_name, am_heir, am_owner, am_memory, am_compressed}; Eterm results[sizeof(fields)/sizeof(Eterm)]; DbTable* tb; Eterm res; @@ -2837,6 +2843,7 @@ void init_db(void) erts_smp_atomic_init(&meta_pid_to_tab->common.nitems, 0); meta_pid_to_tab->common.slot = -1; meta_pid_to_tab->common.meth = &db_hash; + meta_pid_to_tab->common.compress = 0; erts_refc_init(&meta_pid_to_tab->common.ref, 1); erts_refc_init(&meta_pid_to_tab->common.fixref, 0); @@ -2869,6 +2876,7 @@ void init_db(void) erts_smp_atomic_init(&meta_pid_to_fixed_tab->common.nitems, 0); meta_pid_to_fixed_tab->common.slot = -1; meta_pid_to_fixed_tab->common.meth = &db_hash; + meta_pid_to_fixed_tab->common.compress = 0; erts_refc_init(&meta_pid_to_fixed_tab->common.ref, 1); erts_refc_init(&meta_pid_to_fixed_tab->common.fixref, 0); @@ -3077,7 +3085,7 @@ retry: db_unlock(tb,LCK_WRITE); heir_data = tb->common.heir_data; if (!is_immed(heir_data)) { - Eterm* tpv = DBTERM_BUF((DbTerm*)heir_data); /* tuple_val */ + Eterm* tpv = ((DbTerm*)heir_data)->tpl; /* tuple_val */ ASSERT(arityval(*tpv) == 1); heir_data = tpv[1]; } @@ -3251,7 +3259,8 @@ erts_db_process_exiting(Process *c_p, ErtsProcLocks c_p_locks) pp = &(*pp)->next) { if ((*pp)->pid == pid) { DbFixation* fix = *pp; - erts_refc_add(&tb->common.fixref,-fix->counter,0); + long diff = -(long)fix->counter; + erts_refc_add(&tb->common.fixref,diff,0); *pp = fix->next; erts_db_free(ERTS_ALC_T_DB_FIXATION, tb, fix, sizeof(DbFixation)); @@ -3469,8 +3478,8 @@ static void set_heir(Process* me, DbTable* tb, Eterm heir, UWord heir_data) UseTmpHeap(2,me); /* Make a dummy 1-tuple around data to use db_get_term() */ - heir_data = (UWord) db_get_term(&tb->common, NULL, 0, - TUPLE1(tmp,heir_data)); + heir_data = (UWord) db_store_term(&tb->common, NULL, 0, + TUPLE1(tmp,heir_data)); UnUseTmpHeap(2,me); ASSERT(!is_immed(heir_data)); } @@ -3481,7 +3490,7 @@ static void free_heir_data(DbTable* tb) { if (tb->common.heir != am_none && !is_immed(tb->common.heir_data)) { DbTerm* p = (DbTerm*) tb->common.heir_data; - db_free_term_data(p); + db_cleanup_offheap_comp(p); erts_db_free(ERTS_ALC_T_DB_TERM, tb, (void *)p, sizeof(DbTerm) + (p->size-1)*sizeof(Eterm)); } @@ -3618,10 +3627,13 @@ static Eterm table_info(Process* p, DbTable* tb, Eterm What) ret = erts_this_dist_entry->sysname; } else if (What == am_named_table) { ret = is_atom(tb->common.id) ? am_true : am_false; + } else if (What == am_compressed) { + ret = tb->common.compress ? am_true : am_false; + } /* * For debugging purposes */ - } else if (What == am_data) { + else if (What == am_data) { print_table(ERTS_PRINT_STDOUT, NULL, 1, tb); ret = am_true; } else if (What == am_atom_put("fixed",5)) { diff --git a/erts/emulator/beam/erl_db.h b/erts/emulator/beam/erl_db.h index 7da28fad29..cb2da603f0 100644 --- a/erts/emulator/beam/erl_db.h +++ b/erts/emulator/beam/erl_db.h @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1996-2009. All Rights Reserved. + * Copyright Ericsson AB 1996-2010. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in @@ -61,6 +61,7 @@ void erts_db_foreach_offheap(DbTable *, extern int user_requested_db_max_tabs; /* set in erl_init */ extern int erts_ets_realloc_always_moves; /* set in erl_init */ +extern int erts_ets_always_compress; /* set in erl_init */ extern Export ets_select_delete_continue_exp; extern Export ets_select_count_continue_exp; extern Export ets_select_continue_exp; diff --git a/erts/emulator/beam/erl_db_hash.c b/erts/emulator/beam/erl_db_hash.c index fa707f4eed..14ee63100a 100644 --- a/erts/emulator/beam/erl_db_hash.c +++ b/erts/emulator/beam/erl_db_hash.c @@ -267,11 +267,11 @@ static ERTS_INLINE Sint next_slot_w(DbTableHash* tb, Uint ix, */ #define BIN_FLAG_ALL_OBJECTS BIN_FLAG_USR1 -/* - * Size calculations - */ -#define SIZ_OVERHEAD ((sizeof(HashDbTerm)/sizeof(Eterm)) - 1) -#define SIZ_DBTERM(HDT) (SIZ_OVERHEAD + (HDT)->dbterm.size) + +static ERTS_INLINE void free_term(DbTableHash *tb, HashDbTerm* p) +{ + db_free_term((DbTable*)tb, p, offsetof(HashDbTerm, dbterm)); +} /* * Local types @@ -358,10 +358,8 @@ static HashDbTerm* search_list(DbTableHash* tb, Eterm key, HashValue hval, HashDbTerm *list); static void shrink(DbTableHash* tb, int nactive); static void grow(DbTableHash* tb, int nactive); -static void free_term(DbTableHash *tb, HashDbTerm* p); -static Eterm put_term_list(Process* p, HashDbTerm* ptr1, HashDbTerm* ptr2); -static HashDbTerm* get_term(DbTableHash* tb, HashDbTerm* old, - Eterm obj, HashValue hval); +static Eterm build_term_list(Process* p, HashDbTerm* ptr1, HashDbTerm* ptr2, + DbTableHash*); static int analyze_pattern(DbTableHash *tb, Eterm pattern, struct mp_info *mpi); @@ -442,6 +440,7 @@ static ERTS_INLINE int has_live_key(DbTableHash* tb, HashDbTerm* b, if (b->hvalue != hval) return 0; else { Eterm itemKey = GETKEY(tb, b->dbterm.tpl); + ASSERT(!is_header(itemKey)); return EQ(key,itemKey); } } @@ -454,10 +453,38 @@ static ERTS_INLINE int has_key(DbTableHash* tb, HashDbTerm* b, if (b->hvalue != hval && b->hvalue != INVALID_HASH) return 0; else { Eterm itemKey = GETKEY(tb, b->dbterm.tpl); + ASSERT(!is_header(itemKey)); return EQ(key,itemKey); } } +static ERTS_INLINE HashDbTerm* new_dbterm(DbTableHash* tb, Eterm obj) +{ + HashDbTerm* p; + if (tb->common.compress) { + p = db_store_term_comp(&tb->common, NULL, offsetof(HashDbTerm,dbterm), obj); + } + else { + p = db_store_term(&tb->common, NULL, offsetof(HashDbTerm,dbterm), obj); + } + return p; +} + +static ERTS_INLINE HashDbTerm* replace_dbterm(DbTableHash* tb, HashDbTerm* old, + Eterm obj) +{ + HashDbTerm* ret; + ASSERT(old != NULL); + if (tb->common.compress) { + ret = db_store_term_comp(&tb->common, &(old->dbterm), offsetof(HashDbTerm,dbterm), obj); + } + else { + ret = db_store_term(&tb->common, &(old->dbterm), offsetof(HashDbTerm,dbterm), obj); + } + return ret; +} + + /* ** External interface @@ -764,7 +791,7 @@ int db_put_hash(DbTable *tbl, Eterm obj, int key_clash_fail) ret = DB_ERROR_BADKEY; goto Ldone; } - q = get_term(tb, b, obj, hval); + q = replace_dbterm(tb, b, obj); q->next = bnext; q->hvalue = hval; /* In case of INVALID_HASH */ *bp = q; @@ -784,7 +811,7 @@ int db_put_hash(DbTable *tbl, Eterm obj, int key_clash_fail) HashDbTerm** qp = bp; q = b; do { - if (eq(make_tuple(q->dbterm.tpl), obj)) { + if (db_eq(&tb->common,obj,&q->dbterm)) { if (q->hvalue == INVALID_HASH) { erts_smp_atomic_inc(&tb->common.nitems); q->hvalue = hval; @@ -803,7 +830,8 @@ int db_put_hash(DbTable *tbl, Eterm obj, int key_clash_fail) /*else DB_DUPLICATE_BAG */ Lnew: - q = get_term(tb, NULL, obj, hval); + q = new_dbterm(tb, obj); + q->hvalue = hval; q->next = b; *bp = q; nitems = erts_smp_atomic_inctest(&tb->common.nitems); @@ -844,7 +872,7 @@ int db_get_hash(Process *p, DbTable *tbl, Eterm key, Eterm *ret) while(b2 != NULL && has_key(tb,b2,key,hval)) b2 = b2->next; } - copy = put_term_list(p, b1, b2); + copy = build_term_list(p, b1, b2, tb); CHECK_TABLES(); *ret = copy; goto done; @@ -967,13 +995,10 @@ static int db_get_element_hash(Process *p, DbTable *tbl, while(b1 != 0) { if (has_live_key(tb,b1,key,hval)) { - Eterm copy; - if (ndex > arityval(b1->dbterm.tpl[0])) { retval = DB_ERROR_BADITEM; goto done; } - if (tb->common.status & (DB_BAG | DB_DUPLICATE_BAG)) { HashDbTerm* b; HashDbTerm* b2 = b1->next; @@ -987,15 +1012,12 @@ static int db_get_element_hash(Process *p, DbTable *tbl, } b2 = b2->next; } - b = b1; while(b != b2) { if (b->hvalue != INVALID_HASH) { Eterm *hp; - Uint sz = size_object(b->dbterm.tpl[ndex])+2; - - hp = HAlloc(p, sz); - copy = copy_struct(b->dbterm.tpl[ndex], sz-2, &hp, &MSO(p)); + Eterm copy = db_copy_element_from_ets(&tb->common, p, + &b->dbterm, ndex, &hp, 2); elem_list = CONS(hp, copy, elem_list); hp += 2; } @@ -1004,8 +1026,8 @@ static int db_get_element_hash(Process *p, DbTable *tbl, *ret = elem_list; } else { - COPY_OBJECT(b1->dbterm.tpl[ndex], p, ©); - *ret = copy; + Eterm* hp; + *ret = db_copy_element_from_ets(&tb->common, p, &b1->dbterm, ndex, &hp, 0); } retval = DB_ERROR_NONE; goto done; @@ -1040,6 +1062,7 @@ int db_erase_bag_exact2(DbTable *tbl, Eterm key, Eterm value) ASSERT(!IS_FIXED(tb)); ASSERT((tb->common.status & DB_BAG)); + ASSERT(!tb->common.compress); while(b != 0) { if (has_live_key(tb,b,key,hval)) { @@ -1139,7 +1162,7 @@ static int db_erase_object_hash(DbTable *tbl, Eterm object, Eterm *ret) while(b != 0) { if (has_live_key(tb,b,key,hval)) { ++nkeys; - if (eq(object, make_tuple(b->dbterm.tpl))) { + if (db_eq(&tb->common,object, &b->dbterm)) { --nitems_diff; if (nkeys==1 && IS_FIXED(tb)) { /* Pseudo remove */ add_fixed_deletion(tb,ix); @@ -1188,7 +1211,7 @@ static int db_slot_hash(Process *p, DbTable *tbl, Eterm slot_term, Eterm *ret) lck = RLOCK_HASH(tb, slot); nactive = NACTIVE(tb); if (slot < nactive) { - *ret = put_term_list(p, BUCKET(tb, slot), 0); + *ret = build_term_list(p, BUCKET(tb, slot), 0, tb); retval = DB_ERROR_NONE; } else if (slot == nactive) { @@ -1232,8 +1255,6 @@ static int db_select_continue_hash(Process *p, int num_left = 1000; HashDbTerm *current = 0; Eterm match_list; - Uint32 dummy; - unsigned sz; Eterm *hp; Eterm match_res; Sint got; @@ -1285,26 +1306,14 @@ static int db_select_continue_hash(Process *p, } for(;;) { if (current->hvalue != INVALID_HASH && - (match_res = - db_prog_match(p,mp, - make_tuple(current->dbterm.tpl), - NULL,0,&dummy), + (match_res = db_prog_match_and_copy(&tb->common, p, mp, all_objects, + ¤t->dbterm, &hp, 2), is_value(match_res))) { - if (all_objects) { - hp = HAlloc(p, current->dbterm.size + 2); - match_res = copy_shallow(DBTERM_BUF(¤t->dbterm), - current->dbterm.size, - &hp, - &MSO(p)); - } else { - sz = size_object(match_res); - - hp = HAlloc(p, sz + 2); - match_res = copy_struct(match_res, sz, &hp, &MSO(p)); - } - match_list = CONS(hp, match_res, match_list); + + match_list = CONS(hp, match_res, match_list); ++got; } + --num_left; save_slot_ix = slot_ix; if ((current = next(tb, (Uint*)&slot_ix, &lck, current)) == NULL) { @@ -1395,9 +1404,7 @@ static int db_select_chunk_hash(Process *p, DbTable *tbl, HashDbTerm *current = 0; unsigned current_list_pos = 0; Eterm match_list; - Uint32 dummy; Eterm match_res; - unsigned sz; Eterm *hp; int num_left = 1000; Uint got = 0; @@ -1464,22 +1471,9 @@ static int db_select_chunk_hash(Process *p, DbTable *tbl, for(;;) { if (current != NULL) { if (current->hvalue != INVALID_HASH) { - match_res = db_prog_match(p,mpi.mp, - make_tuple(current->dbterm.tpl), - NULL,0,&dummy); + match_res = db_prog_match_and_copy(&tb->common, p, mpi.mp, 0, + ¤t->dbterm, &hp, 2); if (is_value(match_res)) { - if (mpi.all_objects) { - hp = HAlloc(p, current->dbterm.size + 2); - match_res = copy_shallow(DBTERM_BUF(¤t->dbterm), - current->dbterm.size, - &hp, - &MSO(p)); - } else { - sz = size_object(match_res); - - hp = HAlloc(p, sz + 2); - match_res = copy_struct(match_res, sz, &hp, &MSO(p)); - } match_list = CONS(hp, match_res, match_list); ++got; } @@ -1594,7 +1588,6 @@ static int db_select_count_hash(Process *p, Uint slot_ix = 0; HashDbTerm* current = NULL; unsigned current_list_pos = 0; - Uint32 dummy; Eterm *hp; int num_left = 1000; Uint got = 0; @@ -1644,8 +1637,8 @@ static int db_select_count_hash(Process *p, for(;;) { if (current != NULL) { if (current->hvalue != INVALID_HASH) { - if (db_prog_match(p, mpi.mp, make_tuple(current->dbterm.tpl), - NULL,0, &dummy) == am_true) { + if (db_prog_match_and_copy(&tb->common, p, mpi.mp, 0, + ¤t->dbterm, NULL,0) == am_true) { ++got; } --num_left; @@ -1713,7 +1706,6 @@ static int db_select_delete_hash(Process *p, Uint slot_ix = 0; HashDbTerm **current = NULL; unsigned current_list_pos = 0; - Uint32 dummy; Eterm *hp; int num_left = 1000; Uint got = 0; @@ -1794,9 +1786,8 @@ static int db_select_delete_hash(Process *p, } else { int did_erase = 0; - if ((db_prog_match(p,mpi.mp, - make_tuple((*current)->dbterm.tpl), - NULL,0,&dummy)) == am_true) { + if (db_prog_match_and_copy(&tb->common, p, mpi.mp, 0, + &(*current)->dbterm, NULL, 0) == am_true) { if (NFIXED(tb) > fixated_by_me) { /* fixated by others? */ if (slot_ix != last_pseudo_delete) { add_fixed_deletion(tb, slot_ix); @@ -1859,7 +1850,6 @@ static int db_select_delete_continue_hash(Process *p, Uint slot_ix; Uint last_pseudo_delete = (Uint)-1; HashDbTerm **current = NULL; - Uint32 dummy; Eterm *hp; int num_left = 1000; Uint got; @@ -1907,8 +1897,8 @@ static int db_select_delete_continue_hash(Process *p, } else { int did_erase = 0; - if ((db_prog_match(p,mp,make_tuple((*current)->dbterm.tpl), - NULL,0,&dummy)) == am_true) { + if (db_prog_match_and_copy(&tb->common, p, mp, 0, + &(*current)->dbterm, NULL, 0) == am_true) { if (NFIXED(tb) > fixated_by_me) { /* fixated by others? */ if (slot_ix != last_pseudo_delete) { add_fixed_deletion(tb, slot_ix); @@ -1970,7 +1960,6 @@ static int db_select_count_continue_hash(Process *p, DbTableHash *tb = &tbl->hash; Uint slot_ix; HashDbTerm* current; - Uint32 dummy; Eterm *hp; int num_left = 1000; Uint got; @@ -2008,8 +1997,8 @@ static int db_select_count_continue_hash(Process *p, current = current->next; continue; } - if (db_prog_match(p, mp, make_tuple(current->dbterm.tpl), - NULL,0,&dummy) == am_true) { + if (db_prog_match_and_copy(&tb->common, p, mp, 0, ¤t->dbterm, + NULL, 0) == am_true) { ++got; } --num_left; @@ -2454,31 +2443,19 @@ static int free_seg(DbTableHash *tb, int free_records) } -static HashDbTerm* get_term(DbTableHash* tb, HashDbTerm* old, - Eterm obj, HashValue hval) -{ - HashDbTerm* p = db_get_term((DbTableCommon *) tb, - (old != NULL) ? &(old->dbterm) : NULL, - ((char *) &(old->dbterm)) - ((char *) old), - obj); - p->hvalue = hval; - /*p->next = NULL;*/ /*No Need */ - return p; -} - - /* ** Copy terms from ptr1 until ptr2 ** works for ptr1 == ptr2 == 0 => [] ** or ptr2 == 0 */ -static Eterm put_term_list(Process* p, HashDbTerm* ptr1, HashDbTerm* ptr2) +static Eterm build_term_list(Process* p, HashDbTerm* ptr1, HashDbTerm* ptr2, + DbTableHash* tb) { int sz = 0; HashDbTerm* ptr; Eterm list = NIL; Eterm copy; - Eterm *hp; + Eterm *hp, *hend; ptr = ptr1; while(ptr != ptr2) { @@ -2490,26 +2467,20 @@ static Eterm put_term_list(Process* p, HashDbTerm* ptr1, HashDbTerm* ptr2) } hp = HAlloc(p, sz); + hend = hp + sz; ptr = ptr1; while(ptr != ptr2) { if (ptr->hvalue != INVALID_HASH) { - copy = copy_shallow(DBTERM_BUF(&ptr->dbterm), ptr->dbterm.size, &hp, &MSO(p)); + copy = db_copy_object_from_ets(&tb->common, &ptr->dbterm, &hp, &MSO(p)); list = CONS(hp, copy, list); hp += 2; } ptr = ptr->next; } - return list; -} + HRelease(p,hend,hp); -static void free_term(DbTableHash *tb, HashDbTerm* p) -{ - db_free_term_data(&(p->dbterm)); - erts_db_free(ERTS_ALC_T_DB_TERM, - (DbTable *) tb, - (void *) p, - SIZ_DBTERM(p)*sizeof(Eterm)); + return list; } /* Grow table with one new bucket. @@ -2720,8 +2691,8 @@ static int db_lookup_dbterm_hash(DbTable *tbl, Eterm key, DbUpdateHandle* handle handle->tb = tbl; handle->bp = (void**) prevp; handle->dbterm = &b->dbterm; - handle->new_size = b->dbterm.size; handle->mustResize = 0; + handle->new_size = b->dbterm.size; handle->lck = lck; /* KEEP hval WLOCKED, db_finalize_dbterm_hash will WUNLOCK */ return 1; @@ -2742,37 +2713,14 @@ static void db_finalize_dbterm_hash(DbUpdateHandle* handle) erts_smp_rwmtx_t* lck = (erts_smp_rwmtx_t*) handle->lck; ERTS_SMP_LC_ASSERT(IS_HASH_WLOCKED(&tbl->hash,lck)); /* locked by db_lookup_dbterm_hash */ - ASSERT(&oldp->dbterm == handle->dbterm); - if (handle->mustResize) { - ErlOffHeap tmp_offheap; - Eterm* top; - Eterm copy; - DbTerm* newDbTerm; - HashDbTerm* newp = erts_db_alloc(ERTS_ALC_T_DB_TERM, tbl, - sizeof(HashDbTerm)+sizeof(Eterm)*(handle->new_size-1)); - sys_memcpy(newp, oldp, sizeof(HashDbTerm)-sizeof(DbTerm)); /* copy only hashtab header */ - *(handle->bp) = newp; - newDbTerm = &newp->dbterm; - - newDbTerm->size = handle->new_size; - tmp_offheap.first = NULL; - tmp_offheap.overhead = 0; - - /* make a flat copy */ - top = DBTERM_BUF(newDbTerm); - copy = copy_struct(make_tuple(handle->dbterm->tpl), - handle->new_size, - &top, &tmp_offheap); - newDbTerm->first_oh = tmp_offheap.first; - DBTERM_SET_TPL(newDbTerm,tuple_val(copy)); + ASSERT((&oldp->dbterm == handle->dbterm) == !(tbl->common.compress && handle->mustResize)); + if (handle->mustResize) { + db_finalize_resize(handle, offsetof(HashDbTerm,dbterm)); WUNLOCK_HASH(lck); - - db_free_term_data(handle->dbterm); - erts_db_free(ERTS_ALC_T_DB_TERM, tbl, - (void *) (((char *) handle->dbterm) - (sizeof(HashDbTerm) - sizeof(DbTerm))), - sizeof(HashDbTerm) + sizeof(Eterm)*(handle->dbterm->size-1)); + + free_term(&tbl->hash, oldp); } else { WUNLOCK_HASH(lck); @@ -2781,7 +2729,7 @@ static void db_finalize_dbterm_hash(DbUpdateHandle* handle) handle->dbterm = 0; #endif return; -} +} static int db_delete_all_objects_hash(Process* p, DbTable* tbl) { diff --git a/erts/emulator/beam/erl_db_tree.c b/erts/emulator/beam/erl_db_tree.c index 5644e85f97..8108494fc5 100644 --- a/erts/emulator/beam/erl_db_tree.c +++ b/erts/emulator/beam/erl_db_tree.c @@ -122,12 +122,41 @@ static void release_stack(DbTableTree* tb, DbTreeStack* stack) } } -static void reset_static_stack(DbTableTree* tb) +static ERTS_INLINE void reset_static_stack(DbTableTree* tb) { tb->static_stack.pos = 0; tb->static_stack.slot = 0; } +static ERTS_INLINE void free_term(DbTableTree *tb, TreeDbTerm* p) +{ + db_free_term((DbTable*)tb, p, offsetof(TreeDbTerm, dbterm)); +} + +static ERTS_INLINE TreeDbTerm* new_dbterm(DbTableTree *tb, Eterm obj) +{ + TreeDbTerm* p; + if (tb->common.compress) { + p = db_store_term_comp(&tb->common, NULL, offsetof(TreeDbTerm,dbterm), obj); + } + else { + p = db_store_term(&tb->common, NULL, offsetof(TreeDbTerm,dbterm), obj); + } + return p; +} +static ERTS_INLINE TreeDbTerm* replace_dbterm(DbTableTree *tb, TreeDbTerm* old, + Eterm obj) +{ + TreeDbTerm* p; + ASSERT(old != NULL); + if (tb->common.compress) { + p = db_store_term_comp(&tb->common, &(old->dbterm), offsetof(TreeDbTerm,dbterm), obj); + } + else { + p = db_store_term(&tb->common, &(old->dbterm), offsetof(TreeDbTerm,dbterm), obj); + } + return p; +} /* ** Some macros for "direction stacks" @@ -178,12 +207,6 @@ static void do_dump_tree2(int to, void *to_arg, int show, TreeDbTerm *t, #endif /* - * Size calculations - */ -#define SIZ_OVERHEAD ((sizeof(TreeDbTerm)/sizeof(Eterm)) - 1) -#define SIZ_DBTERM(TDT) (SIZ_OVERHEAD + (TDT)->dbterm.size) - -/* ** Datatypes */ @@ -263,9 +286,6 @@ static TreeDbTerm *linkout_tree(DbTableTree *tb, Eterm key); static TreeDbTerm *linkout_object_tree(DbTableTree *tb, Eterm object); static int do_free_tree_cont(DbTableTree *tb, int num_left); -static TreeDbTerm* get_term(DbTableTree *tb, - TreeDbTerm* old, - Eterm obj); static void free_term(DbTableTree *tb, TreeDbTerm* p); static int balance_left(TreeDbTerm **this); static int balance_right(TreeDbTerm **this); @@ -622,7 +642,7 @@ static int db_put_tree(DbTable *tbl, Eterm obj, int key_clash_fail) erts_smp_atomic_dec(&tb->common.nitems); return DB_ERROR_SYSRES; } - *this = get_term(tb, NULL, obj); + *this = new_dbterm(tb, obj); (*this)->balance = 0; (*this)->left = (*this)->right = NULL; break; @@ -636,7 +656,7 @@ static int db_put_tree(DbTable *tbl, Eterm obj, int key_clash_fail) tstack[tpos++] = this; this = &((*this)->right); } else if (!key_clash_fail) { /* Equal key and this is a set, replace. */ - *this = get_term(tb, *this, obj); + *this = replace_dbterm(tb, *this, obj); break; } else { return DB_ERROR_BADKEY; /* key already exists */ @@ -714,7 +734,7 @@ static int db_get_tree(Process *p, DbTable *tbl, Eterm key, Eterm *ret) { DbTableTree *tb = &tbl->tree; Eterm copy; - Eterm *hp; + Eterm *hp, *hend; TreeDbTerm *this; /* @@ -728,11 +748,11 @@ static int db_get_tree(Process *p, DbTable *tbl, Eterm key, Eterm *ret) *ret = NIL; } else { hp = HAlloc(p, this->dbterm.size + 2); - copy = copy_shallow(DBTERM_BUF(&this->dbterm), - this->dbterm.size, - &hp, - &MSO(p)); + hend = hp + this->dbterm.size + 2; + copy = db_copy_object_from_ets(&tb->common, &this->dbterm, &hp, &MSO(p)); *ret = CONS(hp, copy, NIL); + hp += 2; + HRelease(p,hend,hp); } return DB_ERROR_NONE; } @@ -766,18 +786,10 @@ static int db_get_element_tree(Process *p, DbTable *tbl, if (this == NULL) { return DB_ERROR_BADKEY; } else { - Eterm element; - Uint sz; if (ndex > arityval(this->dbterm.tpl[0])) { return DB_ERROR_BADPARAM; } - element = this->dbterm.tpl[ndex]; - sz = size_object(element); - hp = HAlloc(p, sz); - *ret = copy_struct(element, - sz, - &hp, - &MSO(p)); + *ret = db_copy_element_from_ets(&tb->common, p, &this->dbterm, ndex, &hp, 0); } return DB_ERROR_NONE; } @@ -815,7 +827,7 @@ static int db_slot_tree(Process *p, DbTable *tbl, DbTableTree *tb = &tbl->tree; Sint slot; TreeDbTerm *st; - Eterm *hp; + Eterm *hp, *hend; Eterm copy; /* @@ -847,11 +859,11 @@ static int db_slot_tree(Process *p, DbTable *tbl, return DB_ERROR_UNSPEC; } hp = HAlloc(p, st->dbterm.size + 2); - copy = copy_shallow(DBTERM_BUF(&st->dbterm), - st->dbterm.size, - &hp, - &MSO(p)); + hend = hp + st->dbterm.size + 2; + copy = db_copy_object_from_ets(&tb->common, &st->dbterm, &hp, &MSO(p)); *ret = CONS(hp, copy, NIL); + hp += 2; + HRelease(p,hend,hp); return DB_ERROR_NONE; } @@ -1738,7 +1750,7 @@ static int db_select_delete_tree(Process *p, DbTable *tbl, ** Other interface routines (not directly coupled to one bif) */ -/* Display hash table contents (for dump) */ +/* Display tree contents (for dump) */ static void db_print_tree(int to, void *to_arg, int show, DbTable *tbl) @@ -1926,7 +1938,7 @@ static TreeDbTerm *linkout_object_tree(DbTableTree *tb, tstack[tpos++] = this; this = &((*this)->right); } else { /* Equal key, found the only possible matching object*/ - if (!eq(object,make_tuple((*this)->dbterm.tpl))) { + if (!db_eq(&tb->common,object,&(*this)->dbterm)) { return NULL; } q = (*this); @@ -2079,15 +2091,6 @@ static void do_dump_tree(int to, void *to_arg, TreeDbTerm *t) } } -static void free_term(DbTableTree *tb, TreeDbTerm* p) -{ - db_free_term_data(&(p->dbterm)); - erts_db_free(ERTS_ALC_T_DB_TERM, - (DbTable *) tb, - (void *) p, - SIZ_DBTERM(p)*sizeof(Uint)); -} - static int do_free_tree_cont(DbTableTree *tb, int num_left) { TreeDbTerm *root; @@ -2118,17 +2121,6 @@ static int do_free_tree_cont(DbTableTree *tb, int num_left) return 1; } -static TreeDbTerm* get_term(DbTableTree *tb, - TreeDbTerm* old, - Eterm obj) -{ - TreeDbTerm* p = db_get_term((DbTableCommon *) tb, - (old != NULL) ? &(old->dbterm) : NULL, - ((char *) &(old->dbterm)) - ((char *) old), - obj); - return p; -} - /* * Deletion helpers */ @@ -2570,46 +2562,21 @@ static int db_lookup_dbterm_tree(DbTable *tbl, Eterm key, DbUpdateHandle* handle handle->tb = tbl; handle->dbterm = &(*pp)->dbterm; + handle->mustResize = 0; handle->bp = (void**) pp; handle->new_size = (*pp)->dbterm.size; - handle->mustResize = 0; return 1; } static void db_finalize_dbterm_tree(DbUpdateHandle* handle) { if (handle->mustResize) { - ErlOffHeap tmp_offheap; - Eterm* top; - Eterm copy; - DbTerm* newDbTerm; - DbTableTree *tb = &handle->tb->tree; TreeDbTerm* oldp = (TreeDbTerm*) *handle->bp; - TreeDbTerm* newp = erts_db_alloc(ERTS_ALC_T_DB_TERM, - handle->tb, - sizeof(TreeDbTerm)+sizeof(Eterm)*(handle->new_size-1)); - memcpy(newp, oldp, sizeof(TreeDbTerm)-sizeof(DbTerm)); /* copy only tree header */ - *(handle->bp) = newp; - reset_static_stack(tb); - newDbTerm = &newp->dbterm; - - newDbTerm->size = handle->new_size; - tmp_offheap.first = NULL; - tmp_offheap.overhead = 0; - - /* make a flat copy */ - top = DBTERM_BUF(newDbTerm); - copy = copy_struct(make_tuple(handle->dbterm->tpl), - handle->new_size, - &top, &tmp_offheap); - newDbTerm->first_oh = tmp_offheap.first; - DBTERM_SET_TPL(newDbTerm,tuple_val(copy)); - - db_free_term_data(handle->dbterm); - erts_db_free(ERTS_ALC_T_DB_TERM, - handle->tb, - (void *) (((char *) handle->dbterm) - (sizeof(TreeDbTerm) - sizeof(DbTerm))), - sizeof(TreeDbTerm) + sizeof(Eterm)*(handle->dbterm->size-1)); + + db_finalize_resize(handle, offsetof(TreeDbTerm,dbterm)); + reset_static_stack(&handle->tb->tree); + + free_term(&handle->tb->tree, oldp); } #ifdef DEBUG handle->dbterm = 0; @@ -3009,7 +2976,7 @@ static int doit_select(DbTableTree *tb, TreeDbTerm *this, void *ptr, { struct select_context *sc = (struct select_context *) ptr; Eterm ret; - Uint32 dummy; + Eterm* hp; sc->lastobj = this->dbterm.tpl; @@ -3024,24 +2991,9 @@ static int doit_select(DbTableTree *tb, TreeDbTerm *this, void *ptr, this->dbterm.tpl)) > 0))) { return 0; } - ret = db_prog_match(sc->p, sc->mp, - make_tuple(this->dbterm.tpl), - NULL,0, &dummy); + ret = db_prog_match_and_copy(&tb->common,sc->p,sc->mp,sc->all_objects, + &this->dbterm, &hp, 2); if (is_value(ret)) { - Uint sz; - Eterm *hp; - if (sc->all_objects) { - hp = HAlloc(sc->p, this->dbterm.size + 2); - ret = copy_shallow(DBTERM_BUF(&this->dbterm), - this->dbterm.size, - &hp, - &MSO(sc->p)); - } else { - sz = size_object(ret); - hp = HAlloc(sc->p, sz + 2); - ret = copy_struct(ret, sz, - &hp, &MSO(sc->p)); - } sc->accum = CONS(hp, ret, sc->accum); } if (MBUF(sc->p)) { @@ -3062,7 +3014,6 @@ static int doit_select_count(DbTableTree *tb, TreeDbTerm *this, void *ptr, { struct select_count_context *sc = (struct select_count_context *) ptr; Eterm ret; - Uint32 dummy; sc->lastobj = this->dbterm.tpl; @@ -3073,9 +3024,8 @@ static int doit_select_count(DbTableTree *tb, TreeDbTerm *this, void *ptr, this->dbterm.tpl)) > 0)) { return 0; } - ret = db_prog_match(sc->p, sc->mp, - make_tuple(this->dbterm.tpl), - NULL,0, &dummy); + ret = db_prog_match_and_copy(&tb->common, sc->p, sc->mp, 0, + &this->dbterm, NULL, 0); if (ret == am_true) { ++(sc->got); } @@ -3090,7 +3040,7 @@ static int doit_select_chunk(DbTableTree *tb, TreeDbTerm *this, void *ptr, { struct select_context *sc = (struct select_context *) ptr; Eterm ret; - Uint32 dummy; + Eterm* hp; sc->lastobj = this->dbterm.tpl; @@ -3106,25 +3056,10 @@ static int doit_select_chunk(DbTableTree *tb, TreeDbTerm *this, void *ptr, return 0; } - ret = db_prog_match(sc->p, sc->mp, - make_tuple(this->dbterm.tpl), - NULL,0, &dummy); + ret = db_prog_match_and_copy(&tb->common, sc->p, sc->mp, sc->all_objects, + &this->dbterm, &hp, 2); if (is_value(ret)) { - Uint sz; - Eterm *hp; - ++(sc->got); - if (sc->all_objects) { - hp = HAlloc(sc->p, this->dbterm.size + 2); - ret = copy_shallow(DBTERM_BUF(&this->dbterm), - this->dbterm.size, - &hp, - &MSO(sc->p)); - } else { - sz = size_object(ret); - hp = HAlloc(sc->p, sz + 2); - ret = copy_struct(ret, sz, &hp, &MSO(sc->p)); - } sc->accum = CONS(hp, ret, sc->accum); } if (MBUF(sc->p)) { @@ -3146,7 +3081,6 @@ static int doit_select_delete(DbTableTree *tb, TreeDbTerm *this, void *ptr, { struct select_delete_context *sc = (struct select_delete_context *) ptr; Eterm ret; - Uint32 dummy; Eterm key; if (sc->erase_lastterm) @@ -3159,9 +3093,8 @@ static int doit_select_delete(DbTableTree *tb, TreeDbTerm *this, void *ptr, GETKEY_WITH_POS(sc->keypos, this->dbterm.tpl)) > 0) return 0; - ret = db_prog_match(sc->p, sc->mp, - make_tuple(this->dbterm.tpl), - NULL,0, &dummy); + ret = db_prog_match_and_copy(&tb->common, sc->p, sc->mp, 0, + &this->dbterm, NULL, 0); if (ret == am_true) { key = GETKEY(sc->tb, this->dbterm.tpl); linkout_tree(sc->tb, key); diff --git a/erts/emulator/beam/erl_db_util.c b/erts/emulator/beam/erl_db_util.c index 2f34561234..e773361619 100644 --- a/erts/emulator/beam/erl_db_util.c +++ b/erts/emulator/beam/erl_db_util.c @@ -25,7 +25,6 @@ #ifdef HAVE_CONFIG_H # include "config.h" #endif - #include "sys.h" #include "erl_vm.h" #include "global.h" @@ -890,6 +889,8 @@ static Eterm match_spec_test(Process *p, Eterm against, Eterm spec, int trace); static Eterm seq_trace_fake(Process *p, Eterm arg1); +static void db_free_tmp_uncompressed(DbTerm* obj); + /* ** Interface routines. @@ -1604,6 +1605,7 @@ static Eterm dpm_array_to_list(Process *psp, Eterm *arr, int arity) } return ret; } + /* ** Execution of the match program, this is Pam. ** May return THE_NON_VALUE, which is a bailout. @@ -2391,33 +2393,46 @@ void db_do_update_element(DbUpdateHandle* handle, if (is_both_immed(newval,oldval)) { handle->dbterm->tpl[position] = newval; +#ifdef DEBUG_CLONE + if (handle->dbterm->debug_clone) { + handle->dbterm->debug_clone[position] = newval; + } +#endif return; } - else if (!handle->mustResize && is_boxed(newval)) { - newp = boxed_val(newval); - switch (*newp & _TAG_HEADER_MASK) { - case _TAG_HEADER_POS_BIG: - case _TAG_HEADER_NEG_BIG: - case _TAG_HEADER_FLOAT: - case _TAG_HEADER_HEAP_BIN: - newval_sz = header_arity(*newp) + 1; - if (is_boxed(oldval)) { - oldp = boxed_val(oldval); - switch (*oldp & _TAG_HEADER_MASK) { - case _TAG_HEADER_POS_BIG: - case _TAG_HEADER_NEG_BIG: - case _TAG_HEADER_FLOAT: - case _TAG_HEADER_HEAP_BIN: - oldval_sz = header_arity(*oldp) + 1; - if (oldval_sz == newval_sz) { - /* "self contained" terms of same size, do memcpy */ - sys_memcpy(oldp, newp, newval_sz*sizeof(Eterm)); - return; + if (!handle->mustResize) { + if (handle->tb->common.compress) { + handle->dbterm = db_alloc_tmp_uncompressed(&handle->tb->common, + handle->dbterm); + handle->mustResize = 1; + oldval = handle->dbterm->tpl[position]; + } + else if (is_boxed(newval)) { + newp = boxed_val(newval); + switch (*newp & _TAG_HEADER_MASK) { + case _TAG_HEADER_POS_BIG: + case _TAG_HEADER_NEG_BIG: + case _TAG_HEADER_FLOAT: + case _TAG_HEADER_HEAP_BIN: + newval_sz = header_arity(*newp) + 1; + if (is_boxed(oldval)) { + oldp = boxed_val(oldval); + switch (*oldp & _TAG_HEADER_MASK) { + case _TAG_HEADER_POS_BIG: + case _TAG_HEADER_NEG_BIG: + case _TAG_HEADER_FLOAT: + case _TAG_HEADER_HEAP_BIN: + oldval_sz = header_arity(*oldp) + 1; + if (oldval_sz == newval_sz) { + /* "self contained" terms of same size, do memcpy */ + sys_memcpy(oldp, newp, newval_sz*sizeof(Eterm)); + return; + } + goto both_size_set; } - goto both_size_set; } + goto new_size_set; } - goto new_size_set; } } /* Not possible for simple memcpy or dbterm is already non-contiguous, */ @@ -2436,84 +2451,372 @@ both_size_set: handle->mustResize = 1; } +static ERTS_INLINE byte* db_realloc_term(DbTableCommon* tb, void* old, + Uint old_sz, Uint new_sz, Uint offset) +{ + byte* ret; + if (erts_ets_realloc_always_moves) { + ret = erts_db_alloc(ERTS_ALC_T_DB_TERM, (DbTable*)tb, new_sz); + sys_memcpy(ret, old, offset); + erts_db_free(ERTS_ALC_T_DB_TERM, (DbTable*)tb, old, old_sz); + } else { + ret = erts_db_realloc(ERTS_ALC_T_DB_TERM, (DbTable*)tb, + old, old_sz, new_sz); + } + return ret; +} + +/* Allocated size of a compressed dbterm +*/ +static ERTS_INLINE Uint db_alloced_size_comp(DbTerm* obj) +{ + return obj->tpl[arityval(*obj->tpl) + 1]; +} + +void db_free_term(DbTable *tb, void* basep, Uint offset) +{ + DbTerm* db = (DbTerm*) ((byte*)basep + offset); + Uint size; + if (tb->common.compress) { + db_cleanup_offheap_comp(db); + size = db_alloced_size_comp(db); + } + else { + ErlOffHeap tmp_oh; + tmp_oh.first = db->first_oh; + erts_cleanup_offheap(&tmp_oh); + size = offset + offsetof(DbTerm,tpl) + db->size*sizeof(Eterm); + } + erts_db_free(ERTS_ALC_T_DB_TERM, tb, basep, size); +} + +static ERTS_INLINE Uint align_up(Uint value, Uint pow2) +{ + ASSERT((pow2 & (pow2-1)) == 0); + return (value + (pow2-1)) & ~(pow2-1); +} + +/* Compressed size of an uncompressed term +*/ +static Uint db_size_dbterm_comp(DbTableCommon* tb, Eterm obj) +{ + Eterm* tpl = tuple_val(obj); + int i; + Uint size = sizeof(DbTerm) + + arityval(*tpl) * sizeof(Eterm) + + sizeof(Uint); /* "alloc_size" */ + + for (i = arityval(*tpl); i>0; i--) { + if (i != tb->keypos && is_not_immed(tpl[i])) { + size += erts_encode_ext_size_ets(tpl[i]); + } + } + size += size_object(tpl[tb->keypos]) * sizeof(Eterm); + return align_up(size, sizeof(Uint)); +} + +/* Conversion between top tuple element and pointer to compressed data +*/ +static ERTS_INLINE Eterm ext2elem(Eterm* tpl, byte* ext) +{ + return (((Uint)(ext - (byte*)tpl)) << _TAG_PRIMARY_SIZE) | TAG_PRIMARY_HEADER; +} +static ERTS_INLINE byte* elem2ext(Eterm* tpl, Uint ix) +{ + ASSERT(is_header(tpl[ix])); + return (byte*)tpl + (tpl[ix] >> _TAG_PRIMARY_SIZE); +} + +static void* copy_to_comp(DbTableCommon* tb, Eterm obj, DbTerm* dest, + Uint alloc_size) +{ + ErlOffHeap tmp_offheap; + Eterm* src = tuple_val(obj); + Eterm* tpl = dest->tpl; + Eterm key = src[tb->keypos]; + int arity = arityval(src[0]); + union { + Eterm* ep; + byte* cp; + UWord ui; + }top; + int i; + + top.ep = tpl+ 1 + arity + 1; + tpl[0] = src[0]; + tpl[arity + 1] = alloc_size; + + tmp_offheap.first = NULL; + tpl[tb->keypos] = copy_struct(key, size_object(key), &top.ep, &tmp_offheap); + dest->first_oh = tmp_offheap.first; + for (i=1; i<=arity; i++) { + if (i != tb->keypos) { + if (is_immed(src[i])) { + tpl[i] = src[i]; + } + else { + tpl[i] = ext2elem(tpl, top.cp); + top.cp = erts_encode_ext_ets(src[i], top.cp, &dest->first_oh); + } + } + } + +#ifdef DEBUG_CLONE + { + Eterm* dbg_top = erts_alloc(ERTS_ALC_T_DB_TERM, dest->size * sizeof(Eterm)); + dest->debug_clone = dbg_top; + tmp_offheap.first = dest->first_oh; + copy_struct(obj, dest->size, &dbg_top, &tmp_offheap); + dest->first_oh = tmp_offheap.first; + ASSERT(dbg_top == dest->debug_clone + dest->size); + } +#endif + return top.cp; +} /* ** Copy the object into a possibly new DbTerm, ** offset is the offset of the DbTerm from the start -** of the sysAllocaed structure, The possibly realloced and copied +** of the allocated structure, The possibly realloced and copied ** structure is returned. Make sure (((char *) old) - offset) is a ** pointer to a ERTS_ALC_T_DB_TERM allocated data area. */ -void* db_get_term(DbTableCommon *tb, DbTerm* old, Uint offset, Eterm obj) +void* db_store_term(DbTableCommon *tb, DbTerm* old, Uint offset, Eterm obj) { + byte* basep; + DbTerm* newp; + Eterm* top; int size = size_object(obj); - void *structp = ((char*) old) - offset; - DbTerm* p; - Eterm copy; - Eterm *top; ErlOffHeap tmp_offheap; if (old != 0) { + basep = ((byte*) old) - offset; tmp_offheap.first = old->first_oh; - tmp_offheap.overhead = 0; erts_cleanup_offheap(&tmp_offheap); old->first_oh = tmp_offheap.first; if (size == old->size) { - p = old; - } else { + newp = old; + } + else { Uint new_sz = offset + sizeof(DbTerm) + sizeof(Eterm)*(size-1); Uint old_sz = offset + sizeof(DbTerm) + sizeof(Eterm)*(old->size-1); - if (erts_ets_realloc_always_moves) { - void *nstructp = erts_db_alloc(ERTS_ALC_T_DB_TERM, - (DbTable *) tb, - new_sz); - memcpy(nstructp,structp,offset); - erts_db_free(ERTS_ALC_T_DB_TERM, - (DbTable *) tb, - structp, - old_sz); - structp = nstructp; - } else { - structp = erts_db_realloc(ERTS_ALC_T_DB_TERM, - (DbTable *) tb, - structp, - old_sz, - new_sz); - } - p = (DbTerm*) ((void *)(((char *) structp) + offset)); + basep = db_realloc_term(tb, basep, old_sz, new_sz, offset); + newp = (DbTerm*) (basep + offset); } } else { - structp = erts_db_alloc(ERTS_ALC_T_DB_TERM, - (DbTable *) tb, - (offset - + sizeof(DbTerm) - + sizeof(Eterm)*(size-1))); - p = (DbTerm*) ((void *)(((char *) structp) + offset)); - } - p->size = size; + basep = erts_db_alloc(ERTS_ALC_T_DB_TERM, (DbTable *)tb, + (offset + sizeof(DbTerm) + sizeof(Eterm)*(size-1))); + newp = (DbTerm*) (basep + offset); + } + newp->size = size; + top = newp->tpl; tmp_offheap.first = NULL; - tmp_offheap.overhead = 0; + copy_struct(obj, size, &top, &tmp_offheap); + newp->first_oh = tmp_offheap.first; +#ifdef DEBUG_CLONE + newp->debug_clone = NULL; +#endif + return basep; +} + + +void* db_store_term_comp(DbTableCommon *tb, DbTerm* old, Uint offset, Eterm obj) +{ + Uint new_sz = offset + db_size_dbterm_comp(tb, obj); + byte* basep; + DbTerm* newp; + byte* top; + + ASSERT(tb->compress); + if (old != 0) { + Uint old_sz = db_alloced_size_comp(old); + db_cleanup_offheap_comp(old); + + basep = ((byte*) old) - offset; + if (new_sz == old_sz) { + newp = old; + } + else { + basep = db_realloc_term(tb, basep, old_sz, new_sz, offset); + newp = (DbTerm*) (basep + offset); + } + } + else { + basep = erts_db_alloc(ERTS_ALC_T_DB_TERM, (DbTable*)tb, new_sz); + newp = (DbTerm*) (basep + offset); + } + + newp->size = size_object(obj); + top = copy_to_comp(tb, obj, newp, new_sz); + ASSERT(top <= basep + new_sz); + + // SVERK: realloc? + + return basep; +} + + +void db_finalize_resize(DbUpdateHandle* handle, Uint offset) +{ + DbTable* tbl = handle->tb; + DbTerm* newDbTerm; + Uint alloc_sz = offset + + (tbl->common.compress ? + db_size_dbterm_comp(&tbl->common, make_tuple(handle->dbterm->tpl)) : + sizeof(DbTerm)+sizeof(Eterm)*(handle->new_size-1)); + byte* newp = erts_db_alloc(ERTS_ALC_T_DB_TERM, tbl, alloc_sz); + byte* oldp = *(handle->bp); + + sys_memcpy(newp, oldp, offset); /* copy only hash/tree header */ + *(handle->bp) = newp; + newDbTerm = (DbTerm*) (newp + offset); + newDbTerm->size = handle->new_size; + + /* make a flat copy */ + + if (tbl->common.compress) { + copy_to_comp(&tbl->common, make_tuple(handle->dbterm->tpl), + newDbTerm, alloc_sz); + db_free_tmp_uncompressed(handle->dbterm); + } + else { + Eterm* top; + ErlOffHeap tmp_offheap; + tmp_offheap.first = NULL; + top = newDbTerm->tpl; + copy_struct(make_tuple(handle->dbterm->tpl), handle->new_size, + &top, &tmp_offheap); + newDbTerm->first_oh = tmp_offheap.first; +#ifdef DEBUG_CLONE + newDbTerm->debug_clone = NULL; +#endif + ASSERT((byte*)top <= (newp + alloc_sz)); + } +} - top = DBTERM_BUF(p); - copy = copy_struct(obj, size, &top, &tmp_offheap); - p->first_oh = tmp_offheap.first; - DBTERM_SET_TPL(p,tuple_val(copy)); +Eterm db_copy_from_comp(DbTableCommon* tb, DbTerm* bp, Eterm** hpp, + ErlOffHeap* off_heap) +{ + Eterm* hp = *hpp; + int i, arity = arityval(bp->tpl[0]); + + hp[0] = bp->tpl[0]; + *hpp += arity + 1; + + hp[tb->keypos] = copy_struct(bp->tpl[tb->keypos], + size_object(bp->tpl[tb->keypos]), + hpp, off_heap); + for (i=arity; i>0; i--) { + if (i != tb->keypos) { + if (is_immed(bp->tpl[i])) { + hp[i] = bp->tpl[i]; + } + else { + hp[i] = erts_decode_ext_ets(hpp, off_heap, + elem2ext(bp->tpl, i)); + } + } + } + ASSERT((*hpp - hp) <= bp->size); +#ifdef DEBUG_CLONE + ASSERT(eq(make_tuple(hp),make_tuple(bp->debug_clone))); +#endif + return make_tuple(hp); +} - return structp; +Eterm db_copy_element_from_ets(DbTableCommon* tb, Process* p, + DbTerm* obj, Uint pos, + Eterm** hpp, Uint extra) +{ + if (is_immed(obj->tpl[pos])) { + *hpp = HAlloc(p, extra); + return obj->tpl[pos]; + } + if (tb->compress && pos != tb->keypos) { + byte* ext = elem2ext(obj->tpl, pos); + Sint sz = erts_decode_ext_size_ets(ext, db_alloced_size_comp(obj)) + extra; + Eterm* hp = HAlloc(p, sz); + Eterm* endp = hp + sz; + Eterm copy = erts_decode_ext_ets(&hp, &MSO(p), ext); + *hpp = hp; + hp += extra; + HRelease(p, endp, hp); +#ifdef DEBUG_CLONE + ASSERT(eq(copy,obj->debug_clone[pos])); +#endif + return copy; + } + else { + Uint sz = size_object(obj->tpl[pos]); + *hpp = HAlloc(p, sz + extra); + return copy_struct(obj->tpl[pos], sz, hpp, &MSO(p)); + } } -void db_free_term_data(DbTerm* p) +/* Our own "cleanup_offheap" + * as refc-binaries may be unaligned in compressed terms +*/ +void db_cleanup_offheap_comp(DbTerm* obj) +{ + union erl_off_heap_ptr u; + ProcBin tmp; + + for (u.hdr = obj->first_oh; u.hdr; u.hdr = u.hdr->next) { + if ((UWord)u.voidp % sizeof(UWord) != 0) { /* unaligned ptr */ + sys_memcpy(&tmp, u.voidp, sizeof(tmp)); + /* Warning, must pass (void*)-variable to memcpy. Otherwise it will + cause Bus error on Sparc due to false compile time assumptions + about word aligned memory (type cast is not enough) */ + u.pb = &tmp; + } + switch (thing_subtag(u.hdr->thing_word)) { + case REFC_BINARY_SUBTAG: + if (erts_refc_dectest(&u.pb->val->refc, 0) == 0) { + erts_bin_free(u.pb->val); + } + break; + case FUN_SUBTAG: + ASSERT(u.pb != &tmp); + if (erts_refc_dectest(&u.fun->fe->refc, 0) == 0) { + erts_erase_fun_entry(u.fun->fe); + } + break; + default: + ASSERT(is_external_header(u.hdr->thing_word)); + ASSERT(u.pb != &tmp); + erts_deref_node_entry(u.ext->node); + break; + } + } +#ifdef DEBUG_CLONE + if (obj->debug_clone != NULL) { + erts_free(ERTS_ALC_T_DB_TERM, obj->debug_clone); + obj->debug_clone = NULL; + } +#endif +} + +int db_eq_comp(DbTableCommon* tb, Eterm a, DbTerm* b) { ErlOffHeap tmp_offheap; - tmp_offheap.first = p->first_oh; - tmp_offheap.overhead = 0; + Eterm* allocp; + Eterm* hp; + Eterm tmp_b; + int is_eq; + + ASSERT(tb->compress); + hp = allocp = erts_alloc(ERTS_ALC_T_TMP, b->size*sizeof(Eterm)); + tmp_offheap.first = NULL; + tmp_b = db_copy_from_comp(tb, b, &hp, &tmp_offheap); + is_eq = eq(a,tmp_b); erts_cleanup_offheap(&tmp_offheap); + erts_free(ERTS_ALC_T_TMP, allocp); + return is_eq; } - /* ** Check if object represents a "match" variable ** i.e and atom $N where N is an integer @@ -4404,7 +4707,65 @@ static Eterm seq_trace_fake(Process *p, Eterm arg1) } return result; } - + +DbTerm* db_alloc_tmp_uncompressed(DbTableCommon* tb, DbTerm* org) +{ + ErlOffHeap tmp_offheap; + DbTerm* res = erts_alloc(ERTS_ALC_T_TMP, + sizeof(DbTerm) + org->size*sizeof(Eterm)); + Eterm* hp = res->tpl; + tmp_offheap.first = NULL; + db_copy_from_comp(tb, org, &hp, &tmp_offheap); + res->first_oh = tmp_offheap.first; + res->size = org->size; +#ifdef DEBUG_CLONE + res->debug_clone = NULL; +#endif + return res; +} + +void db_free_tmp_uncompressed(DbTerm* obj) +{ + ErlOffHeap off_heap; + off_heap.first = obj->first_oh; + erts_cleanup_offheap(&off_heap); +#ifdef DEBUG_CLONE + ASSERT(obj->debug_clone == NULL); +#endif + erts_free(ERTS_ALC_T_TMP, obj); +} + +Eterm db_prog_match_and_copy(DbTableCommon* tb, Process* c_p, Binary* bprog, + int all, DbTerm* obj, Eterm** hpp, Uint extra) +{ + Uint32 dummy; + Eterm res; + + if (tb->compress) { + obj = db_alloc_tmp_uncompressed(tb, obj); + } + + res = db_prog_match(c_p, bprog, make_tuple(obj->tpl), NULL, 0, &dummy); + + if (is_value(res) && hpp!=NULL) { + if (all) { + *hpp = HAlloc(c_p, obj->size + extra); + res = copy_shallow(obj->tpl, obj->size, hpp, &MSO(c_p)); + } + else { + Uint sz = size_object(res); + *hpp = HAlloc(c_p, sz + extra); + res = copy_struct(res, sz, hpp, &MSO(c_p)); + } + } + + if (tb->compress) { + db_free_tmp_uncompressed(obj); + } + return res; +} + + #ifdef DMC_DEBUG /* ** Disassemble match program diff --git a/erts/emulator/beam/erl_db_util.h b/erts/emulator/beam/erl_db_util.h index 0f333e8b34..10ba755e80 100644 --- a/erts/emulator/beam/erl_db_util.h +++ b/erts/emulator/beam/erl_db_util.h @@ -52,22 +52,27 @@ is broken.*/ #define DB_ERROR_UNSPEC -10 /* Unspecified error */ +/*#define DEBUG_CLONE*/ /* * A datatype for a database entry stored out of a process heap */ typedef struct db_term { struct erl_off_heap_header* first_oh; /* Off heap data for term. */ - Uint size; /* Size of term in "words" */ - Eterm tpl[1]; /* Untagged "constant pointer" to top tuple */ - /* (assumed to be first in buffer) */ + Uint size; /* Heap size of term in "words" */ +#ifdef DEBUG_CLONE + Eterm* debug_clone; /* An uncompressed copy */ +#endif + Eterm tpl[1]; /* Term data. Top tuple always first */ + + /* Compression: is_immed and key element are uncompressed. + Compressed elements are stored in external format after each other + last in dbterm. The top tuple elements contains byte offsets, to + the start of the data, tagged as headers. + The allocated size of the dbterm in bytes is stored at tpl[arity+1]. + */ } DbTerm; -/* "Assign" a value to DbTerm.tpl */ -#define DBTERM_SET_TPL(dbtermPtr,tplPtr) ASSERT((tplPtr)==(dbtermPtr->tpl)) -/* Get start of term buffer */ -#define DBTERM_BUF(dbtermPtr) ((dbtermPtr)->tpl) - union db_table; typedef union db_table DbTable; @@ -186,6 +191,12 @@ typedef struct db_table_method } DbTableMethod; +typedef struct db_fixation { + Eterm pid; + Uint counter; + struct db_fixation *next; +} DbFixation; + /* * This structure contains data for all different types of database * tables. Note that these fields must match the same fields @@ -194,13 +205,6 @@ typedef struct db_table_method * operations may be the same on different types of tables. */ -typedef struct db_fixation { - Eterm pid; - Uint counter; - struct db_fixation *next; -} DbFixation; - - typedef struct db_table_common { erts_refc_t ref; erts_refc_t fixref; /* fixation counter */ @@ -226,6 +230,7 @@ typedef struct db_table_common { Uint32 status; /* bit masks defined below */ int slot; /* slot index in meta_main_tab */ int keypos; /* defaults to 1 */ + int compress; } DbTableCommon; /* These are status bit patterns */ @@ -252,6 +257,54 @@ typedef struct db_table_common { #define IS_FIXED(T) (NFIXED(T) != 0) Eterm erts_ets_copy_object(Eterm, Process*); +Eterm db_copy_from_comp(DbTableCommon* tb, DbTerm* bp, Eterm** hpp, + ErlOffHeap* off_heap); +int db_eq_comp(DbTableCommon* tb, Eterm a, DbTerm* b); +DbTerm* db_alloc_tmp_uncompressed(DbTableCommon* tb, DbTerm* org); + +ERTS_GLB_INLINE Eterm db_copy_object_from_ets(DbTableCommon* tb, DbTerm* bp, + Eterm** hpp, ErlOffHeap* off_heap); +ERTS_GLB_INLINE int db_eq(DbTableCommon* tb, Eterm a, DbTerm* b); +ERTS_GLB_INLINE Eterm db_do_read_element(DbUpdateHandle* handle, Sint position); + +#if ERTS_GLB_INLINE_INCL_FUNC_DEF +ERTS_GLB_INLINE Eterm db_copy_object_from_ets(DbTableCommon* tb, DbTerm* bp, + Eterm** hpp, ErlOffHeap* off_heap) +{ + if (tb->compress) { + return db_copy_from_comp(tb, bp, hpp, off_heap); + } + else { + return copy_shallow(bp->tpl, bp->size, hpp, off_heap); + } +} + +ERTS_GLB_INLINE int db_eq(DbTableCommon* tb, Eterm a, DbTerm* b) +{ + if (!tb->compress) { + return eq(a, make_tuple(b->tpl)); + } + else { + return db_eq_comp(tb, a, b); + } +} + +/* Must be called to read elements after db_lookup_dbterm. +** Will decompress if needed. */ +ERTS_GLB_INLINE Eterm db_do_read_element(DbUpdateHandle* handle, Sint position) +{ + Eterm elem = handle->dbterm->tpl[position]; + if (!is_header(elem)) { + return elem; + } + ASSERT(((DbTableCommon*)handle->tb)->compress); + ASSERT(!handle->mustResize); + handle->dbterm = db_alloc_tmp_uncompressed((DbTableCommon*)handle->tb, handle->dbterm); + handle->mustResize = 1; + return handle->dbterm->tpl[position]; +} + +#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */ /* optimised version of copy_object (normal case? atomic object) */ #define COPY_OBJECT(obj, p, objp) \ @@ -277,14 +330,19 @@ Eterm db_set_trace_control_word_1(Process *p, Eterm val); void db_initialize_util(void); Eterm db_getkey(int keypos, Eterm obj); -void db_free_term_data(DbTerm* p); -void* db_get_term(DbTableCommon *tb, DbTerm* old, Uint offset, Eterm obj); +void db_cleanup_offheap_comp(DbTerm* p); +void db_free_term(DbTable *tb, void* basep, Uint offset); +void* db_store_term(DbTableCommon *tb, DbTerm* old, Uint offset, Eterm obj); +void* db_store_term_comp(DbTableCommon *tb, DbTerm* old, Uint offset, Eterm obj); +Eterm db_copy_element_from_ets(DbTableCommon* tb, Process* p, DbTerm* obj, + Uint pos, Eterm** hpp, Uint extra); int db_has_variable(Eterm obj); int db_is_variable(Eterm obj); +Eterm db_do_read_element(DbUpdateHandle* handle, Sint position); void db_do_update_element(DbUpdateHandle* handle, Sint position, Eterm newval); -void db_finalize_update_element(DbUpdateHandle* handle); +void db_finalize_resize(DbUpdateHandle* handle, Uint offset); Eterm db_add_counter(Eterm** hpp, Eterm counter, Eterm incr); Eterm db_match_set_lint(Process *p, Eterm matchexpr, Uint flags); Binary *db_match_set_compile(Process *p, Eterm matchexpr, @@ -366,6 +424,8 @@ Binary *db_match_compile(Eterm *matchexpr, Eterm *guards, Eterm *body, int num_matches, Uint flags, DMCErrInfo *err_info); +Eterm db_prog_match_and_copy(DbTableCommon* tb, Process* c_p, Binary* bprog, + int all, DbTerm* obj, Eterm** hpp, Uint extra); /* Returns newly allocated MatchProg binary with refc == 0*/ Eterm db_prog_match(Process *p, Binary *prog, Eterm term, Eterm *termp, int arity, Uint32 *return_flags /* Zeroed on enter */); diff --git a/erts/emulator/beam/erl_drv_thread.c b/erts/emulator/beam/erl_drv_thread.c index d42820ddf3..17b08a71d4 100644 --- a/erts/emulator/beam/erl_drv_thread.c +++ b/erts/emulator/beam/erl_drv_thread.c @@ -528,7 +528,7 @@ erl_drv_tsd_get(ErlDrvTSDKey key) if (!dtid) return NULL; #endif - if (ERL_DRV_TSD_LEN__ < key) + if (ERL_DRV_TSD_LEN__ <= key) return NULL; return ERL_DRV_TSD__[key]; } diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index 4ae656a3ad..a9f4f041ac 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -41,6 +41,7 @@ #include "erl_printf_term.h" #include "erl_misc_utils.h" #include "packet_parser.h" +#include "erl_cpu_topology.h" #ifdef HIPE #include "hipe_mode_switch.h" /* for hipe_mode_switch_init() */ @@ -63,6 +64,8 @@ extern void ConNormalExit(void); extern void ConWaitForExit(void); #endif +static void erl_init(int ncpu); + #define ERTS_MIN_COMPAT_REL 7 #ifdef ERTS_SMP @@ -76,9 +79,6 @@ int erts_initialized = 0; static erts_tid_t main_thread; #endif -erts_cpu_info_t *erts_cpuinfo; - -int erts_reader_groups; int erts_use_sender_punish; /* @@ -111,7 +111,6 @@ int erts_compat_rel; static int use_multi_run_queue; static int no_schedulers; static int no_schedulers_online; -static int max_reader_groups; #ifdef DEBUG Uint32 verbose; /* See erl_debug.h for information about verbose */ @@ -230,18 +229,18 @@ void erl_error(char *fmt, va_list args) erts_vfprintf(stderr, fmt, args); } -static void early_init(int *argc, char **argv); +static int early_init(int *argc, char **argv); void erts_short_init(void) { - early_init(NULL, NULL); - erl_init(); + int ncpu = early_init(NULL, NULL); + erl_init(ncpu); erts_initialized = 1; } -void -erl_init(void) +static void +erl_init(int ncpu) { init_benchmarking(); @@ -252,11 +251,11 @@ erl_init(void) erts_init_monitors(); erts_init_gc(); init_time(); - erts_init_process(); + erts_init_process(ncpu); erts_init_scheduling(use_multi_run_queue, no_schedulers, no_schedulers_online); - + erts_init_cpu_topology(); /* Must be after init_scheduling */ H_MIN_SIZE = erts_next_heap_size(H_MIN_SIZE, 0); BIN_VH_MIN_SIZE = erts_next_heap_size(BIN_VH_MIN_SIZE, 0); @@ -535,7 +534,8 @@ void erts_usage(void) erts_fprintf(stderr, "-W<i|w> set error logger warnings mapping,\n"); erts_fprintf(stderr, " see error_logger documentation for details\n"); - + erts_fprintf(stderr, "-zdbbl size set the distribution buffer busy limit in kilobytes\n"); + erts_fprintf(stderr, " valid range is [1-%d]\n", INT_MAX/1024); erts_fprintf(stderr, "\n"); erts_fprintf(stderr, "Note that if the emulator is started with erlexec (typically\n"); erts_fprintf(stderr, "from the erl script), these flags should be specified with +.\n"); @@ -587,7 +587,7 @@ static void ethr_ll_free(void *ptr) #endif -static void +static int early_init(int *argc, char **argv) /* * Only put things here which are * really important initialize @@ -600,6 +600,10 @@ early_init(int *argc, char **argv) /* int ncpuavail; int schdlrs; int schdlrs_onln; + int max_main_threads; + int max_reader_groups; + int reader_groups; + use_multi_run_queue = 1; erts_printf_eterm_func = erts_printf_term; erts_disable_tolerant_timeofday = 0; @@ -615,13 +619,11 @@ early_init(int *argc, char **argv) /* erts_use_sender_punish = 1; - erts_cpuinfo = erts_cpu_info_create(); - -#ifdef ERTS_SMP - ncpu = erts_get_cpu_configured(erts_cpuinfo); - ncpuonln = erts_get_cpu_online(erts_cpuinfo); - ncpuavail = erts_get_cpu_available(erts_cpuinfo); -#else + erts_pre_early_init_cpu_topology(&max_reader_groups, + &ncpu, + &ncpuonln, + &ncpuavail); +#ifndef ERTS_SMP ncpu = 1; ncpuonln = 1; ncpuavail = 1; @@ -664,15 +666,9 @@ early_init(int *argc, char **argv) /* ? ncpuavail : (ncpuonln > 0 ? ncpuonln : no_schedulers)); -#ifdef ERTS_SMP - erts_max_main_threads = no_schedulers_online; -#endif - schdlrs = no_schedulers; schdlrs_onln = no_schedulers_online; - max_reader_groups = ERTS_MAX_READER_GROUPS; - if (argc && argv) { int i = 1; while (i < *argc) { @@ -768,9 +764,13 @@ early_init(int *argc, char **argv) /* erts_alloc_init(argc, argv, &alloc_opts); /* Handles (and removes) -M flags. */ - - erts_early_init_scheduling(); /* Require allocators */ - erts_init_utils(); /* Require allocators */ + /* Require allocators */ + erts_early_init_scheduling(); + erts_init_utils(); + erts_early_init_cpu_topology(no_schedulers, + &max_main_threads, + max_reader_groups, + &reader_groups); #ifdef USE_THREADS { @@ -784,24 +784,13 @@ early_init(int *argc, char **argv) /* elid.mem.ll.alloc = ethr_ll_alloc; elid.mem.ll.realloc = ethr_ll_realloc; elid.mem.ll.free = ethr_ll_free; - -#ifdef ERTS_SMP - elid.main_threads = erts_max_main_threads; -#else - elid.main_threads = 1; -#endif - elid.reader_groups = (elid.main_threads > 1 - ? elid.main_threads - : 0); - if (max_reader_groups <= 1) - elid.reader_groups = 0; - if (elid.reader_groups > max_reader_groups) - elid.reader_groups = max_reader_groups; - erts_reader_groups = elid.reader_groups; + elid.main_threads = max_main_threads; + elid.reader_groups = reader_groups; erts_thr_late_init(&elid); } #endif + #ifdef ERTS_ENABLE_LOCK_CHECK erts_lc_late_init(); #endif @@ -818,7 +807,10 @@ early_init(int *argc, char **argv) /* erl_sys_args(argc, argv); erts_ets_realloc_always_moves = 0; + erts_ets_always_compress = 0; + erts_dist_buf_busy_limit = ERTS_DE_BUSY_LIMIT; + return ncpu; } #ifndef ERTS_SMP @@ -852,8 +844,7 @@ erl_start(int argc, char **argv) char envbuf[21]; /* enough for any 64-bit integer */ size_t envbufsz; int async_max_threads = erts_async_max_threads; - - early_init(&argc, argv); + int ncpu = early_init(&argc, argv); envbufsz = sizeof(envbuf); if (erts_sys_getenv(ERL_MAX_ETS_TABLES_ENV, envbuf, &envbufsz) == 0) @@ -1038,15 +1029,20 @@ erl_start(int argc, char **argv) break; case 'e': - /* set maximum number of ets tables */ - arg = get_arg(argv[i]+2, argv[i+1], &i); - if (( user_requested_db_max_tabs = atoi(arg) ) < 0) { - erts_fprintf(stderr, "bad maximum number of ets tables %s\n", arg); - erts_usage(); + if (sys_strcmp("c", argv[i]+2) == 0) { + erts_ets_always_compress = 1; + } + else { + /* set maximum number of ets tables */ + arg = get_arg(argv[i]+2, argv[i+1], &i); + if (( user_requested_db_max_tabs = atoi(arg) ) < 0) { + erts_fprintf(stderr, "bad maximum number of ets tables %s\n", arg); + erts_usage(); + } + VERBOSE(DEBUG_SYSTEM, + ("using maximum number of ets tables %d\n", + user_requested_db_max_tabs)); } - VERBOSE(DEBUG_SYSTEM, - ("using maximum number of ets tables %d\n", - user_requested_db_max_tabs)); break; case 'i': @@ -1110,7 +1106,7 @@ erl_start(int argc, char **argv) char *sub_param = argv[i]+2; if (has_prefix("bt", sub_param)) { arg = get_arg(sub_param+2, argv[i+1], &i); - res = erts_init_scheduler_bind_type(arg); + res = erts_init_scheduler_bind_type_string(arg); if (res != ERTS_INIT_SCHED_BIND_TYPE_SUCCESS) { switch (res) { case ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED: @@ -1135,7 +1131,7 @@ erl_start(int argc, char **argv) } else if (has_prefix("ct", sub_param)) { arg = get_arg(sub_param+2, argv[i+1], &i); - res = erts_init_cpu_topology(arg); + res = erts_init_cpu_topology_string(arg); if (res != ERTS_INIT_CPU_TOPOLOGY_OK) { switch (res) { case ERTS_INIT_CPU_TOPOLOGY_INVALID_ID: @@ -1346,6 +1342,26 @@ erl_start(int argc, char **argv) } break; + case 'z': { + char *sub_param = argv[i]+2; + int new_limit; + + if (has_prefix("dbbl", sub_param)) { + arg = get_arg(sub_param+4, argv[i+1], &i); + new_limit = atoi(arg); + if (new_limit < 1 || INT_MAX/1024 < new_limit) { + erts_fprintf(stderr, "Invalid dbbl limit: %d\n", new_limit); + erts_usage(); + } else { + erts_dist_buf_busy_limit = new_limit*1024; + } + } else { + erts_fprintf(stderr, "bad -z option %s\n", argv[i]); + erts_usage(); + } + break; + } + default: erts_fprintf(stderr, "%s unknown flag %s\n", argv[0], argv[i]); erts_usage(); @@ -1386,7 +1402,7 @@ erl_start(int argc, char **argv) boot_argc = argc - i; /* Number of arguments to init */ boot_argv = &argv[i]; - erl_init(); + erl_init(ncpu); init_shared_memory(boot_argc, boot_argv); load_preloaded(); diff --git a/erts/emulator/beam/erl_lock_check.c b/erts/emulator/beam/erl_lock_check.c index d6138fa4e4..04c7dbd2ec 100644 --- a/erts/emulator/beam/erl_lock_check.c +++ b/erts/emulator/beam/erl_lock_check.c @@ -128,8 +128,8 @@ static erts_lc_lock_order_t erts_lock_order[] = { { "removed_fd_pre_alloc_lock", NULL }, { "state_prealloc", NULL }, { "schdlr_sspnd", NULL }, - { "cpu_bind", NULL }, { "run_queue", "address" }, + { "cpu_info", NULL }, { "pollset", "address" }, #ifdef __WIN32__ { "pollwaiter", "address" }, diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c index 1dd9c8bd4a..a680097c2d 100644 --- a/erts/emulator/beam/erl_nif.c +++ b/erts/emulator/beam/erl_nif.c @@ -99,6 +99,16 @@ static Eterm* alloc_heap_heavy(ErlNifEnv* env, unsigned need, Eterm* hp) return hp; } +#if SIZEOF_LONG != ERTS_SIZEOF_ETERM +static ERTS_INLINE void ensure_heap(ErlNifEnv* env, unsigned may_need) +{ + if (env->hp + may_need > env->hp_end) { + alloc_heap_heavy(env, may_need, env->hp); + env->hp -= may_need; + } +} +#endif + void erts_pre_nif(ErlNifEnv* env, Process* p, struct erl_module_nif* mod_nif) { env->mod_nif = mod_nif; @@ -730,9 +740,8 @@ int enif_get_long(ErlNifEnv* env, Eterm term, long* ip) { #if SIZEOF_LONG == ERTS_SIZEOF_ETERM return term_to_Sint(term, ip); -#elif SIZEOF_INT == ERTS_SIZEOF_ETERM - Sint i; - return term_to_Sint(term, &i) ? (*ip = (long) i, 1) : 0; +#elif SIZEOF_LONG == 8 + return term_to_Sint64(term, ip); #else # error Unknown long word size #endif @@ -742,9 +751,8 @@ int enif_get_ulong(ErlNifEnv* env, Eterm term, unsigned long* ip) { #if SIZEOF_LONG == ERTS_SIZEOF_ETERM return term_to_Uint(term, ip); -#elif SIZEOF_INT == ERTS_SIZEOF_ETERM - Uint u; - return term_to_Uint(term, &u) ? (*ip = (unsigned long) u, 1) : 0; +#elif SIZEOF_LONG == 8 + return term_to_Uint64(term, ip); #else # error Unknown long word size #endif @@ -821,12 +829,22 @@ ERL_NIF_TERM enif_make_uint(ErlNifEnv* env, unsigned i) ERL_NIF_TERM enif_make_long(ErlNifEnv* env, long i) { +#if SIZEOF_LONG == ERTS_SIZEOF_ETERM return IS_SSMALL(i) ? make_small(i) : small_to_big(i, alloc_heap(env,2)); +#elif SIZEOF_LONG == 8 + ensure_heap(env,3); + return erts_sint64_to_big(i, &env->hp); +#endif } ERL_NIF_TERM enif_make_ulong(ErlNifEnv* env, unsigned long i) { +#if SIZEOF_LONG == ERTS_SIZEOF_ETERM return IS_USMALL(0,i) ? make_small(i) : uint_to_big(i,alloc_heap(env,2)); +#elif SIZEOF_LONG == 8 + ensure_heap(env,3); + return erts_uint64_to_big(i, &env->hp); +#endif } #if HAVE_INT64 && SIZEOF_LONG != 8 diff --git a/erts/emulator/beam/erl_node_tables.c b/erts/emulator/beam/erl_node_tables.c index d0b08bf72e..8cdda395df 100644 --- a/erts/emulator/beam/erl_node_tables.c +++ b/erts/emulator/beam/erl_node_tables.c @@ -107,7 +107,7 @@ dist_table_alloc(void *dep_tmpl) dep->nlinks = NULL; dep->monitors = NULL; - erts_smp_spinlock_init_x(&dep->qlock, "dist_entry_out_queue", chnl_nr); + erts_smp_mtx_init_x(&dep->qlock, "dist_entry_out_queue", chnl_nr); dep->qflgs = 0; dep->qsize = 0; dep->out_queue.first = NULL; @@ -172,7 +172,7 @@ dist_table_free(void *vdep) ASSERT(!dep->cache); erts_smp_rwmtx_destroy(&dep->rwmtx); erts_smp_mtx_destroy(&dep->lnk_mtx); - erts_smp_spinlock_destroy(&dep->qlock); + erts_smp_mtx_destroy(&dep->qlock); #ifdef DEBUG sys_memset(vdep, 0x77, sizeof(DistEntry)); @@ -755,9 +755,9 @@ void erts_init_node_tables(void) erts_this_dist_entry->nlinks = NULL; erts_this_dist_entry->monitors = NULL; - erts_smp_spinlock_init_x(&erts_this_dist_entry->qlock, - "dist_entry_out_queue", - make_small(ERST_INTERNAL_CHANNEL_NO)); + erts_smp_mtx_init_x(&erts_this_dist_entry->qlock, + "dist_entry_out_queue", + make_small(ERST_INTERNAL_CHANNEL_NO)); erts_this_dist_entry->qflgs = 0; erts_this_dist_entry->qsize = 0; erts_this_dist_entry->out_queue.first = NULL; diff --git a/erts/emulator/beam/erl_node_tables.h b/erts/emulator/beam/erl_node_tables.h index eb759b87e9..b0a63ae035 100644 --- a/erts/emulator/beam/erl_node_tables.h +++ b/erts/emulator/beam/erl_node_tables.h @@ -131,7 +131,7 @@ typedef struct dist_entry_ { ErtsLink *nlinks; /* Link tree with subtrees */ ErtsMonitor *monitors; /* Monitor tree */ - erts_smp_spinlock_t qlock; /* Protects qflgs and out_queue */ + erts_smp_mtx_t qlock; /* Protects qflgs and out_queue */ Uint32 qflgs; Sint qsize; ErtsDistOutputQueue out_queue; diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index b47ce97c46..f252c2cbe2 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -24,7 +24,6 @@ #endif #include <stddef.h> /* offsetof() */ -#include <ctype.h> #include "sys.h" #include "erl_vm.h" #include "global.h" @@ -39,6 +38,7 @@ #include "erl_threads.h" #include "erl_binary.h" #include "beam_bp.h" +#include "erl_cpu_topology.h" #define ERTS_RUNQ_CHECK_BALANCE_REDS_PER_SCHED (2000*CONTEXT_REDS) #define ERTS_RUNQ_CALL_CHECK_BALANCE_REDS \ @@ -63,8 +63,6 @@ #define ERTS_WAKEUP_OTHER_DEC 10 #define ERTS_WAKEUP_OTHER_FIXED_INC (CONTEXT_REDS/10) -#define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff) - #if 0 || defined(DEBUG) #define ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA #endif @@ -119,10 +117,6 @@ Uint erts_process_tab_index_mask; static int wakeup_other_limit; -#ifdef ERTS_SMP -Uint erts_max_main_threads; -#endif - int erts_sched_thread_suggested_stack_size = -1; #ifdef ERTS_ENABLE_LOCK_CHECK @@ -195,48 +189,6 @@ do { \ #endif -/* - * Cpu topology hierarchy. - */ -#define ERTS_TOPOLOGY_NODE 0 -#define ERTS_TOPOLOGY_PROCESSOR 1 -#define ERTS_TOPOLOGY_PROCESSOR_NODE 2 -#define ERTS_TOPOLOGY_CORE 3 -#define ERTS_TOPOLOGY_THREAD 4 -#define ERTS_TOPOLOGY_LOGICAL 5 - -#define ERTS_TOPOLOGY_MAX_DEPTH 6 - -typedef struct { - int bind_id; - int bound_id; -} ErtsCpuBindData; - -static ErtsCpuBindData *scheduler2cpu_map; -erts_smp_rwmtx_t erts_cpu_bind_rwmtx; - -typedef enum { - ERTS_CPU_BIND_UNDEFINED, - ERTS_CPU_BIND_SPREAD, - ERTS_CPU_BIND_PROCESSOR_SPREAD, - ERTS_CPU_BIND_THREAD_SPREAD, - ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD, - ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD, - ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD, - ERTS_CPU_BIND_NO_SPREAD, - ERTS_CPU_BIND_NONE -} ErtsCpuBindOrder; - -#define ERTS_CPU_BIND_DEFAULT_BIND \ - ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD - -ErtsCpuBindOrder cpu_bind_order; - -static erts_cpu_topology_t *user_cpudata; -static int user_cpudata_size; -static erts_cpu_topology_t *system_cpudata; -static int system_cpudata_size; - erts_sched_stat_t erts_sched_stat; ErtsRunQueue *erts_common_run_queue; @@ -259,11 +211,6 @@ ErtsSchedulerData *erts_scheduler_data; ErtsAlignedRunQueue *erts_aligned_run_queues; Uint erts_no_run_queues; -typedef union { - ErtsSchedulerData esd; - char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsSchedulerData))]; -} ErtsAlignedSchedulerData; - ErtsAlignedSchedulerData *erts_aligned_scheduler_data; #ifdef ERTS_SMP @@ -334,12 +281,6 @@ ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(proclist, 200, ERTS_ALC_T_PROC_LIST) -#define ERTS_RUNQ_IX(IX) \ - (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_run_queues), \ - &erts_aligned_run_queues[(IX)].runq) -#define ERTS_SCHEDULER_IX(IX) \ - (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ - &erts_aligned_scheduler_data[(IX)].esd) #define ERTS_SCHED_SLEEP_INFO_IX(IX) \ (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ &aligned_sched_sleep_info[(IX)].ssi) @@ -398,22 +339,8 @@ static int stack_element_dump(int to, void *to_arg, Process* p, Eterm* sp, #ifdef ERTS_SMP static void handle_pending_exiters(ErtsProcList *); -static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata, - int size, - ErtsCpuBindOrder bind_order, - int mk_seq); -static void signal_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size); - #endif -static int reader_group_lookup(int logical); -static void create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, - int *cpudata_size); -static void destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata); - -static void early_cpu_bind_init(void); -static void late_cpu_bind_init(void); - #if defined(ERTS_SMP) && defined(ERTS_ENABLE_LOCK_CHECK) int erts_smp_lc_runq_is_locked(ErtsRunQueue *runq) @@ -469,13 +396,13 @@ erts_pre_init_process(void) /* initialize the scheduler */ void -erts_init_process(void) +erts_init_process(int ncpu) { Uint proc_bits = ERTS_PROC_BITS; #ifdef ERTS_SMP erts_disable_proc_not_running_opt = 0; - erts_init_proc_lock(); + erts_init_proc_lock(ncpu); #endif init_proclist_alloc(); @@ -1060,6 +987,8 @@ scheduler_wait(long *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) sys_poll_aux_work: + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + erl_sys_schedule(1); /* Might give us something to do */ dt = do_time_read_and_reset(); @@ -1155,6 +1084,8 @@ scheduler_wait(long *fcalls, ErtsSchedulerData *esdp, ErtsRunQueue *rq) erts_smp_runq_unlock(rq); + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + erl_sys_schedule(0); dt = do_time_read_and_reset(); @@ -1242,7 +1173,7 @@ wake_scheduler(ErtsRunQueue *rq, int incq, int one) do { ErtsSchedulerSleepInfo *wake_ssi = ssi; ssi = ssi->next; - erts_sched_finish_poke(ssi, ssi_flags_set_wake(wake_ssi)); + erts_sched_finish_poke(wake_ssi, ssi_flags_set_wake(wake_ssi)); } while (ssi); } } @@ -1335,6 +1266,31 @@ erts_smp_notify_inc_runq(ErtsRunQueue *runq) smp_notify_inc_runq(runq); } +void +erts_sched_notify_check_cpu_bind(void) +{ +#ifdef ERTS_SMP + int ix; + if (erts_common_run_queue) { + for (ix = 0; ix < erts_no_schedulers; ix++) + erts_smp_atomic_set(&ERTS_SCHEDULER_IX(ix)->chk_cpu_bind, 1); + wake_all_schedulers(); + } + else { + for (ix = 0; ix < erts_no_run_queues; ix++) { + ErtsRunQueue *rq = ERTS_RUNQ_IX(ix); + erts_smp_runq_lock(rq); + rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; + erts_smp_runq_unlock(rq); + wake_scheduler(rq, 0, 1); + }; + } +#else + erts_sched_check_cpu_bind(erts_get_scheduler_data()); +#endif +} + + #ifdef ERTS_SMP ErtsRunQueue * @@ -2379,7 +2335,6 @@ erts_debug_nbalance(void) void erts_early_init_scheduling(void) { - early_cpu_bind_init(); wakeup_other_limit = ERTS_WAKEUP_OTHER_LIMIT_MEDIUM; } @@ -2656,8 +2611,6 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online) /* init port tasks */ erts_port_task_init(); - - late_cpu_bind_init(); } ErtsRunQueue * @@ -2883,12 +2836,10 @@ suspend_scheduler(ErtsSchedulerData *esdp) long flgs; int changing; long no = (long) esdp->no; - ErtsRunQueue *rq = esdp->run_queue; ErtsSchedulerSleepInfo *ssi = esdp->ssi; long active_schedulers; int curr_online = 1; int wake = 0; - int reset_read_group = 0; #if defined(ERTS_SCHED_NEED_NONBLOCKABLE_AUX_WORK) \ || defined(ERTS_SCHED_NEED_BLOCKABLE_AUX_WORK) long aux_work; @@ -2909,20 +2860,7 @@ suspend_scheduler(ErtsSchedulerData *esdp) erts_smp_runq_unlock(esdp->run_queue); - /* Unbind from cpu */ - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - if (scheduler2cpu_map[esdp->no].bound_id >= 0 - && erts_unbind_from_cpu(erts_cpuinfo) == 0) { - esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; - reset_read_group = 1; - } - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - if (reset_read_group) - erts_smp_rwmtx_set_reader_group(0); - - if (esdp->no <= erts_max_main_threads) - erts_thr_set_main_status(0, 0); + erts_sched_check_cpu_bind_prep_suspend(esdp); if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_inactive); @@ -3056,17 +2994,10 @@ suspend_scheduler(ErtsSchedulerData *esdp) if (erts_system_profile_flags.scheduler) profile_scheduler(make_small(esdp->no), am_active); - if (esdp->no <= erts_max_main_threads) - erts_thr_set_main_status(1, (int) esdp->no); - erts_smp_runq_lock(esdp->run_queue); non_empty_runq(esdp->run_queue); - /* Make sure we check if we should bind to a cpu or not... */ - if (rq->flags & ERTS_RUNQ_FLG_SHARED_RUNQ) - erts_smp_atomic_set(&esdp->chk_cpu_bind, 1); - else - rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; + erts_sched_check_cpu_bind_post_suspend(esdp); } #define ERTS_RUNQ_RESET_SUSPEND_INFO(RQ, DBG_ID) \ @@ -3583,15 +3514,7 @@ sched_thread_func(void *vesdp) erts_tsd_set(sched_data_key, vesdp); #ifdef ERTS_SMP - if (no <= erts_max_main_threads) { - erts_thr_set_main_status(1, (int) no); - if (erts_reader_groups) { - int rg = (int) no; - if (rg > erts_reader_groups) - rg = (((int) no) - 1) % erts_reader_groups + 1; - erts_smp_rwmtx_set_reader_group(rg); - } - } + erts_sched_init_check_cpu_bind((ErtsSchedulerData *) vesdp); erts_proc_lock_prepare_proc_lock_waiter(); ERTS_SCHED_SLEEP_INFO_IX(no - 1)->event = erts_tse_fetch(); @@ -3693,1907 +3616,6 @@ erts_start_schedulers(void) #endif /* ERTS_SMP */ -static int -int_cmp(const void *vx, const void *vy) -{ - return *((int *) vx) - *((int *) vy); -} - -static int -cpu_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->core != y->core) - return x->core - y->core; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->node != y->node) - return x->node - y->node; - return 0; -} - -static int -cpu_processor_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - if (x->node != y->node) - return x->node - y->node; - if (x->processor != y->processor) - return x->processor - y->processor; - return 0; -} - -static int -cpu_thread_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->node != y->node) - return x->node - y->node; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - return 0; -} - -static int -cpu_thread_no_node_processor_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->node != y->node) - return x->node - y->node; - if (x->core != y->core) - return x->core - y->core; - if (x->processor != y->processor) - return x->processor - y->processor; - return 0; -} - -static int -cpu_no_node_processor_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->node != y->node) - return x->node - y->node; - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->core != y->core) - return x->core - y->core; - if (x->processor != y->processor) - return x->processor - y->processor; - return 0; -} - -static int -cpu_no_node_thread_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->node != y->node) - return x->node - y->node; - if (x->thread != y->thread) - return x->thread - y->thread; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->core != y->core) - return x->core - y->core; - return 0; -} - -static int -cpu_no_spread_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->node != y->node) - return x->node - y->node; - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - if (x->thread != y->thread) - return x->thread - y->thread; - return 0; -} - -static ERTS_INLINE void -make_cpudata_id_seq(erts_cpu_topology_t *cpudata, int size, int no_node) -{ - int ix; - int node = -1; - int processor = -1; - int processor_node = -1; - int processor_node_node = -1; - int core = -1; - int thread = -1; - int old_node = -1; - int old_processor = -1; - int old_processor_node = -1; - int old_core = -1; - int old_thread = -1; - - for (ix = 0; ix < size; ix++) { - if (!no_node || cpudata[ix].node >= 0) { - if (old_node == cpudata[ix].node) - cpudata[ix].node = node; - else { - old_node = cpudata[ix].node; - old_processor = processor = -1; - if (!no_node) - old_processor_node = processor_node = -1; - old_core = core = -1; - old_thread = thread = -1; - if (no_node || cpudata[ix].node >= 0) - cpudata[ix].node = ++node; - } - } - if (old_processor == cpudata[ix].processor) - cpudata[ix].processor = processor; - else { - old_processor = cpudata[ix].processor; - if (!no_node) - processor_node_node = old_processor_node = processor_node = -1; - old_core = core = -1; - old_thread = thread = -1; - cpudata[ix].processor = ++processor; - } - if (no_node && cpudata[ix].processor_node < 0) - old_processor_node = -1; - else { - if (old_processor_node == cpudata[ix].processor_node) { - if (no_node) - cpudata[ix].node = cpudata[ix].processor_node = node; - else { - if (processor_node_node >= 0) - cpudata[ix].node = processor_node_node; - cpudata[ix].processor_node = processor_node; - } - } - else { - old_processor_node = cpudata[ix].processor_node; - old_core = core = -1; - old_thread = thread = -1; - if (no_node) - cpudata[ix].node = cpudata[ix].processor_node = ++node; - else { - cpudata[ix].node = processor_node_node = ++node; - cpudata[ix].processor_node = ++processor_node; - } - } - } - if (!no_node && cpudata[ix].processor_node < 0) - cpudata[ix].processor_node = 0; - if (old_core == cpudata[ix].core) - cpudata[ix].core = core; - else { - old_core = cpudata[ix].core; - old_thread = thread = -1; - cpudata[ix].core = ++core; - } - if (old_thread == cpudata[ix].thread) - cpudata[ix].thread = thread; - else - old_thread = cpudata[ix].thread = ++thread; - } -} - -static void -cpu_bind_order_sort(erts_cpu_topology_t *cpudata, - int size, - ErtsCpuBindOrder bind_order, - int mk_seq) -{ - if (size > 1) { - int no_node = 0; - int (*cmp_func)(const void *, const void *); - switch (bind_order) { - case ERTS_CPU_BIND_SPREAD: - cmp_func = cpu_spread_order_cmp; - break; - case ERTS_CPU_BIND_PROCESSOR_SPREAD: - cmp_func = cpu_processor_spread_order_cmp; - break; - case ERTS_CPU_BIND_THREAD_SPREAD: - cmp_func = cpu_thread_spread_order_cmp; - break; - case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: - no_node = 1; - cmp_func = cpu_thread_no_node_processor_spread_order_cmp; - break; - case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: - no_node = 1; - cmp_func = cpu_no_node_processor_spread_order_cmp; - break; - case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: - no_node = 1; - cmp_func = cpu_no_node_thread_spread_order_cmp; - break; - case ERTS_CPU_BIND_NO_SPREAD: - cmp_func = cpu_no_spread_order_cmp; - break; - default: - cmp_func = NULL; - erl_exit(ERTS_ABORT_EXIT, - "Bad cpu bind type: %d\n", - (int) cpu_bind_order); - break; - } - - if (mk_seq) - make_cpudata_id_seq(cpudata, size, no_node); - - qsort(cpudata, size, sizeof(erts_cpu_topology_t), cmp_func); - } -} - -static int -processor_order_cmp(const void *vx, const void *vy) -{ - erts_cpu_topology_t *x = (erts_cpu_topology_t *) vx; - erts_cpu_topology_t *y = (erts_cpu_topology_t *) vy; - - if (x->processor != y->processor) - return x->processor - y->processor; - if (x->node != y->node) - return x->node - y->node; - if (x->processor_node != y->processor_node) - return x->processor_node - y->processor_node; - if (x->core != y->core) - return x->core - y->core; - if (x->thread != y->thread) - return x->thread - y->thread; - return 0; -} - -static void -check_cpu_bind(ErtsSchedulerData *esdp) -{ - int rg = 0; - int res; - int cpu_id; - erts_smp_runq_unlock(esdp->run_queue); - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - cpu_id = scheduler2cpu_map[esdp->no].bind_id; - if (cpu_id >= 0 && cpu_id != scheduler2cpu_map[esdp->no].bound_id) { - res = erts_bind_to_cpu(erts_cpuinfo, cpu_id); - if (res == 0) - esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = cpu_id; - else { - erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); - erts_dsprintf(dsbufp, "Scheduler %d failed to bind to cpu %d: %s\n", - (int) esdp->no, cpu_id, erl_errno_id(-res)); - erts_send_error_to_logger_nogl(dsbufp); - if (scheduler2cpu_map[esdp->no].bound_id >= 0) - goto unbind; - } - } - else if (cpu_id < 0) { - unbind: - /* Get rid of old binding */ - res = erts_unbind_from_cpu(erts_cpuinfo); - if (res == 0) - esdp->cpu_id = scheduler2cpu_map[esdp->no].bound_id = -1; - else if (res != -ENOTSUP) { - erts_dsprintf_buf_t *dsbufp = erts_create_logger_dsbuf(); - erts_dsprintf(dsbufp, "Scheduler %d failed to unbind from cpu %d: %s\n", - (int) esdp->no, cpu_id, erl_errno_id(-res)); - erts_send_error_to_logger_nogl(dsbufp); - } - } - if (erts_reader_groups) { - if (esdp->cpu_id >= 0) - rg = reader_group_lookup(esdp->cpu_id); - else - rg = (((int) esdp->no) - 1) % erts_reader_groups + 1; - } - erts_smp_runq_lock(esdp->run_queue); -#ifdef ERTS_SMP - if (erts_common_run_queue) - erts_smp_atomic_set(&esdp->chk_cpu_bind, 0); - else { - esdp->run_queue->flags &= ~ERTS_RUNQ_FLG_CHK_CPU_BIND; - } -#endif - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - if (erts_reader_groups) - erts_smp_rwmtx_set_reader_group(rg); -} - -static void -signal_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size) -{ - int s_ix = 1; - int cpu_ix; - - if (cpu_bind_order != ERTS_CPU_BIND_NONE && size) { - - cpu_bind_order_sort(cpudata, size, cpu_bind_order, 1); - - for (cpu_ix = 0; cpu_ix < size && cpu_ix < erts_no_schedulers; cpu_ix++) - if (erts_is_cpu_available(erts_cpuinfo, cpudata[cpu_ix].logical)) - scheduler2cpu_map[s_ix++].bind_id = cpudata[cpu_ix].logical; - } - - if (s_ix <= erts_no_schedulers) - for (; s_ix <= erts_no_schedulers; s_ix++) - scheduler2cpu_map[s_ix].bind_id = -1; - -#ifdef ERTS_SMP - if (erts_common_run_queue) { - for (s_ix = 0; s_ix < erts_no_schedulers; s_ix++) - erts_smp_atomic_set(&ERTS_SCHEDULER_IX(s_ix)->chk_cpu_bind, 1); - wake_all_schedulers(); - } - else { - for (s_ix = 0; s_ix < erts_no_run_queues; s_ix++) { - ErtsRunQueue *rq = ERTS_RUNQ_IX(s_ix); - erts_smp_runq_lock(rq); - rq->flags |= ERTS_RUNQ_FLG_CHK_CPU_BIND; - erts_smp_runq_unlock(rq); - wake_scheduler(rq, 0, 1); - }; - } -#else - check_cpu_bind(erts_get_scheduler_data()); -#endif -} - -int -erts_init_scheduler_bind_type(char *how) -{ - if (erts_bind_to_cpu(erts_cpuinfo, -1) == -ENOTSUP) - return ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED; - - if (!system_cpudata && !user_cpudata) - return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY; - - if (sys_strcmp(how, "db") == 0) - cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; - else if (sys_strcmp(how, "s") == 0) - cpu_bind_order = ERTS_CPU_BIND_SPREAD; - else if (sys_strcmp(how, "ps") == 0) - cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; - else if (sys_strcmp(how, "ts") == 0) - cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; - else if (sys_strcmp(how, "tnnps") == 0) - cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; - else if (sys_strcmp(how, "nnps") == 0) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; - else if (sys_strcmp(how, "nnts") == 0) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; - else if (sys_strcmp(how, "ns") == 0) - cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; - else if (sys_strcmp(how, "u") == 0) - cpu_bind_order = ERTS_CPU_BIND_NONE; - else - return ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE; - - return ERTS_INIT_SCHED_BIND_TYPE_SUCCESS; -} - -/* - * reader groups map - */ - -typedef struct { - int level[ERTS_TOPOLOGY_MAX_DEPTH+1]; -} erts_avail_cput; - -typedef struct { - int *map; - int size; - int groups; -} erts_reader_groups_map_test; - -typedef struct { - int id; - int sub_levels; - int reader_groups; -} erts_rg_count_t; - -typedef struct { - int logical; - int reader_group; -} erts_reader_groups_map_t; - -typedef struct { - erts_reader_groups_map_t *map; - int map_size; - int logical_processors; - int groups; -} erts_make_reader_groups_map_test; - -static int reader_groups_available_cpu_check; -static int reader_groups_logical_processors; -static int reader_groups_map_size; -static erts_reader_groups_map_t *reader_groups_map; - -#define ERTS_TOPOLOGY_RG ERTS_TOPOLOGY_MAX_DEPTH - -static void -make_reader_groups_map(erts_make_reader_groups_map_test *test); - -static Eterm -get_reader_groups_map(Process *c_p, - erts_reader_groups_map_t *map, - int map_size, - int logical_processors) -{ -#ifdef DEBUG - Eterm *endp; -#endif - Eterm res = NIL, tuple; - Eterm *hp; - int i; - - hp = HAlloc(c_p, logical_processors*(2+3)); -#ifdef DEBUG - endp = hp + logical_processors*(2+3); -#endif - for (i = map_size - 1; i >= 0; i--) { - if (map[i].logical >= 0) { - tuple = TUPLE2(hp, - make_small(map[i].logical), - make_small(map[i].reader_group)); - hp += 3; - res = CONS(hp, tuple, res); - hp += 2; - } - } - ASSERT(hp == endp); - return res; -} - -Eterm -erts_debug_reader_groups_map(Process *c_p, int groups) -{ - Eterm res; - erts_make_reader_groups_map_test test; - - test.groups = groups; - make_reader_groups_map(&test); - if (!test.map) - res = NIL; - else { - res = get_reader_groups_map(c_p, - test.map, - test.map_size, - test.logical_processors); - erts_free(ERTS_ALC_T_TMP, test.map); - } - return res; -} - - -Eterm -erts_get_reader_groups_map(Process *c_p) -{ - Eterm res; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - res = get_reader_groups_map(c_p, - reader_groups_map, - reader_groups_map_size, - reader_groups_logical_processors); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return res; -} - -static void -make_available_cpu_topology(erts_avail_cput *no, - erts_avail_cput *avail, - erts_cpu_topology_t *cpudata, - int *size, - int test) -{ - int len = *size; - erts_cpu_topology_t last; - int a, i, j; - - no->level[ERTS_TOPOLOGY_NODE] = -1; - no->level[ERTS_TOPOLOGY_PROCESSOR] = -1; - no->level[ERTS_TOPOLOGY_PROCESSOR_NODE] = -1; - no->level[ERTS_TOPOLOGY_CORE] = -1; - no->level[ERTS_TOPOLOGY_THREAD] = -1; - no->level[ERTS_TOPOLOGY_LOGICAL] = -1; - - last.node = INT_MIN; - last.processor = INT_MIN; - last.processor_node = INT_MIN; - last.core = INT_MIN; - last.thread = INT_MIN; - last.logical = INT_MIN; - - a = 0; - - for (i = 0; i < len; i++) { - - if (!test && !erts_is_cpu_available(erts_cpuinfo, cpudata[i].logical)) - continue; - - if (last.node != cpudata[i].node) - goto node; - if (last.processor != cpudata[i].processor) - goto processor; - if (last.processor_node != cpudata[i].processor_node) - goto processor_node; - if (last.core != cpudata[i].core) - goto core; - ASSERT(last.thread != cpudata[i].thread); - goto thread; - - node: - no->level[ERTS_TOPOLOGY_NODE]++; - processor: - no->level[ERTS_TOPOLOGY_PROCESSOR]++; - processor_node: - no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; - core: - no->level[ERTS_TOPOLOGY_CORE]++; - thread: - no->level[ERTS_TOPOLOGY_THREAD]++; - - no->level[ERTS_TOPOLOGY_LOGICAL]++; - - for (j = 0; j < ERTS_TOPOLOGY_LOGICAL; j++) - avail[a].level[j] = no->level[j]; - - avail[a].level[ERTS_TOPOLOGY_LOGICAL] = cpudata[i].logical; - avail[a].level[ERTS_TOPOLOGY_RG] = 0; - - ASSERT(last.logical != cpudata[a].logical); - - last = cpudata[i]; - a++; - } - - no->level[ERTS_TOPOLOGY_NODE]++; - no->level[ERTS_TOPOLOGY_PROCESSOR]++; - no->level[ERTS_TOPOLOGY_PROCESSOR_NODE]++; - no->level[ERTS_TOPOLOGY_CORE]++; - no->level[ERTS_TOPOLOGY_THREAD]++; - no->level[ERTS_TOPOLOGY_LOGICAL]++; - - *size = a; -} - -static int -reader_group_lookup(int logical) -{ - int start = logical % reader_groups_map_size; - int ix = start; - - do { - if (reader_groups_map[ix].logical == logical) { - ASSERT(reader_groups_map[ix].reader_group > 0); - return reader_groups_map[ix].reader_group; - } - ix++; - if (ix == reader_groups_map_size) - ix = 0; - } while (ix != start); - - erl_exit(ERTS_ABORT_EXIT, "Logical cpu id %d not found\n", logical); -} - -static void -reader_group_insert(erts_reader_groups_map_t *map, int map_size, - int logical, int reader_group) -{ - int start = logical % map_size; - int ix = start; - - do { - if (map[ix].logical < 0) { - map[ix].logical = logical; - map[ix].reader_group = reader_group; - return; - } - ix++; - if (ix == map_size) - ix = 0; - } while (ix != start); - - erl_exit(ERTS_ABORT_EXIT, "Reader groups map full\n"); -} - - -static int -sub_levels(erts_rg_count_t *rgc, int level, int aix, int avail_sz, erts_avail_cput *avail) -{ - int sub_level = level+1; - int last = -1; - rgc->sub_levels = 0; - - do { - if (last != avail[aix].level[sub_level]) { - rgc->sub_levels++; - last = avail[aix].level[sub_level]; - } - aix++; - } - while (aix < avail_sz && rgc->id == avail[aix].level[level]); - rgc->reader_groups = 0; - return aix; -} - -static int -write_reader_groups(int *rgp, erts_rg_count_t *rgcp, - int level, int a, - int avail_sz, erts_avail_cput *avail) -{ - int rg = *rgp; - int sub_level = level+1; - int sl_per_gr = rgcp->sub_levels / rgcp->reader_groups; - int xsl = rgcp->sub_levels % rgcp->reader_groups; - int sls = 0; - int last = -1; - int xsl_rg_lim = (rgcp->reader_groups - xsl) + rg + 1; - - ASSERT(level < 0 || avail[a].level[level] == rgcp->id) - - do { - if (last != avail[a].level[sub_level]) { - if (!sls) { - sls = sl_per_gr; - rg++; - if (rg >= xsl_rg_lim) - sls++; - } - last = avail[a].level[sub_level]; - sls--; - } - avail[a].level[ERTS_TOPOLOGY_RG] = rg; - a++; - } while (a < avail_sz && (level < 0 - || avail[a].level[level] == rgcp->id)); - - ASSERT(rgcp->reader_groups == rg - *rgp); - - *rgp = rg; - - return a; -} - -static int -rg_count_sub_levels_compare(const void *vx, const void *vy) -{ - erts_rg_count_t *x = (erts_rg_count_t *) vx; - erts_rg_count_t *y = (erts_rg_count_t *) vy; - if (x->sub_levels != y->sub_levels) - return y->sub_levels - x->sub_levels; - return x->id - y->id; -} - -static int -rg_count_id_compare(const void *vx, const void *vy) -{ - erts_rg_count_t *x = (erts_rg_count_t *) vx; - erts_rg_count_t *y = (erts_rg_count_t *) vy; - return x->id - y->id; -} - -static void -make_reader_groups_map(erts_make_reader_groups_map_test *test) -{ - int i, spread_level, avail_sz; - erts_avail_cput no, *avail; - erts_cpu_topology_t *cpudata; - erts_reader_groups_map_t *map; - int map_sz; - int groups = erts_reader_groups; - - if (test) { - test->map = NULL; - test->map_size = 0; - groups = test->groups; - } - - if (!groups) - return; - - if (!test) { - if (reader_groups_map) - erts_free(ERTS_ALC_T_RDR_GRPS_MAP, reader_groups_map); - - reader_groups_logical_processors = 0; - reader_groups_map_size = 0; - reader_groups_map = NULL; - } - - create_tmp_cpu_topology_copy(&cpudata, &avail_sz); - - if (!cpudata) - return; - - cpu_bind_order_sort(cpudata, - avail_sz, - ERTS_CPU_BIND_NO_SPREAD, - 1); - - avail = erts_alloc(ERTS_ALC_T_TMP, - sizeof(erts_avail_cput)*avail_sz); - - make_available_cpu_topology(&no, avail, cpudata, - &avail_sz, test != NULL); - - destroy_tmp_cpu_topology_copy(cpudata); - - map_sz = avail_sz*2+1; - - if (test) { - map = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_reader_groups_map_t) - * map_sz)); - test->map = map; - test->map_size = map_sz; - test->logical_processors = avail_sz; - } - else { - map = erts_alloc(ERTS_ALC_T_RDR_GRPS_MAP, - (sizeof(erts_reader_groups_map_t) - * map_sz)); - reader_groups_map = map; - reader_groups_logical_processors = avail_sz; - reader_groups_map_size = map_sz; - - } - - for (i = 0; i < map_sz; i++) { - map[i].logical = -1; - map[i].reader_group = 0; - } - - spread_level = ERTS_TOPOLOGY_CORE; - for (i = ERTS_TOPOLOGY_NODE; i < ERTS_TOPOLOGY_THREAD; i++) { - if (no.level[i] > groups) { - spread_level = i; - break; - } - } - - if (no.level[spread_level] <= groups) { - int a, rg, last = -1; - rg = 0; - ASSERT(spread_level == ERTS_TOPOLOGY_CORE); - for (a = 0; a < avail_sz; a++) { - if (last != avail[a].level[spread_level]) { - rg++; - last = avail[a].level[spread_level]; - } - reader_group_insert(map, - map_sz, - avail[a].level[ERTS_TOPOLOGY_LOGICAL], - rg); - } - } - else { /* groups < no.level[spread_level] */ - erts_rg_count_t *rg_count; - int a, rg, tl, toplevels; - - tl = spread_level-1; - - if (spread_level == ERTS_TOPOLOGY_NODE) - toplevels = 1; - else - toplevels = no.level[tl]; - - rg_count = erts_alloc(ERTS_ALC_T_TMP, - toplevels*sizeof(erts_rg_count_t)); - - if (toplevels == 1) { - rg_count[0].id = 0; - rg_count[0].sub_levels = no.level[spread_level]; - rg_count[0].reader_groups = groups; - } - else { - int rgs_per_tl, rgs; - rgs = groups; - rgs_per_tl = rgs / toplevels; - - a = 0; - for (i = 0; i < toplevels; i++) { - rg_count[i].id = avail[a].level[tl]; - a = sub_levels(&rg_count[i], tl, a, avail_sz, avail); - } - - qsort(rg_count, - toplevels, - sizeof(erts_rg_count_t), - rg_count_sub_levels_compare); - - for (i = 0; i < toplevels; i++) { - if (rg_count[i].sub_levels < rgs_per_tl) { - rg_count[i].reader_groups = rg_count[i].sub_levels; - rgs -= rg_count[i].sub_levels; - } - else { - rg_count[i].reader_groups = rgs_per_tl; - rgs -= rgs_per_tl; - } - } - - while (rgs > 0) { - for (i = 0; i < toplevels; i++) { - if (rg_count[i].sub_levels == rg_count[i].reader_groups) - break; - else { - rg_count[i].reader_groups++; - if (--rgs == 0) - break; - } - } - } - - qsort(rg_count, - toplevels, - sizeof(erts_rg_count_t), - rg_count_id_compare); - } - - a = i = rg = 0; - while (a < avail_sz) { - a = write_reader_groups(&rg, &rg_count[i], tl, - a, avail_sz, avail); - i++; - } - - ASSERT(groups == rg); - - for (a = 0; a < avail_sz; a++) - reader_group_insert(map, - map_sz, - avail[a].level[ERTS_TOPOLOGY_LOGICAL], - avail[a].level[ERTS_TOPOLOGY_RG]); - - erts_free(ERTS_ALC_T_TMP, rg_count); - } - - erts_free(ERTS_ALC_T_TMP, avail); -} - -/* - * CPU topology - */ - -typedef struct { - int *id; - int used; - int size; -} ErtsCpuTopIdSeq; - -typedef struct { - ErtsCpuTopIdSeq logical; - ErtsCpuTopIdSeq thread; - ErtsCpuTopIdSeq core; - ErtsCpuTopIdSeq processor_node; - ErtsCpuTopIdSeq processor; - ErtsCpuTopIdSeq node; -} ErtsCpuTopEntry; - -static void -init_cpu_top_entry(ErtsCpuTopEntry *cte) -{ - int size = 10; - cte->logical.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->logical.size = size; - cte->thread.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->thread.size = size; - cte->core.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->core.size = size; - cte->processor_node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->processor_node.size = size; - cte->processor.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->processor.size = size; - cte->node.id = erts_alloc(ERTS_ALC_T_TMP_CPU_IDS, - sizeof(int)*size); - cte->node.size = size; -} - -static void -destroy_cpu_top_entry(ErtsCpuTopEntry *cte) -{ - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->logical.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->thread.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->core.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor_node.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->processor.id); - erts_free(ERTS_ALC_T_TMP_CPU_IDS, cte->node.id); -} - -static int -get_cput_value_or_range(int *v, int *vr, char **str) -{ - long l; - char *c = *str; - errno = 0; - if (!isdigit((unsigned char)*c)) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; - l = strtol(c, &c, 10); - if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID; - *v = (int) l; - if (*c == '-') { - c++; - if (!isdigit((unsigned char)*c)) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - l = strtol(c, &c, 10); - if (errno != 0 || l < 0 || ERTS_MAX_CPU_TOPOLOGY_ID < l) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - *vr = (int) l; - } - *str = c; - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -static int -get_cput_id_seq(ErtsCpuTopIdSeq *idseq, char **str) -{ - int ix = 0; - int need_size = 0; - char *c = *str; - - while (1) { - int res; - int val; - int nids; - int val_range = -1; - res = get_cput_value_or_range(&val, &val_range, &c); - if (res != ERTS_INIT_CPU_TOPOLOGY_OK) - return res; - if (val_range < 0 || val_range == val) - nids = 1; - else { - if (val_range > val) - nids = val_range - val + 1; - else - nids = val - val_range + 1; - } - need_size += nids; - if (need_size > idseq->size) { - idseq->size = need_size + 10; - idseq->id = erts_realloc(ERTS_ALC_T_TMP_CPU_IDS, - idseq->id, - sizeof(int)*idseq->size); - } - if (nids == 1) - idseq->id[ix++] = val; - else if (val_range > val) { - for (; val <= val_range; val++) - idseq->id[ix++] = val; - } - else { - for (; val >= val_range; val--) - idseq->id[ix++] = val; - } - if (*c != ',') - break; - c++; - } - *str = c; - idseq->used = ix; - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -static int -get_cput_entry(ErtsCpuTopEntry *cput, char **str) -{ - int h; - char *c = *str; - - cput->logical.used = 0; - cput->thread.id[0] = 0; - cput->thread.used = 1; - cput->core.id[0] = 0; - cput->core.used = 1; - cput->processor_node.id[0] = -1; - cput->processor_node.used = 1; - cput->processor.id[0] = 0; - cput->processor.used = 1; - cput->node.id[0] = -1; - cput->node.used = 1; - - h = ERTS_TOPOLOGY_MAX_DEPTH; - while (*c != ':' && *c != '\0') { - int res; - ErtsCpuTopIdSeq *idseqp; - switch (*c++) { - case 'L': - if (h <= ERTS_TOPOLOGY_LOGICAL) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->logical; - h = ERTS_TOPOLOGY_LOGICAL; - break; - case 't': - case 'T': - if (h <= ERTS_TOPOLOGY_THREAD) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->thread; - h = ERTS_TOPOLOGY_THREAD; - break; - case 'c': - case 'C': - if (h <= ERTS_TOPOLOGY_CORE) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->core; - h = ERTS_TOPOLOGY_CORE; - break; - case 'p': - case 'P': - if (h <= ERTS_TOPOLOGY_PROCESSOR) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->processor; - h = ERTS_TOPOLOGY_PROCESSOR; - break; - case 'n': - case 'N': - if (h <= ERTS_TOPOLOGY_PROCESSOR) { - do_node: - if (h <= ERTS_TOPOLOGY_NODE) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->node; - h = ERTS_TOPOLOGY_NODE; - } - else { - int p_node = 0; - char *p_chk = c; - while (*p_chk != '\0' && *p_chk != ':') { - if (*p_chk == 'p' || *p_chk == 'P') { - p_node = 1; - break; - } - p_chk++; - } - if (!p_node) - goto do_node; - if (h <= ERTS_TOPOLOGY_PROCESSOR_NODE) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY; - idseqp = &cput->processor_node; - h = ERTS_TOPOLOGY_PROCESSOR_NODE; - } - break; - default: - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE; - } - res = get_cput_id_seq(idseqp, &c); - if (res != ERTS_INIT_CPU_TOPOLOGY_OK) - return res; - } - - if (cput->logical.used < 1) - return ERTS_INIT_CPU_TOPOLOGY_MISSING_LID; - - if (*c == ':') { - c++; - } - - if (cput->thread.used != 1 - && cput->thread.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->core.used != 1 - && cput->core.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->processor_node.used != 1 - && cput->processor_node.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->processor.used != 1 - && cput->processor.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - if (cput->node.used != 1 - && cput->node.used != cput->logical.used) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE; - - *str = c; - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -static int -verify_topology(erts_cpu_topology_t *cpudata, int size) -{ - if (size > 0) { - int *logical; - int node, processor, no_nodes, i; - - /* Verify logical ids */ - logical = erts_alloc(ERTS_ALC_T_TMP, sizeof(int)*size); - - for (i = 0; i < size; i++) - logical[i] = cpudata[i].logical; - - qsort(logical, size, sizeof(int), int_cmp); - for (i = 0; i < size-1; i++) { - if (logical[i] == logical[i+1]) { - erts_free(ERTS_ALC_T_TMP, logical); - return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS; - } - } - - erts_free(ERTS_ALC_T_TMP, logical); - - qsort(cpudata, size, sizeof(erts_cpu_topology_t), processor_order_cmp); - - /* Verify unique entities */ - - for (i = 1; i < size; i++) { - if (cpudata[i-1].processor == cpudata[i].processor - && cpudata[i-1].node == cpudata[i].node - && (cpudata[i-1].processor_node - == cpudata[i].processor_node) - && cpudata[i-1].core == cpudata[i].core - && cpudata[i-1].thread == cpudata[i].thread) { - return ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES; - } - } - - /* Verify numa nodes */ - node = cpudata[0].node; - processor = cpudata[0].processor; - no_nodes = cpudata[0].node < 0 && cpudata[0].processor_node < 0; - for (i = 1; i < size; i++) { - if (no_nodes) { - if (cpudata[i].node >= 0 || cpudata[i].processor_node >= 0) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - } - else { - if (cpudata[i].processor == processor && cpudata[i].node != node) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - node = cpudata[i].node; - processor = cpudata[i].processor; - if (node >= 0 && cpudata[i].processor_node >= 0) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - if (node < 0 && cpudata[i].processor_node < 0) - return ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES; - } - } - } - - return ERTS_INIT_CPU_TOPOLOGY_OK; -} - -int -erts_init_cpu_topology(char *topology_str) -{ - ErtsCpuTopEntry cput; - int need_size; - char *c; - int ix; - int error = ERTS_INIT_CPU_TOPOLOGY_OK; - - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata_size = 10; - - user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(erts_cpu_topology_t) - * user_cpudata_size)); - - init_cpu_top_entry(&cput); - - ix = 0; - need_size = 0; - - c = topology_str; - if (*c == '\0') { - error = ERTS_INIT_CPU_TOPOLOGY_MISSING; - goto fail; - } - do { - int r; - error = get_cput_entry(&cput, &c); - if (error != ERTS_INIT_CPU_TOPOLOGY_OK) - goto fail; - need_size += cput.logical.used; - if (user_cpudata_size < need_size) { - user_cpudata_size = need_size + 10; - user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, - user_cpudata, - (sizeof(erts_cpu_topology_t) - * user_cpudata_size)); - } - - ASSERT(cput.thread.used == 1 - || cput.thread.used == cput.logical.used); - ASSERT(cput.core.used == 1 - || cput.core.used == cput.logical.used); - ASSERT(cput.processor_node.used == 1 - || cput.processor_node.used == cput.logical.used); - ASSERT(cput.processor.used == 1 - || cput.processor.used == cput.logical.used); - ASSERT(cput.node.used == 1 - || cput.node.used == cput.logical.used); - - for (r = 0; r < cput.logical.used; r++) { - user_cpudata[ix].logical = cput.logical.id[r]; - user_cpudata[ix].thread = - cput.thread.id[cput.thread.used == 1 ? 0 : r]; - user_cpudata[ix].core = - cput.core.id[cput.core.used == 1 ? 0 : r]; - user_cpudata[ix].processor_node = - cput.processor_node.id[cput.processor_node.used == 1 ? 0 : r]; - user_cpudata[ix].processor = - cput.processor.id[cput.processor.used == 1 ? 0 : r]; - user_cpudata[ix].node = - cput.node.id[cput.node.used == 1 ? 0 : r]; - ix++; - } - } while (*c != '\0'); - - if (user_cpudata_size != ix) { - user_cpudata_size = ix; - user_cpudata = erts_realloc(ERTS_ALC_T_CPUDATA, - user_cpudata, - (sizeof(erts_cpu_topology_t) - * user_cpudata_size)); - } - - error = verify_topology(user_cpudata, user_cpudata_size); - if (error == ERTS_INIT_CPU_TOPOLOGY_OK) { - destroy_cpu_top_entry(&cput); - return ERTS_INIT_CPU_TOPOLOGY_OK; - } - - fail: - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata_size = 0; - destroy_cpu_top_entry(&cput); - return error; -} - -#define ERTS_GET_CPU_TOPOLOGY_ERROR -1 -#define ERTS_GET_USED_CPU_TOPOLOGY 0 -#define ERTS_GET_DETECTED_CPU_TOPOLOGY 1 -#define ERTS_GET_DEFINED_CPU_TOPOLOGY 2 - -static Eterm get_cpu_topology_term(Process *c_p, int type); - -Eterm -erts_set_cpu_topology(Process *c_p, Eterm term) -{ - erts_cpu_topology_t *cpudata = NULL; - int cpudata_size = 0; - Eterm res; - - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - res = get_cpu_topology_term(c_p, ERTS_GET_USED_CPU_TOPOLOGY); - if (term == am_undefined) { - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata = NULL; - user_cpudata_size = 0; - - if (cpu_bind_order != ERTS_CPU_BIND_NONE && system_cpudata) { - cpudata_size = system_cpudata_size; - cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * cpudata_size)); - - sys_memcpy((void *) cpudata, - (void *) system_cpudata, - sizeof(erts_cpu_topology_t)*cpudata_size); - } - } - else if (is_not_list(term)) { - error: - res = THE_NON_VALUE; - goto done; - } - else { - Eterm list = term; - int ix = 0; - - cpudata_size = 100; - cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * cpudata_size)); - - while (is_list(list)) { - Eterm *lp = list_val(list); - Eterm cpu = CAR(lp); - Eterm* tp; - Sint id; - - if (is_not_tuple(cpu)) - goto error; - - tp = tuple_val(cpu); - - if (arityval(tp[0]) != 7 || tp[1] != am_cpu) - goto error; - - if (ix >= cpudata_size) { - cpudata_size += 100; - cpudata = erts_realloc(ERTS_ALC_T_TMP, - cpudata, - (sizeof(erts_cpu_topology_t) - * cpudata_size)); - } - - id = signed_val(tp[2]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].node = (int) id; - - id = signed_val(tp[3]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].processor = (int) id; - - id = signed_val(tp[4]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].processor_node = (int) id; - - id = signed_val(tp[5]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].core = (int) id; - - id = signed_val(tp[6]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].thread = (int) id; - - id = signed_val(tp[7]); - if (id < -1 || ERTS_MAX_CPU_TOPOLOGY_ID < id) - goto error; - cpudata[ix].logical = (int) id; - - list = CDR(lp); - ix++; - } - - if (is_not_nil(list)) - goto error; - - cpudata_size = ix; - - if (ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(cpudata, cpudata_size)) - goto error; - - if (user_cpudata_size != cpudata_size) { - if (user_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, user_cpudata); - user_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - sizeof(erts_cpu_topology_t)*cpudata_size); - user_cpudata_size = cpudata_size; - } - - sys_memcpy((void *) user_cpudata, - (void *) cpudata, - sizeof(erts_cpu_topology_t)*cpudata_size); - } - - make_reader_groups_map(NULL); - - signal_schedulers_bind_change(cpudata, cpudata_size); - - done: - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - if (cpudata) - erts_free(ERTS_ALC_T_TMP, cpudata); - - return res; -} - -static Eterm -bound_schedulers_term(ErtsCpuBindOrder order) -{ - switch (order) { - case ERTS_CPU_BIND_SPREAD: { - ERTS_DECL_AM(spread); - return AM_spread; - } - case ERTS_CPU_BIND_PROCESSOR_SPREAD: { - ERTS_DECL_AM(processor_spread); - return AM_processor_spread; - } - case ERTS_CPU_BIND_THREAD_SPREAD: { - ERTS_DECL_AM(thread_spread); - return AM_thread_spread; - } - case ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD: { - ERTS_DECL_AM(thread_no_node_processor_spread); - return AM_thread_no_node_processor_spread; - } - case ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD: { - ERTS_DECL_AM(no_node_processor_spread); - return AM_no_node_processor_spread; - } - case ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD: { - ERTS_DECL_AM(no_node_thread_spread); - return AM_no_node_thread_spread; - } - case ERTS_CPU_BIND_NO_SPREAD: { - ERTS_DECL_AM(no_spread); - return AM_no_spread; - } - case ERTS_CPU_BIND_NONE: { - ERTS_DECL_AM(unbound); - return AM_unbound; - } - default: - ASSERT(0); - return THE_NON_VALUE; - } -} - -Eterm -erts_bound_schedulers_term(Process *c_p) -{ - ErtsCpuBindOrder order; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - order = cpu_bind_order; - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return bound_schedulers_term(order); -} - -static void -create_tmp_cpu_topology_copy(erts_cpu_topology_t **cpudata, int *cpudata_size) -{ - if (user_cpudata) { - *cpudata_size = user_cpudata_size; - *cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * (*cpudata_size))); - sys_memcpy((void *) *cpudata, - (void *) user_cpudata, - sizeof(erts_cpu_topology_t)*(*cpudata_size)); - } - else if (system_cpudata) { - *cpudata_size = system_cpudata_size; - *cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * (*cpudata_size))); - sys_memcpy((void *) *cpudata, - (void *) system_cpudata, - sizeof(erts_cpu_topology_t)*(*cpudata_size)); - } - else { - *cpudata = NULL; - *cpudata_size = 0; - } -} - -static void -destroy_tmp_cpu_topology_copy(erts_cpu_topology_t *cpudata) -{ - if (cpudata) - erts_free(ERTS_ALC_T_TMP, cpudata); -} - -Eterm -erts_bind_schedulers(Process *c_p, Eterm how) -{ - Eterm res; - erts_cpu_topology_t *cpudata; - int cpudata_size; - ErtsCpuBindOrder old_cpu_bind_order; - - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - - if (erts_bind_to_cpu(erts_cpuinfo, -1) == -ENOTSUP) { - ERTS_BIF_PREP_ERROR(res, c_p, EXC_NOTSUP); - } - else { - - old_cpu_bind_order = cpu_bind_order; - - if (ERTS_IS_ATOM_STR("default_bind", how)) - cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; - else if (ERTS_IS_ATOM_STR("spread", how)) - cpu_bind_order = ERTS_CPU_BIND_SPREAD; - else if (ERTS_IS_ATOM_STR("processor_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("no_spread", how)) - cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; - else if (ERTS_IS_ATOM_STR("unbound", how)) - cpu_bind_order = ERTS_CPU_BIND_NONE; - else { - cpu_bind_order = old_cpu_bind_order; - ERTS_BIF_PREP_ERROR(res, c_p, BADARG); - goto done; - } - - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - - if (!cpudata) { - cpu_bind_order = old_cpu_bind_order; - ERTS_BIF_PREP_ERROR(res, c_p, BADARG); - goto done; - } - - signal_schedulers_bind_change(cpudata, cpudata_size); - - destroy_tmp_cpu_topology_copy(cpudata); - - res = bound_schedulers_term(old_cpu_bind_order); - } - - done: - - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - - return res; -} - -Eterm -erts_fake_scheduler_bindings(Process *p, Eterm how) -{ - ErtsCpuBindOrder fake_cpu_bind_order; - erts_cpu_topology_t *cpudata; - int cpudata_size; - Eterm res; - - if (ERTS_IS_ATOM_STR("default_bind", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_DEFAULT_BIND; - else if (ERTS_IS_ATOM_STR("spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_SPREAD; - else if (ERTS_IS_ATOM_STR("processor_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("thread_no_node_processor_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_processor_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD; - else if (ERTS_IS_ATOM_STR("no_node_thread_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD; - else if (ERTS_IS_ATOM_STR("no_spread", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NO_SPREAD; - else if (ERTS_IS_ATOM_STR("unbound", how)) - fake_cpu_bind_order = ERTS_CPU_BIND_NONE; - else { - ERTS_BIF_PREP_ERROR(res, p, BADARG); - return res; - } - - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - - if (!cpudata || fake_cpu_bind_order == ERTS_CPU_BIND_NONE) - ERTS_BIF_PREP_RET(res, am_false); - else { - int i; - Eterm *hp; - - cpu_bind_order_sort(cpudata, cpudata_size, fake_cpu_bind_order, 1); - -#ifdef ERTS_FAKE_SCHED_BIND_PRINT_SORTED_CPU_DATA - - erts_fprintf(stderr, "node: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].node); - erts_fprintf(stderr, "\n"); - erts_fprintf(stderr, "processor: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].processor); - erts_fprintf(stderr, "\n"); - if (fake_cpu_bind_order != ERTS_CPU_BIND_THREAD_NO_NODE_PROCESSOR_SPREAD - && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_PROCESSOR_SPREAD - && fake_cpu_bind_order != ERTS_CPU_BIND_NO_NODE_THREAD_SPREAD) { - erts_fprintf(stderr, "processor_node:"); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].processor_node); - erts_fprintf(stderr, "\n"); - } - erts_fprintf(stderr, "core: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].core); - erts_fprintf(stderr, "\n"); - erts_fprintf(stderr, "thread: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].thread); - erts_fprintf(stderr, "\n"); - erts_fprintf(stderr, "logical: "); - for (i = 0; i < cpudata_size; i++) - erts_fprintf(stderr, " %2d", cpudata[i].logical); - erts_fprintf(stderr, "\n"); -#endif - - hp = HAlloc(p, cpudata_size+1); - ERTS_BIF_PREP_RET(res, make_tuple(hp)); - *hp++ = make_arityval((Uint) cpudata_size); - for (i = 0; i < cpudata_size; i++) - *hp++ = make_small((Uint) cpudata[i].logical); - } - - destroy_tmp_cpu_topology_copy(cpudata); - - return res; -} - -Eterm -erts_get_schedulers_binds(Process *c_p) -{ - int ix; - ERTS_DECL_AM(unbound); - Eterm *hp = HAlloc(c_p, erts_no_schedulers+1); - Eterm res = make_tuple(hp); - - *(hp++) = make_arityval(erts_no_schedulers); - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - for (ix = 1; ix <= erts_no_schedulers; ix++) - *(hp++) = (scheduler2cpu_map[ix].bound_id >= 0 - ? make_small(scheduler2cpu_map[ix].bound_id) - : AM_unbound); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return res; -} - -static Eterm -bld_topology_term(Eterm **hpp, - Uint *hszp, - erts_cpu_topology_t *cpudata, - int size) -{ - Eterm res = NIL; - int i; - - if (size == 0) - return am_undefined; - - for (i = size-1; i >= 0; i--) { - res = erts_bld_cons(hpp, - hszp, - erts_bld_tuple(hpp, - hszp, - 7, - am_cpu, - make_small(cpudata[i].node), - make_small(cpudata[i].processor), - make_small(cpudata[i].processor_node), - make_small(cpudata[i].core), - make_small(cpudata[i].thread), - make_small(cpudata[i].logical)), - res); - } - return res; -} - -static Eterm -get_cpu_topology_term(Process *c_p, int type) -{ -#ifdef DEBUG - Eterm *hp_end; -#endif - Eterm *hp; - Uint hsz; - Eterm res = THE_NON_VALUE; - erts_cpu_topology_t *cpudata = NULL; - int size = 0; - - switch (type) { - case ERTS_GET_USED_CPU_TOPOLOGY: - if (user_cpudata) - goto defined; - else - goto detected; - case ERTS_GET_DETECTED_CPU_TOPOLOGY: - detected: - if (!system_cpudata) - res = am_undefined; - else { - size = system_cpudata_size; - cpudata = erts_alloc(ERTS_ALC_T_TMP, - (sizeof(erts_cpu_topology_t) - * size)); - sys_memcpy((void *) cpudata, - (void *) system_cpudata, - sizeof(erts_cpu_topology_t)*size); - } - break; - case ERTS_GET_DEFINED_CPU_TOPOLOGY: - defined: - if (!user_cpudata) - res = am_undefined; - else { - size = user_cpudata_size; - cpudata = user_cpudata; - } - break; - default: - erl_exit(ERTS_ABORT_EXIT, "Bad cpu topology type: %d\n", type); - break; - } - - if (res == am_undefined) { - ASSERT(!cpudata); - return res; - } - - hsz = 0; - - bld_topology_term(NULL, &hsz, - cpudata, size); - - hp = HAlloc(c_p, hsz); - -#ifdef DEBUG - hp_end = hp + hsz; -#endif - - res = bld_topology_term(&hp, NULL, - cpudata, size); - - ASSERT(hp_end == hp); - - if (cpudata && cpudata != system_cpudata && cpudata != user_cpudata) - erts_free(ERTS_ALC_T_TMP, cpudata); - - return res; -} - -Eterm -erts_get_cpu_topology_term(Process *c_p, Eterm which) -{ - Eterm res; - int type; - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - if (ERTS_IS_ATOM_STR("used", which)) - type = ERTS_GET_USED_CPU_TOPOLOGY; - else if (ERTS_IS_ATOM_STR("detected", which)) - type = ERTS_GET_DETECTED_CPU_TOPOLOGY; - else if (ERTS_IS_ATOM_STR("defined", which)) - type = ERTS_GET_DEFINED_CPU_TOPOLOGY; - else - type = ERTS_GET_CPU_TOPOLOGY_ERROR; - if (type == ERTS_GET_CPU_TOPOLOGY_ERROR) - res = THE_NON_VALUE; - else - res = get_cpu_topology_term(c_p, type); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - return res; -} - -static void -early_cpu_bind_init(void) -{ - user_cpudata = NULL; - user_cpudata_size = 0; - - system_cpudata_size = erts_get_cpu_topology_size(erts_cpuinfo); - system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(erts_cpu_topology_t) - * system_cpudata_size)); - - cpu_bind_order = ERTS_CPU_BIND_UNDEFINED; - - reader_groups_available_cpu_check = 1; - reader_groups_logical_processors = 0; - reader_groups_map_size = 0; - reader_groups_map = NULL; - - if (!erts_get_cpu_topology(erts_cpuinfo, system_cpudata) - || ERTS_INIT_CPU_TOPOLOGY_OK != verify_topology(system_cpudata, - system_cpudata_size)) { - erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); - system_cpudata = NULL; - system_cpudata_size = 0; - } -} - -static void -late_cpu_bind_init(void) -{ - int ix; - - erts_smp_rwmtx_init(&erts_cpu_bind_rwmtx, "cpu_bind"); - - scheduler2cpu_map = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(ErtsCpuBindData) - * (erts_no_schedulers+1))); - for (ix = 1; ix <= erts_no_schedulers; ix++) { - scheduler2cpu_map[ix].bind_id = -1; - scheduler2cpu_map[ix].bound_id = -1; - } - - if (cpu_bind_order == ERTS_CPU_BIND_UNDEFINED) { - int ncpus = erts_get_cpu_configured(erts_cpuinfo); - if (ncpus < 1 || erts_no_schedulers < ncpus) - cpu_bind_order = ERTS_CPU_BIND_NONE; - else - cpu_bind_order = ((system_cpudata || user_cpudata) - && (erts_bind_to_cpu(erts_cpuinfo, -1) != -ENOTSUP) - ? ERTS_CPU_BIND_DEFAULT_BIND - : ERTS_CPU_BIND_NONE); - } - - make_reader_groups_map(NULL); - - if (cpu_bind_order != ERTS_CPU_BIND_NONE) { - erts_cpu_topology_t *cpudata; - int cpudata_size; - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - signal_schedulers_bind_change(cpudata, cpudata_size); - destroy_tmp_cpu_topology_copy(cpudata); - } -} - -int -erts_update_cpu_info(void) -{ - int changed; - erts_smp_rwmtx_rwlock(&erts_cpu_bind_rwmtx); - changed = erts_cpu_info_update(erts_cpuinfo); - if (changed) { - erts_cpu_topology_t *cpudata; - int cpudata_size; - - if (system_cpudata) - erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); - - system_cpudata_size = erts_get_cpu_topology_size(erts_cpuinfo); - if (!system_cpudata_size) - system_cpudata = NULL; - else { - system_cpudata = erts_alloc(ERTS_ALC_T_CPUDATA, - (sizeof(erts_cpu_topology_t) - * system_cpudata_size)); - - if (!erts_get_cpu_topology(erts_cpuinfo, system_cpudata) - || (ERTS_INIT_CPU_TOPOLOGY_OK - != verify_topology(system_cpudata, - system_cpudata_size))) { - erts_free(ERTS_ALC_T_CPUDATA, system_cpudata); - system_cpudata = NULL; - system_cpudata_size = 0; - } - } - - create_tmp_cpu_topology_copy(&cpudata, &cpudata_size); - signal_schedulers_bind_change(cpudata, cpudata_size); - destroy_tmp_cpu_topology_copy(cpudata); - } - erts_smp_rwmtx_rwunlock(&erts_cpu_bind_rwmtx); - return changed; -} - #ifdef ERTS_SMP static void @@ -7069,7 +5091,7 @@ Process *schedule(Process *p, int calls) } if ((rq->flags & ERTS_RUNQ_FLG_CHK_CPU_BIND) || erts_smp_atomic_read(&esdp->chk_cpu_bind)) { - check_cpu_bind(esdp); + erts_sched_check_cpu_bind(esdp); } } @@ -7165,7 +5187,9 @@ Process *schedule(Process *p, int calls) erts_smp_atomic_set(&function_calls, 0); fcalls = 0; + ASSERT(!erts_port_task_have_outstanding_io_tasks()); + #ifdef ERTS_SMP /* erts_sys_schedule_interrupt(0); */ #endif @@ -7497,6 +5521,15 @@ erts_schedule_misc_op(void (*func)(void *), void *arg) ErtsRunQueue *rq = erts_get_runq_current(NULL); ErtsMiscOpList *molp = misc_op_list_alloc(); + if (!rq) { + /* + * This can only happen when the sys msg dispatcher + * thread schedules misc ops (this happens *very* + * seldom; only when trace drivers are unloaded). + */ + rq = ERTS_RUNQ_IX(0); + } + erts_smp_runq_lock(rq); while (rq->misc.evac_runq) { diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h index 4365e409e5..c038e57b65 100644 --- a/erts/emulator/beam/erl_process.h +++ b/erts/emulator/beam/erl_process.h @@ -89,7 +89,6 @@ extern int erts_sched_thread_suggested_stack_size; #define ERTS_SCHED_THREAD_MAX_STACK_SIZE 8192 /* Kilo words */ #ifdef ERTS_SMP -extern Uint erts_max_main_threads; #include "erl_bits.h" #endif @@ -426,6 +425,13 @@ struct ErtsSchedulerData_ { #endif }; +typedef union { + ErtsSchedulerData esd; + char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsSchedulerData))]; +} ErtsAlignedSchedulerData; + +extern ErtsAlignedSchedulerData *erts_aligned_scheduler_data; + #ifndef ERTS_SMP extern ErtsSchedulerData *erts_scheduler_data; #endif @@ -1007,27 +1013,12 @@ extern struct erts_system_profile_flags_t erts_system_profile_flags; (p)->flags &= ~F_TIMO; \ } while (0) - -#define ERTS_INIT_SCHED_BIND_TYPE_SUCCESS 0 -#define ERTS_INIT_SCHED_BIND_TYPE_NOT_SUPPORTED 1 -#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_CPU_TOPOLOGY 2 -#define ERTS_INIT_SCHED_BIND_TYPE_ERROR_NO_BAD_TYPE 3 - -int erts_init_scheduler_bind_type(char *how); - -#define ERTS_INIT_CPU_TOPOLOGY_OK 0 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID 1 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_RANGE 2 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_HIERARCHY 3 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_ID_TYPE 4 -#define ERTS_INIT_CPU_TOPOLOGY_INVALID_NODES 5 -#define ERTS_INIT_CPU_TOPOLOGY_MISSING_LID 6 -#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_LIDS 7 -#define ERTS_INIT_CPU_TOPOLOGY_NOT_UNIQUE_ENTITIES 8 -#define ERTS_INIT_CPU_TOPOLOGY_MISSING 9 - -int erts_init_cpu_topology(char *topology_str); -int erts_update_cpu_info(void); +#define ERTS_RUNQ_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_run_queues), \ + &erts_aligned_run_queues[(IX)].runq) +#define ERTS_SCHEDULER_IX(IX) \ + (ASSERT_EXPR(0 <= (IX) && (IX) < erts_no_schedulers), \ + &erts_aligned_scheduler_data[(IX)].esd) void erts_pre_init_process(void); void erts_late_init_process(void); @@ -1058,8 +1049,9 @@ Eterm erts_multi_scheduling_blockers(Process *); void erts_start_schedulers(void); void erts_smp_notify_check_children_needed(void); #endif +void erts_sched_notify_check_cpu_bind(void); Uint erts_active_schedulers(void); -void erts_init_process(void); +void erts_init_process(int); Eterm erts_process_status(Process *, ErtsProcLocks, Process *, Eterm); Uint erts_run_queues_len(Uint *); void erts_add_to_runq(Process *); diff --git a/erts/emulator/beam/erl_process_lock.c b/erts/emulator/beam/erl_process_lock.c index a4d12139e9..1bebcdb911 100644 --- a/erts/emulator/beam/erl_process_lock.c +++ b/erts/emulator/beam/erl_process_lock.c @@ -117,10 +117,9 @@ static int aux_thr_proc_lock_spin_count; static void cleanup_tse(void); void -erts_init_proc_lock(void) +erts_init_proc_lock(int cpus) { int i; - int cpus; erts_smp_spinlock_init(&qs_lock, "proc_lck_qs_alloc"); for (i = 0; i < ERTS_NO_OF_PIX_LOCKS; i++) { #ifdef ERTS_ENABLE_LOCK_COUNT @@ -138,7 +137,6 @@ erts_init_proc_lock(void) lc_id.proc_lock_msgq = erts_lc_get_lock_order_id("proc_msgq"); lc_id.proc_lock_status = erts_lc_get_lock_order_id("proc_status"); #endif - cpus = erts_get_cpu_configured(erts_cpuinfo); if (cpus > 1) { proc_lock_spin_count = ERTS_PROC_LOCK_SPIN_COUNT_BASE; proc_lock_spin_count += (ERTS_PROC_LOCK_SPIN_COUNT_SCHED_INC diff --git a/erts/emulator/beam/erl_process_lock.h b/erts/emulator/beam/erl_process_lock.h index 7cfc9893fa..4fe30c7209 100644 --- a/erts/emulator/beam/erl_process_lock.h +++ b/erts/emulator/beam/erl_process_lock.h @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2007-2009. All Rights Reserved. + * Copyright Ericsson AB 2007-2010. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in @@ -334,7 +334,7 @@ erts_proc_lock_flags_cmpxchg(erts_proc_lock_t *lck, ErtsProcLocks new, extern erts_pix_lock_t erts_pix_locks[ERTS_NO_OF_PIX_LOCKS]; -void erts_init_proc_lock(void); +void erts_init_proc_lock(int cpus); void erts_proc_lock_prepare_proc_lock_waiter(void); void erts_proc_lock_failed(Process *, erts_pix_lock_t *, diff --git a/erts/emulator/beam/erl_threads.h b/erts/emulator/beam/erl_threads.h index 0b7269262e..a74cf79b8c 100644 --- a/erts/emulator/beam/erl_threads.h +++ b/erts/emulator/beam/erl_threads.h @@ -27,9 +27,6 @@ #define ERTS_SPIN_BODY ETHR_SPIN_BODY -#define ERTS_MAX_READER_GROUPS 8 -extern int erts_reader_groups; - #include "sys.h" #ifdef USE_THREADS diff --git a/erts/emulator/beam/erl_vm.h b/erts/emulator/beam/erl_vm.h index cd63401581..48fa99934e 100644 --- a/erts/emulator/beam/erl_vm.h +++ b/erts/emulator/beam/erl_vm.h @@ -199,6 +199,7 @@ extern int BIN_VH_MIN_SIZE; /* minimum virtual (bin) heap */ extern int erts_atom_table_size;/* Atom table size */ #define ORIG_CREATION 0 +#define INTERNAL_CREATION 255 /* macros for extracting bytes from uint16's */ diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index d7c8aa84e9..328aa2be6a 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -49,10 +49,8 @@ #define in_area(ptr,start,nbytes) ((Uint)((char*)(ptr) - (char*)(start)) < (nbytes)) #define MAX_STRING_LEN 0xffff -#define dec_set_creation(nodename,creat) \ - (((nodename) == erts_this_node->sysname && (creat) == ORIG_CREATION) \ - ? erts_this_node->creation \ - : (creat)) + +#define is_valid_creation(Cre) ((unsigned)(Cre) < MAX_CREATION || (Cre) == INTERNAL_CREATION) #undef ERTS_DEBUG_USE_DIST_SEP #ifdef DEBUG @@ -83,14 +81,14 @@ * */ -static byte* enc_term(ErtsAtomCacheMap *, Eterm, byte*, Uint32); +static byte* enc_term(ErtsAtomCacheMap *, Eterm, byte*, Uint32, struct erl_off_heap_header** off_heap); static Uint is_external_string(Eterm obj, int* p_is_string); static byte* enc_atom(ErtsAtomCacheMap *, Eterm, byte*, Uint32); static byte* enc_pid(ErtsAtomCacheMap *, Eterm, byte*, Uint32); static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*); static byte* dec_atom(ErtsDistExternal *, byte*, Eterm*); static byte* dec_pid(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*); -static Sint decoded_size(byte *ep, byte* endp, int only_heap_bins); +static Sint decoded_size(byte *ep, byte* endp, int only_heap_bins, int internal_tags); static Uint encode_size_struct2(ErtsAtomCacheMap *, Eterm, unsigned); @@ -461,6 +459,12 @@ Uint erts_encode_ext_size(Eterm term) + 1 /* VERSION_MAGIC */; } +Uint erts_encode_ext_size_ets(Eterm term) +{ + return encode_size_struct2(NULL, term, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS); +} + + void erts_encode_dist_ext(Eterm term, byte **ext, Uint32 flags, ErtsAtomCacheMap *acmp) { byte *ep = *ext; @@ -468,7 +472,7 @@ void erts_encode_dist_ext(Eterm term, byte **ext, Uint32 flags, ErtsAtomCacheMap if (!(flags & DFLAG_DIST_HDR_ATOM_CACHE)) #endif *ep++ = VERSION_MAGIC; - ep = enc_term(acmp, term, ep, flags); + ep = enc_term(acmp, term, ep, flags, NULL); if (!ep) erl_exit(ERTS_ABORT_EXIT, "%s:%d:erts_encode_dist_ext(): Internal data structure error\n", @@ -480,7 +484,7 @@ void erts_encode_ext(Eterm term, byte **ext) { byte *ep = *ext; *ep++ = VERSION_MAGIC; - ep = enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS); + ep = enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS, NULL); if (!ep) erl_exit(ERTS_ABORT_EXIT, "%s:%d:erts_encode_ext(): Internal data structure error\n", @@ -488,6 +492,12 @@ void erts_encode_ext(Eterm term, byte **ext) *ext = ep; } +byte* erts_encode_ext_ets(Eterm term, byte *ep, struct erl_off_heap_header** off_heap) +{ + return enc_term(NULL, term, ep, TERM_TO_BINARY_DFLAGS|DFLAGS_INTERNAL_TAGS, + off_heap); +} + ErtsDistExternal * erts_make_dist_ext_copy(ErtsDistExternal *edep, Uint xsize) { @@ -813,7 +823,7 @@ erts_decode_dist_ext_size(ErtsDistExternal *edep, int no_refc_bins) goto fail; ep = edep->extp+1; } - res = decoded_size(ep, edep->ext_endp, no_refc_bins); + res = decoded_size(ep, edep->ext_endp, no_refc_bins, 0); if (res >= 0) return res; fail: @@ -825,9 +835,17 @@ Sint erts_decode_ext_size(byte *ext, Uint size, int no_refc_bins) { if (size == 0 || *ext != VERSION_MAGIC) return -1; - return decoded_size(ext+1, ext+size, no_refc_bins); + return decoded_size(ext+1, ext+size, no_refc_bins, 0); } +Sint erts_decode_ext_size_ets(byte *ext, Uint size) +{ + Sint sz = decoded_size(ext, ext+size, 0, 1); + ASSERT(sz >= 0); + return sz; +} + + /* ** hpp is set to either a &p->htop or ** a pointer to a memory pointer (form message buffers) @@ -887,7 +905,13 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext) return obj; } - +Eterm erts_decode_ext_ets(Eterm **hpp, ErlOffHeap *off_heap, byte *ext) +{ + Eterm obj; + ext = dec_term(NULL, hpp, ext, off_heap, &obj); + ASSERT(ext); + return obj; +} /**********************************************************************/ @@ -964,6 +988,7 @@ term_to_binary_1(Process* p, Eterm Term) return erts_term_to_binary(p, Term, 0, TERM_TO_BINARY_DFLAGS); } + Eterm term_to_binary_2(Process* p, Eterm Term, Eterm Flags) { @@ -1075,7 +1100,7 @@ binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size) goto error; size = (Sint) dest_len; } - res = decoded_size(state->extp, state->extp + size, 0); + res = decoded_size(state->extp, state->extp + size, 0, 0); if (res < 0) goto error; return res; @@ -1183,7 +1208,8 @@ BIF_RETTYPE binary_to_term_2(BIF_ALIST_2) opt = CAR(list_val(opts)); if (opt == am_safe) { fakedep.flags |= ERTS_DIST_EXT_BTT_SAFE; - } else { + } + else { goto error; } opts = CDR(list_val(opts)); @@ -1255,7 +1281,7 @@ erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags) bytes = erts_alloc(ERTS_ALC_T_TMP, size); } - if ((endp = enc_term(NULL, Term, bytes, flags)) + if ((endp = enc_term(NULL, Term, bytes, flags, NULL)) == NULL) { erl_exit(1, "%s, line %d: bad term: %x\n", __FILE__, __LINE__, Term); @@ -1300,7 +1326,7 @@ erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags) bin = new_binary(p, (byte *)NULL, size); bytes = binary_bytes(bin); bytes[0] = VERSION_MAGIC; - if ((endp = enc_term(NULL, Term, bytes+1, flags)) + if ((endp = enc_term(NULL, Term, bytes+1, flags, NULL)) == NULL) { erl_exit(1, "%s, line %d: bad term: %x\n", __FILE__, __LINE__, Term); @@ -1330,6 +1356,21 @@ enc_atom(ErtsAtomCacheMap *acmp, Eterm atom, byte *ep, Uint32 dflags) ASSERT(is_atom(atom)); + if (dflags & DFLAGS_INTERNAL_TAGS) { + Uint aval = atom_val(atom); + ASSERT(aval < (1<<24)); + if (aval >= (1 << 16)) { + *ep++ = ATOM_INTERNAL_REF3; + put_int24(aval, ep); + ep += 3; + } + else { + *ep++ = ATOM_INTERNAL_REF2; + put_int16(aval, ep); + ep += 2; + } + return ep; + } /* * term_to_binary/1,2 and the initial distribution message * don't use the cache. @@ -1379,7 +1420,8 @@ enc_pid(ErtsAtomCacheMap *acmp, Eterm pid, byte* ep, Uint32 dflags) ep += 4; put_int32(os, ep); ep += 4; - *ep++ = pid_creation(pid); + *ep++ = (is_internal_pid(pid) && (dflags & DFLAGS_INTERNAL_TAGS)) ? + INTERNAL_CREATION : pid_creation(pid); return ep; } @@ -1418,6 +1460,23 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) } ep += len; break; + case ATOM_INTERNAL_REF2: + n = get_int16(ep); + ep += 2; + if (n >= atom_table_size()) { + goto error; + } + *objp = make_atom(n); + break; + case ATOM_INTERNAL_REF3: + n = get_int24(ep); + ep += 3; + if (n >= atom_table_size()) { + goto error; + } + *objp = make_atom(n); + break; + default: error: *objp = NIL; /* Don't leave a hole in the heap */ @@ -1426,6 +1485,19 @@ dec_atom(ErtsDistExternal *edep, byte* ep, Eterm* objp) return ep; } +static ERTS_INLINE ErlNode* dec_get_node(Eterm sysname, Uint creation) +{ + switch (creation) { + case INTERNAL_CREATION: + return erts_this_node; + case ORIG_CREATION: + if (sysname == erts_this_node->sysname) { + creation = erts_this_node->creation; + } + } + return erts_find_or_insert_node(sysname,creation); +} + static byte* dec_pid(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp) { @@ -1449,18 +1521,20 @@ dec_pid(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Ete ep += 4; if (ser > ERTS_MAX_PID_SERIAL) return NULL; - if ((cre = get_int8(ep)) >= MAX_CREATION) - return NULL; + cre = get_int8(ep); ep += 1; + if (!is_valid_creation(cre)) { + return NULL; + } + data = make_pid_data(ser, num); + /* * We are careful to create the node entry only after all * validity tests are done. */ - cre = dec_set_creation(sysname,cre); - node = erts_find_or_insert_node(sysname,cre); + node = dec_get_node(sysname, cre); - data = make_pid_data(ser, num); if(node == erts_this_node) { *objp = make_internal_pid(data); } else { @@ -1485,7 +1559,8 @@ dec_pid(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Ete #define ENC_LAST_ARRAY_ELEMENT ((Eterm) 3) static byte* -enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) +enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags, + struct erl_off_heap_header** off_heap) { DECLARE_WSTACK(s); Uint n; @@ -1637,12 +1712,14 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) Uint32 *ref_num; ASSERT(dflags & DFLAG_EXTENDED_REFERENCES); + *ep++ = NEW_REFERENCE_EXT; i = ref_no_of_numbers(obj); put_int16(i, ep); ep += 2; ep = enc_atom(acmp,ref_node_name(obj),ep,dflags); - *ep++ = ref_creation(obj); + *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_ref(obj)) ? + INTERNAL_CREATION : ref_creation(obj); ref_num = ref_numbers(obj); for (j = 0; j < i; j++) { put_int32(ref_num[j], ep); @@ -1658,7 +1735,8 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) j = port_number(obj); put_int32(j, ep); ep += 4; - *ep++ = port_creation(obj); + *ep++ = ((dflags & DFLAGS_INTERNAL_TAGS) && is_internal_port(obj)) ? + INTERNAL_CREATION : port_creation(obj); break; case LIST_DEF: @@ -1738,6 +1816,41 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) byte* bytes; ERTS_GET_BINARY_BYTES(obj, bytes, bitoffs, bitsize); + if (dflags & DFLAGS_INTERNAL_TAGS) { + ProcBin* pb = (ProcBin*) binary_val(obj); + Uint bytesize = pb->size; + if (pb->thing_word == HEADER_SUB_BIN) { + ErlSubBin* sub = (ErlSubBin*)pb; + pb = (ProcBin*) binary_val(sub->orig); + ASSERT(bytesize == sub->size); + bytesize += (bitoffs + bitsize + 7) / 8; + } + if (pb->thing_word == HEADER_PROC_BIN + && heap_bin_size(bytesize) > PROC_BIN_SIZE) { + ProcBin tmp; + if (bitoffs || bitsize) { + *ep++ = BIT_BINARY_INTERNAL_REF; + *ep++ = bitoffs; + *ep++ = bitsize; + } + else { + *ep++ = BINARY_INTERNAL_REF; + } + if (pb->flags) { + erts_emasculate_writable_binary(pb); + } + erts_refc_inc(&pb->val->refc, 2); + + sys_memcpy(&tmp, pb, sizeof(ProcBin)); + tmp.next = *off_heap; + tmp.bytes = bytes; + tmp.size = bytesize; + sys_memcpy(ep, &tmp, sizeof(ProcBin)); + *off_heap = (struct erl_off_heap_header*) ep; + ep += sizeof(ProcBin); + break; + } + } if (bitsize == 0) { /* Plain old byte-sized binary. */ *ep++ = BINARY_EXT; @@ -1773,8 +1886,8 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) *ep++ = SMALL_INTEGER_EXT; *ep++ = bitsize; } - break; } + break; case EXPORT_DEF: { Export* exp = *((Export **) (export_val(obj) + 1)); @@ -1782,7 +1895,7 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) *ep++ = EXPORT_EXT; ep = enc_atom(acmp, exp->code[0], ep, dflags); ep = enc_atom(acmp, exp->code[1], ep, dflags); - ep = enc_term(acmp, make_small(exp->code[2]), ep, dflags); + ep = enc_term(acmp, make_small(exp->code[2]), ep, dflags, off_heap); } else { /* Tag, arity */ *ep++ = SMALL_TUPLE_EXT; @@ -1818,8 +1931,8 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) put_int32(funp->num_free, ep); ep += 4; ep = enc_atom(acmp, funp->fe->module, ep, dflags); - ep = enc_term(acmp, make_small(funp->fe->old_index), ep, dflags); - ep = enc_term(acmp, make_small(funp->fe->old_uniq), ep, dflags); + ep = enc_term(acmp, make_small(funp->fe->old_index), ep, dflags, off_heap); + ep = enc_term(acmp, make_small(funp->fe->old_uniq), ep, dflags, off_heap); ep = enc_pid(acmp, funp->creator, ep, dflags); fun_env: @@ -1872,7 +1985,8 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags) return ep; } -static Uint +static +Uint is_external_string(Eterm list, int* p_is_string) { Uint len = 0; @@ -2162,13 +2276,13 @@ dec_term_atom_common: goto error; } ep += 4; - if ((cre = get_int8(ep)) >= MAX_CREATION) { + cre = get_int8(ep); + ep++; + if (!is_valid_creation(cre)) { goto error; } - ep++; - cre = dec_set_creation(sysname,cre); - node = erts_find_or_insert_node(sysname, cre); + node = dec_get_node(sysname, cre); if(node == erts_this_node) { *objp = make_internal_port(num); } @@ -2205,9 +2319,11 @@ dec_term_atom_common: goto error; ep += 4; - if ((cre = get_int8(ep)) >= MAX_CREATION) - goto error; + cre = get_int8(ep); ep += 1; + if (!is_valid_creation(cre)) { + goto error; + } goto ref_ext_common; case NEW_REFERENCE_EXT: @@ -2220,10 +2336,11 @@ dec_term_atom_common: if ((ep = dec_atom(edep, ep, &sysname)) == NULL) goto error; - if ((cre = get_int8(ep)) >= MAX_CREATION) - goto error; + cre = get_int8(ep); ep += 1; - + if (!is_valid_creation(cre)) { + goto error; + } r0 = get_int32(ep); ep += 4; if (r0 >= MAX_REFERENCE) @@ -2231,8 +2348,7 @@ dec_term_atom_common: ref_ext_common: - cre = dec_set_creation(sysname, cre); - node = erts_find_or_insert_node(sysname, cre); + node = dec_get_node(sysname, cre); if(node == erts_this_node) { RefThing *rtp = (RefThing *) hp; ref_num = (Uint32 *) (hp + REF_THING_HEAD_SIZE); @@ -2560,6 +2676,66 @@ dec_term_atom_common: } break; } + case ATOM_INTERNAL_REF2: + n = get_int16(ep); + ep += 2; + if (n >= atom_table_size()) { + goto error; + } + *objp = make_atom(n); + break; + case ATOM_INTERNAL_REF3: + n = get_int24(ep); + ep += 3; + if (n >= atom_table_size()) { + goto error; + } + *objp = make_atom(n); + break; + + case BINARY_INTERNAL_REF: + { + ProcBin* pb = (ProcBin*) hp; + sys_memcpy(pb, ep, sizeof(ProcBin)); + ep += sizeof(ProcBin); + + erts_refc_inc(&pb->val->refc, 1); + hp += PROC_BIN_SIZE; + pb->next = off_heap->first; + off_heap->first = (struct erl_off_heap_header*)pb; + pb->flags = 0; + *objp = make_binary(pb); + break; + } + case BIT_BINARY_INTERNAL_REF: + { + Sint bitoffs = *ep++; + Sint bitsize = *ep++; + ProcBin* pb = (ProcBin*) hp; + ErlSubBin* sub; + sys_memcpy(pb, ep, sizeof(ProcBin)); + ep += sizeof(ProcBin); + + erts_refc_inc(&pb->val->refc, 1); + hp += PROC_BIN_SIZE; + pb->next = off_heap->first; + off_heap->first = (struct erl_off_heap_header*)pb; + pb->flags = 0; + + sub = (ErlSubBin*)hp; + sub->thing_word = HEADER_SUB_BIN; + sub->size = pb->size - (bitoffs + bitsize + 7)/8; + sub->offs = 0; + sub->bitoffs = bitoffs; + sub->bitsize = bitsize; + sub->is_writable = 0; + sub->orig = make_binary(pb); + + hp += ERL_SUB_BIN_SIZE; + *objp = make_binary(sub); + break; + } + default: error: /* UNDO: @@ -2642,20 +2818,29 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) case NIL_DEF: result++; break; - case ATOM_DEF: { - int alen = atom_tab(atom_val(obj))->len; - if ((MAX_ATOM_LENGTH <= 255 || alen <= 255) - && (dflags & DFLAG_SMALL_ATOM_TAGS)) { - /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */ - result += 1 + 1 + alen; + case ATOM_DEF: + if (dflags & DFLAGS_INTERNAL_TAGS) { + if (atom_val(obj) >= (1<<16)) { + result += 1 + 3; + } + else { + result += 1 + 2; + } } else { - /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */ - result += 1 + 2 + alen; + int alen = atom_tab(atom_val(obj))->len; + if ((MAX_ATOM_LENGTH <= 255 || alen <= 255) + && (dflags & DFLAG_SMALL_ATOM_TAGS)) { + /* Make sure a SMALL_ATOM_EXT fits: SMALL_ATOM_EXT l t1 t2... */ + result += 1 + 1 + alen; + } + else { + /* Make sure an ATOM_EXT fits: ATOM_EXT l1 l0 t1 t2... */ + result += 1 + 2 + alen; + } + insert_acache_map(acmp, obj); } - insert_acache_map(acmp, obj); break; - } case SMALL_DEF: { Sint val = signed_val(obj); @@ -2734,8 +2919,25 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } break; case BINARY_DEF: + if (dflags & DFLAGS_INTERNAL_TAGS) { + ProcBin* pb = (ProcBin*) binary_val(obj); + Uint sub_extra = 0; + Uint tot_bytes = pb->size; + if (pb->thing_word == HEADER_SUB_BIN) { + ErlSubBin* sub = (ErlSubBin*) pb; + pb = (ProcBin*) binary_val(sub->orig); + sub_extra = 2; /* bitoffs and bitsize */ + tot_bytes += (sub->bitoffs + sub->bitsize+ 7) / 8; + } + if (pb->thing_word == HEADER_PROC_BIN + && heap_bin_size(tot_bytes) > PROC_BIN_SIZE) { + + result += 1 + sub_extra + sizeof(ProcBin); + break; + } + } result += 1 + 4 + binary_size(obj) + - 5; /* For unaligned binary */ + 5; /* For unaligned binary */ break; case FUN_DEF: { @@ -2807,7 +3009,7 @@ encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags) } static Sint -decoded_size(byte *ep, byte* endp, int no_refc_bins) +decoded_size(byte *ep, byte* endp, int no_refc_bins, int internal_tags) { int heap_size = 0; int terms; @@ -3017,6 +3219,29 @@ decoded_size(byte *ep, byte* endp, int no_refc_bins) heap_size += ERL_FUN_SIZE + num_free; break; } + case ATOM_INTERNAL_REF2: + SKIP(2+atom_extra_skip); + atom_extra_skip = 0; + break; + case ATOM_INTERNAL_REF3: + SKIP(3+atom_extra_skip); + atom_extra_skip = 0; + break; + + case BINARY_INTERNAL_REF: + if (!internal_tags) { + return -1; + } + SKIP(sizeof(ProcBin)); + heap_size += PROC_BIN_SIZE; + break; + case BIT_BINARY_INTERNAL_REF: + if (!internal_tags) { + return -1; + } + SKIP(2+sizeof(ProcBin)); + heap_size += PROC_BIN_SIZE + ERL_SUB_BIN_SIZE; + break; default: return -1; } diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index cee48bbeb0..d8287b96a4 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -54,6 +54,10 @@ #define DIST_HEADER 'D' #define ATOM_CACHE_REF 'R' +#define ATOM_INTERNAL_REF2 'I' +#define ATOM_INTERNAL_REF3 'K' +#define BINARY_INTERNAL_REF 'J' +#define BIT_BINARY_INTERNAL_REF 'L' #define COMPRESSED 'P' #if 0 @@ -156,7 +160,9 @@ Uint erts_encode_dist_ext_size(Eterm, Uint32, ErtsAtomCacheMap *); void erts_encode_dist_ext(Eterm, byte **, Uint32, ErtsAtomCacheMap *); Uint erts_encode_ext_size(Eterm); +Uint erts_encode_ext_size_ets(Eterm); void erts_encode_ext(Eterm, byte **); +byte* erts_encode_ext_ets(Eterm, byte *, struct erl_off_heap_header** ext_off_heap); #ifdef ERTS_WANT_EXTERNAL_TAGS ERTS_GLB_INLINE void erts_peek_dist_header(ErtsDistHeaderPeek *, byte *, Uint); @@ -172,7 +178,9 @@ Sint erts_decode_dist_ext_size(ErtsDistExternal *, int); Eterm erts_decode_dist_ext(Eterm **, ErlOffHeap *, ErtsDistExternal *); Sint erts_decode_ext_size(byte*, Uint, int); +Sint erts_decode_ext_size_ets(byte*, Uint); Eterm erts_decode_ext(Eterm **, ErlOffHeap *, byte**); +Eterm erts_decode_ext_ets(Eterm **, ErlOffHeap *, byte*); Eterm erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags); diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index ecd3c8f68a..89c6625550 100644 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -522,6 +522,7 @@ union erl_off_heap_ptr { struct erl_fun_thing* fun; struct external_thing_* ext; Eterm* ep; + void* voidp; }; /* arrays that get malloced at startup */ @@ -1728,11 +1729,6 @@ Uint erts_current_reductions(Process* current, Process *p); int erts_print_system_version(int to, void *arg, Process *c_p); -/* - * Interface to erl_init - */ -void erl_init(void); - #define seq_trace_output(token, msg, type, receiver, process) \ seq_trace_output_generic((token), (msg), (type), (receiver), (process), NIL) #define seq_trace_output_exit(token, msg, type, receiver, exitfrom) \ diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index 79022d5dd7..9ed92bbe03 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -2802,17 +2802,25 @@ driver_deliver_term(ErlDrvPort port, break; case ERL_DRV_INT: /* signed int argument */ ERTS_DDT_CHK_ENOUGH_ARGS(1); +#if HALFWORD_HEAP + erts_bld_sint64(NULL, &need, (Sint64)ptr[0]); +#else /* check for bignum */ if (!IS_SSMALL((Sint)ptr[0])) need += BIG_UINT_HEAP_SIZE; /* use small_to_big */ +#endif ptr++; depth++; break; case ERL_DRV_UINT: /* unsigned int argument */ ERTS_DDT_CHK_ENOUGH_ARGS(1); +#if HALFWORD_HEAP + erts_bld_uint64(NULL, &need, (Uint64)ptr[0]); +#else /* check for bignum */ if (!IS_USMALL(0, (Uint)ptr[0])) need += BIG_UINT_HEAP_SIZE; /* use small_to_big */ +#endif ptr++; depth++; break; @@ -2979,22 +2987,30 @@ driver_deliver_term(ErlDrvPort port, break; case ERL_DRV_INT: /* signed int argument */ +#if HALFWORD_HEAP + mess = erts_bld_sint64(&hp, NULL, (Sint64)ptr[0]); +#else if (IS_SSMALL((Sint)ptr[0])) mess = make_small((Sint)ptr[0]); else { mess = small_to_big((Sint)ptr[0], hp); hp += BIG_UINT_HEAP_SIZE; } +#endif ptr++; break; case ERL_DRV_UINT: /* unsigned int argument */ +#if HALFWORD_HEAP + mess = erts_bld_uint64(&hp, NULL, (Uint64)ptr[0]); +#else if (IS_USMALL(0, (Uint)ptr[0])) mess = make_small((Uint)ptr[0]); else { mess = uint_to_big((Uint)ptr[0], hp); hp += BIG_UINT_HEAP_SIZE; } +#endif ptr++; break; diff --git a/erts/emulator/beam/packet_parser.c b/erts/emulator/beam/packet_parser.c index 5bcd567b5f..a66d60aa22 100644 --- a/erts/emulator/beam/packet_parser.c +++ b/erts/emulator/beam/packet_parser.c @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 2008-2009. All Rights Reserved. + * Copyright Ericsson AB 2008-2010. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in @@ -47,11 +47,6 @@ (((unsigned char*) (s))[1] << 8) | \ (((unsigned char*) (s))[0])) -#define put_int24(s, x) ((((unsigned char*)(s))[0] = ((x) >> 16) & 0xff), \ - (((unsigned char*)(s))[1] = ((x) >> 8) & 0xff), \ - (((unsigned char*)(s))[2] = (x) & 0xff)) - - #if !defined(__WIN32__) && !defined(HAVE_STRNCASECMP) #define STRNCASECMP my_strncasecmp @@ -833,7 +828,7 @@ int packet_parse_ssl(const char* buf, int len, char prefix[4]; /* <<1:8,Length:24,Data/binary>> */ prefix[0] = 1; - put_int24(&prefix[1],len-3); + put_int24(len-3,&prefix[1]); return pcb->ssl_tls(arg, 22, major, minor, buf+3, len-3, prefix, sizeof(prefix)); } else { diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h index 0031568af6..0d15272aa8 100644 --- a/erts/emulator/beam/sys.h +++ b/erts/emulator/beam/sys.h @@ -466,8 +466,6 @@ static const int zero_value = 0, one_value = 1; # endif /* !__WIN32__ */ #endif /* WANT_NONBLOCKING */ -extern erts_cpu_info_t *erts_cpuinfo; /* erl_init.c */ - __decl_noreturn void __noreturn erl_exit(int n, char*, ...); /* Some special erl_exit() codes: */ @@ -1168,6 +1166,15 @@ void* sys_calloc2(Uint, Uint); ((char*)(s))[3] = (char)(i) & 0xff;} \ while (0) +#define get_int24(s) ((((unsigned char*) (s))[0] << 16) | \ + (((unsigned char*) (s))[1] << 8) | \ + (((unsigned char*) (s))[2])) + +#define put_int24(i, s) do {((char*)(s))[0] = (char)((i) >> 16) & 0xff; \ + ((char*)(s))[1] = (char)((i) >> 8) & 0xff; \ + ((char*)(s))[2] = (char)(i) & 0xff;} \ + while (0) + #define get_int16(s) ((((unsigned char*) (s))[0] << 8) | \ (((unsigned char*) (s))[1])) @@ -1181,6 +1188,7 @@ void* sys_calloc2(Uint, Uint); #define put_int8(i, s) do {((unsigned char*)(s))[0] = (i) & 0xff;} while (0) + /* * Use DEBUGF as you would use printf, but use double parentheses: * diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c index 3de48194fb..18f7cdd15a 100644 --- a/erts/emulator/drivers/common/inet_drv.c +++ b/erts/emulator/drivers/common/inet_drv.c @@ -54,6 +54,9 @@ #ifdef HAVE_IFADDRS_H #include <ifaddrs.h> #endif +#ifdef HAVE_NETPACKET_PACKET_H +#include <netpacket/packet.h> +#endif /* All platforms fail on malloc errors. */ #define FATAL_MALLOC @@ -85,6 +88,7 @@ #include <winsock2.h> #endif #include <windows.h> +#include <iphlpapi.h> #include <Ws2tcpip.h> /* NEED VC 6.0 !!! */ @@ -467,6 +471,7 @@ static int my_strncasecmp(const char *s1, const char *s2, size_t n) #define INET_REQ_IFGET 22 #define INET_REQ_IFSET 23 #define INET_REQ_SUBSCRIBE 24 +#define INET_REQ_GETIFADDRS 25 /* TCP requests */ #define TCP_REQ_ACCEPT 40 #define TCP_REQ_LISTEN 41 @@ -632,15 +637,12 @@ static int my_strncasecmp(const char *s1, const char *s2, size_t n) #define IS_BUSY(d) \ (((d)->state & INET_F_BUSY) == INET_F_BUSY) +#define INET_MAX_OPT_BUFFER (64*1024) + #define INET_DEF_BUFFER 1460 /* default buffer size */ #define INET_MIN_BUFFER 1 /* internal min buffer */ -#define INET_MAX_BUFFER (1024*64) /* internal max buffer */ -/* Note: INET_HIGH_WATERMARK MUST be less than 2*INET_MAX_BUFFER */ #define INET_HIGH_WATERMARK (1024*8) /* 8k pending high => busy */ -/* Note: INET_LOW_WATERMARK MUST be less than INET_MAX_BUFFER and -** less than INET_HIGH_WATERMARK -*/ #define INET_LOW_WATERMARK (1024*4) /* 4k pending => allow more */ #define INET_INFINITY 0xffffffff /* infinity value */ @@ -1251,139 +1253,136 @@ static int load_ip_and_port LOAD_ATOM((spec), (i), (flag) ? am_true : am_false); #endif /* HAVE_SCTP */ +/* Assume a cache line size of 64 bytes */ +#define INET_DRV_CACHE_LINE_SIZE ((ErlDrvUInt) 64) +#define INET_DRV_CACHE_LINE_MASK (INET_DRV_CACHE_LINE_SIZE - 1) + /* ** Binary Buffer Managment ** We keep a stack of usable buffers */ -#define BUFFER_STACK_SIZE 16 +#define BUFFER_STACK_SIZE 14 +#define BUFFER_STACK_MAX_MEM_SIZE (1024*1024) -static erts_smp_spinlock_t inet_buffer_stack_lock; -static ErlDrvBinary* buffer_stack[BUFFER_STACK_SIZE]; -static int buffer_stack_pos = 0; - - -/* - * XXX - * The erts_smp_spin_* functions should not be used by drivers (but this - * driver is special). Replace when driver locking api has been implemented. - * /rickard - */ -#define BUFSTK_LOCK erts_smp_spin_lock(&inet_buffer_stack_lock); -#define BUFSTK_UNLOCK erts_smp_spin_unlock(&inet_buffer_stack_lock); +ErlDrvTSDKey buffer_stack_key; -#ifdef DEBUG -static int tot_buf_allocated = 0; /* memory in use for i_buf */ -static int tot_buf_stacked = 0; /* memory on stack */ -static int max_buf_allocated = 0; /* max allocated */ - -#define COUNT_BUF_ALLOC(sz) do { \ - BUFSTK_LOCK; \ - tot_buf_allocated += (sz); \ - if (tot_buf_allocated > max_buf_allocated) \ - max_buf_allocated = tot_buf_allocated; \ - BUFSTK_UNLOCK; \ -} while(0) - -#define COUNT_BUF_FREE(sz) do { \ - BUFSTK_LOCK; \ - tot_buf_allocated -= (sz); \ - BUFSTK_UNLOCK; \ - } while(0) - -#define COUNT_BUF_STACK(sz) do { \ - BUFSTK_LOCK; \ - tot_buf_stacked += (sz); \ - BUFSTK_UNLOCK; \ - } while(0) +typedef struct { + int mem_size; + int pos; + ErlDrvBinary* stk[BUFFER_STACK_SIZE]; +} InetDrvBufStkBase; -#else +typedef struct { + InetDrvBufStkBase buf; + char align[(((sizeof(InetDrvBufStkBase) - 1) / INET_DRV_CACHE_LINE_SIZE) + 1) + * INET_DRV_CACHE_LINE_SIZE]; +} InetDrvBufStk; + +static InetDrvBufStk *get_bufstk(void) +{ + InetDrvBufStk *bs = erl_drv_tsd_get(buffer_stack_key); + if (bs) + return bs; + bs = driver_alloc(sizeof(InetDrvBufStk) + + INET_DRV_CACHE_LINE_SIZE - 1); + if (!bs) + return NULL; + if ((((ErlDrvUInt) bs) & INET_DRV_CACHE_LINE_MASK) != 0) + bs = ((InetDrvBufStk *) + ((((ErlDrvUInt) bs) & ~INET_DRV_CACHE_LINE_MASK) + + INET_DRV_CACHE_LINE_SIZE)); + erl_drv_tsd_set(buffer_stack_key, bs); + bs->buf.pos = 0; + bs->buf.mem_size = 0; -#define COUNT_BUF_ALLOC(sz) -#define COUNT_BUF_FREE(sz) -#define COUNT_BUF_STACK(sz) + ASSERT(bs == erl_drv_tsd_get(buffer_stack_key)); -#endif + return bs; +} static ErlDrvBinary* alloc_buffer(long minsz) { - ErlDrvBinary* buf = NULL; + InetDrvBufStk *bs = get_bufstk(); - BUFSTK_LOCK; + DEBUGF(("alloc_buffer: %ld\r\n", minsz)); - DEBUGF(("alloc_buffer: sz = %ld, tot = %d, max = %d\r\n", - minsz, tot_buf_allocated, max_buf_allocated)); + if (bs && bs->buf.pos > 0) { + long size; + ErlDrvBinary* buf = bs->buf.stk[--bs->buf.pos]; + size = buf->orig_size; + bs->buf.mem_size -= size; + ASSERT(0 <= bs->buf.mem_size + && bs->buf.mem_size <= BUFFER_STACK_MAX_MEM_SIZE); + if (size >= minsz) + return buf; - if (buffer_stack_pos > 0) { - int origsz; + driver_free_binary(buf); + } - buf = buffer_stack[--buffer_stack_pos]; - origsz = buf->orig_size; - BUFSTK_UNLOCK; - COUNT_BUF_STACK(-origsz); - if (origsz < minsz) { - if ((buf = driver_realloc_binary(buf, minsz)) == NULL) - return NULL; - COUNT_BUF_ALLOC(buf->orig_size - origsz); + ASSERT(!bs || bs->buf.pos != 0 || bs->buf.mem_size == 0); + + return driver_alloc_binary(minsz); +} + +/*#define CHECK_DOUBLE_RELEASE 1*/ +#ifdef CHECK_DOUBLE_RELEASE +static void +check_double_release(InetDrvBufStk *bs, ErlDrvBinary* buf) +{ +#ifdef __GNUC__ +#warning CHECK_DOUBLE_RELEASE is enabled, this is a custom build emulator +#endif + int i; + for (i = 0; i < bs->buf.pos; ++i) { + if (bs->buf.stk[i] == buf) { + erl_exit(ERTS_ABORT_EXIT, + "Multiple buffer release in inet_drv, this " + "is a bug, save the core and send it to " + "[email protected]!"); } } - else { - BUFSTK_UNLOCK; - if ((buf = driver_alloc_binary(minsz)) == NULL) - return NULL; - COUNT_BUF_ALLOC(buf->orig_size); - } - return buf; } +#endif -/* -** Max buffer memory "cached" BUFFER_STACK_SIZE * INET_MAX_BUFFER -** (16 * 64k ~ 1M) -*/ -/*#define CHECK_DOUBLE_RELEASE 1*/ static void release_buffer(ErlDrvBinary* buf) { + InetDrvBufStk *bs; + long size; + DEBUGF(("release_buffer: %ld\r\n", (buf==NULL) ? 0 : buf->orig_size)); - if (buf == NULL) + + if (!buf) return; - BUFSTK_LOCK; - if ((buf->orig_size > INET_MAX_BUFFER) || - (buffer_stack_pos >= BUFFER_STACK_SIZE)) { - BUFSTK_UNLOCK; - COUNT_BUF_FREE(buf->orig_size); + + size = buf->orig_size; + + if (size > BUFFER_STACK_MAX_MEM_SIZE) + goto free_binary; + + bs = get_bufstk(); + if (!bs + || (bs->buf.mem_size + size > BUFFER_STACK_MAX_MEM_SIZE) + || (bs->buf.pos >= BUFFER_STACK_SIZE)) { + free_binary: driver_free_binary(buf); } else { #ifdef CHECK_DOUBLE_RELEASE -#ifdef __GNUC__ -#warning CHECK_DOUBLE_RELEASE is enabled, this is a custom build emulator + check_double_release(bs, buf); #endif - int i; - for (i = 0; i < buffer_stack_pos; ++i) { - if (buffer_stack[i] == buf) { - erl_exit(1,"Multiple buffer release in inet_drv, this is a " - "bug, save the core and send it to " - "[email protected]!"); - } - } -#endif - buffer_stack[buffer_stack_pos++] = buf; - BUFSTK_UNLOCK; - COUNT_BUF_STACK(buf->orig_size); + ASSERT(bs->buf.pos != 0 || bs->buf.mem_size == 0); + + bs->buf.mem_size += size; + bs->buf.stk[bs->buf.pos++] = buf; + + ASSERT(0 <= bs->buf.mem_size + && bs->buf.mem_size <= BUFFER_STACK_MAX_MEM_SIZE); } } static ErlDrvBinary* realloc_buffer(ErlDrvBinary* buf, long newsz) { - ErlDrvBinary* bin; -#ifdef DEBUG - long orig_size = buf->orig_size; -#endif - - if ((bin = driver_realloc_binary(buf,newsz)) != NULL) { - COUNT_BUF_ALLOC(newsz - orig_size); - ; - } - return bin; + return driver_realloc_binary(buf, newsz); } /* use a TRICK, access the refc field to see if any one else has @@ -1397,10 +1396,8 @@ static void free_buffer(ErlDrvBinary* buf) if (buf != NULL) { if (driver_binary_get_refc(buf) == 1) release_buffer(buf); - else { - COUNT_BUF_FREE(buf->orig_size); + else driver_free_binary(buf); - } } } @@ -3404,20 +3401,14 @@ static int inet_init() if (!sock_init()) goto error; - buffer_stack_pos = 0; - - erts_smp_spinlock_init(&inet_buffer_stack_lock, "inet_buffer_stack_lock"); + if (0 != erl_drv_tsd_key_create("inet_buffer_stack_key", &buffer_stack_key)) + goto error; ASSERT(sizeof(struct in_addr) == 4); # if defined(HAVE_IN6) && defined(AF_INET6) ASSERT(sizeof(struct in6_addr) == 16); # endif -#ifdef DEBUG - tot_buf_allocated = 0; - max_buf_allocated = 0; - tot_buf_stacked = 0; -#endif INIT_ATOM(ok); INIT_ATOM(tcp); INIT_ATOM(udp); @@ -3824,39 +3815,81 @@ do { if ((end)-(ptr) < (n)) goto error; } while(0) static char* sockaddr_to_buf(struct sockaddr* addr, char* ptr, char* end) { if (addr->sa_family == AF_INET || addr->sa_family == 0) { - struct in_addr a; - buf_check(ptr,end,sizeof(struct in_addr)); - a = ((struct sockaddr_in*) addr)->sin_addr; - sys_memcpy(ptr, (char*)&a, sizeof(struct in_addr)); - return ptr + sizeof(struct in_addr); + struct in_addr *p = &(((struct sockaddr_in*) addr)->sin_addr); + buf_check(ptr, end, 1 + sizeof(struct in_addr)); + *ptr = INET_AF_INET; + sys_memcpy(ptr+1, (char*)p, sizeof(struct in_addr)); + return ptr + 1 + sizeof(struct in_addr); } #if defined(HAVE_IN6) && defined(AF_INET6) else if (addr->sa_family == AF_INET6) { - struct in6_addr a; - buf_check(ptr,end,sizeof(struct in6_addr)); - a = ((struct sockaddr_in6*) addr)->sin6_addr; - sys_memcpy(ptr, (char*)&a, sizeof(struct in6_addr)); - return ptr + sizeof(struct in6_addr); + struct in6_addr *p = &(((struct sockaddr_in6*) addr)->sin6_addr); + buf_check(ptr, end, 1 + sizeof(struct in6_addr)); + *ptr = INET_AF_INET6; + sys_memcpy(ptr+1, (char*)p, sizeof(struct in6_addr)); + return ptr + 1 + sizeof(struct in6_addr); + } +#endif +#if defined(AF_LINK) + else if (addr->sa_family == AF_LINK) { + struct sockaddr_dl *sdl_p = (struct sockaddr_dl*) addr; + buf_check(ptr, end, 2 + sdl_p->sdl_alen); + put_int16(sdl_p->sdl_alen, ptr); ptr += 2; + sys_memcpy(ptr, sdl_p->sdl_data + sdl_p->sdl_nlen, sdl_p->sdl_alen); + return ptr + sdl_p->sdl_alen; + } +#endif +#if defined(AF_PACKET) && defined(HAVE_NETPACKET_PACKET_H) + else if(addr->sa_family == AF_PACKET) { + struct sockaddr_ll *sll_p = (struct sockaddr_ll*) addr; + buf_check(ptr, end, 2 + sll_p->sll_halen); + put_int16(sll_p->sll_halen, ptr); ptr += 2; + sys_memcpy(ptr, sll_p->sll_addr, sll_p->sll_halen); + return ptr + sll_p->sll_halen; } #endif + return ptr; error: return NULL; - } static char* buf_to_sockaddr(char* ptr, char* end, struct sockaddr* addr) { - buf_check(ptr,end,sizeof(struct in_addr)); - sys_memcpy((char*) &((struct sockaddr_in*)addr)->sin_addr, ptr, - sizeof(struct in_addr)); - addr->sa_family = AF_INET; - return ptr + sizeof(struct in_addr); - + buf_check(ptr,end,1); + switch (*ptr++) { + case INET_AF_INET: { + struct in_addr *p = &((struct sockaddr_in*)addr)->sin_addr; + buf_check(ptr,end,sizeof(struct in_addr)); + sys_memcpy((char*) p, ptr, sizeof(struct in_addr)); + addr->sa_family = AF_INET; + return ptr + sizeof(struct in_addr); + } + case INET_AF_INET6: { + struct in6_addr *p = &((struct sockaddr_in6*)addr)->sin6_addr; + buf_check(ptr,end,sizeof(struct in6_addr)); + sys_memcpy((char*) p, ptr, sizeof(struct in6_addr)); + addr->sa_family = AF_INET6; + return ptr + sizeof(struct in6_addr); + } + } error: return NULL; } +#if defined (IFF_POINTOPOINT) +#define IFGET_FLAGS(cflags) IFGET_FLAGS_P2P(cflags, IFF_POINTOPOINT) +#elif defined IFF_POINTTOPOINT +#define IFGET_FLAGS(cflags) IFGET_FLAGS_P2P(cflags, IFF_POINTTOPOINT) +#endif + +#define IFGET_FLAGS_P2P(cflags, iff_ptp) \ + ((((cflags) & IFF_UP) ? INET_IFF_UP : 0) | \ + (((cflags) & IFF_BROADCAST) ? INET_IFF_BROADCAST : 0) | \ + (((cflags) & IFF_LOOPBACK) ? INET_IFF_LOOPBACK : 0) | \ + (((cflags) & iff_ptp) ? INET_IFF_POINTTOPOINT : 0) | \ + (((cflags) & IFF_UP) ? INET_IFF_RUNNING : 0) | /* emulate running ? */ \ + (((cflags) & IFF_MULTICAST) ? INET_IFF_MULTICAST : 0)) #if defined(__WIN32__) && defined(SIO_GET_INTERFACE_LIST) @@ -3894,7 +3927,6 @@ static int inet_ctl_getiflist(inet_descriptor* desc, char** rbuf, int rsize) return ctl_reply(INET_REP_OK, sbuf, sptr - sbuf, rbuf, rsize); } - /* input is an ip-address in string format i.e A.B.C.D ** scan the INTERFACE_LIST to get the options */ @@ -3980,27 +4012,12 @@ static int inet_ctl_ifget(inet_descriptor* desc, char* buf, int len, break; case INET_IFOPT_FLAGS: { - long eflags = 0; int flags = ifp->iiFlags; /* just enumerate the interfaces (no names) */ - /* translate flags */ - if (flags & IFF_UP) - eflags |= INET_IFF_UP; - if (flags & IFF_BROADCAST) - eflags |= INET_IFF_BROADCAST; - if (flags & IFF_LOOPBACK) - eflags |= INET_IFF_LOOPBACK; - if (flags & IFF_POINTTOPOINT) - eflags |= INET_IFF_POINTTOPOINT; - if (flags & IFF_UP) /* emulate runnign ? */ - eflags |= INET_IFF_RUNNING; - if (flags & IFF_MULTICAST) - eflags |= INET_IFF_MULTICAST; - buf_check(sptr, s_end, 5); *sptr++ = INET_IFOPT_FLAGS; - put_int32(eflags, sptr); + put_int32(IFGET_FLAGS(flags), sptr); sptr += 4; break; } @@ -4021,7 +4038,6 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, return ctl_reply(INET_REP_OK, NULL, 0, rbuf, rsize); } - #elif defined(SIOCGIFCONF) && defined(SIOCSIFFLAGS) /* cygwin has SIOCGIFCONF but not SIOCSIFFLAGS (Nov 2002) */ @@ -4032,69 +4048,77 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, #define SIZEA(p) (sizeof (p)) #endif - -static int inet_ctl_getiflist(inet_descriptor* desc, char** rbuf, int rsize) -{ - struct ifconf ifc; - struct ifreq *ifr; - char *buf; - int buflen, ifc_len, i; - char *sbuf, *sp; - - /* Courtesy of Per Bergqvist and W. Richard Stevens */ - - ifc_len = 0; - buflen = 100 * sizeof(struct ifreq); - buf = ALLOC(buflen); +static int get_ifconf(SOCKET s, struct ifconf *ifcp) { + int ifc_len = 0; + int buflen = 100 * sizeof(struct ifreq); + char *buf = ALLOC(buflen); for (;;) { - ifc.ifc_len = buflen; - ifc.ifc_buf = buf; - if (ioctl(desc->s, SIOCGIFCONF, (char *)&ifc) < 0) { + ifcp->ifc_len = buflen; + ifcp->ifc_buf = buf; + if (ioctl(s, SIOCGIFCONF, (char *)ifcp) < 0) { int res = sock_errno(); if (res != EINVAL || ifc_len) { FREE(buf); - return ctl_error(res, rbuf, rsize); + return -1; } } else { - if (ifc.ifc_len == ifc_len) break; /* buf large enough */ - ifc_len = ifc.ifc_len; + if (ifcp->ifc_len == ifc_len) break; /* buf large enough */ + ifc_len = ifcp->ifc_len; } buflen += 10 * sizeof(struct ifreq); buf = (char *)REALLOC(buf, buflen); } - - sp = sbuf = ALLOC(ifc_len+1); + return 0; +} + +static void free_ifconf(struct ifconf *ifcp) { + FREE(ifcp->ifc_buf); +} + +static int inet_ctl_getiflist(inet_descriptor* desc, char** rbuf, int rsize) +{ + struct ifconf ifc; + struct ifreq *ifrp; + char *sbuf, *sp; + int i; + + /* Courtesy of Per Bergqvist and W. Richard Stevens */ + + if (get_ifconf(desc->s, &ifc) < 0) { + return ctl_error(sock_errno(), rbuf, rsize); + } + + sp = sbuf = ALLOC(ifc.ifc_len+1); *sp++ = INET_REP_OK; i = 0; for (;;) { int n; - - ifr = (struct ifreq *) VOIDP(buf + i); - n = sizeof(ifr->ifr_name) + SIZEA(ifr->ifr_addr); - if (n < sizeof(*ifr)) n = sizeof(*ifr); - if (i+n > ifc_len) break; + + ifrp = (struct ifreq *) VOIDP(ifc.ifc_buf + i); + n = sizeof(ifrp->ifr_name) + SIZEA(ifrp->ifr_addr); + if (n < sizeof(*ifrp)) n = sizeof(*ifrp); + if (i+n > ifc.ifc_len) break; i += n; - - switch (ifr->ifr_addr.sa_family) { + + switch (ifrp->ifr_addr.sa_family) { #if defined(HAVE_IN6) && defined(AF_INET6) case AF_INET6: #endif case AF_INET: - ASSERT(sp+IFNAMSIZ+1 < sbuf+buflen+1) - strncpy(sp, ifr->ifr_name, IFNAMSIZ); + ASSERT(sp+IFNAMSIZ+1 < sbuf+ifc.ifc_len+1) + strncpy(sp, ifrp->ifr_name, IFNAMSIZ); sp[IFNAMSIZ] = '\0'; sp += strlen(sp), ++sp; } - - if (i >= ifc_len) break; + + if (i >= ifc.ifc_len) break; } - FREE(buf); + free_ifconf(&ifc); *rbuf = sbuf; return sp - sbuf; } - /* FIXME: temporary hack */ #ifndef IFHWADDRLEN #define IFHWADDRLEN 6 @@ -4133,37 +4157,52 @@ static int inet_ctl_ifget(inet_descriptor* desc, char* buf, int len, #ifdef SIOCGIFHWADDR if (ioctl(desc->s, SIOCGIFHWADDR, (char *)&ifreq) < 0) break; - buf_check(sptr, s_end, 1+IFHWADDRLEN); + buf_check(sptr, s_end, 1+2+IFHWADDRLEN); *sptr++ = INET_IFOPT_HWADDR; + put_int16(IFHWADDRLEN, sptr); sptr += 2; /* raw memcpy (fix include autoconf later) */ sys_memcpy(sptr, (char*)(&ifreq.ifr_hwaddr.sa_data), IFHWADDRLEN); sptr += IFHWADDRLEN; -#elif defined(HAVE_GETIFADDRS) - struct ifaddrs *ifa, *ifp; - int found = 0; - - if (getifaddrs(&ifa) == -1) - goto error; +#elif defined(SIOCGENADDR) + if (ioctl(desc->s, SIOCGENADDR, (char *)&ifreq) < 0) + break; + buf_check(sptr, s_end, 1+2+sizeof(ifreq.ifr_enaddr)); + *sptr++ = INET_IFOPT_HWADDR; + put_int16(sizeof(ifreq.ifr_enaddr), sptr); sptr += 2; + /* raw memcpy (fix include autoconf later) */ + sys_memcpy(sptr, (char*)(&ifreq.ifr_enaddr), + sizeof(ifreq.ifr_enaddr)); + sptr += sizeof(ifreq.ifr_enaddr); +#elif defined(HAVE_GETIFADDRS) && defined(AF_LINK) + struct ifaddrs *ifa, *ifp; + struct sockaddr_dl *sdlp; + int found = 0; + + if (getifaddrs(&ifa) == -1) + goto error; - for (ifp = ifa; ifp; ifp = ifp->ifa_next) { - if ((ifp->ifa_addr->sa_family == AF_LINK) && - (sys_strcmp(ifp->ifa_name, ifreq.ifr_name) == 0)) { - found = 1; - break; - } - } + for (ifp = ifa; ifp; ifp = ifp->ifa_next) { + if ((ifp->ifa_addr->sa_family == AF_LINK) && + (sys_strcmp(ifp->ifa_name, ifreq.ifr_name) == 0)) { + found = 1; + break; + } + } - if (found == 0) { - freeifaddrs(ifa); - break; - } + if (found == 0) { + freeifaddrs(ifa); + break; + } + sdlp = (struct sockaddr_dl *)ifp->ifa_addr; - buf_check(sptr, s_end, 1+IFHWADDRLEN); - *sptr++ = INET_IFOPT_HWADDR; - sys_memcpy(sptr, ((struct sockaddr_dl *)ifp->ifa_addr)->sdl_data + - ((struct sockaddr_dl *)ifp->ifa_addr)->sdl_nlen, IFHWADDRLEN); - freeifaddrs(ifa); - sptr += IFHWADDRLEN; + buf_check(sptr, s_end, 1+2+sdlp->sdl_alen); + *sptr++ = INET_IFOPT_HWADDR; + put_int16(sdlp->sdl_alen, sptr); sptr += 2; + sys_memcpy(sptr, + sdlp->sdl_data + sdlp->sdl_nlen, + sdlp->sdl_alen); + freeifaddrs(ifa); + sptr += sdlp->sdl_alen; #endif break; } @@ -4240,29 +4279,15 @@ static int inet_ctl_ifget(inet_descriptor* desc, char* buf, int len, case INET_IFOPT_FLAGS: { int flags; - int eflags = 0; if (ioctl(desc->s, SIOCGIFFLAGS, (char*)&ifreq) < 0) flags = 0; else flags = ifreq.ifr_flags; - /* translate flags */ - if (flags & IFF_UP) - eflags |= INET_IFF_UP; - if (flags & IFF_BROADCAST) - eflags |= INET_IFF_BROADCAST; - if (flags & IFF_LOOPBACK) - eflags |= INET_IFF_LOOPBACK; - if (flags & IFF_POINTOPOINT) - eflags |= INET_IFF_POINTTOPOINT; - if (flags & IFF_RUNNING) - eflags |= INET_IFF_RUNNING; - if (flags & IFF_MULTICAST) - eflags |= INET_IFF_MULTICAST; buf_check(sptr, s_end, 5); *sptr++ = INET_IFOPT_FLAGS; - put_int32(eflags, sptr); + put_int32(IFGET_FLAGS(flags), sptr); sptr += 4; break; } @@ -4300,17 +4325,22 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, (void) ioctl(desc->s, SIOCSIFADDR, (char*)&ifreq); break; - case INET_IFOPT_HWADDR: - buf_check(buf, b_end, IFHWADDRLEN); + case INET_IFOPT_HWADDR: { + unsigned int len; + buf_check(buf, b_end, 2); + len = get_int16(buf); buf += 2; + buf_check(buf, b_end, len); #ifdef SIOCSIFHWADDR /* raw memcpy (fix include autoconf later) */ - sys_memcpy((char*)(&ifreq.ifr_hwaddr.sa_data), buf, IFHWADDRLEN); + sys_memset((char*)(&ifreq.ifr_hwaddr.sa_data), + '\0', sizeof(ifreq.ifr_hwaddr.sa_data)); + sys_memcpy((char*)(&ifreq.ifr_hwaddr.sa_data), buf, len); (void) ioctl(desc->s, SIOCSIFHWADDR, (char *)&ifreq); #endif - buf += IFHWADDRLEN; + buf += len; break; - + } case INET_IFOPT_BROADADDR: #ifdef SIOCSIFBRDADDR @@ -4415,6 +4445,551 @@ static int inet_ctl_ifset(inet_descriptor* desc, char* buf, int len, #endif + + +/* Latin-1 to utf8 */ + +static int utf8_len(const char *c, int m) { + int l; + for (l = 0; m; c++, l++, m--) { + if (*c == '\0') break; + if ((*c & 0x7f) != *c) l++; + } + return l; +} + +static void utf8_encode(const char *c, int m, char *p) { + for (; m; c++, m--) { + if (*c == '\0') break; + if ((*c & 0x7f) != *c) { + *p++ = (char) (0xC0 | (0x03 & (*c >> 6))); + *p++ = (char) (0x80 | (0x3F & *c)); + } else { + *p++ = (char) *c; + } + } +} + +#if defined(__WIN32__) + +static void set_netmask_bytes(char *c, int len, int pref_len) { + int i, m; + for (i = 0, m = pref_len >> 3; i < m && i < len; i++) c[i] = '\xFF'; + if (i < len) c[i++] = 0xFF << (8 - (pref_len & 7)); + for (; i < len; i++) c[i] = '\0'; +} + + +int eq_masked_bytes(char *a, char *b, int pref_len) { + int i, m; + for (i = 0, m = pref_len >> 3; i < m; i++) { + if (a[i] != b[i]) return 0; + } + m = pref_len & 7; + if (m) { + m = 0xFF & (0xFF << (8 - m)); + if ((a[i] & m) != (b[i] & m)) return 0; + } + return !0; +} + +static int inet_ctl_getifaddrs(inet_descriptor* desc_p, + char **rbuf_pp, int rsize) +{ + int i; + DWORD ret, n; + IP_INTERFACE_INFO *info_p; + MIB_IPADDRTABLE *ip_addrs_p; + IP_ADAPTER_ADDRESSES *ip_adaddrs_p, *ia_p; + + char *buf_p; + char *buf_alloc_p; + int buf_size =512; +# define BUF_ENSURE(Size) \ + do { \ + int NEED_, GOT_ = buf_p - buf_alloc_p; \ + NEED_ = GOT_ + (Size); \ + if (NEED_ > buf_size) { \ + buf_size = NEED_ + 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + } while(0) +# define SOCKADDR_TO_BUF(opt, sa) \ + do { \ + if (sa) { \ + char *P_; \ + *buf_p++ = (opt); \ + while (! (P_ = sockaddr_to_buf((sa), buf_p, \ + buf_alloc_p+buf_size))) { \ + int GOT_ = buf_p - buf_alloc_p; \ + buf_size += 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + if (P_ == buf_p) { \ + buf_p--; \ + } else { \ + buf_p = P_; \ + } \ + } \ + } while (0) + + { + /* Try GetAdaptersAddresses, if it is available */ + unsigned long ip_adaddrs_size = 16 * 1024; + ULONG family = AF_UNSPEC; + ULONG flags = + GAA_FLAG_INCLUDE_PREFIX | GAA_FLAG_SKIP_ANYCAST | + GAA_FLAG_SKIP_DNS_SERVER | GAA_FLAG_SKIP_FRIENDLY_NAME | + GAA_FLAG_SKIP_MULTICAST; + ULONG (WINAPI *fpGetAdaptersAddresses) + (ULONG, ULONG, PVOID, PIP_ADAPTER_ADDRESSES, PULONG); + HMODULE iphlpapi = GetModuleHandle("iphlpapi"); + fpGetAdaptersAddresses = (void *) + (iphlpapi ? + GetProcAddress(iphlpapi, "GetAdaptersAddresses") : + NULL); + if (fpGetAdaptersAddresses) { + ip_adaddrs_p = ALLOC(ip_adaddrs_size); + for (i = 17; i; i--) { + ret = fpGetAdaptersAddresses( + family, flags, NULL, ip_adaddrs_p, &ip_adaddrs_size); + ip_adaddrs_p = REALLOC(ip_adaddrs_p, ip_adaddrs_size); + if (ret == NO_ERROR) break; + if (ret == ERROR_BUFFER_OVERFLOW) continue; + i = 0; + } + if (! i) { + FREE(ip_adaddrs_p); + ip_adaddrs_p = NULL; + } + } else ip_adaddrs_p = NULL; + } + + { + /* Load the IP_INTERFACE_INFO table (only IPv4 interfaces), + * reliable source of interface names on XP + */ + unsigned long info_size = 4 * 1024; + info_p = ALLOC(info_size); + for (i = 17; i; i--) { + ret = GetInterfaceInfo(info_p, &info_size); + info_p = REALLOC(info_p, info_size); + if (ret == NO_ERROR) break; + if (ret == ERROR_INSUFFICIENT_BUFFER) continue; + i = 0; + } + if (! i) { + FREE(info_p); + info_p = NULL; + } + } + + if (! ip_adaddrs_p) { + /* If GetAdaptersAddresses gave nothing we fall back to + * MIB_IPADDRTABLE (only IPv4 interfaces) + */ + unsigned long ip_addrs_size = 16 * sizeof(*ip_addrs_p); + ip_addrs_p = ALLOC(ip_addrs_size); + for (i = 17; i; i--) { + ret = GetIpAddrTable(ip_addrs_p, &ip_addrs_size, FALSE); + ip_addrs_p = REALLOC(ip_addrs_p, ip_addrs_size); + if (ret == NO_ERROR) break; + if (ret == ERROR_INSUFFICIENT_BUFFER) continue; + i = 0; + } + if (! i) { + if (info_p) FREE(info_p); + FREE(ip_addrs_p); + return ctl_reply(INET_REP_OK, NULL, 0, rbuf_pp, rsize); + } + } else ip_addrs_p = NULL; + + buf_p = buf_alloc_p = ALLOC(buf_size); + *buf_p++ = INET_REP_OK; + + /* Iterate over MIB_IPADDRTABLE or IP_ADAPTER_ADDRESSES */ + for (ia_p = NULL, ip_addrs_p ? ((void *)(i = 0)) : (ia_p = ip_adaddrs_p); + ip_addrs_p ? (i < ip_addrs_p->dwNumEntries) : (ia_p != NULL); + ip_addrs_p ? ((void *)(i++)) : (ia_p = ia_p->Next)) { + MIB_IPADDRROW *ipaddrrow_p = NULL; + DWORD flags = INET_IFF_MULTICAST; + DWORD index = 0; + WCHAR *wname_p = NULL; + MIB_IFROW ifrow; + + if (ip_addrs_p) { + ipaddrrow_p = ip_addrs_p->table + i; + index = ipaddrrow_p->dwIndex; + } else { + index = ia_p->IfIndex; + if (ia_p->Flags & IP_ADAPTER_NO_MULTICAST) { + flags &= ~INET_IFF_MULTICAST; + } + } +index: + if (! index) goto done; + sys_memzero(&ifrow, sizeof(ifrow)); + ifrow.dwIndex = index; + if (GetIfEntry(&ifrow) != NO_ERROR) break; + /* Find the interface name - first try MIB_IFROW.wzname */ + if (ifrow.wszName[0] != 0) { + wname_p = ifrow.wszName; + } else { + /* Then try IP_ADAPTER_INDEX_MAP.Name (only IPv4 adapters) */ + int j; + for (j = 0; j < info_p->NumAdapters; j++) { + if (info_p->Adapter[j].Index == (ULONG) ifrow.dwIndex) { + if (info_p->Adapter[j].Name[0] != 0) { + wname_p = info_p->Adapter[j].Name; + } + break; + } + } + } + if (wname_p) { + int len; + /* Convert interface name to UTF-8 */ + len = + WideCharToMultiByte( + CP_UTF8, 0, wname_p, -1, NULL, 0, NULL, NULL); + if (! len) break; + BUF_ENSURE(len); + WideCharToMultiByte( + CP_UTF8, 0, wname_p, -1, buf_p, len, NULL, NULL); + buf_p += len; + } else { + /* Found no name - + * use "MIB_IFROW.dwIndex: MIB_IFROW.bDescr" as name instead */ + int l; + l = utf8_len(ifrow.bDescr, ifrow.dwDescrLen); + BUF_ENSURE(9 + l+1); + buf_p += + erts_sprintf( + buf_p, "%lu: ", (unsigned long) ifrow.dwIndex); + utf8_encode(ifrow.bDescr, ifrow.dwDescrLen, buf_p); + buf_p += l; + *buf_p++ = '\0'; + } + /* Interface flags, often make up broadcast and multicast flags */ + switch (ifrow.dwType) { + case IF_TYPE_ETHERNET_CSMACD: + flags |= INET_IFF_BROADCAST; + break; + case IF_TYPE_SOFTWARE_LOOPBACK: + flags |= INET_IFF_LOOPBACK; + flags &= ~INET_IFF_MULTICAST; + break; + default: + flags &= ~INET_IFF_MULTICAST; + break; + } + if (ifrow.dwAdminStatus) { + flags |= INET_IFF_UP; + switch (ifrow.dwOperStatus) { + case IF_OPER_STATUS_CONNECTING: + flags |= INET_IFF_POINTTOPOINT; + break; + case IF_OPER_STATUS_CONNECTED: + flags |= INET_IFF_RUNNING | INET_IFF_POINTTOPOINT; + break; + case IF_OPER_STATUS_OPERATIONAL: + flags |= INET_IFF_RUNNING; + break; + } + } + BUF_ENSURE(1 + 4); + *buf_p++ = INET_IFOPT_FLAGS; + put_int32(flags, buf_p); buf_p += 4; + if (ipaddrrow_p) { + /* Legacy implementation through GetIpAddrTable */ + struct sockaddr_in sin; + /* IP Address */ + sys_memzero(&sin, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ipaddrrow_p->dwAddr; + BUF_ENSURE(1); + /* Netmask */ + SOCKADDR_TO_BUF(INET_IFOPT_ADDR, (struct sockaddr *) &sin); + sin.sin_addr.s_addr = ipaddrrow_p->dwMask; + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, (struct sockaddr *) &sin); + if (flags & INET_IFF_BROADCAST) { + /* Broadcast address - fake it*/ + sin.sin_addr.s_addr = ipaddrrow_p->dwAddr; + sin.sin_addr.s_addr |= ~ipaddrrow_p->dwMask; + BUF_ENSURE(1); + SOCKADDR_TO_BUF( + INET_IFOPT_BROADADDR, (struct sockaddr *) &sin); + } + } else { + IP_ADAPTER_UNICAST_ADDRESS *p; + /* IP Address(es) */ + for (p = ia_p->FirstUnicastAddress; + p; + p = p->Next) + { + IP_ADAPTER_PREFIX *q; + ULONG shortest_length; + struct sockaddr *shortest_p, *sa_p = p->Address.lpSockaddr; + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_ADDR, sa_p); + shortest_p = NULL; + shortest_length = 0; + for (q = ia_p->FirstPrefix; + q; + q = q->Next) { + struct sockaddr *sp_p = q->Address.lpSockaddr; + if (sa_p->sa_family != sp_p->sa_family) continue; + switch (sa_p->sa_family) { + case AF_INET: { + struct sockaddr_in sin; + DWORD sa, sp, mask; + sa = ntohl((DWORD) + ((struct sockaddr_in *) + sa_p)->sin_addr.s_addr); + sp = ntohl((DWORD) + ((struct sockaddr_in *) + sp_p)->sin_addr.s_addr); + mask = 0xFFFFFFFF << (32 - q->PrefixLength); + if ((sa & mask) != (sp & mask)) continue; + if ((! shortest_p) + || q->PrefixLength < shortest_length) { + shortest_p = sp_p; + shortest_length = q->PrefixLength; + } + } break; + case AF_INET6: { + struct sockaddr_in6 sin6; + if (!eq_masked_bytes((char *) + &((struct sockaddr_in6 *) + sa_p)->sin6_addr, + (char *) + &((struct sockaddr_in6 *) + sp_p)->sin6_addr, + q->PrefixLength)) { + continue; + } + if ((! shortest_p) + || q->PrefixLength < shortest_length) { + shortest_p = sp_p; + shortest_length = q->PrefixLength; + } + } break; + } + } + if (! shortest_p) { + /* Found no shortest prefix */ + shortest_p = sa_p; + switch (shortest_p->sa_family) { + case AF_INET: { + /* Fall back to old classfull network addresses */ + DWORD addr = ntohl(((struct sockaddr_in *)shortest_p) + ->sin_addr.s_addr); + if (! (addr & 0x800000)) { + /* Class A */ + shortest_length = 8; + } else if (! (addr & 0x400000)) { + /* Class B */ + shortest_length = 16; + } else if (! (addr & 0x200000)) { + /* Class C */ + shortest_length = 24; + } else { + shortest_length = 32; + } + } break; + case AF_INET6: { + /* Just play it safe */ + shortest_length = 128; + } break; + } + } + switch (shortest_p->sa_family) { + case AF_INET: { + struct sockaddr_in sin; + DWORD mask = 0xFFFFFFFF << (32 - shortest_length); + sys_memzero(&sin, sizeof(sin)); + sin.sin_family = shortest_p->sa_family; + sin.sin_addr.s_addr = htonl(mask); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, + (struct sockaddr *) &sin); + if (flags & INET_IFF_BROADCAST) { + DWORD sp = + ntohl((DWORD) + ((struct sockaddr_in *)shortest_p) + -> sin_addr.s_addr); + sin.sin_addr.s_addr = htonl(sp | ~mask); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_BROADADDR, + (struct sockaddr *) &sin); + } + } break; + case AF_INET6: { + struct sockaddr_in6 sin6; + sys_memzero(&sin6, sizeof(sin6)); + sin6.sin6_family = shortest_p->sa_family; + set_netmask_bytes((char *) &sin6.sin6_addr, + 16, + shortest_length); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, + (struct sockaddr *) &sin6); + } break; + } + } + } + if (ifrow.dwPhysAddrLen) { + /* Hardware Address */ + BUF_ENSURE(1 + 2 + ifrow.dwPhysAddrLen); + *buf_p++ = INET_IFOPT_HWADDR; + put_int16(ifrow.dwPhysAddrLen, buf_p); buf_p += 2; + sys_memcpy(buf_p, ifrow.bPhysAddr, ifrow.dwPhysAddrLen); + buf_p += ifrow.dwPhysAddrLen; + } + +done: + /* That is all for this interface */ + BUF_ENSURE(1); + *buf_p++ = '\0'; + if (ia_p && + ia_p->Ipv6IfIndex && + ia_p->Ipv6IfIndex != index) + { + /* Oops, there was an other interface for IPv6. Possible? XXX */ + index = ia_p->Ipv6IfIndex; + goto index; + } + } + + if (ip_adaddrs_p) FREE(ip_adaddrs_p); + if (info_p) FREE(info_p); + if (ip_addrs_p) FREE(ip_addrs_p); + + buf_size = buf_p - buf_alloc_p; + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); + /* buf_p is now unreliable */ + *rbuf_pp = buf_alloc_p; + return buf_size; +# undef BUF_ENSURE +} + +#elif defined(HAVE_GETIFADDRS) + +static int inet_ctl_getifaddrs(inet_descriptor* desc_p, + char **rbuf_pp, int rsize) +{ + struct ifaddrs *ifa_p, *ifa_free_p; + + int buf_size; + char *buf_p; + char *buf_alloc_p; + + buf_size = 512; + buf_alloc_p = ALLOC(buf_size); + buf_p = buf_alloc_p; +# define BUF_ENSURE(Size) \ + do { \ + int NEED_, GOT_ = buf_p - buf_alloc_p; \ + NEED_ = GOT_ + (Size); \ + if (NEED_ > buf_size) { \ + buf_size = NEED_ + 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + } while (0) +# define SOCKADDR_TO_BUF(opt, sa) \ + do { \ + if (sa) { \ + char *P_; \ + *buf_p++ = (opt); \ + while (! (P_ = sockaddr_to_buf((sa), buf_p, \ + buf_alloc_p+buf_size))) { \ + int GOT_ = buf_p - buf_alloc_p; \ + buf_size += 512; \ + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); \ + buf_p = buf_alloc_p + GOT_; \ + } \ + if (P_ == buf_p) { \ + buf_p--; \ + } else { \ + buf_p = P_; \ + } \ + } \ + } while (0) + + if (getifaddrs(&ifa_p) < 0) { + return ctl_error(sock_errno(), rbuf_pp, rsize); + } + ifa_free_p = ifa_p; + *buf_p++ = INET_REP_OK; + for (; ifa_p; ifa_p = ifa_p->ifa_next) { + int len = utf8_len(ifa_p->ifa_name, -1); + BUF_ENSURE(len+1 + 1+4 + 1); + utf8_encode(ifa_p->ifa_name, -1, buf_p); + buf_p += len; + *buf_p++ = '\0'; + *buf_p++ = INET_IFOPT_FLAGS; + put_int32(IFGET_FLAGS(ifa_p->ifa_flags), buf_p); buf_p += 4; + if (ifa_p->ifa_addr->sa_family == AF_INET +#if defined(AF_INET6) + || ifa_p->ifa_addr->sa_family == AF_INET6 +#endif + ) { + SOCKADDR_TO_BUF(INET_IFOPT_ADDR, ifa_p->ifa_addr); + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_NETMASK, ifa_p->ifa_netmask); + if (ifa_p->ifa_flags & IFF_POINTOPOINT) { + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_DSTADDR, ifa_p->ifa_dstaddr); + } else if (ifa_p->ifa_flags & IFF_BROADCAST) { + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_BROADADDR, ifa_p->ifa_broadaddr); + } + } +#if defined(AF_LINK) || defined(AF_PACKET) + else if ( +#if defined(AF_LINK) + ifa_p->ifa_addr->sa_family == AF_LINK +#else + 0 +#endif +#if defined(AF_PACKET) + || ifa_p->ifa_addr->sa_family == AF_PACKET +#endif + ) { + char *bp = buf_p; + BUF_ENSURE(1); + SOCKADDR_TO_BUF(INET_IFOPT_HWADDR, ifa_p->ifa_addr); + if (buf_p - bp < 4) buf_p = bp; /* Empty hwaddr */ + } +#endif + BUF_ENSURE(1); + *buf_p++ = '\0'; + } + buf_size = buf_p - buf_alloc_p; + buf_alloc_p = REALLOC(buf_alloc_p, buf_size); + /* buf_p is now unreliable */ + freeifaddrs(ifa_free_p); + *rbuf_pp = buf_alloc_p; + return buf_size; +# undef BUF_ENSURE +} + +#else + +static int inet_ctl_getifaddrs(inet_descriptor* desc_p, + char **rbuf_pp, int rsize) +{ + return ctl_error(ENOTSUP, rbuf_pp, rsize); +} + +#endif + + + #ifdef VXWORKS /* ** THIS is a terrible creature, a bug in the TCP part @@ -4576,8 +5151,7 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len) case INET_LOPT_BUFFER: DEBUGF(("inet_set_opts(%ld): s=%d, BUFFER=%d\r\n", (long)desc->port, desc->s, ival)); - if (ival > INET_MAX_BUFFER) ival = INET_MAX_BUFFER; - else if (ival < INET_MIN_BUFFER) ival = INET_MIN_BUFFER; + if (ival < INET_MIN_BUFFER) ival = INET_MIN_BUFFER; desc->bufsz = ival; continue; @@ -4642,7 +5216,6 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len) if (desc->stype == SOCK_STREAM) { tcp_descriptor* tdesc = (tcp_descriptor*) desc; if (ival < 0) ival = 0; - else if (ival > INET_MAX_BUFFER*2) ival = INET_MAX_BUFFER*2; if (tdesc->low > ival) tdesc->low = ival; tdesc->high = ival; @@ -4653,7 +5226,6 @@ static int inet_set_opts(inet_descriptor* desc, char* ptr, int len) if (desc->stype == SOCK_STREAM) { tcp_descriptor* tdesc = (tcp_descriptor*) desc; if (ival < 0) ival = 0; - else if (ival > INET_MAX_BUFFER) ival = INET_MAX_BUFFER; if (tdesc->high < ival) tdesc->high = ival; tdesc->low = ival; @@ -4999,9 +5571,6 @@ static int sctp_set_opts(inet_descriptor* desc, char* ptr, int len) case INET_LOPT_BUFFER: desc->bufsz = get_int32(curr); curr += 4; - if (desc->bufsz > INET_MAX_BUFFER) - desc->bufsz = INET_MAX_BUFFER; - else if (desc->bufsz < INET_MIN_BUFFER) desc->bufsz = INET_MIN_BUFFER; res = 0; /* This does not affect the kernel buffer size */ @@ -5293,12 +5862,15 @@ static int sctp_set_opts(inet_descriptor* desc, char* ptr, int len) if (pmtud_enable) cflags |= SPP_PMTUD_ENABLE; if (pmtud_disable) cflags |= SPP_PMTUD_DISABLE; +# ifdef HAVE_STRUCT_SCTP_PADDRPARAMS_SPP_SACKDELAY + /* The followings are missing in FreeBSD 7.1 */ sackdelay_enable =eflags& SCTP_FLAG_SACDELAY_ENABLE; sackdelay_disable=eflags& SCTP_FLAG_SACDELAY_DISABLE; if (sackdelay_enable && sackdelay_disable) return -1; if (sackdelay_enable) cflags |= SPP_SACKDELAY_ENABLE; if (sackdelay_disable) cflags |= SPP_SACKDELAY_DISABLE; +# endif arg.pap.spp_flags = cflags; # endif @@ -5436,7 +6008,7 @@ static int inet_fill_opts(inet_descriptor* desc, #define PLACE_FOR(Size,Ptr) \ do { \ int need = dest_used + (Size); \ - if (need > INET_MAX_BUFFER) { \ + if (need > INET_MAX_OPT_BUFFER) { \ RETURN_ERROR(); \ } \ if (need > dest_allocated) { \ @@ -5660,7 +6232,7 @@ static int inet_fill_opts(inet_descriptor* desc, buf += 4; data_provided = (int) *buf++; arg_sz = get_int32(buf); - if (arg_sz > INET_MAX_BUFFER) { + if (arg_sz > INET_MAX_OPT_BUFFER) { RETURN_ERROR(); } buf += 4; @@ -5774,7 +6346,7 @@ static int sctp_fill_opts(inet_descriptor* desc, char* buf, int buflen, "miscalculated buffer size"); \ } \ need = (Index) + (N); \ - if (need > INET_MAX_BUFFER/sizeof(ErlDrvTermData)) { \ + if (need > INET_MAX_OPT_BUFFER/sizeof(ErlDrvTermData)) {\ RETURN_ERROR((Spec), -ENOMEM); \ } \ if (need > spec_allocated) { \ @@ -6199,13 +6771,15 @@ static int sctp_fill_opts(inet_descriptor* desc, char* buf, int buflen, if (ap.spp_flags & SPP_PMTUD_DISABLE) { i = LOAD_ATOM (spec, i, am_pmtud_disable); n++; } - +# ifdef HAVE_STRUCT_SCTP_PADDRPARAMS_SPP_SACKDELAY + /* SPP_SACKDELAY_* not in FreeBSD 7.1 */ if (ap.spp_flags & SPP_SACKDELAY_ENABLE) { i = LOAD_ATOM (spec, i, am_sackdelay_enable); n++; } if (ap.spp_flags & SPP_SACKDELAY_DISABLE) { i = LOAD_ATOM (spec, i, am_sackdelay_disable); n++; } # endif +# endif PLACE_FOR(spec, i, LOAD_NIL_CNT + LOAD_LIST_CNT + 2*LOAD_TUPLE_CNT); @@ -6625,7 +7199,7 @@ static int inet_ctl(inet_descriptor* desc, int cmd, char* buf, int len, } } DEBUGF(("inet_ctl(%ld): GETSTAT\r\n", (long) desc->port)); - if (dstlen > INET_MAX_BUFFER) /* sanity check */ + if (dstlen > INET_MAX_OPT_BUFFER) /* sanity check */ return 0; if (dstlen > rsize) { if ((dst = (char*) ALLOC(dstlen)) == NULL) @@ -6641,7 +7215,7 @@ static int inet_ctl(inet_descriptor* desc, int cmd, char* buf, int len, char* dst; int dstlen = 1 /* Reply code */ + len*5; DEBUGF(("inet_ctl(%ld): INET_REQ_SUBSCRIBE\r\n", (long) desc->port)); - if (dstlen > INET_MAX_BUFFER) /* sanity check */ + if (dstlen > INET_MAX_OPT_BUFFER) /* sanity check */ return 0; if (dstlen > rsize) { if ((dst = (char*) ALLOC(dstlen)) == NULL) @@ -6676,6 +7250,13 @@ static int inet_ctl(inet_descriptor* desc, int cmd, char* buf, int len, return inet_ctl_getiflist(desc, rbuf, rsize); } + case INET_REQ_GETIFADDRS: { + DEBUGF(("inet_ctl(%ld): GETIFADDRS\r\n", (long)desc->port)); + if (!IS_OPEN(desc)) + return ctl_xerror(EXBADPORT, rbuf, rsize); + return inet_ctl_getifaddrs(desc, rbuf, rsize); + } + case INET_REQ_IFGET: { DEBUGF(("inet_ctl(%ld): IFGET\r\n", (long)desc->port)); if (!IS_OPEN(desc)) diff --git a/erts/emulator/drivers/win32/win_efile.c b/erts/emulator/drivers/win32/win_efile.c index 04bd1139f5..6de08e2fa6 100644..100755 --- a/erts/emulator/drivers/win32/win_efile.c +++ b/erts/emulator/drivers/win32/win_efile.c @@ -862,6 +862,7 @@ efile_fileinfo(Efile_error* errInfo, Efile_info* pInfo, findbuf.nFileSizeLow = 0; findbuf.cFileName[0] = '\0'; + pInfo->links = 1; pInfo->modifyTime.year = 1980; pInfo->modifyTime.month = 1; pInfo->modifyTime.day = 1; @@ -874,6 +875,33 @@ efile_fileinfo(Efile_error* errInfo, Efile_info* pInfo, SYSTEMTIME SystemTime; FILETIME LocalFTime; + /*first check if we are a symlink */ + if (!info_for_link && (findbuf.dwFileAttributes & + FILE_ATTRIBUTE_REPARSE_POINT)){ + /* + * given that we know this is a symlink, + we should be able to find its target */ + char target_name[256]; + if (efile_readlink(errInfo, name, target_name,256) == 1) { + return efile_fileinfo(errInfo, pInfo, + target_name, info_for_link); + } + } + +#if 0 + /* number of links: */ + { + HANDLE handle; /* Handle returned by CreateFile() */ + BY_HANDLE_FILE_INFORMATION fileInfo; /* from CreateFile() */ + if (handle = CreateFile(name, GENERIC_READ, 0,NULL, + OPEN_EXISTING, 0, NULL)) { + GetFileInformationByHandle(handle, &fileInfo); + pInfo->links = fileInfo.nNumberOfLinks; + CloseHandle(handle); + } + } +#endif + #define GET_TIME(dst, src) \ if (!FileTimeToLocalFileTime(&findbuf.src, &LocalFTime) || \ !FileTimeToSystemTime(&LocalFTime, &SystemTime)) { \ @@ -908,7 +936,10 @@ if (!FileTimeToLocalFileTime(&findbuf.src, &LocalFTime) || \ pInfo->size_low = findbuf.nFileSizeLow; pInfo->size_high = findbuf.nFileSizeHigh; - if (findbuf.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) + if (info_for_link && (findbuf.dwFileAttributes & + FILE_ATTRIBUTE_REPARSE_POINT)) + pInfo->type = FT_SYMLINK; + else if (findbuf.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) pInfo->type = FT_DIRECTORY; else pInfo->type = FT_REGULAR; @@ -919,7 +950,6 @@ if (!FileTimeToLocalFileTime(&findbuf.src, &LocalFTime) || \ pInfo->access = FA_READ|FA_WRITE; pInfo->mode = dos_to_posix_mode(findbuf.dwFileAttributes, name); - pInfo->links = 1; pInfo->major_device = drive; pInfo->minor_device = 0; pInfo->inode = 0; @@ -1082,12 +1112,17 @@ char* buf; /* Buffer to write. */ size_t count; /* Number of bytes to write. */ { DWORD written; /* Bytes written in last operation. */ + OVERLAPPED overlapped; + OVERLAPPED* pOverlapped = NULL; if (flags & EFILE_MODE_APPEND) { - (void) SetFilePointer((HANDLE) fd, 0, NULL, FILE_END); + memset(&overlapped, 0, sizeof(overlapped)); + overlapped.Offset = 0xffffffff; + overlapped.OffsetHigh = 0xffffffff; + pOverlapped = &overlapped; } while (count > 0) { - if (!WriteFile((HANDLE) fd, buf, count, &written, NULL)) + if (!WriteFile((HANDLE) fd, buf, count, &written, pOverlapped)) return set_error(errInfo); buf += written; count -= written; @@ -1107,11 +1142,16 @@ efile_writev(Efile_error* errInfo, /* Where to return error codes */ size_t size) /* Number of bytes to write */ { int cnt; /* Buffers so far written */ + OVERLAPPED overlapped; + OVERLAPPED* pOverlapped = NULL; ASSERT(iovcnt >= 0); if (flags & EFILE_MODE_APPEND) { - (void) SetFilePointer((HANDLE) fd, 0, NULL, FILE_END); + memset(&overlapped, 0, sizeof(overlapped)); + overlapped.Offset = 0xffffffff; + overlapped.OffsetHigh = 0xffffffff; + pOverlapped = &overlapped; } for (cnt = 0; cnt < iovcnt; cnt++) { if (iov[cnt].iov_base && iov[cnt].iov_len > 0) { @@ -1123,7 +1163,7 @@ efile_writev(Efile_error* errInfo, /* Where to return error codes */ iov[cnt].iov_base + p, iov[cnt].iov_len - p, &w, - NULL)) + pOverlapped)) return set_error(errInfo); } } @@ -1343,6 +1383,48 @@ dos_to_posix_mode(int attr, const char *name) int efile_readlink(Efile_error* errInfo, char* name, char* buffer, size_t size) { + /* + * load dll and see if we have CreateSymbolicLink at runtime: + * (Vista only) + */ + HINSTANCE hModule = NULL; + if ((hModule = LoadLibrary("kernel32.dll")) != NULL) { + typedef DWORD (WINAPI * GETFINALPATHNAMEBYHANDLEPTR)( + HANDLE hFile, + LPCSTR lpFilePath, + DWORD cchFilePath, + DWORD dwFlags); + + GETFINALPATHNAMEBYHANDLEPTR pGetFinalPathNameByHandle = + (GETFINALPATHNAMEBYHANDLEPTR)GetProcAddress(hModule, "GetFinalPathNameByHandleA"); + + if (pGetFinalPathNameByHandle == NULL) { + FreeLibrary(hModule); + } else { + /* first check if file is a symlink; {error, einval} otherwise */ + DWORD fileAttributes = GetFileAttributes(name); + if ((fileAttributes & FILE_ATTRIBUTE_REPARSE_POINT)) { + BOOLEAN success = 0; + HANDLE h = CreateFile(name, GENERIC_READ, 0,NULL, OPEN_EXISTING, 0, NULL); + if(h != INVALID_HANDLE_VALUE) { + success = pGetFinalPathNameByHandle(h, buffer, size,0); + /* GetFinalPathNameByHandle prepends path with "\\?\": */ + sprintf(buffer, buffer+4); + CloseHandle(h); + } + FreeLibrary(hModule); + if (success) { + return 1; + } else { + return set_error(errInfo); + } + } else { + FreeLibrary(hModule); + errno = EINVAL; + return check_error(-1, errInfo); + } + } + } errno = ENOTSUP; return check_error(-1, errInfo); } @@ -1427,13 +1509,46 @@ efile_altname(Efile_error* errInfo, char* orig_name, char* buffer, size_t size) int efile_link(Efile_error* errInfo, char* old, char* new) { - errno = ENOTSUP; - return check_error(-1, errInfo); + if(!CreateHardLink(new, old, NULL)) { + return set_error(errInfo); + } + return 1; } int efile_symlink(Efile_error* errInfo, char* old, char* new) { + /* + * Load dll and see if we have CreateSymbolicLink at runtime: + * (Vista only) + */ + HINSTANCE hModule = NULL; + if ((hModule = LoadLibrary("kernel32.dll")) != NULL) { + typedef BOOLEAN (WINAPI * CREATESYMBOLICLINKFUNCPTR) ( + LPCSTR lpSymlinkFileName, + LPCSTR lpTargetFileName, + DWORD dwFlags); + + CREATESYMBOLICLINKFUNCPTR pCreateSymbolicLink = + (CREATESYMBOLICLINKFUNCPTR) GetProcAddress(hModule, + "CreateSymbolicLinkA"); + /* A for MBCS, W for UNICODE... char* above implies 'A'! */ + if (pCreateSymbolicLink != NULL) { + DWORD attr = GetFileAttributes(old); + int flag = (attr != INVALID_FILE_ATTRIBUTES && + attr & FILE_ATTRIBUTE_DIRECTORY) ? 1 : 0; + /* SYMBOLIC_LINK_FLAG_DIRECTORY = 1 */ + BOOLEAN success = pCreateSymbolicLink(new, old, flag); + FreeLibrary(hModule); + + if (success) { + return 1; + } else { + return set_error(errInfo); + } + } else + FreeLibrary(hModule); + } errno = ENOTSUP; return check_error(-1, errInfo); } diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c index af4ab693dc..01ba773688 100644 --- a/erts/emulator/sys/unix/sys.c +++ b/erts/emulator/sys/unix/sys.c @@ -75,6 +75,7 @@ static erts_smp_rwmtx_t environ_rwmtx; #include "erl_sys_driver.h" #include "erl_check_io.h" +#include "erl_cpu_topology.h" #ifndef DISABLE_VFORK #define DISABLE_VFORK 0 @@ -399,7 +400,7 @@ typedef struct { #ifdef ERTS_THR_HAVE_SIG_FUNCS sigset_t saved_sigmask; #endif - int unbind_child; + int sched_bind_data; } erts_thr_create_data_t; /* @@ -410,15 +411,13 @@ static void * thr_create_prepare(void) { erts_thr_create_data_t *tcdp; - ErtsSchedulerData *esdp; tcdp = erts_alloc(ERTS_ALC_T_TMP, sizeof(erts_thr_create_data_t)); #ifdef ERTS_THR_HAVE_SIG_FUNCS erts_thr_sigmask(SIG_BLOCK, &thr_create_sigmask, &tcdp->saved_sigmask); #endif - esdp = erts_get_scheduler_data(); - tcdp->unbind_child = esdp && erts_is_scheduler_bound(esdp); + tcdp->sched_bind_data = erts_sched_bind_atthrcreate_prepare(); return (void *) tcdp; } @@ -430,6 +429,8 @@ thr_create_cleanup(void *vtcdp) { erts_thr_create_data_t *tcdp = (erts_thr_create_data_t *) vtcdp; + erts_sched_bind_atthrcreate_parent(tcdp->sched_bind_data); + #ifdef ERTS_THR_HAVE_SIG_FUNCS /* Restore signalmask... */ erts_thr_sigmask(SIG_SETMASK, &tcdp->saved_sigmask, NULL); @@ -456,12 +457,7 @@ thr_create_prepare_child(void *vtcdp) erts_thread_disable_fpe(); #endif - if (tcdp->unbind_child) { - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); - erts_unbind_from_cpu(erts_cpuinfo); - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); - } - + erts_sched_bind_atthrcreate_child(tcdp->sched_bind_data); } #endif /* #ifdef USE_THREADS */ @@ -1461,9 +1457,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op CHLD_STAT_LOCK; - unbind = erts_is_scheduler_bound(NULL); - if (unbind) - erts_smp_rwmtx_rlock(&erts_cpu_bind_rwmtx); + unbind = erts_sched_bind_atfork_prepare(); #if !DISABLE_VFORK /* See fork/vfork discussion before this function. */ @@ -1476,7 +1470,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op if (pid == 0) { /* The child! Setup child... */ - if (unbind && erts_unbind_from_cpu(erts_cpuinfo) != 0) + if (erts_sched_bind_atfork_child(unbind) != 0) goto child_error; /* OBSERVE! @@ -1577,8 +1571,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op cs_argv[CS_ARGV_PROGNAME_IX] = child_setup_prog; cs_argv[CS_ARGV_WD_IX] = opts->wd ? opts->wd : "."; - cs_argv[CS_ARGV_UNBIND_IX] - = (unbind ? erts_get_unbind_from_cpu_str(erts_cpuinfo) : "false"); + cs_argv[CS_ARGV_UNBIND_IX] = erts_sched_bind_atvfork_child(unbind); cs_argv[CS_ARGV_FD_CR_IX] = fd_close_range; for (i = 0; i < CS_ARGV_NO_OF_DUP2_OPS; i++) cs_argv[CS_ARGV_DUP2_OP_IX(i)] = &dup2_op[i][0]; @@ -1627,8 +1620,7 @@ static ErlDrvData spawn_start(ErlDrvPort port_num, char* name, SysDriverOpts* op } #endif - if (unbind) - erts_smp_rwmtx_runlock(&erts_cpu_bind_rwmtx); + erts_sched_bind_atfork_parent(unbind); if (pid == -1) { saved_errno = errno; diff --git a/erts/emulator/sys/win32/sys.c b/erts/emulator/sys/win32/sys.c index 15d4cd7361..39b04b26a9 100644 --- a/erts/emulator/sys/win32/sys.c +++ b/erts/emulator/sys/win32/sys.c @@ -31,7 +31,7 @@ #include "global.h" #include "erl_threads.h" #include "../../drivers/win32/win_con.h" - +#include "erl_cpu_topology.h" void erts_sys_init_float(void); @@ -97,7 +97,7 @@ static int driver_write(long, HANDLE, byte*, int); static void common_stop(int); static int create_file_thread(struct async_io* aio, int mode); #ifdef ERTS_SMP -static void close_active_handles(ErlDrvPort, const HANDLE* handles, int cnt); +static void close_active_handle(ErlDrvPort, HANDLE handle); static DWORD WINAPI threaded_handle_closer(LPVOID param); #endif static DWORD WINAPI threaded_reader(LPVOID param); @@ -137,7 +137,11 @@ static BOOL win_console = FALSE; static OSVERSIONINFO int_os_version; /* Version information for Win32. */ -#ifdef ERTS_SMP +/*#define USE_CANCELIOEX + Disabled the use of CancelIoEx as its been seen to cause problem with some + drivers. Not sure what to blame; faulty drivers or some form of invalid use. +*/ +#if defined(ERTS_SMP) && defined(USE_CANCELIOEX) static BOOL (WINAPI *fpCancelIoEx)(HANDLE,LPOVERLAPPED); #endif @@ -684,6 +688,7 @@ release_driver_data(DriverData* dp) erts_smp_mtx_lock(&sys_driver_data_lock); #ifdef ERTS_SMP +#ifdef USE_CANCELIOEX if (fpCancelIoEx != NULL) { if (dp->in.thread == (HANDLE) -1 && dp->in.fd != INVALID_HANDLE_VALUE) { (*fpCancelIoEx)(dp->in.fd, NULL); @@ -692,10 +697,12 @@ release_driver_data(DriverData* dp) (*fpCancelIoEx)(dp->out.fd, NULL); } } - else { + else +#endif + { /* This is a workaround for the fact that CancelIo cant cancel requests issued by another thread and that we cant use - CancelIoEx as that's only availabele in Vista etc. + CancelIoEx as that's only available in Vista etc. R14: Avoid scheduler deadlock by only wait for 10ms, and then spawn a thread that will keep waiting in in order to close handles. */ HANDLE handles[2]; @@ -706,7 +713,7 @@ release_driver_data(DriverData* dp) dp->in.fd = INVALID_HANDLE_VALUE; DEBUGF(("Waiting for the in event thingie")); if (WaitForSingleObject(dp->in.ov.hEvent,timeout) == WAIT_TIMEOUT) { - handles[i++] = dp->in.ov.hEvent; + close_active_handle(dp->port_num, dp->in.ov.hEvent); dp->in.ov.hEvent = NULL; timeout = 0; } @@ -717,14 +724,11 @@ release_driver_data(DriverData* dp) dp->out.fd = INVALID_HANDLE_VALUE; DEBUGF(("Waiting for the out event thingie")); if (WaitForSingleObject(dp->out.ov.hEvent,timeout) == WAIT_TIMEOUT) { - handles[i++] = dp->out.ov.hEvent; + close_active_handle(dp->port_num, dp->out.ov.hEvent); dp->out.ov.hEvent = NULL; } DEBUGF(("...done\n")); } - if (i > 0) { - close_active_handles(dp->port_num, handles, i); - } } #else if (dp->in.thread == (HANDLE) -1 && dp->in.fd != INVALID_HANDLE_VALUE) { @@ -772,42 +776,82 @@ release_driver_data(DriverData* dp) #ifdef ERTS_SMP -struct handles_to_be_closed -{ - int cnt; - HANDLE handles[2]; +struct handles_to_be_closed { + HANDLE handles[MAXIMUM_WAIT_OBJECTS]; + unsigned cnt; }; +static struct handles_to_be_closed* htbc_curr = NULL; +CRITICAL_SECTION htbc_lock; -static void close_active_handles(ErlDrvPort port_num, const HANDLE* handles, int cnt) +static void close_active_handle(ErlDrvPort port_num, HANDLE handle) { - DWORD tid; - HANDLE thread; + struct handles_to_be_closed* htbc; int i; - struct handles_to_be_closed* htbc = erts_alloc(ERTS_ALC_T_DRV_TAB, - sizeof(struct handles_to_be_closed)); - htbc->cnt = cnt; - for (i=0; i < cnt; ++i) { - htbc->handles[i] = handles[i]; - (void) driver_select(port_num, (ErlDrvEvent)handles[i], - ERL_DRV_USE_NO_CALLBACK, 0); + EnterCriticalSection(&htbc_lock); + htbc = htbc_curr; + if (htbc == NULL || htbc->cnt >= MAXIMUM_WAIT_OBJECTS) { + DWORD tid; + HANDLE thread; + + htbc = (struct handles_to_be_closed*) erts_alloc(ERTS_ALC_T_DRV_TAB, + sizeof(*htbc)); + htbc->handles[0] = CreateAutoEvent(FALSE); + htbc->cnt = 1; + thread = (HANDLE *) _beginthreadex(NULL, 0, threaded_handle_closer, htbc, 0, &tid); + CloseHandle(thread); } - thread = (HANDLE *) _beginthreadex(NULL, 0, threaded_handle_closer, htbc, 0, &tid); - CloseHandle(thread); + htbc->handles[htbc->cnt++] = handle; + driver_select(port_num, (ErlDrvEvent)handle, ERL_DRV_USE_NO_CALLBACK, 0); + SetEvent(htbc->handles[0]); + htbc_curr = htbc; + LeaveCriticalSection(&htbc_lock); } - static DWORD WINAPI threaded_handle_closer(LPVOID param) { struct handles_to_be_closed* htbc = (struct handles_to_be_closed*) param; - int i; - DEBUGF(("threaded_handle_closer waiting for %d handles\r\n",htbc->cnt)); - WaitForMultipleObjects(htbc->cnt, htbc->handles, TRUE, INFINITE); - for (i=0; i < htbc->cnt; ++i) { - CloseHandle(htbc->handles[i]); + unsigned ix; + DWORD res; + DEBUGF(("threaded_handle_closer %p started\r\n", htbc)); + EnterCriticalSection(&htbc_lock); + for (;;) { + { + HANDLE* handles = htbc->handles; + unsigned cnt = htbc->cnt; + DWORD timeout = (htbc == htbc_curr) ? INFINITE : 10*1000; + + LeaveCriticalSection(&htbc_lock); + DEBUGF(("threaded_handle_closer %p waiting for %d handles\r\n", htbc, cnt)); + res = WaitForMultipleObjects(cnt, handles, FALSE, timeout); + } + EnterCriticalSection(&htbc_lock); + switch (res) { + case WAIT_OBJECT_0: + case WAIT_TIMEOUT: + break; /* got some more handles to wait for maybe */ + default: + ix = res - WAIT_OBJECT_0; + if (ix > 0 && ix < htbc->cnt) { + CloseHandle(htbc->handles[ix]); + htbc->handles[ix] = htbc->handles[--htbc->cnt]; + } + } + if (htbc != htbc_curr) { + if (htbc->cnt == 1) { /* no real handles left */ + break; + } + /* The thread with most free slots will be "current" */ + if (htbc->cnt < htbc_curr->cnt) { + htbc_curr = htbc; + DEBUGF(("threaded_handle_closer %p made current\r\n", htbc)); + } + } } + LeaveCriticalSection(&htbc_lock); + CloseHandle(htbc->handles[0]); erts_free(ERTS_ALC_T_DRV_TAB, htbc); - DEBUGF(("threaded_handle_closer terminating\r\n")); + DEBUGF(("threaded_handle_closer %p terminating\r\n", htbc)); return 0; } #endif /* ERTS_SMP */ @@ -1101,11 +1145,10 @@ static int spawn_init() { int i; -#ifdef ERTS_SMP +#if defined(ERTS_SMP) && defined(USE_CANCELIOEX) HMODULE module = GetModuleHandle("kernel32"); - fpCancelIoEx = (module != NULL) ? - (BOOL (WINAPI *)(HANDLE,LPOVERLAPPED)) - GetProcAddress(module,"CancelIoEx") : NULL; + fpCancelIoEx = (BOOL (WINAPI *)(HANDLE,LPOVERLAPPED)) + ((module != NULL) ? GetProcAddress(module,"CancelIoEx") : NULL); DEBUGF(("fpCancelIoEx = %p\r\n", fpCancelIoEx)); #endif driver_data = (struct driver_data *) @@ -2973,13 +3016,50 @@ check_supported_os_version(void) } #ifdef USE_THREADS -#ifdef ERTS_ENABLE_LOCK_COUNT + +typedef struct { + int sched_bind_data; +} erts_thr_create_data_t; + +/* + * thr_create_prepare() is called in parent thread before thread creation. + * Returned value is passed as argument to thr_create_cleanup(). + */ +static void * +thr_create_prepare(void) +{ + erts_thr_create_data_t *tcdp; + + tcdp = erts_alloc(ERTS_ALC_T_TMP, sizeof(erts_thr_create_data_t)); + tcdp->sched_bind_data = erts_sched_bind_atthrcreate_prepare(); + + return (void *) tcdp; +} + + +/* thr_create_cleanup() is called in parent thread after thread creation. */ +static void +thr_create_cleanup(void *vtcdp) +{ + erts_thr_create_data_t *tcdp = (erts_thr_create_data_t *) vtcdp; + + erts_sched_bind_atthrcreate_parent(tcdp->sched_bind_data); + + erts_free(ERTS_ALC_T_TMP, tcdp); +} + static void thr_create_prepare_child(void *vtcdp) { + erts_thr_create_data_t *tcdp = (erts_thr_create_data_t *) vtcdp; + +#ifdef ERTS_ENABLE_LOCK_COUNT erts_lcnt_thread_setup(); -} #endif /* ERTS_ENABLE_LOCK_COUNT */ + + erts_sched_bind_atthrcreate_child(tcdp->sched_bind_data); +} + #endif /* USE_THREADS */ void @@ -2991,9 +3071,13 @@ erts_sys_pre_init(void) #ifdef USE_THREADS { erts_thr_init_data_t eid = ERTS_THR_INIT_DATA_DEF_INITER; -#ifdef ERTS_ENABLE_LOCK_COUNT + eid.thread_create_child_func = thr_create_prepare_child; -#endif + /* Before creation in parent */ + eid.thread_create_prepare_func = thr_create_prepare; + /* After creation in parent */ + eid.thread_create_parent_func = thr_create_cleanup, + erts_thr_init(&eid); #ifdef ERTS_ENABLE_LOCK_COUNT erts_lcnt_init(); @@ -3027,6 +3111,7 @@ void erl_sys_init(void) #ifdef ERTS_SMP erts_smp_tsd_key_create(&win32_errstr_key); + InitializeCriticalSection(&htbc_lock); #endif erts_smp_atomic_init(&pipe_creation_counter,0); /* diff --git a/erts/emulator/test/distribution_SUITE.erl b/erts/emulator/test/distribution_SUITE.erl index 7c19274696..79252d0593 100644 --- a/erts/emulator/test/distribution_SUITE.erl +++ b/erts/emulator/test/distribution_SUITE.erl @@ -27,6 +27,7 @@ -export([all/1, ping/1, bulk_send/1, bulk_send_small/1, bulk_send_big/1, + bulk_send_bigbig/1, local_send/1, local_send_small/1, local_send_big/1, local_send_legal/1, link_to_busy/1, exit_to_busy/1, lost_exit/1, link_to_dead/1, link_to_dead_new_node/1, @@ -50,7 +51,8 @@ -export([sender/3, receiver2/2, dummy_waiter/0, dead_process/0, roundtrip/1, bounce/1, do_dist_auto_connect/1, inet_rpc_server/1, dist_parallel_sender/3, dist_parallel_receiver/0, - dist_evil_parallel_receiver/0]). + dist_evil_parallel_receiver/0, + sendersender/4, sendersender2/4]). all(suite) -> [ ping, bulk_send, local_send, link_to_busy, exit_to_busy, @@ -121,7 +123,7 @@ bulk_send(doc) -> "the time. This tests that a process that is suspended on a ", "busy port will eventually be resumed."]; bulk_send(suite) -> - [bulk_send_small, bulk_send_big]. + [bulk_send_small, bulk_send_big, bulk_send_bigbig]. bulk_send_small(Config) when is_list(Config) -> ?line bulk_send(64, 32). @@ -129,6 +131,9 @@ bulk_send_small(Config) when is_list(Config) -> bulk_send_big(Config) when is_list(Config) -> ?line bulk_send(32, 64). +bulk_send_bigbig(Config) when is_list(Config) -> + ?line bulk_sendsend(32*5, 4). + bulk_send(Terms, BinSize) -> ?line Dog = test_server:timetrap(test_server:seconds(30)), @@ -145,6 +150,53 @@ bulk_send(Terms, BinSize) -> ?line test_server:timetrap_cancel(Dog), {comment, integer_to_list(trunc(Size/1024/Elapsed+0.5)) ++ " K/s"}. +bulk_sendsend(Terms, BinSize) -> + {Rate1, MonitorCount1} = bulk_sendsend2(Terms, BinSize, 5), + {Rate2, MonitorCount2} = bulk_sendsend2(Terms, BinSize, 995), + Ratio = if MonitorCount2 == 0 -> MonitorCount1 / 1.0; + true -> MonitorCount1 / MonitorCount2 + end, + %% A somewhat arbitrary ratio, but hopefully one that will accomodate + %% a wide range of CPU speeds. + true = (Ratio > 8.0), + {comment, + integer_to_list(Rate1) ++ " K/s, " ++ + integer_to_list(Rate2) ++ " K/s, " ++ + integer_to_list(MonitorCount1) ++ " monitor msgs, " ++ + integer_to_list(MonitorCount2) ++ " monitor msgs, " ++ + float_to_list(Ratio) ++ " monitor ratio"}. + +bulk_sendsend2(Terms, BinSize, BusyBufSize) -> + ?line Dog = test_server:timetrap(test_server:seconds(30)), + + ?line io:format("Sending ~w binaries, each of size ~w K", + [Terms, BinSize]), + ?line {ok, NodeRecv} = start_node(bulk_receiver), + ?line Recv = spawn(NodeRecv, erlang, apply, [fun receiver/2, [0, 0]]), + ?line Bin = list_to_binary(lists:duplicate(BinSize*1024, 253)), + ?line Size = Terms*size(Bin), + + %% SLF LEFT OFF HERE. + %% When the caller uses small hunks, like 4k via + %% bulk_sendsend(32*5, 4), then (on my laptop at least), we get + %% zero monitor messages. But if we use "+zdbbl 5", then we + %% get a lot of monitor messages. So, if we can count up the + %% total number of monitor messages that we get when running both + %% default busy size and "+zdbbl 5", and if the 5 case gets + %% "many many more" monitor messages, then we know we're working. + + ?line {ok, NodeSend} = start_node(bulk_sender, "+zdbbl " ++ integer_to_list(BusyBufSize)), + ?line _Send = spawn(NodeSend, erlang, apply, [fun sendersender/4, [self(), Recv, Bin, Terms]]), + ?line {Elapsed, {TermsN, SizeN}, MonitorCount} = + receive {sendersender, BigRes} -> + BigRes + end, + ?line stop_node(NodeRecv), + ?line stop_node(NodeSend), + + ?line test_server:timetrap_cancel(Dog), + {trunc(SizeN/1024/Elapsed+0.5), MonitorCount}. + sender(To, _Bin, 0) -> To ! {done, self()}, receive @@ -155,6 +207,43 @@ sender(To, Bin, Left) -> To ! {term, Bin}, sender(To, Bin, Left-1). +%% Sender process to be run on a slave node + +sendersender(Parent, To, Bin, Left) -> + erlang:system_monitor(self(), [busy_dist_port]), + [spawn(fun() -> sendersender2(To, Bin, Left, false) end) || + _ <- lists:seq(1,1)], + {USec, {Res, MonitorCount}} = + timer:tc(?MODULE, sendersender2, [To, Bin, Left, true]), + Parent ! {sendersender, {USec/1000000, Res, MonitorCount}}. + +sendersender2(To, Bin, Left, SendDone) -> + sendersender3(To, Bin, Left, SendDone, 0). + +sendersender3(To, _Bin, 0, SendDone, MonitorCount) -> + if SendDone -> + To ! {done, self()}; + true -> + ok + end, + receive + {monitor, _Pid, _Type, _Info} = M -> + sendersender3(To, _Bin, 0, SendDone, MonitorCount + 1) + after 0 -> + if SendDone -> + receive + Any when is_tuple(Any), size(Any) == 2 -> + {Any, MonitorCount} + end; + true -> + exit(normal) + end + end; +sendersender3(To, Bin, Left, SendDone, MonitorCount) -> + To ! {term, Bin}, + %%timer:sleep(50), + sendersender3(To, Bin, Left-1, SendDone, MonitorCount). + %% Receiver process to be run on a slave node. receiver(Terms, Size) -> diff --git a/erts/emulator/test/nif_SUITE.erl b/erts/emulator/test/nif_SUITE.erl index f45cfa3e4a..42947aa6be 100644 --- a/erts/emulator/test/nif_SUITE.erl +++ b/erts/emulator/test/nif_SUITE.erl @@ -73,7 +73,7 @@ basic(Config) when is_list(Config) -> ?line true = (lib_version() =/= undefined), ?line [{load,1,1,101},{lib_version,1,2,102}] = call_history(), ?line [] = call_history(), - ?line [?MODULE] = erlang:system_info(taints), + ?line true = lists:member(?MODULE, erlang:system_info(taints)), ok. reload(doc) -> ["Test reload callback in nif lib"]; @@ -107,7 +107,8 @@ reload(Config) when is_list(Config) -> ?line true = erlang:purge_module(nif_mod), ?line [{unload,1,3,103}] = nif_mod_call_history(), - ?line [?MODULE, nif_mod] = erlang:system_info(taints), + ?line true = lists:member(?MODULE, erlang:system_info(taints)), + ?line true = lists:member(nif_mod, erlang:system_info(taints)), ?line verify_tmpmem(TmpMem), ok. @@ -197,7 +198,8 @@ upgrade(Config) when is_list(Config) -> ?line true = erlang:purge_module(nif_mod), ?line [{unload,2,4,204}] = nif_mod_call_history(), - ?line [?MODULE, nif_mod] = erlang:system_info(taints), + ?line true = lists:member(?MODULE, erlang:system_info(taints)), + ?line true = lists:member(nif_mod, erlang:system_info(taints)), ?line verify_tmpmem(TmpMem), ok. @@ -727,7 +729,8 @@ resource_takeover(Config) when is_list(Config) -> ?line ok = forget_resource(AN4), ?line [] = nif_mod_call_history(), - ?line [?MODULE, nif_mod] = erlang:system_info(taints), + ?line true = lists:member(?MODULE, erlang:system_info(taints)), + ?line true = lists:member(nif_mod, erlang:system_info(taints)), ?line verify_tmpmem(TmpMem), ok. diff --git a/erts/emulator/test/port_SUITE.erl b/erts/emulator/test/port_SUITE.erl index a7476ca9bb..7fe532abd0 100644 --- a/erts/emulator/test/port_SUITE.erl +++ b/erts/emulator/test/port_SUITE.erl @@ -2302,14 +2302,35 @@ load_driver(Dir, Driver) -> end. -close_deaf_port(doc) -> ["Send data to port program that does not read it, then close port."]; +close_deaf_port(doc) -> ["Send data to port program that does not read it, then close port." + "Primary targeting Windows to test threaded_handle_closer in sys.c"]; close_deaf_port(suite) -> []; close_deaf_port(Config) when is_list(Config) -> ?line Dog = test_server:timetrap(test_server:seconds(100)), ?line DataDir = ?config(data_dir, Config), ?line DeadPort = os:find_executable("dead_port", DataDir), - ?line Port = open_port({spawn,DeadPort++" 60"},[]), ?line erlang:port_command(Port,"Hello, can you hear me!?!?"), ?line port_close(Port), - ok. + + Res = close_deaf_port_1(0, DeadPort), + io:format("Waiting for OS procs to terminate...\n"), + receive after 5*1000 -> ok end, + ?line test_server:timetrap_cancel(Dog), + Res. + +close_deaf_port_1(1000, _) -> + ok; +close_deaf_port_1(N, Cmd) -> + Timeout = integer_to_list(random:uniform(5*1000)), + ?line try open_port({spawn_executable,Cmd},[{args,[Timeout]}]) of + Port -> + ?line erlang:port_command(Port,"Hello, can you hear me!?!?"), + ?line port_close(Port), + close_deaf_port_1(N+1, Cmd) + catch + _:eagain -> + {comment, "Could not spawn more than " ++ integer_to_list(N) ++ " OS processes."} + end. + + diff --git a/erts/emulator/test/port_SUITE_data/dead_port.c b/erts/emulator/test/port_SUITE_data/dead_port.c index 6fa77112be..68e96fbf14 100644 --- a/erts/emulator/test/port_SUITE_data/dead_port.c +++ b/erts/emulator/test/port_SUITE_data/dead_port.c @@ -72,14 +72,14 @@ char *argv[]; { int x; if (argc < 2) { - fprintf(stderr,"Usage %s <seconds>\n",argv[0]); + fprintf(stderr,"Usage %s <milliseconds>\n",argv[0]); return 1; } if ((x = atoi(argv[1])) <= 0) { - fprintf(stderr,"Usage %s <seconds>\n",argv[0]); + fprintf(stderr,"Usage %s <milliseconds>\n",argv[0]); return 1; } - delay(x*1000); + delay(x); return 0; } diff --git a/erts/emulator/test/send_term_SUITE.erl b/erts/emulator/test/send_term_SUITE.erl index 5fd01a9ac5..819aa34886 100644 --- a/erts/emulator/test/send_term_SUITE.erl +++ b/erts/emulator/test/send_term_SUITE.erl @@ -1,7 +1,7 @@ %% %% %CopyrightBegin% %% -%% Copyright Ericsson AB 2005-2009. All Rights Reserved. +%% Copyright Ericsson AB 2005-2010. All Rights Reserved. %% %% The contents of this file are subject to the Erlang Public License, %% Version 1.1, (the "License"); you may not use this file except in @@ -61,7 +61,7 @@ basic(Config) when is_list(Config) -> ?line ExpectExt2Term = term(P, 5), %% ERL_DRV_INT, ERL_DRV_UINT - ?line case erlang:system_info(wordsize) of + ?line case erlang:system_info({wordsize, external}) of 4 -> ?line {-1, 4294967295} = term(P, 6); 8 -> diff --git a/erts/emulator/test/system_info_SUITE.erl b/erts/emulator/test/system_info_SUITE.erl index ba433d4e11..cd940f3ddf 100644 --- a/erts/emulator/test/system_info_SUITE.erl +++ b/erts/emulator/test/system_info_SUITE.erl @@ -132,6 +132,7 @@ misc_smoke_tests(Config) when is_list(Config) -> ?line true = is_binary(erlang:system_info(procs)), ?line true = is_binary(erlang:system_info(loaded)), ?line true = is_binary(erlang:system_info(dist)), + ?line ok = try erlang:system_info({cpu_topology,erts_get_cpu_topology_error_case}), fail catch error:badarg -> ok end, ?line ok. |