diff options
author | Kjell Winblad <[email protected]> | 2019-04-10 15:50:54 +0200 |
---|---|---|
committer | Kjell Winblad <[email protected]> | 2019-04-10 15:50:54 +0200 |
commit | 3d712ffc9d08dad3a42ea988869eaef8cd99b53b (patch) | |
tree | 25ce2b16b9f847088997b0d4e77af46dbb6d2d66 | |
parent | 326c3cb70c1b37c794b781a42c50725766098810 (diff) | |
parent | c5e9766712436bea2b91bccd062f66a3ad1841bb (diff) | |
download | otp-3d712ffc9d08dad3a42ea988869eaef8cd99b53b.tar.gz otp-3d712ffc9d08dad3a42ea988869eaef8cd99b53b.tar.bz2 otp-3d712ffc9d08dad3a42ea988869eaef8cd99b53b.zip |
Merge branch 'kjell/stdlib/ets_decentralized_counters/PR-2190/OTP-15623'
* kjell/stdlib/ets_decentralized_counters/PR-2190/OTP-15623:
Decentralized counters for ETS ordered_set with write_concurrency
-rw-r--r-- | erts/doc/src/erl.xml | 23 | ||||
-rw-r--r-- | erts/emulator/Makefile.in | 2 | ||||
-rw-r--r-- | erts/emulator/beam/erl_bif_info.c | 2 | ||||
-rw-r--r-- | erts/emulator/beam/erl_cpu_topology.c | 41 | ||||
-rw-r--r-- | erts/emulator/beam/erl_cpu_topology.h | 22 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db.c | 149 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db.h | 12 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db_catree.c | 60 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db_catree.h | 5 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db_hash.c | 63 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db_tree.c | 97 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db_tree_util.h | 18 | ||||
-rw-r--r-- | erts/emulator/beam/erl_db_util.h | 19 | ||||
-rw-r--r-- | erts/emulator/beam/erl_flxctr.c | 370 | ||||
-rw-r--r-- | erts/emulator/beam/erl_flxctr.h | 406 | ||||
-rw-r--r-- | erts/emulator/beam/erl_init.c | 30 | ||||
-rw-r--r-- | erts/emulator/beam/erl_process.h | 2 | ||||
-rw-r--r-- | lib/stdlib/test/ets_SUITE.erl | 140 |
18 files changed, 1357 insertions, 104 deletions
diff --git a/erts/doc/src/erl.xml b/erts/doc/src/erl.xml index 88ddb03e97..471d7caa5a 100644 --- a/erts/doc/src/erl.xml +++ b/erts/doc/src/erl.xml @@ -636,6 +636,29 @@ produces a crash dump. On Unix systems, sending an emulator process a <c>SIGUSR1</c> signal also forces a crash dump.</p> </item> + <tag><marker id="+dcg"/><c><![CDATA[+rg DecentralizedCounterGroupsLimit]]></c></tag> + <item> + <p>Limits the number of decentralized counter groups used by + decentralized counters optimized for update operations in the + Erlang runtime system. By default, the limit is 256.</p> + <p>When the number of schedulers is less than or equal to the + limit, each scheduler has its own group. When the + number of schedulers is larger than the groups limit, + schedulers share groups. Shared groups degrade + the performance for updating counters while many reader groups + degrade the performance for reading counters. So, the limit is a tradeoff + between performance for update operations and performance for + read operations. Each group consumes 64 bytes in each + counter.</p> + <p>Notice that a runtime system using decentralized + counter groups benefits from <seealso marker="#+sbt">binding + schedulers to logical processors</seealso>, as the groups are + distributed better between schedulers with this option.</p> + <p>This option only affects decentralized counters used for + the counters that are keeping track of the memory consumption + and the number of terms in ETS tables of type ordered_set with + the write_concurrency option activated.</p> + </item> <tag><marker id="+e"/><c><![CDATA[+e Number]]></c></tag> <item> <p>Sets the maximum number of ETS tables. This limit is diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in index 21351df656..448f41b523 100644 --- a/erts/emulator/Makefile.in +++ b/erts/emulator/Makefile.in @@ -894,7 +894,7 @@ RUN_OBJS += \ $(OBJDIR)/erl_ptab.o $(OBJDIR)/erl_map.o \ $(OBJDIR)/erl_msacc.o $(OBJDIR)/erl_lock_flags.o \ $(OBJDIR)/erl_io_queue.o $(OBJDIR)/erl_db_catree.o \ - $(ESOCK_RUN_OBJS) + $(ESOCK_RUN_OBJS) $(OBJDIR)/erl_flxctr.o LTTNG_OBJS = $(OBJDIR)/erlang_lttng.o diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index 39d42d9757..a2f6284c9d 100644 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -3028,6 +3028,8 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1) BIF_RET(erts_nif_taints(BIF_P)); } else if (ERTS_IS_ATOM_STR("reader_groups_map", BIF_ARG_1)) { BIF_RET(erts_get_reader_groups_map(BIF_P)); + } else if (ERTS_IS_ATOM_STR("decentralized_counter_groups_map", BIF_ARG_1)) { + BIF_RET(erts_get_decentralized_counter_groups_map(BIF_P)); } else if (ERTS_IS_ATOM_STR("dist_buf_busy_limit", BIF_ARG_1)) { Uint hsz = 0; diff --git a/erts/emulator/beam/erl_cpu_topology.c b/erts/emulator/beam/erl_cpu_topology.c index 6f8d2f8c35..6a4f43297e 100644 --- a/erts/emulator/beam/erl_cpu_topology.c +++ b/erts/emulator/beam/erl_cpu_topology.c @@ -34,6 +34,7 @@ #include "error.h" #include "bif.h" #include "erl_cpu_topology.h" +#include "erl_flxctr.h" #define ERTS_MAX_READER_GROUPS 64 @@ -58,6 +59,7 @@ static erts_cpu_info_t *cpuinfo; static int max_main_threads; static int reader_groups; +static int decentralized_counter_groups; static ErtsCpuBindData *scheduler2cpu_map; static erts_rwmtx_t cpuinfo_rwmtx; @@ -127,6 +129,8 @@ static erts_cpu_groups_map_t *cpu_groups_maps; static erts_cpu_groups_map_t *reader_groups_map; +static erts_cpu_groups_map_t *decentralized_counter_groups_map; + #define ERTS_TOPOLOGY_CG ERTS_TOPOLOGY_MAX_DEPTH #define ERTS_MAX_CPU_TOPOLOGY_ID ((int) 0xffff) @@ -138,6 +142,7 @@ static void cpu_bind_order_sort(erts_cpu_topology_t *cpudata, static void write_schedulers_bind_change(erts_cpu_topology_t *cpudata, int size); static void reader_groups_callback(int, ErtsSchedulerData *, int, void *); +static void flxctr_groups_callback(int, ErtsSchedulerData *, int, void *); static erts_cpu_groups_map_t *add_cpu_groups(int groups, erts_cpu_groups_callback_t callback, void *arg); @@ -1646,7 +1651,8 @@ erts_get_logical_processors(int *conf, int *onln, int *avail) } void -erts_pre_early_init_cpu_topology(int *max_rg_p, +erts_pre_early_init_cpu_topology(int *max_dcg_p, + int *max_rg_p, int *conf_p, int *onln_p, int *avail_p) @@ -1654,6 +1660,7 @@ erts_pre_early_init_cpu_topology(int *max_rg_p, cpu_groups_maps = NULL; no_cpu_groups_callbacks = 0; *max_rg_p = ERTS_MAX_READER_GROUPS; + *max_dcg_p = ERTS_MAX_FLXCTR_GROUPS; cpuinfo = erts_cpu_info_create(); get_logical_processors(conf_p, onln_p, avail_p); } @@ -1662,7 +1669,9 @@ void erts_early_init_cpu_topology(int no_schedulers, int *max_main_threads_p, int max_reader_groups, - int *reader_groups_p) + int *reader_groups_p, + int max_decentralized_counter_groups, + int *decentralized_counter_groups_p) { user_cpudata = NULL; user_cpudata_size = 0; @@ -1687,6 +1696,12 @@ erts_early_init_cpu_topology(int no_schedulers, max_main_threads = no_schedulers; *max_main_threads_p = max_main_threads; + decentralized_counter_groups = max_main_threads; + if (decentralized_counter_groups <= 1 || max_decentralized_counter_groups <= 1) + decentralized_counter_groups = 1; + if (decentralized_counter_groups > max_decentralized_counter_groups) + decentralized_counter_groups = max_decentralized_counter_groups; + *decentralized_counter_groups_p = decentralized_counter_groups; reader_groups = max_main_threads; if (reader_groups <= 1 || max_reader_groups <= 1) reader_groups = 0; @@ -1718,6 +1733,9 @@ erts_init_cpu_topology(void) reader_groups_map = add_cpu_groups(reader_groups, reader_groups_callback, NULL); + decentralized_counter_groups_map = add_cpu_groups(decentralized_counter_groups, + flxctr_groups_callback, + NULL); if (cpu_bind_order == ERTS_CPU_BIND_NONE) erts_rwmtx_rwunlock(&cpuinfo_rwmtx); @@ -1789,6 +1807,15 @@ reader_groups_callback(int suspending, erts_rwmtx_set_reader_group(suspending ? 0 : group+1); } +void +flxctr_groups_callback(int suspending, + ErtsSchedulerData *esdp, + int group, + void *unused) +{ + erts_flxctr_set_slot(suspending ? 0 : group+1); +} + static Eterm get_cpu_groups_map(Process *c_p, erts_cpu_groups_map_t *map, int offset); @@ -1821,6 +1848,16 @@ erts_get_reader_groups_map(Process *c_p) return res; } +Eterm +erts_get_decentralized_counter_groups_map(Process *c_p) +{ + Eterm res; + erts_rwmtx_rlock(&cpuinfo_rwmtx); + res = get_cpu_groups_map(c_p, decentralized_counter_groups_map, 1); + erts_rwmtx_runlock(&cpuinfo_rwmtx); + return res; +} + /* * CPU groups */ diff --git a/erts/emulator/beam/erl_cpu_topology.h b/erts/emulator/beam/erl_cpu_topology.h index 88bcad79ab..4a428d7972 100644 --- a/erts/emulator/beam/erl_cpu_topology.h +++ b/erts/emulator/beam/erl_cpu_topology.h @@ -27,14 +27,19 @@ #ifndef ERL_CPU_TOPOLOGY_H__ #define ERL_CPU_TOPOLOGY_H__ -void erts_pre_early_init_cpu_topology(int *max_rg_p, - int *conf_p, - int *onln_p, - int *avail_p); -void erts_early_init_cpu_topology(int no_schedulers, - int *max_main_threads_p, - int max_reader_groups, - int *reader_groups_p); +void +erts_pre_early_init_cpu_topology(int *max_dcg_p, + int *max_rg_p, + int *conf_p, + int *onln_p, + int *avail_p); +void +erts_early_init_cpu_topology(int no_schedulers, + int *max_main_threads_p, + int max_reader_groups, + int *reader_groups_p, + int max_decentralized_counter_groups, + int *decentralized_counter_groups_p); void erts_init_cpu_topology(void); @@ -70,6 +75,7 @@ Eterm erts_bind_schedulers(Process *c_p, Eterm how); Eterm erts_get_schedulers_binds(Process *c_p); Eterm erts_get_reader_groups_map(Process *c_p); +Eterm erts_get_decentralized_counter_groups_map(Process *c_p); Eterm erts_set_cpu_topology(Process *c_p, Eterm term); Eterm erts_get_cpu_topology_term(Process *c_p, Eterm which); diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c index 0a50af4d1a..c0f5c506f4 100644 --- a/erts/emulator/beam/erl_db.c +++ b/erts/emulator/beam/erl_db.c @@ -42,6 +42,7 @@ #include "bif.h" #include "big.h" #include "erl_binary.h" +#include "bif.h" erts_atomic_t erts_ets_misc_mem_size; @@ -64,6 +65,11 @@ do { \ } \ }while(0) +#define DB_GET_APPROX_NITEMS(DB) \ + erts_flxctr_read_approx(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) +#define DB_GET_APPROX_MEM_CONSUMED(DB) \ + erts_flxctr_read_approx(&(DB)->common.counters, ERTS_DB_TABLE_MEM_COUNTER_ID) + static BIF_RETTYPE db_bif_fail(Process* p, Uint freason, Uint bif_ix, Export* bif_exp) { @@ -398,8 +404,9 @@ static void free_dbtable(void *vtb) { DbTable *tb = (DbTable *) vtb; - - ASSERT(erts_atomic_read_nob(&tb->common.memory_size) == sizeof(DbTable)); + ASSERT(erts_flxctr_is_snapshot_ongoing(&tb->common.counters) || + sizeof(DbTable) == erts_flxctr_read_approx(&tb->common.counters, + ERTS_DB_TABLE_MEM_COUNTER_ID)); erts_rwmtx_destroy(&tb->common.rwlock); erts_mtx_destroy(&tb->common.fixlock); @@ -408,7 +415,8 @@ free_dbtable(void *vtb) if (tb->common.btid) erts_bin_release(tb->common.btid); - erts_db_free(ERTS_ALC_T_DB_TABLE, tb, (void *) tb, sizeof(DbTable)); + erts_flxctr_destroy(&tb->common.counters, ERTS_ALC_T_DB_TABLE); + erts_free(ERTS_ALC_T_DB_TABLE, tb); } static void schedule_free_dbtable(DbTable* tb) @@ -1731,12 +1739,16 @@ BIF_RETTYPE ets_new_2(BIF_ALIST_2) */ { DbTable init_tb; - - erts_atomic_init_nob(&init_tb.common.memory_size, 0); + erts_flxctr_init(&init_tb.common.counters, 0, 2, ERTS_ALC_T_DB_TABLE); tb = (DbTable*) erts_db_alloc(ERTS_ALC_T_DB_TABLE, &init_tb, sizeof(DbTable)); - erts_atomic_init_nob(&tb->common.memory_size, - erts_atomic_read_nob(&init_tb.common.memory_size)); + erts_flxctr_init(&tb->common.counters, + status & DB_CA_ORDERED_SET, + 2, + ERTS_ALC_T_DB_TABLE); + erts_flxctr_add(&tb->common.counters, + ERTS_DB_TABLE_MEM_COUNTER_ID, + DB_GET_APPROX_MEM_CONSUMED(&init_tb)); } tb->common.meth = meth; @@ -1750,8 +1762,6 @@ BIF_RETTYPE ets_new_2(BIF_ALIST_2) tb->common.owner = BIF_P->common.id; set_heir(BIF_P, tb, heir, heir_data); - erts_atomic_init_nob(&tb->common.nitems, 0); - tb->common.fixing_procs = NULL; tb->common.compress = is_compressed; #ifdef ETS_DBG_FORCE_TRAP @@ -2128,19 +2138,18 @@ BIF_RETTYPE ets_internal_delete_all_2(BIF_ALIST_2) { SWord initial_reds = ERTS_BIF_REDS_LEFT(BIF_P); SWord reds = initial_reds; - Eterm nitems; + Eterm nitems_holder = THE_NON_VALUE; DbTable* tb; - CHECK_TABLES(); DB_BIF_GET_TABLE(tb, DB_WRITE, LCK_WRITE, BIF_ets_internal_delete_all_2); if (BIF_ARG_2 == am_undefined) { - nitems = erts_make_integer(erts_atomic_read_nob(&tb->common.nitems), - BIF_P); - - reds = tb->common.meth->db_delete_all_objects(BIF_P, tb, reds); - + reds = tb->common.meth->db_delete_all_objects(BIF_P, + tb, + reds, + &nitems_holder); + ASSERT(nitems_holder != THE_NON_VALUE); ASSERT(!(tb->common.status & DB_BUSY)); if (reds < 0) { @@ -2159,7 +2168,7 @@ BIF_RETTYPE ets_internal_delete_all_2(BIF_ALIST_2) db_unlock(tb, LCK_WRITE); BUMP_ALL_REDS(BIF_P); BIF_TRAP2(bif_export[BIF_ets_internal_delete_all_2], BIF_P, - BIF_ARG_1, nitems); + BIF_ARG_1, nitems_holder); } else { /* Done, no trapping needed */ @@ -2169,15 +2178,19 @@ BIF_RETTYPE ets_internal_delete_all_2(BIF_ALIST_2) } else { /* - * The table lookup succeeded and second argument is nitems + * The table lookup succeeded and second argument is nitems_holder * and not 'undefined', which means we have trapped at least once * and are now done. */ - nitems = BIF_ARG_2; + nitems_holder = BIF_ARG_2; } - db_unlock(tb, LCK_WRITE); + { + Eterm nitems = + tb->common.meth->db_delete_all_objects_get_nitems_from_holder(BIF_P, + nitems_holder); BIF_RET(nitems); + } } static void delete_all_objects_continue(Process* p, DbTable* tb) @@ -2190,7 +2203,7 @@ static void delete_all_objects_continue(Process* p, DbTable* tb) if ((tb->common.status & (DB_DELETE|DB_BUSY)) != DB_BUSY) return; - reds = tb->common.meth->db_delete_all_objects(p, tb, reds); + reds = tb->common.meth->db_delete_all_objects(p, tb, reds, NULL); if (reds < 0) { BUMP_ALL_REDS(p); @@ -3277,13 +3290,29 @@ BIF_RETTYPE ets_info_1(BIF_ALIST_1) int i; Eterm* hp; Uint freason; + Sint size = -1; + Sint memory = -1; + Eterm table; + int is_ctrs_read_result_set = 0; /*Process* rp = NULL;*/ /* If/when we implement lockless private tables: Eterm owner; */ - - if ((tb = db_get_table(BIF_P, BIF_ARG_1, DB_INFO, LCK_READ, &freason)) == NULL) { - if (freason == BADARG && (is_atom(BIF_ARG_1) || is_ref(BIF_ARG_1))) + if(is_tuple(BIF_ARG_1) && + is_tuple_arity(BIF_ARG_1, 2) && + erts_flxctr_is_snapshot_result(tuple_val(BIF_ARG_1)[1])) { + Eterm counter_read_result = tuple_val(BIF_ARG_1)[1]; + table = tuple_val(BIF_ARG_1)[2]; + size = erts_flxctr_get_snapshot_result_after_trap(counter_read_result, + ERTS_DB_TABLE_NITEMS_COUNTER_ID); + memory = erts_flxctr_get_snapshot_result_after_trap(counter_read_result, + ERTS_DB_TABLE_MEM_COUNTER_ID); + is_ctrs_read_result_set = 1; + } else { + table = BIF_ARG_1; + } + if ((tb = db_get_table(BIF_P, table, DB_INFO, LCK_READ, &freason)) == NULL) { + if (freason == BADARG && (is_atom(table) || is_ref(table))) BIF_RET(am_undefined); else return db_bif_fail(BIF_P, freason, BIF_ets_info_1, NULL); @@ -3314,9 +3343,35 @@ BIF_RETTYPE ets_info_1(BIF_ALIST_1) BIF_ERROR(BIF_P, BADARG); } }*/ + + if (!is_ctrs_read_result_set) { + ErtsFlxCtrSnapshotResult res = + erts_flxctr_snapshot(&tb->common.counters, ERTS_ALC_T_DB_TABLE, BIF_P); + if (ERTS_FLXCTR_GET_RESULT_AFTER_TRAP == res.type) { + Eterm tuple; + db_unlock(tb, LCK_READ); + hp = HAlloc(BIF_P, 3); + tuple = TUPLE2(hp, res.trap_resume_state, table); + BIF_TRAP1(bif_export[BIF_ets_info_1], BIF_P, tuple); + } else if (res.type == ERTS_FLXCTR_TRY_AGAIN_AFTER_TRAP) { + db_unlock(tb, LCK_READ); + BIF_TRAP1(bif_export[BIF_ets_info_1], BIF_P, table); + } else { + size = res.result[ERTS_DB_TABLE_NITEMS_COUNTER_ID]; + memory = res.result[ERTS_DB_TABLE_MEM_COUNTER_ID]; + is_ctrs_read_result_set = 1; + } + } for (i = 0; i < sizeof(fields)/sizeof(Eterm); i++) { - results[i] = table_info(BIF_P, tb, fields[i]); - ASSERT(is_value(results[i])); + if (is_ctrs_read_result_set && am_size == fields[i]) { + results[i] = erts_make_integer(size, BIF_P); + } else if (is_ctrs_read_result_set && am_memory == fields[i]) { + Sint words = (Sint) ((memory + sizeof(Sint) - 1) / sizeof(Sint)); + results[i] = erts_make_integer(words, BIF_P); + } else { + results[i] = table_info(BIF_P, tb, fields[i]); + ASSERT(is_value(results[i])); + } } db_unlock(tb, LCK_READ); @@ -3344,14 +3399,43 @@ BIF_RETTYPE ets_info_2(BIF_ALIST_2) DbTable* tb; Eterm ret = THE_NON_VALUE; Uint freason; - + if (erts_flxctr_is_snapshot_result(BIF_ARG_1)) { + Sint res; + if (am_memory == BIF_ARG_2) { + res = erts_flxctr_get_snapshot_result_after_trap(BIF_ARG_1, + ERTS_DB_TABLE_MEM_COUNTER_ID); + res = (Sint) ((res + sizeof(Sint) - 1) / sizeof(Sint)); + } else { + res = erts_flxctr_get_snapshot_result_after_trap(BIF_ARG_1, + ERTS_DB_TABLE_NITEMS_COUNTER_ID); + } + BIF_RET(erts_make_integer(res, BIF_P)); + } if ((tb = db_get_table(BIF_P, BIF_ARG_1, DB_INFO, LCK_READ, &freason)) == NULL) { if (freason == BADARG && (is_atom(BIF_ARG_1) || is_ref(BIF_ARG_1))) BIF_RET(am_undefined); else return db_bif_fail(BIF_P, freason, BIF_ets_info_2, NULL); } - ret = table_info(BIF_P, tb, BIF_ARG_2); + if (BIF_ARG_2 == am_size || BIF_ARG_2 == am_memory) { + ErtsFlxCtrSnapshotResult res = + erts_flxctr_snapshot(&tb->common.counters, ERTS_ALC_T_DB_TABLE, BIF_P); + if (ERTS_FLXCTR_GET_RESULT_AFTER_TRAP == res.type) { + db_unlock(tb, LCK_READ); + BIF_TRAP2(bif_export[BIF_ets_info_2], BIF_P, res.trap_resume_state, BIF_ARG_2); + } else if (res.type == ERTS_FLXCTR_TRY_AGAIN_AFTER_TRAP) { + db_unlock(tb, LCK_READ); + BIF_TRAP2(bif_export[BIF_ets_info_2], BIF_P, BIF_ARG_1, BIF_ARG_2); + } else if (BIF_ARG_2 == am_size) { + ret = erts_make_integer(res.result[ERTS_DB_TABLE_NITEMS_COUNTER_ID], BIF_P); + } else { /* BIF_ARG_2 == am_memory */ + Sint r = res.result[ERTS_DB_TABLE_MEM_COUNTER_ID]; + r = (Sint) ((r + sizeof(Sint) - 1) / sizeof(Sint)); + ret = erts_make_integer(r, BIF_P); + } + } else { + ret = table_info(BIF_P, tb, BIF_ARG_2); + } db_unlock(tb, LCK_READ); if (is_non_value(ret)) { BIF_ERROR(BIF_P, BADARG); @@ -4121,7 +4205,8 @@ static Eterm table_info(Process* p, DbTable* tb, Eterm What) int use_monotonic; if (What == am_size) { - ret = make_small(erts_atomic_read_nob(&tb->common.nitems)); + Uint size = (Uint) (DB_GET_APPROX_NITEMS(tb)); + ret = erts_make_integer(size, p); } else if (What == am_type) { if (tb->common.status & DB_SET) { ret = am_set; @@ -4136,7 +4221,7 @@ static Eterm table_info(Process* p, DbTable* tb, Eterm What) ret = am_bag; } } else if (What == am_memory) { - Uint words = (Uint) ((erts_atomic_read_nob(&tb->common.memory_size) + Uint words = (Uint) ((DB_GET_APPROX_MEM_CONSUMED(tb) + sizeof(Uint) - 1) / sizeof(Uint)); @@ -4294,9 +4379,9 @@ static void print_table(fmtfn_t to, void *to_arg, int show, DbTable* tb) tb->common.meth->db_print(to, to_arg, show, tb); - erts_print(to, to_arg, "Objects: %d\n", (int)erts_atomic_read_nob(&tb->common.nitems)); + erts_print(to, to_arg, "Objects: %d\n", (int)DB_GET_APPROX_NITEMS(tb)); erts_print(to, to_arg, "Words: %bpu\n", - (Uint) ((erts_atomic_read_nob(&tb->common.memory_size) + (Uint) ((DB_GET_APPROX_MEM_CONSUMED(tb) + sizeof(Uint) - 1) / sizeof(Uint))); diff --git a/erts/emulator/beam/erl_db.h b/erts/emulator/beam/erl_db.h index dc77fbb60c..b22f35a5ef 100644 --- a/erts/emulator/beam/erl_db.h +++ b/erts/emulator/beam/erl_db.h @@ -160,7 +160,9 @@ do { \ erts_aint_t sz__ = (((erts_aint_t) (ALLOC_SZ)) \ - ((erts_aint_t) (FREE_SZ))); \ ASSERT((TAB)); \ - erts_atomic_add_nob(&(TAB)->common.memory_size, sz__); \ + erts_flxctr_add(&(TAB)->common.counters, \ + ERTS_DB_TABLE_MEM_COUNTER_ID, \ + sz__); \ } while (0) #define ERTS_ETS_MISC_MEM_ADD(SZ) \ @@ -305,10 +307,10 @@ erts_db_free(ErtsAlcType_t type, DbTable *tab, void *ptr, Uint size) ASSERT(ptr != 0); ASSERT(size == ERTS_ALC_DBG_BLK_SZ(ptr)); ERTS_DB_ALC_MEM_UPDATE_(tab, size, 0); - - ASSERT(((void *) tab) != ptr - || erts_atomic_read_nob(&tab->common.memory_size) == 0); - + ASSERT(((void *) tab) != ptr || + tab->common.counters.is_decentralized || + 0 == erts_flxctr_read_centralized(&tab->common.counters, + ERTS_DB_TABLE_MEM_COUNTER_ID)); erts_free(type, ptr); } diff --git a/erts/emulator/beam/erl_db_catree.c b/erts/emulator/beam/erl_db_catree.c index 0402c6b7b4..962fe4c4f8 100644 --- a/erts/emulator/beam/erl_db_catree.c +++ b/erts/emulator/beam/erl_db_catree.c @@ -149,7 +149,12 @@ static SWord db_free_table_continue_catree(DbTable *tbl, SWord); static void db_foreach_offheap_catree(DbTable *, void (*)(ErlOffHeap *, void *), void *); -static SWord db_delete_all_objects_catree(Process* p, DbTable* tbl, SWord reds); +static SWord db_delete_all_objects_catree(Process* p, + DbTable* tbl, + SWord reds, + Eterm* nitems_holder_wb); +static Eterm db_delete_all_objects_get_nitems_from_holder_catree(Process* p, + Eterm nitems_holder); static int db_lookup_dbterm_catree(Process *, DbTable *, Eterm key, Eterm obj, DbUpdateHandle*); @@ -191,6 +196,7 @@ DbTableMethod db_catree = db_select_replace_continue_catree, db_take_catree, db_delete_all_objects_catree, + db_delete_all_objects_get_nitems_from_holder_catree, db_free_table_catree, db_free_table_continue_catree, db_print_catree, @@ -1357,6 +1363,8 @@ static SWord do_free_base_node_cont(DbTableCATree *tb, SWord num_left) PUSH_NODE(&tb->free_stack_elems, root); root = p; } else { + DEC_NITEMS((DbTable*)tb); + tb->nr_of_deleted_items++; free_term((DbTable*)tb, root); if (--num_left >= 0) { break; @@ -1397,6 +1405,7 @@ int db_create_catree(Process *p, DbTable *tbl) root = create_base_node(tb, NULL); tb->deletion = 0; tb->base_nodes_to_free_list = NULL; + tb->nr_of_deleted_items = 0; erts_atomic_init_relb(&(tb->root), (erts_aint_t)root); return DB_ERROR_NONE; } @@ -2050,6 +2059,7 @@ static SWord db_free_table_continue_catree(DbTable *tbl, SWord reds) PUSH_NODE(&tb->free_stack_rnodes, GET_ROOT(tb)); tb->is_routing_nodes_freed = 0; tb->base_nodes_to_free_list = NULL; + tb->nr_of_deleted_items = 0; } if ( ! tb->is_routing_nodes_freed ) { reds = do_free_routing_nodes_catree_cont(tb, reds); @@ -2079,13 +2089,57 @@ static SWord db_free_table_continue_catree(DbTable *tbl, SWord reds) return 1; } -static SWord db_delete_all_objects_catree(Process* p, DbTable* tbl, SWord reds) +static +int db_catree_nr_of_items_deleted_wb_dtor(Binary *context_bin) { + (void)context_bin; + return 1; +} + +typedef struct { + Uint nr_of_deleted_items; +} DbCATreeNrOfItemsDeletedWb; + +static Eterm +create_and_install_num_of_deleted_items_wb_bin(Process *p, DbTableCATree *tb) +{ + Binary* bin = + erts_create_magic_binary(sizeof(DbCATreeNrOfItemsDeletedWb), + db_catree_nr_of_items_deleted_wb_dtor); + DbCATreeNrOfItemsDeletedWb* data = ERTS_MAGIC_BIN_DATA(bin); + Eterm* hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE); + Eterm mref = erts_mk_magic_ref(&hp, &MSO(p), bin); + data->nr_of_deleted_items = 0; + tb->nr_of_deleted_items_wb = bin; + erts_refc_inctest(&bin->intern.refc, 2); + return mref; +} + +static Eterm db_delete_all_objects_get_nitems_from_holder_catree(Process* p, + Eterm mref) { + Binary* bin = erts_magic_ref2bin(mref); + DbCATreeNrOfItemsDeletedWb* data = ERTS_MAGIC_BIN_DATA(bin); + return erts_make_integer(data->nr_of_deleted_items, p); +} + +static SWord db_delete_all_objects_catree(Process* p, + DbTable* tbl, + SWord reds, + Eterm* nitems_holder_wb) +{ + DbTableCATree *tb = &tbl->catree; + DbCATreeNrOfItemsDeletedWb* data; + if (!tb->deletion) { + *nitems_holder_wb = + create_and_install_num_of_deleted_items_wb_bin(p, tb); + } reds = db_free_table_continue_catree(tbl, reds); if (reds < 0) return reds; + data = ERTS_MAGIC_BIN_DATA(tb->nr_of_deleted_items_wb); + data->nr_of_deleted_items = tb->nr_of_deleted_items; + erts_bin_release(tb->nr_of_deleted_items_wb); db_create_catree(p, tbl); - erts_atomic_set_nob(&tbl->catree.common.nitems, 0); return reds; } diff --git a/erts/emulator/beam/erl_db_catree.h b/erts/emulator/beam/erl_db_catree.h index 418837be8e..fde442eaf5 100644 --- a/erts/emulator/beam/erl_db_catree.h +++ b/erts/emulator/beam/erl_db_catree.h @@ -87,6 +87,10 @@ typedef struct db_table_catree { CATreeNodeStack free_stack_rnodes; DbTableCATreeNode *base_nodes_to_free_list; int is_routing_nodes_freed; + /* The fields below are used by delete_all_objects and + select_delete(DeleteAll)*/ + Uint nr_of_deleted_items; + Binary* nr_of_deleted_items_wb; } DbTableCATree; typedef struct { @@ -104,7 +108,6 @@ void db_initialize_catree(void); int db_create_catree(Process *p, DbTable *tbl); - TreeDbTerm** catree_find_root(Eterm key, CATreeRootIterator*); TreeDbTerm** catree_find_next_from_pb_key_root(Eterm key, CATreeRootIterator*); diff --git a/erts/emulator/beam/erl_db_hash.c b/erts/emulator/beam/erl_db_hash.c index f225730029..ceaccf7e44 100644 --- a/erts/emulator/beam/erl_db_hash.c +++ b/erts/emulator/beam/erl_db_hash.c @@ -85,6 +85,14 @@ #include "erl_db_hash.h" +#define ADD_NITEMS(DB, TO_ADD) \ + erts_flxctr_add(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID, TO_ADD) +#define INC_NITEMS(DB) \ + erts_flxctr_inc_read_centralized(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) +#define DEC_NITEMS(DB) \ + erts_flxctr_dec_read_centralized(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) +#define RESET_NITEMS(DB) \ + erts_flxctr_reset(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) /* * The following symbols can be manipulated to "tune" the linear hash array */ @@ -121,7 +129,9 @@ : ((struct segment**) erts_atomic_read_nob(&(tb)->segtab))) #endif #define NACTIVE(tb) ((int)erts_atomic_read_nob(&(tb)->nactive)) -#define NITEMS(tb) ((int)erts_atomic_read_nob(&(tb)->common.nitems)) +#define NITEMS(tb) \ + ((Sint)erts_flxctr_read_centralized(&(tb)->common.counters, \ + ERTS_DB_TABLE_NITEMS_COUNTER_ID)) #define SLOT_IX_TO_SEG_IX(i) (((i)+(EXT_SEGSZ-FIRST_SEGSZ)) >> EXT_SEGSZ_EXP) @@ -444,7 +454,12 @@ static void db_foreach_offheap_hash(DbTable *, void (*)(ErlOffHeap *, void *), void *); -static SWord db_delete_all_objects_hash(Process* p, DbTable* tbl, SWord reds); +static SWord db_delete_all_objects_hash(Process* p, + DbTable* tbl, + SWord reds, + Eterm* nitems_holder_wb); +static Eterm db_delete_all_objects_get_nitems_from_holder_hash(Process* p, + Eterm nitems_holder); #ifdef HARDDEBUG static void db_check_table_hash(DbTableHash *tb); #endif @@ -548,6 +563,7 @@ DbTableMethod db_hash = db_select_replace_continue_hash, db_take_hash, db_delete_all_objects_hash, + db_delete_all_objects_get_nitems_from_holder_hash, db_free_empty_table_hash, db_free_table_continue_hash, db_print_hash, @@ -806,7 +822,7 @@ int db_put_hash(DbTable *tbl, Eterm obj, int key_clash_fail) if (tb->common.status & DB_SET) { HashDbTerm* bnext = b->next; if (is_pseudo_deleted(b)) { - erts_atomic_inc_nob(&tb->common.nitems); + INC_NITEMS(tb); b->pseudo_deleted = 0; } else if (key_clash_fail) { @@ -835,7 +851,7 @@ int db_put_hash(DbTable *tbl, Eterm obj, int key_clash_fail) do { if (db_eq(&tb->common,obj,&q->dbterm)) { if (is_pseudo_deleted(q)) { - erts_atomic_inc_nob(&tb->common.nitems); + INC_NITEMS(tb); q->pseudo_deleted = 0; ASSERT(q->hvalue == hval); if (q != b) { /* must move to preserve key insertion order */ @@ -858,7 +874,7 @@ Lnew: q->pseudo_deleted = 0; q->next = b; *bp = q; - nitems = erts_atomic_inc_read_nob(&tb->common.nitems); + nitems = INC_NITEMS(tb); WUNLOCK_HASH(lck); { int nactive = NACTIVE(tb); @@ -1056,7 +1072,7 @@ int db_erase_hash(DbTable *tbl, Eterm key, Eterm *ret) } WUNLOCK_HASH(lck); if (nitems_diff) { - erts_atomic_add_nob(&tb->common.nitems, nitems_diff); + ADD_NITEMS(tb, nitems_diff); try_shrink(tb); } free_term_list(tb, free_us); @@ -1117,7 +1133,7 @@ static int db_erase_object_hash(DbTable *tbl, Eterm object, Eterm *ret) } WUNLOCK_HASH(lck); if (nitems_diff) { - erts_atomic_add_nob(&tb->common.nitems, nitems_diff); + ADD_NITEMS(tb, nitems_diff); try_shrink(tb); } free_term_list(tb, free_us); @@ -2023,7 +2039,7 @@ static int select_delete_on_match_res(traverse_context_t* ctx_base, Sint slot_ix del->next = ctx->free_us; ctx->free_us = del; } - erts_atomic_dec_nob(&ctx->base.tb->common.nitems); + DEC_NITEMS(ctx->base.tb); return 1; } @@ -2300,7 +2316,7 @@ static int db_take_hash(Process *p, DbTable *tbl, Eterm key, Eterm *ret) } WUNLOCK_HASH(lck); if (nitems_diff) { - erts_atomic_add_nob(&tb->common.nitems, nitems_diff); + ADD_NITEMS(tb, nitems_diff); try_shrink(tb); } free_term_list(tb, free_us); @@ -2360,7 +2376,7 @@ static SWord db_mark_all_deleted_hash(DbTable *tbl, SWord reds) fixdel->slot = NACTIVE(tb) - 1; fixdel->all = 1; fixdel->trap = 0; - erts_atomic_set_nob(&tb->common.nitems, 0); + RESET_NITEMS(tb); return loops < 0 ? 0 : loops / LOOPS_PER_REDUCTION; } @@ -2468,7 +2484,8 @@ static SWord db_free_table_continue_hash(DbTable *tbl, SWord reds) (void*)tb->locks, sizeof(DbTableHashFineLocks)); tb->locks = NULL; } - ASSERT(erts_atomic_read_nob(&tb->common.memory_size) == sizeof(DbTable)); + ASSERT(sizeof(DbTable) == erts_flxctr_read_approx(&tb->common.counters, + ERTS_DB_TABLE_MEM_COUNTER_ID)); return reds; /* Done */ } @@ -3080,7 +3097,7 @@ db_lookup_dbterm_hash(Process *p, DbTable *tbl, Eterm key, Eterm obj, ASSERT(q->hvalue == hval); q->pseudo_deleted = 0; *bp = b = q; - erts_atomic_inc_nob(&tb->common.nitems); + INC_NITEMS(tb); } HRelease(p, hend, htop); @@ -3123,7 +3140,7 @@ db_finalize_dbterm_hash(int cret, DbUpdateHandle* handle) } WUNLOCK_HASH(lck); - erts_atomic_dec_nob(&tb->common.nitems); + DEC_NITEMS(tb); try_shrink(tb); } else { if (handle->flags & DB_MUST_RESIZE) { @@ -3132,7 +3149,7 @@ db_finalize_dbterm_hash(int cret, DbUpdateHandle* handle) } if (handle->flags & DB_INC_TRY_GROW) { int nactive; - int nitems = erts_atomic_inc_read_nob(&tb->common.nitems); + int nitems = INC_NITEMS(tb); WUNLOCK_HASH(lck); nactive = NACTIVE(tb); @@ -3153,8 +3170,17 @@ db_finalize_dbterm_hash(int cret, DbUpdateHandle* handle) return; } -static SWord db_delete_all_objects_hash(Process* p, DbTable* tbl, SWord reds) +static SWord db_delete_all_objects_hash(Process* p, + DbTable* tbl, + SWord reds, + Eterm* nitems_holder_wb) { + if (nitems_holder_wb != NULL) { + Uint nr_of_items = + erts_flxctr_read_centralized(&tbl->common.counters, + ERTS_DB_TABLE_NITEMS_COUNTER_ID); + *nitems_holder_wb = erts_make_integer(nr_of_items, p); + } if (IS_FIXED(tbl)) { reds = db_mark_all_deleted_hash(tbl, reds); } else { @@ -3163,11 +3189,16 @@ static SWord db_delete_all_objects_hash(Process* p, DbTable* tbl, SWord reds) return reds; db_create_hash(p, tbl); - erts_atomic_set_nob(&tbl->hash.common.nitems, 0); + RESET_NITEMS(tbl); } return reds; } +static Eterm db_delete_all_objects_get_nitems_from_holder_hash(Process* p, + Eterm nitems_holder){ + return nitems_holder; +} + void db_foreach_offheap_hash(DbTable *tbl, void (*func)(ErlOffHeap *, void *), void * arg) diff --git a/erts/emulator/beam/erl_db_tree.c b/erts/emulator/beam/erl_db_tree.c index f9ba04f399..492ea81b63 100644 --- a/erts/emulator/beam/erl_db_tree.c +++ b/erts/emulator/beam/erl_db_tree.c @@ -51,9 +51,20 @@ #include "erl_db_tree_util.h" #define GETKEY_WITH_POS(Keypos, Tplp) (*((Tplp) + Keypos)) -#define NITEMS(tb) ((int)erts_atomic_read_nob(&(tb)->common.nitems)) -#define TREE_MAX_ELEMENTS 0xFFFFFFFFUL +#define NITEMS_CENTRALIZED(tb) \ + ((Sint)erts_flxctr_read_centralized(&(tb)->common.counters, \ + ERTS_DB_TABLE_NITEMS_COUNTER_ID)) +#define ADD_NITEMS(DB, TO_ADD) \ + erts_flxctr_add(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID, TO_ADD) +#define INC_NITEMS(DB) \ + erts_flxctr_inc(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) +#define INC_NITEMS_CENTRALIZED(DB) \ + erts_flxctr_inc_read_centralized(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) +#define RESET_NITEMS(DB) \ + erts_flxctr_reset(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) +#define IS_CENTRALIZED_CTR(tb) (!(tb)->common.counters.is_decentralized) +#define APPROX_MEM_CONSUMED(tb) erts_flxctr_read_approx(&(tb)->common.counters, ERTS_DB_TABLE_MEM_COUNTER_ID) #define TOPN_NODE(Dtt, Pos) \ (((Pos) < Dtt->pos) ? \ @@ -296,7 +307,7 @@ int tree_balance_right(TreeDbTerm **this); static int delsub(TreeDbTerm **this); static TreeDbTerm *slot_search(Process *p, TreeDbTerm *root, Sint slot, DbTable *tb, DbTableTree *stack_container, - CATreeRootIterator *iter); + CATreeRootIterator *iter, int* is_EOT); static TreeDbTerm *find_node(DbTableCommon *tb, TreeDbTerm *root, Eterm key, DbTableTree *stack_container); static TreeDbTerm **find_node2(DbTableCommon *tb, TreeDbTerm **root, Eterm key); @@ -433,8 +444,12 @@ static void db_foreach_offheap_tree(DbTable *, void (*)(ErlOffHeap *, void *), void *); -static SWord db_delete_all_objects_tree(Process* p, DbTable* tbl, SWord reds); - +static SWord db_delete_all_objects_tree(Process* p, + DbTable* tbl, + SWord reds, + Eterm* nitems_holder_wb); +static Eterm db_delete_all_objects_get_nitems_from_holder_tree(Process* p, + Eterm nitems_holder); #ifdef HARDDEBUG static void db_check_table_tree(DbTable *tbl); #endif @@ -478,6 +493,7 @@ DbTableMethod db_tree = db_select_replace_continue_tree, db_take_tree, db_delete_all_objects_tree, + db_delete_all_objects_get_nitems_from_holder_tree, db_free_empty_table_tree, db_free_table_continue_tree, db_print_tree, @@ -595,7 +611,8 @@ int db_last_tree_common(Process *p, DbTable *tbl, TreeDbTerm *root, } if (stack) { PUSH_NODE(stack, this); - stack->slot = NITEMS(tbl); + /* Always centralized counters when static stack is used */ + stack->slot = NITEMS_CENTRALIZED(tbl); release_stack(tbl,stack_container,stack); } *ret = db_copy_key(p, tbl, &this->dbterm); @@ -661,10 +678,7 @@ int db_put_tree_common(DbTableCommon *tb, TreeDbTerm **root, Eterm obj, for (;;) if (!*this) { /* Found our place */ state = 1; - if (erts_atomic_inc_read_nob(&tb->nitems) >= TREE_MAX_ELEMENTS) { - erts_atomic_dec_nob(&tb->nitems); - return DB_ERROR_SYSRES; - } + INC_NITEMS(((DbTable*)tb)); *this = new_dbterm(tb, obj); (*this)->balance = 0; (*this)->left = (*this)->right = NULL; @@ -888,7 +902,7 @@ int db_slot_tree_common(Process *p, DbTable *tbl, TreeDbTerm *root, TreeDbTerm *st; Eterm *hp, *hend; Eterm copy; - + int is_EOT = 0; /* * The notion of a "slot" is not natural in a tree, but we try to * simulate it by giving the n'th node in the tree instead. @@ -899,10 +913,10 @@ int db_slot_tree_common(Process *p, DbTable *tbl, TreeDbTerm *root, if (is_not_small(slot_term) || ((slot = signed_val(slot_term)) < 0) || - (slot > NITEMS(tbl))) + (IS_CENTRALIZED_CTR(tbl) && slot > NITEMS_CENTRALIZED(tbl))) return DB_ERROR_BADPARAM; - if (slot == NITEMS(tbl)) { + if (IS_CENTRALIZED_CTR(tbl) && slot == NITEMS_CENTRALIZED(tbl)) { *ret = am_EOT; return DB_ERROR_NONE; } @@ -912,7 +926,11 @@ int db_slot_tree_common(Process *p, DbTable *tbl, TreeDbTerm *root, * are counted from 1 and up. */ ++slot; - st = slot_search(p, root, slot, tbl, stack_container, iter); + st = slot_search(p, root, slot, tbl, stack_container, iter, &is_EOT); + if (is_EOT) { + *ret = am_EOT; + return DB_ERROR_NONE; + } if (st == NULL) { *ret = am_false; return DB_ERROR_UNSPEC; @@ -2244,7 +2262,8 @@ void db_print_tree_common(fmtfn_t to, void *to_arg, erts_print(to, to_arg, "\n" "------------------------------------------------\n"); #else - erts_print(to, to_arg, "Ordered set (AVL tree), Elements: %d\n", NITEMS(tbl)); + erts_print(to, to_arg, "Ordered set (AVL tree), Elements: %d\n", + erts_flxctr_read_approx(&tbl->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID)); #endif } @@ -2281,24 +2300,41 @@ static SWord db_free_table_continue_tree(DbTable *tbl, SWord reds) (DbTable *) tb, (void *) tb->static_stack.array, sizeof(TreeDbTerm *) * STACK_NEED); - ASSERT((erts_atomic_read_nob(&tb->common.memory_size) - == sizeof(DbTable)) || - (erts_atomic_read_nob(&tb->common.memory_size) - == (sizeof(DbTable) + sizeof(DbFixation)))); + ASSERT(erts_flxctr_is_snapshot_ongoing(&tb->common.counters) || + ((APPROX_MEM_CONSUMED(tb) + == sizeof(DbTable)) || + (APPROX_MEM_CONSUMED(tb) + == (sizeof(DbTable) + sizeof(DbFixation))))); } return reds; } -static SWord db_delete_all_objects_tree(Process* p, DbTable* tbl, SWord reds) +static SWord db_delete_all_objects_tree(Process* p, + DbTable* tbl, + SWord reds, + Eterm* nitems_holder_wb) { + if (nitems_holder_wb != NULL) { + Uint nr_of_items = + erts_flxctr_read_centralized(&tbl->common.counters, + ERTS_DB_TABLE_NITEMS_COUNTER_ID); + *nitems_holder_wb = erts_make_integer(nr_of_items, p); + } reds = db_free_table_continue_tree(tbl, reds); if (reds < 0) return reds; db_create_tree(p, tbl); - erts_atomic_set_nob(&tbl->tree.common.nitems, 0); + RESET_NITEMS(tbl); return reds; } +static Eterm db_delete_all_objects_get_nitems_from_holder_tree(Process* p, + Eterm holder) +{ + (void)p; + return holder; +} + static void do_db_tree_foreach_offheap(TreeDbTerm *, void (*)(ErlOffHeap *, void *), void *); @@ -2383,7 +2419,7 @@ static TreeDbTerm *linkout_tree(DbTableCommon *tb, TreeDbTerm **root, tstack[tpos++] = this; state = delsub(this); } - erts_atomic_dec_nob(&tb->nitems); + DEC_NITEMS(((DbTable*)tb)); break; } } @@ -2450,7 +2486,7 @@ static TreeDbTerm *linkout_object_tree(DbTableCommon *tb, TreeDbTerm **root, tstack[tpos++] = this; state = delsub(this); } - erts_atomic_dec_nob(&tb->nitems); + DEC_NITEMS(((DbTable*)tb)); break; } } @@ -2745,7 +2781,8 @@ static int delsub(TreeDbTerm **this) static TreeDbTerm *slot_search(Process *p, TreeDbTerm *root, Sint slot, DbTable *tb, DbTableTree *stack_container, - CATreeRootIterator *iter) + CATreeRootIterator *iter, + int* is_EOT) { TreeDbTerm *this; TreeDbTerm *tmp; @@ -2837,8 +2874,12 @@ static TreeDbTerm *slot_search(Process *p, TreeDbTerm *root, break; next_root: - if (!iter) + if (!iter) { + if (stack->slot == (slot-1)) { + *is_EOT = 1; + } break; /* EOT */ + } ASSERT(slot > stack->slot); if (lastobj) { @@ -2846,8 +2887,12 @@ next_root: lastobj = NULL; } pp = catree_find_next_root(iter, &lastkey); - if (!pp) + if (!pp) { + if (stack->slot == (slot-1)) { + *is_EOT = 1; + } break; /* EOT */ + } root = *pp; stack->pos = 0; find_next(&tb->common, root, stack, lastkey); diff --git a/erts/emulator/beam/erl_db_tree_util.h b/erts/emulator/beam/erl_db_tree_util.h index 02df74678d..ba4a8f79e5 100644 --- a/erts/emulator/beam/erl_db_tree_util.h +++ b/erts/emulator/beam/erl_db_tree_util.h @@ -25,6 +25,8 @@ ** Internal functions and macros used by both the CA tree and the AVL tree */ + +#if defined(ARCH_32) /* ** A stack of this size is enough for an AVL tree with more than ** 0xFFFFFFFF elements. May be subject to change if @@ -34,8 +36,19 @@ ** Where n denotes the number of nodes, h(n) the height of the tree ** with n nodes and log is the binary logarithm. */ - #define STACK_NEED 50 +#elif defined(ARCH_64) +/* +** A stack of this size is enough for an AVL tree with more than +** 2^61 elements. +** The Maximal height of an AVL tree is calculated as above. +*/ +#define STACK_NEED 90 +#else +#error "Unsported architecture" +#endif + + #define PUSH_NODE(Dtt, Tdt) \ ((Dtt)->array[(Dtt)->pos++] = Tdt) @@ -50,6 +63,9 @@ #define EMPTY_NODE(Dtt) (TOP_NODE(Dtt) == NULL) +#define DEC_NITEMS(DB) \ + erts_flxctr_dec(&(DB)->common.counters, ERTS_DB_TABLE_NITEMS_COUNTER_ID) + static ERTS_INLINE void free_term(DbTable *tb, TreeDbTerm* p) { db_free_term(tb, p, offsetof(TreeDbTerm, dbterm)); diff --git a/erts/emulator/beam/erl_db_util.h b/erts/emulator/beam/erl_db_util.h index e3d3c0e804..97f2848679 100644 --- a/erts/emulator/beam/erl_db_util.h +++ b/erts/emulator/beam/erl_db_util.h @@ -21,6 +21,7 @@ #ifndef _DB_UTIL_H #define _DB_UTIL_H +#include "erl_flxctr.h" #include "global.h" #include "erl_message.h" #include "erl_bif_unique.h" @@ -207,8 +208,12 @@ typedef struct db_table_method enum DbIterSafety*); int (*db_take)(Process *, DbTable *, Eterm, Eterm *); - SWord (*db_delete_all_objects)(Process* p, DbTable* db, SWord reds); - + SWord (*db_delete_all_objects)(Process* p, + DbTable* db, + SWord reds, + Eterm* nitems_holder_wb); + Eterm (*db_delete_all_objects_get_nitems_from_holder)(Process* p, + Eterm nitems_holder); int (*db_free_empty_table)(DbTable* db); SWord (*db_free_table_continue)(DbTable* db, SWord reds); @@ -257,6 +262,9 @@ typedef struct { DbTable *prev; } DbTableList; +#define ERTS_DB_TABLE_NITEMS_COUNTER_ID 0 +#define ERTS_DB_TABLE_MEM_COUNTER_ID 1 + /* * This structure contains data for all different types of database * tables. Note that these fields must match the same fields @@ -281,8 +289,11 @@ typedef struct db_table_common { Eterm the_name; /* an atom */ Binary *btid; DbTableMethod* meth; /* table methods */ - erts_atomic_t nitems; /* Total number of items in table */ - erts_atomic_t memory_size;/* Total memory size. NOTE: in bytes! */ + /* The ErtsFlxCtr below contains: + * - Total number of items in table + * - Total memory size (NOTE: in bytes!) */ + ErtsFlxCtr counters; + char extra_for_flxctr[ERTS_FLXCTR_NR_OF_EXTRA_BYTES(2)]; struct { /* Last fixation time */ ErtsMonotonicTime monotonic; ErtsMonotonicTime offset; diff --git a/erts/emulator/beam/erl_flxctr.c b/erts/emulator/beam/erl_flxctr.c new file mode 100644 index 0000000000..35f4a21508 --- /dev/null +++ b/erts/emulator/beam/erl_flxctr.c @@ -0,0 +1,370 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2019. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +/* + * Author: Kjell Winblad + */ + +#include "erl_flxctr.h" + +static int reader_groups_array_size = 0; +#define ERTS_FLXCTR_DECENTRALIZED_NO_SLOTS (reader_groups_array_size) + +static int erts_flxctr_read_ctx_bin_dtor(Binary *context_bin); +static int erts_flxctr_wait_dtor(Binary *context_bin); + +typedef struct { + ErtsThrPrgrLaterOp later_op; + Process* process; + ErtsFlxCtrDecentralizedCtrArray* array; + ErtsFlxCtrDecentralizedCtrArray* next_array; + ErtsAlcType_t alloc_type; + int nr_of_counters; + Sint result[ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE]; +} DecentralizedReadSnapshotInfo; + +typedef enum { + ERTS_FLXCTR_SNAPSHOT_NOT_ONGOING = 0, + ERTS_FLXCTR_SNAPSHOT_ONGOING = 1, + ERTS_FLXCTR_SNAPSHOT_ONGOING_TP_THREAD_DO_FREE = 2 +} erts_flxctr_snapshot_status; + +static void +thr_prg_wake_up_and_count(void* bin_p) +{ + Binary* bin = bin_p; + DecentralizedReadSnapshotInfo* info = ERTS_MAGIC_BIN_DATA(bin); + Process* p = info->process; + ErtsFlxCtrDecentralizedCtrArray* array = info->array; + ErtsFlxCtrDecentralizedCtrArray* next = info->next_array; + int i, sched; + /* Reset result array */ + for (i = 0; i < info->nr_of_counters; i++) { + info->result[i] = 0; + } + /* Read result from snapshot */ + for (sched = 0; sched < ERTS_FLXCTR_DECENTRALIZED_NO_SLOTS; sched++) { + for (i = 0; i < info->nr_of_counters; i++) { + info->result[i] = info->result[i] + + erts_atomic_read_nob(&array->array[sched].counters[i]); + } + } + /* Update the next decentralized counter array */ + for (i = 0; i < info->nr_of_counters; i++) { + erts_atomic_add_nob(&next->array[0].counters[i], info->result[i]); + } + /* Announce that the snapshot is done */ + { + Sint expected = ERTS_FLXCTR_SNAPSHOT_ONGOING; + if (expected != erts_atomic_cmpxchg_mb(&next->snapshot_status, + ERTS_FLXCTR_SNAPSHOT_NOT_ONGOING, + expected)) { + /* The CAS failed which means that this thread need to free the next array. */ + erts_free(info->alloc_type, next->block_start); + } + } + /* Resume the process that requested the snapshot */ + erts_proc_lock(p, ERTS_PROC_LOCK_STATUS); + if (!ERTS_PROC_IS_EXITING(p)) { + erts_resume(p, ERTS_PROC_LOCK_STATUS); + } + /* Free the memory that is no longer needed */ + erts_free(info->alloc_type, array->block_start); + erts_proc_unlock(p, ERTS_PROC_LOCK_STATUS); + erts_proc_dec_refc(p); + erts_bin_release(bin); +} + +typedef struct { + ErtsThrPrgrLaterOp later_op; + Process* process; +} ErtsFlxCtrWakeUpLaterInfo; + +static void +thr_prg_wake_up_later(void* bin_p) +{ + Binary* bin = bin_p; + ErtsFlxCtrWakeUpLaterInfo* info = ERTS_MAGIC_BIN_DATA(bin); + Process* p = info->process; + /* Resume the requesting process */ + erts_proc_lock(p, ERTS_PROC_LOCK_STATUS); + if (!ERTS_PROC_IS_EXITING(p)) { + erts_resume(p, ERTS_PROC_LOCK_STATUS); + } + erts_proc_unlock(p, ERTS_PROC_LOCK_STATUS); + /* Free data */ + erts_proc_dec_refc(p); + erts_bin_release(bin); +} + +static +int erts_flxctr_read_ctx_bin_dtor(Binary *context_bin) { + (void)context_bin; + return 1; +} + +static +int erts_flxctr_wait_dtor(Binary *context_bin) { + (void)context_bin; + return 1; +} + +static void suspend_until_thr_prg(Process* p) +{ + Binary* state_bin; + ErtsFlxCtrWakeUpLaterInfo* info; + state_bin = erts_create_magic_binary(sizeof(ErtsFlxCtrWakeUpLaterInfo), + erts_flxctr_wait_dtor); + info = ERTS_MAGIC_BIN_DATA(state_bin); + info->process = p; + erts_refc_inctest(&state_bin->intern.refc, 1); + erts_suspend(p, ERTS_PROC_LOCK_MAIN, NULL); + erts_proc_inc_refc(p); + ERTS_VBUMP_ALL_REDS(p); + erts_schedule_thr_prgr_later_op(thr_prg_wake_up_later, state_bin, &info->later_op); +} + + +static ErtsFlxCtrDecentralizedCtrArray* +create_decentralized_ctr_array(ErtsAlcType_t alloc_type, Uint nr_of_counters) { + /* Allocate an ErtsFlxCtrDecentralizedCtrArray and make sure that + the array field is located at the start of a cache line */ + char* bytes = + erts_alloc(alloc_type, + sizeof(ErtsFlxCtrDecentralizedCtrArray) + + (sizeof(ErtsFlxCtrDecentralizedCtrArrayElem) * + ERTS_FLXCTR_DECENTRALIZED_NO_SLOTS) + + ERTS_CACHE_LINE_SIZE); + void* block_start = bytes; + int bytes_to_next_cacheline_border; + ErtsFlxCtrDecentralizedCtrArray* array; + int i, sched; + bytes = &bytes[offsetof(ErtsFlxCtrDecentralizedCtrArray, array)]; + bytes_to_next_cacheline_border = + ERTS_CACHE_LINE_SIZE - (((Uint)bytes) % ERTS_CACHE_LINE_SIZE); + array = (ErtsFlxCtrDecentralizedCtrArray*) + (&bytes[bytes_to_next_cacheline_border - + (int)offsetof(ErtsFlxCtrDecentralizedCtrArray, array)]); + ASSERT(((Uint)array->array) % ERTS_CACHE_LINE_SIZE == 0); + ASSERT(((Uint)array - (Uint)block_start) <= ERTS_CACHE_LINE_SIZE); + /* Initialize fields */ + erts_atomic_init_nob(&array->snapshot_status, ERTS_FLXCTR_SNAPSHOT_ONGOING); + for (sched = 0; sched < ERTS_FLXCTR_DECENTRALIZED_NO_SLOTS; sched++) { + for (i = 0; i < nr_of_counters; i++) { + erts_atomic_init_nob(&array->array[sched].counters[i], 0); + } + } + array->block_start = block_start; + return array; +} + +void erts_flxctr_setup(int decentralized_counter_groups) +{ + reader_groups_array_size = decentralized_counter_groups+1; +} + +void erts_flxctr_init(ErtsFlxCtr* c, + int is_decentralized, + Uint nr_of_counters, + ErtsAlcType_t alloc_type) +{ + ASSERT(nr_of_counters <= ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE); + c->is_decentralized = is_decentralized; + c->nr_of_counters = nr_of_counters; + if (c->is_decentralized) { + ErtsFlxCtrDecentralizedCtrArray* array = + create_decentralized_ctr_array(alloc_type, nr_of_counters); + erts_atomic_set_nob(&array->snapshot_status, + ERTS_FLXCTR_SNAPSHOT_NOT_ONGOING); + erts_atomic_init_nob(&c->u.counters_ptr, (Sint)array); + ASSERT(((Uint)array->array) % ERTS_CACHE_LINE_SIZE == 0); + } else { + int i; + for (i = 0; i < nr_of_counters; i++) { + erts_atomic_init_nob(&c->u.counters[i], 0); + } + } +} + +void erts_flxctr_destroy(ErtsFlxCtr* c, ErtsAlcType_t type) +{ + if (c->is_decentralized) { + if (erts_flxctr_is_snapshot_ongoing(c)) { + ErtsFlxCtrDecentralizedCtrArray* array = + ERTS_FLXCTR_GET_CTR_ARRAY_PTR(c); + /* Try to delegate the resposibilty of freeing to + thr_prg_wake_up_and_count */ + Sint expected = ERTS_FLXCTR_SNAPSHOT_ONGOING; + if (expected != + erts_atomic_cmpxchg_mb(&array->snapshot_status, + ERTS_FLXCTR_SNAPSHOT_ONGOING_TP_THREAD_DO_FREE, + expected)) { + /* The delegation was unsuccessful which means that no + snapshot is ongoing anymore and the freeing needs + to be done here */ + ERTS_ASSERT(!erts_flxctr_is_snapshot_ongoing(c)); + erts_free(type, array->block_start); + } + } else { + erts_free(type, ERTS_FLXCTR_GET_CTR_ARRAY_PTR(c)->block_start); + } + } +} + +ErtsFlxCtrSnapshotResult +erts_flxctr_snapshot(ErtsFlxCtr* c, + ErtsAlcType_t alloc_type, + Process* p) +{ + if (c->is_decentralized) { + ErtsFlxCtrDecentralizedCtrArray* array = ERTS_FLXCTR_GET_CTR_ARRAY_PTR(c); + if (erts_flxctr_is_snapshot_ongoing(c)) { + /* Let the caller try again later */ + ErtsFlxCtrSnapshotResult res = + {.type = ERTS_FLXCTR_TRY_AGAIN_AFTER_TRAP}; + suspend_until_thr_prg(p); + return res; + } else { + Eterm* hp; + Binary* state_bin; + Eterm state_mref; + DecentralizedReadSnapshotInfo* info; + ErtsFlxCtrDecentralizedCtrArray* new_array = + create_decentralized_ctr_array(alloc_type, c->nr_of_counters); + int success = + ((Sint)array) == erts_atomic_cmpxchg_mb(&c->u.counters_ptr, + (Sint)new_array, + (Sint)array); + if (!success) { + /* Let the caller try again later */ + ErtsFlxCtrSnapshotResult res = + {.type = ERTS_FLXCTR_TRY_AGAIN_AFTER_TRAP}; + suspend_until_thr_prg(p); + erts_free(alloc_type, new_array->block_start); + return res; + } + /* Create binary with info about the operation that can be + sent to the caller and to a thread progress function */ + state_bin = + erts_create_magic_binary(sizeof(DecentralizedReadSnapshotInfo), + erts_flxctr_read_ctx_bin_dtor); + hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE); + state_mref = erts_mk_magic_ref(&hp, &MSO(p), state_bin); + info = ERTS_MAGIC_BIN_DATA(state_bin); + info->alloc_type = alloc_type; + info->array = array; + info->next_array = new_array; + info->process = p; + info->nr_of_counters = c->nr_of_counters; + erts_proc_inc_refc(p); + erts_refc_inctest(&state_bin->intern.refc, 2); + erts_suspend(p, ERTS_PROC_LOCK_MAIN, NULL); + ERTS_VBUMP_ALL_REDS(p); + erts_schedule_thr_prgr_later_op(thr_prg_wake_up_and_count, + state_bin, + &info->later_op); + { + ErtsFlxCtrSnapshotResult res = { + .type = ERTS_FLXCTR_GET_RESULT_AFTER_TRAP, + .trap_resume_state = state_mref}; + return res; + } + } + } else { + ErtsFlxCtrSnapshotResult res; + int i; + res.type = ERTS_FLXCTR_DONE; + for (i = 0; i < c->nr_of_counters; i++){ + res.result[i] = erts_flxctr_read_centralized(c, i); + } + return res; + } +} + + +Sint erts_flxctr_get_snapshot_result_after_trap(Eterm result_holder, + Uint counter_nr) +{ + Binary* bin = erts_magic_ref2bin(result_holder); + DecentralizedReadSnapshotInfo* data = ERTS_MAGIC_BIN_DATA(bin);; + return data->result[counter_nr]; +} + +int erts_flxctr_is_snapshot_result(Eterm term) +{ + if (is_internal_magic_ref(term)) { + Binary* bin = erts_magic_ref2bin(term); + return ERTS_MAGIC_BIN_DESTRUCTOR(bin) == erts_flxctr_read_ctx_bin_dtor; + } else return 0; +} + +Sint erts_flxctr_read_approx(ErtsFlxCtr* c, + Uint counter_nr) +{ + if (c->is_decentralized) { + ErtsFlxCtrDecentralizedCtrArray* counter = ERTS_FLXCTR_GET_CTR_ARRAY_PTR(c); + Sint sum = 0; + int sched; + for (sched = 0; sched < ERTS_FLXCTR_DECENTRALIZED_NO_SLOTS; sched++) { + sum = sum + erts_atomic_read_nob(&counter->array[sched].counters[counter_nr]); + } + return sum; + } else { + return erts_flxctr_read_centralized(c, counter_nr); + } +} + +int erts_flxctr_is_snapshot_ongoing(ErtsFlxCtr* c) +{ + return c->is_decentralized && + (ERTS_FLXCTR_SNAPSHOT_NOT_ONGOING != + erts_atomic_read_acqb(&ERTS_FLXCTR_GET_CTR_ARRAY_PTR(c)->snapshot_status)); +} + +int erts_flxctr_suspend_until_thr_prg_if_snapshot_ongoing(ErtsFlxCtr* c, Process* p) +{ + if (erts_flxctr_is_snapshot_ongoing(c)) { + suspend_until_thr_prg(p); + return 1; + } else { + return 0; + } +} + +void erts_flxctr_reset(ErtsFlxCtr* c, + Uint counter_nr) +{ + if (c->is_decentralized) { + int sched; + ErtsFlxCtrDecentralizedCtrArray* counter = + ERTS_FLXCTR_GET_CTR_ARRAY_PTR(c); + for (sched = 0; sched < ERTS_FLXCTR_DECENTRALIZED_NO_SLOTS; sched++) { + erts_atomic_set_nob(&counter->array[sched].counters[counter_nr], 0); + } + } else { + erts_atomic_set_nob(&c->u.counters[counter_nr], 0); + } +} + + +void erts_flxctr_set_slot(int group) { + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + esdp->flxctr_slot_no = group; +} diff --git a/erts/emulator/beam/erl_flxctr.h b/erts/emulator/beam/erl_flxctr.h new file mode 100644 index 0000000000..5cab02b9eb --- /dev/null +++ b/erts/emulator/beam/erl_flxctr.h @@ -0,0 +1,406 @@ +/* + * %CopyrightBegin% + * + * Copyright Ericsson AB 2019. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * %CopyrightEnd% + */ + +/** + * @file erl_flxctr.h + * + * @brief This file contains the API of a flexible counter. The + * counter can be configured during its initialization to be + * centralized or decentralized. The centralized configuration makes + * it possible to read the counter value extremely efficiently, but + * updates of the counter value can easily cause contention. The + * decentralized configuration has the reverse trade-off (i.e., + * updates are efficient and scalable but reading the counter value is + * slow and may cause contention). + * + * @author Kjell Winblad + */ + +#ifndef ERL_FLXCTR_H__ +#define ERL_FLXCTR_H__ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif +#include "sys.h" +#include "erl_vm.h" +#include "global.h" +#include "error.h" +#include "bif.h" +#include "big.h" +#include "erl_binary.h" +#include "bif.h" +#include <stddef.h> + +/* Public Interface */ + +#define ERTS_MAX_FLXCTR_GROUPS 256 +#define ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE (ERTS_CACHE_LINE_SIZE / sizeof(erts_atomic_t)) + +typedef struct { + int nr_of_counters; + int is_decentralized; + union { + erts_atomic_t counters_ptr; + erts_atomic_t counters[1]; + } u; +} ErtsFlxCtr; + +#define ERTS_FLXCTR_NR_OF_EXTRA_BYTES(NR_OF_COUNTERS) \ + ((NR_OF_COUNTERS-1) * sizeof(erts_atomic_t)) + +/* Called by early_init */ +void erts_flxctr_setup(int decentralized_counter_groups); + +/** + * @brief Initializes an ErtsFlxCtr. The macro + * ERTS_FLXCTR_NR_OF_EXTRA_BYTES should be used to determine how much + * extra space that needs to be allocated directly after the + * ErtsFlxCtr when is_decentralized is set to zero. Each ErtsFlxCtr + * instance may contain up to ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE + * counters. These counters are numbered from zero to + * (ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE-1). Most of the functions in + * this module take a parameter named counter_nr that controls which + * of the ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE counters in the given + * ErtsFlxCtr that should be operated on. + * + * @param c The counter to initialize + * @param is_decentralized Non-zero value to make c decentralized + * @param nr_of_counters The number of counters included in c + * (max ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE) + * @param alloc_type + */ +void erts_flxctr_init(ErtsFlxCtr* c, + int is_decentralized, + Uint nr_of_counters, + ErtsAlcType_t alloc_type); + +/** + * @brief Destroys an initialized counter. + * + * @param c The counter that should be destroyed + * @param alloc_type The allocation type (needs to be the same as the + * one passed to erts_flxctr_init when c was + * initialized) + */ +void erts_flxctr_destroy(ErtsFlxCtr* c, ErtsAlcType_t alloc_type); + +/** + * @brief Adds to_add to the counter with counter_nr in c + * + * @param c the ErtsFlxCtr to operate on + * @param counter_nr The number of the counter in c to modify + * @param to_add The amount that should be added to the specified counter + */ +ERTS_GLB_INLINE +void erts_flxctr_add(ErtsFlxCtr* c, + Uint counter_nr, + int to_add); + +/** + * @brief Increases the specified counter by 1 + * + * @param c The ErtsFlxCtr instance to operate on + * @param counter_nr The number of the counter within c to operate on + */ +ERTS_GLB_INLINE +void erts_flxctr_inc(ErtsFlxCtr* c, + Uint counter_nr); + +/** + * @brief Decreases the specified counter by 1 + */ +ERTS_GLB_INLINE +void erts_flxctr_dec(ErtsFlxCtr* c, + Uint counter_nr); + +/** + * @brief This function tries to return the current value of the + * specified counter but may return an incorrect result if the counter + * is decentralized and other threads are accessing the counter + * concurrently. + * + * @param c The ErtsFlxCtr instance to operate on + * @param counter_nr The number of the counter within c to operate on + * + * @return A snapshot of the specifed counter if c is centralized or a + * possibly incorrect estimate of the counter value if c is + * decentralized + */ +Sint erts_flxctr_read_approx(ErtsFlxCtr* c, + Uint counter_nr); + +/** + * @brief This function can only be used together with an ErtsFlxCtr + * that is configured to be centralized. The function increments the + * specified counter by 1 and returns the value of the counter after + * the increment. + */ +ERTS_GLB_INLINE +Sint erts_flxctr_inc_read_centralized(ErtsFlxCtr* c, + Uint counter_nr); + +/** + * @brief This function can only be used together with a ErtsFlxCtr + * that is configured to be centralized. The function decrements the + * specified counter by 1 and returns the value of the counter after + * the operation. + */ +ERTS_GLB_INLINE +Sint erts_flxctr_dec_read_centralized(ErtsFlxCtr* c, + Uint counter_nr); + +/** + * @brief This function can only be used together with an ErtsFlxCtr + * that is configured to be centralized. The function returns the + * current value of the specified counter. + */ +ERTS_GLB_INLINE +Sint erts_flxctr_read_centralized(ErtsFlxCtr* c, + Uint counter_nr); + + +typedef enum { + ERTS_FLXCTR_TRY_AGAIN_AFTER_TRAP, + ERTS_FLXCTR_DONE, + ERTS_FLXCTR_GET_RESULT_AFTER_TRAP +} ErtsFlxctrSnapshotResultType; + +typedef struct { + ErtsFlxctrSnapshotResultType type; + Eterm trap_resume_state; + Sint result[ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE]; +} ErtsFlxCtrSnapshotResult; + +/** + * @brief This function initiates an atomic snapshot of an ErtsFlxCtr + * to read out the values of one or more of the counters that are + * stored in the given ErtsFlxCtr. The caller needs to perform + * different actions after the return of this function depending on + * the value of the type field in the returned struct: + * + * - The caller needs to trap and try again after the trap if the + * return value has the type ERTS_FLXCTR_TRY_AGAIN_AFTER_TRAP. + * + * - The caller can get the result directly from the result field of + * the returned struct if the return value has the type + * ERTS_FLXCTR_DONE. The value at index i in the result field + * correspond to counter number i. + * + * - Finally, if the return value has the type + * ERTS_FLXCTR_GET_RESULT_AFTER_TRAP, then the caller needs to save + * the value of the field trap_resume_state from the returned struct + * and trap. After the trap, the values of the counters can be + * obtained by using the function + * erts_flxctr_get_snapshot_result_after_trap. Note that the + * function erts_flxctr_is_snapshot_result can be used to check if a + * value is obtained from the trap_resume_state field in the + * returned struct (this can be useful when the calling function + * wakes up again after the trap). + * + * The snapshot operation that is initiated by this function should be + * considered to be ongoing from the issuing of this function until a + * struct with the type field set to ERTS_FLXCTR_DONE has been + * returned from the function or until the caller of this function has + * woken up after trapping. + * + * @param c The ErtsFlxCtr that the snapshot shall be taken from + * @param alloc_type The allocation type (needs to be the same as the + * type passed to erts_flxctr_init when c was + * initialized) + * @param p The Erlang process that is doing the call + * + * @return See the description above + * + */ +ErtsFlxCtrSnapshotResult +erts_flxctr_snapshot(ErtsFlxCtr* c, + ErtsAlcType_t alloc_type, + Process* p); + +/** + * @brief Checks if the parameter term is a snapshot result (i.e., + * something obtained from the trap_resume_state field of an + * ErtsFlxCtrSnapshotResult struct that has been returned from + * erts_flxctr_snapshot). + * + * @param term The term to check + * + * @return A nonzero value iff the term is a snapshot result + */ +int erts_flxctr_is_snapshot_result(Eterm term); + +/** + * @brief Returns the result of a snapshot for a counter given a + * snapshot result returned by a call to erts_flxctr_snapshot (i.e., + * the value stored in the trap_resume_state field of a struct + * returned by erts_flxctr_snapshot). The caller needs to trap between + * the return of erts_flxctr_snapshot and the call to this function. + */ +Sint erts_flxctr_get_snapshot_result_after_trap(Eterm trap_resume_state, + Uint counter_nr); + +/** + * @brief Resets the specified counter to 0. This function is unsafe + * to call while a snapshot operation may be active (initiated with + * the erts_flxctr_snapshot function). + */ +void erts_flxctr_reset(ErtsFlxCtr* c, + Uint counter_nr); + +/** + * @brief Checks if a snapshot operation is active (snapshots are + * initiated with the erts_flxctr_snapshot function). + * + * @return nonzero value iff a snapshot was active at some point + * between the invocation and return of the function + */ +int erts_flxctr_is_snapshot_ongoing(ErtsFlxCtr* c); + +/** + * @brief This function checks if a snapshot operation is ongoing + * (snapshots are initiated with the erts_flxctr_snapshot function) + * and suspend the given process until thread progress has happened if + * it detected an ongoing snapshot operation. The caller needs to trap + * if a non-zero value is returned. + * + * @param c The ErtsFlxCtr to check + * @param p The calling process + * + * @return nonzero value if the given process has got suspended + */ +int erts_flxctr_suspend_until_thr_prg_if_snapshot_ongoing(ErtsFlxCtr* c, Process* p); + +/* End: Public Interface */ + +/* Internal Declarations */ + +#define ERTS_FLXCTR_GET_CTR_ARRAY_PTR(C) \ + ((ErtsFlxCtrDecentralizedCtrArray*) erts_atomic_read_acqb(&(C)->u.counters_ptr)) +#define ERTS_FLXCTR_GET_CTR_PTR(C, SCHEDULER_ID, COUNTER_ID) \ + &(ERTS_FLXCTR_GET_CTR_ARRAY_PTR(C))->array[SCHEDULER_ID].counters[COUNTER_ID] + + +typedef union { + erts_atomic_t counters[ERTS_FLXCTR_ATOMICS_PER_CACHE_LINE]; + char pad[ERTS_CACHE_LINE_SIZE]; +} ErtsFlxCtrDecentralizedCtrArrayElem; + +typedef struct ErtsFlxCtrDecentralizedCtrArray { + void* block_start; + erts_atomic_t snapshot_status; + ErtsFlxCtrDecentralizedCtrArrayElem array[]; +} ErtsFlxCtrDecentralizedCtrArray; + +void erts_flxctr_set_slot(int group); + +ERTS_GLB_INLINE +int erts_flxctr_get_slot_index(void); + +/* End: Internal Declarations */ + + +/* Implementation of inlined functions */ + +#if ERTS_GLB_INLINE_INCL_FUNC_DEF + +ERTS_GLB_INLINE +int erts_flxctr_get_slot_index(void) +{ + ErtsSchedulerData *esdp = erts_get_scheduler_data(); + ASSERT(esdp && !ERTS_SCHEDULER_IS_DIRTY(esdp)); + ASSERT(esdp->flxctr_slot_no > 0); + return esdp->flxctr_slot_no; +} + +ERTS_GLB_INLINE +void erts_flxctr_add(ErtsFlxCtr* c, + Uint counter_nr, + int to_add) +{ + ASSERT(counter_nr < c->nr_of_counters); + if (c->is_decentralized) { + erts_atomic_add_nob(ERTS_FLXCTR_GET_CTR_PTR(c, + erts_flxctr_get_slot_index(), + counter_nr), + to_add); + } else { + erts_atomic_add_nob(&c->u.counters[counter_nr], to_add); + } +} + +ERTS_GLB_INLINE +void erts_flxctr_inc(ErtsFlxCtr* c, + Uint counter_nr) +{ + ASSERT(counter_nr < c->nr_of_counters); + if (c->is_decentralized) { + erts_atomic_inc_nob(ERTS_FLXCTR_GET_CTR_PTR(c, + erts_flxctr_get_slot_index(), + counter_nr)); + } else { + erts_atomic_inc_read_nob(&c->u.counters[counter_nr]); + } +} + +ERTS_GLB_INLINE +void erts_flxctr_dec(ErtsFlxCtr* c, + Uint counter_nr) +{ + ASSERT(counter_nr < c->nr_of_counters); + if (c->is_decentralized) { + erts_atomic_dec_nob(ERTS_FLXCTR_GET_CTR_PTR(c, + erts_flxctr_get_slot_index(), + counter_nr)); + } else { + erts_atomic_dec_nob(&c->u.counters[counter_nr]); + } +} + +ERTS_GLB_INLINE +Sint erts_flxctr_inc_read_centralized(ErtsFlxCtr* c, + Uint counter_nr) +{ + ASSERT(counter_nr < c->nr_of_counters); + ASSERT(!c->is_decentralized); + return erts_atomic_inc_read_nob(&c->u.counters[counter_nr]); +} + +ERTS_GLB_INLINE +Sint erts_flxctr_dec_read_centralized(ErtsFlxCtr* c, + Uint counter_nr) +{ + ASSERT(counter_nr < c->nr_of_counters); + ASSERT(!c->is_decentralized); + return erts_atomic_dec_read_nob(&c->u.counters[counter_nr]); +} + +ERTS_GLB_INLINE +Sint erts_flxctr_read_centralized(ErtsFlxCtr* c, + Uint counter_nr) +{ + ASSERT(counter_nr < c->nr_of_counters); + ASSERT(!c->is_decentralized); + return erts_atomic_read_nob(&((erts_atomic_t*)(c->u.counters))[counter_nr]); +} + +#endif /* #if ERTS_GLB_INLINE_INCL_FUNC_DEF */ + +#endif /* ERL_FLXCTR_H__ */ diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index 12750b9aa6..547e4064a2 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -593,6 +593,7 @@ void erts_usage(void) erts_fprintf(stderr, " no_time_warp|single_time_warp|multi_time_warp\n"); erts_fprintf(stderr, "-d don't write a crash dump for internally detected errors\n"); erts_fprintf(stderr, " (halt(String) will still produce a crash dump)\n"); + erts_fprintf(stderr, "-dcg set the limit for the number of decentralized counter groups\n"); erts_fprintf(stderr, "-fn[u|a|l] Control how filenames are interpreted\n"); erts_fprintf(stderr, "-hms size set minimum heap size in words (default %d)\n", H_DEFAULT_SIZE); @@ -785,6 +786,8 @@ early_init(int *argc, char **argv) /* int dirty_io_scheds; int max_reader_groups; int reader_groups; + int max_decentralized_counter_groups; + int decentralized_counter_groups; char envbuf[21]; /* enough for any 64-bit integer */ size_t envbufsz; @@ -804,7 +807,8 @@ early_init(int *argc, char **argv) /* erts_initialized = 0; - erts_pre_early_init_cpu_topology(&max_reader_groups, + erts_pre_early_init_cpu_topology(&max_decentralized_counter_groups, + &max_reader_groups, &ncpu, &ncpuonln, &ncpuavail); @@ -865,6 +869,24 @@ early_init(int *argc, char **argv) /* } if (argv[i][0] == '-') { switch (argv[i][1]) { + case 'd': { + char *sub_param = argv[i]+2; + if (has_prefix("cg", sub_param)) { + char *arg = get_arg(sub_param+2, argv[i+1], &i); + if (sscanf(arg, "%d", &max_decentralized_counter_groups) != 1) { + erts_fprintf(stderr, + "bad decentralized counter groups limit: %s\n", arg); + erts_usage(); + } + if (max_decentralized_counter_groups < 0) { + erts_fprintf(stderr, + "bad decentralized counter groups limit: %d\n", + max_decentralized_counter_groups); + erts_usage(); + } + } + break; + } case 'r': { char *sub_param = argv[i]+2; if (has_prefix("g", sub_param)) { @@ -1186,8 +1208,10 @@ early_init(int *argc, char **argv) /* erts_early_init_cpu_topology(no_schedulers, &max_main_threads, max_reader_groups, - &reader_groups); - + &reader_groups, + max_decentralized_counter_groups, + &decentralized_counter_groups); + erts_flxctr_setup(decentralized_counter_groups); { erts_thr_late_init_data_t elid = ERTS_THR_LATE_INIT_DATA_DEF_INITER; elid.mem.std.alloc = ethr_std_alloc; diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h index 711b73417d..6118c671ee 100644 --- a/erts/emulator/beam/erl_process.h +++ b/erts/emulator/beam/erl_process.h @@ -641,6 +641,7 @@ struct ErtsSchedulerData_ { ErtsSchedType type; Uint no; /* Scheduler number for normal schedulers */ Uint dirty_no; /* Scheduler number for dirty schedulers */ + int flxctr_slot_no; /* slot nr when a flxctr is used */ struct enif_environment_t *current_nif; Process *dirty_shadow_process; Port *current_port; @@ -1847,6 +1848,7 @@ int erts_resume_processes(ErtsProcList *); void erts_deep_process_dump(fmtfn_t, void *); Eterm erts_get_reader_groups_map(Process *c_p); +Eterm erts_get_decentralized_counter_groups_map(Process *c_p); Eterm erts_debug_reader_groups_map(Process *c_p, int groups); Uint erts_debug_nbalance(void); diff --git a/lib/stdlib/test/ets_SUITE.erl b/lib/stdlib/test/ets_SUITE.erl index 87ca9bd32c..cd32f25abd 100644 --- a/lib/stdlib/test/ets_SUITE.erl +++ b/lib/stdlib/test/ets_SUITE.erl @@ -42,6 +42,8 @@ select_bound_chunk/1, t_delete_all_objects/1, t_insert_list/1, t_test_ms/1, t_select_delete/1,t_select_replace/1,t_select_replace_next_bug/1,t_ets_dets/1]). +-export([test_table_size_concurrency/1,test_table_memory_concurrency/1, + test_delete_table_while_size_snapshot/1, test_delete_table_while_size_snapshot_helper/0]). -export([ordered/1, ordered_match/1, interface_equality/1, fixtable_next/1, fixtable_insert/1, rename/1, rename_unnamed/1, evil_rename/1, @@ -156,7 +158,11 @@ all() -> whereis_table, delete_unfix_race, test_throughput_benchmark, - {group, benchmark}]. + {group, benchmark}, + test_table_size_concurrency, + test_table_memory_concurrency, + test_delete_table_while_size_snapshot]. + groups() -> [{new, [], @@ -828,7 +834,11 @@ adjust_xmem([_T1,_T2,_T3,_T4], {A0,B0,C0,D0} = _Mem0, EstCnt) -> {TabSz, EstSz} = erts_debug:get_internal_state('DbTable_words'), HTabSz = TabSz + EstCnt*EstSz, - {A0+TabSz, B0+HTabSz, C0+HTabSz, D0+HTabSz}. + OrdSetExtra = case erlang:system_info(wordsize) of + 8 -> 40; % larger stack on 64 bit architectures + _ -> 0 + end, + {A0+TabSz+OrdSetExtra, B0+HTabSz, C0+HTabSz, D0+HTabSz}. %% Misc. whitebox tests t_whitebox(Config) when is_list(Config) -> @@ -4102,6 +4112,11 @@ slot_do(Opts) -> fill_tab(Tab,foo), Elts = ets:info(Tab,size), Elts = slot_loop(Tab,0,0), + case ets:info(Tab, type) of + ordered_set -> + '$end_of_table' = ets:slot(Tab,Elts); + _ -> ok + end, true = ets:delete(Tab), verify_etsmem(EtsMem). @@ -4453,6 +4468,127 @@ info_do(Opts) -> undefined = ets:info(non_existing_table_xxyy,safe_fixed), verify_etsmem(EtsMem). +size_loop(_T, 0, _, _) -> + ok; +size_loop(T, I, PrevSize, WhatToTest) -> + Size = ets:info(T, WhatToTest), + case Size < PrevSize of + true -> ct:fail("Bad ets:info/2"); + _ -> ok + end, + size_loop(T, I -1, Size, WhatToTest). + +add_loop(_T, 0) -> + ok; +add_loop(T, I) -> + ets:insert(T, {I}), + add_loop(T, I -1). + + +test_table_counter_concurrency(WhatToTest) -> + ItemsToAdd = 1000000, + SizeLoopSize = 1000, + T = ets:new(k, [public, ordered_set, {write_concurrency, true}]), + 0 = ets:info(T, size), + P = self(), + SpawnedSizeProcs = + [spawn(fun() -> + size_loop(T, SizeLoopSize, 0, WhatToTest), + P ! done + end) + || _ <- lists:seq(1, 6)], + spawn(fun() -> + add_loop(T, ItemsToAdd), + P ! done_add + end), + [receive + done -> ok; + done_add -> ok + end + || _ <- [ok|SpawnedSizeProcs]], + case WhatToTest =:= size of + true -> + ItemsToAdd = ets:info(T, size); + _ -> + ok + end, + ok. + +test_table_size_concurrency(Config) when is_list(Config) -> + test_table_counter_concurrency(size). + +test_table_memory_concurrency(Config) when is_list(Config) -> + test_table_counter_concurrency(memory). + +%% Tests that calling the ets:delete operation on a table T with +%% decentralized counters works while ets:info(T, size) operations are +%% active +test_delete_table_while_size_snapshot(Config) when is_list(Config) -> + %% Run test case in a slave node as other test suites in stdlib + %% depend on that pids are ordered in creation order which is no + %% longer the case when many processes have been started before + Node = start_slave(), + ok = rpc:call(Node, ?MODULE, test_delete_table_while_size_snapshot_helper, []), + test_server:stop_node(Node), + ok. + +test_delete_table_while_size_snapshot_helper()-> + TopParent = self(), + repeat_par( + fun() -> + Table = ets:new(t, [public, ordered_set, + {write_concurrency, true}]), + Parent = self(), + NrOfSizeProcs = 100, + Pids = [ spawn(fun()-> size_process(Table, Parent) end) + || _ <- lists:seq(1, NrOfSizeProcs)], + timer:sleep(1), + ets:delete(Table), + [receive + table_gone -> ok; + Problem -> TopParent ! Problem + end || _ <- Pids] + end, + 15000), + receive + Problem -> throw(Problem) + after 0 -> ok + end. + +size_process(Table, Parent) -> + try ets:info(Table, size) of + N when is_integer(N) -> + size_process(Table, Parent); + undefined -> Parent ! table_gone; + E -> Parent ! {got_unexpected, E} + catch + E -> Parent ! {got_unexpected_exception, E} + end. + +start_slave() -> + MicroSecs = erlang:monotonic_time(), + Name = "ets_" ++ integer_to_list(MicroSecs), + Pa = filename:dirname(code:which(?MODULE)), + {ok, Node} = test_server:start_node(list_to_atom(Name), slave, [{args, "-pa " ++ Pa}]), + Node. + +repeat_par(FunToRepeat, NrOfTimes) -> + repeat_par_help(FunToRepeat, NrOfTimes, NrOfTimes). + +repeat_par_help(_FunToRepeat, 0, OrgNrOfTimes) -> + repeat(fun()-> receive done -> ok end end, OrgNrOfTimes); +repeat_par_help(FunToRepeat, NrOfTimes, OrgNrOfTimes) -> + Parent = self(), + case NrOfTimes rem 5 of + 0 -> timer:sleep(1); + _ -> ok + end, + spawn(fun()-> + FunToRepeat(), + Parent ! done + end), + repeat_par_help(FunToRepeat, NrOfTimes-1, OrgNrOfTimes). + %% Test various duplicate_bags stuff. dups(Config) when is_list(Config) -> repeat_for_opts(fun dups_do/1). |