diff options
Diffstat (limited to 'erts')
38 files changed, 2768 insertions, 248 deletions
diff --git a/erts/doc/src/absform.xml b/erts/doc/src/absform.xml index 55aef9f8ab..4acc03b133 100644 --- a/erts/doc/src/absform.xml +++ b/erts/doc/src/absform.xml @@ -290,6 +290,18 @@ <item>If E is <c><![CDATA[fun Fc_1 ; ... ; Fc_k end]]></c> where each <c><![CDATA[Fc_i]]></c> is a function clause then Rep(E) = <c><![CDATA[{'fun',LINE,{clauses,[Rep(Fc_1), ..., Rep(Fc_k)]}}]]></c>.</item> + <item>If E is <c><![CDATA[fun Name Fc_1 ; ... ; Name Fc_k end]]></c> + where <c><![CDATA[Name]]></c> is a variable and each + <c><![CDATA[Fc_i]]></c> is a function clause then Rep(E) = + <c><![CDATA[{named_fun,LINE,Name,[Rep(Fc_1), ..., Rep(Fc_k)]}]]></c>. + </item> + <item>If E is <c><![CDATA[query [E_0 || W_1, ..., W_k] end]]></c>, + where each <c><![CDATA[W_i]]></c> is a generator or a filter, then + Rep(E) = <c><![CDATA[{'query',LINE,{lc,LINE,Rep(E_0),[Rep(W_1), ..., Rep(W_k)]}}]]></c>. + For Rep(W), see below.</item> + <item>If E is <c><![CDATA[E_0.Field]]></c>, a Mnesia record access + inside a query, then + Rep(E) = <c><![CDATA[{record_field,LINE,Rep(E_0),Rep(Field)}]]></c>.</item> <item>If E is <c><![CDATA[( E_0 )]]></c>, then Rep(E) = <c><![CDATA[Rep(E_0)]]></c>, i.e., parenthesized expressions cannot be distinguished from their bodies.</item> diff --git a/erts/doc/src/erlc.xml b/erts/doc/src/erlc.xml index 10cab344b0..c3fc3b1686 100644 --- a/erts/doc/src/erlc.xml +++ b/erts/doc/src/erlc.xml @@ -234,6 +234,16 @@ erlc +export_all file.erl</pre> from the shell.</p> <p>Supported options: -I, -o, -D, -v, -W, -b.</p> </item> + <tag>.S</tag> + <item> + <p>Erlang assembler source code. It generates a <c><![CDATA[.beam]]></c> file.</p> + <p>Supported options: same as for .erl.</p> + </item> + <tag>.core</tag> + <item> + <p>Erlang core source code. It generates a <c><![CDATA[.beam]]></c> file.</p> + <p>Supported options: same as for .erl.</p> + </item> <tag>.yrl</tag> <item> <p>Yecc source code. It generates an <c><![CDATA[.erl]]></c> file.</p> diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index eee4badfb8..96547ba743 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -116,6 +116,7 @@ atom binary_longest_prefix_trap atom binary_longest_suffix_trap atom binary_match_trap atom binary_matches_trap +atom binary_to_term_trap atom block atom blocked atom bm diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab index dd50df636c..3ec534f0bc 100644 --- a/erts/emulator/beam/bif.tab +++ b/erts/emulator/beam/bif.tab @@ -45,7 +45,6 @@ bif erlang:apply/3 bif erlang:atom_to_list/1 bif erlang:binary_to_list/1 bif erlang:binary_to_list/3 -bif erlang:binary_to_term/1 bif erlang:crc32/1 bif erlang:crc32/2 bif erlang:crc32_combine/3 @@ -152,6 +151,8 @@ bif erts_internal:port_command/3 bif erts_internal:port_control/3 bif erts_internal:port_close/1 bif erts_internal:port_connect/2 +bif erts_internal:binary_to_term/1 +bif erts_internal:binary_to_term/2 bif erts_internal:request_system_task/3 bif erts_internal:check_process_code/2 @@ -479,11 +480,6 @@ bif erlang:call_on_load_function/1 bif erlang:finish_after_on_load/2 # -# New Bifs in R13B4 -# -bif erlang:binary_to_term/2 - -# # The binary match bifs (New in R14A - EEP9) # diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c index 1c3e955f47..1728b200f7 100644 --- a/erts/emulator/beam/erl_bif_ddll.c +++ b/erts/emulator/beam/erl_bif_ddll.c @@ -182,7 +182,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3) Eterm name_term = BIF_ARG_2; Eterm options = BIF_ARG_3; char *path = NULL; - ErlDrvSizeT path_len; + Sint path_len; char *name = NULL; DE_Handle *dh; erts_driver_t *drv; @@ -198,6 +198,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3) int kill_ports = 0; int do_build_load_error = 0; int build_this_load_error = 0; + int encoding; for(l = options; is_list(l); l = CDR(list_val(l))) { Eterm opt = CAR(list_val(l)); @@ -257,18 +258,23 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3) goto error; } - if (erts_iolist_size(path_term, &path_len)) { - goto error; + encoding = erts_get_native_filename_encoding(); + if (encoding == ERL_FILENAME_WIN_WCHAR) { + /* Do not convert the lib name to utf-16le yet, do that in win32 specific code */ + /* since lib_name is used in error messages */ + encoding = ERL_FILENAME_UTF8; } - path = erts_alloc(ERTS_ALC_T_DDLL_TMP_BUF, path_len + 1 /* might need path separator */ + sys_strlen(name) + 1); - if (erts_iolist_to_buf(path_term, path, path_len) != 0) { + path = erts_convert_filename_to_encoding(path_term, NULL, 0, + ERTS_ALC_T_DDLL_TMP_BUF, 1, 0, + encoding, &path_len, + sys_strlen(name) + 2); /* might need path separator */ + if (!path) { goto error; } - while (path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/')) { - --path_len; - } + ASSERT(path_len > 0 && path[path_len-1] == 0); + while (--path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/')) + ; path[path_len++] = '/'; - /*path[path_len] = '\0';*/ sys_strcpy(path+path_len,name); #if DDLL_SMP @@ -1524,7 +1530,7 @@ static int do_load_driver_entry(DE_Handle *dh, char *path, char *name) assert_drv_list_rwlocked(); - if ((res = erts_sys_ddll_open(path, &(dh->handle))) != ERL_DE_NO_ERROR) { + if ((res = erts_sys_ddll_open(path, &(dh->handle), NULL)) != ERL_DE_NO_ERROR) { return res; } diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c index 2b40f9272d..414ae2f046 100755 --- a/erts/emulator/beam/erl_bif_info.c +++ b/erts/emulator/beam/erl_bif_info.c @@ -3613,8 +3613,9 @@ BIF_RETTYPE erts_debug_set_internal_state_2(BIF_ALIST_2) default: BIF_ERROR(BIF_P, BADARG); break; } - res = erts_set_gc_state(BIF_P, enable); - BIF_RET(res ? am_true : am_false); + res = (BIF_P->flags & F_DISABLE_GC) ? am_false : am_true; + erts_set_gc_state(BIF_P, enable); + BIF_RET(res); } else if (ERTS_IS_ATOM_STR("send_fake_exit_signal", BIF_ARG_1)) { /* Used by signal_SUITE (emulator) */ diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c index 864349491a..f298422267 100644 --- a/erts/emulator/beam/erl_bif_port.c +++ b/erts/emulator/beam/erl_bif_port.c @@ -808,7 +808,7 @@ open_port(Process* p, Eterm name, Eterm settings, int *err_typep, int *err_nump) if (encoding == ERL_FILENAME_WIN_WCHAR) { encoding = ERL_FILENAME_UTF8; } - if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL)) + if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL, 0)) == NULL) { goto badarg; } diff --git a/erts/emulator/beam/erl_bif_re.c b/erts/emulator/beam/erl_bif_re.c index 99c31738a5..448c6f6f6d 100644 --- a/erts/emulator/beam/erl_bif_re.c +++ b/erts/emulator/beam/erl_bif_re.c @@ -1196,8 +1196,8 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3) ovsize = 3*(unsigned_val(tp[2])+1); code_size = binary_size(tp[5]); - if ((code_tmp = (const pcre *) - erts_get_aligned_binary_bytes(tp[5], &temp_alloc)) == NULL) { + code_tmp = (const pcre *) erts_get_aligned_binary_bytes(tp[5], &temp_alloc); + if (code_tmp == NULL || code_size < 4) { erts_free_aligned_binary_bytes(temp_alloc); BIF_ERROR(p, BADARG); } diff --git a/erts/emulator/beam/erl_debug.c b/erts/emulator/beam/erl_debug.c index dc79d45be7..873a9860da 100644 --- a/erts/emulator/beam/erl_debug.c +++ b/erts/emulator/beam/erl_debug.c @@ -1,7 +1,7 @@ /* * %CopyrightBegin% * - * Copyright Ericsson AB 1998-2012. All Rights Reserved. + * Copyright Ericsson AB 1998-2013. All Rights Reserved. * * The contents of this file are subject to the Erlang Public License, * Version 1.1, (the "License"); you may not use this file except in diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c index 8c4fffa75b..1af80dd04b 100644 --- a/erts/emulator/beam/erl_init.c +++ b/erts/emulator/beam/erl_init.c @@ -553,8 +553,8 @@ void erts_usage(void) erts_fprintf(stderr, " numbers is %d\n", ERTS_MAX_NO_OF_SCHEDULERS); erts_fprintf(stderr, "-SP p1:p2 specify schedulers (p1) and schedulers online (p2)\n"); - erts_fprintf(stderr, " as percentages of logical processors configured and logical\n"); - erts_fprintf(stderr, " processors available, respectively\n"); + erts_fprintf(stderr, " as percentages of logical processors configured and logical\n"); + erts_fprintf(stderr, " processors available, respectively\n"); erts_fprintf(stderr, "-t size set the maximum number of atoms the " "emulator can handle\n"); erts_fprintf(stderr, " valid range is [%d-%d]\n", diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c index e87959f0ab..dc285b3cf7 100644 --- a/erts/emulator/beam/erl_nif.c +++ b/erts/emulator/beam/erl_nif.c @@ -1407,7 +1407,7 @@ void* enif_dlopen(const char* lib, ErtsSysDdllError errdesc = ERTS_SYS_DDLL_ERROR_INIT; void* handle; void* init_func; - if (erts_sys_ddll_open2(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) { + if (erts_sys_ddll_open(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) { if (erts_sys_ddll_load_nif_init(handle, &init_func, &errdesc) == ERL_DE_NO_ERROR) { erts_sys_ddll_call_nif_init(init_func); } @@ -1587,7 +1587,8 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) encoding = ERL_FILENAME_UTF8; } lib_name = erts_convert_filename_to_encoding(BIF_ARG_1, NULL, 0, - ERTS_ALC_T_TMP, 1, 0, encoding, NULL); + ERTS_ALC_T_TMP, 1, 0, encoding, + NULL, 0); if (!lib_name) { BIF_ERROR(BIF_P, BADARG); } @@ -1626,7 +1627,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2) "module '%T' not allowed", mod_atom); } else if (init_func == NULL && - (err=erts_sys_ddll_open2(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) { + (err=erts_sys_ddll_open(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) { const char slogan[] = "Failed to load NIF library"; if (strstr(errdesc.str, lib_name) != NULL) { ret = load_nif_error(BIF_P, "load_failed", "%s: '%s'", slogan, errdesc.str); diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c index 0a41fb596d..21fd8dd50a 100644 --- a/erts/emulator/beam/erl_process.c +++ b/erts/emulator/beam/erl_process.c @@ -8271,25 +8271,22 @@ save_gc_task(Process *c_p, ErtsProcSysTask *st, int prio) int erts_set_gc_state(Process *c_p, int enable) { - int res; ErtsProcSysTaskQs *dgc_tsk_qs; ASSERT(c_p == erts_get_current_process()); ASSERT((ERTS_PSFLG_RUNNING|ERTS_PSFLG_RUNNING_SYS) & erts_smp_atomic32_read_nob(&c_p->state)); ERTS_SMP_LC_ASSERT(ERTS_PROC_LOCK_MAIN == erts_proc_lc_my_proc_locks(c_p)); - res = !(c_p->flags & F_DISABLE_GC); - if (!enable) { c_p->flags |= F_DISABLE_GC; - return res; + return 0; } c_p->flags &= ~F_DISABLE_GC; dgc_tsk_qs = ERTS_PROC_GET_DELAYED_GC_TASK_QS(c_p); if (!dgc_tsk_qs) - return res; + return 0; /* Move delayed gc tasks into sys tasks queues. */ @@ -8387,7 +8384,7 @@ erts_set_gc_state(Process *c_p, int enable) if (dgc_tsk_qs) proc_sys_task_queues_free(dgc_tsk_qs); - return res; + return 1; } void diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c index 7e3c6681d9..3a968594f3 100644 --- a/erts/emulator/beam/erl_unicode.c +++ b/erts/emulator/beam/erl_unicode.c @@ -1990,12 +1990,14 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_ { int encoding = erts_get_native_filename_encoding(); return erts_convert_filename_to_encoding(name, statbuf, statbuf_size, alloc_type, - allow_empty, allow_atom, encoding, used); + allow_empty, allow_atom, encoding, + used, 0); } char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbuf_size, ErtsAlcType_t alloc_type, int allow_empty, - int allow_atom, int encoding, Sint *used) + int allow_atom, int encoding, Sint *used, + Uint extra) { char* name_buf = NULL; @@ -2008,13 +2010,14 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu } if (encoding == ERL_FILENAME_WIN_WCHAR) { need += 2; + extra *= 2; } else { ++need; } if (used) *used = (Sint) need; - if (need > statbuf_size) { - name_buf = (char *) erts_alloc(alloc_type, need); + if (need+extra > statbuf_size) { + name_buf = (char *) erts_alloc(alloc_type, need+extra); } else { name_buf = statbuf; } @@ -2035,8 +2038,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu /*Add 0 termination only*/ if (used) *used = (Sint) size+1; - if (size+1 > statbuf_size) { - name_buf = (char *) erts_alloc(alloc_type, size+1); + if (size+1+extra > statbuf_size) { + name_buf = (char *) erts_alloc(alloc_type, size+1+extra); } else { name_buf = statbuf; } @@ -2045,7 +2048,7 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu } else { name_buf = erts_convert_filename_to_wchar(bytes, size, statbuf, statbuf_size, - alloc_type, used, 0); + alloc_type, used, extra); } erts_free_aligned_binary_bytes(temp_alloc); } else { diff --git a/erts/emulator/beam/erl_zlib.c b/erts/emulator/beam/erl_zlib.c index 47fd92988e..8e33144f96 100644 --- a/erts/emulator/beam/erl_zlib.c +++ b/erts/emulator/beam/erl_zlib.c @@ -87,6 +87,46 @@ int ZEXPORT erl_zlib_deflate_finish(z_stream *streamp) return deflateEnd(streamp); } +int ZEXPORT erl_zlib_inflate_start(z_stream *streamp, const Bytef* source, + uLong sourceLen) +{ + streamp->next_in = (Bytef*)source; + streamp->avail_in = (uInt)sourceLen; + streamp->total_out = streamp->avail_out = 0; + streamp->next_out = NULL; + erl_zlib_alloc_init(streamp); + return inflateInit(streamp); +} +/* + * Inflate a chunk, The destination length is the limit. + * Returns Z_OK if more to process, Z_STREAM_END if we are done. + */ +int ZEXPORT erl_zlib_inflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen) +{ + int err; + uLongf last_tot = streamp->total_out; + + streamp->next_out = dest; + streamp->avail_out = (uInt)*destLen; + + if ((uLong)streamp->avail_out != *destLen) return Z_BUF_ERROR; + + err = inflate(streamp, Z_NO_FLUSH); + ASSERT(err != Z_STREAM_ERROR); + *destLen = streamp->total_out - last_tot; + return err; +} + +/* + * When we are done, free up the inflate structure + * Retyurns Z_OK or Error + */ +int ZEXPORT erl_zlib_inflate_finish(z_stream *streamp) +{ + return inflateEnd(streamp); +} + + int ZEXPORT erl_zlib_compress2 (Bytef* dest, uLongf* destLen, const Bytef* source, uLong sourceLen, int level) diff --git a/erts/emulator/beam/erl_zlib.h b/erts/emulator/beam/erl_zlib.h index 5ac849d21c..160166c66b 100644 --- a/erts/emulator/beam/erl_zlib.h +++ b/erts/emulator/beam/erl_zlib.h @@ -39,6 +39,12 @@ int ZEXPORT erl_zlib_deflate_start(z_stream *streamp, const Bytef* source, int ZEXPORT erl_zlib_deflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen); int ZEXPORT erl_zlib_deflate_finish(z_stream *streamp); +int ZEXPORT erl_zlib_inflate_start(z_stream *streamp, const Bytef* source, + uLong sourceLen); +int ZEXPORT erl_zlib_inflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen); +int ZEXPORT erl_zlib_inflate_finish(z_stream *streamp); + + /* Use instead of compress */ #define erl_zlib_compress(dest,destLen,source,sourceLen) \ diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c index 22b0a02937..2cb44a5b64 100644 --- a/erts/emulator/beam/external.c +++ b/erts/emulator/beam/external.c @@ -61,6 +61,9 @@ */ # define ERTS_DEBUG_USE_DIST_SEP # endif +# define IF_DEBUG(X) X +#else +# define IF_DEBUG(X) #endif /* Does Sint fit in Sint32? @@ -89,10 +92,11 @@ static int enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, static Uint is_external_string(Eterm obj, int* p_is_string); static byte* enc_atom(ErtsAtomCacheMap *, Eterm, byte*, Uint32); static byte* enc_pid(ErtsAtomCacheMap *, Eterm, byte*, Uint32); -static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*); +struct B2TContext_t; +static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*, struct B2TContext_t*); static byte* dec_atom(ErtsDistExternal *, byte*, Eterm*); static byte* dec_pid(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*); -static Sint decoded_size(byte *ep, byte* endp, int internal_tags); +static Sint decoded_size(byte *ep, byte* endp, int internal_tags, struct B2TContext_t*); static BIF_RETTYPE term_to_binary_trap_1(BIF_ALIST_1); static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint flags, @@ -102,11 +106,19 @@ static Uint encode_size_struct2(ErtsAtomCacheMap *, Eterm, unsigned); static int encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj, unsigned dflags, Sint *reds, Uint *res); +static Export binary_to_term_trap_export; +static BIF_RETTYPE binary_to_term_trap_1(BIF_ALIST_1); +static Eterm binary_to_term_int(Process* p, Uint32 flags, Eterm bin, Binary* context_b); + void erts_init_external(void) { #if 1 /* In R16 */ erts_init_trap_export(&term_to_binary_trap_export, am_erlang, am_term_to_binary_trap, 1, &term_to_binary_trap_1); + + erts_init_trap_export(&binary_to_term_trap_export, + am_erlang, am_binary_to_term_trap, 1, + &binary_to_term_trap_1); #else sys_memset((void *) &term_to_binary_trap_export, 0, sizeof(Export)); term_to_binary_trap_export.address = &term_to_binary_trap_export.code[3]; @@ -877,7 +889,7 @@ erts_decode_dist_ext_size(ErtsDistExternal *edep) goto fail; ep = edep->extp+1; } - res = decoded_size(ep, edep->ext_endp, 0); + res = decoded_size(ep, edep->ext_endp, 0, NULL); if (res >= 0) return res; fail: @@ -889,12 +901,12 @@ Sint erts_decode_ext_size(byte *ext, Uint size) { if (size == 0 || *ext != VERSION_MAGIC) return -1; - return decoded_size(ext+1, ext+size, 0); + return decoded_size(ext+1, ext+size, 0, NULL); } Sint erts_decode_ext_size_ets(byte *ext, Uint size) { - Sint sz = decoded_size(ext, ext+size, 1); + Sint sz = decoded_size(ext, ext+size, 1, NULL); ASSERT(sz >= 0); return sz; } @@ -927,7 +939,7 @@ erts_decode_dist_ext(Eterm** hpp, goto error; ep++; } - ep = dec_term(edep, hpp, ep, off_heap, &obj); + ep = dec_term(edep, hpp, ep, off_heap, &obj, NULL); if (!ep) goto error; @@ -948,7 +960,7 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext) byte *ep = *ext; if (*ep++ != VERSION_MAGIC) return THE_NON_VALUE; - ep = dec_term(NULL, hpp, ep, off_heap, &obj); + ep = dec_term(NULL, hpp, ep, off_heap, &obj, NULL); if (!ep) { #ifdef DEBUG bin_write(ERTS_PRINT_STDERR,NULL,*ext,500); @@ -962,7 +974,7 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext) Eterm erts_decode_ext_ets(Eterm **hpp, ErlOffHeap *off_heap, byte *ext) { Eterm obj; - ext = dec_term(NULL, hpp, ext, off_heap, &obj); + ext = dec_term(NULL, hpp, ext, off_heap, &obj, NULL); ASSERT(ext); return obj; } @@ -1043,9 +1055,14 @@ static BIF_RETTYPE term_to_binary_trap_1(BIF_ALIST_1) Binary *bin = ((ProcBin *) binary_val(bt))->val; Eterm res = erts_term_to_binary_int(BIF_P, Term, 0, 0,bin); if (is_tuple(res)) { + ASSERT(BIF_P->flags & F_DISABLE_GC); BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res); } else { - BIF_RET(res); + if (erts_set_gc_state(BIF_P, 1) + || MSO(BIF_P).overhead > BIN_VHEAP_SZ(BIF_P)) + ERTS_BIF_YIELD_RETURN(BIF_P, res); + else + BIF_RET(res); } } @@ -1053,8 +1070,10 @@ BIF_RETTYPE term_to_binary_1(BIF_ALIST_1) { Eterm res = erts_term_to_binary_int(BIF_P, BIF_ARG_1, 0, TERM_TO_BINARY_DFLAGS, NULL); if (is_tuple(res)) { + erts_set_gc_state(BIF_P, 0); BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res); } else { + ASSERT(!(BIF_P->flags & F_DISABLE_GC)); BIF_RET(res); } } @@ -1106,12 +1125,72 @@ BIF_RETTYPE term_to_binary_2(BIF_ALIST_2) res = erts_term_to_binary_int(p, Term, level, flags, bin); if (is_tuple(res)) { + erts_set_gc_state(p, 0); BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res); } else { + ASSERT(!(BIF_P->flags & F_DISABLE_GC)); BIF_RET(res); } } + +enum B2TState { /* order is somewhat significant */ + B2TPrepare, + B2TUncompressChunk, + B2TSizeInit, + B2TSize, + B2TDecodeInit, + B2TDecode, + B2TDecodeList, + B2TDecodeTuple, + B2TDecodeString, + B2TDecodeBinary, + + B2TDone, + B2TDecodeFail, + B2TBadArg +}; + +typedef struct { + int heap_size; + int terms; + byte* ep; + int atom_extra_skip; +} B2TSizeContext; + +typedef struct { + byte* ep; + Eterm res; + Eterm* next; + Eterm* hp_start; + Eterm* hp; + Eterm* hp_end; + int remaining_n; + char* remaining_bytes; +} B2TDecodeContext; + +typedef struct { + z_stream stream; + byte* dbytes; + Uint dleft; +} B2TUncompressContext; + +typedef struct B2TContext_t { + Sint heap_size; + byte* aligned_alloc; + ErtsBinary2TermState b2ts; + Uint32 flags; + SWord reds; + Eterm trap_bin; + enum B2TState state; + union { + B2TSizeContext sc; + B2TDecodeContext dc; + B2TUncompressContext uc; + } u; +} B2TContext; + + static uLongf binary2term_uncomp_size(byte* data, Sint size) { z_stream stream; @@ -1141,48 +1220,62 @@ static uLongf binary2term_uncomp_size(byte* data, Sint size) return err == Z_STREAM_END ? uncomp_size : 0; } -static ERTS_INLINE Sint -binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size) +static ERTS_INLINE int +binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size, + B2TContext* ctx) { - Sint res; byte *bytes = data; Sint size = data_size; state->exttmp = 0; if (size < 1 || *bytes != VERSION_MAGIC) { - error: - if (state->exttmp) - erts_free(ERTS_ALC_T_TMP, state->extp); - state->extp = NULL; - state->exttmp = 0; return -1; } bytes++; size--; if (size < 5 || *bytes != COMPRESSED) { state->extp = bytes; + if (ctx) + ctx->state = B2TSizeInit; } else { uLongf dest_len = (Uint32) get_int32(bytes+1); bytes += 5; size -= 5; if (dest_len > 32*1024*1024 - || (state->extp = erts_alloc_fnf(ERTS_ALC_T_TMP, dest_len)) == NULL) { + || (state->extp = erts_alloc_fnf(ERTS_ALC_T_EXT_TERM_DATA, dest_len)) == NULL) { + /* + * Try avoid out-of-memory crash due to corrupted 'dest_len' + * by checking the actual length of the uncompressed data. + * The only way to do that is to uncompress it. Sad but true. + */ if (dest_len != binary2term_uncomp_size(bytes, size)) { - goto error; + return -1; } - state->extp = erts_alloc(ERTS_ALC_T_TMP, dest_len); + state->extp = erts_alloc(ERTS_ALC_T_EXT_TERM_DATA, dest_len); + ctx->reds -= dest_len; } state->exttmp = 1; - if (erl_zlib_uncompress(state->extp, &dest_len, bytes, size) != Z_OK) - goto error; + if (ctx) { + if (erl_zlib_inflate_start(&ctx->u.uc.stream, bytes, size) != Z_OK) + return -1; + + ctx->u.uc.dbytes = state->extp; + ctx->u.uc.dleft = dest_len; + ctx->state = B2TUncompressChunk; + } + else { + uLongf dlen = dest_len; + if (erl_zlib_uncompress(state->extp, &dlen, bytes, size) != Z_OK + || dlen != dest_len) { + return -1; + } + } size = (Sint) dest_len; } - res = decoded_size(state->extp, state->extp + size, 0); - if (res < 0) - goto error; - return res; + state->extsize = size; + return 0; } static ERTS_INLINE void @@ -1190,7 +1283,7 @@ binary2term_abort(ErtsBinary2TermState *state) { if (state->exttmp) { state->exttmp = 0; - erts_free(ERTS_ALC_T_TMP, state->extp); + erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp); } } @@ -1198,11 +1291,11 @@ static ERTS_INLINE Eterm binary2term_create(ErtsDistExternal *edep, ErtsBinary2TermState *state, Eterm **hpp, ErlOffHeap *ohp) { Eterm res; - if (!dec_term(edep, hpp, state->extp, ohp, &res)) + if (!dec_term(edep, hpp, state->extp, ohp, &res, NULL)) res = THE_NON_VALUE; if (state->exttmp) { state->exttmp = 0; - erts_free(ERTS_ALC_T_TMP, state->extp); + erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp); } return res; } @@ -1210,7 +1303,18 @@ binary2term_create(ErtsDistExternal *edep, ErtsBinary2TermState *state, Eterm ** Sint erts_binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size) { - return binary2term_prepare(state, data, data_size); + Sint res; + + if (binary2term_prepare(state, data, data_size, NULL) < 0 || + (res=decoded_size(state->extp, state->extp + state->extsize, 0, NULL)) < 0) { + + if (state->exttmp) + erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp); + state->extp = NULL; + state->exttmp = 0; + return -1; + } + return res; } void @@ -1225,68 +1329,233 @@ erts_binary2term_create(ErtsBinary2TermState *state, Eterm **hpp, ErlOffHeap *oh return binary2term_create(NULL,state, hpp, ohp); } -BIF_RETTYPE binary_to_term_1(BIF_ALIST_1) +static void b2t_destroy_context(B2TContext* context) { - Sint heap_size; - Eterm res; + erts_free_aligned_binary_bytes_extra(context->aligned_alloc, + ERTS_ALC_T_EXT_TERM_DATA); + context->aligned_alloc = NULL; + binary2term_abort(&context->b2ts); + if (context->state == B2TUncompressChunk) { + erl_zlib_inflate_finish(&context->u.uc.stream); + } +} + +static void b2t_context_destructor(Binary *context_bin) +{ + B2TContext* ctx = (B2TContext*) ERTS_MAGIC_BIN_DATA(context_bin); + ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(context_bin) == b2t_context_destructor); + + b2t_destroy_context(ctx); +} + +static BIF_RETTYPE binary_to_term_trap_1(BIF_ALIST_1) +{ + Binary *context_bin = ((ProcBin *) binary_val(BIF_ARG_1))->val; + ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(context_bin) == b2t_context_destructor); + + return binary_to_term_int(BIF_P, 0, THE_NON_VALUE, context_bin); +} + + +#define B2T_BYTES_PER_REDUCTION 128 +#define B2T_MEMCPY_FACTOR 8 + +/* Define for testing */ +/*#define EXTREME_B2T_TRAPPING 1*/ + +#ifdef EXTREME_B2T_TRAPPING +static unsigned b2t_rand(void) +{ + static unsigned prev = 17; + prev = (prev * 214013 + 2531011); + return prev; +} +#endif + + +static B2TContext* b2t_export_context(Process* p, B2TContext* src) +{ + Binary* context_b = erts_create_magic_binary(sizeof(B2TContext), + b2t_context_destructor); + B2TContext* ctx = ERTS_MAGIC_BIN_DATA(context_b); Eterm* hp; - Eterm* endp; - Sint size; - byte* bytes; - byte* temp_alloc = NULL; - ErtsBinary2TermState b2ts; + sys_memcpy(ctx, src, sizeof(B2TContext)); + if (ctx->state >= B2TDecode && ctx->u.dc.next == &src->u.dc.res) { + ctx->u.dc.next = &ctx->u.dc.res; + } + hp = HAlloc(p, PROC_BIN_SIZE); + ctx->trap_bin = erts_mk_magic_binary_term(&hp, &MSO(p), context_b); + return ctx; +} - if ((bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc)) == NULL) { - error: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(BIF_P, BADARG); +static Eterm binary_to_term_int(Process* p, Uint32 flags, Eterm bin, Binary* context_b) +{ +#ifdef EXTREME_B2T_TRAPPING + SWord initial_reds = 1 + b2t_rand() % 4; +#else + SWord initial_reds = (Uint)(ERTS_BIF_REDS_LEFT(p) * B2T_BYTES_PER_REDUCTION); +#endif + B2TContext c_buff; + B2TContext *ctx; + int is_first_call; + + if (context_b == NULL) { + /* Setup enough to get started */ + is_first_call = 1; + ctx = &c_buff; + ctx->state = B2TPrepare; + ctx->aligned_alloc = NULL; + ctx->flags = flags; + IF_DEBUG(ctx->trap_bin = THE_NON_VALUE;) + } else { + is_first_call = 0; + ctx = ERTS_MAGIC_BIN_DATA(context_b); + ASSERT(ctx->state != B2TPrepare); } - size = binary_size(BIF_ARG_1); + ctx->reds = initial_reds; + + do { + switch (ctx->state) { + case B2TPrepare: { + byte* bytes; + Uint bin_size; + bytes = erts_get_aligned_binary_bytes_extra(bin, + &ctx->aligned_alloc, + ERTS_ALC_T_EXT_TERM_DATA, + 0); + if (bytes == NULL) { + ctx->b2ts.exttmp = 0; + ctx->state = B2TBadArg; + break; + } + bin_size = binary_size(bin); + if (ctx->aligned_alloc) { + ctx->reds -= bin_size / 8; + } + if (binary2term_prepare(&ctx->b2ts, bytes, bin_size, ctx) < 0) { + ctx->state = B2TBadArg; + } + break; + } + case B2TUncompressChunk: { + uLongf chunk = ctx->reds; + int zret; + + if (chunk > ctx->u.uc.dleft) + chunk = ctx->u.uc.dleft; + zret = erl_zlib_inflate_chunk(&ctx->u.uc.stream, + ctx->u.uc.dbytes, &chunk); + ctx->u.uc.dbytes += chunk; + ctx->u.uc.dleft -= chunk; + if (zret == Z_OK && ctx->u.uc.dleft > 0) { + ctx->reds = 0; + } + else if (erl_zlib_inflate_finish(&ctx->u.uc.stream) == Z_OK + && zret == Z_STREAM_END + && ctx->u.uc.dleft == 0) { + ctx->reds -= chunk; + ctx->state = B2TSizeInit; + } + else { + ctx->state = B2TBadArg; + } + break; + } + case B2TSizeInit: + ctx->u.sc.ep = NULL; + ctx->state = B2TSize; + /*fall through*/ + case B2TSize: + ctx->heap_size = decoded_size(ctx->b2ts.extp, + ctx->b2ts.extp + ctx->b2ts.extsize, + 0, ctx); + break; + + case B2TDecodeInit: + if (ctx == &c_buff && ctx->b2ts.extsize > ctx->reds) { + /* dec_term will maybe trap, allocate space for magic bin + before result term to make it easy to trim with HRelease. + */ + ctx = b2t_export_context(p, &c_buff); + } + ctx->u.dc.ep = ctx->b2ts.extp; + ctx->u.dc.res = (Eterm) (UWord) NULL; + ctx->u.dc.next = &ctx->u.dc.res; + ctx->u.dc.hp_start = HAlloc(p, ctx->heap_size); + ctx->u.dc.hp = ctx->u.dc.hp_start; + ctx->u.dc.hp_end = ctx->u.dc.hp_start + ctx->heap_size; + ctx->state = B2TDecode; + /*fall through*/ + case B2TDecode: + case B2TDecodeList: + case B2TDecodeTuple: + case B2TDecodeString: + case B2TDecodeBinary: { + ErtsDistExternal fakedep; + fakedep.flags = ctx->flags; + dec_term(&fakedep, NULL, NULL, &MSO(p), NULL, ctx); + break; + } + case B2TDecodeFail: + HRelease(p, ctx->u.dc.hp_end, ctx->u.dc.hp_start); + /*fall through*/ + case B2TBadArg: + b2t_destroy_context(ctx); + if (!is_first_call) { + erts_set_gc_state(p, 1); + } + BUMP_REDS(p, (initial_reds - ctx->reds) / B2T_BYTES_PER_REDUCTION); + BIF_ERROR(p, BADARG & ~EXF_SAVETRACE); - heap_size = binary2term_prepare(&b2ts, bytes, size); - if (heap_size < 0) - goto error; + case B2TDone: + b2t_destroy_context(ctx); - hp = HAlloc(BIF_P, heap_size); - endp = hp + heap_size; + if (ctx->u.dc.hp > ctx->u.dc.hp_end) { + erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n", + __FILE__, __LINE__, ctx->u.dc.hp - ctx->u.dc.hp_end); + } + HRelease(p, ctx->u.dc.hp_end, ctx->u.dc.hp); - res = binary2term_create(NULL, &b2ts, &hp, &MSO(BIF_P)); + if (!is_first_call) { + erts_set_gc_state(p, 1); + } + BUMP_REDS(p, (initial_reds - ctx->reds) / B2T_BYTES_PER_REDUCTION); + return ctx->u.dc.res; - erts_free_aligned_binary_bytes(temp_alloc); + default: + ASSERT(!"Unknown state in binary_to_term"); + } + }while (ctx->reds > 0 || ctx->state >= B2TDone); - if (hp > endp) { - erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n", - __FILE__, __LINE__, hp-endp); + if (ctx == &c_buff) { + ASSERT(ctx->trap_bin == THE_NON_VALUE); + ctx = b2t_export_context(p, &c_buff); } + ASSERT(ctx->trap_bin != THE_NON_VALUE); - HRelease(BIF_P, endp, hp); - - if (res == THE_NON_VALUE) - goto error; + if (is_first_call) { + erts_set_gc_state(p, 0); + } + BUMP_ALL_REDS(p); + BIF_TRAP1(&binary_to_term_trap_export, p, ctx->trap_bin); +} - return res; +BIF_RETTYPE erts_internal_binary_to_term_1(BIF_ALIST_1) +{ + return binary_to_term_int(BIF_P, 0, BIF_ARG_1, NULL); } -BIF_RETTYPE binary_to_term_2(BIF_ALIST_2) +BIF_RETTYPE erts_internal_binary_to_term_2(BIF_ALIST_2) { - Sint heap_size; - Eterm res; Eterm opts; Eterm opt; - Eterm* hp; - Eterm* endp; - Sint size; - byte* bytes; - byte* temp_alloc = NULL; - ErtsBinary2TermState b2ts; - ErtsDistExternal fakedep; + Uint32 flags = 0; - fakedep.flags = 0; opts = BIF_ARG_2; while (is_list(opts)) { opt = CAR(list_val(opts)); if (opt == am_safe) { - fakedep.flags |= ERTS_DIST_EXT_BTT_SAFE; + flags |= ERTS_DIST_EXT_BTT_SAFE; } else { goto error; @@ -1297,35 +1566,10 @@ BIF_RETTYPE binary_to_term_2(BIF_ALIST_2) if (is_not_nil(opts)) goto error; - if ((bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc)) == NULL) { - error: - erts_free_aligned_binary_bytes(temp_alloc); - BIF_ERROR(BIF_P, BADARG); - } - size = binary_size(BIF_ARG_1); - - heap_size = binary2term_prepare(&b2ts, bytes, size); - if (heap_size < 0) - goto error; - - hp = HAlloc(BIF_P, heap_size); - endp = hp + heap_size; - - res = binary2term_create(&fakedep, &b2ts, &hp, &MSO(BIF_P)); - - erts_free_aligned_binary_bytes(temp_alloc); - - if (hp > endp) { - erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n", - __FILE__, __LINE__, hp-endp); - } - - HRelease(BIF_P, endp, hp); - - if (res == THE_NON_VALUE) - goto error; + return binary_to_term_int(BIF_P, flags, BIF_ARG_1, NULL); - return res; +error: + BIF_ERROR(BIF_P, BADARG); } Eterm @@ -1473,12 +1717,10 @@ erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags) { /* #define EXTREME_TTB_TRAPPING 1 */ #ifndef EXTREME_TTB_TRAPPING -#define TERM_TO_BINARY_LOOP_FACTOR 500 -#define TERM_TO_BINARY_SIZE_FACTOR 500000 -#define TERM_TO_BINARY_COMPRESS_CHUNK 500000 +#define TERM_TO_BINARY_LOOP_FACTOR 32 +#define TERM_TO_BINARY_COMPRESS_CHUNK (1 << 18) #else #define TERM_TO_BINARY_LOOP_FACTOR 1 -#define TERM_TO_BINARY_SIZE_FACTOR 10 #define TERM_TO_BINARY_COMPRESS_CHUNK 10 #endif @@ -1514,7 +1756,7 @@ typedef struct { } s; } TTBContext; -static void context_destructor(Binary *context_bin) +static void ttb_context_destructor(Binary *context_bin) { TTBContext *context = ERTS_MAGIC_BIN_DATA(context_bin); if (context->alive) { @@ -1567,7 +1809,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla do { \ if (context_b == NULL) { \ context_b = erts_create_magic_binary(sizeof(TTBContext), \ - context_destructor); \ + ttb_context_destructor); \ context = ERTS_MAGIC_BIN_DATA(context_b); \ memcpy(context,&c_buff,sizeof(TTBContext)); \ } \ @@ -1615,7 +1857,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla /* Finish in one go */ res = erts_term_to_binary_simple(p, Term, size, level, flags); - BUMP_REDS(p, size / TERM_TO_BINARY_SIZE_FACTOR); + BUMP_REDS(p, 1); return res; } @@ -2604,21 +2846,112 @@ undo_offheap_in_area(ErlOffHeap* off_heap, Eterm* start, Eterm* end) #endif /* DEBUG */ } + /* Decode term from external format into *objp. ** On failure return NULL and (R13B04) *hpp will be unchanged. */ static byte* -dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp) +dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, + Eterm* objp, B2TContext* ctx) { - Eterm* hp_saved = *hpp; + Eterm* hp_saved; int n; ErtsAtomEncoding char_enc; - register Eterm* hp = *hpp; /* Please don't take the address of hp */ - Eterm* next = objp; + register Eterm* hp; /* Please don't take the address of hp */ + Eterm* next; + SWord reds; + + if (ctx) { + hp_saved = ctx->u.dc.hp_start; + reds = ctx->reds; + next = ctx->u.dc.next; + ep = ctx->u.dc.ep; + hpp = &ctx->u.dc.hp; + + if (ctx->state != B2TDecode) { + int n_limit = reds; + + n = ctx->u.dc.remaining_n; + if (ctx->state == B2TDecodeBinary) { + n_limit *= B2T_MEMCPY_FACTOR; + ASSERT(n_limit >= reds); + reds -= n / B2T_MEMCPY_FACTOR; + } + else + reds -= n; - *next = (Eterm) (UWord) NULL; + if (n > n_limit) { + ctx->u.dc.remaining_n -= n_limit; + n = n_limit; + reds = 0; + } + else { + ctx->u.dc.remaining_n = 0; + } + + switch (ctx->state) { + case B2TDecodeList: + objp = next - 2; + while (n > 0) { + objp[0] = (Eterm) COMPRESS_POINTER(next); + objp[1] = make_list(next); + next = objp; + objp -= 2; + n--; + } + break; + + case B2TDecodeTuple: + objp = next - 1; + while (n-- > 0) { + objp[0] = (Eterm) COMPRESS_POINTER(next); + next = objp; + objp--; + } + break; + + case B2TDecodeString: + hp = *hpp; + hp[-1] = make_list(hp); /* overwrite the premature NIL */ + while (n-- > 0) { + hp[0] = make_small(*ep++); + hp[1] = make_list(hp+2); + hp += 2; + } + hp[-1] = NIL; + *hpp = hp; + break; + + case B2TDecodeBinary: + sys_memcpy(ctx->u.dc.remaining_bytes, ep, n); + ctx->u.dc.remaining_bytes += n; + ep += n; + break; + + default: + ASSERT(!"Unknown state"); + } + if (!ctx->u.dc.remaining_n) { + ctx->state = B2TDecode; + } + if (reds <= 0) { + ctx->u.dc.next = next; + ctx->u.dc.ep = ep; + ctx->reds = 0; + return NULL; + } + } + } + else { + hp_saved = *hpp; + reds = ERTS_SWORD_MAX; + next = objp; + *next = (Eterm) (UWord) NULL; + } + hp = *hpp; while (next != NULL) { + objp = next; next = (Eterm *) EXPAND_POINTER(*objp); @@ -2738,7 +3071,16 @@ dec_term_atom_common: *objp = make_tuple(hp); *hp++ = make_arityval(n); hp += n; - objp = hp - 1; + objp = hp - 1; + if (ctx) { + if (reds < n) { + ASSERT(reds > 0); + ctx->state = B2TDecodeTuple; + ctx->u.dc.remaining_n = n - reds; + n = reds; + } + reds -= n; + } while (n-- > 0) { objp[0] = (Eterm) COMPRESS_POINTER(next); next = objp; @@ -2756,17 +3098,27 @@ dec_term_atom_common: break; } *objp = make_list(hp); - hp += 2*n; + hp += 2 * n; objp = hp - 2; objp[0] = (Eterm) COMPRESS_POINTER((objp+1)); objp[1] = (Eterm) COMPRESS_POINTER(next); next = objp; objp -= 2; - while (--n > 0) { + n--; + if (ctx) { + if (reds < n) { + ctx->state = B2TDecodeList; + ctx->u.dc.remaining_n = n - reds; + n = reds; + } + reds -= n; + } + while (n > 0) { objp[0] = (Eterm) COMPRESS_POINTER(next); - objp[1] = make_list(objp + 2); + objp[1] = make_list(next); next = objp; objp -= 2; + n--; } break; case STRING_EXT: @@ -2777,6 +3129,14 @@ dec_term_atom_common: break; } *objp = make_list(hp); + if (ctx) { + if (reds < n) { + ctx->state = B2TDecodeString; + ctx->u.dc.remaining_n = n - reds; + n = reds; + } + reds -= n; + } while (n-- > 0) { hp[0] = make_small(*ep++); hp[1] = make_list(hp+2); @@ -2984,7 +3344,6 @@ dec_term_atom_common: dbin->flags = 0; dbin->orig_size = n; erts_refc_init(&dbin->refc, 1); - sys_memcpy(dbin->orig_bytes, ep, n); pb = (ProcBin *) hp; hp += PROC_BIN_SIZE; pb->thing_word = HEADER_PROC_BIN; @@ -2995,7 +3354,20 @@ dec_term_atom_common: pb->bytes = (byte*) dbin->orig_bytes; pb->flags = 0; *objp = make_binary(pb); - } + if (ctx) { + int n_limit = reds * B2T_MEMCPY_FACTOR; + if (n > n_limit) { + ctx->state = B2TDecodeBinary; + ctx->u.dc.remaining_n = n - n_limit; + ctx->u.dc.remaining_bytes = dbin->orig_bytes + n_limit; + n = n_limit; + reds = 0; + } + else + reds -= n / B2T_MEMCPY_FACTOR; + } + sys_memcpy(dbin->orig_bytes, ep, n); + } ep += n; break; } @@ -3018,13 +3390,14 @@ dec_term_atom_common: sys_memcpy(hb->data, ep, n); bin = make_binary(hb); hp += heap_bin_size(n); + ep += n; } else { Binary* dbin = erts_bin_nrml_alloc(n); ProcBin* pb; + dbin->flags = 0; dbin->orig_size = n; erts_refc_init(&dbin->refc, 1); - sys_memcpy(dbin->orig_bytes, ep, n); pb = (ProcBin *) hp; pb->thing_word = HEADER_PROC_BIN; pb->size = n; @@ -3035,8 +3408,23 @@ dec_term_atom_common: pb->flags = 0; bin = make_binary(pb); hp += PROC_BIN_SIZE; - } - ep += n; + if (ctx) { + int n_limit = reds * B2T_MEMCPY_FACTOR; + if (n > n_limit) { + ctx->state = B2TDecodeBinary; + ctx->u.dc.remaining_n = n - n_limit; + ctx->u.dc.remaining_bytes = dbin->orig_bytes + n_limit; + n = n_limit; + reds = 0; + } + else + reds -= n / B2T_MEMCPY_FACTOR; + } + sys_memcpy(dbin->orig_bytes, ep, n); + ep += n; + n = pb->size; + } + if (bitsize == 8 || n == 0) { *objp = bin; } else { @@ -3067,7 +3455,7 @@ dec_term_atom_common: goto error; } *hpp = hp; - ep = dec_term(edep, hpp, ep, off_heap, &temp); + ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL); hp = *hpp; if (ep == NULL) { goto error; @@ -3127,7 +3515,7 @@ dec_term_atom_common: } *hpp = hp; /* Index */ - if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) { + if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) { goto error; } if (!is_small(temp)) { @@ -3136,7 +3524,7 @@ dec_term_atom_common: old_index = unsigned_val(temp); /* Uniq */ - if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) { + if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) { goto error; } if (!is_small(temp)) { @@ -3204,7 +3592,7 @@ dec_term_atom_common: } /* Index */ - if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) { + if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) { goto error; } if (!is_small(temp)) { @@ -3213,7 +3601,7 @@ dec_term_atom_common: old_index = unsigned_val(temp); /* Uniq */ - if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) { + if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) { goto error; } if (!is_small(temp)) { @@ -3313,8 +3701,31 @@ dec_term_atom_common: } undo_offheap_in_area(off_heap, hp_saved, hp); *hpp = hp_saved; - return NULL; + if (ctx) { + ctx->state = B2TDecodeFail; + ctx->reds = reds; + } + return NULL; } + + if (--reds <= 0) { + if (ctx) { + if (next || ctx->state != B2TDecode) { + ctx->u.dc.ep = ep; + ctx->u.dc.next = next; + ctx->u.dc.hp = hp; + ctx->reds = 0; + return NULL; + } + } + else { + reds = ERTS_SWORD_MAX; + } + } + } + if (ctx) { + ctx->state = B2TDone; + ctx->reds = reds; } *hpp = hp; return ep; @@ -3602,18 +4013,37 @@ encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj, } static Sint -decoded_size(byte *ep, byte* endp, int internal_tags) +decoded_size(byte *ep, byte* endp, int internal_tags, B2TContext* ctx) { - int heap_size = 0; + int heap_size; int terms; - int atom_extra_skip = 0; + int atom_extra_skip; Uint n; + SWord reds; + + if (ctx) { + reds = ctx->reds; + if (ctx->u.sc.ep) { + heap_size = ctx->u.sc.heap_size; + terms = ctx->u.sc.terms; + ep = ctx->u.sc.ep; + atom_extra_skip = ctx->u.sc.atom_extra_skip; + goto init_done; + } + } + else + reds = 0; /* not used but compiler warns anyway */ + + heap_size = 0; + terms = 1; + atom_extra_skip = 0; +init_done: #define SKIP(sz) \ do { \ if ((sz) <= endp-ep) { \ ep += (sz); \ - } else { return -1; }; \ + } else { goto error; }; \ } while (0) #define SKIP2(sz1, sz2) \ @@ -3621,25 +4051,24 @@ decoded_size(byte *ep, byte* endp, int internal_tags) Uint sz = (sz1) + (sz2); \ if (sz1 < sz && (sz) <= endp-ep) { \ ep += (sz); \ - } else { return -1; } \ + } else { goto error; } \ } while (0) #define CHKSIZE(sz) \ do { \ - if ((sz) > endp-ep) { return -1; } \ + if ((sz) > endp-ep) { goto error; } \ } while (0) #define ADDTERMS(n) \ do { \ int before = terms; \ terms += (n); \ - if (terms < before) return -1; \ + if (terms < before) goto error; \ } while (0) - - for (terms=1; terms > 0; terms--) { - int tag; - + ASSERT(terms > 0); + do { + int tag; CHKSIZE(1); tag = ep++[0]; switch (tag) { @@ -3660,7 +4089,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) CHKSIZE(4); n = get_int32(ep); if (n > BIG_ARITY_MAX*sizeof(ErtsDigit)) { - return -1; + goto error; } SKIP2(n,4+1); /* skip, size,sign,digits */ heap_size += 1+1+(n+sizeof(Eterm)-1)/sizeof(Eterm); /* XXX: 1 too much? */ @@ -3669,7 +4098,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) CHKSIZE(2); n = get_int16(ep); if (n > MAX_ATOM_CHARACTERS) { - return -1; + goto error; } SKIP(n+2+atom_extra_skip); atom_extra_skip = 0; @@ -3679,7 +4108,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) n = get_int16(ep); ep += 2; if (n > MAX_ATOM_SZ_LIMIT) { - return -1; + goto error; } SKIP(n+atom_extra_skip); atom_extra_skip = 0; @@ -3688,7 +4117,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) CHKSIZE(1); n = get_int8(ep); if (n > MAX_ATOM_CHARACTERS) { - return -1; + goto error; } SKIP(n+1+atom_extra_skip); atom_extra_skip = 0; @@ -3698,7 +4127,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) n = get_int8(ep); ep++; if (n > MAX_ATOM_SZ_LIMIT) { - return -1; + goto error; } SKIP(n+atom_extra_skip); atom_extra_skip = 0; @@ -3727,7 +4156,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) id_words = get_int16(ep); if (id_words > ERTS_MAX_REF_NUMBERS) - return -1; + goto error; ep += 2; atom_extra_skip = 1 + 4*id_words; @@ -3829,7 +4258,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags) num_free = get_int32(ep); ep += 4; if (num_free > MAX_ARG) { - return -1; + goto error; } terms += 4 + num_free; heap_size += ERL_FUN_SIZE + num_free; @@ -3846,24 +4275,47 @@ decoded_size(byte *ep, byte* endp, int internal_tags) case BINARY_INTERNAL_REF: if (!internal_tags) { - return -1; + goto error; } SKIP(sizeof(ProcBin)); heap_size += PROC_BIN_SIZE; break; case BIT_BINARY_INTERNAL_REF: if (!internal_tags) { - return -1; + goto error; } SKIP(2+sizeof(ProcBin)); heap_size += PROC_BIN_SIZE + ERL_SUB_BIN_SIZE; break; default: - return -1; + goto error; } - } + terms--; + + if (ctx && --reds <= 0 && terms > 0) { + ctx->u.sc.heap_size = heap_size; + ctx->u.sc.terms = terms; + ctx->u.sc.ep = ep; + ctx->u.sc.atom_extra_skip = atom_extra_skip; + ctx->reds = 0; + return 0; + } + }while (terms > 0); + /* 'terms' may be non-zero if it has wrapped around */ - return terms==0 ? heap_size : -1; + if (terms == 0) { + if (ctx) { + ctx->state = B2TDecodeInit; + ctx->reds = reds; + } + return heap_size; + } + +error: + if (ctx) { + ctx->state = B2TBadArg; + } + return -1; #undef SKIP #undef SKIP2 #undef CHKSIZE diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h index ff29e84972..83001b2c7e 100644 --- a/erts/emulator/beam/external.h +++ b/erts/emulator/beam/external.h @@ -146,6 +146,7 @@ typedef struct { typedef struct { byte *extp; int exttmp; + Uint extsize; } ErtsBinary2TermState; /* -------------------------------------------------------------------------- */ diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h index 94bc1b172a..6e5d352e5b 100755 --- a/erts/emulator/beam/global.h +++ b/erts/emulator/beam/global.h @@ -921,7 +921,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, ErtsAlcType_t alloc_type, int allow_empty, int allow_atom, int encoding, - Sint *used /* out */); + Sint *used /* out */, + Uint extra); char* erts_convert_filename_to_wchar(byte* bytes, Uint size, char *statbuf, size_t statbuf_size, ErtsAlcType_t alloc_type, Sint* used, diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c index d4623c0450..49af86b36a 100644 --- a/erts/emulator/beam/io.c +++ b/erts/emulator/beam/io.c @@ -7061,7 +7061,7 @@ void *driver_dl_open(char * path) int res; int *last_error_p = erts_smp_tsd_get(driver_list_last_error_key); int locked = maybe_lock_driver_list(); - if ((res = erts_sys_ddll_open(path, &ptr)) == 0) { + if ((res = erts_sys_ddll_open(path, &ptr, NULL)) == 0) { maybe_unlock_driver_list(locked); return ptr; } else { diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h index 31252ed78f..189d9ebac8 100644 --- a/erts/emulator/beam/sys.h +++ b/erts/emulator/beam/sys.h @@ -279,18 +279,21 @@ typedef unsigned long UWord; typedef long SWord; #define SWORD_CONSTANT(Const) Const##L #define UWORD_CONSTANT(Const) Const##UL +#define ERTS_UWORD_MAX ULONG_MAX #define ERTS_SWORD_MAX LONG_MAX #elif SIZEOF_VOID_P == SIZEOF_INT typedef unsigned int UWord; typedef int SWord; #define SWORD_CONSTANT(Const) Const #define UWORD_CONSTANT(Const) Const##U +#define ERTS_UWORD_MAX UINT_MAX #define ERTS_SWORD_MAX INT_MAX #elif SIZEOF_VOID_P == SIZEOF_LONG_LONG typedef unsigned long long UWord; typedef long long SWord; #define SWORD_CONSTANT(Const) Const##LL #define UWORD_CONSTANT(Const) Const##ULL +#define ERTS_UWORD_MAX ULLONG_MAX #define ERTS_SWORD_MAX LLONG_MAX #else #error Found no appropriate type to use for 'Eterm', 'Uint' and 'Sint' @@ -304,6 +307,7 @@ typedef unsigned long Uint; typedef long Sint; #define SWORD_CONSTANT(Const) Const##L #define UWORD_CONSTANT(Const) Const##UL +#define ERTS_UWORD_MAX ULONG_MAX #define ERTS_SWORD_MAX LONG_MAX #define ERTS_SIZEOF_ETERM SIZEOF_LONG #define ErtsStrToSint strtol @@ -313,6 +317,7 @@ typedef unsigned int Uint; typedef int Sint; #define SWORD_CONSTANT(Const) Const #define UWORD_CONSTANT(Const) Const##U +#define ERTS_UWORD_MAX UINT_MAX #define ERTS_SWORD_MAX INT_MAX #define ERTS_SIZEOF_ETERM SIZEOF_INT #define ErtsStrToSint strtol @@ -322,6 +327,7 @@ typedef unsigned long long Uint; typedef long long Sint; #define SWORD_CONSTANT(Const) Const##LL #define UWORD_CONSTANT(Const) Const##ULL +#define ERTS_UWORD_MAX ULLONG_MAX #define ERTS_SWORD_MAX LLONG_MAX #define ERTS_SIZEOF_ETERM SIZEOF_LONG_LONG #if defined(__WIN32__) @@ -661,8 +667,7 @@ typedef struct { #define ERTS_SYS_DDLL_ERROR_INIT {NULL} extern void erts_sys_ddll_free_error(ErtsSysDdllError*); extern void erl_sys_ddll_init(void); /* to initialize mutexes etc */ -extern int erts_sys_ddll_open2(const char *path, void **handle, ErtsSysDdllError*); -#define erts_sys_ddll_open(P,H) erts_sys_ddll_open2(P,H,NULL) +extern int erts_sys_ddll_open(const char *path, void **handle, ErtsSysDdllError*); extern int erts_sys_ddll_open_noext(char *path, void **handle, ErtsSysDdllError*); extern int erts_sys_ddll_load_driver_init(void *handle, void **function); extern int erts_sys_ddll_load_nif_init(void *handle, void **function,ErtsSysDdllError*); diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c index 978a766de9..80937dfcc8 100644 --- a/erts/emulator/drivers/common/inet_drv.c +++ b/erts/emulator/drivers/common/inet_drv.c @@ -1490,8 +1490,8 @@ static int load_ip_and_port unsigned int alen = len; char abuf [len]; int res = inet_get_address(abuf, (inet_address*) addr, &alen); - ASSERT(res==0); - res = 0; + ASSERT(res==0); (void)res; + /* Now "abuf" contains: Family(1b), Port(2b), IP(4|16b) */ /* NB: the following functions are safe to use, as they create tuples diff --git a/erts/emulator/internal_doc/CarrierMigration.md b/erts/emulator/internal_doc/CarrierMigration.md new file mode 100644 index 0000000000..b93c11c6ec --- /dev/null +++ b/erts/emulator/internal_doc/CarrierMigration.md @@ -0,0 +1,201 @@ +Carrier Migration +================= + +The ERTS memory allocators manage memory blocks in two types of raw +memory chunks. We call these chunks of raw memory +*carriers*. Singleblock carriers which only contain one large block, +and multiblock carriers which contain multiple blocks. A carrier is +typically created using `mmap()` on unix systems. However, how a +carrier is created is of minor importance. An allocator instance +typically manages a mixture of single- and multiblock carriers. + +Problem +------- + +When a carrier is empty, i.e. contains only one large free block, it +is deallocated. Since multiblock carriers can contain both allocated +blocks and free blocks at the same time, an allocator instance might +be stuck with a large amount of poorly utilized carriers if the memory +load decrease. After a peak in memory usage it is expected that not +all memory can be returned since the blocks still allocated is likely +to be dispersed over multiple carriers. Such poorly utilized carriers +can usually be reused if the memory load increase again. However, +since each scheduler thread manages its own set of allocator +instances, and memory load is not necessarily connected to CPU load we +might get into a situation where there are lots of poorly utilized +multiblock carriers on some allocator instances while we need to +allocate new multiblock carriers on other allocator instances. In +scenarios like this, the demand for multiblock carriers in the system +might increase at the same time as the actual memory demand in the +system has decreased which is both unwanted and quite unexpected for +the end user. + +Solution +-------- + +In order to prevent scenarios like this we've implemented support for +migration of multiblock carriers between allocator instances of the +same type. + +### Management of Free Blocks ### + +In order to be able to remove a carrier from one allocator instance +and add it to another we need to be able to move references to the +free blocks of the carrier between the allocator instances. The +allocator instance specific data structure referring to the free +blocks it manages often refers to the same carrier from multiple +places. For example, when the address order bestfit strategy is used +this data structure is a binary search tree spanning all carriers that +the allocator instance manages. Free blocks in one specific carrier +can be referred to from potentially every other carrier that is +managed, and the amount of such references can be huge. That is, the +work of removing the free blocks of such a carrier from the search +tree will be huge. One way of solving this could be to not migrate +carriers that contain lots of free blocks, but this would prevent us +from migrating carriers that potentially needs to be migrated in order +to solve the problem we set out to solve. + +By using one data structure of free blocks in each carrier and an +allocator instance wide data structure of carriers managed by the +allocator instance, the work needed in order to remove and add +carriers can be kept to a minimum. When migration of carriers is +enabled on a specific allocator type, we require that an allocation +strategy with such an implementation is used. Currently we've +implemented this for three different allocation strategies. All of +these strategies use a search tree of carriers sorted so that we can +find the carrier with the lowest address that can satisfy the +request. Internally in carriers we use yet another search tree that +either implement address order first fit, address order best fit, +or best fit. The abbreviations used for these different allocation +strategies are `aoff`, and `aoffcaobf`, `aoffcbf`. + +### Carrier Pool ### + +In order to migrate carriers between allocator instances we move them +through a pool of carriers. In order for a carrier migration to +complete, one scheduler needs to move the carrier into the pool, and +another scheduler needs to take the carrier out of the pool. + +The pool is implemented as a lock free, circular, double linked, +list. The list contains a sentinel which is used as the starting point +when inserting to, or fetching from the pool. Carriers in the pool are +elements in this list. + +The list can be modified by all scheduler threads +simultaneously. During modifications the double linked list is allowed +to get a bit "out of shape". For example, following the `next` pointer +to the next element and then following the `prev` pointer does not +always take you back to were you started. The following is however +always true: + +* Repeatedly following `next` pointers will eventually take you to the + sentinel. +* Repeatedly following `prev` pointers will eventually take you to the + sentinel. +* Following a `next` or a `prev` pointer will take you to either an + element in the pool, or an element that used to be in the pool. + +When inserting a new element we search for a place to insert the +element by only following `next` pointers, and we always begin by +skipping the first element encountered. When trying to fetch an +element we do the same thing, but instead only follow `prev` pointers. + +By going different directions when inserting and fetching, we avoid +contention between threads inserting and threads fetching as much as +possible. By skipping one element when we begin searching, we preserve +the sentinel unmodified as much as possible. This is beneficial since +all search operations need to read the content of the sentinel. If we +were to modify the sentinel, the cache line containing the sentinel +would unnecessarily be bounced between processors. + +The `prev`, and `next` fields in the elements of the list contains the +value of the pointer, a modification marker, and a deleted +marker. Memory operations on these fields are done using atomic memory +operations. When a thread has set the modification marker in a field, +no-one except the thread that set the marker is allowed to modify the +field. If multiple modification markers needs to be set, we always +begin with `next` fields followed by `prev` fields in the order +following the actual pointers. This guarantees that no deadlocks will +occur. + +When a carrier is being removed from a pool, we mark it with a thread +progress value that needs to be reached before we are allowed to +modify the `next`, and `prev` fields. That is, until we reach this +thread progress we are not allowed to insert the carrier into the pool +again, and we are not allowed to deallocate the carrier. This ensures +that threads inspecting the pool always will be able to traverse the +pool and reach valid elements. Once we have reached the thread +progress value that the carrier was tagged with, we know that no +threads may have references to it via the pool. + +### Migration ### + +There exist one pool for each allocator type enabling migration of +carriers between scheduler specific allocator instances of the same +allocator type. + +Each allocator instance keeps track of the current utilization of its +multiblock carriers. When the utilization falls below the "abandon +carrier utilization limit" it starts to inspect the utilization of the +current carrier when deallocations are made. If also the utilization +of the carrier falls below the "abandon carrier utilization limit" it +unlinks the carrier from its data structure of available free blocks +and inserts the carrier into the pool. + +Since the carrier has been unlinked from the data structure of +available free blocks, no more allocations will be made in the +carrier. The allocator instance putting the carrier into the pool, +however, still has the responsibility of performing deallocations in +it while it remains in the pool. + +Each carrier has a flag field containing information about allocator +instance owning the carrier, a flag indicating if the carrier is in +the pool or not, and a flag indicating if it is busy or not. When the +carrier is in the pool, the owning allocator instance needs to mark it +as busy while operating on it. If another thread inspects it in order +to try to fetch it from the pool, it will abort the fetch if it is +busy. When fetching the carrier from the pool, ownership will changed +and further deallocations in the carrier will be redirected to the new +owner using the delayed dealloc functionality. + +If a carrier in the pool becomes empty, it will be withdrawn from the +pool. All carriers that become empty are also always passed to its +originating allocator instance for deallocation using the delayed +dealloc functionality. Since carriers this way always will be +deallocated by the allocator instance that allocated the carrier the +underlying functionality of allocating and deallocating carriers can +remain simple and doesn't have to bother about multiple threads. In a +NUMA system we will also not mix carriers originating from multiple +NUMA nodes. + +When an allocator instance needs more carrier space, it always begins +by inspecting its own carriers that are waiting for thread progress +before they can be deallocated. If no such carrier could be found, it +then inspects the pool. If no carrier could be fetched from the pool, +it will allocate a new carrier. Regardless of where the allocator +instance gets the carrier from it the just links in the carrier into +its data structure of free blocks. + +### Result ### + +The use of this strategy of abandoning carriers with poor utilization +and reusing these in allocator instances with an increased carrier +demand is extremely effective and completely eliminates the problems +that otherwise sometimes occurred when CPU load dropped while memory +load did not. + +When using the `aoffcaobf` or `aoff` strategies compared to `gf` or +`bf`, we loose some performance since we get more modifications in the +data structure of free blocks. This performance penalty is however +reduced using the `aoffcbf` strategy. A tradeoff between memory +consumption and performance is however inevitable, and it is up to +the user to decide what is most important. + +Further work +------------ + +It would be quite easy to extend this to allow migration of multiblock +carriers between all allocator types. More or less the only obstacle +is maintenance of the statistics information. + + diff --git a/erts/emulator/internal_doc/CodeLoading.md b/erts/emulator/internal_doc/CodeLoading.md new file mode 100644 index 0000000000..151b9cd57c --- /dev/null +++ b/erts/emulator/internal_doc/CodeLoading.md @@ -0,0 +1,186 @@ +Non-Blocking Code Loading +========================= + +Introduction +------------ + +Before OTP R16 when an Erlang code module was loaded, all other +execution in the VM were halted while the load operation was carried +out in single threaded mode. This might not be a big problem for +initial loading of modules during VM boot, but it can be a severe +problem for availability when upgrading modules or adding new code on +a VM with running payload. This problem grows with the number of cores +as both the time it takes to wait for all schedulers to stop increases +as well as the potential amount of halted ongoing work. + +In OTP R16, modules are loaded without blocking the VM. +Erlang processes may continue executing undisturbed in parallel during +the entire load operation. The code loading is carried out by a normal +Erlang process that is scheduled like all the others. The load +operation is completed by making the loaded code visible to all +processes in a consistent way with one single atomic +instruction. Non-blocking code loading will improve real-time +characteristics when modules are loaded/upgraded on a running SMP +system. + + +The Load Phases +--------------- + +The loading of a module is divided into two phases; a *prepare phase* +and a *finishing phase*. The prepare phase contains reading the BEAM +file format and all the preparations of the loaded code that can +easily be done without interference with the running code. The +finishing phase will make the loaded (and prepared) code accessible +from the running code. Old module versions (replaced or deleted) will +also be made inaccessible by the finishing phase. + +The prepare phase is designed to allow several "loader" processes to +prepare separate modules in parallel while the finishing phase can +only be done by one loader process at a time. A second loader process +trying to enter finishing phase will be suspended until the first +loader is done. This will only block the process, the scheduler is +free to schedule other work while the second loader is waiting. (See +`erts_try_seize_code_write_permission` and +`erts_release_code_write_permission`). + +The ability to prepare several modules in parallel is not currently +used as almost all code loading is serialized by the code_server +process. The BIF interface is however prepared for this. + + erlang:prepare_loading(Module, Code) -> LoaderState + erlang:finish_loading([LoaderState]) + +The idea is that `prepare_loading` could be called in parallel for +different modules and returns a "magic binary" containing the internal +state of each prepared module. Function `finish_loading` could take a +list of such states and do the finishing of all of them in one go. + +Currenlty we use the legacy BIF `erlang:load_module` which is now +implemented in Erlang by calling the above two functions in +sequence. Function `finish_loading` is limited to only accepts a list +with one module state as we do not yet use the multi module loading +feature. + + +The Finishing Sequence +---------------------- + +During VM execution, code is accessed through a number of data +structures. These *code access structures* are + +* Export table. One entry for every exported function. +* Module table. One entry for each loaded module. +* "beam_catches". Identifies jump destinations for catch instructions. +* "beam_ranges". Map code address to function and line in source file. + +The most frequently used of these structures is the export table that +is accessed in run time for every executed external function call to +get the address of the callee. For performance reasons, we want to +access all these structures without any overhead from thread +synchronization. Earlier this was solved with an emergency break. Stop +the entire VM to mutate these code access structures, otherwise treat +them as read-only. + +The solution in R16 is instead to *replicate* the code access +structures. We have one set of active structures read by the running +code. When new code is loaded the active structures are copied, the +copy is updated to include the newly loaded module and then a switch +is made to make the updated copy the new active set. The active set is +identified by a single global atomic variable +`the_active_code_index`. The switch can thus be made by a single +atomic write operation. The running code have to read this atomic +variable when using the active access structures, which means one +atomic read operation per external function call for example. The +performance penalty from this extra atomic read is however very small +as it can be done without any memory barriers at all (as described +below). With this solution we also preserve the transactional feature +of a load operation. Running code will never see the intermediate +result of a half loaded module. + +The finishing phase is carried out in the following sequence by the +BIF `erlang:finish_loading`: + +1. Seize exclusive code write permission (suspend process if needed + until we get it). + +2. Make a full copy of all the active access structures. This copy is + called the staging area and is identified by the global atomic + variable `the_staging_code_index`. + +3. Update all access structures in the staging area to include the + newly prepared module. + +4. Schedule a thread progress event. That is a time in the future when + all schedulers have yielded and executed a full memory barrier. + +5. Suspend the loader process. + +6. After thread progress, commit the staging area by assigning + `the_staging_code_index` to `the_active_code_index`. + +7. Release the code write permission allowing other processes to stage + new code. + +8. Resume the loader process allowing it to return from + `erlang:finish_loading`. + + +### Thread Progress + +The waiting for thread progress in 4-6 is necessary in order for +processes to read `the_active_code_index` atomic during normal +execution without any expensive memory barriers. When we write a new +value into `the_active_code_index` in step 6, we know that all +schedulers will see an updated and consistent view of all the new +active access structures once they become reachable through +`the_active_code_index`. + +The total lack of memory barrier when reading `the_active_code_index` +has one interesting consequence however. Different processes may see +the new code at different point in time depending on when different +cores happen to refresh their hardware caches. This may sound unsafe +but it actually does not matter. The only property we must guarantee +is that the ability to see the new code must spread with process +communication. After receiving a message that was triggered by new +code, the receiver must be guaranteed to also see the new code. This +will be guaranteed as all types of process communication involves +memory barriers in order for the receiver to be sure to read what the +sender has written. This implicit memory barrier will then also make +sure that the receiver reads the new value of `the_active_code_index` +and thereby also sees the new code. This is true for all kinds of +inter process communication (TCP, ETS, process name registering, +tracing, drivers, NIFs, etc) not just Erlang messages. + +### Code Index Reuse + +To optimize the copy operation in step 2, code access structures are +reused. In current solution we have three sets of code access +structures, identified by a code index of 0, 1 and 2. These indexes +are used in a round robin fashion. Instead of having to initialize a +completely new copy of all access structures for every load operation +we just have to update with the changes that have happened since the +last two code load operations. We could get by with only two code +indexes (0 and 1), but that would require yet another round of waiting +for thread progress before step 2 in the `finish_loading` sequence. We +cannot start reusing a code index as staging area until we know that +no lingering scheduler thread is still using it as the active code +index. With three generations of code indexes, the waiting for thread +progress in step 4-6 will give this guarantee for us. Thread progress +will wait for all running schedulers to reschedule at least one +time. No ongoing execution reading code access structures reached from +an old value of `the_active_code_index` can exist after a second round +of thread progress. + +The design choice between two or three generations of code access +structures is a trade-off between memory consumption and code loading +latency. + +### A Consistent Code View + +Some native BIFs may need to get a consistent snapshot view of the +active code. To do this it is important to only read +`the_active_code_index` one time and then use that index value for all +code accessing during the BIF. If a load operation is executed in +parallel, reading `the_active_code_index` a second time might result +in a different value, and thereby a different view of the code. diff --git a/erts/emulator/internal_doc/DelayedDealloc.md b/erts/emulator/internal_doc/DelayedDealloc.md new file mode 100644 index 0000000000..b7d87b839f --- /dev/null +++ b/erts/emulator/internal_doc/DelayedDealloc.md @@ -0,0 +1,175 @@ +Delayed Dealloc +=============== + +Problem +------- + +An easy way to handle memory allocation in a multi-threaded +environment is to protect the memory allocator with a global lock +which threads performing memory allocations or deallocations have to +have locked during the whole operation. This solution of course scales +very poorly, due to heavy lock contention. An improved solution of +this scheme is to use multiple thread specific instances of such an +allocator. That is, each thread allocates in its own allocator +instance which is protected by a lock. In the general case references +to memory need to be passed between threads. In the case where a +thread that needs to deallocate memory that originates from another +threads allocator instance a lock conflict is possible. In a system as +the Erlang VM where memory allocation/deallocation is frequent and +references to memory also are passed around between threads this +solution will also scale poorly due to lock contention. + +Functionality Used to Adress This problem +----------------------------------------- + +In order to reduce contention due to locking of allocator instances we +introduced completely lock free instances tied to each scheduler +thread, and an extra locked instance for other threads. The scheduler +threads in the system is expected to do the major part of the +work. Other threads may still be needed but should not perform any +major and/or time critical work. The limited amount of contention that +appears on the locked allocator instance can more or less be +disregarded. + +Since we still need to be able to pass references to memory between +scheduler threads we need some way to manage this. An allocator +instance belonging to one scheduler thread is only allowed to be +manipulated by that scheduler thread. When other threads need to +deallocate memory originating from a foreign allocator instance, they +only pass the memory block to a "message box" containing deallocation +jobs attached to the originating allocator instance. When a scheduler +thread detects such deallocation job it performs the actual +deallocation. + +The "message box" is implemented using a lock free single linked list +through the memory blocks to deallocate. The order of the elements in +this list is not important. Insertion of new free blocks will be made +somewhere near the end of this list. Requirering that the new blocks +need to be inserted at the end would cause unnecessary contention when +large amount of memory blocks are inserted simultaneous by multiple +threads. + +The data structure refering to this single linked list cover two cache +lines. One cache line containing information about the head of the +list, and one cache line containing information about the tail of the +list. This in order to reduce cache line ping ponging of this data +structure. The head of the list will only be manipulated by the thread +owning the allocator instance, and the tail will be manipulated by +other threads inserting deallocation jobs. + +### Tail ### + +In the tail part of the data structure we find a pointer to the last +element of the list, or at least something that is near the end of the +list. In the uncontended case it will point to the end of the list, +but when simultaneous insert operations are performed it will point to +something near the end of the list. + +When insterting an element one will try to write a pointer to the new +element in the next pointer of the element pointed to by the last +pointer. This is done using an atomic compare and swap that expects +the next pointer to be `NULL`. If this succeds the thread performing +this operation moves the last pointer to point to the newly inserted +element. + +If the atomic compare and swap described above failed, the last +pointer didn't point to the last element. In this case we need to +insert the new element somewhere inbetween the element that the last +pointer pointed to and the actual last element. If we do it this way +the last pointer will eventually end up at the last element when +threads stop adding new elements. When trying to insert somewhere near +the end and failing to do so, the inserting thread sometimes moves to +the next element and somtimes tries with the same element again. This +in order to spread the inserted elements during heavy contention. That +is, we try to spread the modifications of memory to different +locations instead of letting all threads continue to try to modify the +same location in memory. + +### Head ### + +The head contains pointers to begining of the list (`head.first`), and +to the first block which other threads may refer to +(`head.unref_end`). Blocks between these pointers are only refered to +by the head part of the data structure which is only used by the +thread owning the allocator instance. When these two pointers are not +equal the thread owning the allocator instance deallocate block after +block until `head.first` reach `head.unref_end`. + +We of course periodically need to move the `head.unref_end` closer to +the end in order to be able to continue deallocating memory +blocks. Since all threads inserting new elements in the linked list +will enter the list using the last pointer we can use this +knowledge. If we call `erts_thr_progress_later()` and wait until we +have reached that thread progress we know that no managed threads can +refer the elements up to the element pointed to by the last pointer at +the time when we called `erts_thr_progress_later()`. This since, all +managed threads must have left the code implementing this at least +once, and they always enters into the list via the last pointer. The +`tail.next` field contains information about next `head.unref_end` +pointer and thread progress that needs to be reached before we can +move `head.unref_end`. + +Unfortunately not only threads managed by the thread progress +functionality may insert memory blocks. Other threads also needs to be +taken care of. Other threads will not be as frequent users of this +functionality as managed threads, so using a less efficient scheme for +them is not that big of a problem. In order to handle unmanaged +threads we use two reference counters. When an unmanaged thread enters +this implementation it increments the reference counter currently +used, and when it leaves this implementation it decrements the same +reference counter. When the consumer thread calls +`erts_thr_progress_later()` in order to determine when it is safe to +move `head.unref_end`, it also swaps reference counters for unmanaged +threads. The previous current represents outstanding references from +the time up to this point. The new current represents future reference +following this point. When the consumer thread detects that we have +both reached the desired thread progress and when the previous current +reference counter reach zero it is safe to move the `head.unref_end`. + +The reason for using two reference counters is that we need to know +that the reference counter eventually will reach zero. If we only used +one reference counter it would potentially be held above zero for ever +by different unmanaged threads. + +### Empty List ### + +If no new memory blocks are inserted into the list, it should +eventually be emptied. All pointers to the list however expect to +always point to something. This is solved by inserting an empty +"marker" element, which only has to purpose of being there in the +absense of other elements. That is when the list is empty it only +contains this "marker" element. + +### Contention ### + +When elements are continously inserted by threads not owning the +allocator instance, the thread owning the allocator instance will be +able to work more or less undisturbed by other threads at the head end +of the list. At the tail end large amounts of simultaneous inserts may +cause contention, but we reduce such contention by spreading inserts +of new elements near the end instead of requiring all new elements to +be inserted at the end. + +### Schedulers and The Locked Allocator Instance ### + +Also the locked allocator instance for use by non-scheduler threads +have a message box for deallocation jobs just as all the other +allocator instances. The reason for this is that other threads may +allocate memory pass it to a scheduler that then needs to deallocate +it. We do not want the scheduler to have to wait for the lock on this +locked instance. Since also locked instances has message boxes for +deallocation jobs, the scheduler can just insert the job and avoid the +locking. + + +### A Benchmark Result ### + +When running the ehb benchmark, large amount of messages are passed +around between schedulers. All message passing will in some way or the +other cause memory allocation and deallocation. Since messages are +passed between different schedulers we will get contention on the +allocator instances where messages were allocated. By the introduction +of the delayed dealloc feature, we got a speedup of between 25-45%, +depending on configuration of the benchmark, when running on a +relatively new machine with an Intel i7 quad core processor with +hyper-threading using 8 schedulers.
\ No newline at end of file diff --git a/erts/emulator/internal_doc/PTables.md b/erts/emulator/internal_doc/PTables.md new file mode 100644 index 0000000000..6fe0e7665d --- /dev/null +++ b/erts/emulator/internal_doc/PTables.md @@ -0,0 +1,356 @@ +Process and Port Tables +======================= + +Problems +-------- + +The process table is a mapping from process identifiers to process +structure pointers. The process structure contains miscellaneous +information about a process, as for example pointers to its heap, +message queue, etc. When the runtime system needs to operate on a +process, it looks up the process structure in the process table using +the process identifier. An example of this is when passing a message +to a process. + +The process table has for a very long time just been an array of +pointers to process structures. Since process identifiers internally +in the runtime system are 28-bit integers it is quite easy to map a +process identifier to index into the array. The 28-bits were divided +into two sets. The least significant set of bits was used as index +into the array. The most significant set of bits was only used to be +able to distinguish between a number of identifiers with which map to +the same index in the array. As long as process table sizes of a power +of two was used we had 2^28 unique process identifiers. + +When the first SMP support was implemented, the table still was kept +more or less the same way, but protected by two types of locks. One +lock that protected the whole table against modifications and an array +of locks protecting different parts of the table. The exact locking +strategy previously used isn't interesting. What is interesting is +that it suffered from heavy lock contention especially when lots of +modifications was being made, but also when only performing lookups. + +In order to be able to detect when it is safe to deallocate a +previously used process structure, reference counting of the structure +was used. Also this was problematic, since simultaneous lookups needed +to modify the reference counter which also caused contention on the +cache line where the reference counter was located. This since all +modifications needs to be communicated between all involved +processors. + +The port table is very similar to the process table. The major +difference, at least in concept, is that it is a mapping from port +identifiers to port structures. It had a similar implementation, but +with some differences. Instead of being an array of pointers it was an +array of structures, and instead of being protected by two types of +locks it was only protected by one global lock. This table also +suffered from lock contention in various situations. + +Solution +-------- + +The process table was the major problem to address since processes are +much more frequently used than ports. The first implementation only +implemented this for processes, but since the port table is very +similar and very similar problems occur on the port table, the process +table implementation was later generalized so that it could also be +used implementing the port table. For simplicity I will only talk +about the process table in the following text, but the same will apply +to the port table unless otherwise stated. + +If we disregard the locking issues, the original solution is very +appealing. The mapping from process identifier to index into the array +is very fast, and this property is something we would like to +keep. The vast majority of operations on these tables are lookups so +optimizing for lookups is what we want to do. + +### Lookup ### + +Using a set of bits in the process identifier as index into an array +seems hard to beat. By replacing the array of pointers with an array +of our pointer sized atomic data type, a lookup will consist of the +following: + +1. Mapping the 28-bit integer to an index into the array. + + More about this mapping later. + +2. Read the pointer using an atomic memory operation at determined + index in array. + + On all platforms that we provide atomic memory operations, this is + just a `volatile` read, preventing the compiler to use values in + registers, forcing the a read from memory. + +3. Depending on use, issue appropriate memory barrier. + + A common barrier used is a barrier with acquire semantics. On + x86/x86_64 this maps to a compiler barrier preventing the compiler + to reorder instructions, but on other hardware often some kind of + light weight hardware memory barrier is also needed. + + When comparing with a locked approach, at least one heavy weight + memory barrier will be issued when locking the lock on most, if + not all, hardware architectures (including x86/x86_64), and often + some kind of light weight memory barrier will be issued when + unlocking the lock. + +When looking at this very simple solution with very little overhead +you might wonder why we didn't implement it this way from the +beginning. It all boils down to the read operation of the pointer. We +need some way to know that it is safe to access the memory pointed +to. One way of doing this is to place a reference counter in the +process structure. Increment of the reference counter at lookup needs +to be done atomically with the lookup. A lock can typically provide +this service for us, which was the approach we previously +used. Another approach could be to co-locate the reference counter +with the pointer in the table. The major problem with this approach is +the modifications of the reference counter. This since these +modification would have to be communicated between all involved +processor cause contention on the cache line containing the reference +counter. The new lookup approach above is possible since we can use +the "thread progress" functionality in order to determine when it is +safe to deallocate the process structure. We'll get back to this when +describing deletion in the table. + +Using this new lookup approach we wont modify any memory at all which +is important. A lookup conceptually only read memory, now this is true +in the implementation also which is important from a scalability +perspective. The previous implementation modified the cache line +containing the reference counter two times, and the cache line +containing the corresponding lock two times at each lookup. + +### Modifications of the Table ### + +A lightweight lookup in the table was the most important feature, but +we also wanted to improve modifications of the table. The process +table is modified when a new process is spawned, i.e. a new pointer is +inserted into the table, and when a process terminates, i.e. a pointer +is deleted in the table. + +Assuming that we spawn fewer processes than the maximum amount of +unique process identifiers in the system, one has always been able to +determine the order of process creation just by comparing process +identifiers. If PidX is larger than PidY, then PidX was created after +PidY assuming both identifiers originates from the same node. However, +since we have a quite limited amount of unique identifiers today +(2^28), this property cannot be relied upon if we create large amount +of processes. But never the less, this is a property the system always +have had. + +If we would have had a huge amount of unique identifiers available, it +would have tempting to drop or modify this ordering property as +described above. The ordering property could for example be based on +the scheduler performing the spawn operation. It would have been +possible to reserve large ranges of identifiers exclusive for each +scheduler thread which could be used minimizing the need for +communication when allocating identifiers. The amount of identifiers +we got to work with today is, however, not even close to be enough for +such an approach. + +Since we have a limited amount of unique identifiers, we need to be +careful not to waste them. If previously used identifiers are reused +too quick, identifiers originating from terminated processes will +refer to newly created processes, and mixups will occur. The +previously used approach was quite good at not wasting +identifiers. Using a modified version of the same approach also lets +us keep the ordering property that we have always had. + +#### Insert #### + +The original approach is more or less to search for next free index or +slot in the array. The search starts from the last slot allocated. If +we reach the end of the array we increase a "wrapped counter" and then +continue the search. The process identifier is constructed by writing +the index to the least significant set of bits, and the "wrapped +counter" to the most significant set of bits. The amount of bits in +each set of bits is decided at boot time, so that maximum index will +just fit into the least significant set of bits. + +In the modified lock free version of this approach we more or less do +it the same way, but with some important modifications trying to avoid +unnecessary contention when multiple schedulers create processes +simultaneously. Since multiple threads might be trying to search for +the next free slot at the same time from the same starting point we +want subsequent slots to be located in different cache lines. Multiple +schedulers simultaneously writing new pointers into the table are +therefore very likely to write into adjacent slots. If adjacent slots +are located in the same cache line all modification of this cache line +needs to be communicated between all involved processors which will be +very expensive and scale very poor. By locating adjacent slots in +different cache lines only true conflicts will trigger communication +between involved processors, i.e., avoiding false sharing. + +A cache line is larger than a pointer, typically 8 or 16 times larger, +so using one cache line for each slot only containing one pointer +would be a waste of space. Each cache line will be able to hold a +fixed amount of slots. The first slot of the table will be the first +slot of the first cache line, the second slot of the table will be the +first slot of the second cache line until we reach the end of the +array. The next slot after that will be the second slot of the first +cache line, etc, moving forward one cache line internal slot each time +we wrap. This way we will be able to fit the same amount of pointers +into an array of the same size while always keeping adjacent slots in +different cache lines. + +The mapping from identifier to slot or index into the array gets a bit +more complicated than before. Instead of a `shift` and a bitwise +`and`, we get two `shift`s, two bitwise `and`s, and an `add` (see +implementation of `erts_ptab_data2pix()` in `erl_ptab.h`). However, by +storing this information optimized for lookup we only need a `shift` +and a bitwise `and` on 32-bit platforms. On 64-bit platforms we got +enough room for the 28-bit identifier in the least significant +halfword, and the index in the most significant halfword, in other +words, we just need to read the most significant halfword to get the +index. That is, this operation is as fast, or faster than before. The +downside is that on 32-bit platforms we need to convert this +information into the 28-bit identifier number when printing, or when +ordering identifiers from the same node. These operations are, +however, extremely infrequent compared to lookups. + +When we insert a new element in the table we do the following: + +1. We begin by reserving space in the table by atomically + incrementing a counter of processes in the table. If our increment + brings the counter above the maximum size of the table, the + operation fail and a `system_limit` exception is raised. + +2. The table contains a 64-bit atomic variable of the last identifier + used. Only the least significant bits will be used when actually + creating the identifier. This identifier is where the search + begin. + +3. We increment last identifier value used. In order determine the + slot that corresponds to this identifier we call + `erts_ptab_data2pix()` that maps identifier to slot. We read the + content of the slot. If the slot is free we try to write a + reservation marker using an atomic compare and swap. If this fails + we repeat this step until it succeeds. + +4. Change the table variable of last identifier used. Since multiple + writes might occur at the same time this value may already have + been changed by to an identifier larger that the one we got. In + this case we can continue; otherwise, we need to change it to the + identifier we got. + +5. We now do some initializations of the process structure that + cannot be done before we know the process identifier, and have to + be done before we publish the structure in the table. This, for + example, includes storing the identifier in the process structure. + +6. Now we can publish the structure in the table by writing the the + pointer to the process structure in the slot previously reserved + in 3. + +Using this approach we keep the properties like identifier ordering, +and identifier reuse while improving performance and scalability. It +has one flaw, though. There is no guarantee that the operation will +terminate. This can quite easily be fixed though, and will be fixed in +the next release. We will get back to this below. + +#### Delete #### + +When a process terminates, we mark the process as terminated in the +process structure, the counter of number of processes in the table is +decreased, and the reference to the process structure is removed by +writing a `NULL` pointer into the corresponding slot. The scheduler +thread performing this then schedule a thread progress later job which +will do the final cleanup and deallocate the process structure. The +thread progress functionality will make sure that this job will not +execute until it is certain that all managed threads have dropped all +references to the process structure. + +### BIF Iterating Over the Table ### + +The `erlang:processes/1` and `erlang:port/1` BIFs iterate over the +tables and return corresponding identifiers. These BIF should return a +consistent snapshot of the table content during some time when the BIF +is executing. In order to implement this we use locking in a strange +way. We use an "inverted rwlock". + +When performing lookups in the table we do not need to bother about +the locking at all, but when modifying the table we read lock the +rwlock protecting the table which allows for multiple writers during +normal operation. When the BIF that iterates over the table need +access to the table it write locks the rwlock and reads content of the +table. The BIF do not read the whole table in one go but instead read +small chunks at time only write locking while reading. The actual +implementation of the BIFs is out of the scope of this document. + +An out of the box rwlock will typically suffer from contention on the +single cache line containing the state of the rwlock even in the case +we are only read locking. Instead of using such an rwlock, we have our +own implementation of reader optimized rwlocks which keeps track of +reader threads in separate thread specific cache lines. This in order +to avoid contention on a singe cache line. As long as we only do read +lock operations, threads only need to read a global cache line and +modify its own cache line, and by this minimize communication between +involved processors. The iterating BIFs are normally very infrequently +used, so in the normal case we will only do read lock operations on +the table global rwlock. + +### Future Improvements ### + +The first improvement is to fix the guarantee so that insert +operations will be guaranteed to terminate. When the operation starts +we verify that there actually exist a free slot that we can use. The +problem is that we might not find it since it may move when multiple +threads modify the table at the same time as we are trying to find the +slot. The easy fix is to abort the operation if an empty slot could +not be found in a finite number operation, and then restart the +operation under a write lock. This will be implemented in next +release, but furter work should be made trying to find a better +solution. + +This and also previous implementation do not work well when the table +is nearly full. We will both get long search times for free slots, and +we will reuse identifiers more frequently since we more frequently +wrap during the search. These tables works best when the table is much +larger than the amount of simultaneous existing processes. One easy +improvement is to always have room for more processes than we allow in +the table. This will also be implemented in the next release, but this +should probably also be worked more on trying to find an even better +solution. + +It would also be nice to get rid of the rwlock all together. The use +of a reader optimized rwlock makes sure we do not any contention on +the lock, but unnecessary memory barriers will be issued due to the +lock. The main issue here is to modify iterating BIFs so that they do +not require exclusive access to the table while reading a sequence of +slots. In principle this should be rather easy, the code can handle +sequences of variable sizes, so shrinking the sequence size of slots +to one would solv the problem. This will, however, need some tweeks +and modifications of not trival code, but is something that should be +looked at in the future. + +By increasing the size of identifiers, at least on 64-bit machines +(which isn't as easy as it first might seem) we get further room for +improvement. Besides the obvious improvement of not reusing +identifiers as fast as we currently do, it makes it possible to +further avoid contention when inserting elements in the table. At +least if we drop this ordering property, which isn't that useful +anyway. + +### Some Benchmark Results ### + +In order to test modifications of the process table we ran a couple of +benchmarks where lots of processes are spawned and terminated +simultaneously, and got a speedup of between 150-200%. Running a +similar benchmark but with ports we got a speedup of about 130%. + +The BIF `erlang:is_process_alive/1` is the closest you can get to a +process table lookup only. The BIF looks up the process corresponding +to the process identifier passed as argument, and then checks if it is +alive. By running multiple processes looping over this BIF checking +the same process, we get a speedup between 20000-23000%. Conceptually +this operation only involve read operations. In the implementation +used in R16B also only read operation are performed, while the +previous implementation need to lock structures in order to read the +data, suffering from both lock contention and contention due to +modifications of cache lines used by lock internal data structures and +the reference counter on the process being looked up. + +The benchmarks were run on a relatively new machine with an Intel i7 +quad core processor with hyper-threading using 8 schedulers. On a +machine with more communication overhead and/or larger amount of +logical processors the speedups are expected to be even larger. diff --git a/erts/emulator/internal_doc/PortSignals.md b/erts/emulator/internal_doc/PortSignals.md new file mode 100644 index 0000000000..b1afb7c5cb --- /dev/null +++ b/erts/emulator/internal_doc/PortSignals.md @@ -0,0 +1,267 @@ +Port Signals +============ + +Problems +-------- + +Erlang ports conceptually are very similar to Erlang processes. Erlang +processes execute Erlang code in the virtual machine, while an Erlang +port execute native code typically used for communication with the +outside world. For example, when an Erlang process wants to +communicate using TCP over the network, it communicates via an Erlang +port implementing the TCP socket interface in native code. Both Erlang +Processes and Ports communicate using asynchronous signaling. The +native code executed by an Erlang port is a collection of callback +functions, called a driver. Each callback more or less implements the +code of a signal to, or from the port. + +Even though processes and ports conceptually always have been very +similar, the implementations have been very different. Originally, +more or less all port signals were handled synchronously at the time +they occurred. Very early in the development of the SMP support for +the runtime system we recognized that this was a huge problem for +signals between ports and the outside world. That is, I/O events to +and from the outside world, or I/O signals. This was one of the first +things that had to be rewritten in order to be able to do I/O in +parallel at all. The solution was to implement scheduling of these +signals. I/O signals corresponding to different ports could then be +executed in parallel on different scheduler threads. Signals from +processes to ports was not as big of a problem as the I/O signals, and +the implementation of those was left as they were. + +Each port is protected by its own lock to protect against simultaneous +execution in multiple threads. Previously when a process, executing on +a scheduler thread, sent a port a signal, it locked the port lock and +synchronously executed the code corresponding to the signal. If the +lock was busy, the scheduler thread blocked waiting until it could +lock the lock. If multiple processes executing simultaneously on +different scheduler threads, sent signals to the same port, schedulers +suffered from heavy lock contention. Such contention could also occur +between I/O signals for the port executing on one scheduler thread, +and a signal from a process to the port executing on another scheduler +thread. Beside the contention issues, we also loose potential work to +execute in parallel on different scheduler threads. This since the +process sending the *asynchronous* signal is blocked while the code +implementing the signal is executed synchronously. + +Solution +-------- + +In order to prevent multiple schedulers from trying to execute signals +to/from the same port simultaneously, we need to be able to ensure +that all signals to/from a port are executed in sequence on one +scheduler. More or less, the only way to do this is to schedule all +types of signals. Signals corresponding to a port can then be executed +in sequence by one single scheduler thread. If only one thread tries +to execute the port, no contention will appear on the port +lock. Besides getting rid of the contention, processes sending signals +to the port can also continue execution of their own Erlang code on +other schedulers at the same time as the signaling code is executing +on another scheduler. + +When implementing this there are a couple of important properties that +we either need, or want to preserve: + +* Signal ordering guarantee. Signals from process `X` to port `Y`, + *must* be delivered to `Y` in the same order as sent from `X`. + +* Signal latency. Due to the previous synchronous implementation, + latency of signals sent from processes to ports have usually been + very low. During contention the latency has of course + increased. Users expect latency of these signals to be low, a + sudden increase in latency would not be appreciated by our users. + +* Compatible flow control. Ports have for a very long time had the + possibility to use the busy port functionality when implementing + flow control. One may argue that this functionality fits very bad + with the conceptually completely asynchronous signaling, but the + functionality has been there for ages and is expected to be + there. When a port sets itself into a busy state, `command` + signals should not be delivered, and senders of such signals + should suspend until the port sets itself in a not busy state. + +### Scheduling of Port Signals ### + +A run queue has four queues for processes of different priority and +one queue for ports. The scheduler thread associated with the run +queue switch evenly between execution of processes and execution of +ports while both processes and ports exist in the queue. This is not +completely true, but not important for this discussion. A port that is +in a run queue also has a queue of tasks to execute. Each task +corresponds to an in- or outgoing signal. When the port is selected +for execution each task will be executed in sequence. The run queue +locks not only protected the queues of ports, but also the queues of +port tasks. + +Since we go from a state where I/O signals are the only port related +signals scheduled, to a state where potentially all port related +signals may be scheduled we may drastically increase the load on the +run queue lock. The amount of scheduled port tasks very much depend on +the Erlang application executing, which we do not control, and we do +not want to get increased contention on the run queue locks. We +therefore need another approach of protecting the port task queue. + +#### Task Queue #### + +We chose a "semi locked" approach, with one public locked task queue, +and a private, lock free, queue like, task data structure. This "semi +locked" approach is similar to how the message boxes of processes are +managed. The lock is port specific and only used for protection of +port tasks, so the run queue lock is now needed in more or less the +same way for ports as for processes. This ensures that we wont see an +increased lock contention on run queue locks due to this rewrite of +the port functionality. + +When an executing port runs out of work to execute in the private task +data structure, it moves the public task queue into the private task +data structure while holding the lock. Once tasks has been moved to +the private data structure no lock protects them. This way the port +can continue working on tasks in the private data structure without +having to fight for the lock. + +I/O signals may however be aborted. This could be solved by letting +the port specific scheduling lock also protect the private task data +structure, but then the port very frequently would have to fight with +others enqueueing new tasks. In order to handle this while keeping the +private task data structure lock free, we use a similar "non +aggressive" approach as we use when handling processes that gets +suspended while in the run queue. Instead of removing the aborted port +task, we just mark it as aborted using an atomic memory +operation. When a task is selected for execution, we first verify that +it has not been aborted. If aborted we, just drop the task. + +A task that can be aborted is referred via another data structure from +other parts of the system, so that a thread that needs to abort the +task can reach it. In order to be sure to safely deallocate a task +that is no longer used, we first clear this reference and then use the +thread progress functionality in order to make sure no references can +exist to the task. Unfortunately, also unmanaged threads might abort +tasks. This is very infrequent, but might occur. This could be handled +locally for each port, but would require extra information in each +port structure which very infrequently would be used. Instead of +implementing this in each port, we implemented general functionality +that can be used from unmanaged threads to delay thread progress. + +The private "queue like" task data structure could have been an +ordinary queue if it wasn't for the busy port functionality. When the +port has flagged itself as busy, `command` signals are not allowed to +be delivered and need to be blocked. Other signals sent from the same +sender following a `command` signal that has been blocked also have to +be blocked; otherwise, we would violate the ordering guarantee. At the +same time, other signals that have no dependencies to blocked +`command` signals are expected to be delivered. + +The above requirements makes the private task data structure a rather +complex data structure. It has a queue of unprocessed tasks, and a +busy queue. The busy queue contains blocked tasks corresponding to +`command` signals, and tasks with dependencies to such tasks. The busy +queue is accompanied by a table over blocked tasks based on sender +with a references into last task in the busy queue from a specific +sender. This since we need check for dependencies when new tasks are +processed in the queue of unprocessed tasks. When a new task is +processed that needs to be blocked it isn't enqueued at the end of the +busy queue, but instead directly after the last task with the same +sender. This in order to easily be able to detect when we have tasks +that no longer have any dependencies to tasks corresponding to +`command` signals which should be moved out of the busy queue. When +the port executes, it switches between processing tasks from the busy +queue, and processing directly from the unprocessed queue based on its +busy state. When processing directly from the unprocessed queue it +might, of course, have to move a task into the busy queue instead of +executing it. + +#### Busy Port Queue #### + +Since it is the port itself which decides when it is time to enter a +busy state, it needs to be executing in order to enter the busy +state. As a result of `command` signals being scheduled, we may get +into a situation where the port gets flooded by a huge amount of +`command` signals before it even gets a chance to set itself into a +busy state. This since it has not been scheduled for execution +yet. That is, under these circumstances the busy port functionality +loose the flow control properties it was intended to provide. + +In order to solve this, we introduced a new busy feature, namely "busy +port queue". The port has a limit of `command` data that is allowed to +be enqueued in the task queue. When this limit is reached, the port +will automatically enter a busy port queue state. When in this state, +senders of `command` signals will be suspended, but `command` signals +will still be delivered to the port unless it is also in a busy port +state. This limit is known as the high limit. + +There is also a low limit. When the amount of queued `command` data +falls below this limit and the port is in a busy port queue state, the +busy port queue state is automatically disabled. The low limit should +typically be significantly lower than the high limit in order to +prevent frequent oscillation around the busy port queue state. + +By introduction of this new busy state we still can provide the flow +control. Old driver do not even have to be changed. The limits can, +however, be configured and even disabled by the port. By default the +high limit is 8 KB and the low limit is 4 KB. + +### Preparation of Signal Send ### + +Previously all operations sending signals to ports began by acquiring +the port lock, then performed preparations for sending the signal, and +then finaly sent the signal. The preparations typically included +inspecting the state of the port, and preparing the data to pass along +with the signal. The preparation of data is frequently quite time +consuming, and did not really depend on the port. That is we would +like to do this without having the port lock locked. + +In order to improve this, state information was re-organized in the +port structer, so that we can access it using atomic memory +operations. This together with the new port table implementation, +enabled us to lookup the port and inspect the state before acquiring +the port lock, which in turn made it possible to perform preparations +of signal data before acquiring the port lock. + +### Preserving Low Latency ### + +If we disregard the contended cases, we will inevitably get a higher +latency when scheduling signals for execution at a later time than by +executing the signal immediately. In order to preserve the low latency +we now first check if this is a contended case or not. If it is, we +schedule the signal for later execution; otherwise, we execute the +signal immediately. It is a contended case if other signals already +are scheduled on the port, or if we fail to acquire the port +lock. That is we will not block waiting for the lock. + +Doing it this way we will preserve the low latency at the expense of +lost potential parallel execution of the signal and other code in the +process sending the signal. This default behaviour can however be +changed on port basis or system wide, forcing scheduling of all +signals from processes to ports that are not part of a synchronous +communication. That is, an unconditional request/response pair of +asynchronous signals. In this case it is no potential for parallelism, +and by that no point forcing scheduling of the request signal. + +The immediate execution of signals may also cause a scheduler that is +about to execute scheduled tasks to block waiting for the port +lock. This is however more or less the only scenario where a scheduler +needs to wait for the port lock. The maximum time it has to wait is +the time it takes to execute one signal, since we always schedule +signals when contention occurs. + +### Signal Operations ### + +Besides implementing the functionality enabling the scheduling, +preparation of signal data without port lock, etc, each operation +sending signals to ports had to be quite extensively re-written. This +in order to move all sub-operations that can be done without the lock +to a place before we have acquired the lock, and also since signals +now sometimes are executed immediately and sometimes scheduled for +execution at a later time which put different requirements on the data +to pass along with the signal. + +### Some Benchmark Results ### + +When running some simple benchmarks where contention only occur due to +I/O signals contending with signals from one single process we got a +speedup of 5-15%. When multiple processes send signals to one single +port the improvements can be much larger, but the scenario with one +process contending with I/O is the most common one. + +The benchmarks were run on a relatively new machine with an Intel i7 +quad core processor with hyper-threading using 8 schedulers.
\ No newline at end of file diff --git a/erts/emulator/internal_doc/ProcessManagementOptimizations.md b/erts/emulator/internal_doc/ProcessManagementOptimizations.md new file mode 100644 index 0000000000..9e83633bef --- /dev/null +++ b/erts/emulator/internal_doc/ProcessManagementOptimizations.md @@ -0,0 +1,172 @@ +Process Management Optimizations +================================ + +Problems +-------- + +Early versions of the SMP support for the runtime system completely +relied on locking in order to protect data accesses from multiple +threads. In some cases this isn't that problematic, but in some cases +it really is. It complicates the code, ensuring all locks needed are +actually held, and ensuring that all locks are acquired in such an +order that no deadlock occur. Acquiring locks in the right order often +also involve releasing locks held, forcing threads to reread data +already read. A good recipe for creation of bugs. Trying to use more +fine-grained locking in order to increase possible parallelism in the +system makes the complexity situation even worse. Having to acquire a +bunch of locks when doing operations also often cause heavy lock +contention which cause poor scalability. + +Management of processes internally in the runtime system suffered from +these problems. When changing state on a process, for example from +`waiting` to `runnable`, a lock on the process needed to be +locked. When inserting a process into a run queue also a lock +protecting the run queue had to be locked. When migrating a process +from one run queue to another run queue, locks on both run queues and +on the process had to be locked. + +This last example is a quite common case in during normal +operation. For example, when a scheduler thread runs out of work it +tries to steal work from another scheduler threads run queue. When +searching for a victim to steal from there was a lot of juggling of +run queue locks involved, and during the actual theft finalized by +having to lock both run queues and the process. When one scheduler +runs out of work, often others also do, causing lots of lock +contention. + +Solution +-------- + +### Process ### + +In order to avoid these situations we wanted to be able to do most of +the fundamental operations on a process without having to acquire a +lock on the process. Some examples of such fundamental operations are, +moving a process between run queues, detecting if we need to insert it +into a run queue or not, detecting if it is alive or not. + +All of this information in the process structure that was needed by +these operations was protected by the process `status` lock, but the +information was spread across a number of fields. The fields used was +typically state fields that could contain a small number of different +states. By reordering this information a bit we could *easily* fit +this information into a 32-bit wide field of bit flags (only 12-flags +were needed). By moving this information we could remove five 32-bit +wide fields and one pointer field from the process structure! The move +also enabled us to easily read and change the state using atomic +memory operations. + +### Run Queue ### + +As with processes we wanted to be able to do the most fundamental +operations without having to acquire a lock on it. The most important +being able to determine if we should enqueue a process in a specific +run queue or not. This involves being able to read actual load, and +load balancing information. + +The load balancing functionality is triggered at repeated fixed +intervals. The load balancing more or less strives to even out run +queue lengths over the system. When balancing is triggered, +information about every run queue is gathered, migrations paths and +run queue length limits are set up. Migration paths and limits are +fixed until the next balancing has been done. The most important +information about each run queue is the maximum run queue length since +last balancing. All of this information were previously stored in the +run queues themselves. + +When a process has become runnable, for example due to reception of a +message, we need to determine which run queue to enqueue it +in. Previously this at least involved locking the run queue that the +process currently was assigned to while holding the status lock on the +process. Depending on load we sometimes also had to acquire a lock on +another run queue in order to be able to determine if it should be +migrated to that run queue or not. + +In order to be able to decide which run queue to use without having to +lock any run queues, we moved all fixed balancing information out of +the run queues into a global memory block. That is, migration paths +and run queue limits. Information that need to be frequently updated, +like for example maximum run queue length, were kept in the run queue, +but instead of operating on this information under locks we now use +atomic memory operations when accessing this information. This made it +possible to first determine which run queue to use, without locking +any run queues, and when decided, lock the chosen run queue and insert +the process. + +#### Fixed Balancing Information #### + +When determining which run queue to choose we need to read the fixed +balancing information that we moved out of the run queues. This +information is global, read only between load balancing operations, +but will be changed during a load balancing. We do not want to +introduce a global lock that needs to be acquired when accessing this +information. A reader optimized rwlock could avoid some of the +overhead since the data is most frequently read, but it would +unavoidably cause disruption during load balancing, since this +information is very frequently read. The likelihood of a large +disruption due to this also increase as number of schedulers grows. + +Instead of using a global lock protecting modifications of this +information, we write a completely new version of it at each load +balancing. The new version is written in another memory block than the +previous one, and published by issuing a write memory barrier and then +storing a pointer to the new memory block in a global variable using +an atomic write operation. + +When schedulers need to read this information, they read the pointer +to currently used information using an atomic read operation, and then +issue a data dependency read barrier, which on most architectures is a +no-op. That is, it is very little overhead getting access to this +information. + +Instead of allocating and deallocating memory blocks for the different +versions of the balancing information we keep old memory blocks and +reuse them when it is safe to do so. In order to be able to determine +when it is safe to reuse a block we use the thread progress +functionality, ensuring that no threads have any references to the +memory block when we reuse it. + +#### Be Less Aggressive #### + +We implemented a test version using lock free run queues. This +implementation did however not perform as good as the version using +one lock per run queue. The reason for this was not investigated +enough to say why this was. Since the locked version performed better +we kept it, at least for now. The lock free version, however, forced +us to use other solutions, some of them we kept. + +Previously when a process that was in a run queue got suspended, we +removed it from the queue straight away. This involved locking the +process, locking the run queue, and then unlinking it from the double +linked list implementing the queue. Removing a process from a lock +free queue gets really complicated. Instead, of removing it from the +queue, we just leave it in the queue and mark it as suspended. When +later selected for execution we check if the process is suspended, if +so just dropped it. During its time in the queue, it might also get +resumed again, if so execute it when it get selected for execution. + +By keeping this part when reverting back to a locked implementation, +we could remove a pointer field in each process structure, and avoid +unnecessary operations on the process and the queue which might cause +contention. + +### Combined Modifications ### + +By combining the modifications of the process state management and the +run queue management, we can do large parts of the work involved when +managing processes with regards to scheduling and migration without +having any locks locked at all. In these situations we previously had +to have multiple locks locked. This of course caused a lot of rewrites +across large parts of the runtime system, but the rewrite both +simplified code and eliminated locking at a number of places. The +major benefit is, of course, reduced contention. + +### A Benchmark Result ### + +When running the chameneosredux benchmark, schedulers frequently run +out of work trying to steal work from each other. That is, either +succeeding in migrating, or trying to migrate processes which is a +scenario which we wanted to optimize. By the introduction of these +improvements, we got a speedup of 25-35% when running this benchmark +on a relatively new machine with an Intel i7 quad core processor with +hyper-threading using 8 schedulers.
\ No newline at end of file diff --git a/erts/emulator/internal_doc/ThreadProgress.md b/erts/emulator/internal_doc/ThreadProgress.md new file mode 100644 index 0000000000..6118bcf0f6 --- /dev/null +++ b/erts/emulator/internal_doc/ThreadProgress.md @@ -0,0 +1,308 @@ +Thread Progress +=============== + +Problems +-------- + +### Knowing When Threads Have Completed Accesses to a Data Structure ### + +When multiple threads access the same data structure you often need to +know when all threads have completed their accesses. For example, in +order to know when it is safe to deallocate the data structure. One +simple way to accomplish this is to reference count all accesses to +the data structure. The problem with this approach is that the cache +line where the reference counter is located needs to be communicated +between all involved processors. Such communication can become +extremely expensive and will scale poorly if the reference counter is +frequently accessed. That is, we want to use some other approach of +keeping track of threads than reference counting. + +### Knowing That Modifications of Memory is Consistently Observed ### + +Different hardware architectures have different memory models. Some +architectures allows very aggressive reordering of memory accesses +while other architectures only reorder a few specific cases. Common to +all modern hardware is, however, that some type of reordering will +occur. When using locks to protect all memory accesses made from +multiple threads such reorderings will not be visible. The locking +primitives will ensure that the memory accesses will be ordered. When +using lock free algorithms one do however have to take this reordering +made by the hardware into account. + +Hardware memory barriers or memory fences are instructions that can be +used to enforce order between memory accesses. Different hardware +architectures provide different memory barriers. Lock free algorithms +need to use memory barriers in order to ensure that memory accesses +are not reordered in such ways that the algorithm breaks down. Memory +barriers are also expensive instructions, so you typically want to +minimize the use of these instructions. + +Functionality Used to Address These Problems +------------------------------------------- + +The "thread progress" functionality in the Erlang VM is used to +address these problems. The name "thread progress" was chosen since we +want to use it to determine when all threads in a set of threads have +made such progress so that two specific events have taken place for +all them. + +The set of threads that we are interested in we call managed +threads. The managed threads are the only threads that we get any +information about. These threads *have* to frequently report +progress. Not all threads in the system are able to frequently report +progress. Such threads cannot be allowed in the set of managed threads +and are called unmanaged threads. An example of unmanaged threads are +threads in the async thread pool. Async threads can be blocked for +very long times and by this be prevented from frequently reporting +progress. Currently only scheduler threads and a couple of other +threads are managed threads. + +### Thread Progress Events ### + +Any thread in the system may use the thread progress functionality in +order to determine when the following events have occured at least +once in all managed threads: + +1. The thread has returned from other code to a known state in the + thread progress functionality, which is independent of any other + code. +2. The thread has executed a full memory barrier. + +These events, of course, need to occur ordered to other memory +operations. The operation of determining this begins by initiating the +thread progress operation. The thread that initiated the thread +progress operation after this poll for the completion of the +operation. Both of these events must occur at least once *after* the +thread progress operation has been initiated, and at least once +*before* the operation has completed in each managed thread. This is +ordered using communication via memory which makes it possible to draw +conclusion about the memory state after the thread progress operation +has completed. Lets call the progress made from initiation to +comletion for "thread progress". + +Assuming that the thread progress functionality is efficient, a lot of +algorithms can both be simplified and made more efficient than using +the first approach that comes to mind. A couple of examples follows. + +By being able to determine when the first event above has occurred we +can easily know when all managed threads have completed accesses to a +data structure. This can be determined the following way. We have an +implementation of some functionality `F` using a data structure +`D`. The reference to `D` is always looked up before `D` is being +accessed, and the references to `D` is always dropped before we leave +the code implementing `F`. If we remove the possibility to look up `D` +and then wait until the first event has occurred in all managed +threads, no managed threads can have any references to the data +structure `D`. This could for example have been achieved by using +reference counting, but the cache line containing the reference +counter would in this case be ping ponged between all processors +accessing `D` at every access. + +By being able to determine when the second event has occurred it is +quite easy to do complex modifications of memory that needs to be seen +consistently by other threads without having to resort to locking. By +doing the modifications, then issuing a full memory barrier, then wait +until the second event has occurred in all managed threads, and then +publish the modifications, we know that all managed threads reading +this memory will get a consistent view of the modifications. Managed +threads reading this will not have to issue any extra memory barriers +at all. + +Implementation of the Thread Progress Functionality +--------------------------------------------------- + +### Requirement on the Implementation ### + +In order to be able to determine when all managed threads have reached +the states that we are interested in we need to communicate between +all involved threads. We of course want to minimize this +communication. + +We also want threads to be able to determine when thread progress has +been made relatively fast. That is we need to have some balance +between comunication overhead and time to complete the operation. + +### API ### + +I will only present the most important functions in the API here. + +* `ErtsThrPrgrVal erts_thr_progress_later(void)` - Initiation of the + operation. The thread progress value returned can be used testing + for completion of the operation. +* `int erts_thr_progress_has_reached(ErtsThrPrgrVal val)` - Returns + a non zero value when we have reached the thread progress value + passed as argument. That is, when a non zero value is returned the + operation has completed. + +When a thread calls `my_val = erts_thr_progress_later()` and waits for +`erts_thr_progress_has_reached(my_val)` to return a non zero value it +knows that thread progress has been made. + +While waiting for `erts_thr_progress_has_reached()` to return a non +zero value we typically do not want to block waiting, but instead want +to continue working with other stuff. If we run out of other stuff to +work on we typically do want to block waiting until we have reached +the thread progress value that we are waiting for. In order to be able +to do this we provide functionality for waking up a thread when a +certain thread progress value has been reached: + +* `void erts_thr_progress_wakeup(ErtsSchedulerData *esdp, + ErtsThrPrgrVal val)` - Request wake up. The calling thread will be + woken when thread progress has reached val. + +Managed threads frequently need to update their thread progress by +calling the following functions: + +* `int erts_thr_progress_update(ErtsSchedulerData *esdp)` - Update + thread progress. If a non zero value is returned + `erts_thr_progress_leader_update()` has to be called without any + locks held. +* `int erts_thr_progress_leader_update(ErtsSchedulerData *esdp)` - + Leader update thread progress. + +Unmanaged threads can delay thread progress beeing made: + +* `ErtsThrPrgrDelayHandle erts_thr_progress_unmanaged_delay(void)` - + Delay thread progress. +* `void erts_thr_progress_unmanaged_continue(ErtsThrPrgrDelayHandle + handle)` - Let thread progress continue. + +Scheduler threads can schedule an operation to be executed by the +scheduler itself when thread progress has been made: + +* `void erts_schedule_thr_prgr_later_op(void (*funcp)(void *), void + *argp, ErtsThrPrgrLaterOp *memp)` - Schedule a call to `funcp`. The + call `(*funcp)(argp)` will be executed when thread progress has been + made since the call to `erts_schedule_thr_prgr_later_op()` was + made. + +### Implementation ### + +In order to determine when the events has happened we use a global +counter that is incremented when all managed threads have called +`erts_thr_progress_update()` (or `erts_thr_progress_leader_update()`). +This could naively be implemented using a "thread confirmed" counter. +This would however cause an explosion of communication where all +involved processors would need to communicate with each other at each +update. + +Instead of confirming at a global location each thread confirms that +it accepts in increment of the global counter in its own cache +line. These confirmation cache lines are located in sequence in an +array, and each confirmation cache line will only be written by one +and only one thread. One of the managed threads always have the leader +responsibility. This responsibility may jump between threads, but as +long as there are some activity in the system always one of them will +have the leader responsibility. The thread with the leader +responsibility will call `erts_thr_progress_leader_update()` which +will check that all other threads have confirmed an increment of the +global counter before doing the increment of the global counter. The +leader thread is the only thread reading the confirmation cache +lines. + +Doing it this way we will get a communication pattern of information +going from the leader thread out to all other managed threads and then +back from the other threads to the leader thread. This since only the +leader thread will write to the global counter and all other threads +will only read it, and since each confirmation cache lines will only +be written by one specific thread and only read by the leader +thread. When each managed thread is distributed over different +processors, the communication between processors will be a reflection +of this communication pattern between threads. + +The value returned from `erts_thr_progress_later()` equals the, by +this thread, latest confirmed value plus two. The global value may be +latest confirmed value or latest confirmed value minus one. In order +to be certain that all other managed threads actually will call +`erts_thr_progress_update()` at least once before we reach the value +returned from `erts_thr_progress_later()`, the global counter plus one +is not enough. This since all other threads may already have confirmed +current global value plus one at the time when we call +`erts_thr_progress_later()`. They are however guaranteed not to have +confirmed global value plus two at this time. + +The above described implementation more or less minimizes the +comunication needed before we can increment the global counter. The +amount of communication in the system due to the thread progress +functionality however also depend on the frequency with which managed +threads call `erts_thr_progress_update()`. Today each scheduler thread +calls `erts_thr_progress_update()` more or less each time an Erlang +process is scheduled out. One way of further reducing communication +due to the thread progress functionality is to only call +`erts_thr_progress_update()` every second, or third time an Erlang +process is scheduled out, or even less frequently than that. However, +by doing updates of thread progress less frequently all operations +depending on the thread progress functionality will also take a longer +time. + +#### Delay of Thread Progress by Unmanaged Threads #### + +In order to implement delay of thread progress from unmanaged threads +we use two reference counters. One being `current` and one being +`waiting`. When an unmanaged thread wants to delay thread progress it +increments `current` and gets a handle back to the reference counter +it incremented. When it later wants to enable continuation of thread +progress it uses the handle to decrement the reference counter it +previously incremented. + +When the leader threads is about to increment the global thread +progress counter it verifies that the `waiting` counter is zero before +doing so. If not zero, the leader isn't allowed to increment the +global counter, and needs to wait before it can do this. When it is +zero, it swaps the `waiting` and `current` counters before increasing +the global counter. From now on the new `waiting` counter will +decrease, so that it eventualy will reach zero, making it possible to +increment the global counter the next time. If we only used one +reference counter it would potentially be held above zero for ever by +different unmanaged threads. + +When an unmanaged thread increment the `current` counter it will not +prevent the next increment of the global counter, but instead the +increment after that. This is sufficient since the global counter +needs to be incremented two times before thread progress has been +made. It is also desirable not to prevent the first increment, since +the likelyhood increases that the delay is withdrawn before any +increment of the global counter is delayed. That is, the operation +will cause as little disruption as possible. + +However, this feature of delaying thread progress from unmanaged +threads should preferably be used as little as possible, since heavy +use of it will cause contention on the reference counter cache +lines. The functionality is however very useful in code which normally +only executes in managed threads, but which may under some infrequent +circumstances be executed in other threads. + +#### Overhead #### + +The overhead caused by the thread progress functionality is more or +less fixed using the same amount of schedulers regardless of the +number of uses of the functionality. Already today quite a lot of +functionality use it, and we plan to use it even more. When rewriting +old implementations of ERTS internal functionality to use the thread +progress functionality, this implies removing communication in the old +implementation. Otherwise it is simply no point rewriting the old +implementation to use the thread progress functionality. Since the +thread progress overhead is more or less fixed, the rewrite will cause +a reduction of the total communication in the system. + +##### An Example ##### + +The main structure of an ETS table was originally managed using +reference counting. Already a long time ago we replaced this strategy +since the reference counter caused contention on each access of the +table. The solution used was to schedule "confirm deletion" jobs on +each scheduler in order to know when it was safe to deallocate the +table structure of a removed table. These confirm deletion jobs needed +to be allocated. That is, we had to allocate and deallocate as many +blocks as schedulers in order to deallocate one block. This of course +was a quite an expensive operation, but we only needed to do this once +when removing a table. It was more important to get rid of the +contention on the reference counter which was present on every +operation on the table. + +When the thread progress functionality had been introduced, we could +remove the code implementing the "confirm deletion" jobs, and then +just schedule a thread progress later operation which deallocates the +structure. Besides simplifying the code a lot, we got an increase of +more than 10% of the number of transactions per second handled on a +mnesia tpcb benchmark executing on a quad core machine. diff --git a/erts/emulator/internal_doc/Tracing.md b/erts/emulator/internal_doc/Tracing.md new file mode 100644 index 0000000000..30bc5327a7 --- /dev/null +++ b/erts/emulator/internal_doc/Tracing.md @@ -0,0 +1,220 @@ +Non-blocking trace setting +========================== + +Introduction +------------ + +Before OTP R16 when trace settings were changed by `erlang:trace_pattern`, +all other execution in the VM were halted while the trace operation +was carried out in single threaded mode. Similar to code loading, this +can impose a severe problem for availability that grows with the +number of cores. + +In OTP R16, trace breakpoints are set in the code without blocking the +VM. Erlang processes may continue executing undisturbed in parallel +during the entire operation. The same base technique is used as for +code loading. A staging area of breakpoints is prepared and then made +active with a single atomic operation. + + +Redesign of Breakpoint Wheel +---------------------------- + +To make it easier to manage breakpoints without single threaded mode a +redesign of the breakpoint mechanism has been made. The old +"breakpoint wheel" data structure was a circular double-linked list of +breakpoints for each instrumented function. It was invented before the +SMP emulator. To support it in the SMP emulator, is was essentially +expanded to one breakpoint wheel per scheduler. As more breakpoint +types have been added, the implementation have become messy and hard +to understand and maintain. + +In the new design the old wheel was dropped and instead replaced by +one struct (`GenericBp`) to hold the data for all types of breakpoints +for each instrumented function. A bit-flag field is used to indicate +what different type of break actions that are enabled. + + +Same Same but Different +----------------------- +Even though `trace_pattern` use the same technique as the non-blocking +code loading with replicated generations of data structures and an +atomic switch, the implementations are quite separate from each +other. One initial idea was to use the existing mechanism of code +loading to do a dummy load operation that would make a copy of the +affected modules. That copy could then be instrumented with +breakpoints before making it reachable with the same atomic switch as +done for code loading. This approach seems straight forward but has a +number of shortcomings, one being the large memory footprint when many +modules are instrumented. Another problem is how execution will reach +the new instrumented code. Normally loaded code can only be reached +through external functions calls. Trace settings must be activated +instantaneously without the need of external function calls. + +The choosen solution is instead for tracing to use the technique of +replication applied on the data structures for breakpoints. Two +generations of breakpoints are kept and indentified by index of 0 and +1. The global atomic variables `erts_active_bp_index` will determine +which generation of breakpoints running code will use. + +### Atomicy Without Atomic Operations + +Not using the code loading generations (or any other code duplication) +means that `trace_pattern` must at some point write to the active beam +code in order for running processes to reach the staged breakpoints +structures. This can be done with one single atomic write operation +per instrumented function. The beam instruction words are however read +with normal memory loads and not through the atomic API. The only +guarantee we need is that the written instruction word is seen as +atomic. Either fully written or not at all. This is true for word +aligned write operation on all hardware architectures we use. + + +Adding a new Breakpoint +----------------------- +This is a simplified sequence describing what `trace_pattern` goes +through when adding a new breakpoint. + +1. Seize exclusive code write permission (suspend process until we get it). + +2. Allocate breakpoint structure `GenericBp` including both generations. + Set the active part as disabled with a zeroed flagfield. Save the original + instruction word in the breakpoint. + +3. Write a pointer to the breakpoint at offset -4 from the first + instruction "func_info" header. + +4. Set the staging part of the breakpoint as enabled with specified + breakpoint data. + +5. Wait for thread progress. + +6. Write a `op_i_generic_breakpoint` as the first instruction for the function. + This instruction will execute the breakpoint that it finds at offset -4. + +7. Wait for thread progress. + +8. Commit the breadpoint by switching `erts_active_bp_index`. + +9. Wait for thread progress. + +10. Prepare for next call to `trace_pattern` by updating the new staging part + (the old active) of the breakpoint to be identic to the the new active part. + +11. Release code write permission and return from `trace_pattern`. + + +The code write permission "lock" seized in step 1 is the same as used +by code loading. This will ensure that only one process at a time can +stage new trace settings but it will also prevent concurrent code +loading and make sure we see a consistent view of the beam code during +the entire sequence. + +Between step 6 and 8, runninng processes might execute the written +`op_i_generic_breakpoint` instruction. They will get the breakpoint +structure written in step 3, read `erts_active_bp_index` and execute +the corresponding part of the breakpoint. Before the switch in step 8 +becomes visible they will however execute the disabled part of the +breakpoint structure and do nothing other than executing the saved +original instruction. + + +To Updating and Remove Breakpoints +---------------------------------- + +The above sequence did only describe adding a new breakpoint. We do +basically the same sequence to update the settings of an existing +breakpoint except step 2,3 and 6 can be skipped as it has already been +done. + +To remove a breakpoint some more steps are needed. The idea is to +first stage the breakpoint as disabled, do the switch, wait for thread +progress and then remove the disabled breakpoint by restoring the +original beam instruction. + +Here is a more complete sequence that contains both adding, updating +and removing breakpoints. + +1. Seize exclusive code write permission (suspend process until we get it). + +2. Allocate new breakpoint structures with a disabled active part and + the original beam instruction. Write a pointer to the breakpoint in + "func_info" header at offset -4. + +3. Update the staging part of all affected breakpoints. Disable + breakpoints that are to be removed. + +4. Wait for thread progress. + +5. Write a `op_i_generic_breakpoint` as the first instruction for all + functions with new breakpoints. + +6. Wait for thread progress. + +7. Commit all staged breadpoints by switching `erts_active_bp_index`. + +8. Wait for thread progress. + + +9. Restore original beam instruction for disabled breakpoints. + +10. Wait for thread progress. + +11. Prepare for next call to `trace_pattern` by updating the new + staging area (the old active) for all enabled breakpoints. + +12. Deallocate disabled breakpoint structures. + +13. Release code write permission and return from `trace_pattern`. + + +### All that Waiting for Thread Progress + +There are four rounds of waiting for thread progress in the above +sequence. In the code loading sequence we sacrificed memory overhead +of three generations to avoid a second round of thread progress. The +latency of `trace_pattern` should not be such a big problem for +however, as it is normally not called in a rapid sequence. + +The waiting in step 4 is to make sure all threads will see an updated +view of the breakpoint structures once they become reachable through +the `op_i_generic_breakpoint` instruction written in step 5. + +The waiting in step 6 is to make the activation of the new trace +settings "as atomic as possible". Different cores might see the new +value of `erts_active_bp_index` at different times as it is read +without any memory barrier. But this is the best we can do without +more expensive thread synchronization. + +The waiting in step 8 is to make sure we dont't restore the original +bream instructions for disabled breakpoints until we know that no +thread is still accessing the old enabled part of a disabled +breakpoint. + +The waiting in step 10 is to make sure no lingering thread is still +accessing disabled breakpoint structures to be deallocated in step +12. + + +Global Tracing +-------------- + +Call tracing with `global` option only affects external function +calls. This was earlier handled by inserting a special trace +instruction in export entries without the use of breakpoints. With the +new non-blocking tracing we want to avoid special handling for global +tracing and make use of the staging and atomic switching within the +breakpoint mechanism. The solution was to create the same type of +breakpoint structure for a global call trace. The difference to local +tracing is that we insert the `op_i_generic_breakpoint` instruction +(with its pointer at offset -4) in the export entry rather than in the +code. + + +Future work +----------- + +We still go to single threaded mode when new code is loaded for a +module that is traced, or when loading code when there is a default +trace pattern set. That is not impossible to fix, but that requires +much closer cooperation between tracing BIFs and the loader BIFs. diff --git a/erts/emulator/sys/unix/erl_unix_sys_ddll.c b/erts/emulator/sys/unix/erl_unix_sys_ddll.c index 12c47d0088..8760b58839 100644 --- a/erts/emulator/sys/unix/erl_unix_sys_ddll.c +++ b/erts/emulator/sys/unix/erl_unix_sys_ddll.c @@ -101,7 +101,7 @@ void erl_sys_ddll_init(void) { /* * Open a shared object */ -int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err) +int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err) { #if defined(HAVE_DLOPEN) char* dlname; diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c index 61f9f6a59a..59e34eb819 100644 --- a/erts/emulator/sys/unix/sys.c +++ b/erts/emulator/sys/unix/sys.c @@ -547,6 +547,25 @@ erts_sys_pre_init(void) #endif #endif /* USE_THREADS */ erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0); + + { + /* + * Unfortunately we depend on fd 0,1,2 in the old shell code. + * So if for some reason we do not have those open when we start + * we have to open them here. Not doing this can cause the emulator + * to deadlock when reaping the fd_driver ports :( + */ + int fd; + /* Make sure fd 0 is open */ + if ((fd = open("/dev/null", O_RDONLY)) != 0) + close(fd); + /* Make sure fds 1 and 2 are open */ + while (fd < 3) { + fd = open("/dev/null", O_WRONLY); + } + close(fd); + } + } void diff --git a/erts/emulator/sys/win32/erl_win32_sys_ddll.c b/erts/emulator/sys/win32/erl_win32_sys_ddll.c index 2d3f073cc2..338f0d7386 100644 --- a/erts/emulator/sys/win32/erl_win32_sys_ddll.c +++ b/erts/emulator/sys/win32/erl_win32_sys_ddll.c @@ -59,7 +59,7 @@ void erl_sys_ddll_init(void) { * Open a shared object * Expecting 'full_name' as an UTF-8 string. */ -int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err) +int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err) { HINSTANCE hinstance; int len; diff --git a/erts/emulator/test/binary_SUITE.erl b/erts/emulator/test/binary_SUITE.erl index a340a805b5..bce4278337 100644 --- a/erts/emulator/test/binary_SUITE.erl +++ b/erts/emulator/test/binary_SUITE.erl @@ -447,26 +447,26 @@ terms(Config) when is_list(Config) -> Sz1 when is_integer(Sz1), size(Bin1) =< Sz1 -> ok end, - Term = binary_to_term(Bin), - Term = binary_to_term(Bin, [safe]), + Term = binary_to_term_stress(Bin), + Term = binary_to_term_stress(Bin, [safe]), Unaligned = make_unaligned_sub_binary(Bin), - Term = binary_to_term(Unaligned), - Term = binary_to_term(Unaligned, []), - Term = binary_to_term(Bin, [safe]), + Term = binary_to_term_stress(Unaligned), + Term = binary_to_term_stress(Unaligned, []), + Term = binary_to_term_stress(Bin, [safe]), BinC = erlang:term_to_binary(Term, [compressed]), - Term = binary_to_term(BinC), + Term = binary_to_term_stress(BinC), true = size(BinC) =< size(Bin), Bin = term_to_binary(Term, [{compressed,0}]), terms_compression_levels(Term, size(Bin), 1), UnalignedC = make_unaligned_sub_binary(BinC), - Term = binary_to_term(UnalignedC) + Term = binary_to_term_stress(UnalignedC) end, ?line test_terms(TestFun), ok. terms_compression_levels(Term, UncompressedSz, Level) when Level < 10 -> BinC = erlang:term_to_binary(Term, [{compressed,Level}]), - Term = binary_to_term(BinC), + Term = binary_to_term_stress(BinC), Sz = byte_size(BinC), true = Sz =< UncompressedSz, terms_compression_levels(Term, UncompressedSz, Level+1); @@ -476,9 +476,9 @@ terms_float(Config) when is_list(Config) -> ?line test_floats(fun(Term) -> Bin0 = term_to_binary(Term), Bin0 = term_to_binary(Term, [{minor_version,0}]), - Term = binary_to_term(Bin0), + Term = binary_to_term_stress(Bin0), Bin1 = term_to_binary(Term, [{minor_version,1}]), - Term = binary_to_term(Bin1), + Term = binary_to_term_stress(Bin1), true = size(Bin1) < size(Bin0), Size0 = erlang:external_size(Term), Size00 = erlang:external_size(Term, [{minor_version, 0}]), @@ -490,7 +490,7 @@ terms_float(Config) when is_list(Config) -> float_middle_endian(Config) when is_list(Config) -> %% Testing for roundtrip is not enough. ?line <<131,70,63,240,0,0,0,0,0,0>> = term_to_binary(1.0, [{minor_version,1}]), - ?line 1.0 = binary_to_term(<<131,70,63,240,0,0,0,0,0,0>>). + ?line 1.0 = binary_to_term_stress(<<131,70,63,240,0,0,0,0,0,0>>). external_size(Config) when is_list(Config) -> %% Build a term whose external size only fits in a big num (on 32-bit CPU). @@ -608,10 +608,10 @@ bad_binary_to_term(Config) when is_list(Config) -> ok. bad_bin_to_term(BadBin) -> - {'EXIT',{badarg,_}} = (catch binary_to_term(BadBin)). + {'EXIT',{badarg,_}} = (catch binary_to_term_stress(BadBin)). bad_bin_to_term(BadBin,Opts) -> - {'EXIT',{badarg,_}} = (catch binary_to_term(BadBin,Opts)). + {'EXIT',{badarg,_}} = (catch binary_to_term_stress(BadBin,Opts)). safe_binary_to_term2(doc) -> "Test safety options for binary_to_term/2"; safe_binary_to_term2(Config) when is_list(Config) -> @@ -622,7 +622,7 @@ safe_binary_to_term2(Config) when is_list(Config) -> BadRef = <<131,114,0,3,BadHostAtom/binary,0,<<0,0,0,255>>/binary, Empty/binary,Empty/binary>>, ?line bad_bin_to_term(BadRef, [safe]), % good ref, with a bad atom - ?line fullsweep_after = binary_to_term(<<131,100,0,15,"fullsweep_after">>, [safe]), % should be a good atom + ?line fullsweep_after = binary_to_term_stress(<<131,100,0,15,"fullsweep_after">>, [safe]), % should be a good atom BadExtFun = <<131,113,100,0,4,98,108,117,101,100,0,4,109,111,111,110,97,3>>, ?line bad_bin_to_term(BadExtFun, [safe]), ok. @@ -679,14 +679,14 @@ corrupter0(Term) -> corrupter(Bin, Pos) when Pos >= 0 -> ?line {ShorterBin, Rest} = split_binary(Bin, Pos), - ?line catch binary_to_term(ShorterBin), %% emulator shouldn't crash + ?line catch binary_to_term_stress(ShorterBin), %% emulator shouldn't crash ?line MovedBin = list_to_binary([ShorterBin]), - ?line catch binary_to_term(MovedBin), %% emulator shouldn't crash + ?line catch binary_to_term_stress(MovedBin), %% emulator shouldn't crash %% Bit faults, shouldn't crash <<Byte,Tail/binary>> = Rest, Fun = fun(M) -> FaultyByte = Byte bxor M, - catch binary_to_term(<<ShorterBin/binary, + catch binary_to_term_stress(<<ShorterBin/binary, FaultyByte, Tail/binary>>) end, ?line lists:foreach(Fun,[1,2,4,8,16,32,64,128,255]), ?line corrupter(Bin, Pos-1); @@ -700,7 +700,7 @@ more_bad_terms(Config) when is_list(Config) -> ?line ok = io:format("File: ~s\n", [BadFile]), ?line case file:read_file(BadFile) of {ok,Bin} -> - ?line {'EXIT',{badarg,_}} = (catch binary_to_term(Bin)), + ?line {'EXIT',{badarg,_}} = (catch binary_to_term_stress(Bin)), ok; Other -> ?line ?t:fail(Other) @@ -709,7 +709,7 @@ more_bad_terms(Config) when is_list(Config) -> otp_5484(Config) when is_list(Config) -> ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( <<131, 104,2, %Tuple, 2 elements 103, %Pid @@ -722,7 +722,7 @@ otp_5484(Config) when is_list(Config) -> ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( <<131, 104,2, %Tuple, 2 elements 103, %Pid @@ -734,13 +734,13 @@ otp_5484(Config) when is_list(Config) -> ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( %% A old-type fun in a list containing a bad creator pid. <<131,108,0,0,0,1,117,0,0,0,0,103,100,0,13,110,111,110,111,100,101,64,110,111,104,111,115,116,255,255,0,25,255,0,0,0,0,100,0,1,116,97,0,98,6,142,121,72,106>>)), ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( %% A new-type fun in a list containing a bad creator pid. %% <<131, @@ -752,7 +752,7 @@ otp_5484(Config) when is_list(Config) -> ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( %% A new-type fun in a list containing a bad module. <<131, 108,0,0,0,1, %List, 1 element @@ -763,7 +763,7 @@ otp_5484(Config) when is_list(Config) -> ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( %% A new-type fun in a list containing a bad index. <<131, 108,0,0,0,1, %List, 1 element @@ -775,7 +775,7 @@ otp_5484(Config) when is_list(Config) -> ?line {'EXIT',_} = (catch - binary_to_term( + binary_to_term_stress( %% A new-type fun in a list containing a bad unique value. <<131, 108,0,0,0,1, %List, 1 element @@ -788,46 +788,46 @@ otp_5484(Config) when is_list(Config) -> %% An absurdly large atom. ?line {'EXIT',_} = - (catch binary_to_term(iolist_to_binary([<<131,100,65000:16>>| + (catch binary_to_term_stress(iolist_to_binary([<<131,100,65000:16>>| lists:duplicate(65000, 42)]))), %% Longer than 255 characters. ?line {'EXIT',_} = - (catch binary_to_term(iolist_to_binary([<<131,100,256:16>>| + (catch binary_to_term_stress(iolist_to_binary([<<131,100,256:16>>| lists:duplicate(256, 42)]))), %% OTP-7218. Thanks to Matthew Dempsky. Also make sure that we %% cover the other error cases for external funs (EXPORT_EXT). ?line {'EXIT',_} = - (catch binary_to_term( + (catch binary_to_term_stress( <<131, 113, %EXPORT_EXP 97,13, %Integer: 13 97,13, %Integer: 13 97,13>>)), %Integer: 13 ?line {'EXIT',_} = - (catch binary_to_term( + (catch binary_to_term_stress( <<131, 113, %EXPORT_EXP 100,0,1,64, %Atom: '@' 97,13, %Integer: 13 97,13>>)), %Integer: 13 ?line {'EXIT',_} = - (catch binary_to_term( + (catch binary_to_term_stress( <<131, 113, %EXPORT_EXP 100,0,1,64, %Atom: '@' 100,0,1,64, %Atom: '@' 106>>)), %NIL ?line {'EXIT',_} = - (catch binary_to_term( + (catch binary_to_term_stress( <<131, 113, %EXPORT_EXP 100,0,1,64, %Atom: '@' 100,0,1,64, %Atom: '@' 98,255,255,255,255>>)), %Integer: -1 ?line {'EXIT',_} = - (catch binary_to_term( + (catch binary_to_term_stress( <<131, 113, %EXPORT_EXP 100,0,1,64, %Atom: '@' @@ -835,7 +835,7 @@ otp_5484(Config) when is_list(Config) -> 113,97,13,97,13,97,13>>)), %fun 13:13/13 %% Bad funs. - ?line {'EXIT',_} = (catch binary_to_term(fake_fun(0, lists:seq(0, 256)))), + ?line {'EXIT',_} = (catch binary_to_term_stress(fake_fun(0, lists:seq(0, 256)))), ok. fake_fun(Arity, Env0) -> @@ -869,7 +869,7 @@ try_bad_lengths(B) -> try_bad_lengths(B, L) when L > 16#FFFFFFF0 -> Bin = <<B/binary,L:32>>, io:format("~p\n", [Bin]), - {'EXIT',_} = (catch binary_to_term(Bin)), + {'EXIT',_} = (catch binary_to_term_stress(Bin)), try_bad_lengths(B, L-1); try_bad_lengths(_, _) -> ok. @@ -923,7 +923,7 @@ otp_6817_try_bin(Bin) -> %% If the bug is present, the heap pointer will moved when the invalid term %% is found and we will have a linked list passing through the limbo area %% between the heap top and the stack pointer. - catch binary_to_term(Bin), + catch binary_to_term_stress(Bin), %% If the bug is present, we will overwrite the pointers in the limbo area. Filler = erlang:make_tuple(1024, 16#3FA), @@ -935,7 +935,7 @@ otp_6817_try_bin(Bin) -> otp_8117(doc) -> "Some bugs in binary_to_term when 32-bit integers are negative."; otp_8117(suite) -> []; otp_8117(Config) when is_list(Config) -> - [otp_8117_do(Op,-(1 bsl N)) || Op <- ['fun',list,tuple], + [otp_8117_do(Op,-(1 bsl N)) || Op <- ['fun',named_fun,list,tuple], N <- lists:seq(0,31)], ok. @@ -944,6 +944,11 @@ otp_8117_do('fun',Neg) -> FunBin = term_to_binary(fun() -> ok end), ?line <<B1:27/binary,_NumFree:32,Rest/binary>> = FunBin, ?line bad_bin_to_term(<<B1/binary,Neg:32,Rest/binary>>); +otp_8117_do(named_fun,Neg) -> + % Named fun with negative num_free + FunBin = term_to_binary(fun F() -> F end), + ?line <<B1:27/binary,_NumFree:32,Rest/binary>> = FunBin, + ?line bad_bin_to_term(<<B1/binary,Neg:32,Rest/binary>>); otp_8117_do(list,Neg) -> %% List with negative length ?line bad_bin_to_term(<<131,104,2,108,Neg:32,97,11,104,1,97,12,97,13,106,97,14>>); @@ -1233,7 +1238,7 @@ bit_sized_binary_sizes(Config) when is_list(Config) -> bsbs_1(A) -> BinSize = 32+A, io:format("A: ~p BinSize: ~p", [A,BinSize]), - Bin = binary_to_term(<<131,$M,5:32,A,0,0,0,0,0>>), + Bin = binary_to_term_stress(<<131,$M,5:32,A,0,0,0,0,0>>), BinSize = bit_size(Bin). deep(Config) when is_list(Config) -> @@ -1250,7 +1255,7 @@ deep(Config) when is_list(Config) -> deep_roundtrip(T) -> B = term_to_binary(T), - T = binary_to_term(B). + T = binary_to_term_stress(B). obsolete_funs(Config) when is_list(Config) -> erts_debug:set_internal_state(available_internal_state, true), @@ -1285,29 +1290,29 @@ obsolete_fun(Fun) -> Tuple = no_fun_roundtrip(Fun). no_fun_roundtrip(Term) -> - binary_to_term(erts_debug:get_internal_state({term_to_binary_no_funs,Term})). + binary_to_term_stress(erts_debug:get_internal_state({term_to_binary_no_funs,Term})). %% Test non-standard encodings never generated by term_to_binary/1 %% but recognized by binary_to_term/1. robustness(Config) when is_list(Config) -> - ?line [] = binary_to_term(<<131,107,0,0>>), %Empty string. - ?line [] = binary_to_term(<<131,108,0,0,0,0,106>>), %Zero-length list. + ?line [] = binary_to_term_stress(<<131,107,0,0>>), %Empty string. + ?line [] = binary_to_term_stress(<<131,108,0,0,0,0,106>>), %Zero-length list. %% {[],a} where [] is a zero-length list. - ?line {[],a} = binary_to_term(<<131,104,2,108,0,0,0,0,106,100,0,1,97>>), + ?line {[],a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,106,100,0,1,97>>), %% {42,a} where 42 is a zero-length list with 42 in the tail. - ?line {42,a} = binary_to_term(<<131,104,2,108,0,0,0,0,97,42,100,0,1,97>>), + ?line {42,a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,97,42,100,0,1,97>>), %% {{x,y},a} where {x,y} is a zero-length list with {x,y} in the tail. - ?line {{x,y},a} = binary_to_term(<<131,104,2,108,0,0,0,0, + ?line {{x,y},a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0, 104,2,100,0,1,120,100,0,1, 121,100,0,1,97>>), %% Bignums fitting in 32 bits. - ?line 16#7FFFFFFF = binary_to_term(<<131,98,127,255,255,255>>), - ?line -1 = binary_to_term(<<131,98,255,255,255,255>>), + ?line 16#7FFFFFFF = binary_to_term_stress(<<131,98,127,255,255,255>>), + ?line -1 = binary_to_term_stress(<<131,98,255,255,255,255>>), ok. @@ -1325,7 +1330,7 @@ run_otp_8180(Name) -> ?line {ok,Bins} = file:consult(Name), [begin io:format("~p\n", [Bin]), - ?line {'EXIT',{badarg,_}} = (catch binary_to_term(Bin)) + ?line {'EXIT',{badarg,_}} = (catch binary_to_term_stress(Bin)) end || Bin <- Bins], ok. @@ -1394,3 +1399,52 @@ unaligned_sub_bin(Bin0, Offs) -> Bin. id(I) -> I. + + +%% Stress binary_to_term with different initial reductions +binary_to_term_stress(Bin) -> + binary_to_term_stress(Bin, no_opts). + +binary_to_term_stress(Bin, Opts) -> + Reds = get_reds(), + T = b2t(erlang:system_info(context_reductions), + Bin, Opts, catch_binary_to_term(Bin, Opts)), + set_reds(Reds), + T = case Opts of + no_opts -> binary_to_term(Bin); + _ -> binary_to_term(Bin,Opts) + end. + +catch_binary_to_term(Bin, no_opts) -> + try binary_to_term(Bin) + catch + error:badarg -> binary_to_term_throws_badarg + end; +catch_binary_to_term(Bin, Opts) -> + try binary_to_term(Bin, Opts) + catch + error:badarg -> binary_to_term_throws_badarg + end. + +b2t(0, _Bin, _Opts, Term) -> + Term; +b2t(Reds, Bin, Opts, Term) -> + set_reds(Reds), + Term = catch_binary_to_term(Bin,Opts), + b2t(Reds div 3, Bin, Opts, Term). + +set_reds(Reds) -> + try erts_debug:set_internal_state(reds_left, Reds) + catch + error:undef -> + erts_debug:set_internal_state(available_internal_state, true), + set_reds(Reds) + end. + +get_reds() -> + try erts_debug:get_internal_state(reds_left) + catch + error:undef -> + erts_debug:set_internal_state(available_internal_state, true), + get_reds() + end. diff --git a/erts/emulator/test/fun_SUITE.erl b/erts/emulator/test/fun_SUITE.erl index 36ba4e0f48..8ad5f290ed 100644 --- a/erts/emulator/test/fun_SUITE.erl +++ b/erts/emulator/test/fun_SUITE.erl @@ -262,6 +262,16 @@ equality(Config) when is_list(Config) -> ?line false = eq(FF2, FF4), ?line false = eq(FF3, FF4), + %% EEP37 + H1 = fun Fact(N) when N > 0 -> N * Fact(N - 1); Fact(0) -> 1 end, + H2 = fun Pow(N, M) when M > 0 -> N * Pow(N, M - 1); Pow(_, 0) -> 1 end, + H1_copy = copy_term(H1), + + true = eq(H1, H1), + true = eq(H1, H1_copy), + true = eq(H2, H2), + false = eq(H1, H2), + ok. eq(X, X) -> true; diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam Binary files differindex fb222cb64b..73fac27161 100644 --- a/erts/preloaded/ebin/erlang.beam +++ b/erts/preloaded/ebin/erlang.beam diff --git a/erts/preloaded/ebin/erts_internal.beam b/erts/preloaded/ebin/erts_internal.beam Binary files differindex 9ab806718b..12b36913a9 100644 --- a/erts/preloaded/ebin/erts_internal.beam +++ b/erts/preloaded/ebin/erts_internal.beam diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl index a21da2ecc9..0ed677c3d8 100644 --- a/erts/preloaded/src/erlang.erl +++ b/erts/preloaded/src/erlang.erl @@ -362,15 +362,25 @@ binary_to_list(_Binary, _Start, _Stop) -> %% binary_to_term/1 -spec binary_to_term(Binary) -> term() when Binary :: ext_binary(). -binary_to_term(_Binary) -> - erlang:nif_error(undefined). +binary_to_term(Binary) -> + %% This BIF may throw badarg while trapping + try + erts_internal:binary_to_term(Binary) + catch + error:Reason -> erlang:error(Reason,[Binary]) + end. %% binary_to_term/2 -spec binary_to_term(Binary, Opts) -> term() when Binary :: ext_binary(), Opts :: [safe]. -binary_to_term(_Binary, _Opts) -> - erlang:nif_error(undefined). +binary_to_term(Binary, Opts) -> + %% This BIF may throw badarg while trapping + try + erts_internal:binary_to_term(Binary,Opts) + catch + error:Reason -> erlang:error(Reason,[Binary,Opts]) + end. %% bit_size/1 %% Shadowed by erl_bif_types: erlang:bit_size/1 diff --git a/erts/preloaded/src/erts_internal.erl b/erts/preloaded/src/erts_internal.erl index c8e8e7e069..d6a185482e 100644 --- a/erts/preloaded/src/erts_internal.erl +++ b/erts/preloaded/src/erts_internal.erl @@ -29,7 +29,7 @@ -module(erts_internal). -export([await_port_send_result/3]). - +-export([binary_to_term/1, binary_to_term/2]). -export([port_command/3, port_connect/2, port_close/1, port_control/3, port_call/3, port_info/1, port_info/2]). @@ -160,3 +160,13 @@ request_system_task(_Pid, _Prio, _Request) -> check_process_code(_Module, _OptionList) -> erlang:nif_error(undefined). +-spec binary_to_term(Binary) -> term() when + Binary :: binary(). +binary_to_term(_Binary) -> + erlang:nif_error(undefined). + +-spec binary_to_term(Binary, Opts) -> term() when + Binary :: binary(), + Opts :: [safe]. +binary_to_term(_Binary, _Opts) -> + erlang:nif_error(undefined). |