aboutsummaryrefslogtreecommitdiffstats
path: root/erts
diff options
context:
space:
mode:
Diffstat (limited to 'erts')
-rw-r--r--erts/doc/src/absform.xml12
-rw-r--r--erts/doc/src/erlc.xml10
-rw-r--r--erts/emulator/beam/atom.names1
-rw-r--r--erts/emulator/beam/bif.tab8
-rw-r--r--erts/emulator/beam/erl_bif_ddll.c26
-rwxr-xr-xerts/emulator/beam/erl_bif_info.c5
-rw-r--r--erts/emulator/beam/erl_bif_port.c2
-rw-r--r--erts/emulator/beam/erl_bif_re.c4
-rw-r--r--erts/emulator/beam/erl_debug.c2
-rw-r--r--erts/emulator/beam/erl_init.c4
-rw-r--r--erts/emulator/beam/erl_nif.c7
-rw-r--r--erts/emulator/beam/erl_process.c9
-rw-r--r--erts/emulator/beam/erl_unicode.c17
-rw-r--r--erts/emulator/beam/erl_zlib.c40
-rw-r--r--erts/emulator/beam/erl_zlib.h6
-rw-r--r--erts/emulator/beam/external.c748
-rw-r--r--erts/emulator/beam/external.h1
-rwxr-xr-xerts/emulator/beam/global.h3
-rw-r--r--erts/emulator/beam/io.c2
-rw-r--r--erts/emulator/beam/sys.h9
-rw-r--r--erts/emulator/drivers/common/inet_drv.c4
-rw-r--r--erts/emulator/internal_doc/CarrierMigration.md201
-rw-r--r--erts/emulator/internal_doc/CodeLoading.md186
-rw-r--r--erts/emulator/internal_doc/DelayedDealloc.md175
-rw-r--r--erts/emulator/internal_doc/PTables.md356
-rw-r--r--erts/emulator/internal_doc/PortSignals.md267
-rw-r--r--erts/emulator/internal_doc/ProcessManagementOptimizations.md172
-rw-r--r--erts/emulator/internal_doc/ThreadProgress.md308
-rw-r--r--erts/emulator/internal_doc/Tracing.md220
-rw-r--r--erts/emulator/sys/unix/erl_unix_sys_ddll.c2
-rw-r--r--erts/emulator/sys/unix/sys.c19
-rw-r--r--erts/emulator/sys/win32/erl_win32_sys_ddll.c2
-rw-r--r--erts/emulator/test/binary_SUITE.erl148
-rw-r--r--erts/emulator/test/fun_SUITE.erl10
-rw-r--r--erts/preloaded/ebin/erlang.beambin97588 -> 97872 bytes
-rw-r--r--erts/preloaded/ebin/erts_internal.beambin3784 -> 4096 bytes
-rw-r--r--erts/preloaded/src/erlang.erl18
-rw-r--r--erts/preloaded/src/erts_internal.erl12
38 files changed, 2768 insertions, 248 deletions
diff --git a/erts/doc/src/absform.xml b/erts/doc/src/absform.xml
index 55aef9f8ab..4acc03b133 100644
--- a/erts/doc/src/absform.xml
+++ b/erts/doc/src/absform.xml
@@ -290,6 +290,18 @@
<item>If E is <c><![CDATA[fun Fc_1 ; ... ; Fc_k end]]></c>
where each <c><![CDATA[Fc_i]]></c> is a function clause then Rep(E) =
<c><![CDATA[{'fun',LINE,{clauses,[Rep(Fc_1), ..., Rep(Fc_k)]}}]]></c>.</item>
+ <item>If E is <c><![CDATA[fun Name Fc_1 ; ... ; Name Fc_k end]]></c>
+ where <c><![CDATA[Name]]></c> is a variable and each
+ <c><![CDATA[Fc_i]]></c> is a function clause then Rep(E) =
+ <c><![CDATA[{named_fun,LINE,Name,[Rep(Fc_1), ..., Rep(Fc_k)]}]]></c>.
+ </item>
+ <item>If E is <c><![CDATA[query [E_0 || W_1, ..., W_k] end]]></c>,
+ where each <c><![CDATA[W_i]]></c> is a generator or a filter, then
+ Rep(E) = <c><![CDATA[{'query',LINE,{lc,LINE,Rep(E_0),[Rep(W_1), ..., Rep(W_k)]}}]]></c>.
+ For Rep(W), see below.</item>
+ <item>If E is <c><![CDATA[E_0.Field]]></c>, a Mnesia record access
+ inside a query, then
+ Rep(E) = <c><![CDATA[{record_field,LINE,Rep(E_0),Rep(Field)}]]></c>.</item>
<item>If E is <c><![CDATA[( E_0 )]]></c>, then
Rep(E) = <c><![CDATA[Rep(E_0)]]></c>,
i.e., parenthesized expressions cannot be distinguished from their bodies.</item>
diff --git a/erts/doc/src/erlc.xml b/erts/doc/src/erlc.xml
index 10cab344b0..c3fc3b1686 100644
--- a/erts/doc/src/erlc.xml
+++ b/erts/doc/src/erlc.xml
@@ -234,6 +234,16 @@ erlc +export_all file.erl</pre>
from the shell.</p>
<p>Supported options: -I, -o, -D, -v, -W, -b.</p>
</item>
+ <tag>.S</tag>
+ <item>
+ <p>Erlang assembler source code. It generates a <c><![CDATA[.beam]]></c> file.</p>
+ <p>Supported options: same as for .erl.</p>
+ </item>
+ <tag>.core</tag>
+ <item>
+ <p>Erlang core source code. It generates a <c><![CDATA[.beam]]></c> file.</p>
+ <p>Supported options: same as for .erl.</p>
+ </item>
<tag>.yrl</tag>
<item>
<p>Yecc source code. It generates an <c><![CDATA[.erl]]></c> file.</p>
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index eee4badfb8..96547ba743 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -116,6 +116,7 @@ atom binary_longest_prefix_trap
atom binary_longest_suffix_trap
atom binary_match_trap
atom binary_matches_trap
+atom binary_to_term_trap
atom block
atom blocked
atom bm
diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab
index dd50df636c..3ec534f0bc 100644
--- a/erts/emulator/beam/bif.tab
+++ b/erts/emulator/beam/bif.tab
@@ -45,7 +45,6 @@ bif erlang:apply/3
bif erlang:atom_to_list/1
bif erlang:binary_to_list/1
bif erlang:binary_to_list/3
-bif erlang:binary_to_term/1
bif erlang:crc32/1
bif erlang:crc32/2
bif erlang:crc32_combine/3
@@ -152,6 +151,8 @@ bif erts_internal:port_command/3
bif erts_internal:port_control/3
bif erts_internal:port_close/1
bif erts_internal:port_connect/2
+bif erts_internal:binary_to_term/1
+bif erts_internal:binary_to_term/2
bif erts_internal:request_system_task/3
bif erts_internal:check_process_code/2
@@ -479,11 +480,6 @@ bif erlang:call_on_load_function/1
bif erlang:finish_after_on_load/2
#
-# New Bifs in R13B4
-#
-bif erlang:binary_to_term/2
-
-#
# The binary match bifs (New in R14A - EEP9)
#
diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c
index 1c3e955f47..1728b200f7 100644
--- a/erts/emulator/beam/erl_bif_ddll.c
+++ b/erts/emulator/beam/erl_bif_ddll.c
@@ -182,7 +182,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
Eterm name_term = BIF_ARG_2;
Eterm options = BIF_ARG_3;
char *path = NULL;
- ErlDrvSizeT path_len;
+ Sint path_len;
char *name = NULL;
DE_Handle *dh;
erts_driver_t *drv;
@@ -198,6 +198,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
int kill_ports = 0;
int do_build_load_error = 0;
int build_this_load_error = 0;
+ int encoding;
for(l = options; is_list(l); l = CDR(list_val(l))) {
Eterm opt = CAR(list_val(l));
@@ -257,18 +258,23 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
goto error;
}
- if (erts_iolist_size(path_term, &path_len)) {
- goto error;
+ encoding = erts_get_native_filename_encoding();
+ if (encoding == ERL_FILENAME_WIN_WCHAR) {
+ /* Do not convert the lib name to utf-16le yet, do that in win32 specific code */
+ /* since lib_name is used in error messages */
+ encoding = ERL_FILENAME_UTF8;
}
- path = erts_alloc(ERTS_ALC_T_DDLL_TMP_BUF, path_len + 1 /* might need path separator */ + sys_strlen(name) + 1);
- if (erts_iolist_to_buf(path_term, path, path_len) != 0) {
+ path = erts_convert_filename_to_encoding(path_term, NULL, 0,
+ ERTS_ALC_T_DDLL_TMP_BUF, 1, 0,
+ encoding, &path_len,
+ sys_strlen(name) + 2); /* might need path separator */
+ if (!path) {
goto error;
}
- while (path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/')) {
- --path_len;
- }
+ ASSERT(path_len > 0 && path[path_len-1] == 0);
+ while (--path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/'))
+ ;
path[path_len++] = '/';
- /*path[path_len] = '\0';*/
sys_strcpy(path+path_len,name);
#if DDLL_SMP
@@ -1524,7 +1530,7 @@ static int do_load_driver_entry(DE_Handle *dh, char *path, char *name)
assert_drv_list_rwlocked();
- if ((res = erts_sys_ddll_open(path, &(dh->handle))) != ERL_DE_NO_ERROR) {
+ if ((res = erts_sys_ddll_open(path, &(dh->handle), NULL)) != ERL_DE_NO_ERROR) {
return res;
}
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 2b40f9272d..414ae2f046 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -3613,8 +3613,9 @@ BIF_RETTYPE erts_debug_set_internal_state_2(BIF_ALIST_2)
default: BIF_ERROR(BIF_P, BADARG); break;
}
- res = erts_set_gc_state(BIF_P, enable);
- BIF_RET(res ? am_true : am_false);
+ res = (BIF_P->flags & F_DISABLE_GC) ? am_false : am_true;
+ erts_set_gc_state(BIF_P, enable);
+ BIF_RET(res);
}
else if (ERTS_IS_ATOM_STR("send_fake_exit_signal", BIF_ARG_1)) {
/* Used by signal_SUITE (emulator) */
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index 864349491a..f298422267 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -808,7 +808,7 @@ open_port(Process* p, Eterm name, Eterm settings, int *err_typep, int *err_nump)
if (encoding == ERL_FILENAME_WIN_WCHAR) {
encoding = ERL_FILENAME_UTF8;
}
- if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL))
+ if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL, 0))
== NULL) {
goto badarg;
}
diff --git a/erts/emulator/beam/erl_bif_re.c b/erts/emulator/beam/erl_bif_re.c
index 99c31738a5..448c6f6f6d 100644
--- a/erts/emulator/beam/erl_bif_re.c
+++ b/erts/emulator/beam/erl_bif_re.c
@@ -1196,8 +1196,8 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3)
ovsize = 3*(unsigned_val(tp[2])+1);
code_size = binary_size(tp[5]);
- if ((code_tmp = (const pcre *)
- erts_get_aligned_binary_bytes(tp[5], &temp_alloc)) == NULL) {
+ code_tmp = (const pcre *) erts_get_aligned_binary_bytes(tp[5], &temp_alloc);
+ if (code_tmp == NULL || code_size < 4) {
erts_free_aligned_binary_bytes(temp_alloc);
BIF_ERROR(p, BADARG);
}
diff --git a/erts/emulator/beam/erl_debug.c b/erts/emulator/beam/erl_debug.c
index dc79d45be7..873a9860da 100644
--- a/erts/emulator/beam/erl_debug.c
+++ b/erts/emulator/beam/erl_debug.c
@@ -1,7 +1,7 @@
/*
* %CopyrightBegin%
*
- * Copyright Ericsson AB 1998-2012. All Rights Reserved.
+ * Copyright Ericsson AB 1998-2013. All Rights Reserved.
*
* The contents of this file are subject to the Erlang Public License,
* Version 1.1, (the "License"); you may not use this file except in
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 8c4fffa75b..1af80dd04b 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -553,8 +553,8 @@ void erts_usage(void)
erts_fprintf(stderr, " numbers is %d\n",
ERTS_MAX_NO_OF_SCHEDULERS);
erts_fprintf(stderr, "-SP p1:p2 specify schedulers (p1) and schedulers online (p2)\n");
- erts_fprintf(stderr, " as percentages of logical processors configured and logical\n");
- erts_fprintf(stderr, " processors available, respectively\n");
+ erts_fprintf(stderr, " as percentages of logical processors configured and logical\n");
+ erts_fprintf(stderr, " processors available, respectively\n");
erts_fprintf(stderr, "-t size set the maximum number of atoms the "
"emulator can handle\n");
erts_fprintf(stderr, " valid range is [%d-%d]\n",
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index e87959f0ab..dc285b3cf7 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -1407,7 +1407,7 @@ void* enif_dlopen(const char* lib,
ErtsSysDdllError errdesc = ERTS_SYS_DDLL_ERROR_INIT;
void* handle;
void* init_func;
- if (erts_sys_ddll_open2(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) {
+ if (erts_sys_ddll_open(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) {
if (erts_sys_ddll_load_nif_init(handle, &init_func, &errdesc) == ERL_DE_NO_ERROR) {
erts_sys_ddll_call_nif_init(init_func);
}
@@ -1587,7 +1587,8 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
encoding = ERL_FILENAME_UTF8;
}
lib_name = erts_convert_filename_to_encoding(BIF_ARG_1, NULL, 0,
- ERTS_ALC_T_TMP, 1, 0, encoding, NULL);
+ ERTS_ALC_T_TMP, 1, 0, encoding,
+ NULL, 0);
if (!lib_name) {
BIF_ERROR(BIF_P, BADARG);
}
@@ -1626,7 +1627,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
"module '%T' not allowed", mod_atom);
}
else if (init_func == NULL &&
- (err=erts_sys_ddll_open2(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
+ (err=erts_sys_ddll_open(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
const char slogan[] = "Failed to load NIF library";
if (strstr(errdesc.str, lib_name) != NULL) {
ret = load_nif_error(BIF_P, "load_failed", "%s: '%s'", slogan, errdesc.str);
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 0a41fb596d..21fd8dd50a 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -8271,25 +8271,22 @@ save_gc_task(Process *c_p, ErtsProcSysTask *st, int prio)
int
erts_set_gc_state(Process *c_p, int enable)
{
- int res;
ErtsProcSysTaskQs *dgc_tsk_qs;
ASSERT(c_p == erts_get_current_process());
ASSERT((ERTS_PSFLG_RUNNING|ERTS_PSFLG_RUNNING_SYS)
& erts_smp_atomic32_read_nob(&c_p->state));
ERTS_SMP_LC_ASSERT(ERTS_PROC_LOCK_MAIN == erts_proc_lc_my_proc_locks(c_p));
- res = !(c_p->flags & F_DISABLE_GC);
-
if (!enable) {
c_p->flags |= F_DISABLE_GC;
- return res;
+ return 0;
}
c_p->flags &= ~F_DISABLE_GC;
dgc_tsk_qs = ERTS_PROC_GET_DELAYED_GC_TASK_QS(c_p);
if (!dgc_tsk_qs)
- return res;
+ return 0;
/* Move delayed gc tasks into sys tasks queues. */
@@ -8387,7 +8384,7 @@ erts_set_gc_state(Process *c_p, int enable)
if (dgc_tsk_qs)
proc_sys_task_queues_free(dgc_tsk_qs);
- return res;
+ return 1;
}
void
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 7e3c6681d9..3a968594f3 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1990,12 +1990,14 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_
{
int encoding = erts_get_native_filename_encoding();
return erts_convert_filename_to_encoding(name, statbuf, statbuf_size, alloc_type,
- allow_empty, allow_atom, encoding, used);
+ allow_empty, allow_atom, encoding,
+ used, 0);
}
char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbuf_size,
ErtsAlcType_t alloc_type, int allow_empty,
- int allow_atom, int encoding, Sint *used)
+ int allow_atom, int encoding, Sint *used,
+ Uint extra)
{
char* name_buf = NULL;
@@ -2008,13 +2010,14 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
}
if (encoding == ERL_FILENAME_WIN_WCHAR) {
need += 2;
+ extra *= 2;
} else {
++need;
}
if (used)
*used = (Sint) need;
- if (need > statbuf_size) {
- name_buf = (char *) erts_alloc(alloc_type, need);
+ if (need+extra > statbuf_size) {
+ name_buf = (char *) erts_alloc(alloc_type, need+extra);
} else {
name_buf = statbuf;
}
@@ -2035,8 +2038,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
/*Add 0 termination only*/
if (used)
*used = (Sint) size+1;
- if (size+1 > statbuf_size) {
- name_buf = (char *) erts_alloc(alloc_type, size+1);
+ if (size+1+extra > statbuf_size) {
+ name_buf = (char *) erts_alloc(alloc_type, size+1+extra);
} else {
name_buf = statbuf;
}
@@ -2045,7 +2048,7 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
} else {
name_buf = erts_convert_filename_to_wchar(bytes, size,
statbuf, statbuf_size,
- alloc_type, used, 0);
+ alloc_type, used, extra);
}
erts_free_aligned_binary_bytes(temp_alloc);
} else {
diff --git a/erts/emulator/beam/erl_zlib.c b/erts/emulator/beam/erl_zlib.c
index 47fd92988e..8e33144f96 100644
--- a/erts/emulator/beam/erl_zlib.c
+++ b/erts/emulator/beam/erl_zlib.c
@@ -87,6 +87,46 @@ int ZEXPORT erl_zlib_deflate_finish(z_stream *streamp)
return deflateEnd(streamp);
}
+int ZEXPORT erl_zlib_inflate_start(z_stream *streamp, const Bytef* source,
+ uLong sourceLen)
+{
+ streamp->next_in = (Bytef*)source;
+ streamp->avail_in = (uInt)sourceLen;
+ streamp->total_out = streamp->avail_out = 0;
+ streamp->next_out = NULL;
+ erl_zlib_alloc_init(streamp);
+ return inflateInit(streamp);
+}
+/*
+ * Inflate a chunk, The destination length is the limit.
+ * Returns Z_OK if more to process, Z_STREAM_END if we are done.
+ */
+int ZEXPORT erl_zlib_inflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen)
+{
+ int err;
+ uLongf last_tot = streamp->total_out;
+
+ streamp->next_out = dest;
+ streamp->avail_out = (uInt)*destLen;
+
+ if ((uLong)streamp->avail_out != *destLen) return Z_BUF_ERROR;
+
+ err = inflate(streamp, Z_NO_FLUSH);
+ ASSERT(err != Z_STREAM_ERROR);
+ *destLen = streamp->total_out - last_tot;
+ return err;
+}
+
+/*
+ * When we are done, free up the inflate structure
+ * Retyurns Z_OK or Error
+ */
+int ZEXPORT erl_zlib_inflate_finish(z_stream *streamp)
+{
+ return inflateEnd(streamp);
+}
+
+
int ZEXPORT erl_zlib_compress2 (Bytef* dest, uLongf* destLen,
const Bytef* source, uLong sourceLen,
int level)
diff --git a/erts/emulator/beam/erl_zlib.h b/erts/emulator/beam/erl_zlib.h
index 5ac849d21c..160166c66b 100644
--- a/erts/emulator/beam/erl_zlib.h
+++ b/erts/emulator/beam/erl_zlib.h
@@ -39,6 +39,12 @@ int ZEXPORT erl_zlib_deflate_start(z_stream *streamp, const Bytef* source,
int ZEXPORT erl_zlib_deflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen);
int ZEXPORT erl_zlib_deflate_finish(z_stream *streamp);
+int ZEXPORT erl_zlib_inflate_start(z_stream *streamp, const Bytef* source,
+ uLong sourceLen);
+int ZEXPORT erl_zlib_inflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen);
+int ZEXPORT erl_zlib_inflate_finish(z_stream *streamp);
+
+
/* Use instead of compress
*/
#define erl_zlib_compress(dest,destLen,source,sourceLen) \
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 22b0a02937..2cb44a5b64 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -61,6 +61,9 @@
*/
# define ERTS_DEBUG_USE_DIST_SEP
# endif
+# define IF_DEBUG(X) X
+#else
+# define IF_DEBUG(X)
#endif
/* Does Sint fit in Sint32?
@@ -89,10 +92,11 @@ static int enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep,
static Uint is_external_string(Eterm obj, int* p_is_string);
static byte* enc_atom(ErtsAtomCacheMap *, Eterm, byte*, Uint32);
static byte* enc_pid(ErtsAtomCacheMap *, Eterm, byte*, Uint32);
-static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*);
+struct B2TContext_t;
+static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*, struct B2TContext_t*);
static byte* dec_atom(ErtsDistExternal *, byte*, Eterm*);
static byte* dec_pid(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*);
-static Sint decoded_size(byte *ep, byte* endp, int internal_tags);
+static Sint decoded_size(byte *ep, byte* endp, int internal_tags, struct B2TContext_t*);
static BIF_RETTYPE term_to_binary_trap_1(BIF_ALIST_1);
static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint flags,
@@ -102,11 +106,19 @@ static Uint encode_size_struct2(ErtsAtomCacheMap *, Eterm, unsigned);
static int encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj,
unsigned dflags, Sint *reds, Uint *res);
+static Export binary_to_term_trap_export;
+static BIF_RETTYPE binary_to_term_trap_1(BIF_ALIST_1);
+static Eterm binary_to_term_int(Process* p, Uint32 flags, Eterm bin, Binary* context_b);
+
void erts_init_external(void) {
#if 1 /* In R16 */
erts_init_trap_export(&term_to_binary_trap_export,
am_erlang, am_term_to_binary_trap, 1,
&term_to_binary_trap_1);
+
+ erts_init_trap_export(&binary_to_term_trap_export,
+ am_erlang, am_binary_to_term_trap, 1,
+ &binary_to_term_trap_1);
#else
sys_memset((void *) &term_to_binary_trap_export, 0, sizeof(Export));
term_to_binary_trap_export.address = &term_to_binary_trap_export.code[3];
@@ -877,7 +889,7 @@ erts_decode_dist_ext_size(ErtsDistExternal *edep)
goto fail;
ep = edep->extp+1;
}
- res = decoded_size(ep, edep->ext_endp, 0);
+ res = decoded_size(ep, edep->ext_endp, 0, NULL);
if (res >= 0)
return res;
fail:
@@ -889,12 +901,12 @@ Sint erts_decode_ext_size(byte *ext, Uint size)
{
if (size == 0 || *ext != VERSION_MAGIC)
return -1;
- return decoded_size(ext+1, ext+size, 0);
+ return decoded_size(ext+1, ext+size, 0, NULL);
}
Sint erts_decode_ext_size_ets(byte *ext, Uint size)
{
- Sint sz = decoded_size(ext, ext+size, 1);
+ Sint sz = decoded_size(ext, ext+size, 1, NULL);
ASSERT(sz >= 0);
return sz;
}
@@ -927,7 +939,7 @@ erts_decode_dist_ext(Eterm** hpp,
goto error;
ep++;
}
- ep = dec_term(edep, hpp, ep, off_heap, &obj);
+ ep = dec_term(edep, hpp, ep, off_heap, &obj, NULL);
if (!ep)
goto error;
@@ -948,7 +960,7 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext)
byte *ep = *ext;
if (*ep++ != VERSION_MAGIC)
return THE_NON_VALUE;
- ep = dec_term(NULL, hpp, ep, off_heap, &obj);
+ ep = dec_term(NULL, hpp, ep, off_heap, &obj, NULL);
if (!ep) {
#ifdef DEBUG
bin_write(ERTS_PRINT_STDERR,NULL,*ext,500);
@@ -962,7 +974,7 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext)
Eterm erts_decode_ext_ets(Eterm **hpp, ErlOffHeap *off_heap, byte *ext)
{
Eterm obj;
- ext = dec_term(NULL, hpp, ext, off_heap, &obj);
+ ext = dec_term(NULL, hpp, ext, off_heap, &obj, NULL);
ASSERT(ext);
return obj;
}
@@ -1043,9 +1055,14 @@ static BIF_RETTYPE term_to_binary_trap_1(BIF_ALIST_1)
Binary *bin = ((ProcBin *) binary_val(bt))->val;
Eterm res = erts_term_to_binary_int(BIF_P, Term, 0, 0,bin);
if (is_tuple(res)) {
+ ASSERT(BIF_P->flags & F_DISABLE_GC);
BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
} else {
- BIF_RET(res);
+ if (erts_set_gc_state(BIF_P, 1)
+ || MSO(BIF_P).overhead > BIN_VHEAP_SZ(BIF_P))
+ ERTS_BIF_YIELD_RETURN(BIF_P, res);
+ else
+ BIF_RET(res);
}
}
@@ -1053,8 +1070,10 @@ BIF_RETTYPE term_to_binary_1(BIF_ALIST_1)
{
Eterm res = erts_term_to_binary_int(BIF_P, BIF_ARG_1, 0, TERM_TO_BINARY_DFLAGS, NULL);
if (is_tuple(res)) {
+ erts_set_gc_state(BIF_P, 0);
BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
} else {
+ ASSERT(!(BIF_P->flags & F_DISABLE_GC));
BIF_RET(res);
}
}
@@ -1106,12 +1125,72 @@ BIF_RETTYPE term_to_binary_2(BIF_ALIST_2)
res = erts_term_to_binary_int(p, Term, level, flags, bin);
if (is_tuple(res)) {
+ erts_set_gc_state(p, 0);
BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
} else {
+ ASSERT(!(BIF_P->flags & F_DISABLE_GC));
BIF_RET(res);
}
}
+
+enum B2TState { /* order is somewhat significant */
+ B2TPrepare,
+ B2TUncompressChunk,
+ B2TSizeInit,
+ B2TSize,
+ B2TDecodeInit,
+ B2TDecode,
+ B2TDecodeList,
+ B2TDecodeTuple,
+ B2TDecodeString,
+ B2TDecodeBinary,
+
+ B2TDone,
+ B2TDecodeFail,
+ B2TBadArg
+};
+
+typedef struct {
+ int heap_size;
+ int terms;
+ byte* ep;
+ int atom_extra_skip;
+} B2TSizeContext;
+
+typedef struct {
+ byte* ep;
+ Eterm res;
+ Eterm* next;
+ Eterm* hp_start;
+ Eterm* hp;
+ Eterm* hp_end;
+ int remaining_n;
+ char* remaining_bytes;
+} B2TDecodeContext;
+
+typedef struct {
+ z_stream stream;
+ byte* dbytes;
+ Uint dleft;
+} B2TUncompressContext;
+
+typedef struct B2TContext_t {
+ Sint heap_size;
+ byte* aligned_alloc;
+ ErtsBinary2TermState b2ts;
+ Uint32 flags;
+ SWord reds;
+ Eterm trap_bin;
+ enum B2TState state;
+ union {
+ B2TSizeContext sc;
+ B2TDecodeContext dc;
+ B2TUncompressContext uc;
+ } u;
+} B2TContext;
+
+
static uLongf binary2term_uncomp_size(byte* data, Sint size)
{
z_stream stream;
@@ -1141,48 +1220,62 @@ static uLongf binary2term_uncomp_size(byte* data, Sint size)
return err == Z_STREAM_END ? uncomp_size : 0;
}
-static ERTS_INLINE Sint
-binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size)
+static ERTS_INLINE int
+binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size,
+ B2TContext* ctx)
{
- Sint res;
byte *bytes = data;
Sint size = data_size;
state->exttmp = 0;
if (size < 1 || *bytes != VERSION_MAGIC) {
- error:
- if (state->exttmp)
- erts_free(ERTS_ALC_T_TMP, state->extp);
- state->extp = NULL;
- state->exttmp = 0;
return -1;
}
bytes++;
size--;
if (size < 5 || *bytes != COMPRESSED) {
state->extp = bytes;
+ if (ctx)
+ ctx->state = B2TSizeInit;
}
else {
uLongf dest_len = (Uint32) get_int32(bytes+1);
bytes += 5;
size -= 5;
if (dest_len > 32*1024*1024
- || (state->extp = erts_alloc_fnf(ERTS_ALC_T_TMP, dest_len)) == NULL) {
+ || (state->extp = erts_alloc_fnf(ERTS_ALC_T_EXT_TERM_DATA, dest_len)) == NULL) {
+ /*
+ * Try avoid out-of-memory crash due to corrupted 'dest_len'
+ * by checking the actual length of the uncompressed data.
+ * The only way to do that is to uncompress it. Sad but true.
+ */
if (dest_len != binary2term_uncomp_size(bytes, size)) {
- goto error;
+ return -1;
}
- state->extp = erts_alloc(ERTS_ALC_T_TMP, dest_len);
+ state->extp = erts_alloc(ERTS_ALC_T_EXT_TERM_DATA, dest_len);
+ ctx->reds -= dest_len;
}
state->exttmp = 1;
- if (erl_zlib_uncompress(state->extp, &dest_len, bytes, size) != Z_OK)
- goto error;
+ if (ctx) {
+ if (erl_zlib_inflate_start(&ctx->u.uc.stream, bytes, size) != Z_OK)
+ return -1;
+
+ ctx->u.uc.dbytes = state->extp;
+ ctx->u.uc.dleft = dest_len;
+ ctx->state = B2TUncompressChunk;
+ }
+ else {
+ uLongf dlen = dest_len;
+ if (erl_zlib_uncompress(state->extp, &dlen, bytes, size) != Z_OK
+ || dlen != dest_len) {
+ return -1;
+ }
+ }
size = (Sint) dest_len;
}
- res = decoded_size(state->extp, state->extp + size, 0);
- if (res < 0)
- goto error;
- return res;
+ state->extsize = size;
+ return 0;
}
static ERTS_INLINE void
@@ -1190,7 +1283,7 @@ binary2term_abort(ErtsBinary2TermState *state)
{
if (state->exttmp) {
state->exttmp = 0;
- erts_free(ERTS_ALC_T_TMP, state->extp);
+ erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp);
}
}
@@ -1198,11 +1291,11 @@ static ERTS_INLINE Eterm
binary2term_create(ErtsDistExternal *edep, ErtsBinary2TermState *state, Eterm **hpp, ErlOffHeap *ohp)
{
Eterm res;
- if (!dec_term(edep, hpp, state->extp, ohp, &res))
+ if (!dec_term(edep, hpp, state->extp, ohp, &res, NULL))
res = THE_NON_VALUE;
if (state->exttmp) {
state->exttmp = 0;
- erts_free(ERTS_ALC_T_TMP, state->extp);
+ erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp);
}
return res;
}
@@ -1210,7 +1303,18 @@ binary2term_create(ErtsDistExternal *edep, ErtsBinary2TermState *state, Eterm **
Sint
erts_binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size)
{
- return binary2term_prepare(state, data, data_size);
+ Sint res;
+
+ if (binary2term_prepare(state, data, data_size, NULL) < 0 ||
+ (res=decoded_size(state->extp, state->extp + state->extsize, 0, NULL)) < 0) {
+
+ if (state->exttmp)
+ erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp);
+ state->extp = NULL;
+ state->exttmp = 0;
+ return -1;
+ }
+ return res;
}
void
@@ -1225,68 +1329,233 @@ erts_binary2term_create(ErtsBinary2TermState *state, Eterm **hpp, ErlOffHeap *oh
return binary2term_create(NULL,state, hpp, ohp);
}
-BIF_RETTYPE binary_to_term_1(BIF_ALIST_1)
+static void b2t_destroy_context(B2TContext* context)
{
- Sint heap_size;
- Eterm res;
+ erts_free_aligned_binary_bytes_extra(context->aligned_alloc,
+ ERTS_ALC_T_EXT_TERM_DATA);
+ context->aligned_alloc = NULL;
+ binary2term_abort(&context->b2ts);
+ if (context->state == B2TUncompressChunk) {
+ erl_zlib_inflate_finish(&context->u.uc.stream);
+ }
+}
+
+static void b2t_context_destructor(Binary *context_bin)
+{
+ B2TContext* ctx = (B2TContext*) ERTS_MAGIC_BIN_DATA(context_bin);
+ ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(context_bin) == b2t_context_destructor);
+
+ b2t_destroy_context(ctx);
+}
+
+static BIF_RETTYPE binary_to_term_trap_1(BIF_ALIST_1)
+{
+ Binary *context_bin = ((ProcBin *) binary_val(BIF_ARG_1))->val;
+ ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(context_bin) == b2t_context_destructor);
+
+ return binary_to_term_int(BIF_P, 0, THE_NON_VALUE, context_bin);
+}
+
+
+#define B2T_BYTES_PER_REDUCTION 128
+#define B2T_MEMCPY_FACTOR 8
+
+/* Define for testing */
+/*#define EXTREME_B2T_TRAPPING 1*/
+
+#ifdef EXTREME_B2T_TRAPPING
+static unsigned b2t_rand(void)
+{
+ static unsigned prev = 17;
+ prev = (prev * 214013 + 2531011);
+ return prev;
+}
+#endif
+
+
+static B2TContext* b2t_export_context(Process* p, B2TContext* src)
+{
+ Binary* context_b = erts_create_magic_binary(sizeof(B2TContext),
+ b2t_context_destructor);
+ B2TContext* ctx = ERTS_MAGIC_BIN_DATA(context_b);
Eterm* hp;
- Eterm* endp;
- Sint size;
- byte* bytes;
- byte* temp_alloc = NULL;
- ErtsBinary2TermState b2ts;
+ sys_memcpy(ctx, src, sizeof(B2TContext));
+ if (ctx->state >= B2TDecode && ctx->u.dc.next == &src->u.dc.res) {
+ ctx->u.dc.next = &ctx->u.dc.res;
+ }
+ hp = HAlloc(p, PROC_BIN_SIZE);
+ ctx->trap_bin = erts_mk_magic_binary_term(&hp, &MSO(p), context_b);
+ return ctx;
+}
- if ((bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc)) == NULL) {
- error:
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(BIF_P, BADARG);
+static Eterm binary_to_term_int(Process* p, Uint32 flags, Eterm bin, Binary* context_b)
+{
+#ifdef EXTREME_B2T_TRAPPING
+ SWord initial_reds = 1 + b2t_rand() % 4;
+#else
+ SWord initial_reds = (Uint)(ERTS_BIF_REDS_LEFT(p) * B2T_BYTES_PER_REDUCTION);
+#endif
+ B2TContext c_buff;
+ B2TContext *ctx;
+ int is_first_call;
+
+ if (context_b == NULL) {
+ /* Setup enough to get started */
+ is_first_call = 1;
+ ctx = &c_buff;
+ ctx->state = B2TPrepare;
+ ctx->aligned_alloc = NULL;
+ ctx->flags = flags;
+ IF_DEBUG(ctx->trap_bin = THE_NON_VALUE;)
+ } else {
+ is_first_call = 0;
+ ctx = ERTS_MAGIC_BIN_DATA(context_b);
+ ASSERT(ctx->state != B2TPrepare);
}
- size = binary_size(BIF_ARG_1);
+ ctx->reds = initial_reds;
+
+ do {
+ switch (ctx->state) {
+ case B2TPrepare: {
+ byte* bytes;
+ Uint bin_size;
+ bytes = erts_get_aligned_binary_bytes_extra(bin,
+ &ctx->aligned_alloc,
+ ERTS_ALC_T_EXT_TERM_DATA,
+ 0);
+ if (bytes == NULL) {
+ ctx->b2ts.exttmp = 0;
+ ctx->state = B2TBadArg;
+ break;
+ }
+ bin_size = binary_size(bin);
+ if (ctx->aligned_alloc) {
+ ctx->reds -= bin_size / 8;
+ }
+ if (binary2term_prepare(&ctx->b2ts, bytes, bin_size, ctx) < 0) {
+ ctx->state = B2TBadArg;
+ }
+ break;
+ }
+ case B2TUncompressChunk: {
+ uLongf chunk = ctx->reds;
+ int zret;
+
+ if (chunk > ctx->u.uc.dleft)
+ chunk = ctx->u.uc.dleft;
+ zret = erl_zlib_inflate_chunk(&ctx->u.uc.stream,
+ ctx->u.uc.dbytes, &chunk);
+ ctx->u.uc.dbytes += chunk;
+ ctx->u.uc.dleft -= chunk;
+ if (zret == Z_OK && ctx->u.uc.dleft > 0) {
+ ctx->reds = 0;
+ }
+ else if (erl_zlib_inflate_finish(&ctx->u.uc.stream) == Z_OK
+ && zret == Z_STREAM_END
+ && ctx->u.uc.dleft == 0) {
+ ctx->reds -= chunk;
+ ctx->state = B2TSizeInit;
+ }
+ else {
+ ctx->state = B2TBadArg;
+ }
+ break;
+ }
+ case B2TSizeInit:
+ ctx->u.sc.ep = NULL;
+ ctx->state = B2TSize;
+ /*fall through*/
+ case B2TSize:
+ ctx->heap_size = decoded_size(ctx->b2ts.extp,
+ ctx->b2ts.extp + ctx->b2ts.extsize,
+ 0, ctx);
+ break;
+
+ case B2TDecodeInit:
+ if (ctx == &c_buff && ctx->b2ts.extsize > ctx->reds) {
+ /* dec_term will maybe trap, allocate space for magic bin
+ before result term to make it easy to trim with HRelease.
+ */
+ ctx = b2t_export_context(p, &c_buff);
+ }
+ ctx->u.dc.ep = ctx->b2ts.extp;
+ ctx->u.dc.res = (Eterm) (UWord) NULL;
+ ctx->u.dc.next = &ctx->u.dc.res;
+ ctx->u.dc.hp_start = HAlloc(p, ctx->heap_size);
+ ctx->u.dc.hp = ctx->u.dc.hp_start;
+ ctx->u.dc.hp_end = ctx->u.dc.hp_start + ctx->heap_size;
+ ctx->state = B2TDecode;
+ /*fall through*/
+ case B2TDecode:
+ case B2TDecodeList:
+ case B2TDecodeTuple:
+ case B2TDecodeString:
+ case B2TDecodeBinary: {
+ ErtsDistExternal fakedep;
+ fakedep.flags = ctx->flags;
+ dec_term(&fakedep, NULL, NULL, &MSO(p), NULL, ctx);
+ break;
+ }
+ case B2TDecodeFail:
+ HRelease(p, ctx->u.dc.hp_end, ctx->u.dc.hp_start);
+ /*fall through*/
+ case B2TBadArg:
+ b2t_destroy_context(ctx);
+ if (!is_first_call) {
+ erts_set_gc_state(p, 1);
+ }
+ BUMP_REDS(p, (initial_reds - ctx->reds) / B2T_BYTES_PER_REDUCTION);
+ BIF_ERROR(p, BADARG & ~EXF_SAVETRACE);
- heap_size = binary2term_prepare(&b2ts, bytes, size);
- if (heap_size < 0)
- goto error;
+ case B2TDone:
+ b2t_destroy_context(ctx);
- hp = HAlloc(BIF_P, heap_size);
- endp = hp + heap_size;
+ if (ctx->u.dc.hp > ctx->u.dc.hp_end) {
+ erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n",
+ __FILE__, __LINE__, ctx->u.dc.hp - ctx->u.dc.hp_end);
+ }
+ HRelease(p, ctx->u.dc.hp_end, ctx->u.dc.hp);
- res = binary2term_create(NULL, &b2ts, &hp, &MSO(BIF_P));
+ if (!is_first_call) {
+ erts_set_gc_state(p, 1);
+ }
+ BUMP_REDS(p, (initial_reds - ctx->reds) / B2T_BYTES_PER_REDUCTION);
+ return ctx->u.dc.res;
- erts_free_aligned_binary_bytes(temp_alloc);
+ default:
+ ASSERT(!"Unknown state in binary_to_term");
+ }
+ }while (ctx->reds > 0 || ctx->state >= B2TDone);
- if (hp > endp) {
- erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n",
- __FILE__, __LINE__, hp-endp);
+ if (ctx == &c_buff) {
+ ASSERT(ctx->trap_bin == THE_NON_VALUE);
+ ctx = b2t_export_context(p, &c_buff);
}
+ ASSERT(ctx->trap_bin != THE_NON_VALUE);
- HRelease(BIF_P, endp, hp);
-
- if (res == THE_NON_VALUE)
- goto error;
+ if (is_first_call) {
+ erts_set_gc_state(p, 0);
+ }
+ BUMP_ALL_REDS(p);
+ BIF_TRAP1(&binary_to_term_trap_export, p, ctx->trap_bin);
+}
- return res;
+BIF_RETTYPE erts_internal_binary_to_term_1(BIF_ALIST_1)
+{
+ return binary_to_term_int(BIF_P, 0, BIF_ARG_1, NULL);
}
-BIF_RETTYPE binary_to_term_2(BIF_ALIST_2)
+BIF_RETTYPE erts_internal_binary_to_term_2(BIF_ALIST_2)
{
- Sint heap_size;
- Eterm res;
Eterm opts;
Eterm opt;
- Eterm* hp;
- Eterm* endp;
- Sint size;
- byte* bytes;
- byte* temp_alloc = NULL;
- ErtsBinary2TermState b2ts;
- ErtsDistExternal fakedep;
+ Uint32 flags = 0;
- fakedep.flags = 0;
opts = BIF_ARG_2;
while (is_list(opts)) {
opt = CAR(list_val(opts));
if (opt == am_safe) {
- fakedep.flags |= ERTS_DIST_EXT_BTT_SAFE;
+ flags |= ERTS_DIST_EXT_BTT_SAFE;
}
else {
goto error;
@@ -1297,35 +1566,10 @@ BIF_RETTYPE binary_to_term_2(BIF_ALIST_2)
if (is_not_nil(opts))
goto error;
- if ((bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc)) == NULL) {
- error:
- erts_free_aligned_binary_bytes(temp_alloc);
- BIF_ERROR(BIF_P, BADARG);
- }
- size = binary_size(BIF_ARG_1);
-
- heap_size = binary2term_prepare(&b2ts, bytes, size);
- if (heap_size < 0)
- goto error;
-
- hp = HAlloc(BIF_P, heap_size);
- endp = hp + heap_size;
-
- res = binary2term_create(&fakedep, &b2ts, &hp, &MSO(BIF_P));
-
- erts_free_aligned_binary_bytes(temp_alloc);
-
- if (hp > endp) {
- erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n",
- __FILE__, __LINE__, hp-endp);
- }
-
- HRelease(BIF_P, endp, hp);
-
- if (res == THE_NON_VALUE)
- goto error;
+ return binary_to_term_int(BIF_P, flags, BIF_ARG_1, NULL);
- return res;
+error:
+ BIF_ERROR(BIF_P, BADARG);
}
Eterm
@@ -1473,12 +1717,10 @@ erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags) {
/* #define EXTREME_TTB_TRAPPING 1 */
#ifndef EXTREME_TTB_TRAPPING
-#define TERM_TO_BINARY_LOOP_FACTOR 500
-#define TERM_TO_BINARY_SIZE_FACTOR 500000
-#define TERM_TO_BINARY_COMPRESS_CHUNK 500000
+#define TERM_TO_BINARY_LOOP_FACTOR 32
+#define TERM_TO_BINARY_COMPRESS_CHUNK (1 << 18)
#else
#define TERM_TO_BINARY_LOOP_FACTOR 1
-#define TERM_TO_BINARY_SIZE_FACTOR 10
#define TERM_TO_BINARY_COMPRESS_CHUNK 10
#endif
@@ -1514,7 +1756,7 @@ typedef struct {
} s;
} TTBContext;
-static void context_destructor(Binary *context_bin)
+static void ttb_context_destructor(Binary *context_bin)
{
TTBContext *context = ERTS_MAGIC_BIN_DATA(context_bin);
if (context->alive) {
@@ -1567,7 +1809,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
do { \
if (context_b == NULL) { \
context_b = erts_create_magic_binary(sizeof(TTBContext), \
- context_destructor); \
+ ttb_context_destructor); \
context = ERTS_MAGIC_BIN_DATA(context_b); \
memcpy(context,&c_buff,sizeof(TTBContext)); \
} \
@@ -1615,7 +1857,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
/* Finish in one go */
res = erts_term_to_binary_simple(p, Term, size,
level, flags);
- BUMP_REDS(p, size / TERM_TO_BINARY_SIZE_FACTOR);
+ BUMP_REDS(p, 1);
return res;
}
@@ -2604,21 +2846,112 @@ undo_offheap_in_area(ErlOffHeap* off_heap, Eterm* start, Eterm* end)
#endif /* DEBUG */
}
+
/* Decode term from external format into *objp.
** On failure return NULL and (R13B04) *hpp will be unchanged.
*/
static byte*
-dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp)
+dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap,
+ Eterm* objp, B2TContext* ctx)
{
- Eterm* hp_saved = *hpp;
+ Eterm* hp_saved;
int n;
ErtsAtomEncoding char_enc;
- register Eterm* hp = *hpp; /* Please don't take the address of hp */
- Eterm* next = objp;
+ register Eterm* hp; /* Please don't take the address of hp */
+ Eterm* next;
+ SWord reds;
+
+ if (ctx) {
+ hp_saved = ctx->u.dc.hp_start;
+ reds = ctx->reds;
+ next = ctx->u.dc.next;
+ ep = ctx->u.dc.ep;
+ hpp = &ctx->u.dc.hp;
+
+ if (ctx->state != B2TDecode) {
+ int n_limit = reds;
+
+ n = ctx->u.dc.remaining_n;
+ if (ctx->state == B2TDecodeBinary) {
+ n_limit *= B2T_MEMCPY_FACTOR;
+ ASSERT(n_limit >= reds);
+ reds -= n / B2T_MEMCPY_FACTOR;
+ }
+ else
+ reds -= n;
- *next = (Eterm) (UWord) NULL;
+ if (n > n_limit) {
+ ctx->u.dc.remaining_n -= n_limit;
+ n = n_limit;
+ reds = 0;
+ }
+ else {
+ ctx->u.dc.remaining_n = 0;
+ }
+
+ switch (ctx->state) {
+ case B2TDecodeList:
+ objp = next - 2;
+ while (n > 0) {
+ objp[0] = (Eterm) COMPRESS_POINTER(next);
+ objp[1] = make_list(next);
+ next = objp;
+ objp -= 2;
+ n--;
+ }
+ break;
+
+ case B2TDecodeTuple:
+ objp = next - 1;
+ while (n-- > 0) {
+ objp[0] = (Eterm) COMPRESS_POINTER(next);
+ next = objp;
+ objp--;
+ }
+ break;
+
+ case B2TDecodeString:
+ hp = *hpp;
+ hp[-1] = make_list(hp); /* overwrite the premature NIL */
+ while (n-- > 0) {
+ hp[0] = make_small(*ep++);
+ hp[1] = make_list(hp+2);
+ hp += 2;
+ }
+ hp[-1] = NIL;
+ *hpp = hp;
+ break;
+
+ case B2TDecodeBinary:
+ sys_memcpy(ctx->u.dc.remaining_bytes, ep, n);
+ ctx->u.dc.remaining_bytes += n;
+ ep += n;
+ break;
+
+ default:
+ ASSERT(!"Unknown state");
+ }
+ if (!ctx->u.dc.remaining_n) {
+ ctx->state = B2TDecode;
+ }
+ if (reds <= 0) {
+ ctx->u.dc.next = next;
+ ctx->u.dc.ep = ep;
+ ctx->reds = 0;
+ return NULL;
+ }
+ }
+ }
+ else {
+ hp_saved = *hpp;
+ reds = ERTS_SWORD_MAX;
+ next = objp;
+ *next = (Eterm) (UWord) NULL;
+ }
+ hp = *hpp;
while (next != NULL) {
+
objp = next;
next = (Eterm *) EXPAND_POINTER(*objp);
@@ -2738,7 +3071,16 @@ dec_term_atom_common:
*objp = make_tuple(hp);
*hp++ = make_arityval(n);
hp += n;
- objp = hp - 1;
+ objp = hp - 1;
+ if (ctx) {
+ if (reds < n) {
+ ASSERT(reds > 0);
+ ctx->state = B2TDecodeTuple;
+ ctx->u.dc.remaining_n = n - reds;
+ n = reds;
+ }
+ reds -= n;
+ }
while (n-- > 0) {
objp[0] = (Eterm) COMPRESS_POINTER(next);
next = objp;
@@ -2756,17 +3098,27 @@ dec_term_atom_common:
break;
}
*objp = make_list(hp);
- hp += 2*n;
+ hp += 2 * n;
objp = hp - 2;
objp[0] = (Eterm) COMPRESS_POINTER((objp+1));
objp[1] = (Eterm) COMPRESS_POINTER(next);
next = objp;
objp -= 2;
- while (--n > 0) {
+ n--;
+ if (ctx) {
+ if (reds < n) {
+ ctx->state = B2TDecodeList;
+ ctx->u.dc.remaining_n = n - reds;
+ n = reds;
+ }
+ reds -= n;
+ }
+ while (n > 0) {
objp[0] = (Eterm) COMPRESS_POINTER(next);
- objp[1] = make_list(objp + 2);
+ objp[1] = make_list(next);
next = objp;
objp -= 2;
+ n--;
}
break;
case STRING_EXT:
@@ -2777,6 +3129,14 @@ dec_term_atom_common:
break;
}
*objp = make_list(hp);
+ if (ctx) {
+ if (reds < n) {
+ ctx->state = B2TDecodeString;
+ ctx->u.dc.remaining_n = n - reds;
+ n = reds;
+ }
+ reds -= n;
+ }
while (n-- > 0) {
hp[0] = make_small(*ep++);
hp[1] = make_list(hp+2);
@@ -2984,7 +3344,6 @@ dec_term_atom_common:
dbin->flags = 0;
dbin->orig_size = n;
erts_refc_init(&dbin->refc, 1);
- sys_memcpy(dbin->orig_bytes, ep, n);
pb = (ProcBin *) hp;
hp += PROC_BIN_SIZE;
pb->thing_word = HEADER_PROC_BIN;
@@ -2995,7 +3354,20 @@ dec_term_atom_common:
pb->bytes = (byte*) dbin->orig_bytes;
pb->flags = 0;
*objp = make_binary(pb);
- }
+ if (ctx) {
+ int n_limit = reds * B2T_MEMCPY_FACTOR;
+ if (n > n_limit) {
+ ctx->state = B2TDecodeBinary;
+ ctx->u.dc.remaining_n = n - n_limit;
+ ctx->u.dc.remaining_bytes = dbin->orig_bytes + n_limit;
+ n = n_limit;
+ reds = 0;
+ }
+ else
+ reds -= n / B2T_MEMCPY_FACTOR;
+ }
+ sys_memcpy(dbin->orig_bytes, ep, n);
+ }
ep += n;
break;
}
@@ -3018,13 +3390,14 @@ dec_term_atom_common:
sys_memcpy(hb->data, ep, n);
bin = make_binary(hb);
hp += heap_bin_size(n);
+ ep += n;
} else {
Binary* dbin = erts_bin_nrml_alloc(n);
ProcBin* pb;
+
dbin->flags = 0;
dbin->orig_size = n;
erts_refc_init(&dbin->refc, 1);
- sys_memcpy(dbin->orig_bytes, ep, n);
pb = (ProcBin *) hp;
pb->thing_word = HEADER_PROC_BIN;
pb->size = n;
@@ -3035,8 +3408,23 @@ dec_term_atom_common:
pb->flags = 0;
bin = make_binary(pb);
hp += PROC_BIN_SIZE;
- }
- ep += n;
+ if (ctx) {
+ int n_limit = reds * B2T_MEMCPY_FACTOR;
+ if (n > n_limit) {
+ ctx->state = B2TDecodeBinary;
+ ctx->u.dc.remaining_n = n - n_limit;
+ ctx->u.dc.remaining_bytes = dbin->orig_bytes + n_limit;
+ n = n_limit;
+ reds = 0;
+ }
+ else
+ reds -= n / B2T_MEMCPY_FACTOR;
+ }
+ sys_memcpy(dbin->orig_bytes, ep, n);
+ ep += n;
+ n = pb->size;
+ }
+
if (bitsize == 8 || n == 0) {
*objp = bin;
} else {
@@ -3067,7 +3455,7 @@ dec_term_atom_common:
goto error;
}
*hpp = hp;
- ep = dec_term(edep, hpp, ep, off_heap, &temp);
+ ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL);
hp = *hpp;
if (ep == NULL) {
goto error;
@@ -3127,7 +3515,7 @@ dec_term_atom_common:
}
*hpp = hp;
/* Index */
- if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -3136,7 +3524,7 @@ dec_term_atom_common:
old_index = unsigned_val(temp);
/* Uniq */
- if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -3204,7 +3592,7 @@ dec_term_atom_common:
}
/* Index */
- if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -3213,7 +3601,7 @@ dec_term_atom_common:
old_index = unsigned_val(temp);
/* Uniq */
- if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+ if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
goto error;
}
if (!is_small(temp)) {
@@ -3313,8 +3701,31 @@ dec_term_atom_common:
}
undo_offheap_in_area(off_heap, hp_saved, hp);
*hpp = hp_saved;
- return NULL;
+ if (ctx) {
+ ctx->state = B2TDecodeFail;
+ ctx->reds = reds;
+ }
+ return NULL;
}
+
+ if (--reds <= 0) {
+ if (ctx) {
+ if (next || ctx->state != B2TDecode) {
+ ctx->u.dc.ep = ep;
+ ctx->u.dc.next = next;
+ ctx->u.dc.hp = hp;
+ ctx->reds = 0;
+ return NULL;
+ }
+ }
+ else {
+ reds = ERTS_SWORD_MAX;
+ }
+ }
+ }
+ if (ctx) {
+ ctx->state = B2TDone;
+ ctx->reds = reds;
}
*hpp = hp;
return ep;
@@ -3602,18 +4013,37 @@ encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj,
}
static Sint
-decoded_size(byte *ep, byte* endp, int internal_tags)
+decoded_size(byte *ep, byte* endp, int internal_tags, B2TContext* ctx)
{
- int heap_size = 0;
+ int heap_size;
int terms;
- int atom_extra_skip = 0;
+ int atom_extra_skip;
Uint n;
+ SWord reds;
+
+ if (ctx) {
+ reds = ctx->reds;
+ if (ctx->u.sc.ep) {
+ heap_size = ctx->u.sc.heap_size;
+ terms = ctx->u.sc.terms;
+ ep = ctx->u.sc.ep;
+ atom_extra_skip = ctx->u.sc.atom_extra_skip;
+ goto init_done;
+ }
+ }
+ else
+ reds = 0; /* not used but compiler warns anyway */
+
+ heap_size = 0;
+ terms = 1;
+ atom_extra_skip = 0;
+init_done:
#define SKIP(sz) \
do { \
if ((sz) <= endp-ep) { \
ep += (sz); \
- } else { return -1; }; \
+ } else { goto error; }; \
} while (0)
#define SKIP2(sz1, sz2) \
@@ -3621,25 +4051,24 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
Uint sz = (sz1) + (sz2); \
if (sz1 < sz && (sz) <= endp-ep) { \
ep += (sz); \
- } else { return -1; } \
+ } else { goto error; } \
} while (0)
#define CHKSIZE(sz) \
do { \
- if ((sz) > endp-ep) { return -1; } \
+ if ((sz) > endp-ep) { goto error; } \
} while (0)
#define ADDTERMS(n) \
do { \
int before = terms; \
terms += (n); \
- if (terms < before) return -1; \
+ if (terms < before) goto error; \
} while (0)
-
- for (terms=1; terms > 0; terms--) {
- int tag;
-
+ ASSERT(terms > 0);
+ do {
+ int tag;
CHKSIZE(1);
tag = ep++[0];
switch (tag) {
@@ -3660,7 +4089,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
CHKSIZE(4);
n = get_int32(ep);
if (n > BIG_ARITY_MAX*sizeof(ErtsDigit)) {
- return -1;
+ goto error;
}
SKIP2(n,4+1); /* skip, size,sign,digits */
heap_size += 1+1+(n+sizeof(Eterm)-1)/sizeof(Eterm); /* XXX: 1 too much? */
@@ -3669,7 +4098,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
CHKSIZE(2);
n = get_int16(ep);
if (n > MAX_ATOM_CHARACTERS) {
- return -1;
+ goto error;
}
SKIP(n+2+atom_extra_skip);
atom_extra_skip = 0;
@@ -3679,7 +4108,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
n = get_int16(ep);
ep += 2;
if (n > MAX_ATOM_SZ_LIMIT) {
- return -1;
+ goto error;
}
SKIP(n+atom_extra_skip);
atom_extra_skip = 0;
@@ -3688,7 +4117,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
CHKSIZE(1);
n = get_int8(ep);
if (n > MAX_ATOM_CHARACTERS) {
- return -1;
+ goto error;
}
SKIP(n+1+atom_extra_skip);
atom_extra_skip = 0;
@@ -3698,7 +4127,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
n = get_int8(ep);
ep++;
if (n > MAX_ATOM_SZ_LIMIT) {
- return -1;
+ goto error;
}
SKIP(n+atom_extra_skip);
atom_extra_skip = 0;
@@ -3727,7 +4156,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
id_words = get_int16(ep);
if (id_words > ERTS_MAX_REF_NUMBERS)
- return -1;
+ goto error;
ep += 2;
atom_extra_skip = 1 + 4*id_words;
@@ -3829,7 +4258,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
num_free = get_int32(ep);
ep += 4;
if (num_free > MAX_ARG) {
- return -1;
+ goto error;
}
terms += 4 + num_free;
heap_size += ERL_FUN_SIZE + num_free;
@@ -3846,24 +4275,47 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
case BINARY_INTERNAL_REF:
if (!internal_tags) {
- return -1;
+ goto error;
}
SKIP(sizeof(ProcBin));
heap_size += PROC_BIN_SIZE;
break;
case BIT_BINARY_INTERNAL_REF:
if (!internal_tags) {
- return -1;
+ goto error;
}
SKIP(2+sizeof(ProcBin));
heap_size += PROC_BIN_SIZE + ERL_SUB_BIN_SIZE;
break;
default:
- return -1;
+ goto error;
}
- }
+ terms--;
+
+ if (ctx && --reds <= 0 && terms > 0) {
+ ctx->u.sc.heap_size = heap_size;
+ ctx->u.sc.terms = terms;
+ ctx->u.sc.ep = ep;
+ ctx->u.sc.atom_extra_skip = atom_extra_skip;
+ ctx->reds = 0;
+ return 0;
+ }
+ }while (terms > 0);
+
/* 'terms' may be non-zero if it has wrapped around */
- return terms==0 ? heap_size : -1;
+ if (terms == 0) {
+ if (ctx) {
+ ctx->state = B2TDecodeInit;
+ ctx->reds = reds;
+ }
+ return heap_size;
+ }
+
+error:
+ if (ctx) {
+ ctx->state = B2TBadArg;
+ }
+ return -1;
#undef SKIP
#undef SKIP2
#undef CHKSIZE
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index ff29e84972..83001b2c7e 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -146,6 +146,7 @@ typedef struct {
typedef struct {
byte *extp;
int exttmp;
+ Uint extsize;
} ErtsBinary2TermState;
/* -------------------------------------------------------------------------- */
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 94bc1b172a..6e5d352e5b 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -921,7 +921,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf,
ErtsAlcType_t alloc_type,
int allow_empty, int allow_atom,
int encoding,
- Sint *used /* out */);
+ Sint *used /* out */,
+ Uint extra);
char* erts_convert_filename_to_wchar(byte* bytes, Uint size,
char *statbuf, size_t statbuf_size,
ErtsAlcType_t alloc_type, Sint* used,
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index d4623c0450..49af86b36a 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -7061,7 +7061,7 @@ void *driver_dl_open(char * path)
int res;
int *last_error_p = erts_smp_tsd_get(driver_list_last_error_key);
int locked = maybe_lock_driver_list();
- if ((res = erts_sys_ddll_open(path, &ptr)) == 0) {
+ if ((res = erts_sys_ddll_open(path, &ptr, NULL)) == 0) {
maybe_unlock_driver_list(locked);
return ptr;
} else {
diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h
index 31252ed78f..189d9ebac8 100644
--- a/erts/emulator/beam/sys.h
+++ b/erts/emulator/beam/sys.h
@@ -279,18 +279,21 @@ typedef unsigned long UWord;
typedef long SWord;
#define SWORD_CONSTANT(Const) Const##L
#define UWORD_CONSTANT(Const) Const##UL
+#define ERTS_UWORD_MAX ULONG_MAX
#define ERTS_SWORD_MAX LONG_MAX
#elif SIZEOF_VOID_P == SIZEOF_INT
typedef unsigned int UWord;
typedef int SWord;
#define SWORD_CONSTANT(Const) Const
#define UWORD_CONSTANT(Const) Const##U
+#define ERTS_UWORD_MAX UINT_MAX
#define ERTS_SWORD_MAX INT_MAX
#elif SIZEOF_VOID_P == SIZEOF_LONG_LONG
typedef unsigned long long UWord;
typedef long long SWord;
#define SWORD_CONSTANT(Const) Const##LL
#define UWORD_CONSTANT(Const) Const##ULL
+#define ERTS_UWORD_MAX ULLONG_MAX
#define ERTS_SWORD_MAX LLONG_MAX
#else
#error Found no appropriate type to use for 'Eterm', 'Uint' and 'Sint'
@@ -304,6 +307,7 @@ typedef unsigned long Uint;
typedef long Sint;
#define SWORD_CONSTANT(Const) Const##L
#define UWORD_CONSTANT(Const) Const##UL
+#define ERTS_UWORD_MAX ULONG_MAX
#define ERTS_SWORD_MAX LONG_MAX
#define ERTS_SIZEOF_ETERM SIZEOF_LONG
#define ErtsStrToSint strtol
@@ -313,6 +317,7 @@ typedef unsigned int Uint;
typedef int Sint;
#define SWORD_CONSTANT(Const) Const
#define UWORD_CONSTANT(Const) Const##U
+#define ERTS_UWORD_MAX UINT_MAX
#define ERTS_SWORD_MAX INT_MAX
#define ERTS_SIZEOF_ETERM SIZEOF_INT
#define ErtsStrToSint strtol
@@ -322,6 +327,7 @@ typedef unsigned long long Uint;
typedef long long Sint;
#define SWORD_CONSTANT(Const) Const##LL
#define UWORD_CONSTANT(Const) Const##ULL
+#define ERTS_UWORD_MAX ULLONG_MAX
#define ERTS_SWORD_MAX LLONG_MAX
#define ERTS_SIZEOF_ETERM SIZEOF_LONG_LONG
#if defined(__WIN32__)
@@ -661,8 +667,7 @@ typedef struct {
#define ERTS_SYS_DDLL_ERROR_INIT {NULL}
extern void erts_sys_ddll_free_error(ErtsSysDdllError*);
extern void erl_sys_ddll_init(void); /* to initialize mutexes etc */
-extern int erts_sys_ddll_open2(const char *path, void **handle, ErtsSysDdllError*);
-#define erts_sys_ddll_open(P,H) erts_sys_ddll_open2(P,H,NULL)
+extern int erts_sys_ddll_open(const char *path, void **handle, ErtsSysDdllError*);
extern int erts_sys_ddll_open_noext(char *path, void **handle, ErtsSysDdllError*);
extern int erts_sys_ddll_load_driver_init(void *handle, void **function);
extern int erts_sys_ddll_load_nif_init(void *handle, void **function,ErtsSysDdllError*);
diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c
index 978a766de9..80937dfcc8 100644
--- a/erts/emulator/drivers/common/inet_drv.c
+++ b/erts/emulator/drivers/common/inet_drv.c
@@ -1490,8 +1490,8 @@ static int load_ip_and_port
unsigned int alen = len;
char abuf [len];
int res = inet_get_address(abuf, (inet_address*) addr, &alen);
- ASSERT(res==0);
- res = 0;
+ ASSERT(res==0); (void)res;
+
/* Now "abuf" contains: Family(1b), Port(2b), IP(4|16b) */
/* NB: the following functions are safe to use, as they create tuples
diff --git a/erts/emulator/internal_doc/CarrierMigration.md b/erts/emulator/internal_doc/CarrierMigration.md
new file mode 100644
index 0000000000..b93c11c6ec
--- /dev/null
+++ b/erts/emulator/internal_doc/CarrierMigration.md
@@ -0,0 +1,201 @@
+Carrier Migration
+=================
+
+The ERTS memory allocators manage memory blocks in two types of raw
+memory chunks. We call these chunks of raw memory
+*carriers*. Singleblock carriers which only contain one large block,
+and multiblock carriers which contain multiple blocks. A carrier is
+typically created using `mmap()` on unix systems. However, how a
+carrier is created is of minor importance. An allocator instance
+typically manages a mixture of single- and multiblock carriers.
+
+Problem
+-------
+
+When a carrier is empty, i.e. contains only one large free block, it
+is deallocated. Since multiblock carriers can contain both allocated
+blocks and free blocks at the same time, an allocator instance might
+be stuck with a large amount of poorly utilized carriers if the memory
+load decrease. After a peak in memory usage it is expected that not
+all memory can be returned since the blocks still allocated is likely
+to be dispersed over multiple carriers. Such poorly utilized carriers
+can usually be reused if the memory load increase again. However,
+since each scheduler thread manages its own set of allocator
+instances, and memory load is not necessarily connected to CPU load we
+might get into a situation where there are lots of poorly utilized
+multiblock carriers on some allocator instances while we need to
+allocate new multiblock carriers on other allocator instances. In
+scenarios like this, the demand for multiblock carriers in the system
+might increase at the same time as the actual memory demand in the
+system has decreased which is both unwanted and quite unexpected for
+the end user.
+
+Solution
+--------
+
+In order to prevent scenarios like this we've implemented support for
+migration of multiblock carriers between allocator instances of the
+same type.
+
+### Management of Free Blocks ###
+
+In order to be able to remove a carrier from one allocator instance
+and add it to another we need to be able to move references to the
+free blocks of the carrier between the allocator instances. The
+allocator instance specific data structure referring to the free
+blocks it manages often refers to the same carrier from multiple
+places. For example, when the address order bestfit strategy is used
+this data structure is a binary search tree spanning all carriers that
+the allocator instance manages. Free blocks in one specific carrier
+can be referred to from potentially every other carrier that is
+managed, and the amount of such references can be huge. That is, the
+work of removing the free blocks of such a carrier from the search
+tree will be huge. One way of solving this could be to not migrate
+carriers that contain lots of free blocks, but this would prevent us
+from migrating carriers that potentially needs to be migrated in order
+to solve the problem we set out to solve.
+
+By using one data structure of free blocks in each carrier and an
+allocator instance wide data structure of carriers managed by the
+allocator instance, the work needed in order to remove and add
+carriers can be kept to a minimum. When migration of carriers is
+enabled on a specific allocator type, we require that an allocation
+strategy with such an implementation is used. Currently we've
+implemented this for three different allocation strategies. All of
+these strategies use a search tree of carriers sorted so that we can
+find the carrier with the lowest address that can satisfy the
+request. Internally in carriers we use yet another search tree that
+either implement address order first fit, address order best fit,
+or best fit. The abbreviations used for these different allocation
+strategies are `aoff`, and `aoffcaobf`, `aoffcbf`.
+
+### Carrier Pool ###
+
+In order to migrate carriers between allocator instances we move them
+through a pool of carriers. In order for a carrier migration to
+complete, one scheduler needs to move the carrier into the pool, and
+another scheduler needs to take the carrier out of the pool.
+
+The pool is implemented as a lock free, circular, double linked,
+list. The list contains a sentinel which is used as the starting point
+when inserting to, or fetching from the pool. Carriers in the pool are
+elements in this list.
+
+The list can be modified by all scheduler threads
+simultaneously. During modifications the double linked list is allowed
+to get a bit "out of shape". For example, following the `next` pointer
+to the next element and then following the `prev` pointer does not
+always take you back to were you started. The following is however
+always true:
+
+* Repeatedly following `next` pointers will eventually take you to the
+ sentinel.
+* Repeatedly following `prev` pointers will eventually take you to the
+ sentinel.
+* Following a `next` or a `prev` pointer will take you to either an
+ element in the pool, or an element that used to be in the pool.
+
+When inserting a new element we search for a place to insert the
+element by only following `next` pointers, and we always begin by
+skipping the first element encountered. When trying to fetch an
+element we do the same thing, but instead only follow `prev` pointers.
+
+By going different directions when inserting and fetching, we avoid
+contention between threads inserting and threads fetching as much as
+possible. By skipping one element when we begin searching, we preserve
+the sentinel unmodified as much as possible. This is beneficial since
+all search operations need to read the content of the sentinel. If we
+were to modify the sentinel, the cache line containing the sentinel
+would unnecessarily be bounced between processors.
+
+The `prev`, and `next` fields in the elements of the list contains the
+value of the pointer, a modification marker, and a deleted
+marker. Memory operations on these fields are done using atomic memory
+operations. When a thread has set the modification marker in a field,
+no-one except the thread that set the marker is allowed to modify the
+field. If multiple modification markers needs to be set, we always
+begin with `next` fields followed by `prev` fields in the order
+following the actual pointers. This guarantees that no deadlocks will
+occur.
+
+When a carrier is being removed from a pool, we mark it with a thread
+progress value that needs to be reached before we are allowed to
+modify the `next`, and `prev` fields. That is, until we reach this
+thread progress we are not allowed to insert the carrier into the pool
+again, and we are not allowed to deallocate the carrier. This ensures
+that threads inspecting the pool always will be able to traverse the
+pool and reach valid elements. Once we have reached the thread
+progress value that the carrier was tagged with, we know that no
+threads may have references to it via the pool.
+
+### Migration ###
+
+There exist one pool for each allocator type enabling migration of
+carriers between scheduler specific allocator instances of the same
+allocator type.
+
+Each allocator instance keeps track of the current utilization of its
+multiblock carriers. When the utilization falls below the "abandon
+carrier utilization limit" it starts to inspect the utilization of the
+current carrier when deallocations are made. If also the utilization
+of the carrier falls below the "abandon carrier utilization limit" it
+unlinks the carrier from its data structure of available free blocks
+and inserts the carrier into the pool.
+
+Since the carrier has been unlinked from the data structure of
+available free blocks, no more allocations will be made in the
+carrier. The allocator instance putting the carrier into the pool,
+however, still has the responsibility of performing deallocations in
+it while it remains in the pool.
+
+Each carrier has a flag field containing information about allocator
+instance owning the carrier, a flag indicating if the carrier is in
+the pool or not, and a flag indicating if it is busy or not. When the
+carrier is in the pool, the owning allocator instance needs to mark it
+as busy while operating on it. If another thread inspects it in order
+to try to fetch it from the pool, it will abort the fetch if it is
+busy. When fetching the carrier from the pool, ownership will changed
+and further deallocations in the carrier will be redirected to the new
+owner using the delayed dealloc functionality.
+
+If a carrier in the pool becomes empty, it will be withdrawn from the
+pool. All carriers that become empty are also always passed to its
+originating allocator instance for deallocation using the delayed
+dealloc functionality. Since carriers this way always will be
+deallocated by the allocator instance that allocated the carrier the
+underlying functionality of allocating and deallocating carriers can
+remain simple and doesn't have to bother about multiple threads. In a
+NUMA system we will also not mix carriers originating from multiple
+NUMA nodes.
+
+When an allocator instance needs more carrier space, it always begins
+by inspecting its own carriers that are waiting for thread progress
+before they can be deallocated. If no such carrier could be found, it
+then inspects the pool. If no carrier could be fetched from the pool,
+it will allocate a new carrier. Regardless of where the allocator
+instance gets the carrier from it the just links in the carrier into
+its data structure of free blocks.
+
+### Result ###
+
+The use of this strategy of abandoning carriers with poor utilization
+and reusing these in allocator instances with an increased carrier
+demand is extremely effective and completely eliminates the problems
+that otherwise sometimes occurred when CPU load dropped while memory
+load did not.
+
+When using the `aoffcaobf` or `aoff` strategies compared to `gf` or
+`bf`, we loose some performance since we get more modifications in the
+data structure of free blocks. This performance penalty is however
+reduced using the `aoffcbf` strategy. A tradeoff between memory
+consumption and performance is however inevitable, and it is up to
+the user to decide what is most important.
+
+Further work
+------------
+
+It would be quite easy to extend this to allow migration of multiblock
+carriers between all allocator types. More or less the only obstacle
+is maintenance of the statistics information.
+
+
diff --git a/erts/emulator/internal_doc/CodeLoading.md b/erts/emulator/internal_doc/CodeLoading.md
new file mode 100644
index 0000000000..151b9cd57c
--- /dev/null
+++ b/erts/emulator/internal_doc/CodeLoading.md
@@ -0,0 +1,186 @@
+Non-Blocking Code Loading
+=========================
+
+Introduction
+------------
+
+Before OTP R16 when an Erlang code module was loaded, all other
+execution in the VM were halted while the load operation was carried
+out in single threaded mode. This might not be a big problem for
+initial loading of modules during VM boot, but it can be a severe
+problem for availability when upgrading modules or adding new code on
+a VM with running payload. This problem grows with the number of cores
+as both the time it takes to wait for all schedulers to stop increases
+as well as the potential amount of halted ongoing work.
+
+In OTP R16, modules are loaded without blocking the VM.
+Erlang processes may continue executing undisturbed in parallel during
+the entire load operation. The code loading is carried out by a normal
+Erlang process that is scheduled like all the others. The load
+operation is completed by making the loaded code visible to all
+processes in a consistent way with one single atomic
+instruction. Non-blocking code loading will improve real-time
+characteristics when modules are loaded/upgraded on a running SMP
+system.
+
+
+The Load Phases
+---------------
+
+The loading of a module is divided into two phases; a *prepare phase*
+and a *finishing phase*. The prepare phase contains reading the BEAM
+file format and all the preparations of the loaded code that can
+easily be done without interference with the running code. The
+finishing phase will make the loaded (and prepared) code accessible
+from the running code. Old module versions (replaced or deleted) will
+also be made inaccessible by the finishing phase.
+
+The prepare phase is designed to allow several "loader" processes to
+prepare separate modules in parallel while the finishing phase can
+only be done by one loader process at a time. A second loader process
+trying to enter finishing phase will be suspended until the first
+loader is done. This will only block the process, the scheduler is
+free to schedule other work while the second loader is waiting. (See
+`erts_try_seize_code_write_permission` and
+`erts_release_code_write_permission`).
+
+The ability to prepare several modules in parallel is not currently
+used as almost all code loading is serialized by the code_server
+process. The BIF interface is however prepared for this.
+
+ erlang:prepare_loading(Module, Code) -> LoaderState
+ erlang:finish_loading([LoaderState])
+
+The idea is that `prepare_loading` could be called in parallel for
+different modules and returns a "magic binary" containing the internal
+state of each prepared module. Function `finish_loading` could take a
+list of such states and do the finishing of all of them in one go.
+
+Currenlty we use the legacy BIF `erlang:load_module` which is now
+implemented in Erlang by calling the above two functions in
+sequence. Function `finish_loading` is limited to only accepts a list
+with one module state as we do not yet use the multi module loading
+feature.
+
+
+The Finishing Sequence
+----------------------
+
+During VM execution, code is accessed through a number of data
+structures. These *code access structures* are
+
+* Export table. One entry for every exported function.
+* Module table. One entry for each loaded module.
+* "beam_catches". Identifies jump destinations for catch instructions.
+* "beam_ranges". Map code address to function and line in source file.
+
+The most frequently used of these structures is the export table that
+is accessed in run time for every executed external function call to
+get the address of the callee. For performance reasons, we want to
+access all these structures without any overhead from thread
+synchronization. Earlier this was solved with an emergency break. Stop
+the entire VM to mutate these code access structures, otherwise treat
+them as read-only.
+
+The solution in R16 is instead to *replicate* the code access
+structures. We have one set of active structures read by the running
+code. When new code is loaded the active structures are copied, the
+copy is updated to include the newly loaded module and then a switch
+is made to make the updated copy the new active set. The active set is
+identified by a single global atomic variable
+`the_active_code_index`. The switch can thus be made by a single
+atomic write operation. The running code have to read this atomic
+variable when using the active access structures, which means one
+atomic read operation per external function call for example. The
+performance penalty from this extra atomic read is however very small
+as it can be done without any memory barriers at all (as described
+below). With this solution we also preserve the transactional feature
+of a load operation. Running code will never see the intermediate
+result of a half loaded module.
+
+The finishing phase is carried out in the following sequence by the
+BIF `erlang:finish_loading`:
+
+1. Seize exclusive code write permission (suspend process if needed
+ until we get it).
+
+2. Make a full copy of all the active access structures. This copy is
+ called the staging area and is identified by the global atomic
+ variable `the_staging_code_index`.
+
+3. Update all access structures in the staging area to include the
+ newly prepared module.
+
+4. Schedule a thread progress event. That is a time in the future when
+ all schedulers have yielded and executed a full memory barrier.
+
+5. Suspend the loader process.
+
+6. After thread progress, commit the staging area by assigning
+ `the_staging_code_index` to `the_active_code_index`.
+
+7. Release the code write permission allowing other processes to stage
+ new code.
+
+8. Resume the loader process allowing it to return from
+ `erlang:finish_loading`.
+
+
+### Thread Progress
+
+The waiting for thread progress in 4-6 is necessary in order for
+processes to read `the_active_code_index` atomic during normal
+execution without any expensive memory barriers. When we write a new
+value into `the_active_code_index` in step 6, we know that all
+schedulers will see an updated and consistent view of all the new
+active access structures once they become reachable through
+`the_active_code_index`.
+
+The total lack of memory barrier when reading `the_active_code_index`
+has one interesting consequence however. Different processes may see
+the new code at different point in time depending on when different
+cores happen to refresh their hardware caches. This may sound unsafe
+but it actually does not matter. The only property we must guarantee
+is that the ability to see the new code must spread with process
+communication. After receiving a message that was triggered by new
+code, the receiver must be guaranteed to also see the new code. This
+will be guaranteed as all types of process communication involves
+memory barriers in order for the receiver to be sure to read what the
+sender has written. This implicit memory barrier will then also make
+sure that the receiver reads the new value of `the_active_code_index`
+and thereby also sees the new code. This is true for all kinds of
+inter process communication (TCP, ETS, process name registering,
+tracing, drivers, NIFs, etc) not just Erlang messages.
+
+### Code Index Reuse
+
+To optimize the copy operation in step 2, code access structures are
+reused. In current solution we have three sets of code access
+structures, identified by a code index of 0, 1 and 2. These indexes
+are used in a round robin fashion. Instead of having to initialize a
+completely new copy of all access structures for every load operation
+we just have to update with the changes that have happened since the
+last two code load operations. We could get by with only two code
+indexes (0 and 1), but that would require yet another round of waiting
+for thread progress before step 2 in the `finish_loading` sequence. We
+cannot start reusing a code index as staging area until we know that
+no lingering scheduler thread is still using it as the active code
+index. With three generations of code indexes, the waiting for thread
+progress in step 4-6 will give this guarantee for us. Thread progress
+will wait for all running schedulers to reschedule at least one
+time. No ongoing execution reading code access structures reached from
+an old value of `the_active_code_index` can exist after a second round
+of thread progress.
+
+The design choice between two or three generations of code access
+structures is a trade-off between memory consumption and code loading
+latency.
+
+### A Consistent Code View
+
+Some native BIFs may need to get a consistent snapshot view of the
+active code. To do this it is important to only read
+`the_active_code_index` one time and then use that index value for all
+code accessing during the BIF. If a load operation is executed in
+parallel, reading `the_active_code_index` a second time might result
+in a different value, and thereby a different view of the code.
diff --git a/erts/emulator/internal_doc/DelayedDealloc.md b/erts/emulator/internal_doc/DelayedDealloc.md
new file mode 100644
index 0000000000..b7d87b839f
--- /dev/null
+++ b/erts/emulator/internal_doc/DelayedDealloc.md
@@ -0,0 +1,175 @@
+Delayed Dealloc
+===============
+
+Problem
+-------
+
+An easy way to handle memory allocation in a multi-threaded
+environment is to protect the memory allocator with a global lock
+which threads performing memory allocations or deallocations have to
+have locked during the whole operation. This solution of course scales
+very poorly, due to heavy lock contention. An improved solution of
+this scheme is to use multiple thread specific instances of such an
+allocator. That is, each thread allocates in its own allocator
+instance which is protected by a lock. In the general case references
+to memory need to be passed between threads. In the case where a
+thread that needs to deallocate memory that originates from another
+threads allocator instance a lock conflict is possible. In a system as
+the Erlang VM where memory allocation/deallocation is frequent and
+references to memory also are passed around between threads this
+solution will also scale poorly due to lock contention.
+
+Functionality Used to Adress This problem
+-----------------------------------------
+
+In order to reduce contention due to locking of allocator instances we
+introduced completely lock free instances tied to each scheduler
+thread, and an extra locked instance for other threads. The scheduler
+threads in the system is expected to do the major part of the
+work. Other threads may still be needed but should not perform any
+major and/or time critical work. The limited amount of contention that
+appears on the locked allocator instance can more or less be
+disregarded.
+
+Since we still need to be able to pass references to memory between
+scheduler threads we need some way to manage this. An allocator
+instance belonging to one scheduler thread is only allowed to be
+manipulated by that scheduler thread. When other threads need to
+deallocate memory originating from a foreign allocator instance, they
+only pass the memory block to a "message box" containing deallocation
+jobs attached to the originating allocator instance. When a scheduler
+thread detects such deallocation job it performs the actual
+deallocation.
+
+The "message box" is implemented using a lock free single linked list
+through the memory blocks to deallocate. The order of the elements in
+this list is not important. Insertion of new free blocks will be made
+somewhere near the end of this list. Requirering that the new blocks
+need to be inserted at the end would cause unnecessary contention when
+large amount of memory blocks are inserted simultaneous by multiple
+threads.
+
+The data structure refering to this single linked list cover two cache
+lines. One cache line containing information about the head of the
+list, and one cache line containing information about the tail of the
+list. This in order to reduce cache line ping ponging of this data
+structure. The head of the list will only be manipulated by the thread
+owning the allocator instance, and the tail will be manipulated by
+other threads inserting deallocation jobs.
+
+### Tail ###
+
+In the tail part of the data structure we find a pointer to the last
+element of the list, or at least something that is near the end of the
+list. In the uncontended case it will point to the end of the list,
+but when simultaneous insert operations are performed it will point to
+something near the end of the list.
+
+When insterting an element one will try to write a pointer to the new
+element in the next pointer of the element pointed to by the last
+pointer. This is done using an atomic compare and swap that expects
+the next pointer to be `NULL`. If this succeds the thread performing
+this operation moves the last pointer to point to the newly inserted
+element.
+
+If the atomic compare and swap described above failed, the last
+pointer didn't point to the last element. In this case we need to
+insert the new element somewhere inbetween the element that the last
+pointer pointed to and the actual last element. If we do it this way
+the last pointer will eventually end up at the last element when
+threads stop adding new elements. When trying to insert somewhere near
+the end and failing to do so, the inserting thread sometimes moves to
+the next element and somtimes tries with the same element again. This
+in order to spread the inserted elements during heavy contention. That
+is, we try to spread the modifications of memory to different
+locations instead of letting all threads continue to try to modify the
+same location in memory.
+
+### Head ###
+
+The head contains pointers to begining of the list (`head.first`), and
+to the first block which other threads may refer to
+(`head.unref_end`). Blocks between these pointers are only refered to
+by the head part of the data structure which is only used by the
+thread owning the allocator instance. When these two pointers are not
+equal the thread owning the allocator instance deallocate block after
+block until `head.first` reach `head.unref_end`.
+
+We of course periodically need to move the `head.unref_end` closer to
+the end in order to be able to continue deallocating memory
+blocks. Since all threads inserting new elements in the linked list
+will enter the list using the last pointer we can use this
+knowledge. If we call `erts_thr_progress_later()` and wait until we
+have reached that thread progress we know that no managed threads can
+refer the elements up to the element pointed to by the last pointer at
+the time when we called `erts_thr_progress_later()`. This since, all
+managed threads must have left the code implementing this at least
+once, and they always enters into the list via the last pointer. The
+`tail.next` field contains information about next `head.unref_end`
+pointer and thread progress that needs to be reached before we can
+move `head.unref_end`.
+
+Unfortunately not only threads managed by the thread progress
+functionality may insert memory blocks. Other threads also needs to be
+taken care of. Other threads will not be as frequent users of this
+functionality as managed threads, so using a less efficient scheme for
+them is not that big of a problem. In order to handle unmanaged
+threads we use two reference counters. When an unmanaged thread enters
+this implementation it increments the reference counter currently
+used, and when it leaves this implementation it decrements the same
+reference counter. When the consumer thread calls
+`erts_thr_progress_later()` in order to determine when it is safe to
+move `head.unref_end`, it also swaps reference counters for unmanaged
+threads. The previous current represents outstanding references from
+the time up to this point. The new current represents future reference
+following this point. When the consumer thread detects that we have
+both reached the desired thread progress and when the previous current
+reference counter reach zero it is safe to move the `head.unref_end`.
+
+The reason for using two reference counters is that we need to know
+that the reference counter eventually will reach zero. If we only used
+one reference counter it would potentially be held above zero for ever
+by different unmanaged threads.
+
+### Empty List ###
+
+If no new memory blocks are inserted into the list, it should
+eventually be emptied. All pointers to the list however expect to
+always point to something. This is solved by inserting an empty
+"marker" element, which only has to purpose of being there in the
+absense of other elements. That is when the list is empty it only
+contains this "marker" element.
+
+### Contention ###
+
+When elements are continously inserted by threads not owning the
+allocator instance, the thread owning the allocator instance will be
+able to work more or less undisturbed by other threads at the head end
+of the list. At the tail end large amounts of simultaneous inserts may
+cause contention, but we reduce such contention by spreading inserts
+of new elements near the end instead of requiring all new elements to
+be inserted at the end.
+
+### Schedulers and The Locked Allocator Instance ###
+
+Also the locked allocator instance for use by non-scheduler threads
+have a message box for deallocation jobs just as all the other
+allocator instances. The reason for this is that other threads may
+allocate memory pass it to a scheduler that then needs to deallocate
+it. We do not want the scheduler to have to wait for the lock on this
+locked instance. Since also locked instances has message boxes for
+deallocation jobs, the scheduler can just insert the job and avoid the
+locking.
+
+
+### A Benchmark Result ###
+
+When running the ehb benchmark, large amount of messages are passed
+around between schedulers. All message passing will in some way or the
+other cause memory allocation and deallocation. Since messages are
+passed between different schedulers we will get contention on the
+allocator instances where messages were allocated. By the introduction
+of the delayed dealloc feature, we got a speedup of between 25-45%,
+depending on configuration of the benchmark, when running on a
+relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers. \ No newline at end of file
diff --git a/erts/emulator/internal_doc/PTables.md b/erts/emulator/internal_doc/PTables.md
new file mode 100644
index 0000000000..6fe0e7665d
--- /dev/null
+++ b/erts/emulator/internal_doc/PTables.md
@@ -0,0 +1,356 @@
+Process and Port Tables
+=======================
+
+Problems
+--------
+
+The process table is a mapping from process identifiers to process
+structure pointers. The process structure contains miscellaneous
+information about a process, as for example pointers to its heap,
+message queue, etc. When the runtime system needs to operate on a
+process, it looks up the process structure in the process table using
+the process identifier. An example of this is when passing a message
+to a process.
+
+The process table has for a very long time just been an array of
+pointers to process structures. Since process identifiers internally
+in the runtime system are 28-bit integers it is quite easy to map a
+process identifier to index into the array. The 28-bits were divided
+into two sets. The least significant set of bits was used as index
+into the array. The most significant set of bits was only used to be
+able to distinguish between a number of identifiers with which map to
+the same index in the array. As long as process table sizes of a power
+of two was used we had 2^28 unique process identifiers.
+
+When the first SMP support was implemented, the table still was kept
+more or less the same way, but protected by two types of locks. One
+lock that protected the whole table against modifications and an array
+of locks protecting different parts of the table. The exact locking
+strategy previously used isn't interesting. What is interesting is
+that it suffered from heavy lock contention especially when lots of
+modifications was being made, but also when only performing lookups.
+
+In order to be able to detect when it is safe to deallocate a
+previously used process structure, reference counting of the structure
+was used. Also this was problematic, since simultaneous lookups needed
+to modify the reference counter which also caused contention on the
+cache line where the reference counter was located. This since all
+modifications needs to be communicated between all involved
+processors.
+
+The port table is very similar to the process table. The major
+difference, at least in concept, is that it is a mapping from port
+identifiers to port structures. It had a similar implementation, but
+with some differences. Instead of being an array of pointers it was an
+array of structures, and instead of being protected by two types of
+locks it was only protected by one global lock. This table also
+suffered from lock contention in various situations.
+
+Solution
+--------
+
+The process table was the major problem to address since processes are
+much more frequently used than ports. The first implementation only
+implemented this for processes, but since the port table is very
+similar and very similar problems occur on the port table, the process
+table implementation was later generalized so that it could also be
+used implementing the port table. For simplicity I will only talk
+about the process table in the following text, but the same will apply
+to the port table unless otherwise stated.
+
+If we disregard the locking issues, the original solution is very
+appealing. The mapping from process identifier to index into the array
+is very fast, and this property is something we would like to
+keep. The vast majority of operations on these tables are lookups so
+optimizing for lookups is what we want to do.
+
+### Lookup ###
+
+Using a set of bits in the process identifier as index into an array
+seems hard to beat. By replacing the array of pointers with an array
+of our pointer sized atomic data type, a lookup will consist of the
+following:
+
+1. Mapping the 28-bit integer to an index into the array.
+
+ More about this mapping later.
+
+2. Read the pointer using an atomic memory operation at determined
+ index in array.
+
+ On all platforms that we provide atomic memory operations, this is
+ just a `volatile` read, preventing the compiler to use values in
+ registers, forcing the a read from memory.
+
+3. Depending on use, issue appropriate memory barrier.
+
+ A common barrier used is a barrier with acquire semantics. On
+ x86/x86_64 this maps to a compiler barrier preventing the compiler
+ to reorder instructions, but on other hardware often some kind of
+ light weight hardware memory barrier is also needed.
+
+ When comparing with a locked approach, at least one heavy weight
+ memory barrier will be issued when locking the lock on most, if
+ not all, hardware architectures (including x86/x86_64), and often
+ some kind of light weight memory barrier will be issued when
+ unlocking the lock.
+
+When looking at this very simple solution with very little overhead
+you might wonder why we didn't implement it this way from the
+beginning. It all boils down to the read operation of the pointer. We
+need some way to know that it is safe to access the memory pointed
+to. One way of doing this is to place a reference counter in the
+process structure. Increment of the reference counter at lookup needs
+to be done atomically with the lookup. A lock can typically provide
+this service for us, which was the approach we previously
+used. Another approach could be to co-locate the reference counter
+with the pointer in the table. The major problem with this approach is
+the modifications of the reference counter. This since these
+modification would have to be communicated between all involved
+processor cause contention on the cache line containing the reference
+counter. The new lookup approach above is possible since we can use
+the "thread progress" functionality in order to determine when it is
+safe to deallocate the process structure. We'll get back to this when
+describing deletion in the table.
+
+Using this new lookup approach we wont modify any memory at all which
+is important. A lookup conceptually only read memory, now this is true
+in the implementation also which is important from a scalability
+perspective. The previous implementation modified the cache line
+containing the reference counter two times, and the cache line
+containing the corresponding lock two times at each lookup.
+
+### Modifications of the Table ###
+
+A lightweight lookup in the table was the most important feature, but
+we also wanted to improve modifications of the table. The process
+table is modified when a new process is spawned, i.e. a new pointer is
+inserted into the table, and when a process terminates, i.e. a pointer
+is deleted in the table.
+
+Assuming that we spawn fewer processes than the maximum amount of
+unique process identifiers in the system, one has always been able to
+determine the order of process creation just by comparing process
+identifiers. If PidX is larger than PidY, then PidX was created after
+PidY assuming both identifiers originates from the same node. However,
+since we have a quite limited amount of unique identifiers today
+(2^28), this property cannot be relied upon if we create large amount
+of processes. But never the less, this is a property the system always
+have had.
+
+If we would have had a huge amount of unique identifiers available, it
+would have tempting to drop or modify this ordering property as
+described above. The ordering property could for example be based on
+the scheduler performing the spawn operation. It would have been
+possible to reserve large ranges of identifiers exclusive for each
+scheduler thread which could be used minimizing the need for
+communication when allocating identifiers. The amount of identifiers
+we got to work with today is, however, not even close to be enough for
+such an approach.
+
+Since we have a limited amount of unique identifiers, we need to be
+careful not to waste them. If previously used identifiers are reused
+too quick, identifiers originating from terminated processes will
+refer to newly created processes, and mixups will occur. The
+previously used approach was quite good at not wasting
+identifiers. Using a modified version of the same approach also lets
+us keep the ordering property that we have always had.
+
+#### Insert ####
+
+The original approach is more or less to search for next free index or
+slot in the array. The search starts from the last slot allocated. If
+we reach the end of the array we increase a "wrapped counter" and then
+continue the search. The process identifier is constructed by writing
+the index to the least significant set of bits, and the "wrapped
+counter" to the most significant set of bits. The amount of bits in
+each set of bits is decided at boot time, so that maximum index will
+just fit into the least significant set of bits.
+
+In the modified lock free version of this approach we more or less do
+it the same way, but with some important modifications trying to avoid
+unnecessary contention when multiple schedulers create processes
+simultaneously. Since multiple threads might be trying to search for
+the next free slot at the same time from the same starting point we
+want subsequent slots to be located in different cache lines. Multiple
+schedulers simultaneously writing new pointers into the table are
+therefore very likely to write into adjacent slots. If adjacent slots
+are located in the same cache line all modification of this cache line
+needs to be communicated between all involved processors which will be
+very expensive and scale very poor. By locating adjacent slots in
+different cache lines only true conflicts will trigger communication
+between involved processors, i.e., avoiding false sharing.
+
+A cache line is larger than a pointer, typically 8 or 16 times larger,
+so using one cache line for each slot only containing one pointer
+would be a waste of space. Each cache line will be able to hold a
+fixed amount of slots. The first slot of the table will be the first
+slot of the first cache line, the second slot of the table will be the
+first slot of the second cache line until we reach the end of the
+array. The next slot after that will be the second slot of the first
+cache line, etc, moving forward one cache line internal slot each time
+we wrap. This way we will be able to fit the same amount of pointers
+into an array of the same size while always keeping adjacent slots in
+different cache lines.
+
+The mapping from identifier to slot or index into the array gets a bit
+more complicated than before. Instead of a `shift` and a bitwise
+`and`, we get two `shift`s, two bitwise `and`s, and an `add` (see
+implementation of `erts_ptab_data2pix()` in `erl_ptab.h`). However, by
+storing this information optimized for lookup we only need a `shift`
+and a bitwise `and` on 32-bit platforms. On 64-bit platforms we got
+enough room for the 28-bit identifier in the least significant
+halfword, and the index in the most significant halfword, in other
+words, we just need to read the most significant halfword to get the
+index. That is, this operation is as fast, or faster than before. The
+downside is that on 32-bit platforms we need to convert this
+information into the 28-bit identifier number when printing, or when
+ordering identifiers from the same node. These operations are,
+however, extremely infrequent compared to lookups.
+
+When we insert a new element in the table we do the following:
+
+1. We begin by reserving space in the table by atomically
+ incrementing a counter of processes in the table. If our increment
+ brings the counter above the maximum size of the table, the
+ operation fail and a `system_limit` exception is raised.
+
+2. The table contains a 64-bit atomic variable of the last identifier
+ used. Only the least significant bits will be used when actually
+ creating the identifier. This identifier is where the search
+ begin.
+
+3. We increment last identifier value used. In order determine the
+ slot that corresponds to this identifier we call
+ `erts_ptab_data2pix()` that maps identifier to slot. We read the
+ content of the slot. If the slot is free we try to write a
+ reservation marker using an atomic compare and swap. If this fails
+ we repeat this step until it succeeds.
+
+4. Change the table variable of last identifier used. Since multiple
+ writes might occur at the same time this value may already have
+ been changed by to an identifier larger that the one we got. In
+ this case we can continue; otherwise, we need to change it to the
+ identifier we got.
+
+5. We now do some initializations of the process structure that
+ cannot be done before we know the process identifier, and have to
+ be done before we publish the structure in the table. This, for
+ example, includes storing the identifier in the process structure.
+
+6. Now we can publish the structure in the table by writing the the
+ pointer to the process structure in the slot previously reserved
+ in 3.
+
+Using this approach we keep the properties like identifier ordering,
+and identifier reuse while improving performance and scalability. It
+has one flaw, though. There is no guarantee that the operation will
+terminate. This can quite easily be fixed though, and will be fixed in
+the next release. We will get back to this below.
+
+#### Delete ####
+
+When a process terminates, we mark the process as terminated in the
+process structure, the counter of number of processes in the table is
+decreased, and the reference to the process structure is removed by
+writing a `NULL` pointer into the corresponding slot. The scheduler
+thread performing this then schedule a thread progress later job which
+will do the final cleanup and deallocate the process structure. The
+thread progress functionality will make sure that this job will not
+execute until it is certain that all managed threads have dropped all
+references to the process structure.
+
+### BIF Iterating Over the Table ###
+
+The `erlang:processes/1` and `erlang:port/1` BIFs iterate over the
+tables and return corresponding identifiers. These BIF should return a
+consistent snapshot of the table content during some time when the BIF
+is executing. In order to implement this we use locking in a strange
+way. We use an "inverted rwlock".
+
+When performing lookups in the table we do not need to bother about
+the locking at all, but when modifying the table we read lock the
+rwlock protecting the table which allows for multiple writers during
+normal operation. When the BIF that iterates over the table need
+access to the table it write locks the rwlock and reads content of the
+table. The BIF do not read the whole table in one go but instead read
+small chunks at time only write locking while reading. The actual
+implementation of the BIFs is out of the scope of this document.
+
+An out of the box rwlock will typically suffer from contention on the
+single cache line containing the state of the rwlock even in the case
+we are only read locking. Instead of using such an rwlock, we have our
+own implementation of reader optimized rwlocks which keeps track of
+reader threads in separate thread specific cache lines. This in order
+to avoid contention on a singe cache line. As long as we only do read
+lock operations, threads only need to read a global cache line and
+modify its own cache line, and by this minimize communication between
+involved processors. The iterating BIFs are normally very infrequently
+used, so in the normal case we will only do read lock operations on
+the table global rwlock.
+
+### Future Improvements ###
+
+The first improvement is to fix the guarantee so that insert
+operations will be guaranteed to terminate. When the operation starts
+we verify that there actually exist a free slot that we can use. The
+problem is that we might not find it since it may move when multiple
+threads modify the table at the same time as we are trying to find the
+slot. The easy fix is to abort the operation if an empty slot could
+not be found in a finite number operation, and then restart the
+operation under a write lock. This will be implemented in next
+release, but furter work should be made trying to find a better
+solution.
+
+This and also previous implementation do not work well when the table
+is nearly full. We will both get long search times for free slots, and
+we will reuse identifiers more frequently since we more frequently
+wrap during the search. These tables works best when the table is much
+larger than the amount of simultaneous existing processes. One easy
+improvement is to always have room for more processes than we allow in
+the table. This will also be implemented in the next release, but this
+should probably also be worked more on trying to find an even better
+solution.
+
+It would also be nice to get rid of the rwlock all together. The use
+of a reader optimized rwlock makes sure we do not any contention on
+the lock, but unnecessary memory barriers will be issued due to the
+lock. The main issue here is to modify iterating BIFs so that they do
+not require exclusive access to the table while reading a sequence of
+slots. In principle this should be rather easy, the code can handle
+sequences of variable sizes, so shrinking the sequence size of slots
+to one would solv the problem. This will, however, need some tweeks
+and modifications of not trival code, but is something that should be
+looked at in the future.
+
+By increasing the size of identifiers, at least on 64-bit machines
+(which isn't as easy as it first might seem) we get further room for
+improvement. Besides the obvious improvement of not reusing
+identifiers as fast as we currently do, it makes it possible to
+further avoid contention when inserting elements in the table. At
+least if we drop this ordering property, which isn't that useful
+anyway.
+
+### Some Benchmark Results ###
+
+In order to test modifications of the process table we ran a couple of
+benchmarks where lots of processes are spawned and terminated
+simultaneously, and got a speedup of between 150-200%. Running a
+similar benchmark but with ports we got a speedup of about 130%.
+
+The BIF `erlang:is_process_alive/1` is the closest you can get to a
+process table lookup only. The BIF looks up the process corresponding
+to the process identifier passed as argument, and then checks if it is
+alive. By running multiple processes looping over this BIF checking
+the same process, we get a speedup between 20000-23000%. Conceptually
+this operation only involve read operations. In the implementation
+used in R16B also only read operation are performed, while the
+previous implementation need to lock structures in order to read the
+data, suffering from both lock contention and contention due to
+modifications of cache lines used by lock internal data structures and
+the reference counter on the process being looked up.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers. On a
+machine with more communication overhead and/or larger amount of
+logical processors the speedups are expected to be even larger.
diff --git a/erts/emulator/internal_doc/PortSignals.md b/erts/emulator/internal_doc/PortSignals.md
new file mode 100644
index 0000000000..b1afb7c5cb
--- /dev/null
+++ b/erts/emulator/internal_doc/PortSignals.md
@@ -0,0 +1,267 @@
+Port Signals
+============
+
+Problems
+--------
+
+Erlang ports conceptually are very similar to Erlang processes. Erlang
+processes execute Erlang code in the virtual machine, while an Erlang
+port execute native code typically used for communication with the
+outside world. For example, when an Erlang process wants to
+communicate using TCP over the network, it communicates via an Erlang
+port implementing the TCP socket interface in native code. Both Erlang
+Processes and Ports communicate using asynchronous signaling. The
+native code executed by an Erlang port is a collection of callback
+functions, called a driver. Each callback more or less implements the
+code of a signal to, or from the port.
+
+Even though processes and ports conceptually always have been very
+similar, the implementations have been very different. Originally,
+more or less all port signals were handled synchronously at the time
+they occurred. Very early in the development of the SMP support for
+the runtime system we recognized that this was a huge problem for
+signals between ports and the outside world. That is, I/O events to
+and from the outside world, or I/O signals. This was one of the first
+things that had to be rewritten in order to be able to do I/O in
+parallel at all. The solution was to implement scheduling of these
+signals. I/O signals corresponding to different ports could then be
+executed in parallel on different scheduler threads. Signals from
+processes to ports was not as big of a problem as the I/O signals, and
+the implementation of those was left as they were.
+
+Each port is protected by its own lock to protect against simultaneous
+execution in multiple threads. Previously when a process, executing on
+a scheduler thread, sent a port a signal, it locked the port lock and
+synchronously executed the code corresponding to the signal. If the
+lock was busy, the scheduler thread blocked waiting until it could
+lock the lock. If multiple processes executing simultaneously on
+different scheduler threads, sent signals to the same port, schedulers
+suffered from heavy lock contention. Such contention could also occur
+between I/O signals for the port executing on one scheduler thread,
+and a signal from a process to the port executing on another scheduler
+thread. Beside the contention issues, we also loose potential work to
+execute in parallel on different scheduler threads. This since the
+process sending the *asynchronous* signal is blocked while the code
+implementing the signal is executed synchronously.
+
+Solution
+--------
+
+In order to prevent multiple schedulers from trying to execute signals
+to/from the same port simultaneously, we need to be able to ensure
+that all signals to/from a port are executed in sequence on one
+scheduler. More or less, the only way to do this is to schedule all
+types of signals. Signals corresponding to a port can then be executed
+in sequence by one single scheduler thread. If only one thread tries
+to execute the port, no contention will appear on the port
+lock. Besides getting rid of the contention, processes sending signals
+to the port can also continue execution of their own Erlang code on
+other schedulers at the same time as the signaling code is executing
+on another scheduler.
+
+When implementing this there are a couple of important properties that
+we either need, or want to preserve:
+
+* Signal ordering guarantee. Signals from process `X` to port `Y`,
+ *must* be delivered to `Y` in the same order as sent from `X`.
+
+* Signal latency. Due to the previous synchronous implementation,
+ latency of signals sent from processes to ports have usually been
+ very low. During contention the latency has of course
+ increased. Users expect latency of these signals to be low, a
+ sudden increase in latency would not be appreciated by our users.
+
+* Compatible flow control. Ports have for a very long time had the
+ possibility to use the busy port functionality when implementing
+ flow control. One may argue that this functionality fits very bad
+ with the conceptually completely asynchronous signaling, but the
+ functionality has been there for ages and is expected to be
+ there. When a port sets itself into a busy state, `command`
+ signals should not be delivered, and senders of such signals
+ should suspend until the port sets itself in a not busy state.
+
+### Scheduling of Port Signals ###
+
+A run queue has four queues for processes of different priority and
+one queue for ports. The scheduler thread associated with the run
+queue switch evenly between execution of processes and execution of
+ports while both processes and ports exist in the queue. This is not
+completely true, but not important for this discussion. A port that is
+in a run queue also has a queue of tasks to execute. Each task
+corresponds to an in- or outgoing signal. When the port is selected
+for execution each task will be executed in sequence. The run queue
+locks not only protected the queues of ports, but also the queues of
+port tasks.
+
+Since we go from a state where I/O signals are the only port related
+signals scheduled, to a state where potentially all port related
+signals may be scheduled we may drastically increase the load on the
+run queue lock. The amount of scheduled port tasks very much depend on
+the Erlang application executing, which we do not control, and we do
+not want to get increased contention on the run queue locks. We
+therefore need another approach of protecting the port task queue.
+
+#### Task Queue ####
+
+We chose a "semi locked" approach, with one public locked task queue,
+and a private, lock free, queue like, task data structure. This "semi
+locked" approach is similar to how the message boxes of processes are
+managed. The lock is port specific and only used for protection of
+port tasks, so the run queue lock is now needed in more or less the
+same way for ports as for processes. This ensures that we wont see an
+increased lock contention on run queue locks due to this rewrite of
+the port functionality.
+
+When an executing port runs out of work to execute in the private task
+data structure, it moves the public task queue into the private task
+data structure while holding the lock. Once tasks has been moved to
+the private data structure no lock protects them. This way the port
+can continue working on tasks in the private data structure without
+having to fight for the lock.
+
+I/O signals may however be aborted. This could be solved by letting
+the port specific scheduling lock also protect the private task data
+structure, but then the port very frequently would have to fight with
+others enqueueing new tasks. In order to handle this while keeping the
+private task data structure lock free, we use a similar "non
+aggressive" approach as we use when handling processes that gets
+suspended while in the run queue. Instead of removing the aborted port
+task, we just mark it as aborted using an atomic memory
+operation. When a task is selected for execution, we first verify that
+it has not been aborted. If aborted we, just drop the task.
+
+A task that can be aborted is referred via another data structure from
+other parts of the system, so that a thread that needs to abort the
+task can reach it. In order to be sure to safely deallocate a task
+that is no longer used, we first clear this reference and then use the
+thread progress functionality in order to make sure no references can
+exist to the task. Unfortunately, also unmanaged threads might abort
+tasks. This is very infrequent, but might occur. This could be handled
+locally for each port, but would require extra information in each
+port structure which very infrequently would be used. Instead of
+implementing this in each port, we implemented general functionality
+that can be used from unmanaged threads to delay thread progress.
+
+The private "queue like" task data structure could have been an
+ordinary queue if it wasn't for the busy port functionality. When the
+port has flagged itself as busy, `command` signals are not allowed to
+be delivered and need to be blocked. Other signals sent from the same
+sender following a `command` signal that has been blocked also have to
+be blocked; otherwise, we would violate the ordering guarantee. At the
+same time, other signals that have no dependencies to blocked
+`command` signals are expected to be delivered.
+
+The above requirements makes the private task data structure a rather
+complex data structure. It has a queue of unprocessed tasks, and a
+busy queue. The busy queue contains blocked tasks corresponding to
+`command` signals, and tasks with dependencies to such tasks. The busy
+queue is accompanied by a table over blocked tasks based on sender
+with a references into last task in the busy queue from a specific
+sender. This since we need check for dependencies when new tasks are
+processed in the queue of unprocessed tasks. When a new task is
+processed that needs to be blocked it isn't enqueued at the end of the
+busy queue, but instead directly after the last task with the same
+sender. This in order to easily be able to detect when we have tasks
+that no longer have any dependencies to tasks corresponding to
+`command` signals which should be moved out of the busy queue. When
+the port executes, it switches between processing tasks from the busy
+queue, and processing directly from the unprocessed queue based on its
+busy state. When processing directly from the unprocessed queue it
+might, of course, have to move a task into the busy queue instead of
+executing it.
+
+#### Busy Port Queue ####
+
+Since it is the port itself which decides when it is time to enter a
+busy state, it needs to be executing in order to enter the busy
+state. As a result of `command` signals being scheduled, we may get
+into a situation where the port gets flooded by a huge amount of
+`command` signals before it even gets a chance to set itself into a
+busy state. This since it has not been scheduled for execution
+yet. That is, under these circumstances the busy port functionality
+loose the flow control properties it was intended to provide.
+
+In order to solve this, we introduced a new busy feature, namely "busy
+port queue". The port has a limit of `command` data that is allowed to
+be enqueued in the task queue. When this limit is reached, the port
+will automatically enter a busy port queue state. When in this state,
+senders of `command` signals will be suspended, but `command` signals
+will still be delivered to the port unless it is also in a busy port
+state. This limit is known as the high limit.
+
+There is also a low limit. When the amount of queued `command` data
+falls below this limit and the port is in a busy port queue state, the
+busy port queue state is automatically disabled. The low limit should
+typically be significantly lower than the high limit in order to
+prevent frequent oscillation around the busy port queue state.
+
+By introduction of this new busy state we still can provide the flow
+control. Old driver do not even have to be changed. The limits can,
+however, be configured and even disabled by the port. By default the
+high limit is 8 KB and the low limit is 4 KB.
+
+### Preparation of Signal Send ###
+
+Previously all operations sending signals to ports began by acquiring
+the port lock, then performed preparations for sending the signal, and
+then finaly sent the signal. The preparations typically included
+inspecting the state of the port, and preparing the data to pass along
+with the signal. The preparation of data is frequently quite time
+consuming, and did not really depend on the port. That is we would
+like to do this without having the port lock locked.
+
+In order to improve this, state information was re-organized in the
+port structer, so that we can access it using atomic memory
+operations. This together with the new port table implementation,
+enabled us to lookup the port and inspect the state before acquiring
+the port lock, which in turn made it possible to perform preparations
+of signal data before acquiring the port lock.
+
+### Preserving Low Latency ###
+
+If we disregard the contended cases, we will inevitably get a higher
+latency when scheduling signals for execution at a later time than by
+executing the signal immediately. In order to preserve the low latency
+we now first check if this is a contended case or not. If it is, we
+schedule the signal for later execution; otherwise, we execute the
+signal immediately. It is a contended case if other signals already
+are scheduled on the port, or if we fail to acquire the port
+lock. That is we will not block waiting for the lock.
+
+Doing it this way we will preserve the low latency at the expense of
+lost potential parallel execution of the signal and other code in the
+process sending the signal. This default behaviour can however be
+changed on port basis or system wide, forcing scheduling of all
+signals from processes to ports that are not part of a synchronous
+communication. That is, an unconditional request/response pair of
+asynchronous signals. In this case it is no potential for parallelism,
+and by that no point forcing scheduling of the request signal.
+
+The immediate execution of signals may also cause a scheduler that is
+about to execute scheduled tasks to block waiting for the port
+lock. This is however more or less the only scenario where a scheduler
+needs to wait for the port lock. The maximum time it has to wait is
+the time it takes to execute one signal, since we always schedule
+signals when contention occurs.
+
+### Signal Operations ###
+
+Besides implementing the functionality enabling the scheduling,
+preparation of signal data without port lock, etc, each operation
+sending signals to ports had to be quite extensively re-written. This
+in order to move all sub-operations that can be done without the lock
+to a place before we have acquired the lock, and also since signals
+now sometimes are executed immediately and sometimes scheduled for
+execution at a later time which put different requirements on the data
+to pass along with the signal.
+
+### Some Benchmark Results ###
+
+When running some simple benchmarks where contention only occur due to
+I/O signals contending with signals from one single process we got a
+speedup of 5-15%. When multiple processes send signals to one single
+port the improvements can be much larger, but the scenario with one
+process contending with I/O is the most common one.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers. \ No newline at end of file
diff --git a/erts/emulator/internal_doc/ProcessManagementOptimizations.md b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
new file mode 100644
index 0000000000..9e83633bef
--- /dev/null
+++ b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
@@ -0,0 +1,172 @@
+Process Management Optimizations
+================================
+
+Problems
+--------
+
+Early versions of the SMP support for the runtime system completely
+relied on locking in order to protect data accesses from multiple
+threads. In some cases this isn't that problematic, but in some cases
+it really is. It complicates the code, ensuring all locks needed are
+actually held, and ensuring that all locks are acquired in such an
+order that no deadlock occur. Acquiring locks in the right order often
+also involve releasing locks held, forcing threads to reread data
+already read. A good recipe for creation of bugs. Trying to use more
+fine-grained locking in order to increase possible parallelism in the
+system makes the complexity situation even worse. Having to acquire a
+bunch of locks when doing operations also often cause heavy lock
+contention which cause poor scalability.
+
+Management of processes internally in the runtime system suffered from
+these problems. When changing state on a process, for example from
+`waiting` to `runnable`, a lock on the process needed to be
+locked. When inserting a process into a run queue also a lock
+protecting the run queue had to be locked. When migrating a process
+from one run queue to another run queue, locks on both run queues and
+on the process had to be locked.
+
+This last example is a quite common case in during normal
+operation. For example, when a scheduler thread runs out of work it
+tries to steal work from another scheduler threads run queue. When
+searching for a victim to steal from there was a lot of juggling of
+run queue locks involved, and during the actual theft finalized by
+having to lock both run queues and the process. When one scheduler
+runs out of work, often others also do, causing lots of lock
+contention.
+
+Solution
+--------
+
+### Process ###
+
+In order to avoid these situations we wanted to be able to do most of
+the fundamental operations on a process without having to acquire a
+lock on the process. Some examples of such fundamental operations are,
+moving a process between run queues, detecting if we need to insert it
+into a run queue or not, detecting if it is alive or not.
+
+All of this information in the process structure that was needed by
+these operations was protected by the process `status` lock, but the
+information was spread across a number of fields. The fields used was
+typically state fields that could contain a small number of different
+states. By reordering this information a bit we could *easily* fit
+this information into a 32-bit wide field of bit flags (only 12-flags
+were needed). By moving this information we could remove five 32-bit
+wide fields and one pointer field from the process structure! The move
+also enabled us to easily read and change the state using atomic
+memory operations.
+
+### Run Queue ###
+
+As with processes we wanted to be able to do the most fundamental
+operations without having to acquire a lock on it. The most important
+being able to determine if we should enqueue a process in a specific
+run queue or not. This involves being able to read actual load, and
+load balancing information.
+
+The load balancing functionality is triggered at repeated fixed
+intervals. The load balancing more or less strives to even out run
+queue lengths over the system. When balancing is triggered,
+information about every run queue is gathered, migrations paths and
+run queue length limits are set up. Migration paths and limits are
+fixed until the next balancing has been done. The most important
+information about each run queue is the maximum run queue length since
+last balancing. All of this information were previously stored in the
+run queues themselves.
+
+When a process has become runnable, for example due to reception of a
+message, we need to determine which run queue to enqueue it
+in. Previously this at least involved locking the run queue that the
+process currently was assigned to while holding the status lock on the
+process. Depending on load we sometimes also had to acquire a lock on
+another run queue in order to be able to determine if it should be
+migrated to that run queue or not.
+
+In order to be able to decide which run queue to use without having to
+lock any run queues, we moved all fixed balancing information out of
+the run queues into a global memory block. That is, migration paths
+and run queue limits. Information that need to be frequently updated,
+like for example maximum run queue length, were kept in the run queue,
+but instead of operating on this information under locks we now use
+atomic memory operations when accessing this information. This made it
+possible to first determine which run queue to use, without locking
+any run queues, and when decided, lock the chosen run queue and insert
+the process.
+
+#### Fixed Balancing Information ####
+
+When determining which run queue to choose we need to read the fixed
+balancing information that we moved out of the run queues. This
+information is global, read only between load balancing operations,
+but will be changed during a load balancing. We do not want to
+introduce a global lock that needs to be acquired when accessing this
+information. A reader optimized rwlock could avoid some of the
+overhead since the data is most frequently read, but it would
+unavoidably cause disruption during load balancing, since this
+information is very frequently read. The likelihood of a large
+disruption due to this also increase as number of schedulers grows.
+
+Instead of using a global lock protecting modifications of this
+information, we write a completely new version of it at each load
+balancing. The new version is written in another memory block than the
+previous one, and published by issuing a write memory barrier and then
+storing a pointer to the new memory block in a global variable using
+an atomic write operation.
+
+When schedulers need to read this information, they read the pointer
+to currently used information using an atomic read operation, and then
+issue a data dependency read barrier, which on most architectures is a
+no-op. That is, it is very little overhead getting access to this
+information.
+
+Instead of allocating and deallocating memory blocks for the different
+versions of the balancing information we keep old memory blocks and
+reuse them when it is safe to do so. In order to be able to determine
+when it is safe to reuse a block we use the thread progress
+functionality, ensuring that no threads have any references to the
+memory block when we reuse it.
+
+#### Be Less Aggressive ####
+
+We implemented a test version using lock free run queues. This
+implementation did however not perform as good as the version using
+one lock per run queue. The reason for this was not investigated
+enough to say why this was. Since the locked version performed better
+we kept it, at least for now. The lock free version, however, forced
+us to use other solutions, some of them we kept.
+
+Previously when a process that was in a run queue got suspended, we
+removed it from the queue straight away. This involved locking the
+process, locking the run queue, and then unlinking it from the double
+linked list implementing the queue. Removing a process from a lock
+free queue gets really complicated. Instead, of removing it from the
+queue, we just leave it in the queue and mark it as suspended. When
+later selected for execution we check if the process is suspended, if
+so just dropped it. During its time in the queue, it might also get
+resumed again, if so execute it when it get selected for execution.
+
+By keeping this part when reverting back to a locked implementation,
+we could remove a pointer field in each process structure, and avoid
+unnecessary operations on the process and the queue which might cause
+contention.
+
+### Combined Modifications ###
+
+By combining the modifications of the process state management and the
+run queue management, we can do large parts of the work involved when
+managing processes with regards to scheduling and migration without
+having any locks locked at all. In these situations we previously had
+to have multiple locks locked. This of course caused a lot of rewrites
+across large parts of the runtime system, but the rewrite both
+simplified code and eliminated locking at a number of places. The
+major benefit is, of course, reduced contention.
+
+### A Benchmark Result ###
+
+When running the chameneosredux benchmark, schedulers frequently run
+out of work trying to steal work from each other. That is, either
+succeeding in migrating, or trying to migrate processes which is a
+scenario which we wanted to optimize. By the introduction of these
+improvements, we got a speedup of 25-35% when running this benchmark
+on a relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers. \ No newline at end of file
diff --git a/erts/emulator/internal_doc/ThreadProgress.md b/erts/emulator/internal_doc/ThreadProgress.md
new file mode 100644
index 0000000000..6118bcf0f6
--- /dev/null
+++ b/erts/emulator/internal_doc/ThreadProgress.md
@@ -0,0 +1,308 @@
+Thread Progress
+===============
+
+Problems
+--------
+
+### Knowing When Threads Have Completed Accesses to a Data Structure ###
+
+When multiple threads access the same data structure you often need to
+know when all threads have completed their accesses. For example, in
+order to know when it is safe to deallocate the data structure. One
+simple way to accomplish this is to reference count all accesses to
+the data structure. The problem with this approach is that the cache
+line where the reference counter is located needs to be communicated
+between all involved processors. Such communication can become
+extremely expensive and will scale poorly if the reference counter is
+frequently accessed. That is, we want to use some other approach of
+keeping track of threads than reference counting.
+
+### Knowing That Modifications of Memory is Consistently Observed ###
+
+Different hardware architectures have different memory models. Some
+architectures allows very aggressive reordering of memory accesses
+while other architectures only reorder a few specific cases. Common to
+all modern hardware is, however, that some type of reordering will
+occur. When using locks to protect all memory accesses made from
+multiple threads such reorderings will not be visible. The locking
+primitives will ensure that the memory accesses will be ordered. When
+using lock free algorithms one do however have to take this reordering
+made by the hardware into account.
+
+Hardware memory barriers or memory fences are instructions that can be
+used to enforce order between memory accesses. Different hardware
+architectures provide different memory barriers. Lock free algorithms
+need to use memory barriers in order to ensure that memory accesses
+are not reordered in such ways that the algorithm breaks down. Memory
+barriers are also expensive instructions, so you typically want to
+minimize the use of these instructions.
+
+Functionality Used to Address These Problems
+-------------------------------------------
+
+The "thread progress" functionality in the Erlang VM is used to
+address these problems. The name "thread progress" was chosen since we
+want to use it to determine when all threads in a set of threads have
+made such progress so that two specific events have taken place for
+all them.
+
+The set of threads that we are interested in we call managed
+threads. The managed threads are the only threads that we get any
+information about. These threads *have* to frequently report
+progress. Not all threads in the system are able to frequently report
+progress. Such threads cannot be allowed in the set of managed threads
+and are called unmanaged threads. An example of unmanaged threads are
+threads in the async thread pool. Async threads can be blocked for
+very long times and by this be prevented from frequently reporting
+progress. Currently only scheduler threads and a couple of other
+threads are managed threads.
+
+### Thread Progress Events ###
+
+Any thread in the system may use the thread progress functionality in
+order to determine when the following events have occured at least
+once in all managed threads:
+
+1. The thread has returned from other code to a known state in the
+ thread progress functionality, which is independent of any other
+ code.
+2. The thread has executed a full memory barrier.
+
+These events, of course, need to occur ordered to other memory
+operations. The operation of determining this begins by initiating the
+thread progress operation. The thread that initiated the thread
+progress operation after this poll for the completion of the
+operation. Both of these events must occur at least once *after* the
+thread progress operation has been initiated, and at least once
+*before* the operation has completed in each managed thread. This is
+ordered using communication via memory which makes it possible to draw
+conclusion about the memory state after the thread progress operation
+has completed. Lets call the progress made from initiation to
+comletion for "thread progress".
+
+Assuming that the thread progress functionality is efficient, a lot of
+algorithms can both be simplified and made more efficient than using
+the first approach that comes to mind. A couple of examples follows.
+
+By being able to determine when the first event above has occurred we
+can easily know when all managed threads have completed accesses to a
+data structure. This can be determined the following way. We have an
+implementation of some functionality `F` using a data structure
+`D`. The reference to `D` is always looked up before `D` is being
+accessed, and the references to `D` is always dropped before we leave
+the code implementing `F`. If we remove the possibility to look up `D`
+and then wait until the first event has occurred in all managed
+threads, no managed threads can have any references to the data
+structure `D`. This could for example have been achieved by using
+reference counting, but the cache line containing the reference
+counter would in this case be ping ponged between all processors
+accessing `D` at every access.
+
+By being able to determine when the second event has occurred it is
+quite easy to do complex modifications of memory that needs to be seen
+consistently by other threads without having to resort to locking. By
+doing the modifications, then issuing a full memory barrier, then wait
+until the second event has occurred in all managed threads, and then
+publish the modifications, we know that all managed threads reading
+this memory will get a consistent view of the modifications. Managed
+threads reading this will not have to issue any extra memory barriers
+at all.
+
+Implementation of the Thread Progress Functionality
+---------------------------------------------------
+
+### Requirement on the Implementation ###
+
+In order to be able to determine when all managed threads have reached
+the states that we are interested in we need to communicate between
+all involved threads. We of course want to minimize this
+communication.
+
+We also want threads to be able to determine when thread progress has
+been made relatively fast. That is we need to have some balance
+between comunication overhead and time to complete the operation.
+
+### API ###
+
+I will only present the most important functions in the API here.
+
+* `ErtsThrPrgrVal erts_thr_progress_later(void)` - Initiation of the
+ operation. The thread progress value returned can be used testing
+ for completion of the operation.
+* `int erts_thr_progress_has_reached(ErtsThrPrgrVal val)` - Returns
+ a non zero value when we have reached the thread progress value
+ passed as argument. That is, when a non zero value is returned the
+ operation has completed.
+
+When a thread calls `my_val = erts_thr_progress_later()` and waits for
+`erts_thr_progress_has_reached(my_val)` to return a non zero value it
+knows that thread progress has been made.
+
+While waiting for `erts_thr_progress_has_reached()` to return a non
+zero value we typically do not want to block waiting, but instead want
+to continue working with other stuff. If we run out of other stuff to
+work on we typically do want to block waiting until we have reached
+the thread progress value that we are waiting for. In order to be able
+to do this we provide functionality for waking up a thread when a
+certain thread progress value has been reached:
+
+* `void erts_thr_progress_wakeup(ErtsSchedulerData *esdp,
+ ErtsThrPrgrVal val)` - Request wake up. The calling thread will be
+ woken when thread progress has reached val.
+
+Managed threads frequently need to update their thread progress by
+calling the following functions:
+
+* `int erts_thr_progress_update(ErtsSchedulerData *esdp)` - Update
+ thread progress. If a non zero value is returned
+ `erts_thr_progress_leader_update()` has to be called without any
+ locks held.
+* `int erts_thr_progress_leader_update(ErtsSchedulerData *esdp)` -
+ Leader update thread progress.
+
+Unmanaged threads can delay thread progress beeing made:
+
+* `ErtsThrPrgrDelayHandle erts_thr_progress_unmanaged_delay(void)` -
+ Delay thread progress.
+* `void erts_thr_progress_unmanaged_continue(ErtsThrPrgrDelayHandle
+ handle)` - Let thread progress continue.
+
+Scheduler threads can schedule an operation to be executed by the
+scheduler itself when thread progress has been made:
+
+* `void erts_schedule_thr_prgr_later_op(void (*funcp)(void *), void
+ *argp, ErtsThrPrgrLaterOp *memp)` - Schedule a call to `funcp`. The
+ call `(*funcp)(argp)` will be executed when thread progress has been
+ made since the call to `erts_schedule_thr_prgr_later_op()` was
+ made.
+
+### Implementation ###
+
+In order to determine when the events has happened we use a global
+counter that is incremented when all managed threads have called
+`erts_thr_progress_update()` (or `erts_thr_progress_leader_update()`).
+This could naively be implemented using a "thread confirmed" counter.
+This would however cause an explosion of communication where all
+involved processors would need to communicate with each other at each
+update.
+
+Instead of confirming at a global location each thread confirms that
+it accepts in increment of the global counter in its own cache
+line. These confirmation cache lines are located in sequence in an
+array, and each confirmation cache line will only be written by one
+and only one thread. One of the managed threads always have the leader
+responsibility. This responsibility may jump between threads, but as
+long as there are some activity in the system always one of them will
+have the leader responsibility. The thread with the leader
+responsibility will call `erts_thr_progress_leader_update()` which
+will check that all other threads have confirmed an increment of the
+global counter before doing the increment of the global counter. The
+leader thread is the only thread reading the confirmation cache
+lines.
+
+Doing it this way we will get a communication pattern of information
+going from the leader thread out to all other managed threads and then
+back from the other threads to the leader thread. This since only the
+leader thread will write to the global counter and all other threads
+will only read it, and since each confirmation cache lines will only
+be written by one specific thread and only read by the leader
+thread. When each managed thread is distributed over different
+processors, the communication between processors will be a reflection
+of this communication pattern between threads.
+
+The value returned from `erts_thr_progress_later()` equals the, by
+this thread, latest confirmed value plus two. The global value may be
+latest confirmed value or latest confirmed value minus one. In order
+to be certain that all other managed threads actually will call
+`erts_thr_progress_update()` at least once before we reach the value
+returned from `erts_thr_progress_later()`, the global counter plus one
+is not enough. This since all other threads may already have confirmed
+current global value plus one at the time when we call
+`erts_thr_progress_later()`. They are however guaranteed not to have
+confirmed global value plus two at this time.
+
+The above described implementation more or less minimizes the
+comunication needed before we can increment the global counter. The
+amount of communication in the system due to the thread progress
+functionality however also depend on the frequency with which managed
+threads call `erts_thr_progress_update()`. Today each scheduler thread
+calls `erts_thr_progress_update()` more or less each time an Erlang
+process is scheduled out. One way of further reducing communication
+due to the thread progress functionality is to only call
+`erts_thr_progress_update()` every second, or third time an Erlang
+process is scheduled out, or even less frequently than that. However,
+by doing updates of thread progress less frequently all operations
+depending on the thread progress functionality will also take a longer
+time.
+
+#### Delay of Thread Progress by Unmanaged Threads ####
+
+In order to implement delay of thread progress from unmanaged threads
+we use two reference counters. One being `current` and one being
+`waiting`. When an unmanaged thread wants to delay thread progress it
+increments `current` and gets a handle back to the reference counter
+it incremented. When it later wants to enable continuation of thread
+progress it uses the handle to decrement the reference counter it
+previously incremented.
+
+When the leader threads is about to increment the global thread
+progress counter it verifies that the `waiting` counter is zero before
+doing so. If not zero, the leader isn't allowed to increment the
+global counter, and needs to wait before it can do this. When it is
+zero, it swaps the `waiting` and `current` counters before increasing
+the global counter. From now on the new `waiting` counter will
+decrease, so that it eventualy will reach zero, making it possible to
+increment the global counter the next time. If we only used one
+reference counter it would potentially be held above zero for ever by
+different unmanaged threads.
+
+When an unmanaged thread increment the `current` counter it will not
+prevent the next increment of the global counter, but instead the
+increment after that. This is sufficient since the global counter
+needs to be incremented two times before thread progress has been
+made. It is also desirable not to prevent the first increment, since
+the likelyhood increases that the delay is withdrawn before any
+increment of the global counter is delayed. That is, the operation
+will cause as little disruption as possible.
+
+However, this feature of delaying thread progress from unmanaged
+threads should preferably be used as little as possible, since heavy
+use of it will cause contention on the reference counter cache
+lines. The functionality is however very useful in code which normally
+only executes in managed threads, but which may under some infrequent
+circumstances be executed in other threads.
+
+#### Overhead ####
+
+The overhead caused by the thread progress functionality is more or
+less fixed using the same amount of schedulers regardless of the
+number of uses of the functionality. Already today quite a lot of
+functionality use it, and we plan to use it even more. When rewriting
+old implementations of ERTS internal functionality to use the thread
+progress functionality, this implies removing communication in the old
+implementation. Otherwise it is simply no point rewriting the old
+implementation to use the thread progress functionality. Since the
+thread progress overhead is more or less fixed, the rewrite will cause
+a reduction of the total communication in the system.
+
+##### An Example #####
+
+The main structure of an ETS table was originally managed using
+reference counting. Already a long time ago we replaced this strategy
+since the reference counter caused contention on each access of the
+table. The solution used was to schedule "confirm deletion" jobs on
+each scheduler in order to know when it was safe to deallocate the
+table structure of a removed table. These confirm deletion jobs needed
+to be allocated. That is, we had to allocate and deallocate as many
+blocks as schedulers in order to deallocate one block. This of course
+was a quite an expensive operation, but we only needed to do this once
+when removing a table. It was more important to get rid of the
+contention on the reference counter which was present on every
+operation on the table.
+
+When the thread progress functionality had been introduced, we could
+remove the code implementing the "confirm deletion" jobs, and then
+just schedule a thread progress later operation which deallocates the
+structure. Besides simplifying the code a lot, we got an increase of
+more than 10% of the number of transactions per second handled on a
+mnesia tpcb benchmark executing on a quad core machine.
diff --git a/erts/emulator/internal_doc/Tracing.md b/erts/emulator/internal_doc/Tracing.md
new file mode 100644
index 0000000000..30bc5327a7
--- /dev/null
+++ b/erts/emulator/internal_doc/Tracing.md
@@ -0,0 +1,220 @@
+Non-blocking trace setting
+==========================
+
+Introduction
+------------
+
+Before OTP R16 when trace settings were changed by `erlang:trace_pattern`,
+all other execution in the VM were halted while the trace operation
+was carried out in single threaded mode. Similar to code loading, this
+can impose a severe problem for availability that grows with the
+number of cores.
+
+In OTP R16, trace breakpoints are set in the code without blocking the
+VM. Erlang processes may continue executing undisturbed in parallel
+during the entire operation. The same base technique is used as for
+code loading. A staging area of breakpoints is prepared and then made
+active with a single atomic operation.
+
+
+Redesign of Breakpoint Wheel
+----------------------------
+
+To make it easier to manage breakpoints without single threaded mode a
+redesign of the breakpoint mechanism has been made. The old
+"breakpoint wheel" data structure was a circular double-linked list of
+breakpoints for each instrumented function. It was invented before the
+SMP emulator. To support it in the SMP emulator, is was essentially
+expanded to one breakpoint wheel per scheduler. As more breakpoint
+types have been added, the implementation have become messy and hard
+to understand and maintain.
+
+In the new design the old wheel was dropped and instead replaced by
+one struct (`GenericBp`) to hold the data for all types of breakpoints
+for each instrumented function. A bit-flag field is used to indicate
+what different type of break actions that are enabled.
+
+
+Same Same but Different
+-----------------------
+Even though `trace_pattern` use the same technique as the non-blocking
+code loading with replicated generations of data structures and an
+atomic switch, the implementations are quite separate from each
+other. One initial idea was to use the existing mechanism of code
+loading to do a dummy load operation that would make a copy of the
+affected modules. That copy could then be instrumented with
+breakpoints before making it reachable with the same atomic switch as
+done for code loading. This approach seems straight forward but has a
+number of shortcomings, one being the large memory footprint when many
+modules are instrumented. Another problem is how execution will reach
+the new instrumented code. Normally loaded code can only be reached
+through external functions calls. Trace settings must be activated
+instantaneously without the need of external function calls.
+
+The choosen solution is instead for tracing to use the technique of
+replication applied on the data structures for breakpoints. Two
+generations of breakpoints are kept and indentified by index of 0 and
+1. The global atomic variables `erts_active_bp_index` will determine
+which generation of breakpoints running code will use.
+
+### Atomicy Without Atomic Operations
+
+Not using the code loading generations (or any other code duplication)
+means that `trace_pattern` must at some point write to the active beam
+code in order for running processes to reach the staged breakpoints
+structures. This can be done with one single atomic write operation
+per instrumented function. The beam instruction words are however read
+with normal memory loads and not through the atomic API. The only
+guarantee we need is that the written instruction word is seen as
+atomic. Either fully written or not at all. This is true for word
+aligned write operation on all hardware architectures we use.
+
+
+Adding a new Breakpoint
+-----------------------
+This is a simplified sequence describing what `trace_pattern` goes
+through when adding a new breakpoint.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate breakpoint structure `GenericBp` including both generations.
+ Set the active part as disabled with a zeroed flagfield. Save the original
+ instruction word in the breakpoint.
+
+3. Write a pointer to the breakpoint at offset -4 from the first
+ instruction "func_info" header.
+
+4. Set the staging part of the breakpoint as enabled with specified
+ breakpoint data.
+
+5. Wait for thread progress.
+
+6. Write a `op_i_generic_breakpoint` as the first instruction for the function.
+ This instruction will execute the breakpoint that it finds at offset -4.
+
+7. Wait for thread progress.
+
+8. Commit the breadpoint by switching `erts_active_bp_index`.
+
+9. Wait for thread progress.
+
+10. Prepare for next call to `trace_pattern` by updating the new staging part
+ (the old active) of the breakpoint to be identic to the the new active part.
+
+11. Release code write permission and return from `trace_pattern`.
+
+
+The code write permission "lock" seized in step 1 is the same as used
+by code loading. This will ensure that only one process at a time can
+stage new trace settings but it will also prevent concurrent code
+loading and make sure we see a consistent view of the beam code during
+the entire sequence.
+
+Between step 6 and 8, runninng processes might execute the written
+`op_i_generic_breakpoint` instruction. They will get the breakpoint
+structure written in step 3, read `erts_active_bp_index` and execute
+the corresponding part of the breakpoint. Before the switch in step 8
+becomes visible they will however execute the disabled part of the
+breakpoint structure and do nothing other than executing the saved
+original instruction.
+
+
+To Updating and Remove Breakpoints
+----------------------------------
+
+The above sequence did only describe adding a new breakpoint. We do
+basically the same sequence to update the settings of an existing
+breakpoint except step 2,3 and 6 can be skipped as it has already been
+done.
+
+To remove a breakpoint some more steps are needed. The idea is to
+first stage the breakpoint as disabled, do the switch, wait for thread
+progress and then remove the disabled breakpoint by restoring the
+original beam instruction.
+
+Here is a more complete sequence that contains both adding, updating
+and removing breakpoints.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate new breakpoint structures with a disabled active part and
+ the original beam instruction. Write a pointer to the breakpoint in
+ "func_info" header at offset -4.
+
+3. Update the staging part of all affected breakpoints. Disable
+ breakpoints that are to be removed.
+
+4. Wait for thread progress.
+
+5. Write a `op_i_generic_breakpoint` as the first instruction for all
+ functions with new breakpoints.
+
+6. Wait for thread progress.
+
+7. Commit all staged breadpoints by switching `erts_active_bp_index`.
+
+8. Wait for thread progress.
+
+
+9. Restore original beam instruction for disabled breakpoints.
+
+10. Wait for thread progress.
+
+11. Prepare for next call to `trace_pattern` by updating the new
+ staging area (the old active) for all enabled breakpoints.
+
+12. Deallocate disabled breakpoint structures.
+
+13. Release code write permission and return from `trace_pattern`.
+
+
+### All that Waiting for Thread Progress
+
+There are four rounds of waiting for thread progress in the above
+sequence. In the code loading sequence we sacrificed memory overhead
+of three generations to avoid a second round of thread progress. The
+latency of `trace_pattern` should not be such a big problem for
+however, as it is normally not called in a rapid sequence.
+
+The waiting in step 4 is to make sure all threads will see an updated
+view of the breakpoint structures once they become reachable through
+the `op_i_generic_breakpoint` instruction written in step 5.
+
+The waiting in step 6 is to make the activation of the new trace
+settings "as atomic as possible". Different cores might see the new
+value of `erts_active_bp_index` at different times as it is read
+without any memory barrier. But this is the best we can do without
+more expensive thread synchronization.
+
+The waiting in step 8 is to make sure we dont't restore the original
+bream instructions for disabled breakpoints until we know that no
+thread is still accessing the old enabled part of a disabled
+breakpoint.
+
+The waiting in step 10 is to make sure no lingering thread is still
+accessing disabled breakpoint structures to be deallocated in step
+12.
+
+
+Global Tracing
+--------------
+
+Call tracing with `global` option only affects external function
+calls. This was earlier handled by inserting a special trace
+instruction in export entries without the use of breakpoints. With the
+new non-blocking tracing we want to avoid special handling for global
+tracing and make use of the staging and atomic switching within the
+breakpoint mechanism. The solution was to create the same type of
+breakpoint structure for a global call trace. The difference to local
+tracing is that we insert the `op_i_generic_breakpoint` instruction
+(with its pointer at offset -4) in the export entry rather than in the
+code.
+
+
+Future work
+-----------
+
+We still go to single threaded mode when new code is loaded for a
+module that is traced, or when loading code when there is a default
+trace pattern set. That is not impossible to fix, but that requires
+much closer cooperation between tracing BIFs and the loader BIFs.
diff --git a/erts/emulator/sys/unix/erl_unix_sys_ddll.c b/erts/emulator/sys/unix/erl_unix_sys_ddll.c
index 12c47d0088..8760b58839 100644
--- a/erts/emulator/sys/unix/erl_unix_sys_ddll.c
+++ b/erts/emulator/sys/unix/erl_unix_sys_ddll.c
@@ -101,7 +101,7 @@ void erl_sys_ddll_init(void) {
/*
* Open a shared object
*/
-int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err)
+int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err)
{
#if defined(HAVE_DLOPEN)
char* dlname;
diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c
index 61f9f6a59a..59e34eb819 100644
--- a/erts/emulator/sys/unix/sys.c
+++ b/erts/emulator/sys/unix/sys.c
@@ -547,6 +547,25 @@ erts_sys_pre_init(void)
#endif
#endif /* USE_THREADS */
erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0);
+
+ {
+ /*
+ * Unfortunately we depend on fd 0,1,2 in the old shell code.
+ * So if for some reason we do not have those open when we start
+ * we have to open them here. Not doing this can cause the emulator
+ * to deadlock when reaping the fd_driver ports :(
+ */
+ int fd;
+ /* Make sure fd 0 is open */
+ if ((fd = open("/dev/null", O_RDONLY)) != 0)
+ close(fd);
+ /* Make sure fds 1 and 2 are open */
+ while (fd < 3) {
+ fd = open("/dev/null", O_WRONLY);
+ }
+ close(fd);
+ }
+
}
void
diff --git a/erts/emulator/sys/win32/erl_win32_sys_ddll.c b/erts/emulator/sys/win32/erl_win32_sys_ddll.c
index 2d3f073cc2..338f0d7386 100644
--- a/erts/emulator/sys/win32/erl_win32_sys_ddll.c
+++ b/erts/emulator/sys/win32/erl_win32_sys_ddll.c
@@ -59,7 +59,7 @@ void erl_sys_ddll_init(void) {
* Open a shared object
* Expecting 'full_name' as an UTF-8 string.
*/
-int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err)
+int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err)
{
HINSTANCE hinstance;
int len;
diff --git a/erts/emulator/test/binary_SUITE.erl b/erts/emulator/test/binary_SUITE.erl
index a340a805b5..bce4278337 100644
--- a/erts/emulator/test/binary_SUITE.erl
+++ b/erts/emulator/test/binary_SUITE.erl
@@ -447,26 +447,26 @@ terms(Config) when is_list(Config) ->
Sz1 when is_integer(Sz1), size(Bin1) =< Sz1 ->
ok
end,
- Term = binary_to_term(Bin),
- Term = binary_to_term(Bin, [safe]),
+ Term = binary_to_term_stress(Bin),
+ Term = binary_to_term_stress(Bin, [safe]),
Unaligned = make_unaligned_sub_binary(Bin),
- Term = binary_to_term(Unaligned),
- Term = binary_to_term(Unaligned, []),
- Term = binary_to_term(Bin, [safe]),
+ Term = binary_to_term_stress(Unaligned),
+ Term = binary_to_term_stress(Unaligned, []),
+ Term = binary_to_term_stress(Bin, [safe]),
BinC = erlang:term_to_binary(Term, [compressed]),
- Term = binary_to_term(BinC),
+ Term = binary_to_term_stress(BinC),
true = size(BinC) =< size(Bin),
Bin = term_to_binary(Term, [{compressed,0}]),
terms_compression_levels(Term, size(Bin), 1),
UnalignedC = make_unaligned_sub_binary(BinC),
- Term = binary_to_term(UnalignedC)
+ Term = binary_to_term_stress(UnalignedC)
end,
?line test_terms(TestFun),
ok.
terms_compression_levels(Term, UncompressedSz, Level) when Level < 10 ->
BinC = erlang:term_to_binary(Term, [{compressed,Level}]),
- Term = binary_to_term(BinC),
+ Term = binary_to_term_stress(BinC),
Sz = byte_size(BinC),
true = Sz =< UncompressedSz,
terms_compression_levels(Term, UncompressedSz, Level+1);
@@ -476,9 +476,9 @@ terms_float(Config) when is_list(Config) ->
?line test_floats(fun(Term) ->
Bin0 = term_to_binary(Term),
Bin0 = term_to_binary(Term, [{minor_version,0}]),
- Term = binary_to_term(Bin0),
+ Term = binary_to_term_stress(Bin0),
Bin1 = term_to_binary(Term, [{minor_version,1}]),
- Term = binary_to_term(Bin1),
+ Term = binary_to_term_stress(Bin1),
true = size(Bin1) < size(Bin0),
Size0 = erlang:external_size(Term),
Size00 = erlang:external_size(Term, [{minor_version, 0}]),
@@ -490,7 +490,7 @@ terms_float(Config) when is_list(Config) ->
float_middle_endian(Config) when is_list(Config) ->
%% Testing for roundtrip is not enough.
?line <<131,70,63,240,0,0,0,0,0,0>> = term_to_binary(1.0, [{minor_version,1}]),
- ?line 1.0 = binary_to_term(<<131,70,63,240,0,0,0,0,0,0>>).
+ ?line 1.0 = binary_to_term_stress(<<131,70,63,240,0,0,0,0,0,0>>).
external_size(Config) when is_list(Config) ->
%% Build a term whose external size only fits in a big num (on 32-bit CPU).
@@ -608,10 +608,10 @@ bad_binary_to_term(Config) when is_list(Config) ->
ok.
bad_bin_to_term(BadBin) ->
- {'EXIT',{badarg,_}} = (catch binary_to_term(BadBin)).
+ {'EXIT',{badarg,_}} = (catch binary_to_term_stress(BadBin)).
bad_bin_to_term(BadBin,Opts) ->
- {'EXIT',{badarg,_}} = (catch binary_to_term(BadBin,Opts)).
+ {'EXIT',{badarg,_}} = (catch binary_to_term_stress(BadBin,Opts)).
safe_binary_to_term2(doc) -> "Test safety options for binary_to_term/2";
safe_binary_to_term2(Config) when is_list(Config) ->
@@ -622,7 +622,7 @@ safe_binary_to_term2(Config) when is_list(Config) ->
BadRef = <<131,114,0,3,BadHostAtom/binary,0,<<0,0,0,255>>/binary,
Empty/binary,Empty/binary>>,
?line bad_bin_to_term(BadRef, [safe]), % good ref, with a bad atom
- ?line fullsweep_after = binary_to_term(<<131,100,0,15,"fullsweep_after">>, [safe]), % should be a good atom
+ ?line fullsweep_after = binary_to_term_stress(<<131,100,0,15,"fullsweep_after">>, [safe]), % should be a good atom
BadExtFun = <<131,113,100,0,4,98,108,117,101,100,0,4,109,111,111,110,97,3>>,
?line bad_bin_to_term(BadExtFun, [safe]),
ok.
@@ -679,14 +679,14 @@ corrupter0(Term) ->
corrupter(Bin, Pos) when Pos >= 0 ->
?line {ShorterBin, Rest} = split_binary(Bin, Pos),
- ?line catch binary_to_term(ShorterBin), %% emulator shouldn't crash
+ ?line catch binary_to_term_stress(ShorterBin), %% emulator shouldn't crash
?line MovedBin = list_to_binary([ShorterBin]),
- ?line catch binary_to_term(MovedBin), %% emulator shouldn't crash
+ ?line catch binary_to_term_stress(MovedBin), %% emulator shouldn't crash
%% Bit faults, shouldn't crash
<<Byte,Tail/binary>> = Rest,
Fun = fun(M) -> FaultyByte = Byte bxor M,
- catch binary_to_term(<<ShorterBin/binary,
+ catch binary_to_term_stress(<<ShorterBin/binary,
FaultyByte, Tail/binary>>) end,
?line lists:foreach(Fun,[1,2,4,8,16,32,64,128,255]),
?line corrupter(Bin, Pos-1);
@@ -700,7 +700,7 @@ more_bad_terms(Config) when is_list(Config) ->
?line ok = io:format("File: ~s\n", [BadFile]),
?line case file:read_file(BadFile) of
{ok,Bin} ->
- ?line {'EXIT',{badarg,_}} = (catch binary_to_term(Bin)),
+ ?line {'EXIT',{badarg,_}} = (catch binary_to_term_stress(Bin)),
ok;
Other ->
?line ?t:fail(Other)
@@ -709,7 +709,7 @@ more_bad_terms(Config) when is_list(Config) ->
otp_5484(Config) when is_list(Config) ->
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
<<131,
104,2, %Tuple, 2 elements
103, %Pid
@@ -722,7 +722,7 @@ otp_5484(Config) when is_list(Config) ->
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
<<131,
104,2, %Tuple, 2 elements
103, %Pid
@@ -734,13 +734,13 @@ otp_5484(Config) when is_list(Config) ->
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
%% A old-type fun in a list containing a bad creator pid.
<<131,108,0,0,0,1,117,0,0,0,0,103,100,0,13,110,111,110,111,100,101,64,110,111,104,111,115,116,255,255,0,25,255,0,0,0,0,100,0,1,116,97,0,98,6,142,121,72,106>>)),
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
%% A new-type fun in a list containing a bad creator pid.
%%
<<131,
@@ -752,7 +752,7 @@ otp_5484(Config) when is_list(Config) ->
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
%% A new-type fun in a list containing a bad module.
<<131,
108,0,0,0,1, %List, 1 element
@@ -763,7 +763,7 @@ otp_5484(Config) when is_list(Config) ->
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
%% A new-type fun in a list containing a bad index.
<<131,
108,0,0,0,1, %List, 1 element
@@ -775,7 +775,7 @@ otp_5484(Config) when is_list(Config) ->
?line {'EXIT',_} =
(catch
- binary_to_term(
+ binary_to_term_stress(
%% A new-type fun in a list containing a bad unique value.
<<131,
108,0,0,0,1, %List, 1 element
@@ -788,46 +788,46 @@ otp_5484(Config) when is_list(Config) ->
%% An absurdly large atom.
?line {'EXIT',_} =
- (catch binary_to_term(iolist_to_binary([<<131,100,65000:16>>|
+ (catch binary_to_term_stress(iolist_to_binary([<<131,100,65000:16>>|
lists:duplicate(65000, 42)]))),
%% Longer than 255 characters.
?line {'EXIT',_} =
- (catch binary_to_term(iolist_to_binary([<<131,100,256:16>>|
+ (catch binary_to_term_stress(iolist_to_binary([<<131,100,256:16>>|
lists:duplicate(256, 42)]))),
%% OTP-7218. Thanks to Matthew Dempsky. Also make sure that we
%% cover the other error cases for external funs (EXPORT_EXT).
?line {'EXIT',_} =
- (catch binary_to_term(
+ (catch binary_to_term_stress(
<<131,
113, %EXPORT_EXP
97,13, %Integer: 13
97,13, %Integer: 13
97,13>>)), %Integer: 13
?line {'EXIT',_} =
- (catch binary_to_term(
+ (catch binary_to_term_stress(
<<131,
113, %EXPORT_EXP
100,0,1,64, %Atom: '@'
97,13, %Integer: 13
97,13>>)), %Integer: 13
?line {'EXIT',_} =
- (catch binary_to_term(
+ (catch binary_to_term_stress(
<<131,
113, %EXPORT_EXP
100,0,1,64, %Atom: '@'
100,0,1,64, %Atom: '@'
106>>)), %NIL
?line {'EXIT',_} =
- (catch binary_to_term(
+ (catch binary_to_term_stress(
<<131,
113, %EXPORT_EXP
100,0,1,64, %Atom: '@'
100,0,1,64, %Atom: '@'
98,255,255,255,255>>)), %Integer: -1
?line {'EXIT',_} =
- (catch binary_to_term(
+ (catch binary_to_term_stress(
<<131,
113, %EXPORT_EXP
100,0,1,64, %Atom: '@'
@@ -835,7 +835,7 @@ otp_5484(Config) when is_list(Config) ->
113,97,13,97,13,97,13>>)), %fun 13:13/13
%% Bad funs.
- ?line {'EXIT',_} = (catch binary_to_term(fake_fun(0, lists:seq(0, 256)))),
+ ?line {'EXIT',_} = (catch binary_to_term_stress(fake_fun(0, lists:seq(0, 256)))),
ok.
fake_fun(Arity, Env0) ->
@@ -869,7 +869,7 @@ try_bad_lengths(B) ->
try_bad_lengths(B, L) when L > 16#FFFFFFF0 ->
Bin = <<B/binary,L:32>>,
io:format("~p\n", [Bin]),
- {'EXIT',_} = (catch binary_to_term(Bin)),
+ {'EXIT',_} = (catch binary_to_term_stress(Bin)),
try_bad_lengths(B, L-1);
try_bad_lengths(_, _) -> ok.
@@ -923,7 +923,7 @@ otp_6817_try_bin(Bin) ->
%% If the bug is present, the heap pointer will moved when the invalid term
%% is found and we will have a linked list passing through the limbo area
%% between the heap top and the stack pointer.
- catch binary_to_term(Bin),
+ catch binary_to_term_stress(Bin),
%% If the bug is present, we will overwrite the pointers in the limbo area.
Filler = erlang:make_tuple(1024, 16#3FA),
@@ -935,7 +935,7 @@ otp_6817_try_bin(Bin) ->
otp_8117(doc) -> "Some bugs in binary_to_term when 32-bit integers are negative.";
otp_8117(suite) -> [];
otp_8117(Config) when is_list(Config) ->
- [otp_8117_do(Op,-(1 bsl N)) || Op <- ['fun',list,tuple],
+ [otp_8117_do(Op,-(1 bsl N)) || Op <- ['fun',named_fun,list,tuple],
N <- lists:seq(0,31)],
ok.
@@ -944,6 +944,11 @@ otp_8117_do('fun',Neg) ->
FunBin = term_to_binary(fun() -> ok end),
?line <<B1:27/binary,_NumFree:32,Rest/binary>> = FunBin,
?line bad_bin_to_term(<<B1/binary,Neg:32,Rest/binary>>);
+otp_8117_do(named_fun,Neg) ->
+ % Named fun with negative num_free
+ FunBin = term_to_binary(fun F() -> F end),
+ ?line <<B1:27/binary,_NumFree:32,Rest/binary>> = FunBin,
+ ?line bad_bin_to_term(<<B1/binary,Neg:32,Rest/binary>>);
otp_8117_do(list,Neg) ->
%% List with negative length
?line bad_bin_to_term(<<131,104,2,108,Neg:32,97,11,104,1,97,12,97,13,106,97,14>>);
@@ -1233,7 +1238,7 @@ bit_sized_binary_sizes(Config) when is_list(Config) ->
bsbs_1(A) ->
BinSize = 32+A,
io:format("A: ~p BinSize: ~p", [A,BinSize]),
- Bin = binary_to_term(<<131,$M,5:32,A,0,0,0,0,0>>),
+ Bin = binary_to_term_stress(<<131,$M,5:32,A,0,0,0,0,0>>),
BinSize = bit_size(Bin).
deep(Config) when is_list(Config) ->
@@ -1250,7 +1255,7 @@ deep(Config) when is_list(Config) ->
deep_roundtrip(T) ->
B = term_to_binary(T),
- T = binary_to_term(B).
+ T = binary_to_term_stress(B).
obsolete_funs(Config) when is_list(Config) ->
erts_debug:set_internal_state(available_internal_state, true),
@@ -1285,29 +1290,29 @@ obsolete_fun(Fun) ->
Tuple = no_fun_roundtrip(Fun).
no_fun_roundtrip(Term) ->
- binary_to_term(erts_debug:get_internal_state({term_to_binary_no_funs,Term})).
+ binary_to_term_stress(erts_debug:get_internal_state({term_to_binary_no_funs,Term})).
%% Test non-standard encodings never generated by term_to_binary/1
%% but recognized by binary_to_term/1.
robustness(Config) when is_list(Config) ->
- ?line [] = binary_to_term(<<131,107,0,0>>), %Empty string.
- ?line [] = binary_to_term(<<131,108,0,0,0,0,106>>), %Zero-length list.
+ ?line [] = binary_to_term_stress(<<131,107,0,0>>), %Empty string.
+ ?line [] = binary_to_term_stress(<<131,108,0,0,0,0,106>>), %Zero-length list.
%% {[],a} where [] is a zero-length list.
- ?line {[],a} = binary_to_term(<<131,104,2,108,0,0,0,0,106,100,0,1,97>>),
+ ?line {[],a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,106,100,0,1,97>>),
%% {42,a} where 42 is a zero-length list with 42 in the tail.
- ?line {42,a} = binary_to_term(<<131,104,2,108,0,0,0,0,97,42,100,0,1,97>>),
+ ?line {42,a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,97,42,100,0,1,97>>),
%% {{x,y},a} where {x,y} is a zero-length list with {x,y} in the tail.
- ?line {{x,y},a} = binary_to_term(<<131,104,2,108,0,0,0,0,
+ ?line {{x,y},a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,
104,2,100,0,1,120,100,0,1,
121,100,0,1,97>>),
%% Bignums fitting in 32 bits.
- ?line 16#7FFFFFFF = binary_to_term(<<131,98,127,255,255,255>>),
- ?line -1 = binary_to_term(<<131,98,255,255,255,255>>),
+ ?line 16#7FFFFFFF = binary_to_term_stress(<<131,98,127,255,255,255>>),
+ ?line -1 = binary_to_term_stress(<<131,98,255,255,255,255>>),
ok.
@@ -1325,7 +1330,7 @@ run_otp_8180(Name) ->
?line {ok,Bins} = file:consult(Name),
[begin
io:format("~p\n", [Bin]),
- ?line {'EXIT',{badarg,_}} = (catch binary_to_term(Bin))
+ ?line {'EXIT',{badarg,_}} = (catch binary_to_term_stress(Bin))
end || Bin <- Bins],
ok.
@@ -1394,3 +1399,52 @@ unaligned_sub_bin(Bin0, Offs) ->
Bin.
id(I) -> I.
+
+
+%% Stress binary_to_term with different initial reductions
+binary_to_term_stress(Bin) ->
+ binary_to_term_stress(Bin, no_opts).
+
+binary_to_term_stress(Bin, Opts) ->
+ Reds = get_reds(),
+ T = b2t(erlang:system_info(context_reductions),
+ Bin, Opts, catch_binary_to_term(Bin, Opts)),
+ set_reds(Reds),
+ T = case Opts of
+ no_opts -> binary_to_term(Bin);
+ _ -> binary_to_term(Bin,Opts)
+ end.
+
+catch_binary_to_term(Bin, no_opts) ->
+ try binary_to_term(Bin)
+ catch
+ error:badarg -> binary_to_term_throws_badarg
+ end;
+catch_binary_to_term(Bin, Opts) ->
+ try binary_to_term(Bin, Opts)
+ catch
+ error:badarg -> binary_to_term_throws_badarg
+ end.
+
+b2t(0, _Bin, _Opts, Term) ->
+ Term;
+b2t(Reds, Bin, Opts, Term) ->
+ set_reds(Reds),
+ Term = catch_binary_to_term(Bin,Opts),
+ b2t(Reds div 3, Bin, Opts, Term).
+
+set_reds(Reds) ->
+ try erts_debug:set_internal_state(reds_left, Reds)
+ catch
+ error:undef ->
+ erts_debug:set_internal_state(available_internal_state, true),
+ set_reds(Reds)
+ end.
+
+get_reds() ->
+ try erts_debug:get_internal_state(reds_left)
+ catch
+ error:undef ->
+ erts_debug:set_internal_state(available_internal_state, true),
+ get_reds()
+ end.
diff --git a/erts/emulator/test/fun_SUITE.erl b/erts/emulator/test/fun_SUITE.erl
index 36ba4e0f48..8ad5f290ed 100644
--- a/erts/emulator/test/fun_SUITE.erl
+++ b/erts/emulator/test/fun_SUITE.erl
@@ -262,6 +262,16 @@ equality(Config) when is_list(Config) ->
?line false = eq(FF2, FF4),
?line false = eq(FF3, FF4),
+ %% EEP37
+ H1 = fun Fact(N) when N > 0 -> N * Fact(N - 1); Fact(0) -> 1 end,
+ H2 = fun Pow(N, M) when M > 0 -> N * Pow(N, M - 1); Pow(_, 0) -> 1 end,
+ H1_copy = copy_term(H1),
+
+ true = eq(H1, H1),
+ true = eq(H1, H1_copy),
+ true = eq(H2, H2),
+ false = eq(H1, H2),
+
ok.
eq(X, X) -> true;
diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam
index fb222cb64b..73fac27161 100644
--- a/erts/preloaded/ebin/erlang.beam
+++ b/erts/preloaded/ebin/erlang.beam
Binary files differ
diff --git a/erts/preloaded/ebin/erts_internal.beam b/erts/preloaded/ebin/erts_internal.beam
index 9ab806718b..12b36913a9 100644
--- a/erts/preloaded/ebin/erts_internal.beam
+++ b/erts/preloaded/ebin/erts_internal.beam
Binary files differ
diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl
index a21da2ecc9..0ed677c3d8 100644
--- a/erts/preloaded/src/erlang.erl
+++ b/erts/preloaded/src/erlang.erl
@@ -362,15 +362,25 @@ binary_to_list(_Binary, _Start, _Stop) ->
%% binary_to_term/1
-spec binary_to_term(Binary) -> term() when
Binary :: ext_binary().
-binary_to_term(_Binary) ->
- erlang:nif_error(undefined).
+binary_to_term(Binary) ->
+ %% This BIF may throw badarg while trapping
+ try
+ erts_internal:binary_to_term(Binary)
+ catch
+ error:Reason -> erlang:error(Reason,[Binary])
+ end.
%% binary_to_term/2
-spec binary_to_term(Binary, Opts) -> term() when
Binary :: ext_binary(),
Opts :: [safe].
-binary_to_term(_Binary, _Opts) ->
- erlang:nif_error(undefined).
+binary_to_term(Binary, Opts) ->
+ %% This BIF may throw badarg while trapping
+ try
+ erts_internal:binary_to_term(Binary,Opts)
+ catch
+ error:Reason -> erlang:error(Reason,[Binary,Opts])
+ end.
%% bit_size/1
%% Shadowed by erl_bif_types: erlang:bit_size/1
diff --git a/erts/preloaded/src/erts_internal.erl b/erts/preloaded/src/erts_internal.erl
index c8e8e7e069..d6a185482e 100644
--- a/erts/preloaded/src/erts_internal.erl
+++ b/erts/preloaded/src/erts_internal.erl
@@ -29,7 +29,7 @@
-module(erts_internal).
-export([await_port_send_result/3]).
-
+-export([binary_to_term/1, binary_to_term/2]).
-export([port_command/3, port_connect/2, port_close/1,
port_control/3, port_call/3, port_info/1, port_info/2]).
@@ -160,3 +160,13 @@ request_system_task(_Pid, _Prio, _Request) ->
check_process_code(_Module, _OptionList) ->
erlang:nif_error(undefined).
+-spec binary_to_term(Binary) -> term() when
+ Binary :: binary().
+binary_to_term(_Binary) ->
+ erlang:nif_error(undefined).
+
+-spec binary_to_term(Binary, Opts) -> term() when
+ Binary :: binary(),
+ Opts :: [safe].
+binary_to_term(_Binary, _Opts) ->
+ erlang:nif_error(undefined).