38 files changed, 2768 insertions, 248 deletions
diff --git a/erts/doc/src/absform.xml b/erts/doc/src/absform.xml
index 55aef9f8ab..4acc03b133 100644
--- a/erts/doc/src/absform.xml
+++ b/erts/doc/src/absform.xml
@@ -290,6 +290,18 @@
       <item>If E is <c><![CDATA[fun Fc_1 ; ... ; Fc_k end]]></c> 
        where each <c><![CDATA[Fc_i]]></c> is a function clause then Rep(E) =
       <c><![CDATA[{'fun',LINE,{clauses,[Rep(Fc_1), ..., Rep(Fc_k)]}}]]></c>.</item>
+      <item>If E is <c><![CDATA[fun Name Fc_1 ; ... ; Name Fc_k end]]></c>
+       where <c><![CDATA[Name]]></c> is a variable and each
+       <c><![CDATA[Fc_i]]></c> is a function clause then Rep(E) =
+       <c><![CDATA[{named_fun,LINE,Name,[Rep(Fc_1), ..., Rep(Fc_k)]}]]></c>.
+       </item>
+      <item>If E is <c><![CDATA[query [E_0 || W_1, ..., W_k] end]]></c>,
+       where each <c><![CDATA[W_i]]></c> is a generator or a filter, then
+       Rep(E) = <c><![CDATA[{'query',LINE,{lc,LINE,Rep(E_0),[Rep(W_1), ..., Rep(W_k)]}}]]></c>.
+       For Rep(W), see below.</item>
+      <item>If E is <c><![CDATA[E_0.Field]]></c>, a Mnesia record access
+       inside a query, then
+       Rep(E) = <c><![CDATA[{record_field,LINE,Rep(E_0),Rep(Field)}]]></c>.</item>
       <item>If E is <c><![CDATA[( E_0 )]]></c>, then
        Rep(E) = <c><![CDATA[Rep(E_0)]]></c>,
        i.e., parenthesized expressions cannot be distinguished from their bodies.</item>
diff --git a/erts/doc/src/erlc.xml b/erts/doc/src/erlc.xml
index 10cab344b0..c3fc3b1686 100644
--- a/erts/doc/src/erlc.xml
+++ b/erts/doc/src/erlc.xml
@@ -234,6 +234,16 @@ erlc +export_all file.erl</pre>
           from the shell.</p>
         <p>Supported options: -I, -o, -D, -v, -W, -b.</p>
       </item>
+      <tag>.S</tag>
+      <item>
+        <p>Erlang assembler source code. It generates a <c><![CDATA[.beam]]></c> file.</p>
+        <p>Supported options: same as for .erl.</p>
+      </item>
+      <tag>.core</tag>
+      <item>
+        <p>Erlang core source code. It generates a <c><![CDATA[.beam]]></c> file.</p>
+        <p>Supported options: same as for .erl.</p>
+      </item>
       <tag>.yrl</tag>
       <item>
         <p>Yecc source code. It generates an <c><![CDATA[.erl]]></c> file.</p>
diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names
index eee4badfb8..96547ba743 100644
--- a/erts/emulator/beam/atom.names
+++ b/erts/emulator/beam/atom.names
@@ -116,6 +116,7 @@ atom binary_longest_prefix_trap
 atom binary_longest_suffix_trap
 atom binary_match_trap
 atom binary_matches_trap
+atom binary_to_term_trap
 atom block
 atom blocked
 atom bm
diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab
index dd50df636c..3ec534f0bc 100644
--- a/erts/emulator/beam/bif.tab
+++ b/erts/emulator/beam/bif.tab
@@ -45,7 +45,6 @@ bif erlang:apply/3
 bif erlang:atom_to_list/1
 bif erlang:binary_to_list/1
 bif erlang:binary_to_list/3
-bif erlang:binary_to_term/1	
 bif erlang:crc32/1
 bif erlang:crc32/2
 bif erlang:crc32_combine/3
@@ -152,6 +151,8 @@ bif erts_internal:port_command/3
 bif erts_internal:port_control/3
 bif erts_internal:port_close/1
 bif erts_internal:port_connect/2
+bif erts_internal:binary_to_term/1
+bif erts_internal:binary_to_term/2
 
 bif erts_internal:request_system_task/3
 bif erts_internal:check_process_code/2
@@ -479,11 +480,6 @@ bif erlang:call_on_load_function/1
 bif erlang:finish_after_on_load/2
 
 #
-# New Bifs in R13B4
-#
-bif erlang:binary_to_term/2
-
-#
 # The binary match bifs (New in R14A - EEP9)
 #
 
diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c
index 1c3e955f47..1728b200f7 100644
--- a/erts/emulator/beam/erl_bif_ddll.c
+++ b/erts/emulator/beam/erl_bif_ddll.c
@@ -182,7 +182,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
     Eterm name_term = BIF_ARG_2;
     Eterm options = BIF_ARG_3;
     char *path = NULL;
-    ErlDrvSizeT path_len;
+    Sint path_len;
     char *name = NULL;
     DE_Handle *dh;
     erts_driver_t *drv;
@@ -198,6 +198,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
     int kill_ports = 0;
     int do_build_load_error = 0;
     int build_this_load_error = 0;
+    int encoding;
 
     for(l = options; is_list(l); l =  CDR(list_val(l))) {
 	Eterm opt = CAR(list_val(l));
@@ -257,18 +258,23 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
 	goto error;
     }
 
-    if (erts_iolist_size(path_term, &path_len)) {
-	goto error;
+    encoding = erts_get_native_filename_encoding();
+    if (encoding == ERL_FILENAME_WIN_WCHAR) {
+        /* Do not convert the lib name to utf-16le yet, do that in win32 specific code */
+        /* since lib_name is used in error messages */
+        encoding = ERL_FILENAME_UTF8;
     }
-    path = erts_alloc(ERTS_ALC_T_DDLL_TMP_BUF, path_len + 1 /* might need path separator */ + sys_strlen(name) + 1);
-    if (erts_iolist_to_buf(path_term, path, path_len) != 0) {
+    path = erts_convert_filename_to_encoding(path_term, NULL, 0,
+					     ERTS_ALC_T_DDLL_TMP_BUF, 1, 0,
+					     encoding, &path_len,
+					     sys_strlen(name) + 2); /* might need path separator */
+    if (!path) {
 	goto error;
     }
-    while (path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/')) {
-	--path_len;
-    }
+    ASSERT(path_len > 0 && path[path_len-1] == 0);
+    while (--path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/'))
+	;
     path[path_len++] = '/';
-    /*path[path_len] = '\0';*/
     sys_strcpy(path+path_len,name);
 
 #if DDLL_SMP
@@ -1524,7 +1530,7 @@ static int do_load_driver_entry(DE_Handle *dh, char *path, char *name)
 
     assert_drv_list_rwlocked();
 
-    if ((res =  erts_sys_ddll_open(path, &(dh->handle))) != ERL_DE_NO_ERROR) {
+    if ((res =  erts_sys_ddll_open(path, &(dh->handle), NULL)) != ERL_DE_NO_ERROR) {
 	return res;
     }
     
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 2b40f9272d..414ae2f046 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -3613,8 +3613,9 @@ BIF_RETTYPE erts_debug_set_internal_state_2(BIF_ALIST_2)
 	    default: BIF_ERROR(BIF_P, BADARG); break;
 	    }
  
-	    res = erts_set_gc_state(BIF_P, enable);
-	    BIF_RET(res ? am_true : am_false);
+            res = (BIF_P->flags & F_DISABLE_GC) ? am_false : am_true;
+	    erts_set_gc_state(BIF_P, enable);
+	    BIF_RET(res);
 	}
 	else if (ERTS_IS_ATOM_STR("send_fake_exit_signal", BIF_ARG_1)) {
 	    /* Used by signal_SUITE (emulator) */
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index 864349491a..f298422267 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -808,7 +808,7 @@ open_port(Process* p, Eterm name, Eterm settings, int *err_typep, int *err_nump)
 	    if (encoding == ERL_FILENAME_WIN_WCHAR) {
 		encoding = ERL_FILENAME_UTF8;
 	    }
-	    if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL))
+	    if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL, 0))
 		== NULL) {
 		goto badarg;
 	    }
diff --git a/erts/emulator/beam/erl_bif_re.c b/erts/emulator/beam/erl_bif_re.c
index 99c31738a5..448c6f6f6d 100644
--- a/erts/emulator/beam/erl_bif_re.c
+++ b/erts/emulator/beam/erl_bif_re.c
@@ -1196,8 +1196,8 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3)
 
 	ovsize = 3*(unsigned_val(tp[2])+1);
 	code_size = binary_size(tp[5]);
-	if ((code_tmp = (const pcre *) 
-	     erts_get_aligned_binary_bytes(tp[5], &temp_alloc)) == NULL) {
+	code_tmp = (const pcre *) erts_get_aligned_binary_bytes(tp[5], &temp_alloc);
+	if (code_tmp == NULL || code_size < 4) {
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_ERROR(p, BADARG);
 	}
diff --git a/erts/emulator/beam/erl_debug.c b/erts/emulator/beam/erl_debug.c
index dc79d45be7..873a9860da 100644
--- a/erts/emulator/beam/erl_debug.c
+++ b/erts/emulator/beam/erl_debug.c
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1998-2012. All Rights Reserved.
+ * Copyright Ericsson AB 1998-2013. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 8c4fffa75b..1af80dd04b 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -553,8 +553,8 @@ void erts_usage(void)
     erts_fprintf(stderr, "            numbers is %d\n",
 		 ERTS_MAX_NO_OF_SCHEDULERS);
     erts_fprintf(stderr, "-SP p1:p2   specify schedulers (p1) and schedulers online (p2)\n");
-    erts_fprintf(stderr, "	      as percentages of logical processors configured and logical\n");
-    erts_fprintf(stderr, "	      processors available, respectively\n");
+    erts_fprintf(stderr, "            as percentages of logical processors configured and logical\n");
+    erts_fprintf(stderr, "            processors available, respectively\n");
     erts_fprintf(stderr, "-t size     set the maximum number of atoms the "
 			 "emulator can handle\n");
     erts_fprintf(stderr, "            valid range is [%d-%d]\n",
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index e87959f0ab..dc285b3cf7 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -1407,7 +1407,7 @@ void* enif_dlopen(const char* lib,
     ErtsSysDdllError errdesc = ERTS_SYS_DDLL_ERROR_INIT;
     void* handle;
     void* init_func;
-    if (erts_sys_ddll_open2(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) {
+    if (erts_sys_ddll_open(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) {
 	if (erts_sys_ddll_load_nif_init(handle, &init_func, &errdesc) == ERL_DE_NO_ERROR) {
 	    erts_sys_ddll_call_nif_init(init_func);
 	}
@@ -1587,7 +1587,8 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
         encoding = ERL_FILENAME_UTF8;
     }
     lib_name = erts_convert_filename_to_encoding(BIF_ARG_1, NULL, 0,
-                                                 ERTS_ALC_T_TMP, 1, 0, encoding, NULL);
+                                                 ERTS_ALC_T_TMP, 1, 0, encoding,
+						 NULL, 0);
     if (!lib_name) {
 	BIF_ERROR(BIF_P, BADARG);
     }
@@ -1626,7 +1627,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 			     "module '%T' not allowed", mod_atom);
     }    
     else if (init_func == NULL &&
-	     (err=erts_sys_ddll_open2(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
+	     (err=erts_sys_ddll_open(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
 	const char slogan[] = "Failed to load NIF library";
 	if (strstr(errdesc.str, lib_name) != NULL) {
 	    ret = load_nif_error(BIF_P, "load_failed", "%s: '%s'", slogan, errdesc.str);
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 0a41fb596d..21fd8dd50a 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -8271,25 +8271,22 @@ save_gc_task(Process *c_p, ErtsProcSysTask *st, int prio)
 int
 erts_set_gc_state(Process *c_p, int enable)
 {
-    int res;
     ErtsProcSysTaskQs *dgc_tsk_qs;
     ASSERT(c_p == erts_get_current_process());
     ASSERT((ERTS_PSFLG_RUNNING|ERTS_PSFLG_RUNNING_SYS)
 	   & erts_smp_atomic32_read_nob(&c_p->state));
     ERTS_SMP_LC_ASSERT(ERTS_PROC_LOCK_MAIN == erts_proc_lc_my_proc_locks(c_p));
 
-    res = !(c_p->flags & F_DISABLE_GC);
-
     if (!enable) {
 	c_p->flags |= F_DISABLE_GC;
-	return res;
+	return 0;
     }
 
     c_p->flags &= ~F_DISABLE_GC;
 
     dgc_tsk_qs = ERTS_PROC_GET_DELAYED_GC_TASK_QS(c_p);
     if (!dgc_tsk_qs)
-	return res;
+	return 0;
 
     /* Move delayed gc tasks into sys tasks queues. */
 
@@ -8387,7 +8384,7 @@ erts_set_gc_state(Process *c_p, int enable)
     if (dgc_tsk_qs)
 	proc_sys_task_queues_free(dgc_tsk_qs);
 
-    return res;
+    return 1;
 }
 
 void
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 7e3c6681d9..3a968594f3 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1990,12 +1990,14 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_
 {
     int encoding = erts_get_native_filename_encoding();
     return erts_convert_filename_to_encoding(name, statbuf, statbuf_size, alloc_type,
-					     allow_empty, allow_atom, encoding, used);
+					     allow_empty, allow_atom, encoding,
+					     used, 0);
 }
 
 char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbuf_size,
 					ErtsAlcType_t alloc_type, int allow_empty,
-					int allow_atom, int encoding, Sint *used)
+					int allow_atom, int encoding, Sint *used,
+					Uint extra)
 {
     char* name_buf = NULL;
 
@@ -2008,13 +2010,14 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
 	}
 	if (encoding == ERL_FILENAME_WIN_WCHAR) {
 	    need += 2;
+	    extra *= 2;
 	} else {
 	    ++need;
 	}
 	if (used) 
 	    *used = (Sint) need;
-	if (need > statbuf_size) {
-	    name_buf = (char *) erts_alloc(alloc_type, need);
+	if (need+extra > statbuf_size) {
+	    name_buf = (char *) erts_alloc(alloc_type, need+extra);
 	} else {
 	    name_buf = statbuf;
 	}
@@ -2035,8 +2038,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
 	    /*Add 0 termination only*/
 	    if (used) 
 		*used = (Sint) size+1;
-	    if (size+1 > statbuf_size) {
-		name_buf = (char *) erts_alloc(alloc_type, size+1);
+	    if (size+1+extra > statbuf_size) {
+		name_buf = (char *) erts_alloc(alloc_type, size+1+extra);
 	    } else {
 		name_buf = statbuf;
 	    }
@@ -2045,7 +2048,7 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
 	} else {
             name_buf = erts_convert_filename_to_wchar(bytes, size,
                                                       statbuf, statbuf_size,
-                                                      alloc_type, used, 0);
+                                                      alloc_type, used, extra);
         }
 	erts_free_aligned_binary_bytes(temp_alloc);
     } else {
diff --git a/erts/emulator/beam/erl_zlib.c b/erts/emulator/beam/erl_zlib.c
index 47fd92988e..8e33144f96 100644
--- a/erts/emulator/beam/erl_zlib.c
+++ b/erts/emulator/beam/erl_zlib.c
@@ -87,6 +87,46 @@ int ZEXPORT erl_zlib_deflate_finish(z_stream *streamp)
     return deflateEnd(streamp);
 }
 
+int ZEXPORT erl_zlib_inflate_start(z_stream *streamp, const Bytef* source,
+                                   uLong sourceLen)
+{
+    streamp->next_in = (Bytef*)source;
+    streamp->avail_in = (uInt)sourceLen;
+    streamp->total_out = streamp->avail_out = 0;
+    streamp->next_out = NULL;
+    erl_zlib_alloc_init(streamp);
+    return inflateInit(streamp);
+}
+/*
+ * Inflate a chunk, The destination length is the limit.
+ * Returns Z_OK if more to process, Z_STREAM_END if we are done.
+ */
+int ZEXPORT erl_zlib_inflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen)
+{
+    int err;
+    uLongf last_tot = streamp->total_out;
+
+    streamp->next_out = dest;
+    streamp->avail_out = (uInt)*destLen;
+
+    if ((uLong)streamp->avail_out != *destLen) return Z_BUF_ERROR;
+
+    err = inflate(streamp, Z_NO_FLUSH);
+    ASSERT(err != Z_STREAM_ERROR);
+    *destLen = streamp->total_out - last_tot;
+    return err;
+}
+
+/*
+ * When we are done, free up the inflate structure
+ * Retyurns Z_OK or Error
+ */
+int ZEXPORT erl_zlib_inflate_finish(z_stream *streamp)
+{
+    return inflateEnd(streamp);
+}
+
+
 int ZEXPORT erl_zlib_compress2 (Bytef* dest, uLongf* destLen,
 				const Bytef* source, uLong sourceLen,
 				int level)
diff --git a/erts/emulator/beam/erl_zlib.h b/erts/emulator/beam/erl_zlib.h
index 5ac849d21c..160166c66b 100644
--- a/erts/emulator/beam/erl_zlib.h
+++ b/erts/emulator/beam/erl_zlib.h
@@ -39,6 +39,12 @@ int ZEXPORT erl_zlib_deflate_start(z_stream *streamp, const Bytef* source,
 int ZEXPORT erl_zlib_deflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen);
 int ZEXPORT erl_zlib_deflate_finish(z_stream *streamp);
 
+int ZEXPORT erl_zlib_inflate_start(z_stream *streamp, const Bytef* source,
+                                   uLong sourceLen);
+int ZEXPORT erl_zlib_inflate_chunk(z_stream *streamp, Bytef* dest, uLongf* destLen);
+int ZEXPORT erl_zlib_inflate_finish(z_stream *streamp);
+
+
 /* Use instead of compress
 */
 #define erl_zlib_compress(dest,destLen,source,sourceLen) \
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 22b0a02937..2cb44a5b64 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -61,6 +61,9 @@
  */
 #    define ERTS_DEBUG_USE_DIST_SEP
 #  endif
+#  define IF_DEBUG(X) X
+#else
+#  define IF_DEBUG(X)
 #endif
 
 /* Does Sint fit in Sint32?
@@ -89,10 +92,11 @@ static int enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep,
 static Uint is_external_string(Eterm obj, int* p_is_string);
 static byte* enc_atom(ErtsAtomCacheMap *, Eterm, byte*, Uint32);
 static byte* enc_pid(ErtsAtomCacheMap *, Eterm, byte*, Uint32);
-static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*);
+struct B2TContext_t;
+static byte* dec_term(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*, struct B2TContext_t*);
 static byte* dec_atom(ErtsDistExternal *, byte*, Eterm*);
 static byte* dec_pid(ErtsDistExternal *, Eterm**, byte*, ErlOffHeap*, Eterm*);
-static Sint decoded_size(byte *ep, byte* endp, int internal_tags);
+static Sint decoded_size(byte *ep, byte* endp, int internal_tags, struct B2TContext_t*);
 static BIF_RETTYPE term_to_binary_trap_1(BIF_ALIST_1);
 
 static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint flags, 
@@ -102,11 +106,19 @@ static Uint encode_size_struct2(ErtsAtomCacheMap *, Eterm, unsigned);
 static int encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj, 
 				  unsigned dflags, Sint *reds, Uint *res);
 
+static Export binary_to_term_trap_export;
+static BIF_RETTYPE binary_to_term_trap_1(BIF_ALIST_1);
+static Eterm binary_to_term_int(Process* p, Uint32 flags, Eterm bin, Binary* context_b);
+
 void erts_init_external(void) {
 #if 1 /* In R16 */
     erts_init_trap_export(&term_to_binary_trap_export,
 			  am_erlang, am_term_to_binary_trap, 1,
 			  &term_to_binary_trap_1);
+
+    erts_init_trap_export(&binary_to_term_trap_export,
+			  am_erlang, am_binary_to_term_trap, 1,
+			  &binary_to_term_trap_1);
 #else
     sys_memset((void *) &term_to_binary_trap_export, 0, sizeof(Export));
     term_to_binary_trap_export.address = &term_to_binary_trap_export.code[3];
@@ -877,7 +889,7 @@ erts_decode_dist_ext_size(ErtsDistExternal *edep)
 	    goto fail;
 	ep = edep->extp+1;
     }
-    res = decoded_size(ep, edep->ext_endp, 0);
+    res = decoded_size(ep, edep->ext_endp, 0, NULL);
     if (res >= 0)
 	return res;
  fail:
@@ -889,12 +901,12 @@ Sint erts_decode_ext_size(byte *ext, Uint size)
 {
     if (size == 0 || *ext != VERSION_MAGIC)
 	return -1;
-    return decoded_size(ext+1, ext+size, 0);
+    return decoded_size(ext+1, ext+size, 0, NULL);
 }
 
 Sint erts_decode_ext_size_ets(byte *ext, Uint size)
 {
-    Sint sz = decoded_size(ext, ext+size, 1);
+    Sint sz = decoded_size(ext, ext+size, 1, NULL);
     ASSERT(sz >= 0);
     return sz;
 }
@@ -927,7 +939,7 @@ erts_decode_dist_ext(Eterm** hpp,
 	    goto error;
 	ep++;
     }
-    ep = dec_term(edep, hpp, ep, off_heap, &obj);
+    ep = dec_term(edep, hpp, ep, off_heap, &obj, NULL);
     if (!ep)
 	goto error;
 
@@ -948,7 +960,7 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext)
     byte *ep = *ext;
     if (*ep++ != VERSION_MAGIC)
 	return THE_NON_VALUE;
-    ep = dec_term(NULL, hpp, ep, off_heap, &obj);
+    ep = dec_term(NULL, hpp, ep, off_heap, &obj, NULL);
     if (!ep) {
 #ifdef DEBUG
 	bin_write(ERTS_PRINT_STDERR,NULL,*ext,500);
@@ -962,7 +974,7 @@ Eterm erts_decode_ext(Eterm **hpp, ErlOffHeap *off_heap, byte **ext)
 Eterm erts_decode_ext_ets(Eterm **hpp, ErlOffHeap *off_heap, byte *ext)
 {
     Eterm obj;
-    ext = dec_term(NULL, hpp, ext, off_heap, &obj);
+    ext = dec_term(NULL, hpp, ext, off_heap, &obj, NULL);
     ASSERT(ext);
     return obj;
 }
@@ -1043,9 +1055,14 @@ static BIF_RETTYPE term_to_binary_trap_1(BIF_ALIST_1)
     Binary *bin = ((ProcBin *) binary_val(bt))->val;
     Eterm res = erts_term_to_binary_int(BIF_P, Term, 0, 0,bin);
     if (is_tuple(res)) {
+	ASSERT(BIF_P->flags & F_DISABLE_GC);
 	BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
     } else {
-	BIF_RET(res);
+        if (erts_set_gc_state(BIF_P, 1)
+            || MSO(BIF_P).overhead > BIN_VHEAP_SZ(BIF_P))
+            ERTS_BIF_YIELD_RETURN(BIF_P, res);
+        else
+            BIF_RET(res);
     }
 }
     
@@ -1053,8 +1070,10 @@ BIF_RETTYPE term_to_binary_1(BIF_ALIST_1)
 {
     Eterm res = erts_term_to_binary_int(BIF_P, BIF_ARG_1, 0, TERM_TO_BINARY_DFLAGS, NULL);
     if (is_tuple(res)) {
+	erts_set_gc_state(BIF_P, 0);
 	BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
     } else {
+	ASSERT(!(BIF_P->flags & F_DISABLE_GC));
 	BIF_RET(res);
     }
 }
@@ -1106,12 +1125,72 @@ BIF_RETTYPE term_to_binary_2(BIF_ALIST_2)
 
     res = erts_term_to_binary_int(p, Term, level, flags, bin);
     if (is_tuple(res)) {
+	erts_set_gc_state(p, 0);
 	BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
     } else {
+	ASSERT(!(BIF_P->flags & F_DISABLE_GC));
 	BIF_RET(res);
     }
 }
 
+
+enum B2TState { /* order is somewhat significant */
+    B2TPrepare,
+    B2TUncompressChunk,
+    B2TSizeInit,
+    B2TSize,
+    B2TDecodeInit,
+    B2TDecode,
+    B2TDecodeList,
+    B2TDecodeTuple,
+    B2TDecodeString,
+    B2TDecodeBinary,
+
+    B2TDone,
+    B2TDecodeFail,
+    B2TBadArg
+};
+
+typedef struct {
+    int heap_size;
+    int terms;
+    byte* ep;
+    int atom_extra_skip;
+} B2TSizeContext;
+
+typedef struct {
+    byte*  ep;
+    Eterm  res;
+    Eterm* next;
+    Eterm* hp_start;
+    Eterm* hp;
+    Eterm* hp_end;
+    int remaining_n;
+    char* remaining_bytes;
+} B2TDecodeContext;
+
+typedef struct {
+    z_stream stream;
+    byte* dbytes;
+    Uint dleft;
+} B2TUncompressContext;
+
+typedef struct B2TContext_t {
+    Sint heap_size;
+    byte* aligned_alloc;
+    ErtsBinary2TermState b2ts;
+    Uint32 flags;
+    SWord reds;
+    Eterm trap_bin;
+    enum B2TState state;
+    union {
+	B2TSizeContext sc;
+	B2TDecodeContext dc;
+	B2TUncompressContext uc;
+    } u;
+} B2TContext;
+
+
 static uLongf binary2term_uncomp_size(byte* data, Sint size)
 {
     z_stream stream;
@@ -1141,48 +1220,62 @@ static uLongf binary2term_uncomp_size(byte* data, Sint size)
     return err == Z_STREAM_END ? uncomp_size : 0;
 }
 
-static ERTS_INLINE Sint
-binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size)
+static ERTS_INLINE int
+binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size,
+		    B2TContext* ctx)
 {
-    Sint res;
     byte *bytes = data;
     Sint size = data_size;
 
     state->exttmp = 0;
 
     if (size < 1 || *bytes != VERSION_MAGIC) {
-    error:
-	if (state->exttmp)
-	    erts_free(ERTS_ALC_T_TMP, state->extp);
-	state->extp = NULL;
-	state->exttmp = 0;
 	return -1;
     }
     bytes++;
     size--;
     if (size < 5 || *bytes != COMPRESSED) {
 	state->extp = bytes;
+        if (ctx)
+	    ctx->state = B2TSizeInit;
     }
     else  {
 	uLongf dest_len = (Uint32) get_int32(bytes+1);
 	bytes += 5;
 	size -= 5;	
 	if (dest_len > 32*1024*1024
-	    || (state->extp = erts_alloc_fnf(ERTS_ALC_T_TMP, dest_len)) == NULL) {
+	    || (state->extp = erts_alloc_fnf(ERTS_ALC_T_EXT_TERM_DATA, dest_len)) == NULL) {
+            /*
+             * Try avoid out-of-memory crash due to corrupted 'dest_len'
+             * by checking the actual length of the uncompressed data.
+             * The only way to do that is to uncompress it. Sad but true.
+             */
 	    if (dest_len != binary2term_uncomp_size(bytes, size)) {
-		goto error;
+                return -1;
 	    }
-	    state->extp = erts_alloc(ERTS_ALC_T_TMP, dest_len);
+	    state->extp = erts_alloc(ERTS_ALC_T_EXT_TERM_DATA, dest_len);
+            ctx->reds -= dest_len;
 	}
 	state->exttmp = 1;
-	if (erl_zlib_uncompress(state->extp, &dest_len, bytes, size) != Z_OK)
-	    goto error;
+        if (ctx) {
+	    if (erl_zlib_inflate_start(&ctx->u.uc.stream, bytes, size) != Z_OK)
+		return -1;
+
+	    ctx->u.uc.dbytes = state->extp;
+	    ctx->u.uc.dleft = dest_len;
+	    ctx->state = B2TUncompressChunk;
+        }
+	else {
+	    uLongf dlen = dest_len;
+	    if (erl_zlib_uncompress(state->extp, &dlen, bytes, size) != Z_OK
+		|| dlen != dest_len) {
+		return -1;
+	    }
+        }
 	size = (Sint) dest_len;
     }
-    res = decoded_size(state->extp, state->extp + size, 0);
-    if (res < 0)
-	goto error;
-    return res;
+    state->extsize = size;
+    return 0;
 }
 
 static ERTS_INLINE void
@@ -1190,7 +1283,7 @@ binary2term_abort(ErtsBinary2TermState *state)
 {
     if (state->exttmp) {
 	state->exttmp = 0;
-	erts_free(ERTS_ALC_T_TMP, state->extp);
+	erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp);
     }
 }
 
@@ -1198,11 +1291,11 @@ static ERTS_INLINE Eterm
 binary2term_create(ErtsDistExternal *edep, ErtsBinary2TermState *state, Eterm **hpp, ErlOffHeap *ohp)
 {
     Eterm res;
-    if (!dec_term(edep, hpp, state->extp, ohp, &res))
+    if (!dec_term(edep, hpp, state->extp, ohp, &res, NULL))
 	res = THE_NON_VALUE;
     if (state->exttmp) {
 	state->exttmp = 0;
-	erts_free(ERTS_ALC_T_TMP, state->extp);
+	erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp);
     }
     return res;
 }
@@ -1210,7 +1303,18 @@ binary2term_create(ErtsDistExternal *edep, ErtsBinary2TermState *state, Eterm **
 Sint
 erts_binary2term_prepare(ErtsBinary2TermState *state, byte *data, Sint data_size)
 {
-    return binary2term_prepare(state, data, data_size);
+    Sint res;
+
+    if (binary2term_prepare(state, data, data_size, NULL) < 0 ||
+        (res=decoded_size(state->extp, state->extp + state->extsize, 0, NULL)) < 0) {
+
+        if (state->exttmp)
+            erts_free(ERTS_ALC_T_EXT_TERM_DATA, state->extp);
+        state->extp = NULL;
+	state->exttmp = 0;
+	return -1;
+    }
+    return res;
 }
 
 void
@@ -1225,68 +1329,233 @@ erts_binary2term_create(ErtsBinary2TermState *state, Eterm **hpp, ErlOffHeap *oh
     return binary2term_create(NULL,state, hpp, ohp);
 }
 
-BIF_RETTYPE binary_to_term_1(BIF_ALIST_1)
+static void b2t_destroy_context(B2TContext* context)
 {
-    Sint heap_size;
-    Eterm res;
+    erts_free_aligned_binary_bytes_extra(context->aligned_alloc,
+                                         ERTS_ALC_T_EXT_TERM_DATA);
+    context->aligned_alloc = NULL;
+    binary2term_abort(&context->b2ts);
+    if (context->state == B2TUncompressChunk) {
+	erl_zlib_inflate_finish(&context->u.uc.stream);
+    }
+}
+
+static void b2t_context_destructor(Binary *context_bin)
+{
+    B2TContext* ctx = (B2TContext*) ERTS_MAGIC_BIN_DATA(context_bin);
+    ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(context_bin) == b2t_context_destructor);
+
+    b2t_destroy_context(ctx);
+}
+
+static BIF_RETTYPE binary_to_term_trap_1(BIF_ALIST_1)
+{
+    Binary *context_bin = ((ProcBin *) binary_val(BIF_ARG_1))->val;
+    ASSERT(ERTS_MAGIC_BIN_DESTRUCTOR(context_bin) == b2t_context_destructor);
+
+    return binary_to_term_int(BIF_P, 0, THE_NON_VALUE, context_bin);
+}
+
+
+#define B2T_BYTES_PER_REDUCTION 128
+#define B2T_MEMCPY_FACTOR 8
+
+/* Define for testing */
+/*#define EXTREME_B2T_TRAPPING 1*/
+
+#ifdef EXTREME_B2T_TRAPPING
+static unsigned b2t_rand(void)
+{
+    static unsigned prev = 17;
+    prev = (prev * 214013 + 2531011);
+    return prev;
+}
+#endif
+
+
+static B2TContext* b2t_export_context(Process* p, B2TContext* src)
+{
+    Binary* context_b = erts_create_magic_binary(sizeof(B2TContext),
+                                                 b2t_context_destructor);
+    B2TContext* ctx = ERTS_MAGIC_BIN_DATA(context_b);
     Eterm* hp;
-    Eterm* endp;
-    Sint size;
-    byte* bytes;
-    byte* temp_alloc = NULL;
-    ErtsBinary2TermState b2ts;
+    sys_memcpy(ctx, src, sizeof(B2TContext));
+    if (ctx->state >= B2TDecode && ctx->u.dc.next == &src->u.dc.res) {
+        ctx->u.dc.next = &ctx->u.dc.res;
+    }
+    hp = HAlloc(p, PROC_BIN_SIZE);
+    ctx->trap_bin = erts_mk_magic_binary_term(&hp, &MSO(p), context_b);
+    return ctx;
+}
 
-    if ((bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc)) == NULL) {
-    error:
-	erts_free_aligned_binary_bytes(temp_alloc);
-	BIF_ERROR(BIF_P, BADARG);
+static Eterm binary_to_term_int(Process* p, Uint32 flags, Eterm bin, Binary* context_b)
+{
+#ifdef EXTREME_B2T_TRAPPING
+    SWord initial_reds = 1 + b2t_rand() % 4;
+#else
+    SWord initial_reds = (Uint)(ERTS_BIF_REDS_LEFT(p) * B2T_BYTES_PER_REDUCTION);
+#endif
+    B2TContext c_buff;
+    B2TContext *ctx;
+    int is_first_call;
+
+    if (context_b == NULL) {
+	/* Setup enough to get started */
+        is_first_call = 1;
+        ctx = &c_buff;
+	ctx->state = B2TPrepare;
+        ctx->aligned_alloc = NULL;
+        ctx->flags = flags;
+        IF_DEBUG(ctx->trap_bin = THE_NON_VALUE;)
+    } else {
+        is_first_call = 0;
+	ctx = ERTS_MAGIC_BIN_DATA(context_b);
+        ASSERT(ctx->state != B2TPrepare);
     }
-    size = binary_size(BIF_ARG_1);
+    ctx->reds = initial_reds;
+
+    do {
+        switch (ctx->state) {
+        case B2TPrepare: {
+	    byte* bytes;
+            Uint bin_size;
+            bytes = erts_get_aligned_binary_bytes_extra(bin,
+                                                        &ctx->aligned_alloc,
+                                                        ERTS_ALC_T_EXT_TERM_DATA,
+                                                        0);
+            if (bytes == NULL) {
+                ctx->b2ts.exttmp = 0;
+                ctx->state = B2TBadArg;
+                break;
+            }
+            bin_size = binary_size(bin);
+            if (ctx->aligned_alloc) {
+                ctx->reds -= bin_size / 8;
+            }
+            if (binary2term_prepare(&ctx->b2ts, bytes, bin_size, ctx) < 0) {
+		ctx->state = B2TBadArg;
+	    }
+            break;
+        }
+	case B2TUncompressChunk: {
+            uLongf chunk = ctx->reds;
+            int zret;
+
+            if (chunk > ctx->u.uc.dleft)
+                chunk = ctx->u.uc.dleft;
+            zret = erl_zlib_inflate_chunk(&ctx->u.uc.stream,
+                                          ctx->u.uc.dbytes, &chunk);
+            ctx->u.uc.dbytes += chunk;
+            ctx->u.uc.dleft  -= chunk;
+            if (zret == Z_OK && ctx->u.uc.dleft > 0) {
+                ctx->reds = 0;
+            }
+            else if (erl_zlib_inflate_finish(&ctx->u.uc.stream) == Z_OK
+                     && zret == Z_STREAM_END
+                     && ctx->u.uc.dleft == 0) {
+                ctx->reds -= chunk;
+                ctx->state = B2TSizeInit;
+            }
+            else {
+                ctx->state = B2TBadArg;
+            }
+            break;
+        }
+	case B2TSizeInit:
+	    ctx->u.sc.ep = NULL;
+	    ctx->state = B2TSize;
+	    /*fall through*/
+        case B2TSize:
+            ctx->heap_size = decoded_size(ctx->b2ts.extp,
+					  ctx->b2ts.extp + ctx->b2ts.extsize,
+                                          0, ctx);
+            break;
+
+        case B2TDecodeInit:
+            if (ctx == &c_buff && ctx->b2ts.extsize > ctx->reds) {
+                /* dec_term will maybe trap, allocate space for magic bin
+                   before result term to make it easy to trim with HRelease.
+                 */
+                ctx = b2t_export_context(p, &c_buff);
+            }
+            ctx->u.dc.ep = ctx->b2ts.extp;
+            ctx->u.dc.res = (Eterm) (UWord) NULL;
+            ctx->u.dc.next = &ctx->u.dc.res;
+            ctx->u.dc.hp_start = HAlloc(p, ctx->heap_size);
+            ctx->u.dc.hp       = ctx->u.dc.hp_start;
+            ctx->u.dc.hp_end   = ctx->u.dc.hp_start + ctx->heap_size;
+            ctx->state = B2TDecode;
+            /*fall through*/
+	case B2TDecode:
+        case B2TDecodeList:
+        case B2TDecodeTuple:
+        case B2TDecodeString:
+        case B2TDecodeBinary: {
+	    ErtsDistExternal fakedep;
+            fakedep.flags = ctx->flags;
+            dec_term(&fakedep, NULL, NULL, &MSO(p), NULL, ctx);
+            break;
+	}
+        case B2TDecodeFail:
+            HRelease(p, ctx->u.dc.hp_end, ctx->u.dc.hp_start);
+            /*fall through*/
+        case B2TBadArg:
+            b2t_destroy_context(ctx);
+            if (!is_first_call) {
+                erts_set_gc_state(p, 1);
+            }
+            BUMP_REDS(p, (initial_reds - ctx->reds) / B2T_BYTES_PER_REDUCTION);
+            BIF_ERROR(p, BADARG & ~EXF_SAVETRACE);
 
-    heap_size = binary2term_prepare(&b2ts, bytes, size);
-    if (heap_size < 0)
-	goto error;
+        case B2TDone:
+            b2t_destroy_context(ctx);
 
-    hp = HAlloc(BIF_P, heap_size);
-    endp = hp + heap_size;
+            if (ctx->u.dc.hp > ctx->u.dc.hp_end) {
+                erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n",
+                         __FILE__, __LINE__, ctx->u.dc.hp - ctx->u.dc.hp_end);
+            }
+            HRelease(p, ctx->u.dc.hp_end, ctx->u.dc.hp);
 
-    res = binary2term_create(NULL, &b2ts, &hp, &MSO(BIF_P));
+            if (!is_first_call) {
+                erts_set_gc_state(p, 1);
+            }
+            BUMP_REDS(p, (initial_reds - ctx->reds) / B2T_BYTES_PER_REDUCTION);
+            return ctx->u.dc.res;
 
-    erts_free_aligned_binary_bytes(temp_alloc);
+        default:
+            ASSERT(!"Unknown state in binary_to_term");
+        }
+    }while (ctx->reds > 0 || ctx->state >= B2TDone);
 
-    if (hp > endp) {
-	erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n",
-		 __FILE__, __LINE__, hp-endp);
+    if (ctx == &c_buff) {
+        ASSERT(ctx->trap_bin == THE_NON_VALUE);
+        ctx = b2t_export_context(p, &c_buff);
     }
+    ASSERT(ctx->trap_bin != THE_NON_VALUE);
 
-    HRelease(BIF_P, endp, hp);
-
-    if (res == THE_NON_VALUE)
-	goto error;
+    if (is_first_call) {
+        erts_set_gc_state(p, 0);
+    }
+    BUMP_ALL_REDS(p);
+    BIF_TRAP1(&binary_to_term_trap_export, p, ctx->trap_bin);
+}
 
-    return res;
+BIF_RETTYPE erts_internal_binary_to_term_1(BIF_ALIST_1)
+{
+    return binary_to_term_int(BIF_P, 0, BIF_ARG_1, NULL);
 }
 
-BIF_RETTYPE binary_to_term_2(BIF_ALIST_2)
+BIF_RETTYPE erts_internal_binary_to_term_2(BIF_ALIST_2)
 {
-    Sint heap_size;
-    Eterm res;
     Eterm opts;
     Eterm opt;
-    Eterm* hp;
-    Eterm* endp;
-    Sint size;
-    byte* bytes;
-    byte* temp_alloc = NULL;
-    ErtsBinary2TermState b2ts;
-    ErtsDistExternal fakedep;
+    Uint32 flags = 0;
 
-    fakedep.flags = 0;
     opts = BIF_ARG_2;
     while (is_list(opts)) {
         opt = CAR(list_val(opts));
         if (opt == am_safe) {
-	    fakedep.flags |= ERTS_DIST_EXT_BTT_SAFE;
+            flags |= ERTS_DIST_EXT_BTT_SAFE;
         }
 	else {
             goto error;
@@ -1297,35 +1566,10 @@ BIF_RETTYPE binary_to_term_2(BIF_ALIST_2)
     if (is_not_nil(opts))
         goto error;
 
-    if ((bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc)) == NULL) {
-    error:
-	erts_free_aligned_binary_bytes(temp_alloc);
-	BIF_ERROR(BIF_P, BADARG);
-    }
-    size = binary_size(BIF_ARG_1);
-
-    heap_size = binary2term_prepare(&b2ts, bytes, size);
-    if (heap_size < 0)
-	goto error;
-
-    hp = HAlloc(BIF_P, heap_size);
-    endp = hp + heap_size;
-
-    res = binary2term_create(&fakedep, &b2ts, &hp, &MSO(BIF_P));
-
-    erts_free_aligned_binary_bytes(temp_alloc);
-
-    if (hp > endp) {
-	erl_exit(1, ":%s, line %d: heap overrun by %d words(s)\n",
-		 __FILE__, __LINE__, hp-endp);
-    }
-
-    HRelease(BIF_P, endp, hp);
-
-    if (res == THE_NON_VALUE)
-	goto error;
+    return binary_to_term_int(BIF_P, flags, BIF_ARG_1, NULL);
 
-    return res;
+error:
+    BIF_ERROR(BIF_P, BADARG);
 }
 
 Eterm
@@ -1473,12 +1717,10 @@ erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags) {
 /* #define EXTREME_TTB_TRAPPING 1 */
 
 #ifndef EXTREME_TTB_TRAPPING
-#define TERM_TO_BINARY_LOOP_FACTOR 500
-#define TERM_TO_BINARY_SIZE_FACTOR 500000
-#define TERM_TO_BINARY_COMPRESS_CHUNK 500000
+#define TERM_TO_BINARY_LOOP_FACTOR 32
+#define TERM_TO_BINARY_COMPRESS_CHUNK (1 << 18)
 #else
 #define TERM_TO_BINARY_LOOP_FACTOR 1
-#define TERM_TO_BINARY_SIZE_FACTOR 10
 #define TERM_TO_BINARY_COMPRESS_CHUNK 10
 #endif
 
@@ -1514,7 +1756,7 @@ typedef struct {
     } s;
 } TTBContext;
 
-static void context_destructor(Binary *context_bin) 
+static void ttb_context_destructor(Binary *context_bin)
 {
     TTBContext *context = ERTS_MAGIC_BIN_DATA(context_bin);
     if (context->alive) {
@@ -1567,7 +1809,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
     do {								\
 	if (context_b == NULL) {					\
 	    context_b = erts_create_magic_binary(sizeof(TTBContext),    \
-                                                 context_destructor);   \
+                                                 ttb_context_destructor);   \
 	    context =  ERTS_MAGIC_BIN_DATA(context_b);			\
 	    memcpy(context,&c_buff,sizeof(TTBContext));			\
 	}								\
@@ -1615,7 +1857,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 		    /* Finish in one go */
 		    res = erts_term_to_binary_simple(p, Term, size, 
 						     level, flags);
-		    BUMP_REDS(p, size / TERM_TO_BINARY_SIZE_FACTOR);
+		    BUMP_REDS(p, 1);
 		    return res;
 		}
 
@@ -2604,21 +2846,112 @@ undo_offheap_in_area(ErlOffHeap* off_heap, Eterm* start, Eterm* end)
 #endif /* DEBUG */
 }
 
+
 /* Decode term from external format into *objp.
 ** On failure return NULL and (R13B04) *hpp will be unchanged.
 */
 static byte*
-dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Eterm* objp)
+dec_term(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap,
+         Eterm* objp, B2TContext* ctx)
 {
-    Eterm* hp_saved = *hpp;
+    Eterm* hp_saved;
     int n;
     ErtsAtomEncoding char_enc;
-    register Eterm* hp = *hpp;	/* Please don't take the address of hp */
-    Eterm* next = objp;
+    register Eterm* hp;        /* Please don't take the address of hp */
+    Eterm* next;
+    SWord reds;
+
+    if (ctx) {
+        hp_saved = ctx->u.dc.hp_start;
+        reds     = ctx->reds;
+        next     = ctx->u.dc.next;
+        ep       = ctx->u.dc.ep;
+        hpp      = &ctx->u.dc.hp;
+
+        if (ctx->state != B2TDecode) {
+            int n_limit = reds;
+
+	    n = ctx->u.dc.remaining_n;
+            if (ctx->state == B2TDecodeBinary) {
+                n_limit *= B2T_MEMCPY_FACTOR;
+                ASSERT(n_limit >= reds);
+		reds -= n / B2T_MEMCPY_FACTOR;
+            }
+	    else
+		reds -= n;
 
-    *next = (Eterm) (UWord) NULL;
+            if (n > n_limit) {
+                ctx->u.dc.remaining_n -= n_limit;
+                n = n_limit;
+                reds = 0;
+            }
+            else {
+                ctx->u.dc.remaining_n = 0;
+            }
+
+            switch (ctx->state) {
+            case B2TDecodeList:
+                objp = next - 2;
+                while (n > 0) {
+                    objp[0] = (Eterm) COMPRESS_POINTER(next);
+                    objp[1] = make_list(next);
+                    next = objp;
+                    objp -= 2;
+                    n--;
+                }
+                break;
+
+            case B2TDecodeTuple:
+                objp = next - 1;
+                while (n-- > 0) {
+                    objp[0] = (Eterm) COMPRESS_POINTER(next);
+                    next = objp;
+                    objp--;
+                }
+                break;
+
+            case B2TDecodeString:
+                hp = *hpp;
+                hp[-1] = make_list(hp);  /* overwrite the premature NIL */
+                while (n-- > 0) {
+                    hp[0] = make_small(*ep++);
+                    hp[1] = make_list(hp+2);
+                    hp += 2;
+                }
+                hp[-1] = NIL;
+                *hpp = hp;
+                break;
+
+            case B2TDecodeBinary:
+                sys_memcpy(ctx->u.dc.remaining_bytes, ep, n);
+                ctx->u.dc.remaining_bytes += n;
+                ep += n;
+                break;
+
+            default:
+                ASSERT(!"Unknown state");
+            }
+            if (!ctx->u.dc.remaining_n) {
+                ctx->state = B2TDecode;
+            }
+            if (reds <= 0) {
+                ctx->u.dc.next = next;
+                ctx->u.dc.ep = ep;
+                ctx->reds = 0;
+                return NULL;
+            }
+        }
+    }
+    else {
+        hp_saved = *hpp;
+        reds = ERTS_SWORD_MAX;
+        next = objp;
+        *next = (Eterm) (UWord) NULL;
+    }
+    hp = *hpp;
 
     while (next != NULL) {
+
 	objp = next;
 	next = (Eterm *) EXPAND_POINTER(*objp);
 
@@ -2738,7 +3071,16 @@ dec_term_atom_common:
 	    *objp = make_tuple(hp);
 	    *hp++ = make_arityval(n);
 	    hp += n;
-	    objp = hp - 1;
+            objp = hp - 1;
+            if (ctx) {
+                if (reds < n) {
+                    ASSERT(reds > 0);
+                    ctx->state = B2TDecodeTuple;
+                    ctx->u.dc.remaining_n = n - reds;
+                    n = reds;
+                }
+		reds -= n;
+	    }
 	    while (n-- > 0) {
 		objp[0] = (Eterm) COMPRESS_POINTER(next);
 		next = objp;
@@ -2756,17 +3098,27 @@ dec_term_atom_common:
 		break;
 	    }
 	    *objp = make_list(hp);
-	    hp += 2*n;
+            hp += 2 * n;
 	    objp = hp - 2;
 	    objp[0] = (Eterm) COMPRESS_POINTER((objp+1));
 	    objp[1] = (Eterm) COMPRESS_POINTER(next);
 	    next = objp;
 	    objp -= 2;
-	    while (--n > 0) {
+            n--;
+	    if (ctx) {
+                if (reds < n) {
+		    ctx->state = B2TDecodeList;
+		    ctx->u.dc.remaining_n = n - reds;
+		    n = reds;
+		}
+		reds -= n;
+	    }
+            while (n > 0) {
 		objp[0] = (Eterm) COMPRESS_POINTER(next);
-		objp[1] = make_list(objp + 2);
+		objp[1] = make_list(next);
 		next = objp;
 		objp -= 2;
+                n--;
 	    }
 	    break;
 	case STRING_EXT:
@@ -2777,6 +3129,14 @@ dec_term_atom_common:
 		break;
 	    }
 	    *objp = make_list(hp);
+            if (ctx) {
+                if (reds < n) {
+                    ctx->state = B2TDecodeString;
+                    ctx->u.dc.remaining_n = n - reds;
+                    n = reds;
+		}
+                reds -= n;
+            }
 	    while (n-- > 0) {
 		hp[0] = make_small(*ep++);
 		hp[1] = make_list(hp+2);
@@ -2984,7 +3344,6 @@ dec_term_atom_common:
 		    dbin->flags = 0;
 		    dbin->orig_size = n;
 		    erts_refc_init(&dbin->refc, 1);
-		    sys_memcpy(dbin->orig_bytes, ep, n);
 		    pb = (ProcBin *) hp;
 		    hp += PROC_BIN_SIZE;
 		    pb->thing_word = HEADER_PROC_BIN;
@@ -2995,7 +3354,20 @@ dec_term_atom_common:
 		    pb->bytes = (byte*) dbin->orig_bytes;
 		    pb->flags = 0;
 		    *objp = make_binary(pb);
-		}
+                    if (ctx) {
+                        int n_limit = reds * B2T_MEMCPY_FACTOR;
+                        if (n > n_limit) {
+                            ctx->state = B2TDecodeBinary;
+                            ctx->u.dc.remaining_n = n - n_limit;
+                            ctx->u.dc.remaining_bytes = dbin->orig_bytes + n_limit;
+                            n = n_limit;
+                            reds = 0;
+                        }
+                        else
+                            reds -= n / B2T_MEMCPY_FACTOR;
+                    }
+                    sys_memcpy(dbin->orig_bytes, ep, n);
+                }
 		ep += n;
 		break;
 	    }
@@ -3018,13 +3390,14 @@ dec_term_atom_common:
 		    sys_memcpy(hb->data, ep, n);
 		    bin = make_binary(hb);
 		    hp += heap_bin_size(n);
+                    ep += n;
 		} else {
 		    Binary* dbin = erts_bin_nrml_alloc(n);
 		    ProcBin* pb;
+
 		    dbin->flags = 0;
 		    dbin->orig_size = n;
 		    erts_refc_init(&dbin->refc, 1);
-		    sys_memcpy(dbin->orig_bytes, ep, n);
 		    pb = (ProcBin *) hp;
 		    pb->thing_word = HEADER_PROC_BIN;
 		    pb->size = n;
@@ -3035,8 +3408,23 @@ dec_term_atom_common:
 		    pb->flags = 0;
 		    bin = make_binary(pb);
 		    hp += PROC_BIN_SIZE;
-		}
-		ep += n;
+                    if (ctx) {
+                        int n_limit = reds * B2T_MEMCPY_FACTOR;
+                        if (n > n_limit) {
+                            ctx->state = B2TDecodeBinary;
+                            ctx->u.dc.remaining_n = n - n_limit;
+                            ctx->u.dc.remaining_bytes = dbin->orig_bytes + n_limit;
+                            n = n_limit;
+                            reds = 0;
+                        }
+                        else
+                            reds -= n / B2T_MEMCPY_FACTOR;
+                    }
+                    sys_memcpy(dbin->orig_bytes, ep, n);
+                    ep += n;
+                    n = pb->size;
+                }
+
 		if (bitsize == 8 || n == 0) {
 		    *objp = bin;
 		} else {
@@ -3067,7 +3455,7 @@ dec_term_atom_common:
 		    goto error;
 		}
 		*hpp = hp;
-		ep = dec_term(edep, hpp, ep, off_heap, &temp);
+		ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL);
 		hp = *hpp;
 		if (ep == NULL) {
 		    goto error;
@@ -3127,7 +3515,7 @@ dec_term_atom_common:
 		}
 		*hpp = hp;
 		/* Index */
-		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -3136,7 +3524,7 @@ dec_term_atom_common:
 		old_index = unsigned_val(temp);
 
 		/* Uniq */
-		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -3204,7 +3592,7 @@ dec_term_atom_common:
 		}
 
 		/* Index */
-		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -3213,7 +3601,7 @@ dec_term_atom_common:
 		old_index = unsigned_val(temp);
 
 		/* Uniq */
-		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp)) == NULL) {
+		if ((ep = dec_term(edep, hpp, ep, off_heap, &temp, NULL)) == NULL) {
 		    goto error;
 		}
 		if (!is_small(temp)) {
@@ -3313,8 +3701,31 @@ dec_term_atom_common:
 	    }
 	    undo_offheap_in_area(off_heap, hp_saved, hp);
 	    *hpp = hp_saved;
-	    return NULL;
+            if (ctx) {
+                ctx->state = B2TDecodeFail;
+		ctx->reds = reds;
+            }
+            return NULL;
 	}
+
+        if (--reds <= 0) {
+            if (ctx) {
+                if (next || ctx->state != B2TDecode) {
+                    ctx->u.dc.ep = ep;
+                    ctx->u.dc.next = next;
+                    ctx->u.dc.hp = hp;
+                    ctx->reds = 0;
+                    return NULL;
+                }
+            }
+            else {
+                reds = ERTS_SWORD_MAX;
+            }
+        }
+    }
+    if (ctx) {
+        ctx->state = B2TDone;
+	ctx->reds = reds;
     }
     *hpp = hp;
     return ep;
@@ -3602,18 +4013,37 @@ encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj,
 }
 
 static Sint
-decoded_size(byte *ep, byte* endp, int internal_tags)
+decoded_size(byte *ep, byte* endp, int internal_tags, B2TContext* ctx)
 {
-    int heap_size = 0;
+    int heap_size;
     int terms;
-    int atom_extra_skip = 0;
+    int atom_extra_skip;
     Uint n;
+    SWord reds;
+
+    if (ctx) {
+        reds = ctx->reds;
+        if (ctx->u.sc.ep) {
+            heap_size = ctx->u.sc.heap_size;
+            terms = ctx->u.sc.terms;
+            ep = ctx->u.sc.ep;
+            atom_extra_skip = ctx->u.sc.atom_extra_skip;
+            goto init_done;
+        }
+    }
+    else
+        reds = 0; /* not used but compiler warns anyway */
+
+    heap_size = 0;
+    terms = 1;
+    atom_extra_skip = 0;
+init_done:
 
 #define SKIP(sz)				\
     do {					\
 	if ((sz) <= endp-ep) {			\
 	    ep += (sz);				\
-        } else { return -1; };			\
+        } else { goto error; };			\
     } while (0)
 
 #define SKIP2(sz1, sz2)				\
@@ -3621,25 +4051,24 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	Uint sz = (sz1) + (sz2);		\
 	if (sz1 < sz && (sz) <= endp-ep) {	\
 	    ep += (sz);				\
-        } else { return -1; }			\
+        } else { goto error; }			\
     } while (0)
 
 #define CHKSIZE(sz)				\
     do {					\
-	 if ((sz) > endp-ep) { return -1; }	\
+	 if ((sz) > endp-ep) { goto error; }	\
     } while (0)
 
 #define ADDTERMS(n)				\
     do {					\
         int before = terms;		        \
 	terms += (n);                           \
-	if (terms < before) return -1;     	\
+	if (terms < before) goto error;     	\
     } while (0)
 
-
-    for (terms=1; terms > 0; terms--) {
-	int tag;
-
+    ASSERT(terms > 0);
+    do {
+        int tag;
 	CHKSIZE(1);
 	tag = ep++[0];
 	switch (tag) {
@@ -3660,7 +4089,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	    CHKSIZE(4);
 	    n = get_int32(ep);
 	    if (n > BIG_ARITY_MAX*sizeof(ErtsDigit)) {
-		return -1;
+		goto error;
 	    }
 	    SKIP2(n,4+1);		/* skip, size,sign,digits */
 	    heap_size += 1+1+(n+sizeof(Eterm)-1)/sizeof(Eterm); /* XXX: 1 too much? */
@@ -3669,7 +4098,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	    CHKSIZE(2);
 	    n = get_int16(ep);
 	    if (n > MAX_ATOM_CHARACTERS) {
-		return -1;
+		goto error;
 	    }
 	    SKIP(n+2+atom_extra_skip);
 	    atom_extra_skip = 0;
@@ -3679,7 +4108,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	    n = get_int16(ep);
 	    ep += 2;
 	    if (n > MAX_ATOM_SZ_LIMIT) {
-		return -1;
+		goto error;
 	    }
 	    SKIP(n+atom_extra_skip);
 	    atom_extra_skip = 0;
@@ -3688,7 +4117,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	    CHKSIZE(1);
 	    n = get_int8(ep);
 	    if (n > MAX_ATOM_CHARACTERS) {
-		return -1;
+		goto error;
 	    }
 	    SKIP(n+1+atom_extra_skip);
 	    atom_extra_skip = 0;
@@ -3698,7 +4127,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 	    n = get_int8(ep);
 	    ep++;
 	    if (n > MAX_ATOM_SZ_LIMIT) {
-		return -1;
+		goto error;
 	    }
 	    SKIP(n+atom_extra_skip);
 	    atom_extra_skip = 0;
@@ -3727,7 +4156,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 		id_words = get_int16(ep);
 		    
 		if (id_words > ERTS_MAX_REF_NUMBERS)
-		    return -1;
+		    goto error;
 
 		ep += 2;
 		atom_extra_skip = 1 + 4*id_words;
@@ -3829,7 +4258,7 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 		num_free = get_int32(ep);
 		ep += 4;
 		if (num_free > MAX_ARG) {
-		    return -1;
+		    goto error;
 		}
 		terms += 4 + num_free;
 		heap_size += ERL_FUN_SIZE + num_free;
@@ -3846,24 +4275,47 @@ decoded_size(byte *ep, byte* endp, int internal_tags)
 
 	case BINARY_INTERNAL_REF:
 	    if (!internal_tags) {
-		return -1;
+		goto error;
 	    }
 	    SKIP(sizeof(ProcBin));
 	    heap_size += PROC_BIN_SIZE;
 	    break;
 	case BIT_BINARY_INTERNAL_REF:
 	    if (!internal_tags) {
-		return -1;
+		goto error;
 	    }
 	    SKIP(2+sizeof(ProcBin));
 	    heap_size += PROC_BIN_SIZE + ERL_SUB_BIN_SIZE;
 	    break;
 	default:
-	    return -1;
+	    goto error;
 	}
-    }
+        terms--;
+
+        if (ctx && --reds <= 0 && terms > 0) {
+            ctx->u.sc.heap_size = heap_size;
+            ctx->u.sc.terms = terms;
+            ctx->u.sc.ep = ep;
+            ctx->u.sc.atom_extra_skip = atom_extra_skip;
+            ctx->reds = 0;
+            return 0;
+        }
+    }while (terms > 0);
+
     /* 'terms' may be non-zero if it has wrapped around */
-    return terms==0 ? heap_size : -1;
+    if (terms == 0) {
+        if (ctx) {
+            ctx->state = B2TDecodeInit;
+            ctx->reds = reds;
+        }
+        return heap_size;
+    }
+
+error:
+    if (ctx) {
+        ctx->state = B2TBadArg;
+    }
+    return -1;
 #undef SKIP
 #undef SKIP2
 #undef CHKSIZE
diff --git a/erts/emulator/beam/external.h b/erts/emulator/beam/external.h
index ff29e84972..83001b2c7e 100644
--- a/erts/emulator/beam/external.h
+++ b/erts/emulator/beam/external.h
@@ -146,6 +146,7 @@ typedef struct {
 typedef struct {
     byte *extp;
     int exttmp;
+    Uint extsize;
 } ErtsBinary2TermState;
 
 /* -------------------------------------------------------------------------- */
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 94bc1b172a..6e5d352e5b 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -921,7 +921,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf,
 					ErtsAlcType_t alloc_type,
 					int allow_empty, int allow_atom,
 					int encoding,
-					Sint *used /* out */);
+					Sint *used /* out */,
+					Uint extra);
 char* erts_convert_filename_to_wchar(byte* bytes, Uint size,
                                      char *statbuf, size_t statbuf_size,
                                      ErtsAlcType_t alloc_type, Sint* used,
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index d4623c0450..49af86b36a 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -7061,7 +7061,7 @@ void *driver_dl_open(char * path)
     int res;
     int *last_error_p = erts_smp_tsd_get(driver_list_last_error_key);
     int locked = maybe_lock_driver_list();
-    if ((res = erts_sys_ddll_open(path, &ptr)) == 0) {
+    if ((res = erts_sys_ddll_open(path, &ptr, NULL)) == 0) {
 	maybe_unlock_driver_list(locked);
 	return ptr;
     } else {
diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h
index 31252ed78f..189d9ebac8 100644
--- a/erts/emulator/beam/sys.h
+++ b/erts/emulator/beam/sys.h
@@ -279,18 +279,21 @@ typedef unsigned long UWord;
 typedef long          SWord;
 #define SWORD_CONSTANT(Const) Const##L
 #define UWORD_CONSTANT(Const) Const##UL
+#define ERTS_UWORD_MAX ULONG_MAX
 #define ERTS_SWORD_MAX LONG_MAX
 #elif SIZEOF_VOID_P == SIZEOF_INT
 typedef unsigned int UWord;
 typedef int          SWord;
 #define SWORD_CONSTANT(Const) Const
 #define UWORD_CONSTANT(Const) Const##U
+#define ERTS_UWORD_MAX UINT_MAX
 #define ERTS_SWORD_MAX INT_MAX
 #elif SIZEOF_VOID_P == SIZEOF_LONG_LONG
 typedef unsigned long long UWord;
 typedef long long          SWord;
 #define SWORD_CONSTANT(Const) Const##LL
 #define UWORD_CONSTANT(Const) Const##ULL
+#define ERTS_UWORD_MAX ULLONG_MAX
 #define ERTS_SWORD_MAX LLONG_MAX
 #else
 #error Found no appropriate type to use for 'Eterm', 'Uint' and 'Sint'
@@ -304,6 +307,7 @@ typedef unsigned long Uint;
 typedef long          Sint;
 #define SWORD_CONSTANT(Const) Const##L
 #define UWORD_CONSTANT(Const) Const##UL
+#define ERTS_UWORD_MAX ULONG_MAX
 #define ERTS_SWORD_MAX LONG_MAX
 #define ERTS_SIZEOF_ETERM SIZEOF_LONG
 #define ErtsStrToSint strtol
@@ -313,6 +317,7 @@ typedef unsigned int Uint;
 typedef int          Sint;
 #define SWORD_CONSTANT(Const) Const
 #define UWORD_CONSTANT(Const) Const##U
+#define ERTS_UWORD_MAX UINT_MAX
 #define ERTS_SWORD_MAX INT_MAX
 #define ERTS_SIZEOF_ETERM SIZEOF_INT
 #define ErtsStrToSint strtol
@@ -322,6 +327,7 @@ typedef unsigned long long Uint;
 typedef long long          Sint;
 #define SWORD_CONSTANT(Const) Const##LL
 #define UWORD_CONSTANT(Const) Const##ULL
+#define ERTS_UWORD_MAX ULLONG_MAX
 #define ERTS_SWORD_MAX LLONG_MAX
 #define ERTS_SIZEOF_ETERM SIZEOF_LONG_LONG
 #if defined(__WIN32__)
@@ -661,8 +667,7 @@ typedef struct {
 #define ERTS_SYS_DDLL_ERROR_INIT {NULL}
 extern void erts_sys_ddll_free_error(ErtsSysDdllError*);
 extern void erl_sys_ddll_init(void); /* to initialize mutexes etc */
-extern int erts_sys_ddll_open2(const char *path, void **handle, ErtsSysDdllError*);
-#define erts_sys_ddll_open(P,H) erts_sys_ddll_open2(P,H,NULL)
+extern int erts_sys_ddll_open(const char *path, void **handle, ErtsSysDdllError*);
 extern int erts_sys_ddll_open_noext(char *path, void **handle, ErtsSysDdllError*);
 extern int erts_sys_ddll_load_driver_init(void *handle, void **function);
 extern int erts_sys_ddll_load_nif_init(void *handle, void **function,ErtsSysDdllError*);
diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c
index 978a766de9..80937dfcc8 100644
--- a/erts/emulator/drivers/common/inet_drv.c
+++ b/erts/emulator/drivers/common/inet_drv.c
@@ -1490,8 +1490,8 @@ static int load_ip_and_port
     unsigned int alen = len;
     char         abuf  [len];
     int res = inet_get_address(abuf, (inet_address*) addr, &alen);
-    ASSERT(res==0);
-    res = 0;
+    ASSERT(res==0); (void)res;
+
     /* Now "abuf" contains: Family(1b), Port(2b), IP(4|16b) */
 
     /* NB: the following functions are safe to use, as they create tuples
diff --git a/erts/emulator/internal_doc/CarrierMigration.md b/erts/emulator/internal_doc/CarrierMigration.md
new file mode 100644
index 0000000000..b93c11c6ec
--- /dev/null
+++ b/erts/emulator/internal_doc/CarrierMigration.md
@@ -0,0 +1,201 @@
+Carrier Migration
+=================
+
+The ERTS memory allocators manage memory blocks in two types of raw
+memory chunks. We call these chunks of raw memory
+*carriers*. Singleblock carriers which only contain one large block,
+and multiblock carriers which contain multiple blocks. A carrier is
+typically created using `mmap()` on unix systems. However, how a
+carrier is created is of minor importance. An allocator instance
+typically manages a mixture of single- and multiblock carriers.
+
+Problem
+-------
+
+When a carrier is empty, i.e. contains only one large free block, it
+is deallocated. Since multiblock carriers can contain both allocated
+blocks and free blocks at the same time, an allocator instance might
+be stuck with a large amount of poorly utilized carriers if the memory
+load decrease. After a peak in memory usage it is expected that not
+all memory can be returned since the blocks still allocated is likely
+to be dispersed over multiple carriers. Such poorly utilized carriers
+can usually be reused if the memory load increase again. However,
+since each scheduler thread manages its own set of allocator
+instances, and memory load is not necessarily connected to CPU load we
+might get into a situation where there are lots of poorly utilized
+multiblock carriers on some allocator instances while we need to
+allocate new multiblock carriers on other allocator instances. In
+scenarios like this, the demand for multiblock carriers in the system
+might increase at the same time as the actual memory demand in the
+system has decreased which is both unwanted and quite unexpected for
+the end user.
+
+Solution
+--------
+
+In order to prevent scenarios like this we've implemented support for
+migration of multiblock carriers between allocator instances of the
+same type.
+
+### Management of Free Blocks ###
+
+In order to be able to remove a carrier from one allocator instance
+and add it to another we need to be able to move references to the
+free blocks of the carrier between the allocator instances. The
+allocator instance specific data structure referring to the free
+blocks it manages often refers to the same carrier from multiple
+places. For example, when the address order bestfit strategy is used
+this data structure is a binary search tree spanning all carriers that
+the allocator instance manages. Free blocks in one specific carrier
+can be referred to from potentially every other carrier that is
+managed, and the amount of such references can be huge. That is, the
+work of removing the free blocks of such a carrier from the search
+tree will be huge. One way of solving this could be to not migrate
+carriers that contain lots of free blocks, but this would prevent us
+from migrating carriers that potentially needs to be migrated in order
+to solve the problem we set out to solve.
+
+By using one data structure of free blocks in each carrier and an
+allocator instance wide data structure of carriers managed by the
+allocator instance, the work needed in order to remove and add
+carriers can be kept to a minimum. When migration of carriers is
+enabled on a specific allocator type, we require that an allocation
+strategy with such an implementation is used. Currently we've
+implemented this for three different allocation strategies. All of
+these strategies use a search tree of carriers sorted so that we can
+find the carrier with the lowest address that can satisfy the
+request. Internally in carriers we use yet another search tree that
+either implement address order first fit, address order best fit,
+or best fit. The abbreviations used for these different allocation
+strategies are `aoff`, and `aoffcaobf`, `aoffcbf`.
+
+### Carrier Pool ###
+
+In order to migrate carriers between allocator instances we move them
+through a pool of carriers. In order for a carrier migration to
+complete, one scheduler needs to move the carrier into the pool, and
+another scheduler needs to take the carrier out of the pool.
+
+The pool is implemented as a lock free, circular, double linked,
+list. The list contains a sentinel which is used as the starting point
+when inserting to, or fetching from the pool. Carriers in the pool are
+elements in this list.
+
+The list can be modified by all scheduler threads
+simultaneously. During modifications the double linked list is allowed
+to get a bit "out of shape". For example, following the `next` pointer
+to the next element and then following the `prev` pointer does not
+always take you back to were you started. The following is however
+always true:
+
+*   Repeatedly following `next` pointers will eventually take you to the
+    sentinel.
+*   Repeatedly following `prev` pointers will eventually take you to the
+    sentinel.
+*   Following a `next` or a `prev` pointer will take you to either an
+    element in the pool, or an element that used to be in the pool.
+
+When inserting a new element we search for a place to insert the
+element by only following `next` pointers, and we always begin by
+skipping the first element encountered. When trying to fetch an
+element we do the same thing, but instead only follow `prev` pointers.
+
+By going different directions when inserting and fetching, we avoid
+contention between threads inserting and threads fetching as much as
+possible. By skipping one element when we begin searching, we preserve
+the sentinel unmodified as much as possible. This is beneficial since
+all search operations need to read the content of the sentinel. If we
+were to modify the sentinel, the cache line containing the sentinel
+would unnecessarily be bounced between processors.
+
+The `prev`, and `next` fields in the elements of the list contains the
+value of the pointer, a modification marker, and a deleted
+marker. Memory operations on these fields are done using atomic memory
+operations. When a thread has set the modification marker in a field,
+no-one except the thread that set the marker is allowed to modify the
+field. If multiple modification markers needs to be set, we always
+begin with `next` fields followed by `prev` fields in the order
+following the actual pointers. This guarantees that no deadlocks will
+occur.
+
+When a carrier is being removed from a pool, we mark it with a thread
+progress value that needs to be reached before we are allowed to
+modify the `next`, and `prev` fields. That is, until we reach this
+thread progress we are not allowed to insert the carrier into the pool
+again, and we are not allowed to deallocate the carrier. This ensures
+that threads inspecting the pool always will be able to traverse the
+pool and reach valid elements. Once we have reached the thread
+progress value that the carrier was tagged with, we know that no
+threads may have references to it via the pool.
+
+### Migration ###
+
+There exist one pool for each allocator type enabling migration of
+carriers between scheduler specific allocator instances of the same
+allocator type.
+
+Each allocator instance keeps track of the current utilization of its
+multiblock carriers. When the utilization falls below the "abandon
+carrier utilization limit" it starts to inspect the utilization of the
+current carrier when deallocations are made. If also the utilization
+of the carrier falls below the "abandon carrier utilization limit" it
+unlinks the carrier from its data structure of available free blocks
+and inserts the carrier into the pool.
+
+Since the carrier has been unlinked from the data structure of
+available free blocks, no more allocations will be made in the
+carrier. The allocator instance putting the carrier into the pool,
+however, still has the responsibility of performing deallocations in
+it while it remains in the pool.
+
+Each carrier has a flag field containing information about allocator
+instance owning the carrier, a flag indicating if the carrier is in
+the pool or not, and a flag indicating if it is busy or not. When the
+carrier is in the pool, the owning allocator instance needs to mark it
+as busy while operating on it. If another thread inspects it in order
+to try to fetch it from the pool, it will abort the fetch if it is
+busy. When fetching the carrier from the pool, ownership will changed
+and further deallocations in the carrier will be redirected to the new
+owner using the delayed dealloc functionality.
+
+If a carrier in the pool becomes empty, it will be withdrawn from the
+pool. All carriers that become empty are also always passed to its
+originating allocator instance for deallocation using the delayed
+dealloc functionality. Since carriers this way always will be
+deallocated by the allocator instance that allocated the carrier the
+underlying functionality of allocating and deallocating carriers can
+remain simple and doesn't have to bother about multiple threads. In a
+NUMA system we will also not mix carriers originating from multiple
+NUMA nodes.
+
+When an allocator instance needs more carrier space, it always begins
+by inspecting its own carriers that are waiting for thread progress
+before they can be deallocated. If no such carrier could be found, it
+then inspects the pool. If no carrier could be fetched from the pool,
+it will allocate a new carrier. Regardless of where the allocator
+instance gets the carrier from it the just links in the carrier into
+its data structure of free blocks.
+
+### Result ###
+
+The use of this strategy of abandoning carriers with poor utilization
+and reusing these in allocator instances with an increased carrier
+demand is extremely effective and completely eliminates the problems
+that otherwise sometimes occurred when CPU load dropped while memory
+load did not.
+
+When using the `aoffcaobf` or `aoff` strategies compared to `gf` or
+`bf`, we loose some performance since we get more modifications in the
+data structure of free blocks. This performance penalty is however
+reduced using the `aoffcbf` strategy. A tradeoff between memory
+consumption and performance is however inevitable, and it is up to
+the user to decide what is most important. 
+
+Further work
+------------
+
+It would be quite easy to extend this to allow migration of multiblock
+carriers between all allocator types. More or less the only obstacle
+is maintenance of the statistics information.
+
+
diff --git a/erts/emulator/internal_doc/CodeLoading.md b/erts/emulator/internal_doc/CodeLoading.md
new file mode 100644
index 0000000000..151b9cd57c
--- /dev/null
+++ b/erts/emulator/internal_doc/CodeLoading.md
@@ -0,0 +1,186 @@
+Non-Blocking Code Loading
+=========================
+
+Introduction
+------------
+
+Before OTP R16 when an Erlang code module was loaded, all other
+execution in the VM were halted while the load operation was carried
+out in single threaded mode. This might not be a big problem for
+initial loading of modules during VM boot, but it can be a severe
+problem for availability when upgrading modules or adding new code on
+a VM with running payload. This problem grows with the number of cores
+as both the time it takes to wait for all schedulers to stop increases
+as well as the potential amount of halted ongoing work.
+
+In OTP R16, modules are loaded without blocking the VM.
+Erlang processes may continue executing undisturbed in parallel during
+the entire load operation. The code loading is carried out by a normal
+Erlang process that is scheduled like all the others. The load
+operation is completed by making the loaded code visible to all
+processes in a consistent way with one single atomic
+instruction. Non-blocking code loading will improve real-time
+characteristics when modules are loaded/upgraded on a running SMP
+system.
+
+
+The Load Phases
+---------------
+
+The loading of a module is divided into two phases; a *prepare phase*
+and a *finishing phase*. The prepare phase contains reading the BEAM
+file format and all the preparations of the loaded code that can
+easily be done without interference with the running code. The
+finishing phase will make the loaded (and prepared) code accessible
+from the running code. Old module versions (replaced or deleted) will
+also be made inaccessible by the finishing phase.
+
+The prepare phase is designed to allow several "loader" processes to
+prepare separate modules in parallel while the finishing phase can
+only be done by one loader process at a time. A second loader process
+trying to enter finishing phase will be suspended until the first
+loader is done. This will only block the process, the scheduler is
+free to schedule other work while the second loader is waiting. (See
+`erts_try_seize_code_write_permission` and
+`erts_release_code_write_permission`).
+
+The ability to prepare several modules in parallel is not currently
+used as almost all code loading is serialized by the code_server
+process. The BIF interface is however prepared for this.
+
+      erlang:prepare_loading(Module, Code) -> LoaderState
+      erlang:finish_loading([LoaderState])
+
+The idea is that `prepare_loading` could be called in parallel for
+different modules and returns a "magic binary" containing the internal
+state of each prepared module. Function `finish_loading` could take a
+list of such states and do the finishing of all of them in one go.
+
+Currenlty we use the legacy BIF `erlang:load_module` which is now
+implemented in Erlang by calling the above two functions in
+sequence. Function `finish_loading` is limited to only accepts a list
+with one module state as we do not yet use the multi module loading
+feature.
+
+
+The Finishing Sequence
+----------------------
+
+During VM execution, code is accessed through a number of data
+structures. These *code access structures* are
+
+* Export table. One entry for every exported function.
+* Module table. One entry for each loaded module.
+* "beam_catches". Identifies jump destinations for catch instructions.
+* "beam_ranges". Map code address to function and line in source file.
+
+The most frequently used of these structures is the export table that
+is accessed in run time for every executed external function call to
+get the address of the callee. For performance reasons, we want to
+access all these structures without any overhead from thread
+synchronization. Earlier this was solved with an emergency break. Stop
+the entire VM to mutate these code access structures, otherwise treat
+them as read-only.
+
+The solution in R16 is instead to *replicate* the code access
+structures. We have one set of active structures read by the running
+code. When new code is loaded the active structures are copied, the
+copy is updated to include the newly loaded module and then a switch
+is made to make the updated copy the new active set. The active set is
+identified by a single global atomic variable
+`the_active_code_index`. The switch can thus be made by a single
+atomic write operation. The running code have to read this atomic
+variable when using the active access structures, which means one
+atomic read operation per external function call for example. The
+performance penalty from this extra atomic read is however very small
+as it can be done without any memory barriers at all (as described
+below). With this solution we also preserve the transactional feature
+of a load operation. Running code will never see the intermediate
+result of a half loaded module.
+
+The finishing phase is carried out in the following sequence by the
+BIF `erlang:finish_loading`:
+
+1. Seize exclusive code write permission (suspend process if needed
+   until we get it).
+
+2. Make a full copy of all the active access structures. This copy is
+   called the staging area and is identified by the global atomic
+   variable `the_staging_code_index`.
+
+3. Update all access structures in the staging area to include the
+   newly prepared module.
+
+4. Schedule a thread progress event. That is a time in the future when
+   all schedulers have yielded and executed a full memory barrier.
+
+5. Suspend the loader process.
+
+6. After thread progress, commit the staging area by assigning
+   `the_staging_code_index` to `the_active_code_index`.
+
+7. Release the code write permission allowing other processes to stage
+   new code.
+
+8. Resume the loader process allowing it to return from
+   `erlang:finish_loading`.
+
+
+### Thread Progress
+
+The waiting for thread progress in 4-6 is necessary in order for
+processes to read `the_active_code_index` atomic during normal
+execution without any expensive memory barriers. When we write a new
+value into `the_active_code_index` in step 6, we know that all
+schedulers will see an updated and consistent view of all the new
+active access structures once they become reachable through
+`the_active_code_index`.
+
+The total lack of memory barrier when reading `the_active_code_index`
+has one interesting consequence however. Different processes may see
+the new code at different point in time depending on when different
+cores happen to refresh their hardware caches. This may sound unsafe
+but it actually does not matter. The only property we must guarantee
+is that the ability to see the new code must spread with process
+communication. After receiving a message that was triggered by new
+code, the receiver must be guaranteed to also see the new code. This
+will be guaranteed as all types of process communication involves
+memory barriers in order for the receiver to be sure to read what the
+sender has written. This implicit memory barrier will then also make
+sure that the receiver reads the new value of `the_active_code_index`
+and thereby also sees the new code. This is true for all kinds of
+inter process communication (TCP, ETS, process name registering,
+tracing, drivers, NIFs, etc) not just Erlang messages.
+
+### Code Index Reuse
+
+To optimize the copy operation in step 2, code access structures are
+reused. In current solution we have three sets of code access
+structures, identified by a code index of 0, 1 and 2. These indexes
+are used in a round robin fashion. Instead of having to initialize a
+completely new copy of all access structures for every load operation
+we just have to update with the changes that have happened since the
+last two code load operations. We could get by with only two code
+indexes (0 and 1), but that would require yet another round of waiting
+for thread progress before step 2 in the `finish_loading` sequence. We
+cannot start reusing a code index as staging area until we know that
+no lingering scheduler thread is still using it as the active code
+index. With three generations of code indexes, the waiting for thread
+progress in step 4-6 will give this guarantee for us. Thread progress
+will wait for all running schedulers to reschedule at least one
+time. No ongoing execution reading code access structures reached from
+an old value of `the_active_code_index` can exist after a second round
+of thread progress.
+
+The design choice between two or three generations of code access
+structures is a trade-off between memory consumption and code loading
+latency.
+
+### A Consistent Code View
+
+Some native BIFs may need to get a consistent snapshot view of the
+active code. To do this it is important to only read
+`the_active_code_index` one time and then use that index value for all
+code accessing during the BIF. If a load operation is executed in
+parallel, reading `the_active_code_index` a second time might result
+in a different value, and thereby a different view of the code.
diff --git a/erts/emulator/internal_doc/DelayedDealloc.md b/erts/emulator/internal_doc/DelayedDealloc.md
new file mode 100644
index 0000000000..b7d87b839f
--- /dev/null
+++ b/erts/emulator/internal_doc/DelayedDealloc.md
@@ -0,0 +1,175 @@
+Delayed Dealloc
+===============
+
+Problem
+-------
+
+An easy way to handle memory allocation in a multi-threaded
+environment is to protect the memory allocator with a global lock
+which threads performing memory allocations or deallocations have to
+have locked during the whole operation. This solution of course scales
+very poorly, due to heavy lock contention. An improved solution of
+this scheme is to use multiple thread specific instances of such an
+allocator. That is, each thread allocates in its own allocator
+instance which is protected by a lock. In the general case references
+to memory need to be passed between threads. In the case where a
+thread that needs to deallocate memory that originates from another
+threads allocator instance a lock conflict is possible. In a system as
+the Erlang VM where memory allocation/deallocation is frequent and
+references to memory also are passed around between threads this
+solution will also scale poorly due to lock contention.
+
+Functionality Used to Adress This problem
+-----------------------------------------
+
+In order to reduce contention due to locking of allocator instances we
+introduced completely lock free instances tied to each scheduler
+thread, and an extra locked instance for other threads. The scheduler
+threads in the system is expected to do the major part of the
+work. Other threads may still be needed but should not perform any
+major and/or time critical work. The limited amount of contention that
+appears on the locked allocator instance can more or less be
+disregarded.
+
+Since we still need to be able to pass references to memory between
+scheduler threads we need some way to manage this. An allocator
+instance belonging to one scheduler thread is only allowed to be
+manipulated by that scheduler thread. When other threads need to
+deallocate memory originating from a foreign allocator instance, they
+only pass the memory block to a "message box" containing deallocation
+jobs attached to the originating allocator instance. When a scheduler
+thread detects such deallocation job it performs the actual
+deallocation.
+
+The "message box" is implemented using a lock free single linked list
+through the memory blocks to deallocate. The order of the elements in
+this list is not important. Insertion of new free blocks will be made
+somewhere near the end of this list. Requirering that the new blocks
+need to be inserted at the end would cause unnecessary contention when
+large amount of memory blocks are inserted simultaneous by multiple
+threads.
+
+The data structure refering to this single linked list cover two cache
+lines. One cache line containing information about the head of the
+list, and one cache line containing information about the tail of the
+list. This in order to reduce cache line ping ponging of this data
+structure. The head of the list will only be manipulated by the thread
+owning the allocator instance, and the tail will be manipulated by
+other threads inserting deallocation jobs.
+
+### Tail ###
+
+In the tail part of the data structure we find a pointer to the last
+element of the list, or at least something that is near the end of the
+list. In the uncontended case it will point to the end of the list,
+but when simultaneous insert operations are performed it will point to
+something near the end of the list.
+
+When insterting an element one will try to write a pointer to the new
+element in the next pointer of the element pointed to by the last
+pointer. This is done using an atomic compare and swap that expects
+the next pointer to be `NULL`. If this succeds the thread performing
+this operation moves the last pointer to point to the newly inserted
+element.
+
+If the atomic compare and swap described above failed, the last
+pointer didn't point to the last element. In this case we need to
+insert the new element somewhere inbetween the element that the last
+pointer pointed to and the actual last element. If we do it this way
+the last pointer will eventually end up at the last element when
+threads stop adding new elements. When trying to insert somewhere near
+the end and failing to do so, the inserting thread sometimes moves to
+the next element and somtimes tries with the same element again. This
+in order to spread the inserted elements during heavy contention. That
+is, we try to spread the modifications of memory to different
+locations instead of letting all threads continue to try to modify the
+same location in memory.
+
+### Head ###
+
+The head contains pointers to begining of the list (`head.first`), and
+to the first block which other threads may refer to
+(`head.unref_end`). Blocks between these pointers are only refered to
+by the head part of the data structure which is only used by the
+thread owning the allocator instance. When these two pointers are not
+equal the thread owning the allocator instance deallocate block after
+block until `head.first` reach `head.unref_end`.
+
+We of course periodically need to move the `head.unref_end` closer to
+the end in order to be able to continue deallocating memory
+blocks. Since all threads inserting new elements in the linked list
+will enter the list using the last pointer we can use this
+knowledge. If we call `erts_thr_progress_later()` and wait until we
+have reached that thread progress we know that no managed threads can
+refer the elements up to the element pointed to by the last pointer at
+the time when we called `erts_thr_progress_later()`. This since, all
+managed threads must have left the code implementing this at least
+once, and they always enters into the list via the last pointer. The
+`tail.next` field contains information about next `head.unref_end`
+pointer and thread progress that needs to be reached before we can
+move `head.unref_end`.
+
+Unfortunately not only threads managed by the thread progress
+functionality may insert memory blocks. Other threads also needs to be
+taken care of. Other threads will not be as frequent users of this
+functionality as managed threads, so using a less efficient scheme for
+them is not that big of a problem. In order to handle unmanaged
+threads we use two reference counters. When an unmanaged thread enters
+this implementation it increments the reference counter currently
+used, and when it leaves this implementation it decrements the same
+reference counter. When the consumer thread calls
+`erts_thr_progress_later()` in order to determine when it is safe to
+move `head.unref_end`, it also swaps reference counters for unmanaged
+threads. The previous current represents outstanding references from
+the time up to this point. The new current represents future reference
+following this point. When the consumer thread detects that we have
+both reached the desired thread progress and when the previous current
+reference counter reach zero it is safe to move the `head.unref_end`.
+
+The reason for using two reference counters is that we need to know
+that the reference counter eventually will reach zero. If we only used
+one reference counter it would potentially be held above zero for ever
+by different unmanaged threads.
+
+### Empty List ###
+
+If no new memory blocks are inserted into the list, it should
+eventually be emptied. All pointers to the list however expect to
+always point to something. This is solved by inserting an empty
+"marker" element, which only has to purpose of being there in the
+absense of other elements. That is when the list is empty it only
+contains this "marker" element.
+
+### Contention ###
+
+When elements are continously inserted by threads not owning the
+allocator instance, the thread owning the allocator instance will be
+able to work more or less undisturbed by other threads at the head end
+of the list. At the tail end large amounts of simultaneous inserts may
+cause contention, but we reduce such contention by spreading inserts
+of new elements near the end instead of requiring all new elements to
+be inserted at the end.
+
+### Schedulers and The Locked Allocator Instance ###
+
+Also the locked allocator instance for use by non-scheduler threads
+have a message box for deallocation jobs just as all the other
+allocator instances. The reason for this is that other threads may
+allocate memory pass it to a scheduler that then needs to deallocate
+it. We do not want the scheduler to have to wait for the lock on this
+locked instance. Since also locked instances has message boxes for
+deallocation jobs, the scheduler can just insert the job and avoid the
+locking.
+
+
+### A Benchmark Result ###
+
+When running the ehb benchmark, large amount of messages are passed
+around between schedulers. All message passing will in some way or the
+other cause memory allocation and deallocation. Since messages are
+passed between different schedulers we will get contention on the
+allocator instances where messages were allocated. By the introduction
+of the delayed dealloc feature, we got a speedup of between 25-45%,
+depending on configuration of the benchmark, when running on a
+relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/PTables.md b/erts/emulator/internal_doc/PTables.md
new file mode 100644
index 0000000000..6fe0e7665d
--- /dev/null
+++ b/erts/emulator/internal_doc/PTables.md
@@ -0,0 +1,356 @@
+Process and Port Tables
+=======================
+
+Problems
+--------
+
+The process table is a mapping from process identifiers to process
+structure pointers. The process structure contains miscellaneous
+information about a process, as for example pointers to its heap,
+message queue, etc. When the runtime system needs to operate on a
+process, it looks up the process structure in the process table using
+the process identifier. An example of this is when passing a message
+to a process.
+
+The process table has for a very long time just been an array of
+pointers to process structures. Since process identifiers internally
+in the runtime system are 28-bit integers it is quite easy to map a
+process identifier to index into the array. The 28-bits were divided
+into two sets. The least significant set of bits was used as index
+into the array. The most significant set of bits was only used to be
+able to distinguish between a number of identifiers with which map to
+the same index in the array. As long as process table sizes of a power
+of two was used we had 2^28 unique process identifiers.
+
+When the first SMP support was implemented, the table still was kept
+more or less the same way, but protected by two types of locks. One
+lock that protected the whole table against modifications and an array
+of locks protecting different parts of the table. The exact locking
+strategy previously used isn't interesting. What is interesting is
+that it suffered from heavy lock contention especially when lots of
+modifications was being made, but also when only performing lookups.
+
+In order to be able to detect when it is safe to deallocate a
+previously used process structure, reference counting of the structure
+was used. Also this was problematic, since simultaneous lookups needed
+to modify the reference counter which also caused contention on the
+cache line where the reference counter was located. This since all
+modifications needs to be communicated between all involved
+processors.
+
+The port table is very similar to the process table. The major
+difference, at least in concept, is that it is a mapping from port
+identifiers to port structures. It had a similar implementation, but
+with some differences. Instead of being an array of pointers it was an
+array of structures, and instead of being protected by two types of
+locks it was only protected by one global lock. This table also
+suffered from lock contention in various situations.
+
+Solution
+--------
+
+The process table was the major problem to address since processes are
+much more frequently used than ports. The first implementation only
+implemented this for processes, but since the port table is very
+similar and very similar problems occur on the port table, the process
+table implementation was later generalized so that it could also be
+used implementing the port table. For simplicity I will only talk
+about the process table in the following text, but the same will apply
+to the port table unless otherwise stated.
+
+If we disregard the locking issues, the original solution is very
+appealing. The mapping from process identifier to index into the array
+is very fast, and this property is something we would like to
+keep. The vast majority of operations on these tables are lookups so
+optimizing for lookups is what we want to do.
+
+### Lookup ###
+
+Using a set of bits in the process identifier as index into an array
+seems hard to beat. By replacing the array of pointers with an array
+of our pointer sized atomic data type, a lookup will consist of the
+following:
+
+1.  Mapping the 28-bit integer to an index into the array.
+
+    More about this mapping later.
+
+2.  Read the pointer using an atomic memory operation at determined
+    index in array.
+
+    On all platforms that we provide atomic memory operations, this is
+    just a `volatile` read, preventing the compiler to use values in
+    registers, forcing the a read from memory.
+
+3.  Depending on use, issue appropriate memory barrier.
+
+    A common barrier used is a barrier with acquire semantics. On
+    x86/x86_64 this maps to a compiler barrier preventing the compiler
+    to reorder instructions, but on other hardware often some kind of
+    light weight hardware memory barrier is also needed.
+
+    When comparing with a locked approach, at least one heavy weight
+    memory barrier will be issued when locking the lock on most, if
+    not all, hardware architectures (including x86/x86_64), and often
+    some kind of light weight memory barrier will be issued when
+    unlocking the lock. 
+
+When looking at this very simple solution with very little overhead
+you might wonder why we didn't implement it this way from the
+beginning. It all boils down to the read operation of the pointer. We
+need some way to know that it is safe to access the memory pointed
+to. One way of doing this is to place a reference counter in the
+process structure. Increment of the reference counter at lookup needs
+to be done atomically with the lookup. A lock can typically provide
+this service for us, which was the approach we previously
+used. Another approach could be to co-locate the reference counter
+with the pointer in the table. The major problem with this approach is
+the modifications of the reference counter. This since these
+modification would have to be communicated between all involved
+processor cause contention on the cache line containing the reference
+counter. The new lookup approach above is possible since we can use
+the "thread progress" functionality in order to determine when it is
+safe to deallocate the process structure. We'll get back to this when
+describing deletion in the table.
+
+Using this new lookup approach we wont modify any memory at all which
+is important. A lookup conceptually only read memory, now this is true
+in the implementation also which is important from a scalability
+perspective. The previous implementation modified the cache line
+containing the reference counter two times, and the cache line
+containing the corresponding lock two times at each lookup.
+
+### Modifications of the Table ###
+
+A lightweight lookup in the table was the most important feature, but
+we also wanted to improve modifications of the table. The process
+table is modified when a new process is spawned, i.e. a new pointer is
+inserted into the table, and when a process terminates, i.e. a pointer
+is deleted in the table.
+
+Assuming that we spawn fewer processes than the maximum amount of
+unique process identifiers in the system, one has always been able to
+determine the order of process creation just by comparing process
+identifiers. If PidX is larger than PidY, then PidX was created after
+PidY assuming both identifiers originates from the same node. However,
+since we have a quite limited amount of unique identifiers today
+(2^28), this property cannot be relied upon if we create large amount
+of processes. But never the less, this is a property the system always
+have had.
+
+If we would have had a huge amount of unique identifiers available, it
+would have tempting to drop or modify this ordering property as
+described above. The ordering property could for example be based on
+the scheduler performing the spawn operation. It would have been
+possible to reserve large ranges of identifiers exclusive for each
+scheduler thread which could be used minimizing the need for
+communication when allocating identifiers. The amount of identifiers
+we got to work with today is, however, not even close to be enough for
+such an approach.
+
+Since we have a limited amount of unique identifiers, we need to be
+careful not to waste them. If previously used identifiers are reused
+too quick, identifiers originating from terminated processes will
+refer to newly created processes, and mixups will occur. The
+previously used approach was quite good at not wasting
+identifiers. Using a modified version of the same approach also lets
+us keep the ordering property that we have always had.
+
+#### Insert ####
+
+The original approach is more or less to search for next free index or
+slot in the array. The search starts from the last slot allocated. If
+we reach the end of the array we increase a "wrapped counter" and then
+continue the search. The process identifier is constructed by writing
+the index to the least significant set of bits, and the "wrapped
+counter" to the most significant set of bits. The amount of bits in
+each set of bits is decided at boot time, so that maximum index will
+just fit into the least significant set of bits.
+
+In the modified lock free version of this approach we more or less do
+it the same way, but with some important modifications trying to avoid
+unnecessary contention when multiple schedulers create processes
+simultaneously. Since multiple threads might be trying to search for
+the next free slot at the same time from the same starting point we
+want subsequent slots to be located in different cache lines. Multiple
+schedulers simultaneously writing new pointers into the table are
+therefore very likely to write into adjacent slots. If adjacent slots
+are located in the same cache line all modification of this cache line
+needs to be communicated between all involved processors which will be
+very expensive and scale very poor. By locating adjacent slots in
+different cache lines only true conflicts will trigger communication
+between involved processors, i.e., avoiding false sharing.
+
+A cache line is larger than a pointer, typically 8 or 16 times larger,
+so using one cache line for each slot only containing one pointer
+would be a waste of space. Each cache line will be able to hold a
+fixed amount of slots. The first slot of the table will be the first
+slot of the first cache line, the second slot of the table will be the
+first slot of the second cache line until we reach the end of the
+array. The next slot after that will be the second slot of the first
+cache line, etc, moving forward one cache line internal slot each time
+we wrap. This way we will be able to fit the same amount of pointers
+into an array of the same size while always keeping adjacent slots in
+different cache lines.
+
+The mapping from identifier to slot or index into the array gets a bit
+more complicated than before. Instead of a `shift` and a bitwise
+`and`, we get two `shift`s, two bitwise `and`s, and an `add` (see
+implementation of `erts_ptab_data2pix()` in `erl_ptab.h`). However, by
+storing this information optimized for lookup we only need a `shift`
+and a bitwise `and` on 32-bit platforms. On 64-bit platforms we got
+enough room for the 28-bit identifier in the least significant
+halfword, and the index in the most significant halfword, in other
+words, we just need to read the most significant halfword to get the
+index. That is, this operation is as fast, or faster than before. The
+downside is that on 32-bit platforms we need to convert this
+information into the 28-bit identifier number when printing, or when
+ordering identifiers from the same node. These operations are,
+however, extremely infrequent compared to lookups.
+
+When we insert a new element in the table we do the following:
+
+1.  We begin by reserving space in the table by atomically
+    incrementing a counter of processes in the table. If our increment
+    brings the counter above the maximum size of the table, the
+    operation fail and a `system_limit` exception is raised.
+
+2.  The table contains a 64-bit atomic variable of the last identifier
+    used. Only the least significant bits will be used when actually
+    creating the identifier. This identifier is where the search
+    begin.
+
+3.  We increment last identifier value used. In order determine the
+    slot that corresponds to this identifier we call
+    `erts_ptab_data2pix()` that maps identifier to slot. We read the
+    content of the slot. If the slot is free we try to write a
+    reservation marker using an atomic compare and swap. If this fails
+    we repeat this step until it succeeds. 
+
+4.  Change the table variable of last identifier used. Since multiple
+    writes might occur at the same time this value may already have
+    been changed by to an identifier larger that the one we got. In
+    this case we can continue; otherwise, we need to change it to the
+    identifier we got.
+
+5.  We now do some initializations of the process structure that
+    cannot be done before we know the process identifier, and have to
+    be done before we publish the structure in the table. This, for
+    example, includes storing the identifier in the process structure. 
+
+6.  Now we can publish the structure in the table by writing the the
+    pointer to the process structure in the slot previously reserved
+    in 3.
+
+Using this approach we keep the properties like identifier ordering,
+and identifier reuse while improving performance and scalability. It
+has one flaw, though. There is no guarantee that the operation will
+terminate. This can quite easily be fixed though, and will be fixed in
+the next release. We will get back to this below.
+
+#### Delete ####
+
+When a process terminates, we mark the process as terminated in the
+process structure, the counter of number of processes in the table is
+decreased, and the reference to the process structure is removed by
+writing a `NULL` pointer into the corresponding slot. The scheduler
+thread performing this then schedule a thread progress later job which
+will do the final cleanup and deallocate the process structure. The
+thread progress functionality will make sure that this job will not
+execute until it is certain that all managed threads have dropped all
+references to the process structure.
+
+### BIF Iterating Over the Table ###
+
+The `erlang:processes/1` and `erlang:port/1` BIFs iterate over the
+tables and return corresponding identifiers. These BIF should return a
+consistent snapshot of the table content during some time when the BIF
+is executing. In order to implement this we use locking in a strange
+way. We use an "inverted rwlock".
+
+When performing lookups in the table we do not need to bother about
+the locking at all, but when modifying the table we read lock the
+rwlock protecting the table which allows for multiple writers during
+normal operation. When the BIF that iterates over the table need
+access to the table it write locks the rwlock and reads content of the
+table. The BIF do not read the whole table in one go but instead read
+small chunks at time only write locking while reading. The actual
+implementation of the BIFs is out of the scope of this document.
+
+An out of the box rwlock will typically suffer from contention on the
+single cache line containing the state of the rwlock even in the case
+we are only read locking. Instead of using such an rwlock, we have our
+own implementation of reader optimized rwlocks which keeps track of
+reader threads in separate thread specific cache lines. This in order
+to avoid contention on a singe cache line. As long as we only do read
+lock operations, threads only need to read a global cache line and
+modify its own cache line, and by this minimize communication between
+involved processors. The iterating BIFs are normally very infrequently
+used, so in the normal case we will only do read lock operations on
+the table global rwlock.
+
+### Future Improvements ###
+
+The first improvement is to fix the guarantee so that insert
+operations will be guaranteed to terminate. When the operation starts
+we verify that there actually exist a free slot that we can use. The
+problem is that we might not find it since it may move when multiple
+threads modify the table at the same time as we are trying to find the
+slot. The easy fix is to abort the operation if an empty slot could
+not be found in a finite number operation, and then restart the
+operation under a write lock. This will be implemented in next
+release, but furter work should be made trying to find a better
+solution.
+
+This and also previous implementation do not work well when the table
+is nearly full. We will both get long search times for free slots, and
+we will reuse identifiers more frequently since we more frequently
+wrap during the search. These tables works best when the table is much
+larger than the amount of simultaneous existing processes. One easy
+improvement is to always have room for more processes than we allow in
+the table. This will also be implemented in the next release, but this
+should probably also be worked more on trying to find an even better
+solution.
+
+It would also be nice to get rid of the rwlock all together. The use
+of a reader optimized rwlock makes sure we do not any contention on
+the lock, but unnecessary memory barriers will be issued due to the
+lock. The main issue here is to modify iterating BIFs so that they do
+not require exclusive access to the table while reading a sequence of
+slots. In principle this should be rather easy, the code can handle
+sequences of variable sizes, so shrinking the sequence size of slots
+to one would solv the problem. This will, however, need some tweeks
+and modifications of not trival code, but is something that should be
+looked at in the future.
+
+By increasing the size of identifiers, at least on 64-bit machines
+(which isn't as easy as it first might seem) we get further room for
+improvement. Besides the obvious improvement of not reusing
+identifiers as fast as we currently do, it makes it possible to
+further avoid contention when inserting elements in the table. At
+least if we drop this ordering property, which isn't that useful
+anyway.
+
+### Some Benchmark Results ###
+
+In order to test modifications of the process table we ran a couple of
+benchmarks where lots of processes are spawned and terminated
+simultaneously, and got a speedup of between 150-200%. Running a
+similar benchmark but with ports we got a speedup of about 130%.
+
+The BIF `erlang:is_process_alive/1` is the closest you can get to a
+process table lookup only. The BIF looks up the process corresponding
+to the process identifier passed as argument, and then checks if it is
+alive. By running multiple processes looping over this BIF checking
+the same process, we get a speedup between 20000-23000%. Conceptually
+this operation only involve read operations. In the implementation
+used in R16B also only read operation are performed, while the
+previous implementation need to lock structures in order to read the
+data, suffering from both lock contention and contention due to
+modifications of cache lines used by lock internal data structures and
+the reference counter on the process being looked up.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers. On a
+machine with more communication overhead and/or larger amount of
+logical processors the speedups are expected to be even larger.
diff --git a/erts/emulator/internal_doc/PortSignals.md b/erts/emulator/internal_doc/PortSignals.md
new file mode 100644
index 0000000000..b1afb7c5cb
--- /dev/null
+++ b/erts/emulator/internal_doc/PortSignals.md
@@ -0,0 +1,267 @@
+Port Signals
+============
+
+Problems
+--------
+
+Erlang ports conceptually are very similar to Erlang processes. Erlang
+processes execute Erlang code in the virtual machine, while an Erlang
+port execute native code typically used for communication with the
+outside world. For example, when an Erlang process wants to
+communicate using TCP over the network, it communicates via an Erlang
+port implementing the TCP socket interface in native code. Both Erlang
+Processes and Ports communicate using asynchronous signaling. The
+native code executed by an Erlang port is a collection of callback
+functions, called a driver. Each callback more or less implements the
+code of a signal to, or from the port.
+
+Even though processes and ports conceptually always have been very
+similar, the implementations have been very different. Originally,
+more or less all port signals were handled synchronously at the time
+they occurred. Very early in the development of the SMP support for
+the runtime system we recognized that this was a huge problem for
+signals between ports and the outside world. That is, I/O events to
+and from the outside world, or I/O signals. This was one of the first
+things that had to be rewritten in order to be able to do I/O in
+parallel at all. The solution was to implement scheduling of these
+signals. I/O signals corresponding to different ports could then be
+executed in parallel on different scheduler threads. Signals from
+processes to ports was not as big of a problem as the I/O signals, and
+the implementation of those was left as they were.
+
+Each port is protected by its own lock to protect against simultaneous
+execution in multiple threads. Previously when a process, executing on
+a scheduler thread, sent a port a signal, it locked the port lock and
+synchronously executed the code corresponding to the signal. If the
+lock was busy, the scheduler thread blocked waiting until it could
+lock the lock. If multiple processes executing simultaneously on
+different scheduler threads, sent signals to the same port, schedulers
+suffered from heavy lock contention. Such contention could also occur
+between I/O signals for the port executing on one scheduler thread,
+and a signal from a process to the port executing on another scheduler
+thread. Beside the contention issues, we also loose potential work to
+execute in parallel on different scheduler threads. This since the
+process sending the *asynchronous* signal is blocked while the code
+implementing the signal is executed synchronously.
+
+Solution
+--------
+
+In order to prevent multiple schedulers from trying to execute signals
+to/from the same port simultaneously, we need to be able to ensure
+that all signals to/from a port are executed in sequence on one
+scheduler. More or less, the only way to do this is to schedule all
+types of signals. Signals corresponding to a port can then be executed
+in sequence by one single scheduler thread. If only one thread tries
+to execute the port, no contention will appear on the port
+lock. Besides getting rid of the contention, processes sending signals
+to the port can also continue execution of their own Erlang code on
+other schedulers at the same time as the signaling code is executing
+on another scheduler.
+
+When implementing this there are a couple of important properties that
+we either need, or want to preserve:
+
+*   Signal ordering guarantee. Signals from process `X` to port `Y`,
+    *must* be delivered to `Y` in the same order as sent from `X`.
+
+*   Signal latency. Due to the previous synchronous implementation,
+    latency of signals sent from processes to ports have usually been
+    very low. During contention the latency has of course
+    increased. Users expect latency of these signals to be low, a
+    sudden increase in latency would not be appreciated by our users.
+
+*   Compatible flow control. Ports have for a very long time had the
+    possibility to use the busy port functionality when implementing
+    flow control. One may argue that this functionality fits very bad
+    with the conceptually completely asynchronous signaling, but the
+    functionality has been there for ages and is expected to be
+    there. When a port sets itself into a busy state, `command`
+    signals should not be delivered, and senders of such signals
+    should suspend until the port sets itself in a not busy state.
+
+### Scheduling of Port Signals ###
+
+A run queue has four queues for processes of different priority and
+one queue for ports. The scheduler thread associated with the run
+queue switch evenly between execution of processes and execution of
+ports while both processes and ports exist in the queue. This is not
+completely true, but not important for this discussion. A port that is
+in a run queue also has a queue of tasks to execute. Each task
+corresponds to an in- or outgoing signal. When the port is selected
+for execution each task will be executed in sequence. The run queue
+locks not only protected the queues of ports, but also the queues of
+port tasks.
+
+Since we go from a state where I/O signals are the only port related
+signals scheduled, to a state where potentially all port related
+signals may be scheduled we may drastically increase the load on the
+run queue lock. The amount of scheduled port tasks very much depend on
+the Erlang application executing, which we do not control, and we do
+not want to get increased contention on the run queue locks. We
+therefore need another approach of protecting the port task queue.
+
+#### Task Queue ####
+
+We chose a "semi locked" approach, with one public locked task queue,
+and a private, lock free, queue like, task data structure. This "semi
+locked" approach is similar to how the message boxes of processes are
+managed. The lock is port specific and only used for protection of
+port tasks, so the run queue lock is now needed in more or less the
+same way for ports as for processes. This ensures that we wont see an
+increased lock contention on run queue locks due to this rewrite of
+the port functionality.
+
+When an executing port runs out of work to execute in the private task
+data structure, it moves the public task queue into the private task
+data structure while holding the lock. Once tasks has been moved to
+the private data structure no lock protects them. This way the port
+can continue working on tasks in the private data structure without
+having to fight for the lock.
+
+I/O signals may however be aborted. This could be solved by letting
+the port specific scheduling lock also protect the private task data
+structure, but then the port very frequently would have to fight with
+others enqueueing new tasks. In order to handle this while keeping the
+private task data structure lock free, we use a similar "non
+aggressive" approach as we use when handling processes that gets
+suspended while in the run queue. Instead of removing the aborted port
+task, we just mark it as aborted using an atomic memory
+operation. When a task is selected for execution, we first verify that
+it has not been aborted. If aborted we, just drop the task.
+
+A task that can be aborted is referred via another data structure from
+other parts of the system, so that a thread that needs to abort the
+task can reach it. In order to be sure to safely deallocate a task
+that is no longer used, we first clear this reference and then use the
+thread progress functionality in order to make sure no references can
+exist to the task. Unfortunately, also unmanaged threads might abort
+tasks. This is very infrequent, but might occur. This could be handled
+locally for each port, but would require extra information in each
+port structure which very infrequently would be used. Instead of
+implementing this in each port, we implemented general functionality
+that can be used from unmanaged threads to delay thread progress.
+
+The private "queue like" task data structure could have been an
+ordinary queue if it wasn't for the busy port functionality. When the
+port has flagged itself as busy, `command` signals are not allowed to
+be delivered and need to be blocked. Other signals sent from the same
+sender following a `command` signal that has been blocked also have to
+be blocked; otherwise, we would violate the ordering guarantee. At the
+same time, other signals that have no dependencies to blocked
+`command` signals are expected to be delivered.
+
+The above requirements makes the private task data structure a rather
+complex data structure. It has a queue of unprocessed tasks, and a
+busy queue. The busy queue contains blocked tasks corresponding to
+`command` signals, and tasks with dependencies to such tasks. The busy
+queue is accompanied by a table over blocked tasks based on sender
+with a references into last task in the busy queue from a specific
+sender. This since we need check for dependencies when new tasks are
+processed in the queue of unprocessed tasks. When a new task is
+processed that needs to be blocked it isn't enqueued at the end of the
+busy queue, but instead directly after the last task with the same
+sender. This in order to easily be able to detect when we have tasks
+that no longer have any dependencies to tasks corresponding to
+`command` signals which should be moved out of the busy queue. When
+the port executes, it switches between processing tasks from the busy
+queue, and processing directly from the unprocessed queue based on its
+busy state. When processing directly from the unprocessed queue it
+might, of course, have to move a task into the busy queue instead of
+executing it.
+
+#### Busy Port Queue ####
+
+Since it is the port itself which decides when it is time to enter a
+busy state, it needs to be executing in order to enter the busy
+state. As a result of `command` signals being scheduled, we may get
+into a situation where the port gets flooded by a huge amount of
+`command` signals before it even gets a chance to set itself into a
+busy state. This since it has not been scheduled for execution
+yet. That is, under these circumstances the busy port functionality
+loose the flow control properties it was intended to provide.
+
+In order to solve this, we introduced a new busy feature, namely "busy
+port queue". The port has a limit of `command` data that is allowed to
+be enqueued in the task queue. When this limit is reached, the port
+will automatically enter a busy port queue state. When in this state,
+senders of `command` signals will be suspended, but `command` signals
+will still be delivered to the port unless it is also in a busy port
+state. This limit is known as the high limit.
+
+There is also a low limit. When the amount of queued `command` data
+falls below this limit and the port is in a busy port queue state, the
+busy port queue state is automatically disabled. The low limit should
+typically be significantly lower than the high limit in order to
+prevent frequent oscillation around the busy port queue state.
+
+By introduction of this new busy state we still can provide the flow
+control. Old driver do not even have to be changed. The limits can,
+however, be configured and even disabled by the port. By default the
+high limit is 8 KB and the low limit is 4 KB.
+
+### Preparation of Signal Send ###
+
+Previously all operations sending signals to ports began by acquiring
+the port lock, then performed preparations for sending the signal, and
+then finaly sent the signal. The preparations typically included
+inspecting the state of the port, and preparing the data to pass along
+with the signal. The preparation of data is frequently quite time
+consuming, and did not really depend on the port. That is we would
+like to do this without having the port lock locked.
+
+In order to improve this, state information was re-organized in the
+port structer, so that we can access it using atomic memory
+operations. This together with the new port table implementation,
+enabled us to lookup the port and inspect the state before acquiring
+the port lock, which in turn made it possible to perform preparations
+of signal data before acquiring the port lock.
+
+### Preserving Low Latency ###
+
+If we disregard the contended cases, we will inevitably get a higher
+latency when scheduling signals for execution at a later time than by
+executing the signal immediately. In order to preserve the low latency
+we now first check if this is a contended case or not. If it is, we
+schedule the signal for later execution; otherwise, we execute the
+signal immediately. It is a contended case if other signals already
+are scheduled on the port, or if we fail to acquire the port
+lock. That is we will not block waiting for the lock.
+
+Doing it this way we will preserve the low latency at the expense of
+lost potential parallel execution of the signal and other code in the
+process sending the signal. This default behaviour can however be
+changed on port basis or system wide, forcing scheduling of all
+signals from processes to ports that are not part of a synchronous
+communication. That is, an unconditional request/response pair of
+asynchronous signals. In this case it is no potential for parallelism,
+and by that no point forcing scheduling of the request signal.
+
+The immediate execution of signals may also cause a scheduler that is
+about to execute scheduled tasks to block waiting for the port
+lock. This is however more or less the only scenario where a scheduler
+needs to wait for the port lock. The maximum time it has to wait is
+the time it takes to execute one signal, since we always schedule
+signals when contention occurs.
+
+### Signal Operations ###
+
+Besides implementing the functionality enabling the scheduling,
+preparation of signal data without port lock, etc, each operation
+sending signals to ports had to be quite extensively re-written. This
+in order to move all sub-operations that can be done without the lock
+to a place before we have acquired the lock, and also since signals
+now sometimes are executed immediately and sometimes scheduled for
+execution at a later time which put different requirements on the data
+to pass along with the signal.
+
+### Some Benchmark Results ###
+
+When running some simple benchmarks where contention only occur due to
+I/O signals contending with signals from one single process we got a
+speedup of 5-15%. When multiple processes send signals to one single
+port the improvements can be much larger, but the scenario with one
+process contending with I/O is the most common one.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/ProcessManagementOptimizations.md b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
new file mode 100644
index 0000000000..9e83633bef
--- /dev/null
+++ b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
@@ -0,0 +1,172 @@
+Process Management Optimizations
+================================
+
+Problems
+--------
+
+Early versions of the SMP support for the runtime system completely
+relied on locking in order to protect data accesses from multiple
+threads. In some cases this isn't that problematic, but in some cases
+it really is. It complicates the code, ensuring all locks needed are
+actually held, and ensuring that all locks are acquired in such an
+order that no deadlock occur. Acquiring locks in the right order often
+also involve releasing locks held, forcing threads to reread data
+already read. A good recipe for creation of bugs. Trying to use more
+fine-grained locking in order to increase possible parallelism in the
+system makes the complexity situation even worse. Having to acquire a
+bunch of locks when doing operations also often cause heavy lock
+contention which cause poor scalability.
+
+Management of processes internally in the runtime system suffered from
+these problems. When changing state on a process, for example from
+`waiting` to `runnable`, a lock on the process needed to be
+locked. When inserting a process into a run queue also a lock
+protecting the run queue had to be locked. When migrating a process
+from one run queue to another run queue, locks on both run queues and
+on the process had to be locked.
+
+This last example is a quite common case in during normal
+operation. For example, when a scheduler thread runs out of work it
+tries to steal work from another scheduler threads run queue. When
+searching for a victim to steal from there was a lot of juggling of
+run queue locks involved, and during the actual theft finalized by
+having to lock both run queues and the process. When one scheduler
+runs out of work, often others also do, causing lots of lock
+contention.
+
+Solution
+--------
+
+### Process ###
+
+In order to avoid these situations we wanted to be able to do most of
+the fundamental operations on a process without having to acquire a
+lock on the process. Some examples of such fundamental operations are,
+moving a process between run queues, detecting if we need to insert it
+into a run queue or not, detecting if it is alive or not.
+
+All of this information in the process structure that was needed by
+these operations was protected by the process `status` lock, but the
+information was spread across a number of fields. The fields used was
+typically state fields that could contain a small number of different
+states. By reordering this information a bit we could *easily* fit
+this information into a 32-bit wide field of bit flags (only 12-flags
+were needed). By moving this information we could remove five 32-bit
+wide fields and one pointer field from the process structure! The move
+also enabled us to easily read and change the state using atomic
+memory operations.
+
+### Run Queue ###
+
+As with processes we wanted to be able to do the most fundamental
+operations without having to acquire a lock on it. The most important
+being able to determine if we should enqueue a process in a specific
+run queue or not. This involves being able to read actual load, and
+load balancing information.
+
+The load balancing functionality is triggered at repeated fixed
+intervals. The load balancing more or less strives to even out run
+queue lengths over the system. When balancing is triggered,
+information about every run queue is gathered, migrations paths and
+run queue length limits are set up. Migration paths and limits are
+fixed until the next balancing has been done. The most important
+information about each run queue is the maximum run queue length since
+last balancing. All of this information were previously stored in the
+run queues themselves.
+
+When a process has become runnable, for example due to reception of a
+message, we need to determine which run queue to enqueue it
+in. Previously this at least involved locking the run queue that the
+process currently was assigned to while holding the status lock on the
+process. Depending on load we sometimes also had to acquire a lock on
+another run queue in order to be able to determine if it should be
+migrated to that run queue or not.
+
+In order to be able to decide which run queue to use without having to
+lock any run queues, we moved all fixed balancing information out of
+the run queues into a global memory block. That is, migration paths
+and run queue limits. Information that need to be frequently updated,
+like for example maximum run queue length, were kept in the run queue,
+but instead of operating on this information under locks we now use
+atomic memory operations when accessing this information. This made it
+possible to first determine which run queue to use, without locking
+any run queues, and when decided, lock the chosen run queue and insert
+the process.
+
+#### Fixed Balancing Information ####
+
+When determining which run queue to choose we need to read the fixed
+balancing information that we moved out of the run queues. This
+information is global, read only between load balancing operations,
+but will be changed during a load balancing. We do not want to
+introduce a global lock that needs to be acquired when accessing this
+information. A reader optimized rwlock could avoid some of the
+overhead since the data is most frequently read, but it would
+unavoidably cause disruption during load balancing, since this
+information is very frequently read. The likelihood of a large
+disruption due to this also increase as number of schedulers grows.
+
+Instead of using a global lock protecting modifications of this
+information, we write a completely new version of it at each load
+balancing. The new version is written in another memory block than the
+previous one, and published by issuing a write memory barrier and then
+storing a pointer to the new memory block in a global variable using
+an atomic write operation.
+
+When schedulers need to read this information, they read the pointer
+to currently used information using an atomic read operation, and then
+issue a data dependency read barrier, which on most architectures is a
+no-op. That is, it is very little overhead getting access to this
+information.
+
+Instead of allocating and deallocating memory blocks for the different
+versions of the balancing information we keep old memory blocks and
+reuse them when it is safe to do so. In order to be able to determine
+when it is safe to reuse a block we use the thread progress
+functionality, ensuring that no threads have any references to the
+memory block when we reuse it.
+
+#### Be Less Aggressive ####
+
+We implemented a test version using lock free run queues. This
+implementation did however not perform as good as the version using
+one lock per run queue. The reason for this was not investigated
+enough to say why this was. Since the locked version performed better
+we kept it, at least for now. The lock free version, however, forced
+us to use other solutions, some of them we kept.
+
+Previously when a process that was in a run queue got suspended, we
+removed it from the queue straight away. This involved locking the
+process, locking the run queue, and then unlinking it from the double
+linked list implementing the queue. Removing a process from a lock
+free queue gets really complicated. Instead, of removing it from the
+queue, we just leave it in the queue and mark it as suspended. When
+later selected for execution we check if the process is suspended, if
+so just dropped it. During its time in the queue, it might also get
+resumed again, if so execute it when it get selected for execution.
+
+By keeping this part when reverting back to a locked implementation,
+we could remove a pointer field in each process structure, and avoid
+unnecessary operations on the process and the queue which might cause
+contention.
+
+### Combined Modifications ###
+
+By combining the modifications of the process state management and the
+run queue management, we can do large parts of the work involved when
+managing processes with regards to scheduling and migration without
+having any locks locked at all. In these situations we previously had
+to have multiple locks locked. This of course caused a lot of rewrites
+across large parts of the runtime system, but the rewrite both
+simplified code and eliminated locking at a number of places. The
+major benefit is, of course, reduced contention.
+
+### A Benchmark Result ###
+
+When running the chameneosredux benchmark, schedulers frequently run
+out of work trying to steal work from each other. That is, either
+succeeding in migrating, or trying to migrate processes which is a
+scenario which we wanted to optimize. By the introduction of these
+improvements, we got a speedup of 25-35% when running this benchmark
+on a relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/ThreadProgress.md b/erts/emulator/internal_doc/ThreadProgress.md
new file mode 100644
index 0000000000..6118bcf0f6
--- /dev/null
+++ b/erts/emulator/internal_doc/ThreadProgress.md
@@ -0,0 +1,308 @@
+Thread Progress
+===============
+
+Problems
+--------
+
+### Knowing When Threads Have Completed Accesses to a Data Structure ###
+
+When multiple threads access the same data structure you often need to
+know when all threads have completed their accesses. For example, in
+order to know when it is safe to deallocate the data structure. One
+simple way to accomplish this is to reference count all accesses to
+the data structure. The problem with this approach is that the cache
+line where the reference counter is located needs to be communicated
+between all involved processors. Such communication can become
+extremely expensive and will scale poorly if the reference counter is
+frequently accessed. That is, we want to use some other approach of
+keeping track of threads than reference counting.
+
+### Knowing That Modifications of Memory is Consistently Observed ###
+
+Different hardware architectures have different memory models. Some
+architectures allows very aggressive reordering of memory accesses
+while other architectures only reorder a few specific cases. Common to
+all modern hardware is, however, that some type of reordering will
+occur. When using locks to protect all memory accesses made from
+multiple threads such reorderings will not be visible. The locking
+primitives will ensure that the memory accesses will be ordered. When
+using lock free algorithms one do however have to take this reordering
+made by the hardware into account.
+
+Hardware memory barriers or memory fences are instructions that can be
+used to enforce order between memory accesses. Different hardware
+architectures provide different memory barriers. Lock free algorithms
+need to use memory barriers in order to ensure that memory accesses
+are not reordered in such ways that the algorithm breaks down. Memory
+barriers are also expensive instructions, so you typically want to
+minimize the use of these instructions.
+
+Functionality Used to Address These Problems
+-------------------------------------------
+
+The "thread progress" functionality in the Erlang VM is used to
+address these problems. The name "thread progress" was chosen since we
+want to use it to determine when all threads in a set of threads have
+made such progress so that two specific events have taken place for
+all them.
+
+The set of threads that we are interested in we call managed
+threads. The managed threads are the only threads that we get any
+information about. These threads *have* to frequently report
+progress. Not all threads in the system are able to frequently report
+progress. Such threads cannot be allowed in the set of managed threads
+and are called unmanaged threads. An example of unmanaged threads are
+threads in the async thread pool. Async threads can be blocked for
+very long times and by this be prevented from frequently reporting
+progress. Currently only scheduler threads and a couple of other
+threads are managed threads.
+
+### Thread Progress Events ###
+
+Any thread in the system may use the thread progress functionality in
+order to determine when the following events have occured at least
+once in all managed threads:
+
+1.  The thread has returned from other code to a known state in the
+    thread progress functionality, which is independent of any other
+    code. 
+2.  The thread has executed a full memory barrier.
+
+These events, of course, need to occur ordered to other memory
+operations. The operation of determining this begins by initiating the
+thread progress operation. The thread that initiated the thread
+progress operation after this poll for the completion of the
+operation. Both of these events must occur at least once *after* the
+thread progress operation has been initiated, and at least once
+*before* the operation has completed in each managed thread. This is
+ordered using communication via memory which makes it possible to draw
+conclusion about the memory state after the thread progress operation
+has completed. Lets call the progress made from initiation to
+comletion for "thread progress".
+
+Assuming that the thread progress functionality is efficient, a lot of
+algorithms can both be simplified and made more efficient than using
+the first approach that comes to mind. A couple of examples follows.
+
+By being able to determine when the first event above has occurred we
+can easily know when all managed threads have completed accesses to a
+data structure. This can be determined the following way. We have an
+implementation of some functionality `F` using a data structure
+`D`. The reference to `D` is always looked up before `D` is being
+accessed, and the references to `D` is always dropped before we leave
+the code implementing `F`. If we remove the possibility to look up `D`
+and then wait until the first event has occurred in all managed
+threads, no managed threads can have any references to the data
+structure `D`. This could for example have been achieved by using
+reference counting, but the cache line containing the reference
+counter would in this case be ping ponged between all processors
+accessing `D` at every access.
+
+By being able to determine when the second event has occurred it is
+quite easy to do complex modifications of memory that needs to be seen
+consistently by other threads without having to resort to locking. By
+doing the modifications, then issuing a full memory barrier, then wait
+until the second event has occurred in all managed threads, and then
+publish the modifications, we know that all managed threads reading
+this memory will get a consistent view of the modifications. Managed
+threads reading this will not have to issue any extra memory barriers
+at all.
+
+Implementation of the Thread Progress Functionality
+---------------------------------------------------
+
+### Requirement on the Implementation ###
+
+In order to be able to determine when all managed threads have reached
+the states that we are interested in we need to communicate between
+all involved threads. We of course want to minimize this
+communication.
+
+We also want threads to be able to determine when thread progress has
+been made relatively fast. That is we need to have some balance
+between comunication overhead and time to complete the operation.
+
+### API ###
+
+I will only present the most important functions in the API here.
+
+*   `ErtsThrPrgrVal erts_thr_progress_later(void)` - Initiation of the
+    operation. The thread progress value returned can be used testing
+    for completion of the operation.
+*   `int erts_thr_progress_has_reached(ErtsThrPrgrVal val)` - Returns
+    a non zero value when we have reached the thread progress value
+    passed as argument. That is, when a non zero value is returned the
+    operation has completed.
+
+When a thread calls `my_val = erts_thr_progress_later()` and waits for
+`erts_thr_progress_has_reached(my_val)` to return a non zero value it
+knows that thread progress has been made.
+
+While waiting for `erts_thr_progress_has_reached()` to return a non
+zero value we typically do not want to block waiting, but instead want
+to continue working with other stuff. If we run out of other stuff to
+work on we typically do want to block waiting until we have reached
+the thread progress value that we are waiting for. In order to be able
+to do this we provide functionality for waking up a thread when a
+certain thread progress value has been reached:
+
+*   `void erts_thr_progress_wakeup(ErtsSchedulerData *esdp,
+    ErtsThrPrgrVal val)` - Request wake up. The calling thread will be
+    woken when thread progress has reached val. 
+
+Managed threads frequently need to update their thread progress by
+calling the following functions:
+
+*   `int erts_thr_progress_update(ErtsSchedulerData *esdp)` - Update
+    thread progress. If a non zero value is returned
+    `erts_thr_progress_leader_update()` has to be called without any
+    locks held.
+*   `int erts_thr_progress_leader_update(ErtsSchedulerData *esdp)` -
+    Leader update thread progress.
+
+Unmanaged threads can delay thread progress beeing made:
+
+*   `ErtsThrPrgrDelayHandle erts_thr_progress_unmanaged_delay(void)` -
+    Delay thread progress.
+*   `void erts_thr_progress_unmanaged_continue(ErtsThrPrgrDelayHandle
+    handle)` - Let thread progress continue.
+
+Scheduler threads can schedule an operation to be executed by the
+scheduler itself when thread progress has been made:
+
+* `void erts_schedule_thr_prgr_later_op(void (*funcp)(void *), void
+  *argp, ErtsThrPrgrLaterOp *memp)` - Schedule a call to `funcp`. The
+  call `(*funcp)(argp)` will be executed when thread progress has been
+  made since the call to `erts_schedule_thr_prgr_later_op()` was
+  made.
+
+### Implementation ###
+
+In order to determine when the events has happened we use a global
+counter that is incremented when all managed threads have called
+`erts_thr_progress_update()` (or `erts_thr_progress_leader_update()`).
+This could naively be implemented using a "thread confirmed" counter.
+This would however cause an explosion of communication where all
+involved processors would need to communicate with each other at each
+update.
+
+Instead of confirming at a global location each thread confirms that
+it accepts in increment of the global counter in its own cache
+line. These confirmation cache lines are located in sequence in an
+array, and each confirmation cache line will only be written by one
+and only one thread. One of the managed threads always have the leader
+responsibility. This responsibility may jump between threads, but as
+long as there are some activity in the system always one of them will
+have the leader responsibility. The thread with the leader
+responsibility will call `erts_thr_progress_leader_update()` which
+will check that all other threads have confirmed an increment of the
+global counter before doing the increment of the global counter. The
+leader thread is the only thread reading the confirmation cache
+lines.
+
+Doing it this way we will get a communication pattern of information
+going from the leader thread out to all other managed threads and then
+back from the other threads to the leader thread. This since only the
+leader thread will write to the global counter and all other threads
+will only read it, and since each confirmation cache lines will only
+be written by one specific thread and only read by the leader
+thread. When each managed thread is distributed over different
+processors, the communication between processors will be a reflection
+of this communication pattern between threads.
+
+The value returned from `erts_thr_progress_later()` equals the, by
+this thread, latest confirmed value plus two. The global value may be
+latest confirmed value or latest confirmed value minus one. In order
+to be certain that all other managed threads actually will call
+`erts_thr_progress_update()` at least once before we reach the value
+returned from `erts_thr_progress_later()`, the global counter plus one
+is not enough. This since all other threads may already have confirmed
+current global value plus one at the time when we call
+`erts_thr_progress_later()`. They are however guaranteed not to have
+confirmed global value plus two at this time.
+
+The above described implementation more or less minimizes the
+comunication needed before we can increment the global counter. The
+amount of communication in the system due to the thread progress
+functionality however also depend on the frequency with which managed
+threads call `erts_thr_progress_update()`. Today each scheduler thread
+calls `erts_thr_progress_update()` more or less each time an Erlang
+process is scheduled out. One way of further reducing communication
+due to the thread progress functionality is to only call
+`erts_thr_progress_update()` every second, or third time an Erlang
+process is scheduled out, or even less frequently than that. However,
+by doing updates of thread progress less frequently all operations
+depending on the thread progress functionality will also take a longer
+time.
+
+#### Delay of Thread Progress by Unmanaged Threads ####
+
+In order to implement delay of thread progress from unmanaged threads
+we use two reference counters. One being `current` and one being
+`waiting`. When an unmanaged thread wants to delay thread progress it
+increments `current` and gets a handle back to the reference counter
+it incremented. When it later wants to enable continuation of thread
+progress it uses the handle to decrement the reference counter it
+previously incremented.
+
+When the leader threads is about to increment the global thread
+progress counter it verifies that the `waiting` counter is zero before
+doing so. If not zero, the leader isn't allowed to increment the
+global counter, and needs to wait before it can do this. When it is
+zero, it swaps the `waiting` and `current` counters before increasing
+the global counter. From now on the new `waiting` counter will
+decrease, so that it eventualy will reach zero, making it possible to
+increment the global counter the next time. If we only used one
+reference counter it would potentially be held above zero for ever by
+different unmanaged threads.
+
+When an unmanaged thread increment the `current` counter it will not
+prevent the next increment of the global counter, but instead the
+increment after that. This is sufficient since the global counter
+needs to be incremented two times before thread progress has been
+made. It is also desirable not to prevent the first increment, since
+the likelyhood increases that the delay is withdrawn before any
+increment of the global counter is delayed. That is, the operation
+will cause as little disruption as possible.
+
+However, this feature of delaying thread progress from unmanaged
+threads should preferably be used as little as possible, since heavy
+use of it will cause contention on the reference counter cache
+lines. The functionality is however very useful in code which normally
+only executes in managed threads, but which may under some infrequent
+circumstances be executed in other threads.
+
+#### Overhead ####
+
+The overhead caused by the thread progress functionality is more or
+less fixed using the same amount of schedulers regardless of the
+number of uses of the functionality. Already today quite a lot of
+functionality use it, and we plan to use it even more. When rewriting
+old implementations of ERTS internal functionality to use the thread
+progress functionality, this implies removing communication in the old
+implementation. Otherwise it is simply no point rewriting the old
+implementation to use the thread progress functionality. Since the
+thread progress overhead is more or less fixed, the rewrite will cause
+a reduction of the total communication in the system.
+
+##### An Example #####
+
+The main structure of an ETS table was originally managed using
+reference counting. Already a long time ago we replaced this strategy
+since the reference counter caused contention on each access of the
+table. The solution used was to schedule "confirm deletion" jobs on
+each scheduler in order to know when it was safe to deallocate the
+table structure of a removed table. These confirm deletion jobs needed
+to be allocated. That is, we had to allocate and deallocate as many
+blocks as schedulers in order to deallocate one block. This of course
+was a quite an expensive operation, but we only needed to do this once
+when removing a table. It was more important to get rid of the
+contention on the reference counter which was present on every
+operation on the table.
+
+When the thread progress functionality had been introduced, we could
+remove the code implementing the "confirm deletion" jobs, and then
+just schedule a thread progress later operation which deallocates the
+structure. Besides simplifying the code a lot, we got an increase of
+more than 10% of the number of transactions per second handled on a
+mnesia tpcb benchmark executing on a quad core machine.
diff --git a/erts/emulator/internal_doc/Tracing.md b/erts/emulator/internal_doc/Tracing.md
new file mode 100644
index 0000000000..30bc5327a7
--- /dev/null
+++ b/erts/emulator/internal_doc/Tracing.md
@@ -0,0 +1,220 @@
+Non-blocking trace setting
+==========================
+
+Introduction
+------------
+
+Before OTP R16 when trace settings were changed by `erlang:trace_pattern`,
+all other execution in the VM were halted while the trace operation
+was carried out in single threaded mode. Similar to code loading, this
+can impose a severe problem for availability that grows with the
+number of cores.
+
+In OTP R16, trace breakpoints are set in the code without blocking the
+VM. Erlang processes may continue executing undisturbed in parallel
+during the entire operation. The same base technique is used as for
+code loading. A staging area of breakpoints is prepared and then made
+active with a single atomic operation.
+
+
+Redesign of Breakpoint Wheel
+----------------------------
+
+To make it easier to manage breakpoints without single threaded mode a
+redesign of the breakpoint mechanism has been made. The old
+"breakpoint wheel" data structure was a circular double-linked list of
+breakpoints for each instrumented function. It was invented before the
+SMP emulator. To support it in the SMP emulator, is was essentially
+expanded to one breakpoint wheel per scheduler. As more breakpoint
+types have been added, the implementation have become messy and hard
+to understand and maintain.
+
+In the new design the old wheel was dropped and instead replaced by
+one struct (`GenericBp`) to hold the data for all types of breakpoints
+for each instrumented function. A bit-flag field is used to indicate
+what different type of break actions that are enabled.
+
+
+Same Same but Different
+-----------------------
+Even though `trace_pattern` use the same technique as the non-blocking
+code loading with replicated generations of data structures and an
+atomic switch, the implementations are quite separate from each
+other. One initial idea was to use the existing mechanism of code
+loading to do a dummy load operation that would make a copy of the
+affected modules. That copy could then be instrumented with
+breakpoints before making it reachable with the same atomic switch as
+done for code loading. This approach seems straight forward but has a
+number of shortcomings, one being the large memory footprint when many
+modules are instrumented. Another problem is how execution will reach
+the new instrumented code. Normally loaded code can only be reached
+through external functions calls. Trace settings must be activated
+instantaneously without the need of external function calls.
+
+The choosen solution is instead for tracing to use the technique of
+replication applied on the data structures for breakpoints. Two
+generations of breakpoints are kept and indentified by index of 0 and
+1. The global atomic variables `erts_active_bp_index` will determine
+which generation of breakpoints running code will use.
+
+### Atomicy Without Atomic Operations
+
+Not using the code loading generations (or any other code duplication)
+means that `trace_pattern` must at some point write to the active beam
+code in order for running processes to reach the staged breakpoints
+structures. This can be done with one single atomic write operation
+per instrumented function. The beam instruction words are however read
+with normal memory loads and not through the atomic API. The only
+guarantee we need is that the written instruction word is seen as
+atomic. Either fully written or not at all. This is true for word
+aligned write operation on all hardware architectures we use.
+
+
+Adding a new Breakpoint
+-----------------------
+This is a simplified sequence describing what `trace_pattern` goes
+through when adding a new breakpoint.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate breakpoint structure `GenericBp` including both generations.
+   Set the active part as disabled with a zeroed flagfield. Save the original
+   instruction word in the breakpoint.
+
+3. Write a pointer to the breakpoint at offset -4 from the first
+   instruction "func_info" header.
+
+4. Set the staging part of the breakpoint as enabled with specified
+   breakpoint data.
+
+5. Wait for thread progress.
+
+6. Write a `op_i_generic_breakpoint` as the first instruction for the function.
+   This instruction will execute the breakpoint that it finds at offset -4.
+
+7. Wait for thread progress.
+
+8. Commit the breadpoint by switching `erts_active_bp_index`.
+
+9. Wait for thread progress.
+
+10. Prepare for next call to `trace_pattern` by updating the new staging part
+    (the old active) of the breakpoint to be identic to the the new active part.
+
+11. Release code write permission and return from `trace_pattern`.
+
+
+The code write permission "lock" seized in step 1 is the same as used
+by code loading. This will ensure that only one process at a time can
+stage new trace settings but it will also prevent concurrent code
+loading and make sure we see a consistent view of the beam code during
+the entire sequence.
+
+Between step 6 and 8, runninng processes might execute the written
+`op_i_generic_breakpoint` instruction. They will get the breakpoint
+structure written in step 3, read `erts_active_bp_index` and execute
+the corresponding part of the breakpoint. Before the switch in step 8
+becomes visible they will however execute the disabled part of the
+breakpoint structure and do nothing other than executing the saved
+original instruction.
+
+
+To Updating and Remove Breakpoints
+----------------------------------
+
+The above sequence did only describe adding a new breakpoint. We do
+basically the same sequence to update the settings of an existing
+breakpoint except step 2,3 and 6 can be skipped as it has already been
+done.
+
+To remove a breakpoint some more steps are needed. The idea is to
+first stage the breakpoint as disabled, do the switch, wait for thread
+progress and then remove the disabled breakpoint by restoring the
+original beam instruction.
+
+Here is a more complete sequence that contains both adding, updating
+and removing breakpoints.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate new breakpoint structures with a disabled active part and
+   the original beam instruction. Write a pointer to the breakpoint in
+   "func_info" header at offset -4.
+
+3. Update the staging part of all affected breakpoints. Disable
+   breakpoints that are to be removed.
+
+4. Wait for thread progress.
+
+5. Write a `op_i_generic_breakpoint` as the first instruction for all
+   functions with new breakpoints.
+
+6. Wait for thread progress.
+
+7. Commit all staged breadpoints by switching `erts_active_bp_index`.
+
+8. Wait for thread progress.
+
+
+9. Restore original beam instruction for disabled breakpoints.
+
+10. Wait for thread progress.
+
+11. Prepare for next call to `trace_pattern` by updating the new
+    staging area (the old active) for all enabled breakpoints.
+
+12. Deallocate disabled breakpoint structures.
+
+13. Release code write permission and return from `trace_pattern`.
+
+
+### All that Waiting for Thread Progress
+
+There are four rounds of waiting for thread progress in the above
+sequence. In the code loading sequence we sacrificed memory overhead
+of three generations to avoid a second round of thread progress. The
+latency of `trace_pattern` should not be such a big problem for
+however, as it is normally not called in a rapid sequence.
+
+The waiting in step 4 is to make sure all threads will see an updated
+view of the breakpoint structures once they become reachable through
+the `op_i_generic_breakpoint` instruction written in step 5.
+
+The waiting in step 6 is to make the activation of the new trace
+settings "as atomic as possible". Different cores might see the new
+value of `erts_active_bp_index` at different times as it is read
+without any memory barrier. But this is the best we can do without
+more expensive thread synchronization.
+
+The waiting in step 8 is to make sure we dont't restore the original
+bream instructions for disabled breakpoints until we know that no
+thread is still accessing the old enabled part of a disabled
+breakpoint.
+
+The waiting in step 10 is to make sure no lingering thread is still
+accessing disabled breakpoint structures to be deallocated in step
+12.
+
+
+Global Tracing
+--------------
+
+Call tracing with `global` option only affects external function
+calls. This was earlier handled by inserting a special trace
+instruction in export entries without the use of breakpoints. With the
+new non-blocking tracing we want to avoid special handling for global
+tracing and make use of the staging and atomic switching within the
+breakpoint mechanism. The solution was to create the same type of
+breakpoint structure for a global call trace. The difference to local
+tracing is that we insert the `op_i_generic_breakpoint` instruction
+(with its pointer at offset -4) in the export entry rather than in the
+code.
+
+
+Future work
+-----------
+
+We still go to single threaded mode when new code is loaded for a
+module that is traced, or when loading code when there is a default
+trace pattern set. That is not impossible to fix, but that requires
+much closer cooperation between tracing BIFs and the loader BIFs.
diff --git a/erts/emulator/sys/unix/erl_unix_sys_ddll.c b/erts/emulator/sys/unix/erl_unix_sys_ddll.c
index 12c47d0088..8760b58839 100644
--- a/erts/emulator/sys/unix/erl_unix_sys_ddll.c
+++ b/erts/emulator/sys/unix/erl_unix_sys_ddll.c
@@ -101,7 +101,7 @@ void erl_sys_ddll_init(void) {
 /* 
  * Open a shared object
  */
-int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err)
+int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err)
 {
 #if defined(HAVE_DLOPEN)
     char* dlname; 
diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c
index 61f9f6a59a..59e34eb819 100644
--- a/erts/emulator/sys/unix/sys.c
+++ b/erts/emulator/sys/unix/sys.c
@@ -547,6 +547,25 @@ erts_sys_pre_init(void)
 #endif
 #endif /* USE_THREADS */
     erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0);
+
+    {
+      /*
+       * Unfortunately we depend on fd 0,1,2 in the old shell code.
+       * So if for some reason we do not have those open when we start
+       * we have to open them here. Not doing this can cause the emulator
+       * to deadlock when reaping the fd_driver ports :(
+       */
+      int fd;
+      /* Make sure fd 0 is open */
+      if ((fd = open("/dev/null", O_RDONLY)) != 0)
+	close(fd);
+      /* Make sure fds 1 and 2 are open */
+      while (fd < 3) {
+	fd = open("/dev/null", O_WRONLY);
+      }
+      close(fd);
+    }
+
 }
 
 void
diff --git a/erts/emulator/sys/win32/erl_win32_sys_ddll.c b/erts/emulator/sys/win32/erl_win32_sys_ddll.c
index 2d3f073cc2..338f0d7386 100644
--- a/erts/emulator/sys/win32/erl_win32_sys_ddll.c
+++ b/erts/emulator/sys/win32/erl_win32_sys_ddll.c
@@ -59,7 +59,7 @@ void erl_sys_ddll_init(void) {
  * Open a shared object
  * Expecting 'full_name' as an UTF-8 string.
  */
-int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err)
+int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err)
 {
     HINSTANCE hinstance;
     int len;
diff --git a/erts/emulator/test/binary_SUITE.erl b/erts/emulator/test/binary_SUITE.erl
index a340a805b5..bce4278337 100644
--- a/erts/emulator/test/binary_SUITE.erl
+++ b/erts/emulator/test/binary_SUITE.erl
@@ -447,26 +447,26 @@ terms(Config) when is_list(Config) ->
               Sz1 when is_integer(Sz1), size(Bin1) =< Sz1 ->
                   ok
               end,
-		      Term = binary_to_term(Bin),
-		      Term = binary_to_term(Bin, [safe]),
+		      Term = binary_to_term_stress(Bin),
+		      Term = binary_to_term_stress(Bin, [safe]),
 		      Unaligned = make_unaligned_sub_binary(Bin),
-		      Term = binary_to_term(Unaligned),
-		      Term = binary_to_term(Unaligned, []),
-		      Term = binary_to_term(Bin, [safe]),
+		      Term = binary_to_term_stress(Unaligned),
+		      Term = binary_to_term_stress(Unaligned, []),
+		      Term = binary_to_term_stress(Bin, [safe]),
 		      BinC = erlang:term_to_binary(Term, [compressed]),
-		      Term = binary_to_term(BinC),
+		      Term = binary_to_term_stress(BinC),
 		      true = size(BinC) =< size(Bin),
 		      Bin = term_to_binary(Term, [{compressed,0}]),
 		      terms_compression_levels(Term, size(Bin), 1),
 		      UnalignedC = make_unaligned_sub_binary(BinC),
-		      Term = binary_to_term(UnalignedC)
+		      Term = binary_to_term_stress(UnalignedC)
 	      end,
     ?line test_terms(TestFun),
     ok.
 
 terms_compression_levels(Term, UncompressedSz, Level) when Level < 10 ->
     BinC = erlang:term_to_binary(Term, [{compressed,Level}]),
-    Term = binary_to_term(BinC),
+    Term = binary_to_term_stress(BinC),
     Sz = byte_size(BinC),
     true = Sz =< UncompressedSz,
     terms_compression_levels(Term, UncompressedSz, Level+1);
@@ -476,9 +476,9 @@ terms_float(Config) when is_list(Config) ->
     ?line test_floats(fun(Term) ->
 			      Bin0 = term_to_binary(Term),
 			      Bin0 = term_to_binary(Term, [{minor_version,0}]),
-			      Term = binary_to_term(Bin0),
+			      Term = binary_to_term_stress(Bin0),
 			      Bin1 = term_to_binary(Term, [{minor_version,1}]),
-			      Term = binary_to_term(Bin1),
+			      Term = binary_to_term_stress(Bin1),
 			      true = size(Bin1) < size(Bin0),
                   Size0 = erlang:external_size(Term),
                   Size00 = erlang:external_size(Term, [{minor_version, 0}]),
@@ -490,7 +490,7 @@ terms_float(Config) when is_list(Config) ->
 float_middle_endian(Config) when is_list(Config) ->
     %% Testing for roundtrip is not enough.
     ?line <<131,70,63,240,0,0,0,0,0,0>> = term_to_binary(1.0, [{minor_version,1}]),
-    ?line 1.0 = binary_to_term(<<131,70,63,240,0,0,0,0,0,0>>).
+    ?line 1.0 = binary_to_term_stress(<<131,70,63,240,0,0,0,0,0,0>>).
 
 external_size(Config) when is_list(Config) ->
     %% Build a term whose external size only fits in a big num (on 32-bit CPU).
@@ -608,10 +608,10 @@ bad_binary_to_term(Config) when is_list(Config) ->
     ok.
 
 bad_bin_to_term(BadBin) ->
-    {'EXIT',{badarg,_}} = (catch binary_to_term(BadBin)).
+    {'EXIT',{badarg,_}} = (catch binary_to_term_stress(BadBin)).
 
 bad_bin_to_term(BadBin,Opts) ->
-    {'EXIT',{badarg,_}} = (catch binary_to_term(BadBin,Opts)).
+    {'EXIT',{badarg,_}} = (catch binary_to_term_stress(BadBin,Opts)).
 
 safe_binary_to_term2(doc) -> "Test safety options for binary_to_term/2";
 safe_binary_to_term2(Config) when is_list(Config) ->
@@ -622,7 +622,7 @@ safe_binary_to_term2(Config) when is_list(Config) ->
     BadRef = <<131,114,0,3,BadHostAtom/binary,0,<<0,0,0,255>>/binary,
 	      Empty/binary,Empty/binary>>,
     ?line bad_bin_to_term(BadRef, [safe]), % good ref, with a bad atom
-    ?line fullsweep_after = binary_to_term(<<131,100,0,15,"fullsweep_after">>, [safe]), % should be a good atom
+    ?line fullsweep_after = binary_to_term_stress(<<131,100,0,15,"fullsweep_after">>, [safe]), % should be a good atom
     BadExtFun = <<131,113,100,0,4,98,108,117,101,100,0,4,109,111,111,110,97,3>>,
     ?line bad_bin_to_term(BadExtFun, [safe]),
     ok.
@@ -679,14 +679,14 @@ corrupter0(Term) ->
 
 corrupter(Bin, Pos) when Pos >= 0 ->
     ?line {ShorterBin, Rest} = split_binary(Bin, Pos),
-    ?line catch binary_to_term(ShorterBin), %% emulator shouldn't crash
+    ?line catch binary_to_term_stress(ShorterBin), %% emulator shouldn't crash
     ?line MovedBin = list_to_binary([ShorterBin]),
-    ?line catch binary_to_term(MovedBin), %% emulator shouldn't crash
+    ?line catch binary_to_term_stress(MovedBin), %% emulator shouldn't crash
 
     %% Bit faults, shouldn't crash
     <<Byte,Tail/binary>> = Rest,
     Fun = fun(M) -> FaultyByte = Byte bxor M,                    
-		    catch binary_to_term(<<ShorterBin/binary,
+		    catch binary_to_term_stress(<<ShorterBin/binary,
 					  FaultyByte, Tail/binary>>) end,
     ?line lists:foreach(Fun,[1,2,4,8,16,32,64,128,255]),    
     ?line corrupter(Bin, Pos-1);
@@ -700,7 +700,7 @@ more_bad_terms(Config) when is_list(Config) ->
     ?line ok = io:format("File: ~s\n", [BadFile]),
     ?line case file:read_file(BadFile) of
 	      {ok,Bin} ->
-		  ?line {'EXIT',{badarg,_}} = (catch binary_to_term(Bin)),
+		  ?line {'EXIT',{badarg,_}} = (catch binary_to_term_stress(Bin)),
 		  ok;
 	      Other ->
 		  ?line ?t:fail(Other)
@@ -709,7 +709,7 @@ more_bad_terms(Config) when is_list(Config) ->
 otp_5484(Config) when is_list(Config) ->
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       <<131,
 		104,2,				%Tuple, 2 elements
 		103,				%Pid
@@ -722,7 +722,7 @@ otp_5484(Config) when is_list(Config) ->
 
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       <<131,
 		104,2,				%Tuple, 2 elements
 		103,				%Pid
@@ -734,13 +734,13 @@ otp_5484(Config) when is_list(Config) ->
 
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       %% A old-type fun in a list containing a bad creator pid.
 	       <<131,108,0,0,0,1,117,0,0,0,0,103,100,0,13,110,111,110,111,100,101,64,110,111,104,111,115,116,255,255,0,25,255,0,0,0,0,100,0,1,116,97,0,98,6,142,121,72,106>>)),
 
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       %% A new-type fun in a list containing a bad creator pid.
 	       %% 
 	       <<131,
@@ -752,7 +752,7 @@ otp_5484(Config) when is_list(Config) ->
 
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       %% A new-type fun in a list containing a bad module.
 	       <<131,
 		108,0,0,0,1,			%List, 1 element
@@ -763,7 +763,7 @@ otp_5484(Config) when is_list(Config) ->
 
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       %% A new-type fun in a list containing a bad index.
 	       <<131,
 		108,0,0,0,1,			%List, 1 element
@@ -775,7 +775,7 @@ otp_5484(Config) when is_list(Config) ->
 
     ?line {'EXIT',_} =
 	(catch
-	     binary_to_term(
+	     binary_to_term_stress(
 	       %% A new-type fun in a list containing a bad unique value.
 	       <<131,
 		108,0,0,0,1,			%List, 1 element
@@ -788,46 +788,46 @@ otp_5484(Config) when is_list(Config) ->
 
     %% An absurdly large atom.
     ?line {'EXIT',_} = 
-	(catch binary_to_term(iolist_to_binary([<<131,100,65000:16>>|
+	(catch binary_to_term_stress(iolist_to_binary([<<131,100,65000:16>>|
 						lists:duplicate(65000, 42)]))),
 
     %% Longer than 255 characters.
     ?line {'EXIT',_} = 
-	(catch binary_to_term(iolist_to_binary([<<131,100,256:16>>|
+	(catch binary_to_term_stress(iolist_to_binary([<<131,100,256:16>>|
 						lists:duplicate(256, 42)]))),
 
     %% OTP-7218. Thanks to Matthew Dempsky. Also make sure that we
     %% cover the other error cases for external funs (EXPORT_EXT).
     ?line {'EXIT',_} = 
-	(catch binary_to_term(
+	(catch binary_to_term_stress(
 		 <<131,
 		  113,				%EXPORT_EXP
 		  97,13,			%Integer: 13
 		  97,13,			%Integer: 13
 		  97,13>>)),			%Integer: 13
     ?line {'EXIT',_} = 
-	(catch binary_to_term(
+	(catch binary_to_term_stress(
 		 <<131,
 		  113,				%EXPORT_EXP
 		  100,0,1,64,			%Atom: '@'
 		  97,13,			%Integer: 13
 		  97,13>>)),			%Integer: 13
     ?line {'EXIT',_} = 
-	(catch binary_to_term(
+	(catch binary_to_term_stress(
 		 <<131,
 		  113,				%EXPORT_EXP
 		  100,0,1,64,			%Atom: '@'
 		  100,0,1,64,			%Atom: '@'
 		  106>>)),			%NIL
     ?line {'EXIT',_} = 
-	(catch binary_to_term(
+	(catch binary_to_term_stress(
 		 <<131,
 		  113,				%EXPORT_EXP
 		  100,0,1,64,			%Atom: '@'
 		  100,0,1,64,			%Atom: '@'
 		  98,255,255,255,255>>)),	%Integer: -1
     ?line {'EXIT',_} = 
-	(catch binary_to_term(
+	(catch binary_to_term_stress(
 		 <<131,
 		  113,				%EXPORT_EXP
 		  100,0,1,64,			%Atom: '@'
@@ -835,7 +835,7 @@ otp_5484(Config) when is_list(Config) ->
 		  113,97,13,97,13,97,13>>)),	%fun 13:13/13
 
     %% Bad funs.
-    ?line {'EXIT',_} = (catch binary_to_term(fake_fun(0, lists:seq(0, 256)))),
+    ?line {'EXIT',_} = (catch binary_to_term_stress(fake_fun(0, lists:seq(0, 256)))),
     ok.
 
 fake_fun(Arity, Env0) ->
@@ -869,7 +869,7 @@ try_bad_lengths(B) ->
 try_bad_lengths(B, L) when L > 16#FFFFFFF0 ->
     Bin = <<B/binary,L:32>>,
     io:format("~p\n", [Bin]),
-    {'EXIT',_} = (catch binary_to_term(Bin)),
+    {'EXIT',_} = (catch binary_to_term_stress(Bin)),
     try_bad_lengths(B, L-1);
 try_bad_lengths(_, _) -> ok.
 
@@ -923,7 +923,7 @@ otp_6817_try_bin(Bin) ->
     %% If the bug is present, the heap pointer will moved when the invalid term
     %% is found and we will have a linked list passing through the limbo area
     %% between the heap top and the stack pointer.
-    catch binary_to_term(Bin),
+    catch binary_to_term_stress(Bin),
 
     %% If the bug is present, we will overwrite the pointers in the limbo area.
     Filler = erlang:make_tuple(1024, 16#3FA),
@@ -935,7 +935,7 @@ otp_6817_try_bin(Bin) ->
 otp_8117(doc) -> "Some bugs in binary_to_term when 32-bit integers are negative.";
 otp_8117(suite) -> [];
 otp_8117(Config) when is_list(Config) ->
-    [otp_8117_do(Op,-(1 bsl N)) || Op <- ['fun',list,tuple],
+    [otp_8117_do(Op,-(1 bsl N)) || Op <- ['fun',named_fun,list,tuple],
 				   N <- lists:seq(0,31)],
     ok.
 
@@ -944,6 +944,11 @@ otp_8117_do('fun',Neg) ->
     FunBin = term_to_binary(fun() -> ok end),
     ?line <<B1:27/binary,_NumFree:32,Rest/binary>> = FunBin,   
     ?line bad_bin_to_term(<<B1/binary,Neg:32,Rest/binary>>);
+otp_8117_do(named_fun,Neg) ->
+    % Named fun with negative num_free
+    FunBin = term_to_binary(fun F() -> F end),
+    ?line <<B1:27/binary,_NumFree:32,Rest/binary>> = FunBin,
+    ?line bad_bin_to_term(<<B1/binary,Neg:32,Rest/binary>>);
 otp_8117_do(list,Neg) ->
     %% List with negative length
     ?line bad_bin_to_term(<<131,104,2,108,Neg:32,97,11,104,1,97,12,97,13,106,97,14>>);
@@ -1233,7 +1238,7 @@ bit_sized_binary_sizes(Config) when is_list(Config) ->
 bsbs_1(A) ->
     BinSize = 32+A,
     io:format("A: ~p BinSize: ~p", [A,BinSize]),
-    Bin = binary_to_term(<<131,$M,5:32,A,0,0,0,0,0>>),
+    Bin = binary_to_term_stress(<<131,$M,5:32,A,0,0,0,0,0>>),
     BinSize = bit_size(Bin).
 
 deep(Config) when is_list(Config) ->
@@ -1250,7 +1255,7 @@ deep(Config) when is_list(Config) ->
 
 deep_roundtrip(T) ->
     B = term_to_binary(T),
-    T = binary_to_term(B).
+    T = binary_to_term_stress(B).
 
 obsolete_funs(Config) when is_list(Config) ->
     erts_debug:set_internal_state(available_internal_state, true),
@@ -1285,29 +1290,29 @@ obsolete_fun(Fun) ->
     Tuple = no_fun_roundtrip(Fun).
 
 no_fun_roundtrip(Term) ->
-    binary_to_term(erts_debug:get_internal_state({term_to_binary_no_funs,Term})).
+    binary_to_term_stress(erts_debug:get_internal_state({term_to_binary_no_funs,Term})).
 
 %% Test non-standard encodings never generated by term_to_binary/1
 %% but recognized by binary_to_term/1.
 
 robustness(Config) when is_list(Config) ->
-    ?line [] = binary_to_term(<<131,107,0,0>>),	%Empty string.
-    ?line [] = binary_to_term(<<131,108,0,0,0,0,106>>),	%Zero-length list.
+    ?line [] = binary_to_term_stress(<<131,107,0,0>>),	%Empty string.
+    ?line [] = binary_to_term_stress(<<131,108,0,0,0,0,106>>),	%Zero-length list.
 
     %% {[],a} where [] is a zero-length list.
-    ?line {[],a} = binary_to_term(<<131,104,2,108,0,0,0,0,106,100,0,1,97>>),
+    ?line {[],a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,106,100,0,1,97>>),
 
     %% {42,a} where 42 is a zero-length list with 42 in the tail.
-    ?line {42,a} = binary_to_term(<<131,104,2,108,0,0,0,0,97,42,100,0,1,97>>),
+    ?line {42,a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,97,42,100,0,1,97>>),
 
     %% {{x,y},a} where {x,y} is a zero-length list with {x,y} in the tail.
-    ?line {{x,y},a} = binary_to_term(<<131,104,2,108,0,0,0,0,
+    ?line {{x,y},a} = binary_to_term_stress(<<131,104,2,108,0,0,0,0,
 				      104,2,100,0,1,120,100,0,1,
 				      121,100,0,1,97>>),
 
     %% Bignums fitting in 32 bits.
-    ?line 16#7FFFFFFF = binary_to_term(<<131,98,127,255,255,255>>),
-    ?line -1 = binary_to_term(<<131,98,255,255,255,255>>),
+    ?line 16#7FFFFFFF = binary_to_term_stress(<<131,98,127,255,255,255>>),
+    ?line -1 = binary_to_term_stress(<<131,98,255,255,255,255>>),
     
     ok.
 
@@ -1325,7 +1330,7 @@ run_otp_8180(Name) ->
     ?line {ok,Bins} = file:consult(Name),
     [begin
 	 io:format("~p\n", [Bin]),
-	 ?line {'EXIT',{badarg,_}} = (catch binary_to_term(Bin))
+	 ?line {'EXIT',{badarg,_}} = (catch binary_to_term_stress(Bin))
      end || Bin <- Bins],
     ok.
 
@@ -1394,3 +1399,52 @@ unaligned_sub_bin(Bin0, Offs) ->
     Bin.
 
 id(I) -> I.
+
+
+%% Stress binary_to_term with different initial reductions
+binary_to_term_stress(Bin) ->
+    binary_to_term_stress(Bin, no_opts).
+
+binary_to_term_stress(Bin, Opts) ->
+    Reds = get_reds(),
+    T = b2t(erlang:system_info(context_reductions),
+	    Bin, Opts, catch_binary_to_term(Bin, Opts)),
+    set_reds(Reds),
+    T = case Opts of
+	    no_opts -> binary_to_term(Bin);
+	    _ ->       binary_to_term(Bin,Opts)
+	end.
+
+catch_binary_to_term(Bin, no_opts) ->
+    try binary_to_term(Bin)
+    catch
+	error:badarg -> binary_to_term_throws_badarg
+    end;
+catch_binary_to_term(Bin, Opts) ->
+    try binary_to_term(Bin, Opts)
+    catch
+	error:badarg -> binary_to_term_throws_badarg
+    end.
+
+b2t(0, _Bin, _Opts, Term) ->
+    Term;
+b2t(Reds, Bin, Opts, Term) ->
+    set_reds(Reds),
+    Term = catch_binary_to_term(Bin,Opts),
+    b2t(Reds div 3, Bin, Opts, Term).
+
+set_reds(Reds) ->
+    try	erts_debug:set_internal_state(reds_left, Reds)
+    catch
+	error:undef ->
+	    erts_debug:set_internal_state(available_internal_state, true),
+	    set_reds(Reds)
+    end.
+
+get_reds() ->
+    try	erts_debug:get_internal_state(reds_left)
+    catch
+	error:undef ->
+	    erts_debug:set_internal_state(available_internal_state, true),
+	    get_reds()
+    end.
diff --git a/erts/emulator/test/fun_SUITE.erl b/erts/emulator/test/fun_SUITE.erl
index 36ba4e0f48..8ad5f290ed 100644
--- a/erts/emulator/test/fun_SUITE.erl
+++ b/erts/emulator/test/fun_SUITE.erl
@@ -262,6 +262,16 @@ equality(Config) when is_list(Config) ->
     ?line false = eq(FF2, FF4),
     ?line false = eq(FF3, FF4),
 
+    %% EEP37
+    H1 = fun Fact(N) when N > 0 -> N * Fact(N - 1); Fact(0) -> 1 end,
+    H2 = fun Pow(N, M) when M > 0 -> N * Pow(N, M - 1); Pow(_, 0) -> 1 end,
+    H1_copy = copy_term(H1),
+
+    true = eq(H1, H1),
+    true = eq(H1, H1_copy),
+    true = eq(H2, H2),
+    false = eq(H1, H2),
+
     ok.
 
 eq(X, X) -> true;
diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam
index fb222cb64b..73fac27161 100644
--- a/erts/preloaded/ebin/erlang.beam
+++ b/erts/preloaded/ebin/erlang.beam
diff --git a/erts/preloaded/ebin/erts_internal.beam b/erts/preloaded/ebin/erts_internal.beam
index 9ab806718b..12b36913a9 100644
--- a/erts/preloaded/ebin/erts_internal.beam
+++ b/erts/preloaded/ebin/erts_internal.beam
diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl
index a21da2ecc9..0ed677c3d8 100644
--- a/erts/preloaded/src/erlang.erl
+++ b/erts/preloaded/src/erlang.erl
@@ -362,15 +362,25 @@ binary_to_list(_Binary, _Start, _Stop) ->
 %% binary_to_term/1
 -spec binary_to_term(Binary) -> term() when
       Binary :: ext_binary().
-binary_to_term(_Binary) ->
-    erlang:nif_error(undefined).
+binary_to_term(Binary) ->
+    %% This BIF may throw badarg while trapping
+    try
+	erts_internal:binary_to_term(Binary)
+    catch
+	error:Reason -> erlang:error(Reason,[Binary])
+    end.
 
 %% binary_to_term/2
 -spec binary_to_term(Binary, Opts) -> term() when
       Binary :: ext_binary(),
       Opts :: [safe].
-binary_to_term(_Binary, _Opts) ->
-    erlang:nif_error(undefined).
+binary_to_term(Binary, Opts) ->
+    %% This BIF may throw badarg while trapping
+    try
+	erts_internal:binary_to_term(Binary,Opts)
+    catch
+	error:Reason -> erlang:error(Reason,[Binary,Opts])
+    end.
 
 %% bit_size/1
 %% Shadowed by erl_bif_types: erlang:bit_size/1
diff --git a/erts/preloaded/src/erts_internal.erl b/erts/preloaded/src/erts_internal.erl
index c8e8e7e069..d6a185482e 100644
--- a/erts/preloaded/src/erts_internal.erl
+++ b/erts/preloaded/src/erts_internal.erl
@@ -29,7 +29,7 @@
 -module(erts_internal).
 
 -export([await_port_send_result/3]).
-
+-export([binary_to_term/1, binary_to_term/2]).
 -export([port_command/3, port_connect/2, port_close/1,
 	 port_control/3, port_call/3, port_info/1, port_info/2]).
 
@@ -160,3 +160,13 @@ request_system_task(_Pid, _Prio, _Request) ->
 check_process_code(_Module, _OptionList) ->
     erlang:nif_error(undefined).
 
+-spec binary_to_term(Binary) -> term() when
+      Binary :: binary().
+binary_to_term(_Binary) ->
+    erlang:nif_error(undefined).
+
+-spec binary_to_term(Binary, Opts) -> term() when
+      Binary :: binary(),
+      Opts :: [safe].
+binary_to_term(_Binary, _Opts) ->
+    erlang:nif_error(undefined).