41 files changed, 2447 insertions, 128 deletions
diff --git a/erts/doc/src/erlang.xml b/erts/doc/src/erlang.xml
index 711473afd2..124302a2cb 100644
--- a/erts/doc/src/erlang.xml
+++ b/erts/doc/src/erlang.xml
@@ -6008,6 +6008,13 @@ ok
               <seealso marker="#system_info_multi_scheduling">erlang:system_info(multi_scheduling)</seealso>, and
               <seealso marker="#system_info_schedulers">erlang:system_info(schedulers)</seealso>.</p>
           </item>
+          <tag><marker id="system_info_otp_correction_package"><c>otp_correction_package</c></marker></tag>
+          <item>
+            <p>Returns a string containing the OTP correction package version
+	    number that currenly executing VM is part of. Note that other
+	    OTP applications in the system may be part of other OTP correction
+	    packages.</p>
+          </item>
           <tag><marker id="system_info_otp_release"><c>otp_release</c></marker></tag>
           <item>
             <p>Returns a string containing the OTP release number.</p>
diff --git a/erts/doc/src/erlc.xml b/erts/doc/src/erlc.xml
index 10cab344b0..c3fc3b1686 100644
--- a/erts/doc/src/erlc.xml
+++ b/erts/doc/src/erlc.xml
@@ -234,6 +234,16 @@ erlc +export_all file.erl</pre>
           from the shell.</p>
         <p>Supported options: -I, -o, -D, -v, -W, -b.</p>
       </item>
+      <tag>.S</tag>
+      <item>
+        <p>Erlang assembler source code. It generates a <c><![CDATA[.beam]]></c> file.</p>
+        <p>Supported options: same as for .erl.</p>
+      </item>
+      <tag>.core</tag>
+      <item>
+        <p>Erlang core source code. It generates a <c><![CDATA[.beam]]></c> file.</p>
+        <p>Supported options: same as for .erl.</p>
+      </item>
       <tag>.yrl</tag>
       <item>
         <p>Yecc source code. It generates an <c><![CDATA[.erl]]></c> file.</p>
diff --git a/erts/doc/src/escript.xml b/erts/doc/src/escript.xml
index 180447cac4..d2b09d4515 100644
--- a/erts/doc/src/escript.xml
+++ b/erts/doc/src/escript.xml
@@ -44,6 +44,7 @@
         <p><c>escript</c> runs a script written in Erlang.</p>
         <p>Here follows an example.</p>
         <pre>
+$ <input>chmod u+x factorial</input>
 $ <input>cat factorial</input>
 #!/usr/bin/env escript
 %% -*- erlang -*-
@@ -66,12 +67,13 @@ usage() ->
 
 fac(0) -> 1;
 fac(N) -> N * fac(N-1).
-$ <input>factorial 5</input>
+$ <input>./factorial 5</input>
 factorial 5 = 120
-$ <input>factorial</input>
+$ <input>./factorial</input>
 usage: factorial integer
-$ <input>factorial five</input>
-usage: factorial integer        </pre>
+$ <input>./factorial five</input>
+usage: factorial integer
+        </pre>
         <p>The header of the Erlang script in the example differs from
         a normal Erlang module.  The first line is intended to be the
         interpreter line, which invokes <c>escript</c>. However if you
diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in
index 5638683f88..b270099566 100644
--- a/erts/emulator/Makefile.in
+++ b/erts/emulator/Makefile.in
@@ -575,7 +575,7 @@ GENERATE += $(TTF_DIR)/erl_alloc_types.h
 
 # version include file
 $(TARGET)/erl_version.h: ../vsn.mk
-	$(gen_verbose)LANG=C $(PERL) utils/make_version -o $@ $(SYSTEM_VSN) $(VSN)$(SERIALNO) $(TARGET)
+	$(gen_verbose)LANG=C $(PERL) utils/make_version -o $@ $(SYSTEM_VSN) $(SYSTEM_CP_VSN) $(VSN)$(SERIALNO) $(TARGET)
 GENERATE += $(TARGET)/erl_version.h
 
 # driver table
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index 0609e86a39..b413f0e859 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -4326,7 +4326,19 @@ void process_main(void)
      flags = Arg(2);
      BsGetFieldSize(tmp_arg2, (flags >> 3), ClauseFail(), size);
      if (size >= SMALL_BITS) {
-	 Uint wordsneeded = 1+WSIZE(NBYTES((Uint) size));
+	 Uint wordsneeded;
+	 /* check bits size before potential gc.
+	  * We do not want a gc and then realize we don't need
+	  * the allocated space (i.e. if the op fails)
+	  *
+	  * remember to reacquire the matchbuffer after gc.
+	  */
+
+	 mb = ms_matchbuffer(tmp_arg1);
+	 if (mb->size - mb->offset < size) {
+	     ClauseFail();
+	 }
+	 wordsneeded = 1+WSIZE(NBYTES((Uint) size));
 	 TestHeapPreserve(wordsneeded, Arg(1), tmp_arg1);
      }
      mb = ms_matchbuffer(tmp_arg1);
diff --git a/erts/emulator/beam/erl_bif_ddll.c b/erts/emulator/beam/erl_bif_ddll.c
index 1c3e955f47..1728b200f7 100644
--- a/erts/emulator/beam/erl_bif_ddll.c
+++ b/erts/emulator/beam/erl_bif_ddll.c
@@ -182,7 +182,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
     Eterm name_term = BIF_ARG_2;
     Eterm options = BIF_ARG_3;
     char *path = NULL;
-    ErlDrvSizeT path_len;
+    Sint path_len;
     char *name = NULL;
     DE_Handle *dh;
     erts_driver_t *drv;
@@ -198,6 +198,7 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
     int kill_ports = 0;
     int do_build_load_error = 0;
     int build_this_load_error = 0;
+    int encoding;
 
     for(l = options; is_list(l); l =  CDR(list_val(l))) {
 	Eterm opt = CAR(list_val(l));
@@ -257,18 +258,23 @@ BIF_RETTYPE erl_ddll_try_load_3(BIF_ALIST_3)
 	goto error;
     }
 
-    if (erts_iolist_size(path_term, &path_len)) {
-	goto error;
+    encoding = erts_get_native_filename_encoding();
+    if (encoding == ERL_FILENAME_WIN_WCHAR) {
+        /* Do not convert the lib name to utf-16le yet, do that in win32 specific code */
+        /* since lib_name is used in error messages */
+        encoding = ERL_FILENAME_UTF8;
     }
-    path = erts_alloc(ERTS_ALC_T_DDLL_TMP_BUF, path_len + 1 /* might need path separator */ + sys_strlen(name) + 1);
-    if (erts_iolist_to_buf(path_term, path, path_len) != 0) {
+    path = erts_convert_filename_to_encoding(path_term, NULL, 0,
+					     ERTS_ALC_T_DDLL_TMP_BUF, 1, 0,
+					     encoding, &path_len,
+					     sys_strlen(name) + 2); /* might need path separator */
+    if (!path) {
 	goto error;
     }
-    while (path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/')) {
-	--path_len;
-    }
+    ASSERT(path_len > 0 && path[path_len-1] == 0);
+    while (--path_len > 0 && (path[path_len-1] == '\\' || path[path_len-1] == '/'))
+	;
     path[path_len++] = '/';
-    /*path[path_len] = '\0';*/
     sys_strcpy(path+path_len,name);
 
 #if DDLL_SMP
@@ -1524,7 +1530,7 @@ static int do_load_driver_entry(DE_Handle *dh, char *path, char *name)
 
     assert_drv_list_rwlocked();
 
-    if ((res =  erts_sys_ddll_open(path, &(dh->handle))) != ERL_DE_NO_ERROR) {
+    if ((res =  erts_sys_ddll_open(path, &(dh->handle), NULL)) != ERL_DE_NO_ERROR) {
 	return res;
     }
     
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 414ae2f046..e0b654cb22 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -64,8 +64,10 @@ static Export *gather_gc_info_res_trap;
 
 #define DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
 
+static char otp_correction_package[] = ERLANG_OTP_CORRECTION_PACKAGE;
 /* Keep erts_system_version as a global variable for easy access from a core */
 static char erts_system_version[] = ("Erlang/OTP " ERLANG_OTP_RELEASE
+				     "%s"
 				     " [erts-" ERLANG_VERSION "]"
 #if !HEAP_ON_C_STACK && !HALFWORD_HEAP
 				     " [no-c-stack-objects]"
@@ -304,11 +306,28 @@ make_link_list(Process *p, ErtsLink *root, Eterm tail)
 int
 erts_print_system_version(int to, void *arg, Process *c_p)
 {
+    int i, rc = -1;
+    char *rc_str = "";
+    char rc_buf[100];
+    char *ocp = otp_correction_package;
 #ifdef ERTS_SMP
     Uint total, online, active;
     (void) erts_schedulers_state(&total, &online, &active, 0);
 #endif
-    return erts_print(to, arg, erts_system_version
+    for (i = 0; i < sizeof(otp_correction_package)-4; i++) {
+	if (ocp[i] == '-' && ocp[i+1] == 'r' && ocp[i+2] == 'c')
+	    rc = atoi(&ocp[i+3]);
+    }
+    if (rc >= 0) {
+	if (rc == 0)
+	    rc_str = " [DEVELOPMENT]";
+	else {
+	    erts_snprintf(rc_buf, sizeof(rc_buf), " [RELEASE CANDIDATE %d]", rc);
+	    rc_str = rc_buf;
+	}
+    }
+    return erts_print(to, arg, erts_system_version,
+		      rc_str
 #ifdef ERTS_SMP
 		      , total, online
 #endif
@@ -2417,6 +2436,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1)
 	    DECL_AM(unknown);
 	    BIF_RET(AM_unknown);
 	}
+    } else if (ERTS_IS_ATOM_STR("otp_correction_package", BIF_ARG_1)) {
+	int n = sizeof(ERLANG_OTP_CORRECTION_PACKAGE)-1;
+	hp = HAlloc(BIF_P, 2*n);
+	BIF_RET(buf_to_intlist(&hp, ERLANG_OTP_CORRECTION_PACKAGE, n, NIL));
     } else if (ERTS_IS_ATOM_STR("otp_release", BIF_ARG_1)) {
 	int n = sizeof(ERLANG_OTP_RELEASE)-1;
 	hp = HAlloc(BIF_P, 2*n);
diff --git a/erts/emulator/beam/erl_bif_port.c b/erts/emulator/beam/erl_bif_port.c
index 864349491a..f298422267 100644
--- a/erts/emulator/beam/erl_bif_port.c
+++ b/erts/emulator/beam/erl_bif_port.c
@@ -808,7 +808,7 @@ open_port(Process* p, Eterm name, Eterm settings, int *err_typep, int *err_nump)
 	    if (encoding == ERL_FILENAME_WIN_WCHAR) {
 		encoding = ERL_FILENAME_UTF8;
 	    }
-	    if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL))
+	    if ((name_buf = erts_convert_filename_to_encoding(name, NULL, 0, ERTS_ALC_T_TMP,0,1, encoding, NULL, 0))
 		== NULL) {
 		goto badarg;
 	    }
diff --git a/erts/emulator/beam/erl_bif_re.c b/erts/emulator/beam/erl_bif_re.c
index 99c31738a5..448c6f6f6d 100644
--- a/erts/emulator/beam/erl_bif_re.c
+++ b/erts/emulator/beam/erl_bif_re.c
@@ -1196,8 +1196,8 @@ re_run(Process *p, Eterm arg1, Eterm arg2, Eterm arg3)
 
 	ovsize = 3*(unsigned_val(tp[2])+1);
 	code_size = binary_size(tp[5]);
-	if ((code_tmp = (const pcre *) 
-	     erts_get_aligned_binary_bytes(tp[5], &temp_alloc)) == NULL) {
+	code_tmp = (const pcre *) erts_get_aligned_binary_bytes(tp[5], &temp_alloc);
+	if (code_tmp == NULL || code_size < 4) {
 	    erts_free_aligned_binary_bytes(temp_alloc);
 	    BIF_ERROR(p, BADARG);
 	}
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 8c4fffa75b..1af80dd04b 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -553,8 +553,8 @@ void erts_usage(void)
     erts_fprintf(stderr, "            numbers is %d\n",
 		 ERTS_MAX_NO_OF_SCHEDULERS);
     erts_fprintf(stderr, "-SP p1:p2   specify schedulers (p1) and schedulers online (p2)\n");
-    erts_fprintf(stderr, "	      as percentages of logical processors configured and logical\n");
-    erts_fprintf(stderr, "	      processors available, respectively\n");
+    erts_fprintf(stderr, "            as percentages of logical processors configured and logical\n");
+    erts_fprintf(stderr, "            processors available, respectively\n");
     erts_fprintf(stderr, "-t size     set the maximum number of atoms the "
 			 "emulator can handle\n");
     erts_fprintf(stderr, "            valid range is [%d-%d]\n",
diff --git a/erts/emulator/beam/erl_nif.c b/erts/emulator/beam/erl_nif.c
index e87959f0ab..dc285b3cf7 100644
--- a/erts/emulator/beam/erl_nif.c
+++ b/erts/emulator/beam/erl_nif.c
@@ -1407,7 +1407,7 @@ void* enif_dlopen(const char* lib,
     ErtsSysDdllError errdesc = ERTS_SYS_DDLL_ERROR_INIT;
     void* handle;
     void* init_func;
-    if (erts_sys_ddll_open2(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) {
+    if (erts_sys_ddll_open(lib, &handle, &errdesc) == ERL_DE_NO_ERROR) {
 	if (erts_sys_ddll_load_nif_init(handle, &init_func, &errdesc) == ERL_DE_NO_ERROR) {
 	    erts_sys_ddll_call_nif_init(init_func);
 	}
@@ -1587,7 +1587,8 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
         encoding = ERL_FILENAME_UTF8;
     }
     lib_name = erts_convert_filename_to_encoding(BIF_ARG_1, NULL, 0,
-                                                 ERTS_ALC_T_TMP, 1, 0, encoding, NULL);
+                                                 ERTS_ALC_T_TMP, 1, 0, encoding,
+						 NULL, 0);
     if (!lib_name) {
 	BIF_ERROR(BIF_P, BADARG);
     }
@@ -1626,7 +1627,7 @@ BIF_RETTYPE load_nif_2(BIF_ALIST_2)
 			     "module '%T' not allowed", mod_atom);
     }    
     else if (init_func == NULL &&
-	     (err=erts_sys_ddll_open2(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
+	     (err=erts_sys_ddll_open(lib_name, &handle, &errdesc)) != ERL_DE_NO_ERROR) {
 	const char slogan[] = "Failed to load NIF library";
 	if (strstr(errdesc.str, lib_name) != NULL) {
 	    ret = load_nif_error(BIF_P, "load_failed", "%s: '%s'", slogan, errdesc.str);
diff --git a/erts/emulator/beam/erl_unicode.c b/erts/emulator/beam/erl_unicode.c
index 7e3c6681d9..3a968594f3 100644
--- a/erts/emulator/beam/erl_unicode.c
+++ b/erts/emulator/beam/erl_unicode.c
@@ -1990,12 +1990,14 @@ char *erts_convert_filename_to_native(Eterm name, char *statbuf, size_t statbuf_
 {
     int encoding = erts_get_native_filename_encoding();
     return erts_convert_filename_to_encoding(name, statbuf, statbuf_size, alloc_type,
-					     allow_empty, allow_atom, encoding, used);
+					     allow_empty, allow_atom, encoding,
+					     used, 0);
 }
 
 char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbuf_size,
 					ErtsAlcType_t alloc_type, int allow_empty,
-					int allow_atom, int encoding, Sint *used)
+					int allow_atom, int encoding, Sint *used,
+					Uint extra)
 {
     char* name_buf = NULL;
 
@@ -2008,13 +2010,14 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
 	}
 	if (encoding == ERL_FILENAME_WIN_WCHAR) {
 	    need += 2;
+	    extra *= 2;
 	} else {
 	    ++need;
 	}
 	if (used) 
 	    *used = (Sint) need;
-	if (need > statbuf_size) {
-	    name_buf = (char *) erts_alloc(alloc_type, need);
+	if (need+extra > statbuf_size) {
+	    name_buf = (char *) erts_alloc(alloc_type, need+extra);
 	} else {
 	    name_buf = statbuf;
 	}
@@ -2035,8 +2038,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
 	    /*Add 0 termination only*/
 	    if (used) 
 		*used = (Sint) size+1;
-	    if (size+1 > statbuf_size) {
-		name_buf = (char *) erts_alloc(alloc_type, size+1);
+	    if (size+1+extra > statbuf_size) {
+		name_buf = (char *) erts_alloc(alloc_type, size+1+extra);
 	    } else {
 		name_buf = statbuf;
 	    }
@@ -2045,7 +2048,7 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf, size_t statbu
 	} else {
             name_buf = erts_convert_filename_to_wchar(bytes, size,
                                                       statbuf, statbuf_size,
-                                                      alloc_type, used, 0);
+                                                      alloc_type, used, extra);
         }
 	erts_free_aligned_binary_bytes(temp_alloc);
     } else {
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 94acceaaf1..5e7a5cab6e 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -4049,7 +4049,9 @@ init_done:
 	switch (tag) {
 	case INTEGER_EXT:
 	    SKIP(4);
+#if !defined(ARCH_64) || HALFWORD_HEAP
 	    heap_size += BIG_UINT_HEAP_SIZE;
+#endif
 	    break;
 	case SMALL_INTEGER_EXT:
 	    SKIP(1);
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index c44f532366..c183c519ff 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -923,7 +923,8 @@ char *erts_convert_filename_to_encoding(Eterm name, char *statbuf,
 					ErtsAlcType_t alloc_type,
 					int allow_empty, int allow_atom,
 					int encoding,
-					Sint *used /* out */);
+					Sint *used /* out */,
+					Uint extra);
 char* erts_convert_filename_to_wchar(byte* bytes, Uint size,
                                      char *statbuf, size_t statbuf_size,
                                      ErtsAlcType_t alloc_type, Sint* used,
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index d4623c0450..49af86b36a 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -7061,7 +7061,7 @@ void *driver_dl_open(char * path)
     int res;
     int *last_error_p = erts_smp_tsd_get(driver_list_last_error_key);
     int locked = maybe_lock_driver_list();
-    if ((res = erts_sys_ddll_open(path, &ptr)) == 0) {
+    if ((res = erts_sys_ddll_open(path, &ptr, NULL)) == 0) {
 	maybe_unlock_driver_list(locked);
 	return ptr;
     } else {
diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h
index d0ccf8a2fc..189d9ebac8 100644
--- a/erts/emulator/beam/sys.h
+++ b/erts/emulator/beam/sys.h
@@ -667,8 +667,7 @@ typedef struct {
 #define ERTS_SYS_DDLL_ERROR_INIT {NULL}
 extern void erts_sys_ddll_free_error(ErtsSysDdllError*);
 extern void erl_sys_ddll_init(void); /* to initialize mutexes etc */
-extern int erts_sys_ddll_open2(const char *path, void **handle, ErtsSysDdllError*);
-#define erts_sys_ddll_open(P,H) erts_sys_ddll_open2(P,H,NULL)
+extern int erts_sys_ddll_open(const char *path, void **handle, ErtsSysDdllError*);
 extern int erts_sys_ddll_open_noext(char *path, void **handle, ErtsSysDdllError*);
 extern int erts_sys_ddll_load_driver_init(void *handle, void **function);
 extern int erts_sys_ddll_load_nif_init(void *handle, void **function,ErtsSysDdllError*);
diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c
index 978a766de9..80937dfcc8 100644
--- a/erts/emulator/drivers/common/inet_drv.c
+++ b/erts/emulator/drivers/common/inet_drv.c
@@ -1490,8 +1490,8 @@ static int load_ip_and_port
     unsigned int alen = len;
     char         abuf  [len];
     int res = inet_get_address(abuf, (inet_address*) addr, &alen);
-    ASSERT(res==0);
-    res = 0;
+    ASSERT(res==0); (void)res;
+
     /* Now "abuf" contains: Family(1b), Port(2b), IP(4|16b) */
 
     /* NB: the following functions are safe to use, as they create tuples
diff --git a/erts/emulator/internal_doc/CarrierMigration.md b/erts/emulator/internal_doc/CarrierMigration.md
new file mode 100644
index 0000000000..b93c11c6ec
--- /dev/null
+++ b/erts/emulator/internal_doc/CarrierMigration.md
@@ -0,0 +1,201 @@
+Carrier Migration
+=================
+
+The ERTS memory allocators manage memory blocks in two types of raw
+memory chunks. We call these chunks of raw memory
+*carriers*. Singleblock carriers which only contain one large block,
+and multiblock carriers which contain multiple blocks. A carrier is
+typically created using `mmap()` on unix systems. However, how a
+carrier is created is of minor importance. An allocator instance
+typically manages a mixture of single- and multiblock carriers.
+
+Problem
+-------
+
+When a carrier is empty, i.e. contains only one large free block, it
+is deallocated. Since multiblock carriers can contain both allocated
+blocks and free blocks at the same time, an allocator instance might
+be stuck with a large amount of poorly utilized carriers if the memory
+load decrease. After a peak in memory usage it is expected that not
+all memory can be returned since the blocks still allocated is likely
+to be dispersed over multiple carriers. Such poorly utilized carriers
+can usually be reused if the memory load increase again. However,
+since each scheduler thread manages its own set of allocator
+instances, and memory load is not necessarily connected to CPU load we
+might get into a situation where there are lots of poorly utilized
+multiblock carriers on some allocator instances while we need to
+allocate new multiblock carriers on other allocator instances. In
+scenarios like this, the demand for multiblock carriers in the system
+might increase at the same time as the actual memory demand in the
+system has decreased which is both unwanted and quite unexpected for
+the end user.
+
+Solution
+--------
+
+In order to prevent scenarios like this we've implemented support for
+migration of multiblock carriers between allocator instances of the
+same type.
+
+### Management of Free Blocks ###
+
+In order to be able to remove a carrier from one allocator instance
+and add it to another we need to be able to move references to the
+free blocks of the carrier between the allocator instances. The
+allocator instance specific data structure referring to the free
+blocks it manages often refers to the same carrier from multiple
+places. For example, when the address order bestfit strategy is used
+this data structure is a binary search tree spanning all carriers that
+the allocator instance manages. Free blocks in one specific carrier
+can be referred to from potentially every other carrier that is
+managed, and the amount of such references can be huge. That is, the
+work of removing the free blocks of such a carrier from the search
+tree will be huge. One way of solving this could be to not migrate
+carriers that contain lots of free blocks, but this would prevent us
+from migrating carriers that potentially needs to be migrated in order
+to solve the problem we set out to solve.
+
+By using one data structure of free blocks in each carrier and an
+allocator instance wide data structure of carriers managed by the
+allocator instance, the work needed in order to remove and add
+carriers can be kept to a minimum. When migration of carriers is
+enabled on a specific allocator type, we require that an allocation
+strategy with such an implementation is used. Currently we've
+implemented this for three different allocation strategies. All of
+these strategies use a search tree of carriers sorted so that we can
+find the carrier with the lowest address that can satisfy the
+request. Internally in carriers we use yet another search tree that
+either implement address order first fit, address order best fit,
+or best fit. The abbreviations used for these different allocation
+strategies are `aoff`, and `aoffcaobf`, `aoffcbf`.
+
+### Carrier Pool ###
+
+In order to migrate carriers between allocator instances we move them
+through a pool of carriers. In order for a carrier migration to
+complete, one scheduler needs to move the carrier into the pool, and
+another scheduler needs to take the carrier out of the pool.
+
+The pool is implemented as a lock free, circular, double linked,
+list. The list contains a sentinel which is used as the starting point
+when inserting to, or fetching from the pool. Carriers in the pool are
+elements in this list.
+
+The list can be modified by all scheduler threads
+simultaneously. During modifications the double linked list is allowed
+to get a bit "out of shape". For example, following the `next` pointer
+to the next element and then following the `prev` pointer does not
+always take you back to were you started. The following is however
+always true:
+
+*   Repeatedly following `next` pointers will eventually take you to the
+    sentinel.
+*   Repeatedly following `prev` pointers will eventually take you to the
+    sentinel.
+*   Following a `next` or a `prev` pointer will take you to either an
+    element in the pool, or an element that used to be in the pool.
+
+When inserting a new element we search for a place to insert the
+element by only following `next` pointers, and we always begin by
+skipping the first element encountered. When trying to fetch an
+element we do the same thing, but instead only follow `prev` pointers.
+
+By going different directions when inserting and fetching, we avoid
+contention between threads inserting and threads fetching as much as
+possible. By skipping one element when we begin searching, we preserve
+the sentinel unmodified as much as possible. This is beneficial since
+all search operations need to read the content of the sentinel. If we
+were to modify the sentinel, the cache line containing the sentinel
+would unnecessarily be bounced between processors.
+
+The `prev`, and `next` fields in the elements of the list contains the
+value of the pointer, a modification marker, and a deleted
+marker. Memory operations on these fields are done using atomic memory
+operations. When a thread has set the modification marker in a field,
+no-one except the thread that set the marker is allowed to modify the
+field. If multiple modification markers needs to be set, we always
+begin with `next` fields followed by `prev` fields in the order
+following the actual pointers. This guarantees that no deadlocks will
+occur.
+
+When a carrier is being removed from a pool, we mark it with a thread
+progress value that needs to be reached before we are allowed to
+modify the `next`, and `prev` fields. That is, until we reach this
+thread progress we are not allowed to insert the carrier into the pool
+again, and we are not allowed to deallocate the carrier. This ensures
+that threads inspecting the pool always will be able to traverse the
+pool and reach valid elements. Once we have reached the thread
+progress value that the carrier was tagged with, we know that no
+threads may have references to it via the pool.
+
+### Migration ###
+
+There exist one pool for each allocator type enabling migration of
+carriers between scheduler specific allocator instances of the same
+allocator type.
+
+Each allocator instance keeps track of the current utilization of its
+multiblock carriers. When the utilization falls below the "abandon
+carrier utilization limit" it starts to inspect the utilization of the
+current carrier when deallocations are made. If also the utilization
+of the carrier falls below the "abandon carrier utilization limit" it
+unlinks the carrier from its data structure of available free blocks
+and inserts the carrier into the pool.
+
+Since the carrier has been unlinked from the data structure of
+available free blocks, no more allocations will be made in the
+carrier. The allocator instance putting the carrier into the pool,
+however, still has the responsibility of performing deallocations in
+it while it remains in the pool.
+
+Each carrier has a flag field containing information about allocator
+instance owning the carrier, a flag indicating if the carrier is in
+the pool or not, and a flag indicating if it is busy or not. When the
+carrier is in the pool, the owning allocator instance needs to mark it
+as busy while operating on it. If another thread inspects it in order
+to try to fetch it from the pool, it will abort the fetch if it is
+busy. When fetching the carrier from the pool, ownership will changed
+and further deallocations in the carrier will be redirected to the new
+owner using the delayed dealloc functionality.
+
+If a carrier in the pool becomes empty, it will be withdrawn from the
+pool. All carriers that become empty are also always passed to its
+originating allocator instance for deallocation using the delayed
+dealloc functionality. Since carriers this way always will be
+deallocated by the allocator instance that allocated the carrier the
+underlying functionality of allocating and deallocating carriers can
+remain simple and doesn't have to bother about multiple threads. In a
+NUMA system we will also not mix carriers originating from multiple
+NUMA nodes.
+
+When an allocator instance needs more carrier space, it always begins
+by inspecting its own carriers that are waiting for thread progress
+before they can be deallocated. If no such carrier could be found, it
+then inspects the pool. If no carrier could be fetched from the pool,
+it will allocate a new carrier. Regardless of where the allocator
+instance gets the carrier from it the just links in the carrier into
+its data structure of free blocks.
+
+### Result ###
+
+The use of this strategy of abandoning carriers with poor utilization
+and reusing these in allocator instances with an increased carrier
+demand is extremely effective and completely eliminates the problems
+that otherwise sometimes occurred when CPU load dropped while memory
+load did not.
+
+When using the `aoffcaobf` or `aoff` strategies compared to `gf` or
+`bf`, we loose some performance since we get more modifications in the
+data structure of free blocks. This performance penalty is however
+reduced using the `aoffcbf` strategy. A tradeoff between memory
+consumption and performance is however inevitable, and it is up to
+the user to decide what is most important. 
+
+Further work
+------------
+
+It would be quite easy to extend this to allow migration of multiblock
+carriers between all allocator types. More or less the only obstacle
+is maintenance of the statistics information.
+
+
diff --git a/erts/emulator/internal_doc/CodeLoading.md b/erts/emulator/internal_doc/CodeLoading.md
new file mode 100644
index 0000000000..151b9cd57c
--- /dev/null
+++ b/erts/emulator/internal_doc/CodeLoading.md
@@ -0,0 +1,186 @@
+Non-Blocking Code Loading
+=========================
+
+Introduction
+------------
+
+Before OTP R16 when an Erlang code module was loaded, all other
+execution in the VM were halted while the load operation was carried
+out in single threaded mode. This might not be a big problem for
+initial loading of modules during VM boot, but it can be a severe
+problem for availability when upgrading modules or adding new code on
+a VM with running payload. This problem grows with the number of cores
+as both the time it takes to wait for all schedulers to stop increases
+as well as the potential amount of halted ongoing work.
+
+In OTP R16, modules are loaded without blocking the VM.
+Erlang processes may continue executing undisturbed in parallel during
+the entire load operation. The code loading is carried out by a normal
+Erlang process that is scheduled like all the others. The load
+operation is completed by making the loaded code visible to all
+processes in a consistent way with one single atomic
+instruction. Non-blocking code loading will improve real-time
+characteristics when modules are loaded/upgraded on a running SMP
+system.
+
+
+The Load Phases
+---------------
+
+The loading of a module is divided into two phases; a *prepare phase*
+and a *finishing phase*. The prepare phase contains reading the BEAM
+file format and all the preparations of the loaded code that can
+easily be done without interference with the running code. The
+finishing phase will make the loaded (and prepared) code accessible
+from the running code. Old module versions (replaced or deleted) will
+also be made inaccessible by the finishing phase.
+
+The prepare phase is designed to allow several "loader" processes to
+prepare separate modules in parallel while the finishing phase can
+only be done by one loader process at a time. A second loader process
+trying to enter finishing phase will be suspended until the first
+loader is done. This will only block the process, the scheduler is
+free to schedule other work while the second loader is waiting. (See
+`erts_try_seize_code_write_permission` and
+`erts_release_code_write_permission`).
+
+The ability to prepare several modules in parallel is not currently
+used as almost all code loading is serialized by the code_server
+process. The BIF interface is however prepared for this.
+
+      erlang:prepare_loading(Module, Code) -> LoaderState
+      erlang:finish_loading([LoaderState])
+
+The idea is that `prepare_loading` could be called in parallel for
+different modules and returns a "magic binary" containing the internal
+state of each prepared module. Function `finish_loading` could take a
+list of such states and do the finishing of all of them in one go.
+
+Currenlty we use the legacy BIF `erlang:load_module` which is now
+implemented in Erlang by calling the above two functions in
+sequence. Function `finish_loading` is limited to only accepts a list
+with one module state as we do not yet use the multi module loading
+feature.
+
+
+The Finishing Sequence
+----------------------
+
+During VM execution, code is accessed through a number of data
+structures. These *code access structures* are
+
+* Export table. One entry for every exported function.
+* Module table. One entry for each loaded module.
+* "beam_catches". Identifies jump destinations for catch instructions.
+* "beam_ranges". Map code address to function and line in source file.
+
+The most frequently used of these structures is the export table that
+is accessed in run time for every executed external function call to
+get the address of the callee. For performance reasons, we want to
+access all these structures without any overhead from thread
+synchronization. Earlier this was solved with an emergency break. Stop
+the entire VM to mutate these code access structures, otherwise treat
+them as read-only.
+
+The solution in R16 is instead to *replicate* the code access
+structures. We have one set of active structures read by the running
+code. When new code is loaded the active structures are copied, the
+copy is updated to include the newly loaded module and then a switch
+is made to make the updated copy the new active set. The active set is
+identified by a single global atomic variable
+`the_active_code_index`. The switch can thus be made by a single
+atomic write operation. The running code have to read this atomic
+variable when using the active access structures, which means one
+atomic read operation per external function call for example. The
+performance penalty from this extra atomic read is however very small
+as it can be done without any memory barriers at all (as described
+below). With this solution we also preserve the transactional feature
+of a load operation. Running code will never see the intermediate
+result of a half loaded module.
+
+The finishing phase is carried out in the following sequence by the
+BIF `erlang:finish_loading`:
+
+1. Seize exclusive code write permission (suspend process if needed
+   until we get it).
+
+2. Make a full copy of all the active access structures. This copy is
+   called the staging area and is identified by the global atomic
+   variable `the_staging_code_index`.
+
+3. Update all access structures in the staging area to include the
+   newly prepared module.
+
+4. Schedule a thread progress event. That is a time in the future when
+   all schedulers have yielded and executed a full memory barrier.
+
+5. Suspend the loader process.
+
+6. After thread progress, commit the staging area by assigning
+   `the_staging_code_index` to `the_active_code_index`.
+
+7. Release the code write permission allowing other processes to stage
+   new code.
+
+8. Resume the loader process allowing it to return from
+   `erlang:finish_loading`.
+
+
+### Thread Progress
+
+The waiting for thread progress in 4-6 is necessary in order for
+processes to read `the_active_code_index` atomic during normal
+execution without any expensive memory barriers. When we write a new
+value into `the_active_code_index` in step 6, we know that all
+schedulers will see an updated and consistent view of all the new
+active access structures once they become reachable through
+`the_active_code_index`.
+
+The total lack of memory barrier when reading `the_active_code_index`
+has one interesting consequence however. Different processes may see
+the new code at different point in time depending on when different
+cores happen to refresh their hardware caches. This may sound unsafe
+but it actually does not matter. The only property we must guarantee
+is that the ability to see the new code must spread with process
+communication. After receiving a message that was triggered by new
+code, the receiver must be guaranteed to also see the new code. This
+will be guaranteed as all types of process communication involves
+memory barriers in order for the receiver to be sure to read what the
+sender has written. This implicit memory barrier will then also make
+sure that the receiver reads the new value of `the_active_code_index`
+and thereby also sees the new code. This is true for all kinds of
+inter process communication (TCP, ETS, process name registering,
+tracing, drivers, NIFs, etc) not just Erlang messages.
+
+### Code Index Reuse
+
+To optimize the copy operation in step 2, code access structures are
+reused. In current solution we have three sets of code access
+structures, identified by a code index of 0, 1 and 2. These indexes
+are used in a round robin fashion. Instead of having to initialize a
+completely new copy of all access structures for every load operation
+we just have to update with the changes that have happened since the
+last two code load operations. We could get by with only two code
+indexes (0 and 1), but that would require yet another round of waiting
+for thread progress before step 2 in the `finish_loading` sequence. We
+cannot start reusing a code index as staging area until we know that
+no lingering scheduler thread is still using it as the active code
+index. With three generations of code indexes, the waiting for thread
+progress in step 4-6 will give this guarantee for us. Thread progress
+will wait for all running schedulers to reschedule at least one
+time. No ongoing execution reading code access structures reached from
+an old value of `the_active_code_index` can exist after a second round
+of thread progress.
+
+The design choice between two or three generations of code access
+structures is a trade-off between memory consumption and code loading
+latency.
+
+### A Consistent Code View
+
+Some native BIFs may need to get a consistent snapshot view of the
+active code. To do this it is important to only read
+`the_active_code_index` one time and then use that index value for all
+code accessing during the BIF. If a load operation is executed in
+parallel, reading `the_active_code_index` a second time might result
+in a different value, and thereby a different view of the code.
diff --git a/erts/emulator/internal_doc/DelayedDealloc.md b/erts/emulator/internal_doc/DelayedDealloc.md
new file mode 100644
index 0000000000..b7d87b839f
--- /dev/null
+++ b/erts/emulator/internal_doc/DelayedDealloc.md
@@ -0,0 +1,175 @@
+Delayed Dealloc
+===============
+
+Problem
+-------
+
+An easy way to handle memory allocation in a multi-threaded
+environment is to protect the memory allocator with a global lock
+which threads performing memory allocations or deallocations have to
+have locked during the whole operation. This solution of course scales
+very poorly, due to heavy lock contention. An improved solution of
+this scheme is to use multiple thread specific instances of such an
+allocator. That is, each thread allocates in its own allocator
+instance which is protected by a lock. In the general case references
+to memory need to be passed between threads. In the case where a
+thread that needs to deallocate memory that originates from another
+threads allocator instance a lock conflict is possible. In a system as
+the Erlang VM where memory allocation/deallocation is frequent and
+references to memory also are passed around between threads this
+solution will also scale poorly due to lock contention.
+
+Functionality Used to Adress This problem
+-----------------------------------------
+
+In order to reduce contention due to locking of allocator instances we
+introduced completely lock free instances tied to each scheduler
+thread, and an extra locked instance for other threads. The scheduler
+threads in the system is expected to do the major part of the
+work. Other threads may still be needed but should not perform any
+major and/or time critical work. The limited amount of contention that
+appears on the locked allocator instance can more or less be
+disregarded.
+
+Since we still need to be able to pass references to memory between
+scheduler threads we need some way to manage this. An allocator
+instance belonging to one scheduler thread is only allowed to be
+manipulated by that scheduler thread. When other threads need to
+deallocate memory originating from a foreign allocator instance, they
+only pass the memory block to a "message box" containing deallocation
+jobs attached to the originating allocator instance. When a scheduler
+thread detects such deallocation job it performs the actual
+deallocation.
+
+The "message box" is implemented using a lock free single linked list
+through the memory blocks to deallocate. The order of the elements in
+this list is not important. Insertion of new free blocks will be made
+somewhere near the end of this list. Requirering that the new blocks
+need to be inserted at the end would cause unnecessary contention when
+large amount of memory blocks are inserted simultaneous by multiple
+threads.
+
+The data structure refering to this single linked list cover two cache
+lines. One cache line containing information about the head of the
+list, and one cache line containing information about the tail of the
+list. This in order to reduce cache line ping ponging of this data
+structure. The head of the list will only be manipulated by the thread
+owning the allocator instance, and the tail will be manipulated by
+other threads inserting deallocation jobs.
+
+### Tail ###
+
+In the tail part of the data structure we find a pointer to the last
+element of the list, or at least something that is near the end of the
+list. In the uncontended case it will point to the end of the list,
+but when simultaneous insert operations are performed it will point to
+something near the end of the list.
+
+When insterting an element one will try to write a pointer to the new
+element in the next pointer of the element pointed to by the last
+pointer. This is done using an atomic compare and swap that expects
+the next pointer to be `NULL`. If this succeds the thread performing
+this operation moves the last pointer to point to the newly inserted
+element.
+
+If the atomic compare and swap described above failed, the last
+pointer didn't point to the last element. In this case we need to
+insert the new element somewhere inbetween the element that the last
+pointer pointed to and the actual last element. If we do it this way
+the last pointer will eventually end up at the last element when
+threads stop adding new elements. When trying to insert somewhere near
+the end and failing to do so, the inserting thread sometimes moves to
+the next element and somtimes tries with the same element again. This
+in order to spread the inserted elements during heavy contention. That
+is, we try to spread the modifications of memory to different
+locations instead of letting all threads continue to try to modify the
+same location in memory.
+
+### Head ###
+
+The head contains pointers to begining of the list (`head.first`), and
+to the first block which other threads may refer to
+(`head.unref_end`). Blocks between these pointers are only refered to
+by the head part of the data structure which is only used by the
+thread owning the allocator instance. When these two pointers are not
+equal the thread owning the allocator instance deallocate block after
+block until `head.first` reach `head.unref_end`.
+
+We of course periodically need to move the `head.unref_end` closer to
+the end in order to be able to continue deallocating memory
+blocks. Since all threads inserting new elements in the linked list
+will enter the list using the last pointer we can use this
+knowledge. If we call `erts_thr_progress_later()` and wait until we
+have reached that thread progress we know that no managed threads can
+refer the elements up to the element pointed to by the last pointer at
+the time when we called `erts_thr_progress_later()`. This since, all
+managed threads must have left the code implementing this at least
+once, and they always enters into the list via the last pointer. The
+`tail.next` field contains information about next `head.unref_end`
+pointer and thread progress that needs to be reached before we can
+move `head.unref_end`.
+
+Unfortunately not only threads managed by the thread progress
+functionality may insert memory blocks. Other threads also needs to be
+taken care of. Other threads will not be as frequent users of this
+functionality as managed threads, so using a less efficient scheme for
+them is not that big of a problem. In order to handle unmanaged
+threads we use two reference counters. When an unmanaged thread enters
+this implementation it increments the reference counter currently
+used, and when it leaves this implementation it decrements the same
+reference counter. When the consumer thread calls
+`erts_thr_progress_later()` in order to determine when it is safe to
+move `head.unref_end`, it also swaps reference counters for unmanaged
+threads. The previous current represents outstanding references from
+the time up to this point. The new current represents future reference
+following this point. When the consumer thread detects that we have
+both reached the desired thread progress and when the previous current
+reference counter reach zero it is safe to move the `head.unref_end`.
+
+The reason for using two reference counters is that we need to know
+that the reference counter eventually will reach zero. If we only used
+one reference counter it would potentially be held above zero for ever
+by different unmanaged threads.
+
+### Empty List ###
+
+If no new memory blocks are inserted into the list, it should
+eventually be emptied. All pointers to the list however expect to
+always point to something. This is solved by inserting an empty
+"marker" element, which only has to purpose of being there in the
+absense of other elements. That is when the list is empty it only
+contains this "marker" element.
+
+### Contention ###
+
+When elements are continously inserted by threads not owning the
+allocator instance, the thread owning the allocator instance will be
+able to work more or less undisturbed by other threads at the head end
+of the list. At the tail end large amounts of simultaneous inserts may
+cause contention, but we reduce such contention by spreading inserts
+of new elements near the end instead of requiring all new elements to
+be inserted at the end.
+
+### Schedulers and The Locked Allocator Instance ###
+
+Also the locked allocator instance for use by non-scheduler threads
+have a message box for deallocation jobs just as all the other
+allocator instances. The reason for this is that other threads may
+allocate memory pass it to a scheduler that then needs to deallocate
+it. We do not want the scheduler to have to wait for the lock on this
+locked instance. Since also locked instances has message boxes for
+deallocation jobs, the scheduler can just insert the job and avoid the
+locking.
+
+
+### A Benchmark Result ###
+
+When running the ehb benchmark, large amount of messages are passed
+around between schedulers. All message passing will in some way or the
+other cause memory allocation and deallocation. Since messages are
+passed between different schedulers we will get contention on the
+allocator instances where messages were allocated. By the introduction
+of the delayed dealloc feature, we got a speedup of between 25-45%,
+depending on configuration of the benchmark, when running on a
+relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/PTables.md b/erts/emulator/internal_doc/PTables.md
new file mode 100644
index 0000000000..6fe0e7665d
--- /dev/null
+++ b/erts/emulator/internal_doc/PTables.md
@@ -0,0 +1,356 @@
+Process and Port Tables
+=======================
+
+Problems
+--------
+
+The process table is a mapping from process identifiers to process
+structure pointers. The process structure contains miscellaneous
+information about a process, as for example pointers to its heap,
+message queue, etc. When the runtime system needs to operate on a
+process, it looks up the process structure in the process table using
+the process identifier. An example of this is when passing a message
+to a process.
+
+The process table has for a very long time just been an array of
+pointers to process structures. Since process identifiers internally
+in the runtime system are 28-bit integers it is quite easy to map a
+process identifier to index into the array. The 28-bits were divided
+into two sets. The least significant set of bits was used as index
+into the array. The most significant set of bits was only used to be
+able to distinguish between a number of identifiers with which map to
+the same index in the array. As long as process table sizes of a power
+of two was used we had 2^28 unique process identifiers.
+
+When the first SMP support was implemented, the table still was kept
+more or less the same way, but protected by two types of locks. One
+lock that protected the whole table against modifications and an array
+of locks protecting different parts of the table. The exact locking
+strategy previously used isn't interesting. What is interesting is
+that it suffered from heavy lock contention especially when lots of
+modifications was being made, but also when only performing lookups.
+
+In order to be able to detect when it is safe to deallocate a
+previously used process structure, reference counting of the structure
+was used. Also this was problematic, since simultaneous lookups needed
+to modify the reference counter which also caused contention on the
+cache line where the reference counter was located. This since all
+modifications needs to be communicated between all involved
+processors.
+
+The port table is very similar to the process table. The major
+difference, at least in concept, is that it is a mapping from port
+identifiers to port structures. It had a similar implementation, but
+with some differences. Instead of being an array of pointers it was an
+array of structures, and instead of being protected by two types of
+locks it was only protected by one global lock. This table also
+suffered from lock contention in various situations.
+
+Solution
+--------
+
+The process table was the major problem to address since processes are
+much more frequently used than ports. The first implementation only
+implemented this for processes, but since the port table is very
+similar and very similar problems occur on the port table, the process
+table implementation was later generalized so that it could also be
+used implementing the port table. For simplicity I will only talk
+about the process table in the following text, but the same will apply
+to the port table unless otherwise stated.
+
+If we disregard the locking issues, the original solution is very
+appealing. The mapping from process identifier to index into the array
+is very fast, and this property is something we would like to
+keep. The vast majority of operations on these tables are lookups so
+optimizing for lookups is what we want to do.
+
+### Lookup ###
+
+Using a set of bits in the process identifier as index into an array
+seems hard to beat. By replacing the array of pointers with an array
+of our pointer sized atomic data type, a lookup will consist of the
+following:
+
+1.  Mapping the 28-bit integer to an index into the array.
+
+    More about this mapping later.
+
+2.  Read the pointer using an atomic memory operation at determined
+    index in array.
+
+    On all platforms that we provide atomic memory operations, this is
+    just a `volatile` read, preventing the compiler to use values in
+    registers, forcing the a read from memory.
+
+3.  Depending on use, issue appropriate memory barrier.
+
+    A common barrier used is a barrier with acquire semantics. On
+    x86/x86_64 this maps to a compiler barrier preventing the compiler
+    to reorder instructions, but on other hardware often some kind of
+    light weight hardware memory barrier is also needed.
+
+    When comparing with a locked approach, at least one heavy weight
+    memory barrier will be issued when locking the lock on most, if
+    not all, hardware architectures (including x86/x86_64), and often
+    some kind of light weight memory barrier will be issued when
+    unlocking the lock. 
+
+When looking at this very simple solution with very little overhead
+you might wonder why we didn't implement it this way from the
+beginning. It all boils down to the read operation of the pointer. We
+need some way to know that it is safe to access the memory pointed
+to. One way of doing this is to place a reference counter in the
+process structure. Increment of the reference counter at lookup needs
+to be done atomically with the lookup. A lock can typically provide
+this service for us, which was the approach we previously
+used. Another approach could be to co-locate the reference counter
+with the pointer in the table. The major problem with this approach is
+the modifications of the reference counter. This since these
+modification would have to be communicated between all involved
+processor cause contention on the cache line containing the reference
+counter. The new lookup approach above is possible since we can use
+the "thread progress" functionality in order to determine when it is
+safe to deallocate the process structure. We'll get back to this when
+describing deletion in the table.
+
+Using this new lookup approach we wont modify any memory at all which
+is important. A lookup conceptually only read memory, now this is true
+in the implementation also which is important from a scalability
+perspective. The previous implementation modified the cache line
+containing the reference counter two times, and the cache line
+containing the corresponding lock two times at each lookup.
+
+### Modifications of the Table ###
+
+A lightweight lookup in the table was the most important feature, but
+we also wanted to improve modifications of the table. The process
+table is modified when a new process is spawned, i.e. a new pointer is
+inserted into the table, and when a process terminates, i.e. a pointer
+is deleted in the table.
+
+Assuming that we spawn fewer processes than the maximum amount of
+unique process identifiers in the system, one has always been able to
+determine the order of process creation just by comparing process
+identifiers. If PidX is larger than PidY, then PidX was created after
+PidY assuming both identifiers originates from the same node. However,
+since we have a quite limited amount of unique identifiers today
+(2^28), this property cannot be relied upon if we create large amount
+of processes. But never the less, this is a property the system always
+have had.
+
+If we would have had a huge amount of unique identifiers available, it
+would have tempting to drop or modify this ordering property as
+described above. The ordering property could for example be based on
+the scheduler performing the spawn operation. It would have been
+possible to reserve large ranges of identifiers exclusive for each
+scheduler thread which could be used minimizing the need for
+communication when allocating identifiers. The amount of identifiers
+we got to work with today is, however, not even close to be enough for
+such an approach.
+
+Since we have a limited amount of unique identifiers, we need to be
+careful not to waste them. If previously used identifiers are reused
+too quick, identifiers originating from terminated processes will
+refer to newly created processes, and mixups will occur. The
+previously used approach was quite good at not wasting
+identifiers. Using a modified version of the same approach also lets
+us keep the ordering property that we have always had.
+
+#### Insert ####
+
+The original approach is more or less to search for next free index or
+slot in the array. The search starts from the last slot allocated. If
+we reach the end of the array we increase a "wrapped counter" and then
+continue the search. The process identifier is constructed by writing
+the index to the least significant set of bits, and the "wrapped
+counter" to the most significant set of bits. The amount of bits in
+each set of bits is decided at boot time, so that maximum index will
+just fit into the least significant set of bits.
+
+In the modified lock free version of this approach we more or less do
+it the same way, but with some important modifications trying to avoid
+unnecessary contention when multiple schedulers create processes
+simultaneously. Since multiple threads might be trying to search for
+the next free slot at the same time from the same starting point we
+want subsequent slots to be located in different cache lines. Multiple
+schedulers simultaneously writing new pointers into the table are
+therefore very likely to write into adjacent slots. If adjacent slots
+are located in the same cache line all modification of this cache line
+needs to be communicated between all involved processors which will be
+very expensive and scale very poor. By locating adjacent slots in
+different cache lines only true conflicts will trigger communication
+between involved processors, i.e., avoiding false sharing.
+
+A cache line is larger than a pointer, typically 8 or 16 times larger,
+so using one cache line for each slot only containing one pointer
+would be a waste of space. Each cache line will be able to hold a
+fixed amount of slots. The first slot of the table will be the first
+slot of the first cache line, the second slot of the table will be the
+first slot of the second cache line until we reach the end of the
+array. The next slot after that will be the second slot of the first
+cache line, etc, moving forward one cache line internal slot each time
+we wrap. This way we will be able to fit the same amount of pointers
+into an array of the same size while always keeping adjacent slots in
+different cache lines.
+
+The mapping from identifier to slot or index into the array gets a bit
+more complicated than before. Instead of a `shift` and a bitwise
+`and`, we get two `shift`s, two bitwise `and`s, and an `add` (see
+implementation of `erts_ptab_data2pix()` in `erl_ptab.h`). However, by
+storing this information optimized for lookup we only need a `shift`
+and a bitwise `and` on 32-bit platforms. On 64-bit platforms we got
+enough room for the 28-bit identifier in the least significant
+halfword, and the index in the most significant halfword, in other
+words, we just need to read the most significant halfword to get the
+index. That is, this operation is as fast, or faster than before. The
+downside is that on 32-bit platforms we need to convert this
+information into the 28-bit identifier number when printing, or when
+ordering identifiers from the same node. These operations are,
+however, extremely infrequent compared to lookups.
+
+When we insert a new element in the table we do the following:
+
+1.  We begin by reserving space in the table by atomically
+    incrementing a counter of processes in the table. If our increment
+    brings the counter above the maximum size of the table, the
+    operation fail and a `system_limit` exception is raised.
+
+2.  The table contains a 64-bit atomic variable of the last identifier
+    used. Only the least significant bits will be used when actually
+    creating the identifier. This identifier is where the search
+    begin.
+
+3.  We increment last identifier value used. In order determine the
+    slot that corresponds to this identifier we call
+    `erts_ptab_data2pix()` that maps identifier to slot. We read the
+    content of the slot. If the slot is free we try to write a
+    reservation marker using an atomic compare and swap. If this fails
+    we repeat this step until it succeeds. 
+
+4.  Change the table variable of last identifier used. Since multiple
+    writes might occur at the same time this value may already have
+    been changed by to an identifier larger that the one we got. In
+    this case we can continue; otherwise, we need to change it to the
+    identifier we got.
+
+5.  We now do some initializations of the process structure that
+    cannot be done before we know the process identifier, and have to
+    be done before we publish the structure in the table. This, for
+    example, includes storing the identifier in the process structure. 
+
+6.  Now we can publish the structure in the table by writing the the
+    pointer to the process structure in the slot previously reserved
+    in 3.
+
+Using this approach we keep the properties like identifier ordering,
+and identifier reuse while improving performance and scalability. It
+has one flaw, though. There is no guarantee that the operation will
+terminate. This can quite easily be fixed though, and will be fixed in
+the next release. We will get back to this below.
+
+#### Delete ####
+
+When a process terminates, we mark the process as terminated in the
+process structure, the counter of number of processes in the table is
+decreased, and the reference to the process structure is removed by
+writing a `NULL` pointer into the corresponding slot. The scheduler
+thread performing this then schedule a thread progress later job which
+will do the final cleanup and deallocate the process structure. The
+thread progress functionality will make sure that this job will not
+execute until it is certain that all managed threads have dropped all
+references to the process structure.
+
+### BIF Iterating Over the Table ###
+
+The `erlang:processes/1` and `erlang:port/1` BIFs iterate over the
+tables and return corresponding identifiers. These BIF should return a
+consistent snapshot of the table content during some time when the BIF
+is executing. In order to implement this we use locking in a strange
+way. We use an "inverted rwlock".
+
+When performing lookups in the table we do not need to bother about
+the locking at all, but when modifying the table we read lock the
+rwlock protecting the table which allows for multiple writers during
+normal operation. When the BIF that iterates over the table need
+access to the table it write locks the rwlock and reads content of the
+table. The BIF do not read the whole table in one go but instead read
+small chunks at time only write locking while reading. The actual
+implementation of the BIFs is out of the scope of this document.
+
+An out of the box rwlock will typically suffer from contention on the
+single cache line containing the state of the rwlock even in the case
+we are only read locking. Instead of using such an rwlock, we have our
+own implementation of reader optimized rwlocks which keeps track of
+reader threads in separate thread specific cache lines. This in order
+to avoid contention on a singe cache line. As long as we only do read
+lock operations, threads only need to read a global cache line and
+modify its own cache line, and by this minimize communication between
+involved processors. The iterating BIFs are normally very infrequently
+used, so in the normal case we will only do read lock operations on
+the table global rwlock.
+
+### Future Improvements ###
+
+The first improvement is to fix the guarantee so that insert
+operations will be guaranteed to terminate. When the operation starts
+we verify that there actually exist a free slot that we can use. The
+problem is that we might not find it since it may move when multiple
+threads modify the table at the same time as we are trying to find the
+slot. The easy fix is to abort the operation if an empty slot could
+not be found in a finite number operation, and then restart the
+operation under a write lock. This will be implemented in next
+release, but furter work should be made trying to find a better
+solution.
+
+This and also previous implementation do not work well when the table
+is nearly full. We will both get long search times for free slots, and
+we will reuse identifiers more frequently since we more frequently
+wrap during the search. These tables works best when the table is much
+larger than the amount of simultaneous existing processes. One easy
+improvement is to always have room for more processes than we allow in
+the table. This will also be implemented in the next release, but this
+should probably also be worked more on trying to find an even better
+solution.
+
+It would also be nice to get rid of the rwlock all together. The use
+of a reader optimized rwlock makes sure we do not any contention on
+the lock, but unnecessary memory barriers will be issued due to the
+lock. The main issue here is to modify iterating BIFs so that they do
+not require exclusive access to the table while reading a sequence of
+slots. In principle this should be rather easy, the code can handle
+sequences of variable sizes, so shrinking the sequence size of slots
+to one would solv the problem. This will, however, need some tweeks
+and modifications of not trival code, but is something that should be
+looked at in the future.
+
+By increasing the size of identifiers, at least on 64-bit machines
+(which isn't as easy as it first might seem) we get further room for
+improvement. Besides the obvious improvement of not reusing
+identifiers as fast as we currently do, it makes it possible to
+further avoid contention when inserting elements in the table. At
+least if we drop this ordering property, which isn't that useful
+anyway.
+
+### Some Benchmark Results ###
+
+In order to test modifications of the process table we ran a couple of
+benchmarks where lots of processes are spawned and terminated
+simultaneously, and got a speedup of between 150-200%. Running a
+similar benchmark but with ports we got a speedup of about 130%.
+
+The BIF `erlang:is_process_alive/1` is the closest you can get to a
+process table lookup only. The BIF looks up the process corresponding
+to the process identifier passed as argument, and then checks if it is
+alive. By running multiple processes looping over this BIF checking
+the same process, we get a speedup between 20000-23000%. Conceptually
+this operation only involve read operations. In the implementation
+used in R16B also only read operation are performed, while the
+previous implementation need to lock structures in order to read the
+data, suffering from both lock contention and contention due to
+modifications of cache lines used by lock internal data structures and
+the reference counter on the process being looked up.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers. On a
+machine with more communication overhead and/or larger amount of
+logical processors the speedups are expected to be even larger.
diff --git a/erts/emulator/internal_doc/PortSignals.md b/erts/emulator/internal_doc/PortSignals.md
new file mode 100644
index 0000000000..b1afb7c5cb
--- /dev/null
+++ b/erts/emulator/internal_doc/PortSignals.md
@@ -0,0 +1,267 @@
+Port Signals
+============
+
+Problems
+--------
+
+Erlang ports conceptually are very similar to Erlang processes. Erlang
+processes execute Erlang code in the virtual machine, while an Erlang
+port execute native code typically used for communication with the
+outside world. For example, when an Erlang process wants to
+communicate using TCP over the network, it communicates via an Erlang
+port implementing the TCP socket interface in native code. Both Erlang
+Processes and Ports communicate using asynchronous signaling. The
+native code executed by an Erlang port is a collection of callback
+functions, called a driver. Each callback more or less implements the
+code of a signal to, or from the port.
+
+Even though processes and ports conceptually always have been very
+similar, the implementations have been very different. Originally,
+more or less all port signals were handled synchronously at the time
+they occurred. Very early in the development of the SMP support for
+the runtime system we recognized that this was a huge problem for
+signals between ports and the outside world. That is, I/O events to
+and from the outside world, or I/O signals. This was one of the first
+things that had to be rewritten in order to be able to do I/O in
+parallel at all. The solution was to implement scheduling of these
+signals. I/O signals corresponding to different ports could then be
+executed in parallel on different scheduler threads. Signals from
+processes to ports was not as big of a problem as the I/O signals, and
+the implementation of those was left as they were.
+
+Each port is protected by its own lock to protect against simultaneous
+execution in multiple threads. Previously when a process, executing on
+a scheduler thread, sent a port a signal, it locked the port lock and
+synchronously executed the code corresponding to the signal. If the
+lock was busy, the scheduler thread blocked waiting until it could
+lock the lock. If multiple processes executing simultaneously on
+different scheduler threads, sent signals to the same port, schedulers
+suffered from heavy lock contention. Such contention could also occur
+between I/O signals for the port executing on one scheduler thread,
+and a signal from a process to the port executing on another scheduler
+thread. Beside the contention issues, we also loose potential work to
+execute in parallel on different scheduler threads. This since the
+process sending the *asynchronous* signal is blocked while the code
+implementing the signal is executed synchronously.
+
+Solution
+--------
+
+In order to prevent multiple schedulers from trying to execute signals
+to/from the same port simultaneously, we need to be able to ensure
+that all signals to/from a port are executed in sequence on one
+scheduler. More or less, the only way to do this is to schedule all
+types of signals. Signals corresponding to a port can then be executed
+in sequence by one single scheduler thread. If only one thread tries
+to execute the port, no contention will appear on the port
+lock. Besides getting rid of the contention, processes sending signals
+to the port can also continue execution of their own Erlang code on
+other schedulers at the same time as the signaling code is executing
+on another scheduler.
+
+When implementing this there are a couple of important properties that
+we either need, or want to preserve:
+
+*   Signal ordering guarantee. Signals from process `X` to port `Y`,
+    *must* be delivered to `Y` in the same order as sent from `X`.
+
+*   Signal latency. Due to the previous synchronous implementation,
+    latency of signals sent from processes to ports have usually been
+    very low. During contention the latency has of course
+    increased. Users expect latency of these signals to be low, a
+    sudden increase in latency would not be appreciated by our users.
+
+*   Compatible flow control. Ports have for a very long time had the
+    possibility to use the busy port functionality when implementing
+    flow control. One may argue that this functionality fits very bad
+    with the conceptually completely asynchronous signaling, but the
+    functionality has been there for ages and is expected to be
+    there. When a port sets itself into a busy state, `command`
+    signals should not be delivered, and senders of such signals
+    should suspend until the port sets itself in a not busy state.
+
+### Scheduling of Port Signals ###
+
+A run queue has four queues for processes of different priority and
+one queue for ports. The scheduler thread associated with the run
+queue switch evenly between execution of processes and execution of
+ports while both processes and ports exist in the queue. This is not
+completely true, but not important for this discussion. A port that is
+in a run queue also has a queue of tasks to execute. Each task
+corresponds to an in- or outgoing signal. When the port is selected
+for execution each task will be executed in sequence. The run queue
+locks not only protected the queues of ports, but also the queues of
+port tasks.
+
+Since we go from a state where I/O signals are the only port related
+signals scheduled, to a state where potentially all port related
+signals may be scheduled we may drastically increase the load on the
+run queue lock. The amount of scheduled port tasks very much depend on
+the Erlang application executing, which we do not control, and we do
+not want to get increased contention on the run queue locks. We
+therefore need another approach of protecting the port task queue.
+
+#### Task Queue ####
+
+We chose a "semi locked" approach, with one public locked task queue,
+and a private, lock free, queue like, task data structure. This "semi
+locked" approach is similar to how the message boxes of processes are
+managed. The lock is port specific and only used for protection of
+port tasks, so the run queue lock is now needed in more or less the
+same way for ports as for processes. This ensures that we wont see an
+increased lock contention on run queue locks due to this rewrite of
+the port functionality.
+
+When an executing port runs out of work to execute in the private task
+data structure, it moves the public task queue into the private task
+data structure while holding the lock. Once tasks has been moved to
+the private data structure no lock protects them. This way the port
+can continue working on tasks in the private data structure without
+having to fight for the lock.
+
+I/O signals may however be aborted. This could be solved by letting
+the port specific scheduling lock also protect the private task data
+structure, but then the port very frequently would have to fight with
+others enqueueing new tasks. In order to handle this while keeping the
+private task data structure lock free, we use a similar "non
+aggressive" approach as we use when handling processes that gets
+suspended while in the run queue. Instead of removing the aborted port
+task, we just mark it as aborted using an atomic memory
+operation. When a task is selected for execution, we first verify that
+it has not been aborted. If aborted we, just drop the task.
+
+A task that can be aborted is referred via another data structure from
+other parts of the system, so that a thread that needs to abort the
+task can reach it. In order to be sure to safely deallocate a task
+that is no longer used, we first clear this reference and then use the
+thread progress functionality in order to make sure no references can
+exist to the task. Unfortunately, also unmanaged threads might abort
+tasks. This is very infrequent, but might occur. This could be handled
+locally for each port, but would require extra information in each
+port structure which very infrequently would be used. Instead of
+implementing this in each port, we implemented general functionality
+that can be used from unmanaged threads to delay thread progress.
+
+The private "queue like" task data structure could have been an
+ordinary queue if it wasn't for the busy port functionality. When the
+port has flagged itself as busy, `command` signals are not allowed to
+be delivered and need to be blocked. Other signals sent from the same
+sender following a `command` signal that has been blocked also have to
+be blocked; otherwise, we would violate the ordering guarantee. At the
+same time, other signals that have no dependencies to blocked
+`command` signals are expected to be delivered.
+
+The above requirements makes the private task data structure a rather
+complex data structure. It has a queue of unprocessed tasks, and a
+busy queue. The busy queue contains blocked tasks corresponding to
+`command` signals, and tasks with dependencies to such tasks. The busy
+queue is accompanied by a table over blocked tasks based on sender
+with a references into last task in the busy queue from a specific
+sender. This since we need check for dependencies when new tasks are
+processed in the queue of unprocessed tasks. When a new task is
+processed that needs to be blocked it isn't enqueued at the end of the
+busy queue, but instead directly after the last task with the same
+sender. This in order to easily be able to detect when we have tasks
+that no longer have any dependencies to tasks corresponding to
+`command` signals which should be moved out of the busy queue. When
+the port executes, it switches between processing tasks from the busy
+queue, and processing directly from the unprocessed queue based on its
+busy state. When processing directly from the unprocessed queue it
+might, of course, have to move a task into the busy queue instead of
+executing it.
+
+#### Busy Port Queue ####
+
+Since it is the port itself which decides when it is time to enter a
+busy state, it needs to be executing in order to enter the busy
+state. As a result of `command` signals being scheduled, we may get
+into a situation where the port gets flooded by a huge amount of
+`command` signals before it even gets a chance to set itself into a
+busy state. This since it has not been scheduled for execution
+yet. That is, under these circumstances the busy port functionality
+loose the flow control properties it was intended to provide.
+
+In order to solve this, we introduced a new busy feature, namely "busy
+port queue". The port has a limit of `command` data that is allowed to
+be enqueued in the task queue. When this limit is reached, the port
+will automatically enter a busy port queue state. When in this state,
+senders of `command` signals will be suspended, but `command` signals
+will still be delivered to the port unless it is also in a busy port
+state. This limit is known as the high limit.
+
+There is also a low limit. When the amount of queued `command` data
+falls below this limit and the port is in a busy port queue state, the
+busy port queue state is automatically disabled. The low limit should
+typically be significantly lower than the high limit in order to
+prevent frequent oscillation around the busy port queue state.
+
+By introduction of this new busy state we still can provide the flow
+control. Old driver do not even have to be changed. The limits can,
+however, be configured and even disabled by the port. By default the
+high limit is 8 KB and the low limit is 4 KB.
+
+### Preparation of Signal Send ###
+
+Previously all operations sending signals to ports began by acquiring
+the port lock, then performed preparations for sending the signal, and
+then finaly sent the signal. The preparations typically included
+inspecting the state of the port, and preparing the data to pass along
+with the signal. The preparation of data is frequently quite time
+consuming, and did not really depend on the port. That is we would
+like to do this without having the port lock locked.
+
+In order to improve this, state information was re-organized in the
+port structer, so that we can access it using atomic memory
+operations. This together with the new port table implementation,
+enabled us to lookup the port and inspect the state before acquiring
+the port lock, which in turn made it possible to perform preparations
+of signal data before acquiring the port lock.
+
+### Preserving Low Latency ###
+
+If we disregard the contended cases, we will inevitably get a higher
+latency when scheduling signals for execution at a later time than by
+executing the signal immediately. In order to preserve the low latency
+we now first check if this is a contended case or not. If it is, we
+schedule the signal for later execution; otherwise, we execute the
+signal immediately. It is a contended case if other signals already
+are scheduled on the port, or if we fail to acquire the port
+lock. That is we will not block waiting for the lock.
+
+Doing it this way we will preserve the low latency at the expense of
+lost potential parallel execution of the signal and other code in the
+process sending the signal. This default behaviour can however be
+changed on port basis or system wide, forcing scheduling of all
+signals from processes to ports that are not part of a synchronous
+communication. That is, an unconditional request/response pair of
+asynchronous signals. In this case it is no potential for parallelism,
+and by that no point forcing scheduling of the request signal.
+
+The immediate execution of signals may also cause a scheduler that is
+about to execute scheduled tasks to block waiting for the port
+lock. This is however more or less the only scenario where a scheduler
+needs to wait for the port lock. The maximum time it has to wait is
+the time it takes to execute one signal, since we always schedule
+signals when contention occurs.
+
+### Signal Operations ###
+
+Besides implementing the functionality enabling the scheduling,
+preparation of signal data without port lock, etc, each operation
+sending signals to ports had to be quite extensively re-written. This
+in order to move all sub-operations that can be done without the lock
+to a place before we have acquired the lock, and also since signals
+now sometimes are executed immediately and sometimes scheduled for
+execution at a later time which put different requirements on the data
+to pass along with the signal.
+
+### Some Benchmark Results ###
+
+When running some simple benchmarks where contention only occur due to
+I/O signals contending with signals from one single process we got a
+speedup of 5-15%. When multiple processes send signals to one single
+port the improvements can be much larger, but the scenario with one
+process contending with I/O is the most common one.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/ProcessManagementOptimizations.md b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
new file mode 100644
index 0000000000..9e83633bef
--- /dev/null
+++ b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
@@ -0,0 +1,172 @@
+Process Management Optimizations
+================================
+
+Problems
+--------
+
+Early versions of the SMP support for the runtime system completely
+relied on locking in order to protect data accesses from multiple
+threads. In some cases this isn't that problematic, but in some cases
+it really is. It complicates the code, ensuring all locks needed are
+actually held, and ensuring that all locks are acquired in such an
+order that no deadlock occur. Acquiring locks in the right order often
+also involve releasing locks held, forcing threads to reread data
+already read. A good recipe for creation of bugs. Trying to use more
+fine-grained locking in order to increase possible parallelism in the
+system makes the complexity situation even worse. Having to acquire a
+bunch of locks when doing operations also often cause heavy lock
+contention which cause poor scalability.
+
+Management of processes internally in the runtime system suffered from
+these problems. When changing state on a process, for example from
+`waiting` to `runnable`, a lock on the process needed to be
+locked. When inserting a process into a run queue also a lock
+protecting the run queue had to be locked. When migrating a process
+from one run queue to another run queue, locks on both run queues and
+on the process had to be locked.
+
+This last example is a quite common case in during normal
+operation. For example, when a scheduler thread runs out of work it
+tries to steal work from another scheduler threads run queue. When
+searching for a victim to steal from there was a lot of juggling of
+run queue locks involved, and during the actual theft finalized by
+having to lock both run queues and the process. When one scheduler
+runs out of work, often others also do, causing lots of lock
+contention.
+
+Solution
+--------
+
+### Process ###
+
+In order to avoid these situations we wanted to be able to do most of
+the fundamental operations on a process without having to acquire a
+lock on the process. Some examples of such fundamental operations are,
+moving a process between run queues, detecting if we need to insert it
+into a run queue or not, detecting if it is alive or not.
+
+All of this information in the process structure that was needed by
+these operations was protected by the process `status` lock, but the
+information was spread across a number of fields. The fields used was
+typically state fields that could contain a small number of different
+states. By reordering this information a bit we could *easily* fit
+this information into a 32-bit wide field of bit flags (only 12-flags
+were needed). By moving this information we could remove five 32-bit
+wide fields and one pointer field from the process structure! The move
+also enabled us to easily read and change the state using atomic
+memory operations.
+
+### Run Queue ###
+
+As with processes we wanted to be able to do the most fundamental
+operations without having to acquire a lock on it. The most important
+being able to determine if we should enqueue a process in a specific
+run queue or not. This involves being able to read actual load, and
+load balancing information.
+
+The load balancing functionality is triggered at repeated fixed
+intervals. The load balancing more or less strives to even out run
+queue lengths over the system. When balancing is triggered,
+information about every run queue is gathered, migrations paths and
+run queue length limits are set up. Migration paths and limits are
+fixed until the next balancing has been done. The most important
+information about each run queue is the maximum run queue length since
+last balancing. All of this information were previously stored in the
+run queues themselves.
+
+When a process has become runnable, for example due to reception of a
+message, we need to determine which run queue to enqueue it
+in. Previously this at least involved locking the run queue that the
+process currently was assigned to while holding the status lock on the
+process. Depending on load we sometimes also had to acquire a lock on
+another run queue in order to be able to determine if it should be
+migrated to that run queue or not.
+
+In order to be able to decide which run queue to use without having to
+lock any run queues, we moved all fixed balancing information out of
+the run queues into a global memory block. That is, migration paths
+and run queue limits. Information that need to be frequently updated,
+like for example maximum run queue length, were kept in the run queue,
+but instead of operating on this information under locks we now use
+atomic memory operations when accessing this information. This made it
+possible to first determine which run queue to use, without locking
+any run queues, and when decided, lock the chosen run queue and insert
+the process.
+
+#### Fixed Balancing Information ####
+
+When determining which run queue to choose we need to read the fixed
+balancing information that we moved out of the run queues. This
+information is global, read only between load balancing operations,
+but will be changed during a load balancing. We do not want to
+introduce a global lock that needs to be acquired when accessing this
+information. A reader optimized rwlock could avoid some of the
+overhead since the data is most frequently read, but it would
+unavoidably cause disruption during load balancing, since this
+information is very frequently read. The likelihood of a large
+disruption due to this also increase as number of schedulers grows.
+
+Instead of using a global lock protecting modifications of this
+information, we write a completely new version of it at each load
+balancing. The new version is written in another memory block than the
+previous one, and published by issuing a write memory barrier and then
+storing a pointer to the new memory block in a global variable using
+an atomic write operation.
+
+When schedulers need to read this information, they read the pointer
+to currently used information using an atomic read operation, and then
+issue a data dependency read barrier, which on most architectures is a
+no-op. That is, it is very little overhead getting access to this
+information.
+
+Instead of allocating and deallocating memory blocks for the different
+versions of the balancing information we keep old memory blocks and
+reuse them when it is safe to do so. In order to be able to determine
+when it is safe to reuse a block we use the thread progress
+functionality, ensuring that no threads have any references to the
+memory block when we reuse it.
+
+#### Be Less Aggressive ####
+
+We implemented a test version using lock free run queues. This
+implementation did however not perform as good as the version using
+one lock per run queue. The reason for this was not investigated
+enough to say why this was. Since the locked version performed better
+we kept it, at least for now. The lock free version, however, forced
+us to use other solutions, some of them we kept.
+
+Previously when a process that was in a run queue got suspended, we
+removed it from the queue straight away. This involved locking the
+process, locking the run queue, and then unlinking it from the double
+linked list implementing the queue. Removing a process from a lock
+free queue gets really complicated. Instead, of removing it from the
+queue, we just leave it in the queue and mark it as suspended. When
+later selected for execution we check if the process is suspended, if
+so just dropped it. During its time in the queue, it might also get
+resumed again, if so execute it when it get selected for execution.
+
+By keeping this part when reverting back to a locked implementation,
+we could remove a pointer field in each process structure, and avoid
+unnecessary operations on the process and the queue which might cause
+contention.
+
+### Combined Modifications ###
+
+By combining the modifications of the process state management and the
+run queue management, we can do large parts of the work involved when
+managing processes with regards to scheduling and migration without
+having any locks locked at all. In these situations we previously had
+to have multiple locks locked. This of course caused a lot of rewrites
+across large parts of the runtime system, but the rewrite both
+simplified code and eliminated locking at a number of places. The
+major benefit is, of course, reduced contention.
+
+### A Benchmark Result ###
+
+When running the chameneosredux benchmark, schedulers frequently run
+out of work trying to steal work from each other. That is, either
+succeeding in migrating, or trying to migrate processes which is a
+scenario which we wanted to optimize. By the introduction of these
+improvements, we got a speedup of 25-35% when running this benchmark
+on a relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/ThreadProgress.md b/erts/emulator/internal_doc/ThreadProgress.md
new file mode 100644
index 0000000000..6118bcf0f6
--- /dev/null
+++ b/erts/emulator/internal_doc/ThreadProgress.md
@@ -0,0 +1,308 @@
+Thread Progress
+===============
+
+Problems
+--------
+
+### Knowing When Threads Have Completed Accesses to a Data Structure ###
+
+When multiple threads access the same data structure you often need to
+know when all threads have completed their accesses. For example, in
+order to know when it is safe to deallocate the data structure. One
+simple way to accomplish this is to reference count all accesses to
+the data structure. The problem with this approach is that the cache
+line where the reference counter is located needs to be communicated
+between all involved processors. Such communication can become
+extremely expensive and will scale poorly if the reference counter is
+frequently accessed. That is, we want to use some other approach of
+keeping track of threads than reference counting.
+
+### Knowing That Modifications of Memory is Consistently Observed ###
+
+Different hardware architectures have different memory models. Some
+architectures allows very aggressive reordering of memory accesses
+while other architectures only reorder a few specific cases. Common to
+all modern hardware is, however, that some type of reordering will
+occur. When using locks to protect all memory accesses made from
+multiple threads such reorderings will not be visible. The locking
+primitives will ensure that the memory accesses will be ordered. When
+using lock free algorithms one do however have to take this reordering
+made by the hardware into account.
+
+Hardware memory barriers or memory fences are instructions that can be
+used to enforce order between memory accesses. Different hardware
+architectures provide different memory barriers. Lock free algorithms
+need to use memory barriers in order to ensure that memory accesses
+are not reordered in such ways that the algorithm breaks down. Memory
+barriers are also expensive instructions, so you typically want to
+minimize the use of these instructions.
+
+Functionality Used to Address These Problems
+-------------------------------------------
+
+The "thread progress" functionality in the Erlang VM is used to
+address these problems. The name "thread progress" was chosen since we
+want to use it to determine when all threads in a set of threads have
+made such progress so that two specific events have taken place for
+all them.
+
+The set of threads that we are interested in we call managed
+threads. The managed threads are the only threads that we get any
+information about. These threads *have* to frequently report
+progress. Not all threads in the system are able to frequently report
+progress. Such threads cannot be allowed in the set of managed threads
+and are called unmanaged threads. An example of unmanaged threads are
+threads in the async thread pool. Async threads can be blocked for
+very long times and by this be prevented from frequently reporting
+progress. Currently only scheduler threads and a couple of other
+threads are managed threads.
+
+### Thread Progress Events ###
+
+Any thread in the system may use the thread progress functionality in
+order to determine when the following events have occured at least
+once in all managed threads:
+
+1.  The thread has returned from other code to a known state in the
+    thread progress functionality, which is independent of any other
+    code. 
+2.  The thread has executed a full memory barrier.
+
+These events, of course, need to occur ordered to other memory
+operations. The operation of determining this begins by initiating the
+thread progress operation. The thread that initiated the thread
+progress operation after this poll for the completion of the
+operation. Both of these events must occur at least once *after* the
+thread progress operation has been initiated, and at least once
+*before* the operation has completed in each managed thread. This is
+ordered using communication via memory which makes it possible to draw
+conclusion about the memory state after the thread progress operation
+has completed. Lets call the progress made from initiation to
+comletion for "thread progress".
+
+Assuming that the thread progress functionality is efficient, a lot of
+algorithms can both be simplified and made more efficient than using
+the first approach that comes to mind. A couple of examples follows.
+
+By being able to determine when the first event above has occurred we
+can easily know when all managed threads have completed accesses to a
+data structure. This can be determined the following way. We have an
+implementation of some functionality `F` using a data structure
+`D`. The reference to `D` is always looked up before `D` is being
+accessed, and the references to `D` is always dropped before we leave
+the code implementing `F`. If we remove the possibility to look up `D`
+and then wait until the first event has occurred in all managed
+threads, no managed threads can have any references to the data
+structure `D`. This could for example have been achieved by using
+reference counting, but the cache line containing the reference
+counter would in this case be ping ponged between all processors
+accessing `D` at every access.
+
+By being able to determine when the second event has occurred it is
+quite easy to do complex modifications of memory that needs to be seen
+consistently by other threads without having to resort to locking. By
+doing the modifications, then issuing a full memory barrier, then wait
+until the second event has occurred in all managed threads, and then
+publish the modifications, we know that all managed threads reading
+this memory will get a consistent view of the modifications. Managed
+threads reading this will not have to issue any extra memory barriers
+at all.
+
+Implementation of the Thread Progress Functionality
+---------------------------------------------------
+
+### Requirement on the Implementation ###
+
+In order to be able to determine when all managed threads have reached
+the states that we are interested in we need to communicate between
+all involved threads. We of course want to minimize this
+communication.
+
+We also want threads to be able to determine when thread progress has
+been made relatively fast. That is we need to have some balance
+between comunication overhead and time to complete the operation.
+
+### API ###
+
+I will only present the most important functions in the API here.
+
+*   `ErtsThrPrgrVal erts_thr_progress_later(void)` - Initiation of the
+    operation. The thread progress value returned can be used testing
+    for completion of the operation.
+*   `int erts_thr_progress_has_reached(ErtsThrPrgrVal val)` - Returns
+    a non zero value when we have reached the thread progress value
+    passed as argument. That is, when a non zero value is returned the
+    operation has completed.
+
+When a thread calls `my_val = erts_thr_progress_later()` and waits for
+`erts_thr_progress_has_reached(my_val)` to return a non zero value it
+knows that thread progress has been made.
+
+While waiting for `erts_thr_progress_has_reached()` to return a non
+zero value we typically do not want to block waiting, but instead want
+to continue working with other stuff. If we run out of other stuff to
+work on we typically do want to block waiting until we have reached
+the thread progress value that we are waiting for. In order to be able
+to do this we provide functionality for waking up a thread when a
+certain thread progress value has been reached:
+
+*   `void erts_thr_progress_wakeup(ErtsSchedulerData *esdp,
+    ErtsThrPrgrVal val)` - Request wake up. The calling thread will be
+    woken when thread progress has reached val. 
+
+Managed threads frequently need to update their thread progress by
+calling the following functions:
+
+*   `int erts_thr_progress_update(ErtsSchedulerData *esdp)` - Update
+    thread progress. If a non zero value is returned
+    `erts_thr_progress_leader_update()` has to be called without any
+    locks held.
+*   `int erts_thr_progress_leader_update(ErtsSchedulerData *esdp)` -
+    Leader update thread progress.
+
+Unmanaged threads can delay thread progress beeing made:
+
+*   `ErtsThrPrgrDelayHandle erts_thr_progress_unmanaged_delay(void)` -
+    Delay thread progress.
+*   `void erts_thr_progress_unmanaged_continue(ErtsThrPrgrDelayHandle
+    handle)` - Let thread progress continue.
+
+Scheduler threads can schedule an operation to be executed by the
+scheduler itself when thread progress has been made:
+
+* `void erts_schedule_thr_prgr_later_op(void (*funcp)(void *), void
+  *argp, ErtsThrPrgrLaterOp *memp)` - Schedule a call to `funcp`. The
+  call `(*funcp)(argp)` will be executed when thread progress has been
+  made since the call to `erts_schedule_thr_prgr_later_op()` was
+  made.
+
+### Implementation ###
+
+In order to determine when the events has happened we use a global
+counter that is incremented when all managed threads have called
+`erts_thr_progress_update()` (or `erts_thr_progress_leader_update()`).
+This could naively be implemented using a "thread confirmed" counter.
+This would however cause an explosion of communication where all
+involved processors would need to communicate with each other at each
+update.
+
+Instead of confirming at a global location each thread confirms that
+it accepts in increment of the global counter in its own cache
+line. These confirmation cache lines are located in sequence in an
+array, and each confirmation cache line will only be written by one
+and only one thread. One of the managed threads always have the leader
+responsibility. This responsibility may jump between threads, but as
+long as there are some activity in the system always one of them will
+have the leader responsibility. The thread with the leader
+responsibility will call `erts_thr_progress_leader_update()` which
+will check that all other threads have confirmed an increment of the
+global counter before doing the increment of the global counter. The
+leader thread is the only thread reading the confirmation cache
+lines.
+
+Doing it this way we will get a communication pattern of information
+going from the leader thread out to all other managed threads and then
+back from the other threads to the leader thread. This since only the
+leader thread will write to the global counter and all other threads
+will only read it, and since each confirmation cache lines will only
+be written by one specific thread and only read by the leader
+thread. When each managed thread is distributed over different
+processors, the communication between processors will be a reflection
+of this communication pattern between threads.
+
+The value returned from `erts_thr_progress_later()` equals the, by
+this thread, latest confirmed value plus two. The global value may be
+latest confirmed value or latest confirmed value minus one. In order
+to be certain that all other managed threads actually will call
+`erts_thr_progress_update()` at least once before we reach the value
+returned from `erts_thr_progress_later()`, the global counter plus one
+is not enough. This since all other threads may already have confirmed
+current global value plus one at the time when we call
+`erts_thr_progress_later()`. They are however guaranteed not to have
+confirmed global value plus two at this time.
+
+The above described implementation more or less minimizes the
+comunication needed before we can increment the global counter. The
+amount of communication in the system due to the thread progress
+functionality however also depend on the frequency with which managed
+threads call `erts_thr_progress_update()`. Today each scheduler thread
+calls `erts_thr_progress_update()` more or less each time an Erlang
+process is scheduled out. One way of further reducing communication
+due to the thread progress functionality is to only call
+`erts_thr_progress_update()` every second, or third time an Erlang
+process is scheduled out, or even less frequently than that. However,
+by doing updates of thread progress less frequently all operations
+depending on the thread progress functionality will also take a longer
+time.
+
+#### Delay of Thread Progress by Unmanaged Threads ####
+
+In order to implement delay of thread progress from unmanaged threads
+we use two reference counters. One being `current` and one being
+`waiting`. When an unmanaged thread wants to delay thread progress it
+increments `current` and gets a handle back to the reference counter
+it incremented. When it later wants to enable continuation of thread
+progress it uses the handle to decrement the reference counter it
+previously incremented.
+
+When the leader threads is about to increment the global thread
+progress counter it verifies that the `waiting` counter is zero before
+doing so. If not zero, the leader isn't allowed to increment the
+global counter, and needs to wait before it can do this. When it is
+zero, it swaps the `waiting` and `current` counters before increasing
+the global counter. From now on the new `waiting` counter will
+decrease, so that it eventualy will reach zero, making it possible to
+increment the global counter the next time. If we only used one
+reference counter it would potentially be held above zero for ever by
+different unmanaged threads.
+
+When an unmanaged thread increment the `current` counter it will not
+prevent the next increment of the global counter, but instead the
+increment after that. This is sufficient since the global counter
+needs to be incremented two times before thread progress has been
+made. It is also desirable not to prevent the first increment, since
+the likelyhood increases that the delay is withdrawn before any
+increment of the global counter is delayed. That is, the operation
+will cause as little disruption as possible.
+
+However, this feature of delaying thread progress from unmanaged
+threads should preferably be used as little as possible, since heavy
+use of it will cause contention on the reference counter cache
+lines. The functionality is however very useful in code which normally
+only executes in managed threads, but which may under some infrequent
+circumstances be executed in other threads.
+
+#### Overhead ####
+
+The overhead caused by the thread progress functionality is more or
+less fixed using the same amount of schedulers regardless of the
+number of uses of the functionality. Already today quite a lot of
+functionality use it, and we plan to use it even more. When rewriting
+old implementations of ERTS internal functionality to use the thread
+progress functionality, this implies removing communication in the old
+implementation. Otherwise it is simply no point rewriting the old
+implementation to use the thread progress functionality. Since the
+thread progress overhead is more or less fixed, the rewrite will cause
+a reduction of the total communication in the system.
+
+##### An Example #####
+
+The main structure of an ETS table was originally managed using
+reference counting. Already a long time ago we replaced this strategy
+since the reference counter caused contention on each access of the
+table. The solution used was to schedule "confirm deletion" jobs on
+each scheduler in order to know when it was safe to deallocate the
+table structure of a removed table. These confirm deletion jobs needed
+to be allocated. That is, we had to allocate and deallocate as many
+blocks as schedulers in order to deallocate one block. This of course
+was a quite an expensive operation, but we only needed to do this once
+when removing a table. It was more important to get rid of the
+contention on the reference counter which was present on every
+operation on the table.
+
+When the thread progress functionality had been introduced, we could
+remove the code implementing the "confirm deletion" jobs, and then
+just schedule a thread progress later operation which deallocates the
+structure. Besides simplifying the code a lot, we got an increase of
+more than 10% of the number of transactions per second handled on a
+mnesia tpcb benchmark executing on a quad core machine.
diff --git a/erts/emulator/internal_doc/Tracing.md b/erts/emulator/internal_doc/Tracing.md
new file mode 100644
index 0000000000..30bc5327a7
--- /dev/null
+++ b/erts/emulator/internal_doc/Tracing.md
@@ -0,0 +1,220 @@
+Non-blocking trace setting
+==========================
+
+Introduction
+------------
+
+Before OTP R16 when trace settings were changed by `erlang:trace_pattern`,
+all other execution in the VM were halted while the trace operation
+was carried out in single threaded mode. Similar to code loading, this
+can impose a severe problem for availability that grows with the
+number of cores.
+
+In OTP R16, trace breakpoints are set in the code without blocking the
+VM. Erlang processes may continue executing undisturbed in parallel
+during the entire operation. The same base technique is used as for
+code loading. A staging area of breakpoints is prepared and then made
+active with a single atomic operation.
+
+
+Redesign of Breakpoint Wheel
+----------------------------
+
+To make it easier to manage breakpoints without single threaded mode a
+redesign of the breakpoint mechanism has been made. The old
+"breakpoint wheel" data structure was a circular double-linked list of
+breakpoints for each instrumented function. It was invented before the
+SMP emulator. To support it in the SMP emulator, is was essentially
+expanded to one breakpoint wheel per scheduler. As more breakpoint
+types have been added, the implementation have become messy and hard
+to understand and maintain.
+
+In the new design the old wheel was dropped and instead replaced by
+one struct (`GenericBp`) to hold the data for all types of breakpoints
+for each instrumented function. A bit-flag field is used to indicate
+what different type of break actions that are enabled.
+
+
+Same Same but Different
+-----------------------
+Even though `trace_pattern` use the same technique as the non-blocking
+code loading with replicated generations of data structures and an
+atomic switch, the implementations are quite separate from each
+other. One initial idea was to use the existing mechanism of code
+loading to do a dummy load operation that would make a copy of the
+affected modules. That copy could then be instrumented with
+breakpoints before making it reachable with the same atomic switch as
+done for code loading. This approach seems straight forward but has a
+number of shortcomings, one being the large memory footprint when many
+modules are instrumented. Another problem is how execution will reach
+the new instrumented code. Normally loaded code can only be reached
+through external functions calls. Trace settings must be activated
+instantaneously without the need of external function calls.
+
+The choosen solution is instead for tracing to use the technique of
+replication applied on the data structures for breakpoints. Two
+generations of breakpoints are kept and indentified by index of 0 and
+1. The global atomic variables `erts_active_bp_index` will determine
+which generation of breakpoints running code will use.
+
+### Atomicy Without Atomic Operations
+
+Not using the code loading generations (or any other code duplication)
+means that `trace_pattern` must at some point write to the active beam
+code in order for running processes to reach the staged breakpoints
+structures. This can be done with one single atomic write operation
+per instrumented function. The beam instruction words are however read
+with normal memory loads and not through the atomic API. The only
+guarantee we need is that the written instruction word is seen as
+atomic. Either fully written or not at all. This is true for word
+aligned write operation on all hardware architectures we use.
+
+
+Adding a new Breakpoint
+-----------------------
+This is a simplified sequence describing what `trace_pattern` goes
+through when adding a new breakpoint.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate breakpoint structure `GenericBp` including both generations.
+   Set the active part as disabled with a zeroed flagfield. Save the original
+   instruction word in the breakpoint.
+
+3. Write a pointer to the breakpoint at offset -4 from the first
+   instruction "func_info" header.
+
+4. Set the staging part of the breakpoint as enabled with specified
+   breakpoint data.
+
+5. Wait for thread progress.
+
+6. Write a `op_i_generic_breakpoint` as the first instruction for the function.
+   This instruction will execute the breakpoint that it finds at offset -4.
+
+7. Wait for thread progress.
+
+8. Commit the breadpoint by switching `erts_active_bp_index`.
+
+9. Wait for thread progress.
+
+10. Prepare for next call to `trace_pattern` by updating the new staging part
+    (the old active) of the breakpoint to be identic to the the new active part.
+
+11. Release code write permission and return from `trace_pattern`.
+
+
+The code write permission "lock" seized in step 1 is the same as used
+by code loading. This will ensure that only one process at a time can
+stage new trace settings but it will also prevent concurrent code
+loading and make sure we see a consistent view of the beam code during
+the entire sequence.
+
+Between step 6 and 8, runninng processes might execute the written
+`op_i_generic_breakpoint` instruction. They will get the breakpoint
+structure written in step 3, read `erts_active_bp_index` and execute
+the corresponding part of the breakpoint. Before the switch in step 8
+becomes visible they will however execute the disabled part of the
+breakpoint structure and do nothing other than executing the saved
+original instruction.
+
+
+To Updating and Remove Breakpoints
+----------------------------------
+
+The above sequence did only describe adding a new breakpoint. We do
+basically the same sequence to update the settings of an existing
+breakpoint except step 2,3 and 6 can be skipped as it has already been
+done.
+
+To remove a breakpoint some more steps are needed. The idea is to
+first stage the breakpoint as disabled, do the switch, wait for thread
+progress and then remove the disabled breakpoint by restoring the
+original beam instruction.
+
+Here is a more complete sequence that contains both adding, updating
+and removing breakpoints.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate new breakpoint structures with a disabled active part and
+   the original beam instruction. Write a pointer to the breakpoint in
+   "func_info" header at offset -4.
+
+3. Update the staging part of all affected breakpoints. Disable
+   breakpoints that are to be removed.
+
+4. Wait for thread progress.
+
+5. Write a `op_i_generic_breakpoint` as the first instruction for all
+   functions with new breakpoints.
+
+6. Wait for thread progress.
+
+7. Commit all staged breadpoints by switching `erts_active_bp_index`.
+
+8. Wait for thread progress.
+
+
+9. Restore original beam instruction for disabled breakpoints.
+
+10. Wait for thread progress.
+
+11. Prepare for next call to `trace_pattern` by updating the new
+    staging area (the old active) for all enabled breakpoints.
+
+12. Deallocate disabled breakpoint structures.
+
+13. Release code write permission and return from `trace_pattern`.
+
+
+### All that Waiting for Thread Progress
+
+There are four rounds of waiting for thread progress in the above
+sequence. In the code loading sequence we sacrificed memory overhead
+of three generations to avoid a second round of thread progress. The
+latency of `trace_pattern` should not be such a big problem for
+however, as it is normally not called in a rapid sequence.
+
+The waiting in step 4 is to make sure all threads will see an updated
+view of the breakpoint structures once they become reachable through
+the `op_i_generic_breakpoint` instruction written in step 5.
+
+The waiting in step 6 is to make the activation of the new trace
+settings "as atomic as possible". Different cores might see the new
+value of `erts_active_bp_index` at different times as it is read
+without any memory barrier. But this is the best we can do without
+more expensive thread synchronization.
+
+The waiting in step 8 is to make sure we dont't restore the original
+bream instructions for disabled breakpoints until we know that no
+thread is still accessing the old enabled part of a disabled
+breakpoint.
+
+The waiting in step 10 is to make sure no lingering thread is still
+accessing disabled breakpoint structures to be deallocated in step
+12.
+
+
+Global Tracing
+--------------
+
+Call tracing with `global` option only affects external function
+calls. This was earlier handled by inserting a special trace
+instruction in export entries without the use of breakpoints. With the
+new non-blocking tracing we want to avoid special handling for global
+tracing and make use of the staging and atomic switching within the
+breakpoint mechanism. The solution was to create the same type of
+breakpoint structure for a global call trace. The difference to local
+tracing is that we insert the `op_i_generic_breakpoint` instruction
+(with its pointer at offset -4) in the export entry rather than in the
+code.
+
+
+Future work
+-----------
+
+We still go to single threaded mode when new code is loaded for a
+module that is traced, or when loading code when there is a default
+trace pattern set. That is not impossible to fix, but that requires
+much closer cooperation between tracing BIFs and the loader BIFs.
diff --git a/erts/emulator/sys/unix/erl_unix_sys_ddll.c b/erts/emulator/sys/unix/erl_unix_sys_ddll.c
index 12c47d0088..8760b58839 100644
--- a/erts/emulator/sys/unix/erl_unix_sys_ddll.c
+++ b/erts/emulator/sys/unix/erl_unix_sys_ddll.c
@@ -101,7 +101,7 @@ void erl_sys_ddll_init(void) {
 /* 
  * Open a shared object
  */
-int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err)
+int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err)
 {
 #if defined(HAVE_DLOPEN)
     char* dlname; 
diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c
index 61f9f6a59a..59e34eb819 100644
--- a/erts/emulator/sys/unix/sys.c
+++ b/erts/emulator/sys/unix/sys.c
@@ -547,6 +547,25 @@ erts_sys_pre_init(void)
 #endif
 #endif /* USE_THREADS */
     erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0);
+
+    {
+      /*
+       * Unfortunately we depend on fd 0,1,2 in the old shell code.
+       * So if for some reason we do not have those open when we start
+       * we have to open them here. Not doing this can cause the emulator
+       * to deadlock when reaping the fd_driver ports :(
+       */
+      int fd;
+      /* Make sure fd 0 is open */
+      if ((fd = open("/dev/null", O_RDONLY)) != 0)
+	close(fd);
+      /* Make sure fds 1 and 2 are open */
+      while (fd < 3) {
+	fd = open("/dev/null", O_WRONLY);
+      }
+      close(fd);
+    }
+
 }
 
 void
diff --git a/erts/emulator/sys/win32/erl_win32_sys_ddll.c b/erts/emulator/sys/win32/erl_win32_sys_ddll.c
index 2d3f073cc2..338f0d7386 100644
--- a/erts/emulator/sys/win32/erl_win32_sys_ddll.c
+++ b/erts/emulator/sys/win32/erl_win32_sys_ddll.c
@@ -59,7 +59,7 @@ void erl_sys_ddll_init(void) {
  * Open a shared object
  * Expecting 'full_name' as an UTF-8 string.
  */
-int erts_sys_ddll_open2(const char *full_name, void **handle, ErtsSysDdllError* err)
+int erts_sys_ddll_open(const char *full_name, void **handle, ErtsSysDdllError* err)
 {
     HINSTANCE hinstance;
     int len;
diff --git a/erts/emulator/test/binary_SUITE.erl b/erts/emulator/test/binary_SUITE.erl
index f493f8c1ac..a390c536bb 100644
--- a/erts/emulator/test/binary_SUITE.erl
+++ b/erts/emulator/test/binary_SUITE.erl
@@ -506,8 +506,8 @@ external_size(Config) when is_list(Config) ->
 	    io:format("Unaligned size: ~p\n", [Sz2]),
 	    ?line ?t:fail()
     end,
-    ?line erlang:external_size(Bin) =:= erlang:external_size(Bin, [{minor_version, 1}]),
-    ?line erlang:external_size(Unaligned) =:= erlang:external_size(Unaligned, [{minor_version, 1}]).
+    true = (erlang:external_size(Bin) =:= erlang:external_size(Bin, [{minor_version, 1}])),
+    true = (erlang:external_size(Unaligned) =:= erlang:external_size(Unaligned, [{minor_version, 1}])).
 
 external_size_1(Term, Size0, Limit) when Size0 < Limit ->
     case erlang:external_size(Term) of
@@ -1241,16 +1241,27 @@ bsbs_1(A) ->
     Bin = binary_to_term_stress(<<131,$M,5:32,A,0,0,0,0,0>>),
     BinSize = bit_size(Bin).
 
+%% lists:foldl(_,_,lists:seq(_,_)) with less heap consumption
+lists_foldl_seq(Fun, Acc0, N, To) when N =< To ->
+    Acc1 = Fun(N, Acc0),
+    lists_foldl_seq(Fun, Acc1, N+1, To);
+
+lists_foldl_seq(_, Acc, _, _) ->
+    Acc.
+
 deep(Config) when is_list(Config) ->
-    ?line deep_roundtrip(lists:foldl(fun(E, A) ->
-					     [E,A]
-				     end, [], lists:seq(1, 1000000))),
-    ?line deep_roundtrip(lists:foldl(fun(E, A) ->
-					     {E,A}
-				     end, [], lists:seq(1, 1000000))),
-    ?line deep_roundtrip(lists:foldl(fun(E, A) ->
-					     fun() -> {E,A} end
-				     end, [], lists:seq(1, 1000000))),
+    deep_roundtrip(lists_foldl_seq(fun(E, A) ->
+					   [E,A]
+				   end, [], 1, 1000000)),
+    erlang:garbage_collect(),
+    deep_roundtrip(lists_foldl_seq(fun(E, A) ->
+					   {E,A}
+				   end, [], 1, 1000000)),
+    erlang:garbage_collect(),
+    deep_roundtrip(lists_foldl_seq(fun(E, A) ->
+					   fun() -> {E,A} end
+				   end, [], 1, 1000000)),
+    erlang:garbage_collect(),
     ok.
 
 deep_roundtrip(T) ->
diff --git a/erts/emulator/test/driver_SUITE.erl b/erts/emulator/test/driver_SUITE.erl
index 7087542899..06211406b4 100644
--- a/erts/emulator/test/driver_SUITE.erl
+++ b/erts/emulator/test/driver_SUITE.erl
@@ -2075,6 +2075,21 @@ thr_msg_blast(Config) when is_list(Config) ->
 	    Res
     end.
 
+-define(IN_RANGE(LoW_, VaLuE_, HiGh_),
+	case in_range(LoW_, VaLuE_, HiGh_) of
+	    true -> ok;
+	    false ->
+		case erlang:system_info(lock_checking) of
+		    true ->
+			?t:format("~p:~p: Ignore bad sched count due to "
+				  "lock checking~n",
+				  [?MODULE,?LINE]);
+		    false ->
+			?t:fail({unexpected_sched_counts, VaLuE_})
+		end
+	end).
+
+
 consume_timeslice(Config) when is_list(Config) ->
     %%
     %% Verify that erl_drv_consume_timeslice() works.
@@ -2131,15 +2146,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc1 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt1}, {Proc1, Sproc1}] = count_pp_sched_stop([Port, Proc1]),
-    case Sprt1 of
-	10 ->
-	    true = in_range(5, Sproc1-10, 7);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt1, Sproc1})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt1, 10),
+    ?IN_RANGE(5, Sproc1-10, 7),
 
     "disabled" = port_control(Port, $D, ""),
     Proc2 = spawn_link(fun () ->
@@ -2160,15 +2168,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc2 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt2}, {Proc2, Sproc2}] = count_pp_sched_stop([Port, Proc2]),
-    case Sprt2 of
-	10 ->
-	    true = in_range(1, Sproc2-10, 2);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt2, Sproc2})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt2, 10),
+    ?IN_RANGE(1, Sproc2-10, 2),
 
     "enabled" = port_control(Port, $E, ""),
     Proc3 = spawn_link(fun () ->
@@ -2188,15 +2189,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc3 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt3}, {Proc3, Sproc3}] = count_pp_sched_stop([Port, Proc3]),
-    case Sprt3 of
-	10 ->
-	    true = in_range(5, Sproc3-10, 7);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt3, Sproc3})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt3, 10),
+    ?IN_RANGE(5, Sproc3-10, 7),
 
     "disabled" = port_control(Port, $D, ""),
     Proc4 = spawn_link(fun () ->
@@ -2216,15 +2210,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc4 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt4}, {Proc4, Sproc4}] = count_pp_sched_stop([Port, Proc4]),
-    case Sprt4 of
-	10 ->
-	    true = in_range(1, Sproc4-10, 2);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt4, Sproc4})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt4, 10),
+    ?IN_RANGE(1, Sproc4-10, 2),
 
     SOnl = erlang:system_info(schedulers_online),
     %% If only one scheduler use port with parallelism set to true,
@@ -2272,8 +2259,8 @@ consume_timeslice(Config) when is_list(Config) ->
     wait_procs_exit([W5, Proc5]),
     wait_command_msgs(Port2, 10),
     [{Port2, Sprt5}, {Proc5, Sproc5}] = count_pp_sched_stop([Port2, Proc5]),
-    true = in_range(2, Sproc5, 3),
-    true = in_range(7, Sprt5, 20),
+    ?IN_RANGE(2, Sproc5, 3),
+    ?IN_RANGE(6, Sprt5, 20),
 		  
     count_pp_sched_start(),
     "disabled" = port_control(Port2, $D, ""),
@@ -2307,8 +2294,8 @@ consume_timeslice(Config) when is_list(Config) ->
     wait_procs_exit([W6, Proc6]),
     wait_command_msgs(Port2, 10),
     [{Port2, Sprt6}, {Proc6, Sproc6}] = count_pp_sched_stop([Port2, Proc6]),
-    true = in_range(2, Sproc6, 3),
-    true = in_range(3, Sprt6, 6),
+    ?IN_RANGE(2, Sproc6, 3),
+    ?IN_RANGE(2, Sprt6, 6),
 
     process_flag(scheduler, 0),
 
@@ -2316,6 +2303,7 @@ consume_timeslice(Config) when is_list(Config) ->
     receive {Port2, closed} -> ok end,
     ok.
 
+
 wait_command_msgs(_, 0) ->
     ok;
 wait_command_msgs(Port, N) ->
diff --git a/erts/emulator/test/exception_SUITE.erl b/erts/emulator/test/exception_SUITE.erl
index 109cec25cb..09a7a87a9a 100644
--- a/erts/emulator/test/exception_SUITE.erl
+++ b/erts/emulator/test/exception_SUITE.erl
@@ -589,6 +589,13 @@ line_numbers(Config) when is_list(Config) ->
 	       [{file,ModFile},{line,_}]}|_]}} =
 	(catch build_binary2(8, bad_binary)),
 
+    <<"abc",357:16>> = build_binary3(<<"abc">>),
+    {'EXIT',{badarg,[{?MODULE,build_binary3,1,
+		      [{file,"bit_syntax.erl"},{line,72511}]},
+		     {?MODULE,line_numbers,1,
+		      [{file,ModFile},{line,_}]}|_]}} =
+	(catch build_binary3(no_binary)),
+
     {'EXIT',{function_clause,
 	     [{?MODULE,do_call_abs,[y,y],
 	       [{file,"gc_bif.erl"},{line,18}]},
@@ -691,6 +698,10 @@ build_binary2(Size, Bin) ->			%Line 72505
     id(0),					%Line 72506
     <<7:Size,Bin/binary>>.			%Line 72507
 
+build_binary3(Bin) ->			        %Line 72509
+    id(0),					%Line 72510
+    <<Bin/binary,357:16>>.			%Line 72511
+
 -file("gc_bif.erl", 17).
 do_call_abs(x, Arg) ->				%Line 18
     abs(Arg).					%Line 19
diff --git a/erts/emulator/test/scheduler_SUITE.erl b/erts/emulator/test/scheduler_SUITE.erl
index 81539faa09..6a43e2b0e7 100644
--- a/erts/emulator/test/scheduler_SUITE.erl
+++ b/erts/emulator/test/scheduler_SUITE.erl
@@ -1495,7 +1495,7 @@ mcall(Node, Funs) ->
 	      end, Refs).
 
 erl_rel_flag_var() ->
-    "ERL_"++erlang:system_info(otp_release)++"_FLAGS".
+    "ERL_OTP"++erlang:system_info(otp_release)++"_FLAGS".
 
 clear_erl_rel_flags() ->
     EnvVar = erl_rel_flag_var(),
diff --git a/erts/emulator/utils/make_version b/erts/emulator/utils/make_version
index 7757fa8138..02b68f2b39 100755
--- a/erts/emulator/utils/make_version
+++ b/erts/emulator/utils/make_version
@@ -41,6 +41,9 @@ if ($ARGV[0] eq '-o') {
 my $release = shift;
 defined $release or die "No release specified";
 
+my $correction_package = shift;
+defined $correction_package or die "No correction package specified";
+
 my $version = shift;
 defined $version or die "No version name specified";
 
@@ -53,6 +56,7 @@ open(FILE, ">$outputfile") or die "Can't create $outputfile: $!";
 print FILE <<EOF;
 /* This file was created by 'make_version' -- don't modify. */
 #define ERLANG_OTP_RELEASE "$release"
+#define ERLANG_OTP_CORRECTION_PACKAGE "$correction_package"
 #define ERLANG_VERSION "$version"
 #define ERLANG_COMPILE_DATE "$time_str"
 #define ERLANG_ARCHITECTURE "$architecture"
diff --git a/erts/epmd/src/epmd_cli.c b/erts/epmd/src/epmd_cli.c
index 8817bde8d7..bd30bc35d9 100644
--- a/erts/epmd/src/epmd_cli.c
+++ b/erts/epmd/src/epmd_cli.c
@@ -118,7 +118,7 @@ void epmd_call(EpmdVars *g,int what)
     if (!g->silent) {
 	rval = erts_snprintf(buf, OUTBUF_SIZE,
 			     "epmd: up and running on port %d with data:\n", j);
-	write(1, buf, rval);
+	fwrite(buf, 1, rval, stdout);
     }
     while(1) {
 	if ((rval = read(fd,buf,OUTBUF_SIZE)) <= 0)  {
@@ -126,7 +126,7 @@ void epmd_call(EpmdVars *g,int what)
 	    epmd_cleanup_exit(g,0);
 	}
 	if (!g->silent)
-	    write(1, buf, rval); /* Potentially UTF-8 encoded */
+	    fwrite(buf, 1, rval, stdout); /* Potentially UTF-8 encoded */
     }
 }
 
diff --git a/erts/epmd/test/epmd_SUITE.erl b/erts/epmd/test/epmd_SUITE.erl
index cc24a556a3..a752abf33b 100644
--- a/erts/epmd/test/epmd_SUITE.erl
+++ b/erts/epmd/test/epmd_SUITE.erl
@@ -69,6 +69,8 @@
     returns_valid_empty_extra/1,
     returns_valid_populated_extra_with_nulls/1,
 
+    names_stdout/1,
+
     buffer_overrun_1/1,
     buffer_overrun_2/1,
     no_nonlocal_register/1,
@@ -118,6 +120,7 @@ all() ->
      too_large, alive_req_too_small_1, alive_req_too_small_2,
      alive_req_too_large, returns_valid_empty_extra,
      returns_valid_populated_extra_with_nulls,
+     names_stdout,
      {group, buffer_overrun}, no_nonlocal_register,
      no_nonlocal_kill, no_live_killing].
 
@@ -759,6 +762,24 @@ returns_valid_populated_extra_with_nulls(Config) when is_list(Config) ->
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 
+names_stdout(doc) ->
+    ["Test that epmd -names prints registered nodes to stdout"];
+names_stdout(suite) ->
+    [];
+names_stdout(Config) when is_list(Config) ->
+    ?line ok = epmdrun(),
+    ?line {ok,Sock} = register_node("foobar"),
+    ?line ok = epmdrun("-names"),
+    ?line {ok, Data} = receive {_Port, {data, D}} -> {ok, D}
+		       after 10000 -> {error, timeout}
+		       end,
+    ?line {match,_} = re:run(Data, "^epmd: up and running", [multiline]),
+    ?line {match,_} = re:run(Data, "^name foobar at port", [multiline]),
+    ?line ok = close(Sock),
+    ok.
+
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+
 buffer_overrun_1(suite) ->
     [];
 buffer_overrun_1(doc) ->
@@ -968,7 +989,7 @@ epmdrun(Epmd,Args0) ->
 	       O ->
 		   " "++O
 	   end,
-  osrun("\"" ++ Epmd ++ "\"" ++ Args ++ " " ?EPMDARGS " -port " ++ integer_to_list(?PORT)).
+  osrun("\"" ++ Epmd ++ "\"" ++ " " ?EPMDARGS " -port " ++ integer_to_list(?PORT) ++ Args).
 
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
diff --git a/erts/etc/common/erlexec.c b/erts/etc/common/erlexec.c
index 1d7811d570..c30203c632 100644
--- a/erts/etc/common/erlexec.c
+++ b/erts/etc/common/erlexec.c
@@ -1972,35 +1972,8 @@ get_file_args(char *filename, argv_buf *abp, argv_buf *xabp)
 }
 
 static void
-write_erl_otp_flags(char *bufp)
-{
-    /* ERL_OTP<MAJOR-VSN>_FLAGS */
-    int ix = 0;
-    char *otp_p;
-    char otp[] = OTP_SYSTEM_VERSION;
-
-    bufp[ix++] = 'E';
-    bufp[ix++] = 'R';
-    bufp[ix++] = 'L';
-    bufp[ix++] = '_';
-    bufp[ix++] = 'O';
-    bufp[ix++] = 'T';
-    bufp[ix++] = 'P';
-    for (otp_p = &otp[0]; '0' <= *otp_p && *otp_p <= '9'; otp_p++)
-	bufp[ix++] = *otp_p;
-    bufp[ix++] = '_';
-    bufp[ix++] = 'F';
-    bufp[ix++] = 'L';
-    bufp[ix++] = 'A';
-    bufp[ix++] = 'G';
-    bufp[ix++] = 'S';
-    bufp[ix] = '\0';
-}
-
-static void
 initial_argv_massage(int *argc, char ***argv)
 {
-    char erl_otp_flags_buf[] = "ERL_OTP" OTP_SYSTEM_VERSION "_FLAGS";
     argv_buf ab = {0}, xab = {0};
     int ix, vix, ac;
     char **av;
@@ -2016,8 +1989,7 @@ initial_argv_massage(int *argc, char ***argv)
 
     vix = 0;
 
-    write_erl_otp_flags(erl_otp_flags_buf);
-    av = build_args_from_env(erl_otp_flags_buf);
+    av = build_args_from_env("ERL_OTP" OTP_SYSTEM_VERSION "_FLAGS");
     if (av)
 	avv[vix++].argv = av;
 
diff --git a/erts/etc/unix/etp-commands.in b/erts/etc/unix/etp-commands.in
index 73887931cc..8520d58f47 100644
--- a/erts/etc/unix/etp-commands.in
+++ b/erts/etc/unix/etp-commands.in
@@ -652,7 +652,7 @@ end
 define etp-ct-atom-1
 # Args: int
 #
-# Determines if integer is a atom first character
+# Determines if integer is an atom first character
 #
 # Non-reentrant
 # Returns: $etp_ct_atom
@@ -1278,6 +1278,250 @@ document etpf-stackdump
 %---------------------------------------------------------------------------
 end
 
+define etp-heapdump
+# Args: Process*
+#
+# Non-reentrant
+  etp-heapdump-1 ($arg0)->heap ($arg0)->htop
+end
+
+document etp-heapdump
+%---------------------------------------------------------------------------
+% etp-heapdump Process*
+%
+% Take an Process* and print a heapdump for the process heap.
+%---------------------------------------------------------------------------
+end
+
+define etp-heapdump-old
+# Args: Process*
+#
+# Non-reentrant
+  etp-heapdump-1 ($arg0)->old_heap ($arg0)->old_htop
+end
+
+document etp-heapdump
+%---------------------------------------------------------------------------
+% etp-heapdump-old Process*
+%
+% Take an Process* and print a heapdump for the process old heap (gen-heap).
+%---------------------------------------------------------------------------
+end
+
+
+define etp-heapdump-1
+# Args: Eterm* heap, Eterm* htop
+#
+# Non-reentrant
+  set $etp_heapdump_heap = (Eterm*)($arg0)
+  set $etp_heapdump_p = (Eterm*)($arg0)
+  set $etp_heapdump_end = (Eterm*)($arg1)
+  set $etp_heapdump_skips = 0
+  printf "%% heapdump (%u):\n", $etp_heapdump_end-$etp_heapdump_p
+  while $etp_heapdump_p < $etp_heapdump_end
+    set $etp_heapdump_ix = 0
+    printf " %p: ", $etp_heapdump_p
+    while $etp_heapdump_p < $etp_heapdump_end && $etp_heapdump_ix < 8
+	if ($etp_heapdump_skips > 0)
+	  printf "|   0x%08x ", ($etp_heapdump_p)
+	  set $etp_heapdump_skips--
+	else
+	  etp-term-dump $etp_heapdump_p[0]
+	end
+	set $etp_heapdump_p++
+	set $etp_heapdump_ix++
+    end
+    printf "\n"
+  end
+end
+
+
+define etp-term-dump
+# Args: Eterm
+  if (($arg0) & 0x3) == 0
+    etp-term-dump-header ($arg0)
+  else
+    if (($arg0) & 0x3) == 1
+      # Cons pointer
+      set $etp_term_dump_cons_p = ((Eterm*)(($arg0) & ~0x3))
+      if $etp_term_dump_cons_p > $etp_heapdump_heap &&  $etp_term_dump_cons_p < $etp_heapdump_end
+        printf "| C:0x%08x ", $etp_term_dump_cons_p
+        #printf "| C: --> %5d ", $etp_heapdump_p - $etp_term_dump_cons_p - 1
+      else
+        printf "| C:0x%08x ", $etp_term_dump_cons_p
+      end
+    else
+      if (($arg0) & 0x3) == 2
+        # Box pointer
+        printf "| B:0x%08x ", ($arg0)
+      else
+        if (($arg0) & 0x3) == 3
+          # immediate
+          etp-term-dump-immediate ($arg0)
+        else
+          printf "| U:0x%08x ", ($arg0)
+        end
+      end
+    end
+  end
+end
+
+define etp-term-dump-immediate
+# Args: immediate term
+  if (($arg0) & 0xF) == 0xf
+    # Fixnum
+    etp-ct-printable-1 ((long)((Sint)($arg0)>>4))
+      if $etp_ct_printable
+        if $etp_ct_printable < 0
+	  printf "| I:   %c (%3ld) ", (long)((Sint)($arg0)>>4), (long)((Sint)($arg0)>>4)
+        else
+	  printf "| I:  \\%c (%3ld) ", (long)((Sint)($arg0)>>4), (long)((Sint)($arg0)>>4)
+        end
+      else
+      printf "| I:%10ld ", (long)((Sint)($arg0)>>4)
+    end
+  else
+    if (($arg0) & 0xF) == 0x3
+      etp-term-dump-pid ($arg0)
+    else
+      if (($arg0) & 0xF) == 0x7
+        printf "| port:0x%05x ", ($arg0)
+       else
+         # Immediate2  - 0xB
+         if (($arg0) & 0x3f) == 0x0b
+	   etp-term-dump-atom ($arg0)
+         else
+           if (($arg0) & 0x3f) == 0x1b
+	     printf "| #Catch<%06d> ", ($arg0)>>6
+           else
+             if (($arg0) == $etp_nil)
+               printf "|    [] (NIL)  "
+             else
+               printf "| I:0x%08x ", ($arg0)
+             end
+	   end
+	 end
+      end
+    end
+  end
+end
+
+define etp-term-dump-atom
+# Args: atom term
+  set $etp_atom_1_ap = (Atom*)erts_atom_table.seg_table[(Eterm)($arg0)>>16][((Eterm)($arg0)>>6)&0x3FF]
+  set $etp_atom_1_i = ($etp_atom_1_ap)->len
+  set $etp_atom_1_p = ($etp_atom_1_ap)->name
+  set $etp_atom_1_quote = 1
+  set $etp_atom_indent = 13
+
+  if ($etp_atom_1_i < 11)
+    if ($etp_atom_1_i > 0)
+      etp-ct-atom-1 (*$etp_atom_1_p)
+      if $etp_ct_atom
+        set $etp_atom_indent = 13
+      else
+        set $etp_atom_indent = 11
+      end
+    end
+    # perform indentation
+    printf "|"
+    while ($etp_atom_1_i < $etp_atom_indent)
+	printf " "
+        set $etp_atom_1_i++
+    end
+    set $etp_atom_1_i = ($etp_atom_1_ap)->len
+    # Check if atom has to be quoted
+    if ($etp_atom_1_i > 0)
+      etp-ct-atom-1 (*$etp_atom_1_p)
+      if $etp_ct_atom
+        # Atom start character
+        set $etp_atom_1_p++
+        set $etp_atom_1_i--
+        set $etp_atom_1_quote = 0
+      else
+        set $etp_atom_1_i = 0
+      end
+    end
+    while $etp_atom_1_i > 0
+      etp-ct-name-1 (*$etp_atom_1_p)
+      if $etp_ct_name
+        # Name character
+        set $etp_atom_1_p++
+        set $etp_atom_1_i--
+      else
+        set $etp_atom_1_quote = 1
+        set $etp_atom_1_i = 0
+      end
+    end
+    # Print the atom
+    if $etp_atom_1_quote
+      printf "'"
+    end
+    set $etp_atom_1_i = ($etp_atom_1_ap)->len
+    set $etp_atom_1_p = ($etp_atom_1_ap)->name
+    while $etp_atom_1_i > 0
+        etp-char-1 (*$etp_atom_1_p) '\''
+        set $etp_atom_1_p++
+        set $etp_atom_1_i--
+    end
+    if $etp_atom_1_quote
+      printf "'"
+    end
+    printf " "
+  else
+    printf "| A:0x%08x ", ($arg0)
+  end
+end
+
+define etp-term-dump-pid
+# Args: Eterm pid
+#
+# Non-reentrant
+#
+  set $etp_pid_1 = (Eterm)($arg0)
+  if ($etp_pid_1 & 0xF) == 0x3
+    if (etp_arch_bits == 64 && etp_halfword == 0)
+      if (etp_big_endian)
+        set $etp_pid_data = (unsigned) ((((Uint64) $etp_pid_1) >> 36) & 0x0fffffff)
+      else
+        set $etp_pid_data = (unsigned) ((((Uint64) $etp_pid_1) >> 4) & 0x0fffffff)
+      end
+   else
+      set $etp_pid_data = (unsigned) (((((Uint32) $etp_pid_1) >> 4) & ~erts_proc.r.o.pix_mask) | ((((Uint32) $etp_pid_1) >> (erts_proc.r.o.pix_cl_shift + 4)) & erts_proc.r.o.pix_cl_mask) | (((((Uint32) $etp_pid_1) >> 4) & erts_proc.r.o.pix_cli_mask) << erts_proc.r.o.pix_cli_shift))
+   end
+    # Internal pid
+    printf "| <0.%04u.%03u> ", $etp_pid_data & 0x7fff, ($etp_pid_data >> 15) & 0x1fff
+  else
+    printf "| #NotPid<%#x> ", ($arg0)
+  end
+end
+
+define etp-term-dump-header
+# Args: Header term
+  if (($arg0) & 0x3f) == 0
+    printf  "| H:%4d-tuple ", ($arg0) >> 6
+  else
+    set $etp_heapdump_skips = ($arg0) >> 6
+    if ((($arg0) & 0x3f) == 0x18)
+      printf  "| H: float %3d ", ($arg0) >> 6
+    else
+      if ((($arg0) & 0x3f) == 0x28)
+        # sub-binary
+        printf  "| H:   sub-bin "
+      else
+        if ((($arg0) & 0x3f) == 0x8)
+          # pos-bignum
+          printf  "| H:bignum %3u ", ($arg0) >> 6
+        else
+          printf  "| header %5d ", ($arg0) >> 6
+	end
+      end
+    end
+  end
+end
+
+
+
 define etp-pid2pix-1
 # Args: Eterm
 #
@@ -1445,7 +1689,7 @@ define etp-process-info
 # Args: Process*
 #
   printf "  Pid: "
-  etp-1 $arg0->common.id
+  etp-1 ($arg0)->common.id
   printf "\n  State: "
   etp-proc-state $arg0
   if $proxy_process != 0
@@ -1523,11 +1767,104 @@ end
 document etp-processes
 %---------------------------------------------------------------------------
 % etp-processes
-% 
+%
 % Print misc info about all processes
 %---------------------------------------------------------------------------
 end
 
+define etp-processes-memory
+  if (!erts_initialized)
+    printf "No processes, since system isn't initialized!\n"
+  else
+    set $proc_ix = 0
+    printf "--- (%ld processes in wheel)\n", erts_proc.r.o.max
+    while $proc_ix < erts_proc.r.o.max
+      set $proc = (Process *) *((UWord *) &erts_proc.r.o.tab[$proc_ix])
+      if ($proc != ((Process *) 0) && $proc != &erts_invalid_process)
+        etp-process-memory-info $proc
+      end
+      set $proc_ix++
+    end
+    printf "---\n",
+  end
+end
+
+document etp-processes-memory
+%---------------------------------------------------------------------------
+% etp-processes-memory
+%
+% Print memory info about all processes
+%---------------------------------------------------------------------------
+end
+
+define etp-process-memory-info
+# Args: Process*
+#
+  if ((*(((Uint32 *) &(((Process *) $arg0)->state)))) & 0x400000)
+    set $proxy_process = 1
+  else
+    set $proxy_process = 0
+  end
+  printf "  "
+  etp-1 $arg0->common.id
+  printf ": (Process *) %p ", $arg0
+  if $proxy_process != 0
+    printf "(Process *) %p ", $arg0
+    printf "  *** PROXY process struct *** refer to next: \n"
+    etp-pid2proc-1 $arg0->common.id
+    printf " -"
+    etp-process-memory-info $proc
+  else
+    printf " [Heap: %5ld", $arg0->heap_sz
+    if ($arg0->old_heap)
+      printf " | %5ld", $arg0->old_hend - $arg0->old_heap
+    else
+      printf " | none "
+    end
+    printf "] [Mbuf: %5ld", $arg0->mbuf_sz
+    if (etp_smp_compiled)
+      printf " | %3ld (%3ld | %3ld)", ($arg0->msg.len + $arg0->msg_inq.len), $arg0->msg.len, $arg0->msg_inq.len
+    else
+      printf " | %3ld", $arg0->msg.len
+    end
+    printf "] "
+    if ($arg0->i)
+      printf " I: "
+      etp-cp-1 $arg0->i
+      printf " "
+    end
+
+    if ($arg0->current)
+      etp-1 $arg0->current[0]
+      printf ":"
+      etp-1 $arg0->current[1]
+      printf "/%d ", $arg0->current[2]
+    end
+
+    if (*(((Uint32 *) &(((Process *) $arg0)->state))) & 0x4) == 0
+      if ($arg0->common.u.alive.reg)
+        etp-1 $arg0->common.u.alive.reg->name
+        printf " "
+      end
+    end
+
+    if ($arg0->cp)
+      printf " CP: "
+      etp-cp-1 $arg0->cp
+      printf " "
+    end
+    printf "\n"
+  end
+end
+
+document etp-process-memory-info
+%---------------------------------------------------------------------------
+% etp-process-memory-info Process*
+%
+% Print memory info about process
+%---------------------------------------------------------------------------
+end
+
 define etp-port-id2pix-1
 # Args: Eterm
 #
diff --git a/erts/etc/win32/erlang.ico b/erts/etc/win32/erlang.ico
index cee8b58af9..7b62d31aa9 100644
--- a/erts/etc/win32/erlang.ico
+++ b/erts/etc/win32/erlang.ico
diff --git a/erts/preloaded/ebin/erlang.beam b/erts/preloaded/ebin/erlang.beam
index 73fac27161..3c77d6ae0f 100644
--- a/erts/preloaded/ebin/erlang.beam
+++ b/erts/preloaded/ebin/erlang.beam
diff --git a/erts/preloaded/src/erlang.erl b/erts/preloaded/src/erlang.erl
index 0ed677c3d8..f99d5bfdd0 100644
--- a/erts/preloaded/src/erlang.erl
+++ b/erts/preloaded/src/erlang.erl
@@ -2246,6 +2246,7 @@ tuple_to_list(_Tuple) ->
          (modified_timing_level) -> integer() | undefined;
          (multi_scheduling) -> disabled | blocked | enabled;
          (multi_scheduling_blockers) -> [PID :: pid()];
+         (otp_correction_package) -> string();
          (otp_release) -> string();
          (port_count) -> non_neg_integer();
          (port_limit) -> pos_integer();
diff --git a/erts/vsn.mk b/erts/vsn.mk
index 30aa870144..8e77a9a26e 100644
--- a/erts/vsn.mk
+++ b/erts/vsn.mk
@@ -18,7 +18,11 @@
 # 
 
 VSN = 6.0
-SYSTEM_VSN = 17.0-rc0
+
+# OTP major version
+SYSTEM_VSN = 17
+# OTP correction package version
+SYSTEM_CP_VSN = 17.0-rc0
 
 # Port number 4365 in 4.2
 # Port number 4366 in 4.3