55 files changed, 5379 insertions, 2403 deletions
diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in
index 5638683f88..b270099566 100644
--- a/erts/emulator/Makefile.in
+++ b/erts/emulator/Makefile.in
@@ -575,7 +575,7 @@ GENERATE += $(TTF_DIR)/erl_alloc_types.h
 
 # version include file
 $(TARGET)/erl_version.h: ../vsn.mk
-	$(gen_verbose)LANG=C $(PERL) utils/make_version -o $@ $(SYSTEM_VSN) $(VSN)$(SERIALNO) $(TARGET)
+	$(gen_verbose)LANG=C $(PERL) utils/make_version -o $@ $(SYSTEM_VSN) $(SYSTEM_CP_VSN) $(VSN)$(SERIALNO) $(TARGET)
 GENERATE += $(TARGET)/erl_version.h
 
 # driver table
diff --git a/erts/emulator/beam/beam_emu.c b/erts/emulator/beam/beam_emu.c
index 78ab6fa30f..b413f0e859 100644
--- a/erts/emulator/beam/beam_emu.c
+++ b/erts/emulator/beam/beam_emu.c
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1996-2013. All Rights Reserved.
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -48,7 +48,7 @@
 #  define OpCase(OpCode)    case op_##OpCode
 #  define CountCase(OpCode) case op_count_##OpCode
 #  define OpCode(OpCode)    ((Uint*)op_##OpCode)
-#  define Goto(Rel) {Go = (int)(Rel); goto emulator_loop;}
+#  define Goto(Rel) {Go = (int)(UWord)(Rel); goto emulator_loop;}
 #  define LabelAddr(Addr) &&##Addr
 #else
 #  define OpCase(OpCode)    lb_##OpCode
@@ -133,7 +133,7 @@ do {                                     \
 
 /* We don't check the range if an ordinary switch is used */
 #ifdef NO_JUMP_TABLE
-#define VALID_INSTR(IP) (0 <= (int)(IP) && ((int)(IP) < (NUMBER_OF_OPCODES*2+10)))
+#define VALID_INSTR(IP) ((UWord)(IP) < (NUMBER_OF_OPCODES*2+10))
 #else
 #define VALID_INSTR(IP) \
    ((SWord)LabelAddr(emulator_loop) <= (SWord)(IP) && \
@@ -4326,7 +4326,19 @@ void process_main(void)
      flags = Arg(2);
      BsGetFieldSize(tmp_arg2, (flags >> 3), ClauseFail(), size);
      if (size >= SMALL_BITS) {
-	 Uint wordsneeded = 1+WSIZE(NBYTES((Uint) size));
+	 Uint wordsneeded;
+	 /* check bits size before potential gc.
+	  * We do not want a gc and then realize we don't need
+	  * the allocated space (i.e. if the op fails)
+	  *
+	  * remember to reacquire the matchbuffer after gc.
+	  */
+
+	 mb = ms_matchbuffer(tmp_arg1);
+	 if (mb->size - mb->offset < size) {
+	     ClauseFail();
+	 }
+	 wordsneeded = 1+WSIZE(NBYTES((Uint) size));
 	 TestHeapPreserve(wordsneeded, Arg(1), tmp_arg1);
      }
      mb = ms_matchbuffer(tmp_arg1);
diff --git a/erts/emulator/beam/bif.c b/erts/emulator/beam/bif.c
index 96666d98ed..61c1abedb5 100644
--- a/erts/emulator/beam/bif.c
+++ b/erts/emulator/beam/bif.c
@@ -4488,7 +4488,7 @@ BIF_RETTYPE system_flag_2(BIF_ALIST_2)
 	    BIF_P->group_leader,
 	    "A call to erlang:system_flag(cpu_topology, _) was made.\n"
 	    "The cpu_topology argument is deprecated and scheduled\n"
-	    "for removal in erts-5.10/OTP-R16. For more information\n"
+	    "for removal in Erlang/OTP 18. For more information\n"
 	    "see the erlang:system_flag/2 documentation.\n");
 	BIF_TRAP1(set_cpu_topology_trap, BIF_P, BIF_ARG_2);
     } else if (ERTS_IS_ATOM_STR("scheduler_bind_type", BIF_ARG_1)) {
@@ -4496,7 +4496,7 @@ BIF_RETTYPE system_flag_2(BIF_ALIST_2)
 	    BIF_P->group_leader,
 	    "A call to erlang:system_flag(scheduler_bind_type, _) was\n"
 	    "made. The scheduler_bind_type argument is deprecated and\n"
-	    "scheduled for removal in erts-5.10/OTP-R16. For more\n"
+	    "scheduled for removal in Erlang/OTP 18. For more\n"
 	    "information see the erlang:system_flag/2 documentation.\n");
 	return erts_bind_schedulers(BIF_P, BIF_ARG_2);
     }
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index b5ba9bb94a..8094c6ee2e 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -75,9 +75,9 @@
 #define ERTS_ALC_DEFAULT_ENABLED_ACUL_EHEAP_ALLOC 45
 #define ERTS_ALC_DEFAULT_ENABLED_ACUL_LL_ALLOC 85
 
-#define ERTS_ALC_DEFAULT_ACUL 0
-#define ERTS_ALC_DEFAULT_ACUL_EHEAP_ALLOC 0
-#define ERTS_ALC_DEFAULT_ACUL_LL_ALLOC 0
+#define ERTS_ALC_DEFAULT_ACUL ERTS_ALC_DEFAULT_ENABLED_ACUL
+#define ERTS_ALC_DEFAULT_ACUL_EHEAP_ALLOC ERTS_ALC_DEFAULT_ENABLED_ACUL_EHEAP_ALLOC
+#define ERTS_ALC_DEFAULT_ACUL_LL_ALLOC ERTS_ALC_DEFAULT_ENABLED_ACUL_LL_ALLOC
 
 #ifndef ERTS_SMP
 #  undef ERTS_ALC_DEFAULT_ACUL
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 32308fae9b..b4e52770e3 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -1,7 +1,7 @@
 #
 # %CopyrightBegin%
 #
-# Copyright Ericsson AB 2003-2013. All Rights Reserved.
+# Copyright Ericsson AB 2003-2014. All Rights Reserved.
 #
 # The contents of this file are subject to the Erlang Public License,
 # Version 1.1, (the "License"); you may not use this file except in
@@ -150,7 +150,7 @@ type	LINK_LH		STANDARD	PROCESSES	link_lh
 type	SUSPEND_MON	STANDARD	PROCESSES	suspend_monitor
 type	PEND_SUSPEND	SHORT_LIVED	PROCESSES	pending_suspend
 type	PROC_LIST	SHORT_LIVED	PROCESSES	proc_list
-type	EXTRA_ROOT	SHORT_LIVED	PROCESSES	extra_root
+type	SAVED_ESTACK	SHORT_LIVED	PROCESSES	saved_estack
 type	FUN_ENTRY	LONG_LIVED	CODE		fun_entry
 type	ATOM_TXT	LONG_LIVED	ATOM		atom_text
 type 	BEAM_REGISTER	EHEAP		PROCESSES	beam_register
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 414ae2f046..e0b654cb22 100755
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -64,8 +64,10 @@ static Export *gather_gc_info_res_trap;
 
 #define DECL_AM(S) Eterm AM_ ## S = am_atom_put(#S, sizeof(#S) - 1)
 
+static char otp_correction_package[] = ERLANG_OTP_CORRECTION_PACKAGE;
 /* Keep erts_system_version as a global variable for easy access from a core */
 static char erts_system_version[] = ("Erlang/OTP " ERLANG_OTP_RELEASE
+				     "%s"
 				     " [erts-" ERLANG_VERSION "]"
 #if !HEAP_ON_C_STACK && !HALFWORD_HEAP
 				     " [no-c-stack-objects]"
@@ -304,11 +306,28 @@ make_link_list(Process *p, ErtsLink *root, Eterm tail)
 int
 erts_print_system_version(int to, void *arg, Process *c_p)
 {
+    int i, rc = -1;
+    char *rc_str = "";
+    char rc_buf[100];
+    char *ocp = otp_correction_package;
 #ifdef ERTS_SMP
     Uint total, online, active;
     (void) erts_schedulers_state(&total, &online, &active, 0);
 #endif
-    return erts_print(to, arg, erts_system_version
+    for (i = 0; i < sizeof(otp_correction_package)-4; i++) {
+	if (ocp[i] == '-' && ocp[i+1] == 'r' && ocp[i+2] == 'c')
+	    rc = atoi(&ocp[i+3]);
+    }
+    if (rc >= 0) {
+	if (rc == 0)
+	    rc_str = " [DEVELOPMENT]";
+	else {
+	    erts_snprintf(rc_buf, sizeof(rc_buf), " [RELEASE CANDIDATE %d]", rc);
+	    rc_str = rc_buf;
+	}
+    }
+    return erts_print(to, arg, erts_system_version,
+		      rc_str
 #ifdef ERTS_SMP
 		      , total, online
 #endif
@@ -2417,6 +2436,10 @@ BIF_RETTYPE system_info_1(BIF_ALIST_1)
 	    DECL_AM(unknown);
 	    BIF_RET(AM_unknown);
 	}
+    } else if (ERTS_IS_ATOM_STR("otp_correction_package", BIF_ARG_1)) {
+	int n = sizeof(ERLANG_OTP_CORRECTION_PACKAGE)-1;
+	hp = HAlloc(BIF_P, 2*n);
+	BIF_RET(buf_to_intlist(&hp, ERLANG_OTP_CORRECTION_PACKAGE, n, NIL));
     } else if (ERTS_IS_ATOM_STR("otp_release", BIF_ARG_1)) {
 	int n = sizeof(ERLANG_OTP_RELEASE)-1;
 	hp = HAlloc(BIF_P, 2*n);
diff --git a/erts/emulator/beam/erl_gc.c b/erts/emulator/beam/erl_gc.c
index fd86c658d6..ab8448e8a1 100644
--- a/erts/emulator/beam/erl_gc.c
+++ b/erts/emulator/beam/erl_gc.c
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 2002-2013. All Rights Reserved.
+ * Copyright Ericsson AB 2002-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -1975,17 +1975,6 @@ setup_rootset(Process *p, Eterm *objv, int nobj, Rootset *rootset)
         ++n;
     }
 
-    /*
-     * A trapping BIF can add to rootset by setting the extra_root
-     * in the process_structure.
-     */
-    if (p->extra_root != NULL) {
-	roots[n].v = p->extra_root->objv;
-	roots[n].sz = p->extra_root->sz;
-	++n;
-    }
-
-
     ASSERT((is_nil(p->seq_trace_token) ||
 	    is_tuple(follow_moved(p->seq_trace_token)) ||
 	    is_atom(p->seq_trace_token)));
@@ -2563,11 +2552,6 @@ offset_one_rootset(Process *p, Sint offs, char* area, Uint area_size,
 		    p->dictionary->used, 
 		    offs, area, area_size);
     }
-    if (p->extra_root != NULL) {
-	offset_heap_ptr(p->extra_root->objv, 
-			p->extra_root->sz, 
-			offs, area, area_size);
-    }
 
     offset_heap_ptr(&p->fvalue, 1, offs, area, area_size);
     offset_heap_ptr(&p->ftrace, 1, offs, area, area_size);
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 8c4fffa75b..19088fd913 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -537,6 +537,12 @@ void erts_usage(void)
     erts_fprintf(stderr, "            see the erl(1) documentation for more info.\n");
     erts_fprintf(stderr, "-sct cput   set cpu topology,\n");
     erts_fprintf(stderr, "            see the erl(1) documentation for more info.\n");
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+    erts_fprintf(stderr, "-sub bool   enable/disable scheduler utilization balancing,\n");
+#else
+    erts_fprintf(stderr, "-sub false  disable scheduler utilization balancing,\n");
+#endif
+    erts_fprintf(stderr, "            see the erl(1) documentation for more info.\n");
     erts_fprintf(stderr, "-sws val    set scheduler wakeup strategy, valid values are:\n");
     erts_fprintf(stderr, "            default|legacy.\n");
     erts_fprintf(stderr, "-swct val   set scheduler wake cleanup threshold, valid values are:\n");
@@ -553,8 +559,8 @@ void erts_usage(void)
     erts_fprintf(stderr, "            numbers is %d\n",
 		 ERTS_MAX_NO_OF_SCHEDULERS);
     erts_fprintf(stderr, "-SP p1:p2   specify schedulers (p1) and schedulers online (p2)\n");
-    erts_fprintf(stderr, "	      as percentages of logical processors configured and logical\n");
-    erts_fprintf(stderr, "	      processors available, respectively\n");
+    erts_fprintf(stderr, "            as percentages of logical processors configured and logical\n");
+    erts_fprintf(stderr, "            processors available, respectively\n");
     erts_fprintf(stderr, "-t size     set the maximum number of atoms the "
 			 "emulator can handle\n");
     erts_fprintf(stderr, "            valid range is [%d-%d]\n",
@@ -1433,8 +1439,10 @@ erl_start(int argc, char **argv)
 	    }
 	    else if (has_prefix("cl", sub_param)) {
 		arg = get_arg(sub_param+2, argv[i+1], &i);
-		if (sys_strcmp("true", arg) == 0)
+		if (sys_strcmp("true", arg) == 0) {
 		    erts_sched_compact_load = 1;
+		    erts_sched_balance_util = 0;
+		}
 		else if (sys_strcmp("false", arg) == 0)
 		    erts_sched_compact_load = 0;
 		else {
@@ -1512,6 +1520,26 @@ erl_start(int argc, char **argv)
 		    erts_usage();
 		}
 	    }
+	    else if (has_prefix("ub", sub_param)) {
+		arg = get_arg(sub_param+2, argv[i+1], &i);
+		if (sys_strcmp("true", arg) == 0) {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+		    erts_sched_balance_util = 1;
+#else
+		    erts_fprintf(stderr,
+				 "scheduler utilization balancing not "
+				 "supported on this system\n");
+		    erts_usage();
+#endif
+		}
+		else if (sys_strcmp("false", arg) == 0)
+		    erts_sched_balance_util = 0;
+		else {
+		    erts_fprintf(stderr, "bad scheduler utilization balancing "
+				 " value '%s'\n", arg);
+		    erts_usage();
+		}
+	    }
 	    else if (has_prefix("wct", sub_param)) {
 		arg = get_arg(sub_param+3, argv[i+1], &i);
 		if (erts_sched_set_wake_cleanup_threshold(arg) != 0) {
diff --git a/erts/emulator/beam/erl_port_task.c b/erts/emulator/beam/erl_port_task.c
index 547a42beb2..d4108067d0 100644
--- a/erts/emulator/beam/erl_port_task.c
+++ b/erts/emulator/beam/erl_port_task.c
@@ -877,6 +877,11 @@ enqueue_port(ErtsRunQueue *runq, Port *pp)
     ASSERT(runq->ports.start && runq->ports.end);
 
     erts_smp_inc_runq_len(runq, &runq->ports.info, ERTS_PORT_PRIO_LEVEL);
+
+#ifdef ERTS_SMP
+    if (runq->halt_in_progress)
+	erts_non_empty_runq(runq);
+#endif
 }
 
 static ERTS_INLINE Port *
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 21fd8dd50a..74cd84a998 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1996-2013. All Rights Reserved.
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -144,6 +144,7 @@ extern BeamInstr beam_exit[];
 extern BeamInstr beam_continue_exit[];
 
 int erts_sched_compact_load;
+int erts_sched_balance_util = 0;
 Uint erts_no_schedulers;
 
 #define ERTS_THR_PRGR_LATER_CLEANUP_OP_THRESHOLD_VERY_LAZY		(4*1024*1024)
@@ -608,6 +609,7 @@ erts_late_init_process(void)
 static void
 init_sched_wall_time(ErtsSchedWallTime *swtp)
 {
+    swtp->need = erts_sched_balance_util;
     swtp->enabled = 0;
     swtp->start = 0;
     swtp->working.total = 0;
@@ -630,27 +632,253 @@ sched_wall_time_ts(void)
 #endif
 }
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+
+#ifdef ARCH_64
+
+static ERTS_INLINE Uint64
+aschedtime_read(ErtsAtomicSchedTime *var)
+{
+    return (Uint64) erts_atomic_read_nob((erts_atomic_t *) var);
+}
+
+static ERTS_INLINE void
+aschedtime_set(ErtsAtomicSchedTime *var, Uint64 val)
+{
+    erts_atomic_set_nob((erts_atomic_t *) var, (erts_aint_t) val);
+}
+
+static ERTS_INLINE void
+aschedtime_init(ErtsAtomicSchedTime *var)
+{
+    erts_atomic_init_nob((erts_atomic_t *) var, (erts_aint_t) 0);
+}
+
+#elif defined(ARCH_32)
+
+static ERTS_INLINE Uint64
+aschedtime_read(ErtsAtomicSchedTime *var)
+{
+    erts_dw_aint_t dw;
+    erts_dw_atomic_read_nob((erts_dw_atomic_t *) var, &dw);
+#ifdef ETHR_SU_DW_NAINT_T__
+    return (Uint64) dw.dw_sint;
+#else
+    {
+	Uint64 res;
+	res = (Uint64) ((Uint32) dw.sint[ERTS_DW_AINT_HIGH_WORD]);
+	res <<= 32;
+	res |= (Uint64) ((Uint32) dw.sint[ERTS_DW_AINT_LOW_WORD]);
+	return res;
+    }
+#endif    
+}
+
+static ERTS_INLINE void
+aschedtime_set(ErtsAtomicSchedTime *var, Uint64 val)
+{
+    erts_dw_aint_t dw;
+#ifdef ETHR_SU_DW_NAINT_T__
+    dw.dw_sint = (ETHR_SU_DW_NAINT_T__) val;
+#else
+    dw.sint[ERTS_DW_AINT_LOW_WORD] = (erts_aint_t) (val & 0xffffffff);
+    dw.sint[ERTS_DW_AINT_HIGH_WORD] = (erts_aint_t) ((val >> 32) & 0xffffffff);
+#endif
+    erts_dw_atomic_set_nob((erts_dw_atomic_t *) var, &dw);
+}
+
+static ERTS_INLINE void
+aschedtime_init(ErtsAtomicSchedTime *var)
+{
+    erts_dw_aint_t dw;
+    dw.sint[ERTS_DW_AINT_LOW_WORD] = (erts_aint_t) 0;
+    dw.sint[ERTS_DW_AINT_HIGH_WORD] = (erts_aint_t) 0;
+    erts_dw_atomic_init_nob((erts_dw_atomic_t *) var, &dw);
+}
+
+#else
+#  error :-/
+#endif
+
+#define ERTS_GET_AVG_MAX_UNLOCKED_TRY 50
+#define ERTS_SCHED_AVG_UTIL_WRITE_MARKER (~((Uint64) 0))
+
+/* Intervals in nanoseconds */
+#define ERTS_SCHED_UTIL_SHORT_INTERVAL ((Uint64) 1*1000*1000*1000)
+#define ERTS_SCHED_UTIL_LONG_INTERVAL ((Uint64) 10*1000*1000*1000)
+
+
+#define ERTS_SCHED_UTIL_IGNORE_IMBALANCE_DIFF 5000 /* ppm */
+
+static ERTS_INLINE Uint64
+calc_sched_worktime(int is_working, Uint64 now, Uint64 last,
+		    Uint64 interval, Uint64 old_worktime)
+{
+    Uint64 worktime;
+    Uint64 new;
+
+    if (now <= last)
+	return old_worktime;
+
+    new = now - last;
+
+    if (new >= interval)
+	return is_working ? interval : (Uint64) 0;
+
+
+    /*
+     * Division by 1000 in order to avoid
+     * overflow. If changed update assertions
+     * in init_runq_sched_util().
+     */
+    worktime = old_worktime;
+    worktime *= (interval - new)/1000;
+    worktime /= (interval/1000);
+    if (is_working)
+	worktime += new;
+
+    ASSERT(0 <= worktime && worktime <= interval);
+
+    return worktime;
+}
+
+static ERTS_INLINE void
+update_avg_sched_util(ErtsSchedulerData *esdp, Uint64 now, int is_working)
+{
+    ErtsRunQueue *rq;
+    int worked;
+    Uint64 swt, lwt, last;
+
+    rq = esdp->run_queue;
+    last = aschedtime_read(&rq->sched_util.last);
+
+    if (now <= last) {
+	ASSERT(last == ERTS_SCHED_AVG_UTIL_WRITE_MARKER);
+	return;
+    }
+
+    ASSERT(now >= last);
+
+    worked = rq->sched_util.is_working;
+
+    swt = calc_sched_worktime(worked, now, last, ERTS_SCHED_UTIL_SHORT_INTERVAL,
+			      rq->sched_util.worktime.short_interval);
+    lwt = calc_sched_worktime(worked, now, last, ERTS_SCHED_UTIL_LONG_INTERVAL,
+			      rq->sched_util.worktime.long_interval);
+
+    aschedtime_set(&rq->sched_util.last, ERTS_SCHED_AVG_UTIL_WRITE_MARKER);
+    ERTS_THR_WRITE_MEMORY_BARRIER;
+    rq->sched_util.is_working = is_working;
+    rq->sched_util.worktime.short_interval = swt;
+    rq->sched_util.worktime.long_interval = lwt;
+    ERTS_THR_WRITE_MEMORY_BARRIER;
+    aschedtime_set(&rq->sched_util.last, now);
+}
+
+int
+erts_get_sched_util(ErtsRunQueue *rq, int initially_locked, int short_interval)
+{
+    /* Average scheduler utilization in ppm */
+    int util, is_working, try = 0, locked = initially_locked;
+    Uint64 worktime, old_worktime, now, last, interval, *old_worktimep;
+
+    if (short_interval) {
+	old_worktimep = &rq->sched_util.worktime.short_interval;
+	interval = ERTS_SCHED_UTIL_SHORT_INTERVAL;
+    }
+    else {
+	old_worktimep = &rq->sched_util.worktime.long_interval;
+	interval = ERTS_SCHED_UTIL_LONG_INTERVAL;
+    }
+
+    while (1) {
+	Uint64 chk_last;
+	last = aschedtime_read(&rq->sched_util.last);
+	ERTS_THR_READ_MEMORY_BARRIER;
+	is_working = rq->sched_util.is_working;
+	old_worktime = *old_worktimep;
+	ERTS_THR_READ_MEMORY_BARRIER;
+	chk_last = aschedtime_read(&rq->sched_util.last);
+	if (chk_last == last)
+	    break;
+	if (!locked) {
+	    if (++try >= ERTS_GET_AVG_MAX_UNLOCKED_TRY) {
+		/* Writer will eventually block on runq-lock */
+		erts_smp_runq_lock(rq);
+		locked = 1;
+	    }
+	}
+    }
+
+    if (!initially_locked && locked)
+	erts_smp_runq_unlock(rq);
+
+    now = sched_wall_time_ts();
+    worktime = calc_sched_worktime(is_working, now, last, interval, old_worktime);
+
+    util = (int) ((worktime * 1000000)/interval);
+
+    ASSERT(0 <= util && util <= 1000000);
+
+    return util;
+}
+
+static void
+init_runq_sched_util(ErtsRunQueueSchedUtil *rqsu, int enabled)
+{
+    aschedtime_init(&rqsu->last);
+    if (!enabled)
+	aschedtime_set(&rqsu->last, ERTS_SCHED_AVG_UTIL_WRITE_MARKER);
+    rqsu->is_working = 0;
+    rqsu->worktime.short_interval = (Uint64) 0;
+    rqsu->worktime.long_interval = (Uint64) 0;
+
+#ifdef DEBUG
+    {
+	Uint64 intrvl;
+	/*
+	 * If one of these asserts fail we may have
+	 * overflow in calc_sched_worktime(). Which
+	 * have to be fixed either by shrinking
+	 * interval size, or fix calculation of
+	 * worktime in calc_sched_worktime().
+	 */
+	intrvl = ERTS_SCHED_UTIL_SHORT_INTERVAL;
+	ASSERT(intrvl*(intrvl/1000) > intrvl);
+	intrvl = ERTS_SCHED_UTIL_LONG_INTERVAL;
+	ASSERT(intrvl*(intrvl/1000) > intrvl);
+    }
+#endif
+}
+
+#endif /* ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT */
+
 static ERTS_INLINE void
 sched_wall_time_change(ErtsSchedulerData *esdp, int working)
 {
-    if (esdp->sched_wall_time.enabled) {
+    if (esdp->sched_wall_time.need) {
 	Uint64 ts = sched_wall_time_ts();
-	if (working) {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+	update_avg_sched_util(esdp, ts, working);
+#endif
+	if (esdp->sched_wall_time.enabled) {
+	    if (working) {
 #ifdef DEBUG
-	    ASSERT(!esdp->sched_wall_time.working.currently);
-	    esdp->sched_wall_time.working.currently = 1;
+		ASSERT(!esdp->sched_wall_time.working.currently);
+		esdp->sched_wall_time.working.currently = 1;
 #endif
-	    ts -= esdp->sched_wall_time.start;
-	    esdp->sched_wall_time.working.start = ts;
-	}
-	else {
+		ts -= esdp->sched_wall_time.start;
+		esdp->sched_wall_time.working.start = ts;
+	    }
+	    else {
 #ifdef DEBUG
-	    ASSERT(esdp->sched_wall_time.working.currently);
-	    esdp->sched_wall_time.working.currently = 0;
+		ASSERT(esdp->sched_wall_time.working.currently);
+		esdp->sched_wall_time.working.currently = 0;
 #endif
-	    ts -= esdp->sched_wall_time.start;
-	    ts -= esdp->sched_wall_time.working.start;
-	    esdp->sched_wall_time.working.total += ts;
+		ts -= esdp->sched_wall_time.start;
+		ts -= esdp->sched_wall_time.working.start;
+		esdp->sched_wall_time.working.total += ts;
+	    }
 	}
     }
 }
@@ -705,10 +933,13 @@ reply_sched_wall_time(void *vswtrp)
     ASSERT(esdp);
     
     if (swtrp->set) {
-	if (!swtrp->enable && esdp->sched_wall_time.enabled)
+	if (!swtrp->enable && esdp->sched_wall_time.enabled) {
+	    esdp->sched_wall_time.need = erts_sched_balance_util;
 	    esdp->sched_wall_time.enabled = 0;
+	}
 	else if (swtrp->enable && !esdp->sched_wall_time.enabled) {
 	    Uint64 ts = sched_wall_time_ts();
+	    esdp->sched_wall_time.need = 1;
 	    esdp->sched_wall_time.enabled = 1;
 	    esdp->sched_wall_time.start = ts;
 	    esdp->sched_wall_time.working.total = 0;
@@ -2084,9 +2315,8 @@ ongoing_multi_scheduling_block(void)
 }
 
 static ERTS_INLINE void
-empty_runq(ErtsRunQueue *rq)
+empty_runq_aux(ErtsRunQueue *rq, Uint32 old_flags)
 {
-    Uint32 old_flags = ERTS_RUNQ_FLGS_UNSET(rq, ERTS_RUNQ_FLG_NONEMPTY|ERTS_RUNQ_FLG_PROTECTED);
     if (old_flags & ERTS_RUNQ_FLG_NONEMPTY) {
 #ifdef DEBUG
 	erts_aint32_t empty = erts_smp_atomic32_read_nob(&no_empty_run_queues);
@@ -2107,6 +2337,23 @@ empty_runq(ErtsRunQueue *rq)
 }
 
 static ERTS_INLINE void
+empty_runq(ErtsRunQueue *rq)
+{
+    Uint32 old_flags = ERTS_RUNQ_FLGS_UNSET(rq, ERTS_RUNQ_FLG_NONEMPTY|ERTS_RUNQ_FLG_PROTECTED);
+    empty_runq_aux(rq, old_flags);
+}
+
+static ERTS_INLINE Uint32
+empty_protected_runq(ErtsRunQueue *rq)
+{
+    Uint32 old_flags = ERTS_RUNQ_FLGS_BSET(rq,
+					   ERTS_RUNQ_FLG_NONEMPTY|ERTS_RUNQ_FLG_PROTECTED,
+					   ERTS_RUNQ_FLG_PROTECTED);
+    empty_runq_aux(rq, old_flags);
+    return old_flags;
+}
+
+static ERTS_INLINE void
 non_empty_runq(ErtsRunQueue *rq)
 {
     Uint32 old_flags = ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_NONEMPTY);
@@ -2130,6 +2377,18 @@ non_empty_runq(ErtsRunQueue *rq)
     }
 }
 
+void
+erts_empty_runq(ErtsRunQueue *rq)
+{
+    empty_runq(rq);
+}
+
+void
+erts_non_empty_runq(ErtsRunQueue *rq)
+{
+    non_empty_runq(rq);
+}
+
 static erts_aint32_t
 sched_prep_spin_wait(ErtsSchedulerSleepInfo *ssi)
 {
@@ -2632,7 +2891,7 @@ ssi_flags_set_wake(ErtsSchedulerSleepInfo *ssi)
 }
 
 static void
-wake_scheduler(ErtsRunQueue *rq, int incq)
+wake_scheduler(ErtsRunQueue *rq)
 {
     ErtsSchedulerSleepInfo *ssi;
     erts_aint32_t flgs;
@@ -2651,9 +2910,6 @@ wake_scheduler(ErtsRunQueue *rq, int incq)
 
     flgs = ssi_flags_set_wake(ssi);
     erts_sched_finish_poke(ssi, flgs);
-
-    if (incq && (flgs & ERTS_SSI_FLG_WAITING))
-	non_empty_runq(rq);
 }
 
 #define ERTS_NO_USED_RUNQS_SHIFT 16
@@ -2744,7 +3000,7 @@ chk_wake_sched(ErtsRunQueue *crq, int ix, int activate)
 	    if (try_inc_no_active_runqs(ix+1))
 		(void) ERTS_RUNQ_FLGS_UNSET(wrq, ERTS_RUNQ_FLG_INACTIVE);
 	}
-	wake_scheduler(wrq, 0);
+	wake_scheduler(wrq);
 	return 1;
     }
     return 0;
@@ -2792,7 +3048,7 @@ smp_notify_inc_runq(ErtsRunQueue *runq)
 {
 #ifdef ERTS_SMP
     if (runq)
-	wake_scheduler(runq, 1);
+	wake_scheduler(runq);
 #endif
 }
 
@@ -2810,7 +3066,7 @@ erts_sched_notify_check_cpu_bind(void)
     for (ix = 0; ix < erts_no_run_queues; ix++) {
 	ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
 	(void) ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_CHK_CPU_BIND);
-	wake_scheduler(rq, 0);
+	wake_scheduler(rq);
     }
 #else
     erts_sched_check_cpu_bind(erts_get_scheduler_data());
@@ -2938,6 +3194,11 @@ check_immigration_need(ErtsRunQueue *c_rq, ErtsMigrationPath *mp, int prio)
     if (!f_rq)
 	return NULL;
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    if (mp->sched_util)
+	return NULL;
+#endif
+
     f_rq_flags = ERTS_RUNQ_FLGS_GET(f_rq);
     if (f_rq_flags & ERTS_RUNQ_FLG_PROTECTED)
 	return NULL;
@@ -3077,7 +3338,7 @@ suspend_run_queue(ErtsRunQueue *rq)
 				   ERTS_SSI_FLG_SUSPENDED);
     (void) ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_SUSPENDED);
 
-    wake_scheduler(rq, 0);
+    wake_scheduler(rq);
 }
 
 static void scheduler_ix_resume_wake(Uint ix);
@@ -3169,6 +3430,9 @@ evacuate_run_queue(ErtsRunQueue *rq,
 	    to_rq->misc.start = start;
 
 	to_rq->misc.end = end;
+
+	non_empty_runq(to_rq);
+
 	erts_smp_runq_unlock(to_rq);
 	smp_notify_inc_runq(to_rq);
 	erts_smp_runq_lock(to_rq);
@@ -3381,7 +3645,7 @@ try_steal_task(ErtsRunQueue *rq)
     Uint32 flags;
 
     /* Protect jobs we steal from getting stolen from us... */
-    flags = ERTS_RUNQ_FLGS_SET(rq, ERTS_RUNQ_FLG_PROTECTED);
+    flags = empty_protected_runq(rq);
     if (flags & ERTS_RUNQ_FLG_SUSPENDED)
 	return 0; /* go suspend instead... */
 
@@ -3460,6 +3724,9 @@ typedef struct {
     int full_reds_history_change;
     int oowc;
     int max_len;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    int sched_util;
+#endif
 } ErtsRunQueueBalance;
 static ErtsRunQueueBalance *run_queue_info;
 
@@ -3623,6 +3890,9 @@ check_balance(ErtsRunQueue *c_rq)
     Sint64 scheds_reds, full_scheds_reds;
     int forced, active, current_active, oowc, half_full_scheds, full_scheds,
 	mmax_len, blnc_no_rqs, qix, pix, freds_hist_ix;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    int sched_util_balancing;
+#endif
 
     if (erts_smp_atomic32_xchg_nob(&balance_info.checking_balance, 1)) {
 	c_rq->check_balance_reds = INT_MAX;
@@ -3678,6 +3948,10 @@ check_balance(ErtsRunQueue *c_rq)
 	return;
     }
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    sched_util_balancing = 0;
+#endif
+
     freds_hist_ix = balance_info.full_reds_history_index;
     balance_info.full_reds_history_index++;
     if (balance_info.full_reds_history_index >= ERTS_FULL_REDS_HISTORY_SIZE)
@@ -3708,7 +3982,12 @@ check_balance(ErtsRunQueue *c_rq)
 	run_queue_info[qix].oowc = rq->out_of_work_count;
 	run_queue_info[qix].max_len = rq->max_len;
 	rq->check_balance_reds = INT_MAX;
-	
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+	if (erts_sched_balance_util)
+	    run_queue_info[qix].sched_util = erts_get_sched_util(rq, 1, 0);
+#endif
+
 	erts_smp_runq_unlock(rq);
     }
 
@@ -3778,8 +4057,38 @@ check_balance(ErtsRunQueue *c_rq)
 	    mmax_len = run_queue_info[qix].max_len;
     }
 
-    if (!erts_sched_compact_load)
+    if (!erts_sched_compact_load) {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+	if (erts_sched_balance_util && full_scheds < blnc_no_rqs) {
+	    int avg_util = 0;
+
+	    for (qix = 0; qix < blnc_no_rqs; qix++)
+		avg_util += run_queue_info[qix].sched_util;
+
+	    avg_util /= blnc_no_rqs; /* in ppm */
+
+	    sched_util_balancing = 1;
+	    /*
+	     * In order to avoid renaming a large amount of fields
+	     * we write utilization values instead of lenght values
+	     * in the 'max_len' and 'migration_limit' fields...
+	     */
+	    for (qix = 0; qix < blnc_no_rqs; qix++) {
+		run_queue_info[qix].flags = 0; /* Reset for later use... */
+		for (pix = 0; pix < ERTS_NO_PRIO_LEVELS; pix++) {
+		    run_queue_info[qix].prio[pix].emigrate_to = -1;
+		    run_queue_info[qix].prio[pix].immigrate_from = -1;
+		    run_queue_info[qix].prio[pix].avail = 100;
+		    run_queue_info[qix].prio[pix].max_len = run_queue_info[qix].sched_util;
+		    run_queue_info[qix].prio[pix].migration_limit = avg_util;
+		}
+	    }
+	    active = blnc_no_rqs;
+	    goto setup_migration_paths;
+	}
+#endif
 	goto all_active;
+    }
 
     if (!forced && half_full_scheds != blnc_no_rqs) {
 	int min = 1;
@@ -3896,15 +4205,30 @@ check_balance(ErtsRunQueue *c_rq)
 	    }
 	}
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    setup_migration_paths:
+#endif
+
 	/* Setup migration paths for all priorities */
 	for (pix = 0; pix < ERTS_NO_PRIO_LEVELS; pix++) {
 	    int low = 0, high = 0;
 	    for (qix = 0; qix < blnc_no_rqs; qix++) {
 		int len_diff = run_queue_info[qix].prio[pix].max_len;
 		len_diff -= run_queue_info[qix].prio[pix].migration_limit;
+
 #ifdef DBG_PRINT
 if (pix == 2) erts_fprintf(stderr, "%d ", len_diff);
 #endif
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+		if (sched_util_balancing
+		    && -ERTS_SCHED_UTIL_IGNORE_IMBALANCE_DIFF <= len_diff
+		    && len_diff <= ERTS_SCHED_UTIL_IGNORE_IMBALANCE_DIFF) {
+		    /* ignore minor imbalance */
+		    len_diff = 0;
+		}
+#endif
+		    
 		run_queue_compare[qix].qix = qix;
 		run_queue_compare[qix].len = len_diff;
 		if (len_diff != 0) {
@@ -4031,6 +4355,9 @@ erts_fprintf(stderr, "--------------------------------\n");
 	Uint32 flags = run_queue_info[qix].flags;
 	ErtsMigrationPath *mp = &new_mpaths->mpath[qix];
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+	mp->sched_util = sched_util_balancing;
+#endif
 	mp->flags = flags;
 	mp->misc_evac_runq = NULL;
 
@@ -4628,6 +4955,11 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online)
     set_wakeup_other_data();
 #endif
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    if (erts_sched_balance_util)
+	erts_sched_compact_load = 0;
+#endif
+
     ASSERT(no_schedulers_online <= no_schedulers);
     ASSERT(no_schedulers_online >= 1);
     ASSERT(no_schedulers >= 1);
@@ -4696,6 +5028,11 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online)
 	rq->ports.info.reds = 0;
 	rq->ports.start = NULL;
 	rq->ports.end = NULL;
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+	init_runq_sched_util(&rq->sched_util, erts_sched_balance_util);
+#endif
+
     }
 
 #ifdef ERTS_SMP
@@ -4794,6 +5131,7 @@ erts_init_scheduling(int no_schedulers, int no_schedulers_online)
 	esdp->reductions = 0;
 
 	init_sched_wall_time(&esdp->sched_wall_time);
+
 	erts_port_task_handle_init(&esdp->nosuspend_port_task_handle);
 
     }
@@ -5761,7 +6099,7 @@ erts_set_schedulers_online(Process *p,
 
 		    for (ix = no; ix < online; ix++) {
 			ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
-			wake_scheduler(rq, 0);
+			wake_scheduler(rq);
 		    }
 		}
 	    }
@@ -5860,7 +6198,7 @@ erts_block_multi_scheduling(Process *p, ErtsProcLocks plocks, int on, int all)
 
 		for (ix = 1; ix < online; ix++) {
 		    ErtsRunQueue *rq = ERTS_RUNQ_IX(ix);
-		    wake_scheduler(rq, 0);
+		    wake_scheduler(rq);
 		}
 
 		if (erts_smp_atomic32_read_nob(&schdlr_sspnd.active)
@@ -7265,19 +7603,13 @@ Process *schedule(Process *p, int calls)
 #ifdef ERTS_SMP
 	ErtsMigrationPaths *mps;
 	ErtsMigrationPath *mp;
-
-#ifdef ERTS_SMP
-	{
-	    ErtsProcList *pnd_xtrs = rq->procs.pending_exiters;
-	    if (erts_proclist_fetch(&pnd_xtrs, NULL)) {
-		rq->procs.pending_exiters = NULL;
-		erts_smp_runq_unlock(rq);
-		handle_pending_exiters(pnd_xtrs);
-		erts_smp_runq_lock(rq);
-	    }
-		
+	ErtsProcList *pnd_xtrs = rq->procs.pending_exiters;
+	if (erts_proclist_fetch(&pnd_xtrs, NULL)) {
+	    rq->procs.pending_exiters = NULL;
+	    erts_smp_runq_unlock(rq);
+	    handle_pending_exiters(pnd_xtrs);
+	    erts_smp_runq_lock(rq);
 	}
-#endif
 
 	if (rq->check_balance_reds <= 0)
 	    check_balance(rq);
@@ -7294,7 +7626,7 @@ Process *schedule(Process *p, int calls)
     continue_check_activities_to_run:
 	flags = ERTS_RUNQ_FLGS_GET_NOB(rq);
     continue_check_activities_to_run_known_flags:
-
+	ASSERT(flags & ERTS_RUNQ_FLG_NONEMPTY);
 
 	if (flags & (ERTS_RUNQ_FLG_CHK_CPU_BIND|ERTS_RUNQ_FLG_SUSPENDED)) {
 	
@@ -7346,20 +7678,16 @@ Process *schedule(Process *p, int calls)
 	    rq->wakeup_other = 0;
 	    rq->wakeup_other_reds = 0;
 
-	    empty_runq(rq);
-
 	    flags = ERTS_RUNQ_FLGS_GET_NOB(rq);
-	    if (flags & ERTS_RUNQ_FLG_SUSPENDED) {
-		non_empty_runq(rq);
+	    if (flags & ERTS_RUNQ_FLG_SUSPENDED)
 		goto continue_check_activities_to_run_known_flags;
-	    }
-	    else if (!(flags & ERTS_RUNQ_FLG_INACTIVE)) {
-		if (try_steal_task(rq)) {
-		    non_empty_runq(rq);
+	    if (flags & ERTS_RUNQ_FLG_INACTIVE)
+		empty_runq(rq);
+	    else {
+		if (try_steal_task(rq))
 		    goto continue_check_activities_to_run;
-		}
 
-		(void) ERTS_RUNQ_FLGS_UNSET(rq, ERTS_RUNQ_FLG_PROTECTED);
+		empty_runq(rq);
 
 		/*
 		 * Check for ERTS_RUNQ_FLG_SUSPENDED has to be done
@@ -7368,10 +7696,10 @@ Process *schedule(Process *p, int calls)
 		flags = ERTS_RUNQ_FLGS_GET_NOB(rq);
 		if (flags & ERTS_RUNQ_FLG_SUSPENDED) {
 		    non_empty_runq(rq);
+		    flags |= ERTS_RUNQ_FLG_NONEMPTY;
 		    goto continue_check_activities_to_run_known_flags;
 		}
 	    }
-
 #endif
 
 	    scheduler_wait(&fcalls, esdp, rq);
@@ -8486,6 +8814,10 @@ erts_schedule_misc_op(void (*func)(void *), void *arg)
 	rq->misc.start = molp;
     rq->misc.end = molp;
 
+#ifdef ERTS_SMP
+    non_empty_runq(rq);
+#endif
+
     erts_smp_runq_unlock(rq);
 
     smp_notify_inc_runq(rq);
@@ -8755,7 +9087,6 @@ erl_create_process(Process* parent, /* Parent of process (default group leader).
     p->htop = p->heap;
     p->heap_sz = sz;
     p->catches = 0;
-    p->extra_root = NULL;
 
     p->bin_vheap_sz     = p->min_vheap_size;
     p->bin_old_vheap_sz = p->min_vheap_size;
@@ -9372,8 +9703,11 @@ save_pending_exiter(Process *p)
 
     erts_proclist_store_last(&rq->procs.pending_exiters, plp);
 
+    non_empty_runq(rq);
+
     erts_smp_runq_unlock(rq);
-    wake_scheduler(rq, 1);
+
+    wake_scheduler(rq);
 }
 
 #endif
@@ -10219,12 +10553,6 @@ erts_continue_exit_process(Process *p)
     if (pbt)
         erts_free(ERTS_ALC_T_BPD, (void *) pbt);
 
-    if (p->extra_root != NULL) {
-	(p->extra_root->cleanup)(p->extra_root); /* Should deallocate 
-						    whole structure */
-	p->extra_root = NULL;
-    }
-
     delete_process(p);
 
 #ifdef ERTS_SMP
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index 043621125c..6155f99b85 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1996-2013. All Rights Reserved.
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -70,6 +70,9 @@ typedef struct process Process;
 
 struct ErtsNodesMonitor_;
 
+#define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT	0
+#define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT		0
+
 #define ERTS_MAX_NO_OF_SCHEDULERS 1024
 
 #define ERTS_DEFAULT_MAX_PROCESSES (1 << 18)
@@ -98,6 +101,7 @@ struct saved_calls {
 
 extern Export exp_send, exp_receive, exp_timeout;
 extern int erts_sched_compact_load;
+extern int erts_sched_balance_util;
 extern Uint erts_no_schedulers;
 extern Uint erts_no_run_queues;
 extern int erts_sched_thread_suggested_stack_size;
@@ -198,6 +202,10 @@ extern int erts_sched_thread_suggested_stack_size;
 #define ERTS_RUNQ_FLGS_SET(RQ, FLGS)					\
     ((Uint32) erts_smp_atomic32_read_bor_relb(&(RQ)->flags,		\
 					      (erts_aint32_t) (FLGS)))
+#define ERTS_RUNQ_FLGS_BSET(RQ, MSK, FLGS)				\
+    ((Uint32) erts_smp_atomic32_read_bset_relb(&(RQ)->flags,		\
+					       (erts_aint32_t) (MSK),	\
+					       (erts_aint32_t) (FLGS)))
 #define ERTS_RUNQ_FLGS_UNSET(RQ, FLGS)					\
     ((Uint32) erts_smp_atomic32_read_band_relb(&(RQ)->flags,		\
 					       (erts_aint32_t) ~(FLGS)))
@@ -316,9 +324,40 @@ typedef struct {
     int reds;
 } ErtsRunQueueInfo;
 
+
+#ifdef HAVE_GETHRTIME
+#  undef ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+#  define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT 1
+#endif
+
 #ifdef ERTS_SMP
 
+#undef ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+#define ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT_OPT
+
+#ifdef ARCH_64
+typedef erts_atomic_t ErtsAtomicSchedTime;
+#elif defined(ARCH_32)
+typedef erts_dw_atomic_t ErtsAtomicSchedTime;
+#else
+# error :-/
+#endif
+
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+typedef struct {
+    ErtsAtomicSchedTime last;
+    struct {
+	Uint64 short_interval;
+	Uint64 long_interval;
+    } worktime;
+    int is_working;
+} ErtsRunQueueSchedUtil;
+#endif
+
 typedef struct {
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    int sched_util;
+#endif
     Uint32 flags;
     ErtsRunQueue *misc_evac_runq;
     struct {
@@ -385,6 +424,9 @@ struct ErtsRunQueue_ {
 	Port *start;
 	Port *end;
     } ports;
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+    ErtsRunQueueSchedUtil sched_util;
+#endif
 };
 
 #ifdef ERTS_SMP
@@ -414,6 +456,7 @@ do {								\
 } while (0)
 
 typedef struct {
+    int need; /* "+sbu true" or scheduler_wall_time enabled */
     int enabled;
     Uint64 start;
     struct {
@@ -542,6 +585,12 @@ int erts_smp_lc_runq_is_locked(ErtsRunQueue *);
 
 #ifdef ERTS_INCLUDE_SCHEDULER_INTERNALS
 
+#ifdef ERTS_SMP
+void erts_empty_runq(ErtsRunQueue *rq);
+void erts_non_empty_runq(ErtsRunQueue *rq);
+#endif
+
+
 /*
  * Run queue locked during modifications. We use atomic ops since
  * other threads peek at values without run queue lock.
@@ -574,6 +623,10 @@ erts_smp_inc_runq_len(ErtsRunQueue *rq, ErtsRunQueueInfo *rqi, int prio)
 
     erts_smp_atomic32_set_relb(&rqi->len, len);
 
+#ifdef ERTS_SMP
+    if (rq->len == 0)
+	erts_non_empty_runq(rq);
+#endif
     rq->len++;
     if (rq->max_len < rq->len)
 	rq->max_len = len;
@@ -711,13 +764,6 @@ struct ErtsPendingSuspend_ {
 #endif
 
 
-typedef struct ErlExtraRootSet_ ErlExtraRootSet;
-struct ErlExtraRootSet_ {
-    Eterm *objv;
-    Uint sz;
-    void (*cleanup)(ErlExtraRootSet *);
-};
-
 /* Defines to ease the change of memory architecture */
 #  define HEAP_START(p)     (p)->heap
 #  define HEAP_TOP(p)       (p)->htop
@@ -811,8 +857,6 @@ struct process {
 
     ErlMessageQueue msg;	/* Message queue */
 
-    ErlExtraRootSet *extra_root;   /* Used by trapping BIF's */
-
     union {
 	ErtsBifTimer *bif_timers;	/* Bif timers aiming at this process */
 	void *terminate;
@@ -1695,6 +1739,13 @@ erts_proc_set_error_handler(Process *p, ErtsProcLocks plocks, Eterm handler)
 
 extern erts_atomic_t erts_migration_paths;
 
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+int erts_get_sched_util(ErtsRunQueue *rq,
+			int initially_locked,
+			int short_interval);
+#endif
+
+
 ERTS_GLB_INLINE ErtsMigrationPaths *erts_get_migration_paths_managed(void);
 ERTS_GLB_INLINE ErtsMigrationPaths *erts_get_migration_paths(void);
 ERTS_GLB_INLINE ErtsRunQueue *erts_check_emigration_need(ErtsRunQueue *c_rq,
@@ -1746,22 +1797,36 @@ erts_check_emigration_need(ErtsRunQueue *c_rq, int prio)
 		return mp->prio[prio].runq;
 	}
 
-
-	if (prio == ERTS_PORT_PRIO_LEVEL)
-	    len = RUNQ_READ_LEN(&c_rq->ports.info.len);
+#if ERTS_HAVE_SCHED_UTIL_BALANCING_SUPPORT
+	if (mp->sched_util) {
+	    ErtsRunQueue *rq = mp->prio[prio].runq;
+	    /* No migration if other is non-empty */
+	    if (!(ERTS_RUNQ_FLGS_GET(rq) & ERTS_RUNQ_FLG_NONEMPTY)
+		&& erts_get_sched_util(rq, 0, 1) < mp->prio[prio].limit.other
+		&& erts_get_sched_util(c_rq, 0, 1) > mp->prio[prio].limit.this) {
+		return rq;
+	    }
+	}
 	else
-	    len = RUNQ_READ_LEN(&c_rq->procs.prio_info[prio].len);
-
-	if (len > mp->prio[prio].limit.this) {
-	    ErtsRunQueue *n_rq = mp->prio[prio].runq;
-	    if (n_rq) {
-		if (prio == ERTS_PORT_PRIO_LEVEL)
-		    len = RUNQ_READ_LEN(&n_rq->ports.info.len);
-		else
-		    len = RUNQ_READ_LEN(&n_rq->procs.prio_info[prio].len);
-
-		if (len < mp->prio[prio].limit.other)
-		    return n_rq;
+#endif
+	{
+
+	    if (prio == ERTS_PORT_PRIO_LEVEL)
+		len = RUNQ_READ_LEN(&c_rq->ports.info.len);
+	    else
+		len = RUNQ_READ_LEN(&c_rq->procs.prio_info[prio].len);
+
+	    if (len > mp->prio[prio].limit.this) {
+		ErtsRunQueue *n_rq = mp->prio[prio].runq;
+		if (n_rq) {
+		    if (prio == ERTS_PORT_PRIO_LEVEL)
+			len = RUNQ_READ_LEN(&n_rq->ports.info.len);
+		    else
+			len = RUNQ_READ_LEN(&n_rq->procs.prio_info[prio].len);
+
+		    if (len < mp->prio[prio].limit.other)
+			return n_rq;
+		}
 	    }
 	}
     }
diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 2cb44a5b64..5e7a5cab6e 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1996-2013. All Rights Reserved.
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -87,7 +87,8 @@
 static Export term_to_binary_trap_export;
 
 static byte* enc_term(ErtsAtomCacheMap *, Eterm, byte*, Uint32, struct erl_off_heap_header** off_heap);
-static int enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
+struct TTBEncodeContext_;
+static int enc_term_int(struct TTBEncodeContext_*,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
 			struct erl_off_heap_header** off_heap, Sint *reds, byte **res);
 static Uint is_external_string(Eterm obj, int* p_is_string);
 static byte* enc_atom(ErtsAtomCacheMap *, Eterm, byte*, Uint32);
@@ -103,7 +104,8 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 				     Binary *context_b);
 
 static Uint encode_size_struct2(ErtsAtomCacheMap *, Eterm, unsigned);
-static int encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj, 
+struct TTBSizeContext_;
+static int encode_size_struct_int(struct TTBSizeContext_*, ErtsAtomCacheMap *acmp, Eterm obj,
 				  unsigned dflags, Sint *reds, Uint *res);
 
 static Export binary_to_term_trap_export;
@@ -1086,7 +1088,6 @@ BIF_RETTYPE term_to_binary_2(BIF_ALIST_2)
     int level = 0;
     Uint flags = TERM_TO_BINARY_DFLAGS;
     Eterm res;
-    Binary *bin = NULL;
 
     while (is_list(Flags)) {
 	Eterm arg = CAR(list_val(Flags));
@@ -1123,7 +1124,7 @@ BIF_RETTYPE term_to_binary_2(BIF_ALIST_2)
 	goto error;
     }
 
-    res = erts_term_to_binary_int(p, Term, level, flags, bin);
+    res = erts_term_to_binary_int(p, Term, level, flags, NULL);
     if (is_tuple(res)) {
 	erts_set_gc_state(p, 0);
 	BIF_TRAP1(&term_to_binary_trap_export,BIF_P,res);
@@ -1726,14 +1727,20 @@ erts_term_to_binary(Process* p, Eterm Term, int level, Uint flags) {
 
 
 typedef enum { TTBSize, TTBEncode, TTBCompress } TTBState;
-typedef struct {
+typedef struct TTBSizeContext_ {
     Uint flags;
     int level;
+    Uint result;
+    Eterm obj;
+    ErtsEStack estack;
 } TTBSizeContext;
 
-typedef struct {
+typedef struct TTBEncodeContext_ {
     Uint flags;
     int level;
+    byte* ep;
+    Eterm obj;
+    ErtsWStack wstack;
     Binary *result_bin;
 } TTBEncodeContext;
 
@@ -1763,8 +1770,10 @@ static void ttb_context_destructor(Binary *context_bin)
 	context->alive = 0;
 	switch (context->state) {
 	case TTBSize:
+	    DESTROY_SAVED_ESTACK(&context->s.sc.estack);
 	    break;
 	case TTBEncode:
+	    DESTROY_SAVED_WSTACK(&context->s.ec.wstack);
 	    if (context->s.ec.result_bin != NULL) { /* Set to NULL if ever made alive! */
 		ASSERT(erts_refc_read(&(context->s.ec.result_bin->refc),0) == 0);
 		erts_bin_free(context->s.ec.result_bin);
@@ -1829,6 +1838,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 	/* Setup enough to get started */
 	context->state = TTBSize;
 	context->alive = 1;
+	context->s.sc.estack.start = NULL;
 	context->s.sc.flags = flags;
 	context->s.sc.level = level;
     } else {
@@ -1844,7 +1854,8 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 		int level;
 		Uint flags;
 		/* Try for fast path */
-		if (encode_size_struct_int(p, NULL, Term, context->s.sc.flags, &reds, &size) < 0) {
+		if (encode_size_struct_int(&context->s.sc, NULL, Term,
+					   context->s.sc.flags, &reds, &size) < 0) {
 		    EXPORT_CONTEXT();
 		    /* Same state */
 		    RETURN_STATE();
@@ -1870,6 +1881,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 		context->state = TTBEncode;
 		context->s.ec.flags = flags;
 		context->s.ec.level = level;
+		context->s.ec.wstack.wstart = NULL;
 		context->s.ec.result_bin = result_bin;
 		break;
 	    }
@@ -1881,7 +1893,7 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 		Binary *result_bin;
 
 		flags = context->s.ec.flags;
-		if (enc_term_int(p,NULL,Term, bytes+1, flags, NULL, &reds, &endp) < 0) {
+		if (enc_term_int(&context->s.ec, NULL,Term, bytes+1, flags, NULL, &reds, &endp) < 0) {
 		    EXPORT_CONTEXT();
 		    RETURN_STATE();
 		}
@@ -2289,27 +2301,6 @@ dec_pid(ErtsDistExternal *edep, Eterm** hpp, byte* ep, ErlOffHeap* off_heap, Ete
 #define ENC_PATCH_FUN_SIZE ((Eterm) 2)
 #define ENC_LAST_ARRAY_ELEMENT ((Eterm) 3)
 
-/* Free extra rootset (used when trapping) */
-static void cleanup_ttb_extra_root(ErlExtraRootSet *rs)
-{
-    if (rs->objv != NULL) {
-	erts_free(ERTS_ALC_T_EXTRA_ROOT, rs->objv);
-    }
-    erts_free(ERTS_ALC_T_EXTRA_ROOT, rs);
-}
-
-/* Same as above, but we have an extra "stack" beyond GC reach, i.e. an array of two extra roots */
-static void cleanup_ttb_extra_root_2(ErlExtraRootSet *rs)
-{
-    if (rs->objv != NULL) {
-	erts_free(ERTS_ALC_T_EXTRA_ROOT, rs->objv);
-    }
-    if (rs[1].objv != NULL) {
-	erts_free(ERTS_ALC_T_EXTRA_ROOT, rs[1].objv);
-    }
-	
-    erts_free(ERTS_ALC_T_EXTRA_ROOT, rs);
-}
 
 static byte*
 enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
@@ -2321,39 +2312,43 @@ enc_term(ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
 }
 
 static int
-enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
+enc_term_int(TTBEncodeContext* ctx, ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dflags,
 	     struct erl_off_heap_header** off_heap, Sint *reds, byte **res)
 {
-    DECLARE_ESTACK(s);
-    DECLARE_WSTACK(com);
+    DECLARE_WSTACK(s);
     Uint n;
     Uint i;
     Uint j;
     Uint* ptr;
     Eterm val;
     FloatDef f;
-    int count_reds = (p != NULL && reds != NULL);
     Sint r = 0;
+#if HALFWORD_HEAP
+    UWord wobj;
+#endif
+
 
-    if (count_reds) {
-	ESTACK_CHANGE_ALLOCATOR(s, ERTS_ALC_T_EXTRA_ROOT);
-	WSTACK_CHANGE_ALLOCATOR(com, ERTS_ALC_T_EXTRA_ROOT);
+    if (ctx) {
+	WSTACK_CHANGE_ALLOCATOR(s, ERTS_ALC_T_SAVED_ESTACK);
 	r = *reds;
-    }
 
-    if (p && p->extra_root) { /* restore saved stacks and byte pointer */
-	ESTACK_RESTORE(s,p->extra_root[0].objv, p->extra_root[0].sz);
-	obj = ESTACK_POP(s);
-	WSTACK_RESTORE(com, p->extra_root[1].objv, p->extra_root[1].sz);
-	ep = (byte *) WSTACK_POP(com);
+	if (ctx->wstack.wstart) { /* restore saved stacks and byte pointer */
+	    WSTACK_RESTORE(s, &ctx->wstack);
+	    ep = ctx->ep;
+	    obj = ctx->obj;
+	}
     }
 
     goto L_jump_start;
 
  outer_loop:
-    while (!ESTACK_ISEMPTY(s)) {
-	obj = ESTACK_POP(s);
-	switch (val = WSTACK_POP(com)) {
+    while (!WSTACK_ISEMPTY(s)) {
+#if HALFWORD_HEAP
+	obj = (Eterm) (wobj = WSTACK_POP(s));
+#else
+	obj = WSTACK_POP(s);
+#endif
+	switch (val = WSTACK_POP(s)) {
 	case ENC_TERM:
 	    break;
 	case ENC_ONE_CONS:
@@ -2364,55 +2359,52 @@ enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dfla
 
 		obj = CAR(cons);
 		tl = CDR(cons);
-		WSTACK_PUSH(com, is_list(tl) ? ENC_ONE_CONS : ENC_TERM);
-		ESTACK_PUSH(s, tl);
+		WSTACK_PUSH(s, is_list(tl) ? ENC_ONE_CONS : ENC_TERM);
+		WSTACK_PUSH(s, tl);
 	    }
 	    break;
 	case ENC_PATCH_FUN_SIZE:
-	    /* obj will be discarded, it was NIL */
 	    {
-		byte* size_p = (byte *) WSTACK_POP(com);
+#if HALFWORD_HEAP
+		byte* size_p = (byte *) wobj;
+#else
+		byte* size_p = (byte *) obj;
+#endif
 		put_int32(ep - size_p, size_p);
 	    }
 	    goto outer_loop;
 	case ENC_LAST_ARRAY_ELEMENT:
 	    /* obj is the tuple */
 	    {
-		Eterm* ptr = tuple_val(obj);
-		i = arityval(*ptr);
-		obj = ptr[i];
+#if HALFWORD_HEAP
+		Eterm* ptr = (Eterm *) wobj;
+#else
+		Eterm* ptr = (Eterm *) obj;
+#endif
+		obj = *ptr;
 	    }
 	    break;
 	default:		/* ENC_LAST_ARRAY_ELEMENT+1 and upwards */
 	    {
-		Eterm* ptr = tuple_val(obj);
-		i = arityval(*ptr);
-		ESTACK_PUSH(s, obj); /* put back tuple and next element index */
-		WSTACK_PUSH(com, val-1);
-		obj = ptr[i - (val - ENC_LAST_ARRAY_ELEMENT)]; /* the index is counting down */
+#if HALFWORD_HEAP
+		Eterm* ptr = (Eterm *) wobj;
+#else
+		Eterm* ptr = (Eterm *) obj;
+#endif
+		WSTACK_PUSH(s, val-1);
+		obj = *ptr++;
+		WSTACK_PUSH(s, (UWord)ptr);
 	    }
 	    break;
 	}
 
     L_jump_start:
 
-	if (count_reds && --r == 0) {
+	if (ctx && --r == 0) {
 	    *reds = r;
-	    ESTACK_PUSH(s,obj); /* push back current object, to be popped on restore */
-	    WSTACK_PUSH(com,((UWord) ep));
-	    if (p->extra_root == NULL) {
-		/* NB. Allocate an array of two "extra-roots", of which only the first element
-		   is seen and handled by the GC. Index 1 holds the Wstack. */
-		p->extra_root = erts_alloc(ERTS_ALC_T_EXTRA_ROOT, sizeof(ErlExtraRootSet)*2);
-		p->extra_root->objv = NULL;
-		p->extra_root->sz = 0;
-		p->extra_root->cleanup = cleanup_ttb_extra_root_2;
-		p->extra_root[1].objv = NULL;
-		p->extra_root[1].sz = 0;
-		p->extra_root[1].cleanup = NULL; /* Never used */
-	    }
-	    ESTACK_SAVE(s, p->extra_root[0].objv, p->extra_root[0].sz);
-	    WSTACK_SAVE(com, p->extra_root[1].objv, (p->extra_root[1].sz));
+	    ctx->obj = obj;
+	    ctx->ep = ep;
+	    WSTACK_SAVE(s, &ctx->wstack);
 	    return -1;
 	}
 	switch(tag_val_def(obj)) {
@@ -2558,8 +2550,8 @@ enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dfla
 		ep += 4;
 	    }
 	    if (i > 0) {
-		WSTACK_PUSH(com, ENC_LAST_ARRAY_ELEMENT+i-1);
-		ESTACK_PUSH(s, obj);
+		WSTACK_PUSH(s, ENC_LAST_ARRAY_ELEMENT+i-1);
+		WSTACK_PUSH(s, (UWord)ptr);
 	    }
 	    break;
 
@@ -2703,9 +2695,8 @@ enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dfla
 		    int ei;
 
 		    *ep++ = NEW_FUN_EXT;
-		    WSTACK_PUSH(com, (UWord) ep); /* Position for patching in size */
-		    WSTACK_PUSH(com, ENC_PATCH_FUN_SIZE);
-		    ESTACK_PUSH(s,NIL); /* Will be thrown away */
+		    WSTACK_PUSH(s, ENC_PATCH_FUN_SIZE);
+		    WSTACK_PUSH(s, (UWord) ep); /* Position for patching in size */
 		    ep += 4;
 		    *ep = funp->arity;
 		    ep += 1;
@@ -2722,8 +2713,8 @@ enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dfla
 
 		fun_env:
 		    for (ei = funp->num_free-1; ei > 0; ei--) {
-			WSTACK_PUSH(com, ENC_TERM);
-			ESTACK_PUSH(s, (UWord) funp->env[ei]);
+			WSTACK_PUSH(s, ENC_TERM);
+			WSTACK_PUSH(s, (UWord) funp->env[ei]);
 		    }
 		    if (funp->num_free != 0) {
 			obj = funp->env[0];
@@ -2766,13 +2757,9 @@ enc_term_int(Process *p,ErtsAtomCacheMap *acmp, Eterm obj, byte* ep, Uint32 dfla
 	    break;
 	}
     }
-    DESTROY_ESTACK(s);
-    DESTROY_WSTACK(com);
-    if (p && p->extra_root) {
-	cleanup_ttb_extra_root_2(p->extra_root);
-	p->extra_root = NULL;
-    }
-    if (count_reds) {
+    DESTROY_WSTACK(s);
+    if (ctx) {
+	ASSERT(ctx->wstack.wstart == NULL);
 	*reds = r;
     }
     *res = ep;
@@ -3742,26 +3729,24 @@ static Uint encode_size_struct2(ErtsAtomCacheMap *acmp, Eterm obj, unsigned dfla
 }
 
 static int
-encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj, 
+encode_size_struct_int(TTBSizeContext* ctx, ErtsAtomCacheMap *acmp, Eterm obj,
 		       unsigned dflags, Sint *reds, Uint *res)
 {
     DECLARE_ESTACK(s);
     Uint m, i, arity;
     Uint result = 0;
-    int count_reds = (p != NULL && reds != 0);
     Sint r = 0;
 
-    if (count_reds) {
-	ESTACK_CHANGE_ALLOCATOR(s, ERTS_ALC_T_EXTRA_ROOT);
+    if (ctx) {
+	ESTACK_CHANGE_ALLOCATOR(s, ERTS_ALC_T_SAVED_ESTACK);
 	r = *reds;
-    }
-
-    if (p && p->extra_root) { /* restore saved stack */
-	ESTACK_RESTORE(s,p->extra_root->objv, p->extra_root->sz + 1);
-	result = ESTACK_POP(s); /*Untagged, beyond  p->extra_root->sz */
-	obj = ESTACK_POP(s);
 
-    } 
+	if (ctx->estack.start) { /* restore saved stack */
+	    ESTACK_RESTORE(s, &ctx->estack);
+	    result = ctx->result;
+	    obj = ctx->obj;
+	}
+    }
 
     goto L_jump_start;
 
@@ -3787,18 +3772,11 @@ encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj,
 	}
     
     L_jump_start:
-	if (count_reds && --r == 0) {
+	if (ctx && --r == 0) {
 	    *reds = r;
-	    ESTACK_PUSH(s,obj); /* push back current object */
-	    ESTACK_PUSH(s,result); /* Untagged, will be out of GC reach */
-	    if (p->extra_root == NULL) {
-		p->extra_root = erts_alloc(ERTS_ALC_T_EXTRA_ROOT, sizeof(ErlExtraRootSet));
-		p->extra_root->objv = NULL;
-		p->extra_root->sz = 0;
-		p->extra_root->cleanup = cleanup_ttb_extra_root;
-	    }
-	    ESTACK_SAVE(s, p->extra_root->objv, p->extra_root->sz);
-	    --p->extra_root->sz; /* Hide result from GC */
+	    ctx->obj = obj;
+	    ctx->result = result;
+	    ESTACK_SAVE(s, &ctx->estack);
 	    return -1;
 	}
 	switch (tag_val_def(obj)) {
@@ -4001,11 +3979,8 @@ encode_size_struct_int(Process *p, ErtsAtomCacheMap *acmp, Eterm obj,
     }
 
     DESTROY_ESTACK(s);
-    if (p && p->extra_root) {
-	cleanup_ttb_extra_root(p->extra_root);
-	p->extra_root = NULL;
-    }
-    if (count_reds) {
+    if (ctx) {
+	ASSERT(ctx->estack.start == NULL);
 	*reds = r;
     }
     *res = result;
@@ -4074,7 +4049,9 @@ init_done:
 	switch (tag) {
 	case INTEGER_EXT:
 	    SKIP(4);
+#if !defined(ARCH_64) || HALFWORD_HEAP
 	    heap_size += BIG_UINT_HEAP_SIZE;
+#endif
 	    break;
 	case SMALL_INTEGER_EXT:
 	    SKIP(1);
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 6e5d352e5b..c183c519ff 100755
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1996-2013. All Rights Reserved.
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -370,231 +370,233 @@ extern int stackdump_on_exit;
  * DESTROY_ESTACK(Stack)
  */
 
+typedef struct {
+    UWord* start;
+    UWord* sp;
+    UWord* end;
+    ErtsAlcType_t alloc_type;
+}ErtsEStack;
 
-void erl_grow_stack(ErtsAlcType_t a_type, Eterm** start, Eterm** sp, Eterm** end);
-#define ESTK_CONCAT(a,b) a##b
-#define ESTK_SUBSCRIPT(s,i) *((Eterm *)((byte *)ESTK_CONCAT(s,_start) + (i)))
 #define DEF_ESTACK_SIZE (16)
 
-#define DECLARE_ESTACK(s)						\
-    Eterm ESTK_CONCAT(s,_default_stack)[DEF_ESTACK_SIZE];		\
-    Eterm* ESTK_CONCAT(s,_start) = ESTK_CONCAT(s,_default_stack);	\
-    Eterm* ESTK_CONCAT(s,_sp) = ESTK_CONCAT(s,_start);			\
-    Eterm* ESTK_CONCAT(s,_end) = ESTK_CONCAT(s,_start) + DEF_ESTACK_SIZE;\
-    ErtsAlcType_t ESTK_CONCAT(s,_alloc_type) = ERTS_ALC_T_ESTACK
+void erl_grow_estack(ErtsEStack*, Eterm* def_stack);
+#define ESTK_CONCAT(a,b) a##b
+#define ESTK_DEF_STACK(s) ESTK_CONCAT(s,_default_estack)
+
+#define DECLARE_ESTACK(s)				\
+    UWord ESTK_DEF_STACK(s)[DEF_ESTACK_SIZE];		\
+    ErtsEStack s = {					\
+        ESTK_DEF_STACK(s),  /* start */ 		\
+        ESTK_DEF_STACK(s),  /* sp */			\
+        ESTK_DEF_STACK(s) + DEF_ESTACK_SIZE, /* end */	\
+        ERTS_ALC_T_ESTACK /* alloc_type */		\
+    }
 
 #define ESTACK_CHANGE_ALLOCATOR(s,t)					\
 do {									\
-    if (ESTK_CONCAT(s,_start) != ESTK_CONCAT(s,_default_stack)) {	\
+    if (s.start != ESTK_DEF_STACK(s)) {					\
 	erl_exit(1, "Internal error - trying to change allocator "	\
 		 "type of active estack\n");				\
     }									\
-    ESTK_CONCAT(s,_alloc_type) = (t);					\
+    s.alloc_type = (t);							\
  } while (0)
 
+#define DESTROY_ESTACK(s)				\
+do {							\
+    if (s.start != ESTK_DEF_STACK(s)) {			\
+	erts_free(s.alloc_type, s.start); 		\
+    }							\
+} while(0)
+
+
 /*
- * Do not free the stack after this, it may have pointers into what 
- * was saved in 'v'. 'v' and 'vsize' are changed by this macro. If
- * 'v' points to anything, it should have been allocated by a previous 
- * call to this macro. Be careful to set a correct allocator prior to 
- * saving.
- * 'v' can be any lvalue pointer, it will point to an array of UWord
- * after calling this macro.
+ * Do not free the stack after this, it may have pointers into what
+ * was saved in 'dst'.
  */
-#define ESTACK_SAVE(s,v,vsize) /* v and vsize are "name parameters" */	\
-do {									\
-    Uint _esz = ESTACK_COUNT(s);					\
-    if (ESTK_CONCAT(s,_start) == ESTK_CONCAT(s,_default_stack)) {	\
-	if ((v) == NULL) {						\
-	    (v) = erts_alloc(ESTK_CONCAT(s,_alloc_type),		\
-			     DEF_ESTACK_SIZE * sizeof(Eterm));		\
-	}								\
-	memcpy((v),ESTK_CONCAT(s,_start),_esz*sizeof(Eterm));		\
-    } else {								\
-	(v) = (void *) ESTK_CONCAT(s,_start);				\
-    }									\
-    (vsize) = _esz;							\
+#define ESTACK_SAVE(s,dst)\
+do {\
+    if (s.start == ESTK_DEF_STACK(s)) {\
+	UWord _wsz = ESTACK_COUNT(s);\
+	(dst)->start = erts_alloc(s.alloc_type,\
+				  DEF_ESTACK_SIZE * sizeof(UWord));\
+	memcpy((dst)->start, s.start,_wsz*sizeof(UWord));\
+	(dst)->sp = (dst)->start + _wsz;\
+	(dst)->end = (dst)->start + DEF_ESTACK_SIZE;\
+	(dst)->alloc_type = s.alloc_type;\
+    } else\
+        *(dst) = s;\
  } while (0)
 
-/* 
- * Use on empty stack, only the allocator can be changed before this 
- * The vector parameter is reset to NULL if the vector is moved to stack,
- * otherwise it's kept for reuse, so a saved and restored vector might 
- * need freeing using the correct allocator parameter.
- * 'v' can be any lvalue pointer, it's cast to an (Eterm *).
+#define DESTROY_SAVED_ESTACK(estack)\
+do {\
+    if ((estack)->start) {\
+	erts_free((estack)->alloc_type, (estack)->start);\
+	(estack)->start = NULL;\
+    }\
+} while(0)
+
+/*
+ * Use on empty stack, only the allocator can be changed before this.
+ * The src stack is reset to NULL.
  */
-#define ESTACK_RESTORE(s, v, vsize) /*v is a "name parameter"*/ \
-do {								\
-    if ((vsize) > DEF_ESTACK_SIZE) {				\
-	Uint _ca = DEF_ESTACK_SIZE;				\
-	while (_ca < (vsize))					\
-	    _ca = _ca * 2;					\
-	ESTK_CONCAT(s,_start) = (Eterm *) (v);			\
-	ESTK_CONCAT(s,_end) = ((Eterm *)(v)) + _ca;		\
-	ESTK_CONCAT(s,_sp) = ESTK_CONCAT(s,_start) + (vsize);	\
-	(v) = NULL;                                             \
-    } else {							\
-	memcpy(ESTK_CONCAT(s,_start),(v),(vsize)*sizeof(Eterm));\
-	ESTK_CONCAT(s,_sp) = ESTK_CONCAT(s,_start) + (vsize);	\
-    }								\
- } while (0)
+#define ESTACK_RESTORE(s, src)			\
+do {						\
+    ASSERT(s.start == ESTK_DEF_STACK(s));	\
+    s = *(src);  /* struct copy */		\
+    (src)->start = NULL;			\
+    ASSERT(s.sp >= s.start);			\
+    ASSERT(s.sp <= s.end);			\
+} while (0)
 
-#define ESTACK_IS_STATIC(s) (ESTK_CONCAT(s,_start) == ESTK_CONCAT(s,_default_stack))
+#define ESTACK_IS_STATIC(s) (s.start == ESTK_DEF_STACK(s)))
 
-#define DESTROY_ESTACK(s)						\
-do {									\
-    if (ESTK_CONCAT(s,_start) != ESTK_CONCAT(s,_default_stack)) {	\
-	erts_free(ESTK_CONCAT(s,_alloc_type), ESTK_CONCAT(s,_start));		\
-    }									\
+#define ESTACK_PUSH(s, x)				\
+do {							\
+    if (s.sp == s.end) {				\
+	erl_grow_estack(&s, ESTK_DEF_STACK(s)); 	\
+    }							\
+    *s.sp++ = (x);					\
 } while(0)
 
-#define ESTACK_PUSH(s, x)						\
-do {									\
-    if (ESTK_CONCAT(s,_sp) == ESTK_CONCAT(s,_end)) {			\
-	erl_grow_stack(ESTK_CONCAT(s,_alloc_type),&ESTK_CONCAT(s,_start), \
-		       &ESTK_CONCAT(s,_sp), &ESTK_CONCAT(s,_end));	\
-    }									\
-    *ESTK_CONCAT(s,_sp)++ = (x);					\
+#define ESTACK_PUSH2(s, x, y)			\
+do {						\
+    if (s.sp > s.end - 2) {			\
+	erl_grow_estack(&s, ESTK_DEF_STACK(s)); \
+    }						\
+    *s.sp++ = (x);				\
+    *s.sp++ = (y);				\
 } while(0)
 
-#define ESTACK_PUSH2(s, x, y)						\
-do {									\
-    if (ESTK_CONCAT(s,_sp) > ESTK_CONCAT(s,_end) - 2) {			\
-	erl_grow_stack(ESTK_CONCAT(s,_alloc_type),&ESTK_CONCAT(s,_start), \
-		&ESTK_CONCAT(s,_sp), &ESTK_CONCAT(s,_end));	\
-    }									\
-    *ESTK_CONCAT(s,_sp)++ = (x);					\
-    *ESTK_CONCAT(s,_sp)++ = (y);					\
+#define ESTACK_PUSH3(s, x, y, z)		\
+do {						\
+    if (s.sp > s.end - 3) {			\
+	erl_grow_estack(&s, ESTK_DEF_STACK(s)); \
+    }						\
+    *s.sp++ = (x);				\
+    *s.sp++ = (y);				\
+    *s.sp++ = (z);				\
 } while(0)
 
-#define ESTACK_PUSH3(s, x, y, z)					\
-do {									\
-    if (ESTK_CONCAT(s,_sp) > ESTK_CONCAT(s,_end) - 3) {			\
-	erl_grow_stack(&ESTK_CONCAT(s,_start), &ESTK_CONCAT(s,_sp),	\
-		&ESTK_CONCAT(s,_end));					\
-    }									\
-    *ESTK_CONCAT(s,_sp)++ = (x);					\
-    *ESTK_CONCAT(s,_sp)++ = (y);					\
-    *ESTK_CONCAT(s,_sp)++ = (z);					\
-} while(0)
+#define ESTACK_COUNT(s) (s.sp - s.start)
+#define ESTACK_ISEMPTY(s) (s.sp == s.start)
+#define ESTACK_POP(s) (*(--s.sp))
 
-#define ESTACK_COUNT(s) (ESTK_CONCAT(s,_sp) - ESTK_CONCAT(s,_start))
 
-#define ESTACK_ISEMPTY(s) (ESTK_CONCAT(s,_sp) == ESTK_CONCAT(s,_start))
-#define ESTACK_POP(s) (*(--ESTK_CONCAT(s,_sp)))
+/*
+ * WSTACK: same as ESTACK but with UWord instead of Eterm
+ */
 
+typedef struct {
+    UWord* wstart;
+    UWord* wsp;
+    UWord* wend;
+    ErtsAlcType_t alloc_type;
+}ErtsWStack;
 
-void erl_grow_wstack(ErtsAlcType_t a_type, UWord** start, UWord** sp, UWord** end);
-#define WSTK_CONCAT(a,b) a##b
-#define WSTK_SUBSCRIPT(s,i) *((UWord *)((byte *)WSTK_CONCAT(s,_start) + (i)))
 #define DEF_WSTACK_SIZE (16)
 
-#define DECLARE_WSTACK(s)						\
-    UWord WSTK_CONCAT(s,_default_stack)[DEF_WSTACK_SIZE];		\
-    UWord* WSTK_CONCAT(s,_start) = WSTK_CONCAT(s,_default_stack);	\
-    UWord* WSTK_CONCAT(s,_sp) = WSTK_CONCAT(s,_start);			\
-    UWord* WSTK_CONCAT(s,_end) = WSTK_CONCAT(s,_start) + DEF_WSTACK_SIZE; \
-    ErtsAlcType_t WSTK_CONCAT(s,_alloc_type) = ERTS_ALC_T_ESTACK
+void erl_grow_wstack(ErtsWStack*, Eterm* def_stack);
+#define WSTK_CONCAT(a,b) a##b
+#define WSTK_DEF_STACK(s) WSTK_CONCAT(s,_default_wstack)
+
+#define DECLARE_WSTACK(s)				\
+    UWord WSTK_DEF_STACK(s)[DEF_WSTACK_SIZE];		\
+    ErtsWStack s = {					\
+        WSTK_DEF_STACK(s),  /* wstart */ 		\
+        WSTK_DEF_STACK(s),  /* wsp */			\
+        WSTK_DEF_STACK(s) + DEF_WSTACK_SIZE, /* wend */	\
+        ERTS_ALC_T_ESTACK /* alloc_type */		\
+    }
 
 #define WSTACK_CHANGE_ALLOCATOR(s,t)					\
 do {									\
-    if (WSTK_CONCAT(s,_start) != WSTK_CONCAT(s,_default_stack)) {	\
+    if (s.wstart != WSTK_DEF_STACK(s)) {				\
 	erl_exit(1, "Internal error - trying to change allocator "	\
 		 "type of active wstack\n");				\
     }									\
-    WSTK_CONCAT(s,_alloc_type) = (t);					\
+    s.alloc_type = (t);							\
  } while (0)
 
-#define DESTROY_WSTACK(s)						\
-do {									\
-    if (WSTK_CONCAT(s,_start) != WSTK_CONCAT(s,_default_stack)) {	\
-	erts_free(WSTK_CONCAT(s,_alloc_type), WSTK_CONCAT(s,_start));		\
-    }									\
+#define DESTROY_WSTACK(s)				\
+do {							\
+    if (s.wstart != WSTK_DEF_STACK(s)) {		\
+	erts_free(s.alloc_type, s.wstart); 		\
+    }							\
 } while(0)
 
+
 /*
- * Do not free the stack after this, it may have pointers into what 
- * was saved in 'v'. 'v' and 'vsize' are changed by this macro. If
- * 'v' points to anything, it should have been allocated by a previous 
- * call to this macro. Be careful to set a correct allocator prior to 
- * saving.
- * 'v' can be any lvalue pointer, it will point to an array of UWord
- * after calling this macro.
+ * Do not free the stack after this, it may have pointers into what
+ * was saved in 'dst'.
  */
-#define WSTACK_SAVE(s,v,vsize) /* v and vsize are "name parameters" */	\
-do {									\
-    Uint _wsz = WSTACK_COUNT(s);					\
-    if (WSTK_CONCAT(s,_start) == WSTK_CONCAT(s,_default_stack)) {	\
-	if ((v) == NULL) {						\
-	    (v) = erts_alloc(WSTK_CONCAT(s,_alloc_type),		\
-			     DEF_WSTACK_SIZE * sizeof(UWord));		\
-	}								\
-	memcpy((v),WSTK_CONCAT(s,_start),_wsz*sizeof(UWord));		\
-    } else {								\
-	(v) = (void *) WSTK_CONCAT(s,_start);				\
-    }									\
-    (vsize) = _wsz;							\
+#define WSTACK_SAVE(s,dst)\
+do {\
+    if (s.wstart == WSTK_DEF_STACK(s)) {\
+	UWord _wsz = WSTACK_COUNT(s);\
+	(dst)->wstart = erts_alloc(s.alloc_type,\
+				  DEF_WSTACK_SIZE * sizeof(UWord));\
+	memcpy((dst)->wstart, s.wstart,_wsz*sizeof(UWord));\
+	(dst)->wsp = (dst)->wstart + _wsz;\
+	(dst)->wend = (dst)->wstart + DEF_WSTACK_SIZE;\
+	(dst)->alloc_type = s.alloc_type;\
+    } else\
+        *(dst) = s;\
  } while (0)
 
-/* 
- * Use on empty stack, only the allocator can be changed before this 
- * The vector parameter is reset to NULL if the vector is moved to stack,
- * otherwise it's kept for reuse, so a saved and restored vector might 
- * need freeing using the correct allocator parameter.
- * 'v' can be any lvalue pointer, it's cast to an (UWord *).
+#define DESTROY_SAVED_WSTACK(wstack)\
+do {\
+    if ((wstack)->wstart) {\
+	erts_free((wstack)->alloc_type, (wstack)->wstart);\
+	(wstack)->wstart = NULL;\
+    }\
+} while(0)
+
+/*
+ * Use on empty stack, only the allocator can be changed before this.
+ * The src stack is reset to NULL.
  */
-#define WSTACK_RESTORE(s, v, vsize) /*v is a "name parameter"*/ \
-do {								\
-    if ((vsize) > DEF_WSTACK_SIZE) {				\
-	Uint _ca = DEF_WSTACK_SIZE;				\
-	while (_ca < (vsize))					\
-	    _ca = _ca * 2;					\
-	WSTK_CONCAT(s,_start) = (UWord *) (v);			\
-	WSTK_CONCAT(s,_end) = ((UWord *)(v)) + _ca;		\
-	WSTK_CONCAT(s,_sp) = WSTK_CONCAT(s,_start) + (vsize);	\
-	(v) = NULL;                                             \
-    } else {							\
-	memcpy(WSTK_CONCAT(s,_start),(v),(vsize)*sizeof(UWord));\
-	WSTK_CONCAT(s,_sp) = WSTK_CONCAT(s,_start) + (vsize);	\
-    }								\
- } while (0)
+#define WSTACK_RESTORE(s, src)			\
+do {						\
+    ASSERT(s.wstart == WSTK_DEF_STACK(s));	\
+    s = *(src);  /* struct copy */		\
+    (src)->wstart = NULL;			\
+    ASSERT(s.wsp >= s.wstart);			\
+    ASSERT(s.wsp <= s.wend);			\
+} while (0)
 
-#define WSTACK_IS_STATIC(s) (WSTK_CONCAT(s,_start) == WSTK_CONCAT(s,_default_stack))
+#define WSTACK_IS_STATIC(s) (s.wstart == WSTK_DEF_STACK(s)))
 
-#define WSTACK_PUSH(s, x)						\
-do {									\
-    if (WSTK_CONCAT(s,_sp) == WSTK_CONCAT(s,_end)) {			\
-	erl_grow_wstack(WSTK_CONCAT(s,_alloc_type), &WSTK_CONCAT(s,_start), \
-			&WSTK_CONCAT(s,_sp), &WSTK_CONCAT(s,_end));	\
-    }									\
-    *WSTK_CONCAT(s,_sp)++ = (x);					\
+#define WSTACK_PUSH(s, x)				\
+do {							\
+    if (s.wsp == s.wend) {				\
+	erl_grow_wstack(&s, WSTK_DEF_STACK(s)); 	\
+    }							\
+    *s.wsp++ = (x);					\
 } while(0)
 
-#define WSTACK_PUSH2(s, x, y)						\
-do {									\
-    if (WSTK_CONCAT(s,_sp) > WSTK_CONCAT(s,_end) - 2) {			\
-	erl_grow_wstack(WSTK_CONCAT(s,_alloc_type), &WSTK_CONCAT(s,_start), \
-		 &WSTK_CONCAT(s,_sp), &WSTK_CONCAT(s,_end));	        \
-    }									\
-    *WSTK_CONCAT(s,_sp)++ = (x);					\
-    *WSTK_CONCAT(s,_sp)++ = (y);					\
+#define WSTACK_PUSH2(s, x, y)			\
+do {						\
+    if (s.wsp > s.wend - 2) {			\
+	erl_grow_wstack(&s, WSTK_DEF_STACK(s)); \
+    }						\
+    *s.wsp++ = (x);				\
+    *s.wsp++ = (y);				\
 } while(0)
 
-#define WSTACK_PUSH3(s, x, y, z)					\
-do {									\
-    if (WSTK_CONCAT(s,_sp) > WSTK_CONCAT(s,_end) - 3) {			\
-	erl_grow_wstack(WSTK_CONCAT(s,_alloc_type), &WSTK_CONCAT(s,_start), \
-		&WSTK_CONCAT(s,_sp), &WSTK_CONCAT(s,_end));		\
-    }									\
-    *WSTK_CONCAT(s,_sp)++ = (x);					\
-    *WSTK_CONCAT(s,_sp)++ = (y);					\
-    *WSTK_CONCAT(s,_sp)++ = (z);					\
+#define WSTACK_PUSH3(s, x, y, z)		\
+do {						\
+    if (s.wsp > s.wend - 3) {	\
+	erl_grow_wstack(&s, WSTK_DEF_STACK(s)); \
+    }						\
+    *s.wsp++ = (x);				\
+    *s.wsp++ = (y);				\
+    *s.wsp++ = (z);				\
 } while(0)
 
-#define WSTACK_COUNT(s) (WSTK_CONCAT(s,_sp) - WSTK_CONCAT(s,_start))
+#define WSTACK_COUNT(s) (s.wsp - s.wstart)
+#define WSTACK_ISEMPTY(s) (s.wsp == s.wstart)
+#define WSTACK_POP(s) (*(--s.wsp))
 
-#define WSTACK_ISEMPTY(s) (WSTK_CONCAT(s,_sp) == WSTK_CONCAT(s,_start))
-#define WSTACK_POP(s) (*(--WSTK_CONCAT(s,_sp)))
 
 /* binary.c */
 
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 297c4bf439..7f8bdcb2ca 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -1,7 +1,7 @@
 /*
  * %CopyrightBegin%
  *
- * Copyright Ericsson AB 1996-2013. All Rights Reserved.
+ * Copyright Ericsson AB 1996-2014. All Rights Reserved.
  *
  * The contents of this file are subject to the Erlang Public License,
  * Version 1.1, (the "License"); you may not use this file except in
@@ -185,39 +185,41 @@ erts_set_hole_marker(Eterm* ptr, Uint sz)
  * Helper function for the ESTACK macros defined in global.h.
  */
 void
-erl_grow_stack(ErtsAlcType_t a_type, Eterm** start, Eterm** sp, Eterm** end)
+erl_grow_estack(ErtsEStack* s, Eterm* default_estack)
 {
-    Uint old_size = (*end - *start);
+    Uint old_size = (s->end - s->start);
     Uint new_size = old_size * 2;
-    Uint sp_offs = *sp - *start;
-    if (new_size > 2 * DEF_ESTACK_SIZE) {
-	*start = erts_realloc(a_type, (void *) *start, new_size*sizeof(Eterm));
+    Uint sp_offs = s->sp - s->start;
+    if (s->start != default_estack) {
+	s->start = erts_realloc(s->alloc_type, s->start,
+				new_size*sizeof(Eterm));
     } else {
-	Eterm* new_ptr = erts_alloc(a_type, new_size*sizeof(Eterm));
-	sys_memcpy(new_ptr, *start, old_size*sizeof(Eterm));
-	*start = new_ptr;
+	Eterm* new_ptr = erts_alloc(s->alloc_type, new_size*sizeof(Eterm));
+	sys_memcpy(new_ptr, s->start, old_size*sizeof(Eterm));
+	s->start = new_ptr;
     }
-    *end = *start + new_size;
-    *sp = *start + sp_offs;
+    s->end = s->start + new_size;
+    s->sp = s->start + sp_offs;
 }
 /*
- * Helper function for the ESTACK macros defined in global.h.
+ * Helper function for the WSTACK macros defined in global.h.
  */
 void
-erl_grow_wstack(ErtsAlcType_t a_type, UWord** start, UWord** sp, UWord** end)
+erl_grow_wstack(ErtsWStack* s, UWord* default_wstack)
 {
-    Uint old_size = (*end - *start);
+    Uint old_size = (s->wend - s->wstart);
     Uint new_size = old_size * 2;
-    Uint sp_offs = *sp - *start;
-    if (new_size > 2 * DEF_ESTACK_SIZE) {
-	*start = erts_realloc(a_type, (void *) *start, new_size*sizeof(UWord));
+    Uint sp_offs = s->wsp - s->wstart;
+    if (s->wstart != default_wstack) {
+	s->wstart = erts_realloc(s->alloc_type, s->wstart,
+				 new_size*sizeof(UWord));
     } else {
-	UWord* new_ptr = erts_alloc(a_type, new_size*sizeof(UWord));
-	sys_memcpy(new_ptr, *start, old_size*sizeof(UWord));
-	*start = new_ptr;
+	UWord* new_ptr = erts_alloc(s->alloc_type, new_size*sizeof(UWord));
+	sys_memcpy(new_ptr, s->wstart, old_size*sizeof(UWord));
+	s->wstart = new_ptr;
     }
-    *end = *start + new_size;
-    *sp = *start + sp_offs;
+    s->wend = s->wstart + new_size;
+    s->wsp = s->wstart + sp_offs;
 }
 
 /* CTYPE macros */
@@ -2846,7 +2848,7 @@ pop_next:
     return 0;
 
 not_equal:
-    DESTROY_ESTACK(stack);
+    DESTROY_WSTACK(stack);
     return j;
 
 #undef CMP_NODES
diff --git a/erts/emulator/drivers/common/efile_drv.c b/erts/emulator/drivers/common/efile_drv.c
index 8de578d8b7..dca979c13a 100644
--- a/erts/emulator/drivers/common/efile_drv.c
+++ b/erts/emulator/drivers/common/efile_drv.c
@@ -111,7 +111,6 @@
 #include "erl_driver.h"
 #include "erl_efile.h"
 #include "erl_threads.h"
-#include "zlib.h"
 #include "gzio.h"
 #include "dtrace-wrapper.h" 
 #include <ctype.h>
@@ -818,7 +817,7 @@ file_start(ErlDrvPort port, char* command)
 
 static void do_close(int flags, SWord fd) {
     if (flags & EFILE_COMPRESSED) {
-	erts_gzclose((gzFile)(fd));
+	erts_gzclose((ErtsGzFile)(fd));
     } else {
 	efile_closefile((int) fd);
     }
@@ -1136,7 +1135,7 @@ static void invoke_read(void *data)
     }
     read_size = size;
     if (d->flags & EFILE_COMPRESSED) {
-	read_size = erts_gzread((gzFile)d->fd, 
+	read_size = erts_gzread((ErtsGzFile)d->fd,
 				d->c.read.binp->orig_bytes + d->c.read.bin_offset,
 				size);
 	status = (read_size != (size_t) -1);
@@ -1209,7 +1208,7 @@ static void invoke_read_line(void *data)
 	    size = need - d->c.read_line.read_size;
 	}
 	if (d->flags & EFILE_COMPRESSED) {
-	    read_size = erts_gzread((gzFile)d->fd, 
+	    read_size = erts_gzread((ErtsGzFile)d->fd,
 				    d->c.read_line.binp->orig_bytes + 
 				    d->c.read_line.read_offset + d->c.read_line.read_size,
 				    size);
@@ -1250,7 +1249,7 @@ static void invoke_read_line(void *data)
 		    d->c.read_line.read_size -= too_much;
 		    ASSERT(d->c.read_line.read_size >= 0);
 		    if (d->flags & EFILE_COMPRESSED) {
-			Sint64 location = erts_gzseek((gzFile)d->fd, 
+			Sint64 location = erts_gzseek((ErtsGzFile)d->fd,
 						      -((Sint64) too_much), EFILE_SEEK_CUR);
 			if (location == -1) {
 			    d->result_ok = 0;
@@ -1535,7 +1534,7 @@ static void invoke_writev(void *data) {
 		     */
 		    errno = EINVAL; 
 		    if (! (status = 
-			   erts_gzwrite((gzFile)d->fd, 
+			   erts_gzwrite((ErtsGzFile)d->fd,
 					iov[i].iov_base,
 					iov[i].iov_len)) == iov[i].iov_len) {
 			d->errInfo.posix_errno =
@@ -1797,7 +1796,7 @@ static void invoke_lseek(void *data)
 	    d->errInfo.posix_errno = EINVAL;
 	    status = 0;
 	} else {
-	    d->c.lseek.location = erts_gzseek((gzFile)d->fd, 
+	    d->c.lseek.location = erts_gzseek((ErtsGzFile)d->fd,
 					      offset, d->c.lseek.origin);
 	    if (d->c.lseek.location == -1) {
 		d->errInfo.posix_errno = errno;
@@ -1885,7 +1884,7 @@ static void invoke_open(void *data)
 	    if (status || (d->errInfo.posix_errno != EISDIR)) {
 		mode = (d->flags & EFILE_MODE_READ) ? "rb" : "wb";
 		d->fd = (SWord) erts_gzopen(d->b, mode);
-		if ((gzFile)d->fd) {
+		if ((ErtsGzFile)d->fd) {
 		    status = 1;
 		} else {
 		    if (errno == 0) {
diff --git a/erts/emulator/drivers/common/gzio.c b/erts/emulator/drivers/common/gzio.c
index e085c262b0..653f3954b1 100644
--- a/erts/emulator/drivers/common/gzio.c
+++ b/erts/emulator/drivers/common/gzio.c
@@ -77,7 +77,7 @@ typedef struct gz_stream {
 					    *  this structure. */
 } gz_stream;
 
-local gzFile gz_open      OF((const char *path, const char *mode));
+local ErtsGzFile gz_open      OF((const char *path, const char *mode));
 local int    get_byte     OF((gz_stream *s));
 local void   check_header OF((gz_stream *s));
 local int    destroy      OF((gz_stream *s));
@@ -144,7 +144,7 @@ local uLong  getLong      OF((gz_stream *s));
    can be checked to distinguish the two cases (if errno is zero, the
    zlib error is Z_MEM_ERROR).
 */
-local gzFile gz_open (path, mode)
+local ErtsGzFile gz_open (path, mode)
     const char *path;
     const char *mode;
 {
@@ -179,7 +179,7 @@ local gzFile gz_open (path, mode)
 
     s->path = (char*)ALLOC(FILENAME_BYTELEN(path)+FILENAME_CHARSIZE);
     if (s->path == NULL) {
-        return s->destroy(s), (gzFile)Z_NULL;
+        return s->destroy(s), (ErtsGzFile)Z_NULL;
     }
     FILENAME_COPY(s->path, path); /* do this early for debugging */
 
@@ -197,7 +197,7 @@ local gzFile gz_open (path, mode)
     } while (*p++ && m < fmode + sizeof(fmode) - 1);
     *m = '\0';
     if (s->mode == '\0')
-	return s->destroy(s), (gzFile)Z_NULL;
+	return s->destroy(s), (ErtsGzFile)Z_NULL;
     
     if (s->mode == 'w') {
         err = deflateInit2(&(s->stream), level,
@@ -207,7 +207,7 @@ local gzFile gz_open (path, mode)
         s->stream.next_out = s->outbuf = (Byte*)ALLOC(Z_BUFSIZE);
 
         if (err != Z_OK || s->outbuf == Z_NULL) {
-            return s->destroy(s), (gzFile)Z_NULL;
+            return s->destroy(s), (ErtsGzFile)Z_NULL;
         }
     } else {
 	/*
@@ -221,7 +221,7 @@ local gzFile gz_open (path, mode)
         s->stream.next_in  = s->inbuf = (Byte*)ALLOC(Z_BUFSIZE);
 
         if (err != Z_OK || s->inbuf == Z_NULL) {
-            return s->destroy(s), (gzFile)Z_NULL;
+            return s->destroy(s), (ErtsGzFile)Z_NULL;
         }
     }
     s->stream.avail_out = Z_BUFSIZE;
@@ -229,17 +229,16 @@ local gzFile gz_open (path, mode)
     errno = 0;
 #if defined(FILENAMES_16BIT)
     {
-	char wfmode[160];
-	int i=0,j;
-	for(j=0;fmode[j] != '\0';++j) {
-	    wfmode[i++]=fmode[j];
-	    wfmode[i++]='\0';
+	WCHAR wfmode[80];
+	int i = 0;
+	int j;
+	for(j = 0; fmode[j] != '\0'; ++j) {
+	    wfmode[i++] = (WCHAR) fmode[j];
 	}
-	wfmode[i++] = '\0';
-	wfmode[i++] = '\0';
-	s->file = F_OPEN(path, wfmode);
+	wfmode[i++] = L'\0';
+	s->file = _wfopen((WCHAR *)path, wfmode);
 	if (s->file == NULL) {
-	    return s->destroy(s), (gzFile)Z_NULL;
+	    return s->destroy(s), (ErtsGzFile)Z_NULL;
 	}
     }
 #elif defined(UNIX)
@@ -249,18 +248,18 @@ local gzFile gz_open (path, mode)
 	s->file = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0666);
     }
     if (s->file == -1) {
-        return s->destroy(s), (gzFile)Z_NULL;
+        return s->destroy(s), (ErtsGzFile)Z_NULL;
     }
 #else
-    s->file = F_OPEN(path, fmode);
+    s->file = fopen(path, fmode);
     if (s->file == NULL) {
-        return s->destroy(s), (gzFile)Z_NULL;
+        return s->destroy(s), (ErtsGzFile)Z_NULL;
     }
 #endif
     if (s->mode == 'r') {
 	check_header(s); /* skip the .gz header */
     }
-    return (gzFile)s;
+    return (ErtsGzFile)s;
 }
 
 /* ===========================================================================
@@ -296,7 +295,7 @@ local int gz_rewind (gz_stream *s)
 /* ===========================================================================
      Opens a gzip (.gz) file for reading or writing.
 */
-gzFile erts_gzopen (path, mode)
+ErtsGzFile erts_gzopen (path, mode)
     const char *path;
     const char *mode;
 {
@@ -447,7 +446,7 @@ local int destroy (s)
    gzread returns the number of bytes actually read (0 for end of file).
 */
 int
-erts_gzread(gzFile file, voidp buf, unsigned len)
+erts_gzread(ErtsGzFile file, voidp buf, unsigned len)
 {
     gz_stream *s = (gz_stream*)file;
     Bytef *start = buf; /* starting point for crc computation */
@@ -557,7 +556,7 @@ erts_gzread(gzFile file, voidp buf, unsigned len)
    gzwrite returns the number of bytes actually written (0 in case of error).
 */
 int
-erts_gzwrite(gzFile file, voidp buf, unsigned len)
+erts_gzwrite(ErtsGzFile file, voidp buf, unsigned len)
 {
     gz_stream *s = (gz_stream*)file;
 
@@ -593,7 +592,7 @@ erts_gzwrite(gzFile file, voidp buf, unsigned len)
  */
 
 int
-erts_gzseek(gzFile file, int offset, int whence)
+erts_gzseek(ErtsGzFile file, int offset, int whence)
 {
     int pos;
     gz_stream* s = (gz_stream *) file;
@@ -655,7 +654,7 @@ erts_gzseek(gzFile file, int offset, int whence)
    degrade compression.
 */
 int
-erts_gzflush(gzFile file, int flush)
+erts_gzflush(ErtsGzFile file, int flush)
 {
     uInt len;
     int done = 0;
@@ -714,7 +713,7 @@ local uLong getLong (s)
    and deallocates all the (de)compression state.
 */
 int
-erts_gzclose(gzFile file)
+erts_gzclose(ErtsGzFile file)
 {
     int err;
     gz_stream *s = (gz_stream*)file;
@@ -723,9 +722,9 @@ erts_gzclose(gzFile file)
 
     if (s->mode == 'w') {
         err = erts_gzflush (file, Z_FINISH);
-        if (err != Z_OK) return s->destroy(file);
+        if (err != Z_OK) return s->destroy(s);
     }
-    return s->destroy(file);
+    return s->destroy(s);
 }
 
 
diff --git a/erts/emulator/drivers/common/gzio.h b/erts/emulator/drivers/common/gzio.h
index 3f1e546140..ea50d922ec 100644
--- a/erts/emulator/drivers/common/gzio.h
+++ b/erts/emulator/drivers/common/gzio.h
@@ -17,11 +17,15 @@
  * %CopyrightEnd%
  */
 
-gzFile erts_gzopen (const char *path, const char *mode);
-int erts_gzread(gzFile file, voidp buf, unsigned len);
-int erts_gzwrite(gzFile file, voidp buf, unsigned len);
-int erts_gzseek(gzFile, int, int);
-int erts_gzflush(gzFile file, int flush);
-int erts_gzclose(gzFile file);
+#include "zlib.h"
+
+typedef struct erts_gzFile* ErtsGzFile;
+
+ErtsGzFile erts_gzopen (const char *path, const char *mode);
+int erts_gzread(ErtsGzFile file, voidp buf, unsigned len);
+int erts_gzwrite(ErtsGzFile file, voidp buf, unsigned len);
+int erts_gzseek(ErtsGzFile, int, int);
+int erts_gzflush(ErtsGzFile file, int flush);
+int erts_gzclose(ErtsGzFile file);
 ErlDrvBinary* erts_gzinflate_buffer(char*, uLong);
 ErlDrvBinary* erts_gzdeflate_buffer(char*, uLong);
diff --git a/erts/emulator/drivers/common/gzio_zutil.h b/erts/emulator/drivers/common/gzio_zutil.h
index 00eccc80fc..854205cc2c 100644
--- a/erts/emulator/drivers/common/gzio_zutil.h
+++ b/erts/emulator/drivers/common/gzio_zutil.h
@@ -23,12 +23,6 @@
  * that may change or not exist at all.
  */
 
-#ifndef HAVE_LIBZ
-/* Use our "real" copy of zutil.h if we don't use shared zlib */
-#include "zutil.h"
-
-#else /* HAVE_LIBZ: Shared zlib is used */
-
 #define local static
 #define DEF_MEM_LEVEL 8
 #define zmemcpy sys_memcpy
@@ -77,6 +71,3 @@
 #  define OS_CODE  0x03  /* assume Unix */
 #endif
 
-
-#endif /* HAVE_LIBZ */
-
diff --git a/erts/emulator/drivers/common/inet_drv.c b/erts/emulator/drivers/common/inet_drv.c
index 80937dfcc8..4a861b121c 100644
--- a/erts/emulator/drivers/common/inet_drv.c
+++ b/erts/emulator/drivers/common/inet_drv.c
@@ -854,9 +854,10 @@ static int my_strncasecmp(const char *s1, const char *s2, size_t n)
 #define INET_IFNAMSIZ          16
 
 /* INET Ignore states */
-#define INET_IGNORE_NONE 0
-#define INET_IGNORE_READ 1
-#define INET_IGNORE_WRITE 1 << 1
+#define INET_IGNORE_NONE    0
+#define INET_IGNORE_READ    (1 << 0)
+#define INET_IGNORE_WRITE   (1 << 1)
+#define INET_IGNORE_PASSIVE (1 << 2)
 
 /* Max length of Erlang Term Buffer (for outputting structured terms):  */
 #ifdef  HAVE_SCTP
@@ -8307,11 +8308,19 @@ static ErlDrvSSizeT inet_ctl(inet_descriptor* desc, int cmd, char* buf,
 
       if (*buf == 1 && !desc->is_ignored) {
 	  sock_select(desc, (FD_READ|FD_WRITE|FD_CLOSE|ERL_DRV_USE_NO_CALLBACK), 0);
-	  desc->is_ignored = INET_IGNORE_READ;
+	  if (desc->active)
+	    desc->is_ignored = INET_IGNORE_READ;
+	  else
+	    desc->is_ignored = INET_IGNORE_PASSIVE;
       } else if (*buf == 0 && desc->is_ignored) {
-	  int flags = (FD_READ|FD_CLOSE|((desc->is_ignored & INET_IGNORE_WRITE)?FD_WRITE:0));
+	  int flags = FD_CLOSE;
+	  if (desc->is_ignored & INET_IGNORE_READ)
+	    flags |= FD_READ;
+	  if (desc->is_ignored & INET_IGNORE_WRITE)
+	    flags |= FD_WRITE;
 	  desc->is_ignored = INET_IGNORE_NONE;
-	  sock_select(desc, flags, 1);
+	  if (flags != FD_CLOSE)
+	    sock_select(desc, flags, 1);
       } else
 	  return ctl_error(EINVAL, rbuf, rsize);
 
@@ -8988,6 +8997,8 @@ static ErlDrvSSizeT tcp_inet_ctl(ErlDrvData e, unsigned int cmd,
 		    driver_set_timer(desc->inet.port, timeout);
 		if (!INETP(desc)->is_ignored)
 		    sock_select(INETP(desc),(FD_READ|FD_CLOSE),1);
+		else
+		  INETP(desc)->is_ignored |= INET_IGNORE_READ;
 	    }
 	}
 	return ctl_reply(INET_REP_OK, tbuf, 2, rbuf, rsize);
diff --git a/erts/emulator/internal_doc/CarrierMigration.md b/erts/emulator/internal_doc/CarrierMigration.md
new file mode 100644
index 0000000000..b93c11c6ec
--- /dev/null
+++ b/erts/emulator/internal_doc/CarrierMigration.md
@@ -0,0 +1,201 @@
+Carrier Migration
+=================
+
+The ERTS memory allocators manage memory blocks in two types of raw
+memory chunks. We call these chunks of raw memory
+*carriers*. Singleblock carriers which only contain one large block,
+and multiblock carriers which contain multiple blocks. A carrier is
+typically created using `mmap()` on unix systems. However, how a
+carrier is created is of minor importance. An allocator instance
+typically manages a mixture of single- and multiblock carriers.
+
+Problem
+-------
+
+When a carrier is empty, i.e. contains only one large free block, it
+is deallocated. Since multiblock carriers can contain both allocated
+blocks and free blocks at the same time, an allocator instance might
+be stuck with a large amount of poorly utilized carriers if the memory
+load decrease. After a peak in memory usage it is expected that not
+all memory can be returned since the blocks still allocated is likely
+to be dispersed over multiple carriers. Such poorly utilized carriers
+can usually be reused if the memory load increase again. However,
+since each scheduler thread manages its own set of allocator
+instances, and memory load is not necessarily connected to CPU load we
+might get into a situation where there are lots of poorly utilized
+multiblock carriers on some allocator instances while we need to
+allocate new multiblock carriers on other allocator instances. In
+scenarios like this, the demand for multiblock carriers in the system
+might increase at the same time as the actual memory demand in the
+system has decreased which is both unwanted and quite unexpected for
+the end user.
+
+Solution
+--------
+
+In order to prevent scenarios like this we've implemented support for
+migration of multiblock carriers between allocator instances of the
+same type.
+
+### Management of Free Blocks ###
+
+In order to be able to remove a carrier from one allocator instance
+and add it to another we need to be able to move references to the
+free blocks of the carrier between the allocator instances. The
+allocator instance specific data structure referring to the free
+blocks it manages often refers to the same carrier from multiple
+places. For example, when the address order bestfit strategy is used
+this data structure is a binary search tree spanning all carriers that
+the allocator instance manages. Free blocks in one specific carrier
+can be referred to from potentially every other carrier that is
+managed, and the amount of such references can be huge. That is, the
+work of removing the free blocks of such a carrier from the search
+tree will be huge. One way of solving this could be to not migrate
+carriers that contain lots of free blocks, but this would prevent us
+from migrating carriers that potentially needs to be migrated in order
+to solve the problem we set out to solve.
+
+By using one data structure of free blocks in each carrier and an
+allocator instance wide data structure of carriers managed by the
+allocator instance, the work needed in order to remove and add
+carriers can be kept to a minimum. When migration of carriers is
+enabled on a specific allocator type, we require that an allocation
+strategy with such an implementation is used. Currently we've
+implemented this for three different allocation strategies. All of
+these strategies use a search tree of carriers sorted so that we can
+find the carrier with the lowest address that can satisfy the
+request. Internally in carriers we use yet another search tree that
+either implement address order first fit, address order best fit,
+or best fit. The abbreviations used for these different allocation
+strategies are `aoff`, and `aoffcaobf`, `aoffcbf`.
+
+### Carrier Pool ###
+
+In order to migrate carriers between allocator instances we move them
+through a pool of carriers. In order for a carrier migration to
+complete, one scheduler needs to move the carrier into the pool, and
+another scheduler needs to take the carrier out of the pool.
+
+The pool is implemented as a lock free, circular, double linked,
+list. The list contains a sentinel which is used as the starting point
+when inserting to, or fetching from the pool. Carriers in the pool are
+elements in this list.
+
+The list can be modified by all scheduler threads
+simultaneously. During modifications the double linked list is allowed
+to get a bit "out of shape". For example, following the `next` pointer
+to the next element and then following the `prev` pointer does not
+always take you back to were you started. The following is however
+always true:
+
+*   Repeatedly following `next` pointers will eventually take you to the
+    sentinel.
+*   Repeatedly following `prev` pointers will eventually take you to the
+    sentinel.
+*   Following a `next` or a `prev` pointer will take you to either an
+    element in the pool, or an element that used to be in the pool.
+
+When inserting a new element we search for a place to insert the
+element by only following `next` pointers, and we always begin by
+skipping the first element encountered. When trying to fetch an
+element we do the same thing, but instead only follow `prev` pointers.
+
+By going different directions when inserting and fetching, we avoid
+contention between threads inserting and threads fetching as much as
+possible. By skipping one element when we begin searching, we preserve
+the sentinel unmodified as much as possible. This is beneficial since
+all search operations need to read the content of the sentinel. If we
+were to modify the sentinel, the cache line containing the sentinel
+would unnecessarily be bounced between processors.
+
+The `prev`, and `next` fields in the elements of the list contains the
+value of the pointer, a modification marker, and a deleted
+marker. Memory operations on these fields are done using atomic memory
+operations. When a thread has set the modification marker in a field,
+no-one except the thread that set the marker is allowed to modify the
+field. If multiple modification markers needs to be set, we always
+begin with `next` fields followed by `prev` fields in the order
+following the actual pointers. This guarantees that no deadlocks will
+occur.
+
+When a carrier is being removed from a pool, we mark it with a thread
+progress value that needs to be reached before we are allowed to
+modify the `next`, and `prev` fields. That is, until we reach this
+thread progress we are not allowed to insert the carrier into the pool
+again, and we are not allowed to deallocate the carrier. This ensures
+that threads inspecting the pool always will be able to traverse the
+pool and reach valid elements. Once we have reached the thread
+progress value that the carrier was tagged with, we know that no
+threads may have references to it via the pool.
+
+### Migration ###
+
+There exist one pool for each allocator type enabling migration of
+carriers between scheduler specific allocator instances of the same
+allocator type.
+
+Each allocator instance keeps track of the current utilization of its
+multiblock carriers. When the utilization falls below the "abandon
+carrier utilization limit" it starts to inspect the utilization of the
+current carrier when deallocations are made. If also the utilization
+of the carrier falls below the "abandon carrier utilization limit" it
+unlinks the carrier from its data structure of available free blocks
+and inserts the carrier into the pool.
+
+Since the carrier has been unlinked from the data structure of
+available free blocks, no more allocations will be made in the
+carrier. The allocator instance putting the carrier into the pool,
+however, still has the responsibility of performing deallocations in
+it while it remains in the pool.
+
+Each carrier has a flag field containing information about allocator
+instance owning the carrier, a flag indicating if the carrier is in
+the pool or not, and a flag indicating if it is busy or not. When the
+carrier is in the pool, the owning allocator instance needs to mark it
+as busy while operating on it. If another thread inspects it in order
+to try to fetch it from the pool, it will abort the fetch if it is
+busy. When fetching the carrier from the pool, ownership will changed
+and further deallocations in the carrier will be redirected to the new
+owner using the delayed dealloc functionality.
+
+If a carrier in the pool becomes empty, it will be withdrawn from the
+pool. All carriers that become empty are also always passed to its
+originating allocator instance for deallocation using the delayed
+dealloc functionality. Since carriers this way always will be
+deallocated by the allocator instance that allocated the carrier the
+underlying functionality of allocating and deallocating carriers can
+remain simple and doesn't have to bother about multiple threads. In a
+NUMA system we will also not mix carriers originating from multiple
+NUMA nodes.
+
+When an allocator instance needs more carrier space, it always begins
+by inspecting its own carriers that are waiting for thread progress
+before they can be deallocated. If no such carrier could be found, it
+then inspects the pool. If no carrier could be fetched from the pool,
+it will allocate a new carrier. Regardless of where the allocator
+instance gets the carrier from it the just links in the carrier into
+its data structure of free blocks.
+
+### Result ###
+
+The use of this strategy of abandoning carriers with poor utilization
+and reusing these in allocator instances with an increased carrier
+demand is extremely effective and completely eliminates the problems
+that otherwise sometimes occurred when CPU load dropped while memory
+load did not.
+
+When using the `aoffcaobf` or `aoff` strategies compared to `gf` or
+`bf`, we loose some performance since we get more modifications in the
+data structure of free blocks. This performance penalty is however
+reduced using the `aoffcbf` strategy. A tradeoff between memory
+consumption and performance is however inevitable, and it is up to
+the user to decide what is most important. 
+
+Further work
+------------
+
+It would be quite easy to extend this to allow migration of multiblock
+carriers between all allocator types. More or less the only obstacle
+is maintenance of the statistics information.
+
+
diff --git a/erts/emulator/internal_doc/CodeLoading.md b/erts/emulator/internal_doc/CodeLoading.md
new file mode 100644
index 0000000000..151b9cd57c
--- /dev/null
+++ b/erts/emulator/internal_doc/CodeLoading.md
@@ -0,0 +1,186 @@
+Non-Blocking Code Loading
+=========================
+
+Introduction
+------------
+
+Before OTP R16 when an Erlang code module was loaded, all other
+execution in the VM were halted while the load operation was carried
+out in single threaded mode. This might not be a big problem for
+initial loading of modules during VM boot, but it can be a severe
+problem for availability when upgrading modules or adding new code on
+a VM with running payload. This problem grows with the number of cores
+as both the time it takes to wait for all schedulers to stop increases
+as well as the potential amount of halted ongoing work.
+
+In OTP R16, modules are loaded without blocking the VM.
+Erlang processes may continue executing undisturbed in parallel during
+the entire load operation. The code loading is carried out by a normal
+Erlang process that is scheduled like all the others. The load
+operation is completed by making the loaded code visible to all
+processes in a consistent way with one single atomic
+instruction. Non-blocking code loading will improve real-time
+characteristics when modules are loaded/upgraded on a running SMP
+system.
+
+
+The Load Phases
+---------------
+
+The loading of a module is divided into two phases; a *prepare phase*
+and a *finishing phase*. The prepare phase contains reading the BEAM
+file format and all the preparations of the loaded code that can
+easily be done without interference with the running code. The
+finishing phase will make the loaded (and prepared) code accessible
+from the running code. Old module versions (replaced or deleted) will
+also be made inaccessible by the finishing phase.
+
+The prepare phase is designed to allow several "loader" processes to
+prepare separate modules in parallel while the finishing phase can
+only be done by one loader process at a time. A second loader process
+trying to enter finishing phase will be suspended until the first
+loader is done. This will only block the process, the scheduler is
+free to schedule other work while the second loader is waiting. (See
+`erts_try_seize_code_write_permission` and
+`erts_release_code_write_permission`).
+
+The ability to prepare several modules in parallel is not currently
+used as almost all code loading is serialized by the code_server
+process. The BIF interface is however prepared for this.
+
+      erlang:prepare_loading(Module, Code) -> LoaderState
+      erlang:finish_loading([LoaderState])
+
+The idea is that `prepare_loading` could be called in parallel for
+different modules and returns a "magic binary" containing the internal
+state of each prepared module. Function `finish_loading` could take a
+list of such states and do the finishing of all of them in one go.
+
+Currenlty we use the legacy BIF `erlang:load_module` which is now
+implemented in Erlang by calling the above two functions in
+sequence. Function `finish_loading` is limited to only accepts a list
+with one module state as we do not yet use the multi module loading
+feature.
+
+
+The Finishing Sequence
+----------------------
+
+During VM execution, code is accessed through a number of data
+structures. These *code access structures* are
+
+* Export table. One entry for every exported function.
+* Module table. One entry for each loaded module.
+* "beam_catches". Identifies jump destinations for catch instructions.
+* "beam_ranges". Map code address to function and line in source file.
+
+The most frequently used of these structures is the export table that
+is accessed in run time for every executed external function call to
+get the address of the callee. For performance reasons, we want to
+access all these structures without any overhead from thread
+synchronization. Earlier this was solved with an emergency break. Stop
+the entire VM to mutate these code access structures, otherwise treat
+them as read-only.
+
+The solution in R16 is instead to *replicate* the code access
+structures. We have one set of active structures read by the running
+code. When new code is loaded the active structures are copied, the
+copy is updated to include the newly loaded module and then a switch
+is made to make the updated copy the new active set. The active set is
+identified by a single global atomic variable
+`the_active_code_index`. The switch can thus be made by a single
+atomic write operation. The running code have to read this atomic
+variable when using the active access structures, which means one
+atomic read operation per external function call for example. The
+performance penalty from this extra atomic read is however very small
+as it can be done without any memory barriers at all (as described
+below). With this solution we also preserve the transactional feature
+of a load operation. Running code will never see the intermediate
+result of a half loaded module.
+
+The finishing phase is carried out in the following sequence by the
+BIF `erlang:finish_loading`:
+
+1. Seize exclusive code write permission (suspend process if needed
+   until we get it).
+
+2. Make a full copy of all the active access structures. This copy is
+   called the staging area and is identified by the global atomic
+   variable `the_staging_code_index`.
+
+3. Update all access structures in the staging area to include the
+   newly prepared module.
+
+4. Schedule a thread progress event. That is a time in the future when
+   all schedulers have yielded and executed a full memory barrier.
+
+5. Suspend the loader process.
+
+6. After thread progress, commit the staging area by assigning
+   `the_staging_code_index` to `the_active_code_index`.
+
+7. Release the code write permission allowing other processes to stage
+   new code.
+
+8. Resume the loader process allowing it to return from
+   `erlang:finish_loading`.
+
+
+### Thread Progress
+
+The waiting for thread progress in 4-6 is necessary in order for
+processes to read `the_active_code_index` atomic during normal
+execution without any expensive memory barriers. When we write a new
+value into `the_active_code_index` in step 6, we know that all
+schedulers will see an updated and consistent view of all the new
+active access structures once they become reachable through
+`the_active_code_index`.
+
+The total lack of memory barrier when reading `the_active_code_index`
+has one interesting consequence however. Different processes may see
+the new code at different point in time depending on when different
+cores happen to refresh their hardware caches. This may sound unsafe
+but it actually does not matter. The only property we must guarantee
+is that the ability to see the new code must spread with process
+communication. After receiving a message that was triggered by new
+code, the receiver must be guaranteed to also see the new code. This
+will be guaranteed as all types of process communication involves
+memory barriers in order for the receiver to be sure to read what the
+sender has written. This implicit memory barrier will then also make
+sure that the receiver reads the new value of `the_active_code_index`
+and thereby also sees the new code. This is true for all kinds of
+inter process communication (TCP, ETS, process name registering,
+tracing, drivers, NIFs, etc) not just Erlang messages.
+
+### Code Index Reuse
+
+To optimize the copy operation in step 2, code access structures are
+reused. In current solution we have three sets of code access
+structures, identified by a code index of 0, 1 and 2. These indexes
+are used in a round robin fashion. Instead of having to initialize a
+completely new copy of all access structures for every load operation
+we just have to update with the changes that have happened since the
+last two code load operations. We could get by with only two code
+indexes (0 and 1), but that would require yet another round of waiting
+for thread progress before step 2 in the `finish_loading` sequence. We
+cannot start reusing a code index as staging area until we know that
+no lingering scheduler thread is still using it as the active code
+index. With three generations of code indexes, the waiting for thread
+progress in step 4-6 will give this guarantee for us. Thread progress
+will wait for all running schedulers to reschedule at least one
+time. No ongoing execution reading code access structures reached from
+an old value of `the_active_code_index` can exist after a second round
+of thread progress.
+
+The design choice between two or three generations of code access
+structures is a trade-off between memory consumption and code loading
+latency.
+
+### A Consistent Code View
+
+Some native BIFs may need to get a consistent snapshot view of the
+active code. To do this it is important to only read
+`the_active_code_index` one time and then use that index value for all
+code accessing during the BIF. If a load operation is executed in
+parallel, reading `the_active_code_index` a second time might result
+in a different value, and thereby a different view of the code.
diff --git a/erts/emulator/internal_doc/DelayedDealloc.md b/erts/emulator/internal_doc/DelayedDealloc.md
new file mode 100644
index 0000000000..b7d87b839f
--- /dev/null
+++ b/erts/emulator/internal_doc/DelayedDealloc.md
@@ -0,0 +1,175 @@
+Delayed Dealloc
+===============
+
+Problem
+-------
+
+An easy way to handle memory allocation in a multi-threaded
+environment is to protect the memory allocator with a global lock
+which threads performing memory allocations or deallocations have to
+have locked during the whole operation. This solution of course scales
+very poorly, due to heavy lock contention. An improved solution of
+this scheme is to use multiple thread specific instances of such an
+allocator. That is, each thread allocates in its own allocator
+instance which is protected by a lock. In the general case references
+to memory need to be passed between threads. In the case where a
+thread that needs to deallocate memory that originates from another
+threads allocator instance a lock conflict is possible. In a system as
+the Erlang VM where memory allocation/deallocation is frequent and
+references to memory also are passed around between threads this
+solution will also scale poorly due to lock contention.
+
+Functionality Used to Adress This problem
+-----------------------------------------
+
+In order to reduce contention due to locking of allocator instances we
+introduced completely lock free instances tied to each scheduler
+thread, and an extra locked instance for other threads. The scheduler
+threads in the system is expected to do the major part of the
+work. Other threads may still be needed but should not perform any
+major and/or time critical work. The limited amount of contention that
+appears on the locked allocator instance can more or less be
+disregarded.
+
+Since we still need to be able to pass references to memory between
+scheduler threads we need some way to manage this. An allocator
+instance belonging to one scheduler thread is only allowed to be
+manipulated by that scheduler thread. When other threads need to
+deallocate memory originating from a foreign allocator instance, they
+only pass the memory block to a "message box" containing deallocation
+jobs attached to the originating allocator instance. When a scheduler
+thread detects such deallocation job it performs the actual
+deallocation.
+
+The "message box" is implemented using a lock free single linked list
+through the memory blocks to deallocate. The order of the elements in
+this list is not important. Insertion of new free blocks will be made
+somewhere near the end of this list. Requirering that the new blocks
+need to be inserted at the end would cause unnecessary contention when
+large amount of memory blocks are inserted simultaneous by multiple
+threads.
+
+The data structure refering to this single linked list cover two cache
+lines. One cache line containing information about the head of the
+list, and one cache line containing information about the tail of the
+list. This in order to reduce cache line ping ponging of this data
+structure. The head of the list will only be manipulated by the thread
+owning the allocator instance, and the tail will be manipulated by
+other threads inserting deallocation jobs.
+
+### Tail ###
+
+In the tail part of the data structure we find a pointer to the last
+element of the list, or at least something that is near the end of the
+list. In the uncontended case it will point to the end of the list,
+but when simultaneous insert operations are performed it will point to
+something near the end of the list.
+
+When insterting an element one will try to write a pointer to the new
+element in the next pointer of the element pointed to by the last
+pointer. This is done using an atomic compare and swap that expects
+the next pointer to be `NULL`. If this succeds the thread performing
+this operation moves the last pointer to point to the newly inserted
+element.
+
+If the atomic compare and swap described above failed, the last
+pointer didn't point to the last element. In this case we need to
+insert the new element somewhere inbetween the element that the last
+pointer pointed to and the actual last element. If we do it this way
+the last pointer will eventually end up at the last element when
+threads stop adding new elements. When trying to insert somewhere near
+the end and failing to do so, the inserting thread sometimes moves to
+the next element and somtimes tries with the same element again. This
+in order to spread the inserted elements during heavy contention. That
+is, we try to spread the modifications of memory to different
+locations instead of letting all threads continue to try to modify the
+same location in memory.
+
+### Head ###
+
+The head contains pointers to begining of the list (`head.first`), and
+to the first block which other threads may refer to
+(`head.unref_end`). Blocks between these pointers are only refered to
+by the head part of the data structure which is only used by the
+thread owning the allocator instance. When these two pointers are not
+equal the thread owning the allocator instance deallocate block after
+block until `head.first` reach `head.unref_end`.
+
+We of course periodically need to move the `head.unref_end` closer to
+the end in order to be able to continue deallocating memory
+blocks. Since all threads inserting new elements in the linked list
+will enter the list using the last pointer we can use this
+knowledge. If we call `erts_thr_progress_later()` and wait until we
+have reached that thread progress we know that no managed threads can
+refer the elements up to the element pointed to by the last pointer at
+the time when we called `erts_thr_progress_later()`. This since, all
+managed threads must have left the code implementing this at least
+once, and they always enters into the list via the last pointer. The
+`tail.next` field contains information about next `head.unref_end`
+pointer and thread progress that needs to be reached before we can
+move `head.unref_end`.
+
+Unfortunately not only threads managed by the thread progress
+functionality may insert memory blocks. Other threads also needs to be
+taken care of. Other threads will not be as frequent users of this
+functionality as managed threads, so using a less efficient scheme for
+them is not that big of a problem. In order to handle unmanaged
+threads we use two reference counters. When an unmanaged thread enters
+this implementation it increments the reference counter currently
+used, and when it leaves this implementation it decrements the same
+reference counter. When the consumer thread calls
+`erts_thr_progress_later()` in order to determine when it is safe to
+move `head.unref_end`, it also swaps reference counters for unmanaged
+threads. The previous current represents outstanding references from
+the time up to this point. The new current represents future reference
+following this point. When the consumer thread detects that we have
+both reached the desired thread progress and when the previous current
+reference counter reach zero it is safe to move the `head.unref_end`.
+
+The reason for using two reference counters is that we need to know
+that the reference counter eventually will reach zero. If we only used
+one reference counter it would potentially be held above zero for ever
+by different unmanaged threads.
+
+### Empty List ###
+
+If no new memory blocks are inserted into the list, it should
+eventually be emptied. All pointers to the list however expect to
+always point to something. This is solved by inserting an empty
+"marker" element, which only has to purpose of being there in the
+absense of other elements. That is when the list is empty it only
+contains this "marker" element.
+
+### Contention ###
+
+When elements are continously inserted by threads not owning the
+allocator instance, the thread owning the allocator instance will be
+able to work more or less undisturbed by other threads at the head end
+of the list. At the tail end large amounts of simultaneous inserts may
+cause contention, but we reduce such contention by spreading inserts
+of new elements near the end instead of requiring all new elements to
+be inserted at the end.
+
+### Schedulers and The Locked Allocator Instance ###
+
+Also the locked allocator instance for use by non-scheduler threads
+have a message box for deallocation jobs just as all the other
+allocator instances. The reason for this is that other threads may
+allocate memory pass it to a scheduler that then needs to deallocate
+it. We do not want the scheduler to have to wait for the lock on this
+locked instance. Since also locked instances has message boxes for
+deallocation jobs, the scheduler can just insert the job and avoid the
+locking.
+
+
+### A Benchmark Result ###
+
+When running the ehb benchmark, large amount of messages are passed
+around between schedulers. All message passing will in some way or the
+other cause memory allocation and deallocation. Since messages are
+passed between different schedulers we will get contention on the
+allocator instances where messages were allocated. By the introduction
+of the delayed dealloc feature, we got a speedup of between 25-45%,
+depending on configuration of the benchmark, when running on a
+relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/PTables.md b/erts/emulator/internal_doc/PTables.md
new file mode 100644
index 0000000000..6fe0e7665d
--- /dev/null
+++ b/erts/emulator/internal_doc/PTables.md
@@ -0,0 +1,356 @@
+Process and Port Tables
+=======================
+
+Problems
+--------
+
+The process table is a mapping from process identifiers to process
+structure pointers. The process structure contains miscellaneous
+information about a process, as for example pointers to its heap,
+message queue, etc. When the runtime system needs to operate on a
+process, it looks up the process structure in the process table using
+the process identifier. An example of this is when passing a message
+to a process.
+
+The process table has for a very long time just been an array of
+pointers to process structures. Since process identifiers internally
+in the runtime system are 28-bit integers it is quite easy to map a
+process identifier to index into the array. The 28-bits were divided
+into two sets. The least significant set of bits was used as index
+into the array. The most significant set of bits was only used to be
+able to distinguish between a number of identifiers with which map to
+the same index in the array. As long as process table sizes of a power
+of two was used we had 2^28 unique process identifiers.
+
+When the first SMP support was implemented, the table still was kept
+more or less the same way, but protected by two types of locks. One
+lock that protected the whole table against modifications and an array
+of locks protecting different parts of the table. The exact locking
+strategy previously used isn't interesting. What is interesting is
+that it suffered from heavy lock contention especially when lots of
+modifications was being made, but also when only performing lookups.
+
+In order to be able to detect when it is safe to deallocate a
+previously used process structure, reference counting of the structure
+was used. Also this was problematic, since simultaneous lookups needed
+to modify the reference counter which also caused contention on the
+cache line where the reference counter was located. This since all
+modifications needs to be communicated between all involved
+processors.
+
+The port table is very similar to the process table. The major
+difference, at least in concept, is that it is a mapping from port
+identifiers to port structures. It had a similar implementation, but
+with some differences. Instead of being an array of pointers it was an
+array of structures, and instead of being protected by two types of
+locks it was only protected by one global lock. This table also
+suffered from lock contention in various situations.
+
+Solution
+--------
+
+The process table was the major problem to address since processes are
+much more frequently used than ports. The first implementation only
+implemented this for processes, but since the port table is very
+similar and very similar problems occur on the port table, the process
+table implementation was later generalized so that it could also be
+used implementing the port table. For simplicity I will only talk
+about the process table in the following text, but the same will apply
+to the port table unless otherwise stated.
+
+If we disregard the locking issues, the original solution is very
+appealing. The mapping from process identifier to index into the array
+is very fast, and this property is something we would like to
+keep. The vast majority of operations on these tables are lookups so
+optimizing for lookups is what we want to do.
+
+### Lookup ###
+
+Using a set of bits in the process identifier as index into an array
+seems hard to beat. By replacing the array of pointers with an array
+of our pointer sized atomic data type, a lookup will consist of the
+following:
+
+1.  Mapping the 28-bit integer to an index into the array.
+
+    More about this mapping later.
+
+2.  Read the pointer using an atomic memory operation at determined
+    index in array.
+
+    On all platforms that we provide atomic memory operations, this is
+    just a `volatile` read, preventing the compiler to use values in
+    registers, forcing the a read from memory.
+
+3.  Depending on use, issue appropriate memory barrier.
+
+    A common barrier used is a barrier with acquire semantics. On
+    x86/x86_64 this maps to a compiler barrier preventing the compiler
+    to reorder instructions, but on other hardware often some kind of
+    light weight hardware memory barrier is also needed.
+
+    When comparing with a locked approach, at least one heavy weight
+    memory barrier will be issued when locking the lock on most, if
+    not all, hardware architectures (including x86/x86_64), and often
+    some kind of light weight memory barrier will be issued when
+    unlocking the lock. 
+
+When looking at this very simple solution with very little overhead
+you might wonder why we didn't implement it this way from the
+beginning. It all boils down to the read operation of the pointer. We
+need some way to know that it is safe to access the memory pointed
+to. One way of doing this is to place a reference counter in the
+process structure. Increment of the reference counter at lookup needs
+to be done atomically with the lookup. A lock can typically provide
+this service for us, which was the approach we previously
+used. Another approach could be to co-locate the reference counter
+with the pointer in the table. The major problem with this approach is
+the modifications of the reference counter. This since these
+modification would have to be communicated between all involved
+processor cause contention on the cache line containing the reference
+counter. The new lookup approach above is possible since we can use
+the "thread progress" functionality in order to determine when it is
+safe to deallocate the process structure. We'll get back to this when
+describing deletion in the table.
+
+Using this new lookup approach we wont modify any memory at all which
+is important. A lookup conceptually only read memory, now this is true
+in the implementation also which is important from a scalability
+perspective. The previous implementation modified the cache line
+containing the reference counter two times, and the cache line
+containing the corresponding lock two times at each lookup.
+
+### Modifications of the Table ###
+
+A lightweight lookup in the table was the most important feature, but
+we also wanted to improve modifications of the table. The process
+table is modified when a new process is spawned, i.e. a new pointer is
+inserted into the table, and when a process terminates, i.e. a pointer
+is deleted in the table.
+
+Assuming that we spawn fewer processes than the maximum amount of
+unique process identifiers in the system, one has always been able to
+determine the order of process creation just by comparing process
+identifiers. If PidX is larger than PidY, then PidX was created after
+PidY assuming both identifiers originates from the same node. However,
+since we have a quite limited amount of unique identifiers today
+(2^28), this property cannot be relied upon if we create large amount
+of processes. But never the less, this is a property the system always
+have had.
+
+If we would have had a huge amount of unique identifiers available, it
+would have tempting to drop or modify this ordering property as
+described above. The ordering property could for example be based on
+the scheduler performing the spawn operation. It would have been
+possible to reserve large ranges of identifiers exclusive for each
+scheduler thread which could be used minimizing the need for
+communication when allocating identifiers. The amount of identifiers
+we got to work with today is, however, not even close to be enough for
+such an approach.
+
+Since we have a limited amount of unique identifiers, we need to be
+careful not to waste them. If previously used identifiers are reused
+too quick, identifiers originating from terminated processes will
+refer to newly created processes, and mixups will occur. The
+previously used approach was quite good at not wasting
+identifiers. Using a modified version of the same approach also lets
+us keep the ordering property that we have always had.
+
+#### Insert ####
+
+The original approach is more or less to search for next free index or
+slot in the array. The search starts from the last slot allocated. If
+we reach the end of the array we increase a "wrapped counter" and then
+continue the search. The process identifier is constructed by writing
+the index to the least significant set of bits, and the "wrapped
+counter" to the most significant set of bits. The amount of bits in
+each set of bits is decided at boot time, so that maximum index will
+just fit into the least significant set of bits.
+
+In the modified lock free version of this approach we more or less do
+it the same way, but with some important modifications trying to avoid
+unnecessary contention when multiple schedulers create processes
+simultaneously. Since multiple threads might be trying to search for
+the next free slot at the same time from the same starting point we
+want subsequent slots to be located in different cache lines. Multiple
+schedulers simultaneously writing new pointers into the table are
+therefore very likely to write into adjacent slots. If adjacent slots
+are located in the same cache line all modification of this cache line
+needs to be communicated between all involved processors which will be
+very expensive and scale very poor. By locating adjacent slots in
+different cache lines only true conflicts will trigger communication
+between involved processors, i.e., avoiding false sharing.
+
+A cache line is larger than a pointer, typically 8 or 16 times larger,
+so using one cache line for each slot only containing one pointer
+would be a waste of space. Each cache line will be able to hold a
+fixed amount of slots. The first slot of the table will be the first
+slot of the first cache line, the second slot of the table will be the
+first slot of the second cache line until we reach the end of the
+array. The next slot after that will be the second slot of the first
+cache line, etc, moving forward one cache line internal slot each time
+we wrap. This way we will be able to fit the same amount of pointers
+into an array of the same size while always keeping adjacent slots in
+different cache lines.
+
+The mapping from identifier to slot or index into the array gets a bit
+more complicated than before. Instead of a `shift` and a bitwise
+`and`, we get two `shift`s, two bitwise `and`s, and an `add` (see
+implementation of `erts_ptab_data2pix()` in `erl_ptab.h`). However, by
+storing this information optimized for lookup we only need a `shift`
+and a bitwise `and` on 32-bit platforms. On 64-bit platforms we got
+enough room for the 28-bit identifier in the least significant
+halfword, and the index in the most significant halfword, in other
+words, we just need to read the most significant halfword to get the
+index. That is, this operation is as fast, or faster than before. The
+downside is that on 32-bit platforms we need to convert this
+information into the 28-bit identifier number when printing, or when
+ordering identifiers from the same node. These operations are,
+however, extremely infrequent compared to lookups.
+
+When we insert a new element in the table we do the following:
+
+1.  We begin by reserving space in the table by atomically
+    incrementing a counter of processes in the table. If our increment
+    brings the counter above the maximum size of the table, the
+    operation fail and a `system_limit` exception is raised.
+
+2.  The table contains a 64-bit atomic variable of the last identifier
+    used. Only the least significant bits will be used when actually
+    creating the identifier. This identifier is where the search
+    begin.
+
+3.  We increment last identifier value used. In order determine the
+    slot that corresponds to this identifier we call
+    `erts_ptab_data2pix()` that maps identifier to slot. We read the
+    content of the slot. If the slot is free we try to write a
+    reservation marker using an atomic compare and swap. If this fails
+    we repeat this step until it succeeds. 
+
+4.  Change the table variable of last identifier used. Since multiple
+    writes might occur at the same time this value may already have
+    been changed by to an identifier larger that the one we got. In
+    this case we can continue; otherwise, we need to change it to the
+    identifier we got.
+
+5.  We now do some initializations of the process structure that
+    cannot be done before we know the process identifier, and have to
+    be done before we publish the structure in the table. This, for
+    example, includes storing the identifier in the process structure. 
+
+6.  Now we can publish the structure in the table by writing the the
+    pointer to the process structure in the slot previously reserved
+    in 3.
+
+Using this approach we keep the properties like identifier ordering,
+and identifier reuse while improving performance and scalability. It
+has one flaw, though. There is no guarantee that the operation will
+terminate. This can quite easily be fixed though, and will be fixed in
+the next release. We will get back to this below.
+
+#### Delete ####
+
+When a process terminates, we mark the process as terminated in the
+process structure, the counter of number of processes in the table is
+decreased, and the reference to the process structure is removed by
+writing a `NULL` pointer into the corresponding slot. The scheduler
+thread performing this then schedule a thread progress later job which
+will do the final cleanup and deallocate the process structure. The
+thread progress functionality will make sure that this job will not
+execute until it is certain that all managed threads have dropped all
+references to the process structure.
+
+### BIF Iterating Over the Table ###
+
+The `erlang:processes/1` and `erlang:port/1` BIFs iterate over the
+tables and return corresponding identifiers. These BIF should return a
+consistent snapshot of the table content during some time when the BIF
+is executing. In order to implement this we use locking in a strange
+way. We use an "inverted rwlock".
+
+When performing lookups in the table we do not need to bother about
+the locking at all, but when modifying the table we read lock the
+rwlock protecting the table which allows for multiple writers during
+normal operation. When the BIF that iterates over the table need
+access to the table it write locks the rwlock and reads content of the
+table. The BIF do not read the whole table in one go but instead read
+small chunks at time only write locking while reading. The actual
+implementation of the BIFs is out of the scope of this document.
+
+An out of the box rwlock will typically suffer from contention on the
+single cache line containing the state of the rwlock even in the case
+we are only read locking. Instead of using such an rwlock, we have our
+own implementation of reader optimized rwlocks which keeps track of
+reader threads in separate thread specific cache lines. This in order
+to avoid contention on a singe cache line. As long as we only do read
+lock operations, threads only need to read a global cache line and
+modify its own cache line, and by this minimize communication between
+involved processors. The iterating BIFs are normally very infrequently
+used, so in the normal case we will only do read lock operations on
+the table global rwlock.
+
+### Future Improvements ###
+
+The first improvement is to fix the guarantee so that insert
+operations will be guaranteed to terminate. When the operation starts
+we verify that there actually exist a free slot that we can use. The
+problem is that we might not find it since it may move when multiple
+threads modify the table at the same time as we are trying to find the
+slot. The easy fix is to abort the operation if an empty slot could
+not be found in a finite number operation, and then restart the
+operation under a write lock. This will be implemented in next
+release, but furter work should be made trying to find a better
+solution.
+
+This and also previous implementation do not work well when the table
+is nearly full. We will both get long search times for free slots, and
+we will reuse identifiers more frequently since we more frequently
+wrap during the search. These tables works best when the table is much
+larger than the amount of simultaneous existing processes. One easy
+improvement is to always have room for more processes than we allow in
+the table. This will also be implemented in the next release, but this
+should probably also be worked more on trying to find an even better
+solution.
+
+It would also be nice to get rid of the rwlock all together. The use
+of a reader optimized rwlock makes sure we do not any contention on
+the lock, but unnecessary memory barriers will be issued due to the
+lock. The main issue here is to modify iterating BIFs so that they do
+not require exclusive access to the table while reading a sequence of
+slots. In principle this should be rather easy, the code can handle
+sequences of variable sizes, so shrinking the sequence size of slots
+to one would solv the problem. This will, however, need some tweeks
+and modifications of not trival code, but is something that should be
+looked at in the future.
+
+By increasing the size of identifiers, at least on 64-bit machines
+(which isn't as easy as it first might seem) we get further room for
+improvement. Besides the obvious improvement of not reusing
+identifiers as fast as we currently do, it makes it possible to
+further avoid contention when inserting elements in the table. At
+least if we drop this ordering property, which isn't that useful
+anyway.
+
+### Some Benchmark Results ###
+
+In order to test modifications of the process table we ran a couple of
+benchmarks where lots of processes are spawned and terminated
+simultaneously, and got a speedup of between 150-200%. Running a
+similar benchmark but with ports we got a speedup of about 130%.
+
+The BIF `erlang:is_process_alive/1` is the closest you can get to a
+process table lookup only. The BIF looks up the process corresponding
+to the process identifier passed as argument, and then checks if it is
+alive. By running multiple processes looping over this BIF checking
+the same process, we get a speedup between 20000-23000%. Conceptually
+this operation only involve read operations. In the implementation
+used in R16B also only read operation are performed, while the
+previous implementation need to lock structures in order to read the
+data, suffering from both lock contention and contention due to
+modifications of cache lines used by lock internal data structures and
+the reference counter on the process being looked up.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers. On a
+machine with more communication overhead and/or larger amount of
+logical processors the speedups are expected to be even larger.
diff --git a/erts/emulator/internal_doc/PortSignals.md b/erts/emulator/internal_doc/PortSignals.md
new file mode 100644
index 0000000000..b1afb7c5cb
--- /dev/null
+++ b/erts/emulator/internal_doc/PortSignals.md
@@ -0,0 +1,267 @@
+Port Signals
+============
+
+Problems
+--------
+
+Erlang ports conceptually are very similar to Erlang processes. Erlang
+processes execute Erlang code in the virtual machine, while an Erlang
+port execute native code typically used for communication with the
+outside world. For example, when an Erlang process wants to
+communicate using TCP over the network, it communicates via an Erlang
+port implementing the TCP socket interface in native code. Both Erlang
+Processes and Ports communicate using asynchronous signaling. The
+native code executed by an Erlang port is a collection of callback
+functions, called a driver. Each callback more or less implements the
+code of a signal to, or from the port.
+
+Even though processes and ports conceptually always have been very
+similar, the implementations have been very different. Originally,
+more or less all port signals were handled synchronously at the time
+they occurred. Very early in the development of the SMP support for
+the runtime system we recognized that this was a huge problem for
+signals between ports and the outside world. That is, I/O events to
+and from the outside world, or I/O signals. This was one of the first
+things that had to be rewritten in order to be able to do I/O in
+parallel at all. The solution was to implement scheduling of these
+signals. I/O signals corresponding to different ports could then be
+executed in parallel on different scheduler threads. Signals from
+processes to ports was not as big of a problem as the I/O signals, and
+the implementation of those was left as they were.
+
+Each port is protected by its own lock to protect against simultaneous
+execution in multiple threads. Previously when a process, executing on
+a scheduler thread, sent a port a signal, it locked the port lock and
+synchronously executed the code corresponding to the signal. If the
+lock was busy, the scheduler thread blocked waiting until it could
+lock the lock. If multiple processes executing simultaneously on
+different scheduler threads, sent signals to the same port, schedulers
+suffered from heavy lock contention. Such contention could also occur
+between I/O signals for the port executing on one scheduler thread,
+and a signal from a process to the port executing on another scheduler
+thread. Beside the contention issues, we also loose potential work to
+execute in parallel on different scheduler threads. This since the
+process sending the *asynchronous* signal is blocked while the code
+implementing the signal is executed synchronously.
+
+Solution
+--------
+
+In order to prevent multiple schedulers from trying to execute signals
+to/from the same port simultaneously, we need to be able to ensure
+that all signals to/from a port are executed in sequence on one
+scheduler. More or less, the only way to do this is to schedule all
+types of signals. Signals corresponding to a port can then be executed
+in sequence by one single scheduler thread. If only one thread tries
+to execute the port, no contention will appear on the port
+lock. Besides getting rid of the contention, processes sending signals
+to the port can also continue execution of their own Erlang code on
+other schedulers at the same time as the signaling code is executing
+on another scheduler.
+
+When implementing this there are a couple of important properties that
+we either need, or want to preserve:
+
+*   Signal ordering guarantee. Signals from process `X` to port `Y`,
+    *must* be delivered to `Y` in the same order as sent from `X`.
+
+*   Signal latency. Due to the previous synchronous implementation,
+    latency of signals sent from processes to ports have usually been
+    very low. During contention the latency has of course
+    increased. Users expect latency of these signals to be low, a
+    sudden increase in latency would not be appreciated by our users.
+
+*   Compatible flow control. Ports have for a very long time had the
+    possibility to use the busy port functionality when implementing
+    flow control. One may argue that this functionality fits very bad
+    with the conceptually completely asynchronous signaling, but the
+    functionality has been there for ages and is expected to be
+    there. When a port sets itself into a busy state, `command`
+    signals should not be delivered, and senders of such signals
+    should suspend until the port sets itself in a not busy state.
+
+### Scheduling of Port Signals ###
+
+A run queue has four queues for processes of different priority and
+one queue for ports. The scheduler thread associated with the run
+queue switch evenly between execution of processes and execution of
+ports while both processes and ports exist in the queue. This is not
+completely true, but not important for this discussion. A port that is
+in a run queue also has a queue of tasks to execute. Each task
+corresponds to an in- or outgoing signal. When the port is selected
+for execution each task will be executed in sequence. The run queue
+locks not only protected the queues of ports, but also the queues of
+port tasks.
+
+Since we go from a state where I/O signals are the only port related
+signals scheduled, to a state where potentially all port related
+signals may be scheduled we may drastically increase the load on the
+run queue lock. The amount of scheduled port tasks very much depend on
+the Erlang application executing, which we do not control, and we do
+not want to get increased contention on the run queue locks. We
+therefore need another approach of protecting the port task queue.
+
+#### Task Queue ####
+
+We chose a "semi locked" approach, with one public locked task queue,
+and a private, lock free, queue like, task data structure. This "semi
+locked" approach is similar to how the message boxes of processes are
+managed. The lock is port specific and only used for protection of
+port tasks, so the run queue lock is now needed in more or less the
+same way for ports as for processes. This ensures that we wont see an
+increased lock contention on run queue locks due to this rewrite of
+the port functionality.
+
+When an executing port runs out of work to execute in the private task
+data structure, it moves the public task queue into the private task
+data structure while holding the lock. Once tasks has been moved to
+the private data structure no lock protects them. This way the port
+can continue working on tasks in the private data structure without
+having to fight for the lock.
+
+I/O signals may however be aborted. This could be solved by letting
+the port specific scheduling lock also protect the private task data
+structure, but then the port very frequently would have to fight with
+others enqueueing new tasks. In order to handle this while keeping the
+private task data structure lock free, we use a similar "non
+aggressive" approach as we use when handling processes that gets
+suspended while in the run queue. Instead of removing the aborted port
+task, we just mark it as aborted using an atomic memory
+operation. When a task is selected for execution, we first verify that
+it has not been aborted. If aborted we, just drop the task.
+
+A task that can be aborted is referred via another data structure from
+other parts of the system, so that a thread that needs to abort the
+task can reach it. In order to be sure to safely deallocate a task
+that is no longer used, we first clear this reference and then use the
+thread progress functionality in order to make sure no references can
+exist to the task. Unfortunately, also unmanaged threads might abort
+tasks. This is very infrequent, but might occur. This could be handled
+locally for each port, but would require extra information in each
+port structure which very infrequently would be used. Instead of
+implementing this in each port, we implemented general functionality
+that can be used from unmanaged threads to delay thread progress.
+
+The private "queue like" task data structure could have been an
+ordinary queue if it wasn't for the busy port functionality. When the
+port has flagged itself as busy, `command` signals are not allowed to
+be delivered and need to be blocked. Other signals sent from the same
+sender following a `command` signal that has been blocked also have to
+be blocked; otherwise, we would violate the ordering guarantee. At the
+same time, other signals that have no dependencies to blocked
+`command` signals are expected to be delivered.
+
+The above requirements makes the private task data structure a rather
+complex data structure. It has a queue of unprocessed tasks, and a
+busy queue. The busy queue contains blocked tasks corresponding to
+`command` signals, and tasks with dependencies to such tasks. The busy
+queue is accompanied by a table over blocked tasks based on sender
+with a references into last task in the busy queue from a specific
+sender. This since we need check for dependencies when new tasks are
+processed in the queue of unprocessed tasks. When a new task is
+processed that needs to be blocked it isn't enqueued at the end of the
+busy queue, but instead directly after the last task with the same
+sender. This in order to easily be able to detect when we have tasks
+that no longer have any dependencies to tasks corresponding to
+`command` signals which should be moved out of the busy queue. When
+the port executes, it switches between processing tasks from the busy
+queue, and processing directly from the unprocessed queue based on its
+busy state. When processing directly from the unprocessed queue it
+might, of course, have to move a task into the busy queue instead of
+executing it.
+
+#### Busy Port Queue ####
+
+Since it is the port itself which decides when it is time to enter a
+busy state, it needs to be executing in order to enter the busy
+state. As a result of `command` signals being scheduled, we may get
+into a situation where the port gets flooded by a huge amount of
+`command` signals before it even gets a chance to set itself into a
+busy state. This since it has not been scheduled for execution
+yet. That is, under these circumstances the busy port functionality
+loose the flow control properties it was intended to provide.
+
+In order to solve this, we introduced a new busy feature, namely "busy
+port queue". The port has a limit of `command` data that is allowed to
+be enqueued in the task queue. When this limit is reached, the port
+will automatically enter a busy port queue state. When in this state,
+senders of `command` signals will be suspended, but `command` signals
+will still be delivered to the port unless it is also in a busy port
+state. This limit is known as the high limit.
+
+There is also a low limit. When the amount of queued `command` data
+falls below this limit and the port is in a busy port queue state, the
+busy port queue state is automatically disabled. The low limit should
+typically be significantly lower than the high limit in order to
+prevent frequent oscillation around the busy port queue state.
+
+By introduction of this new busy state we still can provide the flow
+control. Old driver do not even have to be changed. The limits can,
+however, be configured and even disabled by the port. By default the
+high limit is 8 KB and the low limit is 4 KB.
+
+### Preparation of Signal Send ###
+
+Previously all operations sending signals to ports began by acquiring
+the port lock, then performed preparations for sending the signal, and
+then finaly sent the signal. The preparations typically included
+inspecting the state of the port, and preparing the data to pass along
+with the signal. The preparation of data is frequently quite time
+consuming, and did not really depend on the port. That is we would
+like to do this without having the port lock locked.
+
+In order to improve this, state information was re-organized in the
+port structer, so that we can access it using atomic memory
+operations. This together with the new port table implementation,
+enabled us to lookup the port and inspect the state before acquiring
+the port lock, which in turn made it possible to perform preparations
+of signal data before acquiring the port lock.
+
+### Preserving Low Latency ###
+
+If we disregard the contended cases, we will inevitably get a higher
+latency when scheduling signals for execution at a later time than by
+executing the signal immediately. In order to preserve the low latency
+we now first check if this is a contended case or not. If it is, we
+schedule the signal for later execution; otherwise, we execute the
+signal immediately. It is a contended case if other signals already
+are scheduled on the port, or if we fail to acquire the port
+lock. That is we will not block waiting for the lock.
+
+Doing it this way we will preserve the low latency at the expense of
+lost potential parallel execution of the signal and other code in the
+process sending the signal. This default behaviour can however be
+changed on port basis or system wide, forcing scheduling of all
+signals from processes to ports that are not part of a synchronous
+communication. That is, an unconditional request/response pair of
+asynchronous signals. In this case it is no potential for parallelism,
+and by that no point forcing scheduling of the request signal.
+
+The immediate execution of signals may also cause a scheduler that is
+about to execute scheduled tasks to block waiting for the port
+lock. This is however more or less the only scenario where a scheduler
+needs to wait for the port lock. The maximum time it has to wait is
+the time it takes to execute one signal, since we always schedule
+signals when contention occurs.
+
+### Signal Operations ###
+
+Besides implementing the functionality enabling the scheduling,
+preparation of signal data without port lock, etc, each operation
+sending signals to ports had to be quite extensively re-written. This
+in order to move all sub-operations that can be done without the lock
+to a place before we have acquired the lock, and also since signals
+now sometimes are executed immediately and sometimes scheduled for
+execution at a later time which put different requirements on the data
+to pass along with the signal.
+
+### Some Benchmark Results ###
+
+When running some simple benchmarks where contention only occur due to
+I/O signals contending with signals from one single process we got a
+speedup of 5-15%. When multiple processes send signals to one single
+port the improvements can be much larger, but the scenario with one
+process contending with I/O is the most common one.
+
+The benchmarks were run on a relatively new machine with an Intel i7
+quad core processor with hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/ProcessManagementOptimizations.md b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
new file mode 100644
index 0000000000..9e83633bef
--- /dev/null
+++ b/erts/emulator/internal_doc/ProcessManagementOptimizations.md
@@ -0,0 +1,172 @@
+Process Management Optimizations
+================================
+
+Problems
+--------
+
+Early versions of the SMP support for the runtime system completely
+relied on locking in order to protect data accesses from multiple
+threads. In some cases this isn't that problematic, but in some cases
+it really is. It complicates the code, ensuring all locks needed are
+actually held, and ensuring that all locks are acquired in such an
+order that no deadlock occur. Acquiring locks in the right order often
+also involve releasing locks held, forcing threads to reread data
+already read. A good recipe for creation of bugs. Trying to use more
+fine-grained locking in order to increase possible parallelism in the
+system makes the complexity situation even worse. Having to acquire a
+bunch of locks when doing operations also often cause heavy lock
+contention which cause poor scalability.
+
+Management of processes internally in the runtime system suffered from
+these problems. When changing state on a process, for example from
+`waiting` to `runnable`, a lock on the process needed to be
+locked. When inserting a process into a run queue also a lock
+protecting the run queue had to be locked. When migrating a process
+from one run queue to another run queue, locks on both run queues and
+on the process had to be locked.
+
+This last example is a quite common case in during normal
+operation. For example, when a scheduler thread runs out of work it
+tries to steal work from another scheduler threads run queue. When
+searching for a victim to steal from there was a lot of juggling of
+run queue locks involved, and during the actual theft finalized by
+having to lock both run queues and the process. When one scheduler
+runs out of work, often others also do, causing lots of lock
+contention.
+
+Solution
+--------
+
+### Process ###
+
+In order to avoid these situations we wanted to be able to do most of
+the fundamental operations on a process without having to acquire a
+lock on the process. Some examples of such fundamental operations are,
+moving a process between run queues, detecting if we need to insert it
+into a run queue or not, detecting if it is alive or not.
+
+All of this information in the process structure that was needed by
+these operations was protected by the process `status` lock, but the
+information was spread across a number of fields. The fields used was
+typically state fields that could contain a small number of different
+states. By reordering this information a bit we could *easily* fit
+this information into a 32-bit wide field of bit flags (only 12-flags
+were needed). By moving this information we could remove five 32-bit
+wide fields and one pointer field from the process structure! The move
+also enabled us to easily read and change the state using atomic
+memory operations.
+
+### Run Queue ###
+
+As with processes we wanted to be able to do the most fundamental
+operations without having to acquire a lock on it. The most important
+being able to determine if we should enqueue a process in a specific
+run queue or not. This involves being able to read actual load, and
+load balancing information.
+
+The load balancing functionality is triggered at repeated fixed
+intervals. The load balancing more or less strives to even out run
+queue lengths over the system. When balancing is triggered,
+information about every run queue is gathered, migrations paths and
+run queue length limits are set up. Migration paths and limits are
+fixed until the next balancing has been done. The most important
+information about each run queue is the maximum run queue length since
+last balancing. All of this information were previously stored in the
+run queues themselves.
+
+When a process has become runnable, for example due to reception of a
+message, we need to determine which run queue to enqueue it
+in. Previously this at least involved locking the run queue that the
+process currently was assigned to while holding the status lock on the
+process. Depending on load we sometimes also had to acquire a lock on
+another run queue in order to be able to determine if it should be
+migrated to that run queue or not.
+
+In order to be able to decide which run queue to use without having to
+lock any run queues, we moved all fixed balancing information out of
+the run queues into a global memory block. That is, migration paths
+and run queue limits. Information that need to be frequently updated,
+like for example maximum run queue length, were kept in the run queue,
+but instead of operating on this information under locks we now use
+atomic memory operations when accessing this information. This made it
+possible to first determine which run queue to use, without locking
+any run queues, and when decided, lock the chosen run queue and insert
+the process.
+
+#### Fixed Balancing Information ####
+
+When determining which run queue to choose we need to read the fixed
+balancing information that we moved out of the run queues. This
+information is global, read only between load balancing operations,
+but will be changed during a load balancing. We do not want to
+introduce a global lock that needs to be acquired when accessing this
+information. A reader optimized rwlock could avoid some of the
+overhead since the data is most frequently read, but it would
+unavoidably cause disruption during load balancing, since this
+information is very frequently read. The likelihood of a large
+disruption due to this also increase as number of schedulers grows.
+
+Instead of using a global lock protecting modifications of this
+information, we write a completely new version of it at each load
+balancing. The new version is written in another memory block than the
+previous one, and published by issuing a write memory barrier and then
+storing a pointer to the new memory block in a global variable using
+an atomic write operation.
+
+When schedulers need to read this information, they read the pointer
+to currently used information using an atomic read operation, and then
+issue a data dependency read barrier, which on most architectures is a
+no-op. That is, it is very little overhead getting access to this
+information.
+
+Instead of allocating and deallocating memory blocks for the different
+versions of the balancing information we keep old memory blocks and
+reuse them when it is safe to do so. In order to be able to determine
+when it is safe to reuse a block we use the thread progress
+functionality, ensuring that no threads have any references to the
+memory block when we reuse it.
+
+#### Be Less Aggressive ####
+
+We implemented a test version using lock free run queues. This
+implementation did however not perform as good as the version using
+one lock per run queue. The reason for this was not investigated
+enough to say why this was. Since the locked version performed better
+we kept it, at least for now. The lock free version, however, forced
+us to use other solutions, some of them we kept.
+
+Previously when a process that was in a run queue got suspended, we
+removed it from the queue straight away. This involved locking the
+process, locking the run queue, and then unlinking it from the double
+linked list implementing the queue. Removing a process from a lock
+free queue gets really complicated. Instead, of removing it from the
+queue, we just leave it in the queue and mark it as suspended. When
+later selected for execution we check if the process is suspended, if
+so just dropped it. During its time in the queue, it might also get
+resumed again, if so execute it when it get selected for execution.
+
+By keeping this part when reverting back to a locked implementation,
+we could remove a pointer field in each process structure, and avoid
+unnecessary operations on the process and the queue which might cause
+contention.
+
+### Combined Modifications ###
+
+By combining the modifications of the process state management and the
+run queue management, we can do large parts of the work involved when
+managing processes with regards to scheduling and migration without
+having any locks locked at all. In these situations we previously had
+to have multiple locks locked. This of course caused a lot of rewrites
+across large parts of the runtime system, but the rewrite both
+simplified code and eliminated locking at a number of places. The
+major benefit is, of course, reduced contention.
+
+### A Benchmark Result ###
+
+When running the chameneosredux benchmark, schedulers frequently run
+out of work trying to steal work from each other. That is, either
+succeeding in migrating, or trying to migrate processes which is a
+scenario which we wanted to optimize. By the introduction of these
+improvements, we got a speedup of 25-35% when running this benchmark
+on a relatively new machine with an Intel i7 quad core processor with
+hyper-threading using 8 schedulers.
+\ No newline at end of file
diff --git a/erts/emulator/internal_doc/ThreadProgress.md b/erts/emulator/internal_doc/ThreadProgress.md
new file mode 100644
index 0000000000..6118bcf0f6
--- /dev/null
+++ b/erts/emulator/internal_doc/ThreadProgress.md
@@ -0,0 +1,308 @@
+Thread Progress
+===============
+
+Problems
+--------
+
+### Knowing When Threads Have Completed Accesses to a Data Structure ###
+
+When multiple threads access the same data structure you often need to
+know when all threads have completed their accesses. For example, in
+order to know when it is safe to deallocate the data structure. One
+simple way to accomplish this is to reference count all accesses to
+the data structure. The problem with this approach is that the cache
+line where the reference counter is located needs to be communicated
+between all involved processors. Such communication can become
+extremely expensive and will scale poorly if the reference counter is
+frequently accessed. That is, we want to use some other approach of
+keeping track of threads than reference counting.
+
+### Knowing That Modifications of Memory is Consistently Observed ###
+
+Different hardware architectures have different memory models. Some
+architectures allows very aggressive reordering of memory accesses
+while other architectures only reorder a few specific cases. Common to
+all modern hardware is, however, that some type of reordering will
+occur. When using locks to protect all memory accesses made from
+multiple threads such reorderings will not be visible. The locking
+primitives will ensure that the memory accesses will be ordered. When
+using lock free algorithms one do however have to take this reordering
+made by the hardware into account.
+
+Hardware memory barriers or memory fences are instructions that can be
+used to enforce order between memory accesses. Different hardware
+architectures provide different memory barriers. Lock free algorithms
+need to use memory barriers in order to ensure that memory accesses
+are not reordered in such ways that the algorithm breaks down. Memory
+barriers are also expensive instructions, so you typically want to
+minimize the use of these instructions.
+
+Functionality Used to Address These Problems
+-------------------------------------------
+
+The "thread progress" functionality in the Erlang VM is used to
+address these problems. The name "thread progress" was chosen since we
+want to use it to determine when all threads in a set of threads have
+made such progress so that two specific events have taken place for
+all them.
+
+The set of threads that we are interested in we call managed
+threads. The managed threads are the only threads that we get any
+information about. These threads *have* to frequently report
+progress. Not all threads in the system are able to frequently report
+progress. Such threads cannot be allowed in the set of managed threads
+and are called unmanaged threads. An example of unmanaged threads are
+threads in the async thread pool. Async threads can be blocked for
+very long times and by this be prevented from frequently reporting
+progress. Currently only scheduler threads and a couple of other
+threads are managed threads.
+
+### Thread Progress Events ###
+
+Any thread in the system may use the thread progress functionality in
+order to determine when the following events have occured at least
+once in all managed threads:
+
+1.  The thread has returned from other code to a known state in the
+    thread progress functionality, which is independent of any other
+    code. 
+2.  The thread has executed a full memory barrier.
+
+These events, of course, need to occur ordered to other memory
+operations. The operation of determining this begins by initiating the
+thread progress operation. The thread that initiated the thread
+progress operation after this poll for the completion of the
+operation. Both of these events must occur at least once *after* the
+thread progress operation has been initiated, and at least once
+*before* the operation has completed in each managed thread. This is
+ordered using communication via memory which makes it possible to draw
+conclusion about the memory state after the thread progress operation
+has completed. Lets call the progress made from initiation to
+comletion for "thread progress".
+
+Assuming that the thread progress functionality is efficient, a lot of
+algorithms can both be simplified and made more efficient than using
+the first approach that comes to mind. A couple of examples follows.
+
+By being able to determine when the first event above has occurred we
+can easily know when all managed threads have completed accesses to a
+data structure. This can be determined the following way. We have an
+implementation of some functionality `F` using a data structure
+`D`. The reference to `D` is always looked up before `D` is being
+accessed, and the references to `D` is always dropped before we leave
+the code implementing `F`. If we remove the possibility to look up `D`
+and then wait until the first event has occurred in all managed
+threads, no managed threads can have any references to the data
+structure `D`. This could for example have been achieved by using
+reference counting, but the cache line containing the reference
+counter would in this case be ping ponged between all processors
+accessing `D` at every access.
+
+By being able to determine when the second event has occurred it is
+quite easy to do complex modifications of memory that needs to be seen
+consistently by other threads without having to resort to locking. By
+doing the modifications, then issuing a full memory barrier, then wait
+until the second event has occurred in all managed threads, and then
+publish the modifications, we know that all managed threads reading
+this memory will get a consistent view of the modifications. Managed
+threads reading this will not have to issue any extra memory barriers
+at all.
+
+Implementation of the Thread Progress Functionality
+---------------------------------------------------
+
+### Requirement on the Implementation ###
+
+In order to be able to determine when all managed threads have reached
+the states that we are interested in we need to communicate between
+all involved threads. We of course want to minimize this
+communication.
+
+We also want threads to be able to determine when thread progress has
+been made relatively fast. That is we need to have some balance
+between comunication overhead and time to complete the operation.
+
+### API ###
+
+I will only present the most important functions in the API here.
+
+*   `ErtsThrPrgrVal erts_thr_progress_later(void)` - Initiation of the
+    operation. The thread progress value returned can be used testing
+    for completion of the operation.
+*   `int erts_thr_progress_has_reached(ErtsThrPrgrVal val)` - Returns
+    a non zero value when we have reached the thread progress value
+    passed as argument. That is, when a non zero value is returned the
+    operation has completed.
+
+When a thread calls `my_val = erts_thr_progress_later()` and waits for
+`erts_thr_progress_has_reached(my_val)` to return a non zero value it
+knows that thread progress has been made.
+
+While waiting for `erts_thr_progress_has_reached()` to return a non
+zero value we typically do not want to block waiting, but instead want
+to continue working with other stuff. If we run out of other stuff to
+work on we typically do want to block waiting until we have reached
+the thread progress value that we are waiting for. In order to be able
+to do this we provide functionality for waking up a thread when a
+certain thread progress value has been reached:
+
+*   `void erts_thr_progress_wakeup(ErtsSchedulerData *esdp,
+    ErtsThrPrgrVal val)` - Request wake up. The calling thread will be
+    woken when thread progress has reached val. 
+
+Managed threads frequently need to update their thread progress by
+calling the following functions:
+
+*   `int erts_thr_progress_update(ErtsSchedulerData *esdp)` - Update
+    thread progress. If a non zero value is returned
+    `erts_thr_progress_leader_update()` has to be called without any
+    locks held.
+*   `int erts_thr_progress_leader_update(ErtsSchedulerData *esdp)` -
+    Leader update thread progress.
+
+Unmanaged threads can delay thread progress beeing made:
+
+*   `ErtsThrPrgrDelayHandle erts_thr_progress_unmanaged_delay(void)` -
+    Delay thread progress.
+*   `void erts_thr_progress_unmanaged_continue(ErtsThrPrgrDelayHandle
+    handle)` - Let thread progress continue.
+
+Scheduler threads can schedule an operation to be executed by the
+scheduler itself when thread progress has been made:
+
+* `void erts_schedule_thr_prgr_later_op(void (*funcp)(void *), void
+  *argp, ErtsThrPrgrLaterOp *memp)` - Schedule a call to `funcp`. The
+  call `(*funcp)(argp)` will be executed when thread progress has been
+  made since the call to `erts_schedule_thr_prgr_later_op()` was
+  made.
+
+### Implementation ###
+
+In order to determine when the events has happened we use a global
+counter that is incremented when all managed threads have called
+`erts_thr_progress_update()` (or `erts_thr_progress_leader_update()`).
+This could naively be implemented using a "thread confirmed" counter.
+This would however cause an explosion of communication where all
+involved processors would need to communicate with each other at each
+update.
+
+Instead of confirming at a global location each thread confirms that
+it accepts in increment of the global counter in its own cache
+line. These confirmation cache lines are located in sequence in an
+array, and each confirmation cache line will only be written by one
+and only one thread. One of the managed threads always have the leader
+responsibility. This responsibility may jump between threads, but as
+long as there are some activity in the system always one of them will
+have the leader responsibility. The thread with the leader
+responsibility will call `erts_thr_progress_leader_update()` which
+will check that all other threads have confirmed an increment of the
+global counter before doing the increment of the global counter. The
+leader thread is the only thread reading the confirmation cache
+lines.
+
+Doing it this way we will get a communication pattern of information
+going from the leader thread out to all other managed threads and then
+back from the other threads to the leader thread. This since only the
+leader thread will write to the global counter and all other threads
+will only read it, and since each confirmation cache lines will only
+be written by one specific thread and only read by the leader
+thread. When each managed thread is distributed over different
+processors, the communication between processors will be a reflection
+of this communication pattern between threads.
+
+The value returned from `erts_thr_progress_later()` equals the, by
+this thread, latest confirmed value plus two. The global value may be
+latest confirmed value or latest confirmed value minus one. In order
+to be certain that all other managed threads actually will call
+`erts_thr_progress_update()` at least once before we reach the value
+returned from `erts_thr_progress_later()`, the global counter plus one
+is not enough. This since all other threads may already have confirmed
+current global value plus one at the time when we call
+`erts_thr_progress_later()`. They are however guaranteed not to have
+confirmed global value plus two at this time.
+
+The above described implementation more or less minimizes the
+comunication needed before we can increment the global counter. The
+amount of communication in the system due to the thread progress
+functionality however also depend on the frequency with which managed
+threads call `erts_thr_progress_update()`. Today each scheduler thread
+calls `erts_thr_progress_update()` more or less each time an Erlang
+process is scheduled out. One way of further reducing communication
+due to the thread progress functionality is to only call
+`erts_thr_progress_update()` every second, or third time an Erlang
+process is scheduled out, or even less frequently than that. However,
+by doing updates of thread progress less frequently all operations
+depending on the thread progress functionality will also take a longer
+time.
+
+#### Delay of Thread Progress by Unmanaged Threads ####
+
+In order to implement delay of thread progress from unmanaged threads
+we use two reference counters. One being `current` and one being
+`waiting`. When an unmanaged thread wants to delay thread progress it
+increments `current` and gets a handle back to the reference counter
+it incremented. When it later wants to enable continuation of thread
+progress it uses the handle to decrement the reference counter it
+previously incremented.
+
+When the leader threads is about to increment the global thread
+progress counter it verifies that the `waiting` counter is zero before
+doing so. If not zero, the leader isn't allowed to increment the
+global counter, and needs to wait before it can do this. When it is
+zero, it swaps the `waiting` and `current` counters before increasing
+the global counter. From now on the new `waiting` counter will
+decrease, so that it eventualy will reach zero, making it possible to
+increment the global counter the next time. If we only used one
+reference counter it would potentially be held above zero for ever by
+different unmanaged threads.
+
+When an unmanaged thread increment the `current` counter it will not
+prevent the next increment of the global counter, but instead the
+increment after that. This is sufficient since the global counter
+needs to be incremented two times before thread progress has been
+made. It is also desirable not to prevent the first increment, since
+the likelyhood increases that the delay is withdrawn before any
+increment of the global counter is delayed. That is, the operation
+will cause as little disruption as possible.
+
+However, this feature of delaying thread progress from unmanaged
+threads should preferably be used as little as possible, since heavy
+use of it will cause contention on the reference counter cache
+lines. The functionality is however very useful in code which normally
+only executes in managed threads, but which may under some infrequent
+circumstances be executed in other threads.
+
+#### Overhead ####
+
+The overhead caused by the thread progress functionality is more or
+less fixed using the same amount of schedulers regardless of the
+number of uses of the functionality. Already today quite a lot of
+functionality use it, and we plan to use it even more. When rewriting
+old implementations of ERTS internal functionality to use the thread
+progress functionality, this implies removing communication in the old
+implementation. Otherwise it is simply no point rewriting the old
+implementation to use the thread progress functionality. Since the
+thread progress overhead is more or less fixed, the rewrite will cause
+a reduction of the total communication in the system.
+
+##### An Example #####
+
+The main structure of an ETS table was originally managed using
+reference counting. Already a long time ago we replaced this strategy
+since the reference counter caused contention on each access of the
+table. The solution used was to schedule "confirm deletion" jobs on
+each scheduler in order to know when it was safe to deallocate the
+table structure of a removed table. These confirm deletion jobs needed
+to be allocated. That is, we had to allocate and deallocate as many
+blocks as schedulers in order to deallocate one block. This of course
+was a quite an expensive operation, but we only needed to do this once
+when removing a table. It was more important to get rid of the
+contention on the reference counter which was present on every
+operation on the table.
+
+When the thread progress functionality had been introduced, we could
+remove the code implementing the "confirm deletion" jobs, and then
+just schedule a thread progress later operation which deallocates the
+structure. Besides simplifying the code a lot, we got an increase of
+more than 10% of the number of transactions per second handled on a
+mnesia tpcb benchmark executing on a quad core machine.
diff --git a/erts/emulator/internal_doc/Tracing.md b/erts/emulator/internal_doc/Tracing.md
new file mode 100644
index 0000000000..30bc5327a7
--- /dev/null
+++ b/erts/emulator/internal_doc/Tracing.md
@@ -0,0 +1,220 @@
+Non-blocking trace setting
+==========================
+
+Introduction
+------------
+
+Before OTP R16 when trace settings were changed by `erlang:trace_pattern`,
+all other execution in the VM were halted while the trace operation
+was carried out in single threaded mode. Similar to code loading, this
+can impose a severe problem for availability that grows with the
+number of cores.
+
+In OTP R16, trace breakpoints are set in the code without blocking the
+VM. Erlang processes may continue executing undisturbed in parallel
+during the entire operation. The same base technique is used as for
+code loading. A staging area of breakpoints is prepared and then made
+active with a single atomic operation.
+
+
+Redesign of Breakpoint Wheel
+----------------------------
+
+To make it easier to manage breakpoints without single threaded mode a
+redesign of the breakpoint mechanism has been made. The old
+"breakpoint wheel" data structure was a circular double-linked list of
+breakpoints for each instrumented function. It was invented before the
+SMP emulator. To support it in the SMP emulator, is was essentially
+expanded to one breakpoint wheel per scheduler. As more breakpoint
+types have been added, the implementation have become messy and hard
+to understand and maintain.
+
+In the new design the old wheel was dropped and instead replaced by
+one struct (`GenericBp`) to hold the data for all types of breakpoints
+for each instrumented function. A bit-flag field is used to indicate
+what different type of break actions that are enabled.
+
+
+Same Same but Different
+-----------------------
+Even though `trace_pattern` use the same technique as the non-blocking
+code loading with replicated generations of data structures and an
+atomic switch, the implementations are quite separate from each
+other. One initial idea was to use the existing mechanism of code
+loading to do a dummy load operation that would make a copy of the
+affected modules. That copy could then be instrumented with
+breakpoints before making it reachable with the same atomic switch as
+done for code loading. This approach seems straight forward but has a
+number of shortcomings, one being the large memory footprint when many
+modules are instrumented. Another problem is how execution will reach
+the new instrumented code. Normally loaded code can only be reached
+through external functions calls. Trace settings must be activated
+instantaneously without the need of external function calls.
+
+The choosen solution is instead for tracing to use the technique of
+replication applied on the data structures for breakpoints. Two
+generations of breakpoints are kept and indentified by index of 0 and
+1. The global atomic variables `erts_active_bp_index` will determine
+which generation of breakpoints running code will use.
+
+### Atomicy Without Atomic Operations
+
+Not using the code loading generations (or any other code duplication)
+means that `trace_pattern` must at some point write to the active beam
+code in order for running processes to reach the staged breakpoints
+structures. This can be done with one single atomic write operation
+per instrumented function. The beam instruction words are however read
+with normal memory loads and not through the atomic API. The only
+guarantee we need is that the written instruction word is seen as
+atomic. Either fully written or not at all. This is true for word
+aligned write operation on all hardware architectures we use.
+
+
+Adding a new Breakpoint
+-----------------------
+This is a simplified sequence describing what `trace_pattern` goes
+through when adding a new breakpoint.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate breakpoint structure `GenericBp` including both generations.
+   Set the active part as disabled with a zeroed flagfield. Save the original
+   instruction word in the breakpoint.
+
+3. Write a pointer to the breakpoint at offset -4 from the first
+   instruction "func_info" header.
+
+4. Set the staging part of the breakpoint as enabled with specified
+   breakpoint data.
+
+5. Wait for thread progress.
+
+6. Write a `op_i_generic_breakpoint` as the first instruction for the function.
+   This instruction will execute the breakpoint that it finds at offset -4.
+
+7. Wait for thread progress.
+
+8. Commit the breadpoint by switching `erts_active_bp_index`.
+
+9. Wait for thread progress.
+
+10. Prepare for next call to `trace_pattern` by updating the new staging part
+    (the old active) of the breakpoint to be identic to the the new active part.
+
+11. Release code write permission and return from `trace_pattern`.
+
+
+The code write permission "lock" seized in step 1 is the same as used
+by code loading. This will ensure that only one process at a time can
+stage new trace settings but it will also prevent concurrent code
+loading and make sure we see a consistent view of the beam code during
+the entire sequence.
+
+Between step 6 and 8, runninng processes might execute the written
+`op_i_generic_breakpoint` instruction. They will get the breakpoint
+structure written in step 3, read `erts_active_bp_index` and execute
+the corresponding part of the breakpoint. Before the switch in step 8
+becomes visible they will however execute the disabled part of the
+breakpoint structure and do nothing other than executing the saved
+original instruction.
+
+
+To Updating and Remove Breakpoints
+----------------------------------
+
+The above sequence did only describe adding a new breakpoint. We do
+basically the same sequence to update the settings of an existing
+breakpoint except step 2,3 and 6 can be skipped as it has already been
+done.
+
+To remove a breakpoint some more steps are needed. The idea is to
+first stage the breakpoint as disabled, do the switch, wait for thread
+progress and then remove the disabled breakpoint by restoring the
+original beam instruction.
+
+Here is a more complete sequence that contains both adding, updating
+and removing breakpoints.
+
+1. Seize exclusive code write permission (suspend process until we get it).
+
+2. Allocate new breakpoint structures with a disabled active part and
+   the original beam instruction. Write a pointer to the breakpoint in
+   "func_info" header at offset -4.
+
+3. Update the staging part of all affected breakpoints. Disable
+   breakpoints that are to be removed.
+
+4. Wait for thread progress.
+
+5. Write a `op_i_generic_breakpoint` as the first instruction for all
+   functions with new breakpoints.
+
+6. Wait for thread progress.
+
+7. Commit all staged breadpoints by switching `erts_active_bp_index`.
+
+8. Wait for thread progress.
+
+
+9. Restore original beam instruction for disabled breakpoints.
+
+10. Wait for thread progress.
+
+11. Prepare for next call to `trace_pattern` by updating the new
+    staging area (the old active) for all enabled breakpoints.
+
+12. Deallocate disabled breakpoint structures.
+
+13. Release code write permission and return from `trace_pattern`.
+
+
+### All that Waiting for Thread Progress
+
+There are four rounds of waiting for thread progress in the above
+sequence. In the code loading sequence we sacrificed memory overhead
+of three generations to avoid a second round of thread progress. The
+latency of `trace_pattern` should not be such a big problem for
+however, as it is normally not called in a rapid sequence.
+
+The waiting in step 4 is to make sure all threads will see an updated
+view of the breakpoint structures once they become reachable through
+the `op_i_generic_breakpoint` instruction written in step 5.
+
+The waiting in step 6 is to make the activation of the new trace
+settings "as atomic as possible". Different cores might see the new
+value of `erts_active_bp_index` at different times as it is read
+without any memory barrier. But this is the best we can do without
+more expensive thread synchronization.
+
+The waiting in step 8 is to make sure we dont't restore the original
+bream instructions for disabled breakpoints until we know that no
+thread is still accessing the old enabled part of a disabled
+breakpoint.
+
+The waiting in step 10 is to make sure no lingering thread is still
+accessing disabled breakpoint structures to be deallocated in step
+12.
+
+
+Global Tracing
+--------------
+
+Call tracing with `global` option only affects external function
+calls. This was earlier handled by inserting a special trace
+instruction in export entries without the use of breakpoints. With the
+new non-blocking tracing we want to avoid special handling for global
+tracing and make use of the staging and atomic switching within the
+breakpoint mechanism. The solution was to create the same type of
+breakpoint structure for a global call trace. The difference to local
+tracing is that we insert the `op_i_generic_breakpoint` instruction
+(with its pointer at offset -4) in the export entry rather than in the
+code.
+
+
+Future work
+-----------
+
+We still go to single threaded mode when new code is loaded for a
+module that is traced, or when loading code when there is a default
+trace pattern set. That is not impossible to fix, but that requires
+much closer cooperation between tracing BIFs and the loader BIFs.
diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c
index 61f9f6a59a..59e34eb819 100644
--- a/erts/emulator/sys/unix/sys.c
+++ b/erts/emulator/sys/unix/sys.c
@@ -547,6 +547,25 @@ erts_sys_pre_init(void)
 #endif
 #endif /* USE_THREADS */
     erts_smp_atomic_init_nob(&sys_misc_mem_sz, 0);
+
+    {
+      /*
+       * Unfortunately we depend on fd 0,1,2 in the old shell code.
+       * So if for some reason we do not have those open when we start
+       * we have to open them here. Not doing this can cause the emulator
+       * to deadlock when reaping the fd_driver ports :(
+       */
+      int fd;
+      /* Make sure fd 0 is open */
+      if ((fd = open("/dev/null", O_RDONLY)) != 0)
+	close(fd);
+      /* Make sure fds 1 and 2 are open */
+      while (fd < 3) {
+	fd = open("/dev/null", O_WRONLY);
+      }
+      close(fd);
+    }
+
 }
 
 void
diff --git a/erts/emulator/test/binary_SUITE.erl b/erts/emulator/test/binary_SUITE.erl
index bce4278337..a390c536bb 100644
--- a/erts/emulator/test/binary_SUITE.erl
+++ b/erts/emulator/test/binary_SUITE.erl
@@ -1,7 +1,7 @@
 %%
 %% %CopyrightBegin%
 %%
-%% Copyright Ericsson AB 1997-2013. All Rights Reserved.
+%% Copyright Ericsson AB 1997-2014. All Rights Reserved.
 %%
 %% The contents of this file are subject to the Erlang Public License,
 %% Version 1.1, (the "License"); you may not use this file except in
@@ -58,10 +58,10 @@
 	 ordering/1,unaligned_order/1,gc_test/1,
 	 bit_sized_binary_sizes/1,
 	 otp_6817/1,deep/1,obsolete_funs/1,robustness/1,otp_8117/1,
-	 otp_8180/1, ttb_trap/1]).
+	 otp_8180/1, trapping/1]).
 
 %% Internal exports.
--export([sleeper/0,ttb_loop/2]).
+-export([sleeper/0,trapping_loop/4]).
 
 suite() -> [{ct_hooks,[ts_install_cth]},
 	    {timetrap,{minutes,2}}].
@@ -76,7 +76,7 @@ all() ->
      bad_term_to_binary, more_bad_terms, otp_5484, otp_5933,
      ordering, unaligned_order, gc_test,
      bit_sized_binary_sizes, otp_6817, otp_8117, deep,
-     obsolete_funs, robustness, otp_8180, ttb_trap].
+     obsolete_funs, robustness, otp_8180, trapping].
 
 groups() -> 
     [].
@@ -506,8 +506,8 @@ external_size(Config) when is_list(Config) ->
 	    io:format("Unaligned size: ~p\n", [Sz2]),
 	    ?line ?t:fail()
     end,
-    ?line erlang:external_size(Bin) =:= erlang:external_size(Bin, [{minor_version, 1}]),
-    ?line erlang:external_size(Unaligned) =:= erlang:external_size(Unaligned, [{minor_version, 1}]).
+    true = (erlang:external_size(Bin) =:= erlang:external_size(Bin, [{minor_version, 1}])),
+    true = (erlang:external_size(Unaligned) =:= erlang:external_size(Unaligned, [{minor_version, 1}])).
 
 external_size_1(Term, Size0, Limit) when Size0 < Limit ->
     case erlang:external_size(Term) of
@@ -1241,16 +1241,27 @@ bsbs_1(A) ->
     Bin = binary_to_term_stress(<<131,$M,5:32,A,0,0,0,0,0>>),
     BinSize = bit_size(Bin).
 
+%% lists:foldl(_,_,lists:seq(_,_)) with less heap consumption
+lists_foldl_seq(Fun, Acc0, N, To) when N =< To ->
+    Acc1 = Fun(N, Acc0),
+    lists_foldl_seq(Fun, Acc1, N+1, To);
+
+lists_foldl_seq(_, Acc, _, _) ->
+    Acc.
+
 deep(Config) when is_list(Config) ->
-    ?line deep_roundtrip(lists:foldl(fun(E, A) ->
-					     [E,A]
-				     end, [], lists:seq(1, 1000000))),
-    ?line deep_roundtrip(lists:foldl(fun(E, A) ->
-					     {E,A}
-				     end, [], lists:seq(1, 1000000))),
-    ?line deep_roundtrip(lists:foldl(fun(E, A) ->
-					     fun() -> {E,A} end
-				     end, [], lists:seq(1, 1000000))),
+    deep_roundtrip(lists_foldl_seq(fun(E, A) ->
+					   [E,A]
+				   end, [], 1, 1000000)),
+    erlang:garbage_collect(),
+    deep_roundtrip(lists_foldl_seq(fun(E, A) ->
+					   {E,A}
+				   end, [], 1, 1000000)),
+    erlang:garbage_collect(),
+    deep_roundtrip(lists_foldl_seq(fun(E, A) ->
+					   fun() -> {E,A} end
+				   end, [], 1, 1000000)),
+    erlang:garbage_collect(),
     ok.
 
 deep_roundtrip(T) ->
@@ -1334,36 +1345,44 @@ run_otp_8180(Name) ->
      end || Bin <- Bins],
     ok.
 
-%% Test that exit and GC during term_to_binary trap does not crash.
-ttb_trap(Config) when is_list(Config)->
-    case erlang:system_info(wordsize) of
-	N when N < 8 ->
-	    {skipped, "Only on 64bit machines"};
-	_ ->
-	    do_ttb_trap(5)
-    end.
+%% Test that exit and GC during trapping term_to_binary and binary_to_term
+%% does not crash.
+trapping(Config) when is_list(Config)->
+    do_trapping(5, term_to_binary,
+		fun() -> [lists:duplicate(2000000,2000000)] end),
+    do_trapping(5, binary_to_term,
+		fun() -> [term_to_binary(lists:duplicate(2000000,2000000))] end).
 
-do_ttb_trap(0) ->
+do_trapping(0, _, _) ->
     ok;
-do_ttb_trap(N) ->
-    Pid = spawn(?MODULE,ttb_loop,[1000,self()]),
+do_trapping(N, Bif, ArgFun) ->
+    io:format("N=~p: Do ~p ~s gc.\n", [N, Bif, case N rem 2 of 0 -> "with"; 1 -> "without" end]),
+    Pid = spawn(?MODULE,trapping_loop,[Bif, ArgFun, 1000, self()]),
     receive ok -> ok end,
     receive after 100 -> ok end,
-    erlang:garbage_collect(Pid),
-    receive after 100 -> ok end,
+    Ref = make_ref(),
+    case N rem 2 of
+	0 -> erlang:garbage_collect(Pid, [{async,Ref}]),
+	     receive after 100 -> ok end;
+	1 -> void
+    end,
     exit(Pid,kill),
+    case N rem 2 of
+	0 -> receive {garbage_collect, Ref, _} -> ok end;
+	1 -> void
+    end,
     receive after 1 -> ok end,
-    do_ttb_trap(N-1).
+    do_trapping(N-1, Bif, ArgFun).
 
-ttb_loop(N,Pid) ->
-    Term = lists:duplicate(2000000,2000000),
+trapping_loop(Bif, ArgFun, N, Pid) ->
+    Args = ArgFun(),
     Pid ! ok,
-    ttb_loop2(N,Term).
-ttb_loop2(0,_T) ->
+    trapping_loop2(Bif,Args,N).
+trapping_loop2(_,_,0) ->
     ok;
-ttb_loop2(N,T) ->
-    apply(erlang,term_to_binary,[T]),
-    ttb_loop2(N-1,T).
+trapping_loop2(Bif,Args,N) ->
+    apply(erlang,Bif,Args),
+    trapping_loop2(Bif, Args, N-1).
 
 
 %% Utilities.
diff --git a/erts/emulator/test/driver_SUITE.erl b/erts/emulator/test/driver_SUITE.erl
index 7087542899..06211406b4 100644
--- a/erts/emulator/test/driver_SUITE.erl
+++ b/erts/emulator/test/driver_SUITE.erl
@@ -2075,6 +2075,21 @@ thr_msg_blast(Config) when is_list(Config) ->
 	    Res
     end.
 
+-define(IN_RANGE(LoW_, VaLuE_, HiGh_),
+	case in_range(LoW_, VaLuE_, HiGh_) of
+	    true -> ok;
+	    false ->
+		case erlang:system_info(lock_checking) of
+		    true ->
+			?t:format("~p:~p: Ignore bad sched count due to "
+				  "lock checking~n",
+				  [?MODULE,?LINE]);
+		    false ->
+			?t:fail({unexpected_sched_counts, VaLuE_})
+		end
+	end).
+
+
 consume_timeslice(Config) when is_list(Config) ->
     %%
     %% Verify that erl_drv_consume_timeslice() works.
@@ -2131,15 +2146,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc1 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt1}, {Proc1, Sproc1}] = count_pp_sched_stop([Port, Proc1]),
-    case Sprt1 of
-	10 ->
-	    true = in_range(5, Sproc1-10, 7);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt1, Sproc1})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt1, 10),
+    ?IN_RANGE(5, Sproc1-10, 7),
 
     "disabled" = port_control(Port, $D, ""),
     Proc2 = spawn_link(fun () ->
@@ -2160,15 +2168,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc2 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt2}, {Proc2, Sproc2}] = count_pp_sched_stop([Port, Proc2]),
-    case Sprt2 of
-	10 ->
-	    true = in_range(1, Sproc2-10, 2);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt2, Sproc2})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt2, 10),
+    ?IN_RANGE(1, Sproc2-10, 2),
 
     "enabled" = port_control(Port, $E, ""),
     Proc3 = spawn_link(fun () ->
@@ -2188,15 +2189,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc3 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt3}, {Proc3, Sproc3}] = count_pp_sched_stop([Port, Proc3]),
-    case Sprt3 of
-	10 ->
-	    true = in_range(5, Sproc3-10, 7);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt3, Sproc3})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt3, 10),
+    ?IN_RANGE(5, Sproc3-10, 7),
 
     "disabled" = port_control(Port, $D, ""),
     Proc4 = spawn_link(fun () ->
@@ -2216,15 +2210,8 @@ consume_timeslice(Config) when is_list(Config) ->
     Proc4 ! Go,
     wait_command_msgs(Port, 10),
     [{Port, Sprt4}, {Proc4, Sproc4}] = count_pp_sched_stop([Port, Proc4]),
-    case Sprt4 of
-	10 ->
-	    true = in_range(1, Sproc4-10, 2);
-	_ ->
-	    case erlang:system_info(lock_checking) of
-		true -> ?t:format("Ignore bad sched count due to lock checking", []);
-		false -> ?t:fail({unexpected_sched_counts, Sprt4, Sproc4})
-	    end
-    end,
+    ?IN_RANGE(10, Sprt4, 10),
+    ?IN_RANGE(1, Sproc4-10, 2),
 
     SOnl = erlang:system_info(schedulers_online),
     %% If only one scheduler use port with parallelism set to true,
@@ -2272,8 +2259,8 @@ consume_timeslice(Config) when is_list(Config) ->
     wait_procs_exit([W5, Proc5]),
     wait_command_msgs(Port2, 10),
     [{Port2, Sprt5}, {Proc5, Sproc5}] = count_pp_sched_stop([Port2, Proc5]),
-    true = in_range(2, Sproc5, 3),
-    true = in_range(7, Sprt5, 20),
+    ?IN_RANGE(2, Sproc5, 3),
+    ?IN_RANGE(6, Sprt5, 20),
 		  
     count_pp_sched_start(),
     "disabled" = port_control(Port2, $D, ""),
@@ -2307,8 +2294,8 @@ consume_timeslice(Config) when is_list(Config) ->
     wait_procs_exit([W6, Proc6]),
     wait_command_msgs(Port2, 10),
     [{Port2, Sprt6}, {Proc6, Sproc6}] = count_pp_sched_stop([Port2, Proc6]),
-    true = in_range(2, Sproc6, 3),
-    true = in_range(3, Sprt6, 6),
+    ?IN_RANGE(2, Sproc6, 3),
+    ?IN_RANGE(2, Sprt6, 6),
 
     process_flag(scheduler, 0),
 
@@ -2316,6 +2303,7 @@ consume_timeslice(Config) when is_list(Config) ->
     receive {Port2, closed} -> ok end,
     ok.
 
+
 wait_command_msgs(_, 0) ->
     ok;
 wait_command_msgs(Port, N) ->
diff --git a/erts/emulator/test/exception_SUITE.erl b/erts/emulator/test/exception_SUITE.erl
index 109cec25cb..09a7a87a9a 100644
--- a/erts/emulator/test/exception_SUITE.erl
+++ b/erts/emulator/test/exception_SUITE.erl
@@ -589,6 +589,13 @@ line_numbers(Config) when is_list(Config) ->
 	       [{file,ModFile},{line,_}]}|_]}} =
 	(catch build_binary2(8, bad_binary)),
 
+    <<"abc",357:16>> = build_binary3(<<"abc">>),
+    {'EXIT',{badarg,[{?MODULE,build_binary3,1,
+		      [{file,"bit_syntax.erl"},{line,72511}]},
+		     {?MODULE,line_numbers,1,
+		      [{file,ModFile},{line,_}]}|_]}} =
+	(catch build_binary3(no_binary)),
+
     {'EXIT',{function_clause,
 	     [{?MODULE,do_call_abs,[y,y],
 	       [{file,"gc_bif.erl"},{line,18}]},
@@ -691,6 +698,10 @@ build_binary2(Size, Bin) ->			%Line 72505
     id(0),					%Line 72506
     <<7:Size,Bin/binary>>.			%Line 72507
 
+build_binary3(Bin) ->			        %Line 72509
+    id(0),					%Line 72510
+    <<Bin/binary,357:16>>.			%Line 72511
+
 -file("gc_bif.erl", 17).
 do_call_abs(x, Arg) ->				%Line 18
     abs(Arg).					%Line 19
diff --git a/erts/emulator/test/scheduler_SUITE.erl b/erts/emulator/test/scheduler_SUITE.erl
index 81539faa09..6a43e2b0e7 100644
--- a/erts/emulator/test/scheduler_SUITE.erl
+++ b/erts/emulator/test/scheduler_SUITE.erl
@@ -1495,7 +1495,7 @@ mcall(Node, Funs) ->
 	      end, Refs).
 
 erl_rel_flag_var() ->
-    "ERL_"++erlang:system_info(otp_release)++"_FLAGS".
+    "ERL_OTP"++erlang:system_info(otp_release)++"_FLAGS".
 
 clear_erl_rel_flags() ->
     EnvVar = erl_rel_flag_var(),
diff --git a/erts/emulator/utils/make_version b/erts/emulator/utils/make_version
index 7757fa8138..02b68f2b39 100755
--- a/erts/emulator/utils/make_version
+++ b/erts/emulator/utils/make_version
@@ -41,6 +41,9 @@ if ($ARGV[0] eq '-o') {
 my $release = shift;
 defined $release or die "No release specified";
 
+my $correction_package = shift;
+defined $correction_package or die "No correction package specified";
+
 my $version = shift;
 defined $version or die "No version name specified";
 
@@ -53,6 +56,7 @@ open(FILE, ">$outputfile") or die "Can't create $outputfile: $!";
 print FILE <<EOF;
 /* This file was created by 'make_version' -- don't modify. */
 #define ERLANG_OTP_RELEASE "$release"
+#define ERLANG_OTP_CORRECTION_PACKAGE "$correction_package"
 #define ERLANG_VERSION "$version"
 #define ERLANG_COMPILE_DATE "$time_str"
 #define ERLANG_ARCHITECTURE "$architecture"
diff --git a/erts/emulator/zlib/adler32.c b/erts/emulator/zlib/adler32.c
index 4368c31d70..c693a42b7c 100644
--- a/erts/emulator/zlib/adler32.c
+++ b/erts/emulator/zlib/adler32.c
@@ -1,19 +1,20 @@
 /* adler32.c -- compute the Adler-32 checksum of a data stream
- * Copyright (C) 1995-2004 Mark Adler
+ * Copyright (C) 1995-2011 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* @(#) $Id$ */
 
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
-#define ZLIB_INTERNAL
-#include "zlib.h"
+#include "zutil.h"
+
+#define local static
+
+local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
 
-#define BASE 65521UL    /* largest prime smaller than 65536 */
+#define BASE 65521      /* largest prime smaller than 65536 */
 #define NMAX 5552
 /* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
 
@@ -23,39 +24,44 @@
 #define DO8(buf,i)  DO4(buf,i); DO4(buf,i+4);
 #define DO16(buf)   DO8(buf,0); DO8(buf,8);
 
-/* use NO_DIVIDE if your processor does not do division in hardware */
+/* use NO_DIVIDE if your processor does not do division in hardware --
+   try it both ways to see which is faster */
 #ifdef NO_DIVIDE
-#  define MOD(a) \
+/* note that this assumes BASE is 65521, where 65536 % 65521 == 15
+   (thank you to John Reiser for pointing this out) */
+#  define CHOP(a) \
+    do { \
+        unsigned long tmp = a >> 16; \
+        a &= 0xffffUL; \
+        a += (tmp << 4) - tmp; \
+    } while (0)
+#  define MOD28(a) \
     do { \
-        if (a >= (BASE << 16)) a -= (BASE << 16); \
-        if (a >= (BASE << 15)) a -= (BASE << 15); \
-        if (a >= (BASE << 14)) a -= (BASE << 14); \
-        if (a >= (BASE << 13)) a -= (BASE << 13); \
-        if (a >= (BASE << 12)) a -= (BASE << 12); \
-        if (a >= (BASE << 11)) a -= (BASE << 11); \
-        if (a >= (BASE << 10)) a -= (BASE << 10); \
-        if (a >= (BASE << 9)) a -= (BASE << 9); \
-        if (a >= (BASE << 8)) a -= (BASE << 8); \
-        if (a >= (BASE << 7)) a -= (BASE << 7); \
-        if (a >= (BASE << 6)) a -= (BASE << 6); \
-        if (a >= (BASE << 5)) a -= (BASE << 5); \
-        if (a >= (BASE << 4)) a -= (BASE << 4); \
-        if (a >= (BASE << 3)) a -= (BASE << 3); \
-        if (a >= (BASE << 2)) a -= (BASE << 2); \
-        if (a >= (BASE << 1)) a -= (BASE << 1); \
+        CHOP(a); \
         if (a >= BASE) a -= BASE; \
     } while (0)
-#  define MOD4(a) \
+#  define MOD(a) \
     do { \
-        if (a >= (BASE << 4)) a -= (BASE << 4); \
-        if (a >= (BASE << 3)) a -= (BASE << 3); \
-        if (a >= (BASE << 2)) a -= (BASE << 2); \
-        if (a >= (BASE << 1)) a -= (BASE << 1); \
+        CHOP(a); \
+        MOD28(a); \
+    } while (0)
+#  define MOD63(a) \
+    do { /* this assumes a is not negative */ \
+        z_off64_t tmp = a >> 32; \
+        a &= 0xffffffffL; \
+        a += (tmp << 8) - (tmp << 5) + tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
+        tmp = a >> 16; \
+        a &= 0xffffL; \
+        a += (tmp << 4) - tmp; \
         if (a >= BASE) a -= BASE; \
     } while (0)
 #else
 #  define MOD(a) a %= BASE
-#  define MOD4(a) a %= BASE
+#  define MOD28(a) a %= BASE
+#  define MOD63(a) a %= BASE
 #endif
 
 /* ========================================================================= */
@@ -94,7 +100,7 @@ uLong ZEXPORT adler32(adler, buf, len)
         }
         if (adler >= BASE)
             adler -= BASE;
-        MOD4(sum2);             /* only added so many BASE's */
+        MOD28(sum2);            /* only added so many BASE's */
         return adler | (sum2 << 16);
     }
 
@@ -130,25 +136,47 @@ uLong ZEXPORT adler32(adler, buf, len)
 }
 
 /* ========================================================================= */
-uLong ZEXPORT adler32_combine(adler1, adler2, len2)
+local uLong adler32_combine_(adler1, adler2, len2)
     uLong adler1;
     uLong adler2;
-    z_off_t len2;
+    z_off64_t len2;
 {
     unsigned long sum1;
     unsigned long sum2;
     unsigned rem;
 
+    /* for negative len, return invalid adler32 as a clue for debugging */
+    if (len2 < 0)
+        return 0xffffffffUL;
+
     /* the derivation of this formula is left as an exercise for the reader */
-    rem = (unsigned)(len2 % BASE);
+    MOD63(len2);                /* assumes len2 >= 0 */
+    rem = (unsigned)len2;
     sum1 = adler1 & 0xffff;
     sum2 = rem * sum1;
     MOD(sum2);
     sum1 += (adler2 & 0xffff) + BASE - 1;
     sum2 += ((adler1 >> 16) & 0xffff) + ((adler2 >> 16) & 0xffff) + BASE - rem;
-    if (sum1 > BASE) sum1 -= BASE;
-    if (sum1 > BASE) sum1 -= BASE;
-    if (sum2 > (BASE << 1)) sum2 -= (BASE << 1);
-    if (sum2 > BASE) sum2 -= BASE;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum1 >= BASE) sum1 -= BASE;
+    if (sum2 >= (BASE << 1)) sum2 -= (BASE << 1);
+    if (sum2 >= BASE) sum2 -= BASE;
     return sum1 | (sum2 << 16);
 }
+
+/* ========================================================================= */
+uLong ZEXPORT adler32_combine(adler1, adler2, len2)
+    uLong adler1;
+    uLong adler2;
+    z_off_t len2;
+{
+    return adler32_combine_(adler1, adler2, len2);
+}
+
+uLong ZEXPORT adler32_combine64(adler1, adler2, len2)
+    uLong adler1;
+    uLong adler2;
+    z_off64_t len2;
+{
+    return adler32_combine_(adler1, adler2, len2);
+}
diff --git a/erts/emulator/zlib/compress.c b/erts/emulator/zlib/compress.c
index 28bceb15f8..8ecef0f790 100644
--- a/erts/emulator/zlib/compress.c
+++ b/erts/emulator/zlib/compress.c
@@ -1,10 +1,8 @@
 /* compress.c -- compress a memory buffer
- * Copyright (C) 1995-2003 Jean-loup Gailly.
+ * Copyright (C) 1995-2005 Jean-loup Gailly.
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* @(#) $Id$ */
 
 #ifdef HAVE_CONFIG_H
@@ -34,7 +32,7 @@ int ZEXPORT compress2 (dest, destLen, source, sourceLen, level)
     z_stream stream;
     int err;
 
-    stream.next_in = (Bytef*)source;
+    stream.next_in = (z_const Bytef *)source;
     stream.avail_in = (uInt)sourceLen;
 #ifdef MAXSEG_64K
     /* Check for source > 64K on 16-bit machine: */
@@ -80,5 +78,6 @@ int ZEXPORT compress (dest, destLen, source, sourceLen)
 uLong ZEXPORT compressBound (sourceLen)
     uLong sourceLen;
 {
-    return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) + 11;
+    return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
+           (sourceLen >> 25) + 13;
 }
diff --git a/erts/emulator/zlib/crc32.c b/erts/emulator/zlib/crc32.c
index b9c10bb9b3..ba506d8dd3 100644
--- a/erts/emulator/zlib/crc32.c
+++ b/erts/emulator/zlib/crc32.c
@@ -1,19 +1,14 @@
 /* crc32.c -- compute the CRC-32 of a data stream
- * Copyright (C) 1995-2005 Mark Adler
+ * Copyright (C) 1995-2006, 2010, 2011, 2012 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  *
  * Thanks to Rodney Brown <[email protected]> for his contribution of faster
  * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
  * tables for updating the shift register in one step with three exclusive-ors
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
  * instead of four steps with four exclusive-ors.  This results in about a
  * factor of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
  */
 
-/* %ExternalCopyright% */
-
 /* @(#) $Id$ */
 
 /*
@@ -22,6 +17,8 @@
   of the crc tables.  Therefore, if you #define DYNAMIC_CRC_TABLE, you should
   first call get_crc_table() to initialize the tables before allowing more than
   one thread to use crc32().
+
+  DYNAMIC_CRC_TABLE and MAKECRCH can be #defined to write out crc32.h.
  */
 
 #ifdef MAKECRCH
@@ -31,35 +28,19 @@
 #  endif /* !DYNAMIC_CRC_TABLE */
 #endif /* MAKECRCH */
 
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
 #include "zutil.h"      /* for STDC and FAR definitions */
 
 #define local static
 
-/* Find a four-byte integer type for crc32_little() and crc32_big(). */
-#ifndef NOBYFOUR
-#  ifdef STDC           /* need ANSI C limits.h to determine sizes */
-#    include <limits.h>
-#    define BYFOUR
-#    if (UINT_MAX == 0xffffffffUL)
-       typedef unsigned int u4;
-#    else
-#      if (ULONG_MAX == 0xffffffffUL)
-         typedef unsigned long u4;
-#      else
-#        if (USHRT_MAX == 0xffffffffUL)
-           typedef unsigned short u4;
-#        else
-#          undef BYFOUR     /* can't find a four-byte integer type! */
-#        endif
-#      endif
-#    endif
-#  endif /* STDC */
-#endif /* !NOBYFOUR */
-
 /* Definitions for doing the crc four data bytes at a time. */
+#if !defined(NOBYFOUR) && defined(Z_U4)
+#  define BYFOUR
+#endif
 #ifdef BYFOUR
-#  define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
-                (((w)&0xff00)<<8)+(((w)&0xff)<<24))
    local unsigned long crc32_little OF((unsigned long,
                         const unsigned char FAR *, unsigned));
    local unsigned long crc32_big OF((unsigned long,
@@ -73,14 +54,16 @@
 local unsigned long gf2_matrix_times OF((unsigned long *mat,
                                          unsigned long vec));
 local void gf2_matrix_square OF((unsigned long *square, unsigned long *mat));
+local uLong crc32_combine_ OF((uLong crc1, uLong crc2, z_off64_t len2));
+
 
 #ifdef DYNAMIC_CRC_TABLE
 
 local volatile int crc_table_empty = 1;
-local unsigned long FAR crc_table[TBLS][256];
+local z_crc_t FAR crc_table[TBLS][256];
 local void make_crc_table OF((void));
 #ifdef MAKECRCH
-   local void write_table OF((FILE *, const unsigned long FAR *));
+   local void write_table OF((FILE *, const z_crc_t FAR *));
 #endif /* MAKECRCH */
 /*
   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
@@ -110,9 +93,9 @@ local void make_crc_table OF((void));
 */
 local void make_crc_table()
 {
-    unsigned long c;
+    z_crc_t c;
     int n, k;
-    unsigned long poly;                 /* polynomial exclusive-or pattern */
+    z_crc_t poly;                       /* polynomial exclusive-or pattern */
     /* terms of polynomial defining this crc (except x^32): */
     static volatile int first = 1;      /* flag to limit concurrent making */
     static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
@@ -124,13 +107,13 @@ local void make_crc_table()
         first = 0;
 
         /* make exclusive-or pattern from polynomial (0xedb88320UL) */
-        poly = 0UL;
-        for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
-            poly |= 1UL << (31 - p[n]);
+        poly = 0;
+        for (n = 0; n < (int)(sizeof(p)/sizeof(unsigned char)); n++)
+            poly |= (z_crc_t)1 << (31 - p[n]);
 
         /* generate a crc for every 8-bit value */
         for (n = 0; n < 256; n++) {
-            c = (unsigned long)n;
+            c = (z_crc_t)n;
             for (k = 0; k < 8; k++)
                 c = c & 1 ? poly ^ (c >> 1) : c >> 1;
             crc_table[0][n] = c;
@@ -141,11 +124,11 @@ local void make_crc_table()
            and then the byte reversal of those as well as the first table */
         for (n = 0; n < 256; n++) {
             c = crc_table[0][n];
-            crc_table[4][n] = REV(c);
+            crc_table[4][n] = ZSWAP32(c);
             for (k = 1; k < 4; k++) {
                 c = crc_table[0][c & 0xff] ^ (c >> 8);
                 crc_table[k][n] = c;
-                crc_table[k + 4][n] = REV(c);
+                crc_table[k + 4][n] = ZSWAP32(c);
             }
         }
 #endif /* BYFOUR */
@@ -167,7 +150,7 @@ local void make_crc_table()
         if (out == NULL) return;
         fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
         fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
-        fprintf(out, "local const unsigned long FAR ");
+        fprintf(out, "local const z_crc_t FAR ");
         fprintf(out, "crc_table[TBLS][256] =\n{\n  {\n");
         write_table(out, crc_table[0]);
 #  ifdef BYFOUR
@@ -187,12 +170,13 @@ local void make_crc_table()
 #ifdef MAKECRCH
 local void write_table(out, table)
     FILE *out;
-    const unsigned long FAR *table;
+    const z_crc_t FAR *table;
 {
     int n;
 
     for (n = 0; n < 256; n++)
-        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ", table[n],
+        fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : "    ",
+                (unsigned long)(table[n]),
                 n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
 }
 #endif /* MAKECRCH */
@@ -207,13 +191,13 @@ local void write_table(out, table)
 /* =========================================================================
  * This function can be used by asm versions of crc32()
  */
-const unsigned long FAR * ZEXPORT get_crc_table()
+const z_crc_t FAR * ZEXPORT get_crc_table()
 {
 #ifdef DYNAMIC_CRC_TABLE
     if (crc_table_empty)
         make_crc_table();
 #endif /* DYNAMIC_CRC_TABLE */
-    return (const unsigned long FAR *)crc_table;
+    return (const z_crc_t FAR *)crc_table;
 }
 
 /* ========================================================================= */
@@ -224,7 +208,7 @@ const unsigned long FAR * ZEXPORT get_crc_table()
 unsigned long ZEXPORT crc32(crc, buf, len)
     unsigned long crc;
     const unsigned char FAR *buf;
-    unsigned len;
+    uInt len;
 {
     if (buf == Z_NULL) return 0UL;
 
@@ -235,7 +219,7 @@ unsigned long ZEXPORT crc32(crc, buf, len)
 
 #ifdef BYFOUR
     if (sizeof(void *) == sizeof(ptrdiff_t)) {
-        u4 endian;
+        z_crc_t endian;
 
         endian = 1;
         if (*((unsigned char *)(&endian)))
@@ -269,17 +253,17 @@ local unsigned long crc32_little(crc, buf, len)
     const unsigned char FAR *buf;
     unsigned len;
 {
-    register u4 c;
-    register const u4 FAR *buf4;
+    register z_crc_t c;
+    register const z_crc_t FAR *buf4;
 
-    c = (u4)crc;
+    c = (z_crc_t)crc;
     c = ~c;
     while (len && ((ptrdiff_t)buf & 3)) {
         c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
         len--;
     }
 
-    buf4 = (const u4 FAR *)(const void FAR *)buf;
+    buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
     while (len >= 32) {
         DOLIT32;
         len -= 32;
@@ -309,17 +293,17 @@ local unsigned long crc32_big(crc, buf, len)
     const unsigned char FAR *buf;
     unsigned len;
 {
-    register u4 c;
-    register const u4 FAR *buf4;
+    register z_crc_t c;
+    register const z_crc_t FAR *buf4;
 
-    c = REV((u4)crc);
+    c = ZSWAP32((z_crc_t)crc);
     c = ~c;
     while (len && ((ptrdiff_t)buf & 3)) {
         c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
         len--;
     }
 
-    buf4 = (const u4 FAR *)(const void FAR *)buf;
+    buf4 = (const z_crc_t FAR *)(const void FAR *)buf;
     buf4--;
     while (len >= 32) {
         DOBIG32;
@@ -336,7 +320,7 @@ local unsigned long crc32_big(crc, buf, len)
         c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
     } while (--len);
     c = ~c;
-    return (unsigned long)(REV(c));
+    return (unsigned long)(ZSWAP32(c));
 }
 
 #endif /* BYFOUR */
@@ -372,22 +356,22 @@ local void gf2_matrix_square(square, mat)
 }
 
 /* ========================================================================= */
-uLong ZEXPORT crc32_combine(crc1, crc2, len2)
+local uLong crc32_combine_(crc1, crc2, len2)
     uLong crc1;
     uLong crc2;
-    z_off_t len2;
+    z_off64_t len2;
 {
     int n;
     unsigned long row;
     unsigned long even[GF2_DIM];    /* even-power-of-two zeros operator */
     unsigned long odd[GF2_DIM];     /* odd-power-of-two zeros operator */
 
-    /* degenerate case */
-    if (len2 == 0)
+    /* degenerate case (also disallow negative lengths) */
+    if (len2 <= 0)
         return crc1;
 
     /* put operator for one zero bit in odd */
-    odd[0] = 0xedb88320L;           /* CRC-32 polynomial */
+    odd[0] = 0xedb88320UL;          /* CRC-32 polynomial */
     row = 1;
     for (n = 1; n < GF2_DIM; n++) {
         odd[n] = row;
@@ -426,3 +410,20 @@ uLong ZEXPORT crc32_combine(crc1, crc2, len2)
     crc1 ^= crc2;
     return crc1;
 }
+
+/* ========================================================================= */
+uLong ZEXPORT crc32_combine(crc1, crc2, len2)
+    uLong crc1;
+    uLong crc2;
+    z_off_t len2;
+{
+    return crc32_combine_(crc1, crc2, len2);
+}
+
+uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
+    uLong crc1;
+    uLong crc2;
+    z_off64_t len2;
+{
+    return crc32_combine_(crc1, crc2, len2);
+}
diff --git a/erts/emulator/zlib/crc32.h b/erts/emulator/zlib/crc32.h
index 49cd69a4c2..9e0c778102 100644
--- a/erts/emulator/zlib/crc32.h
+++ b/erts/emulator/zlib/crc32.h
@@ -2,9 +2,7 @@
  * Generated automatically by crc32.c
  */
 
-/* %ExternalCopyright% */
-
-local const unsigned long FAR crc_table[TBLS][256] =
+local const z_crc_t FAR crc_table[TBLS][256] =
 {
   {
     0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
diff --git a/erts/emulator/zlib/deflate.c b/erts/emulator/zlib/deflate.c
index 92f4be57c5..943c26dfb2 100644
--- a/erts/emulator/zlib/deflate.c
+++ b/erts/emulator/zlib/deflate.c
@@ -1,10 +1,8 @@
 /* deflate.c -- compress data using the deflation algorithm
- * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /*
  *  ALGORITHM
  *
@@ -39,7 +37,7 @@
  *  REFERENCES
  *
  *      Deutsch, L.P.,"DEFLATE Compressed Data Format Specification".
- *      Available in http://www.ietf.org/rfc/rfc1951.txt
+ *      Available in http://tools.ietf.org/html/rfc1951
  *
  *      A description of the Rabin and Karp algorithm is given in the book
  *         "Algorithms" by R. Sedgewick, Addison-Wesley, p252.
@@ -57,7 +55,7 @@
 #include "deflate.h"
 
 const char deflate_copyright[] =
-   " deflate 1.2.3 Copyright 1995-2005 Jean-loup Gailly ";
+   " deflate 1.2.8 Copyright 1995-2013 Jean-loup Gailly and Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -84,19 +82,18 @@ local block_state deflate_fast   OF((deflate_state *s, int flush));
 #ifndef FASTEST
 local block_state deflate_slow   OF((deflate_state *s, int flush));
 #endif
+local block_state deflate_rle    OF((deflate_state *s, int flush));
+local block_state deflate_huff   OF((deflate_state *s, int flush));
 local void lm_init        OF((deflate_state *s));
 local void putShortMSB    OF((deflate_state *s, uInt b));
 local void flush_pending  OF((z_streamp strm));
 local int read_buf        OF((z_streamp strm, Bytef *buf, unsigned size));
-#ifndef FASTEST
 #ifdef ASMV
       void match_init OF((void)); /* asm code initialization */
       uInt longest_match  OF((deflate_state *s, IPos cur_match));
 #else
 local uInt longest_match  OF((deflate_state *s, IPos cur_match));
 #endif
-#endif
-local uInt longest_match_fast OF((deflate_state *s, IPos cur_match));
 
 #ifdef DEBUG
 local  void check_match OF((deflate_state *s, IPos start, IPos match,
@@ -115,11 +112,6 @@ local  void check_match OF((deflate_state *s, IPos start, IPos match,
 #endif
 /* Matches of length 3 are discarded if their distance exceeds TOO_FAR */
 
-#define MIN_LOOKAHEAD (MAX_MATCH+MIN_MATCH+1)
-/* Minimum amount of lookahead, except at the end of the input file.
- * See deflate.c for comments about the MIN_MATCH+1.
- */
-
 /* Values for max_lazy_match, good_match and max_chain_length, depending on
  * the desired pack level (0..9). The values given below have been tuned to
  * exclude worst case performance for pathological files. Better values may be
@@ -166,6 +158,9 @@ local const config configuration_table[10] = {
 struct static_tree_desc_s {int dummy;}; /* for buggy compilers */
 #endif
 
+/* rank Z_BLOCK between Z_NO_FLUSH and Z_PARTIAL_FLUSH */
+#define RANK(f) (((f) << 1) - ((f) > 4 ? 9 : 0))
+
 /* ===========================================================================
  * Update a hash value with the given input byte
  * IN  assertion: all calls to to UPDATE_HASH are made with consecutive
@@ -246,10 +241,19 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
 
     strm->msg = Z_NULL;
     if (strm->zalloc == (alloc_func)0) {
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
         strm->zalloc = zcalloc;
         strm->opaque = (voidpf)0;
+#endif
     }
-    if (strm->zfree == (free_func)0) strm->zfree = zcfree;
+    if (strm->zfree == (free_func)0)
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zfree = zcfree;
+#endif
 
 #ifdef FASTEST
     if (level != 0) level = 1;
@@ -293,6 +297,8 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
     s->prev   = (Posf *)  ZALLOC(strm, s->w_size, sizeof(Pos));
     s->head   = (Posf *)  ZALLOC(strm, s->hash_size, sizeof(Pos));
 
+    s->high_water = 0;      /* nothing written to s->window yet */
+
     s->lit_bufsize = 1 << (memLevel + 6); /* 16K elements by default */
 
     overlay = (ushf *) ZALLOC(strm, s->lit_bufsize, sizeof(ush)+2);
@@ -302,7 +308,7 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
     if (s->window == Z_NULL || s->prev == Z_NULL || s->head == Z_NULL ||
         s->pending_buf == Z_NULL) {
         s->status = FINISH_STATE;
-        strm->msg = (char*)ERR_MSG(Z_MEM_ERROR);
+        strm->msg = ERR_MSG(Z_MEM_ERROR);
         deflateEnd (strm);
         return Z_MEM_ERROR;
     }
@@ -323,43 +329,70 @@ int ZEXPORT deflateSetDictionary (strm, dictionary, dictLength)
     uInt  dictLength;
 {
     deflate_state *s;
-    uInt length = dictLength;
-    uInt n;
-    IPos hash_head = 0;
+    uInt str, n;
+    int wrap;
+    unsigned avail;
+    z_const unsigned char *next;
 
-    if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL ||
-        strm->state->wrap == 2 ||
-        (strm->state->wrap == 1 && strm->state->status != INIT_STATE))
+    if (strm == Z_NULL || strm->state == Z_NULL || dictionary == Z_NULL)
         return Z_STREAM_ERROR;
-
     s = strm->state;
-    if (s->wrap)
-        strm->adler = adler32(strm->adler, dictionary, dictLength);
+    wrap = s->wrap;
+    if (wrap == 2 || (wrap == 1 && s->status != INIT_STATE) || s->lookahead)
+        return Z_STREAM_ERROR;
 
-    if (length < MIN_MATCH) return Z_OK;
-    if (length > MAX_DIST(s)) {
-        length = MAX_DIST(s);
-        dictionary += dictLength - length; /* use the tail of the dictionary */
+    /* when using zlib wrappers, compute Adler-32 for provided dictionary */
+    if (wrap == 1)
+        strm->adler = adler32(strm->adler, dictionary, dictLength);
+    s->wrap = 0;                    /* avoid computing Adler-32 in read_buf */
+
+    /* if dictionary would fill window, just replace the history */
+    if (dictLength >= s->w_size) {
+        if (wrap == 0) {            /* already empty otherwise */
+            CLEAR_HASH(s);
+            s->strstart = 0;
+            s->block_start = 0L;
+            s->insert = 0;
+        }
+        dictionary += dictLength - s->w_size;  /* use the tail */
+        dictLength = s->w_size;
     }
-    zmemcpy(s->window, dictionary, length);
-    s->strstart = length;
-    s->block_start = (long)length;
 
-    /* Insert all strings in the hash table (except for the last two bytes).
-     * s->lookahead stays null, so s->ins_h will be recomputed at the next
-     * call of fill_window.
-     */
-    s->ins_h = s->window[0];
-    UPDATE_HASH(s, s->ins_h, s->window[1]);
-    for (n = 0; n <= length - MIN_MATCH; n++) {
-        INSERT_STRING(s, n, hash_head);
+    /* insert dictionary into window and hash */
+    avail = strm->avail_in;
+    next = strm->next_in;
+    strm->avail_in = dictLength;
+    strm->next_in = (z_const Bytef *)dictionary;
+    fill_window(s);
+    while (s->lookahead >= MIN_MATCH) {
+        str = s->strstart;
+        n = s->lookahead - (MIN_MATCH-1);
+        do {
+            UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+#ifndef FASTEST
+            s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+            s->head[s->ins_h] = (Pos)str;
+            str++;
+        } while (--n);
+        s->strstart = str;
+        s->lookahead = MIN_MATCH-1;
+        fill_window(s);
     }
-    if (hash_head) hash_head = 0;  /* to make compiler happy */
+    s->strstart += s->lookahead;
+    s->block_start = (long)s->strstart;
+    s->insert = s->lookahead;
+    s->lookahead = 0;
+    s->match_length = s->prev_length = MIN_MATCH-1;
+    s->match_available = 0;
+    strm->next_in = next;
+    strm->avail_in = avail;
+    s->wrap = wrap;
     return Z_OK;
 }
 
 /* ========================================================================= */
-int ZEXPORT deflateReset (strm)
+int ZEXPORT deflateResetKeep (strm)
     z_streamp strm;
 {
     deflate_state *s;
@@ -389,12 +422,23 @@ int ZEXPORT deflateReset (strm)
     s->last_flush = Z_NO_FLUSH;
 
     _tr_init(s);
-    lm_init(s);
 
     return Z_OK;
 }
 
 /* ========================================================================= */
+int ZEXPORT deflateReset (strm)
+    z_streamp strm;
+{
+    int ret;
+
+    ret = deflateResetKeep(strm);
+    if (ret == Z_OK)
+        lm_init(strm->state);
+    return ret;
+}
+
+/* ========================================================================= */
 int ZEXPORT deflateSetHeader (strm, head)
     z_streamp strm;
     gz_headerp head;
@@ -406,14 +450,42 @@ int ZEXPORT deflateSetHeader (strm, head)
 }
 
 /* ========================================================================= */
+int ZEXPORT deflatePending (strm, pending, bits)
+    unsigned *pending;
+    int *bits;
+    z_streamp strm;
+{
+    if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+    if (pending != Z_NULL)
+        *pending = strm->state->pending;
+    if (bits != Z_NULL)
+        *bits = strm->state->bi_valid;
+    return Z_OK;
+}
+
+/* ========================================================================= */
 int ZEXPORT deflatePrime (strm, bits, value)
     z_streamp strm;
     int bits;
     int value;
 {
+    deflate_state *s;
+    int put;
+
     if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
-    strm->state->bi_valid = bits;
-    strm->state->bi_buf = (ush)(value & ((1 << bits) - 1));
+    s = strm->state;
+    if ((Bytef *)(s->d_buf) < s->pending_out + ((Buf_size + 7) >> 3))
+        return Z_BUF_ERROR;
+    do {
+        put = Buf_size - s->bi_valid;
+        if (put > bits)
+            put = bits;
+        s->bi_buf |= (ush)((value & ((1 << put) - 1)) << s->bi_valid);
+        s->bi_valid += put;
+        _tr_flush_bits(s);
+        value >>= put;
+        bits -= put;
+    } while (bits);
     return Z_OK;
 }
 
@@ -440,9 +512,12 @@ int ZEXPORT deflateParams(strm, level, strategy)
     }
     func = configuration_table[s->level].func;
 
-    if (func != configuration_table[level].func && strm->total_in != 0) {
+    if ((strategy != s->strategy || func != configuration_table[level].func) &&
+        strm->total_in != 0) {
         /* Flush the last buffer: */
-        err = deflate(strm, Z_PARTIAL_FLUSH);
+        err = deflate(strm, Z_BLOCK);
+        if (err == Z_BUF_ERROR && s->pending == 0)
+            err = Z_OK;
     }
     if (s->level != level) {
         s->level = level;
@@ -486,33 +561,66 @@ int ZEXPORT deflateTune(strm, good_length, max_lazy, nice_length, max_chain)
  * resulting from using fixed blocks instead of stored blocks, which deflate
  * can emit on compressed data for some combinations of the parameters.
  *
- * This function could be more sophisticated to provide closer upper bounds
- * for every combination of windowBits and memLevel, as well as wrap.
- * But even the conservative upper bound of about 14% expansion does not
- * seem onerous for output buffer allocation.
+ * This function could be more sophisticated to provide closer upper bounds for
+ * every combination of windowBits and memLevel.  But even the conservative
+ * upper bound of about 14% expansion does not seem onerous for output buffer
+ * allocation.
  */
 uLong ZEXPORT deflateBound(strm, sourceLen)
     z_streamp strm;
     uLong sourceLen;
 {
     deflate_state *s;
-    uLong destLen;
+    uLong complen, wraplen;
+    Bytef *str;
 
-    /* conservative upper bound */
-    destLen = sourceLen +
-              ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 11;
+    /* conservative upper bound for compressed data */
+    complen = sourceLen +
+              ((sourceLen + 7) >> 3) + ((sourceLen + 63) >> 6) + 5;
 
-    /* if can't get parameters, return conservative bound */
+    /* if can't get parameters, return conservative bound plus zlib wrapper */
     if (strm == Z_NULL || strm->state == Z_NULL)
-        return destLen;
+        return complen + 6;
 
-    /* if not default parameters, return conservative bound */
+    /* compute wrapper length */
     s = strm->state;
+    switch (s->wrap) {
+    case 0:                                 /* raw deflate */
+        wraplen = 0;
+        break;
+    case 1:                                 /* zlib wrapper */
+        wraplen = 6 + (s->strstart ? 4 : 0);
+        break;
+    case 2:                                 /* gzip wrapper */
+        wraplen = 18;
+        if (s->gzhead != Z_NULL) {          /* user-supplied gzip header */
+            if (s->gzhead->extra != Z_NULL)
+                wraplen += 2 + s->gzhead->extra_len;
+            str = s->gzhead->name;
+            if (str != Z_NULL)
+                do {
+                    wraplen++;
+                } while (*str++);
+            str = s->gzhead->comment;
+            if (str != Z_NULL)
+                do {
+                    wraplen++;
+                } while (*str++);
+            if (s->gzhead->hcrc)
+                wraplen += 2;
+        }
+        break;
+    default:                                /* for compiler happiness */
+        wraplen = 6;
+    }
+
+    /* if not default parameters, return conservative bound */
     if (s->w_bits != 15 || s->hash_bits != 8 + 7)
-        return destLen;
+        return complen + wraplen;
 
     /* default settings: return tight bound for that case */
-    return compressBound(sourceLen);
+    return sourceLen + (sourceLen >> 12) + (sourceLen >> 14) +
+           (sourceLen >> 25) + 13 - 6 + wraplen;
 }
 
 /* =========================================================================
@@ -537,19 +645,22 @@ local void putShortMSB (s, b)
 local void flush_pending(strm)
     z_streamp strm;
 {
-    unsigned len = strm->state->pending;
+    unsigned len;
+    deflate_state *s = strm->state;
 
+    _tr_flush_bits(s);
+    len = s->pending;
     if (len > strm->avail_out) len = strm->avail_out;
     if (len == 0) return;
 
-    zmemcpy(strm->next_out, strm->state->pending_out, len);
+    zmemcpy(strm->next_out, s->pending_out, len);
     strm->next_out  += len;
-    strm->state->pending_out  += len;
+    s->pending_out  += len;
     strm->total_out += len;
     strm->avail_out  -= len;
-    strm->state->pending -= len;
-    if (strm->state->pending == 0) {
-        strm->state->pending_out = strm->state->pending_buf;
+    s->pending -= len;
+    if (s->pending == 0) {
+        s->pending_out = s->pending_buf;
     }
 }
 
@@ -562,7 +673,7 @@ int ZEXPORT deflate (strm, flush)
     deflate_state *s;
 
     if (strm == Z_NULL || strm->state == Z_NULL ||
-        flush > Z_FINISH || flush < 0) {
+        flush > Z_BLOCK || flush < 0) {
         return Z_STREAM_ERROR;
     }
     s = strm->state;
@@ -586,7 +697,7 @@ int ZEXPORT deflate (strm, flush)
             put_byte(s, 31);
             put_byte(s, 139);
             put_byte(s, 8);
-            if (s->gzhead == NULL) {
+            if (s->gzhead == Z_NULL) {
                 put_byte(s, 0);
                 put_byte(s, 0);
                 put_byte(s, 0);
@@ -613,7 +724,7 @@ int ZEXPORT deflate (strm, flush)
                             (s->strategy >= Z_HUFFMAN_ONLY || s->level < 2 ?
                              4 : 0));
                 put_byte(s, s->gzhead->os & 0xff);
-                if (s->gzhead->extra != NULL) {
+                if (s->gzhead->extra != Z_NULL) {
                     put_byte(s, s->gzhead->extra_len & 0xff);
                     put_byte(s, (s->gzhead->extra_len >> 8) & 0xff);
                 }
@@ -655,7 +766,7 @@ int ZEXPORT deflate (strm, flush)
     }
 #ifdef GZIP
     if (s->status == EXTRA_STATE) {
-        if (s->gzhead->extra != NULL) {
+        if (s->gzhead->extra != Z_NULL) {
             uInt beg = s->pending;  /* start of bytes to update crc */
 
             while (s->gzindex < (s->gzhead->extra_len & 0xffff)) {
@@ -683,7 +794,7 @@ int ZEXPORT deflate (strm, flush)
             s->status = NAME_STATE;
     }
     if (s->status == NAME_STATE) {
-        if (s->gzhead->name != NULL) {
+        if (s->gzhead->name != Z_NULL) {
             uInt beg = s->pending;  /* start of bytes to update crc */
             int val;
 
@@ -714,7 +825,7 @@ int ZEXPORT deflate (strm, flush)
             s->status = COMMENT_STATE;
     }
     if (s->status == COMMENT_STATE) {
-        if (s->gzhead->comment != NULL) {
+        if (s->gzhead->comment != Z_NULL) {
             uInt beg = s->pending;  /* start of bytes to update crc */
             int val;
 
@@ -776,7 +887,7 @@ int ZEXPORT deflate (strm, flush)
      * flushes. For repeated and useless calls with Z_FINISH, we keep
      * returning Z_STREAM_END instead of Z_BUF_ERROR.
      */
-    } else if (strm->avail_in == 0 && flush <= old_flush &&
+    } else if (strm->avail_in == 0 && RANK(flush) <= RANK(old_flush) &&
                flush != Z_FINISH) {
         ERR_RETURN(strm, Z_BUF_ERROR);
     }
@@ -792,7 +903,9 @@ int ZEXPORT deflate (strm, flush)
         (flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
         block_state bstate;
 
-        bstate = (*(configuration_table[s->level].func))(s, flush);
+        bstate = s->strategy == Z_HUFFMAN_ONLY ? deflate_huff(s, flush) :
+                    (s->strategy == Z_RLE ? deflate_rle(s, flush) :
+                        (*(configuration_table[s->level].func))(s, flush));
 
         if (bstate == finish_started || bstate == finish_done) {
             s->status = FINISH_STATE;
@@ -813,13 +926,18 @@ int ZEXPORT deflate (strm, flush)
         if (bstate == block_done) {
             if (flush == Z_PARTIAL_FLUSH) {
                 _tr_align(s);
-            } else { /* FULL_FLUSH or SYNC_FLUSH */
+            } else if (flush != Z_BLOCK) { /* FULL_FLUSH or SYNC_FLUSH */
                 _tr_stored_block(s, (char*)0, 0L, 0);
                 /* For a full flush, this empty block will be recognized
                  * as a special marker by inflate_sync().
                  */
                 if (flush == Z_FULL_FLUSH) {
                     CLEAR_HASH(s);             /* forget history */
+                    if (s->lookahead == 0) {
+                        s->strstart = 0;
+                        s->block_start = 0L;
+                        s->insert = 0;
+                    }
                 }
             }
             flush_pending(strm);
@@ -914,12 +1032,12 @@ int ZEXPORT deflateCopy (dest, source)
 
     ss = source->state;
 
-    zmemcpy(dest, source, sizeof(z_stream));
+    zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream));
 
     ds = (deflate_state *) ZALLOC(dest, 1, sizeof(deflate_state));
     if (ds == Z_NULL) return Z_MEM_ERROR;
     dest->state = (struct internal_state FAR *) ds;
-    zmemcpy(ds, ss, sizeof(deflate_state));
+    zmemcpy((voidpf)ds, (voidpf)ss, sizeof(deflate_state));
     ds->strm = dest;
 
     ds->window = (Bytef *) ZALLOC(dest, ds->w_size, 2*sizeof(Byte));
@@ -935,8 +1053,8 @@ int ZEXPORT deflateCopy (dest, source)
     }
     /* following zmemcpy do not work for 16-bit MSDOS */
     zmemcpy(ds->window, ss->window, ds->w_size * 2 * sizeof(Byte));
-    zmemcpy(ds->prev, ss->prev, ds->w_size * sizeof(Pos));
-    zmemcpy(ds->head, ss->head, ds->hash_size * sizeof(Pos));
+    zmemcpy((voidpf)ds->prev, (voidpf)ss->prev, ds->w_size * sizeof(Pos));
+    zmemcpy((voidpf)ds->head, (voidpf)ss->head, ds->hash_size * sizeof(Pos));
     zmemcpy(ds->pending_buf, ss->pending_buf, (uInt)ds->pending_buf_size);
 
     ds->pending_out = ds->pending_buf + (ss->pending_out - ss->pending_buf);
@@ -970,15 +1088,15 @@ local int read_buf(strm, buf, size)
 
     strm->avail_in  -= len;
 
+    zmemcpy(buf, strm->next_in, len);
     if (strm->state->wrap == 1) {
-        strm->adler = adler32(strm->adler, strm->next_in, len);
+        strm->adler = adler32(strm->adler, buf, len);
     }
 #ifdef GZIP
     else if (strm->state->wrap == 2) {
-        strm->adler = crc32(strm->adler, strm->next_in, len);
+        strm->adler = crc32(strm->adler, buf, len);
     }
 #endif
-    zmemcpy(buf, strm->next_in, len);
     strm->next_in  += len;
     strm->total_in += len;
 
@@ -1005,6 +1123,7 @@ local void lm_init (s)
     s->strstart = 0;
     s->block_start = 0L;
     s->lookahead = 0;
+    s->insert = 0;
     s->match_length = s->prev_length = MIN_MATCH-1;
     s->match_available = 0;
     s->ins_h = 0;
@@ -1172,12 +1291,13 @@ local uInt longest_match(s, cur_match)
     return s->lookahead;
 }
 #endif /* ASMV */
-#endif /* FASTEST */
+
+#else /* FASTEST */
 
 /* ---------------------------------------------------------------------------
- * Optimized version for level == 1 or strategy == Z_RLE only
+ * Optimized version for FASTEST only
  */
-local uInt longest_match_fast(s, cur_match)
+local uInt longest_match(s, cur_match)
     deflate_state *s;
     IPos cur_match;                             /* current match */
 {
@@ -1230,6 +1350,8 @@ local uInt longest_match_fast(s, cur_match)
     return (uInt)len <= s->lookahead ? (uInt)len : s->lookahead;
 }
 
+#endif /* FASTEST */
+
 #ifdef DEBUG
 /* ===========================================================================
  * Check that the match at match_start is indeed a match.
@@ -1276,6 +1398,8 @@ local void fill_window(s)
     unsigned more;    /* Amount of free space at the end of the window. */
     uInt wsize = s->w_size;
 
+    Assert(s->lookahead < MIN_LOOKAHEAD, "already enough lookahead");
+
     do {
         more = (unsigned)(s->window_size -(ulg)s->lookahead -(ulg)s->strstart);
 
@@ -1308,7 +1432,6 @@ local void fill_window(s)
                later. (Using level 0 permanently is not an optimal usage of
                zlib, so we don't care about this pathological case.)
              */
-            /* %%% avoid this when Z_RLE */
             n = s->hash_size;
             p = &s->head[n];
             do {
@@ -1329,7 +1452,7 @@ local void fill_window(s)
 #endif
             more += wsize;
         }
-        if (s->strm->avail_in == 0) return;
+        if (s->strm->avail_in == 0) break;
 
         /* If there was no sliding:
          *    strstart <= WSIZE+MAX_DIST-1 && lookahead <= MIN_LOOKAHEAD - 1 &&
@@ -1348,39 +1471,88 @@ local void fill_window(s)
         s->lookahead += n;
 
         /* Initialize the hash value now that we have some input: */
-        if (s->lookahead >= MIN_MATCH) {
-            s->ins_h = s->window[s->strstart];
-            UPDATE_HASH(s, s->ins_h, s->window[s->strstart+1]);
+        if (s->lookahead + s->insert >= MIN_MATCH) {
+            uInt str = s->strstart - s->insert;
+            s->ins_h = s->window[str];
+            UPDATE_HASH(s, s->ins_h, s->window[str + 1]);
 #if MIN_MATCH != 3
             Call UPDATE_HASH() MIN_MATCH-3 more times
 #endif
+            while (s->insert) {
+                UPDATE_HASH(s, s->ins_h, s->window[str + MIN_MATCH-1]);
+#ifndef FASTEST
+                s->prev[str & s->w_mask] = s->head[s->ins_h];
+#endif
+                s->head[s->ins_h] = (Pos)str;
+                str++;
+                s->insert--;
+                if (s->lookahead + s->insert < MIN_MATCH)
+                    break;
+            }
         }
         /* If the whole input has less than MIN_MATCH bytes, ins_h is garbage,
          * but this is not important since only literal bytes will be emitted.
          */
 
     } while (s->lookahead < MIN_LOOKAHEAD && s->strm->avail_in != 0);
+
+    /* If the WIN_INIT bytes after the end of the current data have never been
+     * written, then zero those bytes in order to avoid memory check reports of
+     * the use of uninitialized (or uninitialised as Julian writes) bytes by
+     * the longest match routines.  Update the high water mark for the next
+     * time through here.  WIN_INIT is set to MAX_MATCH since the longest match
+     * routines allow scanning to strstart + MAX_MATCH, ignoring lookahead.
+     */
+    if (s->high_water < s->window_size) {
+        ulg curr = s->strstart + (ulg)(s->lookahead);
+        ulg init;
+
+        if (s->high_water < curr) {
+            /* Previous high water mark below current data -- zero WIN_INIT
+             * bytes or up to end of window, whichever is less.
+             */
+            init = s->window_size - curr;
+            if (init > WIN_INIT)
+                init = WIN_INIT;
+            zmemzero(s->window + curr, (unsigned)init);
+            s->high_water = curr + init;
+        }
+        else if (s->high_water < (ulg)curr + WIN_INIT) {
+            /* High water mark at or above current data, but below current data
+             * plus WIN_INIT -- zero out to current data plus WIN_INIT, or up
+             * to end of window, whichever is less.
+             */
+            init = (ulg)curr + WIN_INIT - s->high_water;
+            if (init > s->window_size - s->high_water)
+                init = s->window_size - s->high_water;
+            zmemzero(s->window + s->high_water, (unsigned)init);
+            s->high_water += init;
+        }
+    }
+
+    Assert((ulg)s->strstart <= s->window_size - MIN_LOOKAHEAD,
+           "not enough room for search");
 }
 
 /* ===========================================================================
  * Flush the current block, with given end-of-file flag.
  * IN assertion: strstart is set to the end of the current match.
  */
-#define FLUSH_BLOCK_ONLY(s, eof) { \
+#define FLUSH_BLOCK_ONLY(s, last) { \
    _tr_flush_block(s, (s->block_start >= 0L ? \
                    (charf *)&s->window[(unsigned)s->block_start] : \
                    (charf *)Z_NULL), \
                 (ulg)((long)s->strstart - s->block_start), \
-                (eof)); \
+                (last)); \
    s->block_start = s->strstart; \
    flush_pending(s->strm); \
    Tracev((stderr,"[FLUSH]")); \
 }
 
 /* Same but force premature exit if necessary. */
-#define FLUSH_BLOCK(s, eof) { \
-   FLUSH_BLOCK_ONLY(s, eof); \
-   if (s->strm->avail_out == 0) return (eof) ? finish_started : need_more; \
+#define FLUSH_BLOCK(s, last) { \
+   FLUSH_BLOCK_ONLY(s, last); \
+   if (s->strm->avail_out == 0) return (last) ? finish_started : need_more; \
 }
 
 /* ===========================================================================
@@ -1439,8 +1611,14 @@ local block_state deflate_stored(s, flush)
             FLUSH_BLOCK(s, 0);
         }
     }
-    FLUSH_BLOCK(s, flush == Z_FINISH);
-    return flush == Z_FINISH ? finish_done : block_done;
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if ((long)s->strstart > s->block_start)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
 }
 
 /* ===========================================================================
@@ -1454,7 +1632,7 @@ local block_state deflate_fast(s, flush)
     deflate_state *s;
     int flush;
 {
-    IPos hash_head = NIL; /* head of the hash chain */
+    IPos hash_head;       /* head of the hash chain */
     int bflush;           /* set if current block must be flushed */
 
     for (;;) {
@@ -1474,6 +1652,7 @@ local block_state deflate_fast(s, flush)
         /* Insert the string window[strstart .. strstart+2] in the
          * dictionary, and set hash_head to the head of the hash chain:
          */
+        hash_head = NIL;
         if (s->lookahead >= MIN_MATCH) {
             INSERT_STRING(s, s->strstart, hash_head);
         }
@@ -1486,19 +1665,8 @@ local block_state deflate_fast(s, flush)
              * of window index 0 (in particular we have to avoid a match
              * of the string with itself at the start of the input file).
              */
-#ifdef FASTEST
-            if ((s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) ||
-                (s->strategy == Z_RLE && s->strstart - hash_head == 1)) {
-                s->match_length = longest_match_fast (s, hash_head);
-            }
-#else
-            if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) {
-                s->match_length = longest_match (s, hash_head);
-            } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) {
-                s->match_length = longest_match_fast (s, hash_head);
-            }
-#endif
-            /* longest_match() or longest_match_fast() sets match_start */
+            s->match_length = longest_match (s, hash_head);
+            /* longest_match() sets match_start */
         }
         if (s->match_length >= MIN_MATCH) {
             check_match(s, s->strstart, s->match_start, s->match_length);
@@ -1546,8 +1714,14 @@ local block_state deflate_fast(s, flush)
         }
         if (bflush) FLUSH_BLOCK(s, 0);
     }
-    FLUSH_BLOCK(s, flush == Z_FINISH);
-    return flush == Z_FINISH ? finish_done : block_done;
+    s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
 }
 
 #ifndef FASTEST
@@ -1560,7 +1734,7 @@ local block_state deflate_slow(s, flush)
     deflate_state *s;
     int flush;
 {
-    IPos hash_head = NIL;    /* head of hash chain */
+    IPos hash_head;          /* head of hash chain */
     int bflush;              /* set if current block must be flushed */
 
     /* Process the input block. */
@@ -1581,6 +1755,7 @@ local block_state deflate_slow(s, flush)
         /* Insert the string window[strstart .. strstart+2] in the
          * dictionary, and set hash_head to the head of the hash chain:
          */
+        hash_head = NIL;
         if (s->lookahead >= MIN_MATCH) {
             INSERT_STRING(s, s->strstart, hash_head);
         }
@@ -1596,12 +1771,8 @@ local block_state deflate_slow(s, flush)
              * of window index 0 (in particular we have to avoid a match
              * of the string with itself at the start of the input file).
              */
-            if (s->strategy != Z_HUFFMAN_ONLY && s->strategy != Z_RLE) {
-                s->match_length = longest_match (s, hash_head);
-            } else if (s->strategy == Z_RLE && s->strstart - hash_head == 1) {
-                s->match_length = longest_match_fast (s, hash_head);
-            }
-            /* longest_match() or longest_match_fast() sets match_start */
+            s->match_length = longest_match (s, hash_head);
+            /* longest_match() sets match_start */
 
             if (s->match_length <= 5 && (s->strategy == Z_FILTERED
 #if TOO_FAR <= 32767
@@ -1674,12 +1845,17 @@ local block_state deflate_slow(s, flush)
         _tr_tally_lit(s, s->window[s->strstart-1], bflush);
         s->match_available = 0;
     }
-    FLUSH_BLOCK(s, flush == Z_FINISH);
-    return flush == Z_FINISH ? finish_done : block_done;
+    s->insert = s->strstart < MIN_MATCH-1 ? s->strstart : MIN_MATCH-1;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
 }
 #endif /* FASTEST */
 
-#if 0
 /* ===========================================================================
  * For Z_RLE, simply look for runs of bytes, generate matches only of distance
  * one.  Do not maintain a hash table.  (It will be regenerated if this run of
@@ -1689,43 +1865,52 @@ local block_state deflate_rle(s, flush)
     deflate_state *s;
     int flush;
 {
-    int bflush;         /* set if current block must be flushed */
-    uInt run;           /* length of run */
-    uInt max;           /* maximum length of run */
-    uInt prev;          /* byte at distance one to match */
-    Bytef *scan;        /* scan for end of run */
+    int bflush;             /* set if current block must be flushed */
+    uInt prev;              /* byte at distance one to match */
+    Bytef *scan, *strend;   /* scan goes up to strend for length of run */
 
     for (;;) {
         /* Make sure that we always have enough lookahead, except
          * at the end of the input file. We need MAX_MATCH bytes
-         * for the longest encodable run.
+         * for the longest run, plus one for the unrolled loop.
          */
-        if (s->lookahead < MAX_MATCH) {
+        if (s->lookahead <= MAX_MATCH) {
             fill_window(s);
-            if (s->lookahead < MAX_MATCH && flush == Z_NO_FLUSH) {
+            if (s->lookahead <= MAX_MATCH && flush == Z_NO_FLUSH) {
                 return need_more;
             }
             if (s->lookahead == 0) break; /* flush the current block */
         }
 
         /* See how many times the previous byte repeats */
-        run = 0;
-        if (s->strstart > 0) {      /* if there is a previous byte, that is */
-            max = s->lookahead < MAX_MATCH ? s->lookahead : MAX_MATCH;
+        s->match_length = 0;
+        if (s->lookahead >= MIN_MATCH && s->strstart > 0) {
             scan = s->window + s->strstart - 1;
-            prev = *scan++;
-            do {
-                if (*scan++ != prev)
-                    break;
-            } while (++run < max);
+            prev = *scan;
+            if (prev == *++scan && prev == *++scan && prev == *++scan) {
+                strend = s->window + s->strstart + MAX_MATCH;
+                do {
+                } while (prev == *++scan && prev == *++scan &&
+                         prev == *++scan && prev == *++scan &&
+                         prev == *++scan && prev == *++scan &&
+                         prev == *++scan && prev == *++scan &&
+                         scan < strend);
+                s->match_length = MAX_MATCH - (int)(strend - scan);
+                if (s->match_length > s->lookahead)
+                    s->match_length = s->lookahead;
+            }
+            Assert(scan <= s->window+(uInt)(s->window_size-1), "wild scan");
         }
 
         /* Emit match if have run of MIN_MATCH or longer, else emit literal */
-        if (run >= MIN_MATCH) {
-            check_match(s, s->strstart, s->strstart - 1, run);
-            _tr_tally_dist(s, 1, run - MIN_MATCH, bflush);
-            s->lookahead -= run;
-            s->strstart += run;
+        if (s->match_length >= MIN_MATCH) {
+            check_match(s, s->strstart, s->strstart - 1, s->match_length);
+
+            _tr_tally_dist(s, 1, s->match_length - MIN_MATCH, bflush);
+
+            s->lookahead -= s->match_length;
+            s->strstart += s->match_length;
+            s->match_length = 0;
         } else {
             /* No match, output a literal byte */
             Tracevv((stderr,"%c", s->window[s->strstart]));
@@ -1735,7 +1920,51 @@ local block_state deflate_rle(s, flush)
         }
         if (bflush) FLUSH_BLOCK(s, 0);
     }
-    FLUSH_BLOCK(s, flush == Z_FINISH);
-    return flush == Z_FINISH ? finish_done : block_done;
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
+}
+
+/* ===========================================================================
+ * For Z_HUFFMAN_ONLY, do not look for matches.  Do not maintain a hash table.
+ * (It will be regenerated if this run of deflate switches away from Huffman.)
+ */
+local block_state deflate_huff(s, flush)
+    deflate_state *s;
+    int flush;
+{
+    int bflush;             /* set if current block must be flushed */
+
+    for (;;) {
+        /* Make sure that we have a literal to write. */
+        if (s->lookahead == 0) {
+            fill_window(s);
+            if (s->lookahead == 0) {
+                if (flush == Z_NO_FLUSH)
+                    return need_more;
+                break;      /* flush the current block */
+            }
+        }
+
+        /* Output a literal byte */
+        s->match_length = 0;
+        Tracevv((stderr,"%c", s->window[s->strstart]));
+        _tr_tally_lit (s, s->window[s->strstart], bflush);
+        s->lookahead--;
+        s->strstart++;
+        if (bflush) FLUSH_BLOCK(s, 0);
+    }
+    s->insert = 0;
+    if (flush == Z_FINISH) {
+        FLUSH_BLOCK(s, 1);
+        return finish_done;
+    }
+    if (s->last_lit)
+        FLUSH_BLOCK(s, 0);
+    return block_done;
 }
-#endif
diff --git a/erts/emulator/zlib/deflate.h b/erts/emulator/zlib/deflate.h
index 92b037c9d2..ce0299edd1 100644
--- a/erts/emulator/zlib/deflate.h
+++ b/erts/emulator/zlib/deflate.h
@@ -1,10 +1,8 @@
 /* deflate.h -- internal compression state
- * Copyright (C) 1995-2004 Jean-loup Gailly
+ * Copyright (C) 1995-2012 Jean-loup Gailly
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* WARNING: this file should *not* be used by applications. It is
    part of the implementation of the compression library and is
    subject to change. Applications should only use zlib.h.
@@ -50,6 +48,9 @@
 #define MAX_BITS 15
 /* All codes must not exceed MAX_BITS bits */
 
+#define Buf_size 16
+/* size of bit buffer in bi_buf */
+
 #define INIT_STATE    42
 #define EXTRA_STATE   69
 #define NAME_STATE    73
@@ -103,7 +104,7 @@ typedef struct internal_state {
     int   wrap;          /* bit 0 true for zlib, bit 1 true for gzip */
     gz_headerp  gzhead;  /* gzip header information to write */
     uInt   gzindex;      /* where in extra, name, or comment */
-    Byte  method;        /* STORED (for zip only) or DEFLATED */
+    Byte  method;        /* can only be DEFLATED */
     int   last_flush;    /* value of flush param for previous deflate call */
 
                 /* used by deflate.c: */
@@ -190,7 +191,7 @@ typedef struct internal_state {
     int nice_match; /* Stop searching when current match exceeds this */
 
                 /* used by trees.c: */
-    /* Didn't use ct_data typedef below to supress compiler warning */
+    /* Didn't use ct_data typedef below to suppress compiler warning */
     struct ct_data_s dyn_ltree[HEAP_SIZE];   /* literal and length tree */
     struct ct_data_s dyn_dtree[2*D_CODES+1]; /* distance tree */
     struct ct_data_s bl_tree[2*BL_CODES+1];  /* Huffman tree for bit lengths */
@@ -246,7 +247,7 @@ typedef struct internal_state {
     ulg opt_len;        /* bit length of current block with optimal trees */
     ulg static_len;     /* bit length of current block with static trees */
     uInt matches;       /* number of string matches in current block */
-    int last_eob_len;   /* bit length of EOB code for last block */
+    uInt insert;        /* bytes at end of window left to insert */
 
 #ifdef DEBUG
     ulg compressed_len; /* total bit length of compressed file mod 2^32 */
@@ -262,6 +263,13 @@ typedef struct internal_state {
      * are always zero.
      */
 
+    ulg high_water;
+    /* High water mark offset in window for initialized bytes -- bytes above
+     * this are set to zero in order to avoid memory check warnings when
+     * longest match routines access bytes past the input.  This is then
+     * updated to the new high water mark.
+     */
+
 } FAR deflate_state;
 
 /* Output a byte on the stream.
@@ -280,14 +288,19 @@ typedef struct internal_state {
  * distances are limited to MAX_DIST instead of WSIZE.
  */
 
+#define WIN_INIT MAX_MATCH
+/* Number of bytes after end of data in window to initialize in order to avoid
+   memory checker errors from longest match routines */
+
         /* in trees.c */
-void _tr_init         OF((deflate_state *s));
-int  _tr_tally        OF((deflate_state *s, unsigned dist, unsigned lc));
-void _tr_flush_block  OF((deflate_state *s, charf *buf, ulg stored_len,
-                          int eof));
-void _tr_align        OF((deflate_state *s));
-void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len,
-                          int eof));
+void ZLIB_INTERNAL _tr_init OF((deflate_state *s));
+int ZLIB_INTERNAL _tr_tally OF((deflate_state *s, unsigned dist, unsigned lc));
+void ZLIB_INTERNAL _tr_flush_block OF((deflate_state *s, charf *buf,
+                        ulg stored_len, int last));
+void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
+void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
+void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
+                        ulg stored_len, int last));
 
 #define d_code(dist) \
    ((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
@@ -300,11 +313,11 @@ void _tr_stored_block OF((deflate_state *s, charf *buf, ulg stored_len,
 /* Inline versions of _tr_tally for speed: */
 
 #if defined(GEN_TREES_H) || !defined(STDC)
-  extern uch _length_code[];
-  extern uch _dist_code[];
+  extern uch ZLIB_INTERNAL _length_code[];
+  extern uch ZLIB_INTERNAL _dist_code[];
 #else
-  extern const uch _length_code[];
-  extern const uch _dist_code[];
+  extern const uch ZLIB_INTERNAL _length_code[];
+  extern const uch ZLIB_INTERNAL _dist_code[];
 #endif
 
 # define _tr_tally_lit(s, c, flush) \
diff --git a/erts/emulator/zlib/example.c b/erts/emulator/zlib/example.c
deleted file mode 100644
index ebe828f72d..0000000000
--- a/erts/emulator/zlib/example.c
+++ /dev/null
@@ -1,570 +0,0 @@
-/* example.c -- usage example of the zlib compression library
- * Copyright (C) 1995-2004 Jean-loup Gailly.
- * For conditions of distribution and use, see copyright notice in zlib.h
- */
-
-/* %ExternalCopyright% */
-
-/* @(#) $Id$ */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
-#include <stdio.h>
-#include "zlib.h"
-
-#ifdef STDC
-#  include <string.h>
-#  include <stdlib.h>
-#endif
-
-#if defined(VMS) || defined(RISCOS)
-#  define TESTFILE "foo-gz"
-#else
-#  define TESTFILE "foo.gz"
-#endif
-
-#define CHECK_ERR(err, msg) { \
-    if (err != Z_OK) { \
-        fprintf(stderr, "%s error: %d\n", msg, err); \
-        exit(1); \
-    } \
-}
-
-const char hello[] = "hello, hello!";
-/* "hello world" would be more standard, but the repeated "hello"
- * stresses the compression code better, sorry...
- */
-
-const char dictionary[] = "hello";
-uLong dictId; /* Adler32 value of the dictionary */
-
-void test_compress      OF((Byte *compr, uLong comprLen,
-                            Byte *uncompr, uLong uncomprLen));
-void test_gzio          OF((const char *fname,
-                            Byte *uncompr, uLong uncomprLen));
-void test_deflate       OF((Byte *compr, uLong comprLen));
-void test_inflate       OF((Byte *compr, uLong comprLen,
-                            Byte *uncompr, uLong uncomprLen));
-void test_large_deflate OF((Byte *compr, uLong comprLen,
-                            Byte *uncompr, uLong uncomprLen));
-void test_large_inflate OF((Byte *compr, uLong comprLen,
-                            Byte *uncompr, uLong uncomprLen));
-void test_flush         OF((Byte *compr, uLong *comprLen));
-void test_sync          OF((Byte *compr, uLong comprLen,
-                            Byte *uncompr, uLong uncomprLen));
-void test_dict_deflate  OF((Byte *compr, uLong comprLen));
-void test_dict_inflate  OF((Byte *compr, uLong comprLen,
-                            Byte *uncompr, uLong uncomprLen));
-int  main               OF((int argc, char *argv[]));
-
-/* ===========================================================================
- * Test compress() and uncompress()
- */
-void test_compress(compr, comprLen, uncompr, uncomprLen)
-    Byte *compr, *uncompr;
-    uLong comprLen, uncomprLen;
-{
-    int err;
-    uLong len = (uLong)strlen(hello)+1;
-
-    err = compress(compr, &comprLen, (const Bytef*)hello, len);
-    CHECK_ERR(err, "compress");
-
-    strcpy((char*)uncompr, "garbage");
-
-    err = uncompress(uncompr, &uncomprLen, compr, comprLen);
-    CHECK_ERR(err, "uncompress");
-
-    if (strcmp((char*)uncompr, hello)) {
-        fprintf(stderr, "bad uncompress\n");
-        exit(1);
-    } else {
-        printf("uncompress(): %s\n", (char *)uncompr);
-    }
-}
-
-/* ===========================================================================
- * Test read/write of .gz files
- */
-void test_gzio(fname, uncompr, uncomprLen)
-    const char *fname; /* compressed file name */
-    Byte *uncompr;
-    uLong uncomprLen;
-{
-#ifdef NO_GZCOMPRESS
-    fprintf(stderr, "NO_GZCOMPRESS -- gz* functions cannot compress\n");
-#else
-    int err;
-    int len = (int)strlen(hello)+1;
-    gzFile file;
-    z_off_t pos;
-
-    file = gzopen(fname, "wb");
-    if (file == NULL) {
-        fprintf(stderr, "gzopen error\n");
-        exit(1);
-    }
-    gzputc(file, 'h');
-    if (gzputs(file, "ello") != 4) {
-        fprintf(stderr, "gzputs err: %s\n", gzerror(file, &err));
-        exit(1);
-    }
-    if (gzprintf(file, ", %s!", "hello") != 8) {
-        fprintf(stderr, "gzprintf err: %s\n", gzerror(file, &err));
-        exit(1);
-    }
-    gzseek(file, 1L, SEEK_CUR); /* add one zero byte */
-    gzclose(file);
-
-    file = gzopen(fname, "rb");
-    if (file == NULL) {
-        fprintf(stderr, "gzopen error\n");
-        exit(1);
-    }
-    strcpy((char*)uncompr, "garbage");
-
-    if (gzread(file, uncompr, (unsigned)uncomprLen) != len) {
-        fprintf(stderr, "gzread err: %s\n", gzerror(file, &err));
-        exit(1);
-    }
-    if (strcmp((char*)uncompr, hello)) {
-        fprintf(stderr, "bad gzread: %s\n", (char*)uncompr);
-        exit(1);
-    } else {
-        printf("gzread(): %s\n", (char*)uncompr);
-    }
-
-    pos = gzseek(file, -8L, SEEK_CUR);
-    if (pos != 6 || gztell(file) != pos) {
-        fprintf(stderr, "gzseek error, pos=%ld, gztell=%ld\n",
-                (long)pos, (long)gztell(file));
-        exit(1);
-    }
-
-    if (gzgetc(file) != ' ') {
-        fprintf(stderr, "gzgetc error\n");
-        exit(1);
-    }
-
-    if (gzungetc(' ', file) != ' ') {
-        fprintf(stderr, "gzungetc error\n");
-        exit(1);
-    }
-
-    gzgets(file, (char*)uncompr, (int)uncomprLen);
-    if (strlen((char*)uncompr) != 7) { /* " hello!" */
-        fprintf(stderr, "gzgets err after gzseek: %s\n", gzerror(file, &err));
-        exit(1);
-    }
-    if (strcmp((char*)uncompr, hello + 6)) {
-        fprintf(stderr, "bad gzgets after gzseek\n");
-        exit(1);
-    } else {
-        printf("gzgets() after gzseek: %s\n", (char*)uncompr);
-    }
-
-    gzclose(file);
-#endif
-}
-
-/* ===========================================================================
- * Test deflate() with small buffers
- */
-void test_deflate(compr, comprLen)
-    Byte *compr;
-    uLong comprLen;
-{
-    z_stream c_stream; /* compression stream */
-    int err;
-    uLong len = (uLong)strlen(hello)+1;
-
-    c_stream.zalloc = (alloc_func)0;
-    c_stream.zfree = (free_func)0;
-    c_stream.opaque = (voidpf)0;
-
-    err = deflateInit(&c_stream, Z_DEFAULT_COMPRESSION);
-    CHECK_ERR(err, "deflateInit");
-
-    c_stream.next_in  = (Bytef*)hello;
-    c_stream.next_out = compr;
-
-    while (c_stream.total_in != len && c_stream.total_out < comprLen) {
-        c_stream.avail_in = c_stream.avail_out = 1; /* force small buffers */
-        err = deflate(&c_stream, Z_NO_FLUSH);
-        CHECK_ERR(err, "deflate");
-    }
-    /* Finish the stream, still forcing small buffers: */
-    for (;;) {
-        c_stream.avail_out = 1;
-        err = deflate(&c_stream, Z_FINISH);
-        if (err == Z_STREAM_END) break;
-        CHECK_ERR(err, "deflate");
-    }
-
-    err = deflateEnd(&c_stream);
-    CHECK_ERR(err, "deflateEnd");
-}
-
-/* ===========================================================================
- * Test inflate() with small buffers
- */
-void test_inflate(compr, comprLen, uncompr, uncomprLen)
-    Byte *compr, *uncompr;
-    uLong comprLen, uncomprLen;
-{
-    int err;
-    z_stream d_stream; /* decompression stream */
-
-    strcpy((char*)uncompr, "garbage");
-
-    d_stream.zalloc = (alloc_func)0;
-    d_stream.zfree = (free_func)0;
-    d_stream.opaque = (voidpf)0;
-
-    d_stream.next_in  = compr;
-    d_stream.avail_in = 0;
-    d_stream.next_out = uncompr;
-
-    err = inflateInit(&d_stream);
-    CHECK_ERR(err, "inflateInit");
-
-    while (d_stream.total_out < uncomprLen && d_stream.total_in < comprLen) {
-        d_stream.avail_in = d_stream.avail_out = 1; /* force small buffers */
-        err = inflate(&d_stream, Z_NO_FLUSH);
-        if (err == Z_STREAM_END) break;
-        CHECK_ERR(err, "inflate");
-    }
-
-    err = inflateEnd(&d_stream);
-    CHECK_ERR(err, "inflateEnd");
-
-    if (strcmp((char*)uncompr, hello)) {
-        fprintf(stderr, "bad inflate\n");
-        exit(1);
-    } else {
-        printf("inflate(): %s\n", (char *)uncompr);
-    }
-}
-
-/* ===========================================================================
- * Test deflate() with large buffers and dynamic change of compression level
- */
-void test_large_deflate(compr, comprLen, uncompr, uncomprLen)
-    Byte *compr, *uncompr;
-    uLong comprLen, uncomprLen;
-{
-    z_stream c_stream; /* compression stream */
-    int err;
-
-    c_stream.zalloc = (alloc_func)0;
-    c_stream.zfree = (free_func)0;
-    c_stream.opaque = (voidpf)0;
-
-    err = deflateInit(&c_stream, Z_BEST_SPEED);
-    CHECK_ERR(err, "deflateInit");
-
-    c_stream.next_out = compr;
-    c_stream.avail_out = (uInt)comprLen;
-
-    /* At this point, uncompr is still mostly zeroes, so it should compress
-     * very well:
-     */
-    c_stream.next_in = uncompr;
-    c_stream.avail_in = (uInt)uncomprLen;
-    err = deflate(&c_stream, Z_NO_FLUSH);
-    CHECK_ERR(err, "deflate");
-    if (c_stream.avail_in != 0) {
-        fprintf(stderr, "deflate not greedy\n");
-        exit(1);
-    }
-
-    /* Feed in already compressed data and switch to no compression: */
-    deflateParams(&c_stream, Z_NO_COMPRESSION, Z_DEFAULT_STRATEGY);
-    c_stream.next_in = compr;
-    c_stream.avail_in = (uInt)comprLen/2;
-    err = deflate(&c_stream, Z_NO_FLUSH);
-    CHECK_ERR(err, "deflate");
-
-    /* Switch back to compressing mode: */
-    deflateParams(&c_stream, Z_BEST_COMPRESSION, Z_FILTERED);
-    c_stream.next_in = uncompr;
-    c_stream.avail_in = (uInt)uncomprLen;
-    err = deflate(&c_stream, Z_NO_FLUSH);
-    CHECK_ERR(err, "deflate");
-
-    err = deflate(&c_stream, Z_FINISH);
-    if (err != Z_STREAM_END) {
-        fprintf(stderr, "deflate should report Z_STREAM_END\n");
-        exit(1);
-    }
-    err = deflateEnd(&c_stream);
-    CHECK_ERR(err, "deflateEnd");
-}
-
-/* ===========================================================================
- * Test inflate() with large buffers
- */
-void test_large_inflate(compr, comprLen, uncompr, uncomprLen)
-    Byte *compr, *uncompr;
-    uLong comprLen, uncomprLen;
-{
-    int err;
-    z_stream d_stream; /* decompression stream */
-
-    strcpy((char*)uncompr, "garbage");
-
-    d_stream.zalloc = (alloc_func)0;
-    d_stream.zfree = (free_func)0;
-    d_stream.opaque = (voidpf)0;
-
-    d_stream.next_in  = compr;
-    d_stream.avail_in = (uInt)comprLen;
-
-    err = inflateInit(&d_stream);
-    CHECK_ERR(err, "inflateInit");
-
-    for (;;) {
-        d_stream.next_out = uncompr;            /* discard the output */
-        d_stream.avail_out = (uInt)uncomprLen;
-        err = inflate(&d_stream, Z_NO_FLUSH);
-        if (err == Z_STREAM_END) break;
-        CHECK_ERR(err, "large inflate");
-    }
-
-    err = inflateEnd(&d_stream);
-    CHECK_ERR(err, "inflateEnd");
-
-    if (d_stream.total_out != 2*uncomprLen + comprLen/2) {
-        fprintf(stderr, "bad large inflate: %ld\n", d_stream.total_out);
-        exit(1);
-    } else {
-        printf("large_inflate(): OK\n");
-    }
-}
-
-/* ===========================================================================
- * Test deflate() with full flush
- */
-void test_flush(compr, comprLen)
-    Byte *compr;
-    uLong *comprLen;
-{
-    z_stream c_stream; /* compression stream */
-    int err;
-    uInt len = (uInt)strlen(hello)+1;
-
-    c_stream.zalloc = (alloc_func)0;
-    c_stream.zfree = (free_func)0;
-    c_stream.opaque = (voidpf)0;
-
-    err = deflateInit(&c_stream, Z_DEFAULT_COMPRESSION);
-    CHECK_ERR(err, "deflateInit");
-
-    c_stream.next_in  = (Bytef*)hello;
-    c_stream.next_out = compr;
-    c_stream.avail_in = 3;
-    c_stream.avail_out = (uInt)*comprLen;
-    err = deflate(&c_stream, Z_FULL_FLUSH);
-    CHECK_ERR(err, "deflate");
-
-    compr[3]++; /* force an error in first compressed block */
-    c_stream.avail_in = len - 3;
-
-    err = deflate(&c_stream, Z_FINISH);
-    if (err != Z_STREAM_END) {
-        CHECK_ERR(err, "deflate");
-    }
-    err = deflateEnd(&c_stream);
-    CHECK_ERR(err, "deflateEnd");
-
-    *comprLen = c_stream.total_out;
-}
-
-/* ===========================================================================
- * Test inflateSync()
- */
-void test_sync(compr, comprLen, uncompr, uncomprLen)
-    Byte *compr, *uncompr;
-    uLong comprLen, uncomprLen;
-{
-    int err;
-    z_stream d_stream; /* decompression stream */
-
-    strcpy((char*)uncompr, "garbage");
-
-    d_stream.zalloc = (alloc_func)0;
-    d_stream.zfree = (free_func)0;
-    d_stream.opaque = (voidpf)0;
-
-    d_stream.next_in  = compr;
-    d_stream.avail_in = 2; /* just read the zlib header */
-
-    err = inflateInit(&d_stream);
-    CHECK_ERR(err, "inflateInit");
-
-    d_stream.next_out = uncompr;
-    d_stream.avail_out = (uInt)uncomprLen;
-
-    inflate(&d_stream, Z_NO_FLUSH);
-    CHECK_ERR(err, "inflate");
-
-    d_stream.avail_in = (uInt)comprLen-2;   /* read all compressed data */
-    err = inflateSync(&d_stream);           /* but skip the damaged part */
-    CHECK_ERR(err, "inflateSync");
-
-    err = inflate(&d_stream, Z_FINISH);
-    if (err != Z_DATA_ERROR) {
-        fprintf(stderr, "inflate should report DATA_ERROR\n");
-        /* Because of incorrect adler32 */
-        exit(1);
-    }
-    err = inflateEnd(&d_stream);
-    CHECK_ERR(err, "inflateEnd");
-
-    printf("after inflateSync(): hel%s\n", (char *)uncompr);
-}
-
-/* ===========================================================================
- * Test deflate() with preset dictionary
- */
-void test_dict_deflate(compr, comprLen)
-    Byte *compr;
-    uLong comprLen;
-{
-    z_stream c_stream; /* compression stream */
-    int err;
-
-    c_stream.zalloc = (alloc_func)0;
-    c_stream.zfree = (free_func)0;
-    c_stream.opaque = (voidpf)0;
-
-    err = deflateInit(&c_stream, Z_BEST_COMPRESSION);
-    CHECK_ERR(err, "deflateInit");
-
-    err = deflateSetDictionary(&c_stream,
-                               (const Bytef*)dictionary, sizeof(dictionary));
-    CHECK_ERR(err, "deflateSetDictionary");
-
-    dictId = c_stream.adler;
-    c_stream.next_out = compr;
-    c_stream.avail_out = (uInt)comprLen;
-
-    c_stream.next_in = (Bytef*)hello;
-    c_stream.avail_in = (uInt)strlen(hello)+1;
-
-    err = deflate(&c_stream, Z_FINISH);
-    if (err != Z_STREAM_END) {
-        fprintf(stderr, "deflate should report Z_STREAM_END\n");
-        exit(1);
-    }
-    err = deflateEnd(&c_stream);
-    CHECK_ERR(err, "deflateEnd");
-}
-
-/* ===========================================================================
- * Test inflate() with a preset dictionary
- */
-void test_dict_inflate(compr, comprLen, uncompr, uncomprLen)
-    Byte *compr, *uncompr;
-    uLong comprLen, uncomprLen;
-{
-    int err;
-    z_stream d_stream; /* decompression stream */
-
-    strcpy((char*)uncompr, "garbage");
-
-    d_stream.zalloc = (alloc_func)0;
-    d_stream.zfree = (free_func)0;
-    d_stream.opaque = (voidpf)0;
-
-    d_stream.next_in  = compr;
-    d_stream.avail_in = (uInt)comprLen;
-
-    err = inflateInit(&d_stream);
-    CHECK_ERR(err, "inflateInit");
-
-    d_stream.next_out = uncompr;
-    d_stream.avail_out = (uInt)uncomprLen;
-
-    for (;;) {
-        err = inflate(&d_stream, Z_NO_FLUSH);
-        if (err == Z_STREAM_END) break;
-        if (err == Z_NEED_DICT) {
-            if (d_stream.adler != dictId) {
-                fprintf(stderr, "unexpected dictionary");
-                exit(1);
-            }
-            err = inflateSetDictionary(&d_stream, (const Bytef*)dictionary,
-                                       sizeof(dictionary));
-        }
-        CHECK_ERR(err, "inflate with dict");
-    }
-
-    err = inflateEnd(&d_stream);
-    CHECK_ERR(err, "inflateEnd");
-
-    if (strcmp((char*)uncompr, hello)) {
-        fprintf(stderr, "bad inflate with dict\n");
-        exit(1);
-    } else {
-        printf("inflate with dictionary: %s\n", (char *)uncompr);
-    }
-}
-
-/* ===========================================================================
- * Usage:  example [output.gz  [input.gz]]
- */
-
-int main(argc, argv)
-    int argc;
-    char *argv[];
-{
-    Byte *compr, *uncompr;
-    uLong comprLen = 10000*sizeof(int); /* don't overflow on MSDOS */
-    uLong uncomprLen = comprLen;
-    static const char* myVersion = ZLIB_VERSION;
-
-    if (zlibVersion()[0] != myVersion[0]) {
-        fprintf(stderr, "incompatible zlib version\n");
-        exit(1);
-
-    } else if (strcmp(zlibVersion(), ZLIB_VERSION) != 0) {
-        fprintf(stderr, "warning: different zlib version\n");
-    }
-
-    printf("zlib version %s = 0x%04x, compile flags = 0x%lx\n",
-            ZLIB_VERSION, ZLIB_VERNUM, zlibCompileFlags());
-
-    compr    = (Byte*)calloc((uInt)comprLen, 1);
-    uncompr  = (Byte*)calloc((uInt)uncomprLen, 1);
-    /* compr and uncompr are cleared to avoid reading uninitialized
-     * data and to ensure that uncompr compresses well.
-     */
-    if (compr == Z_NULL || uncompr == Z_NULL) {
-        printf("out of memory\n");
-        exit(1);
-    }
-    test_compress(compr, comprLen, uncompr, uncomprLen);
-
-    test_gzio((argc > 1 ? argv[1] : TESTFILE),
-              uncompr, uncomprLen);
-
-    test_deflate(compr, comprLen);
-    test_inflate(compr, comprLen, uncompr, uncomprLen);
-
-    test_large_deflate(compr, comprLen, uncompr, uncomprLen);
-    test_large_inflate(compr, comprLen, uncompr, uncomprLen);
-
-    test_flush(compr, &comprLen);
-    test_sync(compr, comprLen, uncompr, uncomprLen);
-    comprLen = uncomprLen;
-
-    test_dict_deflate(compr, comprLen);
-    test_dict_inflate(compr, comprLen, uncompr, uncomprLen);
-
-    free(compr);
-    free(uncompr);
-
-    return 0;
-}
diff --git a/erts/emulator/zlib/gzguts.h b/erts/emulator/zlib/gzguts.h
new file mode 100644
index 0000000000..d87659d031
--- /dev/null
+++ b/erts/emulator/zlib/gzguts.h
@@ -0,0 +1,209 @@
+/* gzguts.h -- zlib internal header definitions for gz* operations
+ * Copyright (C) 2004, 2005, 2010, 2011, 2012, 2013 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ */
+
+#ifdef _LARGEFILE64_SOURCE
+#  ifndef _LARGEFILE_SOURCE
+#    define _LARGEFILE_SOURCE 1
+#  endif
+#  ifdef _FILE_OFFSET_BITS
+#    undef _FILE_OFFSET_BITS
+#  endif
+#endif
+
+#ifdef HAVE_HIDDEN
+#  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define ZLIB_INTERNAL
+#endif
+
+#include <stdio.h>
+#include "zlib.h"
+#ifdef STDC
+#  include <string.h>
+#  include <stdlib.h>
+#  include <limits.h>
+#endif
+#include <fcntl.h>
+
+#ifdef _WIN32
+#  include <stddef.h>
+#endif
+
+#if defined(__TURBOC__) || defined(_MSC_VER) || defined(_WIN32)
+#  include <io.h>
+#endif
+
+#ifdef WINAPI_FAMILY
+#  define open _open
+#  define read _read
+#  define write _write
+#  define close _close
+#endif
+
+#ifdef NO_DEFLATE       /* for compatibility with old definition */
+#  define NO_GZCOMPRESS
+#endif
+
+#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#if defined(__CYGWIN__)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#if defined(MSDOS) && defined(__BORLANDC__) && (BORLANDC > 0x410)
+#  ifndef HAVE_VSNPRINTF
+#    define HAVE_VSNPRINTF
+#  endif
+#endif
+
+#ifndef HAVE_VSNPRINTF
+#  ifdef MSDOS
+/* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
+   but for now we just assume it doesn't. */
+#    define NO_vsnprintf
+#  endif
+#  ifdef __TURBOC__
+#    define NO_vsnprintf
+#  endif
+#  ifdef WIN32
+/* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
+#    if !defined(vsnprintf) && !defined(NO_vsnprintf)
+#      if !defined(_MSC_VER) || ( defined(_MSC_VER) && _MSC_VER < 1500 )
+#         define vsnprintf _vsnprintf
+#      endif
+#    endif
+#  endif
+#  ifdef __SASC
+#    define NO_vsnprintf
+#  endif
+#  ifdef VMS
+#    define NO_vsnprintf
+#  endif
+#  ifdef __OS400__
+#    define NO_vsnprintf
+#  endif
+#  ifdef __MVS__
+#    define NO_vsnprintf
+#  endif
+#endif
+
+/* unlike snprintf (which is required in C99, yet still not supported by
+   Microsoft more than a decade later!), _snprintf does not guarantee null
+   termination of the result -- however this is only used in gzlib.c where
+   the result is assured to fit in the space provided */
+#ifdef _MSC_VER
+#  define snprintf _snprintf
+#endif
+
+#ifndef local
+#  define local static
+#endif
+/* compile with -Dlocal if your debugger can't find static symbols */
+
+/* gz* functions always use library allocation functions */
+#ifndef STDC
+  extern voidp  malloc OF((uInt size));
+  extern void   free   OF((voidpf ptr));
+#endif
+
+/* get errno and strerror definition */
+#if defined UNDER_CE
+#  include <windows.h>
+#  define zstrerror() gz_strwinerror((DWORD)GetLastError())
+#else
+#  ifndef NO_STRERROR
+#    include <errno.h>
+#    define zstrerror() strerror(errno)
+#  else
+#    define zstrerror() "stdio error (consult errno)"
+#  endif
+#endif
+
+/* provide prototypes for these when building zlib without LFS */
+#if !defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0
+    ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+    ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+    ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+    ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+#endif
+
+/* default memLevel */
+#if MAX_MEM_LEVEL >= 8
+#  define DEF_MEM_LEVEL 8
+#else
+#  define DEF_MEM_LEVEL  MAX_MEM_LEVEL
+#endif
+
+/* default i/o buffer size -- double this for output when reading (this and
+   twice this must be able to fit in an unsigned type) */
+#define GZBUFSIZE 8192
+
+/* gzip modes, also provide a little integrity check on the passed structure */
+#define GZ_NONE 0
+#define GZ_READ 7247
+#define GZ_WRITE 31153
+#define GZ_APPEND 1     /* mode set to GZ_WRITE after the file is opened */
+
+/* values for gz_state how */
+#define LOOK 0      /* look for a gzip header */
+#define COPY 1      /* copy input directly */
+#define GZIP 2      /* decompress a gzip stream */
+
+/* internal gzip file state data structure */
+typedef struct {
+        /* exposed contents for gzgetc() macro */
+    struct gzFile_s x;      /* "x" for exposed */
+                            /* x.have: number of bytes available at x.next */
+                            /* x.next: next output data to deliver or write */
+                            /* x.pos: current position in uncompressed data */
+        /* used for both reading and writing */
+    int mode;               /* see gzip modes above */
+    int fd;                 /* file descriptor */
+    char *path;             /* path or fd for error messages */
+    unsigned size;          /* buffer size, zero if not allocated yet */
+    unsigned want;          /* requested buffer size, default is GZBUFSIZE */
+    unsigned char *in;      /* input buffer */
+    unsigned char *out;     /* output buffer (double-sized when reading) */
+    int direct;             /* 0 if processing gzip, 1 if transparent */
+        /* just for reading */
+    int how;                /* 0: get header, 1: copy, 2: decompress */
+    z_off64_t start;        /* where the gzip data started, for rewinding */
+    int eof;                /* true if end of input file reached */
+    int past;               /* true if read requested past end */
+        /* just for writing */
+    int level;              /* compression level */
+    int strategy;           /* compression strategy */
+        /* seek request */
+    z_off64_t skip;         /* amount to skip (already rewound if backwards) */
+    int seek;               /* true if seek request pending */
+        /* error information */
+    int err;                /* error code */
+    char *msg;              /* error message */
+        /* zlib inflate or deflate stream */
+    z_stream strm;          /* stream structure in-place (not a pointer) */
+} gz_state;
+typedef gz_state FAR *gz_statep;
+
+/* shared functions */
+void ZLIB_INTERNAL gz_error OF((gz_statep, int, const char *));
+#if defined UNDER_CE
+char ZLIB_INTERNAL *gz_strwinerror OF((DWORD error));
+#endif
+
+/* GT_OFF(x), where x is an unsigned value, is true if x > maximum z_off64_t
+   value -- needed when comparing unsigned to z_off64_t, which is signed
+   (possible z_off64_t types off_t, off64_t, and long are all signed) */
+#ifdef INT_MAX
+#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > INT_MAX)
+#else
+unsigned ZLIB_INTERNAL gz_intmax OF((void));
+#  define GT_OFF(x) (sizeof(int) == sizeof(z_off64_t) && (x) > gz_intmax())
+#endif
diff --git a/erts/emulator/zlib/inffast.c b/erts/emulator/zlib/inffast.c
index eb81884888..5187743fde 100644
--- a/erts/emulator/zlib/inffast.c
+++ b/erts/emulator/zlib/inffast.c
@@ -1,10 +1,8 @@
 /* inffast.c -- fast decoding
- * Copyright (C) 1995-2004 Mark Adler
+ * Copyright (C) 1995-2008, 2010, 2013 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
@@ -69,13 +67,13 @@
       requires strm->avail_out >= 258 for each loop to avoid checking for
       output space.
  */
-void inflate_fast(strm, start)
+void ZLIB_INTERNAL inflate_fast(strm, start)
 z_streamp strm;
 unsigned start;         /* inflate()'s starting value for strm->avail_out */
 {
     struct inflate_state FAR *state;
-    unsigned char FAR *in;      /* local strm->next_in */
-    unsigned char FAR *last;    /* while in < last, enough input available */
+    z_const unsigned char FAR *in;      /* local strm->next_in */
+    z_const unsigned char FAR *last;    /* have enough input while in < last */
     unsigned char FAR *out;     /* local strm->next_out */
     unsigned char FAR *beg;     /* inflate()'s initial strm->next_out */
     unsigned char FAR *end;     /* while out < end, enough space available */
@@ -84,7 +82,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
 #endif
     unsigned wsize;             /* window size or zero if not using window */
     unsigned whave;             /* valid bytes in the window */
-    unsigned write;             /* window write index */
+    unsigned wnext;             /* window write index */
     unsigned char FAR *window;  /* allocated sliding window, if wsize != 0 */
     unsigned long hold;         /* local strm->hold */
     unsigned bits;              /* local strm->bits */
@@ -92,7 +90,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
     code const FAR *dcode;      /* local strm->distcode */
     unsigned lmask;             /* mask for first level of length codes */
     unsigned dmask;             /* mask for first level of distance codes */
-    code this;                  /* retrieved table entry */
+    code here;                  /* retrieved table entry */
     unsigned op;                /* code bits, operation, extra bits, or */
                                 /*  window position, window bytes to copy */
     unsigned len;               /* match length, unused bytes */
@@ -111,7 +109,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
 #endif
     wsize = state->wsize;
     whave = state->whave;
-    write = state->write;
+    wnext = state->wnext;
     window = state->window;
     hold = state->hold;
     bits = state->bits;
@@ -129,20 +127,20 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
             hold += (unsigned long)(PUP(in)) << bits;
             bits += 8;
         }
-        this = lcode[hold & lmask];
+        here = lcode[hold & lmask];
       dolen:
-        op = (unsigned)(this.bits);
+        op = (unsigned)(here.bits);
         hold >>= op;
         bits -= op;
-        op = (unsigned)(this.op);
+        op = (unsigned)(here.op);
         if (op == 0) {                          /* literal */
-            Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
+            Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
                     "inflate:         literal '%c'\n" :
-                    "inflate:         literal 0x%02x\n", this.val));
-            PUP(out) = (unsigned char)(this.val);
+                    "inflate:         literal 0x%02x\n", here.val));
+            PUP(out) = (unsigned char)(here.val);
         }
         else if (op & 16) {                     /* length base */
-            len = (unsigned)(this.val);
+            len = (unsigned)(here.val);
             op &= 15;                           /* number of extra bits */
             if (op) {
                 if (bits < op) {
@@ -160,14 +158,14 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
                 hold += (unsigned long)(PUP(in)) << bits;
                 bits += 8;
             }
-            this = dcode[hold & dmask];
+            here = dcode[hold & dmask];
           dodist:
-            op = (unsigned)(this.bits);
+            op = (unsigned)(here.bits);
             hold >>= op;
             bits -= op;
-            op = (unsigned)(this.op);
+            op = (unsigned)(here.op);
             if (op & 16) {                      /* distance base */
-                dist = (unsigned)(this.val);
+                dist = (unsigned)(here.val);
                 op &= 15;                       /* number of extra bits */
                 if (bits < op) {
                     hold += (unsigned long)(PUP(in)) << bits;
@@ -192,12 +190,34 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
                 if (dist > op) {                /* see if copy from window */
                     op = dist - op;             /* distance back in window */
                     if (op > whave) {
-                        strm->msg = (char *)"invalid distance too far back";
-                        state->mode = BAD;
-                        break;
+                        if (state->sane) {
+                            strm->msg =
+                                (char *)"invalid distance too far back";
+                            state->mode = BAD;
+                            break;
+                        }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                        if (len <= op - whave) {
+                            do {
+                                PUP(out) = 0;
+                            } while (--len);
+                            continue;
+                        }
+                        len -= op - whave;
+                        do {
+                            PUP(out) = 0;
+                        } while (--op > whave);
+                        if (op == 0) {
+                            from = out - dist;
+                            do {
+                                PUP(out) = PUP(from);
+                            } while (--len);
+                            continue;
+                        }
+#endif
                     }
                     from = window - OFF;
-                    if (write == 0) {           /* very common case */
+                    if (wnext == 0) {           /* very common case */
                         from += wsize - op;
                         if (op < len) {         /* some from window */
                             len -= op;
@@ -207,17 +227,17 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
                             from = out - dist;  /* rest from output */
                         }
                     }
-                    else if (write < op) {      /* wrap around window */
-                        from += wsize + write - op;
-                        op -= write;
+                    else if (wnext < op) {      /* wrap around window */
+                        from += wsize + wnext - op;
+                        op -= wnext;
                         if (op < len) {         /* some from end of window */
                             len -= op;
                             do {
                                 PUP(out) = PUP(from);
                             } while (--op);
                             from = window - OFF;
-                            if (write < len) {  /* some from start of window */
-                                op = write;
+                            if (wnext < len) {  /* some from start of window */
+                                op = wnext;
                                 len -= op;
                                 do {
                                     PUP(out) = PUP(from);
@@ -227,7 +247,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
                         }
                     }
                     else {                      /* contiguous in window */
-                        from += write - op;
+                        from += wnext - op;
                         if (op < len) {         /* some from window */
                             len -= op;
                             do {
@@ -264,7 +284,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
                 }
             }
             else if ((op & 64) == 0) {          /* 2nd level distance code */
-                this = dcode[this.val + (hold & ((1U << op) - 1))];
+                here = dcode[here.val + (hold & ((1U << op) - 1))];
                 goto dodist;
             }
             else {
@@ -274,7 +294,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
             }
         }
         else if ((op & 64) == 0) {              /* 2nd level length code */
-            this = lcode[this.val + (hold & ((1U << op) - 1))];
+            here = lcode[here.val + (hold & ((1U << op) - 1))];
             goto dolen;
         }
         else if (op & 32) {                     /* end-of-block */
@@ -310,7 +330,7 @@ unsigned start;         /* inflate()'s starting value for strm->avail_out */
    inflate_fast() speedups that turned out slower (on a PowerPC G3 750CXe):
    - Using bit fields for code structure
    - Different op definition to avoid & for extra bits (do & for table bits)
-   - Three separate decoding do-loops for direct, window, and write == 0
+   - Three separate decoding do-loops for direct, window, and wnext == 0
    - Special case for distance > 1 copies to do overlapped load and store copy
    - Explicit branch predictions (based on measured branch probabilities)
    - Deferring match copy and interspersed it with decoding subsequent codes
diff --git a/erts/emulator/zlib/inffast.h b/erts/emulator/zlib/inffast.h
index 623ed83c08..e5c1aa4ca8 100644
--- a/erts/emulator/zlib/inffast.h
+++ b/erts/emulator/zlib/inffast.h
@@ -1,13 +1,11 @@
 /* inffast.h -- header to use inffast.c
- * Copyright (C) 1995-2003 Mark Adler
+ * Copyright (C) 1995-2003, 2010 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* WARNING: this file should *not* be used by applications. It is
    part of the implementation of the compression library and is
    subject to change. Applications should only use zlib.h.
  */
 
-void inflate_fast OF((z_streamp strm, unsigned start));
+void ZLIB_INTERNAL inflate_fast OF((z_streamp strm, unsigned start));
diff --git a/erts/emulator/zlib/inffixed.h b/erts/emulator/zlib/inffixed.h
index 75ed4b5978..d628327769 100644
--- a/erts/emulator/zlib/inffixed.h
+++ b/erts/emulator/zlib/inffixed.h
@@ -2,9 +2,9 @@
      * Generated automatically by makefixed().
      */
 
-    /* WARNING: this file should *not* be used by applications. It
-       is part of the implementation of the compression library and
-       is subject to change. Applications should only use zlib.h.
+    /* WARNING: this file should *not* be used by applications.
+       It is part of the implementation of this library and is
+       subject to change. Applications should only use zlib.h.
      */
 
     static const code lenfix[512] = {
diff --git a/erts/emulator/zlib/inflate.c b/erts/emulator/zlib/inflate.c
index 1764447c66..532330b06b 100644
--- a/erts/emulator/zlib/inflate.c
+++ b/erts/emulator/zlib/inflate.c
@@ -1,13 +1,8 @@
 /* inflate.c -- zlib decompression
- * Copyright (C) 1995-2005 Mark Adler
+ * Copyright (C) 1995-2012 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
-#ifdef HAVE_CONFIG_H
-#  include "config.h"
-#endif
 /*
  * Change history:
  *
@@ -50,7 +45,7 @@
  * - Rearrange window copies in inflate_fast() for speed and simplification
  * - Unroll last copy for window match in inflate_fast()
  * - Use local copies of window variables in inflate_fast() for speed
- * - Pull out common write == 0 case for speed in inflate_fast()
+ * - Pull out common wnext == 0 case for speed in inflate_fast()
  * - Make op and len in inflate_fast() unsigned for consistency
  * - Add FAR to lcode and dcode declarations in inflate_fast()
  * - Simplified bad distance check in inflate_fast()
@@ -85,6 +80,9 @@
  * The history for versions after 1.2.0 are in ChangeLog in zlib distribution.
  */
 
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
 #include "zutil.h"
 #include "inftrees.h"
 #include "inflate.h"
@@ -98,14 +96,15 @@
 
 /* function prototypes */
 local void fixedtables OF((struct inflate_state FAR *state));
-local int updatewindow OF((z_streamp strm, unsigned out));
+local int updatewindow OF((z_streamp strm, const unsigned char FAR *end,
+                           unsigned copy));
 #ifdef BUILDFIXED
    void makefixed OF((void));
 #endif
-local unsigned syncsearch OF((unsigned FAR *have, unsigned char FAR *buf,
+local unsigned syncsearch OF((unsigned FAR *have, const unsigned char FAR *buf,
                               unsigned len));
 
-int ZEXPORT inflateReset(strm)
+int ZEXPORT inflateResetKeep(strm)
 z_streamp strm;
 {
     struct inflate_state FAR *state;
@@ -114,36 +113,71 @@ z_streamp strm;
     state = (struct inflate_state FAR *)strm->state;
     strm->total_in = strm->total_out = state->total = 0;
     strm->msg = Z_NULL;
-    strm->adler = 1;        /* to support ill-conceived Java test suite */
+    if (state->wrap)        /* to support ill-conceived Java test suite */
+        strm->adler = state->wrap & 1;
     state->mode = HEAD;
     state->last = 0;
     state->havedict = 0;
     state->dmax = 32768U;
     state->head = Z_NULL;
-    state->wsize = 0;
-    state->whave = 0;
-    state->write = 0;
     state->hold = 0;
     state->bits = 0;
     state->lencode = state->distcode = state->next = state->codes;
+    state->sane = 1;
+    state->back = -1;
     Tracev((stderr, "inflate: reset\n"));
     return Z_OK;
 }
 
-int ZEXPORT inflatePrime(strm, bits, value)
+int ZEXPORT inflateReset(strm)
 z_streamp strm;
-int bits;
-int value;
 {
     struct inflate_state FAR *state;
 
     if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
     state = (struct inflate_state FAR *)strm->state;
-    if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR;
-    value &= (1L << bits) - 1;
-    state->hold += value << state->bits;
-    state->bits += bits;
-    return Z_OK;
+    state->wsize = 0;
+    state->whave = 0;
+    state->wnext = 0;
+    return inflateResetKeep(strm);
+}
+
+int ZEXPORT inflateReset2(strm, windowBits)
+z_streamp strm;
+int windowBits;
+{
+    int wrap;
+    struct inflate_state FAR *state;
+
+    /* get the state */
+    if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+
+    /* extract wrap request from windowBits parameter */
+    if (windowBits < 0) {
+        wrap = 0;
+        windowBits = -windowBits;
+    }
+    else {
+        wrap = (windowBits >> 4) + 1;
+#ifdef GUNZIP
+        if (windowBits < 48)
+            windowBits &= 15;
+#endif
+    }
+
+    /* set number of window bits, free window if different */
+    if (windowBits && (windowBits < 8 || windowBits > 15))
+        return Z_STREAM_ERROR;
+    if (state->window != Z_NULL && state->wbits != (unsigned)windowBits) {
+        ZFREE(strm, state->window);
+        state->window = Z_NULL;
+    }
+
+    /* update state and reset the rest of it */
+    state->wrap = wrap;
+    state->wbits = (unsigned)windowBits;
+    return inflateReset(strm);
 }
 
 int ZEXPORT inflateInit2_(strm, windowBits, version, stream_size)
@@ -152,6 +186,7 @@ int windowBits;
 const char *version;
 int stream_size;
 {
+    int ret;
     struct inflate_state FAR *state;
 
     if (version == Z_NULL || version[0] != ZLIB_VERSION[0] ||
@@ -160,33 +195,31 @@ int stream_size;
     if (strm == Z_NULL) return Z_STREAM_ERROR;
     strm->msg = Z_NULL;                 /* in case we return an error */
     if (strm->zalloc == (alloc_func)0) {
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
         strm->zalloc = zcalloc;
         strm->opaque = (voidpf)0;
+#endif
     }
-    if (strm->zfree == (free_func)0) strm->zfree = zcfree;
+    if (strm->zfree == (free_func)0)
+#ifdef Z_SOLO
+        return Z_STREAM_ERROR;
+#else
+        strm->zfree = zcfree;
+#endif
     state = (struct inflate_state FAR *)
             ZALLOC(strm, 1, sizeof(struct inflate_state));
     if (state == Z_NULL) return Z_MEM_ERROR;
     Tracev((stderr, "inflate: allocated\n"));
     strm->state = (struct internal_state FAR *)state;
-    if (windowBits < 0) {
-        state->wrap = 0;
-        windowBits = -windowBits;
-    }
-    else {
-        state->wrap = (windowBits >> 4) + 1;
-#ifdef GUNZIP
-        if (windowBits < 48) windowBits &= 15;
-#endif
-    }
-    if (windowBits < 8 || windowBits > 15) {
+    state->window = Z_NULL;
+    ret = inflateReset2(strm, windowBits);
+    if (ret != Z_OK) {
         ZFREE(strm, state);
         strm->state = Z_NULL;
-        return Z_STREAM_ERROR;
     }
-    state->wbits = (unsigned)windowBits;
-    state->window = Z_NULL;
-    return inflateReset(strm);
+    return ret;
 }
 
 int ZEXPORT inflateInit_(strm, version, stream_size)
@@ -197,6 +230,27 @@ int stream_size;
     return inflateInit2_(strm, DEF_WBITS, version, stream_size);
 }
 
+int ZEXPORT inflatePrime(strm, bits, value)
+z_streamp strm;
+int bits;
+int value;
+{
+    struct inflate_state FAR *state;
+
+    if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    if (bits < 0) {
+        state->hold = 0;
+        state->bits = 0;
+        return Z_OK;
+    }
+    if (bits > 16 || state->bits + bits > 32) return Z_STREAM_ERROR;
+    value &= (1L << bits) - 1;
+    state->hold += value << state->bits;
+    state->bits += bits;
+    return Z_OK;
+}
+
 /*
    Return state with length and distance decoding tables and index sizes set to
    fixed code decoding.  Normally this returns fixed tables from inffixed.h.
@@ -291,8 +345,8 @@ void makefixed()
     low = 0;
     for (;;) {
         if ((low % 7) == 0) printf("\n        ");
-        printf("{%u,%u,%d}", state.lencode[low].op, state.lencode[low].bits,
-               state.lencode[low].val);
+        printf("{%u,%u,%d}", (low & 127) == 99 ? 64 : state.lencode[low].op,
+               state.lencode[low].bits, state.lencode[low].val);
         if (++low == size) break;
         putchar(',');
     }
@@ -325,12 +379,13 @@ void makefixed()
    output will fall in the output data, making match copies simpler and faster.
    The advantage may be dependent on the size of the processor's data caches.
  */
-local int updatewindow(strm, out)
+local int updatewindow(strm, end, copy)
 z_streamp strm;
-unsigned out;
+const Bytef *end;
+unsigned copy;
 {
     struct inflate_state FAR *state;
-    unsigned copy, dist;
+    unsigned dist;
 
     state = (struct inflate_state FAR *)strm->state;
 
@@ -345,30 +400,29 @@ unsigned out;
     /* if window not in use yet, initialize */
     if (state->wsize == 0) {
         state->wsize = 1U << state->wbits;
-        state->write = 0;
+        state->wnext = 0;
         state->whave = 0;
     }
 
     /* copy state->wsize or less output bytes into the circular window */
-    copy = out - strm->avail_out;
     if (copy >= state->wsize) {
-        zmemcpy(state->window, strm->next_out - state->wsize, state->wsize);
-        state->write = 0;
+        zmemcpy(state->window, end - state->wsize, state->wsize);
+        state->wnext = 0;
         state->whave = state->wsize;
     }
     else {
-        dist = state->wsize - state->write;
+        dist = state->wsize - state->wnext;
         if (dist > copy) dist = copy;
-        zmemcpy(state->window + state->write, strm->next_out - copy, dist);
+        zmemcpy(state->window + state->wnext, end - copy, dist);
         copy -= dist;
         if (copy) {
-            zmemcpy(state->window, strm->next_out - copy, copy);
-            state->write = copy;
+            zmemcpy(state->window, end - copy, copy);
+            state->wnext = copy;
             state->whave = state->wsize;
         }
         else {
-            state->write += dist;
-            if (state->write == state->wsize) state->write = 0;
+            state->wnext += dist;
+            if (state->wnext == state->wsize) state->wnext = 0;
             if (state->whave < state->wsize) state->whave += dist;
         }
     }
@@ -469,11 +523,6 @@ unsigned out;
         bits -= bits & 7; \
     } while (0)
 
-/* Reverse the bytes in a 32-bit value */
-#define REVERSE(q) \
-    ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
-     (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
-
 /*
    inflate() uses a state machine to process as much input data and generate as
    much output data as possible before returning.  The state machine is
@@ -561,7 +610,7 @@ z_streamp strm;
 int flush;
 {
     struct inflate_state FAR *state;
-    unsigned char FAR *next;    /* next input */
+    z_const unsigned char FAR *next;    /* next input */
     unsigned char FAR *put;     /* next output */
     unsigned have, left;        /* available input and output */
     unsigned long hold;         /* bit buffer */
@@ -569,7 +618,7 @@ int flush;
     unsigned in, out;           /* save starting available input and output */
     unsigned copy;              /* number of stored or match bytes to copy */
     unsigned char FAR *from;    /* where to copy match bytes from */
-    code this;                  /* current decoding table entry */
+    code here;                  /* current decoding table entry */
     code last;                  /* parent table entry */
     unsigned len;               /* length to copy for repeats, bits to drop */
     int ret;                    /* return code */
@@ -624,7 +673,9 @@ int flush;
             }
             DROPBITS(4);
             len = BITS(4) + 8;
-            if (len > state->wbits) {
+            if (state->wbits == 0)
+                state->wbits = len;
+            else if (len > state->wbits) {
                 strm->msg = (char *)"invalid window size";
                 state->mode = BAD;
                 break;
@@ -765,7 +816,7 @@ int flush;
 #endif
         case DICTID:
             NEEDBITS(32);
-            strm->adler = state->check = REVERSE(hold);
+            strm->adler = state->check = ZSWAP32(hold);
             INITBITS();
             state->mode = DICT;
         case DICT:
@@ -776,7 +827,7 @@ int flush;
             strm->adler = state->check = adler32(0L, Z_NULL, 0);
             state->mode = TYPE;
         case TYPE:
-            if (flush == Z_BLOCK) goto inf_leave;
+            if (flush == Z_BLOCK || flush == Z_TREES) goto inf_leave;
         case TYPEDO:
             if (state->last) {
                 BYTEBITS();
@@ -796,7 +847,11 @@ int flush;
                 fixedtables(state);
                 Tracev((stderr, "inflate:     fixed codes block%s\n",
                         state->last ? " (last)" : ""));
-                state->mode = LEN;              /* decode codes */
+                state->mode = LEN_;             /* decode codes */
+                if (flush == Z_TREES) {
+                    DROPBITS(2);
+                    goto inf_leave;
+                }
                 break;
             case 2:                             /* dynamic block */
                 Tracev((stderr, "inflate:     dynamic codes block%s\n",
@@ -821,6 +876,9 @@ int flush;
             Tracev((stderr, "inflate:       stored length %u\n",
                     state->length));
             INITBITS();
+            state->mode = COPY_;
+            if (flush == Z_TREES) goto inf_leave;
+        case COPY_:
             state->mode = COPY;
         case COPY:
             copy = state->length;
@@ -866,7 +924,7 @@ int flush;
             while (state->have < 19)
                 state->lens[order[state->have++]] = 0;
             state->next = state->codes;
-            state->lencode = (code const FAR *)(state->next);
+            state->lencode = (const code FAR *)(state->next);
             state->lenbits = 7;
             ret = inflate_table(CODES, state->lens, 19, &(state->next),
                                 &(state->lenbits), state->work);
@@ -881,19 +939,18 @@ int flush;
         case CODELENS:
             while (state->have < state->nlen + state->ndist) {
                 for (;;) {
-                    this = state->lencode[BITS(state->lenbits)];
-                    if ((unsigned)(this.bits) <= bits) break;
+                    here = state->lencode[BITS(state->lenbits)];
+                    if ((unsigned)(here.bits) <= bits) break;
                     PULLBYTE();
                 }
-                if (this.val < 16) {
-                    NEEDBITS(this.bits);
-                    DROPBITS(this.bits);
-                    state->lens[state->have++] = this.val;
+                if (here.val < 16) {
+                    DROPBITS(here.bits);
+                    state->lens[state->have++] = here.val;
                 }
                 else {
-                    if (this.val == 16) {
-                        NEEDBITS(this.bits + 2);
-                        DROPBITS(this.bits);
+                    if (here.val == 16) {
+                        NEEDBITS(here.bits + 2);
+                        DROPBITS(here.bits);
                         if (state->have == 0) {
                             strm->msg = (char *)"invalid bit length repeat";
                             state->mode = BAD;
@@ -903,16 +960,16 @@ int flush;
                         copy = 3 + BITS(2);
                         DROPBITS(2);
                     }
-                    else if (this.val == 17) {
-                        NEEDBITS(this.bits + 3);
-                        DROPBITS(this.bits);
+                    else if (here.val == 17) {
+                        NEEDBITS(here.bits + 3);
+                        DROPBITS(here.bits);
                         len = 0;
                         copy = 3 + BITS(3);
                         DROPBITS(3);
                     }
                     else {
-                        NEEDBITS(this.bits + 7);
-                        DROPBITS(this.bits);
+                        NEEDBITS(here.bits + 7);
+                        DROPBITS(here.bits);
                         len = 0;
                         copy = 11 + BITS(7);
                         DROPBITS(7);
@@ -930,9 +987,18 @@ int flush;
             /* handle error breaks in while */
             if (state->mode == BAD) break;
 
-            /* build code tables */
+            /* check for end-of-block code (better have one) */
+            if (state->lens[256] == 0) {
+                strm->msg = (char *)"invalid code -- missing end-of-block";
+                state->mode = BAD;
+                break;
+            }
+
+            /* build code tables -- note: do not change the lenbits or distbits
+               values here (9 and 6) without reading the comments in inftrees.h
+               concerning the ENOUGH constants, which depend on those values */
             state->next = state->codes;
-            state->lencode = (code const FAR *)(state->next);
+            state->lencode = (const code FAR *)(state->next);
             state->lenbits = 9;
             ret = inflate_table(LENS, state->lens, state->nlen, &(state->next),
                                 &(state->lenbits), state->work);
@@ -941,7 +1007,7 @@ int flush;
                 state->mode = BAD;
                 break;
             }
-            state->distcode = (code const FAR *)(state->next);
+            state->distcode = (const code FAR *)(state->next);
             state->distbits = 6;
             ret = inflate_table(DISTS, state->lens + state->nlen, state->ndist,
                             &(state->next), &(state->distbits), state->work);
@@ -951,88 +1017,102 @@ int flush;
                 break;
             }
             Tracev((stderr, "inflate:       codes ok\n"));
+            state->mode = LEN_;
+            if (flush == Z_TREES) goto inf_leave;
+        case LEN_:
             state->mode = LEN;
         case LEN:
             if (have >= 6 && left >= 258) {
                 RESTORE();
                 inflate_fast(strm, out);
                 LOAD();
+                if (state->mode == TYPE)
+                    state->back = -1;
                 break;
             }
+            state->back = 0;
             for (;;) {
-                this = state->lencode[BITS(state->lenbits)];
-                if ((unsigned)(this.bits) <= bits) break;
+                here = state->lencode[BITS(state->lenbits)];
+                if ((unsigned)(here.bits) <= bits) break;
                 PULLBYTE();
             }
-            if (this.op && (this.op & 0xf0) == 0) {
-                last = this;
+            if (here.op && (here.op & 0xf0) == 0) {
+                last = here;
                 for (;;) {
-                    this = state->lencode[last.val +
+                    here = state->lencode[last.val +
                             (BITS(last.bits + last.op) >> last.bits)];
-                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    if ((unsigned)(last.bits + here.bits) <= bits) break;
                     PULLBYTE();
                 }
                 DROPBITS(last.bits);
+                state->back += last.bits;
             }
-            DROPBITS(this.bits);
-            state->length = (unsigned)this.val;
-            if ((int)(this.op) == 0) {
-                Tracevv((stderr, this.val >= 0x20 && this.val < 0x7f ?
+            DROPBITS(here.bits);
+            state->back += here.bits;
+            state->length = (unsigned)here.val;
+            if ((int)(here.op) == 0) {
+                Tracevv((stderr, here.val >= 0x20 && here.val < 0x7f ?
                         "inflate:         literal '%c'\n" :
-                        "inflate:         literal 0x%02x\n", this.val));
+                        "inflate:         literal 0x%02x\n", here.val));
                 state->mode = LIT;
                 break;
             }
-            if (this.op & 32) {
+            if (here.op & 32) {
                 Tracevv((stderr, "inflate:         end of block\n"));
+                state->back = -1;
                 state->mode = TYPE;
                 break;
             }
-            if (this.op & 64) {
+            if (here.op & 64) {
                 strm->msg = (char *)"invalid literal/length code";
                 state->mode = BAD;
                 break;
             }
-            state->extra = (unsigned)(this.op) & 15;
+            state->extra = (unsigned)(here.op) & 15;
             state->mode = LENEXT;
         case LENEXT:
             if (state->extra) {
                 NEEDBITS(state->extra);
                 state->length += BITS(state->extra);
                 DROPBITS(state->extra);
+                state->back += state->extra;
             }
             Tracevv((stderr, "inflate:         length %u\n", state->length));
+            state->was = state->length;
             state->mode = DIST;
         case DIST:
             for (;;) {
-                this = state->distcode[BITS(state->distbits)];
-                if ((unsigned)(this.bits) <= bits) break;
+                here = state->distcode[BITS(state->distbits)];
+                if ((unsigned)(here.bits) <= bits) break;
                 PULLBYTE();
             }
-            if ((this.op & 0xf0) == 0) {
-                last = this;
+            if ((here.op & 0xf0) == 0) {
+                last = here;
                 for (;;) {
-                    this = state->distcode[last.val +
+                    here = state->distcode[last.val +
                             (BITS(last.bits + last.op) >> last.bits)];
-                    if ((unsigned)(last.bits + this.bits) <= bits) break;
+                    if ((unsigned)(last.bits + here.bits) <= bits) break;
                     PULLBYTE();
                 }
                 DROPBITS(last.bits);
+                state->back += last.bits;
             }
-            DROPBITS(this.bits);
-            if (this.op & 64) {
+            DROPBITS(here.bits);
+            state->back += here.bits;
+            if (here.op & 64) {
                 strm->msg = (char *)"invalid distance code";
                 state->mode = BAD;
                 break;
             }
-            state->offset = (unsigned)this.val;
-            state->extra = (unsigned)(this.op) & 15;
+            state->offset = (unsigned)here.val;
+            state->extra = (unsigned)(here.op) & 15;
             state->mode = DISTEXT;
         case DISTEXT:
             if (state->extra) {
                 NEEDBITS(state->extra);
                 state->offset += BITS(state->extra);
                 DROPBITS(state->extra);
+                state->back += state->extra;
             }
 #ifdef INFLATE_STRICT
             if (state->offset > state->dmax) {
@@ -1041,11 +1121,6 @@ int flush;
                 break;
             }
 #endif
-            if (state->offset > state->whave + out - left) {
-                strm->msg = (char *)"invalid distance too far back";
-                state->mode = BAD;
-                break;
-            }
             Tracevv((stderr, "inflate:         distance %u\n", state->offset));
             state->mode = MATCH;
         case MATCH:
@@ -1053,12 +1128,32 @@ int flush;
             copy = out - left;
             if (state->offset > copy) {         /* copy from window */
                 copy = state->offset - copy;
-                if (copy > state->write) {
-                    copy -= state->write;
+                if (copy > state->whave) {
+                    if (state->sane) {
+                        strm->msg = (char *)"invalid distance too far back";
+                        state->mode = BAD;
+                        break;
+                    }
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+                    Trace((stderr, "inflate.c too far\n"));
+                    copy -= state->whave;
+                    if (copy > state->length) copy = state->length;
+                    if (copy > left) copy = left;
+                    left -= copy;
+                    state->length -= copy;
+                    do {
+                        *put++ = 0;
+                    } while (--copy);
+                    if (state->length == 0) state->mode = LEN;
+                    break;
+#endif
+                }
+                if (copy > state->wnext) {
+                    copy -= state->wnext;
                     from = state->window + (state->wsize - copy);
                 }
                 else
-                    from = state->window + (state->write - copy);
+                    from = state->window + (state->wnext - copy);
                 if (copy > state->length) copy = state->length;
             }
             else {                              /* copy from output */
@@ -1093,7 +1188,7 @@ int flush;
 #ifdef GUNZIP
                      state->flags ? hold :
 #endif
-                     REVERSE(hold)) != state->check) {
+                     ZSWAP32(hold)) != state->check) {
                     strm->msg = (char *)"incorrect data check";
                     state->mode = BAD;
                     break;
@@ -1137,8 +1232,9 @@ int flush;
      */
   inf_leave:
     RESTORE();
-    if (state->wsize || (state->mode < CHECK && out != strm->avail_out))
-        if (updatewindow(strm, out)) {
+    if (state->wsize || (out != strm->avail_out && state->mode < BAD &&
+            (state->mode < CHECK || flush != Z_FINISH)))
+        if (updatewindow(strm, strm->next_out, out - strm->avail_out)) {
             state->mode = MEM;
             return Z_MEM_ERROR;
         }
@@ -1151,7 +1247,8 @@ int flush;
         strm->adler = state->check =
             UPDATE(state->check, strm->next_out - out, out);
     strm->data_type = state->bits + (state->last ? 64 : 0) +
-                      (state->mode == TYPE ? 128 : 0);
+                      (state->mode == TYPE ? 128 : 0) +
+                      (state->mode == LEN_ || state->mode == COPY_ ? 256 : 0);
     if (((in == 0 && out == 0) || flush == Z_FINISH) && ret == Z_OK)
         ret = Z_BUF_ERROR;
     return ret;
@@ -1171,13 +1268,37 @@ z_streamp strm;
     return Z_OK;
 }
 
+int ZEXPORT inflateGetDictionary(strm, dictionary, dictLength)
+z_streamp strm;
+Bytef *dictionary;
+uInt *dictLength;
+{
+    struct inflate_state FAR *state;
+
+    /* check state */
+    if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+
+    /* copy dictionary */
+    if (state->whave && dictionary != Z_NULL) {
+        zmemcpy(dictionary, state->window + state->wnext,
+                state->whave - state->wnext);
+        zmemcpy(dictionary + state->whave - state->wnext,
+                state->window, state->wnext);
+    }
+    if (dictLength != Z_NULL)
+        *dictLength = state->whave;
+    return Z_OK;
+}
+
 int ZEXPORT inflateSetDictionary(strm, dictionary, dictLength)
 z_streamp strm;
 const Bytef *dictionary;
 uInt dictLength;
 {
     struct inflate_state FAR *state;
-    unsigned long id;
+    unsigned long dictid;
+    int ret;
 
     /* check state */
     if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
@@ -1185,29 +1306,21 @@ uInt dictLength;
     if (state->wrap != 0 && state->mode != DICT)
         return Z_STREAM_ERROR;
 
-    /* check for correct dictionary id */
+    /* check for correct dictionary identifier */
     if (state->mode == DICT) {
-        id = adler32(0L, Z_NULL, 0);
-        id = adler32(id, dictionary, dictLength);
-        if (id != state->check)
+        dictid = adler32(0L, Z_NULL, 0);
+        dictid = adler32(dictid, dictionary, dictLength);
+        if (dictid != state->check)
             return Z_DATA_ERROR;
     }
 
-    /* copy dictionary to window */
-    if (updatewindow(strm, strm->avail_out)) {
+    /* copy dictionary to window using updatewindow(), which will amend the
+       existing dictionary if appropriate */
+    ret = updatewindow(strm, dictionary + dictLength, dictLength);
+    if (ret) {
         state->mode = MEM;
         return Z_MEM_ERROR;
     }
-    if (dictLength > state->wsize) {
-        zmemcpy(state->window, dictionary + dictLength - state->wsize,
-                state->wsize);
-        state->whave = state->wsize;
-    }
-    else {
-        zmemcpy(state->window + state->wsize - dictLength, dictionary,
-                dictLength);
-        state->whave = dictLength;
-    }
     state->havedict = 1;
     Tracev((stderr, "inflate:   dictionary set\n"));
     return Z_OK;
@@ -1243,7 +1356,7 @@ gz_headerp head;
  */
 local unsigned syncsearch(have, buf, len)
 unsigned FAR *have;
-unsigned char FAR *buf;
+const unsigned char FAR *buf;
 unsigned len;
 {
     unsigned got;
@@ -1355,8 +1468,8 @@ z_streamp source;
     }
 
     /* copy state */
-    zmemcpy(dest, source, sizeof(z_stream));
-    zmemcpy(copy, state, sizeof(struct inflate_state));
+    zmemcpy((voidpf)dest, (voidpf)source, sizeof(z_stream));
+    zmemcpy((voidpf)copy, (voidpf)state, sizeof(struct inflate_state));
     if (state->lencode >= state->codes &&
         state->lencode <= state->codes + ENOUGH - 1) {
         copy->lencode = copy->codes + (state->lencode - state->codes);
@@ -1371,3 +1484,32 @@ z_streamp source;
     dest->state = (struct internal_state FAR *)copy;
     return Z_OK;
 }
+
+int ZEXPORT inflateUndermine(strm, subvert)
+z_streamp strm;
+int subvert;
+{
+    struct inflate_state FAR *state;
+
+    if (strm == Z_NULL || strm->state == Z_NULL) return Z_STREAM_ERROR;
+    state = (struct inflate_state FAR *)strm->state;
+    state->sane = !subvert;
+#ifdef INFLATE_ALLOW_INVALID_DISTANCE_TOOFAR_ARRR
+    return Z_OK;
+#else
+    state->sane = 1;
+    return Z_DATA_ERROR;
+#endif
+}
+
+long ZEXPORT inflateMark(strm)
+z_streamp strm;
+{
+    struct inflate_state FAR *state;
+
+    if (strm == Z_NULL || strm->state == Z_NULL) return -1L << 16;
+    state = (struct inflate_state FAR *)strm->state;
+    return ((long)(state->back) << 16) +
+        (state->mode == COPY ? state->length :
+            (state->mode == MATCH ? state->was - state->length : 0));
+}
diff --git a/erts/emulator/zlib/inflate.h b/erts/emulator/zlib/inflate.h
index 59164091c5..95f4986d40 100644
--- a/erts/emulator/zlib/inflate.h
+++ b/erts/emulator/zlib/inflate.h
@@ -1,10 +1,8 @@
 /* inflate.h -- internal inflate state definition
- * Copyright (C) 1995-2004 Mark Adler
+ * Copyright (C) 1995-2009 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* WARNING: this file should *not* be used by applications. It is
    part of the implementation of the compression library and is
    subject to change. Applications should only use zlib.h.
@@ -34,11 +32,13 @@ typedef enum {
         TYPE,       /* i: waiting for type bits, including last-flag bit */
         TYPEDO,     /* i: same, but skip check to exit inflate on new block */
         STORED,     /* i: waiting for stored size (length and complement) */
+        COPY_,      /* i/o: same as COPY below, but only first time in */
         COPY,       /* i/o: waiting for input or output to copy stored block */
         TABLE,      /* i: waiting for dynamic block table lengths */
         LENLENS,    /* i: waiting for code length code lengths */
         CODELENS,   /* i: waiting for length/lit and distance code lengths */
-            LEN,        /* i: waiting for length/lit code */
+            LEN_,       /* i: same as LEN below, but only first time in */
+            LEN,        /* i: waiting for length/lit/eob code */
             LENEXT,     /* i: waiting for length extra bits */
             DIST,       /* i: waiting for distance code */
             DISTEXT,    /* i: waiting for distance extra bits */
@@ -55,19 +55,21 @@ typedef enum {
 /*
     State transitions between above modes -
 
-    (most modes can go to the BAD or MEM mode -- not shown for clarity)
+    (most modes can go to BAD or MEM on error -- not shown for clarity)
 
     Process header:
-        HEAD -> (gzip) or (zlib)
-        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME
-        NAME -> COMMENT -> HCRC -> TYPE
+        HEAD -> (gzip) or (zlib) or (raw)
+        (gzip) -> FLAGS -> TIME -> OS -> EXLEN -> EXTRA -> NAME -> COMMENT ->
+                  HCRC -> TYPE
         (zlib) -> DICTID or TYPE
         DICTID -> DICT -> TYPE
+        (raw) -> TYPEDO
     Read deflate blocks:
-            TYPE -> STORED or TABLE or LEN or CHECK
-            STORED -> COPY -> TYPE
-            TABLE -> LENLENS -> CODELENS -> LEN
-    Read deflate codes:
+            TYPE -> TYPEDO -> STORED or TABLE or LEN_ or CHECK
+            STORED -> COPY_ -> COPY -> TYPE
+            TABLE -> LENLENS -> CODELENS -> LEN_
+            LEN_ -> LEN
+    Read deflate codes in fixed or dynamic block:
                 LEN -> LENEXT or LIT or TYPE
                 LENEXT -> DIST -> DISTEXT -> MATCH -> LEN
                 LIT -> LEN
@@ -75,7 +77,7 @@ typedef enum {
         CHECK -> LENGTH -> DONE
  */
 
-/* state maintained between inflate() calls.  Approximately 7K bytes. */
+/* state maintained between inflate() calls.  Approximately 10K bytes. */
 struct inflate_state {
     inflate_mode mode;          /* current inflate mode */
     int last;                   /* true if processing last block */
@@ -90,7 +92,7 @@ struct inflate_state {
     unsigned wbits;             /* log base 2 of requested window size */
     unsigned wsize;             /* window size or zero if not using window */
     unsigned whave;             /* valid bytes in the window */
-    unsigned write;             /* window write index */
+    unsigned wnext;             /* window write index */
     unsigned char FAR *window;  /* allocated sliding window, if needed */
         /* bit accumulator */
     unsigned long hold;         /* input bit accumulator */
@@ -114,4 +116,7 @@ struct inflate_state {
     unsigned short lens[320];   /* temporary storage for code lengths */
     unsigned short work[288];   /* work area for code table building */
     code codes[ENOUGH];         /* space for code tables */
+    int sane;                   /* if false, allow invalid distance too far */
+    int back;                   /* bits back of last unprocessed length/lit */
+    unsigned was;               /* initial length of match */
 };
diff --git a/erts/emulator/zlib/inftrees.c b/erts/emulator/zlib/inftrees.c
index 832fe28668..3766fa2646 100644
--- a/erts/emulator/zlib/inftrees.c
+++ b/erts/emulator/zlib/inftrees.c
@@ -1,10 +1,8 @@
 /* inftrees.c -- generate Huffman trees for efficient decoding
- * Copyright (C) 1995-2005 Mark Adler
+ * Copyright (C) 1995-2013 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
@@ -14,7 +12,7 @@
 #define MAXBITS 15
 
 const char inflate_copyright[] =
-   " inflate 1.2.3 Copyright 1995-2005 Mark Adler ";
+   " inflate 1.2.8 Copyright 1995-2013 Mark Adler ";
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -34,7 +32,7 @@ const char inflate_copyright[] =
    table index bits.  It will differ if the request is greater than the
    longest code or if it is less than the shortest code.
  */
-int inflate_table(type, lens, codes, table, bits, work)
+int ZLIB_INTERNAL inflate_table(type, lens, codes, table, bits, work)
 codetype type;
 unsigned short FAR *lens;
 unsigned codes;
@@ -55,7 +53,7 @@ unsigned short FAR *work;
     unsigned fill;              /* index for replicating entries */
     unsigned low;               /* low bits for current root entry */
     unsigned mask;              /* mask for low root bits */
-    code this;                  /* table entry for duplication */
+    code here;                  /* table entry for duplication */
     code FAR *next;             /* next available space in table */
     const unsigned short FAR *base;     /* base value table to use */
     const unsigned short FAR *extra;    /* extra bits table to use */
@@ -67,7 +65,7 @@ unsigned short FAR *work;
         35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
     static const unsigned short lext[31] = { /* Length codes 257..285 extra */
         16, 16, 16, 16, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18,
-        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 201, 196};
+        19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 16, 72, 78};
     static const unsigned short dbase[32] = { /* Distance codes 0..29 base */
         1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
         257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
@@ -120,15 +118,15 @@ unsigned short FAR *work;
         if (count[max] != 0) break;
     if (root > max) root = max;
     if (max == 0) {                     /* no symbols to code at all */
-        this.op = (unsigned char)64;    /* invalid code marker */
-        this.bits = (unsigned char)1;
-        this.val = (unsigned short)0;
-        *(*table)++ = this;             /* make a table to force an error */
-        *(*table)++ = this;
+        here.op = (unsigned char)64;    /* invalid code marker */
+        here.bits = (unsigned char)1;
+        here.val = (unsigned short)0;
+        *(*table)++ = here;             /* make a table to force an error */
+        *(*table)++ = here;
         *bits = 1;
         return 0;     /* no symbols, but wait for decoding to report error */
     }
-    for (min = 1; min <= MAXBITS; min++)
+    for (min = 1; min < max; min++)
         if (count[min] != 0) break;
     if (root < min) root = min;
 
@@ -171,11 +169,10 @@ unsigned short FAR *work;
        entered in the tables.
 
        used keeps track of how many table entries have been allocated from the
-       provided *table space.  It is checked when a LENS table is being made
-       against the space in *table, ENOUGH, minus the maximum space needed by
-       the worst case distance code, MAXD.  This should never happen, but the
-       sufficiency of ENOUGH has not been proven exhaustively, hence the check.
-       This assumes that when type == LENS, bits == 9.
+       provided *table space.  It is checked for LENS and DIST tables against
+       the constants ENOUGH_LENS and ENOUGH_DISTS to guard against changes in
+       the initial root table size constants.  See the comments in inftrees.h
+       for more information.
 
        sym increments through all symbols, and the loop terminates when
        all codes of length max, i.e. all codes, have been processed.  This
@@ -214,24 +211,25 @@ unsigned short FAR *work;
     mask = used - 1;            /* mask for comparing low */
 
     /* check available table space */
-    if (type == LENS && used >= ENOUGH - MAXD)
+    if ((type == LENS && used > ENOUGH_LENS) ||
+        (type == DISTS && used > ENOUGH_DISTS))
         return 1;
 
     /* process all codes and make table entries */
     for (;;) {
         /* create table entry */
-        this.bits = (unsigned char)(len - drop);
+        here.bits = (unsigned char)(len - drop);
         if ((int)(work[sym]) < end) {
-            this.op = (unsigned char)0;
-            this.val = work[sym];
+            here.op = (unsigned char)0;
+            here.val = work[sym];
         }
         else if ((int)(work[sym]) > end) {
-            this.op = (unsigned char)(extra[work[sym]]);
-            this.val = base[work[sym]];
+            here.op = (unsigned char)(extra[work[sym]]);
+            here.val = base[work[sym]];
         }
         else {
-            this.op = (unsigned char)(32 + 64);         /* end of block */
-            this.val = 0;
+            here.op = (unsigned char)(32 + 64);         /* end of block */
+            here.val = 0;
         }
 
         /* replicate for those indices with low len bits equal to huff */
@@ -240,7 +238,7 @@ unsigned short FAR *work;
         min = fill;                 /* save offset to next table */
         do {
             fill -= incr;
-            next[(huff >> drop) + fill] = this;
+            next[(huff >> drop) + fill] = here;
         } while (fill != 0);
 
         /* backwards increment the len-bit code huff */
@@ -282,7 +280,8 @@ unsigned short FAR *work;
 
             /* check for enough space */
             used += 1U << curr;
-            if (type == LENS && used >= ENOUGH - MAXD)
+            if ((type == LENS && used > ENOUGH_LENS) ||
+                (type == DISTS && used > ENOUGH_DISTS))
                 return 1;
 
             /* point entry in root table to sub-table */
@@ -293,38 +292,14 @@ unsigned short FAR *work;
         }
     }
 
-    /*
-       Fill in rest of table for incomplete codes.  This loop is similar to the
-       loop above in incrementing huff for table indices.  It is assumed that
-       len is equal to curr + drop, so there is no loop needed to increment
-       through high index bits.  When the current sub-table is filled, the loop
-       drops back to the root table to fill in any remaining entries there.
-     */
-    this.op = (unsigned char)64;                /* invalid code marker */
-    this.bits = (unsigned char)(len - drop);
-    this.val = (unsigned short)0;
-    while (huff != 0) {
-        /* when done with sub-table, drop back to root table */
-        if (drop != 0 && (huff & mask) != low) {
-            drop = 0;
-            len = root;
-            next = *table;
-            this.bits = (unsigned char)len;
-        }
-
-        /* put invalid code marker in table */
-        next[huff >> drop] = this;
-
-        /* backwards increment the len-bit code huff */
-        incr = 1U << (len - 1);
-        while (huff & incr)
-            incr >>= 1;
-        if (incr != 0) {
-            huff &= incr - 1;
-            huff += incr;
-        }
-        else
-            huff = 0;
+    /* fill in remaining table entry if code is incomplete (guaranteed to have
+       at most one remaining entry, since if the code is incomplete, the
+       maximum code length that was allowed to get this far is one bit) */
+    if (huff != 0) {
+        here.op = (unsigned char)64;            /* invalid code marker */
+        here.bits = (unsigned char)(len - drop);
+        here.val = (unsigned short)0;
+        next[huff] = here;
     }
 
     /* set return parameters */
diff --git a/erts/emulator/zlib/inftrees.h b/erts/emulator/zlib/inftrees.h
index 808100f70a..baa53a0b1a 100644
--- a/erts/emulator/zlib/inftrees.h
+++ b/erts/emulator/zlib/inftrees.h
@@ -1,10 +1,8 @@
 /* inftrees.h -- header to use inftrees.c
- * Copyright (C) 1995-2005 Mark Adler
+ * Copyright (C) 1995-2005, 2010 Mark Adler
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* WARNING: this file should *not* be used by applications. It is
    part of the implementation of the compression library and is
    subject to change. Applications should only use zlib.h.
@@ -37,21 +35,28 @@ typedef struct {
     01000000 - invalid code
  */
 
-/* Maximum size of dynamic tree.  The maximum found in a long but non-
-   exhaustive search was 1444 code structures (852 for length/literals
-   and 592 for distances, the latter actually the result of an
-   exhaustive search).  The true maximum is not known, but the value
-   below is more than safe. */
-#define ENOUGH 2048
-#define MAXD 592
+/* Maximum size of the dynamic table.  The maximum number of code structures is
+   1444, which is the sum of 852 for literal/length codes and 592 for distance
+   codes.  These values were found by exhaustive searches using the program
+   examples/enough.c found in the zlib distribtution.  The arguments to that
+   program are the number of symbols, the initial root table size, and the
+   maximum bit length of a code.  "enough 286 9 15" for literal/length codes
+   returns returns 852, and "enough 30 6 15" for distance codes returns 592.
+   The initial root table size (9 or 6) is found in the fifth argument of the
+   inflate_table() calls in inflate.c and infback.c.  If the root table size is
+   changed, then these maximum sizes would be need to be recalculated and
+   updated. */
+#define ENOUGH_LENS 852
+#define ENOUGH_DISTS 592
+#define ENOUGH (ENOUGH_LENS+ENOUGH_DISTS)
 
-/* Type of code to build for inftable() */
+/* Type of code to build for inflate_table() */
 typedef enum {
     CODES,
     LENS,
     DISTS
 } codetype;
 
-extern int inflate_table OF((codetype type, unsigned short FAR *lens,
+int ZLIB_INTERNAL inflate_table OF((codetype type, unsigned short FAR *lens,
                              unsigned codes, code FAR * FAR *table,
                              unsigned FAR *bits, unsigned short FAR *work));
diff --git a/erts/emulator/zlib/trees.c b/erts/emulator/zlib/trees.c
index 7d9f77f451..465e944e5b 100644
--- a/erts/emulator/zlib/trees.c
+++ b/erts/emulator/zlib/trees.c
@@ -1,10 +1,9 @@
 /* trees.c -- output deflated data using Huffman coding
- * Copyright (C) 1995-2005 Jean-loup Gailly
+ * Copyright (C) 1995-2012 Jean-loup Gailly
+ * detect_data_type() function provided freely by Cosmin Truta, 2006
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /*
  *  ALGORITHM
  *
@@ -78,11 +77,6 @@ local const uch bl_order[BL_CODES]
  * probability, to avoid transmitting the lengths for unused bit length codes.
  */
 
-#define Buf_size (8 * 2*sizeof(char))
-/* Number of bits used within bi_buf. (bi_buf might be implemented on
- * more than 16 bits on some systems.)
- */
-
 /* ===========================================================================
  * Local data. These are initialized only once.
  */
@@ -155,9 +149,9 @@ local void send_tree      OF((deflate_state *s, ct_data *tree, int max_code));
 local int  build_bl_tree  OF((deflate_state *s));
 local void send_all_trees OF((deflate_state *s, int lcodes, int dcodes,
                               int blcodes));
-local void compress_block OF((deflate_state *s, ct_data *ltree,
-                              ct_data *dtree));
-local void set_data_type  OF((deflate_state *s));
+local void compress_block OF((deflate_state *s, const ct_data *ltree,
+                              const ct_data *dtree));
+local int  detect_data_type OF((deflate_state *s));
 local unsigned bi_reverse OF((unsigned value, int length));
 local void bi_windup      OF((deflate_state *s));
 local void bi_flush       OF((deflate_state *s));
@@ -208,12 +202,12 @@ local void send_bits(s, value, length)
      * unused bits in value.
      */
     if (s->bi_valid > (int)Buf_size - length) {
-        s->bi_buf |= (value << s->bi_valid);
+        s->bi_buf |= (ush)value << s->bi_valid;
         put_short(s, s->bi_buf);
         s->bi_buf = (ush)value >> (Buf_size - s->bi_valid);
         s->bi_valid += length - Buf_size;
     } else {
-        s->bi_buf |= value << s->bi_valid;
+        s->bi_buf |= (ush)value << s->bi_valid;
         s->bi_valid += length;
     }
 }
@@ -223,12 +217,12 @@ local void send_bits(s, value, length)
 { int len = length;\
   if (s->bi_valid > (int)Buf_size - len) {\
     int val = value;\
-    s->bi_buf |= (val << s->bi_valid);\
+    s->bi_buf |= (ush)val << s->bi_valid;\
     put_short(s, s->bi_buf);\
     s->bi_buf = (ush)val >> (Buf_size - s->bi_valid);\
     s->bi_valid += len - Buf_size;\
   } else {\
-    s->bi_buf |= (value) << s->bi_valid;\
+    s->bi_buf |= (ush)(value) << s->bi_valid;\
     s->bi_valid += len;\
   }\
 }
@@ -255,11 +249,13 @@ local void tr_static_init()
     if (static_init_done) return;
 
     /* For some embedded targets, global variables are not initialized: */
+#ifdef NO_INIT_GLOBAL_POINTERS
     static_l_desc.static_tree = static_ltree;
     static_l_desc.extra_bits = extra_lbits;
     static_d_desc.static_tree = static_dtree;
     static_d_desc.extra_bits = extra_dbits;
     static_bl_desc.extra_bits = extra_blbits;
+#endif
 
     /* Initialize the mapping length (0..255) -> length code (0..28) */
     length = 0;
@@ -353,13 +349,14 @@ void gen_trees_header()
                 static_dtree[i].Len, SEPARATOR(i, D_CODES-1, 5));
     }
 
-    fprintf(header, "const uch _dist_code[DIST_CODE_LEN] = {\n");
+    fprintf(header, "const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = {\n");
     for (i = 0; i < DIST_CODE_LEN; i++) {
         fprintf(header, "%2u%s", _dist_code[i],
                 SEPARATOR(i, DIST_CODE_LEN-1, 20));
     }
 
-    fprintf(header, "const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {\n");
+    fprintf(header,
+        "const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= {\n");
     for (i = 0; i < MAX_MATCH-MIN_MATCH+1; i++) {
         fprintf(header, "%2u%s", _length_code[i],
                 SEPARATOR(i, MAX_MATCH-MIN_MATCH, 20));
@@ -384,7 +381,7 @@ void gen_trees_header()
 /* ===========================================================================
  * Initialize the tree data structures for a new zlib stream.
  */
-void _tr_init(s)
+void ZLIB_INTERNAL _tr_init(s)
     deflate_state *s;
 {
     tr_static_init();
@@ -400,7 +397,6 @@ void _tr_init(s)
 
     s->bi_buf = 0;
     s->bi_valid = 0;
-    s->last_eob_len = 8; /* enough lookahead for inflate */
 #ifdef DEBUG
     s->compressed_len = 0L;
     s->bits_sent = 0L;
@@ -869,13 +865,13 @@ local void send_all_trees(s, lcodes, dcodes, blcodes)
 /* ===========================================================================
  * Send a stored block
  */
-void _tr_stored_block(s, buf, stored_len, eof)
+void ZLIB_INTERNAL _tr_stored_block(s, buf, stored_len, last)
     deflate_state *s;
     charf *buf;       /* input block */
     ulg stored_len;   /* length of input block */
-    int eof;          /* true if this is the last block for a file */
+    int last;         /* one if this is the last block for a file */
 {
-    send_bits(s, (STORED_BLOCK<<1)+eof, 3);  /* send block type */
+    send_bits(s, (STORED_BLOCK<<1)+last, 3);    /* send block type */
 #ifdef DEBUG
     s->compressed_len = (s->compressed_len + 3 + 7) & (ulg)~7L;
     s->compressed_len += (stored_len + 4) << 3;
@@ -884,17 +880,19 @@ void _tr_stored_block(s, buf, stored_len, eof)
 }
 
 /* ===========================================================================
+ * Flush the bits in the bit buffer to pending output (leaves at most 7 bits)
+ */
+void ZLIB_INTERNAL _tr_flush_bits(s)
+    deflate_state *s;
+{
+    bi_flush(s);
+}
+
+/* ===========================================================================
  * Send one empty static block to give enough lookahead for inflate.
  * This takes 10 bits, of which 7 may remain in the bit buffer.
- * The current inflate code requires 9 bits of lookahead. If the
- * last two codes for the previous block (real code plus EOB) were coded
- * on 5 bits or less, inflate may have only 5+3 bits of lookahead to decode
- * the last real code. In this case we send two empty static blocks instead
- * of one. (There are no problems if the previous block is stored or fixed.)
- * To simplify the code, we assume the worst case of last real code encoded
- * on one bit only.
  */
-void _tr_align(s)
+void ZLIB_INTERNAL _tr_align(s)
     deflate_state *s;
 {
     send_bits(s, STATIC_TREES<<1, 3);
@@ -903,31 +901,17 @@ void _tr_align(s)
     s->compressed_len += 10L; /* 3 for block type, 7 for EOB */
 #endif
     bi_flush(s);
-    /* Of the 10 bits for the empty block, we have already sent
-     * (10 - bi_valid) bits. The lookahead for the last real code (before
-     * the EOB of the previous block) was thus at least one plus the length
-     * of the EOB plus what we have just sent of the empty static block.
-     */
-    if (1 + s->last_eob_len + 10 - s->bi_valid < 9) {
-        send_bits(s, STATIC_TREES<<1, 3);
-        send_code(s, END_BLOCK, static_ltree);
-#ifdef DEBUG
-        s->compressed_len += 10L;
-#endif
-        bi_flush(s);
-    }
-    s->last_eob_len = 7;
 }
 
 /* ===========================================================================
  * Determine the best encoding for the current block: dynamic trees, static
  * trees or store, and output the encoded block to the zip file.
  */
-void _tr_flush_block(s, buf, stored_len, eof)
+void ZLIB_INTERNAL _tr_flush_block(s, buf, stored_len, last)
     deflate_state *s;
     charf *buf;       /* input block, or NULL if too old */
     ulg stored_len;   /* length of input block */
-    int eof;          /* true if this is the last block for a file */
+    int last;         /* one if this is the last block for a file */
 {
     ulg opt_lenb, static_lenb; /* opt_len and static_len in bytes */
     int max_blindex = 0;  /* index of last bit length code of non zero freq */
@@ -936,8 +920,8 @@ void _tr_flush_block(s, buf, stored_len, eof)
     if (s->level > 0) {
 
         /* Check if the file is binary or text */
-        if (stored_len > 0 && s->strm->data_type == Z_UNKNOWN)
-            set_data_type(s);
+        if (s->strm->data_type == Z_UNKNOWN)
+            s->strm->data_type = detect_data_type(s);
 
         /* Construct the literal and distance trees */
         build_tree(s, (tree_desc *)(&(s->l_desc)));
@@ -983,23 +967,25 @@ void _tr_flush_block(s, buf, stored_len, eof)
          * successful. If LIT_BUFSIZE <= WSIZE, it is never too late to
          * transform a block into a stored block.
          */
-        _tr_stored_block(s, buf, stored_len, eof);
+        _tr_stored_block(s, buf, stored_len, last);
 
 #ifdef FORCE_STATIC
     } else if (static_lenb >= 0) { /* force static trees */
 #else
     } else if (s->strategy == Z_FIXED || static_lenb == opt_lenb) {
 #endif
-        send_bits(s, (STATIC_TREES<<1)+eof, 3);
-        compress_block(s, (ct_data *)static_ltree, (ct_data *)static_dtree);
+        send_bits(s, (STATIC_TREES<<1)+last, 3);
+        compress_block(s, (const ct_data *)static_ltree,
+                       (const ct_data *)static_dtree);
 #ifdef DEBUG
         s->compressed_len += 3 + s->static_len;
 #endif
     } else {
-        send_bits(s, (DYN_TREES<<1)+eof, 3);
+        send_bits(s, (DYN_TREES<<1)+last, 3);
         send_all_trees(s, s->l_desc.max_code+1, s->d_desc.max_code+1,
                        max_blindex+1);
-        compress_block(s, (ct_data *)s->dyn_ltree, (ct_data *)s->dyn_dtree);
+        compress_block(s, (const ct_data *)s->dyn_ltree,
+                       (const ct_data *)s->dyn_dtree);
 #ifdef DEBUG
         s->compressed_len += 3 + s->opt_len;
 #endif
@@ -1010,21 +996,21 @@ void _tr_flush_block(s, buf, stored_len, eof)
      */
     init_block(s);
 
-    if (eof) {
+    if (last) {
         bi_windup(s);
 #ifdef DEBUG
         s->compressed_len += 7;  /* align on byte boundary */
 #endif
     }
     Tracev((stderr,"\ncomprlen %lu(%lu) ", s->compressed_len>>3,
-           s->compressed_len-7*eof));
+           s->compressed_len-7*last));
 }
 
 /* ===========================================================================
  * Save the match info and tally the frequency counts. Return true if
  * the current block must be flushed.
  */
-int _tr_tally (s, dist, lc)
+int ZLIB_INTERNAL _tr_tally (s, dist, lc)
     deflate_state *s;
     unsigned dist;  /* distance of matched string */
     unsigned lc;    /* match length-MIN_MATCH or unmatched char (if dist==0) */
@@ -1076,8 +1062,8 @@ int _tr_tally (s, dist, lc)
  */
 local void compress_block(s, ltree, dtree)
     deflate_state *s;
-    ct_data *ltree; /* literal tree */
-    ct_data *dtree; /* distance tree */
+    const ct_data *ltree; /* literal tree */
+    const ct_data *dtree; /* distance tree */
 {
     unsigned dist;      /* distance of matched string */
     int lc;             /* match length or unmatched char (if dist == 0) */
@@ -1119,28 +1105,48 @@ local void compress_block(s, ltree, dtree)
     } while (lx < s->last_lit);
 
     send_code(s, END_BLOCK, ltree);
-    s->last_eob_len = ltree[END_BLOCK].Len;
 }
 
 /* ===========================================================================
- * Set the data type to BINARY or TEXT, using a crude approximation:
- * set it to Z_TEXT if all symbols are either printable characters (33 to 255)
- * or white spaces (9 to 13, or 32); or set it to Z_BINARY otherwise.
+ * Check if the data type is TEXT or BINARY, using the following algorithm:
+ * - TEXT if the two conditions below are satisfied:
+ *    a) There are no non-portable control characters belonging to the
+ *       "black list" (0..6, 14..25, 28..31).
+ *    b) There is at least one printable character belonging to the
+ *       "white list" (9 {TAB}, 10 {LF}, 13 {CR}, 32..255).
+ * - BINARY otherwise.
+ * - The following partially-portable control characters form a
+ *   "gray list" that is ignored in this detection algorithm:
+ *   (7 {BEL}, 8 {BS}, 11 {VT}, 12 {FF}, 26 {SUB}, 27 {ESC}).
  * IN assertion: the fields Freq of dyn_ltree are set.
  */
-local void set_data_type(s)
+local int detect_data_type(s)
     deflate_state *s;
 {
+    /* black_mask is the bit mask of black-listed bytes
+     * set bits 0..6, 14..25, and 28..31
+     * 0xf3ffc07f = binary 11110011111111111100000001111111
+     */
+    unsigned long black_mask = 0xf3ffc07fUL;
     int n;
 
-    for (n = 0; n < 9; n++)
+    /* Check for non-textual ("black-listed") bytes. */
+    for (n = 0; n <= 31; n++, black_mask >>= 1)
+        if ((black_mask & 1) && (s->dyn_ltree[n].Freq != 0))
+            return Z_BINARY;
+
+    /* Check for textual ("white-listed") bytes. */
+    if (s->dyn_ltree[9].Freq != 0 || s->dyn_ltree[10].Freq != 0
+            || s->dyn_ltree[13].Freq != 0)
+        return Z_TEXT;
+    for (n = 32; n < LITERALS; n++)
         if (s->dyn_ltree[n].Freq != 0)
-            break;
-    if (n == 9)
-        for (n = 14; n < 32; n++)
-            if (s->dyn_ltree[n].Freq != 0)
-                break;
-    s->strm->data_type = (n == 32) ? Z_TEXT : Z_BINARY;
+            return Z_TEXT;
+
+    /* There are no "black-listed" or "white-listed" bytes:
+     * this stream either is empty or has tolerated ("gray-listed") bytes only.
+     */
+    return Z_BINARY;
 }
 
 /* ===========================================================================
@@ -1206,7 +1212,6 @@ local void copy_block(s, buf, len, header)
     int      header;  /* true if block header must be written */
 {
     bi_windup(s);        /* align on byte boundary */
-    s->last_eob_len = 8; /* enough lookahead for inflate */
 
     if (header) {
         put_short(s, (ush)len);
diff --git a/erts/emulator/zlib/trees.h b/erts/emulator/zlib/trees.h
index 72facf900f..d35639d82a 100644
--- a/erts/emulator/zlib/trees.h
+++ b/erts/emulator/zlib/trees.h
@@ -70,7 +70,7 @@ local const ct_data static_dtree[D_CODES] = {
 {{19},{ 5}}, {{11},{ 5}}, {{27},{ 5}}, {{ 7},{ 5}}, {{23},{ 5}}
 };
 
-const uch _dist_code[DIST_CODE_LEN] = {
+const uch ZLIB_INTERNAL _dist_code[DIST_CODE_LEN] = {
  0,  1,  2,  3,  4,  4,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,
  8,  8,  8,  8,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 10, 10, 10, 10,
 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
@@ -99,7 +99,7 @@ const uch _dist_code[DIST_CODE_LEN] = {
 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29
 };
 
-const uch _length_code[MAX_MATCH-MIN_MATCH+1]= {
+const uch ZLIB_INTERNAL _length_code[MAX_MATCH-MIN_MATCH+1]= {
  0,  1,  2,  3,  4,  5,  6,  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 12, 12,
 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
diff --git a/erts/emulator/zlib/uncompr.c b/erts/emulator/zlib/uncompr.c
index cbc93cb1eb..864d571719 100644
--- a/erts/emulator/zlib/uncompr.c
+++ b/erts/emulator/zlib/uncompr.c
@@ -1,10 +1,8 @@
 /* uncompr.c -- decompress a memory buffer
- * Copyright (C) 1995-2003 Jean-loup Gailly.
+ * Copyright (C) 1995-2003, 2010 Jean-loup Gailly.
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* @(#) $Id$ */
 
 #ifdef HAVE_CONFIG_H
@@ -21,8 +19,6 @@
    been saved previously by the compressor and transmitted to the decompressor
    by some mechanism outside the scope of this compression library.)
    Upon exit, destLen is the actual size of the compressed buffer.
-     This function can be used to decompress a whole file at once if the
-   input file is mmap'ed.
 
      uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
    enough memory, Z_BUF_ERROR if there was not enough room in the output
@@ -37,7 +33,7 @@ int ZEXPORT uncompress (dest, destLen, source, sourceLen)
     z_stream stream;
     int err;
 
-    stream.next_in = (Bytef*)source;
+    stream.next_in = (z_const Bytef *)source;
     stream.avail_in = (uInt)sourceLen;
     /* Check for source > 64K on 16-bit machine: */
     if ((uLong)stream.avail_in != sourceLen) return Z_BUF_ERROR;
diff --git a/erts/emulator/zlib/zconf.h b/erts/emulator/zlib/zconf.h
index b7979d48d3..9987a77553 100644
--- a/erts/emulator/zlib/zconf.h
+++ b/erts/emulator/zlib/zconf.h
@@ -1,10 +1,8 @@
 /* zconf.h -- configuration of the zlib compression library
- * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * Copyright (C) 1995-2013 Jean-loup Gailly.
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* @(#) $Id$ */
 
 #ifndef ZCONF_H
@@ -13,52 +11,145 @@
 /*
  * If you *really* need a unique prefix for all types and library functions,
  * compile with -DZ_PREFIX. The "standard" zlib should be compiled without it.
+ * Even better than compiling with -DZ_PREFIX would be to use configure to set
+ * this permanently in zconf.h using "./configure --zprefix".
  */
-#ifdef Z_PREFIX
-#  define deflateInit_          z_deflateInit_
+#ifdef Z_PREFIX     /* may be set to #if 1 by ./configure */
+#  define Z_PREFIX_SET
+
+/* all linked symbols */
+#  define _dist_code            z__dist_code
+#  define _length_code          z__length_code
+#  define _tr_align             z__tr_align
+#  define _tr_flush_bits        z__tr_flush_bits
+#  define _tr_flush_block       z__tr_flush_block
+#  define _tr_init              z__tr_init
+#  define _tr_stored_block      z__tr_stored_block
+#  define _tr_tally             z__tr_tally
+#  define adler32               z_adler32
+#  define adler32_combine       z_adler32_combine
+#  define adler32_combine64     z_adler32_combine64
+#  ifndef Z_SOLO
+#    define compress              z_compress
+#    define compress2             z_compress2
+#    define compressBound         z_compressBound
+#  endif
+#  define crc32                 z_crc32
+#  define crc32_combine         z_crc32_combine
+#  define crc32_combine64       z_crc32_combine64
 #  define deflate               z_deflate
+#  define deflateBound          z_deflateBound
+#  define deflateCopy           z_deflateCopy
 #  define deflateEnd            z_deflateEnd
-#  define inflateInit_          z_inflateInit_
-#  define inflate               z_inflate
-#  define inflateEnd            z_inflateEnd
 #  define deflateInit2_         z_deflateInit2_
-#  define deflateSetDictionary  z_deflateSetDictionary
-#  define deflateCopy           z_deflateCopy
-#  define deflateReset          z_deflateReset
+#  define deflateInit_          z_deflateInit_
 #  define deflateParams         z_deflateParams
-#  define deflateBound          z_deflateBound
+#  define deflatePending        z_deflatePending
 #  define deflatePrime          z_deflatePrime
+#  define deflateReset          z_deflateReset
+#  define deflateResetKeep      z_deflateResetKeep
+#  define deflateSetDictionary  z_deflateSetDictionary
+#  define deflateSetHeader      z_deflateSetHeader
+#  define deflateTune           z_deflateTune
+#  define deflate_copyright     z_deflate_copyright
+#  define get_crc_table         z_get_crc_table
+#  ifndef Z_SOLO
+#    define gz_error              z_gz_error
+#    define gz_intmax             z_gz_intmax
+#    define gz_strwinerror        z_gz_strwinerror
+#    define gzbuffer              z_gzbuffer
+#    define gzclearerr            z_gzclearerr
+#    define gzclose               z_gzclose
+#    define gzclose_r             z_gzclose_r
+#    define gzclose_w             z_gzclose_w
+#    define gzdirect              z_gzdirect
+#    define gzdopen               z_gzdopen
+#    define gzeof                 z_gzeof
+#    define gzerror               z_gzerror
+#    define gzflush               z_gzflush
+#    define gzgetc                z_gzgetc
+#    define gzgetc_               z_gzgetc_
+#    define gzgets                z_gzgets
+#    define gzoffset              z_gzoffset
+#    define gzoffset64            z_gzoffset64
+#    define gzopen                z_gzopen
+#    define gzopen64              z_gzopen64
+#    ifdef _WIN32
+#      define gzopen_w              z_gzopen_w
+#    endif
+#    define gzprintf              z_gzprintf
+#    define gzvprintf             z_gzvprintf
+#    define gzputc                z_gzputc
+#    define gzputs                z_gzputs
+#    define gzread                z_gzread
+#    define gzrewind              z_gzrewind
+#    define gzseek                z_gzseek
+#    define gzseek64              z_gzseek64
+#    define gzsetparams           z_gzsetparams
+#    define gztell                z_gztell
+#    define gztell64              z_gztell64
+#    define gzungetc              z_gzungetc
+#    define gzwrite               z_gzwrite
+#  endif
+#  define inflate               z_inflate
+#  define inflateBack           z_inflateBack
+#  define inflateBackEnd        z_inflateBackEnd
+#  define inflateBackInit_      z_inflateBackInit_
+#  define inflateCopy           z_inflateCopy
+#  define inflateEnd            z_inflateEnd
+#  define inflateGetHeader      z_inflateGetHeader
 #  define inflateInit2_         z_inflateInit2_
+#  define inflateInit_          z_inflateInit_
+#  define inflateMark           z_inflateMark
+#  define inflatePrime          z_inflatePrime
+#  define inflateReset          z_inflateReset
+#  define inflateReset2         z_inflateReset2
 #  define inflateSetDictionary  z_inflateSetDictionary
+#  define inflateGetDictionary  z_inflateGetDictionary
 #  define inflateSync           z_inflateSync
 #  define inflateSyncPoint      z_inflateSyncPoint
-#  define inflateCopy           z_inflateCopy
-#  define inflateReset          z_inflateReset
-#  define inflateBack           z_inflateBack
-#  define inflateBackEnd        z_inflateBackEnd
-#  define compress              z_compress
-#  define compress2             z_compress2
-#  define compressBound         z_compressBound
-#  define uncompress            z_uncompress
-#  define adler32               z_adler32
-#  define crc32                 z_crc32
-#  define get_crc_table         z_get_crc_table
+#  define inflateUndermine      z_inflateUndermine
+#  define inflateResetKeep      z_inflateResetKeep
+#  define inflate_copyright     z_inflate_copyright
+#  define inflate_fast          z_inflate_fast
+#  define inflate_table         z_inflate_table
+#  ifndef Z_SOLO
+#    define uncompress            z_uncompress
+#  endif
 #  define zError                z_zError
+#  ifndef Z_SOLO
+#    define zcalloc               z_zcalloc
+#    define zcfree                z_zcfree
+#  endif
+#  define zlibCompileFlags      z_zlibCompileFlags
+#  define zlibVersion           z_zlibVersion
 
+/* all zlib typedefs in zlib.h and zconf.h */
+#  define Byte                  z_Byte
+#  define Bytef                 z_Bytef
 #  define alloc_func            z_alloc_func
+#  define charf                 z_charf
 #  define free_func             z_free_func
+#  ifndef Z_SOLO
+#    define gzFile                z_gzFile
+#  endif
+#  define gz_header             z_gz_header
+#  define gz_headerp            z_gz_headerp
 #  define in_func               z_in_func
+#  define intf                  z_intf
 #  define out_func              z_out_func
-#  define Byte                  z_Byte
 #  define uInt                  z_uInt
-#  define uLong                 z_uLong
-#  define Bytef                 z_Bytef
-#  define charf                 z_charf
-#  define intf                  z_intf
 #  define uIntf                 z_uIntf
+#  define uLong                 z_uLong
 #  define uLongf                z_uLongf
-#  define voidpf                z_voidpf
 #  define voidp                 z_voidp
+#  define voidpc                z_voidpc
+#  define voidpf                z_voidpf
+
+/* all zlib structs in zlib.h and zconf.h */
+#  define gz_header_s           z_gz_header_s
+#  define internal_state        z_internal_state
+
 #endif
 
 #if defined(__MSDOS__) && !defined(MSDOS)
@@ -127,6 +218,12 @@
 #  endif
 #endif
 
+#if defined(ZLIB_CONST) && !defined(z_const)
+#  define z_const const
+#else
+#  define z_const
+#endif
+
 /* Some Mac compilers merge all .h files incorrectly: */
 #if defined(__MWERKS__)||defined(applec)||defined(THINK_C)||defined(__SC__)
 #  define NO_DUMMY_DECL
@@ -173,6 +270,14 @@
 #  endif
 #endif
 
+#ifndef Z_ARG /* function prototypes for stdarg */
+#  if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#    define Z_ARG(args)  args
+#  else
+#    define Z_ARG(args)  ()
+#  endif
+#endif
+
 /* The following definitions for FAR are needed only for MSDOS mixed
  * model programming (small or medium model with some far allocations).
  * This was tested only with MSC; for other MSDOS compilers you may have
@@ -286,49 +391,121 @@ typedef uLong FAR uLongf;
    typedef Byte       *voidp;
 #endif
 
-#if 0           /* HAVE_UNISTD_H -- this line is updated by ./configure */
-#  include <sys/types.h> /* for off_t */
-#  include <unistd.h>    /* for SEEK_* and off_t */
-#  ifdef VMS
-#    include <unixio.h>   /* for off_t */
+#if !defined(Z_U4) && !defined(Z_SOLO) && defined(STDC)
+#  include <limits.h>
+#  if (UINT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned
+#  elif (ULONG_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned long
+#  elif (USHRT_MAX == 0xffffffffUL)
+#    define Z_U4 unsigned short
+#  endif
+#endif
+
+#ifdef Z_U4
+   typedef Z_U4 z_crc_t;
+#else
+   typedef unsigned long z_crc_t;
+#endif
+
+#ifdef HAVE_UNISTD_H    /* may be set to #if 1 by ./configure */
+#  define Z_HAVE_UNISTD_H
+#endif
+
+#ifdef HAVE_STDARG_H    /* may be set to #if 1 by ./configure */
+#  define Z_HAVE_STDARG_H
+#endif
+
+#ifdef STDC
+#  ifndef Z_SOLO
+#    include <sys/types.h>      /* for off_t */
+#  endif
+#endif
+
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+#    include <stdarg.h>         /* for va_list */
 #  endif
-#  define z_off_t off_t
 #endif
-#ifndef SEEK_SET
+
+#ifdef _WIN32
+#  ifndef Z_SOLO
+#    include <stddef.h>         /* for wchar_t */
+#  endif
+#endif
+
+/* a little trick to accommodate both "#define _LARGEFILE64_SOURCE" and
+ * "#define _LARGEFILE64_SOURCE 1" as requesting 64-bit operations, (even
+ * though the former does not conform to the LFS document), but considering
+ * both "#undef _LARGEFILE64_SOURCE" and "#define _LARGEFILE64_SOURCE 0" as
+ * equivalently requesting no 64-bit operations
+ */
+#if defined(_LARGEFILE64_SOURCE) && -_LARGEFILE64_SOURCE - -1 == 1
+#  undef _LARGEFILE64_SOURCE
+#endif
+
+#if defined(__WATCOMC__) && !defined(Z_HAVE_UNISTD_H)
+#  define Z_HAVE_UNISTD_H
+#endif
+#ifndef Z_SOLO
+#  if defined(Z_HAVE_UNISTD_H) || defined(_LARGEFILE64_SOURCE)
+#    include <unistd.h>         /* for SEEK_*, off_t, and _LFS64_LARGEFILE */
+#    ifdef VMS
+#      include <unixio.h>       /* for off_t */
+#    endif
+#    ifndef z_off_t
+#      define z_off_t off_t
+#    endif
+#  endif
+#endif
+
+#if defined(_LFS64_LARGEFILE) && _LFS64_LARGEFILE-0
+#  define Z_LFS64
+#endif
+
+#if defined(_LARGEFILE64_SOURCE) && defined(Z_LFS64)
+#  define Z_LARGE64
+#endif
+
+#if defined(_FILE_OFFSET_BITS) && _FILE_OFFSET_BITS-0 == 64 && defined(Z_LFS64)
+#  define Z_WANT64
+#endif
+
+#if !defined(SEEK_SET) && !defined(Z_SOLO)
 #  define SEEK_SET        0       /* Seek from beginning of file.  */
 #  define SEEK_CUR        1       /* Seek from current position.  */
 #  define SEEK_END        2       /* Set file pointer to EOF plus "offset" */
 #endif
+
 #ifndef z_off_t
 #  define z_off_t long
 #endif
 
-#if defined(__OS400__)
-#  define NO_vsnprintf
-#endif
-
-#if defined(__MVS__)
-#  define NO_vsnprintf
-#  ifdef FAR
-#    undef FAR
+#if !defined(_WIN32) && defined(Z_LARGE64)
+#  define z_off64_t off64_t
+#else
+#  if defined(_WIN32) && !defined(__GNUC__) && !defined(Z_SOLO)
+#    define z_off64_t __int64
+#  else
+#    define z_off64_t z_off_t
 #  endif
 #endif
 
 /* MVS linker does not support external names larger than 8 bytes */
 #if defined(__MVS__)
-#   pragma map(deflateInit_,"DEIN")
-#   pragma map(deflateInit2_,"DEIN2")
-#   pragma map(deflateEnd,"DEEND")
-#   pragma map(deflateBound,"DEBND")
-#   pragma map(inflateInit_,"ININ")
-#   pragma map(inflateInit2_,"ININ2")
-#   pragma map(inflateEnd,"INEND")
-#   pragma map(inflateSync,"INSY")
-#   pragma map(inflateSetDictionary,"INSEDI")
-#   pragma map(compressBound,"CMBND")
-#   pragma map(inflate_table,"INTABL")
-#   pragma map(inflate_fast,"INFA")
-#   pragma map(inflate_copyright,"INCOPY")
+  #pragma map(deflateInit_,"DEIN")
+  #pragma map(deflateInit2_,"DEIN2")
+  #pragma map(deflateEnd,"DEEND")
+  #pragma map(deflateBound,"DEBND")
+  #pragma map(inflateInit_,"ININ")
+  #pragma map(inflateInit2_,"ININ2")
+  #pragma map(inflateEnd,"INEND")
+  #pragma map(inflateSync,"INSY")
+  #pragma map(inflateSetDictionary,"INSEDI")
+  #pragma map(compressBound,"CMBND")
+  #pragma map(inflate_table,"INTABL")
+  #pragma map(inflate_fast,"INFA")
+  #pragma map(inflate_copyright,"INCOPY")
 #endif
 
 #endif /* ZCONF_H */
diff --git a/erts/emulator/zlib/zlib.h b/erts/emulator/zlib/zlib.h
index 9209774383..3e0c7672ac 100644
--- a/erts/emulator/zlib/zlib.h
+++ b/erts/emulator/zlib/zlib.h
@@ -1,7 +1,7 @@
 /* zlib.h -- interface of the 'zlib' general purpose compression library
-  version 1.2.3, July 18th, 2005
+  version 1.2.8, April 28th, 2013
 
-  Copyright (C) 1995-2005 Jean-loup Gailly and Mark Adler
+  Copyright (C) 1995-2013 Jean-loup Gailly and Mark Adler
 
   This software is provided 'as-is', without any express or implied
   warranty.  In no event will the authors be held liable for any damages
@@ -24,12 +24,10 @@
 
 
   The data format used by the zlib library is described by RFCs (Request for
-  Comments) 1950 to 1952 in the files http://www.ietf.org/rfc/rfc1950.txt
-  (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format).
+  Comments) 1950 to 1952 in the files http://tools.ietf.org/html/rfc1950
+  (zlib format), rfc1951 (deflate format) and rfc1952 (gzip format).
 */
 
-/* %ExternalCopyright% */
-
 #ifndef ZLIB_H
 #define ZLIB_H
 
@@ -39,41 +37,44 @@
 extern "C" {
 #endif
 
-#define ZLIB_VERSION "1.2.3"
-#define ZLIB_VERNUM 0x1230
+#define ZLIB_VERSION "1.2.8"
+#define ZLIB_VERNUM 0x1280
+#define ZLIB_VER_MAJOR 1
+#define ZLIB_VER_MINOR 2
+#define ZLIB_VER_REVISION 8
+#define ZLIB_VER_SUBREVISION 0
 
 /*
-     The 'zlib' compression library provides in-memory compression and
-  decompression functions, including integrity checks of the uncompressed
-  data.  This version of the library supports only one compression method
-  (deflation) but other algorithms will be added later and will have the same
-  stream interface.
-
-     Compression can be done in a single step if the buffers are large
-  enough (for example if an input file is mmap'ed), or can be done by
-  repeated calls of the compression function.  In the latter case, the
-  application must provide more input and/or consume the output
+    The 'zlib' compression library provides in-memory compression and
+  decompression functions, including integrity checks of the uncompressed data.
+  This version of the library supports only one compression method (deflation)
+  but other algorithms will be added later and will have the same stream
+  interface.
+
+    Compression can be done in a single step if the buffers are large enough,
+  or can be done by repeated calls of the compression function.  In the latter
+  case, the application must provide more input and/or consume the output
   (providing more output space) before each call.
 
-     The compressed data format used by default by the in-memory functions is
+    The compressed data format used by default by the in-memory functions is
   the zlib format, which is a zlib wrapper documented in RFC 1950, wrapped
   around a deflate stream, which is itself documented in RFC 1951.
 
-     The library also supports reading and writing files in gzip (.gz) format
+    The library also supports reading and writing files in gzip (.gz) format
   with an interface similar to that of stdio using the functions that start
   with "gz".  The gzip format is different from the zlib format.  gzip is a
   gzip wrapper, documented in RFC 1952, wrapped around a deflate stream.
 
-     This library can optionally read and write gzip streams in memory as well.
+    This library can optionally read and write gzip streams in memory as well.
 
-     The zlib format was designed to be compact and fast for use in memory
+    The zlib format was designed to be compact and fast for use in memory
   and on communications channels.  The gzip format was designed for single-
   file compression on file systems, has a larger header than zlib to maintain
   directory information, and uses a different, slower check method than zlib.
 
-     The library does not install any signal handler. The decoder checks
-  the consistency of the compressed data, so the library should never
-  crash even in case of corrupted input.
+    The library does not install any signal handler.  The decoder checks
+  the consistency of the compressed data, so the library should never crash
+  even in case of corrupted input.
 */
 
 typedef voidpf (*alloc_func) OF((voidpf opaque, uInt items, uInt size));
@@ -82,15 +83,15 @@ typedef void   (*free_func)  OF((voidpf opaque, voidpf address));
 struct internal_state;
 
 typedef struct z_stream_s {
-    Bytef    *next_in;  /* next input byte */
+    z_const Bytef *next_in;     /* next input byte */
     uInt     avail_in;  /* number of bytes available at next_in */
-    uLong    total_in;  /* total nb of input bytes read so far */
+    uLong    total_in;  /* total number of input bytes read so far */
 
     Bytef    *next_out; /* next output byte should be put there */
     uInt     avail_out; /* remaining free space at next_out */
-    uLong    total_out; /* total nb of bytes output so far */
+    uLong    total_out; /* total number of bytes output so far */
 
-    char     *msg;      /* last error message, NULL if no error */
+    z_const char *msg;  /* last error message, NULL if no error */
     struct internal_state FAR *state; /* not visible by applications */
 
     alloc_func zalloc;  /* used to allocate the internal state */
@@ -128,45 +129,45 @@ typedef struct gz_header_s {
 typedef gz_header FAR *gz_headerp;
 
 /*
-   The application must update next_in and avail_in when avail_in has
-   dropped to zero. It must update next_out and avail_out when avail_out
-   has dropped to zero. The application must initialize zalloc, zfree and
-   opaque before calling the init function. All other fields are set by the
-   compression library and must not be updated by the application.
-
-   The opaque value provided by the application will be passed as the first
-   parameter for calls of zalloc and zfree. This can be useful for custom
-   memory management. The compression library attaches no meaning to the
+     The application must update next_in and avail_in when avail_in has dropped
+   to zero.  It must update next_out and avail_out when avail_out has dropped
+   to zero.  The application must initialize zalloc, zfree and opaque before
+   calling the init function.  All other fields are set by the compression
+   library and must not be updated by the application.
+
+     The opaque value provided by the application will be passed as the first
+   parameter for calls of zalloc and zfree.  This can be useful for custom
+   memory management.  The compression library attaches no meaning to the
    opaque value.
 
-   zalloc must return Z_NULL if there is not enough memory for the object.
+     zalloc must return Z_NULL if there is not enough memory for the object.
    If zlib is used in a multi-threaded application, zalloc and zfree must be
    thread safe.
 
-   On 16-bit systems, the functions zalloc and zfree must be able to allocate
-   exactly 65536 bytes, but will not be required to allocate more than this
-   if the symbol MAXSEG_64K is defined (see zconf.h). WARNING: On MSDOS,
-   pointers returned by zalloc for objects of exactly 65536 bytes *must*
-   have their offset normalized to zero. The default allocation function
-   provided by this library ensures this (see zutil.c). To reduce memory
-   requirements and avoid any allocation of 64K objects, at the expense of
-   compression ratio, compile the library with -DMAX_WBITS=14 (see zconf.h).
-
-   The fields total_in and total_out can be used for statistics or
-   progress reports. After compression, total_in holds the total size of
-   the uncompressed data and may be saved for use in the decompressor
-   (particularly if the decompressor wants to decompress everything in
-   a single step).
+     On 16-bit systems, the functions zalloc and zfree must be able to allocate
+   exactly 65536 bytes, but will not be required to allocate more than this if
+   the symbol MAXSEG_64K is defined (see zconf.h).  WARNING: On MSDOS, pointers
+   returned by zalloc for objects of exactly 65536 bytes *must* have their
+   offset normalized to zero.  The default allocation function provided by this
+   library ensures this (see zutil.c).  To reduce memory requirements and avoid
+   any allocation of 64K objects, at the expense of compression ratio, compile
+   the library with -DMAX_WBITS=14 (see zconf.h).
+
+     The fields total_in and total_out can be used for statistics or progress
+   reports.  After compression, total_in holds the total size of the
+   uncompressed data and may be saved for use in the decompressor (particularly
+   if the decompressor wants to decompress everything in a single step).
 */
 
                         /* constants */
 
 #define Z_NO_FLUSH      0
-#define Z_PARTIAL_FLUSH 1 /* will be removed, use Z_SYNC_FLUSH instead */
+#define Z_PARTIAL_FLUSH 1
 #define Z_SYNC_FLUSH    2
 #define Z_FULL_FLUSH    3
 #define Z_FINISH        4
 #define Z_BLOCK         5
+#define Z_TREES         6
 /* Allowed flush values; see deflate() and inflate() below for details */
 
 #define Z_OK            0
@@ -178,8 +179,8 @@ typedef gz_header FAR *gz_headerp;
 #define Z_MEM_ERROR    (-4)
 #define Z_BUF_ERROR    (-5)
 #define Z_VERSION_ERROR (-6)
-/* Return codes for the compression/decompression functions. Negative
- * values are errors, positive values are used for special but normal events.
+/* Return codes for the compression/decompression functions. Negative values
+ * are errors, positive values are used for special but normal events.
  */
 
 #define Z_NO_COMPRESSION         0
@@ -209,119 +210,141 @@ typedef gz_header FAR *gz_headerp;
 #define zlib_version zlibVersion()
 /* for compatibility with versions < 1.0.2 */
 
+
                         /* basic functions */
 
 ZEXTERN const char * ZEXPORT zlibVersion OF((void));
 /* The application can compare zlibVersion and ZLIB_VERSION for consistency.
-   If the first character differs, the library code actually used is
-   not compatible with the zlib.h header file used by the application.
-   This check is automatically made by deflateInit and inflateInit.
+   If the first character differs, the library code actually used is not
+   compatible with the zlib.h header file used by the application.  This check
+   is automatically made by deflateInit and inflateInit.
  */
 
 /*
 ZEXTERN int ZEXPORT deflateInit OF((z_streamp strm, int level));
 
-     Initializes the internal stream state for compression. The fields
-   zalloc, zfree and opaque must be initialized before by the caller.
-   If zalloc and zfree are set to Z_NULL, deflateInit updates them to
-   use default allocation functions.
+     Initializes the internal stream state for compression.  The fields
+   zalloc, zfree and opaque must be initialized before by the caller.  If
+   zalloc and zfree are set to Z_NULL, deflateInit updates them to use default
+   allocation functions.
 
      The compression level must be Z_DEFAULT_COMPRESSION, or between 0 and 9:
-   1 gives best speed, 9 gives best compression, 0 gives no compression at
-   all (the input data is simply copied a block at a time).
-   Z_DEFAULT_COMPRESSION requests a default compromise between speed and
-   compression (currently equivalent to level 6).
+   1 gives best speed, 9 gives best compression, 0 gives no compression at all
+   (the input data is simply copied a block at a time).  Z_DEFAULT_COMPRESSION
+   requests a default compromise between speed and compression (currently
+   equivalent to level 6).
 
-     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not
-   enough memory, Z_STREAM_ERROR if level is not a valid compression level,
+     deflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if level is not a valid compression level, or
    Z_VERSION_ERROR if the zlib library version (zlib_version) is incompatible
-   with the version assumed by the caller (ZLIB_VERSION).
-   msg is set to null if there is no error message.  deflateInit does not
-   perform any compression: this will be done by deflate().
+   with the version assumed by the caller (ZLIB_VERSION).  msg is set to null
+   if there is no error message.  deflateInit does not perform any compression:
+   this will be done by deflate().
 */
 
 
 ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
 /*
     deflate compresses as much data as possible, and stops when the input
-  buffer becomes empty or the output buffer becomes full. It may introduce some
-  output latency (reading input without producing any output) except when
+  buffer becomes empty or the output buffer becomes full.  It may introduce
+  some output latency (reading input without producing any output) except when
   forced to flush.
 
-    The detailed semantics are as follows. deflate performs one or both of the
+    The detailed semantics are as follows.  deflate performs one or both of the
   following actions:
 
   - Compress more input starting at next_in and update next_in and avail_in
-    accordingly. If not all input can be processed (because there is not
+    accordingly.  If not all input can be processed (because there is not
     enough room in the output buffer), next_in and avail_in are updated and
     processing will resume at this point for the next call of deflate().
 
   - Provide more output starting at next_out and update next_out and avail_out
-    accordingly. This action is forced if the parameter flush is non zero.
+    accordingly.  This action is forced if the parameter flush is non zero.
     Forcing flush frequently degrades the compression ratio, so this parameter
-    should be set only when necessary (in interactive applications).
-    Some output may be provided even if flush is not set.
-
-  Before the call of deflate(), the application should ensure that at least
-  one of the actions is possible, by providing more input and/or consuming
-  more output, and updating avail_in or avail_out accordingly; avail_out
-  should never be zero before the call. The application can consume the
-  compressed output when it wants, for example when the output buffer is full
-  (avail_out == 0), or after each call of deflate(). If deflate returns Z_OK
-  and with zero avail_out, it must be called again after making room in the
-  output buffer because there might be more output pending.
+    should be set only when necessary (in interactive applications).  Some
+    output may be provided even if flush is not set.
+
+    Before the call of deflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating avail_in or avail_out accordingly; avail_out should
+  never be zero before the call.  The application can consume the compressed
+  output when it wants, for example when the output buffer is full (avail_out
+  == 0), or after each call of deflate().  If deflate returns Z_OK and with
+  zero avail_out, it must be called again after making room in the output
+  buffer because there might be more output pending.
 
     Normally the parameter flush is set to Z_NO_FLUSH, which allows deflate to
-  decide how much data to accumualte before producing output, in order to
+  decide how much data to accumulate before producing output, in order to
   maximize compression.
 
     If the parameter flush is set to Z_SYNC_FLUSH, all pending output is
   flushed to the output buffer and the output is aligned on a byte boundary, so
-  that the decompressor can get all input data available so far. (In particular
-  avail_in is zero after the call if enough output space has been provided
-  before the call.)  Flushing may degrade compression for some compression
-  algorithms and so it should be used only when necessary.
+  that the decompressor can get all input data available so far.  (In
+  particular avail_in is zero after the call if enough output space has been
+  provided before the call.) Flushing may degrade compression for some
+  compression algorithms and so it should be used only when necessary.  This
+  completes the current deflate block and follows it with an empty stored block
+  that is three bits plus filler bits to the next byte, followed by four bytes
+  (00 00 ff ff).
+
+    If flush is set to Z_PARTIAL_FLUSH, all pending output is flushed to the
+  output buffer, but the output is not aligned to a byte boundary.  All of the
+  input data so far will be available to the decompressor, as for Z_SYNC_FLUSH.
+  This completes the current deflate block and follows it with an empty fixed
+  codes block that is 10 bits long.  This assures that enough bytes are output
+  in order for the decompressor to finish the block before the empty fixed code
+  block.
+
+    If flush is set to Z_BLOCK, a deflate block is completed and emitted, as
+  for Z_SYNC_FLUSH, but the output is not aligned on a byte boundary, and up to
+  seven bits of the current block are held to be written as the next byte after
+  the next deflate block is completed.  In this case, the decompressor may not
+  be provided enough bits at this point in order to complete decompression of
+  the data provided so far to the compressor.  It may need to wait for the next
+  block to be emitted.  This is for advanced applications that need to control
+  the emission of deflate blocks.
 
     If flush is set to Z_FULL_FLUSH, all output is flushed as with
   Z_SYNC_FLUSH, and the compression state is reset so that decompression can
   restart from this point if previous compressed data has been damaged or if
-  random access is desired. Using Z_FULL_FLUSH too often can seriously degrade
+  random access is desired.  Using Z_FULL_FLUSH too often can seriously degrade
   compression.
 
     If deflate returns with avail_out == 0, this function must be called again
   with the same value of the flush parameter and more output space (updated
   avail_out), until the flush is complete (deflate returns with non-zero
-  avail_out). In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
+  avail_out).  In the case of a Z_FULL_FLUSH or Z_SYNC_FLUSH, make sure that
   avail_out is greater than six to avoid repeated flush markers due to
   avail_out == 0 on return.
 
     If the parameter flush is set to Z_FINISH, pending input is processed,
-  pending output is flushed and deflate returns with Z_STREAM_END if there
-  was enough output space; if deflate returns with Z_OK, this function must be
+  pending output is flushed and deflate returns with Z_STREAM_END if there was
+  enough output space; if deflate returns with Z_OK, this function must be
   called again with Z_FINISH and more output space (updated avail_out) but no
-  more input data, until it returns with Z_STREAM_END or an error. After
-  deflate has returned Z_STREAM_END, the only possible operations on the
-  stream are deflateReset or deflateEnd.
+  more input data, until it returns with Z_STREAM_END or an error.  After
+  deflate has returned Z_STREAM_END, the only possible operations on the stream
+  are deflateReset or deflateEnd.
 
     Z_FINISH can be used immediately after deflateInit if all the compression
-  is to be done in a single step. In this case, avail_out must be at least
-  the value returned by deflateBound (see below). If deflate does not return
-  Z_STREAM_END, then it must be called again as described above.
+  is to be done in a single step.  In this case, avail_out must be at least the
+  value returned by deflateBound (see below).  Then deflate is guaranteed to
+  return Z_STREAM_END.  If not enough output space is provided, deflate will
+  not return Z_STREAM_END, and it must be called again as described above.
 
     deflate() sets strm->adler to the adler32 checksum of all input read
   so far (that is, total_in bytes).
 
     deflate() may update strm->data_type if it can make a good guess about
-  the input data type (Z_BINARY or Z_TEXT). In doubt, the data is considered
-  binary. This field is only for information purposes and does not affect
-  the compression algorithm in any manner.
+  the input data type (Z_BINARY or Z_TEXT).  In doubt, the data is considered
+  binary.  This field is only for information purposes and does not affect the
+  compression algorithm in any manner.
 
     deflate() returns Z_OK if some progress has been made (more input
   processed or more output produced), Z_STREAM_END if all input has been
   consumed and all output has been produced (only when flush is set to
   Z_FINISH), Z_STREAM_ERROR if the stream state was inconsistent (for example
-  if next_in or next_out was NULL), Z_BUF_ERROR if no progress is possible
-  (for example avail_in or avail_out was zero). Note that Z_BUF_ERROR is not
+  if next_in or next_out was Z_NULL), Z_BUF_ERROR if no progress is possible
+  (for example avail_in or avail_out was zero).  Note that Z_BUF_ERROR is not
   fatal, and deflate() can be called again with more input and more output
   space to continue compressing.
 */
@@ -330,13 +353,13 @@ ZEXTERN int ZEXPORT deflate OF((z_streamp strm, int flush));
 ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
 /*
      All dynamically allocated data structures for this stream are freed.
-   This function discards any unprocessed input and does not flush any
-   pending output.
+   This function discards any unprocessed input and does not flush any pending
+   output.
 
      deflateEnd returns Z_OK if success, Z_STREAM_ERROR if the
    stream state was inconsistent, Z_DATA_ERROR if the stream was freed
-   prematurely (some input or output was discarded). In the error case,
-   msg may be set but then points to a static string (which must not be
+   prematurely (some input or output was discarded).  In the error case, msg
+   may be set but then points to a static string (which must not be
    deallocated).
 */
 
@@ -344,10 +367,10 @@ ZEXTERN int ZEXPORT deflateEnd OF((z_streamp strm));
 /*
 ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
 
-     Initializes the internal stream state for decompression. The fields
+     Initializes the internal stream state for decompression.  The fields
    next_in, avail_in, zalloc, zfree and opaque must be initialized before by
-   the caller. If next_in is not Z_NULL and avail_in is large enough (the exact
-   value depends on the compression method), inflateInit determines the
+   the caller.  If next_in is not Z_NULL and avail_in is large enough (the
+   exact value depends on the compression method), inflateInit determines the
    compression method from the zlib header and allocates all data structures
    accordingly; otherwise the allocation will be deferred to the first call of
    inflate.  If zalloc and zfree are set to Z_NULL, inflateInit updates them to
@@ -355,95 +378,116 @@ ZEXTERN int ZEXPORT inflateInit OF((z_streamp strm));
 
      inflateInit returns Z_OK if success, Z_MEM_ERROR if there was not enough
    memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
-   version assumed by the caller.  msg is set to null if there is no error
-   message. inflateInit does not perform any decompression apart from reading
-   the zlib header if present: this will be done by inflate().  (So next_in and
-   avail_in may be modified, but next_out and avail_out are unchanged.)
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit() does not process any header information -- that is deferred
+   until inflate() is called.
 */
 
 
 ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
 /*
     inflate decompresses as much data as possible, and stops when the input
-  buffer becomes empty or the output buffer becomes full. It may introduce
+  buffer becomes empty or the output buffer becomes full.  It may introduce
   some output latency (reading input without producing any output) except when
   forced to flush.
 
-  The detailed semantics are as follows. inflate performs one or both of the
+  The detailed semantics are as follows.  inflate performs one or both of the
   following actions:
 
   - Decompress more input starting at next_in and update next_in and avail_in
-    accordingly. If not all input can be processed (because there is not
-    enough room in the output buffer), next_in is updated and processing
-    will resume at this point for the next call of inflate().
+    accordingly.  If not all input can be processed (because there is not
+    enough room in the output buffer), next_in is updated and processing will
+    resume at this point for the next call of inflate().
 
   - Provide more output starting at next_out and update next_out and avail_out
-    accordingly.  inflate() provides as much output as possible, until there
-    is no more input data or no more space in the output buffer (see below
-    about the flush parameter).
-
-  Before the call of inflate(), the application should ensure that at least
-  one of the actions is possible, by providing more input and/or consuming
-  more output, and updating the next_* and avail_* values accordingly.
-  The application can consume the uncompressed output when it wants, for
-  example when the output buffer is full (avail_out == 0), or after each
-  call of inflate(). If inflate returns Z_OK and with zero avail_out, it
-  must be called again after making room in the output buffer because there
-  might be more output pending.
-
-    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH,
-  Z_FINISH, or Z_BLOCK. Z_SYNC_FLUSH requests that inflate() flush as much
-  output as possible to the output buffer. Z_BLOCK requests that inflate() stop
-  if and when it gets to the next deflate block boundary. When decoding the
-  zlib or gzip format, this will cause inflate() to return immediately after
-  the header and before the first block. When doing a raw inflate, inflate()
-  will go ahead and process the first block, and will return when it gets to
-  the end of that block, or when it runs out of data.
+    accordingly.  inflate() provides as much output as possible, until there is
+    no more input data or no more space in the output buffer (see below about
+    the flush parameter).
+
+    Before the call of inflate(), the application should ensure that at least
+  one of the actions is possible, by providing more input and/or consuming more
+  output, and updating the next_* and avail_* values accordingly.  The
+  application can consume the uncompressed output when it wants, for example
+  when the output buffer is full (avail_out == 0), or after each call of
+  inflate().  If inflate returns Z_OK and with zero avail_out, it must be
+  called again after making room in the output buffer because there might be
+  more output pending.
+
+    The flush parameter of inflate() can be Z_NO_FLUSH, Z_SYNC_FLUSH, Z_FINISH,
+  Z_BLOCK, or Z_TREES.  Z_SYNC_FLUSH requests that inflate() flush as much
+  output as possible to the output buffer.  Z_BLOCK requests that inflate()
+  stop if and when it gets to the next deflate block boundary.  When decoding
+  the zlib or gzip format, this will cause inflate() to return immediately
+  after the header and before the first block.  When doing a raw inflate,
+  inflate() will go ahead and process the first block, and will return when it
+  gets to the end of that block, or when it runs out of data.
 
     The Z_BLOCK option assists in appending to or combining deflate streams.
   Also to assist in this, on return inflate() will set strm->data_type to the
-  number of unused bits in the last byte taken from strm->next_in, plus 64
-  if inflate() is currently decoding the last block in the deflate stream,
-  plus 128 if inflate() returned immediately after decoding an end-of-block
-  code or decoding the complete header up to just before the first byte of the
-  deflate stream. The end-of-block will not be indicated until all of the
-  uncompressed data from that block has been written to strm->next_out.  The
-  number of unused bits may in general be greater than seven, except when
-  bit 7 of data_type is set, in which case the number of unused bits will be
-  less than eight.
+  number of unused bits in the last byte taken from strm->next_in, plus 64 if
+  inflate() is currently decoding the last block in the deflate stream, plus
+  128 if inflate() returned immediately after decoding an end-of-block code or
+  decoding the complete header up to just before the first byte of the deflate
+  stream.  The end-of-block will not be indicated until all of the uncompressed
+  data from that block has been written to strm->next_out.  The number of
+  unused bits may in general be greater than seven, except when bit 7 of
+  data_type is set, in which case the number of unused bits will be less than
+  eight.  data_type is set as noted here every time inflate() returns for all
+  flush options, and so can be used to determine the amount of currently
+  consumed input in bits.
+
+    The Z_TREES option behaves as Z_BLOCK does, but it also returns when the
+  end of each deflate block header is reached, before any actual data in that
+  block is decoded.  This allows the caller to determine the length of the
+  deflate block header for later use in random access within a deflate block.
+  256 is added to the value of strm->data_type when inflate() returns
+  immediately after reaching the end of the deflate block header.
 
     inflate() should normally be called until it returns Z_STREAM_END or an
-  error. However if all decompression is to be performed in a single step
-  (a single call of inflate), the parameter flush should be set to
-  Z_FINISH. In this case all pending input is processed and all pending
-  output is flushed; avail_out must be large enough to hold all the
-  uncompressed data. (The size of the uncompressed data may have been saved
-  by the compressor for this purpose.) The next operation on this stream must
-  be inflateEnd to deallocate the decompression state. The use of Z_FINISH
-  is never required, but can be used to inform inflate that a faster approach
-  may be used for the single inflate() call.
+  error.  However if all decompression is to be performed in a single step (a
+  single call of inflate), the parameter flush should be set to Z_FINISH.  In
+  this case all pending input is processed and all pending output is flushed;
+  avail_out must be large enough to hold all of the uncompressed data for the
+  operation to complete.  (The size of the uncompressed data may have been
+  saved by the compressor for this purpose.) The use of Z_FINISH is not
+  required to perform an inflation in one step.  However it may be used to
+  inform inflate that a faster approach can be used for the single inflate()
+  call.  Z_FINISH also informs inflate to not maintain a sliding window if the
+  stream completes, which reduces inflate's memory footprint.  If the stream
+  does not complete, either because not all of the stream is provided or not
+  enough output space is provided, then a sliding window will be allocated and
+  inflate() can be called again to continue the operation as if Z_NO_FLUSH had
+  been used.
 
      In this implementation, inflate() always flushes as much output as
   possible to the output buffer, and always uses the faster approach on the
-  first call. So the only effect of the flush parameter in this implementation
-  is on the return value of inflate(), as noted below, or when it returns early
-  because Z_BLOCK is used.
+  first call.  So the effects of the flush parameter in this implementation are
+  on the return value of inflate() as noted below, when inflate() returns early
+  when Z_BLOCK or Z_TREES is used, and when inflate() avoids the allocation of
+  memory for a sliding window when Z_FINISH is used.
 
      If a preset dictionary is needed after this call (see inflateSetDictionary
-  below), inflate sets strm->adler to the adler32 checksum of the dictionary
+  below), inflate sets strm->adler to the Adler-32 checksum of the dictionary
   chosen by the compressor and returns Z_NEED_DICT; otherwise it sets
-  strm->adler to the adler32 checksum of all output produced so far (that is,
+  strm->adler to the Adler-32 checksum of all output produced so far (that is,
   total_out bytes) and returns Z_OK, Z_STREAM_END or an error code as described
-  below. At the end of the stream, inflate() checks that its computed adler32
+  below.  At the end of the stream, inflate() checks that its computed adler32
   checksum is equal to that saved by the compressor and returns Z_STREAM_END
   only if the checksum is correct.
 
-    inflate() will decompress and check either zlib-wrapped or gzip-wrapped
-  deflate data.  The header type is detected automatically.  Any information
-  contained in the gzip header is not retained, so applications that need that
-  information should instead use raw inflate, see inflateInit2() below, or
-  inflateBack() and perform their own processing of the gzip header and
-  trailer.
+    inflate() can decompress and check either zlib-wrapped or gzip-wrapped
+  deflate data.  The header type is detected automatically, if requested when
+  initializing with inflateInit2().  Any information contained in the gzip
+  header is not retained, so applications that need that information should
+  instead use raw inflate, see inflateInit2() below, or inflateBack() and
+  perform their own processing of the gzip header and trailer.  When processing
+  gzip-wrapped deflate data, strm->adler32 is set to the CRC-32 of the output
+  producted so far.  The CRC-32 is checked against the gzip trailer.
 
     inflate() returns Z_OK if some progress has been made (more input processed
   or more output produced), Z_STREAM_END if the end of the compressed data has
@@ -451,27 +495,28 @@ ZEXTERN int ZEXPORT inflate OF((z_streamp strm, int flush));
   preset dictionary is needed at this point, Z_DATA_ERROR if the input data was
   corrupted (input stream not conforming to the zlib format or incorrect check
   value), Z_STREAM_ERROR if the stream structure was inconsistent (for example
-  if next_in or next_out was NULL), Z_MEM_ERROR if there was not enough memory,
+  next_in or next_out was Z_NULL), Z_MEM_ERROR if there was not enough memory,
   Z_BUF_ERROR if no progress is possible or if there was not enough room in the
-  output buffer when Z_FINISH is used. Note that Z_BUF_ERROR is not fatal, and
+  output buffer when Z_FINISH is used.  Note that Z_BUF_ERROR is not fatal, and
   inflate() can be called again with more input and more output space to
-  continue decompressing. If Z_DATA_ERROR is returned, the application may then
-  call inflateSync() to look for a good compression block if a partial recovery
-  of the data is desired.
+  continue decompressing.  If Z_DATA_ERROR is returned, the application may
+  then call inflateSync() to look for a good compression block if a partial
+  recovery of the data is desired.
 */
 
 
 ZEXTERN int ZEXPORT inflateEnd OF((z_streamp strm));
 /*
      All dynamically allocated data structures for this stream are freed.
-   This function discards any unprocessed input and does not flush any
-   pending output.
+   This function discards any unprocessed input and does not flush any pending
+   output.
 
      inflateEnd returns Z_OK if success, Z_STREAM_ERROR if the stream state
-   was inconsistent. In the error case, msg may be set but then points to a
+   was inconsistent.  In the error case, msg may be set but then points to a
    static string (which must not be deallocated).
 */
 
+
                         /* Advanced functions */
 
 /*
@@ -486,55 +531,57 @@ ZEXTERN int ZEXPORT deflateInit2 OF((z_streamp strm,
                                      int  memLevel,
                                      int  strategy));
 
-     This is another version of deflateInit with more compression options. The
-   fields next_in, zalloc, zfree and opaque must be initialized before by
-   the caller.
+     This is another version of deflateInit with more compression options.  The
+   fields next_in, zalloc, zfree and opaque must be initialized before by the
+   caller.
 
-     The method parameter is the compression method. It must be Z_DEFLATED in
+     The method parameter is the compression method.  It must be Z_DEFLATED in
    this version of the library.
 
      The windowBits parameter is the base two logarithm of the window size
-   (the size of the history buffer). It should be in the range 8..15 for this
-   version of the library. Larger values of this parameter result in better
-   compression at the expense of memory usage. The default value is 15 if
+   (the size of the history buffer).  It should be in the range 8..15 for this
+   version of the library.  Larger values of this parameter result in better
+   compression at the expense of memory usage.  The default value is 15 if
    deflateInit is used instead.
 
-     windowBits can also be -8..-15 for raw deflate. In this case, -windowBits
-   determines the window size. deflate() will then generate raw deflate data
+     windowBits can also be -8..-15 for raw deflate.  In this case, -windowBits
+   determines the window size.  deflate() will then generate raw deflate data
    with no zlib header or trailer, and will not compute an adler32 check value.
 
-     windowBits can also be greater than 15 for optional gzip encoding. Add
+     windowBits can also be greater than 15 for optional gzip encoding.  Add
    16 to windowBits to write a simple gzip header and trailer around the
-   compressed data instead of a zlib wrapper. The gzip header will have no
-   file name, no extra data, no comment, no modification time (set to zero),
-   no header crc, and the operating system will be set to 255 (unknown).  If a
+   compressed data instead of a zlib wrapper.  The gzip header will have no
+   file name, no extra data, no comment, no modification time (set to zero), no
+   header crc, and the operating system will be set to 255 (unknown).  If a
    gzip stream is being written, strm->adler is a crc32 instead of an adler32.
 
      The memLevel parameter specifies how much memory should be allocated
-   for the internal compression state. memLevel=1 uses minimum memory but
-   is slow and reduces compression ratio; memLevel=9 uses maximum memory
-   for optimal speed. The default value is 8. See zconf.h for total memory
-   usage as a function of windowBits and memLevel.
+   for the internal compression state.  memLevel=1 uses minimum memory but is
+   slow and reduces compression ratio; memLevel=9 uses maximum memory for
+   optimal speed.  The default value is 8.  See zconf.h for total memory usage
+   as a function of windowBits and memLevel.
 
-     The strategy parameter is used to tune the compression algorithm. Use the
+     The strategy parameter is used to tune the compression algorithm.  Use the
    value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
    filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only (no
    string match), or Z_RLE to limit match distances to one (run-length
-   encoding). Filtered data consists mostly of small values with a somewhat
-   random distribution. In this case, the compression algorithm is tuned to
-   compress them better. The effect of Z_FILTERED is to force more Huffman
+   encoding).  Filtered data consists mostly of small values with a somewhat
+   random distribution.  In this case, the compression algorithm is tuned to
+   compress them better.  The effect of Z_FILTERED is to force more Huffman
    coding and less string matching; it is somewhat intermediate between
-   Z_DEFAULT and Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as
-   Z_HUFFMAN_ONLY, but give better compression for PNG image data. The strategy
-   parameter only affects the compression ratio but not the correctness of the
-   compressed output even if it is not set appropriately.  Z_FIXED prevents the
-   use of dynamic Huffman codes, allowing for a simpler decoder for special
-   applications.
-
-      deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
-   memory, Z_STREAM_ERROR if a parameter is invalid (such as an invalid
-   method). msg is set to null if there is no error message.  deflateInit2 does
-   not perform any compression: this will be done by deflate().
+   Z_DEFAULT_STRATEGY and Z_HUFFMAN_ONLY.  Z_RLE is designed to be almost as
+   fast as Z_HUFFMAN_ONLY, but give better compression for PNG image data.  The
+   strategy parameter only affects the compression ratio but not the
+   correctness of the compressed output even if it is not set appropriately.
+   Z_FIXED prevents the use of dynamic Huffman codes, allowing for a simpler
+   decoder for special applications.
+
+     deflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
+   memory, Z_STREAM_ERROR if any parameter is invalid (such as an invalid
+   method), or Z_VERSION_ERROR if the zlib library version (zlib_version) is
+   incompatible with the version assumed by the caller (ZLIB_VERSION).  msg is
+   set to null if there is no error message.  deflateInit2 does not perform any
+   compression: this will be done by deflate().
 */
 
 ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
@@ -542,38 +589,43 @@ ZEXTERN int ZEXPORT deflateSetDictionary OF((z_streamp strm,
                                              uInt  dictLength));
 /*
      Initializes the compression dictionary from the given byte sequence
-   without producing any compressed output. This function must be called
-   immediately after deflateInit, deflateInit2 or deflateReset, before any
-   call of deflate. The compressor and decompressor must use exactly the same
-   dictionary (see inflateSetDictionary).
+   without producing any compressed output.  When using the zlib format, this
+   function must be called immediately after deflateInit, deflateInit2 or
+   deflateReset, and before any call of deflate.  When doing raw deflate, this
+   function must be called either before any call of deflate, or immediately
+   after the completion of a deflate block, i.e. after all input has been
+   consumed and all output has been delivered when using any of the flush
+   options Z_BLOCK, Z_PARTIAL_FLUSH, Z_SYNC_FLUSH, or Z_FULL_FLUSH.  The
+   compressor and decompressor must use exactly the same dictionary (see
+   inflateSetDictionary).
 
      The dictionary should consist of strings (byte sequences) that are likely
    to be encountered later in the data to be compressed, with the most commonly
-   used strings preferably put towards the end of the dictionary. Using a
+   used strings preferably put towards the end of the dictionary.  Using a
    dictionary is most useful when the data to be compressed is short and can be
    predicted with good accuracy; the data can then be compressed better than
    with the default empty dictionary.
 
      Depending on the size of the compression data structures selected by
    deflateInit or deflateInit2, a part of the dictionary may in effect be
-   discarded, for example if the dictionary is larger than the window size in
-   deflate or deflate2. Thus the strings most likely to be useful should be
-   put at the end of the dictionary, not at the front. In addition, the
-   current implementation of deflate will use at most the window size minus
-   262 bytes of the provided dictionary.
+   discarded, for example if the dictionary is larger than the window size
+   provided in deflateInit or deflateInit2.  Thus the strings most likely to be
+   useful should be put at the end of the dictionary, not at the front.  In
+   addition, the current implementation of deflate will use at most the window
+   size minus 262 bytes of the provided dictionary.
 
      Upon return of this function, strm->adler is set to the adler32 value
    of the dictionary; the decompressor may later use this value to determine
-   which dictionary has been used by the compressor. (The adler32 value
+   which dictionary has been used by the compressor.  (The adler32 value
    applies to the whole dictionary even if only a subset of the dictionary is
    actually used by the compressor.) If a raw deflate was requested, then the
    adler32 value is not computed and strm->adler is not set.
 
      deflateSetDictionary returns Z_OK if success, or Z_STREAM_ERROR if a
-   parameter is invalid (such as NULL dictionary) or the stream state is
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
    inconsistent (for example if deflate has already been called for this stream
-   or if the compression method is bsort). deflateSetDictionary does not
-   perform any compression: this will be done by deflate().
+   or if not at a block boundary for raw deflate).  deflateSetDictionary does
+   not perform any compression: this will be done by deflate().
 */
 
 ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
@@ -583,26 +635,26 @@ ZEXTERN int ZEXPORT deflateCopy OF((z_streamp dest,
 
      This function can be useful when several compression strategies will be
    tried, for example when there are several ways of pre-processing the input
-   data with a filter. The streams that will be discarded should then be freed
+   data with a filter.  The streams that will be discarded should then be freed
    by calling deflateEnd.  Note that deflateCopy duplicates the internal
-   compression state which can be quite large, so this strategy is slow and
-   can consume lots of memory.
+   compression state which can be quite large, so this strategy is slow and can
+   consume lots of memory.
 
      deflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
    enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
-   (such as zalloc being NULL). msg is left unchanged in both source and
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
    destination.
 */
 
 ZEXTERN int ZEXPORT deflateReset OF((z_streamp strm));
 /*
      This function is equivalent to deflateEnd followed by deflateInit,
-   but does not free and reallocate all the internal compression state.
-   The stream will keep the same compression level and any other attributes
-   that may have been set by deflateInit2.
+   but does not free and reallocate all the internal compression state.  The
+   stream will keep the same compression level and any other attributes that
+   may have been set by deflateInit2.
 
-      deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
-   stream state was inconsistent (such as zalloc or state being NULL).
+     deflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
 */
 
 ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
@@ -612,18 +664,18 @@ ZEXTERN int ZEXPORT deflateParams OF((z_streamp strm,
      Dynamically update the compression level and compression strategy.  The
    interpretation of level and strategy is as in deflateInit2.  This can be
    used to switch between compression and straight copy of the input data, or
-   to switch to a different kind of input data requiring a different
-   strategy. If the compression level is changed, the input available so far
-   is compressed with the old level (and may be flushed); the new level will
-   take effect only at the next call of deflate().
+   to switch to a different kind of input data requiring a different strategy.
+   If the compression level is changed, the input available so far is
+   compressed with the old level (and may be flushed); the new level will take
+   effect only at the next call of deflate().
 
      Before the call of deflateParams, the stream state must be set as for
-   a call of deflate(), since the currently available input may have to
-   be compressed and flushed. In particular, strm->avail_out must be non-zero.
+   a call of deflate(), since the currently available input may have to be
+   compressed and flushed.  In particular, strm->avail_out must be non-zero.
 
      deflateParams returns Z_OK if success, Z_STREAM_ERROR if the source
-   stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR
-   if strm->avail_out was zero.
+   stream state was inconsistent or if a parameter was invalid, Z_BUF_ERROR if
+   strm->avail_out was zero.
 */
 
 ZEXTERN int ZEXPORT deflateTune OF((z_streamp strm,
@@ -647,31 +699,53 @@ ZEXTERN uLong ZEXPORT deflateBound OF((z_streamp strm,
                                        uLong sourceLen));
 /*
      deflateBound() returns an upper bound on the compressed size after
-   deflation of sourceLen bytes.  It must be called after deflateInit()
-   or deflateInit2().  This would be used to allocate an output buffer
-   for deflation in a single pass, and so would be called before deflate().
+   deflation of sourceLen bytes.  It must be called after deflateInit() or
+   deflateInit2(), and after deflateSetHeader(), if used.  This would be used
+   to allocate an output buffer for deflation in a single pass, and so would be
+   called before deflate().  If that first deflate() call is provided the
+   sourceLen input bytes, an output buffer allocated to the size returned by
+   deflateBound(), and the flush value Z_FINISH, then deflate() is guaranteed
+   to return Z_STREAM_END.  Note that it is possible for the compressed size to
+   be larger than the value returned by deflateBound() if flush options other
+   than Z_FINISH or Z_NO_FLUSH are used.
 */
 
+ZEXTERN int ZEXPORT deflatePending OF((z_streamp strm,
+                                       unsigned *pending,
+                                       int *bits));
+/*
+     deflatePending() returns the number of bytes and bits of output that have
+   been generated, but not yet provided in the available output.  The bytes not
+   provided would be due to the available output space having being consumed.
+   The number of bits of output not provided are between 0 and 7, where they
+   await more bits to join them in order to fill out a full byte.  If pending
+   or bits are Z_NULL, then those values are not set.
+
+     deflatePending returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent.
+ */
+
 ZEXTERN int ZEXPORT deflatePrime OF((z_streamp strm,
                                      int bits,
                                      int value));
 /*
      deflatePrime() inserts bits in the deflate output stream.  The intent
-  is that this function is used to start off the deflate output with the
-  bits leftover from a previous deflate stream when appending to it.  As such,
-  this function can only be used for raw deflate, and must be used before the
-  first deflate() call after a deflateInit2() or deflateReset().  bits must be
-  less than or equal to 16, and that many of the least significant bits of
-  value will be inserted in the output.
-
-      deflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
-   stream state was inconsistent.
+   is that this function is used to start off the deflate output with the bits
+   leftover from a previous deflate stream when appending to it.  As such, this
+   function can only be used for raw deflate, and must be used before the first
+   deflate() call after a deflateInit2() or deflateReset().  bits must be less
+   than or equal to 16, and that many of the least significant bits of value
+   will be inserted in the output.
+
+     deflatePrime returns Z_OK if success, Z_BUF_ERROR if there was not enough
+   room in the internal buffer to insert the bits, or Z_STREAM_ERROR if the
+   source stream state was inconsistent.
 */
 
 ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
                                          gz_headerp head));
 /*
-      deflateSetHeader() provides gzip header information for when a gzip
+     deflateSetHeader() provides gzip header information for when a gzip
    stream is requested by deflateInit2().  deflateSetHeader() may be called
    after deflateInit2() or deflateReset() and before the first call of
    deflate().  The text, time, os, extra field, name, and comment information
@@ -684,11 +758,11 @@ ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
    1.3.x) do not support header crc's, and will report that it is a "multi-part
    gzip file" and give up.
 
-      If deflateSetHeader is not used, the default gzip header has text false,
+     If deflateSetHeader is not used, the default gzip header has text false,
    the time set to zero, and os set to 255, with no extra, name, or comment
    fields.  The gzip header is returned to the default state by deflateReset().
 
-      deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+     deflateSetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
    stream state was inconsistent.
 */
 
@@ -696,43 +770,50 @@ ZEXTERN int ZEXPORT deflateSetHeader OF((z_streamp strm,
 ZEXTERN int ZEXPORT inflateInit2 OF((z_streamp strm,
                                      int  windowBits));
 
-     This is another version of inflateInit with an extra parameter. The
+     This is another version of inflateInit with an extra parameter.  The
    fields next_in, avail_in, zalloc, zfree and opaque must be initialized
    before by the caller.
 
      The windowBits parameter is the base two logarithm of the maximum window
    size (the size of the history buffer).  It should be in the range 8..15 for
-   this version of the library. The default value is 15 if inflateInit is used
-   instead. windowBits must be greater than or equal to the windowBits value
+   this version of the library.  The default value is 15 if inflateInit is used
+   instead.  windowBits must be greater than or equal to the windowBits value
    provided to deflateInit2() while compressing, or it must be equal to 15 if
-   deflateInit2() was not used. If a compressed stream with a larger window
+   deflateInit2() was not used.  If a compressed stream with a larger window
    size is given as input, inflate() will return with the error code
    Z_DATA_ERROR instead of trying to allocate a larger window.
 
-     windowBits can also be -8..-15 for raw inflate. In this case, -windowBits
-   determines the window size. inflate() will then process raw deflate data,
+     windowBits can also be zero to request that inflate use the window size in
+   the zlib header of the compressed stream.
+
+     windowBits can also be -8..-15 for raw inflate.  In this case, -windowBits
+   determines the window size.  inflate() will then process raw deflate data,
    not looking for a zlib or gzip header, not generating a check value, and not
-   looking for any check values for comparison at the end of the stream. This
+   looking for any check values for comparison at the end of the stream.  This
    is for use with other formats that use the deflate compressed data format
-   such as zip.  Those formats provide their own check values. If a custom
+   such as zip.  Those formats provide their own check values.  If a custom
    format is developed using the raw deflate format for compressed data, it is
    recommended that a check value such as an adler32 or a crc32 be applied to
    the uncompressed data as is done in the zlib, gzip, and zip formats.  For
-   most applications, the zlib format should be used as is. Note that comments
+   most applications, the zlib format should be used as is.  Note that comments
    above on the use in deflateInit2() applies to the magnitude of windowBits.
 
-     windowBits can also be greater than 15 for optional gzip decoding. Add
+     windowBits can also be greater than 15 for optional gzip decoding.  Add
    32 to windowBits to enable zlib and gzip decoding with automatic header
    detection, or add 16 to decode only the gzip format (the zlib format will
-   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is
-   a crc32 instead of an adler32.
+   return a Z_DATA_ERROR).  If a gzip stream is being decoded, strm->adler is a
+   crc32 instead of an adler32.
 
      inflateInit2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
-   memory, Z_STREAM_ERROR if a parameter is invalid (such as a null strm). msg
-   is set to null if there is no error message.  inflateInit2 does not perform
-   any decompression apart from reading the zlib header if present: this will
-   be done by inflate(). (So next_in and avail_in may be modified, but next_out
-   and avail_out are unchanged.)
+   memory, Z_VERSION_ERROR if the zlib library version is incompatible with the
+   version assumed by the caller, or Z_STREAM_ERROR if the parameters are
+   invalid, such as a null pointer to the structure.  msg is set to null if
+   there is no error message.  inflateInit2 does not perform any decompression
+   apart from possibly reading the zlib header if present: actual decompression
+   will be done by inflate().  (So next_in and avail_in may be modified, but
+   next_out and avail_out are unused and unchanged.) The current implementation
+   of inflateInit2() does not process any header information -- that is
+   deferred until inflate() is called.
 */
 
 ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
@@ -740,36 +821,56 @@ ZEXTERN int ZEXPORT inflateSetDictionary OF((z_streamp strm,
                                              uInt  dictLength));
 /*
      Initializes the decompression dictionary from the given uncompressed byte
-   sequence. This function must be called immediately after a call of inflate,
-   if that call returned Z_NEED_DICT. The dictionary chosen by the compressor
+   sequence.  This function must be called immediately after a call of inflate,
+   if that call returned Z_NEED_DICT.  The dictionary chosen by the compressor
    can be determined from the adler32 value returned by that call of inflate.
    The compressor and decompressor must use exactly the same dictionary (see
-   deflateSetDictionary).  For raw inflate, this function can be called
-   immediately after inflateInit2() or inflateReset() and before any call of
-   inflate() to set the dictionary.  The application must insure that the
-   dictionary that was used for compression is provided.
+   deflateSetDictionary).  For raw inflate, this function can be called at any
+   time to set the dictionary.  If the provided dictionary is smaller than the
+   window and there is already data in the window, then the provided dictionary
+   will amend what's there.  The application must insure that the dictionary
+   that was used for compression is provided.
 
      inflateSetDictionary returns Z_OK if success, Z_STREAM_ERROR if a
-   parameter is invalid (such as NULL dictionary) or the stream state is
+   parameter is invalid (e.g.  dictionary being Z_NULL) or the stream state is
    inconsistent, Z_DATA_ERROR if the given dictionary doesn't match the
-   expected one (incorrect adler32 value). inflateSetDictionary does not
+   expected one (incorrect adler32 value).  inflateSetDictionary does not
    perform any decompression: this will be done by subsequent calls of
    inflate().
 */
 
-ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+ZEXTERN int ZEXPORT inflateGetDictionary OF((z_streamp strm,
+                                             Bytef *dictionary,
+                                             uInt  *dictLength));
 /*
-    Skips invalid compressed data until a full flush point (see above the
-  description of deflate with Z_FULL_FLUSH) can be found, or until all
-  available input is skipped. No output is provided.
+     Returns the sliding dictionary being maintained by inflate.  dictLength is
+   set to the number of bytes in the dictionary, and that many bytes are copied
+   to dictionary.  dictionary must have enough space, where 32768 bytes is
+   always enough.  If inflateGetDictionary() is called with dictionary equal to
+   Z_NULL, then only the dictionary length is returned, and nothing is copied.
+   Similary, if dictLength is Z_NULL, then it is not set.
+
+     inflateGetDictionary returns Z_OK on success, or Z_STREAM_ERROR if the
+   stream state is inconsistent.
+*/
 
-    inflateSync returns Z_OK if a full flush point has been found, Z_BUF_ERROR
-  if no more input was provided, Z_DATA_ERROR if no flush point has been found,
-  or Z_STREAM_ERROR if the stream structure was inconsistent. In the success
-  case, the application may save the current current value of total_in which
-  indicates where valid compressed data was found. In the error case, the
-  application may repeatedly call inflateSync, providing more input each time,
-  until success or end of the input data.
+ZEXTERN int ZEXPORT inflateSync OF((z_streamp strm));
+/*
+     Skips invalid compressed data until a possible full flush point (see above
+   for the description of deflate with Z_FULL_FLUSH) can be found, or until all
+   available input is skipped.  No output is provided.
+
+     inflateSync searches for a 00 00 FF FF pattern in the compressed data.
+   All full flush points have this pattern, but not all occurrences of this
+   pattern are full flush points.
+
+     inflateSync returns Z_OK if a possible full flush point has been found,
+   Z_BUF_ERROR if no more input was provided, Z_DATA_ERROR if no flush point
+   has been found, or Z_STREAM_ERROR if the stream structure was inconsistent.
+   In the success case, the application may save the current current value of
+   total_in which indicates where valid compressed data was found.  In the
+   error case, the application may repeatedly call inflateSync, providing more
+   input each time, until success or end of the input data.
 */
 
 ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
@@ -784,18 +885,30 @@ ZEXTERN int ZEXPORT inflateCopy OF((z_streamp dest,
 
      inflateCopy returns Z_OK if success, Z_MEM_ERROR if there was not
    enough memory, Z_STREAM_ERROR if the source stream state was inconsistent
-   (such as zalloc being NULL). msg is left unchanged in both source and
+   (such as zalloc being Z_NULL).  msg is left unchanged in both source and
    destination.
 */
 
 ZEXTERN int ZEXPORT inflateReset OF((z_streamp strm));
 /*
      This function is equivalent to inflateEnd followed by inflateInit,
-   but does not free and reallocate all the internal decompression state.
-   The stream will keep attributes that may have been set by inflateInit2.
+   but does not free and reallocate all the internal decompression state.  The
+   stream will keep attributes that may have been set by inflateInit2.
 
-      inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
-   stream state was inconsistent (such as zalloc or state being NULL).
+     inflateReset returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL).
+*/
+
+ZEXTERN int ZEXPORT inflateReset2 OF((z_streamp strm,
+                                      int windowBits));
+/*
+     This function is the same as inflateReset, but it also permits changing
+   the wrap and window size requests.  The windowBits parameter is interpreted
+   the same as it is for inflateInit2.
+
+     inflateReset2 returns Z_OK if success, or Z_STREAM_ERROR if the source
+   stream state was inconsistent (such as zalloc or state being Z_NULL), or if
+   the windowBits parameter is invalid.
 */
 
 ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
@@ -803,54 +916,87 @@ ZEXTERN int ZEXPORT inflatePrime OF((z_streamp strm,
                                      int value));
 /*
      This function inserts bits in the inflate input stream.  The intent is
-  that this function is used to start inflating at a bit position in the
-  middle of a byte.  The provided bits will be used before any bytes are used
-  from next_in.  This function should only be used with raw inflate, and
-  should be used before the first inflate() call after inflateInit2() or
-  inflateReset().  bits must be less than or equal to 16, and that many of the
-  least significant bits of value will be inserted in the input.
-
-      inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
+   that this function is used to start inflating at a bit position in the
+   middle of a byte.  The provided bits will be used before any bytes are used
+   from next_in.  This function should only be used with raw inflate, and
+   should be used before the first inflate() call after inflateInit2() or
+   inflateReset().  bits must be less than or equal to 16, and that many of the
+   least significant bits of value will be inserted in the input.
+
+     If bits is negative, then the input stream bit buffer is emptied.  Then
+   inflatePrime() can be called again to put bits in the buffer.  This is used
+   to clear out bits leftover after feeding inflate a block description prior
+   to feeding inflate codes.
+
+     inflatePrime returns Z_OK if success, or Z_STREAM_ERROR if the source
    stream state was inconsistent.
 */
 
+ZEXTERN long ZEXPORT inflateMark OF((z_streamp strm));
+/*
+     This function returns two values, one in the lower 16 bits of the return
+   value, and the other in the remaining upper bits, obtained by shifting the
+   return value down 16 bits.  If the upper value is -1 and the lower value is
+   zero, then inflate() is currently decoding information outside of a block.
+   If the upper value is -1 and the lower value is non-zero, then inflate is in
+   the middle of a stored block, with the lower value equaling the number of
+   bytes from the input remaining to copy.  If the upper value is not -1, then
+   it is the number of bits back from the current bit position in the input of
+   the code (literal or length/distance pair) currently being processed.  In
+   that case the lower value is the number of bytes already emitted for that
+   code.
+
+     A code is being processed if inflate is waiting for more input to complete
+   decoding of the code, or if it has completed decoding but is waiting for
+   more output space to write the literal or match data.
+
+     inflateMark() is used to mark locations in the input data for random
+   access, which may be at bit positions, and to note those cases where the
+   output of a code may span boundaries of random access blocks.  The current
+   location in the input stream can be determined from avail_in and data_type
+   as noted in the description for the Z_BLOCK flush parameter for inflate.
+
+     inflateMark returns the value noted above or -1 << 16 if the provided
+   source stream state was inconsistent.
+*/
+
 ZEXTERN int ZEXPORT inflateGetHeader OF((z_streamp strm,
                                          gz_headerp head));
 /*
-      inflateGetHeader() requests that gzip header information be stored in the
+     inflateGetHeader() requests that gzip header information be stored in the
    provided gz_header structure.  inflateGetHeader() may be called after
    inflateInit2() or inflateReset(), and before the first call of inflate().
    As inflate() processes the gzip stream, head->done is zero until the header
    is completed, at which time head->done is set to one.  If a zlib stream is
    being decoded, then head->done is set to -1 to indicate that there will be
-   no gzip header information forthcoming.  Note that Z_BLOCK can be used to
-   force inflate() to return immediately after header processing is complete
-   and before any actual data is decompressed.
+   no gzip header information forthcoming.  Note that Z_BLOCK or Z_TREES can be
+   used to force inflate() to return immediately after header processing is
+   complete and before any actual data is decompressed.
 
-      The text, time, xflags, and os fields are filled in with the gzip header
+     The text, time, xflags, and os fields are filled in with the gzip header
    contents.  hcrc is set to true if there is a header CRC.  (The header CRC
-   was valid if done is set to one.)  If extra is not Z_NULL, then extra_max
+   was valid if done is set to one.) If extra is not Z_NULL, then extra_max
    contains the maximum number of bytes to write to extra.  Once done is true,
    extra_len contains the actual extra field length, and extra contains the
    extra field, or that field truncated if extra_max is less than extra_len.
    If name is not Z_NULL, then up to name_max characters are written there,
    terminated with a zero unless the length is greater than name_max.  If
    comment is not Z_NULL, then up to comm_max characters are written there,
-   terminated with a zero unless the length is greater than comm_max.  When
-   any of extra, name, or comment are not Z_NULL and the respective field is
-   not present in the header, then that field is set to Z_NULL to signal its
+   terminated with a zero unless the length is greater than comm_max.  When any
+   of extra, name, or comment are not Z_NULL and the respective field is not
+   present in the header, then that field is set to Z_NULL to signal its
    absence.  This allows the use of deflateSetHeader() with the returned
    structure to duplicate the header.  However if those fields are set to
    allocated memory, then the application will need to save those pointers
    elsewhere so that they can be eventually freed.
 
-      If inflateGetHeader is not used, then the header information is simply
+     If inflateGetHeader is not used, then the header information is simply
    discarded.  The header is always checked for validity, including the header
    CRC if present.  inflateReset() will reset the process to discard the header
    information.  The application would need to call inflateGetHeader() again to
    retrieve the header from the next gzip stream.
 
-      inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
+     inflateGetHeader returns Z_OK if success, or Z_STREAM_ERROR if the source
    stream state was inconsistent.
 */
 
@@ -871,12 +1017,13 @@ ZEXTERN int ZEXPORT inflateBackInit OF((z_streamp strm, int windowBits,
      See inflateBack() for the usage of these routines.
 
      inflateBackInit will return Z_OK on success, Z_STREAM_ERROR if any of
-   the paramaters are invalid, Z_MEM_ERROR if the internal state could not
-   be allocated, or Z_VERSION_ERROR if the version of the library does not
-   match the version of the header file.
+   the parameters are invalid, Z_MEM_ERROR if the internal state could not be
+   allocated, or Z_VERSION_ERROR if the version of the library does not match
+   the version of the header file.
 */
 
-typedef unsigned (*in_func) OF((void FAR *, unsigned char FAR * FAR *));
+typedef unsigned (*in_func) OF((void FAR *,
+                                z_const unsigned char FAR * FAR *));
 typedef int (*out_func) OF((void FAR *, unsigned char FAR *, unsigned));
 
 ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
@@ -884,24 +1031,25 @@ ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
                                     out_func out, void FAR *out_desc));
 /*
      inflateBack() does a raw inflate with a single call using a call-back
-   interface for input and output.  This is more efficient than inflate() for
-   file i/o applications in that it avoids copying between the output and the
-   sliding window by simply making the window itself the output buffer.  This
-   function trusts the application to not change the output buffer passed by
-   the output function, at least until inflateBack() returns.
+   interface for input and output.  This is potentially more efficient than
+   inflate() for file i/o applications, in that it avoids copying between the
+   output and the sliding window by simply making the window itself the output
+   buffer.  inflate() can be faster on modern CPUs when used with large
+   buffers.  inflateBack() trusts the application to not change the output
+   buffer passed by the output function, at least until inflateBack() returns.
 
      inflateBackInit() must be called first to allocate the internal state
    and to initialize the state with the user-provided window buffer.
    inflateBack() may then be used multiple times to inflate a complete, raw
-   deflate stream with each call.  inflateBackEnd() is then called to free
-   the allocated state.
+   deflate stream with each call.  inflateBackEnd() is then called to free the
+   allocated state.
 
      A raw deflate stream is one with no zlib or gzip header or trailer.
    This routine would normally be used in a utility that reads zip or gzip
    files and writes out uncompressed files.  The utility would decode the
-   header and process the trailer on its own, hence this routine expects
-   only the raw deflate stream to decompress.  This is different from the
-   normal behavior of inflate(), which expects either a zlib or gzip header and
+   header and process the trailer on its own, hence this routine expects only
+   the raw deflate stream to decompress.  This is different from the normal
+   behavior of inflate(), which expects either a zlib or gzip header and
    trailer around the deflate stream.
 
      inflateBack() uses two subroutines supplied by the caller that are then
@@ -927,7 +1075,7 @@ ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
    calling inflateBack().  If strm->next_in is Z_NULL, then in() will be called
    immediately for input.  If strm->next_in is not Z_NULL, then strm->avail_in
    must also be initialized, and then if strm->avail_in is not zero, input will
-   initially be taken from strm->next_in[0 .. strm->avail_in - 1].
+   initially be taken from strm->next_in[0 ..  strm->avail_in - 1].
 
      The in_desc and out_desc parameters of inflateBack() is passed as the
    first parameter of in() and out() respectively when they are called.  These
@@ -937,15 +1085,15 @@ ZEXTERN int ZEXPORT inflateBack OF((z_streamp strm,
      On return, inflateBack() will set strm->next_in and strm->avail_in to
    pass back any unused input that was provided by the last in() call.  The
    return values of inflateBack() can be Z_STREAM_END on success, Z_BUF_ERROR
-   if in() or out() returned an error, Z_DATA_ERROR if there was a format
-   error in the deflate stream (in which case strm->msg is set to indicate the
-   nature of the error), or Z_STREAM_ERROR if the stream was not properly
-   initialized.  In the case of Z_BUF_ERROR, an input or output error can be
-   distinguished using strm->next_in which will be Z_NULL only if in() returned
-   an error.  If strm->next is not Z_NULL, then the Z_BUF_ERROR was due to
-   out() returning non-zero.  (in() will always be called before out(), so
-   strm->next_in is assured to be defined if out() returns non-zero.)  Note
-   that inflateBack() cannot return Z_OK.
+   if in() or out() returned an error, Z_DATA_ERROR if there was a format error
+   in the deflate stream (in which case strm->msg is set to indicate the nature
+   of the error), or Z_STREAM_ERROR if the stream was not properly initialized.
+   In the case of Z_BUF_ERROR, an input or output error can be distinguished
+   using strm->next_in which will be Z_NULL only if in() returned an error.  If
+   strm->next_in is not Z_NULL, then the Z_BUF_ERROR was due to out() returning
+   non-zero.  (in() will always be called before out(), so strm->next_in is
+   assured to be defined if out() returns non-zero.) Note that inflateBack()
+   cannot return Z_OK.
 */
 
 ZEXTERN int ZEXPORT inflateBackEnd OF((z_streamp strm));
@@ -997,27 +1145,27 @@ ZEXTERN uLong ZEXPORT zlibCompileFlags OF((void));
      27-31: 0 (reserved)
  */
 
+#ifndef Z_SOLO
 
                         /* utility functions */
 
 /*
-     The following utility functions are implemented on top of the
-   basic stream-oriented functions. To simplify the interface, some
-   default options are assumed (compression level and memory usage,
-   standard memory allocation functions). The source code of these
-   utility functions can easily be modified if you need special options.
+     The following utility functions are implemented on top of the basic
+   stream-oriented functions.  To simplify the interface, some default options
+   are assumed (compression level and memory usage, standard memory allocation
+   functions).  The source code of these utility functions can be modified if
+   you need special options.
 */
 
 ZEXTERN int ZEXPORT compress OF((Bytef *dest,   uLongf *destLen,
                                  const Bytef *source, uLong sourceLen));
 /*
      Compresses the source buffer into the destination buffer.  sourceLen is
-   the byte length of the source buffer. Upon entry, destLen is the total
-   size of the destination buffer, which must be at least the value returned
-   by compressBound(sourceLen). Upon exit, destLen is the actual size of the
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be at least the value returned by
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
    compressed buffer.
-     This function can be used to compress a whole file at once if the
-   input file is mmap'ed.
+
      compress returns Z_OK if success, Z_MEM_ERROR if there was not
    enough memory, Z_BUF_ERROR if there was not enough room in the output
    buffer.
@@ -1027,11 +1175,11 @@ ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
                                   const Bytef *source, uLong sourceLen,
                                   int level));
 /*
-     Compresses the source buffer into the destination buffer. The level
+     Compresses the source buffer into the destination buffer.  The level
    parameter has the same meaning as in deflateInit.  sourceLen is the byte
-   length of the source buffer. Upon entry, destLen is the total size of the
+   length of the source buffer.  Upon entry, destLen is the total size of the
    destination buffer, which must be at least the value returned by
-   compressBound(sourceLen). Upon exit, destLen is the actual size of the
+   compressBound(sourceLen).  Upon exit, destLen is the actual size of the
    compressed buffer.
 
      compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough
@@ -1042,159 +1190,255 @@ ZEXTERN int ZEXPORT compress2 OF((Bytef *dest,   uLongf *destLen,
 ZEXTERN uLong ZEXPORT compressBound OF((uLong sourceLen));
 /*
      compressBound() returns an upper bound on the compressed size after
-   compress() or compress2() on sourceLen bytes.  It would be used before
-   a compress() or compress2() call to allocate the destination buffer.
+   compress() or compress2() on sourceLen bytes.  It would be used before a
+   compress() or compress2() call to allocate the destination buffer.
 */
 
 ZEXTERN int ZEXPORT uncompress OF((Bytef *dest,   uLongf *destLen,
                                    const Bytef *source, uLong sourceLen));
 /*
      Decompresses the source buffer into the destination buffer.  sourceLen is
-   the byte length of the source buffer. Upon entry, destLen is the total
-   size of the destination buffer, which must be large enough to hold the
-   entire uncompressed data. (The size of the uncompressed data must have
-   been saved previously by the compressor and transmitted to the decompressor
-   by some mechanism outside the scope of this compression library.)
-   Upon exit, destLen is the actual size of the compressed buffer.
-     This function can be used to decompress a whole file at once if the
-   input file is mmap'ed.
+   the byte length of the source buffer.  Upon entry, destLen is the total size
+   of the destination buffer, which must be large enough to hold the entire
+   uncompressed data.  (The size of the uncompressed data must have been saved
+   previously by the compressor and transmitted to the decompressor by some
+   mechanism outside the scope of this compression library.) Upon exit, destLen
+   is the actual size of the uncompressed buffer.
 
      uncompress returns Z_OK if success, Z_MEM_ERROR if there was not
    enough memory, Z_BUF_ERROR if there was not enough room in the output
-   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.
+   buffer, or Z_DATA_ERROR if the input data was corrupted or incomplete.  In
+   the case where there is not enough room, uncompress() will fill the output
+   buffer with the uncompressed data up to that point.
 */
 
+                        /* gzip file access functions */
 
-typedef voidp gzFile;
+/*
+     This library supports reading and writing files in gzip (.gz) format with
+   an interface similar to that of stdio, using the functions that start with
+   "gz".  The gzip format is different from the zlib format.  gzip is a gzip
+   wrapper, documented in RFC 1952, wrapped around a deflate stream.
+*/
+
+typedef struct gzFile_s *gzFile;    /* semi-opaque gzip file descriptor */
 
-ZEXTERN gzFile ZEXPORT gzopen  OF((const char *path, const char *mode));
 /*
-     Opens a gzip (.gz) file for reading or writing. The mode parameter
-   is as in fopen ("rb" or "wb") but can also include a compression level
-   ("wb9") or a strategy: 'f' for filtered data as in "wb6f", 'h' for
-   Huffman only compression as in "wb1h", or 'R' for run-length encoding
-   as in "wb1R". (See the description of deflateInit2 for more information
-   about the strategy parameter.)
+ZEXTERN gzFile ZEXPORT gzopen OF((const char *path, const char *mode));
+
+     Opens a gzip (.gz) file for reading or writing.  The mode parameter is as
+   in fopen ("rb" or "wb") but can also include a compression level ("wb9") or
+   a strategy: 'f' for filtered data as in "wb6f", 'h' for Huffman-only
+   compression as in "wb1h", 'R' for run-length encoding as in "wb1R", or 'F'
+   for fixed code compression as in "wb9F".  (See the description of
+   deflateInit2 for more information about the strategy parameter.)  'T' will
+   request transparent writing or appending with no compression and not using
+   the gzip format.
+
+     "a" can be used instead of "w" to request that the gzip stream that will
+   be written be appended to the file.  "+" will result in an error, since
+   reading and writing to the same gzip file is not supported.  The addition of
+   "x" when writing will create the file exclusively, which fails if the file
+   already exists.  On systems that support it, the addition of "e" when
+   reading or writing will set the flag to close the file on an execve() call.
+
+     These functions, as well as gzip, will read and decode a sequence of gzip
+   streams in a file.  The append function of gzopen() can be used to create
+   such a file.  (Also see gzflush() for another way to do this.)  When
+   appending, gzopen does not test whether the file begins with a gzip stream,
+   nor does it look for the end of the gzip streams to begin appending.  gzopen
+   will simply append a gzip stream to the existing file.
 
      gzopen can be used to read a file which is not in gzip format; in this
-   case gzread will directly read from the file without decompression.
+   case gzread will directly read from the file without decompression.  When
+   reading, this will be detected automatically by looking for the magic two-
+   byte gzip header.
+
+     gzopen returns NULL if the file could not be opened, if there was
+   insufficient memory to allocate the gzFile state, or if an invalid mode was
+   specified (an 'r', 'w', or 'a' was not provided, or '+' was provided).
+   errno can be checked to determine if the reason gzopen failed was that the
+   file could not be opened.
+*/
 
-     gzopen returns NULL if the file could not be opened or if there was
-   insufficient memory to allocate the (de)compression state; errno
-   can be checked to distinguish the two cases (if errno is zero, the
-   zlib error is Z_MEM_ERROR).  */
+ZEXTERN gzFile ZEXPORT gzdopen OF((int fd, const char *mode));
+/*
+     gzdopen associates a gzFile with the file descriptor fd.  File descriptors
+   are obtained from calls like open, dup, creat, pipe or fileno (if the file
+   has been previously opened with fopen).  The mode parameter is as in gzopen.
+
+     The next call of gzclose on the returned gzFile will also close the file
+   descriptor fd, just like fclose(fdopen(fd, mode)) closes the file descriptor
+   fd.  If you want to keep fd open, use fd = dup(fd_keep); gz = gzdopen(fd,
+   mode);.  The duplicated descriptor should be saved to avoid a leak, since
+   gzdopen does not close fd if it fails.  If you are using fileno() to get the
+   file descriptor from a FILE *, then you will have to use dup() to avoid
+   double-close()ing the file descriptor.  Both gzclose() and fclose() will
+   close the associated file descriptor, so they need to have different file
+   descriptors.
+
+     gzdopen returns NULL if there was insufficient memory to allocate the
+   gzFile state, if an invalid mode was specified (an 'r', 'w', or 'a' was not
+   provided, or '+' was provided), or if fd is -1.  The file descriptor is not
+   used until the next gz* read, write, seek, or close operation, so gzdopen
+   will not detect if fd is invalid (unless fd is -1).
+*/
 
-ZEXTERN gzFile ZEXPORT gzdopen  OF((int fd, const char *mode));
+ZEXTERN int ZEXPORT gzbuffer OF((gzFile file, unsigned size));
 /*
-     gzdopen() associates a gzFile with the file descriptor fd.  File
-   descriptors are obtained from calls like open, dup, creat, pipe or
-   fileno (in the file has been previously opened with fopen).
-   The mode parameter is as in gzopen.
-     The next call of gzclose on the returned gzFile will also close the
-   file descriptor fd, just like fclose(fdopen(fd), mode) closes the file
-   descriptor fd. If you want to keep fd open, use gzdopen(dup(fd), mode).
-     gzdopen returns NULL if there was insufficient memory to allocate
-   the (de)compression state.
+     Set the internal buffer size used by this library's functions.  The
+   default buffer size is 8192 bytes.  This function must be called after
+   gzopen() or gzdopen(), and before any other calls that read or write the
+   file.  The buffer memory allocation is always deferred to the first read or
+   write.  Two buffers are allocated, either both of the specified size when
+   writing, or one of the specified size and the other twice that size when
+   reading.  A larger buffer size of, for example, 64K or 128K bytes will
+   noticeably increase the speed of decompression (reading).
+
+     The new buffer size also affects the maximum length for gzprintf().
+
+     gzbuffer() returns 0 on success, or -1 on failure, such as being called
+   too late.
 */
 
 ZEXTERN int ZEXPORT gzsetparams OF((gzFile file, int level, int strategy));
 /*
-     Dynamically update the compression level or strategy. See the description
+     Dynamically update the compression level or strategy.  See the description
    of deflateInit2 for the meaning of these parameters.
+
      gzsetparams returns Z_OK if success, or Z_STREAM_ERROR if the file was not
    opened for writing.
 */
 
-ZEXTERN int ZEXPORT    gzread  OF((gzFile file, voidp buf, unsigned len));
+ZEXTERN int ZEXPORT gzread OF((gzFile file, voidp buf, unsigned len));
 /*
-     Reads the given number of uncompressed bytes from the compressed file.
-   If the input file was not in gzip format, gzread copies the given number
-   of bytes into the buffer.
-     gzread returns the number of uncompressed bytes actually read (0 for
-   end of file, -1 for error). */
+     Reads the given number of uncompressed bytes from the compressed file.  If
+   the input file is not in gzip format, gzread copies the given number of
+   bytes into the buffer directly from the file.
+
+     After reaching the end of a gzip stream in the input, gzread will continue
+   to read, looking for another gzip stream.  Any number of gzip streams may be
+   concatenated in the input file, and will all be decompressed by gzread().
+   If something other than a gzip stream is encountered after a gzip stream,
+   that remaining trailing garbage is ignored (and no error is returned).
+
+     gzread can be used to read a gzip file that is being concurrently written.
+   Upon reaching the end of the input, gzread will return with the available
+   data.  If the error code returned by gzerror is Z_OK or Z_BUF_ERROR, then
+   gzclearerr can be used to clear the end of file indicator in order to permit
+   gzread to be tried again.  Z_OK indicates that a gzip stream was completed
+   on the last gzread.  Z_BUF_ERROR indicates that the input file ended in the
+   middle of a gzip stream.  Note that gzread does not return -1 in the event
+   of an incomplete gzip stream.  This error is deferred until gzclose(), which
+   will return Z_BUF_ERROR if the last gzread ended in the middle of a gzip
+   stream.  Alternatively, gzerror can be used before gzclose to detect this
+   case.
+
+     gzread returns the number of uncompressed bytes actually read, less than
+   len for end of file, or -1 for error.
+*/
 
-ZEXTERN int ZEXPORT    gzwrite OF((gzFile file,
-                                   voidpc buf, unsigned len));
+ZEXTERN int ZEXPORT gzwrite OF((gzFile file,
+                                voidpc buf, unsigned len));
 /*
      Writes the given number of uncompressed bytes into the compressed file.
-   gzwrite returns the number of uncompressed bytes actually written
-   (0 in case of error).
+   gzwrite returns the number of uncompressed bytes written or 0 in case of
+   error.
 */
 
-ZEXTERN int ZEXPORTVA   gzprintf OF((gzFile file, const char *format, ...));
+ZEXTERN int ZEXPORTVA gzprintf Z_ARG((gzFile file, const char *format, ...));
 /*
-     Converts, formats, and writes the args to the compressed file under
-   control of the format string, as in fprintf. gzprintf returns the number of
-   uncompressed bytes actually written (0 in case of error).  The number of
-   uncompressed bytes written is limited to 4095. The caller should assure that
-   this limit is not exceeded. If it is exceeded, then gzprintf() will return
-   return an error (0) with nothing written. In this case, there may also be a
-   buffer overflow with unpredictable consequences, which is possible only if
-   zlib was compiled with the insecure functions sprintf() or vsprintf()
-   because the secure snprintf() or vsnprintf() functions were not available.
+     Converts, formats, and writes the arguments to the compressed file under
+   control of the format string, as in fprintf.  gzprintf returns the number of
+   uncompressed bytes actually written, or 0 in case of error.  The number of
+   uncompressed bytes written is limited to 8191, or one less than the buffer
+   size given to gzbuffer().  The caller should assure that this limit is not
+   exceeded.  If it is exceeded, then gzprintf() will return an error (0) with
+   nothing written.  In this case, there may also be a buffer overflow with
+   unpredictable consequences, which is possible only if zlib was compiled with
+   the insecure functions sprintf() or vsprintf() because the secure snprintf()
+   or vsnprintf() functions were not available.  This can be determined using
+   zlibCompileFlags().
 */
 
 ZEXTERN int ZEXPORT gzputs OF((gzFile file, const char *s));
 /*
-      Writes the given null-terminated string to the compressed file, excluding
+     Writes the given null-terminated string to the compressed file, excluding
    the terminating null character.
-      gzputs returns the number of characters written, or -1 in case of error.
+
+     gzputs returns the number of characters written, or -1 in case of error.
 */
 
 ZEXTERN char * ZEXPORT gzgets OF((gzFile file, char *buf, int len));
 /*
-      Reads bytes from the compressed file until len-1 characters are read, or
-   a newline character is read and transferred to buf, or an end-of-file
-   condition is encountered.  The string is then terminated with a null
-   character.
-      gzgets returns buf, or Z_NULL in case of error.
+     Reads bytes from the compressed file until len-1 characters are read, or a
+   newline character is read and transferred to buf, or an end-of-file
+   condition is encountered.  If any characters are read or if len == 1, the
+   string is terminated with a null character.  If no characters are read due
+   to an end-of-file or len < 1, then the buffer is left untouched.
+
+     gzgets returns buf which is a null-terminated string, or it returns NULL
+   for end-of-file or in case of error.  If there was an error, the contents at
+   buf are indeterminate.
 */
 
-ZEXTERN int ZEXPORT    gzputc OF((gzFile file, int c));
+ZEXTERN int ZEXPORT gzputc OF((gzFile file, int c));
 /*
-      Writes c, converted to an unsigned char, into the compressed file.
-   gzputc returns the value that was written, or -1 in case of error.
+     Writes c, converted to an unsigned char, into the compressed file.  gzputc
+   returns the value that was written, or -1 in case of error.
 */
 
-ZEXTERN int ZEXPORT    gzgetc OF((gzFile file));
+ZEXTERN int ZEXPORT gzgetc OF((gzFile file));
 /*
-      Reads one byte from the compressed file. gzgetc returns this byte
-   or -1 in case of end of file or error.
+     Reads one byte from the compressed file.  gzgetc returns this byte or -1
+   in case of end of file or error.  This is implemented as a macro for speed.
+   As such, it does not do all of the checking the other functions do.  I.e.
+   it does not check to see if file is NULL, nor whether the structure file
+   points to has been clobbered or not.
 */
 
-ZEXTERN int ZEXPORT    gzungetc OF((int c, gzFile file));
+ZEXTERN int ZEXPORT gzungetc OF((int c, gzFile file));
 /*
-      Push one character back onto the stream to be read again later.
-   Only one character of push-back is allowed.  gzungetc() returns the
-   character pushed, or -1 on failure.  gzungetc() will fail if a
-   character has been pushed but not read yet, or if c is -1. The pushed
-   character will be discarded if the stream is repositioned with gzseek()
-   or gzrewind().
+     Push one character back onto the stream to be read as the first character
+   on the next read.  At least one character of push-back is allowed.
+   gzungetc() returns the character pushed, or -1 on failure.  gzungetc() will
+   fail if c is -1, and may fail if a character has been pushed but not read
+   yet.  If gzungetc is used immediately after gzopen or gzdopen, at least the
+   output buffer size of pushed characters is allowed.  (See gzbuffer above.)
+   The pushed character will be discarded if the stream is repositioned with
+   gzseek() or gzrewind().
 */
 
-ZEXTERN int ZEXPORT    gzflush OF((gzFile file, int flush));
+ZEXTERN int ZEXPORT gzflush OF((gzFile file, int flush));
 /*
-     Flushes all pending output into the compressed file. The parameter
-   flush is as in the deflate() function. The return value is the zlib
-   error number (see function gzerror below). gzflush returns Z_OK if
-   the flush parameter is Z_FINISH and all output could be flushed.
-     gzflush should be called only when strictly necessary because it can
-   degrade compression.
+     Flushes all pending output into the compressed file.  The parameter flush
+   is as in the deflate() function.  The return value is the zlib error number
+   (see function gzerror below).  gzflush is only permitted when writing.
+
+     If the flush parameter is Z_FINISH, the remaining data is written and the
+   gzip stream is completed in the output.  If gzwrite() is called again, a new
+   gzip stream will be started in the output.  gzread() is able to read such
+   concatented gzip streams.
+
+     gzflush should be called only when strictly necessary because it will
+   degrade compression if called too often.
 */
 
-ZEXTERN z_off_t ZEXPORT    gzseek OF((gzFile file,
-                                      z_off_t offset, int whence));
 /*
-      Sets the starting position for the next gzread or gzwrite on the
-   given compressed file. The offset represents a number of bytes in the
-   uncompressed data stream. The whence parameter is defined as in lseek(2);
+ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile file,
+                                   z_off_t offset, int whence));
+
+     Sets the starting position for the next gzread or gzwrite on the given
+   compressed file.  The offset represents a number of bytes in the
+   uncompressed data stream.  The whence parameter is defined as in lseek(2);
    the value SEEK_END is not supported.
+
      If the file is opened for reading, this function is emulated but can be
-   extremely slow. If the file is opened for writing, only forward seeks are
+   extremely slow.  If the file is opened for writing, only forward seeks are
    supported; gzseek then compresses a sequence of zeroes up to the new
    starting position.
 
-      gzseek returns the resulting offset location as measured in bytes from
+     gzseek returns the resulting offset location as measured in bytes from
    the beginning of the uncompressed stream, or -1 in case of error, in
    particular if the file is opened for writing and the new starting position
    would be before the current position.
@@ -1204,68 +1448,134 @@ ZEXTERN int ZEXPORT    gzrewind OF((gzFile file));
 /*
      Rewinds the given file. This function is supported only for reading.
 
-   gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
+     gzrewind(file) is equivalent to (int)gzseek(file, 0L, SEEK_SET)
 */
 
+/*
 ZEXTERN z_off_t ZEXPORT    gztell OF((gzFile file));
+
+     Returns the starting position for the next gzread or gzwrite on the given
+   compressed file.  This position represents a number of bytes in the
+   uncompressed data stream, and is zero when starting, even if appending or
+   reading a gzip stream from the middle of a file using gzdopen().
+
+     gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+*/
+
 /*
-     Returns the starting position for the next gzread or gzwrite on the
-   given compressed file. This position represents a number of bytes in the
-   uncompressed data stream.
+ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile file));
 
-   gztell(file) is equivalent to gzseek(file, 0L, SEEK_CUR)
+     Returns the current offset in the file being read or written.  This offset
+   includes the count of bytes that precede the gzip stream, for example when
+   appending or when using gzdopen() for reading.  When reading, the offset
+   does not include as yet unused buffered input.  This information can be used
+   for a progress indicator.  On error, gzoffset() returns -1.
 */
 
 ZEXTERN int ZEXPORT gzeof OF((gzFile file));
 /*
-     Returns 1 when EOF has previously been detected reading the given
-   input stream, otherwise zero.
+     Returns true (1) if the end-of-file indicator has been set while reading,
+   false (0) otherwise.  Note that the end-of-file indicator is set only if the
+   read tried to go past the end of the input, but came up short.  Therefore,
+   just like feof(), gzeof() may return false even if there is no more data to
+   read, in the event that the last read request was for the exact number of
+   bytes remaining in the input file.  This will happen if the input file size
+   is an exact multiple of the buffer size.
+
+     If gzeof() returns true, then the read functions will return no more data,
+   unless the end-of-file indicator is reset by gzclearerr() and the input file
+   has grown since the previous end of file was detected.
 */
 
 ZEXTERN int ZEXPORT gzdirect OF((gzFile file));
 /*
-     Returns 1 if file is being read directly without decompression, otherwise
-   zero.
+     Returns true (1) if file is being copied directly while reading, or false
+   (0) if file is a gzip stream being decompressed.
+
+     If the input file is empty, gzdirect() will return true, since the input
+   does not contain a gzip stream.
+
+     If gzdirect() is used immediately after gzopen() or gzdopen() it will
+   cause buffers to be allocated to allow reading the file to determine if it
+   is a gzip file.  Therefore if gzbuffer() is used, it should be called before
+   gzdirect().
+
+     When writing, gzdirect() returns true (1) if transparent writing was
+   requested ("wT" for the gzopen() mode), or false (0) otherwise.  (Note:
+   gzdirect() is not needed when writing.  Transparent writing must be
+   explicitly requested, so the application already knows the answer.  When
+   linking statically, using gzdirect() will include all of the zlib code for
+   gzip file reading and decompression, which may not be desired.)
 */
 
 ZEXTERN int ZEXPORT    gzclose OF((gzFile file));
 /*
-     Flushes all pending output if necessary, closes the compressed file
-   and deallocates all the (de)compression state. The return value is the zlib
-   error number (see function gzerror below).
+     Flushes all pending output if necessary, closes the compressed file and
+   deallocates the (de)compression state.  Note that once file is closed, you
+   cannot call gzerror with file, since its structures have been deallocated.
+   gzclose must not be called more than once on the same file, just as free
+   must not be called more than once on the same allocation.
+
+     gzclose will return Z_STREAM_ERROR if file is not valid, Z_ERRNO on a
+   file operation error, Z_MEM_ERROR if out of memory, Z_BUF_ERROR if the
+   last read ended in the middle of a gzip stream, or Z_OK on success.
+*/
+
+ZEXTERN int ZEXPORT gzclose_r OF((gzFile file));
+ZEXTERN int ZEXPORT gzclose_w OF((gzFile file));
+/*
+     Same as gzclose(), but gzclose_r() is only for use when reading, and
+   gzclose_w() is only for use when writing or appending.  The advantage to
+   using these instead of gzclose() is that they avoid linking in zlib
+   compression or decompression code that is not used when only reading or only
+   writing respectively.  If gzclose() is used, then both compression and
+   decompression code will be included the application when linking to a static
+   zlib library.
 */
 
 ZEXTERN const char * ZEXPORT gzerror OF((gzFile file, int *errnum));
 /*
-     Returns the error message for the last error which occurred on the
-   given compressed file. errnum is set to zlib error number. If an
-   error occurred in the file system and not in the compression library,
-   errnum is set to Z_ERRNO and the application may consult errno
-   to get the exact error code.
+     Returns the error message for the last error which occurred on the given
+   compressed file.  errnum is set to zlib error number.  If an error occurred
+   in the file system and not in the compression library, errnum is set to
+   Z_ERRNO and the application may consult errno to get the exact error code.
+
+     The application must not modify the returned string.  Future calls to
+   this function may invalidate the previously returned string.  If file is
+   closed, then the string previously returned by gzerror will no longer be
+   available.
+
+     gzerror() should be used to distinguish errors from end-of-file for those
+   functions above that do not distinguish those cases in their return values.
 */
 
 ZEXTERN void ZEXPORT gzclearerr OF((gzFile file));
 /*
-     Clears the error and end-of-file flags for file. This is analogous to the
-   clearerr() function in stdio. This is useful for continuing to read a gzip
+     Clears the error and end-of-file flags for file.  This is analogous to the
+   clearerr() function in stdio.  This is useful for continuing to read a gzip
    file that is being written concurrently.
 */
 
+#endif /* !Z_SOLO */
+
                         /* checksum functions */
 
 /*
      These functions are not related to compression but are exported
-   anyway because they might be useful in applications using the
-   compression library.
+   anyway because they might be useful in applications using the compression
+   library.
 */
 
 ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
 /*
      Update a running Adler-32 checksum with the bytes buf[0..len-1] and
-   return the updated checksum. If buf is NULL, this function returns
-   the required initial value for the checksum.
-   An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
-   much faster. Usage example:
+   return the updated checksum.  If buf is Z_NULL, this function returns the
+   required initial value for the checksum.
+
+     An Adler-32 checksum is almost as reliable as a CRC32 but can be computed
+   much faster.
+
+   Usage example:
 
      uLong adler = adler32(0L, Z_NULL, 0);
 
@@ -1275,21 +1585,25 @@ ZEXTERN uLong ZEXPORT adler32 OF((uLong adler, const Bytef *buf, uInt len));
      if (adler != original_adler) error();
 */
 
+/*
 ZEXTERN uLong ZEXPORT adler32_combine OF((uLong adler1, uLong adler2,
                                           z_off_t len2));
-/*
+
      Combine two Adler-32 checksums into one.  For two sequences of bytes, seq1
    and seq2 with lengths len1 and len2, Adler-32 checksums were calculated for
    each, adler1 and adler2.  adler32_combine() returns the Adler-32 checksum of
-   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.
+   seq1 and seq2 concatenated, requiring only adler1, adler2, and len2.  Note
+   that the z_off_t type (like off_t) is a signed integer.  If len2 is
+   negative, the result has no meaning or utility.
 */
 
 ZEXTERN uLong ZEXPORT crc32   OF((uLong crc, const Bytef *buf, uInt len));
 /*
      Update a running CRC-32 with the bytes buf[0..len-1] and return the
-   updated CRC-32. If buf is NULL, this function returns the required initial
-   value for the for the crc. Pre- and post-conditioning (one's complement) is
+   updated CRC-32.  If buf is Z_NULL, this function returns the required
+   initial value for the crc.  Pre- and post-conditioning (one's complement) is
    performed within this function so it shouldn't be done by the application.
+
    Usage example:
 
      uLong crc = crc32(0L, Z_NULL, 0);
@@ -1300,9 +1614,9 @@ ZEXTERN uLong ZEXPORT crc32   OF((uLong crc, const Bytef *buf, uInt len));
      if (crc != original_crc) error();
 */
 
+/*
 ZEXTERN uLong ZEXPORT crc32_combine OF((uLong crc1, uLong crc2, z_off_t len2));
 
-/*
      Combine two CRC-32 check values into one.  For two sequences of bytes,
    seq1 and seq2 with lengths len1 and len2, CRC-32 check values were
    calculated for each, crc1 and crc2.  crc32_combine() returns the CRC-32
@@ -1331,26 +1645,121 @@ ZEXTERN int ZEXPORT inflateBackInit_ OF((z_streamp strm, int windowBits,
                                          const char *version,
                                          int stream_size));
 #define deflateInit(strm, level) \
-        deflateInit_((strm), (level),       ZLIB_VERSION, sizeof(z_stream))
+        deflateInit_((strm), (level), ZLIB_VERSION, (int)sizeof(z_stream))
 #define inflateInit(strm) \
-        inflateInit_((strm),                ZLIB_VERSION, sizeof(z_stream))
+        inflateInit_((strm), ZLIB_VERSION, (int)sizeof(z_stream))
 #define deflateInit2(strm, level, method, windowBits, memLevel, strategy) \
         deflateInit2_((strm),(level),(method),(windowBits),(memLevel),\
-                      (strategy),           ZLIB_VERSION, sizeof(z_stream))
+                      (strategy), ZLIB_VERSION, (int)sizeof(z_stream))
 #define inflateInit2(strm, windowBits) \
-        inflateInit2_((strm), (windowBits), ZLIB_VERSION, sizeof(z_stream))
+        inflateInit2_((strm), (windowBits), ZLIB_VERSION, \
+                      (int)sizeof(z_stream))
 #define inflateBackInit(strm, windowBits, window) \
         inflateBackInit_((strm), (windowBits), (window), \
-        ZLIB_VERSION, sizeof(z_stream))
+                      ZLIB_VERSION, (int)sizeof(z_stream))
+
+#ifndef Z_SOLO
+
+/* gzgetc() macro and its supporting function and exposed data structure.  Note
+ * that the real internal state is much larger than the exposed structure.
+ * This abbreviated structure exposes just enough for the gzgetc() macro.  The
+ * user should not mess with these exposed elements, since their names or
+ * behavior could change in the future, perhaps even capriciously.  They can
+ * only be used by the gzgetc() macro.  You have been warned.
+ */
+struct gzFile_s {
+    unsigned have;
+    unsigned char *next;
+    z_off64_t pos;
+};
+ZEXTERN int ZEXPORT gzgetc_ OF((gzFile file));  /* backward compatibility */
+#ifdef Z_PREFIX_SET
+#  undef z_gzgetc
+#  define z_gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+#else
+#  define gzgetc(g) \
+          ((g)->have ? ((g)->have--, (g)->pos++, *((g)->next)++) : gzgetc(g))
+#endif
 
+/* provide 64-bit offset functions if _LARGEFILE64_SOURCE defined, and/or
+ * change the regular functions to 64 bits if _FILE_OFFSET_BITS is 64 (if
+ * both are true, the application gets the *64 functions, and the regular
+ * functions are changed to 64 bits) -- in case these are set on systems
+ * without large file support, _LFS64_LARGEFILE must also be true
+ */
+#ifdef Z_LARGE64
+   ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+   ZEXTERN z_off64_t ZEXPORT gzseek64 OF((gzFile, z_off64_t, int));
+   ZEXTERN z_off64_t ZEXPORT gztell64 OF((gzFile));
+   ZEXTERN z_off64_t ZEXPORT gzoffset64 OF((gzFile));
+   ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off64_t));
+   ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off64_t));
+#endif
+
+#if !defined(ZLIB_INTERNAL) && defined(Z_WANT64)
+#  ifdef Z_PREFIX_SET
+#    define z_gzopen z_gzopen64
+#    define z_gzseek z_gzseek64
+#    define z_gztell z_gztell64
+#    define z_gzoffset z_gzoffset64
+#    define z_adler32_combine z_adler32_combine64
+#    define z_crc32_combine z_crc32_combine64
+#  else
+#    define gzopen gzopen64
+#    define gzseek gzseek64
+#    define gztell gztell64
+#    define gzoffset gzoffset64
+#    define adler32_combine adler32_combine64
+#    define crc32_combine crc32_combine64
+#  endif
+#  ifndef Z_LARGE64
+     ZEXTERN gzFile ZEXPORT gzopen64 OF((const char *, const char *));
+     ZEXTERN z_off_t ZEXPORT gzseek64 OF((gzFile, z_off_t, int));
+     ZEXTERN z_off_t ZEXPORT gztell64 OF((gzFile));
+     ZEXTERN z_off_t ZEXPORT gzoffset64 OF((gzFile));
+     ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+     ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+#  endif
+#else
+   ZEXTERN gzFile ZEXPORT gzopen OF((const char *, const char *));
+   ZEXTERN z_off_t ZEXPORT gzseek OF((gzFile, z_off_t, int));
+   ZEXTERN z_off_t ZEXPORT gztell OF((gzFile));
+   ZEXTERN z_off_t ZEXPORT gzoffset OF((gzFile));
+   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+#endif
+
+#else /* Z_SOLO */
 
+   ZEXTERN uLong ZEXPORT adler32_combine OF((uLong, uLong, z_off_t));
+   ZEXTERN uLong ZEXPORT crc32_combine OF((uLong, uLong, z_off_t));
+
+#endif /* !Z_SOLO */
+
+/* hack for buggy compilers */
 #if !defined(ZUTIL_H) && !defined(NO_DUMMY_DECL)
-    struct internal_state {int dummy;}; /* hack for buggy compilers */
+    struct internal_state {int dummy;};
 #endif
 
+/* undocumented functions */
 ZEXTERN const char   * ZEXPORT zError           OF((int));
-ZEXTERN int            ZEXPORT inflateSyncPoint OF((z_streamp z));
-ZEXTERN const uLongf * ZEXPORT get_crc_table    OF((void));
+ZEXTERN int            ZEXPORT inflateSyncPoint OF((z_streamp));
+ZEXTERN const z_crc_t FAR * ZEXPORT get_crc_table    OF((void));
+ZEXTERN int            ZEXPORT inflateUndermine OF((z_streamp, int));
+ZEXTERN int            ZEXPORT inflateResetKeep OF((z_streamp));
+ZEXTERN int            ZEXPORT deflateResetKeep OF((z_streamp));
+#if defined(_WIN32) && !defined(Z_SOLO)
+ZEXTERN gzFile         ZEXPORT gzopen_w OF((const wchar_t *path,
+                                            const char *mode));
+#endif
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
+#  ifndef Z_SOLO
+ZEXTERN int            ZEXPORTVA gzvprintf Z_ARG((gzFile file,
+                                                  const char *format,
+                                                  va_list va));
+#  endif
+#endif
 
 #ifdef __cplusplus
 }
diff --git a/erts/emulator/zlib/zutil.c b/erts/emulator/zlib/zutil.c
index fa5b43126a..27a8af4a2b 100644
--- a/erts/emulator/zlib/zutil.c
+++ b/erts/emulator/zlib/zutil.c
@@ -1,22 +1,23 @@
 /* zutil.c -- target dependent utility functions for the compression library
- * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * Copyright (C) 1995-2005, 2010, 2011, 2012 Jean-loup Gailly.
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* @(#) $Id$ */
 
 #ifdef HAVE_CONFIG_H
 #  include "config.h"
 #endif
 #include "zutil.h"
+#ifndef Z_SOLO
+#  include "gzguts.h"
+#endif
 
 #ifndef NO_DUMMY_DECL
 struct internal_state      {int dummy;}; /* for buggy compilers */
 #endif
 
-const char * const z_errmsg[10] = {
+z_const char * const z_errmsg[10] = {
 "need dictionary",     /* Z_NEED_DICT       2  */
 "stream end",          /* Z_STREAM_END      1  */
 "",                    /* Z_OK              0  */
@@ -39,25 +40,25 @@ uLong ZEXPORT zlibCompileFlags()
     uLong flags;
 
     flags = 0;
-    switch (sizeof(uInt)) {
+    switch ((int)(sizeof(uInt))) {
     case 2:     break;
     case 4:     flags += 1;     break;
     case 8:     flags += 2;     break;
     default:    flags += 3;
     }
-    switch (sizeof(uLong)) {
+    switch ((int)(sizeof(uLong))) {
     case 2:     break;
     case 4:     flags += 1 << 2;        break;
     case 8:     flags += 2 << 2;        break;
     default:    flags += 3 << 2;
     }
-    switch (sizeof(voidpf)) {
+    switch ((int)(sizeof(voidpf))) {
     case 2:     break;
     case 4:     flags += 1 << 4;        break;
     case 8:     flags += 2 << 4;        break;
     default:    flags += 3 << 4;
     }
-    switch (sizeof(z_off_t)) {
+    switch ((int)(sizeof(z_off_t))) {
     case 2:     break;
     case 4:     flags += 1 << 6;        break;
     case 8:     flags += 2 << 6;        break;
@@ -90,27 +91,27 @@ uLong ZEXPORT zlibCompileFlags()
 #ifdef FASTEST
     flags += 1L << 21;
 #endif
-#ifdef STDC
+#if defined(STDC) || defined(Z_HAVE_STDARG_H)
 #  ifdef NO_vsnprintf
-        flags += 1L << 25;
+    flags += 1L << 25;
 #    ifdef HAS_vsprintf_void
-        flags += 1L << 26;
+    flags += 1L << 26;
 #    endif
 #  else
 #    ifdef HAS_vsnprintf_void
-        flags += 1L << 26;
+    flags += 1L << 26;
 #    endif
 #  endif
 #else
-        flags += 1L << 24;
+    flags += 1L << 24;
 #  ifdef NO_snprintf
-        flags += 1L << 25;
+    flags += 1L << 25;
 #    ifdef HAS_sprintf_void
-        flags += 1L << 26;
+    flags += 1L << 26;
 #    endif
 #  else
 #    ifdef HAS_snprintf_void
-        flags += 1L << 26;
+    flags += 1L << 26;
 #    endif
 #  endif
 #endif
@@ -122,9 +123,9 @@ uLong ZEXPORT zlibCompileFlags()
 #  ifndef verbose
 #    define verbose 0
 #  endif
-int z_verbose = verbose;
+int ZLIB_INTERNAL z_verbose = verbose;
 
-void z_error (m)
+void ZLIB_INTERNAL z_error (m)
     char *m;
 {
     fprintf(stderr, "%s\n", m);
@@ -151,7 +152,7 @@ const char * ZEXPORT zError(err)
 
 #ifndef HAVE_MEMCPY
 
-void zmemcpy(dest, source, len)
+void ZLIB_INTERNAL zmemcpy(dest, source, len)
     Bytef* dest;
     const Bytef* source;
     uInt  len;
@@ -162,7 +163,7 @@ void zmemcpy(dest, source, len)
     } while (--len != 0);
 }
 
-int zmemcmp(s1, s2, len)
+int ZLIB_INTERNAL zmemcmp(s1, s2, len)
     const Bytef* s1;
     const Bytef* s2;
     uInt  len;
@@ -175,7 +176,7 @@ int zmemcmp(s1, s2, len)
     return 0;
 }
 
-void zmemzero(dest, len)
+void ZLIB_INTERNAL zmemzero(dest, len)
     Bytef* dest;
     uInt  len;
 {
@@ -186,6 +187,7 @@ void zmemzero(dest, len)
 }
 #endif
 
+#ifndef Z_SOLO
 
 #ifdef SYS16BIT
 
@@ -218,7 +220,7 @@ local ptr_table table[MAX_PTR];
  * a protected system like OS/2. Use Microsoft C instead.
  */
 
-voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
+voidpf ZLIB_INTERNAL zcalloc (voidpf opaque, unsigned items, unsigned size)
 {
     voidpf buf = opaque; /* just to make some compilers happy */
     ulg bsize = (ulg)items*size;
@@ -242,7 +244,7 @@ voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
     return buf;
 }
 
-void  zcfree (voidpf opaque, voidpf ptr)
+void ZLIB_INTERNAL zcfree (voidpf opaque, voidpf ptr)
 {
     int n;
     if (*(ush*)&ptr != 0) { /* object < 64K */
@@ -277,13 +279,13 @@ void  zcfree (voidpf opaque, voidpf ptr)
 #  define _hfree   hfree
 #endif
 
-voidpf zcalloc (voidpf opaque, unsigned items, unsigned size)
+voidpf ZLIB_INTERNAL zcalloc (voidpf opaque, uInt items, uInt size)
 {
     if (opaque) opaque = 0; /* to make compiler happy */
     return _halloc((long)items, size);
 }
 
-void  zcfree (voidpf opaque, voidpf ptr)
+void ZLIB_INTERNAL zcfree (voidpf opaque, voidpf ptr)
 {
     if (opaque) opaque = 0; /* to make compiler happy */
     _hfree(ptr);
@@ -302,26 +304,24 @@ extern voidp  calloc OF((uInt items, uInt size));
 extern void   free   OF((voidpf ptr));
 #endif
 
-extern void* sys_alloc(unsigned);
-extern void* sys_free(void *);
-
-voidpf zcalloc (opaque, items, size)
+voidpf ZLIB_INTERNAL zcalloc (opaque, items, size)
     voidpf opaque;
     unsigned items;
     unsigned size;
 {
-    unsigned sz = items * size;
-    voidpf* ptr = (voidpf) sys_alloc(sz);
     if (opaque) items += size - size; /* make compiler happy */
-    return ptr;
+    return sizeof(uInt) > 2 ? (voidpf)malloc(items * size) :
+                              (voidpf)calloc(items, size);
 }
 
-void  zcfree (opaque, ptr)
+void ZLIB_INTERNAL zcfree (opaque, ptr)
     voidpf opaque;
     voidpf ptr;
 {
-    sys_free(ptr);
+    free(ptr);
     if (opaque) return; /* make compiler happy */
 }
 
 #endif /* MY_ZCALLOC */
+
+#endif /* !Z_SOLO */
diff --git a/erts/emulator/zlib/zutil.h b/erts/emulator/zlib/zutil.h
index a8872e1c88..24ab06b1cf 100644
--- a/erts/emulator/zlib/zutil.h
+++ b/erts/emulator/zlib/zutil.h
@@ -1,10 +1,8 @@
 /* zutil.h -- internal interface and configuration of the compression library
- * Copyright (C) 1995-2005 Jean-loup Gailly.
+ * Copyright (C) 1995-2013 Jean-loup Gailly.
  * For conditions of distribution and use, see copyright notice in zlib.h
  */
 
-/* %ExternalCopyright% */
-
 /* WARNING: this file should *not* be used by applications. It is
    part of the implementation of the compression library and is
    subject to change. Applications should only use zlib.h.
@@ -15,30 +13,24 @@
 #ifndef ZUTIL_H
 #define ZUTIL_H
 
-#define ZLIB_INTERNAL
+#ifdef HAVE_HIDDEN
+#  define ZLIB_INTERNAL __attribute__((visibility ("hidden")))
+#else
+#  define ZLIB_INTERNAL
+#endif
+
 #include "zlib.h"
 
-#ifdef STDC
-#  ifndef _WIN32_WCE
+#if defined(STDC) && !defined(Z_SOLO)
+#  if !(defined(_WIN32_WCE) && defined(_MSC_VER))
 #    include <stddef.h>
 #  endif
 #  include <string.h>
 #  include <stdlib.h>
 #endif
-#ifdef NO_ERRNO_H
-#   ifdef _WIN32_WCE
-      /* The Microsoft C Run-Time Library for Windows CE doesn't have
-       * errno.  We define it as a global variable to simplify porting.
-       * Its value is always 0 and should not be used.  We rename it to
-       * avoid conflict with other libraries that use the same workaround.
-       */
-#     define errno z_errno
-#   endif
-    extern int errno;
-#else
-#  ifndef _WIN32_WCE
-#    include <errno.h>
-#  endif
+
+#ifdef Z_SOLO
+   typedef long ptrdiff_t;  /* guess -- will be caught if guess is wrong */
 #endif
 
 #ifndef local
@@ -52,13 +44,13 @@ typedef unsigned short ush;
 typedef ush FAR ushf;
 typedef unsigned long  ulg;
 
-extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
+extern z_const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 /* (size given to avoid silly warnings with Visual C++) */
 
 #define ERR_MSG(err) z_errmsg[Z_NEED_DICT-(err)]
 
 #define ERR_RETURN(strm,err) \
-  return (strm->msg = (char*)ERR_MSG(err), (err))
+  return (strm->msg = ERR_MSG(err), (err))
 /* To be used only when the state is known to be valid */
 
         /* common constants */
@@ -90,16 +82,18 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 
 #if defined(MSDOS) || (defined(WINDOWS) && !defined(WIN32))
 #  define OS_CODE  0x00
-#  if defined(__TURBOC__) || defined(__BORLANDC__)
-#    if(__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
-       /* Allow compilation with ANSI keywords only enabled */
-       void _Cdecl farfree( void *block );
-       void *_Cdecl farmalloc( unsigned long nbytes );
-#    else
-#      include <alloc.h>
+#  ifndef Z_SOLO
+#    if defined(__TURBOC__) || defined(__BORLANDC__)
+#      if (__STDC__ == 1) && (defined(__LARGE__) || defined(__COMPACT__))
+         /* Allow compilation with ANSI keywords only enabled */
+         void _Cdecl farfree( void *block );
+         void *_Cdecl farmalloc( unsigned long nbytes );
+#      else
+#        include <alloc.h>
+#      endif
+#    else /* MSC or DJGPP */
+#      include <malloc.h>
 #    endif
-#  else /* MSC or DJGPP */
-#    include <malloc.h>
 #  endif
 #endif
 
@@ -119,18 +113,20 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 
 #ifdef OS2
 #  define OS_CODE  0x06
-#  ifdef M_I86
-     #include <malloc.h>
+#  if defined(M_I86) && !defined(Z_SOLO)
+#    include <malloc.h>
 #  endif
 #endif
 
 #if defined(MACOS) || defined(TARGET_OS_MAC)
 #  define OS_CODE  0x07
-#  if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
-#    include <unix.h> /* for fdopen */
-#  else
-#    ifndef fdopen
-#      define fdopen(fd,mode) NULL /* No fdopen() */
+#  ifndef Z_SOLO
+#    if defined(__MWERKS__) && __dest_os != __be_os && __dest_os != __win32_os
+#      include <unix.h> /* for fdopen */
+#    else
+#      ifndef fdopen
+#        define fdopen(fd,mode) NULL /* No fdopen() */
+#      endif
 #    endif
 #  endif
 #endif
@@ -142,7 +138,6 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #ifdef WIN32
 #  ifndef __CYGWIN__  /* Cygwin is Unix, not Win32 */
 #    define OS_CODE  0x0b
-#    define F_OPEN(name, mode) _wfopen((WCHAR *)(name), (WCHAR *)(mode)) /* Unicode */
 #  endif
 #endif
 
@@ -154,7 +149,7 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #  define fdopen(fd,mode) NULL /* No fdopen() */
 #endif
 
-#if (defined(_MSC_VER) && (_MSC_VER > 600))
+#if (defined(_MSC_VER) && (_MSC_VER > 600)) && !defined __INTERIX
 #  if defined(_WIN32_WCE)
 #    define fdopen(fd,mode) NULL /* No fdopen() */
 #    ifndef _PTRDIFF_T_DEFINED
@@ -166,6 +161,19 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #  endif
 #endif
 
+#if defined(__BORLANDC__) && !defined(MSDOS)
+  #pragma warn -8004
+  #pragma warn -8008
+  #pragma warn -8066
+#endif
+
+/* provide prototypes for these when building zlib without LFS */
+#if !defined(_WIN32) && \
+    (!defined(_LARGEFILE64_SOURCE) || _LFS64_LARGEFILE-0 == 0)
+    ZEXTERN uLong ZEXPORT adler32_combine64 OF((uLong, uLong, z_off_t));
+    ZEXTERN uLong ZEXPORT crc32_combine64 OF((uLong, uLong, z_off_t));
+#endif
+
         /* common defaults */
 
 #ifndef OS_CODE
@@ -178,40 +186,7 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 
          /* functions */
 
-#if defined(STDC99) || (defined(__TURBOC__) && __TURBOC__ >= 0x550)
-#  ifndef HAVE_VSNPRINTF
-#    define HAVE_VSNPRINTF
-#  endif
-#endif
-#if defined(__CYGWIN__)
-#  ifndef HAVE_VSNPRINTF
-#    define HAVE_VSNPRINTF
-#  endif
-#endif
-#ifndef HAVE_VSNPRINTF
-#  ifdef MSDOS
-     /* vsnprintf may exist on some MS-DOS compilers (DJGPP?),
-        but for now we just assume it doesn't. */
-#    define NO_vsnprintf
-#  endif
-#  ifdef __TURBOC__
-#    define NO_vsnprintf
-#  endif
-#  ifdef WIN32
-     /* In Win32, vsnprintf is available as the "non-ANSI" _vsnprintf. */
-#    if !defined(vsnprintf) && !defined(NO_vsnprintf)
-#      define vsnprintf _vsnprintf
-#    endif
-#  endif
-#  ifdef __SASC
-#    define NO_vsnprintf
-#  endif
-#endif
-#ifdef VMS
-#  define NO_vsnprintf
-#endif
-
-#if defined(pyr)
+#if defined(pyr) || defined(Z_SOLO)
 #  define NO_MEMCPY
 #endif
 #if defined(SMALL_MEDIUM) && !defined(_MSC_VER) && !defined(__SC__)
@@ -235,16 +210,16 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #    define zmemzero(dest, len) memset(dest, 0, len)
 #  endif
 #else
-   extern void zmemcpy  OF((Bytef* dest, const Bytef* source, uInt len));
-   extern int  zmemcmp  OF((const Bytef* s1, const Bytef* s2, uInt len));
-   extern void zmemzero OF((Bytef* dest, uInt len));
+   void ZLIB_INTERNAL zmemcpy OF((Bytef* dest, const Bytef* source, uInt len));
+   int ZLIB_INTERNAL zmemcmp OF((const Bytef* s1, const Bytef* s2, uInt len));
+   void ZLIB_INTERNAL zmemzero OF((Bytef* dest, uInt len));
 #endif
 
 /* Diagnostic functions */
 #ifdef DEBUG
 #  include <stdio.h>
-   extern int z_verbose;
-   extern void z_error    OF((char *m));
+   extern int ZLIB_INTERNAL z_verbose;
+   extern void ZLIB_INTERNAL z_error OF((char *m));
 #  define Assert(cond,msg) {if(!(cond)) z_error(msg);}
 #  define Trace(x) {if (z_verbose>=0) fprintf x ;}
 #  define Tracev(x) {if (z_verbose>0) fprintf x ;}
@@ -260,13 +235,19 @@ extern const char * const z_errmsg[10]; /* indexed by 2-zlib_error */
 #  define Tracecv(c,x)
 #endif
 
-
-voidpf zcalloc OF((voidpf opaque, unsigned items, unsigned size));
-void   zcfree  OF((voidpf opaque, voidpf ptr));
+#ifndef Z_SOLO
+   voidpf ZLIB_INTERNAL zcalloc OF((voidpf opaque, unsigned items,
+                                    unsigned size));
+   void ZLIB_INTERNAL zcfree  OF((voidpf opaque, voidpf ptr));
+#endif
 
 #define ZALLOC(strm, items, size) \
            (*((strm)->zalloc))((strm)->opaque, (items), (size))
 #define ZFREE(strm, addr)  (*((strm)->zfree))((strm)->opaque, (voidpf)(addr))
 #define TRY_FREE(s, p) {if (p) ZFREE(s, p);}
 
+/* Reverse the bytes in a 32-bit value */
+#define ZSWAP32(q) ((((q) >> 24) & 0xff) + (((q) >> 8) & 0xff00) + \
+                    (((q) & 0xff00) << 8) + (((q) & 0xff) << 24))
+
 #endif /* ZUTIL_H */