25 files changed, 2057 insertions, 747 deletions
diff --git a/erts/emulator/Makefile.in b/erts/emulator/Makefile.in
index afca3b85df..708d4ca0a3 100644
--- a/erts/emulator/Makefile.in
+++ b/erts/emulator/Makefile.in
@@ -739,7 +739,7 @@ RUN_OBJS = \
 	$(OBJDIR)/packet_parser.o	$(OBJDIR)/safe_hash.o \
 	$(OBJDIR)/erl_zlib.o		$(OBJDIR)/erl_nif.o \
 	$(OBJDIR)/erl_bif_binary.o      $(OBJDIR)/erl_ao_firstfit_alloc.o \
-	$(OBJDIR)/erl_sched_spec_pre_alloc.o
+	$(OBJDIR)/erl_thr_queue.o	$(OBJDIR)/erl_sched_spec_pre_alloc.o
 
 ifeq ($(TARGET),win32)
 DRV_OBJS = \
diff --git a/erts/emulator/beam/erl_alloc.c b/erts/emulator/beam/erl_alloc.c
index 705ace26fa..33d6cf5f2f 100644
--- a/erts/emulator/beam/erl_alloc.c
+++ b/erts/emulator/beam/erl_alloc.c
@@ -41,6 +41,7 @@
 #include "erl_monitors.h"
 #include "erl_bif_timer.h"
 #include "erl_cpu_topology.h"
+#include "erl_thr_queue.h"
 #if defined(ERTS_ALC_T_DRV_SEL_D_STATE) || defined(ERTS_ALC_T_DRV_EV_D_STATE)
 #include "erl_check_io.h"
 #endif
@@ -524,6 +525,10 @@ erts_alloc_init(int *argc, char **argv, ErtsAllocInitOpts *eaiop)
 	= sizeof(ErtsDrvSelectDataState);
     fix_type_sizes[ERTS_ALC_FIX_TYPE_IX(ERTS_ALC_T_MSG_REF)]
 	= sizeof(ErlMessage);
+#ifdef ERTS_SMP
+    fix_type_sizes[ERTS_ALC_FIX_TYPE_IX(ERTS_ALC_T_THR_Q_EL_SL)]
+	= sizeof(ErtsThrQElement_t);
+#endif
 #ifdef HARD_DEBUG
     hdbg_init();
 #endif
@@ -3070,10 +3075,10 @@ erts_request_alloc_info(struct process *c_p,
 
 #ifdef ERTS_SMP
     if (erts_no_schedulers > 1)
-	erts_smp_schedule_misc_aux_work(1,
-					erts_no_schedulers,
-					reply_alloc_info,
-					(void *) air);
+	erts_schedule_multi_misc_aux_work(1,
+					  erts_no_schedulers,
+					  reply_alloc_info,
+					  (void *) air);
 #endif
 
     reply_alloc_info((void *) air);
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 79d3433fc0..962db8b831 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -50,6 +50,15 @@
 #   command line argument to make_alloc_types. The variable X is false
 #   after a "+disable X" statement or if it has never been mentioned.
 
++if smp
++disable threads_no_smp
++else
++if threads
++enable threads_no_smp
++else
++disable threads_no_smp
++endif
++endif
 
 # --- Allocator declarations -------------------------------------------------
 #
@@ -192,7 +201,7 @@ type	LINEBUF		STANDARD	SYSTEM		line_buf
 type	IOQ		STANDARD	SYSTEM		io_queue
 type	BITS_BUF	STANDARD	SYSTEM		bits_buf
 type	TMP_DIST_BUF	TEMPORARY	SYSTEM		tmp_dist_buf
-type	ASYNC_Q		LONG_LIVED	SYSTEM		async_queue
+type	ASYNC_DATA	LONG_LIVED	SYSTEM		internal_async_data
 type	ESTACK		TEMPORARY	SYSTEM		estack
 type	PORT_CALL_BUF	TEMPORARY	SYSTEM		port_call_buf
 type	DB_TABLE	ETS		ETS		db_tab
@@ -253,6 +262,22 @@ type	EXT_TERM_DATA	SHORT_LIVED	PROCESSES	external_term_data
 type	ZLIB		STANDARD	SYSTEM		zlib
 type	CPU_GRPS_MAP	LONG_LIVED	SYSTEM		cpu_groups_map
 type	AUX_WORK_TMO	LONG_LIVED	SYSTEM		aux_work_timeouts
+type	MISC_AUX_WORK_Q	LONG_LIVED	SYSTEM		misc_aux_work_q
+
++if threads_no_smp
+# Need thread safe allocs, but std_alloc and fix_alloc are not;
+# use driver_alloc which is...
+type	THR_Q_EL	DRIVER 	   	SYSTEM		thr_q_element
+type	THR_Q_EL_SL	DRIVER		SYSTEM		sl_thr_q_element
+type	MISC_AUX_WORK	DRIVER		SYSTEM		misc_aux_work
++else
+type	THR_Q_EL	STANDARD   	SYSTEM		thr_q_element
+type	THR_Q_EL_SL	FIXED_SIZE	SYSTEM		sl_thr_q_element
+type	MISC_AUX_WORK	SHORT_LIVED	SYSTEM		misc_aux_work
++endif
+type	THR_Q		STANDARD	SYSTEM		thr_queue
+type	THR_Q_SL	SHORT_LIVED	SYSTEM		short_lived_thr_queue
+type	THR_Q_LL	LONG_LIVED	SYSTEM		long_lived_thr_queue
 
 +if smp
 type	ASYNC		SHORT_LIVED	SYSTEM		async
@@ -268,8 +293,6 @@ type	XPORTS_LIST	SHORT_LIVED	SYSTEM		extra_port_list
 type	PROC_LCK_WTR	LONG_LIVED	SYSTEM		proc_lock_waiter
 type	PROC_LCK_QS	LONG_LIVED	SYSTEM		proc_lock_queues
 type	RUNQ_BLNS	LONG_LIVED	SYSTEM		run_queue_balancing
-type	MISC_AUX_WORK_Q	LONG_LIVED	SYSTEM		misc_aux_work_q
-type	MISC_AUX_WORK	SHORT_LIVED	SYSTEM		misc_aux_work
 type	THR_PRGR_IDATA	LONG_LIVED	SYSTEM		thr_prgr_internal_data
 type	THR_PRGR_DATA	LONG_LIVED	SYSTEM		thr_prgr_data
 type	T_THR_PRGR_DATA	SHORT_LIVED	SYSTEM		temp_thr_prgr_data
@@ -285,12 +308,6 @@ type	ETHR_STD	STANDARD	SYSTEM		ethread_standard
 type	ETHR_SL		SHORT_LIVED	SYSTEM		ethread_short_lived
 type	ETHR_LL		LONG_LIVED	SYSTEM		ethread_long_lived
 
-+ifnot smp
-
-type	ARCALLBACK	LONG_LIVED	SYSTEM		async_ready_callback
-
-+endif
-
 +endif
 
 +if shared_heap
diff --git a/erts/emulator/beam/erl_async.c b/erts/emulator/beam/erl_async.c
index 91b64411d4..2dc7237f7c 100644
--- a/erts/emulator/beam/erl_async.c
+++ b/erts/emulator/beam/erl_async.c
@@ -24,10 +24,18 @@
 #include "erl_sys_driver.h"
 #include "global.h"
 #include "erl_threads.h"
+#include "erl_thr_queue.h"
+#include "erl_async.h"
+
+#define ERTS_MAX_ASYNC_READY_CALLS_IN_SEQ 20
+
+#define ERTS_ASYNC_PRINT_JOB 0
+
+#if !defined(ERTS_SMP) && defined(USE_THREADS) && !ERTS_USE_ASYNC_READY_Q
+#  error "Need async ready queue in non-smp case"
+#endif
 
 typedef struct _erl_async {
-    struct _erl_async* next;
-    struct _erl_async* prev;
     DE_Handle*         hndl;   /* The DE_Handle is needed when port is gone */
     Eterm              port;
     long               async_id;
@@ -35,345 +43,498 @@ typedef struct _erl_async {
     ErlDrvPDL          pdl;
     void (*async_invoke)(void*);
     void (*async_free)(void*);
-} ErlAsync;
+#if ERTS_USE_ASYNC_READY_Q
+    Uint               sched_id;
+    union {
+	ErtsThrQPrepEnQ_t *prep_enq;
+	ErtsThrQFinDeQ_t   fin_deq;
+    } q;
+#endif
+} ErtsAsync;
+
+#if ERTS_USE_ASYNC_READY_Q
+
+/*
+ * We can do without the enqueue mutex since it isn't needed for
+ * thread safety. Its only purpose is to put async threads to sleep
+ * during a blast of ready async jobs. This in order to reduce
+ * contention on the enqueue end of the async ready queues. During
+ * such a blast without the enqueue mutex much cpu time is consumed
+ * by the async threads without them doing much progress which in turn
+ * slow down progress of scheduler threads.
+ */
+#define ERTS_USE_ASYNC_READY_ENQ_MTX 1
+
+#if ERTS_USE_ASYNC_READY_ENQ_MTX
 
 typedef struct {
-    erts_mtx_t mtx;
-    erts_cnd_t cv;
-    erts_tid_t thr;
-    int   len;
-#ifndef ERTS_SMP
-    int   hndl;
+    erts_mtx_t enq_mtx;
+} ErtsAsyncReadyQXData;
+
 #endif
-    ErlAsync* head;
-    ErlAsync* tail;
-#ifdef ERTS_ENABLE_LOCK_CHECK
-    int no;
+
+typedef struct {
+#if ERTS_USE_ASYNC_READY_ENQ_MTX
+    union {
+	ErtsAsyncReadyQXData data;
+	char align__[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(
+		sizeof(ErtsAsyncReadyQXData))];
+    } x;
 #endif
-} AsyncQueue;
+    ErtsThrQ_t thr_q;
+    ErtsThrQFinDeQ_t fin_deq;
+} ErtsAsyncReadyQ;
 
-static erts_smp_spinlock_t async_id_lock;
-static long async_id = 0;
 
+typedef union {
+    ErtsAsyncReadyQ arq;
+    char align__[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsAsyncReadyQ))];
+} ErtsAlgndAsyncReadyQ;
 
-#ifndef ERTS_SMP
+#endif /* ERTS_USE_ASYNC_READY_Q */
 
-erts_mtx_t async_ready_mtx;
-static ErlAsync* async_ready_list = NULL;
+typedef struct {
+    ErtsThrQ_t thr_q;
+    erts_tid_t thr_id;
+} ErtsAsyncQ;
+
+typedef union {
+    ErtsAsyncQ aq;
+    char align__[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsAsyncQ))];
+} ErtsAlgndAsyncQ;
 
+typedef struct {
+    int no_initialized;
+    erts_mtx_t mtx;
+    erts_cnd_t cnd;
+    erts_atomic_t id;
+} ErtsAsyncInit;
+
+typedef struct {
+    union {
+	ErtsAsyncInit data;
+	char align__[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsAsyncInit))];
+    } init;
+    ErtsAlgndAsyncQ *queue;
+#if ERTS_USE_ASYNC_READY_Q
+    ErtsAlgndAsyncReadyQ *ready_queue;
 #endif
+} ErtsAsyncData;
 
-/*
-** Initialize worker threads (if supported)
-*/
+int erts_async_max_threads; /* Initialized by erl_init.c */
+int erts_async_thread_suggested_stack_size; /* Initialized by erl_init.c */
 
-/* Detach from driver */
-static void async_detach(DE_Handle* dh)
-{
-    return;
-}
+static ErtsAsyncData *async;
 
+#ifndef USE_THREADS
 
-#ifdef USE_THREADS
+void
+erts_init_async(void)
+{
 
-static AsyncQueue* async_q;
+}
 
-static void* async_main(void*);
-static void async_add(ErlAsync*, AsyncQueue*);
+#else
 
-#ifndef ERTS_SMP
-typedef struct ErtsAsyncReadyCallback_ ErtsAsyncReadyCallback;
-struct ErtsAsyncReadyCallback_ {
-    struct ErtsAsyncReadyCallback_ *next;
-    void (*callback)(void);
-};
+static void *async_main(void *);
 
-static ErtsAsyncReadyCallback *callbacks;
-static int async_handle;
+static ERTS_INLINE ErtsAsyncQ *
+async_q(int i)
+{
+    return &async->queue[i].aq;
+}
+
+#if ERTS_USE_ASYNC_READY_Q
 
-int erts_register_async_ready_callback(void (*funcp)(void))
+static ERTS_INLINE ErtsAsyncReadyQ *
+async_ready_q(Uint sched_id)
 {
-    ErtsAsyncReadyCallback *cb = erts_alloc(ERTS_ALC_T_ARCALLBACK,
-					    sizeof(ErtsAsyncReadyCallback));
-    cb->next = callbacks;
-    cb->callback = funcp;
-    erts_mtx_lock(&async_ready_mtx);
-    callbacks = cb;
-    erts_mtx_unlock(&async_ready_mtx);
-    return async_handle;
+    return &async->ready_queue[((int)sched_id)-1].arq;
 }
+
 #endif
 
-int init_async(int hndl)
+void
+erts_init_async(void)
 {
-    erts_thr_opts_t thr_opts = ERTS_THR_OPTS_DEFAULT_INITER;
-    AsyncQueue* q;
-    int i;
+    async = NULL;
+    if (erts_async_max_threads > 0) {
+#if ERTS_USE_ASYNC_READY_Q
+	ErtsThrQInit_t qinit = ERTS_THR_Q_INIT_DEFAULT;
+#endif
+	erts_thr_opts_t thr_opts = ERTS_THR_OPTS_DEFAULT_INITER;
+	char *ptr;
+	size_t tot_size = 0;
+	int i;
+
+	tot_size += ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsAsyncData));
+	tot_size += sizeof(ErtsAlgndAsyncQ)*erts_async_max_threads;
+#if ERTS_USE_ASYNC_READY_Q
+	tot_size += sizeof(ErtsAlgndAsyncReadyQ)*erts_no_schedulers;
+#endif
 
-    thr_opts.detached = 0;
-    thr_opts.suggested_stack_size = erts_async_thread_suggested_stack_size;
-
-#ifndef ERTS_SMP
-    callbacks = NULL;
-    async_handle = hndl;
-    erts_mtx_init(&async_ready_mtx, "async_ready");
-    async_ready_list = NULL;
-#endif
-
-    async_id = 0;
-    erts_smp_spinlock_init(&async_id_lock, "async_id");
-
-    async_q = q = (AsyncQueue*)
-	(erts_async_max_threads
-	 ? erts_alloc(ERTS_ALC_T_ASYNC_Q,
-		      erts_async_max_threads * sizeof(AsyncQueue))
-	 : NULL);
-    for (i = 0; i < erts_async_max_threads; i++) {
-	q->head = NULL;
-	q->tail = NULL;
-	q->len = 0;
-#ifndef ERTS_SMP
-	q->hndl = hndl;
-#endif
-#ifdef ERTS_ENABLE_LOCK_CHECK
-	q->no = i;
-#endif
-	erts_mtx_init(&q->mtx, "asyncq");
-	erts_cnd_init(&q->cv);
-	erts_thr_create(&q->thr, async_main, (void*)q, &thr_opts);
-	q++;
-    }
-    return 0;
-}
+	ptr = erts_alloc_permanent_cache_aligned(ERTS_ALC_T_ASYNC_DATA,
+						 tot_size);
 
+	async = (ErtsAsyncData *) ptr;
+	ptr += ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsAsyncData));
 
-int exit_async()
-{
-    int i;
+	async->init.data.no_initialized = 0;
+	erts_mtx_init(&async->init.data.mtx, "async_init_mtx");
+	erts_cnd_init(&async->init.data.cnd);
+	erts_atomic_init_nob(&async->init.data.id, 0);
 
-    /* terminate threads */
-    for (i = 0; i < erts_async_max_threads; i++) {
-	ErlAsync* a = (ErlAsync*) erts_alloc(ERTS_ALC_T_ASYNC,
-					     sizeof(ErlAsync));
-	a->port = NIL;
-	async_add(a, &async_q[i]);
-    }
+	async->queue = (ErtsAlgndAsyncQ *) ptr;
+	ptr += sizeof(ErtsAlgndAsyncQ)*erts_async_max_threads;
 
-    for (i = 0; i < erts_async_max_threads; i++) {
-	erts_thr_join(async_q[i].thr, NULL);
-	erts_mtx_destroy(&async_q[i].mtx);
-	erts_cnd_destroy(&async_q[i].cv);
-    }
-#ifndef ERTS_SMP
-    erts_mtx_destroy(&async_ready_mtx);
+#if ERTS_USE_ASYNC_READY_Q
+
+	qinit.live.queue = ERTS_THR_Q_LIVE_LONG;
+	qinit.live.objects = ERTS_THR_Q_LIVE_SHORT;
+	qinit.notify = erts_notify_check_async_ready_queue;
+
+	async->ready_queue = (ErtsAlgndAsyncReadyQ *) ptr;
+	ptr += sizeof(ErtsAlgndAsyncReadyQ)*erts_no_schedulers;
+
+	for (i = 1; i <= erts_no_schedulers; i++) {
+	    ErtsAsyncReadyQ *arq = async_ready_q(i);
+#if ERTS_USE_ASYNC_READY_ENQ_MTX
+	    erts_mtx_init(&arq->x.data.enq_mtx, "async_enq_mtx");
 #endif
-    if (async_q)
-	erts_free(ERTS_ALC_T_ASYNC_Q, (void *) async_q);
-    return 0;
+	    erts_thr_q_finalize_dequeue_state_init(&arq->fin_deq);
+	    qinit.arg = (void *) (SWord) i;
+	    erts_thr_q_initialize(&arq->thr_q, &qinit);
+	}
+
+#endif
+
+	/* Create async threads... */
+
+	thr_opts.detached = 0;
+	thr_opts.suggested_stack_size
+	    = erts_async_thread_suggested_stack_size;
+
+	for (i = 0; i < erts_async_max_threads; i++) {
+	    ErtsAsyncQ *aq = async_q(i);
+	    erts_thr_create(&aq->thr_id, async_main, (void*) aq, &thr_opts);
+	}
+
+	/* Wait for async threads to initialize... */
+
+	erts_mtx_lock(&async->init.data.mtx);
+	while (async->init.data.no_initialized != erts_async_max_threads)
+	    erts_cnd_wait(&async->init.data.cnd, &async->init.data.mtx);
+	erts_mtx_unlock(&async->init.data.mtx);
+
+	erts_mtx_destroy(&async->init.data.mtx);
+	erts_cnd_destroy(&async->init.data.cnd);
+
+    }
 }
 
+#if ERTS_USE_ASYNC_READY_Q
 
-static void async_add(ErlAsync* a, AsyncQueue* q)
+void *
+erts_get_async_ready_queue(Uint sched_id)
+{
+    return (void *) async ? async_ready_q(sched_id) : NULL;
+}
+
+#endif
+
+static ERTS_INLINE void async_add(ErtsAsync *a, ErtsAsyncQ* q)
 {
     if (is_internal_port(a->port)) {
-	ERTS_LC_ASSERT(erts_drvportid2port(a->port));
+#if ERTS_USE_ASYNC_READY_Q
+	ErtsAsyncReadyQ *arq = async_ready_q(a->sched_id);
+	a->q.prep_enq = erts_thr_q_prepare_enqueue(&arq->thr_q);
+#endif
 	/* make sure the driver will stay around */
-	driver_lock_driver(internal_port_index(a->port));
+	if (a->hndl)
+	    erts_ddll_reference_referenced_driver(a->hndl);
     }
 
-    erts_mtx_lock(&q->mtx);
+#if ERTS_ASYNC_PRINT_JOB
+    erts_fprintf(stderr, "-> %ld\n", a->async_id);
+#endif
 
-    if (q->len == 0) {
-	q->head = a;
-	q->tail = a;
-	q->len = 1;
-	erts_cnd_signal(&q->cv);
-    }
-    else { /* no need to signal (since the worker is working) */
-	a->next = q->head;
-	q->head->prev = a;
-	q->head = a;
-	q->len++;
-    }
-    erts_mtx_unlock(&q->mtx);
+    erts_thr_q_enqueue(&q->thr_q, a);
 }
 
-static ErlAsync* async_get(AsyncQueue* q)
+static ERTS_INLINE ErtsAsync *async_get(ErtsThrQ_t *q,
+					erts_tse_t *tse,
+					ErtsThrQPrepEnQ_t **prep_enq)
 {
-    ErlAsync* a;
+#if ERTS_USE_ASYNC_READY_Q
+    int saved_fin_deq = 0;
+    ErtsThrQFinDeQ_t fin_deq;
+#endif
 
-    erts_mtx_lock(&q->mtx);
-    while((a = q->tail) == NULL) {
-	erts_cnd_wait(&q->cv, &q->mtx);
-    }
+    while (1) {
+	ErtsAsync *a = (ErtsAsync *) erts_thr_q_dequeue(q);
+	if (a) {
+
+#if ERTS_USE_ASYNC_READY_Q
+	    *prep_enq = a->q.prep_enq;
+	    erts_thr_q_get_finalize_dequeue_data(q, &a->q.fin_deq);
+	    if (saved_fin_deq)
+		erts_thr_q_append_finalize_dequeue_data(&a->q.fin_deq, &fin_deq);
+#endif
+
+	    return a;
+	}
+
+	if (ERTS_THR_Q_DIRTY != erts_thr_q_clean(q)) {
+	    ErtsThrQFinDeQ_t tmp_fin_deq;
+
+	    erts_tse_reset(tse);
+
+#if ERTS_USE_ASYNC_READY_Q
+	chk_fin_deq:
+	    if (erts_thr_q_get_finalize_dequeue_data(q, &tmp_fin_deq)) {
+		if (!saved_fin_deq) {
+		    erts_thr_q_finalize_dequeue_state_init(&fin_deq);
+		    saved_fin_deq = 1;
+		}
+		erts_thr_q_append_finalize_dequeue_data(&fin_deq,
+							&tmp_fin_deq);
+	    }
+#endif
+
+	    switch (erts_thr_q_inspect(q, 1)) {
+	    case ERTS_THR_Q_DIRTY:
+		break;
 #ifdef ERTS_SMP
-    ASSERT(a && q->tail == a);
+	    case ERTS_THR_Q_NEED_THR_PRGR: {
+		ErtsThrPrgrVal prgr = erts_thr_q_need_thr_progress(q);
+		erts_thr_progress_wakeup(NULL, prgr);
+		/*
+		 * We do no dequeue finalizing in hope that a new async
+		 * job will arrive before we are woken due to thread
+		 * progress...
+		 */
+		erts_tse_wait(tse);
+		break;
+	    }
 #endif
-    if (q->head == q->tail) {
-	q->head = q->tail = NULL;
-	q->len = 0;
-    }
-    else {
-	q->tail->prev->next = NULL;
-	q->tail = q->tail->prev;
-	q->len--;
+	    case ERTS_THR_Q_CLEAN:
+
+#if ERTS_USE_ASYNC_READY_Q
+		if (saved_fin_deq) {
+		    if (erts_thr_q_finalize_dequeue(&fin_deq))
+			goto chk_fin_deq;
+		    else
+			saved_fin_deq = 0;
+		}
+#endif
+
+		erts_tse_wait(tse);
+		break;
+
+	    default:
+		ASSERT(0);
+		break;
+	    }
+
+	}
     }
-    erts_mtx_unlock(&q->mtx);
-    return a;
 }
 
-
-static int async_del(long id)
+static ERTS_INLINE void call_async_ready(ErtsAsync *a)
 {
-    int i;
-    /* scan all queue for an entry with async_id == 'id' */
-
-    for (i = 0; i < erts_async_max_threads; i++) {
-	ErlAsync* a;
-	erts_mtx_lock(&async_q[i].mtx);
-	
-	a = async_q[i].head;
-	while(a != NULL) {
-	    if (a->async_id == id) {
-		if (a->prev != NULL)
-		    a->prev->next = a->next;
-		else
-		    async_q[i].head = a->next;
-		if (a->next != NULL)
-		    a->next->prev = a->prev;
-		else
-		    async_q[i].tail = a->prev;
-		async_q[i].len--;
-		erts_mtx_unlock(&async_q[i].mtx);
-		if (a->async_free != NULL)
-		    a->async_free(a->async_data);
-		async_detach(a->hndl);
-		erts_free(ERTS_ALC_T_ASYNC, a);
-		return 1;
-	    }
-	    a = a->next;
+    Port *p = erts_id2port_sflgs(a->port,
+				 NULL,
+				 0,
+				 ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP);
+    if (!p) {
+	if (a->async_free)
+	    a->async_free(a->async_data);
+    }
+    else {
+	if (async_ready(p, a->async_data)) {
+	    if (a->async_free)
+		a->async_free(a->async_data);
 	}
-	erts_mtx_unlock(&async_q[i].mtx);
+	erts_port_release(p);
     }
-    return 0;
+    if (a->hndl)
+	erts_ddll_dereference_driver(a->hndl);
 }
 
-static void* async_main(void* arg)
+static ERTS_INLINE void async_reply(ErtsAsync *a, ErtsThrQPrepEnQ_t *prep_enq)
 {
-    AsyncQueue* q = (AsyncQueue*) arg;
+#if ERTS_USE_ASYNC_READY_Q
+    ErtsAsyncReadyQ *arq;
 
-#ifdef ERTS_ENABLE_LOCK_CHECK
-    {
-	char buf[27];
-	erts_snprintf(&buf[0], 27, "async %d", q->no);
-	erts_lc_set_thread_name(&buf[0]);
-    }
+    if (a->pdl)
+	driver_pdl_dec_refc(a->pdl);
+
+#if ERTS_ASYNC_PRINT_JOB
+    erts_fprintf(stderr, "=>> %ld\n", a->async_id);
 #endif
 
-    while(1) {
-	ErlAsync* a = async_get(q);
+    arq = async_ready_q(a->sched_id);
 
-	if (a->port == NIL) { /* TIME TO DIE SIGNAL */
-	    erts_free(ERTS_ALC_T_ASYNC, (void *) a);
-	    break;
-	}
-	else {
-	    (*a->async_invoke)(a->async_data);
-	    /* Major problem if the code for async_invoke
-	       or async_free is removed during a blocking operation */
+#if ERTS_USE_ASYNC_READY_ENQ_MTX
+	erts_mtx_lock(&arq->x.data.enq_mtx);
+#endif
+
+	erts_thr_q_enqueue_prepared(&arq->thr_q, (void *) a, prep_enq);
+
+#if ERTS_USE_ASYNC_READY_ENQ_MTX
+	erts_mtx_unlock(&arq->x.data.enq_mtx);
+#endif
+
+#else /* ERTS_USE_ASYNC_READY_Q */
+
+	call_async_ready(a);
+	if (a->pdl)
+	    driver_pdl_dec_refc(a->pdl);
+	erts_free(ERTS_ALC_T_ASYNC, (void *) a);
+
+#endif /* ERTS_USE_ASYNC_READY_Q */
+}
+
+
+static void
+async_wakeup(void *vtse)
+{
+    erts_tse_set((erts_tse_t *) vtse);
+}
+
+static erts_tse_t *async_thread_init(ErtsAsyncQ *aq)
+{
+    ErtsThrQInit_t qinit = ERTS_THR_Q_INIT_DEFAULT;
+    erts_tse_t *tse = erts_tse_fetch();
 #ifdef ERTS_SMP
-	    {
-		Port *p;
-		p = erts_id2port_sflgs(a->port,
-				       NULL,
-				       0,
-				       ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP);
-		if (!p) {
-		    if (a->async_free)
-			(*a->async_free)(a->async_data);
-		}
-		else {
-		    if (async_ready(p, a->async_data)) {
-			if (a->async_free)
-			    (*a->async_free)(a->async_data);
-		    }
-		    async_detach(a->hndl);
-		    erts_port_release(p);
-		}
-		if (a->pdl) {
-		    driver_pdl_dec_refc(a->pdl);
-		}
-		erts_free(ERTS_ALC_T_ASYNC, (void *) a);
-	    }
-#else
-	    if (a->pdl) {
-		driver_pdl_dec_refc(a->pdl);
-	    }
-	    erts_mtx_lock(&async_ready_mtx);
-	    a->next = async_ready_list;
-	    async_ready_list = a;
-	    erts_mtx_unlock(&async_ready_mtx);
-	    sys_async_ready(q->hndl);
+    ErtsThrPrgrCallbacks callbacks;
+
+    callbacks.arg = (void *) tse;
+    callbacks.wakeup = async_wakeup;
+    callbacks.prepare_wait = NULL;
+    callbacks.wait = NULL;
+
+    erts_thr_progress_register_unmanaged_thread(&callbacks);
 #endif
-	}
-    }
 
-    return NULL;
+    qinit.live.queue = ERTS_THR_Q_LIVE_LONG;
+    qinit.live.objects = ERTS_THR_Q_LIVE_SHORT;
+    qinit.arg = (void *) tse;
+    qinit.notify = async_wakeup;
+#if ERTS_USE_ASYNC_READY_Q
+    qinit.auto_finalize_dequeue = 0;
+#endif
+
+    erts_thr_q_initialize(&aq->thr_q, &qinit);
+
+    /* Inform main thread that we are done initializing... */
+    erts_mtx_lock(&async->init.data.mtx);
+    async->init.data.no_initialized++;
+    erts_cnd_signal(&async->init.data.cnd);
+    erts_mtx_unlock(&async->init.data.mtx);
+
+    return tse;
 }
 
+static void *async_main(void* arg)
+{
+    ErtsAsyncQ *aq = (ErtsAsyncQ *) arg;
+    erts_tse_t *tse = async_thread_init(aq);
+
+    while (1) {
+	ErtsThrQPrepEnQ_t *prep_enq;
+	ErtsAsync *a = async_get(&aq->thr_q, tse, &prep_enq);
+	if (is_nil(a->port))
+	    break; /* Time to die */
 
+#if ERTS_ASYNC_PRINT_JOB
+	erts_fprintf(stderr, "<- %ld\n", a->async_id);
 #endif
 
-#ifndef ERTS_SMP
+	a->async_invoke(a->async_data);
+
+	async_reply(a, prep_enq);
+    }
+
+    return NULL;
+}
+
+#endif /* USE_THREADS */
 
-int check_async_ready(void)
+void
+erts_exit_flush_async(void)
 {
 #ifdef USE_THREADS
-    ErtsAsyncReadyCallback *cbs;
+    int i;
+    ErtsAsync a;
+    a.port = NIL;
+    /*
+     * Terminate threads in order to flush queues. We do not
+     * bother to clean everything up since we are about to
+     * terminate the runtime system and a cleanup would only
+     * delay the termination.
+     */
+    for (i = 0; i < erts_async_max_threads; i++)
+	async_add(&a, async_q(i));
+    for (i = 0; i < erts_async_max_threads; i++)
+	erts_thr_join(async->queue[i].aq.thr_id, NULL);
 #endif
-    ErlAsync* a;
-    int count = 0;
+}
 
-    erts_mtx_lock(&async_ready_mtx);
-    a = async_ready_list;
-    async_ready_list = NULL;
-#ifdef USE_THREADS
-    cbs = callbacks;
-#endif
-    erts_mtx_unlock(&async_ready_mtx);
-
-    while(a != NULL) {
-	ErlAsync* a_next = a->next;
-	/* Every port not dead */
-	Port *p = erts_id2port_sflgs(a->port,
-				     NULL,
-				     0,
-				     ERTS_PORT_SFLGS_INVALID_DRIVER_LOOKUP);
-	if (!p) {
-	    if (a->async_free)
-		(*a->async_free)(a->async_data);
-	}
-	else {
-	    count++;
-	    if (async_ready(p, a->async_data)) {
-		if (a->async_free != NULL)
-		    (*a->async_free)(a->async_data);
-	    }
-	    async_detach(a->hndl);
-	    erts_port_release(p);
+#if defined(USE_THREADS) && ERTS_USE_ASYNC_READY_Q
+
+int erts_check_async_ready(void *varq)
+{
+    ErtsAsyncReadyQ *arq = (ErtsAsyncReadyQ *) varq;
+    int res = 1;
+    int i;
+
+    for (i = 0; i < ERTS_MAX_ASYNC_READY_CALLS_IN_SEQ; i++) {
+	ErtsAsync *a = (ErtsAsync *) erts_thr_q_dequeue(&arq->thr_q);
+	if (!a) {
+	    res = 0;
+	    break;
 	}
+
+#if ERTS_ASYNC_PRINT_JOB
+	erts_fprintf(stderr, "<<= %ld\n", a->async_id);
+#endif
+	erts_thr_q_append_finalize_dequeue_data(&arq->fin_deq, &a->q.fin_deq);
+	call_async_ready(a);
 	erts_free(ERTS_ALC_T_ASYNC, (void *) a);
-	a = a_next;
     }
-#ifdef USE_THREADS
-    for (; cbs; cbs = cbs->next)
-	(*cbs->callback)();
-#endif
-    return count;
+
+    erts_thr_q_finalize_dequeue(&arq->fin_deq);
+    
+    return res;
 }
 
+int erts_async_ready_clean(void *varq, void *val)
+{
+    ErtsAsyncReadyQ *arq = (ErtsAsyncReadyQ *) varq;
+    ErtsThrQCleanState_t cstate;
+
+    cstate = erts_thr_q_clean(&arq->thr_q);
+
+    if (erts_thr_q_finalize_dequeue(&arq->fin_deq))
+	return ERTS_ASYNC_READY_DIRTY;
+
+    switch (cstate) {
+    case ERTS_THR_Q_DIRTY:
+	return ERTS_ASYNC_READY_DIRTY;
+#ifdef ERTS_SMP
+    case ERTS_THR_Q_NEED_THR_PRGR:
+	*((ErtsThrPrgrVal *) val)
+	    = erts_thr_q_need_thr_progress(&arq->thr_q);
+	return ERTS_ASYNC_READY_NEED_THR_PRGR;
 #endif
+    case ERTS_THR_Q_CLEAN:
+	break;
+    }
+    return ERTS_ASYNC_READY_CLEAN;
+}
 
+#endif
 
 /*
 ** Schedule async_invoke on a worker thread
@@ -393,19 +554,29 @@ long driver_async(ErlDrvPort ix, unsigned int* key,
 		  void (*async_invoke)(void*), void* async_data,
 		  void (*async_free)(void*))
 {
-    ErlAsync* a = (ErlAsync*) erts_alloc(ERTS_ALC_T_ASYNC, sizeof(ErlAsync));
-    Port* prt = erts_drvport2port(ix);
+    ErtsAsync* a;
+    Port* prt;
     long id;
     unsigned int qix;
+#if ERTS_USE_ASYNC_READY_Q
+    Uint sched_id;
 
+    sched_id = erts_get_scheduler_id();
+    if (!sched_id)
+	sched_id = 1;
+#endif
 
+    prt = erts_drvport2port(ix);
     if (!prt)
 	return -1;
 
     ERTS_SMP_LC_ASSERT(erts_lc_is_port_locked(prt));
 
-    a->next = NULL;
-    a->prev = NULL;
+    a = (ErtsAsync*) erts_alloc(ERTS_ALC_T_ASYNC, sizeof(ErtsAsync));
+
+#if ERTS_USE_ASYNC_READY_Q
+    a->sched_id = sched_id;
+#endif
     a->hndl = (DE_Handle*)prt->drv_ptr->handle;
     a->port = prt->id;
     a->pdl = NULL;
@@ -413,12 +584,16 @@ long driver_async(ErlDrvPort ix, unsigned int* key,
     a->async_invoke = async_invoke;
     a->async_free = async_free;
 
-    erts_smp_spin_lock(&async_id_lock);
-    async_id = (async_id + 1) & 0x7fffffff;
-    if (async_id == 0)
-	async_id++;
-    id = async_id;
-    erts_smp_spin_unlock(&async_id_lock);
+    if (!async)
+	id = 0;
+    else {
+	do {
+	    id = erts_atomic_inc_read_nob(&async->init.data.id);
+	} while (id == 0);
+	if (id < 0)
+	    id *= -1;
+	ASSERT(id > 0);
+    }
 
     a->async_id = id;
 
@@ -437,7 +612,7 @@ long driver_async(ErlDrvPort ix, unsigned int* key,
 	    driver_pdl_inc_refc(prt->port_data_lock);
 	    a->pdl = prt->port_data_lock;
 	}
-	async_add(a, &async_q[qix]);
+	async_add(a, async_q(qix));
 	return id;
     }
 #endif
@@ -455,10 +630,16 @@ long driver_async(ErlDrvPort ix, unsigned int* key,
 
 int driver_async_cancel(unsigned int id)
 {
-#ifdef USE_THREADS
-    if (erts_async_max_threads > 0)
-	return async_del(id);
-#endif
+    /*
+     * Not supported anymore. Always fail (which is backward
+     * compatible).
+     *
+     * This functionality could be implemented again. However,
+     * it is (and always has been) completely useless since
+     * it doesn't give you any guarantees whatsoever. The user
+     * needs to (and always have had to) synchronize in his/her
+     * own code in order to get any guarantees.
+     */
     return 0;
 }
 
diff --git a/erts/emulator/beam/erl_async.h b/erts/emulator/beam/erl_async.h
new file mode 100644
index 0000000000..95374a8fc9
--- /dev/null
+++ b/erts/emulator/beam/erl_async.h
@@ -0,0 +1,66 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2011. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#ifndef ERL_ASYNC_H__
+#define ERL_ASYNC_H__
+
+#define ERTS_MAX_NO_OF_ASYNC_THREADS 1024
+extern int erts_async_max_threads;
+#define ERTS_ASYNC_THREAD_MIN_STACK_SIZE 16	/* Kilo words */
+#define ERTS_ASYNC_THREAD_MAX_STACK_SIZE 8192	/* Kilo words */
+extern int erts_async_thread_suggested_stack_size;
+
+#ifdef USE_THREADS
+
+#ifdef ERTS_SMP
+/*
+ * With smp support we can choose to have, or not to
+ * have an async ready queue.
+ */
+#define ERTS_USE_ASYNC_READY_Q 1
+#endif
+
+#ifndef ERTS_SMP
+/* In non-smp case we *need* the async ready queue */
+#  undef ERTS_USE_ASYNC_READY_Q
+#  define ERTS_USE_ASYNC_READY_Q 1
+#endif
+
+#ifndef ERTS_USE_ASYNC_READY_Q
+#  define ERTS_USE_ASYNC_READY_Q 0
+#endif
+
+#if ERTS_USE_ASYNC_READY_Q
+int erts_check_async_ready(void *);
+int erts_async_ready_clean(void *, void *);
+void *erts_get_async_ready_queue(Uint sched_id);
+#define ERTS_ASYNC_READY_CLEAN 0
+#define ERTS_ASYNC_READY_DIRTY 1
+#ifdef ERTS_SMP
+#define ERTS_ASYNC_READY_NEED_THR_PRGR 2
+#endif
+#endif /* ERTS_USE_ASYNC_READY_Q */
+
+#endif /* USE_THREADS */
+
+void erts_init_async(void);
+void erts_exit_flush_async(void);
+
+
+#endif /* ERL_ASYNC_H__ */
diff --git a/erts/emulator/beam/erl_bif_info.c b/erts/emulator/beam/erl_bif_info.c
index 7119306a52..a79feaebdb 100644
--- a/erts/emulator/beam/erl_bif_info.c
+++ b/erts/emulator/beam/erl_bif_info.c
@@ -39,6 +39,7 @@
 #include "dist.h"
 #include "erl_gc.h"
 #include "erl_cpu_topology.h"
+#include "erl_async.h"
 #include "erl_thr_progress.h"
 #ifdef HIPE
 #include "hipe_arch.h"
diff --git a/erts/emulator/beam/erl_db.c b/erts/emulator/beam/erl_db.c
index d8b4294a30..0079c13287 100644
--- a/erts/emulator/beam/erl_db.c
+++ b/erts/emulator/beam/erl_db.c
@@ -280,8 +280,7 @@ static void schedule_free_dbtable(DbTable* tb)
     ASSERT(scheds >= 1);
     ASSERT(erts_refc_read(&tb->common.ref, 0) == 0);
     erts_refc_init(&tb->common.ref, scheds);
-    ERTS_THR_MEMORY_BARRIER;
-    erts_smp_schedule_misc_aux_work(0, scheds, chk_free_dbtable, tb);
+    erts_schedule_multi_misc_aux_work(0, scheds, chk_free_dbtable, tb);
 #else
     free_dbtable(tb);
 #endif
diff --git a/erts/emulator/beam/erl_driver.h b/erts/emulator/beam/erl_driver.h
index 401967a8de..ae0c9def90 100644
--- a/erts/emulator/beam/erl_driver.h
+++ b/erts/emulator/beam/erl_driver.h
@@ -28,6 +28,14 @@
 #  include "config.h"
 #endif
 
+#define ERL_DRV_DEPRECATED_FUNC
+#ifdef __GNUC__
+#  if __GNUC__ >= 3
+#    undef ERL_DRV_DEPRECATED_FUNC
+#    define ERL_DRV_DEPRECATED_FUNC __attribute__((deprecated))
+#  endif
+#endif
+
 #ifdef SIZEOF_CHAR
 #  define SIZEOF_CHAR_SAVED__ SIZEOF_CHAR
 #  undef SIZEOF_CHAR
@@ -582,8 +590,11 @@ EXTERN long driver_async(ErlDrvPort ix,
 			 void* async_data,
 			 void (*async_free)(void*));
 
-
-EXTERN int driver_async_cancel(unsigned int key);
+/*
+ * driver_async_cancel() is deprecated. It is scheduled for removal
+ * in OTP-R16. For more information see the erl_driver(3) documentation.
+ */
+EXTERN int driver_async_cancel(unsigned int key) ERL_DRV_DEPRECATED_FUNC;
 
 /* Locks the driver in the machine "forever", there is
    no unlock function. Note that this is almost never useful, as an open
diff --git a/erts/emulator/beam/erl_init.c b/erts/emulator/beam/erl_init.c
index 647074a47f..9a09f08618 100644
--- a/erts/emulator/beam/erl_init.c
+++ b/erts/emulator/beam/erl_init.c
@@ -43,6 +43,8 @@
 #include "packet_parser.h"
 #include "erl_cpu_topology.h"
 #include "erl_thr_progress.h"
+#include "erl_thr_queue.h"
+#include "erl_async.h"
 
 #ifdef HIPE
 #include "hipe_mode_switch.h"	/* for hipe_mode_switch_init() */
@@ -100,8 +102,6 @@ int erts_backtrace_depth;	/* How many functions to show in a backtrace
 				 * in error codes.
 				 */
 
-int erts_async_max_threads;  /* number of threads for async support */
-int erts_async_thread_suggested_stack_size;
 erts_smp_atomic32_t erts_max_gen_gcs;
 
 Eterm erts_error_logger_warnings; /* What to map warning logs to, am_error, 
@@ -280,6 +280,7 @@ erl_init(int ncpu)
     erts_init_node_tables();
     init_dist();
     erl_drv_thr_init();
+    erts_init_async();
     init_io();
     init_copy();
     init_load();
@@ -606,6 +607,8 @@ early_init(int *argc, char **argv) /*
     int max_main_threads;
     int max_reader_groups;
     int reader_groups;
+    char envbuf[21]; /* enough for any 64-bit integer */
+    size_t envbufsz;
 
     use_multi_run_queue = 1;
     erts_printf_eterm_func = erts_printf_term;
@@ -677,6 +680,16 @@ early_init(int *argc, char **argv) /*
     schdlrs = no_schedulers;
     schdlrs_onln = no_schedulers_online;
 
+    envbufsz = sizeof(envbuf);
+
+    /* erts_sys_getenv() not initialized yet; need erts_sys_getenv__() */
+    if (erts_sys_getenv__("ERL_THREAD_POOL_SIZE", envbuf, &envbufsz) == 0)
+	erts_async_max_threads = atoi(envbuf);
+    else
+	erts_async_max_threads = 0;
+    if (erts_async_max_threads > ERTS_MAX_NO_OF_ASYNC_THREADS)
+	erts_async_max_threads = ERTS_MAX_NO_OF_ASYNC_THREADS;
+
     if (argc && argv) {
 	int i = 1;
 	while (i < *argc) {
@@ -704,6 +717,20 @@ early_init(int *argc, char **argv) /*
 		    }
 		    break;
 		}
+		case 'A': {
+		    /* set number of threads in thread pool */
+		    char *arg = get_arg(argv[i]+2, argv[i+1], &i);
+		    if (((erts_async_max_threads = atoi(arg)) < 0) ||
+			(erts_async_max_threads > ERTS_MAX_NO_OF_ASYNC_THREADS)) {
+			erts_fprintf(stderr,
+				     "bad number of async threads %s\n",
+				     arg);
+			erts_usage();
+			VERBOSE(DEBUG_SYSTEM, ("using %d async-threads\n",
+					       erts_async_max_threads));
+		    }
+		    break;
+		}
 		case 'S' : {
 		    int tot, onln;
 		    char *arg = get_arg(argv[i]+2, argv[i+1], &i);
@@ -784,10 +811,14 @@ early_init(int *argc, char **argv) /*
      * ** Aux thread (see erl_process.c)
      * ** Sys message dispatcher thread (see erl_trace.c)
      *
-     * * No unmanaged threads that need to register.
+     * * Unmanaged threads that need to register:
+     * ** Async threads (see erl_async.c)
      */
-    erts_thr_progress_init(no_schedulers, no_schedulers+1, 0);
+    erts_thr_progress_init(no_schedulers,
+			   no_schedulers+2,
+			   erts_async_max_threads);
 #endif
+    erts_thr_q_init();
     erts_init_utils();
     erts_early_init_cpu_topology(no_schedulers,
 				 &max_main_threads,
@@ -867,7 +898,6 @@ erl_start(int argc, char **argv)
     int have_break_handler = 1;
     char envbuf[21]; /* enough for any 64-bit integer */
     size_t envbufsz;
-    int async_max_threads = erts_async_max_threads;
     int ncpu = early_init(&argc, argv);
 
     envbufsz = sizeof(envbuf);
@@ -883,11 +913,6 @@ erl_start(int argc, char **argv)
 				  (erts_aint32_t) max_gen_gcs);
     }
 
-    envbufsz = sizeof(envbuf);
-    if (erts_sys_getenv("ERL_THREAD_POOL_SIZE", envbuf, &envbufsz) == 0) {
-	async_max_threads = atoi(envbuf);
-    }
-
 #if (defined(__APPLE__) && defined(__MACH__)) || defined(__DARWIN__)
     /*
      * The default stack size on MacOS X is too small for pcre.
@@ -1317,17 +1342,8 @@ erl_start(int argc, char **argv)
 	    break;
 	}
 
-	case 'A':
-	    /* set number of threads in thread pool */
-	    arg = get_arg(argv[i]+2, argv[i+1], &i);
-	    if (((async_max_threads = atoi(arg)) < 0) ||
-		(async_max_threads > ERTS_MAX_NO_OF_ASYNC_THREADS)) {
-		erts_fprintf(stderr, "bad number of async threads %s\n", arg);
-		erts_usage();
-	    }
-
-	    VERBOSE(DEBUG_SYSTEM, ("using %d async-threads\n",
-				   async_max_threads));
+	case 'A': /* Was handled in early init just read past it */
+	    (void) get_arg(argv[i]+2, argv[i+1], &i);
 	    break;
 
 	case 'a':
@@ -1416,10 +1432,6 @@ erl_start(int argc, char **argv)
 	i++;
     }
 
-#ifdef USE_THREADS
-    erts_async_max_threads = async_max_threads;
-#endif
-
     /* Delayed check of +P flag */
     if (erts_max_processes < ERTS_MIN_PROCESSES
 	|| erts_max_processes > ERTS_MAX_PROCESSES
@@ -1465,6 +1477,10 @@ erl_start(int argc, char **argv)
     erts_sys_main_thread(); /* May or may not return! */
 #else
     erts_thr_set_main_status(1, 1);
+#if ERTS_USE_ASYNC_READY_Q
+    erts_get_scheduler_data()->aux_work_data.async_ready.queue
+	= erts_get_async_ready_queue(1);
+#endif
     set_main_stack_size();
     process_main();
 #endif
@@ -1537,14 +1553,7 @@ system_cleanup(int exit_code)
     erts_cleanup_incgc();
 #endif
 
-#if defined(USE_THREADS)
-    exit_async();
-#endif
-
-    /*
-     * A lot more cleaning could/should have been done...
-     */
-
+    erts_exit_flush_async();
 }
 
 /*
diff --git a/erts/emulator/beam/erl_lock_check.c b/erts/emulator/beam/erl_lock_check.c
index 02d1407a2d..44da6b6c51 100644
--- a/erts/emulator/beam/erl_lock_check.c
+++ b/erts/emulator/beam/erl_lock_check.c
@@ -110,10 +110,6 @@ static erts_lc_lock_order_t erts_lock_order[] = {
     {	"fun_tab",				NULL			},
     {	"environ",				NULL			},
 #endif
-    {	"asyncq",				"address"		},
-#ifndef ERTS_SMP
-    {	"async_ready",				NULL			},
-#endif
     {	"efile_drv",				"address"		},
 #if defined(ENABLE_CHILD_WAITER_THREAD) || defined(ERTS_SMP)
     {	"child_status",				NULL			},
@@ -138,6 +134,7 @@ static erts_lc_lock_order_t erts_lock_order[] = {
     {	"alcu_init_atoms",			NULL			},
     {	"mseg_init_atoms",			NULL			},
     {	"drv_tsd",				NULL			},
+    {	"async_enq_mtx",			NULL			},
 #ifdef ERTS_SMP
     {	"sys_msg_q", 				NULL			},
     {	"atom_tab",				NULL			},
@@ -173,14 +170,12 @@ static erts_lc_lock_order_t erts_lock_order[] = {
     {	"timeofday",				NULL			},
     {	"breakpoints",				NULL			},
     {	"pollsets_lock",			NULL			},
-    {	"async_id",				NULL			},
     {	"pix_lock",				"address"		},
     {	"run_queues_lists",			NULL			},
-    {	"misc_aux_work_queue",			"index"			},
-    {	"misc_aux_work_pre_alloc_lock",		"address"		},
     {	"sched_stat",				NULL			},
     {	"run_queue_sleep_list",			"address"		},
 #endif
+    {	"async_init_mtx",			NULL			},
 #ifdef ERTS_SMP
     {	"proc_lck_qs_alloc",			NULL 			},
 #endif
diff --git a/erts/emulator/beam/erl_process.c b/erts/emulator/beam/erl_process.c
index 6adeef2e69..a3c1c9577b 100644
--- a/erts/emulator/beam/erl_process.c
+++ b/erts/emulator/beam/erl_process.c
@@ -40,6 +40,8 @@
 #include "beam_bp.h"
 #include "erl_cpu_topology.h"
 #include "erl_thr_progress.h"
+#include "erl_thr_queue.h"
+#include "erl_async.h"
 
 #define ERTS_RUNQ_CHECK_BALANCE_REDS_PER_SCHED (2000*CONTEXT_REDS)
 #define ERTS_RUNQ_CALL_CHECK_BALANCE_REDS \
@@ -125,7 +127,6 @@ ErtsLcPSDLocks erts_psd_required_locks[ERTS_PSD_SIZE];
 #endif
 
 #ifdef ERTS_SMP
-
 int erts_disable_proc_not_running_opt;
 
 static ErtsAuxWorkData *aux_thread_aux_work_data;
@@ -361,6 +362,15 @@ dbg_chk_aux_work_val(erts_aint32_t value)
 #ifdef ERTS_SSI_AUX_WORK_MISC
     valid |= ERTS_SSI_AUX_WORK_MISC;
 #endif
+#ifdef ERTS_SSI_AUX_WORK_MISC_THR_PRGR
+    valid |= ERTS_SSI_AUX_WORK_MISC_THR_PRGR;
+#endif
+#ifdef ERTS_SSI_AUX_WORK_ASYNC_READY
+    valid |= ERTS_SSI_AUX_WORK_ASYNC_READY;
+#endif
+#ifdef ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN
+    valid |= ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN;
+#endif
 
 #ifdef ERTS_SSI_AUX_WORK_FIX_ALLOC_LOWER_LIM
     valid |= ERTS_SSI_AUX_WORK_FIX_ALLOC_LOWER_LIM;
@@ -707,37 +717,37 @@ unset_aux_work_flags(ErtsSchedulerSleepInfo *ssi, erts_aint32_t flgs)
     return erts_atomic32_read_band_nob(&ssi->aux_work, ~flgs);
 }
 
-#ifdef ERTS_SMP
-
 typedef struct erts_misc_aux_work_t_ erts_misc_aux_work_t;
 struct erts_misc_aux_work_t_ {
-    erts_misc_aux_work_t *next;
     void (*func)(void *);
     void *arg;
 };
 
-typedef struct {
-    erts_smp_mtx_t mtx;
-    erts_misc_aux_work_t *first;
-    erts_misc_aux_work_t *last;
-} erts_misc_aux_work_q_t;
+ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(misc_aux_work,
+				 erts_misc_aux_work_t,
+				 200,
+				 ERTS_ALC_T_MISC_AUX_WORK)
 
 typedef union {
-    erts_misc_aux_work_q_t data;
-    char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(erts_misc_aux_work_q_t))];
+    ErtsThrQ_t q;
+    char align[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsThrQ_t))];
 } erts_algnd_misc_aux_work_q_t;
 
 static erts_algnd_misc_aux_work_q_t *misc_aux_work_queues;
 
-ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(misc_aux_work,
-				 erts_misc_aux_work_t,
-				 200,
-				 ERTS_ALC_T_MISC_AUX_WORK)
+static void
+notify_aux_work(void *vssi)
+{
+    set_aux_work_flags_wakeup_nob((ErtsSchedulerSleepInfo *) vssi,
+				  ERTS_SSI_AUX_WORK_MISC);
+}
 
 static void
 init_misc_aux_work(void)
 {
     int ix;
+    ErtsThrQInit_t qinit = ERTS_THR_Q_INIT_DEFAULT;
+    qinit.notify = notify_aux_work;
 
     init_misc_aux_work_alloc();
 
@@ -746,88 +756,189 @@ init_misc_aux_work(void)
 					   sizeof(erts_algnd_misc_aux_work_q_t)
 					   * (erts_no_schedulers+1));
 
-    for (ix = 0; ix <= erts_no_schedulers; ix++) {
-	erts_smp_mtx_init_x(&misc_aux_work_queues[ix].data.mtx,
-			    "misc_aux_work_queue",
-			    make_small(ix));
-	misc_aux_work_queues[ix].data.first = NULL;
-	misc_aux_work_queues[ix].data.last = NULL;
+#ifdef ERTS_SMP
+    ix = 0; /* aux_thread + schedulers */
+#else
+    ix = 1; /* scheduler only */
+#endif
+
+    for (; ix <= erts_no_schedulers; ix++) {
+	qinit.arg = (void *) ERTS_SCHED_SLEEP_INFO_IX(ix-1);
+	erts_thr_q_initialize(&misc_aux_work_queues[ix].q, &qinit);
+    }
+}
+
+static erts_aint32_t
+misc_aux_work_clean(ErtsThrQ_t *q,
+		    ErtsAuxWorkData *awdp,
+		    erts_aint32_t aux_work)
+{
+    switch (erts_thr_q_clean(q)) {
+    case ERTS_THR_Q_DIRTY:
+	set_aux_work_flags(awdp->ssi, ERTS_SSI_AUX_WORK_MISC);
+	return aux_work | ERTS_SSI_AUX_WORK_MISC;
+#ifdef ERTS_SMP
+    case ERTS_THR_Q_NEED_THR_PRGR:
+	set_aux_work_flags(awdp->ssi, ERTS_SSI_AUX_WORK_MISC_THR_PRGR);
+	erts_thr_progress_wakeup(awdp->esdp,
+				 erts_thr_q_need_thr_progress(q));
+#endif
+    case ERTS_THR_Q_CLEAN:
+	break;
     }
+    return aux_work;
 }
 
 static erts_aint32_t
 handle_misc_aux_work(ErtsAuxWorkData *awdp,
 		     erts_aint32_t aux_work)
 {
-    int ix = (int) awdp->sched_id;
-    erts_misc_aux_work_t *mawp;
+    ErtsThrQ_t *q = &misc_aux_work_queues[awdp->sched_id].q;
 
     unset_aux_work_flags(awdp->ssi, ERTS_SSI_AUX_WORK_MISC);
-
-    erts_smp_mtx_lock(&misc_aux_work_queues[ix].data.mtx);
-    mawp = misc_aux_work_queues[ix].data.first;
-    misc_aux_work_queues[ix].data.first = NULL;
-    misc_aux_work_queues[ix].data.last = NULL;
-    erts_smp_mtx_unlock(&misc_aux_work_queues[ix].data.mtx);
-
-    while (mawp) {
-	erts_misc_aux_work_t *free_mawp;
+    while (1) {
+	erts_misc_aux_work_t *mawp = erts_thr_q_dequeue(q);
+	if (!mawp)
+	    break;
 	mawp->func(mawp->arg);
-	free_mawp = mawp;
-	mawp = mawp->next;
-	misc_aux_work_free(free_mawp);
+	misc_aux_work_free(mawp);
     }
 
-    return aux_work & ~ERTS_SSI_AUX_WORK_MISC;
+    return misc_aux_work_clean(q, awdp, aux_work & ~ERTS_SSI_AUX_WORK_MISC);
 }
 
-static void
-smp_schedule_misc_aux_work(int ix,
-			   void (*func)(void *),
-			   void *arg)
+#ifdef ERTS_SMP
+
+static erts_aint32_t
+handle_misc_aux_work_thr_prgr(ErtsAuxWorkData *awdp,
+			      erts_aint32_t aux_work)
 {
-    erts_aint32_t aux_work;
+    if (!erts_thr_progress_has_reached(awdp->misc.thr_prgr))
+	return aux_work;
+
+    unset_aux_work_flags(awdp->ssi, ERTS_SSI_AUX_WORK_MISC_THR_PRGR);
+
+    return misc_aux_work_clean(&misc_aux_work_queues[awdp->sched_id].q,
+			       awdp,
+			       aux_work & ~ERTS_SSI_AUX_WORK_MISC_THR_PRGR);
+}
+
+#endif
+
+static ERTS_INLINE void
+schedule_misc_aux_work(int sched_id,
+		       void (*func)(void *),
+		       void *arg)
+{
+    ErtsThrQ_t *q;
     erts_misc_aux_work_t *mawp;
-    ErtsSchedulerSleepInfo *ssi;
 
-    mawp = misc_aux_work_alloc();
+#ifdef ERTS_SMP
+    ASSERT(0 <= sched_id && sched_id <= erts_no_schedulers);
+#else
+    ASSERT(sched_id == 1);
+#endif
 
+    q = &misc_aux_work_queues[sched_id].q;
+    mawp = misc_aux_work_alloc();
     mawp->func = func;
     mawp->arg = arg;
-    mawp->next = NULL;
-
-    erts_smp_mtx_lock(&misc_aux_work_queues[ix].data.mtx);
-    if (!misc_aux_work_queues[ix].data.last)
-	misc_aux_work_queues[ix].data.first = mawp;
-    else
-	misc_aux_work_queues[ix].data.last->next = mawp;
-    misc_aux_work_queues[ix].data.last = mawp;
-    erts_smp_mtx_unlock(&misc_aux_work_queues[ix].data.mtx);
+    erts_thr_q_enqueue(q, mawp);
+}
 
-    set_aux_work_flags_wakeup_nob(ERTS_SCHED_SLEEP_INFO_IX(ix-1),
-				  ERTS_SSI_AUX_WORK_MISC);
+void
+erts_schedule_misc_aux_work(int sched_id,
+			    void (*func)(void *),
+			    void *arg)
+{
+    schedule_misc_aux_work(sched_id, func, arg);
 }
 
 void
-erts_smp_schedule_misc_aux_work(int ignore_self,
-				int max_sched,
-				void (*func)(void *),
-				void *arg)
+erts_schedule_multi_misc_aux_work(int ignore_self,
+				  int max_sched,
+				  void (*func)(void *),
+				  void *arg)
 {
-    int ix, ignore_ix = -1;
+    int id, self = 0;
 
     if (ignore_self) {
 	ErtsSchedulerData *esdp = erts_get_scheduler_data();
 	if (esdp)
-	    ignore_ix = (int) esdp->no;
+	    self = (int) esdp->no;
     }
 
     ASSERT(0 < max_sched && max_sched <= erts_no_schedulers);
 
-    for (ix = 1; ix <= max_sched; ix++) {
-	if (ix == ignore_ix)
+    for (id = 1; id <= max_sched; id++) {
+	if (id == self)
 	    continue;
-	smp_schedule_misc_aux_work(ix, func, arg);
+	schedule_misc_aux_work(id, func, arg);
+   }
+}
+
+#if ERTS_USE_ASYNC_READY_Q
+
+void
+erts_notify_check_async_ready_queue(void *vno)
+{
+    int ix = ((int) (SWord) vno) -1;
+    set_aux_work_flags_wakeup_nob(ERTS_SCHED_SLEEP_INFO_IX(ix),
+				  ERTS_SSI_AUX_WORK_ASYNC_READY);
+}
+
+static erts_aint32_t
+handle_async_ready(ErtsAuxWorkData *awdp,
+		   erts_aint32_t aux_work)
+{
+    ErtsSchedulerSleepInfo *ssi = awdp->ssi;
+    unset_aux_work_flags(ssi, ERTS_SSI_AUX_WORK_ASYNC_READY);
+    if (erts_check_async_ready(awdp->async_ready.queue)) {
+	if (set_aux_work_flags(ssi, ERTS_SSI_AUX_WORK_ASYNC_READY)
+	    & ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN) {
+	    unset_aux_work_flags(ssi, ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN);
+	    aux_work &= ~ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN;
+	}
+	return aux_work;
+    }
+#ifdef ERTS_SMP
+    awdp->async_ready.need_thr_prgr = 0;
+#endif
+    set_aux_work_flags(ssi, ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN);
+    return ((aux_work & ~ERTS_SSI_AUX_WORK_ASYNC_READY)
+	    | ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN);
+}
+
+static erts_aint32_t
+handle_async_ready_clean(ErtsAuxWorkData *awdp,
+			 erts_aint32_t aux_work)
+{
+    void *thr_prgr_p;
+
+#ifdef ERTS_SMP
+    if (awdp->async_ready.need_thr_prgr
+	&& !erts_thr_progress_has_reached(awdp->misc.thr_prgr)) {
+	return aux_work & ~ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN;
+    }
+
+    awdp->async_ready.need_thr_prgr = 0;
+    thr_prgr_p = (void *) &awdp->async_ready.thr_prgr;
+#else
+    thr_prgr_p = NULL;
+#endif
+
+    switch (erts_async_ready_clean(awdp->async_ready.queue, thr_prgr_p)) {
+    case ERTS_ASYNC_READY_CLEAN:
+	unset_aux_work_flags(awdp->ssi, ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN);
+	return aux_work & ~ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN;
+#ifdef ERTS_SMP
+    case ERTS_ASYNC_READY_NEED_THR_PRGR:
+	erts_thr_progress_wakeup(awdp->esdp,
+				 awdp->async_ready.thr_prgr);
+	awdp->async_ready.need_thr_prgr = 1;
+#endif
+    default:
+	return aux_work;
     }
 }
 
@@ -964,14 +1075,14 @@ prep_setup_completed_dealloc(void *vproc)
     erts_aint32_t count = (erts_aint32_t) (erts_no_schedulers+1);
     if (erts_atomic32_dec_read_mb(&completed_dealloc_count) == count) {
 	/* scheduler threads */
-	erts_smp_schedule_misc_aux_work(0,
-					erts_no_schedulers,
-					setup_completed_dealloc,
-					vproc);
+	erts_schedule_multi_misc_aux_work(0,
+					  erts_no_schedulers,
+					  setup_completed_dealloc,
+					  vproc);
 	/* aux_thread */
-	smp_schedule_misc_aux_work(0,
-				   setup_completed_dealloc,
-				   vproc);
+	erts_schedule_misc_aux_work(0,
+				    setup_completed_dealloc,
+				    vproc);
     }
 }
 
@@ -992,14 +1103,14 @@ erts_debug_wait_deallocations(Process *c_p)
 	erts_suspend(c_p, ERTS_PROC_LOCK_MAIN, NULL);
 	erts_smp_proc_inc_refc(c_p);
 	/* scheduler threads */
-	erts_smp_schedule_misc_aux_work(0,
-					erts_no_schedulers,
-					prep_setup_completed_dealloc,
-					(void *) c_p);
+	erts_schedule_multi_misc_aux_work(0,
+					  erts_no_schedulers,
+					  prep_setup_completed_dealloc,
+					  (void *) c_p);
 	/* aux_thread */
-	smp_schedule_misc_aux_work(0,
-				   prep_setup_completed_dealloc,
-				   (void *) c_p);
+	erts_schedule_misc_aux_work(0,
+				    prep_setup_completed_dealloc,
+				    (void *) c_p);
 	return 1;
     }
     return 0;
@@ -1062,10 +1173,24 @@ handle_aux_work(ErtsAuxWorkData *awdp, erts_aint32_t aux_work)
 	ERTS_DBG_CHK_AUX_WORK_VAL(aux_work);
     }
 #ifdef ERTS_SMP
+    if (aux_work & ERTS_SSI_AUX_WORK_MISC_THR_PRGR) {
+	aux_work = handle_misc_aux_work_thr_prgr(awdp, aux_work);
+	ERTS_DBG_CHK_AUX_WORK_VAL(aux_work);
+    }
+#endif
     if (aux_work & ERTS_SSI_AUX_WORK_MISC) {
 	aux_work = handle_misc_aux_work(awdp, aux_work);
 	ERTS_DBG_CHK_AUX_WORK_VAL(aux_work);
     }
+#if ERTS_USE_ASYNC_READY_Q
+    if (aux_work & ERTS_SSI_AUX_WORK_ASYNC_READY) {
+	aux_work = handle_async_ready(awdp, aux_work);
+	ERTS_DBG_CHK_AUX_WORK_VAL(aux_work);
+    }
+    if (aux_work & ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN) {
+	aux_work = handle_async_ready_clean(awdp, aux_work);
+	ERTS_DBG_CHK_AUX_WORK_VAL(aux_work);
+    }
 #endif
 #ifdef ERTS_SMP_SCHEDULERS_NEED_TO_CHECK_CHILDREN
     if (aux_work & ERTS_SSI_AUX_WORK_CHECK_CHILDREN) {
@@ -3191,10 +3316,18 @@ init_aux_work_data(ErtsAuxWorkData *awdp, ErtsSchedulerData *esdp)
     awdp->esdp = esdp;
     awdp->ssi = esdp ? esdp->ssi : NULL;
 #ifdef ERTS_SMP
+    awdp->misc.thr_prgr = ERTS_THR_PRGR_VAL_WAITING;
     awdp->dd.thr_prgr = ERTS_THR_PRGR_VAL_WAITING;
     awdp->dd.completed_callback = NULL;
     awdp->dd.completed_arg = NULL;
 #endif
+#ifdef ERTS_USE_ASYNC_READY_Q
+#ifdef ERTS_SMP
+    awdp->async_ready.need_thr_prgr = 0;
+    awdp->async_ready.thr_prgr = ERTS_THR_PRGR_VAL_WAITING;
+#endif
+    awdp->async_ready.queue = NULL;
+#endif
 }
 
 void
@@ -3385,9 +3518,10 @@ erts_init_scheduling(int mrq, int no_schedulers, int no_schedulers_online)
 	init_aux_work_data(&esdp->aux_work_data, esdp);
     }
 
-#ifdef ERTS_SMP
     init_misc_aux_work();
 
+#ifdef ERTS_SMP
+
     erts_atomic32_init_nob(&completed_dealloc_count, 0); /* debug only */
 
     aux_thread_aux_work_data =
@@ -4408,6 +4542,9 @@ sched_thread_func(void *vesdp)
 #if HAVE_ERTS_MSEG
     erts_mseg_late_init();
 #endif
+#if ERTS_USE_ASYNC_READY_Q
+    esdp->aux_work_data.async_ready.queue = erts_get_async_ready_queue(no);
+#endif
 
     erts_sched_init_check_cpu_bind(esdp);
 
diff --git a/erts/emulator/beam/erl_process.h b/erts/emulator/beam/erl_process.h
index 8a0944236c..4027fade35 100644
--- a/erts/emulator/beam/erl_process.h
+++ b/erts/emulator/beam/erl_process.h
@@ -54,6 +54,7 @@ typedef struct process Process;
 #include "erl_atom_table.h"
 #include "external.h"
 #include "erl_mseg.h"
+#include "erl_async.h"
 
 #ifdef HIPE
 #include "hipe_process.h"
@@ -251,13 +252,18 @@ typedef enum {
 #define ERTS_SSI_AUX_WORK_SET_TMO		(((erts_aint32_t) 1) << 0)
 #define ERTS_SSI_AUX_WORK_CHECK_CHILDREN	(((erts_aint32_t) 1) << 1)
 #define ERTS_SSI_AUX_WORK_MISC			(((erts_aint32_t) 1) << 2)
-#define ERTS_SSI_AUX_WORK_FIX_ALLOC_LOWER_LIM	(((erts_aint32_t) 1) << 3)
-#define ERTS_SSI_AUX_WORK_FIX_ALLOC_DEALLOC	(((erts_aint32_t) 1) << 4)
 #ifdef ERTS_SMP
-#define ERTS_SSI_AUX_WORK_DD			(((erts_aint32_t) 1) << 5)
-#define ERTS_SSI_AUX_WORK_DD_THR_PRGR		(((erts_aint32_t) 1) << 6)
+#define ERTS_SSI_AUX_WORK_MISC_THR_PRGR		(((erts_aint32_t) 1) << 3)
 #endif
-#define ERTS_SSI_AUX_WORK_MSEG_CACHE_CHECK	(((erts_aint32_t) 1) << 7)
+#define ERTS_SSI_AUX_WORK_FIX_ALLOC_LOWER_LIM	(((erts_aint32_t) 1) << 4)
+#define ERTS_SSI_AUX_WORK_FIX_ALLOC_DEALLOC	(((erts_aint32_t) 1) << 5)
+#define ERTS_SSI_AUX_WORK_ASYNC_READY		(((erts_aint32_t) 1) << 6)
+#define ERTS_SSI_AUX_WORK_ASYNC_READY_CLEAN	(((erts_aint32_t) 1) << 7)
+#ifdef ERTS_SMP
+#define ERTS_SSI_AUX_WORK_DD			(((erts_aint32_t) 1) << 8)
+#define ERTS_SSI_AUX_WORK_DD_THR_PRGR		(((erts_aint32_t) 1) << 9)
+#endif
+#define ERTS_SSI_AUX_WORK_MSEG_CACHE_CHECK	(((erts_aint32_t) 1) << 10)
 
 #if !HAVE_ERTS_MSEG
 #  undef ERTS_SSI_AUX_WORK_MSEG_CACHE_CHECK
@@ -404,6 +410,9 @@ typedef struct {
     ErtsSchedulerSleepInfo *ssi;
     struct {
 	int ix;
+#ifdef ERTS_SMP
+	ErtsThrPrgrVal thr_prgr;
+#endif
     } misc;
 #ifdef ERTS_SMP
     struct {
@@ -412,6 +421,15 @@ typedef struct {
 	void (*completed_arg)(void *);
     } dd;
 #endif
+#ifdef ERTS_USE_ASYNC_READY_Q
+    struct {
+#ifdef ERTS_SMP
+	int need_thr_prgr;
+	ErtsThrPrgrVal thr_prgr;
+#endif
+	void *queue;
+    } async_ready;
+#endif
 } ErtsAuxWorkData;
 
 struct ErtsSchedulerData_ {
@@ -1090,12 +1108,17 @@ Eterm erts_multi_scheduling_blockers(Process *);
 void erts_start_schedulers(void);
 void erts_alloc_notify_delayed_dealloc(int);
 void erts_smp_notify_check_children_needed(void);
-void
-erts_smp_schedule_misc_aux_work(int ignore_self,
-				int max_sched,
-				void (*func)(void *),
-				void *arg);
 #endif
+#if ERTS_USE_ASYNC_READY_Q
+void erts_notify_check_async_ready_queue(void *);
+#endif
+void erts_schedule_misc_aux_work(int sched_id,
+				 void (*func)(void *),
+				 void *arg);
+void erts_schedule_multi_misc_aux_work(int ignore_self,
+				       int max_sched,
+				       void (*func)(void *),
+				       void *arg);
 erts_aint32_t erts_set_aux_work_timeout(int, erts_aint32_t, int);
 void erts_sched_notify_check_cpu_bind(void);
 Uint erts_active_schedulers(void);
diff --git a/erts/emulator/beam/erl_thr_queue.c b/erts/emulator/beam/erl_thr_queue.c
new file mode 100644
index 0000000000..9ac4cd4b8e
--- /dev/null
+++ b/erts/emulator/beam/erl_thr_queue.c
@@ -0,0 +1,745 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2011. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+/*
+ * Description: Lock-free queue for communication between threads.
+ *
+ *              Currently only a many-to-one version has been,
+ *              implemented, i.e., many threads can enqueue but
+ *              only one thread can dequeue at a time. It doesn't
+ *              have to be the same thread dequeuing every time, but
+ *              synchronization so that only one thread dequeues
+ *              at a time has to be provided by other means.
+ *
+ *              When/If the need for a many-to-many queue arises,
+ *              this implementation can relatively easy be extended
+ *              to support that too.
+ *
+ *              Usage instructions below.
+ *
+ * Author: 	Rickard Green
+ */
+
+/*
+ * ------ Usage instructions -----------------------------------------------
+ *
+ * Dequeuing generates garbage that needs to be cleaned up.
+ * erts_thr_q_dequeue() automatically cleans, but garbage may have to be
+ * cleaned up also when the queue is empty. This is done by calling
+ * erts_thr_q_clean(). In the SMP case thread progress may have to be made
+ * before cleaning can continue. If so, erts_thr_q_need_thr_progress() in
+ * combination with erts_thr_progress_wakeup() can be used in order to
+ * request a wakeup at appropriate time.
+ *
+ * Enqueuing implies memory allocation and dequeuing implies memory
+ * deallocation. Memory allocation can be moved to another more suitable
+ * thread using  erts_thr_q_prepare_enqueue() together with
+ * erts_thr_q_enqueue_prepared() instead of using erts_thr_q_enqueue().
+ * Memory deallocation can can be moved to another more suitable thread by
+ * disabling auto_finalize_dequeue when initializing the queue and then use
+ * erts_thr_q_get_finalize_dequeue_data() together
+ * erts_thr_q_finalize_dequeue() after dequeuing or cleaning.
+ *
+ * Ending the life of the queue using either erts_thr_q_destroy()
+ * or erts_thr_q_finalize() impies cleaning the queue. Both functions
+ * return the cleaning result and may have to be called multiple times
+ * until the queue is clean. Once one of these functions have been called
+ * enqueuing is not allowed. This has to be synchronized by the user.
+ * If auto_finalize_dequeue has been disabled, the finalize dequeue
+ * functionality has to be called after ending the life of the queue just
+ * as when dequeuing or cleaning on a queue that is alive.
+ *
+ * -------------------------------------------------------------------------
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "erl_thr_queue.h"
+
+#if defined(DEBUG)
+#define ERTS_THR_Q_DBG_CHK_DATA 1
+#else
+#define ERTS_THR_Q_DBG_CHK_DATA 0
+#endif
+
+#define ERTS_THR_Q_MAX_CLEAN_REACHED_HEAD_COUNT 100
+#define ERTS_THR_Q_MAX_SCHED_CLEAN_OPS 50
+#define ERTS_THR_Q_MAX_DEQUEUE_CLEAN_OPS 3
+
+#define ERTS_THR_Q_MAX_FINI_DEQ_OPS 50
+
+#ifdef ERTS_SMP
+ERTS_SCHED_PREF_QUICK_ALLOC_IMPL(sl_element,
+				 ErtsThrQElement_t,
+				 1000,
+				 ERTS_ALC_T_THR_Q_EL_SL)
+#else
+
+static void
+init_sl_element_alloc(void)
+{
+}
+
+static ErtsThrQElement_t *
+sl_element_alloc(void)
+{
+    return erts_alloc(ERTS_ALC_T_THR_Q_EL_SL,
+		      sizeof(ErtsThrQElement_t));
+}
+
+static void
+sl_element_free(ErtsThrQElement_t *p)
+{
+    erts_free(ERTS_ALC_T_THR_Q_EL_SL, p);
+}
+
+#endif
+
+typedef union {
+    ErtsThrQ_t q;
+    char align__[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsThrQ_t))];
+} ErtsAlignedThrQ_t;
+
+void
+erts_thr_q_init(void)
+{
+    init_sl_element_alloc();
+}
+
+static void noop_callback(void *arg) { }
+
+void
+erts_thr_q_initialize(ErtsThrQ_t *q, ErtsThrQInit_t *qi)
+{
+#ifndef USE_THREADS
+    q->init = *qi;
+    if (!q->init.notify)
+	q->init.notify = noop_callback;
+    q->first = NULL;
+    q->last = NULL;
+    q->q.blk = NULL;
+#else
+    erts_atomic_init_nob(&q->tail.data.marker.next.atmc, ERTS_AINT_NULL);
+    q->tail.data.marker.data.ptr = NULL;
+    erts_atomic_init_nob(&q->tail.data.last,
+			 (erts_aint_t) &q->tail.data.marker);
+    erts_atomic_init_nob(&q->tail.data.um_refc[0], 0);
+    erts_atomic_init_nob(&q->tail.data.um_refc[1], 0);
+    erts_atomic32_init_nob(&q->tail.data.um_refc_ix, 0);
+    q->tail.data.live = qi->live.objects;
+    q->tail.data.arg = qi->arg;
+    q->tail.data.notify = qi->notify;
+    if (!q->tail.data.notify)
+	q->tail.data.notify = noop_callback;
+
+    q->head.head.ptr = &q->tail.data.marker;
+    q->head.live = qi->live.objects;
+    q->head.first = &q->tail.data.marker;
+    q->head.unref_end = &q->tail.data.marker;
+    q->head.clean_reached_head_count = 0;
+    q->head.deq_fini.automatic = qi->auto_finalize_dequeue;
+    q->head.deq_fini.start = NULL;
+    q->head.deq_fini.end = NULL;
+#ifdef ERTS_SMP
+    q->head.next.thr_progress = erts_thr_progress_current();
+    q->head.next.thr_progress_reached = 1;
+#endif
+    q->head.next.um_refc_ix = 1;
+    q->head.next.unref_end = &q->tail.data.marker;
+    q->head.used_marker = 1;
+    q->head.arg = qi->arg;
+    q->head.notify = q->tail.data.notify;
+    q->q.finalizing = 0;
+    q->q.live = qi->live.queue;
+    q->q.blk = NULL;
+#endif
+}
+
+ErtsThrQCleanState_t
+erts_thr_q_finalize(ErtsThrQ_t *q)
+{
+#ifdef USE_THREADS
+    q->q.finalizing = 1;
+#endif
+    while (erts_thr_q_dequeue(q));
+    return erts_thr_q_clean(q);
+}
+
+ErtsThrQ_t *
+erts_thr_q_create(ErtsThrQInit_t *qi)
+{
+    ErtsAlcType_t atype;
+    ErtsThrQ_t *q, *qblk;
+    UWord qw;
+
+    switch (qi->live.queue) {
+    case ERTS_THR_Q_LIVE_SHORT:
+	atype = ERTS_ALC_T_THR_Q_SL;
+	break;
+    case ERTS_THR_Q_LIVE_LONG:
+	atype = ERTS_ALC_T_THR_Q_LL;
+	break;
+    default:
+	atype = ERTS_ALC_T_THR_Q;
+	break;
+    }
+
+    qw = (UWord) erts_alloc(atype,
+			    sizeof(ErtsThrQ_t) + (ERTS_CACHE_LINE_SIZE-1));
+    qblk = (ErtsThrQ_t *) qw;
+    if (qw & ERTS_CACHE_LINE_MASK)
+	qw = (qw & ~ERTS_CACHE_LINE_MASK) + ERTS_CACHE_LINE_SIZE;
+    ASSERT((qw & ERTS_CACHE_LINE_MASK) == 0);
+    q = (ErtsThrQ_t *) qw;
+    erts_thr_q_initialize(q, qi);
+    q->q.blk = qblk;
+    return q;
+}
+
+ErtsThrQCleanState_t
+erts_thr_q_destroy(ErtsThrQ_t *q)
+{
+    if (!q->q.blk)
+	erl_exit(ERTS_ABORT_EXIT,
+		 "Trying to destroy not created thread queue\n");
+    return erts_thr_q_finalize(q);
+}
+
+#ifdef USE_THREADS
+
+static void
+destroy(ErtsThrQ_t *q)
+{
+    ErtsAlcType_t atype;
+    switch (q->q.live) {
+    case ERTS_THR_Q_LIVE_SHORT:
+	atype = ERTS_ALC_T_THR_Q_SL;
+	break;
+    case ERTS_THR_Q_LIVE_LONG:
+	atype = ERTS_ALC_T_THR_Q_LL;
+	break;
+    default:
+	atype = ERTS_ALC_T_THR_Q;
+	break;
+    }
+    erts_free(atype, q->q.blk);
+}
+
+#endif
+
+static ERTS_INLINE ErtsThrQElement_t *
+element_live_alloc(ErtsThrQLive_t live)
+{
+    switch (live) {
+    case ERTS_THR_Q_LIVE_SHORT:
+	return sl_element_alloc();
+    default:
+	return (ErtsThrQElement_t *) erts_alloc(ERTS_ALC_T_THR_Q_EL,
+						sizeof(ErtsThrQElement_t));
+    }
+}
+
+static ERTS_INLINE ErtsThrQElement_t *
+element_alloc(ErtsThrQ_t *q)
+{
+    ErtsThrQLive_t live;
+#ifdef USE_THREADS
+    live = q->tail.data.live;
+#else
+    live = q->init.live.objects;
+#endif
+    return element_live_alloc(live);
+}
+
+static ERTS_INLINE void
+element_live_free(ErtsThrQLive_t live, ErtsThrQElement_t *el)
+{
+    switch (live) {
+    case ERTS_THR_Q_LIVE_SHORT:
+	sl_element_free(el);
+	break;
+    default:
+	erts_free(ERTS_ALC_T_THR_Q_EL, el);
+    }
+}
+
+static ERTS_INLINE void
+element_free(ErtsThrQ_t *q, ErtsThrQElement_t *el)
+{
+    ErtsThrQLive_t live;
+#ifdef USE_THREADS
+    live = q->head.live;
+#else
+    live = q->init.live.objects;
+#endif
+    element_live_free(live, el);
+}
+
+#ifdef USE_THREADS
+
+static ERTS_INLINE ErtsThrQElement_t *
+enqueue_managed(ErtsThrQ_t *q, ErtsThrQElement_t *this, int want_last)
+{
+    erts_aint_t ilast, itmp;
+
+    erts_atomic_init_nob(&this->next.atmc, ERTS_AINT_NULL);
+    /* Enqueue at end of list... */
+
+    ilast = erts_atomic_read_nob(&q->tail.data.last);
+    while (1) {
+	ErtsThrQElement_t *last = (ErtsThrQElement_t *) ilast;
+	itmp = erts_atomic_cmpxchg_mb(&last->next.atmc,
+				      (erts_aint_t) this,
+				      ERTS_AINT_NULL);
+	if (itmp == ERTS_AINT_NULL)
+	    break;
+	ilast = itmp;
+    }
+
+    /* Move last pointer forward... */
+    while (1) {
+	if (want_last) {
+	    if (erts_atomic_read_rb(&this->next.atmc) != ERTS_AINT_NULL) {
+		/* Someone else will move it forward */
+		ilast = erts_atomic_read_rb(&q->tail.data.last);
+		return (ErtsThrQElement_t *) ilast;
+	    }
+	}
+	else {
+	    if (erts_atomic_read_nob(&this->next.atmc) != ERTS_AINT_NULL) {
+		/* Someone else will move it forward */
+		return NULL;
+	    }
+	}
+	itmp = erts_atomic_cmpxchg_mb(&q->tail.data.last,
+				      (erts_aint_t) this,
+				      ilast);
+	if (ilast == itmp)
+	    return want_last ? this : NULL;
+	ilast = itmp;
+    }
+}
+
+static ErtsThrQCleanState_t
+clean(ErtsThrQ_t *q, int max_ops, int do_notify)
+{
+    erts_aint_t ilast;
+    int um_refc_ix;
+    int ops;
+
+    for (ops = 0; ops < max_ops; ops++) {
+	ErtsThrQElement_t *tmp;
+    restart:
+	ASSERT(q->head.first);
+	if (q->head.first == q->head.head.ptr) {
+	    q->head.clean_reached_head_count++;
+	    if (q->head.clean_reached_head_count
+		>= ERTS_THR_Q_MAX_CLEAN_REACHED_HEAD_COUNT) {
+		q->head.clean_reached_head_count = 0;
+		break;
+	    }
+	    goto inspect_head;
+	}
+	if (q->head.first == q->head.unref_end)	    
+	    break;
+	if (q->head.first == &q->tail.data.marker) {
+	    q->head.used_marker = 0;
+	    q->head.first = q->head.first->next.ptr;
+	    goto restart;
+	}
+	tmp = q->head.first;
+	q->head.first = q->head.first->next.ptr;
+	if (q->head.deq_fini.automatic)
+	    element_free(q, tmp);
+	else {
+	    tmp->data.ptr = (void *) (UWord) q->head.live;
+	    if (!q->head.deq_fini.start)
+		q->head.deq_fini.start = tmp;
+	    else if (q->head.deq_fini.end->next.ptr == &q->tail.data.marker)
+		q->head.deq_fini.end->next.ptr = tmp;
+	    q->head.deq_fini.end = tmp;
+	}
+    }
+
+    ilast = erts_atomic_read_nob(&q->tail.data.last);
+    if (q->head.first == ((ErtsThrQElement_t *) ilast)
+	&& ((ErtsThrQElement_t *) ilast) == &q->tail.data.marker
+	&& q->head.first == &q->tail.data.marker) {
+	/* Empty and clean queue */
+	if (q->q.finalizing)
+	    destroy(q);
+	return ERTS_THR_Q_CLEAN;
+    }
+
+#ifdef ERTS_SMP
+    if (q->head.next.thr_progress_reached
+	|| erts_thr_progress_has_reached(q->head.next.thr_progress)) {
+	q->head.next.thr_progress_reached = 1;
+#endif
+	um_refc_ix = q->head.next.um_refc_ix;
+	if (erts_atomic_read_acqb(&q->tail.data.um_refc[um_refc_ix]) == 0) {
+	    /* Move unreferenced end pointer forward... */
+	    q->head.clean_reached_head_count = 0;
+	    q->head.unref_end = q->head.next.unref_end;
+
+	    if (!q->head.used_marker
+		&& q->head.unref_end == (ErtsThrQElement_t *) ilast) {
+		q->head.used_marker = 1;
+		ilast = (erts_aint_t) enqueue_managed(q,
+						      &q->tail.data.marker,
+						      1);
+		if (q->head.head.ptr == q->head.unref_end) {
+		    ErtsThrQElement_t *next;
+		    next = ((ErtsThrQElement_t *)
+			    erts_atomic_read_acqb(&q->head.head.ptr->next.atmc));
+		    if (next == &q->tail.data.marker) {
+			q->head.head.ptr->next.ptr = &q->tail.data.marker;
+			q->head.head.ptr = &q->tail.data.marker;
+		    }
+		}
+	    }
+
+	    if (q->head.unref_end == (ErtsThrQElement_t *) ilast)
+		ERTS_THR_MEMORY_BARRIER;
+	    else {
+		q->head.next.unref_end = (ErtsThrQElement_t *) ilast;
+		ERTS_THR_MEMORY_BARRIER;
+#ifdef ERTS_SMP
+		q->head.next.thr_progress = erts_thr_progress_later();
+#endif
+		erts_atomic32_set_relb(&q->tail.data.um_refc_ix,
+				       um_refc_ix);
+		q->head.next.um_refc_ix = um_refc_ix == 0 ? 1 : 0;
+#ifdef ERTS_SMP
+		q->head.next.thr_progress_reached = 0;
+#endif
+	    }
+	}
+#ifdef ERTS_SMP
+    }
+#endif
+
+    if (q->head.first == q->head.head.ptr) {
+    inspect_head:
+	if (!q->head.used_marker) {
+	    erts_aint_t inext;
+	    inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+	    if (inext == ERTS_AINT_NULL) {
+		q->head.used_marker = 1;
+		(void) enqueue_managed(q, &q->tail.data.marker, 0);
+		inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+		if (inext == (erts_aint_t) &q->tail.data.marker) {
+		    q->head.head.ptr->next.ptr = &q->tail.data.marker;
+		    q->head.head.ptr = &q->tail.data.marker;
+#ifdef ERTS_SMP
+		    if (!q->head.next.thr_progress_reached)
+			return ERTS_THR_Q_NEED_THR_PRGR;
+#else
+		    if (do_notify)
+			q->head.notify(q->head.arg);
+#endif
+		    return ERTS_THR_Q_DIRTY;
+		}
+	    }
+	}
+	return ERTS_THR_Q_CLEAN;
+    }
+
+    if (q->head.first != q->head.unref_end) {
+	if (do_notify)
+	    q->head.notify(q->head.arg);
+	return ERTS_THR_Q_DIRTY;
+    }
+
+#ifdef ERTS_SMP
+    if (!q->head.next.thr_progress_reached)
+	return ERTS_THR_Q_NEED_THR_PRGR;
+#endif
+
+    return ERTS_THR_Q_CLEAN; /* Waiting for unmanaged threads to complete... */
+}
+
+#endif
+
+ErtsThrQCleanState_t
+erts_thr_q_clean(ErtsThrQ_t *q)
+{
+#ifdef USE_THREADS
+    return clean(q, ERTS_THR_Q_MAX_SCHED_CLEAN_OPS, 0);
+#else
+    return ERTS_THR_Q_CLEAN;
+#endif
+}
+
+ErtsThrQCleanState_t
+erts_thr_q_inspect(ErtsThrQ_t *q, int ensure_empty)
+{
+#ifdef USE_THREADS
+    if (ensure_empty) {
+	erts_aint_t inext;
+	inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+	if (inext != ERTS_AINT_NULL) {
+	    if (&q->tail.data.marker != (ErtsThrQElement_t *) inext)
+		return ERTS_THR_Q_DIRTY;
+	    else {
+		q->head.head.ptr->next.ptr = (ErtsThrQElement_t *) inext;
+		q->head.head.ptr = (ErtsThrQElement_t *) inext;
+		inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+		if (inext != ERTS_AINT_NULL)
+		    return ERTS_THR_Q_DIRTY;
+	    }
+	}
+    }
+
+    if (q->head.first == q->head.head.ptr) {
+	if (!q->head.used_marker) {
+	    erts_aint_t inext;
+	    inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+	    if (inext == ERTS_AINT_NULL)
+		return ERTS_THR_Q_DIRTY;
+	}
+	return ERTS_THR_Q_CLEAN;
+    }
+
+    if (q->head.first != q->head.unref_end)
+	return ERTS_THR_Q_DIRTY;
+
+#ifdef ERTS_SMP
+    if (!q->head.next.thr_progress_reached)
+	return ERTS_THR_Q_NEED_THR_PRGR;
+#endif
+#endif
+    return ERTS_THR_Q_CLEAN;
+}
+
+static void
+enqueue(ErtsThrQ_t *q, void *data, ErtsThrQElement_t *this)
+{
+#ifndef USE_THREADS
+    ASSERT(data);
+
+    this->next.ptr = NULL;
+    this->data.ptr = data;
+
+    if (q->last)
+	q->last->next.ptr = this;
+    else {
+	q->first = q->last = this;
+	q->init.notify(q->init.arg);
+    }
+#else
+    int notify;
+    int um_refc_ix = 0;
+#ifdef ERTS_SMP
+    int unmanaged_thread;
+#endif
+
+#if ERTS_THR_Q_DBG_CHK_DATA
+    if (!data)
+	erl_exit(ERTS_ABORT_EXIT, "Missing data in enqueue\n");
+#endif
+
+    ASSERT(!q->q.finalizing);
+
+    this->data.ptr = data;
+
+#ifdef ERTS_SMP
+    unmanaged_thread = !erts_thr_progress_is_managed_thread();
+    if (unmanaged_thread)
+#endif
+    {
+	um_refc_ix = erts_atomic32_read_acqb(&q->tail.data.um_refc_ix);
+	while (1) {
+	    int tmp_um_refc_ix;
+	    erts_atomic_inc_acqb(&q->tail.data.um_refc[um_refc_ix]);
+	    tmp_um_refc_ix = erts_atomic32_read_acqb(&q->tail.data.um_refc_ix);
+	    if (tmp_um_refc_ix == um_refc_ix)
+		break;
+	    erts_atomic_dec_relb(&q->tail.data.um_refc[um_refc_ix]);
+	    um_refc_ix = tmp_um_refc_ix;
+	}
+    }
+
+    notify = this == enqueue_managed(q, this, 1);
+	
+
+#ifdef ERTS_SMP
+    if (unmanaged_thread)
+#endif
+    {
+	if (notify)
+	    erts_atomic_dec_relb(&q->tail.data.um_refc[um_refc_ix]);
+	else if (erts_atomic_dec_read_relb(&q->tail.data.um_refc[um_refc_ix]) == 0)
+	    notify = 1;
+    }
+    if (notify)
+	q->tail.data.notify(q->tail.data.arg);
+#endif
+}
+
+void
+erts_thr_q_enqueue(ErtsThrQ_t *q, void *data)
+{
+    enqueue(q, data, element_alloc(q));
+}
+
+ErtsThrQPrepEnQ_t *
+erts_thr_q_prepare_enqueue(ErtsThrQ_t *q)
+{
+    return (ErtsThrQPrepEnQ_t *) element_alloc(q);
+}
+
+int
+erts_thr_q_get_finalize_dequeue_data(ErtsThrQ_t *q, ErtsThrQFinDeQ_t *fdp)
+{
+#ifndef USE_THREADS
+    return 0;
+#else
+#ifdef DEBUG
+    if (!q->head.deq_fini.start) {
+	ASSERT(!q->head.deq_fini.end);
+    }
+    else {
+	ErtsThrQElement_t *e = q->head.deq_fini.start;
+	ErtsThrQElement_t *end = q->head.deq_fini.end;
+	while (e != end) {
+	    ASSERT(q->head.head.ptr != e);
+	    ASSERT(q->head.first != e);
+	    ASSERT(q->head.unref_end != e);
+	    e = e->next.ptr;
+	}
+    }	
+#endif
+    fdp->start = q->head.deq_fini.start;
+    fdp->end = q->head.deq_fini.end;
+    if (fdp->end)
+	fdp->end->next.ptr = NULL;
+    q->head.deq_fini.start = NULL;
+    q->head.deq_fini.end = NULL;
+    return fdp->start != NULL;
+#endif
+}
+
+void
+erts_thr_q_append_finalize_dequeue_data(ErtsThrQFinDeQ_t *fdp0,
+					ErtsThrQFinDeQ_t *fdp1)
+{
+#ifdef USE_THREADS
+    if (fdp1->start) {
+	if (fdp0->end)
+	    fdp0->end->next.ptr = fdp1->start;
+	else
+	    fdp0->start = fdp1->start;
+	fdp0->end = fdp1->end;
+    }
+#endif
+}
+
+
+int erts_thr_q_finalize_dequeue(ErtsThrQFinDeQ_t *state)
+{
+#ifdef USE_THREADS
+    ErtsThrQElement_t *start = state->start;
+    if (start) {
+	ErtsThrQLive_t live;
+	int i;
+	for (i = 0; i < ERTS_THR_Q_MAX_FINI_DEQ_OPS; i++) {
+	    ErtsThrQElement_t *tmp;
+	    if (!start)
+		break;
+	    tmp = start;
+	    start = start->next.ptr;
+	    live = (ErtsThrQLive_t) (UWord) tmp->data.ptr;
+	    element_live_free(live, tmp);
+	}
+	state->start = start;
+	if (start)
+	    return 1; /* More to do */
+	state->end = NULL;
+    }
+#endif
+    return 0;
+}
+
+void
+erts_thr_q_finalize_dequeue_state_init(ErtsThrQFinDeQ_t *state)
+{
+#ifdef USE_THREADS
+    state->start = NULL;
+    state->end = NULL;
+#endif
+}
+
+
+void
+erts_thr_q_enqueue_prepared(ErtsThrQ_t *q, void *data, ErtsThrQPrepEnQ_t *prep)
+{
+    ASSERT(prep);
+    enqueue(q, data, (ErtsThrQElement_t *) prep);
+}
+
+void *
+erts_thr_q_dequeue(ErtsThrQ_t *q)
+{
+#ifndef USE_THREADS
+    void *res;
+    ErtsThrQElement_t *tmp;
+
+    if (!q->first)
+	return NULL;
+    tmp = q->first;
+    res = tmp->data.ptr;
+    q->first = tmp->next.ptr;
+    if (!q->first)
+	q->last = NULL;
+
+    element_free(q, tmp);
+
+    return res;
+#else
+    erts_aint_t inext;
+    void *res;
+
+    inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+    if (inext == ERTS_AINT_NULL)
+	return NULL;
+    q->head.head.ptr->next.ptr = (ErtsThrQElement_t *) inext;
+    q->head.head.ptr = (ErtsThrQElement_t *) inext;
+    if (q->head.head.ptr == &q->tail.data.marker) {
+	inext = erts_atomic_read_acqb(&q->head.head.ptr->next.atmc);
+	if (inext == ERTS_AINT_NULL)
+	    return NULL;
+	q->head.head.ptr->next.ptr = (ErtsThrQElement_t *) inext;
+	q->head.head.ptr = (ErtsThrQElement_t *) inext;
+    }
+    res = q->head.head.ptr->data.ptr;
+#if ERTS_THR_Q_DBG_CHK_DATA
+    q->head.head.ptr->data.ptr = NULL;
+    if (!res)
+	erl_exit(ERTS_ABORT_EXIT, "Missing data in dequeue\n");
+#endif
+    clean(q,
+	  (q->head.deq_fini.automatic
+	   ? ERTS_THR_Q_MAX_DEQUEUE_CLEAN_OPS
+	   : ERTS_THR_Q_MAX_SCHED_CLEAN_OPS), 1);
+    return res;
+#endif
+}
diff --git a/erts/emulator/beam/erl_thr_queue.h b/erts/emulator/beam/erl_thr_queue.h
new file mode 100644
index 0000000000..407c23f5eb
--- /dev/null
+++ b/erts/emulator/beam/erl_thr_queue.h
@@ -0,0 +1,211 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2011. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+/*
+ * Description: Lock-free queue for communication between threads.
+ *
+ *              Currently only a many-to-one version has been,
+ *              implemented, i.e., many threads can enqueue but
+ *              only one thread can dequeue at a time. It doesn't
+ *              have to be the same thread dequeuing every time, but
+ *              synchronization so that only one thread dequeues
+ *              at a time has to be provided by other means.
+ *
+ *              When/If the need for a many-to-many queue arises,
+ *              this implementation can relatively easy be extended
+ *              to support that too.
+ *
+ *              Usage instructions can be found in erts_thr_queue.c
+ *
+ * Author: 	Rickard Green
+ */
+
+#ifndef ERL_THR_QUEUE_H__
+#define ERL_THR_QUEUE_H__
+
+#include "sys.h"
+#include "erl_threads.h"
+#include "erl_alloc.h"
+#include "erl_thr_progress.h"
+
+typedef enum {
+    ERTS_THR_Q_LIVE_UNDEF,
+    ERTS_THR_Q_LIVE_SHORT,
+    ERTS_THR_Q_LIVE_LONG
+} ErtsThrQLive_t;
+
+#define ERTS_THR_Q_INIT_DEFAULT						\
+{									\
+    {									\
+	ERTS_THR_Q_LIVE_UNDEF,						\
+	ERTS_THR_Q_LIVE_SHORT						\
+    },									\
+    NULL,								\
+    NULL,								\
+    1									\
+}
+
+typedef struct ErtsThrQ_t_ ErtsThrQ_t;
+
+typedef struct {
+    struct {
+	ErtsThrQLive_t queue;
+	ErtsThrQLive_t objects;
+    } live;
+    void *arg;
+    void (*notify)(void *);
+    int auto_finalize_dequeue;
+} ErtsThrQInit_t;
+
+typedef struct ErtsThrQElement_t_ ErtsThrQElement_t;
+typedef struct ErtsThrQElement_t ErtsThrQPrepEnQ_t;
+
+typedef union {
+    erts_atomic_t atmc;
+    ErtsThrQElement_t *ptr;
+} ErtsThrQPtr_t;
+
+struct ErtsThrQElement_t_ {
+    ErtsThrQPtr_t next;
+    union {
+	erts_atomic_t atmc;
+	void *ptr;
+    } data;
+};
+
+typedef struct {
+    ErtsThrQElement_t *start;
+    ErtsThrQElement_t *end;
+} ErtsThrQFinDeQ_t;
+
+typedef enum {
+    ERTS_THR_Q_CLEAN,
+#ifdef ERTS_SMP
+    ERTS_THR_Q_NEED_THR_PRGR,
+#endif
+    ERTS_THR_Q_DIRTY,
+} ErtsThrQCleanState_t;
+
+#ifdef USE_THREADS
+
+typedef struct {
+    ErtsThrQElement_t marker;
+    erts_atomic_t last;
+    erts_atomic_t um_refc[2];
+    erts_atomic32_t um_refc_ix;
+    ErtsThrQLive_t live;
+#ifdef ERTS_SMP
+    erts_atomic32_t thr_prgr_clean_scheduled;
+#endif
+    void *arg;
+    void (*notify)(void *);
+} ErtsThrQTail_t;
+
+struct ErtsThrQ_t_ {
+    /*
+     * This structure needs to be cache line aligned for best
+     * performance.
+     */
+    union {
+	/* Modified by threads enqueuing */
+	ErtsThrQTail_t data;
+	char align__[ERTS_ALC_CACHE_LINE_ALIGN_SIZE(sizeof(ErtsThrQTail_t))];
+    } tail;
+    /*
+     * Everything below this point is *only* accessed by the
+     * thread dequeuing.
+     */
+    struct {
+	ErtsThrQPtr_t head;
+	ErtsThrQLive_t live;
+	ErtsThrQElement_t *first;
+	ErtsThrQElement_t *unref_end;
+	int clean_reached_head_count;
+	struct {
+	    int automatic;
+	    ErtsThrQElement_t *start;
+	    ErtsThrQElement_t *end;
+	} deq_fini;
+	struct {
+#ifdef ERTS_SMP
+	    ErtsThrPrgrVal thr_progress;
+	    int thr_progress_reached;
+#endif
+	    int um_refc_ix;
+	    ErtsThrQElement_t *unref_end;
+	} next;
+	int used_marker;
+	void *arg;
+	void (*notify)(void *);
+    } head;
+    struct {
+	int finalizing;
+	ErtsThrQLive_t live;
+	void *blk;
+    } q;
+};
+
+#else /* !USE_THREADS */
+
+struct ErtsThrQ_t_ {
+    ErtsThrQInit_t init;
+    ErtsThrQElement_t *first;
+    ErtsThrQElement_t *last;
+    struct {
+	void *blk;
+    } q;
+};
+
+#endif
+
+void erts_thr_q_init(void);
+void erts_thr_q_initialize(ErtsThrQ_t *, ErtsThrQInit_t *);
+ErtsThrQCleanState_t erts_thr_q_finalize(ErtsThrQ_t *);
+ErtsThrQ_t *erts_thr_q_create(ErtsThrQInit_t *);
+ErtsThrQCleanState_t erts_thr_q_destroy(ErtsThrQ_t *);
+ErtsThrQCleanState_t erts_thr_q_clean(ErtsThrQ_t *);
+ErtsThrQCleanState_t erts_thr_q_inspect(ErtsThrQ_t *, int);
+ErtsThrQPrepEnQ_t *erts_thr_q_prepare_enqueue(ErtsThrQ_t *);
+void erts_thr_q_enqueue_prepared(ErtsThrQ_t *, void *, ErtsThrQPrepEnQ_t *);
+void erts_thr_q_enqueue(ErtsThrQ_t *, void *);
+void * erts_thr_q_dequeue(ErtsThrQ_t *);
+int erts_thr_q_get_finalize_dequeue_data(ErtsThrQ_t *,
+					 ErtsThrQFinDeQ_t *);
+void erts_thr_q_append_finalize_dequeue_data(ErtsThrQFinDeQ_t *,
+					     ErtsThrQFinDeQ_t *);
+int erts_thr_q_finalize_dequeue(ErtsThrQFinDeQ_t *);
+void erts_thr_q_finalize_dequeue_state_init(ErtsThrQFinDeQ_t *);
+
+#ifdef ERTS_SMP
+ERTS_GLB_INLINE ErtsThrPrgrVal erts_thr_q_need_thr_progress(ErtsThrQ_t *q);
+#endif
+
+#if ERTS_GLB_INLINE_INCL_FUNC_DEF
+
+#ifdef ERTS_SMP
+ERTS_GLB_INLINE ErtsThrPrgrVal
+erts_thr_q_need_thr_progress(ErtsThrQ_t *q)
+{
+    return q->head.next.thr_progress;
+}
+#endif
+
+#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */
+
+#endif /* ERL_THR_QUEUE_H__ */
diff --git a/erts/emulator/beam/global.h b/erts/emulator/beam/global.h
index 684e910fc3..4a4973baab 100644
--- a/erts/emulator/beam/global.h
+++ b/erts/emulator/beam/global.h
@@ -42,12 +42,6 @@
 typedef struct port Port;
 #include "erl_port_task.h"
 
-#define ERTS_MAX_NO_OF_ASYNC_THREADS 1024
-extern int erts_async_max_threads;
-#define ERTS_ASYNC_THREAD_MIN_STACK_SIZE 16	/* Kilo words */
-#define ERTS_ASYNC_THREAD_MAX_STACK_SIZE 8192	/* Kilo words */
-extern int erts_async_thread_suggested_stack_size;
-
 typedef struct erts_driver_t_ erts_driver_t;
 
 #define SMALL_IO_QUEUE 5   /* Number of fixed elements */
diff --git a/erts/emulator/beam/io.c b/erts/emulator/beam/io.c
index 151c776a3d..fff720634d 100644
--- a/erts/emulator/beam/io.c
+++ b/erts/emulator/beam/io.c
@@ -42,6 +42,7 @@
 #include "erl_bits.h"
 #include "erl_version.h"
 #include "error.h"
+#include "erl_async.h"
 
 extern ErlDrvEntry fd_driver_entry;
 extern ErlDrvEntry vanilla_driver_entry;
@@ -4579,7 +4580,10 @@ int driver_lock_driver(ErlDrvPort ix)
 
     erts_smp_mtx_lock(&erts_driver_list_lock);
 
-    if (prt == NULL) return -1;
+    if (prt == NULL) {
+	erts_smp_mtx_unlock(&erts_driver_list_lock);
+	return -1;
+    }
 
     ERTS_SMP_LC_ASSERT(erts_lc_is_port_locked(prt));
     if ((dh = (DE_Handle*)prt->drv_ptr->handle ) == NULL) {
diff --git a/erts/emulator/beam/sys.h b/erts/emulator/beam/sys.h
index b63fe98f27..f9cbcc5892 100644
--- a/erts/emulator/beam/sys.h
+++ b/erts/emulator/beam/sys.h
@@ -475,15 +475,6 @@ __decl_noreturn void __noreturn erl_exit(int n, char*, ...);
 #define ERTS_ABORT_EXIT	(INT_MIN + 1)	/* no crash dump; only abort() */
 #define ERTS_DUMP_EXIT	(127)		/* crash dump; then exit() */
 
-
-#ifndef ERTS_SMP
-int check_async_ready(void);
-#ifdef USE_THREADS
-void sys_async_ready(int hndl);
-int erts_register_async_ready_callback(void (*funcp)(void));
-#endif
-#endif
-
 Eterm erts_check_io_info(void *p);
 
 /* Size of misc memory allocated from system dependent code */
@@ -671,6 +662,8 @@ int erts_sys_putenv(char *key_value, int sep_ix);
    *size), a value > 0 if value buffer is too small (*size is set to needed
    size), and a value < 0 on failure. */
 int erts_sys_getenv(char *key, char *value, size_t *size);
+/* erts_sys_getenv__() is only allowed to be used in early init phase */
+int erts_sys_getenv__(char *key, char *value, size_t *size);
 
 /* Easier to use, but not as efficient, environment functions */
 char *erts_read_env(char *key);
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 65485241aa..1bd178f280 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -43,6 +43,7 @@
 #include "erl_smp.h"
 #include "erl_time.h"
 #include "erl_thr_progress.h"
+#include "erl_thr_queue.h"
 #include "erl_sched_spec_pre_alloc.h"
 
 #undef M_TRIM_THRESHOLD
diff --git a/erts/emulator/sys/unix/sys.c b/erts/emulator/sys/unix/sys.c
index d7c4812dad..c6b63350e5 100644
--- a/erts/emulator/sys/unix/sys.c
+++ b/erts/emulator/sys/unix/sys.c
@@ -128,7 +128,6 @@ static ErtsSysReportExit *report_exit_list;
 static ErtsSysReportExit *report_exit_transit_list;
 #endif
 
-extern int  check_async_ready(void);
 extern int  driver_interrupt(int, int);
 extern void do_break(void);
 
@@ -1120,31 +1119,6 @@ struct erl_drv_entry vanilla_driver_entry = {
     stop_select
 };
 
-#if defined(USE_THREADS) && !defined(ERTS_SMP)
-static int  async_drv_init(void);
-static ErlDrvData async_drv_start(ErlDrvPort, char*, SysDriverOpts*);
-static void async_drv_stop(ErlDrvData);
-static void async_drv_input(ErlDrvData, ErlDrvEvent);
-
-/* INTERNAL use only */
-
-struct erl_drv_entry async_driver_entry = {
-    async_drv_init,
-    async_drv_start,
-    async_drv_stop,
-    NULL,
-    async_drv_input,
-    NULL,
-    "async",
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-    NULL,
-    NULL
-};
-#endif
-
 /* Handle SIGCHLD signals. */
 #if (defined(SIG_SIGSET) || defined(SIG_SIGNAL))
 static RETSIGTYPE onchld(void)
@@ -2329,87 +2303,6 @@ static void stop_select(ErlDrvEvent fd, void* _)
     close((int)fd);
 }
 
-/*
-** Async opertation support
-*/
-#if defined(USE_THREADS) && !defined(ERTS_SMP)
-static void
-sys_async_ready_failed(int fd, int r, int err)
-{
-    char buf[120];
-    sprintf(buf, "sys_async_ready(): Fatal error: fd=%d, r=%d, errno=%d\n",
-	     fd, r, err);
-    erts_silence_warn_unused_result(write(2, buf, strlen(buf)));
-    abort();
-}
-
-/* called from threads !! */
-void sys_async_ready(int fd)
-{
-    int r;
-    while (1) {
-	r = write(fd, "0", 1);  /* signal main thread fd MUST be async_fd[1] */
-	if (r == 1) {
-	    DEBUGF(("sys_async_ready(): r = 1\r\n"));
-	    break;
-	}
-	if (r < 0 && errno == EINTR) {
-	    DEBUGF(("sys_async_ready(): r = %d\r\n", r));
-	    continue;
-	}
-	sys_async_ready_failed(fd, r, errno);
-    }
-}
-
-static int async_drv_init(void)
-{
-    async_fd[0] = -1;
-    async_fd[1] = -1;
-    return 0;
-}
-
-static ErlDrvData async_drv_start(ErlDrvPort port_num,
-				  char* name, SysDriverOpts* opts)
-{
-    if (async_fd[0] != -1)
-	return ERL_DRV_ERROR_GENERAL;
-    if (pipe(async_fd) < 0)
-	return ERL_DRV_ERROR_GENERAL;
-
-    DEBUGF(("async_drv_start: %d\r\n", port_num));
-
-    SET_NONBLOCKING(async_fd[0]);
-    driver_select(port_num, async_fd[0], ERL_DRV_READ, 1);
-
-    if (init_async(async_fd[1]) < 0)
-	return ERL_DRV_ERROR_GENERAL;
-    return (ErlDrvData)port_num;
-}
-
-static void async_drv_stop(ErlDrvData e)
-{
-    int port_num = (int)(long)e;
-
-    DEBUGF(("async_drv_stop: %d\r\n", port_num));
-
-    exit_async();
-
-    driver_select(port_num, async_fd[0], ERL_DRV_READ, 0);
-
-    close(async_fd[0]);
-    close(async_fd[1]);
-    async_fd[0] = async_fd[1] = -1;
-}
-
-
-static void async_drv_input(ErlDrvData e, ErlDrvEvent fd)
-{
-    char *buf[32];
-    DEBUGF(("async_drv_input\r\n"));
-    while (read((int) fd, (void *) buf, 32) > 0); /* fd MUST be async_fd[0] */
-    check_async_ready();  /* invoke all async_ready */
-}
-#endif
 
 void erts_do_break_handling(void)
 {
@@ -2483,12 +2376,10 @@ erts_sys_putenv(char *buffer, int sep_ix)
 }
 
 int
-erts_sys_getenv(char *key, char *value, size_t *size)
+erts_sys_getenv__(char *key, char *value, size_t *size)
 {
-    char *orig_value;
     int res;
-    erts_smp_rwmtx_rlock(&environ_rwmtx);
-    orig_value = getenv(key);
+    char *orig_value = getenv(key);
     if (!orig_value)
 	res = -1;
     else {
@@ -2503,6 +2394,15 @@ erts_sys_getenv(char *key, char *value, size_t *size)
 	    res = 0;
 	}
     }
+    return res;
+}
+
+int
+erts_sys_getenv(char *key, char *value, size_t *size)
+{
+    int res;
+    erts_smp_rwmtx_rlock(&environ_rwmtx);
+    res = erts_sys_getenv__(key, value, size);
     erts_smp_rwmtx_runlock(&environ_rwmtx);
     return res;
 }
@@ -2514,31 +2414,6 @@ sys_init_io(void)
 	erts_alloc(ERTS_ALC_T_FD_TAB, max_files * sizeof(struct fd_data));
     erts_smp_atomic_add_nob(&sys_misc_mem_sz,
 			    max_files * sizeof(struct fd_data));
-
-#ifdef USE_THREADS
-#ifdef ERTS_SMP
-    if (init_async(-1) < 0)
-	erl_exit(1, "Failed to initialize async-threads\n");
-#else
-    {
-	/* This is speical stuff, starting a driver from the 
-	 * system routines, but is a nice way of handling stuff
-	 * the erlang way
-	 */
-	SysDriverOpts dopts;
-	int ret;
-
-	sys_memset((void*)&dopts, 0, sizeof(SysDriverOpts));
-	add_driver_entry(&async_driver_entry);
-	ret = erts_open_driver(NULL, NIL, "async", &dopts, NULL);
-	DEBUGF(("open_driver = %d\n", ret));
-	if (ret < 0)
-	    erl_exit(1, "Failed to open async driver\n");
-	erts_port[ret].status |= ERTS_PORT_SFLG_IMMORTAL;
-    }
-#endif
-#endif
-
 }
 
 #if (0) /* unused? */
@@ -2765,15 +2640,7 @@ initiate_report_exit_status(ErtsSysReportExit *rep, int status)
     rep->next = report_exit_transit_list;
     rep->status = status;
     report_exit_transit_list = rep;
-    /*
-     * We need the scheduler thread to call check_children().
-     * If the scheduler thread is sleeping in a poll with a
-     * timeout, we need to wake the scheduler thread. We use the
-     * functionality of the async driver to do this, instead of
-     * implementing yet another driver doing the same thing. A
-     * little bit ugly, but it works...
-     */
-    sys_async_ready(async_fd[1]);
+    erts_sys_schedule_interrupt(1);
 }
 
 static int check_children(void)
@@ -2860,19 +2727,11 @@ erl_sys_schedule(int runnable)
 {
 #ifdef ERTS_SMP
     ERTS_CHK_IO(!runnable);
-    ERTS_SMP_LC_ASSERT(!erts_thr_progress_is_blocking());
 #else
-    if (runnable) {
-	ERTS_CHK_IO(0);		/* Poll for I/O */
-	check_async_ready();	/* Check async completions */
-    } else {
-	int wait_for_io = !check_async_ready();
-	if (wait_for_io)
-	    wait_for_io = !check_children();
-	ERTS_CHK_IO(wait_for_io);
-    }
-    (void) check_children();
+    ERTS_CHK_IO(runnable ? 0 : !check_children());
 #endif
+    ERTS_SMP_LC_ASSERT(!erts_thr_progress_is_blocking());
+    (void) check_children();
 }
 
 
diff --git a/erts/emulator/sys/vxworks/sys.c b/erts/emulator/sys/vxworks/sys.c
index 97a2ae7f7b..d6d1fe64e0 100644
--- a/erts/emulator/sys/vxworks/sys.c
+++ b/erts/emulator/sys/vxworks/sys.c
@@ -1520,6 +1520,12 @@ erts_sys_getenv(char *key, char *value, size_t *size)
     return res;
 }
 
+int
+erts_sys_getenv__(char *key, char *value, size_t *size)
+{
+    return erts_sys_getenv(key, value, size);
+}
+
 void
 sys_init_io(void)
 {
diff --git a/erts/emulator/sys/win32/sys.c b/erts/emulator/sys/win32/sys.c
index ace1e1fca0..02d16b83a2 100644
--- a/erts/emulator/sys/win32/sys.c
+++ b/erts/emulator/sys/win32/sys.c
@@ -566,51 +566,6 @@ struct erl_drv_entry vanilla_driver_entry = {
     stop_select
 };
 
-#if defined(USE_THREADS) && !defined(ERTS_SMP)
-
-static int  async_drv_init(void);
-static ErlDrvData async_drv_start(ErlDrvPort, char*, SysDriverOpts*);
-static void async_drv_stop(ErlDrvData);
-static void async_drv_input(ErlDrvData, ErlDrvEvent);
-
-/* INTERNAL use only */
-
-void null_output(ErlDrvData drv_data, char* buf, int len)
-{
-}
-
-void null_ready_output(ErlDrvData drv_data, ErlDrvEvent event)
-{
-}
-
-struct erl_drv_entry async_driver_entry = {
-    async_drv_init,
-    async_drv_start,
-    async_drv_stop,
-    null_output,
-    async_drv_input,
-    null_ready_output,
-    "async",
-    NULL, /* finish */
-    NULL, /* handle */
-    NULL, /* control */
-    NULL, /* timeout */
-    NULL, /* outputv */
-    NULL, /* ready_async */
-    NULL, /* flush */
-    NULL, /* call */
-    NULL, /* event */
-    ERL_DRV_EXTENDED_MARKER,
-    ERL_DRV_EXTENDED_MAJOR_VERSION,
-    ERL_DRV_EXTENDED_MINOR_VERSION,
-    0,	/* ERL_DRV_FLAGs */
-    NULL,
-    NULL, /* process_exit */
-    stop_select
-};
-
-#endif
-
 /*
  * Initialises a DriverData structure.
  *
@@ -2825,30 +2780,6 @@ sys_init_io(void)
        We estimate the number to twice the amount of ports. 
        We really dont know on windows, do we? */
     max_files = 2*erts_max_ports;
-    
-#ifdef USE_THREADS
-#ifdef ERTS_SMP
-    if (init_async(-1) < 0)
-	erl_exit(1, "Failed to initialize async-threads\n");
-#else
-    {
-	/* This is special stuff, starting a driver from the 
-	 * system routines, but is a nice way of handling stuff
-	 * the erlang way
-	 */
-	SysDriverOpts dopts;
-	int ret;
-
-	sys_memset((void*)&dopts, 0, sizeof(SysDriverOpts));
-	add_driver_entry(&async_driver_entry);
-	ret = erts_open_driver(NULL, NIL, "async", &dopts, NULL);
-	DEBUGF(("open_driver = %d\n", ret));
-	if (ret < 0)
-	    erl_exit(1, "Failed to open async driver\n");
-	erts_port[ret].status |= ERTS_PORT_SFLG_IMMORTAL;
-    }
-#endif
-#endif
 }
 
 #ifdef ERTS_SMP
@@ -3382,75 +3313,7 @@ erts_sys_schedule_interrupt_timed(int set, long msec)
 void
 erl_sys_schedule(int runnable)
 {
-#ifdef ERTS_SMP
     erts_check_io(!runnable);
     ERTS_SMP_LC_ASSERT(!erts_thr_progress_is_blocking());
-#else
-    if (runnable) {
-	erts_check_io(0);	/* Poll for I/O */
-	check_async_ready();	/* Check async completions */
-    } else {
-	erts_check_io(check_async_ready() ? 0 : 1);
-    }
-#endif
-}
-
-#if defined(USE_THREADS) && !defined(ERTS_SMP)
-/*
- * Async operation support.
- */
-
-static ErlDrvEvent async_drv_event;
-
-void
-sys_async_ready(int fd)
-{
-    SetEvent((HANDLE)async_drv_event);
 }
 
-static int
-async_drv_init(void)
-{
-    async_drv_event = (ErlDrvEvent) NULL;
-    return 0;
-}
-
-static ErlDrvData
-async_drv_start(ErlDrvPort port_num, char* name, SysDriverOpts* opts)
-{
-    if (async_drv_event != (ErlDrvEvent) NULL) {
-	return ERL_DRV_ERROR_GENERAL;
-    }
-    if ((async_drv_event = (ErlDrvEvent)CreateAutoEvent(FALSE)) == (ErlDrvEvent) NULL) {
-	return ERL_DRV_ERROR_GENERAL;
-    }
-
-    driver_select(port_num, async_drv_event, ERL_DRV_READ|ERL_DRV_USE, 1);
-    if (init_async(async_drv_event) < 0) {
-	return ERL_DRV_ERROR_GENERAL;
-    }
-    return (ErlDrvData)port_num;
-}
-
-static void
-async_drv_stop(ErlDrvData port_num)
-{
-    exit_async();
-    driver_select((ErlDrvPort)port_num, async_drv_event, ERL_DRV_READ|ERL_DRV_USE, 0);
-    /*CloseHandle((HANDLE)async_drv_event);*/
-    async_drv_event = (ErlDrvEvent) NULL;
-}
-
-
-static void
-async_drv_input(ErlDrvData port_num, ErlDrvEvent e) 
-{
-    check_async_ready();
-
-    /*
-     * Our event is auto-resetting.
-     */
-}
-
-#endif
-
diff --git a/erts/emulator/sys/win32/sys_env.c b/erts/emulator/sys/win32/sys_env.c
index 02c8433a10..7acc7f07ee 100644
--- a/erts/emulator/sys/win32/sys_env.c
+++ b/erts/emulator/sys/win32/sys_env.c
@@ -55,19 +55,17 @@ erts_sys_putenv(char *key_value, int sep_ix)
 }
 
 int
-erts_sys_getenv(char *key, char *value, size_t *size)
+erts_sys_getenv__(char *key, char *value, size_t *size)
 {
     size_t req_size = 0;
     int res = 0;
     DWORD new_size;
 
-    erts_smp_rwmtx_rlock(&environ_rwmtx);
     SetLastError(0);
     new_size = GetEnvironmentVariable((LPCTSTR) key,
 				      (LPTSTR) value,
 				      (DWORD) *size);
     res = !new_size && GetLastError() == ERROR_ENVVAR_NOT_FOUND ? -1 : 0;
-    erts_smp_rwmtx_runlock(&environ_rwmtx);
     if (res < 0)
 	return res;
     res = new_size > *size ? 1 : 0;
@@ -75,6 +73,16 @@ erts_sys_getenv(char *key, char *value, size_t *size)
     return res;
 }
 
+int
+erts_sys_getenv(char *key, char *value, size_t *size)
+{
+    int res;
+    erts_smp_rwmtx_rlock(&environ_rwmtx);
+    res = erts_sys_getenv__(key, value, size);
+    erts_smp_rwmtx_runlock(&environ_rwmtx);
+    return res;
+}
+
 struct win32_getenv_state {
     char *env;
     char *next;
diff --git a/erts/emulator/test/driver_SUITE.erl b/erts/emulator/test/driver_SUITE.erl
index bcb0257ed1..c07dbc5871 100644
--- a/erts/emulator/test/driver_SUITE.erl
+++ b/erts/emulator/test/driver_SUITE.erl
@@ -76,7 +76,8 @@
 	 driver_select_use/1,
 	 thread_mseg_alloc_cache_clean/1,
 	 otp_9302/1,
-	 thr_free_drv/1]).
+	 thr_free_drv/1,
+	 async_blast/1]).
 
 -export([bin_prefix/2]).
 
@@ -145,7 +146,8 @@ all() ->
      smp_select, driver_select_use,
      thread_mseg_alloc_cache_clean,
      otp_9302,
-     thr_free_drv].
+     thr_free_drv,
+     async_blast].
 
 groups() -> 
     [{timer, [],
@@ -1911,17 +1913,30 @@ otp_9302(Config) when is_list(Config) ->
     ?line port_command(Port, ""),
     ?line {msg, block} = get_port_msg(Port, infinity),
     ?line {msg, job} = get_port_msg(Port, infinity),
-    ?line case erlang:system_info(thread_pool_size) of
-	      0 ->
-		  {msg, cancel} = get_port_msg(Port, infinity);
-	      _ ->
-		  ok
-	  end,
-    ?line {msg, job} = get_port_msg(Port, infinity),
+    ?line C = case erlang:system_info(thread_pool_size) of
+		  0 ->
+		      ?line {msg, cancel} = get_port_msg(Port, infinity),
+		      ?line {msg, job} = get_port_msg(Port, infinity),
+		      ?line false;
+		  _ ->
+		      case get_port_msg(Port, infinity) of
+			  {msg, cancel} -> %% Cancel always fail in Rel >= 15
+			      ?line {msg, job} = get_port_msg(Port, infinity),
+			      ?line false;
+			  {msg, job} ->
+			      ?line ok,
+			      ?line true
+		      end
+	      end,
     ?line {msg, end_of_jobs} = get_port_msg(Port, infinity),
     ?line no_msg = get_port_msg(Port, 2000),
     ?line port_close(Port),
-    ?line ok.
+    ?line case C of
+	      true ->
+		  ?line {comment, "Async job cancelled"};
+	      false ->
+		  ?line {comment, "Async job not cancelled"}
+	  end.
 
 thr_free_drv(Config) when is_list(Config) ->
     ?line Path = ?config(data_dir, Config),
@@ -1954,6 +1969,48 @@ thr_free_drv_control(Port, N) ->
 %	    io:format("N=~p, SID=~p", [N, erlang:system_info(scheduler_id)]),
 	    thr_free_drv_control(Port, N+1)
     end.
+	    
+async_blast(Config) when is_list(Config) ->
+    ?line Path = ?config(data_dir, Config),
+    ?line erl_ddll:start(),
+    ?line ok = load_driver(Path, async_blast_drv),
+    ?line SchedOnln = erlang:system_info(schedulers_online),
+    ?line MemBefore = driver_alloc_size(),
+    ?line Start = os:timestamp(),
+    ?line Blast = fun () ->
+			  Port = open_port({spawn, async_blast_drv}, []),
+			  true = is_port(Port),
+			  port_command(Port, ""),
+			  receive
+			      {Port, done} ->
+				  ok
+			  end,
+			  port_close(Port)
+		  end,
+    ?line Ps = lists:map(fun (N) ->
+				 spawn_opt(Blast,
+					   [{scheduler,
+					     (N rem SchedOnln)+ 1},
+					    monitor])
+			 end,
+			 lists:seq(1, 100)),
+    ?line MemMid = driver_alloc_size(),
+    ?line lists:foreach(fun ({Pid, Mon}) ->
+				receive
+				    {'DOWN',Mon,process,Pid,_} -> ok
+				end
+			end, Ps),
+    ?line End = os:timestamp(),
+    ?line MemAfter = driver_alloc_size(),
+    ?line io:format("MemBefore=~p, MemMid=~p, MemAfter=~p~n",
+		    [MemBefore, MemMid, MemAfter]),
+    ?line AsyncBlastTime = timer:now_diff(End,Start)/1000000,
+    ?line io:format("AsyncBlastTime=~p~n", [AsyncBlastTime]),
+    ?line MemBefore = MemAfter,
+    ?line erlang:display({async_blast_time, AsyncBlastTime}),
+    ?line ok.
+
+
 
 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
 %% 		Utilities
diff --git a/erts/emulator/test/driver_SUITE_data/Makefile.src b/erts/emulator/test/driver_SUITE_data/Makefile.src
index 62ab5169c0..dd48f6a0f7 100644
--- a/erts/emulator/test/driver_SUITE_data/Makefile.src
+++ b/erts/emulator/test/driver_SUITE_data/Makefile.src
@@ -13,7 +13,8 @@ MISC_DRVS =		outputv_drv@dll@ \
 			missing_callback_drv@dll@ \
 			thr_alloc_drv@dll@ \
 			otp_9302_drv@dll@ \
-			thr_free_drv@dll@
+			thr_free_drv@dll@ \
+			async_blast_drv@dll@
 
 SYS_INFO_DRVS = 	sys_info_1_0_drv@dll@ \
 			sys_info_1_1_drv@dll@ \
diff --git a/erts/emulator/test/driver_SUITE_data/async_blast_drv.c b/erts/emulator/test/driver_SUITE_data/async_blast_drv.c
new file mode 100644
index 0000000000..3821f7e3dc
--- /dev/null
+++ b/erts/emulator/test/driver_SUITE_data/async_blast_drv.c
@@ -0,0 +1,124 @@
+/*
+ * %CopyrightBegin%
+ *
+ * Copyright Ericsson AB 2011. All Rights Reserved.
+ *
+ * The contents of this file are subject to the Erlang Public License,
+ * Version 1.1, (the "License"); you may not use this file except in
+ * compliance with the License. You should have received a copy of the
+ * Erlang Public License along with this software. If not, it can be
+ * retrieved online at http://www.erlang.org/.
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * %CopyrightEnd%
+ */
+
+#include "erl_driver.h"
+
+#define NO_ASYNC_JOBS 10000
+
+static void stop(ErlDrvData drv_data);
+static ErlDrvData start(ErlDrvPort port,
+			char *command);
+static void output(ErlDrvData drv_data,
+		   char *buf, int len);
+static void ready_async(ErlDrvData drv_data,
+			ErlDrvThreadData thread_data);
+
+static ErlDrvEntry async_blast_drv_entry = { 
+    NULL /* init */,
+    start,
+    stop,
+    output,
+    NULL /* ready_input */,
+    NULL /* ready_output */,
+    "async_blast_drv",
+    NULL /* finish */,
+    NULL /* handle */,
+    NULL /* control */,
+    NULL /* timeout */,
+    NULL /* outputv */,
+    ready_async,
+    NULL /* flush */,
+    NULL /* call */,
+    NULL /* event */,
+    ERL_DRV_EXTENDED_MARKER,
+    ERL_DRV_EXTENDED_MAJOR_VERSION,
+    ERL_DRV_EXTENDED_MINOR_VERSION,
+    ERL_DRV_FLAG_USE_PORT_LOCKING,
+    NULL /* handle2 */,
+    NULL /* handle_monitor */
+};
+
+typedef struct {
+    ErlDrvPort port;
+    ErlDrvTermData caller;    
+    int counter;
+} async_blast_data_t;
+
+
+DRIVER_INIT(async_blast_drv)
+{
+    return &async_blast_drv_entry;
+}
+
+static void stop(ErlDrvData drv_data)
+{
+    driver_free((void *) drv_data);
+}
+
+static ErlDrvData start(ErlDrvPort port,
+			char *command)
+{
+    async_blast_data_t *abd;
+
+    abd = driver_alloc(sizeof(async_blast_data_t));
+    if (!abd)
+	return ERL_DRV_ERROR_GENERAL;
+
+    abd->port = port;
+    abd->counter = 0;
+    return (ErlDrvData) abd;
+}
+
+static void async_invoke(void *data)
+{
+
+}
+#include <stdio.h>
+
+static void ready_async(ErlDrvData drv_data,
+			ErlDrvThreadData thread_data)
+{
+    async_blast_data_t *abd = (async_blast_data_t *) drv_data;
+    if (--abd->counter == 0) {
+	ErlDrvTermData spec[] = {
+	    ERL_DRV_PORT, driver_mk_port(abd->port),
+	    ERL_DRV_ATOM, driver_mk_atom("done"),
+	    ERL_DRV_TUPLE, 2
+	};
+	driver_send_term(abd->port, abd->caller,
+			 spec, sizeof(spec)/sizeof(spec[0]));
+    }
+}
+
+static void output(ErlDrvData drv_data,
+		   char *buf, int len)
+{
+    async_blast_data_t *abd = (async_blast_data_t *) drv_data;
+    if (abd->counter == 0) {
+	int i;
+	abd->caller = driver_caller(abd->port);
+	abd->counter = NO_ASYNC_JOBS;
+	for (i = 0; i < NO_ASYNC_JOBS; i++) {
+	    if (0 > driver_async(abd->port, NULL, async_invoke, NULL, NULL)) {
+		driver_failure_atom(abd->port, "driver_async_failed");
+		break;
+	    }
+	}
+    }
+}