From 19972f42719c6e9357e33e123d00b681213022dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20H=C3=B6gberg?= <john@erlang.org>
Date: Tue, 23 Oct 2018 09:37:16 +0200
Subject: Fix trapping in lists:reverse/2

The first stage wasn't bounded by reductions, and it bumped far
more reductions than it should have due to a logic bug.
---
 erts/emulator/beam/erl_bif_lists.c | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/erts/emulator/beam/erl_bif_lists.c b/erts/emulator/beam/erl_bif_lists.c
index 395be67a90..9b2b8aaab6 100644
--- a/erts/emulator/beam/erl_bif_lists.c
+++ b/erts/emulator/beam/erl_bif_lists.c
@@ -283,7 +283,7 @@ static BIF_RETTYPE lists_reverse_alloc(Process *c_p,
 {
     static const Uint CELLS_PER_RED = 40;
 
-    Eterm *heap_top, *heap_end;
+    Eterm *alloc_top, *alloc_end;
     Uint cells_left, max_cells;
     Eterm list, tail;
     Eterm lookahead;
@@ -305,18 +305,18 @@ static BIF_RETTYPE lists_reverse_alloc(Process *c_p,
         BIF_ERROR(c_p, BADARG);
     }
 
-    heap_top = HAlloc(c_p, 2 * (max_cells - cells_left));
-    heap_end = heap_top + 2 * (max_cells - cells_left);
+    alloc_top = HAlloc(c_p, 2 * (max_cells - cells_left));
+    alloc_end = alloc_top + 2 * (max_cells - cells_left);
 
-    while (heap_top < heap_end) {
+    while (alloc_top < alloc_end) {
         Eterm *pair = list_val(list);
 
-        tail = CONS(heap_top, CAR(pair), tail);
+        tail = CONS(alloc_top, CAR(pair), tail);
         list = CDR(pair);
 
         ASSERT(is_list(list) || is_nil(list));
 
-        heap_top += 2;
+        alloc_top += 2;
     }
 
     if (is_nil(list)) {
@@ -333,7 +333,7 @@ static BIF_RETTYPE lists_reverse_onheap(Process *c_p,
 {
     static const Uint CELLS_PER_RED = 60;
 
-    Eterm *heap_top, *heap_end;
+    Eterm *alloc_start, *alloc_top, *alloc_end;
     Uint cells_left, max_cells;
     Eterm list, tail;
 
@@ -343,21 +343,27 @@ static BIF_RETTYPE lists_reverse_onheap(Process *c_p,
     cells_left = max_cells = CELLS_PER_RED * (1 + ERTS_BIF_REDS_LEFT(c_p));
 
     ASSERT(HEAP_LIMIT(c_p) >= HEAP_TOP(c_p) + 2);
-    heap_end = HEAP_LIMIT(c_p) - 2;
-    heap_top = HEAP_TOP(c_p);
+    alloc_start = HEAP_TOP(c_p);
+    alloc_end = HEAP_LIMIT(c_p) - 2;
+    alloc_top = alloc_start;
 
-    while (heap_top < heap_end && is_list(list)) {
+    /* Don't process more cells than we have reductions for. */
+    alloc_end = MIN(alloc_top + (cells_left * 2), alloc_end);
+
+    while (alloc_top < alloc_end && is_list(list)) {
         Eterm *pair = list_val(list);
 
-        tail = CONS(heap_top, CAR(pair), tail);
+        tail = CONS(alloc_top, CAR(pair), tail);
         list = CDR(pair);
 
-        heap_top += 2;
+        alloc_top += 2;
     }
 
-    cells_left -= (heap_top - heap_end) / 2;
+    cells_left -= (alloc_top - alloc_start) / 2;
+    HEAP_TOP(c_p) = alloc_top;
+
+    ASSERT(cells_left >= 0 && cells_left <= max_cells);
     BUMP_REDS(c_p, (max_cells - cells_left) / CELLS_PER_RED);
-    HEAP_TOP(c_p) = heap_top;
 
     if (is_nil(list)) {
         BIF_RET(tail);
-- 
cgit v1.2.3


From 0aad6ef3bf360f2971d5c8b22620aecc3cdae3ef Mon Sep 17 00:00:00 2001
From: Dmytro Lytovchenko <dmytro.lytovchenko@erlang-solutions.com>
Date: Tue, 23 Oct 2018 09:14:11 +0200
Subject: Clarify a magical allocation size

---
 erts/emulator/beam/external.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/erts/emulator/beam/external.c b/erts/emulator/beam/external.c
index 621ba108ba..9a66e491f3 100644
--- a/erts/emulator/beam/external.c
+++ b/erts/emulator/beam/external.c
@@ -1953,7 +1953,8 @@ static Eterm erts_term_to_binary_int(Process* p, Eterm Term, int level, Uint fla
 
 #define RETURN_STATE()							\
     do {								\
-	hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE+3);                    \
+	static const int TUPLE2_SIZE = 2 + 1;				\
+	hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE + TUPLE2_SIZE);        \
 	c_term = erts_mk_magic_ref(&hp, &MSO(p), context_b);            \
 	res = TUPLE2(hp, Term, c_term);					\
 	BUMP_ALL_REDS(p);                                               \
-- 
cgit v1.2.3


From 1056d2d1fd49f669a2001f03890e13c9cba76c1e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20H=C3=B6gberg?= <john@erlang.org>
Date: Thu, 25 Oct 2018 08:33:08 +0200
Subject: Inline erts_cmp

This greatly increases the performance of '--'/2 which does a lot
of term comparisons.
---
 erts/emulator/beam/atom.c      |  2 +-
 erts/emulator/beam/erl_utils.h | 61 ++++++++++++++++++++++++++++++++++++++--
 erts/emulator/beam/utils.c     | 64 ++++--------------------------------------
 3 files changed, 64 insertions(+), 63 deletions(-)

diff --git a/erts/emulator/beam/atom.c b/erts/emulator/beam/atom.c
index 5381611fab..59b51fd15e 100644
--- a/erts/emulator/beam/atom.c
+++ b/erts/emulator/beam/atom.c
@@ -174,7 +174,7 @@ atom_alloc(Atom* tmpl)
 
     /*
      * Precompute ordinal value of first 3 bytes + 7 bits.
-     * This is used by utils.c:erts_cmp_atoms().
+     * This is used by erl_utils.h:erts_cmp_atoms().
      * We cannot use the full 32 bits of the first 4 bytes,
      * since we use the sign of the difference between two
      * ordinal values to represent their relative order.
diff --git a/erts/emulator/beam/erl_utils.h b/erts/emulator/beam/erl_utils.h
index b3bfa69052..880febba8b 100644
--- a/erts/emulator/beam/erl_utils.h
+++ b/erts/emulator/beam/erl_utils.h
@@ -22,6 +22,7 @@
 #define ERL_UTILS_H__
 
 #include "sys.h"
+#include "atom.h"
 #include "erl_printf.h"
 
 struct process;
@@ -112,10 +113,12 @@ int eq(Eterm, Eterm);
 
 #define EQ(x,y) (((x) == (y)) || (is_not_both_immed((x),(y)) && eq((x),(y))))
 
-int erts_cmp_atoms(Eterm a, Eterm b);
-Sint erts_cmp(Eterm, Eterm, int, int);
-Sint erts_cmp_compound(Eterm, Eterm, int, int);
+ERTS_GLB_INLINE Sint erts_cmp(Eterm, Eterm, int, int);
+ERTS_GLB_INLINE int erts_cmp_atoms(Eterm a, Eterm b);
+
 Sint cmp(Eterm a, Eterm b);
+Sint erts_cmp_compound(Eterm, Eterm, int, int);
+
 #define CMP(A,B)                         erts_cmp(A,B,0,0)
 #define CMP_TERM(A,B)                    erts_cmp(A,B,1,0)
 #define CMP_EQ_ONLY(A,B)                 erts_cmp(A,B,0,1)
@@ -150,4 +153,56 @@ Sint cmp(Eterm a, Eterm b);
 	if (erts_cmp_compound(X,Y,0,EqOnly) Op 0) { Action; };	\
     }
 
+#define erts_float_comp(x,y) (((x)<(y)) ? -1 : (((x)==(y)) ? 0 : 1))
+
+#if ERTS_GLB_INLINE_INCL_FUNC_DEF
+
+ERTS_GLB_INLINE int erts_cmp_atoms(Eterm a, Eterm b) {
+    Atom *aa = atom_tab(atom_val(a));
+    Atom *bb = atom_tab(atom_val(b));
+
+    byte *name_a, *name_b;
+    int len_a, len_b, diff;
+
+    diff = aa->ord0 - bb->ord0;
+
+    if (diff != 0) {
+        return diff;
+    }
+
+    name_a = &aa->name[3];
+    name_b = &bb->name[3];
+    len_a = aa->len-3;
+    len_b = bb->len-3;
+
+    if (len_a > 0 && len_b > 0) {
+        diff = sys_memcmp(name_a, name_b, MIN(len_a, len_b));
+
+        if (diff != 0) {
+            return diff;
+        }
+    }
+
+    return len_a - len_b;
+}
+
+ERTS_GLB_INLINE Sint erts_cmp(Eterm a, Eterm b, int exact, int eq_only) {
+    if (is_atom(a) && is_atom(b)) {
+        return erts_cmp_atoms(a, b);
+    } else if (is_both_small(a, b)) {
+        return (signed_val(a) - signed_val(b));
+    } else if (is_float(a) && is_float(b)) {
+        FloatDef af, bf;
+
+        GET_DOUBLE(a, af);
+        GET_DOUBLE(b, bf);
+
+        return erts_float_comp(af.fd, bf.fd);
+    }
+
+    return erts_cmp_compound(a,b,exact,eq_only);
+}
+
+#endif /* ERTS_GLB_INLINE_INCL_FUNC_DEF */
+
 #endif
diff --git a/erts/emulator/beam/utils.c b/erts/emulator/beam/utils.c
index 08f8ca9788..d81bd89a48 100644
--- a/erts/emulator/beam/utils.c
+++ b/erts/emulator/beam/utils.c
@@ -2615,27 +2615,6 @@ not_equal:
 }
 
 
-/* 
- * Lexically compare two strings of bytes (string s1 length l1 and s2 l2).
- *
- *	s1 < s2	return -1
- *	s1 = s2	return  0
- *	s1 > s2 return +1
- */
-static int cmpbytes(byte *s1, int l1, byte *s2, int l2)
-{
-    int i;
-    i = 0;
-    while((i < l1) && (i < l2)) {
-	if (s1[i] < s2[i]) return(-1);
-	if (s1[i] > s2[i]) return(1);
-	i++;
-    }
-    if (l1 < l2) return(-1);
-    if (l1 > l2) return(1);
-    return(0);
-}
-
 
 /*
  * Compare objects.
@@ -2649,20 +2628,6 @@ static int cmpbytes(byte *s1, int l1, byte *s2, int l2)
  *
  */
 
-
-#define float_comp(x,y)    (((x)<(y)) ? -1 : (((x)==(y)) ? 0 : 1))
-
-int erts_cmp_atoms(Eterm a, Eterm b)
-{
-    Atom *aa = atom_tab(atom_val(a));
-    Atom *bb = atom_tab(atom_val(b));
-    int diff = aa->ord0 - bb->ord0;
-    if (diff)
-	return diff;
-    return cmpbytes(aa->name+3, aa->len-3,
-		    bb->name+3, bb->len-3);
-}
-
 /* cmp(Eterm a, Eterm b)
  *  For compatibility with HiPE - arith-based compare.
  */
@@ -2673,22 +2638,6 @@ Sint cmp(Eterm a, Eterm b)
 
 Sint erts_cmp_compound(Eterm a, Eterm b, int exact, int eq_only);
 
-Sint erts_cmp(Eterm a, Eterm b, int exact, int eq_only)
-{
-    if (is_atom(a) && is_atom(b)) {
-        return erts_cmp_atoms(a, b);
-    } else if (is_both_small(a, b)) {
-        return (signed_val(a) - signed_val(b));
-    } else if (is_float(a) && is_float(b)) {
-        FloatDef af, bf;
-        GET_DOUBLE(a, af);
-        GET_DOUBLE(b, bf);
-        return float_comp(af.fd, bf.fd);
-    }
-    return erts_cmp_compound(a,b,exact,eq_only);
-}
-
-
 /* erts_cmp(Eterm a, Eterm b, int exact)
  * exact = 1 -> term-based compare
  * exact = 0 -> arith-based compare
@@ -2985,7 +2934,7 @@ tailrecur_ne:
 
 		    GET_DOUBLE(a, af);
 		    GET_DOUBLE(b, bf);
-		    ON_CMP_GOTO(float_comp(af.fd, bf.fd));
+		    ON_CMP_GOTO(erts_float_comp(af.fd, bf.fd));
 		}
 	    case (_TAG_HEADER_POS_BIG >> _TAG_PRIMARY_SIZE):
 	    case (_TAG_HEADER_NEG_BIG >> _TAG_PRIMARY_SIZE):
@@ -3022,10 +2971,7 @@ tailrecur_ne:
 		    ErlFunThing* f2 = (ErlFunThing *) fun_val(b);
 		    Sint diff;
 
-		    diff = cmpbytes(atom_tab(atom_val(f1->fe->module))->name,
-				    atom_tab(atom_val(f1->fe->module))->len,
-				    atom_tab(atom_val(f2->fe->module))->name,
-				    atom_tab(atom_val(f2->fe->module))->len);
+                    diff = erts_cmp_atoms((f1->fe)->module, (f2->fe)->module);
 		    if (diff != 0) {
 			RETURN_NEQ(diff);
 		    }
@@ -3219,7 +3165,7 @@ tailrecur_ne:
 	    if (f2.fd < MAX_LOSSLESS_FLOAT && f2.fd > MIN_LOSSLESS_FLOAT) {
 		/* Float is within the no loss limit */
 		f1.fd = signed_val(aw);
-		j = float_comp(f1.fd, f2.fd);
+		j = erts_float_comp(f1.fd, f2.fd);
 	    }
 #if ERTS_SIZEOF_ETERM == 8
 	    else if (f2.fd > (double) (MAX_SMALL + 1)) {
@@ -3266,7 +3212,7 @@ tailrecur_ne:
 		if (big_to_double(aw, &f1.fd) < 0) {
 		    j = big_sign(aw) ? -1 : 1;
 		} else {
-		    j = float_comp(f1.fd, f2.fd);
+		    j = erts_float_comp(f1.fd, f2.fd);
 		}
 	    } else {
 		big = double_to_big(f2.fd, big_buf, sizeof(big_buf)/sizeof(Eterm));
@@ -3282,7 +3228,7 @@ tailrecur_ne:
 	    if (f1.fd < MAX_LOSSLESS_FLOAT && f1.fd > MIN_LOSSLESS_FLOAT) {
 		/* Float is within the no loss limit */
 		f2.fd = signed_val(bw);
-		j = float_comp(f1.fd, f2.fd);
+		j = erts_float_comp(f1.fd, f2.fd);
 	    }
 #if ERTS_SIZEOF_ETERM == 8
 	    else if (f1.fd > (double) (MAX_SMALL + 1)) {
-- 
cgit v1.2.3


From eb9ee88f4cc640065f4902e270d834bfb596d5fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?John=20H=C3=B6gberg?= <john@erlang.org>
Date: Mon, 15 Oct 2018 18:17:12 +0200
Subject: Optimize operator '--' and yield on large inputs

The removal set now uses a red-black tree instead of an array on
large inputs, decreasing runtime complexity from `n*n` to
`n*log(n)`. It will also exit early when there are no more items
left in the removal set, drastically improving performance and
memory use when the items to be removed are present near the head
of the list.

This got a lot more complicated than before as the overhead of
always using a red-black tree was unacceptable when either of the
inputs were small, but this compromise has okay-to-decent
performance regardless of input size.

Co-authored-by: Dmytro Lytovchenko <dmytro.lytovchenko@erlang-solutions.com>
---
 erts/emulator/beam/erl_alloc.types            |   1 +
 erts/emulator/beam/erl_bif_lists.c            | 796 +++++++++++++++++++++++---
 lib/stdlib/doc/src/lists.xml                  |   8 -
 lib/stdlib/test/lists_SUITE.erl               |  42 ++
 system/doc/efficiency_guide/commoncaveats.xml |  48 --
 system/doc/efficiency_guide/retired_myths.xml |  14 +
 6 files changed, 766 insertions(+), 143 deletions(-)

diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 5409b89bab..98ba0e5f07 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -274,6 +274,7 @@ type	ML_YIELD_STATE	SHORT_LIVED	SYSTEM		monitor_link_yield_state
 type	ML_DIST		STANDARD	SYSTEM		monitor_link_dist
 type	PF3_ARGS	SHORT_LIVED	PROCESSES	process_flag_3_arguments
 type	SETUP_CONN_ARG	SHORT_LIVED	PROCESSES	setup_connection_argument
+type    LIST_TRAP       SHORT_LIVED     PROCESSES       list_bif_trap_state
 
 type	ENVIRONMENT	SYSTEM		SYSTEM		environment
 
diff --git a/erts/emulator/beam/erl_bif_lists.c b/erts/emulator/beam/erl_bif_lists.c
index 9b2b8aaab6..a793b34852 100644
--- a/erts/emulator/beam/erl_bif_lists.c
+++ b/erts/emulator/beam/erl_bif_lists.c
@@ -29,12 +29,13 @@
 #include "sys.h"
 #include "erl_vm.h"
 #include "global.h"
-#include "erl_process.h"
-#include "error.h"
 #include "bif.h"
+#include "erl_binary.h"
+
 
 static Eterm keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List);
 
+
 static BIF_RETTYPE append(Process* p, Eterm A, Eterm B)
 {
     Eterm list;
@@ -146,103 +147,724 @@ BIF_RETTYPE append_2(BIF_ALIST_2)
     return append(BIF_P, BIF_ARG_1, BIF_ARG_2);
 }
 
-/*
- * erlang:'--'/2
- */
+/* erlang:'--'/2
+ *
+ * Subtracts a list from another (LHS -- RHS), removing the first occurrence of
+ * each element in LHS from RHS. There is no type coercion so the elements must
+ * match exactly.
+ *
+ * The BIF is broken into several stages that can all trap individually, and it
+ * chooses its algorithm based on input size. If either input is small it will
+ * use a linear scan tuned to which side it's on, and if both inputs are large
+ * enough it will convert RHS into a multiset to provide good asymptotic
+ * behavior. */
+
+#define SUBTRACT_LHS_THRESHOLD 16
+#define SUBTRACT_RHS_THRESHOLD 16
+
+typedef enum {
+    SUBTRACT_STAGE_START,
+    SUBTRACT_STAGE_LEN_LHS,
+
+    /* Naive linear scan that's efficient when
+     * LEN_LHS <= SUBTRACT_LHS_THRESHOLD. */
+    SUBTRACT_STAGE_NAIVE_LHS,
+
+    SUBTRACT_STAGE_LEN_RHS,
+
+    /* As SUBTRACT_STAGE_NAIVE_LHS but for RHS. */
+    SUBTRACT_STAGE_NAIVE_RHS,
+
+    /* Creates a multiset from RHS for faster lookups before sweeping through
+     * LHS. The set is implemented as a red-black tree and duplicate elements
+     * are handled by a counter on each node. */
+    SUBTRACT_STAGE_SET_BUILD,
+    SUBTRACT_STAGE_SET_FINISH
+} ErtsSubtractCtxStage;
+
+typedef struct subtract_node__ {
+    struct subtract_node__ *parent;
+    struct subtract_node__ *left;
+    struct subtract_node__ *right;
+    int is_red;
+
+    Eterm key;
+    Uint count;
+} subtract_tree_t;
+
+typedef struct {
+    ErtsSubtractCtxStage stage;
+
+    Eterm lhs_original;
+    Eterm rhs_original;
+
+    Uint lhs_remaining;
+    Uint rhs_remaining;
+
+    Eterm iterator;
+
+    Eterm *result_cdr;
+    Eterm result;
+
+    union {
+        Eterm lhs_elements[SUBTRACT_LHS_THRESHOLD];
+        Eterm rhs_elements[SUBTRACT_RHS_THRESHOLD];
+
+        struct {
+            subtract_tree_t *tree;
+
+            /* A memory area for the tree's nodes, saving us the need to have
+             * one allocation per node. */
+            subtract_tree_t *alloc_start;
+            subtract_tree_t *alloc;
+        } rhs_set;
+    } u;
+} ErtsSubtractContext;
+
+#define ERTS_RBT_PREFIX subtract
+#define ERTS_RBT_T subtract_tree_t
+#define ERTS_RBT_KEY_T Eterm
+#define ERTS_RBT_FLAGS_T int
+#define ERTS_RBT_INIT_EMPTY_TNODE(T) \
+    do { \
+        (T)->parent = NULL; \
+        (T)->left = NULL; \
+        (T)->right = NULL; \
+    } while(0)
+#define ERTS_RBT_IS_RED(T) ((T)->is_red)
+#define ERTS_RBT_SET_RED(T) ((T)->is_red = 1)
+#define ERTS_RBT_IS_BLACK(T) (!ERTS_RBT_IS_RED(T))
+#define ERTS_RBT_SET_BLACK(T) ((T)->is_red = 0)
+#define ERTS_RBT_GET_FLAGS(T) ((T)->is_red)
+#define ERTS_RBT_SET_FLAGS(T, F) ((T)->is_red = F)
+#define ERTS_RBT_GET_PARENT(T) ((T)->parent)
+#define ERTS_RBT_SET_PARENT(T, P) ((T)->parent = P)
+#define ERTS_RBT_GET_RIGHT(T) ((T)->right)
+#define ERTS_RBT_SET_RIGHT(T, R) ((T)->right = (R))
+#define ERTS_RBT_GET_LEFT(T) ((T)->left)
+#define ERTS_RBT_SET_LEFT(T, L) ((T)->left = (L))
+#define ERTS_RBT_GET_KEY(T) ((T)->key)
+#define ERTS_RBT_CMP_KEYS(KX, KY) CMP_TERM(KX, KY)
+#define ERTS_RBT_WANT_LOOKUP_INSERT
+#define ERTS_RBT_WANT_LOOKUP
+#define ERTS_RBT_WANT_DELETE
+#define ERTS_RBT_UNDEF
+
+#include "erl_rbtree.h"
+
+static int subtract_continue(Process *p, ErtsSubtractContext *context);
+
+static void subtract_ctx_dtor(ErtsSubtractContext *context) {
+    switch (context->stage) {
+        case SUBTRACT_STAGE_SET_BUILD:
+        case SUBTRACT_STAGE_SET_FINISH:
+            erts_free(ERTS_ALC_T_LIST_TRAP, context->u.rhs_set.alloc_start);
+            break;
+        default:
+            break;
+    }
+}
 
-#define SMALL_VEC_SIZE 10
-static Eterm subtract(Process* p, Eterm A, Eterm B)
-{
-    Eterm  list;
-    Eterm* hp;
-    Uint  need;
-    Eterm  res;
-    Eterm small_vec[SMALL_VEC_SIZE];	/* Preallocated memory for small lists */
-    Eterm* vec_p;
-    Eterm* vp;
-    Sint i;
-    Sint n;
-    Sint m;
-    
-    if ((n = erts_list_length(A)) < 0) {
-	BIF_ERROR(p, BADARG);
+static int subtract_ctx_bin_dtor(Binary *context_bin) {
+    ErtsSubtractContext *context = ERTS_MAGIC_BIN_DATA(context_bin);
+    subtract_ctx_dtor(context);
+    return 1;
+}
+
+static void subtract_ctx_move(ErtsSubtractContext *from,
+                              ErtsSubtractContext *to) {
+    int uses_result_cdr = 0;
+
+    to->stage = from->stage;
+
+    to->lhs_original = from->lhs_original;
+    to->rhs_original = from->rhs_original;
+
+    to->lhs_remaining = from->lhs_remaining;
+    to->rhs_remaining = from->rhs_remaining;
+
+    to->iterator = from->iterator;
+    to->result = from->result;
+
+    switch (to->stage) {
+        case SUBTRACT_STAGE_NAIVE_LHS:
+            sys_memcpy(to->u.lhs_elements,
+                       from->u.lhs_elements,
+                       sizeof(Eterm) * to->lhs_remaining);
+            break;
+        case SUBTRACT_STAGE_NAIVE_RHS:
+            sys_memcpy(to->u.rhs_elements,
+                       from->u.rhs_elements,
+                       sizeof(Eterm) * to->rhs_remaining);
+
+            uses_result_cdr = 1;
+            break;
+        case SUBTRACT_STAGE_SET_FINISH:
+            uses_result_cdr = 1;
+            /* FALL THROUGH */
+        case SUBTRACT_STAGE_SET_BUILD:
+            to->u.rhs_set.alloc_start = from->u.rhs_set.alloc_start;
+            to->u.rhs_set.alloc = from->u.rhs_set.alloc;
+            to->u.rhs_set.tree = from->u.rhs_set.tree;
+            break;
+        default:
+            break;
     }
-    if ((m = erts_list_length(B)) < 0) {
-	BIF_ERROR(p, BADARG);
+
+    if (uses_result_cdr) {
+        if (from->result_cdr == &from->result) {
+            to->result_cdr = &to->result;
+        } else {
+            to->result_cdr = from->result_cdr;
+        }
     }
-    
-    if (n == 0)
-	BIF_RET(NIL);
-    if (m == 0)
-	BIF_RET(A);
-    
-    /* allocate element vector */
-    if (n <= SMALL_VEC_SIZE)
-	vec_p = small_vec;
-    else
-	vec_p = (Eterm*) erts_alloc(ERTS_ALC_T_TMP, n * sizeof(Eterm));
-    
-    /* PUT ALL ELEMENTS IN VP */
-    vp = vec_p;
-    list = A;
-    i = n;
-    while(i--) {
-	Eterm* listp = list_val(list);
-	*vp++ = CAR(listp);
-	list = CDR(listp);
+}
+
+static Eterm subtract_create_trap_state(Process *p,
+                                        ErtsSubtractContext *context) {
+    Binary *state_bin;
+    Eterm *hp;
+
+    state_bin = erts_create_magic_binary(sizeof(ErtsSubtractContext),
+                                         subtract_ctx_bin_dtor);
+
+    subtract_ctx_move(context, ERTS_MAGIC_BIN_DATA(state_bin));
+
+    hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE);
+
+    return erts_mk_magic_ref(&hp, &MSO(p), state_bin);
+}
+
+static int subtract_enter_len_lhs(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_LEN_LHS;
+
+    context->iterator = context->lhs_original;
+    context->lhs_remaining = 0;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_enter_len_rhs(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_LEN_RHS;
+
+    context->iterator = context->rhs_original;
+    context->rhs_remaining = 0;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_get_length(Process *p, Eterm *iterator_p, Uint *count_p) {
+    static const Sint ELEMENTS_PER_RED = 32;
+
+    Sint budget, count;
+    Eterm iterator;
+
+    budget = ELEMENTS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    iterator = *iterator_p;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    for (count = 0; count < budget && is_list(iterator); count++) {
+        iterator = CDR(list_val(iterator));
     }
-    
-    /* UNMARK ALL DELETED CELLS */
-    list = B;
-    m = 0;  /* number of deleted elements */
-    while(is_list(list)) {
-	Eterm* listp = list_val(list);
-	Eterm  elem = CAR(listp);
-	i = n;
-	vp = vec_p;
-	while(i--) {
-	    if (is_value(*vp) && eq(*vp, elem)) {
-		*vp = THE_NON_VALUE;
-		m++;
-		break;
-	    }
-	    vp++;
-	}
-	list = CDR(listp);
+
+    if (!is_list(iterator) && !is_nil(iterator)) {
+        return -1;
     }
-    
-    if (m == n)      /* All deleted ? */
-	res = NIL;
-    else if (m == 0)  /* None deleted ? */
-	res = A;
-    else {			/* REBUILD LIST */
-	res = NIL;
-	need = 2*(n - m);
-	hp = HAlloc(p, need);
-	vp = vec_p + n - 1;
-	while(vp >= vec_p) {
-	    if (is_value(*vp)) {
-		res = CONS(hp, *vp, res);
-		hp += 2;
-	    }
-	    vp--;
-	}
+
+    BUMP_REDS(p, count / ELEMENTS_PER_RED);
+
+    *iterator_p = iterator;
+    *count_p += count;
+
+    if (is_nil(iterator)) {
+        return 1;
     }
-    if (vec_p != small_vec)
-	erts_free(ERTS_ALC_T_TMP, (void *) vec_p);
-    BIF_RET(res);
+
+    return 0;
 }
 
-BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2)
-{
-    return subtract(BIF_P, BIF_ARG_1, BIF_ARG_2);
+static int subtract_enter_naive_lhs(Process *p, ErtsSubtractContext *context) {
+    Eterm iterator;
+    int i = 0;
+
+    context->stage = SUBTRACT_STAGE_NAIVE_LHS;
+
+    context->iterator = context->rhs_original;
+    context->result = NIL;
+
+    iterator = context->lhs_original;
+
+    while (is_list(iterator)) {
+        const Eterm *cell = list_val(iterator);
+
+        ASSERT(i < SUBTRACT_LHS_THRESHOLD);
+
+        context->u.lhs_elements[i++] = CAR(cell);
+        iterator = CDR(cell);
+    }
+
+    ASSERT(i == context->lhs_remaining);
+
+    return subtract_continue(p, context);
 }
 
-BIF_RETTYPE subtract_2(BIF_ALIST_2)
-{
-    return subtract(BIF_P, BIF_ARG_1, BIF_ARG_2);
+static int subtract_naive_lhs(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 16;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+    while (checks < budget && is_list(context->iterator)) {
+        const Eterm *cell;
+        Eterm value, next;
+        int found_at;
+
+        cell = list_val(context->iterator);
+
+        value = CAR(cell);
+        next = CDR(cell);
+
+        for (found_at = 0; found_at < context->lhs_remaining; found_at++) {
+            if (EQ(value, context->u.lhs_elements[found_at])) {
+                /* We shift the array one step down as we have to preserve
+                 * order.
+                 *
+                 * Note that we can't exit early as that would suppress errors
+                 * in the right-hand side (this runs prior to determining the
+                 * length of RHS). */
+
+                context->lhs_remaining--;
+                sys_memmove(&context->u.lhs_elements[found_at],
+                            &context->u.lhs_elements[found_at + 1],
+                            (context->lhs_remaining - found_at) * sizeof(Eterm));
+                break;
+            }
+        }
+
+        checks += MAX(1, context->lhs_remaining);
+        context->iterator = next;
+    }
+
+    BUMP_REDS(p, MIN(checks, budget) / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        return 0;
+    } else if (!is_nil(context->iterator)) {
+        return -1;
+    }
+
+    if (context->lhs_remaining > 0) {
+        Eterm *hp;
+        int i;
+
+        hp = HAlloc(p, context->lhs_remaining * 2);
+
+        for (i = context->lhs_remaining - 1; i >= 0; i--) {
+            Eterm value = context->u.lhs_elements[i];
+
+            context->result = CONS(hp, value, context->result);
+            hp += 2;
+        }
+    }
+
+    ASSERT(context->lhs_remaining > 0 || context->result == NIL);
+
+    return 1;
+}
+
+static int subtract_enter_naive_rhs(Process *p, ErtsSubtractContext *context) {
+    Eterm iterator;
+    int i = 0;
+
+    context->stage = SUBTRACT_STAGE_NAIVE_RHS;
+
+    context->iterator = context->lhs_original;
+    context->result_cdr = &context->result;
+    context->result = NIL;
+
+    iterator = context->rhs_original;
+
+    while (is_list(iterator)) {
+        const Eterm *cell = list_val(iterator);
+
+        ASSERT(i < SUBTRACT_RHS_THRESHOLD);
+
+        context->u.rhs_elements[i++] = CAR(cell);
+        iterator = CDR(cell);
+    }
+
+    ASSERT(i == context->rhs_remaining);
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_naive_rhs(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 16;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (checks < budget && is_list(context->iterator)) {
+        const Eterm *cell;
+        Eterm value, next;
+        int found_at;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        for (found_at = context->rhs_remaining - 1; found_at >= 0; found_at--) {
+            if (EQ(value, context->u.rhs_elements[found_at])) {
+                break;
+            }
+        }
+
+        if (found_at < 0) {
+            /* Destructively add the value to the result. This is safe
+             * since the GC is disabled and the unfinished term is never
+             * leaked to the outside world. */
+            Eterm *hp = HAllocX(p, 2, context->lhs_remaining * 2);
+
+            *context->result_cdr = make_list(hp);
+            context->result_cdr = &CDR(hp);
+
+            CAR(hp) = value;
+        } else if (found_at >= 0) {
+            Eterm swap;
+
+            if (context->rhs_remaining-- == 1) {
+                /* We've run out of items to remove, so the rest of the
+                 * result will be equal to the remainder of the input. We know
+                 * that LHS is well-formed as any errors would've been reported
+                 * during length determination. */
+                *context->result_cdr = next;
+
+                BUMP_REDS(p, MIN(budget, checks) / CHECKS_PER_RED);
+
+                return 1;
+            }
+
+            swap = context->u.rhs_elements[context->rhs_remaining];
+            context->u.rhs_elements[found_at] = swap;
+        }
+
+        checks += context->rhs_remaining;
+        context->iterator = next;
+        context->lhs_remaining--;
+    }
+
+    /* The result only has to be terminated when returning it to the user, but
+     * we're doing it when trapping as well to prevent headaches when
+     * debugging. */
+    *context->result_cdr = NIL;
+
+    BUMP_REDS(p, MIN(budget, checks) / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        ASSERT(context->lhs_remaining > 0 && context->rhs_remaining > 0);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int subtract_enter_set_build(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_SET_BUILD;
+
+    context->u.rhs_set.alloc_start =
+        erts_alloc(ERTS_ALC_T_LIST_TRAP,
+                   context->rhs_remaining * sizeof(subtract_tree_t));
+
+    context->u.rhs_set.alloc = context->u.rhs_set.alloc_start;
+    context->u.rhs_set.tree = NULL;
+
+    context->iterator = context->rhs_original;
+
+    return subtract_continue(p, context);
 }
 
+static int subtract_set_build(Process *p, ErtsSubtractContext *context) {
+    const static Sint INSERTIONS_PER_RED = 16;
+    Sint budget, insertions;
+
+    budget = INSERTIONS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    insertions = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (insertions < budget && is_list(context->iterator)) {
+        subtract_tree_t *existing_node, *new_node;
+        const Eterm *cell;
+        Eterm value, next;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        new_node = context->u.rhs_set.alloc;
+        new_node->key = value;
+        new_node->count = 1;
+
+        existing_node = subtract_rbt_lookup_insert(&context->u.rhs_set.tree,
+                                                   new_node);
+
+        if (existing_node != NULL) {
+            existing_node->count++;
+        } else {
+            context->u.rhs_set.alloc++;
+        }
+
+        context->iterator = next;
+        insertions++;
+    }
+
+    BUMP_REDS(p, insertions / INSERTIONS_PER_RED);
+
+    ASSERT(is_list(context->iterator) || is_nil(context->iterator));
+    ASSERT(context->u.rhs_set.tree != NULL);
+
+    return is_nil(context->iterator);
+}
+
+static int subtract_enter_set_finish(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_SET_FINISH;
+
+    context->result_cdr = &context->result;
+    context->result = NIL;
+
+    context->iterator = context->lhs_original;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_set_finish(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 8;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (checks < budget && is_list(context->iterator)) {
+        subtract_tree_t *node;
+        const Eterm *cell;
+        Eterm value, next;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        ASSERT(context->rhs_remaining > 0);
+
+        node = subtract_rbt_lookup(context->u.rhs_set.tree, value);
+
+        if (node == NULL) {
+            Eterm *hp = HAllocX(p, 2, context->lhs_remaining * 2);
+
+            *context->result_cdr = make_list(hp);
+            context->result_cdr = &CDR(hp);
+
+            CAR(hp) = value;
+        } else {
+            if (context->rhs_remaining-- == 1) {
+                *context->result_cdr = next;
+
+                BUMP_REDS(p, checks / CHECKS_PER_RED);
+
+                return 1;
+            }
+
+            if (node->count-- == 1) {
+                subtract_rbt_delete(&context->u.rhs_set.tree, node);
+            }
+        }
+
+        context->iterator = next;
+        context->lhs_remaining--;
+        checks++;
+    }
+
+    *context->result_cdr = NIL;
+
+    BUMP_REDS(p, checks / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        ASSERT(context->lhs_remaining > 0 && context->rhs_remaining > 0);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int subtract_continue(Process *p, ErtsSubtractContext *context) {
+    switch (context->stage) {
+        case SUBTRACT_STAGE_START: {
+            return subtract_enter_len_lhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_LEN_LHS: {
+            int res = subtract_get_length(p,
+                                          &context->iterator,
+                                          &context->lhs_remaining);
+
+            if (res != 1) {
+                return res;
+            }
+
+            if (context->lhs_remaining <= SUBTRACT_LHS_THRESHOLD) {
+                return subtract_enter_naive_lhs(p, context);
+            }
+
+            return subtract_enter_len_rhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_NAIVE_LHS: {
+            return subtract_naive_lhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_LEN_RHS: {
+            int res = subtract_get_length(p,
+                                          &context->iterator,
+                                          &context->rhs_remaining);
+
+            if (res != 1) {
+                return res;
+            }
+
+            /* We've walked through both lists fully now so we no longer need
+             * to check for errors past this point. */
+
+            if (context->rhs_remaining <= SUBTRACT_RHS_THRESHOLD) {
+                return subtract_enter_naive_rhs(p, context);
+            }
+
+            return subtract_enter_set_build(p, context);
+        }
+
+        case SUBTRACT_STAGE_NAIVE_RHS: {
+            return subtract_naive_rhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_SET_BUILD: {
+            int res = subtract_set_build(p, context);
+
+            if (res != 1) {
+                return res;
+            }
+
+            return subtract_enter_set_finish(p, context);
+        }
+
+        case SUBTRACT_STAGE_SET_FINISH: {
+            return subtract_set_finish(p, context);
+        }
+
+        default:
+            ERTS_ASSERT(!"unreachable");
+    }
+}
+
+static int subtract_start(Process *p, Eterm lhs, Eterm rhs,
+                          ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_START;
+
+    context->lhs_original = lhs;
+    context->rhs_original = rhs;
+
+    return subtract_continue(p, context);
+}
+
+/* erlang:'--'/2 */
+static Eterm subtract(Export *bif_entry, BIF_ALIST_2) {
+    Eterm lhs = BIF_ARG_1, rhs = BIF_ARG_2;
+
+    if ((is_list(lhs) || is_nil(lhs)) && (is_list(rhs) || is_nil(rhs))) {
+        /* We start with the context on the stack in the hopes that we won't
+         * have to trap. */
+        ErtsSubtractContext context;
+        int res;
+
+        res = subtract_start(BIF_P, lhs, rhs, &context);
+
+        if (res == 0) {
+            Eterm state_mref;
+
+            state_mref = subtract_create_trap_state(BIF_P, &context);
+            erts_set_gc_state(BIF_P, 0);
+
+            BIF_TRAP2(bif_entry, BIF_P, state_mref, NIL);
+        }
+
+        subtract_ctx_dtor(&context);
+
+        if (res < 0) {
+            BIF_ERROR(BIF_P, BADARG);
+        }
+
+        BIF_RET(context.result);
+    } else if (is_internal_magic_ref(lhs)) {
+        ErtsSubtractContext *context;
+        int (*dtor)(Binary*);
+        Binary *magic_bin;
+
+        int res;
+
+        magic_bin = erts_magic_ref2bin(lhs);
+        dtor = ERTS_MAGIC_BIN_DESTRUCTOR(magic_bin);
+
+        if (dtor != subtract_ctx_bin_dtor) {
+            BIF_ERROR(BIF_P, BADARG);
+        }
+
+        ASSERT(BIF_P->flags & F_DISABLE_GC);
+        ASSERT(rhs == NIL);
+
+        context = ERTS_MAGIC_BIN_DATA(magic_bin);
+        res = subtract_continue(BIF_P, context);
+
+        if (res == 0) {
+            BIF_TRAP2(bif_entry, BIF_P, lhs, NIL);
+        }
+
+        erts_set_gc_state(BIF_P, 1);
+
+        if (res < 0) {
+            ERTS_BIF_ERROR_TRAPPED2(BIF_P, BADARG, bif_entry,
+                                    context->lhs_original,
+                                    context->rhs_original);
+        }
+
+        BIF_RET(context->result);
+    }
+
+    ASSERT(!(BIF_P->flags & F_DISABLE_GC));
+
+    BIF_ERROR(BIF_P, BADARG);
+}
+
+BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2) {
+    return subtract(bif_export[BIF_ebif_minusminus_2], BIF_CALL_ARGS);
+}
+
+BIF_RETTYPE subtract_2(BIF_ALIST_2) {
+    return subtract(bif_export[BIF_subtract_2], BIF_CALL_ARGS);
+}
+
+
 BIF_RETTYPE lists_member_2(BIF_ALIST_2)
 {
     Eterm term;
diff --git a/lib/stdlib/doc/src/lists.xml b/lib/stdlib/doc/src/lists.xml
index c3d5d7e07a..e4215a5336 100644
--- a/lib/stdlib/doc/src/lists.xml
+++ b/lib/stdlib/doc/src/lists.xml
@@ -850,14 +850,6 @@ splitwith(Pred, List) ->
 > <input>lists:subtract("123212", "212").</input>
 "312".</pre>
         <p><c>lists:subtract(A, B)</c> is equivalent to <c>A -- B</c>.</p>
-        <warning>
-          <p>The complexity of <c>lists:subtract(A, B)</c> is proportional to
-            <c>length(A)*length(B)</c>, meaning that it is very slow if both
-            <c>A</c> and <c>B</c> are long lists. (If both lists are long, it
-            is a much better choice to use ordered lists and
-            <seealso marker="ordsets#subtract/2">
-            <c>ordsets:subtract/2</c></seealso>.</p>
-        </warning>
       </desc>
     </func>
 
diff --git a/lib/stdlib/test/lists_SUITE.erl b/lib/stdlib/test/lists_SUITE.erl
index 837ab4e97e..9a94bcc012 100644
--- a/lib/stdlib/test/lists_SUITE.erl
+++ b/lib/stdlib/test/lists_SUITE.erl
@@ -2597,6 +2597,13 @@ subtract(Config) when is_list(Config) ->
     {'EXIT',_} = (catch sub([a|b], [])),
     {'EXIT',_} = (catch sub([a|b], [a])),
 
+    %% Trapping, both crashing and otherwise.
+    [sub_trapping(N) || N <- lists:seq(0, 18)],
+
+    %% The current implementation chooses which algorithm to use based on
+    %% certain thresholds, and we need proper coverage for all corner cases.
+    [sub_thresholds(N) || N <- lists:seq(0, 32)],
+
     ok.
 
 sub_non_matching(A, B) ->
@@ -2606,6 +2613,41 @@ sub(A, B) ->
     Res = A -- B,
     Res = lists:subtract(A, B).
 
+sub_trapping(N) ->
+    List = lists:duplicate(N + (1 bsl N), gurka),
+    ImproperList = List ++ crash,
+
+    {'EXIT',_} = (catch sub_trapping_1(ImproperList, [])),
+    {'EXIT',_} = (catch sub_trapping_1(List, ImproperList)),
+
+    List = List -- lists:duplicate(N + (1 bsl N), gaffel),
+    ok = sub_trapping_1(List, []).
+
+sub_trapping_1([], _) -> ok;
+sub_trapping_1(L, R) -> sub_trapping_1(L -- R, [gurka | R]).
+
+sub_thresholds(N) ->
+    %% This needs to be long enough to cause trapping.
+    OtherLen = 1 bsl 18,
+    Other = lists:seq(0, OtherLen - 1),
+
+    Disjoint = lists:seq(-N, -1),
+    Subset = lists:seq(1, N),
+
+    %% LHS is disjoint from RHS, so all elements must be retained.
+    Disjoint = Disjoint -- Other,
+
+    %% LHS is covered by RHS, so all elements must be removed.
+    [] = Subset -- Other,
+
+    %% RHS is disjoint from LHS, so all elements must be retained.
+    Other = Other -- Disjoint,
+
+    %% RHS is covered by LHS, so N elements must be removed.
+    N = OtherLen - length(Other -- Subset),
+
+    ok.
+
 %% Test lists:droplast/1
 droplast(Config) when is_list(Config) ->
     [] = lists:droplast([x]),
diff --git a/system/doc/efficiency_guide/commoncaveats.xml b/system/doc/efficiency_guide/commoncaveats.xml
index b41ffc3902..367da09ba3 100644
--- a/system/doc/efficiency_guide/commoncaveats.xml
+++ b/system/doc/efficiency_guide/commoncaveats.xml
@@ -169,53 +169,5 @@ multiple_setelement(T0) ->
         {Bin1,Bin2} = split_binary(Bin, Num)</code>
    </section>
 
-  <section>
-    <title>Operator "--"</title>
-     <p>The "<c>--</c>" operator has a complexity
-     proportional to the product of the length of its operands.
-     This means that the operator is very slow if both of its operands
-     are long lists:</p>
-
-        <p><em>DO NOT</em></p>
-        <code type="none"><![CDATA[
-        HugeList1 -- HugeList2]]></code>
-
-     <p>Instead use the <seealso marker="stdlib:ordsets">ordsets</seealso>
-     module in STDLIB:</p>
-
-        <p><em>DO</em></p>
-        <code type="none">
-        HugeSet1 = ordsets:from_list(HugeList1),
-        HugeSet2 = ordsets:from_list(HugeList2),
-        ordsets:subtract(HugeSet1, HugeSet2)</code>
-
-     <p>Obviously, that code does not work if the original order
-     of the list is important. If the order of the list must be
-     preserved, do as follows:</p>
-
-        <p><em>DO</em></p>
-        <code type="none"><![CDATA[
-        Set = gb_sets:from_list(HugeList2),
-        [E || E <- HugeList1, not gb_sets:is_element(E, Set)]]]></code>
-
-     <note><p>This code behaves differently from "<c>--</c>"
-     if the lists contain duplicate elements (one occurrence
-     of an element in HugeList2 removes <em>all</em>
-     occurrences in HugeList1.)</p>
-     <p>Also, this code compares lists elements using the
-     "<c>==</c>" operator, while "<c>--</c>" uses the "<c>=:=</c>" operator.
-     If that difference is important, <c>sets</c> can be used instead of
-     <c>gb_sets</c>, but <c>sets:from_list/1</c> is much
-     slower than <c>gb_sets:from_list/1</c> for long lists.</p></note>
-
-     <p>Using the "<c>--</c>" operator to delete an element
-     from a list is not a performance problem:</p>
-
-        <p><em>OK</em></p>
-        <code type="none">
-        HugeList1 -- [Element]</code>
-
-   </section>
-
 </chapter>
 
diff --git a/system/doc/efficiency_guide/retired_myths.xml b/system/doc/efficiency_guide/retired_myths.xml
index 9b914a3b6e..144c942c2b 100644
--- a/system/doc/efficiency_guide/retired_myths.xml
+++ b/system/doc/efficiency_guide/retired_myths.xml
@@ -60,4 +60,18 @@
     That leads us to the myth that tail-recursive functions are faster
     than body-recursive functions.</p>
   </section>
+
+  <section>
+    <title>Myth: List subtraction ("--" operator) is slow</title>
+
+    <p>List subtraction used to have a run-time complexity proportional to the
+      product of the length of its operands, so it was extremely slow when both
+      lists were long.</p>
+
+    <p>As of OTP 22 the run-time complexity is "n log n" and the operation will
+      complete quickly even when both lists are very long. In fact, it is
+      faster and uses less memory than the commonly used workaround to convert
+      both lists to ordered sets before subtracting them with
+      <c>ordsets:subtract/2</c>.</p>
+  </section>
 </chapter>
-- 
cgit v1.2.3