Optimize operator '--' and yield on large inputs

The removal set now uses a red-black tree instead of an array on large inputs, decreasing runtime complexity from `n*n` to `n*log(n)`. It will also exit early when there are no more items left in the removal set, drastically improving performance and memory use when the items to be removed are present near the head of the list. This got a lot more complicated than before as the overhead of always using a red-black tree was unacceptable when either of the inputs were small, but this compromise has okay-to-decent performance regardless of input size. Co-authored-by: Dmytro Lytovchenko <[email protected]>
author: John Högberg <[email protected]> 2018-10-15 18:17:12 +0200
committer: Lukas Larsson <[email protected]> 2018-11-02 17:15:32 +0100
commit: bf5886b790f8f386ed425f543506a4bebb48448c (patch)
tree: b2ef444c7bc5fc21b9e1e4cf905041ca70832c78
parent: db6059a9217767a6e42e93cec05089c0ec977d20 (diff)
download: otp-bf5886b790f8f386ed425f543506a4bebb48448c.tar.gz
otp-bf5886b790f8f386ed425f543506a4bebb48448c.tar.bz2
otp-bf5886b790f8f386ed425f543506a4bebb48448c.zip
6 files changed, 767 insertions, 143 deletions
diff --git a/erts/emulator/beam/erl_alloc.types b/erts/emulator/beam/erl_alloc.types
index 252bf1cc7e..af1133b853 100644
--- a/erts/emulator/beam/erl_alloc.types
+++ b/erts/emulator/beam/erl_alloc.types
@@ -322,6 +322,7 @@ type	THR_PRGR_DATA	LONG_LIVED	SYSTEM		thr_prgr_data
 type	T_THR_PRGR_DATA	SHORT_LIVED	SYSTEM		temp_thr_prgr_data
 type	RELEASE_LAREA	SHORT_LIVED	SYSTEM		release_literal_area
 +endif
+type    LIST_TRAP       SHORT_LIVED     PROCESSES       list_bif_trap_state
 
 #
 # Types used for special emulators
diff --git a/erts/emulator/beam/erl_bif_lists.c b/erts/emulator/beam/erl_bif_lists.c
index 73d327da3e..94a41c285a 100644
--- a/erts/emulator/beam/erl_bif_lists.c
+++ b/erts/emulator/beam/erl_bif_lists.c
@@ -29,12 +29,13 @@
 #include "sys.h"
 #include "erl_vm.h"
 #include "global.h"
-#include "erl_process.h"
-#include "error.h"
 #include "bif.h"
+#include "erl_binary.h"
+
 
 static Eterm keyfind(int Bif, Process* p, Eterm Key, Eterm Pos, Eterm List);
 
+
 static BIF_RETTYPE append(Process* p, Eterm A, Eterm B)
 {
     Eterm list;
@@ -146,103 +147,725 @@ BIF_RETTYPE append_2(BIF_ALIST_2)
     return append(BIF_P, BIF_ARG_1, BIF_ARG_2);
 }
 
-/*
- * erlang:'--'/2
- */
+/* erlang:'--'/2
+ *
+ * Subtracts a list from another (LHS -- RHS), removing the first occurrence of
+ * each element in LHS from RHS. There is no type coercion so the elements must
+ * match exactly.
+ *
+ * The BIF is broken into several stages that can all trap individually, and it
+ * chooses its algorithm based on input size. If either input is small it will
+ * use a linear scan tuned to which side it's on, and if both inputs are large
+ * enough it will convert RHS into a multiset to provide good asymptotic
+ * behavior. */
 
-#define SMALL_VEC_SIZE 10
-static Eterm subtract(Process* p, Eterm A, Eterm B)
-{
-    Eterm  list;
-    Eterm* hp;
-    Uint  need;
-    Eterm  res;
-    Eterm small_vec[SMALL_VEC_SIZE];	/* Preallocated memory for small lists */
-    Eterm* vec_p;
-    Eterm* vp;
-    Sint i;
-    Sint n;
-    Sint m;
-    
-    if ((n = erts_list_length(A)) < 0) {
-	BIF_ERROR(p, BADARG);
+#define SUBTRACT_LHS_THRESHOLD 16
+#define SUBTRACT_RHS_THRESHOLD 16
+
+typedef enum {
+    SUBTRACT_STAGE_START,
+    SUBTRACT_STAGE_LEN_LHS,
+
+    /* Naive linear scan that's efficient when
+     * LEN_LHS <= SUBTRACT_LHS_THRESHOLD. */
+    SUBTRACT_STAGE_NAIVE_LHS,
+
+    SUBTRACT_STAGE_LEN_RHS,
+
+    /* As SUBTRACT_STAGE_NAIVE_LHS but for RHS. */
+    SUBTRACT_STAGE_NAIVE_RHS,
+
+    /* Creates a multiset from RHS for faster lookups before sweeping through
+     * LHS. The set is implemented as a red-black tree and duplicate elements
+     * are handled by a counter on each node. */
+    SUBTRACT_STAGE_SET_BUILD,
+    SUBTRACT_STAGE_SET_FINISH
+} ErtsSubtractCtxStage;
+
+typedef struct subtract_node__ {
+    struct subtract_node__ *parent;
+    struct subtract_node__ *left;
+    struct subtract_node__ *right;
+    int is_red;
+
+    Eterm key;
+    Uint count;
+} subtract_tree_t;
+
+typedef struct {
+    ErtsSubtractCtxStage stage;
+
+    Eterm lhs_original;
+    Eterm rhs_original;
+
+    Uint lhs_remaining;
+    Uint rhs_remaining;
+
+    Eterm iterator;
+
+    Eterm *result_cdr;
+    Eterm result;
+
+    union {
+        Eterm lhs_elements[SUBTRACT_LHS_THRESHOLD];
+        Eterm rhs_elements[SUBTRACT_RHS_THRESHOLD];
+
+        struct {
+            subtract_tree_t *tree;
+
+            /* A memory area for the tree's nodes, saving us the need to have
+             * one allocation per node. */
+            subtract_tree_t *alloc_start;
+            subtract_tree_t *alloc;
+        } rhs_set;
+    } u;
+} ErtsSubtractContext;
+
+#define ERTS_RBT_PREFIX subtract
+#define ERTS_RBT_T subtract_tree_t
+#define ERTS_RBT_KEY_T Eterm
+#define ERTS_RBT_FLAGS_T int
+#define ERTS_RBT_INIT_EMPTY_TNODE(T) \
+    do { \
+        (T)->parent = NULL; \
+        (T)->left = NULL; \
+        (T)->right = NULL; \
+    } while(0)
+#define ERTS_RBT_IS_RED(T) ((T)->is_red)
+#define ERTS_RBT_SET_RED(T) ((T)->is_red = 1)
+#define ERTS_RBT_IS_BLACK(T) (!ERTS_RBT_IS_RED(T))
+#define ERTS_RBT_SET_BLACK(T) ((T)->is_red = 0)
+#define ERTS_RBT_GET_FLAGS(T) ((T)->is_red)
+#define ERTS_RBT_SET_FLAGS(T, F) ((T)->is_red = F)
+#define ERTS_RBT_GET_PARENT(T) ((T)->parent)
+#define ERTS_RBT_SET_PARENT(T, P) ((T)->parent = P)
+#define ERTS_RBT_GET_RIGHT(T) ((T)->right)
+#define ERTS_RBT_SET_RIGHT(T, R) ((T)->right = (R))
+#define ERTS_RBT_GET_LEFT(T) ((T)->left)
+#define ERTS_RBT_SET_LEFT(T, L) ((T)->left = (L))
+#define ERTS_RBT_GET_KEY(T) ((T)->key)
+#define ERTS_RBT_IS_LT(KX, KY) (CMP_TERM(KX, KY) < 0)
+#define ERTS_RBT_IS_EQ(KX, KY) EQ(KX, KY)
+#define ERTS_RBT_WANT_LOOKUP_INSERT
+#define ERTS_RBT_WANT_LOOKUP
+#define ERTS_RBT_WANT_DELETE
+#define ERTS_RBT_UNDEF
+
+#include "erl_rbtree.h"
+
+static int subtract_continue(Process *p, ErtsSubtractContext *context);
+
+static void subtract_ctx_dtor(ErtsSubtractContext *context) {
+    switch (context->stage) {
+        case SUBTRACT_STAGE_SET_BUILD:
+        case SUBTRACT_STAGE_SET_FINISH:
+            erts_free(ERTS_ALC_T_LIST_TRAP, context->u.rhs_set.alloc_start);
+            break;
+        default:
+            break;
     }
-    if ((m = erts_list_length(B)) < 0) {
-	BIF_ERROR(p, BADARG);
+}
+
+static int subtract_ctx_bin_dtor(Binary *context_bin) {
+    ErtsSubtractContext *context = ERTS_MAGIC_BIN_DATA(context_bin);
+    subtract_ctx_dtor(context);
+    return 1;
+}
+
+static void subtract_ctx_move(ErtsSubtractContext *from,
+                              ErtsSubtractContext *to) {
+    int uses_result_cdr = 0;
+
+    to->stage = from->stage;
+
+    to->lhs_original = from->lhs_original;
+    to->rhs_original = from->rhs_original;
+
+    to->lhs_remaining = from->lhs_remaining;
+    to->rhs_remaining = from->rhs_remaining;
+
+    to->iterator = from->iterator;
+    to->result = from->result;
+
+    switch (to->stage) {
+        case SUBTRACT_STAGE_NAIVE_LHS:
+            sys_memcpy(to->u.lhs_elements,
+                       from->u.lhs_elements,
+                       sizeof(Eterm) * to->lhs_remaining);
+            break;
+        case SUBTRACT_STAGE_NAIVE_RHS:
+            sys_memcpy(to->u.rhs_elements,
+                       from->u.rhs_elements,
+                       sizeof(Eterm) * to->rhs_remaining);
+
+            uses_result_cdr = 1;
+            break;
+        case SUBTRACT_STAGE_SET_FINISH:
+            uses_result_cdr = 1;
+            /* FALL THROUGH */
+        case SUBTRACT_STAGE_SET_BUILD:
+            to->u.rhs_set.alloc_start = from->u.rhs_set.alloc_start;
+            to->u.rhs_set.alloc = from->u.rhs_set.alloc;
+            to->u.rhs_set.tree = from->u.rhs_set.tree;
+            break;
+        default:
+            break;
     }
-    
-    if (n == 0)
-	BIF_RET(NIL);
-    if (m == 0)
-	BIF_RET(A);
-    
-    /* allocate element vector */
-    if (n <= SMALL_VEC_SIZE)
-	vec_p = small_vec;
-    else
-	vec_p = (Eterm*) erts_alloc(ERTS_ALC_T_TMP, n * sizeof(Eterm));
-    
-    /* PUT ALL ELEMENTS IN VP */
-    vp = vec_p;
-    list = A;
-    i = n;
-    while(i--) {
-	Eterm* listp = list_val(list);
-	*vp++ = CAR(listp);
-	list = CDR(listp);
+
+    if (uses_result_cdr) {
+        if (from->result_cdr == &from->result) {
+            to->result_cdr = &to->result;
+        } else {
+            to->result_cdr = from->result_cdr;
+        }
     }
-    
-    /* UNMARK ALL DELETED CELLS */
-    list = B;
-    m = 0;  /* number of deleted elements */
-    while(is_list(list)) {
-	Eterm* listp = list_val(list);
-	Eterm  elem = CAR(listp);
-	i = n;
-	vp = vec_p;
-	while(i--) {
-	    if (is_value(*vp) && eq(*vp, elem)) {
-		*vp = THE_NON_VALUE;
-		m++;
-		break;
-	    }
-	    vp++;
-	}
-	list = CDR(listp);
+}
+
+static Eterm subtract_create_trap_state(Process *p,
+                                        ErtsSubtractContext *context) {
+    Binary *state_bin;
+    Eterm *hp;
+
+    state_bin = erts_create_magic_binary(sizeof(ErtsSubtractContext),
+                                         subtract_ctx_bin_dtor);
+
+    subtract_ctx_move(context, ERTS_MAGIC_BIN_DATA(state_bin));
+
+    hp = HAlloc(p, ERTS_MAGIC_REF_THING_SIZE);
+
+    return erts_mk_magic_ref(&hp, &MSO(p), state_bin);
+}
+
+static int subtract_enter_len_lhs(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_LEN_LHS;
+
+    context->iterator = context->lhs_original;
+    context->lhs_remaining = 0;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_enter_len_rhs(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_LEN_RHS;
+
+    context->iterator = context->rhs_original;
+    context->rhs_remaining = 0;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_get_length(Process *p, Eterm *iterator_p, Uint *count_p) {
+    static const Sint ELEMENTS_PER_RED = 32;
+
+    Sint budget, count;
+    Eterm iterator;
+
+    budget = ELEMENTS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    iterator = *iterator_p;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    for (count = 0; count < budget && is_list(iterator); count++) {
+        iterator = CDR(list_val(iterator));
     }
-    
-    if (m == n)      /* All deleted ? */
-	res = NIL;
-    else if (m == 0)  /* None deleted ? */
-	res = A;
-    else {			/* REBUILD LIST */
-	res = NIL;
-	need = 2*(n - m);
-	hp = HAlloc(p, need);
-	vp = vec_p + n - 1;
-	while(vp >= vec_p) {
-	    if (is_value(*vp)) {
-		res = CONS(hp, *vp, res);
-		hp += 2;
-	    }
-	    vp--;
-	}
+
+    if (!is_list(iterator) && !is_nil(iterator)) {
+        return -1;
+    }
+
+    BUMP_REDS(p, count / ELEMENTS_PER_RED);
+
+    *iterator_p = iterator;
+    *count_p += count;
+
+    if (is_nil(iterator)) {
+        return 1;
     }
-    if (vec_p != small_vec)
-	erts_free(ERTS_ALC_T_TMP, (void *) vec_p);
-    BIF_RET(res);
+
+    return 0;
 }
 
-BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2)
-{
-    return subtract(BIF_P, BIF_ARG_1, BIF_ARG_2);
+static int subtract_enter_naive_lhs(Process *p, ErtsSubtractContext *context) {
+    Eterm iterator;
+    int i = 0;
+
+    context->stage = SUBTRACT_STAGE_NAIVE_LHS;
+
+    context->iterator = context->rhs_original;
+    context->result = NIL;
+
+    iterator = context->lhs_original;
+
+    while (is_list(iterator)) {
+        const Eterm *cell = list_val(iterator);
+
+        ASSERT(i < SUBTRACT_LHS_THRESHOLD);
+
+        context->u.lhs_elements[i++] = CAR(cell);
+        iterator = CDR(cell);
+    }
+
+    ASSERT(i == context->lhs_remaining);
+
+    return subtract_continue(p, context);
 }
 
-BIF_RETTYPE subtract_2(BIF_ALIST_2)
-{
-    return subtract(BIF_P, BIF_ARG_1, BIF_ARG_2);
+static int subtract_naive_lhs(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 16;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+    while (checks < budget && is_list(context->iterator)) {
+        const Eterm *cell;
+        Eterm value, next;
+        int found_at;
+
+        cell = list_val(context->iterator);
+
+        value = CAR(cell);
+        next = CDR(cell);
+
+        for (found_at = 0; found_at < context->lhs_remaining; found_at++) {
+            if (EQ(value, context->u.lhs_elements[found_at])) {
+                /* We shift the array one step down as we have to preserve
+                 * order.
+                 *
+                 * Note that we can't exit early as that would suppress errors
+                 * in the right-hand side (this runs prior to determining the
+                 * length of RHS). */
+
+                context->lhs_remaining--;
+                sys_memmove(&context->u.lhs_elements[found_at],
+                            &context->u.lhs_elements[found_at + 1],
+                            (context->lhs_remaining - found_at) * sizeof(Eterm));
+                break;
+            }
+        }
+
+        checks += MAX(1, context->lhs_remaining);
+        context->iterator = next;
+    }
+
+    BUMP_REDS(p, MIN(checks, budget) / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        return 0;
+    } else if (!is_nil(context->iterator)) {
+        return -1;
+    }
+
+    if (context->lhs_remaining > 0) {
+        Eterm *hp;
+        int i;
+
+        hp = HAlloc(p, context->lhs_remaining * 2);
+
+        for (i = context->lhs_remaining - 1; i >= 0; i--) {
+            Eterm value = context->u.lhs_elements[i];
+
+            context->result = CONS(hp, value, context->result);
+            hp += 2;
+        }
+    }
+
+    ASSERT(context->lhs_remaining > 0 || context->result == NIL);
+
+    return 1;
+}
+
+static int subtract_enter_naive_rhs(Process *p, ErtsSubtractContext *context) {
+    Eterm iterator;
+    int i = 0;
+
+    context->stage = SUBTRACT_STAGE_NAIVE_RHS;
+
+    context->iterator = context->lhs_original;
+    context->result_cdr = &context->result;
+    context->result = NIL;
+
+    iterator = context->rhs_original;
+
+    while (is_list(iterator)) {
+        const Eterm *cell = list_val(iterator);
+
+        ASSERT(i < SUBTRACT_RHS_THRESHOLD);
+
+        context->u.rhs_elements[i++] = CAR(cell);
+        iterator = CDR(cell);
+    }
+
+    ASSERT(i == context->rhs_remaining);
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_naive_rhs(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 16;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (checks < budget && is_list(context->iterator)) {
+        const Eterm *cell;
+        Eterm value, next;
+        int found_at;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        for (found_at = context->rhs_remaining - 1; found_at >= 0; found_at--) {
+            if (EQ(value, context->u.rhs_elements[found_at])) {
+                break;
+            }
+        }
+
+        if (found_at < 0) {
+            /* Destructively add the value to the result. This is safe
+             * since the GC is disabled and the unfinished term is never
+             * leaked to the outside world. */
+            Eterm *hp = HAllocX(p, 2, context->lhs_remaining * 2);
+
+            *context->result_cdr = make_list(hp);
+            context->result_cdr = &CDR(hp);
+
+            CAR(hp) = value;
+        } else if (found_at >= 0) {
+            Eterm swap;
+
+            if (context->rhs_remaining-- == 1) {
+                /* We've run out of items to remove, so the rest of the
+                 * result will be equal to the remainder of the input. We know
+                 * that LHS is well-formed as any errors would've been reported
+                 * during length determination. */
+                *context->result_cdr = next;
+
+                BUMP_REDS(p, MIN(budget, checks) / CHECKS_PER_RED);
+
+                return 1;
+            }
+
+            swap = context->u.rhs_elements[context->rhs_remaining];
+            context->u.rhs_elements[found_at] = swap;
+        }
+
+        checks += context->rhs_remaining;
+        context->iterator = next;
+        context->lhs_remaining--;
+    }
+
+    /* The result only has to be terminated when returning it to the user, but
+     * we're doing it when trapping as well to prevent headaches when
+     * debugging. */
+    *context->result_cdr = NIL;
+
+    BUMP_REDS(p, MIN(budget, checks) / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        ASSERT(context->lhs_remaining > 0 && context->rhs_remaining > 0);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int subtract_enter_set_build(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_SET_BUILD;
+
+    context->u.rhs_set.alloc_start =
+        erts_alloc(ERTS_ALC_T_LIST_TRAP,
+                   context->rhs_remaining * sizeof(subtract_tree_t));
+
+    context->u.rhs_set.alloc = context->u.rhs_set.alloc_start;
+    context->u.rhs_set.tree = NULL;
+
+    context->iterator = context->rhs_original;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_set_build(Process *p, ErtsSubtractContext *context) {
+    const static Sint INSERTIONS_PER_RED = 16;
+    Sint budget, insertions;
+
+    budget = INSERTIONS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    insertions = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (insertions < budget && is_list(context->iterator)) {
+        subtract_tree_t *existing_node, *new_node;
+        const Eterm *cell;
+        Eterm value, next;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        new_node = context->u.rhs_set.alloc;
+        new_node->key = value;
+        new_node->count = 1;
+
+        existing_node = subtract_rbt_lookup_insert(&context->u.rhs_set.tree,
+                                                   new_node);
+
+        if (existing_node != NULL) {
+            existing_node->count++;
+        } else {
+            context->u.rhs_set.alloc++;
+        }
+
+        context->iterator = next;
+        insertions++;
+    }
+
+    BUMP_REDS(p, insertions / INSERTIONS_PER_RED);
+
+    ASSERT(is_list(context->iterator) || is_nil(context->iterator));
+    ASSERT(context->u.rhs_set.tree != NULL);
+
+    return is_nil(context->iterator);
+}
+
+static int subtract_enter_set_finish(Process *p, ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_SET_FINISH;
+
+    context->result_cdr = &context->result;
+    context->result = NIL;
+
+    context->iterator = context->lhs_original;
+
+    return subtract_continue(p, context);
+}
+
+static int subtract_set_finish(Process *p, ErtsSubtractContext *context) {
+    const Sint CHECKS_PER_RED = 8;
+    Sint checks, budget;
+
+    budget = CHECKS_PER_RED * ERTS_BIF_REDS_LEFT(p);
+    checks = 0;
+
+#ifdef DEBUG
+    budget = budget / 10 + 1;
+#endif
+
+    while (checks < budget && is_list(context->iterator)) {
+        subtract_tree_t *node;
+        const Eterm *cell;
+        Eterm value, next;
+
+        cell = list_val(context->iterator);
+        value = CAR(cell);
+        next = CDR(cell);
+
+        ASSERT(context->rhs_remaining > 0);
+
+        node = subtract_rbt_lookup(context->u.rhs_set.tree, value);
+
+        if (node == NULL) {
+            Eterm *hp = HAllocX(p, 2, context->lhs_remaining * 2);
+
+            *context->result_cdr = make_list(hp);
+            context->result_cdr = &CDR(hp);
+
+            CAR(hp) = value;
+        } else {
+            if (context->rhs_remaining-- == 1) {
+                *context->result_cdr = next;
+
+                BUMP_REDS(p, checks / CHECKS_PER_RED);
+
+                return 1;
+            }
+
+            if (node->count-- == 1) {
+                subtract_rbt_delete(&context->u.rhs_set.tree, node);
+            }
+        }
+
+        context->iterator = next;
+        context->lhs_remaining--;
+        checks++;
+    }
+
+    *context->result_cdr = NIL;
+
+    BUMP_REDS(p, checks / CHECKS_PER_RED);
+
+    if (is_list(context->iterator)) {
+        ASSERT(context->lhs_remaining > 0 && context->rhs_remaining > 0);
+        return 0;
+    }
+
+    return 1;
+}
+
+static int subtract_continue(Process *p, ErtsSubtractContext *context) {
+    switch (context->stage) {
+        case SUBTRACT_STAGE_START: {
+            return subtract_enter_len_lhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_LEN_LHS: {
+            int res = subtract_get_length(p,
+                                          &context->iterator,
+                                          &context->lhs_remaining);
+
+            if (res != 1) {
+                return res;
+            }
+
+            if (context->lhs_remaining <= SUBTRACT_LHS_THRESHOLD) {
+                return subtract_enter_naive_lhs(p, context);
+            }
+
+            return subtract_enter_len_rhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_NAIVE_LHS: {
+            return subtract_naive_lhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_LEN_RHS: {
+            int res = subtract_get_length(p,
+                                          &context->iterator,
+                                          &context->rhs_remaining);
+
+            if (res != 1) {
+                return res;
+            }
+
+            /* We've walked through both lists fully now so we no longer need
+             * to check for errors past this point. */
+
+            if (context->rhs_remaining <= SUBTRACT_RHS_THRESHOLD) {
+                return subtract_enter_naive_rhs(p, context);
+            }
+
+            return subtract_enter_set_build(p, context);
+        }
+
+        case SUBTRACT_STAGE_NAIVE_RHS: {
+            return subtract_naive_rhs(p, context);
+        }
+
+        case SUBTRACT_STAGE_SET_BUILD: {
+            int res = subtract_set_build(p, context);
+
+            if (res != 1) {
+                return res;
+            }
+
+            return subtract_enter_set_finish(p, context);
+        }
+
+        case SUBTRACT_STAGE_SET_FINISH: {
+            return subtract_set_finish(p, context);
+        }
+
+        default:
+            ERTS_ASSERT(!"unreachable");
+    }
+}
+
+static int subtract_start(Process *p, Eterm lhs, Eterm rhs,
+                          ErtsSubtractContext *context) {
+    context->stage = SUBTRACT_STAGE_START;
+
+    context->lhs_original = lhs;
+    context->rhs_original = rhs;
+
+    return subtract_continue(p, context);
 }
 
+/* erlang:'--'/2 */
+static Eterm subtract(Export *bif_entry, BIF_ALIST_2) {
+    Eterm lhs = BIF_ARG_1, rhs = BIF_ARG_2;
+
+    if ((is_list(lhs) || is_nil(lhs)) && (is_list(rhs) || is_nil(rhs))) {
+        /* We start with the context on the stack in the hopes that we won't
+         * have to trap. */
+        ErtsSubtractContext context;
+        int res;
+
+        res = subtract_start(BIF_P, lhs, rhs, &context);
+
+        if (res == 0) {
+            Eterm state_mref;
+
+            state_mref = subtract_create_trap_state(BIF_P, &context);
+            erts_set_gc_state(BIF_P, 0);
+
+            BIF_TRAP2(bif_entry, BIF_P, state_mref, NIL);
+        }
+
+        subtract_ctx_dtor(&context);
+
+        if (res < 0) {
+            BIF_ERROR(BIF_P, BADARG);
+        }
+
+        BIF_RET(context.result);
+    } else if (is_internal_magic_ref(lhs)) {
+        ErtsSubtractContext *context;
+        int (*dtor)(Binary*);
+        Binary *magic_bin;
+
+        int res;
+
+        magic_bin = erts_magic_ref2bin(lhs);
+        dtor = ERTS_MAGIC_BIN_DESTRUCTOR(magic_bin);
+
+        if (dtor != subtract_ctx_bin_dtor) {
+            BIF_ERROR(BIF_P, BADARG);
+        }
+
+        ASSERT(BIF_P->flags & F_DISABLE_GC);
+        ASSERT(rhs == NIL);
+
+        context = ERTS_MAGIC_BIN_DATA(magic_bin);
+        res = subtract_continue(BIF_P, context);
+
+        if (res == 0) {
+            BIF_TRAP2(bif_entry, BIF_P, lhs, NIL);
+        }
+
+        erts_set_gc_state(BIF_P, 1);
+
+        if (res < 0) {
+            ERTS_BIF_ERROR_TRAPPED2(BIF_P, BADARG, bif_entry,
+                                    context->lhs_original,
+                                    context->rhs_original);
+        }
+
+        BIF_RET(context->result);
+    }
+
+    ASSERT(!(BIF_P->flags & F_DISABLE_GC));
+
+    BIF_ERROR(BIF_P, BADARG);
+}
+
+BIF_RETTYPE ebif_minusminus_2(BIF_ALIST_2) {
+    return subtract(bif_export[BIF_ebif_minusminus_2], BIF_CALL_ARGS);
+}
+
+BIF_RETTYPE subtract_2(BIF_ALIST_2) {
+    return subtract(bif_export[BIF_subtract_2], BIF_CALL_ARGS);
+}
+
+
 BIF_RETTYPE lists_member_2(BIF_ALIST_2)
 {
     Eterm term;
diff --git a/lib/stdlib/doc/src/lists.xml b/lib/stdlib/doc/src/lists.xml
index 7efafedc82..55227aaee5 100644
--- a/lib/stdlib/doc/src/lists.xml
+++ b/lib/stdlib/doc/src/lists.xml
@@ -838,14 +838,6 @@ splitwith(Pred, List) ->
 > <input>lists:subtract("123212", "212").</input>
 "312".</pre>
         <p><c>lists:subtract(A, B)</c> is equivalent to <c>A -- B</c>.</p>
-        <warning>
-          <p>The complexity of <c>lists:subtract(A, B)</c> is proportional to
-            <c>length(A)*length(B)</c>, meaning that it is very slow if both
-            <c>A</c> and <c>B</c> are long lists. (If both lists are long, it
-            is a much better choice to use ordered lists and
-            <seealso marker="ordsets#subtract/2">
-            <c>ordsets:subtract/2</c></seealso>.</p>
-        </warning>
       </desc>
     </func>
 
diff --git a/lib/stdlib/test/lists_SUITE.erl b/lib/stdlib/test/lists_SUITE.erl
index 7c99244b36..c380b3bba1 100644
--- a/lib/stdlib/test/lists_SUITE.erl
+++ b/lib/stdlib/test/lists_SUITE.erl
@@ -2597,6 +2597,13 @@ subtract(Config) when is_list(Config) ->
     {'EXIT',_} = (catch sub([a|b], [])),
     {'EXIT',_} = (catch sub([a|b], [a])),
 
+    %% Trapping, both crashing and otherwise.
+    [sub_trapping(N) || N <- lists:seq(0, 18)],
+
+    %% The current implementation chooses which algorithm to use based on
+    %% certain thresholds, and we need proper coverage for all corner cases.
+    [sub_thresholds(N) || N <- lists:seq(0, 32)],
+
     ok.
 
 sub_non_matching(A, B) ->
@@ -2606,6 +2613,41 @@ sub(A, B) ->
     Res = A -- B,
     Res = lists:subtract(A, B).
 
+sub_trapping(N) ->
+    List = lists:duplicate(N + (1 bsl N), gurka),
+    ImproperList = List ++ crash,
+
+    {'EXIT',_} = (catch sub_trapping_1(ImproperList, [])),
+    {'EXIT',_} = (catch sub_trapping_1(List, ImproperList)),
+
+    List = List -- lists:duplicate(N + (1 bsl N), gaffel),
+    ok = sub_trapping_1(List, []).
+
+sub_trapping_1([], _) -> ok;
+sub_trapping_1(L, R) -> sub_trapping_1(L -- R, [gurka | R]).
+
+sub_thresholds(N) ->
+    %% This needs to be long enough to cause trapping.
+    OtherLen = 1 bsl 18,
+    Other = lists:seq(0, OtherLen - 1),
+
+    Disjoint = lists:seq(-N, -1),
+    Subset = lists:seq(1, N),
+
+    %% LHS is disjoint from RHS, so all elements must be retained.
+    Disjoint = Disjoint -- Other,
+
+    %% LHS is covered by RHS, so all elements must be removed.
+    [] = Subset -- Other,
+
+    %% RHS is disjoint from LHS, so all elements must be retained.
+    Other = Other -- Disjoint,
+
+    %% RHS is covered by LHS, so N elements must be removed.
+    N = OtherLen - length(Other -- Subset),
+
+    ok.
+
 %% Test lists:droplast/1
 droplast(Config) when is_list(Config) ->
     [] = lists:droplast([x]),
diff --git a/system/doc/efficiency_guide/commoncaveats.xml b/system/doc/efficiency_guide/commoncaveats.xml
index b41ffc3902..367da09ba3 100644
--- a/system/doc/efficiency_guide/commoncaveats.xml
+++ b/system/doc/efficiency_guide/commoncaveats.xml
@@ -169,53 +169,5 @@ multiple_setelement(T0) ->
         {Bin1,Bin2} = split_binary(Bin, Num)</code>
    </section>
 
-  <section>
-    <title>Operator "--"</title>
-     <p>The "<c>--</c>" operator has a complexity
-     proportional to the product of the length of its operands.
-     This means that the operator is very slow if both of its operands
-     are long lists:</p>
-
-        <p><em>DO NOT</em></p>
-        <code type="none"><![CDATA[
-        HugeList1 -- HugeList2]]></code>
-
-     <p>Instead use the <seealso marker="stdlib:ordsets">ordsets</seealso>
-     module in STDLIB:</p>
-
-        <p><em>DO</em></p>
-        <code type="none">
-        HugeSet1 = ordsets:from_list(HugeList1),
-        HugeSet2 = ordsets:from_list(HugeList2),
-        ordsets:subtract(HugeSet1, HugeSet2)</code>
-
-     <p>Obviously, that code does not work if the original order
-     of the list is important. If the order of the list must be
-     preserved, do as follows:</p>
-
-        <p><em>DO</em></p>
-        <code type="none"><![CDATA[
-        Set = gb_sets:from_list(HugeList2),
-        [E || E <- HugeList1, not gb_sets:is_element(E, Set)]]]></code>
-
-     <note><p>This code behaves differently from "<c>--</c>"
-     if the lists contain duplicate elements (one occurrence
-     of an element in HugeList2 removes <em>all</em>
-     occurrences in HugeList1.)</p>
-     <p>Also, this code compares lists elements using the
-     "<c>==</c>" operator, while "<c>--</c>" uses the "<c>=:=</c>" operator.
-     If that difference is important, <c>sets</c> can be used instead of
-     <c>gb_sets</c>, but <c>sets:from_list/1</c> is much
-     slower than <c>gb_sets:from_list/1</c> for long lists.</p></note>
-
-     <p>Using the "<c>--</c>" operator to delete an element
-     from a list is not a performance problem:</p>
-
-        <p><em>OK</em></p>
-        <code type="none">
-        HugeList1 -- [Element]</code>
-
-   </section>
-
 </chapter>
 
diff --git a/system/doc/efficiency_guide/retired_myths.xml b/system/doc/efficiency_guide/retired_myths.xml
index 9b914a3b6e..144c942c2b 100644
--- a/system/doc/efficiency_guide/retired_myths.xml
+++ b/system/doc/efficiency_guide/retired_myths.xml
@@ -60,4 +60,18 @@
     That leads us to the myth that tail-recursive functions are faster
     than body-recursive functions.</p>
   </section>
+
+  <section>
+    <title>Myth: List subtraction ("--" operator) is slow</title>
+
+    <p>List subtraction used to have a run-time complexity proportional to the
+      product of the length of its operands, so it was extremely slow when both
+      lists were long.</p>
+
+    <p>As of OTP 22 the run-time complexity is "n log n" and the operation will
+      complete quickly even when both lists are very long. In fact, it is
+      faster and uses less memory than the commonly used workaround to convert
+      both lists to ordered sets before subtracting them with
+      <c>ordsets:subtract/2</c>.</p>
+  </section>
 </chapter>
author	John Högberg <[email protected]>	2018-10-15 18:17:12 +0200
committer	Lukas Larsson <[email protected]>	2018-11-02 17:15:32 +0100
commit	bf5886b790f8f386ed425f543506a4bebb48448c (patch)
tree	b2ef444c7bc5fc21b9e1e4cf905041ca70832c78
parent	db6059a9217767a6e42e93cec05089c0ec977d20 (diff)
download	otp-bf5886b790f8f386ed425f543506a4bebb48448c.tar.gz otp-bf5886b790f8f386ed425f543506a4bebb48448c.tar.bz2 otp-bf5886b790f8f386ed425f543506a4bebb48448c.zip