From fdc8980231b1e791ec4b8f8f3d61a7ba7dda539b Mon Sep 17 00:00:00 2001 From: Patrik Nyblom Date: Mon, 2 Nov 2009 15:31:21 +0100 Subject: Initial commit of the binary EEP --- erts/emulator/beam/atom.names | 2 + erts/emulator/beam/bif.tab | 34 ++ erts/emulator/beam/binary.c | 976 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1012 insertions(+) (limited to 'erts/emulator/beam') diff --git a/erts/emulator/beam/atom.names b/erts/emulator/beam/atom.names index 9ce21089ba..e63d56b32a 100644 --- a/erts/emulator/beam/atom.names +++ b/erts/emulator/beam/atom.names @@ -65,6 +65,7 @@ atom EXIT='EXIT' atom aborted atom abs_path atom absoluteURI +atom ac atom active atom all atom all_but_first @@ -102,6 +103,7 @@ atom bif_return_trap atom binary atom block atom blocked +atom bm atom bnot atom bor atom bxor diff --git a/erts/emulator/beam/bif.tab b/erts/emulator/beam/bif.tab index b6fa06354a..8d81aa7eba 100644 --- a/erts/emulator/beam/bif.tab +++ b/erts/emulator/beam/bif.tab @@ -754,6 +754,40 @@ bif erlang:load_nif/2 bif erlang:call_on_load_function/1 bif erlang:finish_after_on_load/2 +# The binary match bifs (New in R13B04 - EEP9) +# + +# +# The searching/splitting/substituting thingies +# +bif binary:match_compile/1 +bif binary:match/2 +bif binary:match/3 +bif binary:matches/2 +bif binary:matches/3 +# bif binary:split/2 +# bif binary:split/3 +# bif binary:substitute/3 +# bif binary:globally_substitute/3 +# bif binary:duplicate/2 + +# +# XXX:PaN Usecase for these two? Creeping Biffilism? +# +# bif binary:from_unsigned/1 +# bif binary:to_unsigned/1 + +# +# XXX:PaN The following are suggested to be implemented in the erlang code... +# - or are they meant to be guard bif's? +# +# binary:first/1 +# binary:first/2 +# binary:last/1 +# binary:last/2 +# binary:nth/2 +# binary:extract/3 + # # New Bifs in R13B4 # diff --git a/erts/emulator/beam/binary.c b/erts/emulator/beam/binary.c index 59c20398d5..29c1af2114 100644 --- a/erts/emulator/beam/binary.c +++ b/erts/emulator/beam/binary.c @@ -675,3 +675,979 @@ bitstr_list_len(Eterm obj) DESTROY_ESTACK(s); return (Sint) -1; } + +#define HARDDEBUG + +/* + * The native implementation functions for the module binary. + * Searching is implemented using aither Boyer-More or Aho-Corasick + * depending on number of searchstrings (BM if one, AC if more than one). + * Native implementation is for efficiency, nothing really *needs* to be + * implemented in native code. + */ + +/* + * A micro allocator used when building search structures, just a convenience + * for building structures inside a pre alocated magic binary using conventional + * malloc-like interface. + */ + +#define MYALIGN(Size) (SIZEOF_VOID_P * (((Size) / SIZEOF_VOID_P) + \ + !!(((Size) % SIZEOF_VOID_P)))) + +#ifdef DEBUG +#define CHECK_ALLOCATOR(My) ASSERT((My).current <= ((My).mem + (My).size)) +#else +#define CHECK_ALLOCATOR(My) /* nothing */ +#endif + +typedef struct _my_allocator { + Uint size; + byte *current; + byte *mem; +} MyAllocator; + +static void init_my_allocator(MyAllocator *my, Uint siz, byte *array) +{ + ASSERT((siz % SIZEOF_VOID_P) == 0); + my->size = siz; + my->mem = array; + my->current = my->mem; +} + +static void *my_alloc(MyAllocator *my, Uint size) +{ + void *ptr = my->current; + my->current += MYALIGN(size); + return ptr; +} + +/* + * The search functionality. + * + * The search is byte oriented, which works nicely for UTF-8 as well as latin1 data + */ + +#define ALPHABET_SIZE 256 + +typedef struct _ac_node { +#ifdef HARDDEBUG + Uint32 id; /* To identify h pointer targets when dumping */ +#endif + Uint32 d; /* Depth in trie, also represents the length + (-1) of the matched string if in + final set */ + Sint32 final; /* Members in final set represent matches. + * The set representation is scattered + * among the nodes in this way: + * >0 -> this represents a member of + * the final set, <0 -> member of + * final set somewhere in the failure chain, + * 0 -> not member of the final set */ + struct _ac_node *h; /* h(Hode) is the failure function */ + struct _ac_node *g[ALPHABET_SIZE]; /* g(Node,Character) is the + transition function */ +} ACNode; + +typedef struct _ac_trie { +#ifdef HARDDEBUG + Uint32 idc; +#endif + Uint32 counter; /* Number of added patterns */ + ACNode *root; /* pointer to the root state */ +} ACTrie; + +typedef struct _bm_data { + int ret_tuple; + byte *x; + Sint len; + Sint *goodshift; + Sint badshift[ALPHABET_SIZE]; +} BMData; + +#ifdef HARDDEBUG +static void dump_bm_data(BMData *bm); +static void dump_ac_trie(ACTrie *act); +static void dump_ac_node(ACNode *node, int indent, int ch); +#endif + +/* + * The needed size of binary data for a search structure - given the accumulated + * string lengths. + */ +#define BM_SIZE(StrLen) /* StrLen: length of searchstring */ \ +((MYALIGN(sizeof(Sint) * (StrLen))) + /* goodshift array */ \ + MYALIGN(StrLen) + /* searchstring saved */ \ + (MYALIGN(sizeof(BMData)))) /* Structure */ + +#define AC_SIZE(StrLens) /* StrLens: sum of all searchstring lengths */ \ +((MYALIGN(sizeof(ACNode)) * \ +((StrLens)+1)) + /* The actual nodes (including rootnode) */ \ + MYALIGN(sizeof(ACTrie))) /* Structure */ + + +#ifndef MAX +#define MAX(A,B) (((A) > (B)) ? (A) : B) +#endif + +/* + * Callback for the magic binary + */ +static void cleanup_my_data(Binary *bp) +{ + return; +} + +/* + * Initiate a (allocated) micro allocator and fill in the base + * for an Aho-Corasick search trie, given the accumulated length of the search strings. + */ +static ACTrie *create_acdata(MyAllocator *my, Process *p, Uint len, + ACNode ***qbuff /* out */,Eterm *the_bin /* out */) +{ + Uint datasize = AC_SIZE(len); + ACTrie *act; + ACNode *acn; + Eterm *hp; + Binary *mb = erts_create_magic_binary(datasize,cleanup_my_data); + byte *data = ERTS_MAGIC_BIN_DATA(mb); + + init_my_allocator(my, datasize, data); + act = my_alloc(my, sizeof(ACTrie)); /* Important that this is the first + allocation */ + act->counter = 0; + act->root = acn = my_alloc(my, sizeof(ACNode)); + acn->d = 0; + acn->final = 0; + acn->h = NULL; + memset(acn->g, 0, sizeof(ACNode *) * ALPHABET_SIZE); +#ifdef HARDDEBUG + act->idc = 0; + acn->id = 0; +#endif + *qbuff = erts_alloc(ERTS_ALC_T_TMP, sizeof(ACNode *) * len); + hp = HAlloc(p, PROC_BIN_SIZE); + *the_bin = erts_mk_magic_binary_term(&hp, &MSO(p), mb); + return act; +} + +/* + * The same initialization of allocator and basic data for Boyer-More. + */ +static BMData *create_bmdata(MyAllocator *my, Process *p, byte *x, Uint len, Eterm *the_bin /* out */) +{ + Uint datasize = BM_SIZE(len); + BMData *bmd; + Eterm *hp; + Binary *mb = erts_create_magic_binary(datasize,cleanup_my_data); + byte *data = ERTS_MAGIC_BIN_DATA(mb); + init_my_allocator(my, datasize, data); + bmd = my_alloc(my, sizeof(BMData)); + bmd->x = my_alloc(my,len); + memcpy(bmd->x,x,len); + bmd->len = len; + bmd->goodshift = my_alloc(my,sizeof(Uint) * len); + bmd->ret_tuple = 0; + hp = HAlloc(p, PROC_BIN_SIZE); + *the_bin = erts_mk_magic_binary_term(&hp, &MSO(p), mb); + return bmd; +} + +/* + * Compilation of search structures + */ + +/* + * Aho Corasick - Build a Trie and fill in the failure functions + * when all strings are added. + * The algorithm is nicely described by Dieter Bühler of University of Tübingen: + * http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html + */ + +/* + * Helper called ance for each search pattern + */ +static void ac_add_one_pattern(MyAllocator *my, ACTrie *act, byte *x, Uint len) +{ + ACNode *acn = act->root; + Uint32 n = ++act->counter; /* Always increase conter, even if it's a duplicate + as this shall identify the pattern in the + final set and eventually be returned to + the caller (in Erlang) */ + Uint i = 0; + + while(i < len) { + if (acn->g[x[i]] != NULL) { + /* node exists, continue */ + acn = acn->g[x[i]]; + ++i; + } else { + /* allocate a new node */ + ACNode *nn = my_alloc(my,sizeof(ACNode)); +#ifdef HARDDEBUG + nn->id = ++(act->idc); +#endif + nn->d = i+1; + nn->h = act->root; + nn->final = 0; + memset(nn->g, 0, sizeof(ACNode *) * ALPHABET_SIZE); + acn->g[x[i]] = nn; + ++i; + acn = nn; + } + } + if (acn->final == 0) { /* New pattern, add to final set */ + acn->final = n; + } +} + +/* + * Called when all search patterns are added. + */ +static void ac_compute_failure_functions(ACTrie *act, ACNode **qbuff) +{ + ACNode *root = act->root; + ACNode *parent; + int i; + int qh = 0,qt = 0; + ACNode *child, *r; + + /* Set all children of the root to have the root as failure function */ + for (i = 0; i < ALPHABET_SIZE; ++i) { + if (root->g[i] != NULL) { + root->g[i]->h = root; + /* Add to que for later traversal */ + qbuff[qt++] = root->g[i]; + } + } + + /* So, now we've handled children of the root state, traverse the + rest of the trie BF... */ + while (qh < qt) { + parent = qbuff[qh++]; + for (i = 0; i < ALPHABET_SIZE; ++ i) { + if ((child = parent->g[i]) != NULL) { + /* Visit this node to */ + qbuff[qt++] = child; + /* Search for correct failure function, follow the parents failure + function until you find a similar transition funtion to this + childs */ + r = parent->h; + while (r != NULL && r->g[i] == NULL) { + r = r->h; + } + if (r == NULL) { + /* Replace NULL failures with the root as we go */ + child->h = (root->g[i] == NULL) ? root : root->g[i]; + } else { + child->h = r->g[i]; + /* + * The "final" set is scattered among the nodes. When + * the failure function points to a member of the final set, + * we have a match, but we might not see it in the current node + * if we dont mark it as a special type of final, i.e. foolow + * the failure function and you will find a real member of final + * set. This is marked with a negative string id and only done if + * this node does not represent a member in the final set. + */ + if (!(child->final) && (child->h->final)) { + child->final = -1; + } + } + } + } + } + /* Finally the failure function of the root should point to itself */ + root->h = root; +} + +/* + * The actual searching for needles in the haystack... + * Find first match using Aho-Coracick Trie + * return pattern number and fill in mpos + mlen if found, otherwise return 0 + * Return the matching pattern that *starts* first, not ends + * first (difference when overlapping), hence the candidate thing. + * Basic AC finds the first end before the first start... + * + */ +static Uint ac_find_first_match(ACTrie *act, byte *haystack, Uint len, + Uint *mpos, Uint *mlen) +{ + ACNode *q = act->root; + Uint i = 0; + ACNode *candidate = NULL, *r; + Uint candidate_start = 0 /* Init not needed, just quiet the compiler */; + Uint rstart; + + while (i < len) { + while (q->g[haystack[i]] == NULL && q->h != q) { + q = q->h; + } + if (q->g[haystack[i]] != NULL) { + q = q->g[haystack[i]]; + } +#ifdef HARDDEBUG + erts_printf("ch = %c, Current: %u\n", (int) haystack[i], (unsigned) q->id); +#endif + ++i; + if (candidate != NULL && (i - q->d) > candidate_start) { + break; + } + if (q->final) { + r = q; + while (r->final < 0) + r = r->h; + rstart = i - r->d; + if (candidate == NULL || rstart < candidate_start) { + candidate_start = rstart; + candidate = r; + } + } + } + if (!candidate) { + return 0; + } +#ifdef HARDDEBUG + dump_ac_node(candidate,0,'?'); +#endif + *mpos = candidate_start; + *mlen = candidate->d; + return candidate->final; +} + +typedef struct _findall_data { + Uint pos; + Uint len; +#ifdef HARDDEBUG + Uint id; +#endif + Eterm epos; + Eterm elen; +#if 0 + Eterm eid; +#endif +} FindallData; +/* + * Returns number of non overlapping matches + */ +static Uint ac_find_all_non_overlapping(ACTrie *act, byte *haystack, Uint len, + FindallData **data) +{ + ACNode *q = act->root; + Uint i = 0; + Uint rstart; + ACNode *r; + Uint m = 0, save_m; + Uint allocated = 0; + FindallData *out = NULL; + + + while (i < len) { + while (q->g[haystack[i]] == NULL && q->h != q) { + q = q->h; + } + if (q->g[haystack[i]] != NULL) { + q = q->g[haystack[i]]; + } +#ifdef HARDDEBUG + erts_printf("ch = %c, Current: %u\n", (int) haystack[i], (unsigned) q->id); +#endif + ++i; + if (q->final) { + r = q; + while (r->final) { + while (r->final < 0) + r = r->h; +#ifdef HARDDEBUG + erts_printf("Trying to add %u\n",(unsigned) r->final); +#endif + rstart = i - r->d; + save_m = m; + while (m > 0 && out[m-1].pos > rstart) { +#ifdef HARDDEBUG + erts_printf("Popping %u\n",(unsigned) out[m-1].id); +#endif + --m; + } + if (m == 0 || out[m-1].pos + out[m-1].len <= rstart) { + if (m >= allocated) { + if (!allocated) { + allocated = 10; + out = erts_alloc(ERTS_ALC_T_TMP, sizeof(FindallData) * allocated); + } else { + allocated *= 2; + out = erts_realloc(ERTS_ALC_T_TMP, out, + sizeof(FindallData) * allocated); + } + } + out[m].pos = rstart; + out[m].len = r->d; +#ifdef HARDDEBUG + out[m].id = r->final; +#endif + ++m; +#ifdef HARDDEBUG + erts_printf("Pushing %u\n",(unsigned) out[m-1].id); +#endif + } else { +#ifdef HARDDEBUG + erts_printf("Backtracking %d steps\n",save_m - m); +#endif + m = save_m; + } + r = r->h; + } + } + } + *data = out; + return m; +} + +/* + * Boyer More - most obviously implemented more or less exactly as Christian Charras + * and Thierry Lecroq describes it in "Handbook of Exact String-Matching Algorithms" + * http://www-igm.univ-mlv.fr/~lecroq/string/ + */ + +/* + * Call this to compute badshifts array + */ +static void compute_badshifts(BMData *bmd) +{ + Sint i; + Sint m = bmd->len; + + for (i = 0; i < ALPHABET_SIZE; ++i) { + bmd->badshift[i] = m; + } + for (i = 0; i < m - 1; ++i) { + bmd->badshift[bmd->x[i]] = m - i - 1; + } +} + +/* Helper for "compute_goodshifts" */ +static void compute_suffixes(byte *x, Sint m, Sint *suffixes) +{ + int f,g,i; + + suffixes[m - 1] = m; + + f = 0; /* To avoid use before set warning */ + + g = m - 1; + + for (i = m - 2; i >= 0; --i) { + if (i > g && suffixes[i + m - f] < i - g) { + suffixes[i] = suffixes[i + m - 1 - f]; + } else { + if (i < g) { + g = i; + } + f = i; + while ( g >= 0 && x[g] == x[g + m - 1 - f] ) { + --g; + } + suffixes[i] = f - g; + } + } +} + +/* + * Call this to compute goodshift array + */ +static void compute_goodshifts(BMData *bmd) +{ + Sint m = bmd->len; + byte *x = bmd->x; + Sint i, j; + Sint *suffixes = erts_alloc(ERTS_ALC_T_TMP, m * sizeof(Uint)); + + compute_suffixes(x, m, suffixes); + + for (i = 0; i < m; ++i) { + bmd->goodshift[i] = m; + } + + j = 0; + + for (i = m - 1; i >= -1; --i) { + if (i == -1 || suffixes[i] == i + 1) { + while (j < m - 1 - i) { + if (bmd->goodshift[j] == m) { + bmd->goodshift[j] = m - 1 - i; + } + ++j; + } + } + } + for (i = 0; i <= m - 2; ++i) { + bmd->goodshift[m - 1 - suffixes[i]] = m - 1 - i; + } + erts_free(ERTS_ALC_T_TMP, suffixes); +} + +static Sint bm_find_first_match(BMData *bmd, byte *haystack, Uint len) +{ + Sint blen = bmd->len; + Sint *gs = bmd->goodshift; + Sint *bs = bmd->badshift; + byte *needle = bmd->x; + Sint i; + Sint j = 0; + + while (j <= len - blen) { + for (i = blen - 1; i >= 0 && needle[i] == haystack[i + j]; --i) + ; + if (i < 0) { /* found */ + return j; + } + j += MAX(gs[i],bs[haystack[i+j]] - blen + 1 + i); + } + return -1; +} + +/* + * Interface functions (i.e. "bif's") + */ + +/* + * Search functionality interfaces + */ +BIF_RETTYPE binary_match_compile_1(BIF_ALIST_1) +{ + Eterm t, b, comp_term = NIL; + Uint characters; + Uint words; + int return_tuple = 0; + + characters = 0; + words = 0; + + if (is_list(BIF_ARG_1)) { + return_tuple = 1; + t = BIF_ARG_1; + while (is_list(t)) { + b = CAR(list_val(t)); + t = CDR(list_val(t)); + if (!is_binary(b)) { + goto badarg; + } + if (binary_bitsize(b) != 0) { + goto badarg; + } + ++words; + characters += binary_size(b); + } + if (is_not_nil(t)) { + goto badarg; + } + if (words > 1) { + comp_term = BIF_ARG_1; + } else { + comp_term = CAR(list_val(BIF_ARG_1)); + } + } else if (is_binary(BIF_ARG_1)) { + if (binary_bitsize(BIF_ARG_1) != 0) { + goto badarg; + } + words = 1; + comp_term = BIF_ARG_1; + characters = binary_size(BIF_ARG_1); + } + + if (characters == 0) { + goto badarg; + } + ASSERT(words > 0); + + if (words == 1) { + Eterm ret; + byte *bytes; + Uint bitoffs, bitsize; + byte *temp_alloc = NULL; + MyAllocator my; + BMData *bmd; + Eterm *hp; + + ERTS_GET_BINARY_BYTES(comp_term, bytes, bitoffs, bitsize); + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(comp_term, &temp_alloc); + } + bmd = create_bmdata(&my, BIF_P, bytes, characters, &ret); + bmd->ret_tuple = return_tuple; + compute_badshifts(bmd); + compute_goodshifts(bmd); + erts_free_aligned_binary_bytes(temp_alloc); + CHECK_ALLOCATOR(my); + hp = HAlloc(BIF_P,3); + ret = TUPLE2(hp, am_bm, ret); + BIF_RET(ret); + } else { + Eterm ret; + ACTrie *act; + MyAllocator my; + Eterm *hp; + ACNode **qbuff; + + act = create_acdata(&my, BIF_P, characters, &qbuff, &ret); + t = comp_term; + while (is_list(t)) { + byte *bytes; + Uint bitoffs, bitsize; + byte *temp_alloc = NULL; + b = CAR(list_val(t)); + t = CDR(list_val(t)); + ERTS_GET_BINARY_BYTES(b, bytes, bitoffs, bitsize); + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(b, &temp_alloc); + } + ac_add_one_pattern(&my,act,bytes,binary_size(b)); + erts_free_aligned_binary_bytes(temp_alloc); + } + ac_compute_failure_functions(act,qbuff); + CHECK_ALLOCATOR(my); + erts_free(ERTS_ALC_T_TMP,qbuff); + hp = HAlloc(BIF_P,3); + ret = TUPLE2(hp, am_ac, ret); + BIF_RET(ret); + } + badarg: + BIF_ERROR(BIF_P,BADARG); +} + +BIF_RETTYPE binary_match_3(BIF_ALIST_3) +{ + Uint hsstart, hslen; + Eterm *tp; + if (is_not_binary(BIF_ARG_1)) { + goto badarg; + } + if (BIF_ARG_3 == ((Eterm) 0)) { + /* Invalid term, we're called from binary_match_2... */ + hsstart = 0; + hslen = binary_size(BIF_ARG_1); + } else if (is_tuple(BIF_ARG_3)) { + tp = tuple_val(BIF_ARG_3); + if (arityval(*tp) != 2) { + goto badarg; + } + if (!term_to_Uint(tp[1], &hsstart) || ((hsstart >> 16) >> 16) != 0) { + goto badarg; + } + if (!term_to_Uint(tp[2], &hslen) || ((hslen >> 16) >> 16) != 0) { + goto badarg; + } + if (hslen < hsstart) { + goto badarg; + } + if (hslen > binary_size(BIF_ARG_1)-1) { + goto badarg; /* XXX:PaN or should we take as much as we have ? */ + } + hslen = hslen + 1 - hsstart; + } else { + goto badarg; + } + if (hslen == 0) { + BIF_RET(am_nomatch); + } + if (is_tuple(BIF_ARG_2)) { + tp = tuple_val(BIF_ARG_2); + if (arityval(*tp) != 2 || is_not_atom(tp[1])) { + goto badarg; + } + if (tp[1] == am_bm && ERTS_TERM_IS_MAGIC_BINARY(tp[2])) { + Binary *mbp; + BMData *bm; + Sint pos; + byte *bytes; + Uint bitoffs, bitsize; + byte *temp_alloc = NULL; + Eterm ret; + Eterm *hp; + mbp = ((ProcBin *) binary_val(tp[2]))->val; + if (ERTS_MAGIC_BIN_DESTRUCTOR(mbp) != cleanup_my_data) { + goto badarg; + } + bm = (BMData *) ERTS_MAGIC_BIN_DATA(mbp); +#ifdef HARDDEBUG + dump_bm_data(bm); +#endif + ERTS_GET_BINARY_BYTES(BIF_ARG_1, bytes, bitoffs, bitsize); + if (bitsize != 0) { + goto badarg; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + } + pos = bm_find_first_match(bm, bytes + hsstart, hslen); + if (pos < 0) { + ret = am_nomatch; + } else { + Eterm erlen = erts_make_integer((Uint) bm->len, BIF_P); + ret = erts_make_integer(pos+hsstart,BIF_P); + if (bm->ret_tuple) { + hp = HAlloc(BIF_P,3); + ret = TUPLE2(hp, ret, erlen); + } + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + } else if (tp[1] == am_ac && ERTS_TERM_IS_MAGIC_BINARY(tp[2])) { + Binary *mbp; + ACTrie *act; + Uint pos, msn,rlen; + byte *bytes; + Uint bitoffs, bitsize; + byte *temp_alloc = NULL; + Eterm ret; + Eterm *hp; + + mbp = ((ProcBin *) binary_val(tp[2]))->val; + if (ERTS_MAGIC_BIN_DESTRUCTOR(mbp) != cleanup_my_data) { + goto badarg; + } + act = (ACTrie *) ERTS_MAGIC_BIN_DATA(mbp); +#ifdef HARDDEBUG + dump_ac_trie(act); +#endif + ERTS_GET_BINARY_BYTES(BIF_ARG_1, bytes, bitoffs, bitsize); + if (bitsize != 0) { + goto badarg; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + } + msn = ac_find_first_match(act, bytes + hsstart, + hslen, &pos, &rlen); + if (msn == 0) { + ret = am_nomatch; + } else { + Eterm epos = erts_make_integer(pos+hsstart,BIF_P); + Eterm erlen = erts_make_integer(rlen,BIF_P); + hp = HAlloc(BIF_P,3); + ret = TUPLE2(hp, epos, erlen); + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + } else { + goto badarg; + } + } else { + goto badarg; /* Compilation on the fly NYI */ + } + badarg: + BIF_ERROR(BIF_P,BADARG); +} +BIF_RETTYPE binary_match_2(BIF_ALIST_2) +{ + return binary_match_3(BIF_P,BIF_ARG_1,BIF_ARG_2,((Eterm) 0)); +} + +BIF_RETTYPE binary_matches_3(BIF_ALIST_3) +{ + Uint hsstart, hslen; + Eterm *tp; + if (is_not_binary(BIF_ARG_1)) { + goto badarg; + } + if (BIF_ARG_3 == ((Eterm) 0)) { + /* Invalid term, we're called from binary_match_2... */ + hsstart = 0; + hslen = binary_size(BIF_ARG_1); + } else if (is_tuple(BIF_ARG_3)) { + tp = tuple_val(BIF_ARG_3); + if (arityval(*tp) != 2) { + goto badarg; + } + if (!term_to_Uint(tp[1], &hsstart) || ((hsstart >> 16) >> 16) != 0) { + goto badarg; + } + if (!term_to_Uint(tp[2], &hslen) || ((hslen >> 16) >> 16) != 0) { + goto badarg; + } + if (hslen < hsstart) { + goto badarg; + } + if (hslen > binary_size(BIF_ARG_1)-1) { + goto badarg; /* XXX:PaN or should we take as much as we have ? */ + } + hslen = hslen + 1 - hsstart; + } else { + goto badarg; + } + if (hslen == 0) { + BIF_RET(am_nomatch); + } + if (is_tuple(BIF_ARG_2)) { + tp = tuple_val(BIF_ARG_2); + if (arityval(*tp) != 2 || is_not_atom(tp[1])) { + goto badarg; + } + if (tp[1] == am_bm && ERTS_TERM_IS_MAGIC_BINARY(tp[2])) { + Binary *mbp; + BMData *bm; + Sint pos; + byte *bytes; + Uint bitoffs, bitsize; + byte *temp_alloc = NULL; + Eterm ret; + Eterm *hp; + mbp = ((ProcBin *) binary_val(tp[2]))->val; + if (ERTS_MAGIC_BIN_DESTRUCTOR(mbp) != cleanup_my_data) { + goto badarg; + } + bm = (BMData *) ERTS_MAGIC_BIN_DATA(mbp); +#ifdef HARDDEBUG + dump_bm_data(bm); +#endif + ERTS_GET_BINARY_BYTES(BIF_ARG_1, bytes, bitoffs, bitsize); + if (bitsize != 0) { + goto badarg; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + } + pos = bm_find_first_match(bm, bytes + hsstart, hslen); + if (pos < 0) { + ret = am_nomatch; + } else { + Eterm erlen = erts_make_integer((Uint) bm->len, BIF_P); + ret = erts_make_integer(pos,BIF_P); + if (bm->ret_tuple) { + hp = HAlloc(BIF_P,3); + ret = TUPLE2(hp, ret, erlen); + } + } + erts_free_aligned_binary_bytes(temp_alloc); + BIF_RET(ret); + } else if (tp[1] == am_ac && ERTS_TERM_IS_MAGIC_BINARY(tp[2])) { + Binary *mbp; + ACTrie *act; + Uint rlen; + Sint i; + FindallData *fad; + byte *bytes; + Uint bitoffs, bitsize; + byte *temp_alloc = NULL; + Eterm ret,tpl; + Eterm *hp; + + mbp = ((ProcBin *) binary_val(tp[2]))->val; + if (ERTS_MAGIC_BIN_DESTRUCTOR(mbp) != cleanup_my_data) { + goto badarg; + } + act = (ACTrie *) ERTS_MAGIC_BIN_DATA(mbp); +#ifdef HARDDEBUG + dump_ac_trie(act); +#endif + ERTS_GET_BINARY_BYTES(BIF_ARG_1, bytes, bitoffs, bitsize); + if (bitsize != 0) { + goto badarg; + } + if (bitoffs != 0) { + bytes = erts_get_aligned_binary_bytes(BIF_ARG_1, &temp_alloc); + } + rlen = ac_find_all_non_overlapping(act, bytes + hsstart, + hslen, &fad); + if (rlen == 0) { + ret = am_nomatch; + } else { + for (i = 0; i < rlen; ++i) { + fad[i].epos = erts_make_integer(fad[i].pos,BIF_P); + fad[i].elen = erts_make_integer(fad[i].len,BIF_P); + } + hp = HAlloc(BIF_P,rlen * (3 + 2)); + ret = NIL; + for (i = rlen - 1; i >= 0; --i) { + tpl = TUPLE2(hp, fad[i].epos, fad[i].elen); + hp +=3; + ret = CONS(hp,tpl,ret); + hp += 2; + } + } + erts_free_aligned_binary_bytes(temp_alloc); + if (fad != NULL) { + erts_free(ERTS_ALC_T_TMP,fad); + } + BIF_RET(ret); + } else { + goto badarg; + } + } else { + goto badarg; /* Compilation on the fly NYI */ + } + badarg: + BIF_ERROR(BIF_P,BADARG); +} +BIF_RETTYPE binary_matches_2(BIF_ALIST_2) +{ + return binary_matches_3(BIF_P,BIF_ARG_1,BIF_ARG_2,((Eterm) 0)); +} + +/* + * Hard debug functions (dump) for the search structures + */ + +#ifdef HARDDEBUG +static void dump_bm_data(BMData *bm) +{ + int i,j; + erts_printf("Dumping Boyer-More structure.\n"); + erts_printf("=============================\n"); + erts_printf("Return tuple: %d\n",bm->ret_tuple); + erts_printf("Searchstring [%ld]:\n", bm->len); + erts_printf("<<"); + for (i = 0; i < bm->len; ++i) { + if (i > 0) { + erts_printf(", "); + } + erts_printf("%d", (int) bm->x[i]); + if (bm->x[i] >= 'A') { + erts_printf(" ($%c)",(char) bm->x[i]); + } + } + erts_printf(">>\n"); + erts_printf("GoodShift array:\n"); + for (i = 0; i < bm->len; ++i) { + erts_printf("GoodShift[%d]: %ld\n", i, bm->goodshift[i]); + } + erts_printf("BadShift array:\n"); + j = 0; + for (i = 0; i < ALPHABET_SIZE; i += j) { + for (j = 0; i + j < ALPHABET_SIZE && j < 6; ++j) { + erts_printf("BS[%03d]:%02ld, ", i+j, bm->badshift[i+j]); + } + erts_printf("\n"); + } +} + +static void dump_ac_node(ACNode *node, int indent, int ch) { + int i; + char *spaces = erts_alloc(ERTS_ALC_T_TMP, 10 * indent + 1); + memset(spaces,' ',10*indent); + spaces[10*indent] = '\0'; + erts_printf("%s-> %c\n",spaces,ch); + erts_printf("%sId: %u\n",spaces,(unsigned) node->id); + erts_printf("%sD: %u\n",spaces,(unsigned)node->d); + erts_printf("%sFinal: %d\n",spaces,(int)node->final); + erts_printf("%sFail: %u\n",spaces,(unsigned)node->h->id); + erts_free(ERTS_ALC_T_TMP,spaces); + for(i=0;ig[i] != NULL && node->g[i] != node) { + dump_ac_node(node->g[i],indent+1,i); + } + } +} + + +static void dump_ac_trie(ACTrie *act) +{ + erts_printf("Aho Corasick Trie dump.\n"); + erts_printf("=======================\n"); + erts_printf("Node counter: %u\n", (unsigned) act->idc); + erts_printf("Searchstring counter: %u\n", (unsigned) act->counter); + erts_printf("Trie:\n"); + dump_ac_node(act->root, 0, '0'); + return; +} +#endif -- cgit v1.2.3