From db41ff2fd39928a973eea25c3babfa5193f27319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Valim?= Date: Mon, 30 Apr 2018 00:17:07 +0800 Subject: Optimize binary match The idea is to use memchr on the first lookup for binary:match/2 and also after every match on binary:matches/2. We only use memchr in case of matches because benchmarks showed that using memchr even when we had false positives could negatively affect performance. This speeds up binary matching and binary splitting by 4x in some cases and by 70x in other scenarios (when the last character in the needle does not occur in the subject). The reason to use memchr is that it is highly specialized in most modern operating systems, often defaulting to SIMD operations. The implementation uses the reduction count to figure out how many bytes should be read with memchr. We could increase those numbers but they do not seem to make a large difference. --- erts/emulator/beam/erl_bif_binary.c | 69 +++++++++++++++++++++++++++++----- lib/stdlib/test/stdlib.spec | 2 +- lib/stdlib/test/stdlib_bench_SUITE.erl | 67 ++++++++++++++++++++++----------- 3 files changed, 106 insertions(+), 32 deletions(-) diff --git a/erts/emulator/beam/erl_bif_binary.c b/erts/emulator/beam/erl_bif_binary.c index 469f6a1ea8..245e04cb21 100644 --- a/erts/emulator/beam/erl_bif_binary.c +++ b/erts/emulator/beam/erl_bif_binary.c @@ -796,6 +796,7 @@ static void compute_goodshifts(BMData *bmd) } #define BM_LOOP_FACTOR 10 /* Should we have a higher value? */ +#define MC_LOOP_FACTOR 8 static void bm_init_find_first_match(BinaryFindContext *ctx) { @@ -819,13 +820,38 @@ static BFReturn bm_find_first_match(BinaryFindContext *ctx, byte *haystack) Sint i; Sint j = state->pos; register Uint reds = *reductions; + byte *pos_pointer; + Sint needle_last = blen - 1; + Sint mem_read = len - needle_last - j; - while (j <= len - blen) { + if (mem_read <= 0) { + return BF_NOT_FOUND; + } + mem_read = MIN(mem_read, reds * MC_LOOP_FACTOR); + ASSERT(mem_read > 0); + + pos_pointer = memchr(&haystack[j + needle_last], needle[needle_last], mem_read); + if (pos_pointer == NULL) { + reds -= mem_read / MC_LOOP_FACTOR; + j += mem_read; + } else { + reds -= (pos_pointer - &haystack[j]) / MC_LOOP_FACTOR; + j = pos_pointer - haystack - needle_last; + } + + // Ensure we have at least one reduction before entering the loop + ++reds; + + for(;;) { + if (j > len - blen) { + *reductions = reds; + return BF_NOT_FOUND; + } if (--reds == 0) { state->pos = j; return BF_RESTART; } - for (i = blen - 1; i >= 0 && needle[i] == haystack[i + j]; --i) + for (i = needle_last; i >= 0 && needle[i] == haystack[i + j]; --i) ; if (i < 0) { /* found */ *reductions = reds; @@ -835,8 +861,6 @@ static BFReturn bm_find_first_match(BinaryFindContext *ctx, byte *haystack) } j += MAX(gs[i],bs[haystack[i+j]] - blen + 1 + i); } - *reductions = reds; - return BF_NOT_FOUND; } static void bm_init_find_all(BinaryFindContext *ctx) @@ -875,14 +899,38 @@ static BFReturn bm_find_all_non_overlapping(BinaryFindContext *ctx, byte *haysta Sint *gs = bmd->goodshift; Sint *bs = bmd->badshift; byte *needle = bmd->x; - Sint i; + Sint i = -1; /* Use memchr on start and on every match */ Sint j = state->pos; Uint m = state->m; Uint allocated = state->allocated; FindallData *out = state->out; register Uint reds = *reductions; + byte *pos_pointer; + Sint needle_last = blen - 1; + Sint mem_read; - while (j <= len - blen) { + for(;;) { + if (i < 0) { + mem_read = len - needle_last - j; + if(mem_read <= 0) { + goto done; + } + mem_read = MIN(mem_read, reds * MC_LOOP_FACTOR); + ASSERT(mem_read > 0); + pos_pointer = memchr(&haystack[j + needle_last], needle[needle_last], mem_read); + if (pos_pointer == NULL) { + reds -= mem_read / MC_LOOP_FACTOR; + j += mem_read; + } else { + reds -= (pos_pointer - &haystack[j]) / MC_LOOP_FACTOR; + j = pos_pointer - haystack - needle_last; + } + // Ensure we have at least one reduction when resuming the loop + ++reds; + } + if (j > len - blen) { + goto done; + } if (--reds == 0) { state->pos = j; state->m = m; @@ -890,7 +938,7 @@ static BFReturn bm_find_all_non_overlapping(BinaryFindContext *ctx, byte *haysta state->out = out; return BF_RESTART; } - for (i = blen - 1; i >= 0 && needle[i] == haystack[i + j]; --i) + for (i = needle_last; i >= 0 && needle[i] == haystack[i + j]; --i) ; if (i < 0) { /* found */ if (m >= allocated) { @@ -912,6 +960,7 @@ static BFReturn bm_find_all_non_overlapping(BinaryFindContext *ctx, byte *haysta j += MAX(gs[i],bs[haystack[i+j]] - blen + 1 + i); } } + done: state->m = m; state->out = out; *reductions = reds; @@ -931,6 +980,7 @@ static int do_binary_match_compile(Eterm argument, Eterm *tag, Binary **binp) Eterm t, b, comp_term = NIL; Uint characters; Uint words; + Uint size; characters = 0; words = 0; @@ -946,11 +996,12 @@ static int do_binary_match_compile(Eterm argument, Eterm *tag, Binary **binp) if (binary_bitsize(b) != 0) { goto badarg; } - if (binary_size(b) == 0) { + size = binary_size(b); + if (size == 0) { goto badarg; } ++words; - characters += binary_size(b); + characters += size; } if (is_not_nil(t)) { goto badarg; diff --git a/lib/stdlib/test/stdlib.spec b/lib/stdlib/test/stdlib.spec index 9c625091a8..4de7c1a0eb 100644 --- a/lib/stdlib/test/stdlib.spec +++ b/lib/stdlib/test/stdlib.spec @@ -1,4 +1,4 @@ {suites,"../stdlib_test",all}. {skip_groups,"../stdlib_test",stdlib_bench_SUITE, - [base64,gen_server,gen_statem,unicode], + [binary,base64,gen_server,gen_statem,unicode], "Benchmark only"}. diff --git a/lib/stdlib/test/stdlib_bench_SUITE.erl b/lib/stdlib/test/stdlib_bench_SUITE.erl index 2364e8376f..b937eeb06a 100644 --- a/lib/stdlib/test/stdlib_bench_SUITE.erl +++ b/lib/stdlib/test/stdlib_bench_SUITE.erl @@ -29,7 +29,7 @@ suite() -> [{ct_hooks,[{ts_install_cth,[{nodenames,2}]}]}]. all() -> - [{group,unicode},{group,base64}, + [{group,unicode},{group,base64},{group,binary}, {group,gen_server},{group,gen_statem}, {group,gen_server_comparison},{group,gen_statem_comparison}]. @@ -38,6 +38,11 @@ groups() -> [norm_nfc_list, norm_nfc_deep_l, norm_nfc_binary, string_lexemes_list, string_lexemes_binary ]}, + {binary, [{repeat, 5}], + [match_single_pattern_no_match, + matches_single_pattern_no_match, + matches_single_pattern_eventual_match, + matches_single_pattern_frequent_match]}, {base64,[{repeat,5}], [decode_binary, decode_binary_to_string, decode_list, decode_list_to_string, @@ -157,41 +162,59 @@ norm_data(Config) -> %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +match_single_pattern_no_match(_Config) -> + Binary = binary:copy(<<"ugbcfuysabfuqyfikgfsdalpaskfhgjsdgfjwsalp">>, 1000000), + comment(test(binary, match, [Binary, <<"o">>])). + +matches_single_pattern_no_match(_Config) -> + Binary = binary:copy(<<"ugbcfuysabfuqyfikgfsdalpaskfhgjsdgfjwsalp">>, 1000000), + comment(test(binary, matches, [Binary, <<"o">>])). + +matches_single_pattern_eventual_match(_Config) -> + Binary = binary:copy(<<"ugbcfuysabfuqyfikgfsdalpaskfhgjsdgfjwsal\n">>, 1000000), + comment(test(binary, matches, [Binary, <<"\n">>])). + +matches_single_pattern_frequent_match(_Config) -> + Binary = binary:copy(<<"abc\n">>, 1000000), + comment(test(binary, matches, [Binary, <<"abc">>])). + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% + decode_binary(_Config) -> - comment(test(decode, encoded_binary())). + comment(test(base64, decode, [encoded_binary()])). decode_binary_to_string(_Config) -> - comment(test(decode_to_string, encoded_binary())). + comment(test(base64, decode_to_string, [encoded_binary()])). decode_list(_Config) -> - comment(test(decode, encoded_list())). + comment(test(base64, decode, [encoded_list()])). decode_list_to_string(_Config) -> - comment(test(decode_to_string, encoded_list())). + comment(test(base64, decode_to_string, [encoded_list()])). encode_binary(_Config) -> - comment(test(encode, binary())). + comment(test(base64, encode, [binary()])). encode_binary_to_string(_Config) -> - comment(test(encode_to_string, binary())). + comment(test(base64, encode_to_string, [binary()])). encode_list(_Config) -> - comment(test(encode, list())). + comment(test(base64, encode, [list()])). encode_list_to_string(_Config) -> - comment(test(encode_to_string, list())). + comment(test(base64, encode_to_string, [list()])). mime_binary_decode(_Config) -> - comment(test(mime_decode, encoded_binary())). + comment(test(base64, mime_decode, [encoded_binary()])). mime_binary_decode_to_string(_Config) -> - comment(test(mime_decode_to_string, encoded_binary())). + comment(test(base64, mime_decode_to_string, [encoded_binary()])). mime_list_decode(_Config) -> - comment(test(mime_decode, encoded_list())). + comment(test(base64, mime_decode, [encoded_list()])). mime_list_decode_to_string(_Config) -> - comment(test(mime_decode_to_string, encoded_list())). + comment(test(base64, mime_decode_to_string, [encoded_list()])). -define(SIZE, 10000). -define(N, 1000). @@ -209,15 +232,15 @@ binary() -> list() -> random_byte_list(?SIZE). -test(Func, Data) -> - F = fun() -> loop(?N, Func, Data) end, +test(Mod, Fun, Args) -> + F = fun() -> loop(?N, Mod, Fun, Args) end, {Time, ok} = timer:tc(fun() -> lspawn(F) end), - report_base64(Time). + report_mfa(Time, Mod). -loop(0, _F, _D) -> garbage_collect(), ok; -loop(N, F, D) -> - _ = base64:F(D), - loop(N - 1, F, D). +loop(0, _M, _F, _A) -> garbage_collect(), ok; +loop(N, M, F, A) -> + _ = apply(M, F, A), + loop(N - 1, M, F, A). lspawn(Fun) -> {Pid, Ref} = spawn_monitor(fun() -> exit(Fun()) end), @@ -225,10 +248,10 @@ lspawn(Fun) -> {'DOWN', Ref, process, Pid, Rep} -> Rep end. -report_base64(Time) -> +report_mfa(Time, Mod) -> Tps = round((?N*1000000)/Time), ct_event:notify(#event{name = benchmark_data, - data = [{suite, "stdlib_base64"}, + data = [{suite, "stdlib_" ++ atom_to_list(Mod)}, {value, Tps}]}), Tps. -- cgit v1.2.3