1 files changed, 127 insertions, 19 deletions
diff --git a/lib/stdlib/src/rand.erl b/lib/stdlib/src/rand.erl
index 9854c778a1..d3eda5e144 100644
--- a/lib/stdlib/src/rand.erl
+++ b/lib/stdlib/src/rand.erl
@@ -43,13 +43,13 @@
 %% Debug
 -export([make_float/3, float2str/1, bc64/1]).
 
--compile({inline, [exs64_next/1, exsplus_next/1,
+-compile({inline, [exs64_next/1, exsplus_next/1, exsss_next/1,
 		   exs1024_next/1, exs1024_calc/2,
                    exro928_next_state/4,
                    exrop_next/1, exrop_next_s/2,
 		   get_52/1, normal_kiwi/1]}).
 
--define(DEFAULT_ALG_HANDLER, exrop).
+-define(DEFAULT_ALG_HANDLER, exsss).
 -define(SEED_DICT, rand_seed).
 
 %% =====================================================================
@@ -86,7 +86,7 @@
 
 %% This depends on the algorithm handler function
 -type alg_state() ::
-	exrop_state() | exs1024_state() |  exro928_state() |  exsplus_state() |
+	exsplus_state() | exro928_state() |  exrop_state() | exs1024_state() |
 	exs64_state() | term().
 
 %% This is the algorithm handling definition within this module,
@@ -131,7 +131,7 @@
 %% Algorithm state
 -type state() :: {alg_handler(), alg_state()}.
 -type builtin_alg() ::
-	exrop | exs1024s | exro928ss | exsp | exs64 | exsplus | exs1024.
+	exsss | exro928ss | exrop | exs1024s | exsp | exs64 | exsplus | exs1024.
 -type alg() :: builtin_alg() | atom().
 -type export_state() :: {alg(), alg_state()}.
 -type seed() :: [integer()] | integer() | {integer(), integer(), integer()}.
@@ -139,7 +139,7 @@
    [builtin_alg/0, alg/0, alg_handler/0, alg_state/0,
     state/0, export_state/0, seed/0]).
 -export_type(
-   [exrop_state/0, exs1024_state/0, exro928_state/0, exsplus_state/0,
+   [exsplus_state/0, exro928_state/0, exrop_state/0, exs1024_state/0,
     exs64_state/0]).
 
 %% =====================================================================
@@ -618,6 +618,11 @@ mk_alg(exsp) ->
        uniform=>fun exsp_uniform/1, uniform_n=>fun exsp_uniform/2,
        jump=>fun exsplus_jump/1},
      fun exsplus_seed/1};
+mk_alg(exsss) ->
+    {#{type=>exsss, bits=>58, next=>fun exsss_next/1,
+       uniform=>fun exsss_uniform/1, uniform_n=>fun exsss_uniform/2,
+       jump=>fun exsplus_jump/1},
+     fun exsss_seed/1};
 mk_alg(exs1024) ->
     {#{type=>exs1024, max=>?MASK(64), next=>fun exs1024_next/1,
        jump=>fun exs1024_jump/1},
@@ -675,6 +680,36 @@ exs64_next(R) ->
 %% 58 bits fits into an immediate on 64bits erlang and is thus much faster.
 %% Modification of the original Xorshift128+ algorithm to 116
 %% by Sebastiano Vigna, a lot of thanks for his help and work.
+%%
+%% Reference C code for Xorshift116+ and Xorshift116**
+%%
+%% #include <stdint.h>
+%%
+%% #define MASK(b, v) (((UINT64_C(1) << (b)) - 1) & (v))
+%% #define BSL(b, v, n) (MASK((b)-(n), (v)) << (n))
+%% #define ROTL(b, v, n) (BSL((b), (v), (n)) | ((v) >> ((b)-(n))))
+%%
+%% uint64_t s[2];
+%%
+%% uint64_t next(void) {
+%%     uint64_t s1 = s[0];
+%%     const uint64_t s0 = s[1];
+%%
+%%     s1 ^= BSL(58, s1, 24); // a
+%%     s1 ^= s0 ^ (s1 >> 11) ^ (s0 >> 41); // b, c
+%%     s[0] = s0;
+%%     s[1] = s1;
+%%
+%%     const uint64_t result_plus = MASK(58, s0 + s1);
+%%     uint64_t result_starstar = s0;
+%%     result_starstar = MASK(58, result_starstar * 5);
+%%     result_starstar = ROTL(58, result_starstar, 7);
+%%     result_starstar = MASK(58, result_starstar * 9);
+%%
+%%     return result_plus;
+%%     return result_starstar;
+%% }
+%%
 %% =====================================================================
 -opaque exsplus_state() :: nonempty_improper_list(uint58(), uint58()).
 
@@ -697,16 +732,58 @@ exsplus_seed({A1, A2, A3}) ->
                  tl(R1)]),
     R2.
 
+exsss_seed(L) when is_list(L) ->
+    [S0,S1] = seed58_nz(2, L),
+    [S0|S1];
+exsss_seed(X) when is_integer(X) ->
+    [S0,S1] = seed58(2, ?MASK(64, X)),
+    [S0|S1];
+%%
+%% Seed from traditional integer triple - mix into splitmix
+exsss_seed({A1, A2, A3}) ->
+    {_, X0} = seed58(?MASK(64, A1)),
+    {S0, X1} = seed58(?MASK(64, A2) bxor X0),
+    {S1, _} = seed58(?MASK(64, A3) bxor X1),
+    [S0|S1].
+
 -dialyzer({no_improper_lists, exsplus_next/1}).
 
-%% Advance xorshift116+ state for one step and generate 58bit unsigned integer
+%% Advance Xorshift116 state one step
+-define(
+   exs_next(S0, S1, S1_b),
+   begin
+       S1_b = S1 bxor ?BSL(58, S1, 24),
+       S1_b bxor S0 bxor (S1_b bsr 11) bxor (S0 bsr 41)
+   end).
+
+-define(
+   scramble_starstar(S, V_a, V_b),
+   begin
+       %% The multiply by add shifted trick avoids creating bignums
+       %% which improves performance significantly
+       %%
+       V_a = ?MASK(58, S + ?BSL(58, S, 2)), % V_a = S * 5
+       V_b = ?ROTL(58, V_a, 7),
+       ?MASK(58, V_b + ?BSL(58, V_b, 3)) % V_b * 9
+   end).
+
+%% Advance state and generate 58bit unsigned integer
 -spec exsplus_next(exsplus_state()) -> {uint58(), exsplus_state()}.
 exsplus_next([S1|S0]) ->
     %% Note: members s0 and s1 are swapped here
-    S11 = S1 bxor ?BSL(58, S1, 24),
-    S12 = S11 bxor S0 bxor (S11 bsr 11) bxor (S0 bsr 41),
-    {?MASK(58, S0 + S12), [S0|S12]}.
-
+    NewS1 = ?exs_next(S0, S1, S1_1),
+    {?MASK(58, S0 + NewS1), [S0|NewS1]}.
+%%    %% Note: members s0 and s1 are swapped here
+%%    S11 = S1 bxor ?BSL(58, S1, 24),
+%%    S12 = S11 bxor S0 bxor (S11 bsr 11) bxor (S0 bsr 41),
+%%    {?MASK(58, S0 + S12), [S0|S12]}.
+
+-spec exsss_next(exsplus_state()) -> {uint58(), exsplus_state()}.
+exsss_next([S1|S0]) ->
+    %% Note: members s0 and s1 are swapped here
+    NewS1 = ?exs_next(S0, S1, S1_1),
+    {?scramble_starstar(S0, V_0, V_1), [S0|NewS1]}.
+%%    {?MASK(58, S0 + NewS1), [S0|NewS1]}.
 
 exsp_uniform({Alg, R0}) ->
     {I, R1} = exsplus_next(R0),
@@ -714,18 +791,48 @@ exsp_uniform({Alg, R0}) ->
     %% randomness quality than the others
     {(I bsr (58-53)) * ?TWO_POW_MINUS53, {Alg, R1}}.
 
+exsss_uniform({Alg, R0}) ->
+    {I, R1} = exsss_next(R0),
+    {(I bsr (58-53)) * ?TWO_POW_MINUS53, {Alg, R1}}.
+
 exsp_uniform(Range, {Alg, R}) ->
     {V, R1} = exsplus_next(R),
     MaxMinusRange = ?BIT(58) - Range,
     ?uniform_range(Range, Alg, R1, V, MaxMinusRange, I).
 
+exsss_uniform(Range, {Alg, R}) ->
+    {V, R1} = exsss_next(R),
+    MaxMinusRange = ?BIT(58) - Range,
+    ?uniform_range(Range, Alg, R1, V, MaxMinusRange, I).
+
 
-%% This is the jump function for the exsplus generator, equivalent
+%% This is the jump function for the exs* generators,
+%% i.e the Xorshift116 generators,  equivalent
 %% to 2^64 calls to next/1; it can be used to generate 2^52
 %% non-overlapping subsequences for parallel computations.
 %% Note: the jump function takes 116 times of the execution time of
 %% next/1.
-
+%%
+%% #include <stdint.h>
+%%
+%% void jump(void) {
+%%   static const uint64_t JUMP[] = { 0x02f8ea6bc32c797,
+%% 				   0x345d2a0f85f788c };
+%%   int i, b;
+%%   uint64_t s0 = 0;
+%%   uint64_t s1 = 0;
+%%   for(i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
+%%     for(b = 0; b < 58; b++) {
+%%       if (JUMP[i] & 1ULL << b) {
+%% 	s0 ^= s[0];
+%% 	s1 ^= s[1];
+%%       }
+%%       next();
+%%     }
+%%   s[0] = s0;
+%%   s[1] = s1;
+%% }
+%%
 %% -define(JUMPCONST, 16#000d174a83e17de2302f8ea6bc32c797).
 %% split into 58-bit chunks
 %% and two iterative executions
@@ -973,13 +1080,14 @@ exro928ss_next({[S15,S0|Ss], Rs}) ->
     %% {S, R, T} = {5, 7, 9}
     %% const uint64_t result_starstar = rotl(s0 * S, R) * T;
     %%
-    %% The multiply by add shifted trick avoids creating bignums
-    %% which improves performance significantly
-    %%
-    V0 = ?MASK(58, S0 + ?BSL(58, S0, 2)), % V0 = S0 * 5
-    V1 = ?ROTL(58, V0, 7),
-    V = ?MASK(58, V1 + ?BSL(58, V1, 3)), % V = V1 * 9
-    {V, SR};
+    {?scramble_starstar(S0, V_0, V_1), SR};
+%%    %% The multiply by add shifted trick avoids creating bignums
+%%    %% which improves performance significantly
+%%    %%
+%%    V0 = ?MASK(58, S0 + ?BSL(58, S0, 2)), % V0 = S0 * 5
+%%    V1 = ?ROTL(58, V0, 7),
+%%    V = ?MASK(58, V1 + ?BSL(58, V1, 3)), % V = V1 * 9
+%%    {V, SR};
 exro928ss_next({[S15], Rs}) ->
     exro928ss_next({[S15|lists:reverse(Rs)], []}).