1 files changed, 1264 insertions, 121 deletions
diff --git a/lib/stdlib/src/rand.erl b/lib/stdlib/src/rand.erl
index 93409d95df..3a9a1e007b 100644
--- a/lib/stdlib/src/rand.erl
+++ b/lib/stdlib/src/rand.erl
@@ -1,7 +1,7 @@
 %%
 %% %CopyrightBegin%
 %%
-%% Copyright Ericsson AB 2015-2016. All Rights Reserved.
+%% Copyright Ericsson AB 2015-2018. All Rights Reserved.
 %%
 %% Licensed under the Apache License, Version 2.0 (the "License");
 %% you may not use this file except in compliance with the License.
@@ -19,7 +19,10 @@
 %%
 %% =====================================================================
 %% Multiple PRNG module for Erlang/OTP
-%% Copyright (c) 2015 Kenji Rikitake
+%% Copyright (c) 2015-2016 Kenji Rikitake
+%%
+%% exrop (xoroshiro116+) added, statistical distribution
+%% improvements and uniform_real added by the Erlang/OTP team 2017
 %% =====================================================================
 
 -module(rand).
@@ -27,34 +30,200 @@
 -export([seed_s/1, seed_s/2, seed/1, seed/2,
 	 export_seed/0, export_seed_s/1,
          uniform/0, uniform/1, uniform_s/1, uniform_s/2,
-	 normal/0, normal_s/1
+         uniform_real/0, uniform_real_s/1,
+         jump/0, jump/1,
+         normal/0, normal/2, normal_s/1, normal_s/3
 	]).
 
--compile({inline, [exs64_next/1, exsplus_next/1,
+%% Test, dev and internal
+-export([exro928_jump_2pow512/1, exro928_jump_2pow20/1,
+	 exro928_seed/1, exro928_next/1, exro928_next_state/1,
+	 format_jumpconst58/1, seed58/2]).
+
+%% Debug
+-export([make_float/3, float2str/1, bc64/1]).
+
+-compile({inline, [exs64_next/1, exsplus_next/1, exsss_next/1,
 		   exs1024_next/1, exs1024_calc/2,
+                   exro928_next_state/4,
+                   exrop_next/1, exrop_next_s/2,
 		   get_52/1, normal_kiwi/1]}).
 
--define(DEFAULT_ALG_HANDLER, exsplus).
+-define(DEFAULT_ALG_HANDLER, exsss).
 -define(SEED_DICT, rand_seed).
 
 %% =====================================================================
+%% Bit fiddling macros
+%% =====================================================================
+
+-define(BIT(Bits), (1 bsl (Bits))).
+-define(MASK(Bits), (?BIT(Bits) - 1)).
+-define(MASK(Bits, X), ((X) band ?MASK(Bits))).
+-define(
+   BSL(Bits, X, N),
+   %% N is evaluated 2 times
+   (?MASK((Bits)-(N), (X)) bsl (N))).
+-define(
+   ROTL(Bits, X, N),
+   %% Bits is evaluated 2 times
+   %% X is evaluated 2 times
+   %% N i evaluated 3 times
+   (?BSL((Bits), (X), (N)) bor ((X) bsr ((Bits)-(N))))).
+
+-define(
+   BC(V, N),
+   bc((V), ?BIT((N) - 1), N)).
+
+%%-define(TWO_POW_MINUS53, (math:pow(2, -53))).
+-define(TWO_POW_MINUS53, 1.11022302462515657e-16).
+
+%% =====================================================================
 %% Types
 %% =====================================================================
 
+-type uint64() :: 0..?MASK(64).
+-type uint58() :: 0..?MASK(58).
+
 %% This depends on the algorithm handler function
--type alg_seed() :: exs64_state() | exsplus_state() | exs1024_state().
-%% This is the algorithm handler function within this module
--type alg_handler() :: #{type      := alg(),
-			 max       := integer(),
-			 next      := fun(),
-			 uniform   := fun(),
-			 uniform_n := fun()}.
-
-%% Internal state
--opaque state() :: {alg_handler(), alg_seed()}.
--type alg() :: exs64 | exsplus | exs1024.
--opaque export_state() :: {alg(), alg_seed()}.
--export_type([alg/0, state/0, export_state/0]).
+-type alg_state() ::
+	exsplus_state() | exro928_state() |  exrop_state() | exs1024_state() |
+	exs64_state() | term().
+
+%% This is the algorithm handling definition within this module,
+%% and the type to use for plugins.
+%%
+%% The 'type' field must be recognized by the module that implements
+%% the algorithm, to interpret an exported state.
+%%
+%% The 'bits' field indicates how many bits the integer
+%% returned from 'next' has got, i.e 'next' shall return
+%% an random integer in the range 0..(2^Bits - 1).
+%% At least 55 bits is required for the floating point
+%% producing fallbacks, but 56 bits would be more future proof.
+%%
+%% The fields 'next', 'uniform' and 'uniform_n'
+%% implement the algorithm.  If 'uniform' or 'uniform_n'
+%% is not present there is a fallback using 'next' and either
+%% 'bits' or the deprecated 'max'.  The 'next' function
+%% must generate a word with at least 56 good random bits.
+%%
+%% The 'weak_low_bits' field indicate how many bits are of
+%% lesser quality and they will not be used by the floating point
+%% producing functions, nor by the range producing functions
+%% when more bits are needed, to avoid weak bits in the middle
+%% of the generated bits.  The lowest bits from the range
+%% functions still have the generator's quality.
+%%
+-type alg_handler() ::
+        #{type := alg(),
+          bits => non_neg_integer(),
+          weak_low_bits => non_neg_integer(),
+          max => non_neg_integer(), % Deprecated
+          next :=
+              fun ((alg_state()) -> {non_neg_integer(), alg_state()}),
+          uniform =>
+              fun ((state()) -> {float(), state()}),
+          uniform_n =>
+              fun ((pos_integer(), state()) -> {pos_integer(), state()}),
+          jump =>
+              fun ((state()) -> state())}.
+
+%% Algorithm state
+-type state() :: {alg_handler(), alg_state()}.
+-type builtin_alg() ::
+	exsss | exro928ss | exrop | exs1024s | exsp | exs64 | exsplus | exs1024.
+-type alg() :: builtin_alg() | atom().
+-type export_state() :: {alg(), alg_state()}.
+-type seed() :: [integer()] | integer() | {integer(), integer(), integer()}.
+-export_type(
+   [builtin_alg/0, alg/0, alg_handler/0, alg_state/0,
+    state/0, export_state/0, seed/0]).
+-export_type(
+   [exsplus_state/0, exro928_state/0, exrop_state/0, exs1024_state/0,
+    exs64_state/0]).
+
+%% =====================================================================
+%% Range macro and helper
+%% =====================================================================
+
+-define(
+   uniform_range(Range, Alg, R, V, MaxMinusRange, I),
+   if
+       0 =< (MaxMinusRange) ->
+           if
+               %% Really work saving in odd cases;
+               %% large ranges in particular
+               (V) < (Range) ->
+                   {(V) + 1, {(Alg), (R)}};
+               true ->
+                   (I) = (V) rem (Range),
+                   if
+                       (V) - (I) =< (MaxMinusRange) ->
+                           {(I) + 1, {(Alg), (R)}};
+                       true ->
+                           %% V in the truncated top range
+                           %% - try again
+                           ?FUNCTION_NAME((Range), {(Alg), (R)})
+                   end
+           end;
+       true ->
+           uniform_range((Range), (Alg), (R), (V))
+   end).
+
+%% For ranges larger than the algorithm bit size
+uniform_range(Range, #{next:=Next, bits:=Bits} = Alg, R, V) ->
+    WeakLowBits = maps:get(weak_low_bits, Alg, 0),
+    %% Maybe waste the lowest bit(s) when shifting in new bits
+    Shift = Bits - WeakLowBits,
+    ShiftMask = bnot ?MASK(WeakLowBits),
+    RangeMinus1 = Range - 1,
+    if
+        (Range band RangeMinus1) =:= 0 -> % Power of 2
+            %% Generate at least the number of bits for the range
+            {V1, R1, _} =
+                uniform_range(
+                  Range bsr Bits, Next, R, V, ShiftMask, Shift, Bits),
+            {(V1 band RangeMinus1) + 1, {Alg, R1}};
+        true ->
+            %% Generate a value with at least two bits more than the range
+            %% and try that for a fit, otherwise recurse
+            %%
+            %% Just one bit more should ensure that the generated
+            %% number range is at least twice the size of the requested
+            %% range, which would make the probability to draw a good
+            %% number better than 0.5.  And repeating that until
+            %% success i guess would take 2 times statistically amortized.
+            %% But since the probability for fairly many attemtpts
+            %% is not that low, use two bits more than the range which 
+            %% should make the probability to draw a bad number under 0.25,
+            %% which decreases the bad case probability a lot.
+            {V1, R1, B} =
+                uniform_range(
+                  Range bsr (Bits - 2), Next, R, V, ShiftMask, Shift, Bits),
+            I = V1 rem Range,
+            if
+                (V1 - I) =< (1 bsl B) - Range ->
+                    {I + 1, {Alg, R1}};
+                true ->
+                    %% V1 drawn from the truncated top range
+                    %% - try again
+                    {V2, R2} = Next(R1),
+                    uniform_range(Range, Alg, R2, V2)
+            end
+    end.
+%%
+uniform_range(Range, Next, R, V, ShiftMask, Shift, B) ->
+    if 
+        Range =< 1 ->
+            {V, R, B};
+        true ->
+            {V1, R1} = Next(R),
+            %% Waste the lowest bit(s) when shifting in new bits
+            uniform_range(
+              Range bsr Shift, Next, R1,
+              ((V band ShiftMask) bsl Shift) bor V1,
+              ShiftMask, Shift, B + Shift)
+    end.
 
 %% =====================================================================
 %% API
@@ -68,87 +237,316 @@ export_seed() ->
 	_ -> undefined
     end.
 
--spec export_seed_s(state()) -> export_state().
-export_seed_s({#{type:=Alg}, Seed}) -> {Alg, Seed}.
+-spec export_seed_s(State :: state()) -> export_state().
+export_seed_s({#{type:=Alg}, AlgState}) -> {Alg, AlgState}.
 
 %% seed(Alg) seeds RNG with runtime dependent values
 %% and return the NEW state
 
-%% seed({Alg,Seed}) setup RNG with a previously exported seed
+%% seed({Alg,AlgState}) setup RNG with a previously exported seed
 %% and return the NEW state
 
--spec seed(AlgOrExpState::alg() | export_state()) -> state().
+-spec seed(
+        AlgOrStateOrExpState :: builtin_alg() | state() | export_state()) ->
+                  state().
 seed(Alg) ->
-    R = seed_s(Alg),
-    _ = seed_put(R),
-    R.
+    seed_put(seed_s(Alg)).
 
--spec seed_s(AlgOrExpState::alg() | export_state()) -> state().
-seed_s(Alg) when is_atom(Alg) ->
+-spec seed_s(
+        AlgOrStateOrExpState :: builtin_alg() | state() | export_state()) ->
+                    state().
+seed_s({AlgHandler, _AlgState} = State) when is_map(AlgHandler) ->
+    State;
+seed_s({Alg, AlgState}) when is_atom(Alg) ->
+    {AlgHandler,_SeedFun} = mk_alg(Alg),
+    {AlgHandler,AlgState};
+seed_s(Alg) ->
     seed_s(Alg, {erlang:phash2([{node(),self()}]),
 		 erlang:system_time(),
-		 erlang:unique_integer()});
-seed_s({Alg0, Seed}) ->
-    {Alg,_SeedFun} = mk_alg(Alg0),
-    {Alg, Seed}.
+		 erlang:unique_integer()}).
 
 %% seed/2: seeds RNG with the algorithm and given values
 %% and returns the NEW state.
 
--spec seed(Alg :: alg(), {integer(), integer(), integer()}) -> state().
-seed(Alg0, S0) ->
-    State = seed_s(Alg0, S0),
-    _ = seed_put(State),
-    State.
+-spec seed(Alg :: builtin_alg(),  Seed :: seed()) -> state().
+seed(Alg, Seed) ->
+    seed_put(seed_s(Alg, Seed)).
 
--spec seed_s(Alg :: alg(), {integer(), integer(), integer()}) -> state().
-seed_s(Alg0, S0 = {_, _, _}) ->
-    {Alg, Seed} = mk_alg(Alg0),
-    AS = Seed(S0),
-    {Alg, AS}.
+-spec seed_s(Alg :: builtin_alg(), Seed :: seed()) -> state().
+seed_s(Alg, Seed) ->
+    {AlgHandler,SeedFun} = mk_alg(Alg),
+    AlgState = SeedFun(Seed),
+    {AlgHandler,AlgState}.
 
 %%% uniform/0, uniform/1, uniform_s/1, uniform_s/2 are all
 %%% uniformly distributed random numbers.
 
-%% uniform/0: returns a random float X where 0.0 < X < 1.0,
+%% uniform/0: returns a random float X where 0.0 =< X < 1.0,
 %% updating the state in the process dictionary.
 
--spec uniform() -> X::float().
+-spec uniform() -> X :: float().
 uniform() ->
-    {X, Seed} = uniform_s(seed_get()),
-    _ = seed_put(Seed),
+    {X, State} = uniform_s(seed_get()),
+    _ = seed_put(State),
     X.
 
 %% uniform/1: given an integer N >= 1,
 %% uniform/1 returns a random integer X where 1 =< X =< N,
 %% updating the state in the process dictionary.
 
--spec uniform(N :: pos_integer()) -> X::pos_integer().
+-spec uniform(N :: pos_integer()) -> X :: pos_integer().
 uniform(N) ->
-    {X, Seed} = uniform_s(N, seed_get()),
-    _ = seed_put(Seed),
+    {X, State} = uniform_s(N, seed_get()),
+    _ = seed_put(State),
     X.
 
 %% uniform_s/1: given a state, uniform_s/1
-%% returns a random float X where 0.0 < X < 1.0,
+%% returns a random float X where 0.0 =< X < 1.0,
 %% and a new state.
 
--spec uniform_s(state()) -> {X::float(), NewS :: state()}.
+-spec uniform_s(State :: state()) -> {X :: float(), NewState :: state()}.
 uniform_s(State = {#{uniform:=Uniform}, _}) ->
-    Uniform(State).
+    Uniform(State);
+uniform_s({#{bits:=Bits, next:=Next} = Alg, R0}) ->
+    {V, R1} = Next(R0),
+    %% Produce floats on the form N * 2^(-53)
+    {(V bsr (Bits - 53)) * ?TWO_POW_MINUS53, {Alg, R1}};
+uniform_s({#{max:=Max, next:=Next} = Alg, R0}) ->
+    {V, R1} = Next(R0),
+    %% Old algorithm with non-uniform density
+    {V / (Max + 1), {Alg, R1}}.
+
 
 %% uniform_s/2: given an integer N >= 1 and a state, uniform_s/2
 %% uniform_s/2 returns a random integer X where 1 =< X =< N,
 %% and a new state.
 
--spec uniform_s(N::pos_integer(), state()) -> {X::pos_integer(), NewS::state()}.
-uniform_s(N, State = {#{uniform_n:=Uniform, max:=Max}, _})
-  when 0 < N, N =< Max ->
-    Uniform(N, State);
-uniform_s(N, State0 = {#{uniform:=Uniform}, _})
-  when is_integer(N), 0 < N ->
-    {F, State} = Uniform(State0),
-    {trunc(F * N) + 1, State}.
+-spec uniform_s(N :: pos_integer(), State :: state()) ->
+                       {X :: pos_integer(), NewState :: state()}.
+uniform_s(N, State = {#{uniform_n:=UniformN}, _})
+  when is_integer(N), 1 =< N ->
+    UniformN(N, State);
+uniform_s(N, {#{bits:=Bits, next:=Next} = Alg, R0})
+  when is_integer(N), 1 =< N ->
+    {V, R1} = Next(R0),
+    MaxMinusN = ?BIT(Bits) - N,
+    ?uniform_range(N, Alg, R1, V, MaxMinusN, I);
+uniform_s(N, {#{max:=Max, next:=Next} = Alg, R0})
+  when is_integer(N), 1 =< N ->
+    %% Old algorithm with skewed probability
+    %% and gap in ranges > Max
+    {V, R1} = Next(R0),  
+    if
+        N =< Max ->
+            {(V rem N) + 1, {Alg, R1}};
+        true ->
+            F = V / (Max + 1),
+            {trunc(F * N) + 1, {Alg, R1}}
+    end.
+
+%% uniform_real/0: returns a random float X where 0.0 < X =< 1.0,
+%% updating the state in the process dictionary.
+
+-spec uniform_real() -> X :: float().
+uniform_real() ->
+    {X, Seed} = uniform_real_s(seed_get()),
+    _ = seed_put(Seed),
+    X.
+
+%% uniform_real_s/1: given a state, uniform_s/1
+%% returns a random float X where 0.0 < X =< 1.0,
+%% and a new state.
+%%
+%% This function does not use the same form of uniformity
+%% as the uniform_s/1 function.
+%%
+%% Instead, this function does not generate numbers with equal
+%% distance in the interval, but rather tries to keep all mantissa
+%% bits random also for small numbers, meaning that the distance
+%% between possible numbers decreases when the numbers
+%% approaches 0.0, as does the possibility for a particular
+%% number.  Hence uniformity is preserved.
+%%
+%% To generate 56 bits at the time instead of 53 is actually
+%% a speed optimization since the probability to have to
+%% generate a second word decreases by 1/2 for every extra bit.
+%%
+%% This function generates normalized numbers, so the smallest number
+%% that can be generated is 2^-1022 with the distance 2^-1074
+%% to the next to smallest number, compared to 2^-53 for uniform_s/1.
+%%
+%% This concept of uniformity should work better for applications
+%% where you need to calculate 1.0/X or math:log(X) since those
+%% operations benefits from larger precision approaching 0.0,
+%% and that this function does not return 0.0 nor denormalized
+%% numbers very close to 0.0.  The log() operation in The Box-Muller
+%% transformation for normal distribution is an example of this.
+%%
+%%-define(TWO_POW_MINUS55, (math:pow(2, -55))).
+%%-define(TWO_POW_MINUS110, (math:pow(2, -110))).
+%%-define(TWO_POW_MINUS55, 2.7755575615628914e-17).
+%%-define(TWO_POW_MINUS110, 7.7037197775489436e-34).
+%%
+-spec uniform_real_s(State :: state()) -> {X :: float(), NewState :: state()}.
+uniform_real_s({#{bits:=Bits, next:=Next} = Alg, R0}) ->
+    %% Generate a 56 bit number without using the weak low bits.
+    %%
+    %% Be sure to use only 53 bits when multiplying with
+    %% math:pow(2.0, -N) to avoid rounding which would make
+    %% "even" floats more probable than "odd".
+    %%
+    {V1, R1} = Next(R0),
+    M1 = V1 bsr (Bits - 56),
+    if
+        ?BIT(55) =< M1 ->
+            %% We have 56 bits - waste 3
+            {(M1 bsr 3) * math:pow(2.0, -53), {Alg, R1}};
+        ?BIT(54) =< M1 ->
+            %% We have 55 bits - waste 2
+            {(M1 bsr 2) * math:pow(2.0, -54), {Alg, R1}};
+        ?BIT(53) =< M1 ->
+            %% We have 54 bits - waste 1
+            {(M1 bsr 1) * math:pow(2.0, -55), {Alg, R1}};
+        ?BIT(52) =< M1 ->
+            %% We have 53 bits - use all
+            {M1 * math:pow(2.0, -56), {Alg, R1}};
+        true ->
+            %% Need more bits
+            {V2, R2} = Next(R1),
+            uniform_real_s(Alg, Next, M1, -56, R2, V2, Bits)
+    end;
+uniform_real_s({#{max:=_, next:=Next} = Alg, R0}) ->
+    %% Generate a 56 bit number.
+    %% Ignore the weak low bits for these old algorithms,
+    %% just produce something reasonable.
+    %%
+    %% Be sure to use only 53 bits when multiplying with
+    %% math:pow(2.0, -N) to avoid rounding which would make
+    %% "even" floats more probable than "odd".
+    %%
+    {V1, R1} = Next(R0),
+    M1 = ?MASK(56, V1),
+    if
+        ?BIT(55) =< M1 ->
+            %% We have 56 bits - waste 3
+            {(M1 bsr 3) * math:pow(2.0, -53), {Alg, R1}};
+        ?BIT(54) =< M1 ->
+            %% We have 55 bits - waste 2
+            {(M1 bsr 2) * math:pow(2.0, -54), {Alg, R1}};
+        ?BIT(53) =< M1 ->
+            %% We have 54 bits - waste 1
+            {(M1 bsr 1) * math:pow(2.0, -55), {Alg, R1}};
+        ?BIT(52) =< M1 ->
+            %% We have 53 bits - use all
+            {M1 * math:pow(2.0, -56), {Alg, R1}};
+        true ->
+            %% Need more bits
+            {V2, R2} = Next(R1),
+            uniform_real_s(Alg, Next, M1, -56, R2, V2, 56)
+    end.
+
+uniform_real_s(Alg, _Next, M0, -1064, R1, V1, Bits) -> % 19*56
+    %% This is a very theoretical bottom case.
+    %% The odds of getting here is about 2^-1008,
+    %% through a white box test case, or thanks to
+    %% a malfunctioning PRNG producing 18 56-bit zeros in a row.
+    %%
+    %% Fill up to 53 bits, we have at most 52
+    B0 = (53 - ?BC(M0, 52)), % Missing bits
+    {(((M0 bsl B0) bor (V1 bsr (Bits - B0))) * math:pow(2.0, -1064 - B0)),
+     {Alg, R1}};
+uniform_real_s(Alg, Next, M0, BitNo, R1, V1, Bits) ->
+    if
+        %% Optimize the most probable.
+        %% Fill up to 53 bits.
+        ?BIT(51) =< M0 ->
+            %% We have 52 bits in M0 - need 1
+            {(((M0 bsl 1) bor (V1 bsr (Bits - 1)))
+              * math:pow(2.0, BitNo - 1)),
+             {Alg, R1}};
+        ?BIT(50) =< M0 ->
+            %% We have 51 bits in M0 - need 2
+            {(((M0 bsl 2) bor (V1 bsr (Bits - 2)))
+              * math:pow(2.0, BitNo - 2)),
+             {Alg, R1}};
+        ?BIT(49) =< M0 ->
+            %% We have 50 bits in M0 - need 3
+            {(((M0 bsl 3) bor (V1 bsr (Bits - 3)))
+              * math:pow(2.0, BitNo - 3)),
+             {Alg, R1}};
+        M0 == 0 ->
+            M1 = V1 bsr (Bits - 56),
+            if
+                ?BIT(55) =< M1 ->
+                    %% We have 56 bits - waste 3
+                    {(M1 bsr 3) * math:pow(2.0, BitNo - 53), {Alg, R1}};
+                ?BIT(54) =< M1 ->
+                    %% We have 55 bits - waste 2
+                    {(M1 bsr 2) * math:pow(2.0, BitNo - 54), {Alg, R1}};
+                ?BIT(53) =< M1 ->
+                    %% We have 54 bits - waste 1
+                    {(M1 bsr 1) * math:pow(2.0, BitNo - 55), {Alg, R1}};
+                ?BIT(52) =< M1 ->
+                    %% We have 53 bits - use all
+                    {M1 * math:pow(2.0, BitNo - 56), {Alg, R1}};
+                BitNo =:= -1008 ->
+                    %% Endgame
+                    %% For the last round we cannot have 14 zeros or more
+                    %% at the top of M1 because then we will underflow,
+                    %% so we need at least 43 bits
+                    if
+                        ?BIT(42) =< M1 ->
+                            %% We have 43 bits - get the last bits
+                            uniform_real_s(Alg, Next, M1, BitNo - 56, R1);
+                        true ->
+                            %% Would underflow 2^-1022 - start all over
+                            %%
+                            %% We could just crash here since the odds for
+                            %% the PRNG being broken is much higher than
+                            %% for a good PRNG generating this many zeros
+                            %% in a row.  Maybe we should write an error
+                            %% report or call this a system limit...?
+                            uniform_real_s({Alg, R1})
+                    end;
+                true ->
+                    %% Need more bits
+                    uniform_real_s(Alg, Next, M1, BitNo - 56, R1)
+            end;
+        true ->
+            %% Fill up to 53 bits
+            B0 = 53 - ?BC(M0, 49), % Number of bits we need to append
+            {(((M0 bsl B0) bor (V1 bsr (Bits - B0)))
+              * math:pow(2.0, BitNo - B0)),
+             {Alg, R1}}
+    end.
+%%
+uniform_real_s(#{bits:=Bits} = Alg, Next, M0, BitNo, R0) ->
+    {V1, R1} = Next(R0),
+    uniform_real_s(Alg, Next, M0, BitNo, R1, V1, Bits);
+uniform_real_s(#{max:=_} = Alg, Next, M0, BitNo, R0) ->
+    {V1, R1} = Next(R0),
+    uniform_real_s(Alg, Next, M0, BitNo, R1, ?MASK(56, V1), 56).
+
+%% jump/1: given a state, jump/1
+%% returns a new state which is equivalent to that
+%% after a large number of call defined for each algorithm.
+%% The large number is algorithm dependent.
+
+-spec jump(state()) -> NewState :: state().
+jump(State = {#{jump:=Jump}, _}) ->
+    Jump(State);
+jump({#{}, _}) ->
+    erlang:error(not_implemented).
+
+
+%% jump/0: read the internal state and
+%% apply the jump function for the state as in jump/1
+%% and write back the new value to the internal state,
+%% then returns the new value.
+
+-spec jump() -> NewState :: state().
+jump() ->
+    seed_put(jump(seed_get())).
 
 %% normal/0: returns a random float with standard normal distribution
 %% updating the state in the process dictionary.
@@ -159,14 +557,21 @@ normal() ->
     _ = seed_put(Seed),
     X.
 
+%% normal/2: returns a random float with N(μ, σ²) normal distribution
+%% updating the state in the process dictionary.
+
+-spec normal(Mean :: number(), Variance :: number()) -> float().
+normal(Mean, Variance) ->
+    Mean + (math:sqrt(Variance) * normal()).
+
 %% normal_s/1: returns a random float with standard normal distribution
 %% The Ziggurat Method for generating random variables - Marsaglia and Tsang
 %% Paper and reference code: http://www.jstatsoft.org/v05/i08/
 
--spec normal_s(state()) -> {float(), NewS :: state()}.
+-spec normal_s(State :: state()) -> {float(), NewState :: state()}.
 normal_s(State0) ->
     {Sign, R, State} = get_52(State0),
-    Idx = R band 16#FF,
+    Idx = ?MASK(8, R),
     Idx1 = Idx+1,
     {Ki, Wi} = normal_kiwi(Idx1),
     X = R * Wi,
@@ -179,22 +584,20 @@ normal_s(State0) ->
 	false -> normal_s(Idx, Sign, -X, State)
     end.
 
-%% =====================================================================
-%% Internal functions
+%% normal_s/3: returns a random float with normal N(μ, σ²) distribution
 
--define(UINT21MASK, 16#00000000001fffff).
--define(UINT32MASK, 16#00000000ffffffff).
--define(UINT33MASK, 16#00000001ffffffff).
--define(UINT39MASK, 16#0000007fffffffff).
--define(UINT58MASK, 16#03ffffffffffffff).
--define(UINT64MASK, 16#ffffffffffffffff).
+-spec normal_s(Mean :: number(), Variance :: number(), state()) -> {float(), NewS :: state()}.
+normal_s(Mean, Variance, State0) when Variance > 0 ->
+    {X, State} = normal_s(State0),
+    {Mean + (math:sqrt(Variance) * X), State}.
 
--type uint64() :: 0..16#ffffffffffffffff.
--type uint58() :: 0..16#03ffffffffffffff.
+%% =====================================================================
+%% Internal functions
 
--spec seed_put(state()) -> undefined | state().
+-spec seed_put(state()) -> state().
 seed_put(Seed) ->
-    put(?SEED_DICT, Seed).
+    put(?SEED_DICT, Seed),
+    Seed.
 
 seed_get() ->
     case get(?SEED_DICT) of
@@ -204,17 +607,41 @@ seed_get() ->
 
 %% Setup alg record
 mk_alg(exs64) ->
-    {#{type=>exs64, max=>?UINT64MASK, next=>fun exs64_next/1,
-       uniform=>fun exs64_uniform/1, uniform_n=>fun exs64_uniform/2},
+    {#{type=>exs64, max=>?MASK(64), next=>fun exs64_next/1},
      fun exs64_seed/1};
 mk_alg(exsplus) ->
-    {#{type=>exsplus, max=>?UINT58MASK, next=>fun exsplus_next/1,
-       uniform=>fun exsplus_uniform/1, uniform_n=>fun exsplus_uniform/2},
+    {#{type=>exsplus, max=>?MASK(58), next=>fun exsplus_next/1,
+       jump=>fun exsplus_jump/1},
+     fun exsplus_seed/1};
+mk_alg(exsp) ->
+    {#{type=>exsp, bits=>58, weak_low_bits=>1, next=>fun exsplus_next/1,
+       uniform=>fun exsp_uniform/1, uniform_n=>fun exsp_uniform/2,
+       jump=>fun exsplus_jump/1},
      fun exsplus_seed/1};
+mk_alg(exsss) ->
+    {#{type=>exsss, bits=>58, next=>fun exsss_next/1,
+       uniform=>fun exsss_uniform/1, uniform_n=>fun exsss_uniform/2,
+       jump=>fun exsplus_jump/1},
+     fun exsss_seed/1};
 mk_alg(exs1024) ->
-    {#{type=>exs1024, max=>?UINT64MASK, next=>fun exs1024_next/1,
-       uniform=>fun exs1024_uniform/1, uniform_n=>fun exs1024_uniform/2},
-     fun exs1024_seed/1}.
+    {#{type=>exs1024, max=>?MASK(64), next=>fun exs1024_next/1,
+       jump=>fun exs1024_jump/1},
+     fun exs1024_seed/1};
+mk_alg(exs1024s) ->
+    {#{type=>exs1024s, bits=>64, weak_low_bits=>3, next=>fun exs1024_next/1,
+       jump=>fun exs1024_jump/1},
+     fun exs1024_seed/1};
+mk_alg(exrop) ->
+    {#{type=>exrop, bits=>58, weak_low_bits=>1, next=>fun exrop_next/1,
+       uniform=>fun exrop_uniform/1, uniform_n=>fun exrop_uniform/2,
+       jump=>fun exrop_jump/1},
+     fun exrop_seed/1};
+mk_alg(exro928ss) ->
+    {#{type=>exro928ss, bits=>58, next=>fun exro928ss_next/1,
+       uniform=>fun exro928ss_uniform/1,
+       uniform_n=>fun exro928ss_uniform/2,
+       jump=>fun exro928_jump/1},
+     fun exro928_seed/1}.
 
 %% =====================================================================
 %% exs64 PRNG: Xorshift64*
@@ -222,29 +649,29 @@ mk_alg(exs1024) ->
 %% Reference URL: http://xorshift.di.unimi.it/
 %% =====================================================================
 
--type exs64_state() :: uint64().
+-opaque exs64_state() :: uint64().
 
+exs64_seed(L) when is_list(L) ->
+    [R] = seed64_nz(1, L),
+    R;
+exs64_seed(A) when is_integer(A) ->
+    [R] = seed64(1, ?MASK(64, A)),
+    R;
+%%
+%% Traditional integer triplet seed
 exs64_seed({A1, A2, A3}) ->
-    {V1, _} = exs64_next(((A1 band ?UINT32MASK) * 4294967197 + 1)),
-    {V2, _} = exs64_next(((A2 band ?UINT32MASK) * 4294967231 + 1)),
-    {V3, _} = exs64_next(((A3 band ?UINT32MASK) * 4294967279 + 1)),
-    ((V1 * V2 * V3) rem (?UINT64MASK - 1)) + 1.
+    {V1, _} = exs64_next((?MASK(32, A1) * 4294967197 + 1)),
+    {V2, _} = exs64_next((?MASK(32, A2) * 4294967231 + 1)),
+    {V3, _} = exs64_next((?MASK(32, A3) * 4294967279 + 1)),
+    ((V1 * V2 * V3) rem (?MASK(64) - 1)) + 1.
 
 %% Advance xorshift64* state for one step and generate 64bit unsigned integer
 -spec exs64_next(exs64_state()) -> {uint64(), exs64_state()}.
 exs64_next(R) ->
     R1 = R bxor (R bsr 12),
-    R2 = R1 bxor ((R1 band ?UINT39MASK) bsl 25),
+    R2 = R1 bxor ?BSL(64, R1, 25),
     R3 = R2 bxor (R2 bsr 27),
-    {(R3 * 2685821657736338717) band ?UINT64MASK, R3}.
-
-exs64_uniform({Alg, R0}) ->
-    {V, R1} = exs64_next(R0),
-    {V / 18446744073709551616, {Alg, R1}}.
-
-exs64_uniform(Max, {Alg, R}) ->
-    {V, R1} = exs64_next(R),
-    {(V rem Max) + 1, {Alg, R1}}.
+    {?MASK(64, R3 * 2685821657736338717), R3}.
 
 %% =====================================================================
 %% exsplus PRNG: Xorshift116+
@@ -253,35 +680,191 @@ exs64_uniform(Max, {Alg, R}) ->
 %% 58 bits fits into an immediate on 64bits erlang and is thus much faster.
 %% Modification of the original Xorshift128+ algorithm to 116
 %% by Sebastiano Vigna, a lot of thanks for his help and work.
+%%
+%% Reference C code for Xorshift116+ and Xorshift116**
+%%
+%% #include <stdint.h>
+%%
+%% #define MASK(b, v) (((UINT64_C(1) << (b)) - 1) & (v))
+%% #define BSL(b, v, n) (MASK((b)-(n), (v)) << (n))
+%% #define ROTL(b, v, n) (BSL((b), (v), (n)) | ((v) >> ((b)-(n))))
+%%
+%% uint64_t s[2];
+%%
+%% uint64_t next(void) {
+%%     uint64_t s1 = s[0];
+%%     const uint64_t s0 = s[1];
+%%
+%%     s1 ^= BSL(58, s1, 24); // a
+%%     s1 ^= s0 ^ (s1 >> 11) ^ (s0 >> 41); // b, c
+%%     s[0] = s0;
+%%     s[1] = s1;
+%%
+%%     const uint64_t result_plus = MASK(58, s0 + s1);
+%%     uint64_t result_starstar = s0;
+%%     result_starstar = MASK(58, result_starstar * 5);
+%%     result_starstar = ROTL(58, result_starstar, 7);
+%%     result_starstar = MASK(58, result_starstar * 9);
+%%
+%%     return result_plus;
+%%     return result_starstar;
+%% }
+%%
 %% =====================================================================
--type exsplus_state() :: nonempty_improper_list(uint58(), uint58()).
+-opaque exsplus_state() :: nonempty_improper_list(uint58(), uint58()).
 
 -dialyzer({no_improper_lists, exsplus_seed/1}).
 
+exsplus_seed(L) when is_list(L) ->
+    [S0,S1] = seed58_nz(2, L),
+    [S0|S1];
+exsplus_seed(X) when is_integer(X) ->
+    [S0,S1] = seed58(2, ?MASK(64, X)),
+    [S0|S1];
+%%
+%% Traditional integer triplet seed
 exsplus_seed({A1, A2, A3}) ->
-    {_, R1} = exsplus_next([(((A1 * 4294967197) + 1) band ?UINT58MASK)|
-			    (((A2 * 4294967231) + 1) band ?UINT58MASK)]),
-    {_, R2} = exsplus_next([(((A3 * 4294967279) + 1) band ?UINT58MASK)|
-			    tl(R1)]),
+    {_, R1} = exsplus_next(
+                [?MASK(58, (A1 * 4294967197) + 1)|
+                 ?MASK(58, (A2 * 4294967231) + 1)]),
+    {_, R2} = exsplus_next(
+                [?MASK(58, (A3 * 4294967279) + 1)|
+                 tl(R1)]),
     R2.
 
+-dialyzer({no_improper_lists, exsss_seed/1}).
+
+exsss_seed(L) when is_list(L) ->
+    [S0,S1] = seed58_nz(2, L),
+    [S0|S1];
+exsss_seed(X) when is_integer(X) ->
+    [S0,S1] = seed58(2, ?MASK(64, X)),
+    [S0|S1];
+%%
+%% Seed from traditional integer triple - mix into splitmix
+exsss_seed({A1, A2, A3}) ->
+    {_, X0} = seed58(?MASK(64, A1)),
+    {S0, X1} = seed58(?MASK(64, A2) bxor X0),
+    {S1, _} = seed58(?MASK(64, A3) bxor X1),
+    [S0|S1].
+
+%% Advance Xorshift116 state one step
+-define(
+   exs_next(S0, S1, S1_b),
+   begin
+       S1_b = S1 bxor ?BSL(58, S1, 24),
+       S1_b bxor S0 bxor (S1_b bsr 11) bxor (S0 bsr 41)
+   end).
+
+-define(
+   scramble_starstar(S, V_a, V_b),
+   begin
+       %% The multiply by add shifted trick avoids creating bignums
+       %% which improves performance significantly
+       %%
+       V_a = ?MASK(58, S + ?BSL(58, S, 2)), % V_a = S * 5
+       V_b = ?ROTL(58, V_a, 7),
+       ?MASK(58, V_b + ?BSL(58, V_b, 3)) % V_b * 9
+   end).
+
 -dialyzer({no_improper_lists, exsplus_next/1}).
 
-%% Advance xorshift116+ state for one step and generate 58bit unsigned integer
+%% Advance state and generate 58bit unsigned integer
 -spec exsplus_next(exsplus_state()) -> {uint58(), exsplus_state()}.
 exsplus_next([S1|S0]) ->
     %% Note: members s0 and s1 are swapped here
-    S11 = (S1 bxor (S1 bsl 24)) band ?UINT58MASK,
-    S12 = S11 bxor S0 bxor (S11 bsr 11) bxor (S0 bsr 41),
-    {(S0 + S12) band ?UINT58MASK, [S0|S12]}.
+    NewS1 = ?exs_next(S0, S1, S1_1),
+    {?MASK(58, S0 + NewS1), [S0|NewS1]}.
+%%    %% Note: members s0 and s1 are swapped here
+%%    S11 = S1 bxor ?BSL(58, S1, 24),
+%%    S12 = S11 bxor S0 bxor (S11 bsr 11) bxor (S0 bsr 41),
+%%    {?MASK(58, S0 + S12), [S0|S12]}.
+
+-dialyzer({no_improper_lists, exsss_next/1}).
+
+-spec exsss_next(exsplus_state()) -> {uint58(), exsplus_state()}.
+exsss_next([S1|S0]) ->
+    %% Note: members s0 and s1 are swapped here
+    NewS1 = ?exs_next(S0, S1, S1_1),
+    {?scramble_starstar(S0, V_0, V_1), [S0|NewS1]}.
+%%    {?MASK(58, S0 + NewS1), [S0|NewS1]}.
 
-exsplus_uniform({Alg, R0}) ->
+exsp_uniform({Alg, R0}) ->
     {I, R1} = exsplus_next(R0),
-    {I / (?UINT58MASK+1), {Alg, R1}}.
+    %% Waste the lowest bit since it is of lower
+    %% randomness quality than the others
+    {(I bsr (58-53)) * ?TWO_POW_MINUS53, {Alg, R1}}.
+
+exsss_uniform({Alg, R0}) ->
+    {I, R1} = exsss_next(R0),
+    {(I bsr (58-53)) * ?TWO_POW_MINUS53, {Alg, R1}}.
 
-exsplus_uniform(Max, {Alg, R}) ->
+exsp_uniform(Range, {Alg, R}) ->
     {V, R1} = exsplus_next(R),
-    {(V rem Max) + 1, {Alg, R1}}.
+    MaxMinusRange = ?BIT(58) - Range,
+    ?uniform_range(Range, Alg, R1, V, MaxMinusRange, I).
+
+exsss_uniform(Range, {Alg, R}) ->
+    {V, R1} = exsss_next(R),
+    MaxMinusRange = ?BIT(58) - Range,
+    ?uniform_range(Range, Alg, R1, V, MaxMinusRange, I).
+
+
+%% This is the jump function for the exs* generators,
+%% i.e the Xorshift116 generators,  equivalent
+%% to 2^64 calls to next/1; it can be used to generate 2^52
+%% non-overlapping subsequences for parallel computations.
+%% Note: the jump function takes 116 times of the execution time of
+%% next/1.
+%%
+%% #include <stdint.h>
+%%
+%% void jump(void) {
+%%   static const uint64_t JUMP[] = { 0x02f8ea6bc32c797,
+%% 				   0x345d2a0f85f788c };
+%%   int i, b;
+%%   uint64_t s0 = 0;
+%%   uint64_t s1 = 0;
+%%   for(i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
+%%     for(b = 0; b < 58; b++) {
+%%       if (JUMP[i] & 1ULL << b) {
+%% 	s0 ^= s[0];
+%% 	s1 ^= s[1];
+%%       }
+%%       next();
+%%     }
+%%   s[0] = s0;
+%%   s[1] = s1;
+%% }
+%%
+%% -define(JUMPCONST, 16#000d174a83e17de2302f8ea6bc32c797).
+%% split into 58-bit chunks
+%% and two iterative executions
+
+-define(JUMPCONST1, 16#02f8ea6bc32c797).
+-define(JUMPCONST2, 16#345d2a0f85f788c).
+-define(JUMPELEMLEN, 58).
+
+-dialyzer({no_improper_lists, exsplus_jump/1}).
+-spec exsplus_jump({alg_handler(), exsplus_state()}) ->
+                          {alg_handler(), exsplus_state()}.
+exsplus_jump({Alg, S}) ->
+    {S1, AS1} = exsplus_jump(S, [0|0], ?JUMPCONST1, ?JUMPELEMLEN),
+    {_,  AS2} = exsplus_jump(S1, AS1,  ?JUMPCONST2, ?JUMPELEMLEN),
+    {Alg, AS2}.
+
+-dialyzer({no_improper_lists, exsplus_jump/4}).
+exsplus_jump(S, AS, _, 0) ->
+    {S, AS};
+exsplus_jump(S, [AS0|AS1], J, N) ->
+    {_, NS} = exsplus_next(S),
+    case ?MASK(1, J) of
+        1 ->
+            [S0|S1] = S,
+            exsplus_jump(NS, [(AS0 bxor S0)|(AS1 bxor S1)], J bsr 1, N-1);
+        0 ->
+            exsplus_jump(NS, [AS0|AS1], J bsr 1, N-1)
+    end.
 
 %% =====================================================================
 %% exs1024 PRNG: Xorshift1024*
@@ -289,12 +872,18 @@ exsplus_uniform(Max, {Alg, R}) ->
 %% Reference URL: http://xorshift.di.unimi.it/
 %% =====================================================================
 
--type exs1024_state() :: {list(uint64()), list(uint64())}.
+-opaque exs1024_state() :: {list(uint64()), list(uint64())}.
 
+exs1024_seed(L) when is_list(L) ->
+    {seed64_nz(16, L), []};
+exs1024_seed(X) when is_integer(X) ->
+    {seed64(16, ?MASK(64, X)), []};
+%%
+%% Seed from traditional triple, remain backwards compatible
 exs1024_seed({A1, A2, A3}) ->
-    B1 = (((A1 band ?UINT21MASK) + 1) * 2097131) band ?UINT21MASK,
-    B2 = (((A2 band ?UINT21MASK) + 1) * 2097133) band ?UINT21MASK,
-    B3 = (((A3 band ?UINT21MASK) + 1) * 2097143) band ?UINT21MASK,
+    B1 = ?MASK(21, (?MASK(21, A1) + 1) * 2097131),
+    B2 = ?MASK(21, (?MASK(21, A2) + 1) * 2097133),
+    B3 = ?MASK(21, (?MASK(21, A3) + 1) * 2097143),
     {exs1024_gen1024((B1 bsl 43) bor (B2 bsl 22) bor (B3 bsl 1) bor 1),
      []}.
 
@@ -317,11 +906,11 @@ exs1024_gen1024(N, R, L) ->
 %% X: random number output
 -spec exs1024_calc(uint64(), uint64()) -> {uint64(), uint64()}.
 exs1024_calc(S0, S1) ->
-    S11 = S1 bxor ((S1 band ?UINT33MASK) bsl 31),
+    S11 = S1 bxor ?BSL(64, S1, 31),
     S12 = S11 bxor (S11 bsr 11),
     S01 = S0 bxor (S0 bsr 30),
     NS1 = S01 bxor S12,
-    {(NS1 * 1181783497276652981) band ?UINT64MASK, NS1}.
+    {?MASK(64, NS1 * 1181783497276652981), NS1}.
 
 %% Advance xorshift1024* state for one step and generate 64bit unsigned integer
 -spec exs1024_next(exs1024_state()) -> {uint64(), exs1024_state()}.
@@ -332,13 +921,524 @@ exs1024_next({[H], RL}) ->
     NL = [H|lists:reverse(RL)],
     exs1024_next({NL, []}).
 
-exs1024_uniform({Alg, R0}) ->
-    {V, R1} = exs1024_next(R0),
-    {V / 18446744073709551616, {Alg, R1}}.
 
-exs1024_uniform(Max, {Alg, R}) ->
-    {V, R1} = exs1024_next(R),
-    {(V rem Max) + 1, {Alg, R1}}.
+%% This is the jump function for the exs1024 generator, equivalent
+%% to 2^512 calls to next(); it can be used to generate 2^512
+%% non-overlapping subsequences for parallel computations.
+%% Note: the jump function takes ~2000 times of the execution time of
+%% next/1.
+
+%% Jump constant here split into 58 bits for speed
+-define(JUMPCONSTHEAD, 16#00242f96eca9c41d).
+-define(JUMPCONSTTAIL,
+        [16#0196e1ddbe5a1561,
+         16#0239f070b5837a3c,
+         16#03f393cc68796cd2,
+         16#0248316f404489af,
+         16#039a30088bffbac2,
+         16#02fea70dc2d9891f,
+         16#032ae0d9644caec4,
+         16#0313aac17d8efa43,
+         16#02f132e055642626,
+         16#01ee975283d71c93,
+         16#00552321b06f5501,
+         16#00c41d10a1e6a569,
+         16#019158ecf8aa1e44,
+         16#004e9fc949d0b5fc,
+         16#0363da172811fdda,
+         16#030e38c3b99181f2,
+         16#0000000a118038fc]).
+-define(JUMPTOTALLEN, 1024).
+-define(RINGLEN, 16).
+
+-spec exs1024_jump({alg_handler(), exs1024_state()}) ->
+                          {alg_handler(), exs1024_state()}.
+exs1024_jump({Alg, {L, RL}}) ->
+    P = length(RL),
+    AS = exs1024_jump({L, RL},
+         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+         ?JUMPCONSTTAIL, ?JUMPCONSTHEAD, ?JUMPELEMLEN, ?JUMPTOTALLEN),
+    {ASL, ASR} = lists:split(?RINGLEN - P, AS),
+    {Alg, {ASL, lists:reverse(ASR)}}.
+
+exs1024_jump(_, AS, _, _, _, 0) ->
+    AS;
+exs1024_jump(S, AS, [H|T], _, 0, TN) ->
+    exs1024_jump(S, AS, T, H, ?JUMPELEMLEN, TN);
+exs1024_jump({L, RL}, AS, JL, J, N, TN) ->
+    {_, NS} = exs1024_next({L, RL}),
+    case ?MASK(1, J) of
+        1 ->
+            AS2 = lists:zipwith(fun(X, Y) -> X bxor Y end,
+                        AS, L ++ lists:reverse(RL)),
+            exs1024_jump(NS, AS2, JL, J bsr 1, N-1, TN-1);
+        0 ->
+            exs1024_jump(NS, AS, JL, J bsr 1, N-1, TN-1)
+    end.
+
+%% =====================================================================
+%% exro928ss PRNG: Xoroshiro928**
+%%
+%% Reference URL: http://vigna.di.unimi.it/ftp/papers/ScrambledLinear.pdf
+%% i.e the Xoroshiro1024 generator with ** scrambler
+%% with {S, R, T} = {5, 7, 9} as recommended in the paper.
+%%
+%% {A, B, C} were tried out and selected as {44, 9, 45}
+%% and the jump coefficients calculated.
+%%
+%% Standard jump function pseudocode:
+%% 
+%%     Jump constant j = 0xb10773cb...44085302f77130ca
+%%     Generator state: s
+%%     New generator state: t = 0
+%%     foreach bit in j, low to high:
+%%         if the bit is one:
+%%             t ^= s
+%%         next s
+%%     s = t
+%%
+%% Generator used for reference value calculation:
+%%
+%%     #include <stdint.h>
+%%     #include <stdio.h>
+%%     
+%%     int p = 0;
+%%     uint64_t s[16];
+%%     
+%%     #define MASK(x) ((x) & ((UINT64_C(1) << 58) - 1))
+%%     static __inline uint64_t rotl(uint64_t x, int n) {
+%%         return MASK(x << n) | (x >> (58 - n));
+%%     }
+%%     
+%%     uint64_t next() {
+%%         const int q = p;
+%%         const uint64_t s0 = s[p = (p + 1) & 15];
+%%         uint64_t s15 = s[q];
+%%     
+%%         const uint64_t result_starstar = MASK(rotl(MASK(s0 * 5), 7) * 9);
+%%     
+%%         s15 ^= s0;
+%%         s[q] = rotl(s0, 44) ^ s15 ^ MASK(s15 << 9);
+%%         s[p] = rotl(s15, 45);
+%%     
+%%         return result_starstar;
+%%     }
+%%
+%%     static const uint64_t jump_2pow512[15] =
+%%         { 0x44085302f77130ca, 0xba05381fdfd14902, 0x10a1de1d7d6813d2,
+%%           0xb83fe51a1eb3be19, 0xa81b0090567fd9f0, 0x5ac26d5d20f9b49f,
+%%           0x4ddd98ee4be41e01, 0x0657e19f00d4b358, 0xf02f778573cf0f0a,
+%%           0xb45a3a8a3cef3cc0, 0x6e62a33cc2323831, 0xbcb3b7c4cc049c53,
+%%           0x83f240c6007e76ce, 0xe19f5fc1a1504acd, 0x00000000b10773cb };
+%%
+%%     static const uint64_t jump_2pow20[15] =
+%%         { 0xbdb966a3daf905e6, 0x644807a56270cf78, 0xda90f4a806c17e9e,
+%%           0x4a426866bfad3c77, 0xaf699c306d8e7566, 0x8ebc73c700b8b091,
+%%           0xc081a7bf148531fb, 0xdc4d3af15f8a4dfd, 0x90627c014098f4b6,
+%%           0x06df2eb1feaf0fb6, 0x5bdeb1a5a90f2e6b, 0xa480c5878c3549bd,
+%%           0xff45ef33c82f3d48, 0xa30bebc15fefcc78, 0x00000000cb3d181c };
+%%
+%%     void jump(const uint64_t *jump) {
+%%         uint64_t j, t[16] = {0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
+%%         int m, n, k;
+%%         for (m = 0;  m < 15;  m++, jump++) {
+%%             for (n = 0, j = *jump;  n < 64;  n++, j >>= 1) {
+%%                 if ((j & 1) != 0) {
+%%                     for (k = 0;  k < 16;  k++) {
+%%                         t[k] ^= s[(p + k) & 15];
+%%                     }
+%%                 }
+%%                 next();
+%%             }
+%%         }
+%%         for (k = 0;  k < 16;  k++) {
+%%             s[(p + k) & 15] = t[k];
+%%         }
+%%     }
+%%
+%% =====================================================================
+
+-opaque exro928_state() :: {list(uint58()), list(uint58())}.
+
+-spec exro928_seed(
+        list(uint58()) | integer() | {integer(), integer(), integer()}) ->
+                          exro928_state().
+exro928_seed(L) when is_list(L) ->
+    {seed58_nz(16, L), []};
+exro928_seed(X) when is_integer(X) ->
+    {seed58(16, ?MASK(64, X)), []};
+%%
+%% Seed from traditional integer triple - mix into splitmix
+exro928_seed({A1, A2, A3}) ->
+    {S0, X0} = seed58(?MASK(64, A1)),
+    {S1, X1} = seed58(?MASK(64, A2) bxor X0),
+    {S2, X2} = seed58(?MASK(64, A3) bxor X1),
+    {[S0,S1,S2|seed58(13, X2)], []}.
+
+
+%% Update the state and calculate output word
+-spec exro928ss_next(exro928_state()) -> {uint58(), exro928_state()}.
+exro928ss_next({[S15,S0|Ss], Rs}) ->
+    SR = exro928_next_state(Ss, Rs, S15, S0),
+    %%
+    %% {S, R, T} = {5, 7, 9}
+    %% const uint64_t result_starstar = rotl(s0 * S, R) * T;
+    %%
+    {?scramble_starstar(S0, V_0, V_1), SR};
+%%    %% The multiply by add shifted trick avoids creating bignums
+%%    %% which improves performance significantly
+%%    %%
+%%    V0 = ?MASK(58, S0 + ?BSL(58, S0, 2)), % V0 = S0 * 5
+%%    V1 = ?ROTL(58, V0, 7),
+%%    V = ?MASK(58, V1 + ?BSL(58, V1, 3)), % V = V1 * 9
+%%    {V, SR};
+exro928ss_next({[S15], Rs}) ->
+    exro928ss_next({[S15|lists:reverse(Rs)], []}).
+
+-spec exro928_next(exro928_state()) -> {{uint58(),uint58()}, exro928_state()}.
+exro928_next({[S15,S0|Ss], Rs}) ->
+    SR = exro928_next_state(Ss, Rs, S15, S0),
+    {{S15,S0}, SR};
+exro928_next({[S15], Rs}) ->
+    exro928_next({[S15|lists:reverse(Rs)], []}).
+
+%% Just update the state
+-spec exro928_next_state(exro928_state()) -> exro928_state().
+exro928_next_state({[S15,S0|Ss], Rs}) ->
+    exro928_next_state(Ss, Rs, S15, S0);
+exro928_next_state({[S15], Rs}) ->
+    [S0|Ss] = lists:reverse(Rs),
+    exro928_next_state(Ss, [], S15, S0).
+
+exro928_next_state(Ss, Rs, S15, S0) ->
+    %% {A, B, C} = {44, 9, 45},
+    %% s15 ^= s0;
+    %% NewS15: s[q] = rotl(s0, A) ^ s15 ^ (s15 << B);
+    %% NewS0: s[p] = rotl(s15, C);
+    %%
+    Q = S15 bxor S0,
+    NewS15 = ?ROTL(58, S0, 44) bxor Q bxor ?BSL(58, Q, 9),
+    NewS0 = ?ROTL(58, Q, 45),
+    {[NewS0|Ss], [NewS15|Rs]}.
+
+
+exro928ss_uniform({Alg, SR}) ->
+    {V, NewSR} = exro928ss_next(SR),
+    {(V bsr (58-53)) * ?TWO_POW_MINUS53, {Alg, NewSR}}.
+
+exro928ss_uniform(Range, {Alg, SR}) ->
+    {V, NewSR} = exro928ss_next(SR),
+    MaxMinusRange = ?BIT(58) - Range,
+    ?uniform_range(Range, Alg, NewSR, V, MaxMinusRange, I).
+
+
+-spec exro928_jump({alg_handler(), exro928_state()}) ->
+                          {alg_handler(), exro928_state()}.
+exro928_jump({Alg, SR}) ->
+    {Alg,exro928_jump_2pow512(SR)}.
+
+-spec exro928_jump_2pow512(exro928_state()) -> exro928_state().
+exro928_jump_2pow512(SR) ->
+    polyjump(
+      SR, fun exro928_next_state/1,
+      %% 2^512
+      [16#4085302F77130CA, 16#54E07F7F4524091,
+       16#5E1D7D6813D2BA0, 16#4687ACEF8644287,
+       16#4567FD9F0B83FE5, 16#43E6D27EA06C024,
+       16#641E015AC26D5D2, 16#6CD61377663B92F,
+       16#70A0657E19F00D4, 16#43C0BDDE15CF3C3,
+       16#745A3A8A3CEF3CC, 16#58A8CF308C8E0C6,
+       16#7B7C4CC049C536E, 16#431801F9DB3AF2C,
+       16#41A1504ACD83F24, 16#6C41DCF2F867D7F]).
+
+-spec exro928_jump_2pow20(exro928_state()) -> exro928_state().
+exro928_jump_2pow20(SR) ->
+    polyjump(
+      SR, fun exro928_next_state/1,
+      %% 2^20
+      [16#5B966A3DAF905E6, 16#601E9589C33DE2F,
+       16#74A806C17E9E644, 16#59AFEB4F1DF6A43,
+       16#46D8E75664A4268, 16#42E2C246BDA670C,
+       16#4531FB8EBC73C70, 16#537F702069EFC52,
+       16#4B6DC4D3AF15F8A, 16#5A4189F0050263D,
+       16#46DF2EB1FEAF0FB, 16#77AC696A43CB9AC,
+       16#4C5878C3549BD5B, 16#7CCF20BCF522920,
+       16#415FEFCC78FF45E, 16#72CF460728C2FAF]).
+
+%% =====================================================================
+%% exrop PRNG: Xoroshiro116+
+%%
+%% Reference URL: http://xorshift.di.unimi.it/
+%%
+%% 58 bits fits into an immediate on 64bits Erlang and is thus much faster.
+%% In fact, an immediate number is 60 bits signed in Erlang so you can
+%% add two positive 58 bit numbers and get a 59 bit number that still is
+%% a positive immediate, which is a property we utilize here...
+%%
+%% Modification of the original Xororhiro128+ algorithm to 116 bits
+%% by Sebastiano Vigna.  A lot of thanks for his help and work.
+%% =====================================================================
+%% (a, b, c) = (24, 2, 35)
+%% JUMP Polynomial = 0x9863200f83fcd4a11293241fcb12a (116 bit)
+%%
+%% From http://xoroshiro.di.unimi.it/xoroshiro116plus.c:
+%% ---------------------------------------------------------------------
+%% /* Written in 2017 by Sebastiano Vigna ([email protected]).
+%%
+%% To the extent possible under law, the author has dedicated all copyright
+%% and related and neighboring rights to this software to the public domain
+%% worldwide. This software is distributed without any warranty.
+%%
+%% See <http://creativecommons.org/publicdomain/zero/1.0/>. */
+%%
+%% #include <stdint.h>
+%%
+%% #define UINT58MASK (uint64_t)((UINT64_C(1) << 58) - 1)
+%%
+%% uint64_t s[2];
+%%
+%% static inline uint64_t rotl58(const uint64_t x, int k) {
+%%     return (x << k) & UINT58MASK | (x >> (58 - k));
+%% }
+%% 
+%% uint64_t next(void) {
+%%     uint64_t s1 = s[1];
+%%     const uint64_t s0 = s[0];
+%%     const uint64_t result = (s0 + s1) & UINT58MASK;
+%%
+%%     s1 ^= s0;
+%%     s[0] = rotl58(s0, 24) ^ s1 ^ ((s1 << 2) & UINT58MASK); // a, b
+%%     s[1] = rotl58(s1, 35); // c
+%%     return result;
+%% }
+%%
+%% void jump(void) {
+%%     static const uint64_t JUMP[] =
+%%         { 0x4a11293241fcb12a, 0x0009863200f83fcd };
+%%
+%%     uint64_t s0 = 0;
+%%     uint64_t s1 = 0;
+%%     for(int i = 0; i < sizeof JUMP / sizeof *JUMP; i++)
+%%         for(int b = 0; b < 64; b++) {
+%%             if (JUMP[i] & UINT64_C(1) << b) {
+%%                 s0 ^= s[0];
+%% 	           s1 ^= s[1];
+%% 	       }
+%% 	       next();
+%% 	   }
+%%     s[0] = s0;
+%%     s[1] = s1;
+%% }
+
+-opaque exrop_state() :: nonempty_improper_list(uint58(), uint58()).
+
+-dialyzer({no_improper_lists, exrop_seed/1}).
+
+exrop_seed(L) when is_list(L) ->
+    [S0,S1] = seed58_nz(2, L),
+    [S0|S1];
+exrop_seed(X) when is_integer(X) ->
+    [S0,S1] = seed58(2, ?MASK(64, X)),
+    [S0|S1];
+%%
+%% Traditional integer triplet seed
+exrop_seed({A1, A2, A3}) ->
+    [_|S1] =
+        exrop_next_s(
+          ?MASK(58, (A1 * 4294967197) + 1),
+          ?MASK(58, (A2 * 4294967231) + 1)),
+    exrop_next_s(?MASK(58, (A3 * 4294967279) + 1), S1).
+
+-dialyzer({no_improper_lists, exrop_next_s/2}).
+%% Advance xoroshiro116+ state one step
+%% [a, b, c] = [24, 2, 35]
+-define(
+   exrop_next_s(S0, S1, S1_a),
+   begin
+       S1_a = S1 bxor S0,
+       [?ROTL(58, S0, 24) bxor S1_a bxor ?BSL(58, S1_a, 2)| % a, b
+        ?ROTL(58, S1_a, 35)] % c
+   end).
+exrop_next_s(S0, S1) ->
+    ?exrop_next_s(S0, S1, S1_a).
+
+-dialyzer({no_improper_lists, exrop_next/1}).
+%% Advance xoroshiro116+ state one step, generate 58 bit unsigned integer,
+%% and waste the lowest bit since it is of lower randomness quality
+exrop_next([S0|S1]) ->
+    {?MASK(58, S0 + S1), ?exrop_next_s(S0, S1, S1_a)}.
+
+exrop_uniform({Alg, R}) ->
+    {V, R1} = exrop_next(R),
+    %% Waste the lowest bit since it is of lower
+    %% randomness quality than the others
+    {(V bsr (58-53)) * ?TWO_POW_MINUS53, {Alg, R1}}.
+
+exrop_uniform(Range, {Alg, R}) ->
+    {V, R1} = exrop_next(R),
+    MaxMinusRange = ?BIT(58) - Range,
+    ?uniform_range(Range, Alg, R1, V, MaxMinusRange, I).
+
+%% Split a 116 bit constant into two 58 bit words,
+%% a top '1' marks the end of the low word.
+-define(
+   JUMP_116(Jump),
+   [?BIT(58) bor ?MASK(58, (Jump)),(Jump) bsr 58]).
+%%
+exrop_jump({Alg,S}) ->
+    [J|Js] = ?JUMP_116(16#9863200f83fcd4a11293241fcb12a),
+    {Alg, exrop_jump(S, 0, 0, J, Js)}.
+%%
+-dialyzer({no_improper_lists, exrop_jump/5}).
+exrop_jump(_S, S0, S1, 0, []) -> % End of jump constant
+    [S0|S1];
+exrop_jump(S, S0, S1, 1, [J|Js]) -> % End of word
+    exrop_jump(S, S0, S1, J, Js);
+exrop_jump([S__0|S__1] = _S, S0, S1, J, Js) ->
+    case ?MASK(1, J) of
+        1 ->
+            NewS = exrop_next_s(S__0, S__1),
+            exrop_jump(NewS, S0 bxor S__0, S1 bxor S__1, J bsr 1, Js);
+        0 ->
+            NewS = exrop_next_s(S__0, S__1),
+            exrop_jump(NewS, S0, S1, J bsr 1, Js)
+    end.
+
+%% =====================================================================
+%% Mask and fill state list, ensure not all zeros
+%% =====================================================================
+
+seed58_nz(N, Ss) ->
+    seed_nz(N, Ss, 58, false).
+
+seed64_nz(N, Ss) ->
+    seed_nz(N, Ss, 64, false).
+
+seed_nz(_N, [], _M, false) ->
+    erlang:error(zero_seed);
+seed_nz(0, [_|_], _M, _NZ) ->
+    erlang:error(too_many_seed_integers);
+seed_nz(0, [], _M, _NZ) ->
+    [];
+seed_nz(N, [], M, true) ->
+    [0|seed_nz(N - 1, [], M, true)];
+seed_nz(N, [S|Ss], M, NZ) ->
+    if
+	is_integer(S) ->
+	    R = ?MASK(M, S),
+	    [R|seed_nz(N - 1, Ss, M, NZ orelse R =/= 0)];
+	true ->
+	    erlang:error(non_integer_seed)
+    end.
+
+%% =====================================================================
+%% Splitmix seeders, lowest bits of SplitMix64, zeros skipped
+%% =====================================================================
+
+-spec seed58(non_neg_integer(), uint64()) -> list(uint58()).
+seed58(0, _X) ->
+    [];
+seed58(N, X) ->
+    {Z,NewX} = seed58(X),
+    [Z|seed58(N - 1, NewX)].
+%%
+seed58(X_0) ->
+    {Z0,X} = splitmix64_next(X_0),
+    case ?MASK(58, Z0) of
+	0 ->
+	    seed58(X);
+	Z ->
+	    {Z,X}
+    end.
+
+-spec seed64(non_neg_integer(), uint64()) -> list(uint64()).
+seed64(0, _X) ->
+    [];
+seed64(N, X) ->
+    {Z,NewX} = seed64(X),
+    [Z|seed64(N - 1, NewX)].
+%%
+seed64(X_0) ->
+    {Z,X} = ZX = splitmix64_next(X_0),
+    if
+	Z =:= 0 ->
+	    seed64(X);
+	true ->
+	    ZX
+    end.
+
+%% The SplitMix64 generator:
+%%
+%% uint64_t splitmix64_next() {
+%% 	uint64_t z = (x += 0x9e3779b97f4a7c15);
+%% 	z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9;
+%% 	z = (z ^ (z >> 27)) * 0x94d049bb133111eb;
+%% 	return z ^ (z >> 31);
+%% }
+%%
+splitmix64_next(X_0) ->
+    X = ?MASK(64, X_0 + 16#9e3779b97f4a7c15),
+    Z_0 = ?MASK(64, (X bxor (X bsr 30)) * 16#bf58476d1ce4e5b9),
+    Z_1 = ?MASK(64, (Z_0 bxor (Z_0 bsr 27)) * 16#94d049bb133111eb),
+    {?MASK(64, Z_1 bxor (Z_1 bsr 31)),X}.
+
+%% =====================================================================
+%% Polynomial jump with a jump constant word list,
+%% high bit in each word marking top of word,
+%% SR is a {Forward, Reverse} queue tuple with Forward never empty
+%% =====================================================================
+
+polyjump({Ss, Rs} = SR, NextState, JumpConst) ->
+    %% Create new state accumulator T
+    Ts = lists:duplicate(length(Ss) + length(Rs), 0),
+    polyjump(SR, NextState, JumpConst, Ts).
+%%
+%% Foreach jump word
+polyjump(_SR, _NextState, [], Ts) ->
+    %% Return new calculated state
+    {Ts, []};
+polyjump(SR, NextState, [J|Js], Ts) ->
+    polyjump(SR, NextState, Js, Ts, J).
+%%
+%% Foreach bit in jump word until top bit
+polyjump(SR, NextState, Js, Ts, 1) ->
+    polyjump(SR, NextState, Js, Ts);
+polyjump({Ss, Rs} = SR, NextState, Js, Ts, J) when J =/= 0 ->
+    NewSR = NextState(SR),
+    NewJ = J bsr 1,
+    case ?MASK(1, J) of
+        0 ->
+            polyjump(NewSR, NextState, Js, Ts, NewJ);
+        1 ->
+            %% Xor this state onto T
+            polyjump(NewSR, NextState, Js, xorzip_sr(Ts, Ss, Rs), NewJ)
+    end.
+
+xorzip_sr([], [], undefined) ->
+    [];
+xorzip_sr(Ts, [], Rs) ->
+    xorzip_sr(Ts, lists:reverse(Rs), undefined);
+xorzip_sr([T|Ts], [S|Ss], Rs) ->
+    [T bxor S|xorzip_sr(Ts, Ss, Rs)].
+
+%% =====================================================================
+
+format_jumpconst58(String) ->
+    ReOpts = [{newline,any},{capture,all_but_first,binary},global],
+    {match,Matches} = re:run(String, "0x([a-zA-Z0-9]+)", ReOpts),
+    format_jumcons58_matches(lists:reverse(Matches), 0).
+
+format_jumcons58_matches([], J) ->
+    format_jumpconst58_value(J);
+format_jumcons58_matches([[Bin]|Matches], J) ->
+    NewJ = (J bsl 64) bor binary_to_integer(Bin, 16),
+    format_jumcons58_matches(Matches, NewJ).
+
+format_jumpconst58_value(0) ->
+    ok;
+format_jumpconst58_value(J) ->
+    io:format("16#~s,~n", [integer_to_list(?MASK(58, J) bor ?BIT(58), 16)]),
+    format_jumpconst58_value(J bsr 58).
 
 %% =====================================================================
 %% Ziggurat cont
@@ -347,9 +1447,13 @@ exs1024_uniform(Max, {Alg, R}) ->
 -define(NOR_INV_R, 1/?NOR_R).
 
 %% return a {sign, Random51bits, State}
+get_52({Alg=#{bits:=Bits, next:=Next}, S0}) ->
+    %% Use the high bits
+    {Int,S1} = Next(S0),
+    {?BIT(Bits - 51 - 1) band Int, Int bsr (Bits - 51), {Alg, S1}};
 get_52({Alg=#{next:=Next}, S0}) ->
     {Int,S1} = Next(S0),
-    {((1 bsl 51) band Int), Int band ((1 bsl 51)-1), {Alg, S1}}.
+    {?BIT(51) band Int, ?MASK(51, Int), {Alg, S1}}.
 
 %% Slow path
 normal_s(0, Sign, X0, State0) ->
@@ -594,3 +1698,42 @@ normal_fi(Indx) ->
 	     1.0214971439701471e-02,8.6165827693987316e-03,7.0508754713732268e-03,
 	     5.5224032992509968e-03,4.0379725933630305e-03,2.6090727461021627e-03,
 	     1.2602859304985975e-03}).
+
+%%%bitcount64(0) -> 0;
+%%%bitcount64(V) -> 1 + bitcount(V, 64).
+%%%
+%%%-define(
+%%%   BITCOUNT(V, N),
+%%%   bitcount(V, N) ->
+%%%       if
+%%%           (1 bsl ((N) bsr 1)) =< (V) ->
+%%%               ((N) bsr 1) + bitcount((V) bsr ((N) bsr 1), ((N) bsr 1));
+%%%           true ->
+%%%               bitcount((V), ((N) bsr 1))
+%%%       end).
+%%%?BITCOUNT(V, 64);
+%%%?BITCOUNT(V, 32);
+%%%?BITCOUNT(V, 16);
+%%%?BITCOUNT(V, 8);
+%%%?BITCOUNT(V, 4);
+%%%?BITCOUNT(V, 2);
+%%%bitcount(_, 1) -> 0.
+
+bc64(V) -> ?BC(V, 64).
+
+%% Linear from high bit - higher probability first gives faster execution
+bc(V, B, N) when B =< V -> N;
+bc(V, B, N) -> bc(V, B bsr 1, N - 1).
+    
+make_float(S, E, M) ->
+    <<F/float>> = <<S:1, E:11, M:52>>,
+    F.
+
+float2str(N) ->
+    <<S:1, E:11, M:52>> = <<(float(N))/float>>,
+    lists:flatten(
+      io_lib:format(
+      "~c~c.~13.16.0bE~b",
+      [case S of 1 -> $-; 0 -> $+ end,
+       case E of 0 -> $0; _ -> $1 end,
+       M, E - 16#3ff])).