aboutsummaryrefslogtreecommitdiffstats
path: root/erts
diff options
context:
space:
mode:
authorSverker Eriksson <[email protected]>2015-03-20 16:36:34 +0100
committerSverker Eriksson <[email protected]>2015-03-20 17:32:12 +0100
commit6071cbf00680b94d107906e1c7f1e13f79676479 (patch)
treee2b357cd519f5f00dd687c62532d551d6a653946 /erts
parentf958a56e0c7c672cd19f386b58f9abd54e128a6d (diff)
downloadotp-6071cbf00680b94d107906e1c7f1e13f79676479.tar.gz
otp-6071cbf00680b94d107906e1c7f1e13f79676479.tar.bz2
otp-6071cbf00680b94d107906e1c7f1e13f79676479.zip
erts: Fix hashmap overestimation
Old overestimation assumed an average of k/3 nodes. This can be bad for large maps (with tight standard deviations) as the average vary between 0.3*k and up to almost 0.4*k.
Diffstat (limited to 'erts')
-rw-r--r--erts/emulator/beam/erl_map.c27
1 files changed, 16 insertions, 11 deletions
diff --git a/erts/emulator/beam/erl_map.c b/erts/emulator/beam/erl_map.c
index bd6da0a6f1..bcbda65da0 100644
--- a/erts/emulator/beam/erl_map.c
+++ b/erts/emulator/beam/erl_map.c
@@ -2549,19 +2549,24 @@ static int int_sqrt_ceiling(Uint x)
}
}
-Uint hashmap_over_estimated_heap_size(Uint n)
+Uint hashmap_over_estimated_heap_size(Uint k)
{
- /* n is nr of key-value pairs.
- Average nr of nodes is about n/3.
- Standard deviation is about sqrt(n)/3.
- Assuming normal probability distribution,
- we overestimate nr of nodes by 14 std.devs, which gives a probability
- for overrun of 1.0e-49 (same magnitude as a git SHA1 collision).
+ /* k is nr of key-value pairs.
+ N(k) is expected nr of nodes in hamt.
+
+ Observation:
+ For uniformly distributed hash values, average of N varies between
+ 0.3*k and 0.4*k (with a beautiful sine curve)
+ and standard deviation of N is about sqrt(k)/3.
+
+ Assuming normal probability distribution, we overestimate nr of nodes
+ by 15 std.devs above the average, which gives a probability for overrun
+ less than 1.0e-49 (same magnitude as a git SHA1 collision).
*/
- Uint nodes = (n + int_sqrt_ceiling(n)*14) / 3;
- return (n*2 + /* leaf cons cells */
- n + /* leaf list terms */
- nodes*2); /* headers + parent refs */
+ Uint max_nodes = 2*k/5 + (15/3)*int_sqrt_ceiling(k);
+ return (k*2 + /* leaf cons cells */
+ k + /* leaf list terms */
+ max_nodes*2); /* headers + parent boxed terms */
}