4 files changed, 27 insertions, 13 deletions
diff --git a/erts/doc/src/atomics.xml b/erts/doc/src/atomics.xml
index 46231d9234..f552c11e18 100644
--- a/erts/doc/src/atomics.xml
+++ b/erts/doc/src/atomics.xml
@@ -30,7 +30,7 @@
     mutable atomic variables. The implementation utilizes only
     atomic hardware instructions without any software level locking, which makes
     it very efficient for concurrent access. The atomics are organized into
-    arrays with the follwing semantics:</p>
+    arrays with the following semantics:</p>
     <list type="bulleted">
       <item>
 	<p>Atomics are 64 bit integers.</p>
diff --git a/erts/doc/src/counters.xml b/erts/doc/src/counters.xml
index c3b0bfcf85..3d26093a59 100644
--- a/erts/doc/src/counters.xml
+++ b/erts/doc/src/counters.xml
@@ -29,7 +29,7 @@
     <p>This module provides a set of functions to do operations towards
     shared mutable counter variables. The implementation does not utilize any
     software level locking, which makes it very efficient for concurrent
-    access. The counters are organized into arrays with the follwing
+    access. The counters are organized into arrays with the following
     semantics:</p>
     <list type="bulleted">
       <item>
@@ -80,7 +80,7 @@
 	<taglist>
 	  <tag><c>atomics</c> (Default)</tag>
 	  <item><p>Counters will be sequentially consistent. If write
-	  operation A is done sequencially before write operation B, then a concurrent reader
+	  operation A is done sequentially before write operation B, then a concurrent reader
 	  may see none of them, only A, or both A and B. It cannot see only B.</p>
 	  </item>
 	  <tag><c>write_concurrency</c></tag>
@@ -90,7 +90,7 @@
 	  inconsistency and memory consumption per counter.</p>
 	  <p>Read operations may see sequentially inconsistent results with
 	  regard to concurrent write operations. Even if write operation A is done
-	  sequencially before write operation B, a concurrent reader may see any
+	  sequentially before write operation B, a concurrent reader may see any
 	  combination of A and B, including only B. A read operation is only
 	  guaranteed to see all writes done sequentially before the read. No writes
 	  are ever lost, but will eventually all be seen.</p>
@@ -140,7 +140,7 @@
 	<c><anno>Ix</anno></c>.</p>
 	<note>
 	  <p>Despite its name, the <c>write_concurrency</c> optimization does not
-	  improve <c>put</c>. A call to <c>put</c> is a relative heavy
+	  improve <c>put</c>. A call to <c>put</c> is a relatively heavy
 	  operation compared to the very lightweight and scalable <seealso
 	  marker="#add/3"><c>add</c></seealso> and <seealso marker="#sub/3">
 	  <c>sub</c></seealso>. The cost for a <c>put</c> with
diff --git a/erts/emulator/beam/big.c b/erts/emulator/beam/big.c
index 84338769e0..dac9574fa5 100644
--- a/erts/emulator/beam/big.c
+++ b/erts/emulator/beam/big.c
@@ -668,27 +668,25 @@ static dsize_t I_mul(ErtsDigit* x, dsize_t xl, ErtsDigit* y, dsize_t yl, ErtsDig
 
 static dsize_t I_sqr(ErtsDigit* x, dsize_t xl, ErtsDigit* r)
 {
-    ErtsDigit d_next = *x;
     ErtsDigit d;
     ErtsDigit* r0 = r;
     ErtsDigit* s = r;
 
     if ((r + xl) == x)	/* "Inline" operation */
 	*x = 0;
-    x++;
 	
     while(xl--) {
-	ErtsDigit* y = x;
+	ErtsDigit* y;
 	ErtsDigit y_0 = 0, y_1 = 0, y_2 = 0, y_3 = 0;
 	ErtsDigit b0, b1;
 	ErtsDigit z0, z1, z2;
 	ErtsDigit t;
 	dsize_t y_l = xl;
-		
+
+        d = *x;
+        x++;
+        y = x;
 	s = r;
-	d = d_next;
-	d_next = *x; 
-	x++;
 
 	DMUL(d, d, b1, b0);
 	DSUMc(*s, b0, y_3, t);
diff --git a/erts/emulator/test/big_SUITE.erl b/erts/emulator/test/big_SUITE.erl
index 0a42b09903..5b602dd4dc 100644
--- a/erts/emulator/test/big_SUITE.erl
+++ b/erts/emulator/test/big_SUITE.erl
@@ -168,7 +168,11 @@ eval({op,_,Op,A0,B0}, LFH) ->
     Res = eval_op(Op, A, B),
     erlang:garbage_collect(),
     Res;
-eval({integer,_,I}, _) -> I;
+eval({integer,_,I}, _) ->
+    %% "Parasitic" ("symbiotic"?) test of squaring all numbers
+    %% found in the test data.
+    test_squaring(I),
+    I;
 eval({call,_,{atom,_,Local},Args0}, LFH) ->
     Args = eval_list(Args0, LFH),
     LFH(Local, Args).
@@ -192,6 +196,18 @@ eval_op('bxor', A, B) -> A bxor B;
 eval_op('bsl', A, B) -> A bsl B;
 eval_op('bsr', A, B) -> A bsr B.
 
+test_squaring(I) ->
+    %% Multiplying an integer by itself is specially optimized, so we
+    %% should take special care to test squaring.  The optimization
+    %% will kick in when the two operands have the same address.
+    Sqr = I * I,
+
+    %% This expression will be multiplied in the usual way, because
+    %% the the two operands for '*' are stored at different addresses.
+    Sqr = I * ((I + id(1)) - id(1)),
+
+    ok.
+
 %% Built in test functions
 
 fac(0) -> 1;