ngspice/src/frontend/trannoise/FastNorm3.c

/*	This is file FastNorm3.c   */
/*	SUPERCEDES FastNorm.c, FastNorm2.c.  Use with FastNorm3.h  */
/*		24  June  2003   */

/*	A package containing a very fast generator of pseudo-random
	Unit NORMAL variates, and some fairly high-quality UNIFORM
	generators. It also contains a straightforward implementation of
	a ChiSquared and Gamma generator copied from Ahrens and Dieter.
	*/

/*	Version 3 with double transformations and controllable extension
	to repeat the double transformations for higher quality at lower
	speed.
		Dated 17 May 20003.
		Copyright Christopher Stewart Wallace.
	*/
/*
%A C. S. Wallace
%T Fast Pseudo-Random Generators for Normal and Exponential Variates.
%J ACM Trans. Math. Software
%V 22
%N 1
%P 119-127
%M MAR
%D 1996
%O TR 94/197, May 1994, Dept. Computer Science, Monash University
%K CSW, CSWallace, Monash, pseudo random number generator, algorithm,
   jrnl, TOMS, numbers, normal, probability, distribution, PRNG, RNG, Gaussian,
   distribution, jrnl, ACM, TOMS, TR 94 197, TR197, c1996, c199x, c19xx
*/
/*	Use of this package requires the file "FastNorm3.h" which must be
	#include-ed in any C files using this package.

	The main purpose of this package is to provide a very fast source
of pseudo-random variates from the Unit Normal N(0,1) distribution, having
the density function

	f(x) = (1/sqrt(2*PI)) * exp (-0.5 * x^2)

Variates are obtained not by calling a function, but by use of a macro
"FastNorm" defined in FastNorm3.h. In a C program, this macro may appear
anywhere a (double) expression could appear, e.g in statements like
	z += FastNorm;
	if (FastNorm < 1.1) .....
	q = fabs (FastNorm);   etc.

The revision history, and a reference to the method description, is given
later in this file under the heading "Revision history Fastnorm".

	Major sections of this file, such as the revision history and the
major subroutines, are all headed by a line containing a row of minus signs (-)
and the name of the section or subroutine.

	The generators included are:
a Uniform source of integers, unsigned integers and doubles.
Chi-sq(N)  (based on Ahrens and Dieter)
Gamma(N)	(= 0.5 * Chi-sq(2N))
Normal	(a very fast routine)
	*/

/*	----------------- inclusions and some definitions ------------  */
#include <math.h>
#ifndef NOSPICE
#include "ngspice.h"
#endif
#include "FastNorm3.h"


/*	--------------- (Uniform) c7rand, irandm, urandm ----------  */
/*
c	A random number generator called as a function by
c	c7rand (iseed)	or	irandm (iseed)  or urandm (iseed)

c	The parameter should be a pointer to a 2-element Sw vector.
c	The first call gives a double uniform in 0 .. 1.
c	The second gives an Sw integer uniform in 0 .. 2**31-1
c	The third gives an Sw integer with 32 bits, so unif in
c	-2**31 .. 2**31-1 if used in 32-bit signed arithmetic.
c	All update iseed[] in exactly the same way.
c	iseed[] must be a 2-element Sw vector.
c	The initial value of iseed[1] may be any 32-bit integer.
c	The initial value of iseed[0] may be any 32-bit integer except -1.
c
c	The period of the random sequence is 2**32 * (2**32-1)
c	Its quality is quite good. It is based on the mixed multiplicative
c	congruential (Lehmer) generator
		x[n+1] = (69069 * x[n] + odd constant) MOD 2^32
c	but avoids most of the well-known defects of this type of generator
c	by, in effect, generating x[n+k] from x[n] as defined by the
c	sequence above, where k is chosen randomly in 1 ... 128 with the
c	help of a subsidiary Tauseworth-type generator.
c		For the positve integer generator irandm, the less
c	significant digits are more random than is usual for a Lehmer
c	generator. The last n<31 digits do not repeat with a period of 2^n.
c	This is also true of the unsigned integer generator urandm, but less
c	so.

c	This is an implementation in C of the algorithm described in
c	Technical Report "A Long-Period Pseudo-Random Generator"
c	TR89/123, Computer Science, Monash University,
c		Clayton, Vic 3168 AUSTRALIA
c			by
c
c		C.S.Wallace	csw@cs.monash.edu.au

c	The table mt[0:127] is defined by mt[i] = 69069 ** (128-i)
	*/

#define MASK ((Sw) 0x12DD4922)
/*	or in decimal, 316492066	*/
#define SCALE ((double) 1.0 / (1024.0 * 1024.0 * 1024.0 * 2.0))
/*	i.e. 2 to power -31	*/

static Sw mt [128] =	{
	902906369,
	2030498053,
	-473499623,
	1640834941,
	723406961,
	1993558325,
	-257162999,
	-1627724755,
	913952737,
	278845029,
	1327502073,
	-1261253155,
	981676113,
	-1785280363,
	1700077033,
	366908557,
	-1514479167,
	-682799163,
	141955545,
	-830150595,
	317871153,
	1542036469,
	-946413879,
	-1950779155,
	985397153,
	626515237,
	530871481,
	783087261,
	-1512358895,
	1031357269,
	-2007710807,
	-1652747955,
	-1867214463,
	928251525,
	1243003801,
	-2132510467,
	1874683889,
	-717013323,
	218254473,
	-1628774995,
	-2064896159,
	69678053,
	281568889,
	-2104168611,
	-165128239,
	1536495125,
	-39650967,
	546594317,
	-725987007,
	1392966981,
	1044706649,
	687331773,
	-2051306575,
	1544302965,
	-758494647,
	-1243934099,
	-75073759,
	293132965,
	-1935153095,
	118929437,
	807830417,
	-1416222507,
	-1550074071,
	-84903219,
	1355292929,
	-380482555,
	-1818444007,
	-204797315,
	170442609,
	-1636797387,
	868931593,
	-623503571,
	1711722209,
	381210981,
	-161547783,
	-272740131,
	-1450066095,
	2116588437,
	1100682473,
	358442893,
	-1529216831,
	2116152005,
	-776333095,
	1265240893,
	-482278607,
	1067190005,
	333444553,
	86502381,
	753481377,
	39000101,
	1779014585,
	219658653,
	-920253679,
	2029538901,
	1207761577,
	-1515772851,
	-236195711,
	442620293,
	423166617,
	-1763648515,
	-398436623,
	-1749358155,
	-538598519,
	-652439379,
	430550625,
	-1481396507,
	2093206905,
	-1934691747,
	-962631983,
	1454463253,
	-1877118871,
	-291917555,
	-1711673279,
	201201733,
	-474645415,
	-96764739,
	-1587365199,
	1945705589,
	1303896393,
	1744831853,
	381957665,
	2135332261,
	-55996615,
	-1190135011,
	1790562961,
	-1493191723,
	475559465,
	69069
		};

double c7rand (Sw *is)
{
	Sw it, leh;

	it = is [0];
	leh = is [1];
/*	Do a 7-place right cyclic shift of it  */
	it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25);
	if (!(it & 0x80000000)) it = it ^ MASK;
	leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF;
	is [0] = it;	is [1] = leh;
	if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF;
	return (SCALE * leh);
}


Sw irandm (Sw *is)
{
	Sw it, leh;

	it = is [0];
	leh = is [1];
/*	Do a 7-place right cyclic shift of it  */
	it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25);
	if (!(it & 0x80000000)) it = it ^ MASK;
	leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF;
	is [0] = it;	is [1] = leh;
	if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF;
	return (leh);
}


unsigned int urandm (Sw *is)
{
	Sw it, leh;

	it = is [0];
	leh = is [1];
/*	Do a 7-place right cyclic shift of it  */
	it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25);
	if (!(it & 0x80000000)) it = it ^ MASK;
	leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF;
	is [0] = it;	is [1] = leh;
	return (leh);
}


/*	---------------  (Chi-squared) adchi -----------------------  */
/*	Simple implementation of Ahrens and Dieter method for a chi-sq
random variate of order a >> 1.  Uses c7rand, maths library  */
/*	13 July 1998  */
/*	Slightly faster if 'a' is the same as on previous call  */
/*	This routine is no longer used in the fastnorm code, but is included
because it may be useful  */


static double gorder, gm, rt2gm, aold;

double adchi (double a, int *is)
{
	double x, y, z, sq;

	if (a != aold)	{
		aold = a;  gorder = 0.5 * a;
		gm = gorder - 1.0;
		rt2gm = sqrt (aold - 1.0);
		}

polar:
	x = 2.0 * c7rand(is) - 1.0;  z = c7rand(is);
	sq = x*x + z*z;
	if ((sq > 1.0) || (sq < 0.25)) goto polar;
	y = x / z;
	x = rt2gm * y + gm;
	if (x < 0.0) goto polar;

	z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y);
	if (c7rand(is) > z) goto polar;

	return (2.0 * x);
}

/*	-------------------- (Gamma) rgamma (g, is) -----------	*/

double rgamma (double g, int *is)
{
	double x, y, z, sq;

	if (g != gorder)	{
		gorder = g;
		gm = gorder - 1.0;	aold = 2.0 * gorder;
		rt2gm = sqrt (aold - 1.0);
		}

polar:
	x = 2.0 * c7rand(is) - 1.0;  z = c7rand(is);
	sq = x*x + z*z;
	if ((sq > 1.0) || (sq < 0.25)) goto polar;
	y = x / z;
	x = rt2gm * y + gm;
	if (x < 0.0) goto polar;

	z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y);
	if (c7rand(is) > z) goto polar;

	return (x);
}


/*	------------------  Revision history Fastnorm  -------------  */
/*	Items in this revision history appear in chronological order,
so the most recent revsion appears last.
	Revision items are separated by a line of '+' characters.

	++++++++++++++++++++++++++++++++++++++++++++++++++++++
	This is a revised version of the algorithm decribed in

	ACM Transactions on Mathematical Software, Vol 22, No 1
		March 1996, pp 119-127.

	A fast generator of pseudo-random variates from the unit Normal
distribution. It keeps a pool of about 1000 variates, and generates new
ones by picking 4 from the pool, rotating the 4-vector with these as its
components, and replacing the old variates with the components of the
rotated vector.

	The program should initialize the generator by calling initnorm(seed)
with seed a Sw integer seed value. Different seed values will give
different sequences of Normals. Seed may be any 32-bit integer.
	BUT SEE REVISION of 17 May 2003 for initnorm() parameters.
	The revised initnorm requires two integer parameters, iseed and
		quoll, the latter specifying a tradeoff between speed and
		quality.
	Then, wherever the program needs a new Normal variate, it should
use the macro FastNorm, e.g. in statements like:
	x = FastNorm;  (Sets x to a random Normal value)
or
	x += a + FastNorm * b;  (Adds a normal with mean a, SD b, to x)
	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	Changed basic formula, which was:
		t = (p+q+r+s)*0.5; p = p-t; q = t-q; r = t-r; s = t-s;
	This gives sum of new p+q+r+s = 2p(old) which may not be a great
choice. The new version is:
		t = (p+q+r+s)*0.5; p = p-t; q = q-t; r = t-r; s = t-s;
	which gives new p+q+r+s = p+q-r-s (old) which may be better.
	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	Revision 14 November 1998
	The older version "FastNorm" which was available via ftp was found
to have a defect which could affect some applications.

	Dr Christine Rueb, (Max Planck Institut fur Infomatik,
		Im Stadtwald W 66123 Saabrucken, F.G.R.,
			(rueb@mpi-sb.mpg.de)

found that if a large number N of consecutive variates were summed to give
a variate S with nominally N(0,N) distribution, the variance of S was in some
cases too small. The effect was noticed with N=400, and was particularly strong
for N=1023 if the first several (about 128) variates from FastNorm were
discarded. Dr. Rueb traced the effect to an unexpected tendency of FastNorm
to concentrate values with an anomolous correlation into the first 128
elements of the variate pool.
	With the help of her analysis, the algorithm has been revised in a
way which appears to overcome the problem, at the cost of about a 19%
reduction in speed (which still leaves the method very fast.)

	IT  MUST  BE  RECOGNISED  THAT  THIS  ALGORITHM  IS  NOVEL
AND  WHILE  IT  PASSES  A  NUMBER  OF  STANDARD  TESTS  FOR  DISTRIBUTIONAL
FORM,  LACK  OF  SERIAL  CORRELATION  ETC.,  IT  MAY  STILL  HAVE  DEFECTS.

RECALL  THE  NUMBER  OF  YEARS  WHICH  IT  TOOK  FOR  THE  LIMITATIONS  OF
THE  LEHMER  GENERATOR  FOR  UNIFORM  VARIATES  TO  BECOME  APPARENT !!!

UNTIL  MORE  EXPERIENCE  IS  GAINED  WITH  THIS  TYPE  OF  GENERATOR,  IT
WOULD  BE  WISE  IN  ANY  CRITICAL  APPLICATION  TO  COMPARE  RESULTS
OBTAINED  USING  IT  WITH  RESULTS  OBTAINED  USING  A  "STANDARD"  FORM
OF  GENERATOR  OF  NORMAL  VARIATES  COUPLED  WITH  A  WELL-DOCUMENTED
GENERATOR  OF  UNIFORM  VARIATES.
	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	Revision 1 April 2003.
	Trying a scanning process proposed by R.P.Brent. It needs 2 pool
vectors, as it cannot update in-situ, but may be more robust.
	It is a bit slower on a 133 Mhz PC but just as fast on a newer PC
(moggie) at about 16 ns per call in the 'speed.c' test.
	The extreme-value defect is the same on old and new versions.
If one finds a value 'A' such that a batch of B genuine Normal variates has
probability 0.2 of containing a variate with abolute value greater than A,
then the probability that both of two consecive batches of B will contain
such a value should be 0.2 times 0.2, or 0.04. Instead, both versions give
the extreme value prob. as 0.200 (over a million batches) but give the
consective-pair prob as 0.050 for batch size B = 1024.
	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	Revision 17 May 2003.
	The fundamental defect of the method, namely an inadequate 'mixing'
of squared value ('energy') between one generation of the pool and the next,
cannot readily be removed. In going from one pool to the next, the energy
in an old variate is shared among just 4 variates in the new pool. Hence it
takes many generations before the energy of some original variate can be
distributed across the whole pool. The number of generations needed cannot
be less than the log to base 4 of the pool size, or 5 for a pool size of
1024. In fact, the pseudo-random indexing of the pool means that rather
more generations are needed on average.
	The defect is readily revealed by the following test. One picks a
"batch size" comparable to the pool size, say 500 or 1000. One then
computes a value A such that a batch will with probability 0.2 contain one
or more variates with absolute value exceeding A.
One then draws batches from FastNorm,
and tests each batch to see if it contains such an extreme value.
Over many batches, one counts the frequency of such 'extreme' batches,
and finds (with FastNorm2) that it is indeed about 0.2. However, when one counts
the frequency with which succesive batches are both extreme, one finds it to
be higher than the proper value (0.2)^2 = 0.04. For batch sizes round the pool
size, it can be as high as 0.05. That is, although the frequncy of extreme
values is about right, their occurrence in the stream is correlated over a
scale of the order of the pool size.
	The same correlation effect is seen in the average 4th moment of
successive batches.
	Since this inter-generational correlation cannot be avoided, the
this revision seeks to reduce it by performing at least two simple
rotations of the pool at each generation. Obviously, some speed is lost,
but the correlations are reduced.
	To allow the user to trade off speed and quality, the initialization
function initnorm() now provides a QUALITY parameter 'quoll' which controls
how many double-rotations are done for each generation.
	See the comments in initnorm() for more detail.
	++++++++++  End of revision notes  +++++++++  */


/*	-----------------  Some test results  ------------------------  */
/*
General form:
	Some simple tests were conducted by transforming FastNorm variates
in several ways to yield a variable nominally uniformly distributed in 0 ... 1.
Uniformity of the derived variate was then tested by a ChiSquared test on a
100-cell histogram with cell counts around 10000. These tests are crude, but
showed no untoward results on the present version.
	Transformations included:
	y = 0.5 * (1.0 + erf (n1 / sqrt(2))

	y = 0.5 * (n1 / (n1^2 + n2^2 + n3^2) - 1)

	y = exp (-0.5 * (n1^2 + n2^2))

	y = (n1^2 + n2^2) / (n1^2 + n2^2 + n3^2 + n4^2)

		where n1, n2 etc are successive Normal variates.
It may be noted that some of these are sensitive to serial correlation if
present.

Fourth moment of batches:
	Extensive tests for correlation among the fourth moments of successive
batches of variates were made, with batch sizes comparabe to or (worst case)
equal to the size of the variate pool (4096 in this revision).
	With 'quality' 1, significant correlation appears after 10^6 batches
of worst-case size.
	With quality 2, no significant correlation is evident after 10^7
batches. A just-significant correlation appears after 3.6*10^7 batches.
As this requires some 1.4*10^11 deviates to be drawn, it may be irrelevent
for many applications. The observed correlation coefficent was 0.0008.
	With quality 3, results are OK after 10^8 batches, or more than
4*10^11 variates.
	No tests have been done with quality 4 as yet.

Speed:
	Speed tests were done on a PC running RedHat Linux, using "-O"
compiler optimization. The test loop was
	for (i = 0; i < 500000000; i++)  {
		a += FastNorm;  a -= FastNorm;
		}
	Thus the test makes 10^9 uses of FastNorm. The time taken, (which
includes time for a call in 'initnorm' and the loop overhead) depends on
the 'quality' set by initnorm.
	Quality 1:	21.5 sec
	Quality 2:	32.1 sec
	Quality 3:	42.5 sec
	Quality 4:	53.1 sec

By way of comparison, the same 10^9 call loop was timed with the Unix library
"random()" routine substituted for FastNorm, and the variable 'a' defined as
integer rather than double.  Also, since most use of a Uniform generator such
as "random()" requires that the returned integer be scaled into a floating-
point number in 0 ... 1, the timing was repeated with
	"a += random" ('a' integer) replaced by "a += Scale*random()" where
'a' is double and Scale = 2^(-31). The times obtained were:
	Random (integer): 44.1 sec
	Random (double) : 47.7 sec

	It can be seen that FastNorm (even at quality 3) is faster than a
commonly-used Uniform generator. To some extent, this result may vary on
different computers and compilers. Since FastNorm (at least for qualities
above 1) no doubt does more arithmetic per variate than "random()", much of
its speed advantage must come from its replacement of a function call per
variate by a macro which makes only one function call every 4095 variates.
Computers with lower 'call' overheads than the PC used here might show
differnt results.
	Incidently, the Uniform generator 'c7rand()' included in this
package, which returns a double uniform in 0 ... 1, and is of fairly high
quality, gives a time in the same test of 36.8 sec, a little faster than
'random()'.
	*/


/*	-----------------  globals  -------------------------  */
/*	A pool must have a length which is a multiple of 4.
 *	During regeneration of a new pool, the pool is treated as 4
 *	consecutive vectors, each of length VL.
	*/

#define VE 10
#define VL (1 << VE)
#define VM (VL-1)
#define WL (4*VL)
#define WM (WL-1)

Sw gaussfaze;
Sf *gausssave;
Sf GScale;
/*	GScale,fastnorm,gaussfaze, -save must be visible to callers*/
static Sf chic1, chic2;   /* Constants used in getting ChiSq_WL */
Sw gslew;  /*  Counts generations  */
static Sw qual;    /*  Sets number of double transforms per generation.  */
static Sw c7g [2];	/*  seed values for c7rand  */

Sf wk1 [WL], wk2 [WL];   /*  Pools of variates.  */


/*	------------------  regen  ----------------------    */
/*	Takes variates from wk1[], transforms to wk[2], then back to wk1[].
	*/
static void regen (void)
{
	Sw i, j, k, m;
	Sf p, q, r, s, t;
	Sw topv[6], ord[4], *top;
	Sf *ppt[4], *ptn;

/*      Choose 4 random start points in the wk1[] vector
	I want them all different.  */

	top = topv + 1;
/*	Set limiting values in top[-1], top[4]  */
	top[-1] = VL;  top[4] = 0;
reran1:
	m = irandm (c7g);   /* positive 32-bit random */
/*	Extract two VE-sized randoms from m, which has 31 useable digits */
	m  = m >> (31 - 2*VE);
	top[0] = m & VM;  m = m >> VE;  top[1] = m & VM;
	m = irandm (c7g);   /* positive 32-bit random */
/*	Extract two VE-sized randoms from m, which has 31 useable digits */
	m  = m >> (31 - 2*VE);
	top[2] = m & VM;  m = m >> VE;  top[3] = m & VM;
	for (i = 0; i < 4; i++) ord[i] = i;
/*	Sort in decreasing size   */
	for (i = 2; i >= 0; i--)  {
		for (j = 0; j <= i; j++)	{
			if (top[j] < top[j+1])	{
				k = top[j];  top[j] = top[j+1];
				top[j+1] = k;
				k = ord[j];  ord[j] = ord[j+1];
				ord[j+1] = k;
				}
			}
		}
/*	Ensure all different  */
	for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; }

/*	Set pt pointers to their start values for the first chunk.  */
	for (i = 0; i < 4; i++)	{
		j = ord[i];
		ppt[j] = wk2 + j * VL + top[i];
		}

/*	Set ptn to point into wk1  */
	ptn = wk1;

/*	Now ready to do five chunks. The length of chunk i is
		top[i-1] - top[i]  (I hope)
	At the end of chunk i, pointer ord[i] should have reached the end
	of its part, and need to be wrapped down to the start of its part.
		*/
	i = 0;

chunk:
	j = top[i] - top[i-1];   /* Minus the chunk length */
	for (;  j < 0;  j++)	{
		p = *ptn++;  s = *ptn++;  q = *ptn++;  r = *ptn++;
		t = (p + q + r + s) * 0.5;
		*ppt[0]++ = t - p;
		*ppt[1]++ = t - q;
		*ppt[2]++ = r - t;
		*ppt[3]++ = s - t;
		}
/*	This should end the chunk.  See if all done  */
	if (i == 4) goto passdone;

/*	The pointer for part ord[i] should have passed its end  */
	j = ord[i];
#ifdef dddd
printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-wk2);
#endif
	ppt[j] -= VL;
	i++;
	goto chunk;

passdone:
/*	wk1[] values have been transformed and placed in wk2[]
	Transform from wk2 to wk1 with a simple shuffle  */
	m = (irandm (c7g) >> (29 - VE)) & WM;
	j = 0;
	for (i = 0; i < 4; i++) ppt[i] = wk1 + i * VL;
	for (i = 0; i < VL; i++)  {
		p = wk2[j^m];  j++;
		s = wk2[j^m];  j++;
		q = wk2[j^m];  j++;
		r = wk2[j^m];  j++;
		t = (p + q + r + s) * 0.5;
		*ppt[0]++ = t - p;
		*ppt[1]++ = q - t;
		*ppt[2]++ = t - r;
		*ppt[3]++ = s - t;
		}

/*	We have a new lot of variates in wk1  */
	return;
}


/*	-------------------  renormalize  ---------------------------   */
/*	Rescales wk1[] so sum of squares = WL  */
/*	Returns the original sum-of-squares  */
Sf renormalize (void)
{
	Sf ts, vv;
	Sw i;

	ts = 0.0;
	for (i = 0; i < WL; i++)	{
		ts += wk1[i] * wk1[i];
		}
	vv = sqrt (WL / ts);
	for (i = 0; i < WL; i++) wk1[i] *= vv;
	return (ts);
}


/*	------------------------  BoxMuller  ----------------------   */
/*	Fills block gvec of length ll with proper normals  */
static void boxmuller (Sf *gvec, Sw ll)
{
	Sw i;
	Sf tx, ty, tr, tz;

/*	Here, replace the whole pool with conventional Normal variates  */
	i = 0;
nextpair:
	tx = 2.0 * c7rand(c7g) - 1.0;  /* Uniform in -1..1 */
	ty = 2.0 * c7rand(c7g) - 1.0;  /* Uniform in -1..1 */
	tr = tx * tx + ty * ty;
	if ((tr > 1.0) || (tr < 0.25)) goto nextpair;
	tz = -2.0 * log (c7rand(c7g));	/* Sum of squares */
	tz = sqrt ( tz / tr );
	gvec [i++] = tx * tz;	gvec [i++] = ty * tz;
	if (i < ll) goto nextpair;
/*	Horrid, but good enough	*/
	return;
}


/*	-------------------------  initnorm  ----------------------  */
/*	To initialize, given a seed integer and a quality level.
	The seed can be any integer. The quality level quoll should be
	between 1 and 4. Quoll = 1 gives high speed, but leaves some
	correlation between the 4th moments of successive batches of values.
	Higher values of quoll give lower speed but less correlation.

	If called with quoll = 0, initnorm performs a check that the
	most crucial routine (regen) is performing correctly. In this
	case, the value of 'iseed' is ignored. Initnorm will report the
	results of the test, which compares pool values with check17 and
	check98, which are defined below.
	When a check call is made, a proper call on initnorm must then
	be made before using the FastNorm macro. A check call does not
	properly initialize the routines even if it succeeds.
		*/
static Sf check17 = 0.1255789;
static Sf check98 = -0.7113293;

void initnorm (Sw seed, Sw quoll)
{
	Sw i;

/*      At one stage, we need to generate a random variable Z such that
        (WL * Z*Z) has a Chi-squared-WL density. Now, a var with
        an approximate Chi-sq-K distn can be got as
                (A + B*n)**2 where n has unit Normal distn,
        A**2 = K * sqrt (1 - 1/K),  A**2 + B**2 = K.  (For large K)
                So we form Z as (1/sqrt(WL)) * (A + B*n)
        or   chic1 + chic2 * n   where
        chic1 = A / sqrt(WL), chic2 = B / sqrt(WL).
		Hence
        chic1 = sqrt (A*A / WL) = sqrt ( sqrt (1 - 1/WL)),
        chic2 = sqrt (1 - chic1*chic1)
        */

        chic1 = sqrt ( sqrt (1.0 - 1.0 / WL));
        chic2 = sqrt (1.0 - chic1 * chic1);

/*	Set regen counter "gslew" which will affect renormalizations.
	Since pools are OK already, we wont't need to renorm for a
	while  */
	gslew = 1;
/*	Finally, set "gaussfaze" to return all of wk1
 *	except the last entry at WL-1  */
	gaussfaze = WL-1;
	gausssave = wk1;

/*	If quoll = 0, do a check on installation   */
	if (quoll == 0) goto docheck;
	qual = quoll;
/*	Check sensible values for quoll, say 1 to 4  */
	if ((quoll < 0) || (quoll > 4))  {
		printf ("From initnorm(): quoll parameter %d out of\
 range 1 to 4\n", quoll);
		return;
		}
	c7g[0] = seed;  c7g[1] = -3337792;

/*	Fill wk1[] with good normals  */
	boxmuller (wk1,  WL);
/*	Scale so sum-of-squares = WL  */
	GScale = sqrt (renormalize () / WL);
/*	We have set
	GScale to restore the original ChiSq_WL sum-of-squares  */
	return;

docheck:
/*	Set a simple pattern in wk1[] and test results of regen  */
	for (i = 0; i < WL; i++) wk1[i] = wk2[i] = 0.0;
	wk1[0] = sqrt ((double) WL);
	c7g[0] = 1234567;  c7g[1] = 9876543;
	for (i = 0; i < 60; i++) regen();
/*	Check a couple of values  */
	if ((fabs (wk1[17] - check17) > 0.00001) ||
	    (fabs (wk1[98] - check98) > 0.00001))  {
		printf ("\nInitnorm check failed.\n");
		printf ("Expected %8.5f got %10.7f\n", check17, wk1[17]);
		printf ("Expected %8.5f got %10.7f\n", check98, wk1[98]);
		}
	else printf ("\nInitnorm check OK\n");
	return;
}


/*	----------------------  fastnorm  --------------------------  */
/*	If gslew shows time is ripe, renormalizes the pool
		fastnorm() returns the value GScale*gausssave[0].
		*/

Sf fastnorm (void)
{
	Sf sos;
	Sw n1;

	if (! (gslew & 0xFFFF))  {
		sos = renormalize ();
		}

/*	The last entry of gausssave, at WL-1, will not have been used.
	Use it to get an approx. to sqrt (ChiSq_WL / WL).
	See initnorm() code for details  */
	GScale = chic1 + chic2 * GScale * gausssave [WL-1];
	for (n1 = 0; n1 < qual; n1++) regen ();
	gslew++;

	gaussfaze = WL - 1;

	return (GScale * gausssave [0]);
}


/*	---------------------  (test) main  -------------------------  */
#ifdef Main
#include "FastNorm3.h"
int main()
{
	Sf x;  Sw i;
	initnorm (0, 0);
	initnorm (77, 2);
	printf ("SoS %20.6f\n", renormalize());
//	for (i = 0; i < 2000000; i++) x = FastNorm;
        for (i = 0; i < 200; i++) {
        	x = FastNorm;
        	printf("%d\t%f\n", i, x);
        }
	printf ("SoS %20.6f\n", renormalize());
	exit (1);
}
#endif