From 5e1ed023c60777969dd31ca8359eee3c9f1b8a05 Mon Sep 17 00:00:00 2001 From: h_vogt Date: Sat, 27 Nov 2010 16:36:03 +0000 Subject: [PATCH] transient noise simulation --- configure.ac | 2 + examples/transient-noise/noi-ring51-demo.cir | 59 + examples/transient-noise/noi-sc-tr.cir | 53 + examples/transient-noise/noilib-demo.h | 56 + examples/transient-noise/shot_ng.cir | 27 + src/Makefile.am | 5 + src/frontend/Makefile.am | 4 +- src/frontend/com_fft.c | 298 +- src/frontend/com_fft.h | 1 + src/frontend/commands.c | 12 +- src/frontend/trannoise/1-f-code.c | 62 + src/frontend/trannoise/1-f-code_d.c | 61 + src/frontend/trannoise/FastNorm3.c | 846 + src/frontend/trannoise/Makefile.am | 10 + src/frontend/trannoise/wallace.c | 532 + src/include/1-f-code.h | 7 + src/include/FastNorm3.h | 32 + src/include/bool.h | 4 +- src/include/fftext.h | 108 + src/include/ngspice.h | 13 +- src/include/wallace.h | 22 + src/main.c | 25 +- src/maths/Makefile.am | 4 +- src/maths/fft/Makefile.am | 13 + src/maths/fft/NOTE | 37 + src/maths/fft/Read Me | 70 + src/maths/fft/fftext.c | 156 + src/maths/fft/fftext.h | 106 + src/maths/fft/fftlib.c | 3175 +++ src/maths/fft/fftlib.h | 76 + src/maths/fft/matlib.c | 297 + src/maths/fft/matlib.h | 33 + src/maths/misc/randnumb.c | 24 + src/spicelib/devices/vsrc/vsrc.c | 3 +- src/spicelib/devices/vsrc/vsrcacct.c | 51 + src/spicelib/devices/vsrc/vsrcask.c | 1 + src/spicelib/devices/vsrc/vsrcdefs.h | 13 +- src/spicelib/devices/vsrc/vsrcload.c | 440 +- src/spicelib/devices/vsrc/vsrcpar.c | 7 + visualc/vngspice.vcproj | 17882 +++++++++-------- 40 files changed, 15572 insertions(+), 9055 deletions(-) create mode 100644 examples/transient-noise/noi-ring51-demo.cir create mode 100644 examples/transient-noise/noi-sc-tr.cir create mode 100644 examples/transient-noise/noilib-demo.h create mode 100644 examples/transient-noise/shot_ng.cir create mode 100644 src/frontend/trannoise/1-f-code.c create mode 100644 src/frontend/trannoise/1-f-code_d.c create mode 100644 src/frontend/trannoise/FastNorm3.c create mode 100644 src/frontend/trannoise/Makefile.am create mode 100644 src/frontend/trannoise/wallace.c create mode 100644 src/include/1-f-code.h create mode 100644 src/include/FastNorm3.h create mode 100644 src/include/fftext.h create mode 100644 src/include/wallace.h create mode 100644 src/maths/fft/Makefile.am create mode 100644 src/maths/fft/NOTE create mode 100644 src/maths/fft/Read Me create mode 100644 src/maths/fft/fftext.c create mode 100644 src/maths/fft/fftext.h create mode 100644 src/maths/fft/fftlib.c create mode 100644 src/maths/fft/fftlib.h create mode 100644 src/maths/fft/matlib.c create mode 100644 src/maths/fft/matlib.h diff --git a/configure.ac b/configure.ac index 8f70b1ce7..e5b95cf94 100644 --- a/configure.ac +++ b/configure.ac @@ -1073,10 +1073,12 @@ AC_CONFIG_FILES([Makefile src/frontend/help/Makefile src/frontend/parser/Makefile src/frontend/plotting/Makefile + src/frontend/trannoise/Makefile src/frontend/wdisp/Makefile src/include/Makefile src/maths/Makefile src/maths/cmaths/Makefile + src/maths/fft/Makefile src/maths/misc/Makefile src/maths/ni/Makefile src/maths/deriv/Makefile diff --git a/examples/transient-noise/noi-ring51-demo.cir b/examples/transient-noise/noi-ring51-demo.cir new file mode 100644 index 000000000..0e4aa69c3 --- /dev/null +++ b/examples/transient-noise/noi-ring51-demo.cir @@ -0,0 +1,59 @@ +* 51 stage Ring-Osc. BSIM3, transient noise +* will need 45 min on a i7 860 with 4 threads + +* closes the loop between inverters xinv1 and xinv5 +vin in out dc 0.5 pulse 0.5 0 0.1n 5n 1 1 1 + +vdd dd 0 dc 0 pulse 0 2.2 0 1n 1 1 1 + +vss ss 0 dc 0 +ve sub 0 dc 0 + +vpe well 0 2.2 + +* noisy inverters +xiinv2 dd ss sub well out25 out50 inv253 +xiinv1 dd ss sub well in out25 inv253 + +*very noisy inverter +xiinv5 dd ss sub well out50 out inv1_2 +*output amplifier +xiinv11 dd ss sub well out25 bufout inv1 +cout bufout ss 0.2pF + +.option itl1=500 gmin=1e-15 itl4=10 noacct + +* .dc vdd 0 2 0.01 +.tran 0.01n 500n + +.save in bufout v(t1) + +.include D:\Spice_Win\Exam_BSIM3\Modelcards\modelcard.nmos +.include D:\Spice_Win\Exam_BSIM3\Modelcards\modelcard.pmos + +.include noilib-demo.h + +.control +unset ngdebug +* first run +save bufout $ needed for restricting memory usage +rusage +tran 8p 10000n +rusage +plot bufout xlimit 90n 95n +linearize +fft bufout +* next run +reset +save bufout +alter @v.xiinv5.vn1[trnoise] = [ 0 0 0 0 ] $ no noise +tran 8p 10000n +rusage +plot bufout xlimit 90n 95n +linearize +fft bufout +plot mag(bufout) mag(sp2.bufout) xlimit 0 2G ylimit 1e-11 0.1 ylog +.endc + + +.end diff --git a/examples/transient-noise/noi-sc-tr.cir b/examples/transient-noise/noi-sc-tr.cir new file mode 100644 index 000000000..19c738b7e --- /dev/null +++ b/examples/transient-noise/noi-sc-tr.cir @@ -0,0 +1,53 @@ +* simple sample & hold, transient noise + +* switch control +* PULSE(V1 V2 TD TR TF PW PER) +vgate1 ga1 0 dc 0 pulse (0 1 0 10n 10n 90n 200n) + +Switch1 1 2 ga1 0 smodel1 + +* noisy input +* rms value white, time step, exponent < 2, rms value 1/f +vin 1 0 dc 0 trnoise 0.1m 0.2n 1 0.1m +*vin 1 0 dc 0 trnoise 0.1m 0.2n 0 0.1m + +* output +c2 2 0 10p + +* second S&H +vgate2 ga2 0 dc 0 pulse (0 1 140n 10n 10n 30n 200n) +*Buffer EXXXXXXX N+ N- NC+ NC- VALUE +e1 4 0 2 0 1 +Switch2 4 3 ga2 0 smodel2 +c3 3 0 10p + +.option itl1=500 gmin=1e-15 itl4=10 acct + +.model smodel1 sw vt=0.5 ron=100 +.model smodel2 sw vt=0.5 ron=100 + +.tran 0.4n 100u + + +.control +unset ngdebug +set filetype=ascii +rusage +run +rusage all +write noi_test.out v(1) +plot v(2) v(3) xlimit 4u 5u +plot v(ga1) v(ga2) xlimit 4u 5u +linearize +*rms v(1) +fft v(3) +plot mag(v(3)) loglog xlimit 1e4 1e8 ylimit 1e-10 1e-4 +setplot tran1 +linearize +psd 101 v(3) +plot mag(v(3)) xlimit 0 3e7 ylimit 0 10u + +.endc + + +.end diff --git a/examples/transient-noise/noilib-demo.h b/examples/transient-noise/noilib-demo.h new file mode 100644 index 000000000..84e119d37 --- /dev/null +++ b/examples/transient-noise/noilib-demo.h @@ -0,0 +1,56 @@ + +* standard inverter made noisy +*.subckt inv1 dd ss sub well in out +*vn1 out outi dc 0 noise 0.1 0.3n 1.0 0.1 +*mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u +*mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u +*.ends inv1 + +* standard inverter +.subckt inv1 dd ss sub well in out +mn1 out in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u +mp1 out in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u +.ends inv1 + +* very noisy inverter (noise on vdd and well) +.subckt inv1_1 dd ss sub well in out +vn1 dd idd dc 0 trnoise 0.05 0.05n 1 0.05 +vn2 well iwell dc 0 trnoise 0.05 0.05n 1 0.05 +mn1 out in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u +mp1 out in idd iwell p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u +*Cout out 0 0.1p +.ends inv1_1 + + +* another very noisy inverter +.subckt inv1_2 dd ss sub well in out +vn1 out outi dc 0 trnoise 0.05 8p 1.0 0.001 +mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u +mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u +*Cout out 0 0.1p +.ends inv1_2 + +* another very noisy inverter with current souces parallel to transistor +.subckt inv13 dd ss sub well in outi +in1 ss outi dc 0 noise 200u 0.05n 1.0 50u +mn1 outi in ss sub n1 w=2u l=0.25u AS=3p AD=3p PS=4u PD=4u +in2 dd outi dc 0 noise 200u 0.05n 1.0 50u +mp1 outi in dd well p1 w=4u l=0.25u AS=7p AD=7p PS=6u PD=6u +*Cout out 0 0.1p +.ends inv13 + +.subckt inv53 dd ss sub well in out +xinv1 dd ss sub well in 1 inv1 +xinv2 dd ss sub well 1 2 inv1 +xinv3 dd ss sub well 2 3 inv1 +xinv4 dd ss sub well 3 4 inv1 +xinv5 dd ss sub well 4 out inv1 +.ends inv53 + +.subckt inv253 dd ss sub well in out +xinv1 dd ss sub well in 1 inv53 +xinv2 dd ss sub well 1 2 inv53 +xinv3 dd ss sub well 2 3 inv53 +xinv4 dd ss sub well 3 4 inv53 +xinv5 dd ss sub well 4 out inv53 +.ends inv253 diff --git a/examples/transient-noise/shot_ng.cir b/examples/transient-noise/shot_ng.cir new file mode 100644 index 000000000..a8c2f5e14 --- /dev/null +++ b/examples/transient-noise/shot_ng.cir @@ -0,0 +1,27 @@ +* Shot noise test with B source, diode +* voltage on device (diode, forward) +Vdev out 0 DC 0 PULSE(0.4 0.45 10u) +* diode, forward direction, to be modeled with noise +D1 mess 0 DMOD +.model DMOD D IS=1e-14 N=1 +X1 0 mess out ishot +* device between 1 and 2 +* new output terminals of device including noise: 1 and 3 +.subckt ishot 1 2 3 +* white noise source with rms 1V +VNG 0 11 DC 0 TRNOISE(1 1n 0 0) +*measure the current i(v1) +V1 2 3 DC 0 +* calculate the shot noise +* sqrt(2*current*q*bandwidth) +BI 1 3 I=sqrt(2*abs(i(v1))*1.6e-19*1e7)*v(11) +.ends ishot +* 20000 sample points +.tran 1n 20u +.control +run +plot (-1)*i(vdev) +meas tran vdev_rms avg i(vdev) from=0u to=9.9u +meas tran vdev_rms avg i(vdev) from=10.1u to=20u +.endc +.end \ No newline at end of file diff --git a/src/Makefile.am b/src/Makefile.am index a3abdd1f4..299d56e7f 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -156,6 +156,7 @@ endif ngspice_LDADD += \ frontend/parser/libparser.la \ frontend/numparam/libnumparam.la \ + frontend/trannoise/libtrannoise.la \ spicelib/parser/libinp.la if CIDER_WANTED @@ -170,6 +171,7 @@ ngspice_LDADD += \ maths/deriv/libderiv.la \ maths/cmaths/libcmaths.la \ maths/misc/libmathmisc.la \ + maths/fft/libmathfft.la \ maths/poly/libpoly.la \ maths/ni/libni.la \ maths/sparse/libsparse.la \ @@ -208,8 +210,10 @@ ngnutmeg_LDADD += \ frontend/plotting/libplotting.la \ frontend/parser/libparser.la \ frontend/numparam/libnumparam.la \ + frontend/trannoise/libtrannoise.la \ maths/cmaths/libcmaths.la \ maths/misc/libmathmisc.la \ + maths/fft/libmathfft.la \ maths/poly/libpoly.la \ misc/libmisc.la \ spicelib/parser/libinp.la @@ -384,6 +388,7 @@ libspice_la_LIBADD += \ maths/deriv/libderiv.la \ maths/cmaths/libcmaths.la \ maths/misc/libmathmisc.la \ + maths/fft/libmathfft.la \ maths/poly/libpoly.la \ maths/ni/libni.la \ maths/sparse/libsparse.la \ diff --git a/src/frontend/Makefile.am b/src/frontend/Makefile.am index b8de0faa1..1eced1c55 100644 --- a/src/frontend/Makefile.am +++ b/src/frontend/Makefile.am @@ -1,8 +1,8 @@ ## Process this file with automake to produce Makefile.in ## $Id$ -SUBDIRS = plotting help parser wdisp numparam -DIST_SUBDIRS = plotting help parser wdisp numparam +SUBDIRS = plotting help parser wdisp numparam trannoise +DIST_SUBDIRS = plotting help parser wdisp numparam trannoise EXTRA_DIST = testcommands.c parse-bison.y ## For Windows with Visual Studio EXTRA_DIST += parse-bison.c parse-bison.h diff --git a/src/frontend/com_fft.c b/src/frontend/com_fft.c index 4708ec727..b34475924 100644 --- a/src/frontend/com_fft.c +++ b/src/frontend/com_fft.c @@ -16,10 +16,10 @@ Author: 2008 Holger Vogt #include "variable.h" #include "parse.h" #include "../misc/misc_time.h" +#include "../maths/fft/fftext.h" static void fftext(double*, double*, long int, long int, int); - void com_fft(wordlist *wl) { @@ -250,6 +250,302 @@ com_fft(wordlist *wl) tfree(win); } +void +com_psd(wordlist *wl) +{ + ngcomplex_t **fdvec; + double **tdvec; + double *freq, *win, *time, *ave; + double delta_t, span, noipower; + int ngood, mm; + unsigned long fpts, i, j, tlen, jj, smooth, hsmooth; + char *s; + struct dvec *f, *vlist, *lv, *vec; + struct pnode *names, *first_name; + + float *reald, *imagd; + int size, sign, isreal; + double scaling, sum; + int order; + double scale, sigma; + + if (!plot_cur || !plot_cur->pl_scale) { + fprintf(cp_err, "Error: no vectors loaded.\n"); + return; + } + if (!isreal(plot_cur->pl_scale) || + ((plot_cur->pl_scale)->v_type != SV_TIME)) { + fprintf(cp_err, "Error: fft needs real time scale\n"); + return; + } + + tlen = (plot_cur->pl_scale)->v_length; + time = (plot_cur->pl_scale)->v_realdata; + span = time[tlen-1] - time[0]; + delta_t = span/(tlen - 1); + + // get filter length from parameter input + s = wl->wl_word; + if (!(ave = ft_numparse(&s, FALSE)) || (*ave < 1.0)) { + fprintf(cp_out, "Number of averaged data points: %d\n", 1); + smooth = 1; + } + else smooth = (int)(*ave); + wl = wl->wl_next; + + // size of input vector is power of two and larger than spice vector + size = 1; + mm = 0; + while (size < tlen) { + size <<= 1; + mm++; + } + + // output vector has length of size/2 + fpts = size>>1; + + // window function + win = TMALLOC(double, tlen); + { + char window[BSIZE_SP]; + double maxt = time[tlen-1]; + if (!cp_getvar("specwindow", CP_STRING, window)) + strcpy(window,"blackman"); + if (eq(window, "none")) + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = 1; + } + } + else if (eq(window, "hanning") || eq(window, "cosine")) + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = 1 - cos(2*M_PI*(time[i]-maxt)/span); + } + } + else if (eq(window, "hamming")) + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = 1 - 0.92/1.08*cos(2*M_PI*(time[i]-maxt)/span); + } + } + else if (eq(window, "triangle") || eq(window, "bartlet")) + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = 2 - fabs(2+4*(time[i]-maxt)/span); + } + } + else if (eq(window, "blackman")) { + int order; + if (!cp_getvar("specwindoworder", CP_NUM, &order)) order = 2; + if (order < 2) order = 2; /* only order 2 supported here */ + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = 1; + win[i] -= 0.50/0.42*cos(2*M_PI*(time[i]-maxt)/span); + win[i] += 0.08/0.42*cos(4*M_PI*(time[i]-maxt)/span); + } + } + } else if (eq(window, "gaussian")) { + if (!cp_getvar("specwindoworder", CP_NUM, &order)) order = 2; + if (order < 2) order = 2; + sigma=1.0/order; + scale=0.83/sigma; + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = scale*exp(-0.5*pow((time[i]-maxt/2)/(sigma*maxt/2),2)); + } + } +/* int order; + double scale; + extern double erfc(double); + if (!cp_getvar("specwindoworder", CP_NUM, &order)) order = 2; + if (order < 2) order = 2; + scale = pow(2*M_PI/order,0.5)*(0.5-erfc(pow(order,0.5))); + for(i=0; i span) { + win[i] = 0; + } else { + win[i] = exp(-0.5*order*(1-2*(maxt-time[i])/span) + *(1-2*(maxt-time[i])/span))/scale; + } + } +*/ + } else { + fprintf(cp_err, "Warning: unknown window type %s\n", window); + tfree(win); + return; + } + } + + names = ft_getpnames(wl, TRUE); + first_name = names; + vlist = NULL; + ngood = 0; + while (names) { + vec = ft_evaluate(names); + names = names->pn_next; + while (vec) { + if (vec->v_length != tlen) { + fprintf(cp_err, "Error: lengths of %s vectors don't match: %d, %d\n", + vec->v_name, vec->v_length, tlen); + vec = vec->v_link2; + continue; + } + if (!isreal(vec)) { + fprintf(cp_err, "Error: %s isn't real!\n", + vec->v_name); + vec = vec->v_link2; + continue; + } + if (vec->v_type == SV_TIME) { + vec = vec->v_link2; + continue; + } + if (!vlist) + vlist = vec; + else + lv->v_link2 = vec; + lv = vec; + vec = vec->v_link2; + ngood++; + } + } + free_pnode(first_name); + if (!ngood) { + return; + } + + plot_cur = plot_alloc("spectrum"); + plot_cur->pl_next = plot_list; + plot_list = plot_cur; + plot_cur->pl_title = copy((plot_cur->pl_next)->pl_title); + plot_cur->pl_name = copy("PSD"); + plot_cur->pl_date = copy(datestring( )); + + freq = (double *) tmalloc(fpts * sizeof(double)); + f = alloc(struct dvec); + ZERO(f, struct dvec); + f->v_name = copy("frequency"); + f->v_type = SV_FREQUENCY; + f->v_flags = (VF_REAL | VF_PERMANENT | VF_PRINT); + f->v_length = fpts; + f->v_realdata = freq; + vec_new(f); + + for (i = 0; iv_realdata; /* real input data */ + fdvec[i] = TMALLOC(ngcomplex_t, fpts); /* complex output data */ + f = alloc(struct dvec); + ZERO(f, struct dvec); + f->v_name = vec_basename(vec); + f->v_type = SV_NOTYPE; //vec->v_type; + f->v_flags = (VF_COMPLEX | VF_PERMANENT); + f->v_length = fpts; + f->v_compdata = fdvec[i]; + vec_new(f); + vec = vec->v_link2; + } + + printf("PSD: Time span: %g s, input length: %d, zero padding: %d\n", span, size, size-tlen); + printf("PSD: Freq. resolution: %g Hz, output length: %d\n", 1.0/span*tlen/size, fpts); + + sign = 1; + isreal = 1; + + reald = TMALLOC(float, size); + imagd = TMALLOC(float, size); + +// scale = 0.66; + + for (i = 0; i>1; + for (j=0; j +#include +#include +#include +#include // var. argumente +#include "1-f-code.h" +#include "ngspice.h" + +#include "fftext.h" +#include "wallace.h" + + +void f_alpha(int n_pts, int n_exp, float X[], float Q_d, +float alpha) +{ + int i; + float *hfa, *wfa; + float ha; + + ha = alpha/2.0f ; +// Q_d = sqrt(Q_d); /* find the deviation of the noise */ + hfa = TMALLOC(float,n_pts); + wfa = TMALLOC(float,n_pts); + hfa[0] = 1.0f; + wfa[0] = Q_d * (float)GaussWa; + /* generate the coefficients hk */ + for (i=1 ; i < n_pts; i++) { + /* generate the coefficients hk */ + hfa[i] = hfa[i-1] * (ha + (float)(i-1)) / ( (float)(i) ); + /* fill the sequence wk with white noise */ + wfa[i] = Q_d * (float)GaussWa; + } + +// for (i=0 ; i < n_pts; i++) +// printf("rnd %e, hk %e\n", wfa[i], hfa[i]); + + /* perform the discrete Fourier transform */ + fftInit(n_exp); + rffts(hfa, n_exp, 1); + rffts(wfa, n_exp, 1) ; + + /* multiply the two complex vectors */ + rspectprod(hfa, wfa, X, n_pts); + /* inverse transform */ + riffts(X, n_exp, 1); + + free(hfa) ; + free(wfa); + /* fft tables will be freed in vsrcaccept.c and isrcaccept.c + fftFree(); */ + fprintf(stdout,"%d (2e%d) one over f values created\n", n_pts, n_exp); +} + diff --git a/src/frontend/trannoise/1-f-code_d.c b/src/frontend/trannoise/1-f-code_d.c new file mode 100644 index 000000000..e9289ab94 --- /dev/null +++ b/src/frontend/trannoise/1-f-code_d.c @@ -0,0 +1,61 @@ +/* Copyright: Holger Vogt, 2008 + Discrete simulation of colored noise and stochastic + processes and 1/fa power law noise generation + Kasdin, N.J.; + Proceedings of the IEEE + Volume 83, Issue 5, May 1995 Page(s):802 - 827 +*/ + +#include +#include +#include +#include +#include // var. argumente +#include "1-f-code.h" +#include "ngspice.h" + +#include "fftext.h" +#include "wallace.h" + + +void f_alpha(int n_pts, int n_exp, double X[], double Q_d, +double alpha) +{ + unsigned int i; + double *hfa, *wfa; + double ha; + + ha = alpha/2.0f ; +// Q_d = sqrt(Q_d); /* find the deviation of the noise */ + hfa = TMALLOC(double,n_pts); + wfa = TMALLOC(double,n_pts); + hfa[0] = 1.0f; + wfa[0] = Q_d * GaussWa; + /* generate the coefficients hk */ + for (i=1 ; i < n_pts; i++) { + /* generate the coefficients hk */ + hfa[i] = hfa[i-1] * (ha + (double)(i-1)) / ( (double)(i) ); + /* fill the sequence wk with white noise */ + wfa[i] = Q_d * GaussWa; + } + +// for (i=0 ; i < n_pts; i++) +// printf("rnd %e, hk %e\n", wfa[i], hfa[i]); + + /* perform the discrete Fourier transform */ + fftInit(n_exp); + rffts(hfa, n_exp, 1); + rffts(wfa, n_exp, 1) ; + + /* multiply the two complex vectors */ + rspectprod(hfa, wfa, X, n_pts); + /* inverse transform */ + riffts(X, n_exp, 1); + + free(hfa) ; + free(wfa); + /* fft tables will be freed in vsrcaccept.c and isrcaccept.c + fftFree(); */ + fprintf(stdout,"%d (2e%d) one over f values created\n", n_pts, n_exp); +} + diff --git a/src/frontend/trannoise/FastNorm3.c b/src/frontend/trannoise/FastNorm3.c new file mode 100644 index 000000000..c5b1804a0 --- /dev/null +++ b/src/frontend/trannoise/FastNorm3.c @@ -0,0 +1,846 @@ +/* This is file FastNorm3.c */ +/* SUPERCEDES FastNorm.c, FastNorm2.c. Use with FastNorm3.h */ +/* 24 June 2003 */ + +/* A package containing a very fast generator of pseudo-random + Unit NORMAL variates, and some fairly high-quality UNIFORM + generators. It also contains a straightforward implementation of + a ChiSquared and Gamma generator copied from Ahrens and Dieter. + */ + +/* Version 3 with double transformations and controllable extension + to repeat the double transformations for higher quality at lower + speed. + Dated 17 May 20003. + Copyright Christopher Stewart Wallace. + */ +/* +%A C. S. Wallace +%T Fast Pseudo-Random Generators for Normal and Exponential Variates. +%J ACM Trans. Math. Software +%V 22 +%N 1 +%P 119-127 +%M MAR +%D 1996 +%O TR 94/197, May 1994, Dept. Computer Science, Monash University +%K CSW, CSWallace, Monash, pseudo random number generator, algorithm, + jrnl, TOMS, numbers, normal, probability, distribution, PRNG, RNG, Gaussian, + distribution, jrnl, ACM, TOMS, TR 94 197, TR197, c1996, c199x, c19xx +*/ +/* Use of this package requires the file "FastNorm3.h" which must be + #include-ed in any C files using this package. + + The main purpose of this package is to provide a very fast source +of pseudo-random variates from the Unit Normal N(0,1) distribution, having +the density function + + f(x) = (1/sqrt(2*PI)) * exp (-0.5 * x^2) + +Variates are obtained not by calling a function, but by use of a macro +"FastNorm" defined in FastNorm3.h. In a C program, this macro may appear +anywhere a (double) expression could appear, e.g in statements like + z += FastNorm; + if (FastNorm < 1.1) ..... + q = fabs (FastNorm); etc. + +The revision history, and a reference to the method description, is given +later in this file under the heading "Revision history Fastnorm". + + Major sections of this file, such as the revision history and the +major subroutines, are all headed by a line containing a row of minus signs (-) +and the name of the section or subroutine. + + The generators included are: +a Uniform source of integers, unsigned integers and doubles. +Chi-sq(N) (based on Ahrens and Dieter) +Gamma(N) (= 0.5 * Chi-sq(2N)) +Normal (a very fast routine) + */ + +/* ----------------- inclusions and some definitions ------------ */ +#include +#ifndef NOSPICE +#include "ngspice.h" +#endif +#include "FastNorm3.h" + + +/* --------------- (Uniform) c7rand, irandm, urandm ---------- */ +/* +c A random number generator called as a function by +c c7rand (iseed) or irandm (iseed) or urandm (iseed) + +c The parameter should be a pointer to a 2-element Sw vector. +c The first call gives a double uniform in 0 .. 1. +c The second gives an Sw integer uniform in 0 .. 2**31-1 +c The third gives an Sw integer with 32 bits, so unif in +c -2**31 .. 2**31-1 if used in 32-bit signed arithmetic. +c All update iseed[] in exactly the same way. +c iseed[] must be a 2-element Sw vector. +c The initial value of iseed[1] may be any 32-bit integer. +c The initial value of iseed[0] may be any 32-bit integer except -1. +c +c The period of the random sequence is 2**32 * (2**32-1) +c Its quality is quite good. It is based on the mixed multiplicative +c congruential (Lehmer) generator + x[n+1] = (69069 * x[n] + odd constant) MOD 2^32 +c but avoids most of the well-known defects of this type of generator +c by, in effect, generating x[n+k] from x[n] as defined by the +c sequence above, where k is chosen randomly in 1 ... 128 with the +c help of a subsidiary Tauseworth-type generator. +c For the positve integer generator irandm, the less +c significant digits are more random than is usual for a Lehmer +c generator. The last n<31 digits do not repeat with a period of 2^n. +c This is also true of the unsigned integer generator urandm, but less +c so. + +c This is an implementation in C of the algorithm described in +c Technical Report "A Long-Period Pseudo-Random Generator" +c TR89/123, Computer Science, Monash University, +c Clayton, Vic 3168 AUSTRALIA +c by +c +c C.S.Wallace csw@cs.monash.edu.au + +c The table mt[0:127] is defined by mt[i] = 69069 ** (128-i) + */ + +#define MASK ((Sw) 0x12DD4922) +/* or in decimal, 316492066 */ +#define SCALE ((double) 1.0 / (1024.0 * 1024.0 * 1024.0 * 2.0)) +/* i.e. 2 to power -31 */ + +static Sw mt [128] = { + 902906369, + 2030498053, + -473499623, + 1640834941, + 723406961, + 1993558325, + -257162999, + -1627724755, + 913952737, + 278845029, + 1327502073, + -1261253155, + 981676113, + -1785280363, + 1700077033, + 366908557, + -1514479167, + -682799163, + 141955545, + -830150595, + 317871153, + 1542036469, + -946413879, + -1950779155, + 985397153, + 626515237, + 530871481, + 783087261, + -1512358895, + 1031357269, + -2007710807, + -1652747955, + -1867214463, + 928251525, + 1243003801, + -2132510467, + 1874683889, + -717013323, + 218254473, + -1628774995, + -2064896159, + 69678053, + 281568889, + -2104168611, + -165128239, + 1536495125, + -39650967, + 546594317, + -725987007, + 1392966981, + 1044706649, + 687331773, + -2051306575, + 1544302965, + -758494647, + -1243934099, + -75073759, + 293132965, + -1935153095, + 118929437, + 807830417, + -1416222507, + -1550074071, + -84903219, + 1355292929, + -380482555, + -1818444007, + -204797315, + 170442609, + -1636797387, + 868931593, + -623503571, + 1711722209, + 381210981, + -161547783, + -272740131, + -1450066095, + 2116588437, + 1100682473, + 358442893, + -1529216831, + 2116152005, + -776333095, + 1265240893, + -482278607, + 1067190005, + 333444553, + 86502381, + 753481377, + 39000101, + 1779014585, + 219658653, + -920253679, + 2029538901, + 1207761577, + -1515772851, + -236195711, + 442620293, + 423166617, + -1763648515, + -398436623, + -1749358155, + -538598519, + -652439379, + 430550625, + -1481396507, + 2093206905, + -1934691747, + -962631983, + 1454463253, + -1877118871, + -291917555, + -1711673279, + 201201733, + -474645415, + -96764739, + -1587365199, + 1945705589, + 1303896393, + 1744831853, + 381957665, + 2135332261, + -55996615, + -1190135011, + 1790562961, + -1493191723, + 475559465, + 69069 + }; + +double c7rand (Sw *is) +{ + Sw it, leh; + + it = is [0]; + leh = is [1]; +/* Do a 7-place right cyclic shift of it */ + it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); + if (!(it & 0x80000000)) it = it ^ MASK; + leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; + is [0] = it; is [1] = leh; + if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF; + return (SCALE * leh); +} + + + +Sw irandm (Sw *is) +{ + Sw it, leh; + + it = is [0]; + leh = is [1]; +/* Do a 7-place right cyclic shift of it */ + it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); + if (!(it & 0x80000000)) it = it ^ MASK; + leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; + is [0] = it; is [1] = leh; + if (leh & 0x80000000) leh = leh ^ 0xFFFFFFFF; + return (leh); +} + + +unsigned int urandm (Sw *is) +{ + Sw it, leh; + + it = is [0]; + leh = is [1]; +/* Do a 7-place right cyclic shift of it */ + it = ((it >> 7) & 0x01FFFFFF) + ((it & 0x7F) << 25); + if (!(it & 0x80000000)) it = it ^ MASK; + leh = (leh * mt[it & 127] + it) & 0xFFFFFFFF; + is [0] = it; is [1] = leh; + return (leh); +} + + +/* --------------- (Chi-squared) adchi ----------------------- */ +/* Simple implementation of Ahrens and Dieter method for a chi-sq +random variate of order a >> 1. Uses c7rand, maths library */ +/* 13 July 1998 */ +/* Slightly faster if 'a' is the same as on previous call */ +/* This routine is no longer used in the fastnorm code, but is included +because it may be useful */ + + +static double gorder, gm, rt2gm, aold; + +double adchi (double a, int *is) +{ + double x, y, z, sq; + + if (a != aold) { + aold = a; gorder = 0.5 * a; + gm = gorder - 1.0; + rt2gm = sqrt (aold - 1.0); + } + +polar: + x = 2.0 * c7rand(is) - 1.0; z = c7rand(is); + sq = x*x + z*z; + if ((sq > 1.0) || (sq < 0.25)) goto polar; + y = x / z; + x = rt2gm * y + gm; + if (x < 0.0) goto polar; + + z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y); + if (c7rand(is) > z) goto polar; + + return (2.0 * x); +} + +/* -------------------- (Gamma) rgamma (g, is) ----------- */ + +double rgamma (double g, int *is) +{ + double x, y, z, sq; + + if (g != gorder) { + gorder = g; + gm = gorder - 1.0; aold = 2.0 * gorder; + rt2gm = sqrt (aold - 1.0); + } + +polar: + x = 2.0 * c7rand(is) - 1.0; z = c7rand(is); + sq = x*x + z*z; + if ((sq > 1.0) || (sq < 0.25)) goto polar; + y = x / z; + x = rt2gm * y + gm; + if (x < 0.0) goto polar; + + z = (1.0 + y*y) * exp (gm * log(x/gm) - rt2gm * y); + if (c7rand(is) > z) goto polar; + + return (x); +} + + +/* ------------------ Revision history Fastnorm ------------- */ +/* Items in this revision history appear in chronological order, +so the most recent revsion appears last. + Revision items are separated by a line of '+' characters. + + ++++++++++++++++++++++++++++++++++++++++++++++++++++++ + This is a revised version of the algorithm decribed in + + ACM Transactions on Mathematical Software, Vol 22, No 1 + March 1996, pp 119-127. + + A fast generator of pseudo-random variates from the unit Normal +distribution. It keeps a pool of about 1000 variates, and generates new +ones by picking 4 from the pool, rotating the 4-vector with these as its +components, and replacing the old variates with the components of the +rotated vector. + + The program should initialize the generator by calling initnorm(seed) +with seed a Sw integer seed value. Different seed values will give +different sequences of Normals. Seed may be any 32-bit integer. + BUT SEE REVISION of 17 May 2003 for initnorm() parameters. + The revised initnorm requires two integer parameters, iseed and + quoll, the latter specifying a tradeoff between speed and + quality. + Then, wherever the program needs a new Normal variate, it should +use the macro FastNorm, e.g. in statements like: + x = FastNorm; (Sets x to a random Normal value) +or + x += a + FastNorm * b; (Adds a normal with mean a, SD b, to x) + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + Changed basic formula, which was: + t = (p+q+r+s)*0.5; p = p-t; q = t-q; r = t-r; s = t-s; + This gives sum of new p+q+r+s = 2p(old) which may not be a great +choice. The new version is: + t = (p+q+r+s)*0.5; p = p-t; q = q-t; r = t-r; s = t-s; + which gives new p+q+r+s = p+q-r-s (old) which may be better. + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + Revision 14 November 1998 + The older version "FastNorm" which was available via ftp was found +to have a defect which could affect some applications. + + Dr Christine Rueb, (Max Planck Institut fur Infomatik, + Im Stadtwald W 66123 Saabrucken, F.G.R., + (rueb@mpi-sb.mpg.de) + +found that if a large number N of consecutive variates were summed to give +a variate S with nominally N(0,N) distribution, the variance of S was in some +cases too small. The effect was noticed with N=400, and was particularly strong +for N=1023 if the first several (about 128) variates from FastNorm were +discarded. Dr. Rueb traced the effect to an unexpected tendency of FastNorm +to concentrate values with an anomolous correlation into the first 128 +elements of the variate pool. + With the help of her analysis, the algorithm has been revised in a +way which appears to overcome the problem, at the cost of about a 19% +reduction in speed (which still leaves the method very fast.) + + IT MUST BE RECOGNISED THAT THIS ALGORITHM IS NOVEL +AND WHILE IT PASSES A NUMBER OF STANDARD TESTS FOR DISTRIBUTIONAL +FORM, LACK OF SERIAL CORRELATION ETC., IT MAY STILL HAVE DEFECTS. + +RECALL THE NUMBER OF YEARS WHICH IT TOOK FOR THE LIMITATIONS OF +THE LEHMER GENERATOR FOR UNIFORM VARIATES TO BECOME APPARENT !!! + +UNTIL MORE EXPERIENCE IS GAINED WITH THIS TYPE OF GENERATOR, IT +WOULD BE WISE IN ANY CRITICAL APPLICATION TO COMPARE RESULTS +OBTAINED USING IT WITH RESULTS OBTAINED USING A "STANDARD" FORM +OF GENERATOR OF NORMAL VARIATES COUPLED WITH A WELL-DOCUMENTED +GENERATOR OF UNIFORM VARIATES. + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + Revision 1 April 2003. + Trying a scanning process proposed by R.P.Brent. It needs 2 pool +vectors, as it cannot update in-situ, but may be more robust. + It is a bit slower on a 133 Mhz PC but just as fast on a newer PC +(moggie) at about 16 ns per call in the 'speed.c' test. + The extreme-value defect is the same on old and new versions. +If one finds a value 'A' such that a batch of B genuine Normal variates has +probability 0.2 of containing a variate with abolute value greater than A, +then the probability that both of two consecive batches of B will contain +such a value should be 0.2 times 0.2, or 0.04. Instead, both versions give +the extreme value prob. as 0.200 (over a million batches) but give the +consective-pair prob as 0.050 for batch size B = 1024. + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + + Revision 17 May 2003. + The fundamental defect of the method, namely an inadequate 'mixing' +of squared value ('energy') between one generation of the pool and the next, +cannot readily be removed. In going from one pool to the next, the energy +in an old variate is shared among just 4 variates in the new pool. Hence it +takes many generations before the energy of some original variate can be +distributed across the whole pool. The number of generations needed cannot +be less than the log to base 4 of the pool size, or 5 for a pool size of +1024. In fact, the pseudo-random indexing of the pool means that rather +more generations are needed on average. + The defect is readily revealed by the following test. One picks a +"batch size" comparable to the pool size, say 500 or 1000. One then +computes a value A such that a batch will with probability 0.2 contain one +or more variates with absolute value exceeding A. +One then draws batches from FastNorm, +and tests each batch to see if it contains such an extreme value. +Over many batches, one counts the frequency of such 'extreme' batches, +and finds (with FastNorm2) that it is indeed about 0.2. However, when one counts +the frequency with which succesive batches are both extreme, one finds it to +be higher than the proper value (0.2)^2 = 0.04. For batch sizes round the pool +size, it can be as high as 0.05. That is, although the frequncy of extreme +values is about right, their occurrence in the stream is correlated over a +scale of the order of the pool size. + The same correlation effect is seen in the average 4th moment of +successive batches. + Since this inter-generational correlation cannot be avoided, the +this revision seeks to reduce it by performing at least two simple +rotations of the pool at each generation. Obviously, some speed is lost, +but the correlations are reduced. + To allow the user to trade off speed and quality, the initialization +function initnorm() now provides a QUALITY parameter 'quoll' which controls +how many double-rotations are done for each generation. + See the comments in initnorm() for more detail. + ++++++++++ End of revision notes +++++++++ */ + + + +/* ----------------- Some test results ------------------------ */ +/* +General form: + Some simple tests were conducted by transforming FastNorm variates +in several ways to yield a variable nominally uniformly distributed in 0 ... 1. +Uniformity of the derived variate was then tested by a ChiSquared test on a +100-cell histogram with cell counts around 10000. These tests are crude, but +showed no untoward results on the present version. + Transformations included: + y = 0.5 * (1.0 + erf (n1 / sqrt(2)) + + y = 0.5 * (n1 / (n1^2 + n2^2 + n3^2) - 1) + + y = exp (-0.5 * (n1^2 + n2^2)) + + y = (n1^2 + n2^2) / (n1^2 + n2^2 + n3^2 + n4^2) + + where n1, n2 etc are successive Normal variates. +It may be noted that some of these are sensitive to serial correlation if +present. + +Fourth moment of batches: + Extensive tests for correlation among the fourth moments of successive +batches of variates were made, with batch sizes comparabe to or (worst case) +equal to the size of the variate pool (4096 in this revision). + With 'quality' 1, significant correlation appears after 10^6 batches +of worst-case size. + With quality 2, no significant correlation is evident after 10^7 +batches. A just-significant correlation appears after 3.6*10^7 batches. +As this requires some 1.4*10^11 deviates to be drawn, it may be irrelevent +for many applications. The observed correlation coefficent was 0.0008. + With quality 3, results are OK after 10^8 batches, or more than +4*10^11 variates. + No tests have been done with quality 4 as yet. + +Speed: + Speed tests were done on a PC running RedHat Linux, using "-O" +compiler optimization. The test loop was + for (i = 0; i < 500000000; i++) { + a += FastNorm; a -= FastNorm; + } + Thus the test makes 10^9 uses of FastNorm. The time taken, (which +includes time for a call in 'initnorm' and the loop overhead) depends on +the 'quality' set by initnorm. + Quality 1: 21.5 sec + Quality 2: 32.1 sec + Quality 3: 42.5 sec + Quality 4: 53.1 sec + +By way of comparison, the same 10^9 call loop was timed with the Unix library +"random()" routine substituted for FastNorm, and the variable 'a' defined as +integer rather than double. Also, since most use of a Uniform generator such +as "random()" requires that the returned integer be scaled into a floating- +point number in 0 ... 1, the timing was repeated with + "a += random" ('a' integer) replaced by "a += Scale*random()" where +'a' is double and Scale = 2^(-31). The times obtained were: + Random (integer): 44.1 sec + Random (double) : 47.7 sec + + It can be seen that FastNorm (even at quality 3) is faster than a +commonly-used Uniform generator. To some extent, this result may vary on +different computers and compilers. Since FastNorm (at least for qualities +above 1) no doubt does more arithmetic per variate than "random()", much of +its speed advantage must come from its replacement of a function call per +variate by a macro which makes only one function call every 4095 variates. +Computers with lower 'call' overheads than the PC used here might show +differnt results. + Incidently, the Uniform generator 'c7rand()' included in this +package, which returns a double uniform in 0 ... 1, and is of fairly high +quality, gives a time in the same test of 36.8 sec, a little faster than +'random()'. + */ + + +/* ----------------- globals ------------------------- */ +/* A pool must have a length which is a multiple of 4. + * During regeneration of a new pool, the pool is treated as 4 + * consecutive vectors, each of length VL. + */ + +#define VE 10 +#define VL (1 << VE) +#define VM (VL-1) +#define WL (4*VL) +#define WM (WL-1) + +Sw gaussfaze; +Sf *gausssave; +Sf GScale; +/* GScale,fastnorm,gaussfaze, -save must be visible to callers*/ +static Sf chic1, chic2; /* Constants used in getting ChiSq_WL */ +Sw gslew; /* Counts generations */ +static Sw qual; /* Sets number of double transforms per generation. */ +static Sw c7g [2]; /* seed values for c7rand */ + +Sf wk1 [WL], wk2 [WL]; /* Pools of variates. */ + + +/* ------------------ regen ---------------------- */ +/* Takes variates from wk1[], transforms to wk[2], then back to wk1[]. + */ +void regen () +{ + Sw i, j, k, m; + Sf p, q, r, s, t; + Sw topv[6], ord[4], *top; + Sf *ppt[4], *ptn; + +/* Choose 4 random start points in the wk1[] vector + I want them all different. */ + + top = topv + 1; +/* Set limiting values in top[-1], top[4] */ + top[-1] = VL; top[4] = 0; +reran1: + m = irandm (c7g); /* positive 32-bit random */ +/* Extract two VE-sized randoms from m, which has 31 useable digits */ + m = m >> (31 - 2*VE); + top[0] = m & VM; m = m >> VE; top[1] = m & VM; + m = irandm (c7g); /* positive 32-bit random */ +/* Extract two VE-sized randoms from m, which has 31 useable digits */ + m = m >> (31 - 2*VE); + top[2] = m & VM; m = m >> VE; top[3] = m & VM; + for (i = 0; i < 4; i++) ord[i] = i; +/* Sort in decreasing size */ + for (i = 2; i >= 0; i--) { + for (j = 0; j <= i; j++) { + if (top[j] < top[j+1]) { + k = top[j]; top[j] = top[j+1]; + top[j+1] = k; + k = ord[j]; ord[j] = ord[j+1]; + ord[j+1] = k; + } + } + } +/* Ensure all different */ + for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; } + +/* Set pt pointers to their start values for the first chunk. */ + for (i = 0; i < 4; i++) { + j = ord[i]; + ppt[j] = wk2 + j * VL + top[i]; + } + +/* Set ptn to point into wk1 */ + ptn = wk1; + +/* Now ready to do five chunks. The length of chunk i is + top[i-1] - top[i] (I hope) + At the end of chunk i, pointer ord[i] should have reached the end + of its part, and need to be wrapped down to the start of its part. + */ + i = 0; + +chunk: + j = top[i] - top[i-1]; /* Minus the chunk length */ + for (; j < 0; j++) { + p = *ptn++; s = *ptn++; q = *ptn++; r = *ptn++; + t = (p + q + r + s) * 0.5; + *ppt[0]++ = t - p; + *ppt[1]++ = t - q; + *ppt[2]++ = r - t; + *ppt[3]++ = s - t; + } +/* This should end the chunk. See if all done */ + if (i == 4) goto passdone; + +/* The pointer for part ord[i] should have passed its end */ + j = ord[i]; +#ifdef dddd +printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-wk2); +#endif + ppt[j] -= VL; + i++; + goto chunk; + +passdone: +/* wk1[] values have been transformed and placed in wk2[] + Transform from wk2 to wk1 with a simple shuffle */ + m = (irandm (c7g) >> (29 - VE)) & WM; + j = 0; + for (i = 0; i < 4; i++) ppt[i] = wk1 + i * VL; + for (i = 0; i < VL; i++) { + p = wk2[j^m]; j++; + s = wk2[j^m]; j++; + q = wk2[j^m]; j++; + r = wk2[j^m]; j++; + t = (p + q + r + s) * 0.5; + *ppt[0]++ = t - p; + *ppt[1]++ = q - t; + *ppt[2]++ = t - r; + *ppt[3]++ = s - t; + } + +/* We have a new lot of variates in wk1 */ + return; +} + + +/* ------------------- renormalize --------------------------- */ +/* Rescales wk1[] so sum of squares = WL */ +/* Returns the original sum-of-squares */ +Sf renormalize (void) +{ + Sf ts, vv; + Sw i; + + ts = 0.0; + for (i = 0; i < WL; i++) { + ts += wk1[i] * wk1[i]; + } + vv = sqrt (WL / ts); + for (i = 0; i < WL; i++) wk1[i] *= vv; + return (ts); +} + + +/* ------------------------ BoxMuller ---------------------- */ +/* Fills block gvec of length ll with proper normals */ +void boxmuller (Sf *gvec, Sw ll) +{ + Sw i; + Sf tx, ty, tr, tz; + +/* Here, replace the whole pool with conventional Normal variates */ + i = 0; +nextpair: + tx = 2.0 * c7rand(c7g) - 1.0; /* Uniform in -1..1 */ + ty = 2.0 * c7rand(c7g) - 1.0; /* Uniform in -1..1 */ + tr = tx * tx + ty * ty; + if ((tr > 1.0) || (tr < 0.25)) goto nextpair; + tz = -2.0 * log (c7rand(c7g)); /* Sum of squares */ + tz = sqrt ( tz / tr ); + gvec [i++] = tx * tz; gvec [i++] = ty * tz; + if (i < ll) goto nextpair; +/* Horrid, but good enough */ + return; +} + + +/* ------------------------- initnorm ---------------------- */ +/* To initialize, given a seed integer and a quality level. + The seed can be any integer. The quality level quoll should be + between 1 and 4. Quoll = 1 gives high speed, but leaves some + correlation between the 4th moments of successive batches of values. + Higher values of quoll give lower speed but less correlation. + + If called with quoll = 0, initnorm performs a check that the + most crucial routine (regen) is performing correctly. In this + case, the value of 'iseed' is ignored. Initnorm will report the + results of the test, which compares pool values with check17 and + check98, which are defined below. + When a check call is made, a proper call on initnorm must then + be made before using the FastNorm macro. A check call does not + properly initialize the routines even if it succeeds. + */ +static Sf check17 = 0.1255789; +static Sf check98 = -0.7113293; + +void initnorm (Sw seed, Sw quoll) +{ + Sw i; + +/* At one stage, we need to generate a random variable Z such that + (WL * Z*Z) has a Chi-squared-WL density. Now, a var with + an approximate Chi-sq-K distn can be got as + (A + B*n)**2 where n has unit Normal distn, + A**2 = K * sqrt (1 - 1/K), A**2 + B**2 = K. (For large K) + So we form Z as (1/sqrt(WL)) * (A + B*n) + or chic1 + chic2 * n where + chic1 = A / sqrt(WL), chic2 = B / sqrt(WL). + Hence + chic1 = sqrt (A*A / WL) = sqrt ( sqrt (1 - 1/WL)), + chic2 = sqrt (1 - chic1*chic1) + */ + + chic1 = sqrt ( sqrt (1.0 - 1.0 / WL)); + chic2 = sqrt (1.0 - chic1 * chic1); + +/* Set regen counter "gslew" which will affect renormalizations. + Since pools are OK already, we wont't need to renorm for a + while */ + gslew = 1; +/* Finally, set "gaussfaze" to return all of wk1 + * except the last entry at WL-1 */ + gaussfaze = WL-1; + gausssave = wk1; + +/* If quoll = 0, do a check on installation */ + if (quoll == 0) goto docheck; + qual = quoll; +/* Check sensible values for quoll, say 1 to 4 */ + if ((quoll < 0) || (quoll > 4)) { + printf ("From initnorm(): quoll parameter %d out of\ + range 1 to 4\n", quoll); + return; + } + c7g[0] = seed; c7g[1] = -3337792; + +/* Fill wk1[] with good normals */ + boxmuller (wk1, WL); +/* Scale so sum-of-squares = WL */ + GScale = sqrt (renormalize () / WL); +/* We have set + GScale to restore the original ChiSq_WL sum-of-squares */ + return; + +docheck: +/* Set a simple pattern in wk1[] and test results of regen */ + for (i = 0; i < WL; i++) wk1[i] = wk2[i] = 0.0; + wk1[0] = sqrt ((double) WL); + c7g[0] = 1234567; c7g[1] = 9876543; + for (i = 0; i < 60; i++) regen(); +/* Check a couple of values */ + if ((fabs (wk1[17] - check17) > 0.00001) || + (fabs (wk1[98] - check98) > 0.00001)) { + printf ("\nInitnorm check failed.\n"); + printf ("Expected %8.5f got %10.7f\n", check17, wk1[17]); + printf ("Expected %8.5f got %10.7f\n", check98, wk1[98]); + } + else printf ("\nInitnorm check OK\n"); + return; +} + + +/* ---------------------- fastnorm -------------------------- */ +/* If gslew shows time is ripe, renormalizes the pool + fastnorm() returns the value GScale*gausssave[0]. + */ + +Sf fastnorm () +{ + Sf sos; + Sw n1; + + if (! (gslew & 0xFFFF)) { + sos = renormalize (); + } + +/* The last entry of gausssave, at WL-1, will not have been used. + Use it to get an approx. to sqrt (ChiSq_WL / WL). + See initnorm() code for details */ + GScale = chic1 + chic2 * GScale * gausssave [WL-1]; + for (n1 = 0; n1 < qual; n1++) regen (); + gslew++; + + gaussfaze = WL - 1; + + return (GScale * gausssave [0]); +} + + +/* --------------------- (test) main ------------------------- */ +#ifdef Main +#include "FastNorm3.h" +int main() +{ + Sf x; Sw i; + initnorm (0, 0); + initnorm (77, 2); + printf ("SoS %20.6f\n", renormalize()); +// for (i = 0; i < 2000000; i++) x = FastNorm; + for (i = 0; i < 200; i++) { + x = FastNorm; + printf("%d\t%f\n", i, x); + } + printf ("SoS %20.6f\n", renormalize()); + exit (1); +} +#endif diff --git a/src/frontend/trannoise/Makefile.am b/src/frontend/trannoise/Makefile.am new file mode 100644 index 000000000..6d200ef5b --- /dev/null +++ b/src/frontend/trannoise/Makefile.am @@ -0,0 +1,10 @@ +noinst_LTLIBRARIES = libtrannoise.la + +libtrannoise_la_SOURCES = \ + FastNorm3.c \ + 1-f-code.c \ + wallace.c + +AM_CPPFLAGS = -I$(top_srcdir)/src/include -I$(top_srcdir)/src/frontend + +MAINTAINERCLEANFILES = Makefile.in diff --git a/src/frontend/trannoise/wallace.c b/src/frontend/trannoise/wallace.c new file mode 100644 index 000000000..ea8337597 --- /dev/null +++ b/src/frontend/trannoise/wallace.c @@ -0,0 +1,532 @@ +/* Wallace generator for normally distributed random variates + Copyright: Holger Vogt, 2008 + +*/ + +//#define FASTNORM_ORIG + +#include +#include +#ifdef _MSC_VER +#include +#define getpid _getpid +#else +#include +#endif +#include +#include "wallace.h" +#include "FastNorm3.h" + +#ifdef HasMain +#include +#else +#ifndef NOSPICE +#include "ngspice.h" +#endif +#endif + +#define POOLSIZE 4096 +#define LPOOLSIZE 12 +#define NOTRANS 3 /* number of (dual) transformations */ + +#define VE 10 +#define VL (1 << VE) +#define VM (VL-1) +#define WL (4*VL) +#define WM (WL-1) + +double *outgauss; /* output vector for user access */ +unsigned int variate_used; /* actual index of variate called by user */ +double ScaleGauss; + +static double *pool1; +static double *pool2; +static unsigned int *addrif, *addrib; +static unsigned n = POOLSIZE; +static double chi1, chi2; /* chi^2 correction values */ +static unsigned int newpools; + +extern double drand(void); +extern unsigned int CombLCGTausInt(void); +extern void TausSeed(void); +extern unsigned int CombLCGTausInt2(void); + + +void PolarGauss(double* py1, double* py2) +{ + double x1, x2, w; + + do { + x1 = drand(); + x2 = drand(); + w = x1 * x1 + x2 * x2; + } while (( w > 1.0 ) || ( w < 0.25)); + + w = sqrt( (-2.0 * log( w ) ) / w ); + + *py1 = (double)(x1 * w); + *py2 = (double)(x2 * w); +} + + + + + +void initw(void) +{ + unsigned i; + double totsqr, nomsqr; + unsigned long int coa, cob, s; + + /* initialize the uniform generator */ + srand(getpid()); +// srand(17); + TausSeed(); + + ScaleGauss = 1.; + newpools = 1; + + /* set up the two pools */ + pool1 = TMALLOC(double, n); //(double*)malloc(n * sizeof(double)); + pool2 = TMALLOC(double, n); //(double*)malloc(n * sizeof(double)); + addrif = TMALLOC(unsigned int, (n + NOTRANS)); //(unsigned int*)malloc((n + NOTRANS) * sizeof(unsigned int)); + addrib = TMALLOC(unsigned int, (n + NOTRANS)); //(unsigned int*)malloc((n + NOTRANS) * sizeof(unsigned int)); + + /* fill the first pool with normally distributed values */ + PolarGauss(&pool1[0], &pool1[1]); + for (i = 1; i < n>>1; i++) { + PolarGauss(&pool1[i<<1], &pool1[(i<<1) + 1]); + } + /* normalize pool content */ +/* totsqr = totsum = 0.0; + for (i = 0; i < n; i++) { + totsqr += pool1[i] * pool1[i]; + totsum += pool1[i]; + } + totsum = totsum/n; + for (i = 0; i < n; i++) { + totsqr += (pool1[i] - totsum) * (pool1[i] - totsum); + } + nomsqr = sqrt(n / totsqr); + for (i = 0; i < n; i++) + pool1[i] = (pool1[i] - totsum) * nomsqr; +*/ + totsqr = 0.0; + for (i = 0; i < n; i++) + totsqr += pool1[i] * pool1[i]; + nomsqr = sqrt(n / totsqr); + for (i = 0; i < n; i++) + pool1[i] *= nomsqr; + + /* calculate ch^2 value */ + chi1 = sqrt ( sqrt (1.0 - 1.0/n)); + chi2 = sqrt ( 1.0 - chi1*chi1); + + /* first scaling, based on unused pool1[n-2] */ + ScaleGauss = chi1 + chi2 * ScaleGauss * pool1[n-2]; + /* access to first pool */ + outgauss = pool1; + /* set data counter, we return n-2 values here */ + variate_used = n - 2; + + /* generate random reading addresses using a LCG */ + s = 0; + coa = 241; + cob = 59; + for (i=0; i < (n + NOTRANS); i++) { +// addrif[i] = s = (s * coa + cob) % ( n ); + coa = CombLCGTausInt(); + addrif[i] = coa >> (32 - LPOOLSIZE); +// printf ("Random add:\t%ld\n" , s); + } + s = 0; + coa = 193; + cob = 15; + for (i=0; i < (n + NOTRANS); i++) { +// addrib[i] = s = (s * coa + cob) % ( n ); + coa = CombLCGTausInt(); + addrib[i] = coa >> (32 - LPOOLSIZE); +// printf ("Random add:\t%ld\n" , addrib[i]); + } + +// printf("norm for orig. Gauss: %e, chi^2 scale: %e\n", nomsqr, ScaleGauss); +// NewWa(); +} + +/* original FastNorm3.c code */ +#ifdef FASTNORM_ORIG +float NewWa () +{ + int i, j, k, m; + float p, q, r, s, t; + int topv[6], ord[4], *top; + float *ppt[4], *ptn; + + float nulval, endval; + float totsqr, nomsqr; + nulval = ScaleGauss * pool1[0]; + endval = pool1[n-1]; + +/* Choose 4 random start points in the wk1[] vector + I want them all different. */ + + top = topv + 1; +/* Set limiting values in top[-1], top[4] */ + top[-1] = VL; top[4] = 0; +reran1: + m = CombLCGTausInt(); /* positive 32-bit random */ +/* Extract two VE-sized randoms from m, which has 31 useable digits */ + m = m >> (31 - 2*VE); + top[0] = m & VM; m = m >> VE; top[1] = m & VM; + m = CombLCGTausInt(); /* positive 32-bit random */ +/* Extract two VE-sized randoms from m, which has 31 useable digits */ + m = m >> (31 - 2*VE); + top[2] = m & VM; m = m >> VE; top[3] = m & VM; + for (i = 0; i < 4; i++) ord[i] = i; +/* Sort in decreasing size */ + for (i = 2; i >= 0; i--) { + for (j = 0; j <= i; j++) { + if (top[j] < top[j+1]) { + k = top[j]; top[j] = top[j+1]; + top[j+1] = k; + k = ord[j]; ord[j] = ord[j+1]; + ord[j+1] = k; + } + } + } +/* Ensure all different */ + for (i = 0; i < 3; i++) { if (top[i] == top[i+1]) goto reran1; } + +/* Set pt pointers to their start values for the first chunk. */ + for (i = 0; i < 4; i++) { + j = ord[i]; + ppt[j] = pool2 + j * VL + top[i]; + } + +/* Set ptn to point into wk1 */ + ptn = pool1; + +/* Now ready to do five chunks. The length of chunk i is + top[i-1] - top[i] (I hope) + At the end of chunk i, pointer ord[i] should have reached the end + of its part, and need to be wrapped down to the start of its part. + */ + i = 0; + +chunk: + j = top[i] - top[i-1]; /* Minus the chunk length */ + for (; j < 0; j++) { + p = *ptn++; s = *ptn++; q = *ptn++; r = *ptn++; + t = (p + q + r + s) * 0.5; + *ppt[0]++ = t - p; + *ppt[1]++ = t - q; + *ppt[2]++ = r - t; + *ppt[3]++ = s - t; + } +/* This should end the chunk. See if all done */ + if (i == 4) goto passdone; + +/* The pointer for part ord[i] should have passed its end */ + j = ord[i]; +#ifdef dddd +printf ("Chunk %1d done. Ptr %1d now %4d\n", i, j, ppt[j]-pool2); +#endif + ppt[j] -= VL; + i++; + goto chunk; + +passdone: +/* wk1[] values have been transformed and placed in wk2[] + Transform from wk2 to wk1 with a simple shuffle */ + m = (CombLCGTausInt2() >> (29 - VE)) & WM; + j = 0; + for (i = 0; i < 4; i++) ppt[i] = pool1 + i * VL; + for (i = 0; i < VL; i++) { + p = pool2[j^m]; j++; + s = pool2[j^m]; j++; + q = pool2[j^m]; j++; + r = pool2[j^m]; j++; + t = (p + q + r + s) * 0.5; + *ppt[0]++ = t - p; + *ppt[1]++ = q - t; + *ppt[2]++ = t - r; + *ppt[3]++ = s - t; + } + + /* renormalize again if number of pools beyond limit */ + if (!(newpools & 0xFFFF)) { + totsqr = 0.0; + for (i = 0; i < n; i++) + totsqr += pool1[i] * pool1[i]; + nomsqr = sqrt(n / totsqr); + for (i = 0; i < n; i++) + pool1[i] *= nomsqr; + } + + outgauss = pool1; + /* reset data counter */ + variate_used = n - 1; + + /* set counter counting nomber of pools made */ + newpools++; + + /* new scale factor using ch^2 correction, + using pool1[n-1] from last pool */ + ScaleGauss = chi1 + chi2 * ScaleGauss * endval; + +// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); + + return nulval; /* use old scale */ + +} + +#else + +/* Simplified code according to an algorithm published by C. S. Wallace: + "Fast Pseudorandom Generators for Normal and Exponential Variates", + ACM Transactions on Mathmatical Software, Vol. 22, No. 1, March 1996, pp. 119-127. + Transform pool1 to pool2 and back to pool1 NOTRANS times + by orthogonal 4 x 4 Hadamard-Matrix. + Mixing of values is very important: Any value in the pool should contribute to + every value in the new pools, at least after several passes (number of passes + is set by NOTRANS to 2 or 3). + 4 values are read in a continuous sequence from the total of POOLSIZE values. + Values are stored in steps modulo POOLSIZE/4. + During backward transformation the values are shuffled by a random number jj. +*/ + +double NewWa(void) +{ + double nulval, endval; + double bl1, bl2, bl3, bl4; /* the four values to be transformed */ + double bsum; + double totsqr, nomsqr; + unsigned int i, j, jj, m, mm, mmm; + + nulval = ScaleGauss * pool1[0]; + endval = pool1[n-1]; + m = n >> 2; +// printf("New pool after next value\n"); + + /* generate new pool by transformation + Transformation is repeated NOTRANS times */ + for (i=0; i < NOTRANS; i++) { + mm = m << 1; + mmm = mm + m; + /* forward transformation */ +// for (j=0; j < n; j += 4) { + for (j=0; j < m; j++) { + bl1 = pool1[j]; + bl2 = pool1[j+m]; + bl3 = pool1[j+mm]; + bl4 = pool1[j+mmm]; + /* Hadamard-Matrix */ + bsum = (bl1 + bl2 + bl3 + bl4) * 0.5f; + jj = j<<2; + pool2[jj] = bl1 - bsum; + pool2[jj+1] = bl2 - bsum; + pool2[jj+2] = bsum - bl3; + pool2[jj+3] = bsum - bl4; + } + /* backward transformation */ + jj = (CombLCGTausInt2() >> (31 - LPOOLSIZE)) & (n - 1); + for (j=0; j < m; j++) { + bl1 = pool2[j^jj]; + bl2 = pool2[(j+m)^jj]; + bl3 = pool2[(j+mm)^jj]; + bl4 = pool2[(j+mmm)^jj]; + /* Hadamard-Matrix */ + bsum = (bl1 + bl2 + bl3 + bl4) * 0.5f; + jj = j<<2; + pool1[jj] = bl1 - bsum; + pool1[jj+1] = bl2 - bsum; + pool1[jj+2] = bsum - bl3; + pool1[jj+3] = bsum - bl4; + } + } + + /* renormalize again if number of pools beyond limit */ + if (!(newpools & 0xFFFF)) { + totsqr = 0.0; + for (i = 0; i < n; i++) + totsqr += pool1[i] * pool1[i]; + nomsqr = sqrt(n / totsqr); + for (i = 0; i < n; i++) + pool1[i] *= nomsqr; + } + + outgauss = pool1; + /* reset data counter */ + variate_used = n - 1; + + /* set counter counting nomber of pools made */ + newpools++; + + /* new scale factor using ch^2 correction, + using pool1[n-1] from previous pool */ + ScaleGauss = chi1 + chi2 * ScaleGauss * endval; + +// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); + + return nulval; /* use old scale */ +// return pool1[0]; /* use new scale */ +} + +#endif + + +#ifdef FASTNORMTEST +float NewWa_not(void) +{ + float nulval, endval; + float bl1, bl2, bl3, bl4; /* the four values to be transformed */ + float bsum; + float totsqr, nomsqr; + unsigned int i, j, jj; + nulval = ScaleGauss * pool1[0]; + endval = pool1[n-1]; + +// printf("New pool after next value\n"); + + /* generate new pool by transformation + Transformation is repeated NOTRANS times */ + for (i=0; i < NOTRANS; i++) { + + /* forward transformation */ + for (j=0; j < n; j += 4) { + jj = j + i; + bl1 = pool1[addrif[jj]]; + bl2 = pool1[addrif[jj+1]]; + bl3 = pool1[addrif[jj+2]]; + bl4 = pool1[addrif[jj+3]]; +/* s = (s*coa + cob) & (n - 1); + bl1 = pool1[s]; + s = (s*coa + cob) & (n - 1); + bl2 = pool1[s + 1]; + s = (s*coa + cob) & (n - 1); + bl3 = pool1[s + 2]; + s = (s*coa + cob) & (n - 1); + bl4 = pool1[s + 3]; */ +/* jj = j + i; + bl1 = pool1[addrif[jj]]; + bl2 = pool1[addrif[jj+1]]; + bl3 = pool1[addrif[jj+2]]; + bl4 = pool1[addrif[jj+3]]; */ +/* bl1 = pool1[j]; + bl2 = pool1[j+1]; + bl3 = pool1[j+2]; + bl4 = pool1[j+3]; */ + /* Hadamard-Matrix */ + bsum = (bl1 + bl2 + bl3 + bl4) * 0.5; +/* pool2[j] = bl1 - bsum; + pool2[j+1] = bl2 - bsum; + pool2[j+2] = bsum - bl3; + pool2[j+3] = bsum - bl4; */ + pool2[addrib[jj]] = bl1 - bsum; + pool2[addrib[jj+1]] = bl2 - bsum; + pool2[addrib[jj+2]] = bsum - bl3; + pool2[addrib[jj+3]] = bsum - bl4; + } + /* backward transformation */ + for (j=0; j < n; j += 4) { + bl1 = pool2[j]; + bl2 = pool2[j+1]; + bl3 = pool2[j+2]; + bl4 = pool2[j+3]; +/* bl1 = pool2[addrib[j]]; + bl2 = pool2[addrib[j+1]]; + bl3 = pool2[addrib[j+2]]; + bl4 = pool2[addrib[j+3]]; */ + /* Hadamard-Matrix */ + bsum = (bl1 + bl2 + bl3 + bl4) * 0.5; + pool1[j] = bl1 - bsum; + pool1[j+1] = bl2 - bsum; + pool1[j+2] = bsum - bl3; + pool1[j+3] = bsum - bl4; + } + } + + + /* renormalize again if number of pools beyond limit */ + if (!(newpools & 0xFFFF)) { + totsqr = 0.0; + for (i = 0; i < n; i++) + totsqr += pool1[i] * pool1[i]; + nomsqr = sqrt(n / totsqr); + for (i = 0; i < n; i++) + pool1[i] *= nomsqr; + } + + outgauss = pool1; + /* reset data counter */ + variate_used = n - 1; + + /* set counter counting nomber of pools made */ + newpools++; + + /* new scale factor using ch^2 correction, + using pool1[n-1] from last pool */ + ScaleGauss = chi1 + chi2 * ScaleGauss * endval; + +// printf("Pool number: %d, chi^2 scale: %e\n", newpools, ScaleGauss); + + return nulval; /* use old scale */ +// return pool1[0]; /* use new scale */ +} +#endif + +/* --------------------- (test) main ------------------------- */ +/* gcc -Wall -g -DHasMain -I../../include wallace.c CombTaus.o -o watest.exe */ +#ifdef HasMain +#include "wallace.h" + +struct timeb timenow; +struct timeb timebegin; +int sec, msec; + +void timediff(struct timeb *now, struct timeb *begin, int *sec, int *msec) +{ + + *msec = now->millitm - begin->millitm; + *sec = now->time - begin->time; + if (*msec < 0) { + *msec += 1000; + (*sec)--; + } + return; + +} + + +int main() +{ + float x; + unsigned int i; + long int count; + + initw(); + ftime(&timebegin); + count = 100000000; + for (i = 0; i < count; i++) { + x = GaussWa; +// printf("%d\t%f\n", i, x); + } + ftime(&timenow); + timediff(&timenow, &timebegin, &sec, &msec); + printf("WallaceHV: %ld normal variates: %f s\n", count, sec + (float) msec / 1000.0); + + initnorm(0, 0); + initnorm(77, 3); + ftime(&timebegin); + count = 100000000; + for (i = 0; i < count; i++) { + x = FastNorm; +// printf("%d\t%f\n", i, x); + } + ftime(&timenow); + timediff(&timenow, &timebegin, &sec, &msec); + printf("FastNorm3: %ld normal variates: %f s\n", count, sec + (float) msec / 1000.0); + + return (1); +} +#endif diff --git a/src/include/1-f-code.h b/src/include/1-f-code.h new file mode 100644 index 000000000..c7de952d5 --- /dev/null +++ b/src/include/1-f-code.h @@ -0,0 +1,7 @@ + + + +void f_alpha(int n_pts, int n_exp, float X[], float Q_d, +float alpha); + +void rvfft(float X[], unsigned long int n); diff --git a/src/include/FastNorm3.h b/src/include/FastNorm3.h new file mode 100644 index 000000000..bd6bf8e82 --- /dev/null +++ b/src/include/FastNorm3.h @@ -0,0 +1,32 @@ +/* Last revised 28-1-1999 */ +/* This is the header file FastNorm3.h to be included in code files + using FastNorm3.c */ +/* I M P O R T A N T ! ! ! ! ! + + The definition below should be altered to ensure that integer +arithmetic is done on 32-bit words. It may need to be changed from int to +long on some platforms. The 32-bit requirement arises from the use of +a Uniform pseudo-random generator in part of the code, which assumes 32-bit +twos-complement arithmetic. In dire need, replace this generator with +another more suitable for the platform. The rest of the code assumes only +that signed integers up to a bit less than 2^31 can be handled. + */ + +#define Sw int /* MUST define Sw as a 32-bit integer or longer */ +#define Sf double + +extern int gaussfaze; +extern int gaussmask; +extern double *gausssave; +extern double GScale; + +#define FastNorm ((--gaussfaze)?GScale*gausssave[gaussfaze]:fastnorm()) + +void initnorm(Sw seed, Sw quoll); +Sf fastnorm (void); +Sf c7rand(Sw*); +Sw irandm(Sw*); +unsigned Sw urandm(Sw*); +double adchi (double a, int *is); +double rgamma (double g, int *is); +Sf renormalize(void); diff --git a/src/include/bool.h b/src/include/bool.h index a3869a7d9..ad09130e8 100644 --- a/src/include/bool.h +++ b/src/include/bool.h @@ -1,7 +1,9 @@ #ifndef _BOOL_H #define _BOOL_H -typedef unsigned char bool; +//typedef unsigned char bool; +typedef int bool; + typedef int BOOL ; #define BOOLEAN int diff --git a/src/include/fftext.h b/src/include/fftext.h new file mode 100644 index 000000000..25dfd7386 --- /dev/null +++ b/src/include/fftext.h @@ -0,0 +1,108 @@ +/******************************************************************* + This file extends the fftlib with calls to maintain the cosine and bit reversed tables + for you (including mallocs and free's). Call the init routine for each fft size you will + be using. Then you can call the fft routines below which will make the fftlib library + call with the appropriate tables passed. When you are done with all fft's you can call + fftfree to release the storage for the tables. Note that you can call fftinit repeatedly + with the same size, the extra calls will be ignored. So, you could make a macro to call + fftInit every time you call ffts. For example you could have someting like: + #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); +*******************************************************************/ + +int fftInit(long M); +// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft +/* INPUTS */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* OUTPUTS */ +/* private cosine and bit reversed tables */ + +void fftFree(); +// release storage for all private cosine and bit reversed tables + +void ffts(float *data, long M, long Rows); +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void iffts(float *data, long M, long Rows); +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void rffts(float *data, long M, long Rows); +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + +void riffts(float *data, long M, long Rows); +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + +void rspectprod(float *data1, float *data2, float *outdata, long N); +// When multiplying a pair of spectra from rfft care must be taken to multiply the +// two real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* INPUTS */ +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N = fft input size for both data1 and data2 */ +/* OUTPUTS */ +/* *outdata = output data array spectra */ + + +/* The following is FYI + + +Note that most of the fft routines require full matrices, ie Rsiz==Ncols +This is how I like to define a real matrix: +struct matrix { // real matrix + float *d; // pointer to data + long Nrows; // number of rows in the matrix + long Ncols; // number of columns in the matrix (can be less than Rsiz) + long Rsiz; // number of floats from one row to the next +}; +typedef struct matrix matrix; + + + + CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make + arrays that start exactly on a cache line start. + First we CACHEFILLMALLOC a void * (use this void * when free'ing), + then we set our array pointer equal to the properly cast CEILCACHELINE of this void * + example: + aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); + a = (float *) CEILCACHELINE(ainit); + ... main body of code ... + free(aInit); + + To disable this alignment, set CACHELINESIZE to 1 +#define CACHELINESIZE 32 // Bytes per cache line +#define CACHELINEFILL (CACHELINESIZE-1) +#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) +#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) +*/ + diff --git a/src/include/ngspice.h b/src/include/ngspice.h index 2c770280f..7c13ef56c 100644 --- a/src/include/ngspice.h +++ b/src/include/ngspice.h @@ -174,15 +174,10 @@ #define inline _inline #endif -/* -#ifndef HAVE_RANDOM -#define srandom(a) srand(a) -#define random rand -#define RR_MAX RAND_MAX -#else -#define RR_MAX LONG_MAX -#endif -*/ + +/* Fast random number generator */ +//#define FastRand +#define WaGauss #define RR_MAX RAND_MAX #ifdef HAVE_INDEX diff --git a/src/include/wallace.h b/src/include/wallace.h new file mode 100644 index 000000000..2c81bcfc7 --- /dev/null +++ b/src/include/wallace.h @@ -0,0 +1,22 @@ +/* Wallace generator for normally distributed random variates + Copyright Holger Vogt, 2008 + + Calling sequence: + initw(void); initialize using srand(seed) + double x = GaussWa; returns normally distributed random variate + +*/ + + + +extern double *outgauss; /*output vector for user access */ +extern unsigned int variate_used; /* actual index of variate called by user */ +extern double ScaleGauss; /* scale factor, including chi square correction */ + +double NewWa(void); /* generate new pool, return outgauss[0] */ + +#define GaussWa ((--variate_used)?(outgauss[variate_used]*ScaleGauss):NewWa()) + +void initw(void); /* initialization of Wallace generator */ + +void PolarGauss(double* py1, double* py2); diff --git a/src/main.c b/src/main.c index 744c2a772..a096cc877 100644 --- a/src/main.c +++ b/src/main.c @@ -208,6 +208,8 @@ extern int OUTbeginDomain(void *,IFuid,int,IFvalue *); extern int OUTendDomain(void *), OUTstopnow(void), OUTerror(int,char *,IFuid *); extern int OUTattributes(void *,IFuid,int,IFvalue *); +extern void initw(void); + IFfrontEnd nutmeginfo = { IFnewUid, IFdelUid, @@ -757,8 +759,9 @@ xmain(int argc, char **argv) main(int argc, char **argv) #endif /* HAS_WINDOWS */ { - int c; - int err; + int c, err; + unsigned int rseed; + time_t acttime; bool gotone = FALSE; char* copystring; bool addctrlsect = TRUE; /* PN: for autorun */ @@ -1106,6 +1109,24 @@ bot: err = 0; #ifdef SIMULATOR +#ifdef FastRand +// initialization and seed for FastNorm Gaussian random generator + initnorm (0, 0); + rseed = 66; + if (!cp_getvar("rndseed", CP_NUM, (char *) &rseed)) { + acttime = time(NULL); + rseed = (int)acttime; + } + initnorm (rseed, 2); + fprintf (cp_out, "SoS %f, seed value: %ld\n", renormalize(), rseed); +#elif defined (WaGauss) + if (!cp_getvar("rndseed", CP_NUM, (char *) &rseed)) { + acttime = time(NULL); + rseed = (int)acttime; + } + srand(rseed); + initw(); +#endif if (!ft_servermode && !ft_nutmeg) { /* Concatenate all non-option arguments into a temporary file and load that file into the spice core. diff --git a/src/maths/Makefile.am b/src/maths/Makefile.am index 86b4017fe..9bc84a2cf 100644 --- a/src/maths/Makefile.am +++ b/src/maths/Makefile.am @@ -1,6 +1,6 @@ ## Process this file with automake -SUBDIRS = cmaths ni sparse poly deriv misc -DIST_SUBDIRS = cmaths ni sparse poly deriv misc +SUBDIRS = cmaths ni sparse poly deriv misc fft +DIST_SUBDIRS = cmaths ni sparse poly deriv misc fft MAINTAINERCLEANFILES = Makefile.in diff --git a/src/maths/fft/Makefile.am b/src/maths/fft/Makefile.am new file mode 100644 index 000000000..5f362c403 --- /dev/null +++ b/src/maths/fft/Makefile.am @@ -0,0 +1,13 @@ +## Process this file with automake to produce Makefile.in + +noinst_LTLIBRARIES = libmathfft.la + +libmathfft_la_SOURCES = \ + fftext.c \ + fftlib.c \ + matlib.c + + + +AM_CPPFLAGS = -I$(top_srcdir)/src/include +MAINTAINERCLEANFILES = Makefile.in diff --git a/src/maths/fft/NOTE b/src/maths/fft/NOTE new file mode 100644 index 000000000..45f3aa846 --- /dev/null +++ b/src/maths/fft/NOTE @@ -0,0 +1,37 @@ +Subject: FFT for RISC 2.0 +To: macgifts@sumex-aim.stanford.edu +Enclosure: FFTs-for-RISC-2.sit + +Enclosed is a stuffit archive of version 2.0 of my 'C' source code fft library. + + Very-Fast Fourier Transform routines. Routines are provided for real and complex +forward and inverse 1d and 2d fourier transforms and 3d complex forward and inverse ffts. +I coded these to optimize execution speed on Risc processors like the PowerPC. +All fft sizes must still be a power of two. +Test programs based on the Numerical Recipes in C routines are provided. +Also included are some simple applications with source code which time the FFTs. +See the enclosed read me file for more information. + +Revision version 2.0: + Rewrote code to rely more on compiler optimization (and be less ugly.) + Removed restrictions on too small or too large ffts. + Provided a library extension that manages memory for cosine and bit +reversed counter tables. + Added 2d and 3d complex and 2d real ffts. + Speeded routines for data too large to fit in primary cache. + Changed most testing from Matlab to Numerical Recipes based (because its cheaper.) + Changed call parameters (watch out.) +Revision version 1.21: + line 126 of rfftTest.c corrected. +Revisions version 1.2: + I now store the Nyquest point of the real transform where the 0 for the DC term's +imaginary part used to be. !! WATCH OUT FOR THIS IF YOU USE rfft !! + Added the real inverse Fourier transform. + +Revisions version 1.1: + Re-arranged to put fft routines in a shared library and changed source file name to fftlib.c. + Removed some ugly optimizations that are no longer needed for CodeWarrier. + +This code is public domain, do anything you want to with it. + +[Moderators- This file should replace ffts-for-risc-121-c.hqx and can be included on any CD] diff --git a/src/maths/fft/Read Me b/src/maths/fft/Read Me new file mode 100644 index 000000000..d3d81922c --- /dev/null +++ b/src/maths/fft/Read Me @@ -0,0 +1,70 @@ +This directory contains a public domain FFT library which was optimized +for speed on RISC processors such as the PowerPC. All ffts +use single precision floats, for double precision just use a +global search and replace to change float to double in all +source files. +Codewarrier Pro 1.0 project files are also supplied. + +** Warning ** Perform rigorous testing to +your own standards before using this code. + + (John Green) green_jt@vsdec.npt.nuwc.navy.mil + +files: + fftTiming +Application to time complex ffts + + rfftTiming +Application to time real ffts + +// Directory: fft libraries + +files: + + fftext.c +Library of in-place fast fourier transforms. Contains forward +and inverse complex and real transforms. The real fft's expect the +frequency domain data to have the real part of the fsamp/2 bin (which +has a 0 imaginary part) to be stored in the location for the imaginary +part of the DC bin (the DC bin of real data is also strictly real.) +You must first call an initialization routine fftInit before calling +the fft computation routines ffts, iffts, rffts and riffts. +The init routines malloc the memory to store the cosine and +bit reversed counter tables as well as initializing their values. + + fftlib.c +Lower level library of in-place fast fourier transforms. Same as fftext.c but you +need to manage the mallocs for the cosine and bit reversed tables yourself. + + + fft2d.c +Library of 2d and 3d complex and 2d real in-place fast fourier transforms. +The init routine fft2dInit must be called before using the 2d routines and +fft3dInit must be called before using the 3d routines. These init routines +will also call the appropriate 1d init routines in fftext.c + + matlib.c +Matrix transpose routines used by fft2d.c and complex vector multiply +for forming the product of two spectra. + + dxpose.c +Double precision matrix transpose for quick single precision complex transposing + +// Directory: timing code +This directory contains the source to fftTiming and rfftTiming + +// Directory: Numerical Recipes testing +This directory contains files used to test the various fft routines using +the Numerical Recipes in C routines as a baseline. These routines can be purchased +in PeeCee (after expanding you can move them to a Mac) format from: +http://cfata2.harvard.edu/numerical-recipes/ +Unfortunately Numerical Recipes defines its forward and inverse fft's backwards. +For complex fft's I just use their inverse fft as a forward one, but for real ffts +their forward fft followed by my inverse fft reverses the data. They also have ugly matrix +and tensor data types and start their indices with one, Fortran style, but these are +minor annoyances. + +// Directory: Matlab testing +This directory contains files to test fast 1d and 2d convolution with Matlab used to +verify the results. An example of using Matlab to test the fft library routines is +also given for the 2d real fft. diff --git a/src/maths/fft/fftext.c b/src/maths/fft/fftext.c new file mode 100644 index 000000000..b0e612d97 --- /dev/null +++ b/src/maths/fft/fftext.c @@ -0,0 +1,156 @@ +/******************************************************************* + This file extends the fftlib with calls to maintain the cosine and bit reversed tables + for you (including mallocs and free's). Call the init routine for each fft size you will + be using. Then you can call the fft routines below which will make the fftlib library + call with the appropriate tables passed. When you are done with all fft's you can call + fftfree to release the storage for the tables. Note that you can call fftinit repeatedly + with the same size, the extra calls will be ignored. So, you could make a macro to call + fftInit every time you call ffts. For example you could have someting like: + #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); +*******************************************************************/ +#include +#include "fftlib.h" +#include "matlib.h" +#include "fftext.h" + +// pointers to storage of Utbl's and BRLow's +static float *UtblArray[8*sizeof(long)] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; +static short *BRLowArray[8*sizeof(long)/2] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; + +int fftInit(long M){ +// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft +/* INPUTS */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* OUTPUTS */ +/* private cosine and bit reversed tables */ + +int theError = 1; +/*** I did NOT test cases with M>27 ***/ +if ((M >= 0) && (M < 8*sizeof(long))){ + theError = 0; + if (UtblArray[M] == 0){ // have we not inited this size fft yet? + // init cos table + UtblArray[M] = (float *) malloc( (POW2(M)/4+1)*sizeof(float) ); + if (UtblArray[M] == 0) + theError = 2; + else{ + fftCosInit(M, UtblArray[M]); + } + if (M > 1){ + if (BRLowArray[M/2] == 0){ // init bit reversed table for cmplx fft + BRLowArray[M/2] = (short *) malloc( POW2(M/2-1)*sizeof(short) ); + if (BRLowArray[M/2] == 0) + theError = 2; + else{ + fftBRInit(M, BRLowArray[M/2]); + } + } + } + if (M > 2){ + if (BRLowArray[(M-1)/2] == 0){ // init bit reversed table for real fft + BRLowArray[(M-1)/2] = (short *) malloc( POW2((M-1)/2-1)*sizeof(short) ); + if (BRLowArray[(M-1)/2] == 0) + theError = 2; + else{ + fftBRInit(M-1, BRLowArray[(M-1)/2]); + } + } + } + } +}; +return theError; +} + +void fftFree(void){ +// release storage for all private cosine and bit reversed tables +long i1; +for (i1=8*sizeof(long)/2-1; i1>=0; i1--){ + if (BRLowArray[i1] != 0){ + free(BRLowArray[i1]); + BRLowArray[i1] = 0; + }; +}; +for (i1=8*sizeof(long)-1; i1>=0; i1--){ + if (UtblArray[i1] != 0){ + free(UtblArray[i1]); + UtblArray[i1] = 0; + }; +}; +} + +/************************************************* + The following calls are easier than calling to fftlib directly. + Just make sure fftlib has been called for each M first. +**************************************************/ + +void ffts(float *data, long M, long Rows){ +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + ffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); +} + +void iffts(float *data, long M, long Rows){ +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + iffts1(data, M, Rows, UtblArray[M], BRLowArray[M/2]); +} + +void rffts(float *data, long M, long Rows){ +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + rffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); +} + +void riffts(float *data, long M, long Rows){ +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + riffts1(data, M, Rows, UtblArray[M], BRLowArray[(M-1)/2]); +} + +void rspectprod(float *data1, float *data2, float *outdata, long N){ +// When multiplying a pair of spectra from rfft care must be taken to multiply the +// two real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* INPUTS */ +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N = fft input size for both data1 and data2 */ +/* OUTPUTS */ +/* *outdata = output data array spectra */ +if(N>1){ + outdata[0] = data1[0] * data2[0]; // multiply the zero freq values + outdata[1] = data1[1] * data2[1]; // multiply the nyquest freq values + cvprod(data1 + 2, data2 + 2, outdata + 2, N/2-1); // multiply the other positive freq values +} +else{ + outdata[0] = data1[0] * data2[0]; +} +} diff --git a/src/maths/fft/fftext.h b/src/maths/fft/fftext.h new file mode 100644 index 000000000..1db007464 --- /dev/null +++ b/src/maths/fft/fftext.h @@ -0,0 +1,106 @@ +/******************************************************************* + This file extends the fftlib with calls to maintain the cosine and bit reversed tables + for you (including mallocs and free's). Call the init routine for each fft size you will + be using. Then you can call the fft routines below which will make the fftlib library + call with the appropriate tables passed. When you are done with all fft's you can call + fftfree to release the storage for the tables. Note that you can call fftinit repeatedly + with the same size, the extra calls will be ignored. So, you could make a macro to call + fftInit every time you call ffts. For example you could have someting like: + #define FFT(a,n) if(!fftInit(roundtol(LOG2(n)))) ffts(a,roundtol(LOG2(n)),1);else printf("fft error\n"); +*******************************************************************/ + +int fftInit(long M); +// malloc and init cosine and bit reversed tables for a given size fft, ifft, rfft, rifft +/* INPUTS */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* OUTPUTS */ +/* private cosine and bit reversed tables */ + +void fftFree(void); +// release storage for all private cosine and bit reversed tables + +void ffts(float *data, long M, long Rows); +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void iffts(float *data, long M, long Rows); +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void rffts(float *data, long M, long Rows); +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* See rspectprod for multiplying two of these spectra together- ex. for fast convolution */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + +void riffts(float *data, long M, long Rows); +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use 1 for Rows for a single fft) */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + +void rspectprod(float *data1, float *data2, float *outdata, long N); +// When multiplying a pair of spectra from rfft care must be taken to multiply the +// two real values seperately from the complex ones. This routine does it correctly. +// the result can be stored in-place over one of the inputs +/* INPUTS */ +/* *data1 = input data array first spectra */ +/* *data2 = input data array second spectra */ +/* N = fft input size for both data1 and data2 */ +/* OUTPUTS */ +/* *outdata = output data array spectra */ + + +// The following is FYI + + +//Note that most of the fft routines require full matrices, ie Rsiz==Ncols +//This is how I like to define a real matrix: +//struct matrix { // real matrix +// float *d; // pointer to data +// long Nrows; // number of rows in the matrix +// long Ncols; // number of columns in the matrix (can be less than Rsiz) +// long Rsiz; // number of floats from one row to the next +//}; +//typedef struct matrix matrix; + + + +// CACHEFILLMALLOC and CEILCACHELINE can be used instead of malloc to make +// arrays that start exactly on a cache line start. +// First we CACHEFILLMALLOC a void * (use this void * when free'ing), +// then we set our array pointer equal to the properly cast CEILCACHELINE of this void * +// example: +// aInit = CACHEFILLMALLOC( NUMFLOATS*sizeof(float) ); +// a = (float *) CEILCACHELINE(ainit); +// ... main body of code ... +// free(aInit); +// +// To disable this alignment, set CACHELINESIZE to 1 +//#define CACHELINESIZE 32 // Bytes per cache line +//#define CACHELINEFILL (CACHELINESIZE-1) +//#define CEILCACHELINE(p) ((((unsigned long)p+CACHELINEFILL)/CACHELINESIZE)*CACHELINESIZE) +//#define CACHEFILLMALLOC(n) malloc((n)+CACHELINEFILL) diff --git a/src/maths/fft/fftlib.c b/src/maths/fft/fftlib.c new file mode 100644 index 000000000..d605f2114 --- /dev/null +++ b/src/maths/fft/fftlib.c @@ -0,0 +1,3175 @@ +/******************************************************************* +lower level fft stuff including routines called in fftext.c and fft2d.c +*******************************************************************/ +#include "fftlib.h" +#include +#define MCACHE (11-(sizeof(float)/8)) // fft's with M bigger than this bust primary cache + +// some math constants to 40 decimal places +#define MYPI 3.141592653589793238462643383279502884197 // pi +#define MYROOT2 1.414213562373095048801688724209698078569 // sqrt(2) +#define MYCOSPID8 0.9238795325112867561281831893967882868224 // cos(pi/8) +#define MYSINPID8 0.3826834323650897717284599840303988667613 // sin(pi/8) + +#ifdef _MSC_VER +#define inline __inline +#endif + + +/************************************************* +routines to initialize tables used by fft routines +**************************************************/ + +void fftCosInit(long M, float *Utbl){ +/* Compute Utbl, the cosine table for ffts */ +/* of size (pow(2,M)/4 +1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *Utbl = cosine table */ +unsigned long fftN = POW2(M); +unsigned long i1; + Utbl[0] = 1.0; + for (i1 = 1; i1 < fftN/4; i1++) + Utbl[i1] = cos( (2.0 * MYPI * i1) / fftN ); + Utbl[fftN/4] = 0.0; +} + +void fftBRInit(long M, short *BRLow){ +/* Compute BRLow, the bit reversed table for ffts */ +/* of size pow(2,M/2 -1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *BRLow = bit reversed counter table */ +long Mroot_1 = M / 2 - 1; +long Nroot_1 = POW2(Mroot_1); +long i1; +long bitsum; +long bitmask; +long bit; +for (i1 = 0; i1 < Nroot_1; i1++){ + bitsum = 0; + bitmask = 1; + for (bit=1; bit <= Mroot_1; bitmask<<=1, bit++) + if (i1 & bitmask) + bitsum = bitsum + (Nroot_1 >> bit); + BRLow[i1] = bitsum; +}; +} + +/************************************************ +parts of ffts1 +*************************************************/ + +inline void bitrevR2(float *ioptr, long M, short *BRLow); +inline void bitrevR2(float *ioptr, long M, short *BRLow){ +/*** bit reverse and first radix 2 stage of forward or inverse fft ***/ +float f0r; +float f0i; +float f1r; +float f1i; +float f2r; +float f2i; +float f3r; +float f3i; +float f4r; +float f4i; +float f5r; +float f5i; +float f6r; +float f6i; +float f7r; +float f7i; +float t0r; +float t0i; +float t1r; +float t1i; +float *p0r; +float *p1r; +float *IOP; +float *iolimit; +long Colstart; +long iCol; +unsigned long posA; +unsigned long posAi; +unsigned long posB; +unsigned long posBi; + +const unsigned long Nrems2 = POW2((M+3)/2); +const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2; +const unsigned long Nroot_1 = POW2(M/2-1)-1; +const unsigned long ColstartShift = (M+1)/2 +1; +posA = POW2(M); // 1/2 of POW2(M) complexes +posAi = posA + 1; +posB = posA + 2; +posBi = posB + 1; + +iolimit = ioptr + Nrems2; +for (; ioptr < iolimit; ioptr += POW2(M/2+1)){ + for (Colstart = Nroot_1; Colstart >= 0; Colstart--){ + iCol = Nroot_1; + p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4; + IOP = ioptr + (Colstart << ColstartShift); + p1r = IOP + BRLow[iCol]*4; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + for (; iCol > Colstart;){ + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + f4r = *(p1r); + f4i = *(p1r+1); + f5r = *(p1r+posA); + f5i = *(p1r+posAi); + f6r = *(p1r+2); + f6i = *(p1r+(2+1)); + f7r = *(p1r+posB); + f7i = *(p1r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + f0r = f4r + f5r; + f0i = f4i + f5i; + f5r = f4r - f5r; + f5i = f4i - f5i; + f2r = f6r + f7r; + f2i = f6i + f7i; + f7r = f6r - f7r; + f7i = f6i - f7i; + + *(p1r) = t0r; + *(p1r+1) = t0i; + *(p1r+2) = f1r; + *(p1r+(2+1)) = f1i; + *(p1r+posA) = t1r; + *(p1r+posAi) = t1i; + *(p1r+posB) = f3r; + *(p1r+posBi) = f3i; + *(p0r) = f0r; + *(p0r+1) = f0i; + *(p0r+2) = f5r; + *(p0r+(2+1)) = f5i; + *(p0r+posA) = f2r; + *(p0r+posAi) = f2i; + *(p0r+posB) = f7r; + *(p0r+posBi) = f7i; + + p0r -= Nrems2; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + iCol -= 1; + p1r = IOP + BRLow[iCol]*4; + }; + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + *(p0r) = t0r; + *(p0r+1) = t0i; + *(p0r+2) = f1r; + *(p0r+(2+1)) = f1i; + *(p0r+posA) = t1r; + *(p0r+posAi) = t1i; + *(p0r+posB) = f3r; + *(p0r+posBi) = f3i; + + }; +}; +} + +inline void fft2pt(float *ioptr); +inline void fft2pt(float *ioptr){ +/*** RADIX 2 fft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[2]; +f1i = ioptr[3]; + + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + + /* store result */ +ioptr[0] = t0r; +ioptr[1] = t0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +} + + +inline void fft4pt(float *ioptr); +inline void fft4pt(float *ioptr){ +/*** RADIX 4 fft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[4]; +f1i = ioptr[5]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - -i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + /* store result */ +ioptr[0] = f0r; +ioptr[1] = f0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +ioptr[4] = f2r; +ioptr[5] = f2i; +ioptr[6] = f3r; +ioptr[7] = f3i; +} + +inline void fft8pt(float *ioptr); +inline void fft8pt(float *ioptr){ +/*** RADIX 8 fft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[8]; +f1i = ioptr[9]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f7r = ioptr[14]; +f7i = ioptr[15]; + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - -i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - -i - t1 + f7 - 1 - t1 - -i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r - t1i; +f7i = f5i + t1r; +f5r = f5r + t1i; +f5i = f5i - t1r; + + +t0r = f0r - f4r; +t0i = f0i - f4i; +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r - f6i; +t1i = f2i + f6r; +f2r = f2r + f6i; +f2i = f2i - f6r; + +f4r = f1r - f5r * w0r - f5i * w0r; +f4i = f1i + f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r - f7i * w0r; +f6i = f3i + f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* store result */ +ioptr[0] = f0r; +ioptr[1] = f0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +ioptr[4] = f2r; +ioptr[5] = f2i; +ioptr[6] = f3r; +ioptr[7] = f3i; +ioptr[8] = t0r; +ioptr[9] = t0i; +ioptr[10] = f4r; +ioptr[11] = f4i; +ioptr[12] = t1r; +ioptr[13] = t1i; +ioptr[14] = f6r; +ioptr[15] = f6i; +} + +inline void bfR2(float *ioptr, long M, long NDiffU); +inline void bfR2(float *ioptr, long M, long NDiffU){ +/*** 2nd radix 2 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + +for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){ + + f0r = *p0r; + f1r = *p1r; + f0i = *(p0r + 1); + f1i = *(p1r + 1); + f2r = *p2r; + f3r = *p3r; + f2i = *(p2r + 1); + f3i = *(p3r + 1); + + f4r = f0r + f1r; + f4i = f0i + f1i; + f5r = f0r - f1r; + f5i = f0i - f1i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f7r = f2r - f3r; + f7i = f2i - f3i; + + *p0r = f4r; + *(p0r + 1) = f4i; + *p1r = f5r; + *(p1r + 1) = f5i; + *p2r = f6r; + *(p2r + 1) = f6i; + *p3r = f7r; + *(p3r + 1) = f7i; + + f0r = *(p0r + pos); + f1i = *(p1r + posi); + f0i = *(p0r + posi); + f1r = *(p1r + pos); + f2r = *(p2r + pos); + f3i = *(p3r + posi); + f2i = *(p2r + posi); + f3r = *(p3r + pos); + + f4r = f0r + f1i; + f4i = f0i - f1r; + f5r = f0r - f1i; + f5i = f0i + f1r; + + f6r = f2r + f3i; + f6i = f2i - f3r; + f7r = f2r - f3i; + f7i = f2i + f3r; + + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + *(p2r + pos) = f6r; + *(p2r + posi) = f6i; + *(p3r + pos) = f7r; + *(p3r + posi) = f7i; + + p0r += pnext; + p1r += pnext; + p2r += pnext; + p3r += pnext; + +} +} + +inline void bfR4(float *ioptr, long M, long NDiffU); +inline void bfR4(float *ioptr, long M, long NDiffU){ +/*** 1 radix 4 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long pnexti; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float w1r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pnexti = pnext + 1; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f0 - - f4 + f1 - 1 - f5 - - f5 + f2 - - f6 - 1 - f6 + f3 - 1 - f3 - -i - f7 + */ + /* Butterflys */ + /* + f0 - - f4 - - f4 + f1 - -i - t1 - - f5 + f2 - - f2 - w1 - f6 + f3 - -i - f7 - iw1- f7 + */ + +f0r = *p0r; +f1r = *p1r; +f2r = *p2r; +f3r = *p3r; +f0i = *(p0r + 1); +f1i = *(p1r + 1); +f2i = *(p2r + 1); +f3i = *(p3r + 1); + +f5r = f0r - f1r; +f5i = f0i - f1i; +f0r = f0r + f1r; +f0i = f0i + f1i; + +f6r = f2r + f3r; +f6i = f2i + f3i; +f3r = f2r - f3r; +f3i = f2i - f3i; + +for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + + f7r = f5r - f3i; + f7i = f5i + f3r; + f5r = f5r + f3i; + f5i = f5i - f3r; + + f4r = f0r + f6r; + f4i = f0i + f6i; + f6r = f0r - f6r; + f6i = f0i - f6i; + + f2r = *(p2r + pos); + f2i = *(p2r + posi); + f1r = *(p1r + pos); + f1i = *(p1r + posi); + f3i = *(p3r + posi); + f0r = *(p0r + pos); + f3r = *(p3r + pos); + f0i = *(p0r + posi); + + *p3r = f7r; + *p0r = f4r; + *(p3r + 1) = f7i; + *(p0r + 1) = f4i; + *p1r = f5r; + *p2r = f6r; + *(p1r + 1) = f5i; + *(p2r + 1) = f6i; + + f7r = f2r - f3i; + f7i = f2i + f3r; + f2r = f2r + f3i; + f2i = f2i - f3r; + + f4r = f0r + f1i; + f4i = f0i - f1r; + t1r = f0r - f1i; + t1i = f0i + f1r; + + f5r = t1r - f7r * w1r + f7i * w1r; + f5i = t1i - f7r * w1r - f7i * w1r; + f7r = t1r * Two - f5r; + f7i = t1i * Two - f5i; + + f6r = f4r - f2r * w1r - f2i * w1r; + f6i = f4i + f2r * w1r - f2i * w1r; + f4r = f4r * Two - f6r; + f4i = f4i * Two - f6i; + + f3r = *(p3r + pnext); + f0r = *(p0r + pnext); + f3i = *(p3r + pnexti); + f0i = *(p0r + pnexti); + f2r = *(p2r + pnext); + f2i = *(p2r + pnexti); + f1r = *(p1r + pnext); + f1i = *(p1r + pnexti); + + *(p2r + pos) = f6r; + *(p1r + pos) = f5r; + *(p2r + posi) = f6i; + *(p1r + posi) = f5i; + *(p3r + pos) = f7r; + *(p0r + pos) = f4r; + *(p3r + posi) = f7i; + *(p0r + posi) = f4i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + f5r = f0r - f1r; + f5i = f0i - f1i; + f0r = f0r + f1r; + f0i = f0i + f1i; + + p3r += pnext; + p0r += pnext; + p1r += pnext; + p2r += pnext; + +} +f7r = f5r - f3i; +f7i = f5i + f3r; +f5r = f5r + f3i; +f5i = f5i - f3r; + +f4r = f0r + f6r; +f4i = f0i + f6i; +f6r = f0r - f6r; +f6i = f0i - f6i; + +f2r = *(p2r + pos); +f2i = *(p2r + posi); +f1r = *(p1r + pos); +f1i = *(p1r + posi); +f3i = *(p3r + posi); +f0r = *(p0r + pos); +f3r = *(p3r + pos); +f0i = *(p0r + posi); + +*p3r = f7r; +*p0r = f4r; +*(p3r + 1) = f7i; +*(p0r + 1) = f4i; +*p1r = f5r; +*p2r = f6r; +*(p1r + 1) = f5i; +*(p2r + 1) = f6i; + +f7r = f2r - f3i; +f7i = f2i + f3r; +f2r = f2r + f3i; +f2i = f2i - f3r; + +f4r = f0r + f1i; +f4i = f0i - f1r; +t1r = f0r - f1i; +t1i = f0i + f1r; + +f5r = t1r - f7r * w1r + f7i * w1r; +f5i = t1i - f7r * w1r - f7i * w1r; +f7r = t1r * Two - f5r; +f7i = t1i * Two - f5i; + +f6r = f4r - f2r * w1r - f2i * w1r; +f6i = f4i + f2r * w1r - f2i * w1r; +f4r = f4r * Two - f6r; +f4i = f4i * Two - f6i; + +*(p2r + pos) = f6r; +*(p1r + pos) = f5r; +*(p2r + posi) = f6i; +*(p1r + posi) = f5i; +*(p3r + pos) = f7r; +*(p0r + pos) = f4r; +*(p3r + posi) = f7i; +*(p0r + posi) = f4i; + +} + +inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +inline void bfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/*** RADIX 8 Stages ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long Uinc; +unsigned long Uinc2; +unsigned long Uinc4; +unsigned long DiffUCnt; +unsigned long SameUCnt; +unsigned long U2toU3; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; +float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; + +float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 8; +pos = pinc * 4; +posi = pos + 1; +NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly +Uinc = NSameU * Ustride; +Uinc2 = Uinc * 2; +Uinc4 = Uinc * 4; +U2toU3 = (POW2(M) / 8)*Ustride; +for (; StageCnt > 0 ; StageCnt--){ + + u0r = &Utbl[0]; + u0i = &Utbl[POW2(M-2)*Ustride]; + u1r = u0r; + u1i = u0i; + u2r = u0r; + u2i = u0i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + w2r = *u2r; + w2i = *u2i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + + pstrt = ioptr; + + p0r = pstrt; + p1r = pstrt+pinc; + p2r = p1r+pinc; + p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - w0- f1 - - f1 - - f1 + f2 - - f2 - w1- f2 - - f4 + f3 - w0- t1 - iw1- f3 - - f5 + + f4 - - t0 - - f4 - w2- t0 + f5 - w0- f5 - - f5 - w3- t1 + f6 - - f6 - w1- f6 - iw2- f6 + f7 - w0- t1 - iw1- f7 - iw3- f7 + */ + + for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){ + f0r = *p0r; + f0i = *(p0r + 1); + f1r = *p1r; + f1i = *(p1r + 1); + for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r + f1i * w0i; + t0i = f0i - f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r - f3i * w0i; + t1i = f2i + f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r + f2i * w1i; + f0i = t0i - f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i - t1i * w1r; + f3i = f1i + t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + + t0r = f4r + f5r * w0r + f5i * w0i; + t0i = f4i - f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r - f7i * w0i; + t1i = f6i + f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r + f6i * w1i; + f4i = t0i - f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i - t1i * w1r; + f7i = f5i + t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + t0r = f0r - f4r * w2r - f4i * w2i; + t0i = f0i + f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + t1r = f1r - f5r * w3r - f5i * w3i; + t1i = f1i + f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + *(p0r + pos) = t0r; + *(p1r + pos) = t1r; + *(p0r + posi) = t0i; + *(p1r + posi) = t1i; + *p0r = f0r; + *p1r = f1r; + *(p0r + 1) = f0i; + *(p1r + 1) = f1i; + + p0r += pnext; + f0r = *p0r; + f0i = *(p0r + 1); + + p1r += pnext; + + f1r = *p1r; + f1i = *(p1r + 1); + + f4r = f2r - f6r * w2i + f6i * w2r; + f4i = f2i - f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + f5r = f3r - f7r * w3i + f7i * w3r; + f5i = f3i - f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *p2r = f4r; + *p3r = f5r; + *(p2r + 1) = f4i; + *(p3r + 1) = f5i; + *(p2r + pos) = f6r; + *(p3r + pos) = f7r; + *(p2r + posi) = f6i; + *(p3r + posi) = f7i; + + p2r += pnext; + p3r += pnext; + + } + + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r + f1i * w0i; + t0i = f0i - f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r - f3i * w0i; + t1i = f2i + f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r + f2i * w1i; + f0i = t0i - f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i - t1i * w1r; + f3i = f1i + t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + if (DiffUCnt == NDiffU/2) + Uinc4 = -Uinc4; + + u0r += Uinc4; + u0i -= Uinc4; + u1r += Uinc2; + u1i -= Uinc2; + u2r += Uinc; + u2i -= Uinc; + + pstrt += 2; + + t0r = f4r + f5r * w0r + f5i * w0i; + t0i = f4i - f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r - f7i * w0i; + t1i = f6i + f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r + f6i * w1i; + f4i = t0i - f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i - t1i * w1r; + f7i = f5i + t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + + if (DiffUCnt <= NDiffU/2) + w0r = -w0r; + + t0r = f0r - f4r * w2r - f4i * w2i; + t0i = f0i + f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + f4r = f2r - f6r * w2i + f6i * w2r; + f4i = f2i - f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + *(p0r + pos) = t0r; + *p2r = f4r; + *(p0r + posi) = t0i; + *(p2r + 1) = f4i; + w2r = *u2r; + w2i = *u2i; + *p0r = f0r; + *(p2r + pos) = f6r; + *(p0r + 1) = f0i; + *(p2r + posi) = f6i; + + p0r = pstrt; + p2r = pstrt + pinc + pinc; + + t1r = f1r - f5r * w3r - f5i * w3i; + t1i = f1i + f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + f5r = f3r - f7r * w3i + f7i * w3r; + f5i = f3i - f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *(p1r + pos) = t1r; + *p3r = f5r; + *(p1r + posi) = t1i; + *(p3r + 1) = f5i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + *p1r = f1r; + *(p3r + pos) = f7r; + *(p1r + 1) = f1i; + *(p3r + posi) = f7i; + + p1r = pstrt + pinc; + p3r = p2r + pinc; + } + NSameU /= 8; + Uinc /= 8; + Uinc2 /= 8; + Uinc4 = Uinc * 4; + NDiffU *= 8; + pinc *= 8; + pnext *= 8; + pos *= 8; + posi = pos + 1; +} +} + +void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +void fftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/* recursive bfstages calls to maximize on chip cache efficiency */ +long i1; +if (M <= MCACHE) // fits on chip ? + bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ +else{ + for (i1=0; i1<8; i1++){ + fftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */ + } + bfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */ +} +} + +void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +long StageCnt; +long NDiffU; + +switch (M){ +case 0: + break; +case 1: + for (;Rows>0;Rows--){ + fft2pt(ioptr); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + fft4pt(ioptr); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + fft8pt(ioptr); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + for (;Rows>0;Rows--){ + + bitrevR2(ioptr, M, BRLow); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE) + bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + + else{ + fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + ioptr += 2*POW2(M); + } +} +} + +/************************************************ +parts of iffts1 +*************************************************/ + +inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale); +inline void scbitrevR2(float *ioptr, long M, short *BRLow, float scale){ +/*** scaled bit reverse and first radix 2 stage forward or inverse fft ***/ +float f0r; +float f0i; +float f1r; +float f1i; +float f2r; +float f2i; +float f3r; +float f3i; +float f4r; +float f4i; +float f5r; +float f5i; +float f6r; +float f6i; +float f7r; +float f7i; +float t0r; +float t0i; +float t1r; +float t1i; +float *p0r; +float *p1r; +float *IOP; +float *iolimit; +long Colstart; +long iCol; +unsigned long posA; +unsigned long posAi; +unsigned long posB; +unsigned long posBi; + +const unsigned long Nrems2 = POW2((M+3)/2); +const unsigned long Nroot_1_ColInc = POW2(M)-Nrems2; +const unsigned long Nroot_1 = POW2(M/2-1)-1; +const unsigned long ColstartShift = (M+1)/2 +1; +posA = POW2(M); // 1/2 of POW2(M) complexes +posAi = posA + 1; +posB = posA + 2; +posBi = posB + 1; + +iolimit = ioptr + Nrems2; +for (; ioptr < iolimit; ioptr += POW2(M/2+1)){ + for (Colstart = Nroot_1; Colstart >= 0; Colstart--){ + iCol = Nroot_1; + p0r = ioptr+ Nroot_1_ColInc + BRLow[Colstart]*4; + IOP = ioptr + (Colstart << ColstartShift); + p1r = IOP + BRLow[iCol]*4; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + for (; iCol > Colstart;){ + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + f4r = *(p1r); + f4i = *(p1r+1); + f5r = *(p1r+posA); + f5i = *(p1r+posAi); + f6r = *(p1r+2); + f6i = *(p1r+(2+1)); + f7r = *(p1r+posB); + f7i = *(p1r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + f0r = f4r + f5r; + f0i = f4i + f5i; + f5r = f4r - f5r; + f5i = f4i - f5i; + f2r = f6r + f7r; + f2i = f6i + f7i; + f7r = f6r - f7r; + f7i = f6i - f7i; + + *(p1r) = scale*t0r; + *(p1r+1) = scale*t0i; + *(p1r+2) = scale*f1r; + *(p1r+(2+1)) = scale*f1i; + *(p1r+posA) = scale*t1r; + *(p1r+posAi) = scale*t1i; + *(p1r+posB) = scale*f3r; + *(p1r+posBi) = scale*f3i; + *(p0r) = scale*f0r; + *(p0r+1) = scale*f0i; + *(p0r+2) = scale*f5r; + *(p0r+(2+1)) = scale*f5i; + *(p0r+posA) = scale*f2r; + *(p0r+posAi) = scale*f2i; + *(p0r+posB) = scale*f7r; + *(p0r+posBi) = scale*f7i; + + p0r -= Nrems2; + f0r = *(p0r); + f0i = *(p0r+1); + f1r = *(p0r+posA); + f1i = *(p0r+posAi); + iCol -= 1; + p1r = IOP + BRLow[iCol]*4; + }; + f2r = *(p0r+2); + f2i = *(p0r+(2+1)); + f3r = *(p0r+posB); + f3i = *(p0r+posBi); + + t0r = f0r + f1r; + t0i = f0i + f1i; + f1r = f0r - f1r; + f1i = f0i - f1i; + t1r = f2r + f3r; + t1i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + *(p0r) = scale*t0r; + *(p0r+1) = scale*t0i; + *(p0r+2) = scale*f1r; + *(p0r+(2+1)) = scale*f1i; + *(p0r+posA) = scale*t1r; + *(p0r+posAi) = scale*t1i; + *(p0r+posB) = scale*f3r; + *(p0r+posBi) = scale*f3i; + + }; +}; +} + +inline void ifft2pt(float *ioptr, float scale); +inline void ifft2pt(float *ioptr, float scale){ +/*** RADIX 2 ifft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[2]; +f1i = ioptr[3]; + + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + + /* store result */ +ioptr[0] = scale*t0r; +ioptr[1] = scale*t0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +} + +inline void ifft4pt(float *ioptr, float scale); +inline void ifft4pt(float *ioptr, float scale){ +/*** RADIX 4 ifft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[4]; +f1i = ioptr[5]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +} + +inline void ifft8pt(float *ioptr, float scale); +inline void ifft8pt(float *ioptr, float scale){ +/*** RADIX 8 ifft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[8]; +f1i = ioptr[9]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f7r = ioptr[14]; +f7i = ioptr[15]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - i - t1 + f7 - 1 - t1 - i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r + t1i; +f7i = f5i - t1r; +f5r = f5r - t1i; +f5i = f5i + t1r; + + +t0r = f0r - f4r; +t0i = f0i - f4i; +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r + f6i; +t1i = f2i - f6r; +f2r = f2r - f6i; +f2i = f2i + f6r; + +f4r = f1r - f5r * w0r + f5i * w0r; +f4i = f1i - f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r + f7i * w0r; +f6i = f3i - f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +ioptr[8] = scale*t0r; +ioptr[9] = scale*t0i; +ioptr[10] = scale*f4r; +ioptr[11] = scale*f4i; +ioptr[12] = scale*t1r; +ioptr[13] = scale*t1i; +ioptr[14] = scale*f6r; +ioptr[15] = scale*f6i; +} + +inline void ibfR2(float *ioptr, long M, long NDiffU); +inline void ibfR2(float *ioptr, long M, long NDiffU){ +/*** 2nd radix 2 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 Us at a time +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + /* Butterflys */ + /* + f0 - - f4 + f1 - 1 - f5 + f2 - - f6 + f3 - 1 - f7 + */ + +for (SameUCnt = NSameU; SameUCnt > 0 ; SameUCnt--){ + + f0r = *p0r; + f1r = *p1r; + f0i = *(p0r + 1); + f1i = *(p1r + 1); + f2r = *p2r; + f3r = *p3r; + f2i = *(p2r + 1); + f3i = *(p3r + 1); + + f4r = f0r + f1r; + f4i = f0i + f1i; + f5r = f0r - f1r; + f5i = f0i - f1i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f7r = f2r - f3r; + f7i = f2i - f3i; + + *p0r = f4r; + *(p0r + 1) = f4i; + *p1r = f5r; + *(p1r + 1) = f5i; + *p2r = f6r; + *(p2r + 1) = f6i; + *p3r = f7r; + *(p3r + 1) = f7i; + + f0r = *(p0r + pos); + f1i = *(p1r + posi); + f0i = *(p0r + posi); + f1r = *(p1r + pos); + f2r = *(p2r + pos); + f3i = *(p3r + posi); + f2i = *(p2r + posi); + f3r = *(p3r + pos); + + f4r = f0r - f1i; + f4i = f0i + f1r; + f5r = f0r + f1i; + f5i = f0i - f1r; + + f6r = f2r - f3i; + f6i = f2i + f3r; + f7r = f2r + f3i; + f7i = f2i - f3r; + + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + *(p2r + pos) = f6r; + *(p2r + posi) = f6i; + *(p3r + pos) = f7r; + *(p3r + posi) = f7i; + + p0r += pnext; + p1r += pnext; + p2r += pnext; + p3r += pnext; + +} +} + +inline void ibfR4(float *ioptr, long M, long NDiffU); +inline void ibfR4(float *ioptr, long M, long NDiffU){ +/*** 1 radix 4 stage ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long pnexti; +unsigned long NSameU; +unsigned long SameUCnt; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; + +float w1r = 1.0/MYROOT2; /* cos(pi/4) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 4; +pnexti = pnext + 1; +pos = 2; +posi = pos+1; +NSameU = POW2(M) / 4 /NDiffU; // 4 pts per butterfly +pstrt = ioptr; +p0r = pstrt; +p1r = pstrt+pinc; +p2r = p1r+pinc; +p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - f0 - - f4 + f1 - 1 - f5 - - f5 + f2 - - f6 - 1 - f6 + f3 - 1 - f3 - -i - f7 + */ + /* Butterflys */ + /* + f0 - - f4 - - f4 + f1 - -i - t1 - - f5 + f2 - - f2 - w1 - f6 + f3 - -i - f7 - iw1- f7 + */ + +f0r = *p0r; +f1r = *p1r; +f2r = *p2r; +f3r = *p3r; +f0i = *(p0r + 1); +f1i = *(p1r + 1); +f2i = *(p2r + 1); +f3i = *(p3r + 1); + +f5r = f0r - f1r; +f5i = f0i - f1i; +f0r = f0r + f1r; +f0i = f0i + f1i; + +f6r = f2r + f3r; +f6i = f2i + f3i; +f3r = f2r - f3r; +f3i = f2i - f3i; + +for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + + f7r = f5r + f3i; + f7i = f5i - f3r; + f5r = f5r - f3i; + f5i = f5i + f3r; + + f4r = f0r + f6r; + f4i = f0i + f6i; + f6r = f0r - f6r; + f6i = f0i - f6i; + + f2r = *(p2r + pos); + f2i = *(p2r + posi); + f1r = *(p1r + pos); + f1i = *(p1r + posi); + f3i = *(p3r + posi); + f0r = *(p0r + pos); + f3r = *(p3r + pos); + f0i = *(p0r + posi); + + *p3r = f7r; + *p0r = f4r; + *(p3r + 1) = f7i; + *(p0r + 1) = f4i; + *p1r = f5r; + *p2r = f6r; + *(p1r + 1) = f5i; + *(p2r + 1) = f6i; + + f7r = f2r + f3i; + f7i = f2i - f3r; + f2r = f2r - f3i; + f2i = f2i + f3r; + + f4r = f0r - f1i; + f4i = f0i + f1r; + t1r = f0r + f1i; + t1i = f0i - f1r; + + f5r = t1r - f7r * w1r - f7i * w1r; + f5i = t1i + f7r * w1r - f7i * w1r; + f7r = t1r * Two - f5r; + f7i = t1i * Two - f5i; + + f6r = f4r - f2r * w1r + f2i * w1r; + f6i = f4i - f2r * w1r - f2i * w1r; + f4r = f4r * Two - f6r; + f4i = f4i * Two - f6i; + + f3r = *(p3r + pnext); + f0r = *(p0r + pnext); + f3i = *(p3r + pnexti); + f0i = *(p0r + pnexti); + f2r = *(p2r + pnext); + f2i = *(p2r + pnexti); + f1r = *(p1r + pnext); + f1i = *(p1r + pnexti); + + *(p2r + pos) = f6r; + *(p1r + pos) = f5r; + *(p2r + posi) = f6i; + *(p1r + posi) = f5i; + *(p3r + pos) = f7r; + *(p0r + pos) = f4r; + *(p3r + posi) = f7i; + *(p0r + posi) = f4i; + + f6r = f2r + f3r; + f6i = f2i + f3i; + f3r = f2r - f3r; + f3i = f2i - f3i; + + f5r = f0r - f1r; + f5i = f0i - f1i; + f0r = f0r + f1r; + f0i = f0i + f1i; + + p3r += pnext; + p0r += pnext; + p1r += pnext; + p2r += pnext; + +} +f7r = f5r + f3i; +f7i = f5i - f3r; +f5r = f5r - f3i; +f5i = f5i + f3r; + +f4r = f0r + f6r; +f4i = f0i + f6i; +f6r = f0r - f6r; +f6i = f0i - f6i; + +f2r = *(p2r + pos); +f2i = *(p2r + posi); +f1r = *(p1r + pos); +f1i = *(p1r + posi); +f3i = *(p3r + posi); +f0r = *(p0r + pos); +f3r = *(p3r + pos); +f0i = *(p0r + posi); + +*p3r = f7r; +*p0r = f4r; +*(p3r + 1) = f7i; +*(p0r + 1) = f4i; +*p1r = f5r; +*p2r = f6r; +*(p1r + 1) = f5i; +*(p2r + 1) = f6i; + +f7r = f2r + f3i; +f7i = f2i - f3r; +f2r = f2r - f3i; +f2i = f2i + f3r; + +f4r = f0r - f1i; +f4i = f0i + f1r; +t1r = f0r + f1i; +t1i = f0i - f1r; + +f5r = t1r - f7r * w1r - f7i * w1r; +f5i = t1i + f7r * w1r - f7i * w1r; +f7r = t1r * Two - f5r; +f7i = t1i * Two - f5i; + +f6r = f4r - f2r * w1r + f2i * w1r; +f6i = f4i - f2r * w1r - f2i * w1r; +f4r = f4r * Two - f6r; +f4i = f4i * Two - f6i; + +*(p2r + pos) = f6r; +*(p1r + pos) = f5r; +*(p2r + posi) = f6i; +*(p1r + posi) = f5i; +*(p3r + pos) = f7r; +*(p0r + pos) = f4r; +*(p3r + posi) = f7i; +*(p0r + posi) = f4i; + +} + +inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +inline void ibfstages(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/*** RADIX 8 Stages ***/ +unsigned long pos; +unsigned long posi; +unsigned long pinc; +unsigned long pnext; +unsigned long NSameU; +unsigned long Uinc; +unsigned long Uinc2; +unsigned long Uinc4; +unsigned long DiffUCnt; +unsigned long SameUCnt; +unsigned long U2toU3; + +float *pstrt; +float *p0r, *p1r, *p2r, *p3r; +float *u0r, *u0i, *u1r, *u1i, *u2r, *u2i; + +float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pinc = NDiffU * 2; // 2 floats per complex +pnext = pinc * 8; +pos = pinc * 4; +posi = pos + 1; +NSameU = POW2(M) / 8 /NDiffU; // 8 pts per butterfly +Uinc = NSameU * Ustride; +Uinc2 = Uinc * 2; +Uinc4 = Uinc * 4; +U2toU3 = (POW2(M) / 8)*Ustride; +for (; StageCnt > 0 ; StageCnt--){ + + u0r = &Utbl[0]; + u0i = &Utbl[POW2(M-2)*Ustride]; + u1r = u0r; + u1i = u0i; + u2r = u0r; + u2i = u0i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + w2r = *u2r; + w2i = *u2i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + + pstrt = ioptr; + + p0r = pstrt; + p1r = pstrt+pinc; + p2r = p1r+pinc; + p3r = p2r+pinc; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - w0- f1 - - f1 - - f1 + f2 - - f2 - w1- f2 - - f4 + f3 - w0- t1 - iw1- f3 - - f5 + + f4 - - t0 - - f4 - w2- t0 + f5 - w0- f5 - - f5 - w3- t1 + f6 - - f6 - w1- f6 - iw2- f6 + f7 - w0- t1 - iw1- f7 - iw3- f7 + */ + + for (DiffUCnt = NDiffU; DiffUCnt > 0 ; DiffUCnt--){ + f0r = *p0r; + f0i = *(p0r + 1); + f1r = *p1r; + f1i = *(p1r + 1); + for (SameUCnt = NSameU-1; SameUCnt > 0 ; SameUCnt--){ + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r - f1i * w0i; + t0i = f0i + f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r + f3i * w0i; + t1i = f2i - f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r - f2i * w1i; + f0i = t0i + f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i + t1i * w1r; + f3i = f1i - t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + + t0r = f4r + f5r * w0r - f5i * w0i; + t0i = f4i + f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r + f7i * w0i; + t1i = f6i - f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r - f6i * w1i; + f4i = t0i + f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i + t1i * w1r; + f7i = f5i - t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + t0r = f0r - f4r * w2r + f4i * w2i; + t0i = f0i - f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + t1r = f1r - f5r * w3r + f5i * w3i; + t1i = f1i - f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + *(p0r + pos) = t0r; + *(p0r + posi) = t0i; + *p0r = f0r; + *(p0r + 1) = f0i; + + p0r += pnext; + f0r = *p0r; + f0i = *(p0r + 1); + + *(p1r + pos) = t1r; + *(p1r + posi) = t1i; + *p1r = f1r; + *(p1r + 1) = f1i; + + p1r += pnext; + + f1r = *p1r; + f1i = *(p1r + 1); + + f4r = f2r - f6r * w2i - f6i * w2r; + f4i = f2i + f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + f5r = f3r - f7r * w3i - f7i * w3r; + f5i = f3i + f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *p2r = f4r; + *(p2r + 1) = f4i; + *(p2r + pos) = f6r; + *(p2r + posi) = f6i; + + p2r += pnext; + + *p3r = f5r; + *(p3r + 1) = f5i; + *(p3r + pos) = f7r; + *(p3r + posi) = f7i; + + p3r += pnext; + + } + + f2r = *p2r; + f2i = *(p2r + 1); + f3r = *p3r; + f3i = *(p3r + 1); + + t0r = f0r + f1r * w0r - f1i * w0i; + t0i = f0i + f1r * w0i + f1i * w0r; + f1r = f0r * Two - t0r; + f1i = f0i * Two - t0i; + + f4r = *(p0r + pos); + f4i = *(p0r + posi); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + + f6r = *(p2r + pos); + f6i = *(p2r + posi); + f7r = *(p3r + pos); + f7i = *(p3r + posi); + + t1r = f2r - f3r * w0r + f3i * w0i; + t1i = f2i - f3r * w0i - f3i * w0r; + f2r = f2r * Two - t1r; + f2i = f2i * Two - t1i; + + f0r = t0r + f2r * w1r - f2i * w1i; + f0i = t0i + f2r * w1i + f2i * w1r; + f2r = t0r * Two - f0r; + f2i = t0i * Two - f0i; + + f3r = f1r + t1r * w1i + t1i * w1r; + f3i = f1i - t1r * w1r + t1i * w1i; + f1r = f1r * Two - f3r; + f1i = f1i * Two - f3i; + + if (DiffUCnt == NDiffU/2) + Uinc4 = -Uinc4; + + u0r += Uinc4; + u0i -= Uinc4; + u1r += Uinc2; + u1i -= Uinc2; + u2r += Uinc; + u2i -= Uinc; + + pstrt += 2; + + t0r = f4r + f5r * w0r - f5i * w0i; + t0i = f4i + f5r * w0i + f5i * w0r; + f5r = f4r * Two - t0r; + f5i = f4i * Two - t0i; + + t1r = f6r - f7r * w0r + f7i * w0i; + t1i = f6i - f7r * w0i - f7i * w0r; + f6r = f6r * Two - t1r; + f6i = f6i * Two - t1i; + + f4r = t0r + f6r * w1r - f6i * w1i; + f4i = t0i + f6r * w1i + f6i * w1r; + f6r = t0r * Two - f4r; + f6i = t0i * Two - f4i; + + f7r = f5r + t1r * w1i + t1i * w1r; + f7i = f5i - t1r * w1r + t1i * w1i; + f5r = f5r * Two - f7r; + f5i = f5i * Two - f7i; + + w0r = *u0r; + w0i = *u0i; + w1r = *u1r; + w1i = *u1i; + + if (DiffUCnt <= NDiffU/2) + w0r = -w0r; + + t0r = f0r - f4r * w2r + f4i * w2i; + t0i = f0i - f4r * w2i - f4i * w2r; + f0r = f0r * Two - t0r; + f0i = f0i * Two - t0i; + + f4r = f2r - f6r * w2i - f6i * w2r; + f4i = f2i + f6r * w2r - f6i * w2i; + f6r = f2r * Two - f4r; + f6i = f2i * Two - f4i; + + *(p0r + pos) = t0r; + *p2r = f4r; + *(p0r + posi) = t0i; + *(p2r + 1) = f4i; + w2r = *u2r; + w2i = *u2i; + *p0r = f0r; + *(p2r + pos) = f6r; + *(p0r + 1) = f0i; + *(p2r + posi) = f6i; + + p0r = pstrt; + p2r = pstrt + pinc + pinc; + + t1r = f1r - f5r * w3r + f5i * w3i; + t1i = f1i - f5r * w3i - f5i * w3r; + f1r = f1r * Two - t1r; + f1i = f1i * Two - t1i; + + f5r = f3r - f7r * w3i - f7i * w3r; + f5i = f3i + f7r * w3r - f7i * w3i; + f7r = f3r * Two - f5r; + f7i = f3i * Two - f5i; + + *(p1r + pos) = t1r; + *p3r = f5r; + *(p1r + posi) = t1i; + *(p3r + 1) = f5i; + w3r = *(u2r+U2toU3); + w3i = *(u2i-U2toU3); + *p1r = f1r; + *(p3r + pos) = f7r; + *(p1r + 1) = f1i; + *(p3r + posi) = f7i; + + p1r = pstrt + pinc; + p3r = p2r + pinc; + } + NSameU /= 8; + Uinc /= 8; + Uinc2 /= 8; + Uinc4 = Uinc * 4; + NDiffU *= 8; + pinc *= 8; + pnext *= 8; + pos *= 8; + posi = pos + 1; +} +} + +void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt); +void ifftrecurs(float *ioptr, long M, float *Utbl, long Ustride, long NDiffU, long StageCnt){ +/* recursive bfstages calls to maximize on chip cache efficiency */ +long i1; +if (M <= MCACHE) + ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); /* RADIX 8 Stages */ +else{ + for (i1=0; i1<8; i1++){ + ifftrecurs(&ioptr[i1*POW2(M-3)*2], M-3, Utbl, 8*Ustride, NDiffU, StageCnt-1); /* RADIX 8 Stages */ + } + ibfstages(ioptr, M, Utbl, Ustride, POW2(M-3), 1); /* RADIX 8 Stage */ +} +} + +void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +long StageCnt; +long NDiffU; +const float scale = 1.0/POW2(M); + +switch (M){ +case 0: + break; +case 1: + for (;Rows>0;Rows--){ + ifft2pt(ioptr, scale); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + ifft4pt(ioptr, scale); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + ifft8pt(ioptr, scale); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + for (;Rows>0;Rows--){ + + scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE) + ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + + else{ + ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + ioptr += 2*POW2(M); + } +} +} + +/************************************************ +parts of rffts1 +*************************************************/ + +inline void rfft1pt(float *ioptr); +inline void rfft1pt(float *ioptr){ +/*** RADIX 2 rfft ***/ +float f0r, f0i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; + + /* finish rfft */ +t0r = f0r + f0i; +t0i = f0r - f0i; + + /* store result */ +ioptr[0] = t0r; +ioptr[1] = t0i; +} + +inline void rfft2pt(float *ioptr); +inline void rfft2pt(float *ioptr){ +/*** RADIX 4 rfft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[2]; +f1i = ioptr[3]; + + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f1i - f0i; + /* finish rfft */ +f0r = t0r + t0i; +f0i = t0r - t0i; + + /* store result */ +ioptr[0] = f0r; +ioptr[1] = f0i; +ioptr[2] = f1r; +ioptr[3] = f1i; +} + +inline void rfft4pt(float *ioptr); +inline void rfft4pt(float *ioptr){ +/*** RADIX 8 rfft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +const float Two = 2.0; +const float scale = 0.5; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[4]; +f1i = ioptr[5]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - -i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = f2i - t0i; // neg for rfft + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + /* finish rfft */ +t0r = f0r + f0i; /* compute Re(x[0]) */ +t0i = f0r - f0i; /* compute Re(x[N/2]) */ + +t1r = f1r + f3r; +t1i = f1i - f3i; +f0r = f1i + f3i; +f0i = f3r - f1r; + +f1r = t1r + w0r * f0r + w0r * f0i; +f1i = t1i - w0r * f0r + w0r * f0i; +f3r = Two * t1r - f1r; +f3i = f1i - Two * t1i; + + /* store result */ +ioptr[4] = f2r; +ioptr[5] = f2i; +ioptr[0] = t0r; +ioptr[1] = t0i; + +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +} + +inline void rfft8pt(float *ioptr); +inline void rfft8pt(float *ioptr){ +/*** RADIX 16 rfft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float w1r = MYCOSPID8; /* cos(pi/8) */ +float w1i = MYSINPID8; /* sin(pi/8) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; +const float scale = 0.5; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; +f1r = ioptr[8]; +f1i = ioptr[9]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f7r = ioptr[14]; +f7i = ioptr[15]; + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1 - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - -i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - -i - t1 + f7 - 1 - t1 - -i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i + f1i; +f1r = f0r - f1r; +f1i = f0i - f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r - t1i; +f3i = f1i + t1r; +f1r = f1r + t1i; +f1i = f1i - t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r - t1i; +f7i = f5i + t1r; +f5r = f5r + t1i; +f5i = f5i - t1r; + + +t0r = f0r - f4r; +t0i = f4i - f0i; // neg for rfft +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r - f6i; +t1i = f2i + f6r; +f2r = f2r + f6i; +f2i = f2i - f6r; + +f4r = f1r - f5r * w0r - f5i * w0r; +f4i = f1i + f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r - f7i * w0r; +f6i = f3i + f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* finish rfft */ +f5r = f0r + f0i; /* compute Re(x[0]) */ +f5i = f0r - f0i; /* compute Re(x[N/2]) */ + +f0r = f2r + t1r; +f0i = f2i - t1i; +f7r = f2i + t1i; +f7i = t1r - f2r; + +f2r = f0r + w0r * f7r + w0r * f7i; +f2i = f0i - w0r * f7r + w0r * f7i; +t1r = Two * f0r - f2r; +t1i = f2i - Two * f0i; + + +f0r = f1r + f6r; +f0i = f1i - f6i; +f7r = f1i + f6i; +f7i = f6r - f1r; + +f1r = f0r + w1r * f7r + w1i * f7i; +f1i = f0i - w1i * f7r + w1r * f7i; +f6r = Two * f0r - f1r; +f6i = f1i - Two * f0i; + +f0r = f3r + f4r; +f0i = f3i - f4i; +f7r = f3i + f4i; +f7i = f4r - f3r; + +f3r = f0r + w1i * f7r + w1r * f7i; +f3i = f0i - w1r * f7r + w1i * f7i; +f4r = Two * f0r - f3r; +f4i = f3i - Two * f0i; + + /* store result */ +ioptr[8] = t0r; +ioptr[9] = t0i; +ioptr[0] = f5r; +ioptr[1] = f5i; + +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[12] = scale*t1r; +ioptr[13] = scale*t1i; + +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +ioptr[10] = scale*f4r; +ioptr[11] = scale*f4i; +ioptr[14] = scale*f6r; +ioptr[15] = scale*f6i; +} + +inline void frstage(float *ioptr, long M, float *Utbl); +inline void frstage(float *ioptr, long M, float *Utbl){ +/* Finish RFFT */ + +unsigned long pos; +unsigned long posi; +unsigned long diffUcnt; + +float *p0r, *p1r; +float *u0r, *u0i; + +float w0r, w0i; +float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pos = POW2(M-1); +posi = pos + 1; + +p0r = ioptr; +p1r = ioptr + pos/2; + +u0r = Utbl + POW2(M-3); + +w0r = *u0r, + +f0r = *(p0r); +f0i = *(p0r + 1); +f4r = *(p0r + pos); +f4i = *(p0r + posi); +f1r = *(p1r); +f1i = *(p1r + 1); +f5r = *(p1r + pos); +f5i = *(p1r + posi); + + t0r = Two * f0r + Two * f0i; /* compute Re(x[0]) */ + t0i = Two * f0r - Two * f0i; /* compute Re(x[N/2]) */ + t1r = f4r + f4r; + t1i = -f4i - f4i; + + f0r = f1r + f5r; + f0i = f1i - f5i; + f4r = f1i + f5i; + f4i = f5r - f1r; + + f1r = f0r + w0r * f4r + w0r * f4i; + f1i = f0i - w0r * f4r + w0r * f4i; + f5r = Two * f0r - f1r; + f5i = f1i - Two * f0i; + +*(p0r) = t0r; +*(p0r + 1) = t0i; +*(p0r + pos) = t1r; +*(p0r + posi) = t1i; +*(p1r) = f1r; +*(p1r + 1) = f1i; +*(p1r + pos) = f5r; +*(p1r + posi) = f5i; + +u0r = Utbl + 1; +u0i = Utbl + (POW2(M-2)-1); + +w0r = *u0r, +w0i = *u0i; + +p0r = (ioptr + 2); +p1r = (ioptr + (POW2(M-2)-1)*2); + + /* Butterflys */ + /* + f0 - t0 - - f0 + f5 - t1 - w0 - f5 + + f1 - t0 - - f1 + f4 - t1 -iw0 - f4 + */ + +for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){ + + f0r = *(p0r); + f0i = *(p0r + 1); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + f1r = *(p1r); + f1i = *(p1r + 1); + f4r = *(p0r + pos); + f4i = *(p0r + posi); + + t0r = f0r + f5r; + t0i = f0i - f5i; + t1r = f0i + f5i; + t1i = f5r - f0r; + + f0r = t0r + w0r * t1r + w0i * t1i; + f0i = t0i - w0i * t1r + w0r * t1i; + f5r = Two * t0r - f0r; + f5i = f0i - Two * t0i; + + t0r = f1r + f4r; + t0i = f1i - f4i; + t1r = f1i + f4i; + t1i = f4r - f1r; + + f1r = t0r + w0i * t1r + w0r * t1i; + f1i = t0i - w0r * t1r + w0i * t1i; + f4r = Two * t0r - f1r; + f4i = f1i - Two * t0i; + + *(p0r) = f0r; + *(p0r + 1) = f0i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + + w0r = *++u0r; + w0i = *--u0i; + + *(p1r) = f1r; + *(p1r + 1) = f1i; + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + + p0r += 2; + p1r -= 2; +}; +} + +void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + +float scale; +long StageCnt; +long NDiffU; + +M=M-1; +switch (M){ +case -1: + break; +case 0: + for (;Rows>0;Rows--){ + rfft1pt(ioptr); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } +case 1: + for (;Rows>0;Rows--){ + rfft2pt(ioptr); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + rfft4pt(ioptr); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + rfft8pt(ioptr); /* a 16 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + scale = 0.5; + for (;Rows>0;Rows--){ + + scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + bfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + bfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE){ + bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + frstage(ioptr, M+1, Utbl); + } + + else{ + fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + frstage(ioptr, M+1, Utbl); + } + + ioptr += 2*POW2(M); + } +} +} + +/************************************************ +parts of riffts1 +*************************************************/ + +inline void rifft1pt(float *ioptr, float scale); +inline void rifft1pt(float *ioptr, float scale){ +/*** RADIX 2 rifft ***/ +float f0r, f0i; +float t0r, t0i; + + /* bit reversed load */ +f0r = ioptr[0]; +f0i = ioptr[1]; + + /* finish rfft */ +t0r = f0r + f0i; +t0i = f0r - f0i; + + /* store result */ +ioptr[0] = scale*t0r; +ioptr[1] = scale*t0i; +} + +inline void rifft2pt(float *ioptr, float scale); +inline void rifft2pt(float *ioptr, float scale){ +/*** RADIX 4 rifft ***/ +float f0r, f0i, f1r, f1i; +float t0r, t0i; +const float Two = 2.0; + + /* bit reversed load */ +t0r = ioptr[0]; +t0i = ioptr[1]; +f1r = Two * ioptr[2]; +f1i = Two * ioptr[3]; + + /* start rifft */ +f0r = t0r + t0i; +f0i = t0r - t0i; + /* Butterflys */ + /* + f0 - - t0 + f1 - 1 - f1 + */ + +t0r = f0r + f1r; +t0i = f0i - f1i; +f1r = f0r - f1r; +f1i = f0i + f1i; + + /* store result */ +ioptr[0] = scale*t0r; +ioptr[1] = scale*t0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +} + +inline void rifft4pt(float *ioptr, float scale); +inline void rifft4pt(float *ioptr, float scale){ +/*** RADIX 8 rifft ***/ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float t0r, t0i, t1r, t1i; +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +const float Two = 2.0; + + /* bit reversed load */ +t0r = ioptr[0]; +t0i = ioptr[1]; +f2r = ioptr[2]; +f2i = ioptr[3]; +f1r = Two * ioptr[4]; +f1i = Two * ioptr[5]; +f3r = ioptr[6]; +f3i = ioptr[7]; + + /* start rfft */ +f0r = t0r + t0i; /* compute Re(x[0]) */ +f0i = t0r - t0i; /* compute Re(x[N/2]) */ + +t1r = f2r + f3r; +t1i = f2i - f3i; +t0r = f2r - f3r; +t0i = f2i + f3i; + +f2r = t1r - w0r * t0r - w0r * t0i; +f2i = t1i + w0r * t0r - w0r * t0i; +f3r = Two * t1r - f2r; +f3i = f2i - Two * t1i; + + /* Butterflys */ + /* + f0 - - t0 - - f0 + f1 - 1 - f1 - - f1 + f2 - - f2 - 1 - f2 + f3 - 1 - t1 - i - f3 + */ + +t0r = f0r + f1r; +t0i = f0i - f1i; +f1r = f0r - f1r; +f1i = f0i + f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +} + +inline void rifft8pt(float *ioptr, float scale); +inline void rifft8pt(float *ioptr, float scale){ +/*** RADIX 16 rifft ***/ +float w0r = 1.0/MYROOT2; /* cos(pi/4) */ +float w1r = MYCOSPID8; /* cos(pi/8) */ +float w1i = MYSINPID8; /* sin(pi/8) */ +float f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i; +float f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + + /* bit reversed load */ +t0r = ioptr[0]; +t0i = ioptr[1]; +f4r = ioptr[2]; +f4i = ioptr[3]; +f2r = ioptr[4]; +f2i = ioptr[5]; +f6r = ioptr[6]; +f6i = ioptr[7]; +f1r = Two * ioptr[8]; +f1i = Two * ioptr[9]; +f5r = ioptr[10]; +f5i = ioptr[11]; +f3r = ioptr[12]; +f3i = ioptr[13]; +f7r = ioptr[14]; +f7i = ioptr[15]; + + + /* start rfft */ +f0r = t0r + t0i; /* compute Re(x[0]) */ +f0i = t0r - t0i; /* compute Re(x[N/2]) */ + +t0r = f2r + f3r; +t0i = f2i - f3i; +t1r = f2r - f3r; +t1i = f2i + f3i; + +f2r = t0r - w0r * t1r - w0r * t1i; +f2i = t0i + w0r * t1r - w0r * t1i; +f3r = Two * t0r - f2r; +f3i = f2i - Two * t0i; + +t0r = f4r + f7r; +t0i = f4i - f7i; +t1r = f4r - f7r; +t1i = f4i + f7i; + +f4r = t0r - w1i * t1r - w1r * t1i; +f4i = t0i + w1r * t1r - w1i * t1i; +f7r = Two * t0r - f4r; +f7i = f4i - Two * t0i; + +t0r = f6r + f5r; +t0i = f6i - f5i; +t1r = f6r - f5r; +t1i = f6i + f5i; + +f6r = t0r - w1r * t1r - w1i * t1i; +f6i = t0i + w1i * t1r - w1r * t1i; +f5r = Two * t0r - f6r; +f5i = f6i - Two * t0i; + + /* Butterflys */ + /* + f0 - - t0 - - f0 - - f0 + f1* - 1 - f1 - - f1 - - f1 + f2 - - f2 - 1 - f2 - - f2 + f3 - 1 - t1 - i - f3 - - f3 + f4 - - t0 - - f4 - 1 - t0 + f5 - 1 - f5 - - f5 - w3 - f4 + f6 - - f6 - 1 - f6 - i - t1 + f7 - 1 - t1 - i - f7 - iw3- f6 + */ + +t0r = f0r + f1r; +t0i = f0i - f1i; +f1r = f0r - f1r; +f1i = f0i + f1i; + +t1r = f2r - f3r; +t1i = f2i - f3i; +f2r = f2r + f3r; +f2i = f2i + f3i; + +f0r = t0r + f2r; +f0i = t0i + f2i; +f2r = t0r - f2r; +f2i = t0i - f2i; + +f3r = f1r + t1i; +f3i = f1i - t1r; +f1r = f1r - t1i; +f1i = f1i + t1r; + + +t0r = f4r + f5r; +t0i = f4i + f5i; +f5r = f4r - f5r; +f5i = f4i - f5i; + +t1r = f6r - f7r; +t1i = f6i - f7i; +f6r = f6r + f7r; +f6i = f6i + f7i; + +f4r = t0r + f6r; +f4i = t0i + f6i; +f6r = t0r - f6r; +f6i = t0i - f6i; + +f7r = f5r + t1i; +f7i = f5i - t1r; +f5r = f5r - t1i; +f5i = f5i + t1r; + + +t0r = f0r - f4r; +t0i = f0i - f4i; +f0r = f0r + f4r; +f0i = f0i + f4i; + +t1r = f2r + f6i; +t1i = f2i - f6r; +f2r = f2r - f6i; +f2i = f2i + f6r; + +f4r = f1r - f5r * w0r + f5i * w0r; +f4i = f1i - f5r * w0r - f5i * w0r; +f1r = f1r * Two - f4r; +f1i = f1i * Two - f4i; + +f6r = f3r + f7r * w0r + f7i * w0r; +f6i = f3i - f7r * w0r + f7i * w0r; +f3r = f3r * Two - f6r; +f3i = f3i * Two - f6i; + + /* store result */ +ioptr[0] = scale*f0r; +ioptr[1] = scale*f0i; +ioptr[2] = scale*f1r; +ioptr[3] = scale*f1i; +ioptr[4] = scale*f2r; +ioptr[5] = scale*f2i; +ioptr[6] = scale*f3r; +ioptr[7] = scale*f3i; +ioptr[8] = scale*t0r; +ioptr[9] = scale*t0i; +ioptr[10] = scale*f4r; +ioptr[11] = scale*f4i; +ioptr[12] = scale*t1r; +ioptr[13] = scale*t1i; +ioptr[14] = scale*f6r; +ioptr[15] = scale*f6i; +} + +inline void ifrstage(float *ioptr, long M, float *Utbl); +inline void ifrstage(float *ioptr, long M, float *Utbl){ +/* Start RIFFT */ + +unsigned long pos; +unsigned long posi; +unsigned long diffUcnt; + +float *p0r, *p1r; +float *u0r, *u0i; + +float w0r, w0i; +float f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i; +float t0r, t0i, t1r, t1i; +const float Two = 2.0; + +pos = POW2(M-1); +posi = pos + 1; + +p0r = ioptr; +p1r = ioptr + pos/2; + +u0r = Utbl + POW2(M-3); + +w0r = *u0r, + +f0r = *(p0r); +f0i = *(p0r + 1); +f4r = *(p0r + pos); +f4i = *(p0r + posi); +f1r = *(p1r); +f1i = *(p1r + 1); +f5r = *(p1r + pos); +f5i = *(p1r + posi); + + t0r = f0r + f0i; + t0i = f0r - f0i; + t1r = f4r + f4r; + t1i = -f4i - f4i; + + f0r = f1r + f5r; + f0i = f1i - f5i; + f4r = f1r - f5r; + f4i = f1i + f5i; + + f1r = f0r - w0r * f4r - w0r * f4i; + f1i = f0i + w0r * f4r - w0r * f4i; + f5r = Two * f0r - f1r; + f5i = f1i - Two * f0i; + +*(p0r) = t0r; +*(p0r + 1) = t0i; +*(p0r + pos) = t1r; +*(p0r + posi) = t1i; +*(p1r) = f1r; +*(p1r + 1) = f1i; +*(p1r + pos) = f5r; +*(p1r + posi) = f5i; + +u0r = Utbl + 1; +u0i = Utbl + (POW2(M-2)-1); + +w0r = *u0r, +w0i = *u0i; + +p0r = (ioptr + 2); +p1r = (ioptr + (POW2(M-2)-1)*2); + + /* Butterflys */ + /* + f0 - t0 - f0 + f1 - t1 -w0- f1 + + f2 - t0 - f2 + f3 - t1 -iw0- f3 + */ + +for (diffUcnt = POW2(M-3)-1; diffUcnt > 0 ; diffUcnt--){ + + f0r = *(p0r); + f0i = *(p0r + 1); + f5r = *(p1r + pos); + f5i = *(p1r + posi); + f1r = *(p1r); + f1i = *(p1r + 1); + f4r = *(p0r + pos); + f4i = *(p0r + posi); + + t0r = f0r + f5r; + t0i = f0i - f5i; + t1r = f0r - f5r; + t1i = f0i + f5i; + + f0r = t0r - w0i * t1r - w0r * t1i; + f0i = t0i + w0r * t1r - w0i * t1i; + f5r = Two * t0r - f0r; + f5i = f0i - Two * t0i; + + t0r = f1r + f4r; + t0i = f1i - f4i; + t1r = f1r - f4r; + t1i = f1i + f4i; + + f1r = t0r - w0r * t1r - w0i * t1i; + f1i = t0i + w0i * t1r - w0r * t1i; + f4r = Two * t0r - f1r; + f4i = f1i - Two * t0i; + + *(p0r) = f0r; + *(p0r + 1) = f0i; + *(p1r + pos) = f5r; + *(p1r + posi) = f5i; + + w0r = *++u0r; + w0i = *--u0i; + + *(p1r) = f1r; + *(p1r + 1) = f1i; + *(p0r + pos) = f4r; + *(p0r + posi) = f4i; + + p0r += 2; + p1r -= 2; +}; +} + +void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow){ +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts1 */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = real output data array */ + +float scale; +long StageCnt; +long NDiffU; + +scale = 1.0/POW2(M); +M=M-1; +switch (M){ +case -1: + break; +case 0: + for (;Rows>0;Rows--){ + rifft1pt(ioptr, scale); /* a 2 pt fft */ + ioptr += 2*POW2(M); + } +case 1: + for (;Rows>0;Rows--){ + rifft2pt(ioptr, scale); /* a 4 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 2: + for (;Rows>0;Rows--){ + rifft4pt(ioptr, scale); /* an 8 pt fft */ + ioptr += 2*POW2(M); + } + break; +case 3: + for (;Rows>0;Rows--){ + rifft8pt(ioptr, scale); /* a 16 pt fft */ + ioptr += 2*POW2(M); + } + break; +default: + for (;Rows>0;Rows--){ + + ifrstage(ioptr, M+1, Utbl); + + scbitrevR2(ioptr, M, BRLow, scale); /* bit reverse and first radix 2 stage */ + + StageCnt = (M-1) / 3; // number of radix 8 stages + NDiffU = 2; // one radix 2 stage already complete + + if ( (M-1-(StageCnt * 3)) == 1 ){ + ibfR2(ioptr, M, NDiffU); /* 1 radix 2 stage */ + NDiffU *= 2; + } + + if ( (M-1-(StageCnt * 3)) == 2 ){ + ibfR4(ioptr, M, NDiffU); /* 1 radix 4 stage */ + NDiffU *= 4; + } + + if (M <= MCACHE){ + ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + else{ + ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); /* RADIX 8 Stages */ + } + + ioptr += 2*POW2(M); + } +} +} + diff --git a/src/maths/fft/fftlib.h b/src/maths/fft/fftlib.h new file mode 100644 index 000000000..86de4acfa --- /dev/null +++ b/src/maths/fft/fftlib.h @@ -0,0 +1,76 @@ +#define MYRECIPLN2 1.442695040888963407359924681001892137426 // 1.0/log(2) + +/* some useful conversions between a number and its power of 2 */ +#define LOG2(a) (MYRECIPLN2*log(a)) // floating point logarithm base 2 +#define POW2(m) ((unsigned long) 1 << (m)) // integer power of 2 for m<32 + +/******************************************************************* +lower level fft stuff called by routines in fftext.c and fft2d.c +*******************************************************************/ + +void fftCosInit(long M, float *Utbl); +/* Compute Utbl, the cosine table for ffts */ +/* of size (pow(2,M)/4 +1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *Utbl = cosine table */ + +void fftBRInit(long M, short *BRLow); +/* Compute BRLow, the bit reversed table for ffts */ +/* of size pow(2,M/2 -1) */ +/* INPUTS */ +/* M = log2 of fft size */ +/* OUTPUTS */ +/* *BRLow = bit reversed counter table */ + +void ffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size (ex M=10 for 1024 point fft) */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void iffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place inverse complex fft on the rows of the input array */ +/* INPUTS */ +/* *ioptr = input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array */ + +void rffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place real fft on the rows of the input array */ +/* The result is the complex spectra of the positive frequencies */ +/* except the location for the first complex number contains the real */ +/* values for DC and Nyquest */ +/* INPUTS */ +/* *ioptr = real input data array */ +/* M = log2 of fft size */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = output data array in the following order */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ + + +void riffts1(float *ioptr, long M, long Rows, float *Utbl, short *BRLow); +/* Compute in-place real ifft on the rows of the input array */ +/* data order as from rffts1 */ +/* INPUTS */ +/* *ioptr = input data array in the following order */ +/* M = log2 of fft size */ +/* Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]). */ +/* Rows = number of rows in ioptr array (use Rows of 1 if ioptr is a 1 dimensional array) */ +/* *Utbl = cosine table */ +/* *BRLow = bit reversed counter table */ +/* OUTPUTS */ +/* *ioptr = real output data array */ diff --git a/src/maths/fft/matlib.c b/src/maths/fft/matlib.c new file mode 100644 index 000000000..4e8b682e1 --- /dev/null +++ b/src/maths/fft/matlib.c @@ -0,0 +1,297 @@ +/* a few routines from a vector/matrix library */ +#include "matlib.h" + +void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ +/* not in-place matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +float *irow; /* pointer to input row start */ +float *ocol; /* pointer to output col start */ +float *idata; /* pointer to input data */ +float *odata; /* pointer to output data */ +long RowCnt; /* row counter */ +long ColCnt; /* col counter */ +float T0; /* data storage */ +float T1; /* data storage */ +float T2; /* data storage */ +float T3; /* data storage */ +float T4; /* data storage */ +float T5; /* data storage */ +float T6; /* data storage */ +float T7; /* data storage */ +const long inRsizd1 = iRsiz; +const long inRsizd2 = 2*iRsiz; +const long inRsizd3 = inRsizd2+iRsiz; +const long inRsizd4 = 4*iRsiz; +const long inRsizd5 = inRsizd3+inRsizd2; +const long inRsizd6 = inRsizd4+inRsizd2; +const long inRsizd7 = inRsizd4+inRsizd3; +const long inRsizd8 = 8*iRsiz; + +ocol = outdata; +irow = indata; +for (RowCnt=Nrows/8; RowCnt>0; RowCnt--){ + idata = irow; + odata = ocol; + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + T0 = *idata; + T1 = *(idata+inRsizd1); + T2 = *(idata+inRsizd2); + T3 = *(idata+inRsizd3); + T4 = *(idata+inRsizd4); + T5 = *(idata+inRsizd5); + T6 = *(idata+inRsizd6); + T7 = *(idata+inRsizd7); + *odata = T0; + *(odata+1) = T1; + *(odata+2) = T2; + *(odata+3) = T3; + *(odata+4) = T4; + *(odata+5) = T5; + *(odata+6) = T6; + *(odata+7) = T7; + idata++; + odata += oRsiz; + } + irow += inRsizd8; + ocol += 8; +} +if (Nrows%8 != 0){ + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + idata = irow++; + odata = ocol; + ocol += oRsiz; + for (RowCnt=Nrows%8; RowCnt>0; RowCnt--){ + T0 = *idata; + *odata++ = T0; + idata += iRsiz; + } + } +} +} + +void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols){ +/* not in-place complex float matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +float *irow; /* pointer to input row start */ +float *ocol; /* pointer to output col start */ +float *idata; /* pointer to input data */ +float *odata; /* pointer to output data */ +long RowCnt; /* row counter */ +long ColCnt; /* col counter */ +float T0r; /* data storage */ +float T0i; /* data storage */ +float T1r; /* data storage */ +float T1i; /* data storage */ +float T2r; /* data storage */ +float T2i; /* data storage */ +float T3r; /* data storage */ +float T3i; /* data storage */ +const long inRsizd1 = 2*iRsiz; +const long inRsizd1i = 2*iRsiz + 1; +const long inRsizd2 = 4*iRsiz; +const long inRsizd2i = 4*iRsiz + 1; +const long inRsizd3 = inRsizd2+inRsizd1; +const long inRsizd3i = inRsizd2+inRsizd1 + 1; +const long inRsizd4 = 8*iRsiz; + +ocol = outdata; +irow = indata; +for (RowCnt=Nrows/4; RowCnt>0; RowCnt--){ + idata = irow; + odata = ocol; + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + T0r = *idata; + T0i = *(idata +1); + T1r = *(idata+inRsizd1); + T1i = *(idata+inRsizd1i); + T2r = *(idata+inRsizd2); + T2i = *(idata+inRsizd2i); + T3r = *(idata+inRsizd3); + T3i = *(idata+inRsizd3i); + *odata = T0r; + *(odata+1) = T0i; + *(odata+2) = T1r; + *(odata+3) = T1i; + *(odata+4) = T2r; + *(odata+5) = T2i; + *(odata+6) = T3r; + *(odata+7) = T3i; + idata+=2; + odata += 2*oRsiz; + } + irow += inRsizd4; + ocol += 8; +} +if (Nrows%4 != 0){ + for (ColCnt=Ncols; ColCnt>0; ColCnt--){ + idata = irow; + odata = ocol; + for (RowCnt=Nrows%4; RowCnt>0; RowCnt--){ + T0r = *idata; + T0i = *(idata+1); + *odata = T0r; + *(odata+1) = T0i; + odata+=2; + idata += 2*iRsiz; + } + irow+=2; + ocol += 2*oRsiz; + } +} +} + +void cvprod(float *a, float *b, float *out, long N){ +/* complex vector product, can be in-place */ +/* product of complex vector *a times complex vector *b */ +/* INPUTS */ +/* N vector length */ +/* *a complex vector length N complex numbers */ +/* *b complex vector length N complex numbers */ +/* OUTPUTS */ +/* *out complex vector length N */ + +long OutCnt; /* output counter */ +float A0R; /* A storage */ +float A0I; /* A storage */ +float A1R; /* A storage */ +float A1I; /* A storage */ +float A2R; /* A storage */ +float A2I; /* A storage */ +float A3R; /* A storage */ +float A3I; /* A storage */ +float B0R; /* B storage */ +float B0I; /* B storage */ +float B1R; /* B storage */ +float B1I; /* B storage */ +float B2R; /* B storage */ +float B2I; /* B storage */ +float B3R; /* B storage */ +float B3I; /* B storage */ +float T0R; /* TMP storage */ +float T0I; /* TMP storage */ +float T1R; /* TMP storage */ +float T1I; /* TMP storage */ +float T2R; /* TMP storage */ +float T2I; /* TMP storage */ +float T3R; /* TMP storage */ +float T3I; /* TMP storage */ + +if (N>=4){ + A0R = *a; + B0R = *b; + A0I = *(a +1); + B0I = *(b +1); + A1R = *(a +2); + B1R = *(b +2); + A1I = *(a +3); + B1I = *(b +3); + A2R = *(a +4); + B2R = *(b +4); + A2I = *(a +5); + B2I = *(b +5); + A3R = *(a +6); + B3R = *(b +6); + A3I = *(a +7); + B3I = *(b +7); + T0R = A0R * B0R; + T0I = (A0R * B0I); + T1R = A1R * B1R; + T1I = (A1R * B1I); + T2R = A2R * B2R; + T2I = (A2R * B2I); + T3R = A3R * B3R; + T3I = (A3R * B3I); + T0R -= (A0I * B0I); + T0I = A0I * B0R + T0I; + T1R -= (A1I * B1I); + T1I = A1I * B1R + T1I; + T2R -= (A2I * B2I); + T2I = A2I * B2R + T2I; + T3R -= (A3I * B3I); + T3I = A3I * B3R + T3I; + for (OutCnt=N/4-1; OutCnt > 0; OutCnt--){ + a += 8; + b += 8; + A0R = *a; + B0R = *b; + A0I = *(a +1); + B0I = *(b +1); + A1R = *(a +2); + B1R = *(b +2); + A1I = *(a +3); + B1I = *(b +3); + A2R = *(a +4); + B2R = *(b +4); + A2I = *(a +5); + B2I = *(b +5); + A3R = *(a +6); + B3R = *(b +6); + A3I = *(a +7); + B3I = *(b +7); + *out = T0R; + *(out +1) = T0I; + *(out +2) = T1R; + *(out +3) = T1I; + *(out +4) = T2R; + *(out +5) = T2I; + *(out +6) = T3R; + *(out +7) = T3I; + T0R = A0R * B0R; + T0I = (A0R * B0I); + T1R = A1R * B1R; + T1I = (A1R * B1I); + T2R = A2R * B2R; + T2I = (A2R * B2I); + T3R = A3R * B3R; + T3I = (A3R * B3I); + T0R -= (A0I * B0I); + T0I = A0I * B0R + T0I; + T1R -= (A1I * B1I); + T1I = A1I * B1R + T1I; + T2R -= (A2I * B2I); + T2I = A2I * B2R + T2I; + T3R -= (A3I * B3I); + T3I = A3I * B3R + T3I; + out += 8; + } + a += 8; + b += 8; + *out = T0R; + *(out +1) = T0I; + *(out +2) = T1R; + *(out +3) = T1I; + *(out +4) = T2R; + *(out +5) = T2I; + *(out +6) = T3R; + *(out +7) = T3I; + out += 8; +} +for (OutCnt=N%4; OutCnt > 0; OutCnt--){ + A0R = *a++; + B0R = *b++; + A0I = *a++; + B0I = *b++; + T0R = A0R * B0R; + T0I = (A0R * B0I); + T0R -= (A0I * B0I); + T0I = A0I * B0R + T0I; + *out++ = T0R; + *out++ = T0I; +} +} diff --git a/src/maths/fft/matlib.h b/src/maths/fft/matlib.h new file mode 100644 index 000000000..baf9e6e7f --- /dev/null +++ b/src/maths/fft/matlib.h @@ -0,0 +1,33 @@ +/* a few routines from a vector/matrix library */ + +void xpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); +/* not in-place matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +void cxpose(float *indata, long iRsiz, float *outdata, long oRsiz, long Nrows, long Ncols); +/* not in-place complex matrix transpose */ +/* INPUTS */ +/* *indata = input data array */ +/* iRsiz = offset to between rows of input data array */ +/* oRsiz = offset to between rows of output data array */ +/* Nrows = number of rows in input data array */ +/* Ncols = number of columns in input data array */ +/* OUTPUTS */ +/* *outdata = output data array */ + +void cvprod(float *a, float *b, float *out, long N); +/* complex vector product, can be in-place */ +/* product of complex vector *a times complex vector *b */ +/* INPUTS */ +/* N vector length */ +/* *a complex vector length N complex numbers */ +/* *b complex vector length N complex numbers */ +/* OUTPUTS */ +/* *out complex vector length N */ diff --git a/src/maths/misc/randnumb.c b/src/maths/misc/randnumb.c index b6e5d79b3..6b4dd62c1 100644 --- a/src/maths/misc/randnumb.c +++ b/src/maths/misc/randnumb.c @@ -216,3 +216,27 @@ double gauss(void) return glgset; } } + +/* Polar form of the Box-Muller generator for Gaussian distributed + random variates. + Generator will be fed with two uniformly distributed random variates. + Delivers two values per call +*/ + +void rgauss(double* py1, double* py2) +{ + double x1, x2, w; + + do { + x1 = 2.0 * CombLCGTaus() - 1.0; + x2 = 2.0 * CombLCGTaus() - 1.0; + w = x1 * x1 + x2 * x2; + } while ( w >= 1.0 ); + + w = sqrt( (-2.0 * log( w ) ) / w ); + + *py1 = x1 * w; + *py2 = x2 * w; +} + + diff --git a/src/spicelib/devices/vsrc/vsrc.c b/src/spicelib/devices/vsrc/vsrc.c index e498a4d84..c0d627372 100644 --- a/src/spicelib/devices/vsrc/vsrc.c +++ b/src/spicelib/devices/vsrc/vsrc.c @@ -21,8 +21,9 @@ IFparm VSRCpTable[] = { /* parameters */ IOP ("pwl", VSRC_PWL, IF_REALVEC,"Piecewise linear description"), IOP ("sffm", VSRC_SFFM, IF_REALVEC,"Single freq. FM descripton"), IOP ("am", VSRC_AM, IF_REALVEC,"Amplitude modulation descripton"), + IOP ("trnoise", VSRC_TRNOISE, IF_REALVEC,"Transient noise descripton"), -OPU ("pos_node",VSRC_POS_NODE, IF_INTEGER,"Positive node of source"), + OPU ("pos_node",VSRC_POS_NODE, IF_INTEGER,"Positive node of source"), OPU ("neg_node",VSRC_NEG_NODE, IF_INTEGER,"Negative node of source"), OPU ("function",VSRC_FCN_TYPE, IF_INTEGER,"Function of the source"), OPU ("order", VSRC_FCN_ORDER, IF_INTEGER,"Order of the source function"), diff --git a/src/spicelib/devices/vsrc/vsrcacct.c b/src/spicelib/devices/vsrc/vsrcacct.c index 60a56c18f..7d6243398 100644 --- a/src/spicelib/devices/vsrc/vsrcacct.c +++ b/src/spicelib/devices/vsrc/vsrcacct.c @@ -11,6 +11,10 @@ Author: 1985 Thomas L. Quarles #include "suffix.h" #include "missing_math.h" +extern int fftInit(long M); +extern void fftFree(void); +extern void rffts(float *data, long M, long Rows); + #define SAMETIME(a,b) (fabs((a)-(b))<= TIMETOL * PW) #define TIMETOL 1e-7 @@ -74,6 +78,7 @@ VSRCaccept(CKTcircuit *ckt, GENmodel *inModel) /* offset time by delay */ time = ckt->CKTtime - TD; tshift = TD; + #ifdef XSPICE /* normalize phase to 0 - 360° */ /* normalize phase to cycles */ @@ -180,6 +185,52 @@ VSRCaccept(CKTcircuit *ckt, GENmodel *inModel) } break; } + +/**** tansient noise routines: +VNoi2 2 0 DC 0 TRNOISE(10n 0.5n 0 0n) : generate gaussian distributed noise + rms value, time step, 0 0 +VNoi1 1 0 DC 0 TRNOISE(0n 0.5n 1 10n) : generate 1/f noise + 0, time step, exponent < 2, rms value +*/ + case TRNOISE: { + double NA, NT, TS, time, basetime = 0.; + +#define NSAMETIME(a,b) (fabs((a)-(b))<= NTIMETOL * TS) +#define NTIMETOL 1e-7 + + NA = here->VSRCcoeffs[0]; // input is rms value + NT = here->VSRCcoeffs[1]; // time step + if (NT == 0.) // no further breakpoint if value not given + break; +// TS = NT > ckt->CKTstep ? NT : ckt->CKTstep; + TS = NT; + time = ckt->CKTtime; + + if(time >= TS) { + /* repeating signal - figure out where we are + in period */ + basetime = TS * floor(time*1.000000000001/TS); +// basetime = TS * floor(time/TS); +// basetime = TS * here->VSRCncount; + time -= basetime; + } + if(ckt->CKTbreak && NSAMETIME(time,0)) { + /* set next breakpoint */ +// error = CKTsetBreak(ckt, TS * ((double)here->VSRCncount + 1.)); + error = CKTsetBreak(ckt, basetime + TS); + if(error) return(error); + } + /* else if (ckt->CKTbreak && NSAMETIME(time,TS)) { + // set next breakpoint + error = CKTsetBreak(ckt, basetime + TS + TS); + if(error) return(error); + } */ + if (ckt->CKTtime == 0.) { +// printf("VSRC: free fft tables\n"); + fftFree(); + } + } + break; } } bkptset: ; diff --git a/src/spicelib/devices/vsrc/vsrcask.c b/src/spicelib/devices/vsrc/vsrcask.c index 6d22ae283..5754e8c5c 100644 --- a/src/spicelib/devices/vsrc/vsrcask.c +++ b/src/spicelib/devices/vsrc/vsrcask.c @@ -46,6 +46,7 @@ VSRCask(CKTcircuit *ckt, GENinstance *inst, int which, IFvalue *value, IFvalue * case VSRC_PWL: case VSRC_SFFM: case VSRC_AM: + case VSRC_TRNOISE: case VSRC_FCN_COEFFS: temp = value->v.numValue = here->VSRCfunctionOrder; v = value->v.vec.rVec = TMALLOC(double, here->VSRCfunctionOrder); diff --git a/src/spicelib/devices/vsrc/vsrcdefs.h b/src/spicelib/devices/vsrc/vsrcdefs.h index 05faa673c..c6ea14abb 100644 --- a/src/spicelib/devices/vsrc/vsrcdefs.h +++ b/src/spicelib/devices/vsrc/vsrcdefs.h @@ -48,7 +48,16 @@ typedef struct sVSRCinstance { double VSRCdF2mag; /* distortion f2 magnitude */ double VSRCdF1phase; /* distortion f1 phase */ double VSRCdF2phase; /* distortion f2 phase */ - + + /*transient noise*/ + double VSRCprevTime; /*last time a new random value was issued*/ + double VSRCprevVal; /*last value issued at prevTime*/ + double VSRCnewVal; /*new value issued at prevTime*/ + double VSRCsecRand; /*second random value not yet used*/ + float *VSRConeof; /*pointer to array of 1 over f noise values */ + long int VSRCncount; /* counter to retrieve noise values */ + /*end of noise*/ + double VSRCr; /* pwl repeat */ double VSRCrdelay; /* pwl delay period */ double *VSRCposIbrptr; /* pointer to sparse matrix element at @@ -93,6 +102,7 @@ typedef struct sVSRCmodel { #define SFFM 4 #define PWL 5 #define AM 6 +#define TRNOISE 7 #endif /*PULSE*/ /* device parameters */ @@ -121,6 +131,7 @@ typedef struct sVSRCmodel { #define VSRC_AM 22 #define VSRC_R 23 #define VSRC_TD 24 +#define VSRC_TRNOISE 25 /* model parameters */ diff --git a/src/spicelib/devices/vsrc/vsrcload.c b/src/spicelib/devices/vsrc/vsrcload.c index f730a71e4..ee1455992 100644 --- a/src/spicelib/devices/vsrc/vsrcload.c +++ b/src/spicelib/devices/vsrc/vsrcload.c @@ -11,6 +11,15 @@ $Id$ #include "trandefs.h" #include "sperror.h" #include "suffix.h" +#undef WaGauss +#ifdef FastRand +#include "FastNorm3.h" +#elif defined (WaGauss) +#include "wallace.h" +#else +extern void rgauss(double* py1, double* py2); +#endif +#include "1-f-code.h" #ifdef XSPICE_EXP /* gtri - begin - wbk - modify for supply ramping option */ @@ -27,7 +36,7 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) VSRCmodel *model = (VSRCmodel *)inModel; VSRCinstance *here; double time; - double value; + double value = 0.0; /* loop through all the voltage source models */ for( ; model != NULL; model = model->VSRCnextModel ) { @@ -35,7 +44,7 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) /* loop through all the instances of the model */ for (here = model->VSRCinstances; here != NULL ; here=here->VSRCnextInstance) { - if (here->VSRCowner != ARCHme) continue; + if (here->VSRCowner != ARCHme) continue; *(here->VSRCposIbrptr) += 1.0 ; *(here->VSRCnegIbrptr) -= 1.0 ; @@ -63,29 +72,29 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) } case PULSE: { - double V1, V2, TD, TR, TF, PW, PER; - double basetime = 0; + double V1, V2, TD, TR, TF, PW, PER; + double basetime = 0; #ifdef XSPICE double PHASE; double phase; double deltat; #endif - V1 = here->VSRCcoeffs[0]; - V2 = here->VSRCcoeffs[1]; - TD = here->VSRCfunctionOrder > 2 - ? here->VSRCcoeffs[2] : 0.0; - TR = here->VSRCfunctionOrder > 3 - && here->VSRCcoeffs[3] != 0.0 - ? here->VSRCcoeffs[3] : ckt->CKTstep; - TF = here->VSRCfunctionOrder > 4 - && here->VSRCcoeffs[4] != 0.0 - ? here->VSRCcoeffs[4] : ckt->CKTstep; - PW = here->VSRCfunctionOrder > 5 - && here->VSRCcoeffs[5] != 0.0 - ? here->VSRCcoeffs[5] : ckt->CKTfinalTime; - PER = here->VSRCfunctionOrder > 6 - && here->VSRCcoeffs[6] != 0.0 - ? here->VSRCcoeffs[6] : ckt->CKTfinalTime; + V1 = here->VSRCcoeffs[0]; + V2 = here->VSRCcoeffs[1]; + TD = here->VSRCfunctionOrder > 2 + ? here->VSRCcoeffs[2] : 0.0; + TR = here->VSRCfunctionOrder > 3 + && here->VSRCcoeffs[3] != 0.0 + ? here->VSRCcoeffs[3] : ckt->CKTstep; + TF = here->VSRCfunctionOrder > 4 + && here->VSRCcoeffs[4] != 0.0 + ? here->VSRCcoeffs[4] : ckt->CKTstep; + PW = here->VSRCfunctionOrder > 5 + && here->VSRCcoeffs[5] != 0.0 + ? here->VSRCcoeffs[5] : ckt->CKTfinalTime; + PER = here->VSRCfunctionOrder > 6 + && here->VSRCcoeffs[6] != 0.0 + ? here->VSRCcoeffs[6] : ckt->CKTfinalTime; /* shift time by delay time TD */ time -= TD; @@ -126,25 +135,25 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) case SINE: { - double VO, VA, FREQ, TD, THETA; -/* gtri - begin - wbk - add PHASE parameter */ + double VO, VA, FREQ, TD, THETA; + /* gtri - begin - wbk - add PHASE parameter */ #ifdef XSPICE double PHASE; - double phase; + double phase; PHASE = here->VSRCfunctionOrder > 5 - ? here->VSRCcoeffs[5] : 0.0; + ? here->VSRCcoeffs[5] : 0.0; - /* compute phase in radians */ + /* compute phase in radians */ phase = PHASE * M_PI / 180.0; #endif VO = here->VSRCcoeffs[0]; - VA = here->VSRCcoeffs[1]; + VA = here->VSRCcoeffs[1]; FREQ = here->VSRCfunctionOrder > 2 - && here->VSRCcoeffs[2] != 0.0 - ? here->VSRCcoeffs[2] : (1/ckt->CKTfinalTime); - TD = here->VSRCfunctionOrder > 3 - ? here->VSRCcoeffs[3] : 0.0; + && here->VSRCcoeffs[2] != 0.0 + ? here->VSRCcoeffs[2] : (1/ckt->CKTfinalTime); + TD = here->VSRCfunctionOrder > 3 + ? here->VSRCcoeffs[3] : 0.0; THETA = here->VSRCfunctionOrder > 4 ? here->VSRCcoeffs[4] : 0.0; @@ -155,12 +164,12 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) } else { value = VO + VA * sin(FREQ*time * 2.0 * M_PI + phase) * - exp(-time*THETA); + exp(-time*THETA); #else value = VO; } else { value = VO + VA * sin(FREQ * time * 2.0 * M_PI) * - exp(-(time*THETA)); + exp(-(time*THETA)); #endif /* gtri - end - wbk - add PHASE parameter */ } @@ -168,24 +177,23 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) break; case EXP: { - double V1, V2, TD1, TD2, TAU1, TAU2; + double V1, V2, TD1, TD2, TAU1, TAU2; V1 = here->VSRCcoeffs[0]; - V2 = here->VSRCcoeffs[1]; - TD1 = here->VSRCfunctionOrder > 2 - && here->VSRCcoeffs[2] != 0.0 - ? here->VSRCcoeffs[2] : ckt->CKTstep; - TAU1 = here->VSRCfunctionOrder > 3 - && here->VSRCcoeffs[3] != 0.0 - ? here->VSRCcoeffs[3] : ckt->CKTstep; + V2 = here->VSRCcoeffs[1]; + TD1 = here->VSRCfunctionOrder > 2 + && here->VSRCcoeffs[2] != 0.0 + ? here->VSRCcoeffs[2] : ckt->CKTstep; + TAU1 = here->VSRCfunctionOrder > 3 + && here->VSRCcoeffs[3] != 0.0 + ? here->VSRCcoeffs[3] : ckt->CKTstep; TD2 = here->VSRCfunctionOrder > 4 - && here->VSRCcoeffs[4] != 0.0 - ? here->VSRCcoeffs[4] : TD1 + ckt->CKTstep; + && here->VSRCcoeffs[4] != 0.0 + ? here->VSRCcoeffs[4] : TD1 + ckt->CKTstep; TAU2 = here->VSRCfunctionOrder > 5 - && here->VSRCcoeffs[5] - ? here->VSRCcoeffs[5] : ckt->CKTstep; - - + && here->VSRCcoeffs[5] + ? here->VSRCcoeffs[5] : ckt->CKTstep; + if(time <= TD1) { value = V1; } else if (time <= TD2) { @@ -199,7 +207,7 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) case SFFM:{ - double VO, VA, FC, MDI, FS; + double VO, VA, FC, MDI, FS; /* gtri - begin - wbk - add PHASE parameters */ #ifdef XSPICE @@ -208,25 +216,24 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) double phases; PHASEC = here->VSRCfunctionOrder > 5 - ? here->VSRCcoeffs[5] : 0.0; + ? here->VSRCcoeffs[5] : 0.0; PHASES = here->VSRCfunctionOrder > 6 - ? here->VSRCcoeffs[6] : 0.0; + ? here->VSRCcoeffs[6] : 0.0; /* compute phases in radians */ phasec = PHASEC * M_PI / 180.0; phases = PHASES * M_PI / 180.0; - #endif - VO = here->VSRCcoeffs[0]; - VA = here->VSRCcoeffs[1]; - FC = here->VSRCfunctionOrder > 2 - && here->VSRCcoeffs[2] - ? here->VSRCcoeffs[2] : (1/ckt->CKTfinalTime); - MDI = here->VSRCfunctionOrder > 3 - ? here->VSRCcoeffs[3] : 0.0; - FS = here->VSRCfunctionOrder > 4 - && here->VSRCcoeffs[4] - ? here->VSRCcoeffs[4] : (1/ckt->CKTfinalTime); + VO = here->VSRCcoeffs[0]; + VA = here->VSRCcoeffs[1]; + FC = here->VSRCfunctionOrder > 2 + && here->VSRCcoeffs[2] + ? here->VSRCcoeffs[2] : (1/ckt->CKTfinalTime); + MDI = here->VSRCfunctionOrder > 3 + ? here->VSRCcoeffs[3] : 0.0; + FS = here->VSRCfunctionOrder > 4 + && here->VSRCcoeffs[4] + ? here->VSRCcoeffs[4] : (1/ckt->CKTfinalTime); #ifdef XSPICE /* compute waveform value */ value = VO + VA * @@ -242,10 +249,9 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) break; case AM:{ - double VA, FC, MF, VO, TD; + double VA, FC, MF, VO, TD; /* gtri - begin - wbk - add PHASE parameters */ #ifdef XSPICE - double PHASEC, PHASES; double phasec; double phases; @@ -260,49 +266,40 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) phases = PHASES * M_PI / 180.0; #endif - - VA = here->VSRCcoeffs[0]; - VO = here->VSRCcoeffs[1]; - MF = here->VSRCfunctionOrder > 2 - && here->VSRCcoeffs[2] - ? here->VSRCcoeffs[2] : (1/ckt->CKTfinalTime); - FC = here->VSRCfunctionOrder > 3 - ? here->VSRCcoeffs[3] : 0.0; - TD = here->VSRCfunctionOrder > 4 - && here->VSRCcoeffs[4] - ? here->VSRCcoeffs[4] : 0.0; + VA = here->VSRCcoeffs[0]; + VO = here->VSRCcoeffs[1]; + MF = here->VSRCfunctionOrder > 2 + && here->VSRCcoeffs[2] + ? here->VSRCcoeffs[2] : (1/ckt->CKTfinalTime); + FC = here->VSRCfunctionOrder > 3 + ? here->VSRCcoeffs[3] : 0.0; + TD = here->VSRCfunctionOrder > 4 + && here->VSRCcoeffs[4] + ? here->VSRCcoeffs[4] : 0.0; time -= TD; if (time <= 0) { value = 0; } else { #ifdef XSPICE - /* compute waveform value */ - value = VA * (VO + sin(2.0 * M_PI * MF * time + phases )) * - sin(2 * M_PI * FC * time + phases); + /* compute waveform value */ + value = VA * (VO + sin(2.0 * M_PI * MF * time + phases )) * + sin(2 * M_PI * FC * time + phases); #else /* XSPICE */ - value = VA * (VO + sin(2.0 * M_PI * MF * time)) * - sin(2 * M_PI * FC * time); + value = VA * (VO + sin(2.0 * M_PI * MF * time)) * + sin(2 * M_PI * FC * time); #endif - } + } /* gtri - end - wbk - add PHASE parameters */ - } - break; + } + break; case PWL: { int i = 0, num_repeat = 0, ii = 0; double foo, repeat_time = 0, end_time, breakpt_time, itime; time -= here->VSRCrdelay; -// if(time > PER) { - /* repeating signal - figure out where we are */ - /* in period */ -// basetime = PER * floor(time/PER); -// time -= basetime; -// } - - if(time < *(here->VSRCcoeffs)) { foo = *(here->VSRCcoeffs + 1) ; @@ -310,35 +307,243 @@ VSRCload(GENmodel *inModel, CKTcircuit *ckt) goto loadDone; } - do { - for(i=ii ; i<(here->VSRCfunctionOrder/2)-1; i++ ) { - itime = *(here->VSRCcoeffs+2*i); - if ( AlmostEqualUlps(itime+repeat_time, time, 3 )) { -// if ( fabs( (*(here->VSRCcoeffs+2*i)+repeat_time) - time ) < 1e-20 ) { - foo = *(here->VSRCcoeffs+2*i+1); - value = foo; - goto loadDone; - } else if ( (*(here->VSRCcoeffs+2*i)+repeat_time < time) && (*(here->VSRCcoeffs+2*(i+1))+repeat_time > time) ) { - foo = *(here->VSRCcoeffs+2*i+1) + (((time-(*(here->VSRCcoeffs+2*i)+repeat_time))/ - (*(here->VSRCcoeffs+2*(i+1)) - *(here->VSRCcoeffs+2*i))) * + do { + for(i=ii ; i<(here->VSRCfunctionOrder/2)-1; i++ ) { + itime = *(here->VSRCcoeffs+2*i); + if ( AlmostEqualUlps(itime+repeat_time, time, 3 )) { + foo = *(here->VSRCcoeffs+2*i+1); + value = foo; + goto loadDone; + } else if ( (*(here->VSRCcoeffs+2*i)+repeat_time < time) + && (*(here->VSRCcoeffs+2*(i+1))+repeat_time > time) ) { + foo = *(here->VSRCcoeffs+2*i+1) + (((time-(*(here->VSRCcoeffs+2*i)+repeat_time))/ + (*(here->VSRCcoeffs+2*(i+1)) - *(here->VSRCcoeffs+2*i))) * (*(here->VSRCcoeffs+2*i+3) - *(here->VSRCcoeffs+2*i+1))); - value = foo; - goto loadDone; - } - } - foo = *(here->VSRCcoeffs+ here->VSRCfunctionOrder-1) ; - value = foo; + value = foo; + goto loadDone; + } + } + foo = *(here->VSRCcoeffs+ here->VSRCfunctionOrder-1) ; + value = foo; - if ( !here->VSRCrGiven ) goto loadDone; + if ( !here->VSRCrGiven ) goto loadDone; - end_time = *(here->VSRCcoeffs + here->VSRCfunctionOrder-2); - breakpt_time = *(here->VSRCcoeffs + here->VSRCrBreakpt); - repeat_time = end_time + (end_time - breakpt_time)*num_repeat++ - breakpt_time; - ii = here->VSRCrBreakpt/2; - } while ( here->VSRCrGiven ); + end_time = *(here->VSRCcoeffs + here->VSRCfunctionOrder-2); + breakpt_time = *(here->VSRCcoeffs + here->VSRCrBreakpt); + repeat_time = end_time + (end_time - breakpt_time)*num_repeat++ - breakpt_time; + ii = here->VSRCrBreakpt/2; + } while ( here->VSRCrGiven ); break; } - } + +/**** tansient noise routines: +VNoi2 2 0 DC 0 TRNOISE(10n 0.5n 0 0n) : generate gaussian distributed noise + rms value, time step, 0 0 +VNoi1 1 0 DC 0 TRNOISE(0n 0.5n 1 10n) : generate 1/f noise + 0, time step, exponent < 2, rms value +*/ + case TRNOISE: { + /* Generate voltage point every TS with amplitude NA * ra, + where ra is drawn from a random number generator with + gaussian distribution with mean 0 and standard deviation 1 + */ + +//#define PRVAL +// typedef int bool; + + double newval=0.0, lastval=0.0, lasttime=0.0; + double NA, NT, TS; + double V1, V2, basetime = 0.; + double scalef, ra1, ra2; + float NALPHA, NAMP; + + long int nosteps, newsteps = 1, newexp = 0; + + bool aof = FALSE; + + NA = here->VSRCcoeffs[0]; // input is rms value + NT = here->VSRCcoeffs[1]; // time step + + scalef = NA; +// scalef = NA*1.32; + + NALPHA = here->VSRCfunctionOrder > 2 + ? (float)here->VSRCcoeffs[2] : 0.0f; + NAMP = here->VSRCfunctionOrder > 3 + && here->VSRCcoeffs[3] != 0.0 + && here->VSRCcoeffs[2] != 0.0 + ? (float)here->VSRCcoeffs[3] : 0.0f; + + if ((NT == 0.) || ((NA == 0.) && (NAMP == 0.))) { + value = here->VSRCdcValue; + goto noiDone; + } + else + TS = NT; /* time step for noise */ + + if ((NALPHA > 0.0) && (NAMP > 0.0)) aof = TRUE; + + lasttime = here->VSRCprevTime; + lastval = here->VSRCprevVal; + newval = here->VSRCnewVal; + /* set all data: DC, white, 1of */ + if (time <= 0 /*ckt->CKTstep*/) { + /* data are already set */ + if ((here->VSRCprevVal != 0) || (here->VSRCnewVal != 0)) { + value = here->VSRCprevVal; + goto noiDone; + } + lasttime = 0.0; + here->VSRCsecRand = 2.; /* > 1, invalid number out of the random number range */ + /* get two random samples */ +#ifdef FastRand + // use FastNorm3 + here->VSRCprevVal = scalef * GaussWa; + here->VSRCnewVal = scalef * GaussWa; +#elif defined (WaGauss) + // use WallaceHV + here->VSRCprevVal = scalef * GaussWa; + here->VSRCnewVal = scalef * GaussWa; +#else + // make use of two random variables per call to rgauss() + rgauss(&ra1, &ra2); + here->VSRCprevVal = scalef * ra1; + // choose to set start value to 0 + here->VSRCprevVal = 0; + here->VSRCnewVal = scalef * ra2; +#endif + /* generate 1 over f noise at time 0 */ + if (aof) { + if (here->VSRCncount==0) { + // add 10 steps for start up sequence + nosteps = (long)((ckt->CKTfinalTime)/TS) + 10; + // generate number of steps as power of 2 + while(newsteps < nosteps) { + newsteps <<= 1; + newexp++; + } + here->VSRConeof = TMALLOC(float, newsteps); //(float *)tmalloc(sizeof(float) * newsteps); +#ifdef PRVAL + printf("ALPHA: %f, GAIN: %e\n", NALPHA, NAMP); +#endif + f_alpha(newsteps, newexp, here->VSRConeof, NAMP, NALPHA); +#ifdef PRVAL + printf("Noi1: %e, Noi2: %e\n", here->VSRConeof[10], here->VSRConeof[100]); +#endif + here->VSRCprevVal += here->VSRConeof[here->VSRCncount]; + here->VSRCncount++; + here->VSRCnewVal += here->VSRConeof[here->VSRCncount]; + here->VSRCncount++; + value = newval; + // add DC + here->VSRCprevVal += here->VSRCdcValue; + here->VSRCnewVal += here->VSRCdcValue; + value = here->VSRCprevVal; +#ifdef PRVAL + printf("start1, time: %e, outp: %e, rnd: %e\n", time, newval, testval); +#endif + } else { // here->VSRCncount > 0 + // add DC + here->VSRCprevVal += here->VSRCdcValue; + here->VSRCnewVal += here->VSRCdcValue; + value = here->VSRCprevVal; +#ifdef PRVAL + printf("start2, time: %e, outp: %e, rnd: %e\n", time, here->VSRCprevVal, testval); +#endif + } +#ifdef PRVAL + printf("time 0 value: %e for %s\n", here->VSRCprevVal, here->VSRCname); +#endif + goto loadDone; + } //aof + // add DC + here->VSRCprevVal += here->VSRCdcValue; + here->VSRCnewVal += here->VSRCdcValue; + value = here->VSRCprevVal; + here->VSRCprevTime = 0.; + goto loadDone; + } // time < 0 + + V1 = here->VSRCprevVal; + V2 = here->VSRCnewVal; + if (here->VSRCprevTime == ckt->CKTtime) { + value = here->VSRCprevVal; + goto noiDone; + } + + if (time > 0 && time < TS) { + value = V1 + (V2 - V1) * (time) / TS; + } + else if (time >= TS) { + /* repeating signal - figure out where we are in period */ + /* numerical correction to avoid basetime less than + next step, e.g. 4.99999999999999995 delivers a floor + of 4 instead of 5 */ + basetime = TS * floor(time*1.000000000001/TS); + time -= basetime; + +#define NSAMETIME(a,b) (fabs((a)-(b))<= NTIMETOL * TS) +#define NTIMETOL 1e-7 + + if NSAMETIME(time,0.) { + + /* get new random number */ +#ifdef FastRand + // use FastNorm3 + newval = scalef * FastNorm; +#elif defined (WaGauss) + // use WallaceHV + newval = scalef * GaussWa; +#else + // make use of two random variables per call to rgauss() + if (here->VSRCsecRand == 2.0) { + rgauss(&ra1, &ra2); + newval = scalef * ra1; + here->VSRCsecRand = scalef * ra2; + } + else { + newval = here->VSRCsecRand; + here->VSRCsecRand = 2.0; + } +#endif + V1 = here->VSRCprevVal = here->VSRCnewVal; + V2 = newval; // scale factor t.b.d. + if(here->VSRCdcGiven) V2 += here->VSRCdcValue; + if (aof) { + V2 += here->VSRConeof[here->VSRCncount]; +#ifdef PRVAL + printf("aof: %d\n", here->VSRCncount); +#endif + } + here->VSRCncount++; + value = V1; + here->VSRCnewVal = V2; + } else if (time > 0 && time < TS) { + value = V1 + (V2 - V1) * (time) / TS; +#ifdef PRVAL + printf("if1, time: %e, outp: %e, rnd: %e\n", ckt->CKTtime, + V1 + (V2 - V1) * (time) / TS, V2); +#endif + } else { /* time > TS should be never reached */ + value = V1 + (V2 - V1) * (time-TS) / TS; +#ifdef PRVAL + printf("if2, time: %e, outp: %e, rnd: %e\n", ckt->CKTtime, + V1 + (V2 - V1) * (time-TS) / TS, V2); +#endif + } + here->VSRCprevTime = ckt->CKTtime; + } +noiDone: + if (time >=ckt->CKTfinalTime) { + /* free the 1of memory */ + if (here->VSRConeof) tfree(here->VSRConeof); + /* reset the 1of counter */ + here->VSRCncount = 0; + } + goto loadDone; + } // case + break; + } // switch } loadDone: /* gtri - begin - wbk - modify for supply ramping option */ @@ -346,11 +551,12 @@ loadDone: value *= ckt->CKTsrcFact; value *= cm_analog_ramp_factor(); #else -if (ckt->CKTmode & MODETRANOP) value *= ckt->CKTsrcFact; - *(ckt->CKTrhs + (here->VSRCbranch)) += value; + if (ckt->CKTmode & MODETRANOP) value *= ckt->CKTsrcFact; + /* load the new voltage value into the matrix */ + *(ckt->CKTrhs + (here->VSRCbranch)) += value; #endif /* gtri - end - wbk - modify to process srcFact, etc. for all sources */ - } - } + } // for loop instances + } // for loop models return(OK); } diff --git a/src/spicelib/devices/vsrc/vsrcpar.c b/src/spicelib/devices/vsrc/vsrcpar.c index 8f3b75d45..ea37d2db1 100644 --- a/src/spicelib/devices/vsrc/vsrcpar.c +++ b/src/spicelib/devices/vsrc/vsrcpar.c @@ -169,6 +169,13 @@ VSRCparam(int param, IFvalue *value, GENinstance *inst, IFvalue *select) return(E_BADPARM); } break; + case VSRC_TRNOISE: + here->VSRCfunctionType = TRNOISE; + here->VSRCfuncTGiven = TRUE; + here->VSRCcoeffs = value->v.vec.rVec; + here->VSRCfunctionOrder = value->v.numValue; + here->VSRCcoeffsGiven = TRUE; + break; default: return(E_BADPARM); } diff --git a/visualc/vngspice.vcproj b/visualc/vngspice.vcproj index 8aad3b0a1..479b20f2d 100644 --- a/visualc/vngspice.vcproj +++ b/visualc/vngspice.vcproj @@ -1,8917 +1,8965 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +