diff --git a/3rdparty/oourafft/fftsg.cc b/3rdparty/oourafft/fftsg.cc new file mode 100644 index 00000000..d74fbb15 --- /dev/null +++ b/3rdparty/oourafft/fftsg.cc @@ -0,0 +1,3305 @@ +/////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2018-2020, The Regents of the University of California +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/////////////////////////////////////////////////////////////////////////////// + +/* +Fast Fourier/Cosine/Sine Transform + dimension :one + data length :power of 2 + decimation :frequency + radix :split-radix + data :inplace + table :use +functions + cdft: Complex Discrete Fourier Transform + rdft: Real Discrete Fourier Transform + ddct: Discrete Cosine Transform + ddst: Discrete Sine Transform + dfct: Cosine Transform of RDFT (Real Symmetric DFT) + dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) +function prototypes + void cdft(int, int, float *, int *, float *); + void rdft(int, int, float *, int *, float *); + void ddct(int, int, float *, int *, float *); + void ddst(int, int, float *, int *, float *); + void dfct(int, float *, float *, int *, float *); + void dfst(int, float *, float *, int *, float *); +macro definitions + USE_CDFT_PTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=8192 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536 + USE_CDFT_WINTHREADS : default=not defined + CDFT_THREADS_BEGIN_N : must be >= 512, default=32768 + CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288 + + +-------- Complex DFT (Discrete Fourier Transform) -------- + [definition] + + X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k + X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k + ip[0] = 0; // first time only + cdft(2*n, 1, a, ip, w); + + ip[0] = 0; // first time only + cdft(2*n, -1, a, ip, w); + [parameters] + 2*n :data length (int) + n >= 1, n = power of 2 + a[0...2*n-1] :input/output data (float *) + input data + a[2*j] = Re(x[j]), + a[2*j+1] = Im(x[j]), 0<=j= 2+sqrt(n) + strictly, + length of ip >= + 2+(1<<(int)(log(n+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (float *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + cdft(2*n, -1, a, ip, w); + is + cdft(2*n, 1, a, ip, w); + for (j = 0; j <= 2 * n - 1; j++) { + a[j] *= 1.0 / n; + } + . + + +-------- Real DFT / Inverse of Real DFT -------- + [definition] + RDFT + R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2 + I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0 IRDFT (excluding scale) + a[k] = (R[0] + R[n/2]*cos(pi*k))/2 + + sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) + + sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k + ip[0] = 0; // first time only + rdft(n, 1, a, ip, w); + + ip[0] = 0; // first time only + rdft(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (float *) + + output data + a[2*k] = R[k], 0<=k + input data + a[2*j] = R[j], 0<=j= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n/2-1] :cos/sin table (float *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + rdft(n, 1, a, ip, w); + is + rdft(n, -1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- + [definition] + IDCT (excluding scale) + C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k DCT + C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k + ip[0] = 0; // first time only + ddct(n, 1, a, ip, w); + + ip[0] = 0; // first time only + ddct(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (float *) + output data + a[k] = C[k], 0<=k= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (float *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddct(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddct(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- + [definition] + IDST (excluding scale) + S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k DST + S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0 + ip[0] = 0; // first time only + ddst(n, 1, a, ip, w); + + ip[0] = 0; // first time only + ddst(n, -1, a, ip, w); + [parameters] + n :data length (int) + n >= 2, n = power of 2 + a[0...n-1] :input/output data (float *) + + input data + a[j] = A[j], 0 + output data + a[k] = S[k], 0= 2+sqrt(n/2) + strictly, + length of ip >= + 2+(1<<(int)(log(n/2+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/4-1] :cos/sin table (float *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddst(n, -1, a, ip, w); + is + a[0] *= 0.5; + ddst(n, 1, a, ip, w); + for (j = 0; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Cosine Transform of RDFT (Real Symmetric DFT) -------- + [definition] + C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n + [usage] + ip[0] = 0; // first time only + dfct(n, a, t, ip, w); + [parameters] + n :data length - 1 (int) + n >= 2, n = power of 2 + a[0...n] :input/output data (float *) + output data + a[k] = C[k], 0<=k<=n + t[0...n/2] :work area (float *) + ip[0...*] :work area for bit reversal (int *) + length of ip >= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (float *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + is + a[0] *= 0.5; + a[n] *= 0.5; + dfct(n, a, t, ip, w); + for (j = 0; j <= n; j++) { + a[j] *= 2.0 / n; + } + . + + +-------- Sine Transform of RDFT (Real Anti-symmetric DFT) -------- + [definition] + S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0= 2, n = power of 2 + a[0...n-1] :input/output data (float *) + output data + a[k] = S[k], 0= 2+sqrt(n/4) + strictly, + length of ip >= + 2+(1<<(int)(log(n/4+0.5)/log(2))/2). + ip[0],ip[1] are pointers of the cos/sin table. + w[0...n*5/8-1] :cos/sin table (float *) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + dfst(n, a, t, ip, w); + is + dfst(n, a, t, ip, w); + for (j = 1; j <= n - 1; j++) { + a[j] *= 2.0 / n; + } + . + + +Appendix : + The cos/sin table is recalculated when the larger table required. + w[] and ip[] are compatible with all routines. +*/ + +#include +#include + +#include "nextpnr_namespaces.h" + +NEXTPNR_NAMESPACE_BEGIN + +void cdft(int n, int isgn, float* a, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void cftfsub(int n, float* a, int* ip, int nw, float* w); + void cftbsub(int n, float* a, int* ip, int nw, float* w); + int nw; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + if (isgn >= 0) { + cftfsub(n, a, ip, nw, w); + } else { + cftbsub(n, a, ip, nw, w); + } +} + +void rdft(int n, int isgn, float* a, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void cftfsub(int n, float* a, int* ip, int nw, float* w); + void cftbsub(int n, float* a, int* ip, int nw, float* w); + void rftfsub(int n, float* a, int nc, float* c); + void rftbsub(int n, float* a, int nc, float* c); + int nw, nc; + float xi; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 2)) { + nc = n >> 2; + makect(nc, ip, w + nw); + } + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xi = a[0] - a[1]; + a[0] += a[1]; + a[1] = xi; + } else { + a[1] = 0.5 * (a[0] - a[1]); + a[0] -= a[1]; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } +} + +void ddct(int n, int isgn, float* a, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void cftfsub(int n, float* a, int* ip, int nw, float* w); + void cftbsub(int n, float* a, int* ip, int nw, float* w); + void rftfsub(int n, float* a, int nc, float* c); + void rftbsub(int n, float* a, int nc, float* c); + void dctsub(int n, float* a, int nc, float* c); + int j, nw, nc; + float xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = a[j] - a[j - 1]; + a[j] += a[j - 1]; + } + a[1] = a[0] - xr; + a[0] += xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dctsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = a[j] - a[j + 1]; + a[j] += a[j + 1]; + } + a[n - 1] = xr; + } +} + +void ddst(int n, int isgn, float* a, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void cftfsub(int n, float* a, int* ip, int nw, float* w); + void cftbsub(int n, float* a, int* ip, int nw, float* w); + void rftfsub(int n, float* a, int nc, float* c); + void rftbsub(int n, float* a, int nc, float* c); + void dstsub(int n, float* a, int nc, float* c); + int j, nw, nc; + float xr; + + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + if (isgn < 0) { + xr = a[n - 1]; + for (j = n - 2; j >= 2; j -= 2) { + a[j + 1] = -a[j] - a[j - 1]; + a[j] -= a[j - 1]; + } + a[1] = a[0] + xr; + a[0] -= xr; + if (n > 4) { + rftbsub(n, a, nc, w + nw); + cftbsub(n, a, ip, nw, w); + } else if (n == 4) { + cftbsub(n, a, ip, nw, w); + } + } + dstsub(n, a, nc, w + nw); + if (isgn >= 0) { + if (n > 4) { + cftfsub(n, a, ip, nw, w); + rftfsub(n, a, nc, w + nw); + } else if (n == 4) { + cftfsub(n, a, ip, nw, w); + } + xr = a[0] - a[1]; + a[0] += a[1]; + for (j = 2; j < n; j += 2) { + a[j - 1] = -a[j] - a[j + 1]; + a[j] -= a[j + 1]; + } + a[n - 1] = -xr; + } +} + +void dfct(int n, float* a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void cftfsub(int n, float* a, int* ip, int nw, float* w); + void rftfsub(int n, float* a, int nc, float* c); + void dctsub(int n, float* a, int nc, float* c); + int j, k, l, m, mh, nw, nc; + float xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + m = n >> 1; + yi = a[m]; + xi = a[0] + a[n]; + a[0] -= a[n]; + t[0] = xi - yi; + t[m] = xi + yi; + if (n > 2) { + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] - a[n - j]; + xi = a[j] + a[n - j]; + yr = a[k] - a[n - k]; + yi = a[k] + a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi - yi; + t[k] = xi + yi; + } + t[mh] = a[mh] + a[n - mh]; + a[mh] -= a[n - mh]; + dctsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[0] - a[1]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] + a[j + 1]; + a[2 * j - 1] = a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dctsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[0] - t[1]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = t[j] - t[j + 1]; + a[k + l] = t[j] + t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 0; j < mh; j++) { + k = m - j; + t[j] = t[m + k] - t[m + j]; + t[k] = t[m + k] + t[m + j]; + } + t[mh] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + a[n] = t[2] - t[1]; + a[0] = t[2] + t[1]; + } else { + a[1] = a[0]; + a[2] = t[0]; + a[0] = t[1]; + } +} + +void dfst(int n, float* a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void cftfsub(int n, float* a, int* ip, int nw, float* w); + void rftfsub(int n, float* a, int nc, float* c); + void dstsub(int n, float* a, int nc, float* c); + int j, k, l, m, mh, nw, nc; + float xr, xi, yr, yi; + + nw = ip[0]; + if (n > (nw << 3)) { + nw = n >> 3; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > (nc << 1)) { + nc = n >> 1; + makect(nc, ip, w + nw); + } + if (n > 2) { + m = n >> 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + xr = a[j] + a[n - j]; + xi = a[j] - a[n - j]; + yr = a[k] + a[n - k]; + yi = a[k] - a[n - k]; + a[j] = xr; + a[k] = yr; + t[j] = xi + yi; + t[k] = xi - yi; + } + t[0] = a[mh] - a[n - mh]; + a[mh] += a[n - mh]; + a[0] = a[m]; + dstsub(m, a, nc, w + nw); + if (m > 4) { + cftfsub(m, a, ip, nw, w); + rftfsub(m, a, nc, w + nw); + } else if (m == 4) { + cftfsub(m, a, ip, nw, w); + } + a[n - 1] = a[1] - a[0]; + a[1] = a[0] + a[1]; + for (j = m - 2; j >= 2; j -= 2) { + a[2 * j + 1] = a[j] - a[j + 1]; + a[2 * j - 1] = -a[j] - a[j + 1]; + } + l = 2; + m = mh; + while (m >= 2) { + dstsub(m, t, nc, w + nw); + if (m > 4) { + cftfsub(m, t, ip, nw, w); + rftfsub(m, t, nc, w + nw); + } else if (m == 4) { + cftfsub(m, t, ip, nw, w); + } + a[n - l] = t[1] - t[0]; + a[l] = t[0] + t[1]; + k = 0; + for (j = 2; j < m; j += 2) { + k += l << 2; + a[k - l] = -t[j] - t[j + 1]; + a[k + l] = t[j] - t[j + 1]; + } + l <<= 1; + mh = m >> 1; + for (j = 1; j < mh; j++) { + k = m - j; + t[j] = t[m + k] + t[m + j]; + t[k] = t[m + k] - t[m + j]; + } + t[0] = t[m + mh]; + m = mh; + } + a[l] = t[0]; + } + a[0] = 0; +} + +/* -------- initializing routines -------- */ + +void makewt(int nw, int* ip, float* w) +{ + void makeipt(int nw, int* ip); + int j, nwh, nw0, nw1; + float delta, wn4r, wk1r, wk1i, wk3r, wk3i; + + ip[0] = nw; + ip[1] = 1; + if (nw > 2) { + nwh = nw >> 1; + delta = atan(1.0) / nwh; + wn4r = cos(delta * nwh); + w[0] = 1; + w[1] = wn4r; + if (nwh == 4) { + w[2] = cos(delta * 2); + w[3] = sin(delta * 2); + } else if (nwh > 4) { + makeipt(nw, ip); + w[2] = 0.5 / cos(delta * 2); + w[3] = 0.5 / cos(delta * 6); + for (j = 4; j < nwh; j += 4) { + w[j] = cos(delta * j); + w[j + 1] = sin(delta * j); + w[j + 2] = cos(3 * delta * j); + w[j + 3] = -sin(3 * delta * j); + } + } + nw0 = 0; + while (nwh > 2) { + nw1 = nw0 + nwh; + nwh >>= 1; + w[nw1] = 1; + w[nw1 + 1] = wn4r; + if (nwh == 4) { + wk1r = w[nw0 + 4]; + wk1i = w[nw0 + 5]; + w[nw1 + 2] = wk1r; + w[nw1 + 3] = wk1i; + } else if (nwh > 4) { + wk1r = w[nw0 + 4]; + wk3r = w[nw0 + 6]; + w[nw1 + 2] = 0.5 / wk1r; + w[nw1 + 3] = 0.5 / wk3r; + for (j = 4; j < nwh; j += 4) { + wk1r = w[nw0 + 2 * j]; + wk1i = w[nw0 + 2 * j + 1]; + wk3r = w[nw0 + 2 * j + 2]; + wk3i = w[nw0 + 2 * j + 3]; + w[nw1 + j] = wk1r; + w[nw1 + j + 1] = wk1i; + w[nw1 + j + 2] = wk3r; + w[nw1 + j + 3] = wk3i; + } + } + nw0 = nw1; + } + } +} + +void makeipt(int nw, int* ip) +{ + int j, l, m, m2, p, q; + + ip[2] = 0; + ip[3] = 16; + m = 2; + for (l = nw; l > 32; l >>= 2) { + m2 = m << 1; + q = m2 << 3; + for (j = m; j < m2; j++) { + p = ip[j] << 2; + ip[m + j] = p; + ip[m2 + j] = p + q; + } + m = m2; + } +} + +void makect(int nc, int* ip, float* c) +{ + int j, nch; + float delta; + + ip[1] = nc; + if (nc > 1) { + nch = nc >> 1; + delta = atan(1.0) / nch; + c[0] = cos(delta * nch); + c[nch] = 0.5 * c[0]; + for (j = 1; j < nch; j++) { + c[j] = 0.5 * cos(delta * j); + c[nc - j] = 0.5 * sin(delta * j); + } + } +} + +/* -------- child routines -------- */ + +#ifdef USE_CDFT_PTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 8192 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 65536 +#endif +#include +#include +#include +#define cdft_thread_t pthread_t +#define cdft_thread_create(thp, func, argp) \ + { \ + if (pthread_create(thp, NULL, func, (void*) argp) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ + } +#define cdft_thread_wait(th) \ + { \ + if (pthread_join(th, NULL) != 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ + } +#endif /* USE_CDFT_PTHREADS */ + +#ifdef USE_CDFT_WINTHREADS +#define USE_CDFT_THREADS +#ifndef CDFT_THREADS_BEGIN_N +#define CDFT_THREADS_BEGIN_N 32768 +#endif +#ifndef CDFT_4THREADS_BEGIN_N +#define CDFT_4THREADS_BEGIN_N 524288 +#endif +#include +#include +#include +#define cdft_thread_t HANDLE +#define cdft_thread_create(thp, func, argp) \ + { \ + DWORD thid; \ + *(thp) = CreateThread( \ + NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \ + if (*(thp) == 0) { \ + fprintf(stderr, "cdft thread error\n"); \ + exit(1); \ + } \ + } +#define cdft_thread_wait(th) \ + { \ + WaitForSingleObject(th, INFINITE); \ + CloseHandle(th); \ + } +#endif /* USE_CDFT_WINTHREADS */ + +void cftfsub(int n, float* a, int* ip, int nw, float* w) +{ + void bitrv2(int n, int* ip, float* a); + void bitrv216(float* a); + void bitrv208(float* a); + void cftf1st(int n, float* a, float* w); + void cftrec4(int n, float* a, int nw, float* w); + void cftleaf(int n, int isplt, float* a, int nw, float* w); + void cftfx41(int n, float* a, int nw, float* w); + void cftf161(float* a, float* w); + void cftf081(float* a, float* w); + void cftf040(float* a); + void cftx020(float* a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, float* a, int nw, float* w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftf1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216(a); + } else { + cftf081(a, w); + bitrv208(a); + } + } else if (n == 8) { + cftf040(a); + } else if (n == 4) { + cftx020(a); + } +} + +void cftbsub(int n, float* a, int* ip, int nw, float* w) +{ + void bitrv2conj(int n, int* ip, float* a); + void bitrv216neg(float* a); + void bitrv208neg(float* a); + void cftb1st(int n, float* a, float* w); + void cftrec4(int n, float* a, int nw, float* w); + void cftleaf(int n, int isplt, float* a, int nw, float* w); + void cftfx41(int n, float* a, int nw, float* w); + void cftf161(float* a, float* w); + void cftf081(float* a, float* w); + void cftb040(float* a); + void cftx020(float* a); +#ifdef USE_CDFT_THREADS + void cftrec4_th(int n, float* a, int nw, float* w); +#endif /* USE_CDFT_THREADS */ + + if (n > 8) { + if (n > 32) { + cftb1st(n, a, &w[nw - (n >> 2)]); +#ifdef USE_CDFT_THREADS + if (n > CDFT_THREADS_BEGIN_N) { + cftrec4_th(n, a, nw, w); + } else +#endif /* USE_CDFT_THREADS */ + if (n > 512) { + cftrec4(n, a, nw, w); + } else if (n > 128) { + cftleaf(n, 1, a, nw, w); + } else { + cftfx41(n, a, nw, w); + } + bitrv2conj(n, ip, a); + } else if (n == 32) { + cftf161(a, &w[nw - 8]); + bitrv216neg(a); + } else { + cftf081(a, w); + bitrv208neg(a); + } + } else if (n == 8) { + cftb040(a); + } else if (n == 4) { + cftx020(a); + } +} + +void bitrv2(int n, int* ip, float* a) +{ + int j, j1, k, k1, l, m, nh, nm; + float xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = a[j1 + 1]; + yr = a[k1]; + yi = a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + } +} + +void bitrv2conj(int n, int* ip, float* a) +{ + int j, j1, k, k1, l, m, nh, nm; + float xr, xi, yr, yi; + + m = 1; + for (l = n >> 2; l > 8; l >>= 2) { + m <<= 1; + } + nh = n >> 1; + nm = 4 * m; + if (l == 8) { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + 2 * ip[m + k]; + k1 = 4 * k + 2 * ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + 2 * ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += 2 * nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= 2; + k1 -= nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh + 2; + k1 += nh + 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh - nm; + k1 += 2 * nm - 2; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } else { + for (k = 0; k < m; k++) { + for (j = 0; j < k; j++) { + j1 = 4 * j + ip[m + k]; + k1 = 4 * k + ip[m + j]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nh; + k1 += 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += 2; + k1 += nh; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 += nm; + k1 += nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nh; + k1 -= 2; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + j1 -= nm; + k1 -= nm; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + } + k1 = 4 * k + ip[m + k]; + j1 = k1 + 2; + k1 += nh; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + j1 += nm; + k1 += nm; + a[j1 - 1] = -a[j1 - 1]; + xr = a[j1]; + xi = -a[j1 + 1]; + yr = a[k1]; + yi = -a[k1 + 1]; + a[j1] = yr; + a[j1 + 1] = yi; + a[k1] = xr; + a[k1 + 1] = xi; + a[k1 + 3] = -a[k1 + 3]; + } + } +} + +void bitrv216(float* a) +{ + float x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x7r, x7i, x8r, x8i, + x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + a[2] = x8r; + a[3] = x8i; + a[4] = x4r; + a[5] = x4i; + a[6] = x12r; + a[7] = x12i; + a[8] = x2r; + a[9] = x2i; + a[10] = x10r; + a[11] = x10i; + a[14] = x14r; + a[15] = x14i; + a[16] = x1r; + a[17] = x1i; + a[20] = x5r; + a[21] = x5i; + a[22] = x13r; + a[23] = x13i; + a[24] = x3r; + a[25] = x3i; + a[26] = x11r; + a[27] = x11i; + a[28] = x7r; + a[29] = x7i; +} + +void bitrv216neg(float* a) +{ + float x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i, + x8r, x8i, x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, + x14i, x15r, x15i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + x8r = a[16]; + x8i = a[17]; + x9r = a[18]; + x9i = a[19]; + x10r = a[20]; + x10i = a[21]; + x11r = a[22]; + x11i = a[23]; + x12r = a[24]; + x12i = a[25]; + x13r = a[26]; + x13i = a[27]; + x14r = a[28]; + x14i = a[29]; + x15r = a[30]; + x15i = a[31]; + a[2] = x15r; + a[3] = x15i; + a[4] = x7r; + a[5] = x7i; + a[6] = x11r; + a[7] = x11i; + a[8] = x3r; + a[9] = x3i; + a[10] = x13r; + a[11] = x13i; + a[12] = x5r; + a[13] = x5i; + a[14] = x9r; + a[15] = x9i; + a[16] = x1r; + a[17] = x1i; + a[18] = x14r; + a[19] = x14i; + a[20] = x6r; + a[21] = x6i; + a[22] = x10r; + a[23] = x10i; + a[24] = x2r; + a[25] = x2i; + a[26] = x12r; + a[27] = x12i; + a[28] = x4r; + a[29] = x4i; + a[30] = x8r; + a[31] = x8i; +} + +void bitrv208(float* a) +{ + float x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i; + + x1r = a[2]; + x1i = a[3]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x6r = a[12]; + x6i = a[13]; + a[2] = x4r; + a[3] = x4i; + a[6] = x6r; + a[7] = x6i; + a[8] = x1r; + a[9] = x1i; + a[12] = x3r; + a[13] = x3i; +} + +void bitrv208neg(float* a) +{ + float x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i; + + x1r = a[2]; + x1i = a[3]; + x2r = a[4]; + x2i = a[5]; + x3r = a[6]; + x3i = a[7]; + x4r = a[8]; + x4i = a[9]; + x5r = a[10]; + x5i = a[11]; + x6r = a[12]; + x6i = a[13]; + x7r = a[14]; + x7i = a[15]; + a[2] = x7r; + a[3] = x7i; + a[4] = x3r; + a[5] = x3i; + a[6] = x5r; + a[7] = x5i; + a[8] = x1r; + a[9] = x1i; + a[10] = x6r; + a[11] = x6i; + a[12] = x2r; + a[13] = x2i; + a[14] = x4r; + a[15] = x4i; +} + +void cftf1st(int n, float* a, float* w) +{ + int j, j0, j1, j2, j3, k, m, mh; + float wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, + y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = a[j + 3] + a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = a[j + 3] - a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = a[j0 - 1] + a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i + y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i - y2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r - y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r + y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = a[j0 - 1] + a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = a[j0 - 1] - a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i + x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = a[j0 + 3] + a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = a[j0 + 3] - a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i + x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + +void cftb1st(int n, float* a, float* w) +{ + int j, j0, j1, j2, j3, k, m, mh; + float wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i, + y3r, y3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = -a[1] - a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = -a[1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j2] = x1r + x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r - x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + csc1 = w[2]; + csc3 = w[3]; + wd1r = 1; + wd1i = 0; + wd3r = 1; + wd3i = 0; + k = 0; + for (j = 2; j < mh - 2; j += 4) { + k += 4; + wk1r = csc1 * (wd1r + w[k]); + wk1i = csc1 * (wd1i + w[k + 1]); + wk3r = csc3 * (wd3r + w[k + 2]); + wk3i = csc3 * (wd3i + w[k + 3]); + wd1r = w[k]; + wd1i = w[k + 1]; + wd3r = w[k + 2]; + wd3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = -a[j + 1] - a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = -a[j + 1] + a[j2 + 1]; + y0r = a[j + 2] + a[j2 + 2]; + y0i = -a[j + 3] - a[j2 + 3]; + y1r = a[j + 2] - a[j2 + 2]; + y1i = -a[j + 3] + a[j2 + 3]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 + 2] + a[j3 + 2]; + y2i = a[j1 + 3] + a[j3 + 3]; + y3r = a[j1 + 2] - a[j3 + 2]; + y3i = a[j1 + 3] - a[j3 + 3]; + a[j] = x0r + x2r; + a[j + 1] = x0i - x2i; + a[j + 2] = y0r + y2r; + a[j + 3] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 + 2] = y0r - y2r; + a[j1 + 3] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 + 2] = wd1r * x0r - wd1i * x0i; + a[j2 + 3] = wd1r * x0i + wd1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 + 2] = wd3r * x0r + wd3i * x0i; + a[j3 + 3] = wd3r * x0i - wd3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + y0r = a[j0 - 2] + a[j2 - 2]; + y0i = -a[j0 - 1] - a[j2 - 1]; + y1r = a[j0 - 2] - a[j2 - 2]; + y1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + y2r = a[j1 - 2] + a[j3 - 2]; + y2i = a[j1 - 1] + a[j3 - 1]; + y3r = a[j1 - 2] - a[j3 - 2]; + y3i = a[j1 - 1] - a[j3 - 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j0 - 2] = y0r + y2r; + a[j0 - 1] = y0i - y2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + a[j1 - 2] = y0r - y2r; + a[j1 - 1] = y0i + y2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = y1r + y3i; + x0i = y1i + y3r; + a[j2 - 2] = wd1i * x0r - wd1r * x0i; + a[j2 - 1] = wd1i * x0i + wd1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + x0r = y1r - y3i; + x0i = y1i - y3r; + a[j3 - 2] = wd3i * x0r + wd3r * x0i; + a[j3 - 1] = wd3i * x0i - wd3r * x0r; + } + wk1r = csc1 * (wd1r + wn4r); + wk1i = csc1 * (wd1i + wn4r); + wk3r = csc3 * (wd3r - wn4r); + wk3i = csc3 * (wd3i - wn4r); + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0 - 2] + a[j2 - 2]; + x0i = -a[j0 - 1] - a[j2 - 1]; + x1r = a[j0 - 2] - a[j2 - 2]; + x1i = -a[j0 - 1] + a[j2 - 1]; + x2r = a[j1 - 2] + a[j3 - 2]; + x2i = a[j1 - 1] + a[j3 - 1]; + x3r = a[j1 - 2] - a[j3 - 2]; + x3i = a[j1 - 1] - a[j3 - 1]; + a[j0 - 2] = x0r + x2r; + a[j0 - 1] = x0i - x2i; + a[j1 - 2] = x0r - x2r; + a[j1 - 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 - 2] = wk1r * x0r - wk1i * x0i; + a[j2 - 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 - 2] = wk3r * x0r + wk3i * x0i; + a[j3 - 1] = wk3r * x0i - wk3i * x0r; + x0r = a[j0] + a[j2]; + x0i = -a[j0 + 1] - a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = -a[j0 + 1] + a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i - x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); + x0r = a[j0 + 2] + a[j2 + 2]; + x0i = -a[j0 + 3] - a[j2 + 3]; + x1r = a[j0 + 2] - a[j2 + 2]; + x1i = -a[j0 + 3] + a[j2 + 3]; + x2r = a[j1 + 2] + a[j3 + 2]; + x2i = a[j1 + 3] + a[j3 + 3]; + x3r = a[j1 + 2] - a[j3 + 2]; + x3i = a[j1 + 3] - a[j3 + 3]; + a[j0 + 2] = x0r + x2r; + a[j0 + 3] = x0i - x2i; + a[j1 + 2] = x0r - x2r; + a[j1 + 3] = x0i + x2i; + x0r = x1r + x3i; + x0i = x1i + x3r; + a[j2 + 2] = wk1i * x0r - wk1r * x0i; + a[j2 + 3] = wk1i * x0i + wk1r * x0r; + x0r = x1r - x3i; + x0i = x1i - x3r; + a[j3 + 2] = wk3i * x0r + wk3r * x0i; + a[j3 + 3] = wk3i * x0i - wk3r * x0r; +} + +#ifdef USE_CDFT_THREADS +struct cdft_arg_st +{ + int n0; + int n; + float* a; + int nw; + float* w; +}; +typedef struct cdft_arg_st cdft_arg_t; + +void cftrec4_th(int n, float* a, int nw, float* w) +{ + void* cftrec1_th(void* p); + void* cftrec2_th(void* p); + int i, idiv4, m, nthread; + cdft_thread_t th[4]; + cdft_arg_t ag[4]; + + nthread = 2; + idiv4 = 0; + m = n >> 1; + if (n > CDFT_4THREADS_BEGIN_N) { + nthread = 4; + idiv4 = 1; + m >>= 1; + } + for (i = 0; i < nthread; i++) { + ag[i].n0 = n; + ag[i].n = m; + ag[i].a = &a[i * m]; + ag[i].nw = nw; + ag[i].w = w; + if (i != idiv4) { + cdft_thread_create(&th[i], cftrec1_th, &ag[i]); + } else { + cdft_thread_create(&th[i], cftrec2_th, &ag[i]); + } + } + for (i = 0; i < nthread; i++) { + cdft_thread_wait(th[i]); + } +} + +void* cftrec1_th(void* p) +{ + int cfttree(int n, int j, int k, float* a, int nw, float* w); + void cftleaf(int n, int isplt, float* a, int nw, float* w); + void cftmdl1(int n, float* a, float* w); + int isplt, j, k, m, n, n0, nw; + float *a, *w; + + n0 = ((cdft_arg_t*) p)->n0; + n = ((cdft_arg_t*) p)->n; + a = ((cdft_arg_t*) p)->a; + nw = ((cdft_arg_t*) p)->nw; + w = ((cdft_arg_t*) p)->w; + m = n0; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void*) 0; +} + +void* cftrec2_th(void* p) +{ + int cfttree(int n, int j, int k, float* a, int nw, float* w); + void cftleaf(int n, int isplt, float* a, int nw, float* w); + void cftmdl2(int n, float* a, float* w); + int isplt, j, k, m, n, n0, nw; + float *a, *w; + + n0 = ((cdft_arg_t*) p)->n0; + n = ((cdft_arg_t*) p)->n; + a = ((cdft_arg_t*) p)->a; + nw = ((cdft_arg_t*) p)->nw; + w = ((cdft_arg_t*) p)->w; + k = 1; + m = n0; + while (m > 512) { + m >>= 2; + k <<= 2; + cftmdl2(m, &a[n - m], &w[nw - m]); + } + cftleaf(m, 0, &a[n - m], nw, w); + k >>= 1; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } + return (void*) 0; +} +#endif /* USE_CDFT_THREADS */ + +void cftrec4(int n, float* a, int nw, float* w) +{ + int cfttree(int n, int j, int k, float* a, int nw, float* w); + void cftleaf(int n, int isplt, float* a, int nw, float* w); + void cftmdl1(int n, float* a, float* w); + int isplt, j, k, m; + + m = n; + while (m > 512) { + m >>= 2; + cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]); + } + cftleaf(m, 1, &a[n - m], nw, w); + k = 0; + for (j = n - m; j > 0; j -= m) { + k++; + isplt = cfttree(m, j, k, a, nw, w); + cftleaf(m, isplt, &a[j - m], nw, w); + } +} + +int cfttree(int n, int j, int k, float* a, int nw, float* w) +{ + void cftmdl1(int n, float* a, float* w); + void cftmdl2(int n, float* a, float* w); + int i, isplt, m; + + if ((k & 3) != 0) { + isplt = k & 1; + if (isplt != 0) { + cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]); + } else { + cftmdl2(n, &a[j - n], &w[nw - n]); + } + } else { + m = n; + for (i = k; (i & 3) == 0; i >>= 2) { + m <<= 2; + } + isplt = i & 1; + if (isplt != 0) { + while (m > 128) { + cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]); + m >>= 2; + } + } else { + while (m > 128) { + cftmdl2(m, &a[j - m], &w[nw - m]); + m >>= 2; + } + } + } + return isplt; +} + +void cftleaf(int n, int isplt, float* a, int nw, float* w) +{ + void cftmdl1(int n, float* a, float* w); + void cftmdl2(int n, float* a, float* w); + void cftf161(float* a, float* w); + void cftf162(float* a, float* w); + void cftf081(float* a, float* w); + void cftf082(float* a, float* w); + + if (n == 512) { + cftmdl1(128, a, &w[nw - 64]); + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + cftmdl2(128, &a[128], &w[nw - 128]); + cftf161(&a[128], &w[nw - 8]); + cftf162(&a[160], &w[nw - 32]); + cftf161(&a[192], &w[nw - 8]); + cftf162(&a[224], &w[nw - 32]); + cftmdl1(128, &a[256], &w[nw - 64]); + cftf161(&a[256], &w[nw - 8]); + cftf162(&a[288], &w[nw - 32]); + cftf161(&a[320], &w[nw - 8]); + cftf161(&a[352], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(128, &a[384], &w[nw - 64]); + cftf161(&a[480], &w[nw - 8]); + } else { + cftmdl2(128, &a[384], &w[nw - 128]); + cftf162(&a[480], &w[nw - 32]); + } + cftf161(&a[384], &w[nw - 8]); + cftf162(&a[416], &w[nw - 32]); + cftf161(&a[448], &w[nw - 8]); + } else { + cftmdl1(64, a, &w[nw - 32]); + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + cftmdl2(64, &a[64], &w[nw - 64]); + cftf081(&a[64], &w[nw - 8]); + cftf082(&a[80], &w[nw - 8]); + cftf081(&a[96], &w[nw - 8]); + cftf082(&a[112], &w[nw - 8]); + cftmdl1(64, &a[128], &w[nw - 32]); + cftf081(&a[128], &w[nw - 8]); + cftf082(&a[144], &w[nw - 8]); + cftf081(&a[160], &w[nw - 8]); + cftf081(&a[176], &w[nw - 8]); + if (isplt != 0) { + cftmdl1(64, &a[192], &w[nw - 32]); + cftf081(&a[240], &w[nw - 8]); + } else { + cftmdl2(64, &a[192], &w[nw - 64]); + cftf082(&a[240], &w[nw - 8]); + } + cftf081(&a[192], &w[nw - 8]); + cftf082(&a[208], &w[nw - 8]); + cftf081(&a[224], &w[nw - 8]); + } +} + +void cftmdl1(int n, float* a, float* w) +{ + int j, j0, j1, j2, j3, k, m, mh; + float wn4r, wk1r, wk1i, wk3r, wk3i; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + mh = n >> 3; + m = 2 * mh; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] + a[j2]; + x0i = a[1] + a[j2 + 1]; + x1r = a[0] - a[j2]; + x1i = a[1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + a[j2] = x1r - x3i; + a[j2 + 1] = x1i + x3r; + a[j3] = x1r + x3i; + a[j3 + 1] = x1i - x3r; + wn4r = w[1]; + k = 0; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] + a[j2]; + x0i = a[j + 1] + a[j2 + 1]; + x1r = a[j] - a[j2]; + x1i = a[j + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j] = x0r + x2r; + a[j + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1r * x0r - wk1i * x0i; + a[j2 + 1] = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3r * x0r + wk3i * x0i; + a[j3 + 1] = wk3r * x0i - wk3i * x0r; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wk1i * x0r - wk1r * x0i; + a[j2 + 1] = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = wk3i * x0r + wk3r * x0i; + a[j3 + 1] = wk3i * x0i - wk3r * x0r; + } + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] + a[j2]; + x0i = a[j0 + 1] + a[j2 + 1]; + x1r = a[j0] - a[j2]; + x1i = a[j0 + 1] - a[j2 + 1]; + x2r = a[j1] + a[j3]; + x2i = a[j1 + 1] + a[j3 + 1]; + x3r = a[j1] - a[j3]; + x3i = a[j1 + 1] - a[j3 + 1]; + a[j0] = x0r + x2r; + a[j0 + 1] = x0i + x2i; + a[j1] = x0r - x2r; + a[j1 + 1] = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + a[j2] = wn4r * (x0r - x0i); + a[j2 + 1] = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + a[j3] = -wn4r * (x0r + x0i); + a[j3 + 1] = -wn4r * (x0i - x0r); +} + +void cftmdl2(int n, float* a, float* w) +{ + int j, j0, j1, j2, j3, k, kr, m, mh; + float wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i; + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i; + + mh = n >> 3; + m = 2 * mh; + wn4r = w[1]; + j1 = m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[0] - a[j2 + 1]; + x0i = a[1] + a[j2]; + x1r = a[0] + a[j2 + 1]; + x1i = a[1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wn4r * (x2r - x2i); + y0i = wn4r * (x2i + x2r); + a[0] = x0r + y0r; + a[1] = x0i + y0i; + a[j1] = x0r - y0r; + a[j1 + 1] = x0i - y0i; + y0r = wn4r * (x3r - x3i); + y0i = wn4r * (x3i + x3r); + a[j2] = x1r - y0i; + a[j2 + 1] = x1i + y0r; + a[j3] = x1r + y0i; + a[j3 + 1] = x1i - y0r; + k = 0; + kr = 2 * m; + for (j = 2; j < mh; j += 2) { + k += 4; + wk1r = w[k]; + wk1i = w[k + 1]; + wk3r = w[k + 2]; + wk3i = w[k + 3]; + kr -= 4; + wd1i = w[kr]; + wd1r = w[kr + 1]; + wd3i = w[kr + 2]; + wd3r = w[kr + 3]; + j1 = j + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j] - a[j2 + 1]; + x0i = a[j + 1] + a[j2]; + x1r = a[j] + a[j2 + 1]; + x1i = a[j + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wd1r * x2r - wd1i * x2i; + y2i = wd1r * x2i + wd1i * x2r; + a[j] = y0r + y2r; + a[j + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk3r * x1r + wk3i * x1i; + y0i = wk3r * x1i - wk3i * x1r; + y2r = wd3r * x3r + wd3i * x3i; + y2i = wd3r * x3i - wd3i * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + j0 = m - j; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wd1i * x0r - wd1r * x0i; + y0i = wd1i * x0i + wd1r * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wd3i * x1r + wd3r * x1i; + y0i = wd3i * x1i - wd3r * x1r; + y2r = wk3i * x3r + wk3r * x3i; + y2i = wk3i * x3i - wk3r * x3r; + a[j2] = y0r + y2r; + a[j2 + 1] = y0i + y2i; + a[j3] = y0r - y2r; + a[j3 + 1] = y0i - y2i; + } + wk1r = w[m]; + wk1i = w[m + 1]; + j0 = mh; + j1 = j0 + m; + j2 = j1 + m; + j3 = j2 + m; + x0r = a[j0] - a[j2 + 1]; + x0i = a[j0 + 1] + a[j2]; + x1r = a[j0] + a[j2 + 1]; + x1i = a[j0 + 1] - a[j2]; + x2r = a[j1] - a[j3 + 1]; + x2i = a[j1 + 1] + a[j3]; + x3r = a[j1] + a[j3 + 1]; + x3i = a[j1 + 1] - a[j3]; + y0r = wk1r * x0r - wk1i * x0i; + y0i = wk1r * x0i + wk1i * x0r; + y2r = wk1i * x2r - wk1r * x2i; + y2i = wk1i * x2i + wk1r * x2r; + a[j0] = y0r + y2r; + a[j0 + 1] = y0i + y2i; + a[j1] = y0r - y2r; + a[j1 + 1] = y0i - y2i; + y0r = wk1i * x1r - wk1r * x1i; + y0i = wk1i * x1i + wk1r * x1r; + y2r = wk1r * x3r - wk1i * x3i; + y2i = wk1r * x3i + wk1i * x3r; + a[j2] = y0r - y2r; + a[j2 + 1] = y0i - y2i; + a[j3] = y0r + y2r; + a[j3 + 1] = y0i + y2i; +} + +void cftfx41(int n, float* a, int nw, float* w) +{ + void cftf161(float* a, float* w); + void cftf162(float* a, float* w); + void cftf081(float* a, float* w); + void cftf082(float* a, float* w); + + if (n == 128) { + cftf161(a, &w[nw - 8]); + cftf162(&a[32], &w[nw - 32]); + cftf161(&a[64], &w[nw - 8]); + cftf161(&a[96], &w[nw - 8]); + } else { + cftf081(a, &w[nw - 8]); + cftf082(&a[16], &w[nw - 8]); + cftf081(&a[32], &w[nw - 8]); + cftf081(&a[48], &w[nw - 8]); + } +} + +void cftf161(float* a, float* w) +{ + float wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, + y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, y8r, y8i, + y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, y14r, y14i, + y15r, y15i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + x0r = a[0] + a[16]; + x0i = a[1] + a[17]; + x1r = a[0] - a[16]; + x1i = a[1] - a[17]; + x2r = a[8] + a[24]; + x2i = a[9] + a[25]; + x3r = a[8] - a[24]; + x3i = a[9] - a[25]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y4r = x0r - x2r; + y4i = x0i - x2i; + y8r = x1r - x3i; + y8i = x1i + x3r; + y12r = x1r + x3i; + y12i = x1i - x3r; + x0r = a[2] + a[18]; + x0i = a[3] + a[19]; + x1r = a[2] - a[18]; + x1i = a[3] - a[19]; + x2r = a[10] + a[26]; + x2i = a[11] + a[27]; + x3r = a[10] - a[26]; + x3i = a[11] - a[27]; + y1r = x0r + x2r; + y1i = x0i + x2i; + y5r = x0r - x2r; + y5i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y9r = wk1r * x0r - wk1i * x0i; + y9i = wk1r * x0i + wk1i * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y13r = wk1i * x0r - wk1r * x0i; + y13i = wk1i * x0i + wk1r * x0r; + x0r = a[4] + a[20]; + x0i = a[5] + a[21]; + x1r = a[4] - a[20]; + x1i = a[5] - a[21]; + x2r = a[12] + a[28]; + x2i = a[13] + a[29]; + x3r = a[12] - a[28]; + x3i = a[13] - a[29]; + y2r = x0r + x2r; + y2i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y10r = wn4r * (x0r - x0i); + y10i = wn4r * (x0i + x0r); + x0r = x1r + x3i; + x0i = x1i - x3r; + y14r = wn4r * (x0r + x0i); + y14i = wn4r * (x0i - x0r); + x0r = a[6] + a[22]; + x0i = a[7] + a[23]; + x1r = a[6] - a[22]; + x1i = a[7] - a[23]; + x2r = a[14] + a[30]; + x2i = a[15] + a[31]; + x3r = a[14] - a[30]; + x3i = a[15] - a[31]; + y3r = x0r + x2r; + y3i = x0i + x2i; + y7r = x0r - x2r; + y7i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + y11r = wk1i * x0r - wk1r * x0i; + y11i = wk1i * x0i + wk1r * x0r; + x0r = x1r + x3i; + x0i = x1i - x3r; + y15r = wk1r * x0r - wk1i * x0i; + y15i = wk1r * x0i + wk1i * x0r; + x0r = y12r - y14r; + x0i = y12i - y14i; + x1r = y12r + y14r; + x1i = y12i + y14i; + x2r = y13r - y15r; + x2i = y13i - y15i; + x3r = y13r + y15r; + x3i = y13i + y15i; + a[24] = x0r + x2r; + a[25] = x0i + x2i; + a[26] = x0r - x2r; + a[27] = x0i - x2i; + a[28] = x1r - x3i; + a[29] = x1i + x3r; + a[30] = x1r + x3i; + a[31] = x1i - x3r; + x0r = y8r + y10r; + x0i = y8i + y10i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + x3r = y9r - y11r; + x3i = y9i - y11i; + a[16] = x0r + x2r; + a[17] = x0i + x2i; + a[18] = x0r - x2r; + a[19] = x0i - x2i; + a[20] = x1r - x3i; + a[21] = x1i + x3r; + a[22] = x1r + x3i; + a[23] = x1i - x3r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + x0r = y5r + y7i; + x0i = y5i - y7r; + x3r = wn4r * (x0r - x0i); + x3i = wn4r * (x0i + x0r); + x0r = y4r - y6i; + x0i = y4i + y6r; + x1r = y4r + y6i; + x1i = y4i - y6r; + a[8] = x0r + x2r; + a[9] = x0i + x2i; + a[10] = x0r - x2r; + a[11] = x0i - x2i; + a[12] = x1r - x3i; + a[13] = x1i + x3r; + a[14] = x1r + x3i; + a[15] = x1i - x3r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + x3r = y1r - y3r; + x3i = y1i - y3i; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x0r - x2r; + a[3] = x0i - x2i; + a[4] = x1r - x3i; + a[5] = x1i + x3r; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + +void cftf162(float* a, float* w) +{ + float wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, x0r, x0i, x1r, x1i, x2r, x2i, + y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, + y7i, y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, + y14r, y14i, y15r, y15i; + + wn4r = w[1]; + wk1r = w[4]; + wk1i = w[5]; + wk3r = w[6]; + wk3i = -w[7]; + wk2r = w[8]; + wk2i = w[9]; + x1r = a[0] - a[17]; + x1i = a[1] + a[16]; + x0r = a[8] - a[25]; + x0i = a[9] + a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y0r = x1r + x2r; + y0i = x1i + x2i; + y4r = x1r - x2r; + y4i = x1i - x2i; + x1r = a[0] + a[17]; + x1i = a[1] - a[16]; + x0r = a[8] + a[25]; + x0i = a[9] - a[24]; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + y8r = x1r - x2i; + y8i = x1i + x2r; + y12r = x1r + x2i; + y12i = x1i - x2r; + x0r = a[2] - a[19]; + x0i = a[3] + a[18]; + x1r = wk1r * x0r - wk1i * x0i; + x1i = wk1r * x0i + wk1i * x0r; + x0r = a[10] - a[27]; + x0i = a[11] + a[26]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y1r = x1r + x2r; + y1i = x1i + x2i; + y5r = x1r - x2r; + y5i = x1i - x2i; + x0r = a[2] + a[19]; + x0i = a[3] - a[18]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[10] + a[27]; + x0i = a[11] - a[26]; + x2r = wk1r * x0r + wk1i * x0i; + x2i = wk1r * x0i - wk1i * x0r; + y9r = x1r - x2r; + y9i = x1i - x2i; + y13r = x1r + x2r; + y13i = x1i + x2i; + x0r = a[4] - a[21]; + x0i = a[5] + a[20]; + x1r = wk2r * x0r - wk2i * x0i; + x1i = wk2r * x0i + wk2i * x0r; + x0r = a[12] - a[29]; + x0i = a[13] + a[28]; + x2r = wk2i * x0r - wk2r * x0i; + x2i = wk2i * x0i + wk2r * x0r; + y2r = x1r + x2r; + y2i = x1i + x2i; + y6r = x1r - x2r; + y6i = x1i - x2i; + x0r = a[4] + a[21]; + x0i = a[5] - a[20]; + x1r = wk2i * x0r - wk2r * x0i; + x1i = wk2i * x0i + wk2r * x0r; + x0r = a[12] + a[29]; + x0i = a[13] - a[28]; + x2r = wk2r * x0r - wk2i * x0i; + x2i = wk2r * x0i + wk2i * x0r; + y10r = x1r - x2r; + y10i = x1i - x2i; + y14r = x1r + x2r; + y14i = x1i + x2i; + x0r = a[6] - a[23]; + x0i = a[7] + a[22]; + x1r = wk3r * x0r - wk3i * x0i; + x1i = wk3r * x0i + wk3i * x0r; + x0r = a[14] - a[31]; + x0i = a[15] + a[30]; + x2r = wk1i * x0r - wk1r * x0i; + x2i = wk1i * x0i + wk1r * x0r; + y3r = x1r + x2r; + y3i = x1i + x2i; + y7r = x1r - x2r; + y7i = x1i - x2i; + x0r = a[6] + a[23]; + x0i = a[7] - a[22]; + x1r = wk1i * x0r + wk1r * x0i; + x1i = wk1i * x0i - wk1r * x0r; + x0r = a[14] + a[31]; + x0i = a[15] - a[30]; + x2r = wk3i * x0r - wk3r * x0i; + x2i = wk3i * x0i + wk3r * x0r; + y11r = x1r + x2r; + y11i = x1i + x2i; + y15r = x1r - x2r; + y15i = x1i - x2i; + x1r = y0r + y2r; + x1i = y0i + y2i; + x2r = y1r + y3r; + x2i = y1i + y3i; + a[0] = x1r + x2r; + a[1] = x1i + x2i; + a[2] = x1r - x2r; + a[3] = x1i - x2i; + x1r = y0r - y2r; + x1i = y0i - y2i; + x2r = y1r - y3r; + x2i = y1i - y3i; + a[4] = x1r - x2i; + a[5] = x1i + x2r; + a[6] = x1r + x2i; + a[7] = x1i - x2r; + x1r = y4r - y6i; + x1i = y4i + y6r; + x0r = y5r - y7i; + x0i = y5i + y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[8] = x1r + x2r; + a[9] = x1i + x2i; + a[10] = x1r - x2r; + a[11] = x1i - x2i; + x1r = y4r + y6i; + x1i = y4i - y6r; + x0r = y5r + y7i; + x0i = y5i - y7r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[12] = x1r - x2i; + a[13] = x1i + x2r; + a[14] = x1r + x2i; + a[15] = x1i - x2r; + x1r = y8r + y10r; + x1i = y8i + y10i; + x2r = y9r - y11r; + x2i = y9i - y11i; + a[16] = x1r + x2r; + a[17] = x1i + x2i; + a[18] = x1r - x2r; + a[19] = x1i - x2i; + x1r = y8r - y10r; + x1i = y8i - y10i; + x2r = y9r + y11r; + x2i = y9i + y11i; + a[20] = x1r - x2i; + a[21] = x1i + x2r; + a[22] = x1r + x2i; + a[23] = x1i - x2r; + x1r = y12r - y14i; + x1i = y12i + y14r; + x0r = y13r + y15i; + x0i = y13i - y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[24] = x1r + x2r; + a[25] = x1i + x2i; + a[26] = x1r - x2r; + a[27] = x1i - x2i; + x1r = y12r + y14i; + x1i = y12i - y14r; + x0r = y13r - y15i; + x0i = y13i + y15r; + x2r = wn4r * (x0r - x0i); + x2i = wn4r * (x0i + x0r); + a[28] = x1r - x2i; + a[29] = x1i + x2r; + a[30] = x1r + x2i; + a[31] = x1i - x2r; +} + +void cftf081(float* a, float* w) +{ + float wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, + y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + x0r = a[0] + a[8]; + x0i = a[1] + a[9]; + x1r = a[0] - a[8]; + x1i = a[1] - a[9]; + x2r = a[4] + a[12]; + x2i = a[5] + a[13]; + x3r = a[4] - a[12]; + x3i = a[5] - a[13]; + y0r = x0r + x2r; + y0i = x0i + x2i; + y2r = x0r - x2r; + y2i = x0i - x2i; + y1r = x1r - x3i; + y1i = x1i + x3r; + y3r = x1r + x3i; + y3i = x1i - x3r; + x0r = a[2] + a[10]; + x0i = a[3] + a[11]; + x1r = a[2] - a[10]; + x1i = a[3] - a[11]; + x2r = a[6] + a[14]; + x2i = a[7] + a[15]; + x3r = a[6] - a[14]; + x3i = a[7] - a[15]; + y4r = x0r + x2r; + y4i = x0i + x2i; + y6r = x0r - x2r; + y6i = x0i - x2i; + x0r = x1r - x3i; + x0i = x1i + x3r; + x2r = x1r + x3i; + x2i = x1i - x3r; + y5r = wn4r * (x0r - x0i); + y5i = wn4r * (x0r + x0i); + y7r = wn4r * (x2r - x2i); + y7i = wn4r * (x2r + x2i); + a[8] = y1r + y5r; + a[9] = y1i + y5i; + a[10] = y1r - y5r; + a[11] = y1i - y5i; + a[12] = y3r - y7i; + a[13] = y3i + y7r; + a[14] = y3r + y7i; + a[15] = y3i - y7r; + a[0] = y0r + y4r; + a[1] = y0i + y4i; + a[2] = y0r - y4r; + a[3] = y0i - y4i; + a[4] = y2r - y6i; + a[5] = y2i + y6r; + a[6] = y2r + y6i; + a[7] = y2i - y6r; +} + +void cftf082(float* a, float* w) +{ + float wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, y0r, y0i, y1r, y1i, y2r, y2i, y3r, + y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i; + + wn4r = w[1]; + wk1r = w[2]; + wk1i = w[3]; + y0r = a[0] - a[9]; + y0i = a[1] + a[8]; + y1r = a[0] + a[9]; + y1i = a[1] - a[8]; + x0r = a[4] - a[13]; + x0i = a[5] + a[12]; + y2r = wn4r * (x0r - x0i); + y2i = wn4r * (x0i + x0r); + x0r = a[4] + a[13]; + x0i = a[5] - a[12]; + y3r = wn4r * (x0r - x0i); + y3i = wn4r * (x0i + x0r); + x0r = a[2] - a[11]; + x0i = a[3] + a[10]; + y4r = wk1r * x0r - wk1i * x0i; + y4i = wk1r * x0i + wk1i * x0r; + x0r = a[2] + a[11]; + x0i = a[3] - a[10]; + y5r = wk1i * x0r - wk1r * x0i; + y5i = wk1i * x0i + wk1r * x0r; + x0r = a[6] - a[15]; + x0i = a[7] + a[14]; + y6r = wk1i * x0r - wk1r * x0i; + y6i = wk1i * x0i + wk1r * x0r; + x0r = a[6] + a[15]; + x0i = a[7] - a[14]; + y7r = wk1r * x0r - wk1i * x0i; + y7i = wk1r * x0i + wk1i * x0r; + x0r = y0r + y2r; + x0i = y0i + y2i; + x1r = y4r + y6r; + x1i = y4i + y6i; + a[0] = x0r + x1r; + a[1] = x0i + x1i; + a[2] = x0r - x1r; + a[3] = x0i - x1i; + x0r = y0r - y2r; + x0i = y0i - y2i; + x1r = y4r - y6r; + x1i = y4i - y6i; + a[4] = x0r - x1i; + a[5] = x0i + x1r; + a[6] = x0r + x1i; + a[7] = x0i - x1r; + x0r = y1r - y3i; + x0i = y1i + y3r; + x1r = y5r - y7r; + x1i = y5i - y7i; + a[8] = x0r + x1r; + a[9] = x0i + x1i; + a[10] = x0r - x1r; + a[11] = x0i - x1i; + x0r = y1r + y3i; + x0i = y1i - y3r; + x1r = y5r + y7r; + x1i = y5i + y7i; + a[12] = x0r - x1i; + a[13] = x0i + x1r; + a[14] = x0r + x1i; + a[15] = x0i - x1r; +} + +void cftf040(float* a) +{ + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r - x3i; + a[3] = x1i + x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r + x3i; + a[7] = x1i - x3r; +} + +void cftb040(float* a) +{ + float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i; + + x0r = a[0] + a[4]; + x0i = a[1] + a[5]; + x1r = a[0] - a[4]; + x1i = a[1] - a[5]; + x2r = a[2] + a[6]; + x2i = a[3] + a[7]; + x3r = a[2] - a[6]; + x3i = a[3] - a[7]; + a[0] = x0r + x2r; + a[1] = x0i + x2i; + a[2] = x1r + x3i; + a[3] = x1i - x3r; + a[4] = x0r - x2r; + a[5] = x0i - x2i; + a[6] = x1r - x3i; + a[7] = x1i + x3r; +} + +void cftx020(float* a) +{ + float x0r, x0i; + + x0r = a[0] - a[2]; + x0i = a[1] - a[3]; + a[0] += a[2]; + a[1] += a[3]; + a[2] = x0r; + a[3] = x0i; +} + +void rftfsub(int n, float* a, int nc, float* c) +{ + int j, k, kk, ks, m; + float wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr - wki * xi; + yi = wkr * xi + wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + +void rftbsub(int n, float* a, int nc, float* c) +{ + int j, k, kk, ks, m; + float wkr, wki, xr, xi, yr, yi; + + m = n >> 1; + ks = 2 * nc / m; + kk = 0; + for (j = 2; j < m; j += 2) { + k = n - j; + kk += ks; + wkr = 0.5 - c[nc - kk]; + wki = c[kk]; + xr = a[j] - a[k]; + xi = a[j + 1] + a[k + 1]; + yr = wkr * xr + wki * xi; + yi = wkr * xi - wki * xr; + a[j] -= yr; + a[j + 1] -= yi; + a[k] += yr; + a[k + 1] -= yi; + } +} + +void dctsub(int n, float* a, int nc, float* c) +{ + int j, k, kk, ks, m; + float wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[j] - wkr * a[k]; + a[j] = wkr * a[j] + wki * a[k]; + a[k] = xr; + } + a[m] *= c[0]; +} + +void dstsub(int n, float* a, int nc, float* c) +{ + int j, k, kk, ks, m; + float wkr, wki, xr; + + m = n >> 1; + ks = nc / n; + kk = 0; + for (j = 1; j < m; j++) { + k = n - j; + kk += ks; + wkr = c[kk] - c[nc - kk]; + wki = c[kk] + c[nc - kk]; + xr = wki * a[k] - wkr * a[j]; + a[k] = wkr * a[k] + wki * a[j]; + a[j] = xr; + } + a[m] *= c[0]; +} + +NEXTPNR_NAMESPACE_END diff --git a/3rdparty/oourafft/fftsg.h b/3rdparty/oourafft/fftsg.h new file mode 100644 index 00000000..4add9311 --- /dev/null +++ b/3rdparty/oourafft/fftsg.h @@ -0,0 +1,24 @@ +#ifndef FFTSG_H +#define FFTSG_H +#include "nextpnr_namespaces.h" + +NEXTPNR_NAMESPACE_BEGIN + +// +// The following FFT library came from +// http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html +// +// +/// 1D FFT //////////////////////////////////////////////////////////////// +void ddct(int n, int isgn, float *a, int *ip, float *w); +void ddst(int n, int isgn, float *a, int *ip, float *w); + +/// 2D FFT //////////////////////////////////////////////////////////////// +void ddct2d(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w); +void ddsct2d(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w); +void ddcst2d(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w); + + +NEXTPNR_NAMESPACE_END + +#endif diff --git a/3rdparty/oourafft/fftsg2d.cc b/3rdparty/oourafft/fftsg2d.cc new file mode 100644 index 00000000..2bbc2428 --- /dev/null +++ b/3rdparty/oourafft/fftsg2d.cc @@ -0,0 +1,1419 @@ +/////////////////////////////////////////////////////////////////////////////// +// BSD 3-Clause License +// +// Copyright (c) 2018-2020, The Regents of the University of California +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// * Redistributions of source code must retain the above copyright notice, this +// list of conditions and the following disclaimer. +// +// * Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * Neither the name of the copyright holder nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE +// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +/////////////////////////////////////////////////////////////////////////////// + +/* +Fast Fourier/Cosine/Sine Transform + dimension :two + data length :power of 2 + decimation :frequency + radix :split-radix, row-column + data :inplace + table :use +functions + cdft2d: Complex Discrete Fourier Transform + rdft2d: Real Discrete Fourier Transform + ddct2d: Discrete Cosine Transform + ddst2d: Discrete Sine Transform +function prototypes + void cdft2d(int, int, int, float **, float *, int *, float *); + void rdft2d(int, int, int, float **, float *, int *, float *); + void rdft2dsort(int, int, int, float **); + void ddct2d(int, int, int, float **, float *, int *, float *); + void ddst2d(int, int, int, float **, float *, int *, float *); +necessary package + fftsg.c : 1D-FFT package +macro definitions + USE_FFT2D_PTHREADS : default=not defined + FFT2D_MAX_THREADS : must be 2^N, default=4 + FFT2D_THREADS_BEGIN_N : default=65536 + USE_FFT2D_WINTHREADS : default=not defined + FFT2D_MAX_THREADS : must be 2^N, default=4 + FFT2D_THREADS_BEGIN_N : default=131072 + + +-------- Complex DFT (Discrete Fourier Transform) -------- + [definition] + + X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * + exp(2*pi*i*j1*k1/n1) * + exp(2*pi*i*j2*k2/n2), 0<=k1 + X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * + exp(-2*pi*i*j1*k1/n1) * + exp(-2*pi*i*j2*k2/n2), 0<=k1 + ip[0] = 0; // first time only + cdft2d(n1, 2*n2, 1, a, t, ip, w); + + ip[0] = 0; // first time only + cdft2d(n1, 2*n2, -1, a, t, ip, w); + [parameters] + n1 :data length (int) + n1 >= 1, n1 = power of 2 + 2*n2 :data length (int) + n2 >= 1, n2 = power of 2 + a[0...n1-1][0...2*n2-1] + :input/output data (float **) + input data + a[j1][2*j2] = Re(x[j1][j2]), + a[j1][2*j2+1] = Im(x[j1][j2]), + 0<=j1= 8*n1, if single thread, + length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, + t is dynamically allocated, if t == NULL. + ip[0...*] + :work area for bit reversal (int *) + length of ip >= 2+sqrt(n) + (n = max(n1, n2)) + ip[0],ip[1] are pointers of the cos/sin table. + w[0...*] + :cos/sin table (float *) + length of w >= max(n1/2, n2/2) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + cdft2d(n1, 2*n2, -1, a, t, ip, w); + is + cdft2d(n1, 2*n2, 1, a, t, ip, w); + for (j1 = 0; j1 <= n1 - 1; j1++) { + for (j2 = 0; j2 <= 2 * n2 - 1; j2++) { + a[j1][j2] *= 1.0 / n1 / n2; + } + } + . + + +-------- Real DFT / Inverse of Real DFT -------- + [definition] + RDFT + R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * + cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), + 0<=k1 IRDFT (excluding scale) + a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 + (R[j1][j2] * + cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + + I[j1][j2] * + sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), + 0<=k1 + ip[0] = 0; // first time only + rdft2d(n1, n2, 1, a, t, ip, w); + + ip[0] = 0; // first time only + rdft2d(n1, n2, -1, a, t, ip, w); + [parameters] + n1 :data length (int) + n1 >= 2, n1 = power of 2 + n2 :data length (int) + n2 >= 2, n2 = power of 2 + a[0...n1-1][0...n2-1] + :input/output data (float **) + + output data + a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2], + a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2], + 0 + input data + a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2], + a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2], + 0= 8*n1, if single thread, + length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, + t is dynamically allocated, if t == NULL. + ip[0...*] + :work area for bit reversal (int *) + length of ip >= 2+sqrt(n) + (n = max(n1, n2/2)) + ip[0],ip[1] are pointers of the cos/sin table. + w[0...*] + :cos/sin table (float *) + length of w >= max(n1/2, n2/4) + n2/4 + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + rdft2d(n1, n2, 1, a, t, ip, w); + is + rdft2d(n1, n2, -1, a, t, ip, w); + for (j1 = 0; j1 <= n1 - 1; j1++) { + for (j2 = 0; j2 <= n2 - 1; j2++) { + a[j1][j2] *= 2.0 / n1 / n2; + } + } + . + + +-------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- + [definition] + IDCT (excluding scale) + C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * + cos(pi*j1*(k1+1/2)/n1) * + cos(pi*j2*(k2+1/2)/n2), + 0<=k1 DCT + C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * + cos(pi*(j1+1/2)*k1/n1) * + cos(pi*(j2+1/2)*k2/n2), + 0<=k1 + ip[0] = 0; // first time only + ddct2d(n1, n2, 1, a, t, ip, w); + + ip[0] = 0; // first time only + ddct2d(n1, n2, -1, a, t, ip, w); + [parameters] + n1 :data length (int) + n1 >= 2, n1 = power of 2 + n2 :data length (int) + n2 >= 2, n2 = power of 2 + a[0...n1-1][0...n2-1] + :input/output data (float **) + output data + a[k1][k2] = C[k1][k2], 0<=k1= 4*n1, if single thread, + length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, + t is dynamically allocated, if t == NULL. + ip[0...*] + :work area for bit reversal (int *) + length of ip >= 2+sqrt(n) + (n = max(n1/2, n2/2)) + ip[0],ip[1] are pointers of the cos/sin table. + w[0...*] + :cos/sin table (float *) + length of w >= max(n1*3/2, n2*3/2) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddct2d(n1, n2, -1, a, t, ip, w); + is + for (j1 = 0; j1 <= n1 - 1; j1++) { + a[j1][0] *= 0.5; + } + for (j2 = 0; j2 <= n2 - 1; j2++) { + a[0][j2] *= 0.5; + } + ddct2d(n1, n2, 1, a, t, ip, w); + for (j1 = 0; j1 <= n1 - 1; j1++) { + for (j2 = 0; j2 <= n2 - 1; j2++) { + a[j1][j2] *= 4.0 / n1 / n2; + } + } + . + + +-------- DST (Discrete Sine Transform) / Inverse of DST -------- + [definition] + IDST (excluding scale) + S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] * + sin(pi*j1*(k1+1/2)/n1) * + sin(pi*j2*(k2+1/2)/n2), + 0<=k1 DST + S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * + sin(pi*(j1+1/2)*k1/n1) * + sin(pi*(j2+1/2)*k2/n2), + 0 + ip[0] = 0; // first time only + ddst2d(n1, n2, 1, a, t, ip, w); + + ip[0] = 0; // first time only + ddst2d(n1, n2, -1, a, t, ip, w); + [parameters] + n1 :data length (int) + n1 >= 2, n1 = power of 2 + n2 :data length (int) + n2 >= 2, n2 = power of 2 + a[0...n1-1][0...n2-1] + :input/output data (float **) + + input data + a[j1][j2] = A[j1][j2], 0 + output data + a[k1][k2] = S[k1][k2], 0= 4*n1, if single thread, + length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, + t is dynamically allocated, if t == NULL. + ip[0...*] + :work area for bit reversal (int *) + length of ip >= 2+sqrt(n) + (n = max(n1/2, n2/2)) + ip[0],ip[1] are pointers of the cos/sin table. + w[0...*] + :cos/sin table (float *) + length of w >= max(n1*3/2, n2*3/2) + w[],ip[] are initialized if ip[0] == 0. + [remark] + Inverse of + ddst2d(n1, n2, -1, a, t, ip, w); + is + for (j1 = 0; j1 <= n1 - 1; j1++) { + a[j1][0] *= 0.5; + } + for (j2 = 0; j2 <= n2 - 1; j2++) { + a[0][j2] *= 0.5; + } + ddst2d(n1, n2, 1, a, t, ip, w); + for (j1 = 0; j1 <= n1 - 1; j1++) { + for (j2 = 0; j2 <= n2 - 1; j2++) { + a[j1][j2] *= 4.0 / n1 / n2; + } + } + . +*/ + + + +#include +#include +#define fft2d_alloc_error_check(p) \ + { \ + if((p) == NULL) { \ + fprintf(stderr, "fft2d memory allocation error\n"); \ + exit(1); \ + } \ + } + +#ifdef USE_FFT2D_PTHREADS +#define USE_FFT2D_THREADS +#ifndef FFT2D_MAX_THREADS +#define FFT2D_MAX_THREADS 4 +#endif +#ifndef FFT2D_THREADS_BEGIN_N +#define FFT2D_THREADS_BEGIN_N 65536 +#endif +#include +#define fft2d_thread_t pthread_t +#define fft2d_thread_create(thp, func, argp) \ + { \ + if(pthread_create(thp, NULL, func, (void *)(argp)) != 0) { \ + fprintf(stderr, "fft2d thread error\n"); \ + exit(1); \ + } \ + } +#define fft2d_thread_wait(th) \ + { \ + if(pthread_join(th, NULL) != 0) { \ + fprintf(stderr, "fft2d thread error\n"); \ + exit(1); \ + } \ + } +#endif /* USE_FFT2D_PTHREADS */ + +#ifdef USE_FFT2D_WINTHREADS +#define USE_FFT2D_THREADS +#ifndef FFT2D_MAX_THREADS +#define FFT2D_MAX_THREADS 4 +#endif +#ifndef FFT2D_THREADS_BEGIN_N +#define FFT2D_THREADS_BEGIN_N 131072 +#endif +#include +#define fft2d_thread_t HANDLE +#define fft2d_thread_create(thp, func, argp) \ + { \ + DWORD thid; \ + *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)(func), \ + (LPVOID)(argp), 0, &thid); \ + if(*(thp) == 0) { \ + fprintf(stderr, "fft2d thread error\n"); \ + exit(1); \ + } \ + } +#define fft2d_thread_wait(th) \ + { \ + WaitForSingleObject(th, INFINITE); \ + CloseHandle(th); \ + } +#endif /* USE_FFT2D_WINTHREADS */ + +#include "nextpnr_namespaces.h" + +NEXTPNR_NAMESPACE_BEGIN + + +void cdft2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void cdft(int n, int isgn, float* a, int* ip, float* w); + void cdft2d_sub( + int n1, int n2, int isgn, float** a, float* t, int* ip, float* w); +#ifdef USE_FFT2D_THREADS + void xdft2d0_subth( + int n1, int n2, int icr, int isgn, float** a, int* ip, float* w); + void cdft2d_subth( + int n1, int n2, int isgn, float** a, float* t, int* ip, float* w); +#endif /* USE_FFT2D_THREADS */ + int n, itnull, nthread, nt, i; + + n = n1 << 1; + if (n < n2) { + n = n2; + } + if (n > (ip[0] << 2)) { + makewt(n >> 2, ip, w); + } + itnull = 0; + if (t == NULL) { + itnull = 1; + nthread = 1; +#ifdef USE_FFT2D_THREADS + nthread = FFT2D_MAX_THREADS; +#endif /* USE_FFT2D_THREADS */ + nt = 8 * nthread * n1; + if (n2 == 4 * nthread) { + nt >>= 1; + } else if (n2 < 4 * nthread) { + nt >>= 2; + } + t = (float*) malloc(sizeof(float) * nt); + fft2d_alloc_error_check(t); + } +#ifdef USE_FFT2D_THREADS + if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { + xdft2d0_subth(n1, n2, 0, isgn, a, ip, w); + cdft2d_subth(n1, n2, isgn, a, t, ip, w); + } else +#endif /* USE_FFT2D_THREADS */ + { + for (i = 0; i < n1; i++) { + cdft(n2, isgn, a[i], ip, w); + } + cdft2d_sub(n1, n2, isgn, a, t, ip, w); + } + if (itnull != 0) { + free(t); + } +} + +void rdft2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void rdft(int n, int isgn, float* a, int* ip, float* w); + void cdft2d_sub( + int n1, int n2, int isgn, float** a, float* t, int* ip, float* w); + void rdft2d_sub(int n1, int isgn, float** a); +#ifdef USE_FFT2D_THREADS + void xdft2d0_subth( + int n1, int n2, int icr, int isgn, float** a, int* ip, float* w); + void cdft2d_subth( + int n1, int n2, int isgn, float** a, float* t, int* ip, float* w); +#endif /* USE_FFT2D_THREADS */ + int n, nw, nc, itnull, nthread, nt, i; + + n = n1 << 1; + if (n < n2) { + n = n2; + } + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n2 > (nc << 2)) { + nc = n2 >> 2; + makect(nc, ip, w + nw); + } + itnull = 0; + if (t == NULL) { + itnull = 1; + nthread = 1; +#ifdef USE_FFT2D_THREADS + nthread = FFT2D_MAX_THREADS; +#endif /* USE_FFT2D_THREADS */ + nt = 8 * nthread * n1; + if (n2 == 4 * nthread) { + nt >>= 1; + } else if (n2 < 4 * nthread) { + nt >>= 2; + } + t = (float*) malloc(sizeof(float) * nt); + fft2d_alloc_error_check(t); + } +#ifdef USE_FFT2D_THREADS + if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { + if (isgn < 0) { + rdft2d_sub(n1, isgn, a); + cdft2d_subth(n1, n2, isgn, a, t, ip, w); + } + xdft2d0_subth(n1, n2, 1, isgn, a, ip, w); + if (isgn >= 0) { + cdft2d_subth(n1, n2, isgn, a, t, ip, w); + rdft2d_sub(n1, isgn, a); + } + } else +#endif /* USE_FFT2D_THREADS */ + { + if (isgn < 0) { + rdft2d_sub(n1, isgn, a); + cdft2d_sub(n1, n2, isgn, a, t, ip, w); + } + for (i = 0; i < n1; i++) { + rdft(n2, isgn, a[i], ip, w); + } + if (isgn >= 0) { + cdft2d_sub(n1, n2, isgn, a, t, ip, w); + rdft2d_sub(n1, isgn, a); + } + } + if (itnull != 0) { + free(t); + } +} + +void rdft2dsort(int n1, int n2, int isgn, float** a) +{ + int n1h, i; + float x, y; + + n1h = n1 >> 1; + if (isgn < 0) { + for (i = n1h + 1; i < n1; i++) { + a[i][0] = a[i][n2 + 1]; + a[i][1] = a[i][n2]; + } + a[0][1] = a[0][n2]; + a[n1h][1] = a[n1h][n2]; + } else { + for (i = n1h + 1; i < n1; i++) { + y = a[i][0]; + x = a[i][1]; + a[i][n2] = x; + a[i][n2 + 1] = y; + a[n1 - i][n2] = x; + a[n1 - i][n2 + 1] = -y; + a[i][0] = a[n1 - i][0]; + a[i][1] = -a[n1 - i][1]; + } + a[0][n2] = a[0][1]; + a[0][n2 + 1] = 0; + a[0][1] = 0; + a[n1h][n2] = a[n1h][1]; + a[n1h][n2 + 1] = 0; + a[n1h][1] = 0; + } +} + +void ddcst2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void ddct(int n, int isgn, float* a, int* ip, float* w); + void ddst(int n, int isgn, float* a, int* ip, float* w); + void ddxt2d_sub(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#ifdef USE_FFT2D_THREADS + void ddxt2d0_subth( + int n1, int n2, int ics, int isgn, float** a, int* ip, float* w); + void ddxt2d_subth(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#endif /* USE_FFT2D_THREADS */ + int n, nw, nc, itnull, nthread, nt, i; + + n = n1; + if (n < n2) { + n = n2; + } + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + itnull = 0; + if (t == NULL) { + itnull = 1; + nthread = 1; +#ifdef USE_FFT2D_THREADS + nthread = FFT2D_MAX_THREADS; +#endif /* USE_FFT2D_THREADS */ + nt = 4 * nthread * n1; + if (n2 == 2 * nthread) { + nt >>= 1; + } else if (n2 < 2 * nthread) { + nt >>= 2; + } + t = (float*) malloc(sizeof(float) * nt); + fft2d_alloc_error_check(t); + } +#ifdef USE_FFT2D_THREADS + if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { + ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w); + ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w); + } else +#endif /* USE_FFT2D_THREADS */ + { + for (i = 0; i < n1; i++) { + ddst(n2, isgn, a[i], ip, w); + } + ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w); + } + if (itnull != 0) { + free(t); + } +} + +void ddsct2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void ddct(int n, int isgn, float* a, int* ip, float* w); + void ddst(int n, int isgn, float* a, int* ip, float* w); + void ddxt2d_sub(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#ifdef USE_FFT2D_THREADS + void ddxt2d0_subth( + int n1, int n2, int ics, int isgn, float** a, int* ip, float* w); + void ddxt2d_subth(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#endif /* USE_FFT2D_THREADS */ + int n, nw, nc, itnull, nthread, nt, i; + + n = n1; + if (n < n2) { + n = n2; + } + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + itnull = 0; + if (t == NULL) { + itnull = 1; + nthread = 1; +#ifdef USE_FFT2D_THREADS + nthread = FFT2D_MAX_THREADS; +#endif /* USE_FFT2D_THREADS */ + nt = 4 * nthread * n1; + if (n2 == 2 * nthread) { + nt >>= 1; + } else if (n2 < 2 * nthread) { + nt >>= 2; + } + t = (float*) malloc(sizeof(float) * nt); + fft2d_alloc_error_check(t); + } +#ifdef USE_FFT2D_THREADS + if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { + ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w); + ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w); + } else +#endif /* USE_FFT2D_THREADS */ + { + for (i = 0; i < n1; i++) { + ddct(n2, isgn, a[i], ip, w); + } + ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w); + } + if (itnull != 0) { + free(t); + } +} + +void ddct2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void ddct(int n, int isgn, float* a, int* ip, float* w); + void ddxt2d_sub(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#ifdef USE_FFT2D_THREADS + void ddxt2d0_subth( + int n1, int n2, int ics, int isgn, float** a, int* ip, float* w); + void ddxt2d_subth(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#endif /* USE_FFT2D_THREADS */ + int n, nw, nc, itnull, nthread, nt, i; + + n = n1; + if (n < n2) { + n = n2; + } + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + itnull = 0; + if (t == NULL) { + itnull = 1; + nthread = 1; +#ifdef USE_FFT2D_THREADS + nthread = FFT2D_MAX_THREADS; +#endif /* USE_FFT2D_THREADS */ + nt = 4 * nthread * n1; + if (n2 == 2 * nthread) { + nt >>= 1; + } else if (n2 < 2 * nthread) { + nt >>= 2; + } + t = (float*) malloc(sizeof(float) * nt); + fft2d_alloc_error_check(t); + } +#ifdef USE_FFT2D_THREADS + if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { + ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w); + ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w); + } else +#endif /* USE_FFT2D_THREADS */ + { + for (i = 0; i < n1; i++) { + ddct(n2, isgn, a[i], ip, w); + } + ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w); + } + if (itnull != 0) { + free(t); + } +} + +void ddst2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) +{ + void makewt(int nw, int* ip, float* w); + void makect(int nc, int* ip, float* c); + void ddst(int n, int isgn, float* a, int* ip, float* w); + void ddxt2d_sub(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#ifdef USE_FFT2D_THREADS + void ddxt2d0_subth( + int n1, int n2, int ics, int isgn, float** a, int* ip, float* w); + void ddxt2d_subth(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w); +#endif /* USE_FFT2D_THREADS */ + int n, nw, nc, itnull, nthread, nt, i; + + n = n1; + if (n < n2) { + n = n2; + } + nw = ip[0]; + if (n > (nw << 2)) { + nw = n >> 2; + makewt(nw, ip, w); + } + nc = ip[1]; + if (n > nc) { + nc = n; + makect(nc, ip, w + nw); + } + itnull = 0; + if (t == NULL) { + itnull = 1; + nthread = 1; +#ifdef USE_FFT2D_THREADS + nthread = FFT2D_MAX_THREADS; +#endif /* USE_FFT2D_THREADS */ + nt = 4 * nthread * n1; + if (n2 == 2 * nthread) { + nt >>= 1; + } else if (n2 < 2 * nthread) { + nt >>= 2; + } + t = (float*) malloc(sizeof(float) * nt); + fft2d_alloc_error_check(t); + } +#ifdef USE_FFT2D_THREADS + if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { + ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w); + ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w); + } else +#endif /* USE_FFT2D_THREADS */ + { + for (i = 0; i < n1; i++) { + ddst(n2, isgn, a[i], ip, w); + } + ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w); + } + if (itnull != 0) { + free(t); + } +} + +/* -------- child routines -------- */ + +void cdft2d_sub(int n1, + int n2, + int isgn, + float** a, + float* t, + int* ip, + float* w) +{ + void cdft(int n, int isgn, float* a, int* ip, float* w); + int i, j; + + if (n2 > 4) { + for (j = 0; j < n2; j += 8) { + for (i = 0; i < n1; i++) { + t[2 * i] = a[i][j]; + t[2 * i + 1] = a[i][j + 1]; + t[2 * n1 + 2 * i] = a[i][j + 2]; + t[2 * n1 + 2 * i + 1] = a[i][j + 3]; + t[4 * n1 + 2 * i] = a[i][j + 4]; + t[4 * n1 + 2 * i + 1] = a[i][j + 5]; + t[6 * n1 + 2 * i] = a[i][j + 6]; + t[6 * n1 + 2 * i + 1] = a[i][j + 7]; + } + cdft(2 * n1, isgn, t, ip, w); + cdft(2 * n1, isgn, &t[2 * n1], ip, w); + cdft(2 * n1, isgn, &t[4 * n1], ip, w); + cdft(2 * n1, isgn, &t[6 * n1], ip, w); + for (i = 0; i < n1; i++) { + a[i][j] = t[2 * i]; + a[i][j + 1] = t[2 * i + 1]; + a[i][j + 2] = t[2 * n1 + 2 * i]; + a[i][j + 3] = t[2 * n1 + 2 * i + 1]; + a[i][j + 4] = t[4 * n1 + 2 * i]; + a[i][j + 5] = t[4 * n1 + 2 * i + 1]; + a[i][j + 6] = t[6 * n1 + 2 * i]; + a[i][j + 7] = t[6 * n1 + 2 * i + 1]; + } + } + } else if (n2 == 4) { + for (i = 0; i < n1; i++) { + t[2 * i] = a[i][0]; + t[2 * i + 1] = a[i][1]; + t[2 * n1 + 2 * i] = a[i][2]; + t[2 * n1 + 2 * i + 1] = a[i][3]; + } + cdft(2 * n1, isgn, t, ip, w); + cdft(2 * n1, isgn, &t[2 * n1], ip, w); + for (i = 0; i < n1; i++) { + a[i][0] = t[2 * i]; + a[i][1] = t[2 * i + 1]; + a[i][2] = t[2 * n1 + 2 * i]; + a[i][3] = t[2 * n1 + 2 * i + 1]; + } + } else if (n2 == 2) { + for (i = 0; i < n1; i++) { + t[2 * i] = a[i][0]; + t[2 * i + 1] = a[i][1]; + } + cdft(2 * n1, isgn, t, ip, w); + for (i = 0; i < n1; i++) { + a[i][0] = t[2 * i]; + a[i][1] = t[2 * i + 1]; + } + } +} + +void rdft2d_sub(int n1, int isgn, float** a) +{ + int n1h, i, j; + float xi; + + n1h = n1 >> 1; + if (isgn < 0) { + for (i = 1; i < n1h; i++) { + j = n1 - i; + xi = a[i][0] - a[j][0]; + a[i][0] += a[j][0]; + a[j][0] = xi; + xi = a[j][1] - a[i][1]; + a[i][1] += a[j][1]; + a[j][1] = xi; + } + } else { + for (i = 1; i < n1h; i++) { + j = n1 - i; + a[j][0] = 0.5 * (a[i][0] - a[j][0]); + a[i][0] -= a[j][0]; + a[j][1] = 0.5 * (a[i][1] + a[j][1]); + a[i][1] -= a[j][1]; + } + } +} + +void ddxt2d_sub(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w) +{ + void ddct(int n, int isgn, float* a, int* ip, float* w); + void ddst(int n, int isgn, float* a, int* ip, float* w); + int i, j; + + if (n2 > 2) { + for (j = 0; j < n2; j += 4) { + for (i = 0; i < n1; i++) { + t[i] = a[i][j]; + t[n1 + i] = a[i][j + 1]; + t[2 * n1 + i] = a[i][j + 2]; + t[3 * n1 + i] = a[i][j + 3]; + } + if (ics == 0) { + ddct(n1, isgn, t, ip, w); + ddct(n1, isgn, &t[n1], ip, w); + ddct(n1, isgn, &t[2 * n1], ip, w); + ddct(n1, isgn, &t[3 * n1], ip, w); + } else { + ddst(n1, isgn, t, ip, w); + ddst(n1, isgn, &t[n1], ip, w); + ddst(n1, isgn, &t[2 * n1], ip, w); + ddst(n1, isgn, &t[3 * n1], ip, w); + } + for (i = 0; i < n1; i++) { + a[i][j] = t[i]; + a[i][j + 1] = t[n1 + i]; + a[i][j + 2] = t[2 * n1 + i]; + a[i][j + 3] = t[3 * n1 + i]; + } + } + } else if (n2 == 2) { + for (i = 0; i < n1; i++) { + t[i] = a[i][0]; + t[n1 + i] = a[i][1]; + } + if (ics == 0) { + ddct(n1, isgn, t, ip, w); + ddct(n1, isgn, &t[n1], ip, w); + } else { + ddst(n1, isgn, t, ip, w); + ddst(n1, isgn, &t[n1], ip, w); + } + for (i = 0; i < n1; i++) { + a[i][0] = t[i]; + a[i][1] = t[n1 + i]; + } + } +} + +#ifdef USE_FFT2D_THREADS +struct fft2d_arg_st +{ + int nthread; + int n0; + int n1; + int n2; + int ic; + int isgn; + float** a; + float* t; + int* ip; + float* w; +}; +typedef struct fft2d_arg_st fft2d_arg_t; + +void xdft2d0_subth(int n1, + int n2, + int icr, + int isgn, + float** a, + int* ip, + float* w) +{ + void* xdft2d0_th(void* p); + fft2d_thread_t th[FFT2D_MAX_THREADS]; + fft2d_arg_t ag[FFT2D_MAX_THREADS]; + int nthread, i; + + nthread = FFT2D_MAX_THREADS; + if (nthread > n1) { + nthread = n1; + } + for (i = 0; i < nthread; i++) { + ag[i].nthread = nthread; + ag[i].n0 = i; + ag[i].n1 = n1; + ag[i].n2 = n2; + ag[i].ic = icr; + ag[i].isgn = isgn; + ag[i].a = a; + ag[i].ip = ip; + ag[i].w = w; + fft2d_thread_create(&th[i], xdft2d0_th, &ag[i]); + } + for (i = 0; i < nthread; i++) { + fft2d_thread_wait(th[i]); + } +} + +void cdft2d_subth(int n1, + int n2, + int isgn, + float** a, + float* t, + int* ip, + float* w) +{ + void* cdft2d_th(void* p); + fft2d_thread_t th[FFT2D_MAX_THREADS]; + fft2d_arg_t ag[FFT2D_MAX_THREADS]; + int nthread, nt, i; + + nthread = FFT2D_MAX_THREADS; + nt = 8 * n1; + if (n2 == 4 * FFT2D_MAX_THREADS) { + nt >>= 1; + } else if (n2 < 4 * FFT2D_MAX_THREADS) { + nthread = n2 >> 1; + nt >>= 2; + } + for (i = 0; i < nthread; i++) { + ag[i].nthread = nthread; + ag[i].n0 = i; + ag[i].n1 = n1; + ag[i].n2 = n2; + ag[i].isgn = isgn; + ag[i].a = a; + ag[i].t = &t[nt * i]; + ag[i].ip = ip; + ag[i].w = w; + fft2d_thread_create(&th[i], cdft2d_th, &ag[i]); + } + for (i = 0; i < nthread; i++) { + fft2d_thread_wait(th[i]); + } +} + +void ddxt2d0_subth(int n1, + int n2, + int ics, + int isgn, + float** a, + int* ip, + float* w) +{ + void* ddxt2d0_th(void* p); + fft2d_thread_t th[FFT2D_MAX_THREADS]; + fft2d_arg_t ag[FFT2D_MAX_THREADS]; + int nthread, i; + + nthread = FFT2D_MAX_THREADS; + if (nthread > n1) { + nthread = n1; + } + for (i = 0; i < nthread; i++) { + ag[i].nthread = nthread; + ag[i].n0 = i; + ag[i].n1 = n1; + ag[i].n2 = n2; + ag[i].ic = ics; + ag[i].isgn = isgn; + ag[i].a = a; + ag[i].ip = ip; + ag[i].w = w; + fft2d_thread_create(&th[i], ddxt2d0_th, &ag[i]); + } + for (i = 0; i < nthread; i++) { + fft2d_thread_wait(th[i]); + } +} + +void ddxt2d_subth(int n1, + int n2, + int ics, + int isgn, + float** a, + float* t, + int* ip, + float* w) +{ + void* ddxt2d_th(void* p); + fft2d_thread_t th[FFT2D_MAX_THREADS]; + fft2d_arg_t ag[FFT2D_MAX_THREADS]; + int nthread, nt, i; + + nthread = FFT2D_MAX_THREADS; + nt = 4 * n1; + if (n2 == 2 * FFT2D_MAX_THREADS) { + nt >>= 1; + } else if (n2 < 2 * FFT2D_MAX_THREADS) { + nthread = n2; + nt >>= 2; + } + for (i = 0; i < nthread; i++) { + ag[i].nthread = nthread; + ag[i].n0 = i; + ag[i].n1 = n1; + ag[i].n2 = n2; + ag[i].ic = ics; + ag[i].isgn = isgn; + ag[i].a = a; + ag[i].t = &t[nt * i]; + ag[i].ip = ip; + ag[i].w = w; + fft2d_thread_create(&th[i], ddxt2d_th, &ag[i]); + } + for (i = 0; i < nthread; i++) { + fft2d_thread_wait(th[i]); + } +} + +void* xdft2d0_th(void* p) +{ + void cdft(int n, int isgn, float* a, int* ip, float* w); + void rdft(int n, int isgn, float* a, int* ip, float* w); + int nthread, n0, n1, n2, icr, isgn, *ip, i; + float **a, *w; + + nthread = ((fft2d_arg_t*) p)->nthread; + n0 = ((fft2d_arg_t*) p)->n0; + n1 = ((fft2d_arg_t*) p)->n1; + n2 = ((fft2d_arg_t*) p)->n2; + icr = ((fft2d_arg_t*) p)->ic; + isgn = ((fft2d_arg_t*) p)->isgn; + a = ((fft2d_arg_t*) p)->a; + ip = ((fft2d_arg_t*) p)->ip; + w = ((fft2d_arg_t*) p)->w; + if (icr == 0) { + for (i = n0; i < n1; i += nthread) { + cdft(n2, isgn, a[i], ip, w); + } + } else { + for (i = n0; i < n1; i += nthread) { + rdft(n2, isgn, a[i], ip, w); + } + } + return (void*) 0; +} + +void* cdft2d_th(void* p) +{ + void cdft(int n, int isgn, float* a, int* ip, float* w); + int nthread, n0, n1, n2, isgn, *ip, i, j; + float **a, *t, *w; + + nthread = ((fft2d_arg_t*) p)->nthread; + n0 = ((fft2d_arg_t*) p)->n0; + n1 = ((fft2d_arg_t*) p)->n1; + n2 = ((fft2d_arg_t*) p)->n2; + isgn = ((fft2d_arg_t*) p)->isgn; + a = ((fft2d_arg_t*) p)->a; + t = ((fft2d_arg_t*) p)->t; + ip = ((fft2d_arg_t*) p)->ip; + w = ((fft2d_arg_t*) p)->w; + if (n2 > 4 * nthread) { + for (j = 8 * n0; j < n2; j += 8 * nthread) { + for (i = 0; i < n1; i++) { + t[2 * i] = a[i][j]; + t[2 * i + 1] = a[i][j + 1]; + t[2 * n1 + 2 * i] = a[i][j + 2]; + t[2 * n1 + 2 * i + 1] = a[i][j + 3]; + t[4 * n1 + 2 * i] = a[i][j + 4]; + t[4 * n1 + 2 * i + 1] = a[i][j + 5]; + t[6 * n1 + 2 * i] = a[i][j + 6]; + t[6 * n1 + 2 * i + 1] = a[i][j + 7]; + } + cdft(2 * n1, isgn, t, ip, w); + cdft(2 * n1, isgn, &t[2 * n1], ip, w); + cdft(2 * n1, isgn, &t[4 * n1], ip, w); + cdft(2 * n1, isgn, &t[6 * n1], ip, w); + for (i = 0; i < n1; i++) { + a[i][j] = t[2 * i]; + a[i][j + 1] = t[2 * i + 1]; + a[i][j + 2] = t[2 * n1 + 2 * i]; + a[i][j + 3] = t[2 * n1 + 2 * i + 1]; + a[i][j + 4] = t[4 * n1 + 2 * i]; + a[i][j + 5] = t[4 * n1 + 2 * i + 1]; + a[i][j + 6] = t[6 * n1 + 2 * i]; + a[i][j + 7] = t[6 * n1 + 2 * i + 1]; + } + } + } else if (n2 == 4 * nthread) { + for (i = 0; i < n1; i++) { + t[2 * i] = a[i][4 * n0]; + t[2 * i + 1] = a[i][4 * n0 + 1]; + t[2 * n1 + 2 * i] = a[i][4 * n0 + 2]; + t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3]; + } + cdft(2 * n1, isgn, t, ip, w); + cdft(2 * n1, isgn, &t[2 * n1], ip, w); + for (i = 0; i < n1; i++) { + a[i][4 * n0] = t[2 * i]; + a[i][4 * n0 + 1] = t[2 * i + 1]; + a[i][4 * n0 + 2] = t[2 * n1 + 2 * i]; + a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1]; + } + } else if (n2 == 2 * nthread) { + for (i = 0; i < n1; i++) { + t[2 * i] = a[i][2 * n0]; + t[2 * i + 1] = a[i][2 * n0 + 1]; + } + cdft(2 * n1, isgn, t, ip, w); + for (i = 0; i < n1; i++) { + a[i][2 * n0] = t[2 * i]; + a[i][2 * n0 + 1] = t[2 * i + 1]; + } + } + return (void*) 0; +} + +void* ddxt2d0_th(void* p) +{ + void ddct(int n, int isgn, float* a, int* ip, float* w); + void ddst(int n, int isgn, float* a, int* ip, float* w); + int nthread, n0, n1, n2, ics, isgn, *ip, i; + float **a, *w; + + nthread = ((fft2d_arg_t*) p)->nthread; + n0 = ((fft2d_arg_t*) p)->n0; + n1 = ((fft2d_arg_t*) p)->n1; + n2 = ((fft2d_arg_t*) p)->n2; + ics = ((fft2d_arg_t*) p)->ic; + isgn = ((fft2d_arg_t*) p)->isgn; + a = ((fft2d_arg_t*) p)->a; + ip = ((fft2d_arg_t*) p)->ip; + w = ((fft2d_arg_t*) p)->w; + if (ics == 0) { + for (i = n0; i < n1; i += nthread) { + ddct(n2, isgn, a[i], ip, w); + } + } else { + for (i = n0; i < n1; i += nthread) { + ddst(n2, isgn, a[i], ip, w); + } + } + return (void*) 0; +} + +void* ddxt2d_th(void* p) +{ + void ddct(int n, int isgn, float* a, int* ip, float* w); + void ddst(int n, int isgn, float* a, int* ip, float* w); + int nthread, n0, n1, n2, ics, isgn, *ip, i, j; + float **a, *t, *w; + + nthread = ((fft2d_arg_t*) p)->nthread; + n0 = ((fft2d_arg_t*) p)->n0; + n1 = ((fft2d_arg_t*) p)->n1; + n2 = ((fft2d_arg_t*) p)->n2; + ics = ((fft2d_arg_t*) p)->ic; + isgn = ((fft2d_arg_t*) p)->isgn; + a = ((fft2d_arg_t*) p)->a; + t = ((fft2d_arg_t*) p)->t; + ip = ((fft2d_arg_t*) p)->ip; + w = ((fft2d_arg_t*) p)->w; + if (n2 > 2 * nthread) { + for (j = 4 * n0; j < n2; j += 4 * nthread) { + for (i = 0; i < n1; i++) { + t[i] = a[i][j]; + t[n1 + i] = a[i][j + 1]; + t[2 * n1 + i] = a[i][j + 2]; + t[3 * n1 + i] = a[i][j + 3]; + } + if (ics == 0) { + ddct(n1, isgn, t, ip, w); + ddct(n1, isgn, &t[n1], ip, w); + ddct(n1, isgn, &t[2 * n1], ip, w); + ddct(n1, isgn, &t[3 * n1], ip, w); + } else { + ddst(n1, isgn, t, ip, w); + ddst(n1, isgn, &t[n1], ip, w); + ddst(n1, isgn, &t[2 * n1], ip, w); + ddst(n1, isgn, &t[3 * n1], ip, w); + } + for (i = 0; i < n1; i++) { + a[i][j] = t[i]; + a[i][j + 1] = t[n1 + i]; + a[i][j + 2] = t[2 * n1 + i]; + a[i][j + 3] = t[3 * n1 + i]; + } + } + } else if (n2 == 2 * nthread) { + for (i = 0; i < n1; i++) { + t[i] = a[i][2 * n0]; + t[n1 + i] = a[i][2 * n0 + 1]; + } + if (ics == 0) { + ddct(n1, isgn, t, ip, w); + ddct(n1, isgn, &t[n1], ip, w); + } else { + ddst(n1, isgn, t, ip, w); + ddst(n1, isgn, &t[n1], ip, w); + } + for (i = 0; i < n1; i++) { + a[i][2 * n0] = t[i]; + a[i][2 * n0 + 1] = t[n1 + i]; + } + } else if (n2 == nthread) { + for (i = 0; i < n1; i++) { + t[i] = a[i][n0]; + } + if (ics == 0) { + ddct(n1, isgn, t, ip, w); + } else { + ddst(n1, isgn, t, ip, w); + } + for (i = 0; i < n1; i++) { + a[i][n0] = t[i]; + } + } + return (void*) 0; +} +#endif /* USE_FFT2D_THREADS */ + +NEXTPNR_NAMESPACE_END diff --git a/3rdparty/oourafft/readme.txt b/3rdparty/oourafft/readme.txt new file mode 100644 index 00000000..145a081b --- /dev/null +++ b/3rdparty/oourafft/readme.txt @@ -0,0 +1,167 @@ +General Purpose FFT (Fast Fourier/Cosine/Sine Transform) Package + +Description: + A package to calculate Discrete Fourier/Cosine/Sine Transforms of + 1-dimensional sequences of length 2^N. + +Files: + fft4g.c : FFT Package in C - Fast Version I (radix 4,2) + fft4g.f : FFT Package in Fortran - Fast Version I (radix 4,2) + fft4g_h.c : FFT Package in C - Simple Version I (radix 4,2) + fft8g.c : FFT Package in C - Fast Version II (radix 8,4,2) + fft8g.f : FFT Package in Fortran - Fast Version II (radix 8,4,2) + fft8g_h.c : FFT Package in C - Simple Version II (radix 8,4,2) + fftsg.c : FFT Package in C - Fast Version III (Split-Radix) + fftsg.f : FFT Package in Fortran - Fast Version III (Split-Radix) + fftsg_h.c : FFT Package in C - Simple Version III (Split-Radix) + readme.txt : Readme File + sample1/ : Test Directory + Makefile : for gcc, cc + Makefile.f77: for Fortran + testxg.c : Test Program for "fft*g.c" + testxg.f : Test Program for "fft*g.f" + testxg_h.c : Test Program for "fft*g_h.c" + sample2/ : Benchmark Directory + Makefile : for gcc, cc + Makefile.pth: POSIX Thread version + pi_fft.c : PI(= 3.1415926535897932384626...) Calculation Program + for a Benchmark Test for "fft*g.c" + +Difference of the Files: + C and Fortran versions are equal and + the same routines are in each version. + "fft4g*.*" are optimized for most machines. + "fft8g*.*" are fast on the UltraSPARC. + "fftsg*.*" are optimized for the machines that + have the multi-level (L1,L2,etc) cache. + The simple versions "fft*g_h.c" use no work area, but + the fast versions "fft*g.*" use work areas. + The fast versions "fft*g.*" have the same specification. + +Routines in the Package: + cdft: Complex Discrete Fourier Transform + rdft: Real Discrete Fourier Transform + ddct: Discrete Cosine Transform + ddst: Discrete Sine Transform + dfct: Cosine Transform of RDFT (Real Symmetric DFT) + dfst: Sine Transform of RDFT (Real Anti-symmetric DFT) + +Usage: + Please refer to the comments in the "fft**.*" file which + you want to use. Brief explanations are in the block + comments of each package. The examples are also given in + the test programs. + +Method: + -------- cdft -------- + fft4g*.*, fft8g*.*: + A method of in-place, radix 2^M, Sande-Tukey (decimation in + frequency). Index of the butterfly loop is in bit + reverse order to keep continuous memory access. + fftsg*.*: + A method of in-place, Split-Radix, recursive fast + algorithm. + -------- rdft -------- + A method with a following butterfly operation appended to "cdft". + In forward transform : + A[k] = sum_j=0^n-1 a[j]*W(n)^(j*k), 0<=k<=n/2, + W(n) = exp(2*pi*i/n), + this routine makes an array x[] : + x[j] = a[2*j] + i*a[2*j+1], 0<=j + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#ifndef ARRAY2D_H +#define ARRAY2D_H + +#include +#include +#include +#include +#include +#include + +#include "nextpnr_assertions.h" +#include "nextpnr_namespaces.h" + +NEXTPNR_NAMESPACE_BEGIN + +template class array2d +{ + public: + array2d() : m_width(0), m_height(0), m_size(0), data(nullptr){}; + array2d(int width, int height) : m_width(width), m_height(height), m_size(width * height) + { + data = new T[m_width * m_height](); + } + array2d(int width, int height, const T &init) : m_width(width), m_height(height), m_size(width * height) + { + data = new T[m_width * m_height]; + std::fill(data, data + (m_width * m_height), init); + } + array2d(const array2d &other) : m_width(other.m_width), m_height(other.m_height), m_size(other.m_size) + { + data = new T[m_size](); + if (m_size > 0) + std::copy(other.data, other.data + (m_width * m_height), data); + } + array2d(array2d &&other) noexcept : m_width(other.m_width), m_height(other.m_height), m_size(other.m_size) + { + data = other.data; + other.data = nullptr; + other.m_size = 0; + } + void reset(int new_width, int new_height, const T &init = {}) + { + if ((new_width * new_height) > m_size) { + delete[] data; + m_size = new_width * new_height; + data = new T[m_size]; + } + m_width = new_width; + m_height = new_height; + std::fill(data, data + (m_width * m_height), init); + } + + int width() const { return m_width; } + int height() const { return m_height; } + T &at(int x, int y) + { + NPNR_ASSERT(x >= 0 && x < m_width); + NPNR_ASSERT(y >= 0 && y < m_height); + return data[y * m_width + x]; + } + T &at(const Loc &l) { return at(l.x, l.y); } + const T &at(int x, int y) const + { + NPNR_ASSERT(x >= 0 && x < m_width); + NPNR_ASSERT(y >= 0 && y < m_height); + return data[y * m_width + x]; + } + const T &at(const Loc &l) const { return at(l.x, l.y); } + ~array2d() { delete[] data; } + struct entry + { + entry(int x, int y, T &value) : x(x), y(y), value(value){}; + int x, y; + T &value; + }; + struct iterator + { + public: + entry operator*() { return {x, y, base->at(x, y)}; } + inline iterator operator++() + { + ++x; + if (x >= base->width()) { + x = 0; + ++y; + } + return *this; + } + inline iterator operator++(int) + { + iterator prior(x, y, base); + ++x; + if (x >= base->width()) { + x = 0; + ++y; + } + return prior; + } + inline bool operator!=(const iterator &other) const { return other.x != x || other.y != y; } + inline bool operator==(const iterator &other) const { return other.x == x && other.y == y; } + + private: + iterator(int x, int y, array2d &base) : x(x), y(y), base(&base){}; + int x, y; + array2d *base; + friend class array2d; + }; + iterator begin() { return {0, 0, *this}; } + iterator end() { return {0, m_height, *this}; } + + void write_csv(const std::string &filename) const + { + std::ofstream out(filename); + NPNR_ASSERT(out); + for (int y = 0; y < m_height; y++) { + for (int x = 0; x < m_width; x++) { + out << at(x, y) << ","; + } + out << std::endl; + } + } + + private: + int m_width, m_height, m_size; + T *data; +}; +NEXTPNR_NAMESPACE_END + +#endif diff --git a/common/kernel/deterministic_rng.h b/common/kernel/deterministic_rng.h index 3aab5a49..73a9ef78 100644 --- a/common/kernel/deterministic_rng.h +++ b/common/kernel/deterministic_rng.h @@ -72,6 +72,8 @@ struct DeterministicRNG } } + float rngf(float n) { return float(rng64() & 0x3fffffff) * (n / float(0x3fffffff)); } + void rngseed(uint64_t seed) { rngstate = seed ? seed : 0x3141592653589793; diff --git a/common/place/placer_static.cc b/common/place/placer_static.cc new file mode 100644 index 00000000..3e105e47 --- /dev/null +++ b/common/place/placer_static.cc @@ -0,0 +1,1296 @@ +/* + * nextpnr -- Next Generation Place and Route + * + * Copyright (C) 2022 gatecat + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#include "placer_static.h" +#include "static_util.h" + +#include +#include +#include +#include +#include +#include +#include +#include "array2d.h" +#include "fast_bels.h" +#include "log.h" +#include "nextpnr.h" +#include "parallel_refine.h" +#include "place_common.h" +#include "placer1.h" +#include "scope_lock.h" +#include "timing.h" +#include "util.h" + +#include "fftsg.h" + +NEXTPNR_NAMESPACE_BEGIN + +using namespace StaticUtil; + +namespace { + +struct PlacerGroup +{ + int total_bels = 0; + double concrete_area = 0; + double dark_area = 0; + double total_area = 0; + array2d loc_area; + + float overlap = 0; + + array2d conc_density; // excludes fillers and dark nodes + array2d density; + // FFT related data (TODO: should these be per group?) + FFTArray density_fft; + FFTArray electro_phi; + FFTArray electro_fx, electro_fy; +}; + +// Could be an actual concrete netlist cell; or just a spacer +struct MoveCell +{ + StaticRect rect; + // TODO: multiple contiguous vectors is probably faster than an array of structs, but also messier + RealPair pos, ref_pos, last_pos, last_ref_pos; + RealPair ref_wl_grad, wl_grad, last_wl_grad; + RealPair ref_dens_grad, dens_grad, last_dens_grad; + RealPair ref_total_grad, total_grad, last_total_grad; + int32_t pin_count; + int16_t group; + int16_t bx, by; // bins + bool is_fixed : 1; + bool is_spacer : 1; + bool is_dark : 1; +}; + +// Extra data for cells that aren't spacers +struct ConcreteCell +{ + CellInfo *base_cell; + // When cells are macros; we split them up into chunks + // based on dx/dy location + int32_t macro_idx = -1; + int16_t chunk_dx = 0, chunk_dy = 0; +}; + +struct ClusterGroupKey +{ + ClusterGroupKey(int dx = 0, int dy = 0, int group = -1) : dx(dx), dy(dy), group(group){}; + bool operator==(const ClusterGroupKey &other) const + { + return dx == other.dx && dy == other.dy && group == other.group; + } + unsigned hash() const { return mkhash(mkhash(dx, dy), group); } + int16_t dx, dy, group; +}; + +struct PlacerMacro +{ + CellInfo *root; + std::vector conc_cells; + dict> cells; +}; + +struct PlacerBin +{ + float density; + // ... +}; + +struct PlacerPort +{ + // for wirelength data + static constexpr float invalid = std::numeric_limits::lowest(); + + PortRef ref; + RealPair max_exp{invalid, invalid}; + RealPair min_exp{invalid, invalid}; + bool has_max_exp(Axis axis) const { return max_exp.at(axis) != invalid; } + bool has_min_exp(Axis axis) const { return min_exp.at(axis) != invalid; } +}; + +struct PlacerNet +{ + NetInfo *ni; + bool skip = false; + RealPair b1, b0; // real bounding box + RealPair min_exp, x_min_exp; + RealPair max_exp, x_max_exp; + RealPair wa_wl; + // lines up with user indexes; plus one for driver + std::vector ports; + int hpwl() { return (b1.x - b0.x) + (b1.y - b0.y); } +}; + +class StaticPlacer +{ + Context *ctx; + PlacerStaticCfg cfg; + + std::vector mcells; + std::vector ccells; + std::vector macros; + std::vector groups; + std::vector nets; + idict cluster2idx; + + FastBels fast_bels; + TimingAnalyser tmg; + + int width, height; + int iter = 0; + bool fft_debug = false; + + // legalisation queue + std::priority_queue> to_legalise; + + void prepare_cells() + { + for (auto &cell : ctx->cells) { + CellInfo *ci = cell.second.get(); + ci->udata = -1; + // process legacy-ish bel attributes + if (ci->attrs.count(ctx->id("BEL")) && ci->bel == BelId()) { + std::string loc_name = ci->attrs.at(ctx->id("BEL")).as_string(); + BelId bel = ctx->getBelByNameStr(loc_name); + NPNR_ASSERT(ctx->isValidBelForCellType(ci->type, bel)); + NPNR_ASSERT(ctx->checkBelAvail(bel)); + ctx->bindBel(bel, ci, STRENGTH_USER); + } + } + } + + bool lookup_group(IdString type, int &group, StaticRect &rect) + { + for (size_t i = 0; i < cfg.cell_groups.size(); i++) { + const auto &g = cfg.cell_groups.at(i); + if (g.cell_area.count(type)) { + group = i; + rect = g.cell_area.at(type); + return true; + } + } + return false; + } + + void init_bels() + { + log_info("⌁ initialising bels...\n"); + width = 0; + height = 0; + for (auto bel : ctx->getBels()) { + Loc loc = ctx->getBelLocation(bel); + width = std::max(width, loc.x + 1); + height = std::max(height, loc.y + 1); + } + dict beltype2group; + for (int i = 0; i < int(groups.size()); i++) { + groups.at(i).loc_area.reset(width, height); + for (const auto &bel_type : cfg.cell_groups.at(i).cell_area) + beltype2group[bel_type.first] = i; + } + for (auto bel : ctx->getBels()) { + Loc loc = ctx->getBelLocation(bel); + IdString type = ctx->getBelType(bel); + auto fnd = beltype2group.find(type); + if (fnd == beltype2group.end()) + continue; + auto size = cfg.cell_groups.at(fnd->second).bel_area.at(type); // TODO: do we care about dimensions too + auto &group = groups.at(fnd->second); + for (int dy = 0; dy <= int(size.h); dy++) { + for (int dx = 0; dx <= int(size.w); dx++) { + float h = (dy == int(size.h)) ? (size.h - int(size.h)) : 1; + float w = (dx == int(size.w)) ? (size.w - int(size.w)) : 1; + group.loc_area.at(loc.x + dx, loc.y + dy) += w * h; + } + } + group.total_area += size.area(); + group.total_bels += 1; + } + } + + void init_nets() + { + nets.reserve(ctx->nets.size()); + for (auto &net : ctx->nets) { + NetInfo *ni = net.second.get(); + ni->udata = nets.size(); + nets.emplace_back(); + + auto &nd = nets.back(); + nd.ni = ni; + nd.skip = (ni->driver.cell == nullptr); // (or global buffer?) + nd.ports.resize(ni->users.capacity() + 1); // +1 for the driver + nd.ports.back().ref = ni->driver; + for (auto usr : ni->users.enumerate()) { + nd.ports.at(usr.index.idx()).ref = usr.value; + } + } + } + + int add_cell(StaticRect rect, int group, RealPair pos, CellInfo *ci = nullptr) + { + int idx = mcells.size(); + mcells.emplace_back(); + auto &m = mcells.back(); + m.rect = rect; + m.group = group; + m.pos = pos; + if (ci) { + // Is a concrete cell (might be a macro, in which case ci is just one of them...) + // Can't add concrete cells once we have spacers (we define it such that indices line up between mcells and + // ccells; spacer cells only exist in mcells) + NPNR_ASSERT(idx == int(ccells.size())); + ccells.emplace_back(); + auto &c = ccells.back(); + c.base_cell = ci; + groups.at(group).concrete_area += rect.area(); + } else { + // Is a spacer cell + m.is_spacer = true; + } + return idx; + } + + const float pi = 3.141592653589793f; + + RealPair rand_loc() + { + // Box-muller + float u1 = ctx->rngf(1.0f); + while (u1 < 1e-5) + u1 = ctx->rngf(1.0f); + float u2 = ctx->rngf(1.0f); + float m = std::sqrt(-2.f * std::log(u1)); + float z0 = m * std::cos(2.f * pi * u2); + float z1 = m * std::sin(2.f * pi * u2); + float x = (width / 2.f) + (width / 50.f) * z0; + float y = (height / 2.f) + (height / 50.f) * z1; + x = std::min(width - 1.f, std::max(x, 0)); + y = std::min(height - 1.f, std::max(y, 0)); + return RealPair(x, y); + } + + RealPair cell_loc(CellInfo *ci, bool ref) + { + if (ci->udata == -1) { + // not handled? + NPNR_ASSERT(ci->bel != BelId()); // already fixed + return RealPair(ctx->getBelLocation(ci->bel), 0.5f); + } else { + return ref ? mcells.at(ci->udata).ref_pos : mcells.at(ci->udata).pos; + } + } + + void init_cells() + { + log_info("⌁ initialising cells...\n"); + // Process non-clustered cells and find clusters + for (auto &cell : ctx->cells) { + CellInfo *ci = cell.second.get(); + int cell_group; + StaticRect rect; + // Mismatched group case + if (!lookup_group(ci->type, cell_group, rect)) { + // TODO: what is the best thing to do here? singletons/odd cells we can probably mostly randomly place + continue; + } + if (ci->cluster != ClusterId()) { + // Defer processing of macro clusters + int c_idx = cluster2idx(ci->cluster); + if (c_idx >= int(macros.size())) { + macros.emplace_back(); + macros.back().root = ctx->getClusterRootCell(ci->cluster); + } + auto &m = macros.at(c_idx); + Loc delta = ctx->getClusterOffset(ci); + m.cells[ClusterGroupKey(delta.x, delta.y, cell_group)].push_back(ci); + } else { + // Non-clustered cells can be processed already + int idx = add_cell(rect, cell_group, rand_loc(), ci); + ci->udata = idx; + auto &mc = mcells.at(idx); + mc.pin_count += int(ci->ports.size()); + if (ci->bel != BelId()) { + // Currently; treat all ready-placed cells as fixed (eventually we might do incremental ripups + // here...) + Loc loc = ctx->getBelLocation(ci->bel); + mc.pos.x = loc.x + 0.5; + mc.pos.y = loc.y + 0.5; + mc.is_fixed = true; + } + } + } + // Process clustered cells + for (int i = 0; i < int(macros.size()); i++) { + auto &m = macros.at(i); + for (auto &kv : m.cells) { + const auto &g = cfg.cell_groups.at(kv.first.group); + // Only treat zero-area cells as zero-area; if this cluster also contains non-zero area cells + bool has_nonzero = std::any_of(kv.second.begin(), kv.second.end(), + [&](const CellInfo *ci) { return !g.zero_area_cells.count(ci->type); }); + StaticRect cluster_size; + for (auto ci : kv.second) { + if (has_nonzero && g.zero_area_cells.count(ci->type)) + continue; + // Compute an equivalent-area stacked rectange for cells in this cluster group. + // There are probably some ugly cases this handles badly. + StaticRect r = g.cell_area.at(ci->type); + if (r.w > r.h) { + // Long and thin, "stack" vertically + // Compute height we add to stack + if (cluster_size.w < r.w) { + cluster_size.h *= (cluster_size.w / r.w); + cluster_size.w = r.w; + } + cluster_size.h += ((r.w * r.h) / cluster_size.w); + } else { + // "stack" horizontally + if (cluster_size.h < r.h) { + cluster_size.w *= (cluster_size.h / r.h); + cluster_size.h = r.h; + } + cluster_size.w += ((r.w * r.h) / cluster_size.h); + } + } + // Now add the moveable cell + if (cluster_size.area() > 0) { + int idx = add_cell(cluster_size, kv.first.group, rand_loc(), kv.second.front()); + auto &mc = mcells.at(idx); + if (kv.second.front()->bel != BelId()) { + // Currently; treat all ready-placed cells as fixed (eventually we might do incremental ripups + // here...) + Loc loc = ctx->getBelLocation(kv.second.front()->bel); + mc.pos.x = loc.x + 0.5; + mc.pos.y = loc.y + 0.5; + mc.is_fixed = true; + } + for (auto ci : kv.second) { + ci->udata = idx; + mc.pin_count += int(ci->ports.size()); + } + auto &cc = ccells.at(idx); + cc.macro_idx = i; + cc.chunk_dx = kv.first.dx; + cc.chunk_dy = kv.first.dy; + m.conc_cells.push_back(idx); + } + } + } + } + + const double target_util = 0.8; + + void insert_dark() + { + log_info("⌁ inserting dark nodes...\n"); + for (int group = 0; group < int(groups.size()); group++) { + auto &g = groups.at(group); + for (auto tile : g.loc_area) { + if (tile.value > 0.5f) + continue; + StaticRect dark_area(1.0f, 1.0f - tile.value); + int cell_idx = add_cell(dark_area, group, RealPair(tile.x + 0.5f, tile.y + 0.5f), nullptr /*spacer*/); + mcells.at(cell_idx).is_dark = true; + } + } + } + + void insert_spacer() + { + log_info("⌁ inserting spacers...\n"); + int inserted_spacers = 0; + for (int group = 0; group < int(groups.size()); group++) { + const auto &cg = cfg.cell_groups.at(group); + const auto &g = groups.at(group); + double util = g.concrete_area / g.total_area; + log_info("⌁ group %s pre-spacer utilisation %.02f%% (target %.02f%%)\n", ctx->nameOf(cg.name), + (util * 100.0), (target_util * 100.0)); + // TODO: better computation of spacer size and placement? + int spacer_count = (g.total_area * target_util - g.concrete_area) / cg.spacer_rect.area(); + if (spacer_count <= 0) + continue; + for (int i = 0; i < spacer_count; i++) { + add_cell(cg.spacer_rect, group, RealPair(ctx->rngf(width), ctx->rngf(height)), nullptr /*spacer*/); + ++inserted_spacers; + } + } + log_info("⌁ inserted a total of %d spacers\n", inserted_spacers); + } + + // TODO: dark node insertion when we have obstructions or non-rectangular placement regions + int m; + double bin_w, bin_h; + + std::vector cs_table_fft; + std::vector work_area_fft; + + void prepare_density_bins() + { + // TODO: a m x m grid follows the paper and makes the DCTs easier, but is it actually ideal for non-square + // FPGAs? + m = 1 << int(std::ceil(std::log2(std::sqrt(mcells.size() / groups.size())))); + bin_w = double(width) / m; + bin_h = double(height) / m; + + for (auto &g : groups) { + g.density.reset(m, m, 0); + g.density_fft.reset(m, m, 0); + g.electro_phi.reset(m, m, 0); + g.electro_fx.reset(m, m, 0); + g.electro_fy.reset(m, m, 0); + } + cs_table_fft.resize(m * 3 / 2, 0); + work_area_fft.resize(std::round(std::sqrt(m)) + 2, 0); + work_area_fft.at(0) = 0; + } + + template void iter_slithers(RealPair pos, StaticRect rect, TFunc func) + { + // compute the stamp over bins (this could probably be more efficient?) + + double width = rect.w, height = rect.h; + double scaled_density = 1.0; + if (width < bin_w) { + scaled_density *= (width / bin_w); + width = bin_w; + } + if (height < bin_h) { + scaled_density *= (height / bin_h); + height = bin_h; + } + + double x0 = pos.x - (width / 2), x1 = pos.x + (width / 2); + double y0 = pos.y - (height / 2), y1 = pos.y + (height / 2); + for (int y = int(y0 / bin_h); y <= int(y1 / bin_h); y++) { + for (int x = int(x0 / bin_w); x <= int(x1 / bin_w); x++) { + int xb = std::max(0, std::min(x, m - 1)); + int yb = std::max(0, std::min(y, m - 1)); + double slither_w = 1.0, slither_h = 1.0; + if (yb == int(y0 / bin_h)) // y slithers + slither_h = ((yb + 1) * bin_h) - y0; + else if (yb == int(y1 / bin_h)) + slither_h = (y1 - yb * bin_h); + if (xb == int(x0 / bin_w)) // x slithers + slither_w = ((xb + 1) * bin_w) - x0; + else if (xb == int(x1 / bin_w)) + slither_w = (x1 - xb * bin_w); + func(xb, yb, scaled_density * slither_w * slither_h); + } + } + }; + + void compute_density(int group, bool ref) + { + auto &g = groups.at(group); + // reset + for (auto entry : g.density) + entry.value = 0; + // populate + for (int idx = 0; idx < int(mcells.size()); idx++) { + auto &mc = mcells.at(idx); + if (mc.group != group) + continue; + // scale width and height to be at least one bin (local density smoothing from the eplace paper) + // TODO: should we really do this every iteration? + + auto pos = ref ? mc.ref_pos : mc.pos; + iter_slithers(pos, mc.rect, [&](int x, int y, float area) { g.density.at(x, y) += area; }); + } + } + + void compute_overlap() + { + // populate for concrete cells only + for (auto &g : groups) + g.conc_density.reset(m, m, 0); + for (int idx = 0; idx < int(ccells.size()); idx++) { + auto &mc = mcells.at(idx); + auto &g = groups.at(mc.group); + // scale width and height to be at least one bin (local density smoothing from the eplace paper) + // TODO: should we really do this every iteration? + auto loc = mc.pos; + auto size = mc.rect; + + for (int dy = 0; dy <= int(size.h); dy++) { + for (int dx = 0; dx <= int(size.w); dx++) { + float h = (dy == int(size.h)) ? (size.h - int(size.h)) : 1; + float w = (dx == int(size.w)) ? (size.w - int(size.w)) : 1; + g.conc_density.at(loc.x + dx, loc.y + dy) += w * h; + } + } + } + std::string overlap_str = ""; + for (int idx = 0; idx < int(groups.size()); idx++) { + auto &g = groups.at(idx); + g.overlap = 0; + float total_area = 0; + for (auto tile : g.loc_area) { + // g.overlap += std::max(0, g.conc_density.at(tile.x, tile.y) - tile.value); + g.overlap += std::max(0, g.conc_density.at(tile.x, tile.y) - 1); + total_area += g.conc_density.at(tile.x, tile.y); + } + g.overlap /= std::max(1.0f, total_area); + if (!overlap_str.empty()) + overlap_str += ", "; + overlap_str += stringf("%s=%.1f%%", cfg.cell_groups.at(idx).name.c_str(ctx), g.overlap*100); + g.conc_density.write_csv(stringf("out_conc_density_%d_%d.csv", iter, idx)); + } + log_info("overlap: %s\n", overlap_str.c_str()); + } + + void run_fft(int group) + { + // get data into form that fft wants + auto &g = groups.at(group); + for (auto entry : g.density) + g.density_fft.at(entry.x, entry.y) = entry.value; + if (fft_debug) + g.density_fft.write_csv(stringf("out_bin_density_%d_%d.csv", iter, group)); + // Based on + // https://github.com/ALIGN-analoglayout/ALIGN-public/blob/master/PlaceRouteHierFlow/EA_placer/FFT/fft.cpp + // initial DCT for coefficients + ddct2d(m, m, -1, g.density_fft.data(), nullptr, work_area_fft.data(), cs_table_fft.data()); + // postprocess coefficients + for (int x = 0; x < m; x++) + g.density_fft.at(x, 0) *= 0.5f; + for (int y = 0; y < m; y++) + g.density_fft.at(0, y) *= 0.5f; + for (int x = 0; x < m; x++) + for (int y = 0; y < m; y++) + g.density_fft.at(x, y) *= (4.0f / (m * m)); + // scale inputs to IDCT for potentials and field + for (int x = 0; x < m; x++) { + float wx = pi * (x / float(m)); + float wx2 = wx * wx; + for (int y = 0; y < m; y++) { + float wy = pi * (y / float(m)); + float wy2 = wy * wy; + + float dens = g.density_fft.at(x, y); + float phi = 0, ex = 0, ey = 0; + + if (x != 0 || y != 0) { // avoid divide by zero... + phi = dens / (wx2 + wy2); + ex = phi * wx; + ey = phi * wy; + } + + g.electro_phi.at(x, y) = phi; + g.electro_fx.at(x, y) = ex; + g.electro_fy.at(x, y) = ey; + } + } + // IDCT for potential; 2D derivatives for field + ddct2d(m, m, 1, g.electro_phi.data(), nullptr, work_area_fft.data(), cs_table_fft.data()); + ddsct2d(m, m, 1, g.electro_fx.data(), nullptr, work_area_fft.data(), cs_table_fft.data()); + ddcst2d(m, m, 1, g.electro_fy.data(), nullptr, work_area_fft.data(), cs_table_fft.data()); + if (fft_debug) { + g.electro_phi.write_csv(stringf("out_bin_phi_%d_%d.csv", iter, group)); + g.electro_fx.write_csv(stringf("out_bin_ex_%d_%d.csv", iter, group)); + g.electro_fy.write_csv(stringf("out_bin_ey_%d_%d.csv", iter, group)); + } + } + + void compute_bounds(PlacerNet &net, Axis axis, bool ref) + { + NetInfo *ni = net.ni; + auto drv_loc = cell_loc(ni->driver.cell, ref); + // seed with driver location + net.b1.at(axis) = net.b0.at(axis) = drv_loc.at(axis); + // iterate over users + for (auto &usr : ni->users) { + auto usr_loc = cell_loc(usr.cell, ref); + net.b1.at(axis) = std::max(net.b1.at(axis), usr_loc.at(axis)); + net.b0.at(axis) = std::min(net.b0.at(axis), usr_loc.at(axis)); + } + } + + RealPair wl_coeff{0.5f, 0.5f}; + + void update_nets(Axis axis, bool ref) + { + static constexpr float min_wirelen_force = -300.f; + for (auto &net : nets) { + if (net.skip) + continue; + net.min_exp.at(axis) = 0; + net.x_min_exp.at(axis) = 0; + net.max_exp.at(axis) = 0; + net.x_max_exp.at(axis) = 0; + // update bounding box + compute_bounds(net, axis, ref); + // compute rough center to subtract from exponents to avoid FP issues (from replace) + float c = (net.b1.at(axis) + net.b0.at(axis)) / 2.f; + for (auto &port : net.ports) { + if (!port.ref.cell) + continue; + RealPair loc = cell_loc(port.ref.cell, ref); + // update weighted-average model exponents + float emin = (c - loc.at(axis)) * wl_coeff.at(axis); + float emax = (loc.at(axis) - c) * wl_coeff.at(axis); + + if (emin > min_wirelen_force) { + port.min_exp.at(axis) = std::exp(emin); + net.min_exp.at(axis) += port.min_exp.at(axis); + net.x_min_exp.at(axis) += loc.at(axis) * port.min_exp.at(axis); + } else { + port.min_exp.at(axis) = PlacerPort::invalid; + } + if (emax > min_wirelen_force) { + port.max_exp.at(axis) = std::exp(emax); + net.max_exp.at(axis) += port.max_exp.at(axis); + net.x_max_exp.at(axis) += loc.at(axis) * port.max_exp.at(axis); + } else { + port.max_exp.at(axis) = PlacerPort::invalid; + } + } + net.wa_wl.at(axis) = + (net.x_max_exp.at(axis) / net.max_exp.at(axis)) - (net.x_min_exp.at(axis) / net.min_exp.at(axis)); + } + } + + float wirelen_grad(CellInfo *cell, Axis axis, bool ref) + { + float gradient = 0; + if (cell->udata == -1) + return 0; + RealPair loc = cell_loc(cell, ref); + for (auto &port : cell->ports) { + NetInfo *ni = port.second.net; + if (!ni) + continue; + auto &nd = nets.at(ni->udata); + if (nd.skip) + continue; + auto &pd = nd.ports.at(port.second.type == PORT_OUT ? (nd.ports.size() - 1) : port.second.user_idx.idx()); + // From Replace + // TODO: check these derivatives on paper + float d_min = 0, d_max = 0; + if (pd.has_min_exp(axis)) { + float min_sum = nd.min_exp.at(axis), x_min_sum = nd.x_min_exp.at(axis); + d_min = (min_sum * (pd.min_exp.at(axis) * (1.0f - wl_coeff.at(axis) * loc.at(axis))) + + wl_coeff.at(axis) * pd.min_exp.at(axis) * x_min_sum) / + (min_sum * min_sum); + } + if (pd.has_max_exp(axis)) { + float max_sum = nd.max_exp.at(axis), x_max_sum = nd.x_max_exp.at(axis); + d_max = (max_sum * (pd.max_exp.at(axis) * (1.0f + wl_coeff.at(axis) * loc.at(axis))) - + wl_coeff.at(axis) * pd.max_exp.at(axis) * x_max_sum) / + (max_sum * max_sum); + } + gradient += (d_min - d_max); + } + + return gradient; + } + + float dens_penalty = 0.025f; + float nesterov_a = 1.0f; + + void update_gradients(bool ref = true, bool set_prev = true, bool init_penalty = false) + { + // TODO: skip non-group cells more efficiently? + for (int group = 0; group < int(groups.size()); group++) { + compute_density(group, ref); + run_fft(group); + } + for (auto axis : {Axis::X, Axis::Y}) { + update_nets(axis, ref); + } + // First loop: back up gradients if required; set to zero; and compute density gradient + for (auto &cell : mcells) { + auto &g = groups.at(cell.group); + if (set_prev && ref) { + cell.last_wl_grad = cell.ref_wl_grad; + cell.last_dens_grad = cell.ref_dens_grad; + cell.last_total_grad = cell.ref_total_grad; + } + // wirelength gradient updated based on cell instances in next loop + (ref ? cell.ref_wl_grad : cell.wl_grad) = RealPair(0, 0); + // density grad based on bins - do we need to interpolate? + auto &grad = (ref ? cell.ref_dens_grad : cell.dens_grad); + grad = RealPair(0, 0); + iter_slithers((ref ? cell.ref_pos : cell.pos), cell.rect, [&](int x, int y, float area) { + grad += RealPair(g.electro_fx.at(x, y) * area, g.electro_fy.at(x, y) * area); + }); + // total gradient computed at the end + (ref ? cell.ref_total_grad : cell.total_grad) = RealPair(0, 0); + } + // Second loop: sum up wirelength gradients across concrete cell instances + for (auto &cell : ctx->cells) { + CellInfo *ci = cell.second.get(); + if (ci->udata == -1) + continue; + auto &mc = mcells.at(ci->udata); + // TODO: exploit parallelism across axes + float wl_gx = wirelen_grad(ci, Axis::X, ref); + float wl_gy = wirelen_grad(ci, Axis::Y, ref); + (ref ? mc.ref_wl_grad : mc.wl_grad) += RealPair(wl_gx, wl_gy); + } + if (init_penalty) { + // set initial density penalty + float wirelen_sum = 0; + float force_sum = 0; + for (auto &cell : ctx->cells) { + CellInfo *ci = cell.second.get(); + if (ci->udata == -1) + continue; + auto &mc = mcells.at(ci->udata); + wirelen_sum += std::abs(mc.ref_wl_grad.x) + std::abs(mc.ref_wl_grad.y); + force_sum += std::abs(mc.ref_dens_grad.x) + std::abs(mc.ref_dens_grad.y); + } + dens_penalty = wirelen_sum / force_sum; + log_info(" initial density penalty: %f\n", dens_penalty); + } + // Third loop: compute total gradient, and precondition + // TODO: ALM as well as simple penalty + for (auto &cell : mcells) { +#if 0 + if (!cell.is_spacer) { + printf("%d (%f, %f) wirelen_grad: (%f,%f) density_grad: (%f,%f)\n", iter, cell.ref_pos.x, + cell.ref_pos.y, cell.ref_wl_grad.x, cell.ref_wl_grad.y, cell.ref_dens_grad.x, + cell.ref_dens_grad.y); + } +#endif + // Preconditioner from replace for now + float precond = std::max(1.0f, float(cell.pin_count) + dens_penalty * cell.rect.area()); + (ref ? cell.ref_total_grad : cell.total_grad) = + (((ref ? cell.ref_wl_grad : cell.wl_grad) * -1) - + (ref ? cell.ref_dens_grad : cell.dens_grad) * dens_penalty) / + precond; + } + } + + float steplen = 0.01; + + float get_steplen() + { + float coord_dist = 0; + float grad_dist = 0; + int n = 0; + for (auto &cell : mcells) { + coord_dist += (cell.ref_pos.x - cell.last_ref_pos.x) * (cell.ref_pos.x - cell.last_ref_pos.x); + coord_dist += (cell.ref_pos.y - cell.last_ref_pos.y) * (cell.ref_pos.y - cell.last_ref_pos.y); + grad_dist += + (cell.ref_total_grad.x - cell.last_total_grad.x) * (cell.ref_total_grad.x - cell.last_total_grad.x); + grad_dist += + (cell.ref_total_grad.y - cell.last_total_grad.y) * (cell.ref_total_grad.y - cell.last_total_grad.y); + // keep track of N separately, because we might exclude some cells + ++n; + } + coord_dist = std::sqrt(coord_dist / (2 * float(mcells.size()))); + grad_dist = std::sqrt(grad_dist / (2 * float(mcells.size()))); + log_info("coord_dist: %f grad_dist: %f\n", coord_dist, grad_dist); + return coord_dist / grad_dist; + // return 0.1; + } + + float system_hpwl() + { + float hpwl = 0; + for (auto &net : nets) { + if (net.skip) + continue; + // update bounding box + compute_bounds(net, Axis::X, false); + compute_bounds(net, Axis::Y, false); + hpwl += net.b1.x - net.b0.x; + hpwl += net.b1.y - net.b0.y; + } + return hpwl; + } + + float system_potential() + { + float pot = 0; + for (auto &cell : mcells) { + auto &g = groups.at(cell.group); + iter_slithers(cell.ref_pos, cell.rect, + [&](int x, int y, float area) { pot += g.electro_phi.at(x, y) * area; }); + } + return pot; + } + + void initialise() + { + float initial_steplength = 0.01f; + // Update current and previous gradients with initial solution + for (auto &cell : mcells) { + cell.ref_pos = cell.pos; + } + while (true) { + update_gradients(true, true, /* init_penalty */ true); + // compute a "fake" previous position based on an arbitrary steplength and said gradients for nesterov + for (auto &cell : mcells) { + if (cell.is_fixed) + continue; + // save current position in last_pos + cell.last_pos = cell.pos; + cell.last_ref_pos = cell.ref_pos; + // compute previous position; but store it in current for gradient computation + cell.ref_pos = cell.pos - cell.ref_total_grad * initial_steplength; + } + // Compute the previous gradients (albeit into the current state fields) + update_gradients(true); + // Now we have the fake previous state in the current state + for (auto &cell : mcells) { + std::swap(cell.last_ref_pos, cell.ref_pos); + std::swap(cell.ref_total_grad, cell.last_total_grad); + std::swap(cell.ref_wl_grad, cell.last_wl_grad); + std::swap(cell.ref_dens_grad, cell.last_dens_grad); + } + float next_steplen = get_steplen(); + log_info("initial steplen=%f next steplen = %f\n", initial_steplength, next_steplen); + if (next_steplen != 0 && std::isfinite(next_steplen) && std::abs(next_steplen) < 1e10) { + break; + } else { + initial_steplength *= 10; + } + } + } + + RealPair clamp_loc(RealPair loc) + { + return RealPair(std::max(0, std::min(width - 1, loc.x)), + std::max(0, std::min(height - 1, loc.y))); + } + + void update_chains() + { + // Move the post-solve position of a chain towards be the weighted average of its constituents + // The strength increases with iterations + float alpha = std::min(std::pow(1.002f, iter) - 1, 1.0f); + for (int i = 0; i < int(macros.size()); i++) { + auto ¯o = macros.at(i); + float total_area = 0; + const float area_epsilon = 0.05; + RealPair pos(0, 0), ref_pos(0, 0); + for (int c : macro.conc_cells) { + auto &mc = mcells.at(c); + float a = std::max(mc.rect.area(), area_epsilon); + pos += mc.pos * a; + ref_pos += mc.ref_pos * a; + total_area += a; + } + pos /= total_area; + ref_pos /= total_area; + for (int c : macro.conc_cells) { + auto &cc = ccells.at(c); + auto &mc = mcells.at(c); + mc.pos = mc.pos * (1-alpha) + (pos + RealPair(cc.chunk_dx, cc.chunk_dy)) * alpha; + mc.ref_pos = mc.ref_pos * (1 - alpha) + (ref_pos + RealPair(cc.chunk_dx, cc.chunk_dy)) * alpha; + } + } + } + + void step() + { + // TODO: update penalties; wirelength factor; etc + steplen = get_steplen(); + log_info("iter=%d steplen=%f a=%f\n", iter, steplen, nesterov_a); + float a_next = (1.0f + std::sqrt(4.0f * nesterov_a * nesterov_a + 1)) / 2.0f; + // Update positions according to trivial gradient descent (let's leave Nesterov's for later...) + for (auto &cell : mcells) { + if (cell.is_fixed || cell.is_dark) + continue; + // save current position in last_pos + cell.last_ref_pos = cell.ref_pos; + cell.last_pos = cell.pos; + // compute new position + cell.pos = clamp_loc(cell.ref_pos - cell.ref_total_grad * steplen); + // compute reference position + cell.ref_pos = clamp_loc(cell.pos + (cell.pos - cell.last_pos) * ((nesterov_a - 1) / a_next)); + } + nesterov_a = a_next; + update_chains(); + update_gradients(true); + log_info(" system potential: %f hpwl: %f\n", system_potential(), system_hpwl()); + compute_overlap(); + } + + void legalise_step(bool dsp_bram) { + // assume DSP and BRAM are all groups 2+ for now + for (int i = 0; i < int(ccells.size()); i++) { + auto &mc = mcells.at(i); + auto &cc = ccells.at(i); + if (dsp_bram && mc.group < 2) + continue; + if (!dsp_bram && mc.group >= 2) + continue; + if (cc.macro_idx != -1 && i != macros.at(cc.macro_idx).root->udata) + continue; // not macro root + if (mc.is_fixed) { // already placed + NPNR_ASSERT(cc.base_cell->bel != BelId()); + continue; + } + enqueue_legalise(i); + } + // in the DSP/BRAM step, also merge any cells outside of a group (global buffers, other random crud/singletons) + for (auto &cell : ctx->cells) { + if (cell.second->udata == -1) + enqueue_legalise(cell.second.get()); + } + log_info("Strict legalising %d cells...\n", int(to_legalise.size())); + float pre_hpwl = system_hpwl(); + legalise_placement_strict(true); + for (auto axis : {Axis::X, Axis::Y}) { + update_nets(axis, true); + } + float post_hpwl = system_hpwl(); + log_info("HPWL after legalise: %f (delta: %f)\n", post_hpwl, post_hpwl - pre_hpwl); + } + + void enqueue_legalise(int cell_idx) { + NPNR_ASSERT(cell_idx < int(ccells.size())); // we should never be legalising spacers or dark nodes + auto &ccell = ccells.at(cell_idx); + if (ccell.macro_idx != -1) { + // is a macro + auto ¯o = macros.at(ccell.macro_idx); + to_legalise.emplace(int(macro.cells.size()), macro.root->name); + } else { + to_legalise.emplace(1, ccell.base_cell->name); + } + } + + void enqueue_legalise(CellInfo *ci) { + if (ci->udata != -1) { + // managed by static + enqueue_legalise(ci->udata); + } else { + // special case + to_legalise.emplace(1, ci->name); + } + } + // Strict placement legalisation, performed after the initial HeAP spreading + void legalise_placement_strict(bool require_validity = true) + { + // At the moment we don't follow the full HeAP algorithm using cuts for legalisation, instead using + // the simple greedy largest-macro-first approach. + int ripup_radius = 2; + int total_iters = 0; + int total_iters_noreset = 0; + while (!to_legalise.empty()) { + auto top = to_legalise.top(); + to_legalise.pop(); + + CellInfo *ci = ctx->cells.at(top.second).get(); + // Was now placed, ignore + if (ci->bel != BelId()) + continue; + // log_info(" Legalising %s (%s) %d\n", top.second.c_str(ctx), ci->type.c_str(ctx), top.first); + FastBels::FastBelsData *fb; + fast_bels.getBelsForCellType(ci->type, &fb); + int radius = 0; + int iter = 0; + int iter_at_radius = 0; + int total_iters_for_cell = 0; + bool placed = false; + BelId bestBel; + int best_inp_len = std::numeric_limits::max(); + + total_iters++; + total_iters_noreset++; + if (total_iters > int(ccells.size())) { + total_iters = 0; + ripup_radius = std::max(std::max(width+1, height+1), ripup_radius * 2); + } + + if (total_iters_noreset > std::max(5000, 8 * int(ctx->cells.size()))) { + log_error("Unable to find legal placement for all cells, design is probably at utilisation limit.\n"); + } + + while (!placed) { + // Determine a search radius around the solver location (which increases over time) that is clamped to + // the region constraint for the cell (if applicable) + int rx = radius, ry = radius; + + // Pick a random X and Y location within our search radius + int cx, cy; + if (ci->udata == -1) { + cx = width / 2; + cy = height / 2; + } else { + cx = int(mcells.at(ci->udata).pos.x); + cy = int(mcells.at(ci->udata).pos.y); + } + int nx = ctx->rng(2 * rx + 1) + std::max(cx - rx, 0); + int ny = ctx->rng(2 * ry + 1) + std::max(cy - ry, 0); + + iter++; + iter_at_radius++; + if (iter >= (10 * (radius + 1))) { + // No luck yet, increase radius + radius = std::min(std::max(width+1, height+1), radius + 1); + while (radius < std::max(width+1, height+1)) { + // Keep increasing the radius until it will actually increase the number of cells we are + // checking (e.g. BRAM and DSP will not be in all cols/rows), so we don't waste effort + for (int x = std::max(0, cx - radius); + x <= std::min(width+1, cx + radius); x++) { + if (x >= int(fb->size())) + break; + for (int y = std::max(0, cy - radius); + y <= std::min(height+1, cy + radius); y++) { + if (y >= int(fb->at(x).size())) + break; + if (fb->at(x).at(y).size() > 0) + goto notempty; + } + } + radius = std::min(std::max(width+1, height+1), radius + 1); + } + notempty: + iter_at_radius = 0; + iter = 0; + } + // If our randomly chosen cooridnate is out of bounds; or points to a tile with no relevant bels; ignore + // it + if (nx < 0 || nx > width+1) + continue; + if (ny < 0 || ny > height+1) + continue; + + if (nx >= int(fb->size())) + continue; + if (ny >= int(fb->at(nx).size())) + continue; + if (fb->at(nx).at(ny).empty()) + continue; + + // The number of attempts to find a location to try + int need_to_explore = 2 * radius; + + // If we have found at least one legal location; and made enough attempts; assume it's good enough and + // finish + if (iter_at_radius >= need_to_explore && bestBel != BelId()) { + CellInfo *bound = ctx->getBoundBelCell(bestBel); + if (bound != nullptr) { + ctx->unbindBel(bound->bel); + enqueue_legalise(bound); + } + ctx->bindBel(bestBel, ci, STRENGTH_WEAK); + placed = true; + Loc loc = ctx->getBelLocation(bestBel); + if (ci->udata != -1) { + auto &mc = mcells.at(ci->udata); + mc.pos = mc.ref_pos = RealPair(loc, 0.5); + mc.is_fixed = true; + } + break; + } + + if (ci->cluster == ClusterId()) { + // The case where we have no relative constraints + for (auto sz : fb->at(nx).at(ny)) { + // Look through all bels in this tile; checking region constraint if applicable + if (!ci->testRegion(sz)) + continue; + // Prefer available bels; unless we are dealing with a wide radius (e.g. difficult control sets) + // or occasionally trigger a tiebreaker + if (ctx->checkBelAvail(sz) || (radius > ripup_radius || ctx->rng(20000) < 10)) { + CellInfo *bound = ctx->getBoundBelCell(sz); + if (bound != nullptr) { + // Only rip up cells without constraints + if (bound->cluster != ClusterId()) + continue; + ctx->unbindBel(bound->bel); + } + // Provisionally bind the bel + ctx->bindBel(sz, ci, STRENGTH_WEAK); + if (require_validity && !ctx->isBelLocationValid(sz)) { + // New location is not legal; unbind the cell (and rebind the cell we ripped up if + // applicable) + ctx->unbindBel(sz); + if (bound != nullptr) + ctx->bindBel(sz, bound, STRENGTH_WEAK); + } else if (iter_at_radius < need_to_explore) { + // It's legal, but we haven't tried enough locations yet + ctx->unbindBel(sz); + if (bound != nullptr) + ctx->bindBel(sz, bound, STRENGTH_WEAK); + int input_len = 0; + // Compute a fast input wirelength metric at this bel; and save if better than our last + // try + for (auto &port : ci->ports) { + auto &p = port.second; + if (p.type != PORT_IN || p.net == nullptr || p.net->driver.cell == nullptr) + continue; + CellInfo *drv = p.net->driver.cell; + if (drv->udata == -1) + continue; + auto drv_loc = mcells.at(drv->udata); + input_len += std::abs(int(drv_loc.pos.x) - nx) + std::abs(int(drv_loc.pos.y) - ny); + } + if (input_len < best_inp_len) { + best_inp_len = input_len; + bestBel = sz; + } + break; + } else { + // It's legal, and we've tried enough. Finish. + if (bound != nullptr) + enqueue_legalise(bound); + Loc loc = ctx->getBelLocation(sz); + if (ci->udata != -1) { + auto &mc = mcells.at(ci->udata); + mc.pos = mc.ref_pos = RealPair(loc, 0.5); + mc.is_fixed = true; + } + placed = true; + break; + } + } + } + } else { + // We do have relative constraints + for (auto sz : fb->at(nx).at(ny)) { + // List of cells and their destination + std::vector> targets; + // List of bels we placed things at; and the cell that was there before if applicable + std::vector> swaps_made; + + if (!ctx->getClusterPlacement(ci->cluster, sz, targets)) + continue; + + for (auto &target : targets) { + // Check it satisfies the region constraint if applicable + if (!target.first->testRegion(target.second)) + goto fail; + CellInfo *bound = ctx->getBoundBelCell(target.second); + // Chains cannot overlap; so if we have to ripup a cell make sure it isn't part of a chain + if (bound != nullptr) + if (bound->cluster != ClusterId() || bound->belStrength > STRENGTH_WEAK) + goto fail; + } + // Actually perform the move; keeping track of the moves we make so we can revert them if needed + for (auto &target : targets) { + CellInfo *bound = ctx->getBoundBelCell(target.second); + if (bound != nullptr) + ctx->unbindBel(target.second); + ctx->bindBel(target.second, target.first, STRENGTH_STRONG); + swaps_made.emplace_back(target.second, bound); + } + // Check that the move we have made is legal + for (auto &sm : swaps_made) { + if (!ctx->isBelLocationValid(sm.first)) + goto fail; + } + + if (false) { + fail: + // If the move turned out to be illegal; revert all the moves we made + for (auto &swap : swaps_made) { + ctx->unbindBel(swap.first); + if (swap.second != nullptr) + ctx->bindBel(swap.first, swap.second, STRENGTH_WEAK); + } + continue; + } + for (auto &target : targets) { + Loc loc = ctx->getBelLocation(target.second); + if (ci->udata != -1) { + auto &mc = mcells.at(target.first->udata); + mc.pos = mc.ref_pos = RealPair(loc, 0.5); + mc.is_fixed = true; + } + // log_info("%s %d %d %d\n", target.first->name.c_str(ctx), loc.x, loc.y, loc.z); + } + for (auto &swap : swaps_made) { + // Where we have ripped up cells; add them to the queue + if (swap.second != nullptr) + enqueue_legalise(swap.second); + } + + placed = true; + break; + } + } + + total_iters_for_cell++; + } + } + } + + public: + StaticPlacer(Context *ctx, PlacerStaticCfg cfg) : ctx(ctx), cfg(cfg), fast_bels(ctx, true, 8), tmg(ctx) + { + groups.resize(cfg.cell_groups.size()); + }; + void place() + { + log_info("Running Static placer...\n"); + init_bels(); + prepare_cells(); + init_cells(); + init_nets(); + insert_dark(); + insert_spacer(); + + prepare_density_bins(); + initialise(); + bool legalised_ip = false; + while (true) { + step(); + dens_penalty *= 1.025; + if (!legalised_ip) { + float ip_overlap = 0; + for (int i = 2; i < int(groups.size()); i++) + ip_overlap = std::max(ip_overlap, groups.at(i).overlap); + if (ip_overlap < 0.15) { + legalise_step(true); + legalised_ip = true; + } + } else { + float logic_overlap = 0; + for (int i = 0; i < 2; i++) + logic_overlap = std::max(logic_overlap, groups.at(i).overlap); + if (logic_overlap < 0.1) { + legalise_step(false); + break; + } + } + ++iter; + } + { + auto placer1_cfg = Placer1Cfg(ctx); + placer1_cfg.hpwl_scale_x = cfg.hpwl_scale_x; + placer1_cfg.hpwl_scale_y = cfg.hpwl_scale_y; + placer1_refine(ctx, placer1_cfg); + } + } +}; +}; // namespace + +bool placer_static(Context *ctx, PlacerStaticCfg cfg) +{ + StaticPlacer(ctx, cfg).place(); + return true; +} + +PlacerStaticCfg::PlacerStaticCfg(Context *ctx) +{ + timing_driven = ctx->setting("timing_driven"); + + hpwl_scale_x = 1; + hpwl_scale_y = 1; +} + +NEXTPNR_NAMESPACE_END diff --git a/common/place/placer_static.h b/common/place/placer_static.h new file mode 100644 index 00000000..adce1f41 --- /dev/null +++ b/common/place/placer_static.h @@ -0,0 +1,70 @@ +/* + * nextpnr -- Next Generation Place and Route + * + * Copyright (C) 2022-23 gatecat + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * + */ + +#ifndef PLACER_STATIC_H +#define PLACER_STATIC_H +#include "log.h" +#include "nextpnr.h" + +NEXTPNR_NAMESPACE_BEGIN + +struct StaticRect +{ + StaticRect() : w(0), h(0){}; + StaticRect(float w, float h) : w(w), h(h){}; + float w, h; + float area() const { return w * h; } +}; + +struct StaticCellGroupCfg +{ + // name of the group for debugging purposes + IdString name; + // bel buckets in this group + pool bel_buckets; + // cell & bel types in this group and the (normalised) area of that cell/bel type + dict cell_area, bel_area; + // these cells (generally auxilliary CLB cells like RAMW; carry; MUX) are considered to have zero area when part of + // a macro with other non-zero-area cells + pool zero_area_cells; + // size of spacers to insert + StaticRect spacer_rect{0.5, 0.5}; +}; + +struct PlacerStaticCfg +{ + PlacerStaticCfg(Context *ctx); + + // These cell types will be randomly locked to prevent singular matrices + pool ioBufTypes; + int hpwl_scale_x = 1; + int hpwl_scale_y = 1; + bool timing_driven = false; + // for calculating timing estimates based on distance + // estimate = c + mx*dx + my * dy + delay_t timing_c = 100, timing_mx = 100, timing_my = 100; + // groups of cells that should be placed together. + // group 0 must be LUTs and group 1 must be FFs, further groups for BRAM/DSP/misc + std::vector cell_groups; +}; + +extern bool placer_static(Context *ctx, PlacerStaticCfg cfg); +NEXTPNR_NAMESPACE_END +#endif \ No newline at end of file diff --git a/common/place/static_util.h b/common/place/static_util.h new file mode 100644 index 00000000..a3dc5c93 --- /dev/null +++ b/common/place/static_util.h @@ -0,0 +1,142 @@ +/* + * nextpnr -- Next Generation Place and Route + * + * Copyright (C) 2023 gatecat + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + */ + +#ifndef STATIC_UTIL_H +#define STATIC_UTIL_H + +#include +#include "nextpnr_assertions.h" +#include "nextpnr_namespaces.h" + +NEXTPNR_NAMESPACE_BEGIN + +namespace StaticUtil { + +enum class Axis +{ + X, + Y +}; + +struct RealPair +{ + RealPair() : x(0), y(0){}; + RealPair(float x, float y) : x(x), y(y){}; + explicit RealPair(Loc l, float bias = 0.0f) : x(l.x + bias), y(l.y + bias){}; + float x, y; + RealPair &operator+=(const RealPair &other) + { + x += other.x; + y += other.y; + return *this; + } + RealPair &operator/=(float factor) + { + x /= factor; + y /= factor; + return *this; + } + friend RealPair operator+(RealPair a, RealPair b); + friend RealPair operator-(RealPair a, RealPair b); + RealPair operator*(float factor) const { return RealPair(x * factor, y * factor); } + RealPair operator/(float factor) const { return RealPair(x / factor, y / factor); } + // to simplify axis-generic code + inline float &at(Axis axis) { return (axis == Axis::Y) ? y : x; } + inline const float &at(Axis axis) const { return (axis == Axis::Y) ? y : x; } +}; +inline RealPair operator+(RealPair a, RealPair b) { return RealPair(a.x + b.x, a.y + b.y); } +inline RealPair operator-(RealPair a, RealPair b) { return RealPair(a.x - b.x, a.y - b.y); } + +// array2d; but as ourafft wants it +struct FFTArray +{ + FFTArray(int width = 0, int height = 0) : m_width(width), m_height(height) + { + alloc(); + fill(0); + } + + void fill(float value) + { + for (int x = 0; x < m_width; x++) + for (int y = 0; y < m_height; y++) + m_data[x][y] = value; + } + + void reset(int width, int height, float value = 0) + { + if (width != m_height || height != m_height) { + destroy(); + m_width = width; + m_height = height; + alloc(); + } + fill(value); + } + + float &at(int x, int y) + { + NPNR_ASSERT(x >= 0 && x < m_width && y >= 0 && y < m_height); + return m_data[x][y]; + } + float at(int x, int y) const + { + NPNR_ASSERT(x >= 0 && x < m_width && y >= 0 && y < m_height); + return m_data[x][y]; + } + float **data() { return m_data; } + + void write_csv(const std::string &filename) const + { + std::ofstream out(filename); + NPNR_ASSERT(out); + for (int y = 0; y < m_height; y++) { + for (int x = 0; x < m_width; x++) { + out << at(x, y) << ","; + } + out << std::endl; + } + } + + ~FFTArray() { destroy(); } + + private: + int m_width, m_height; + float **m_data = nullptr; + void alloc() + { + if (m_width == 0) + return; + m_data = new float *[m_width]; + for (int x = 0; x < m_width; x++) + m_data[x] = (m_height > 0) ? (new float[m_height]) : nullptr; + } + void destroy() + { + for (int x = 0; x < m_width; x++) + delete[] m_data[x]; + delete[] m_data; + } +}; + +}; // namespace StaticUtil + +NEXTPNR_NAMESPACE_END + +#endif diff --git a/ecp5/arch.cc b/ecp5/arch.cc index a0b2644c..5d46f881 100644 --- a/ecp5/arch.cc +++ b/ecp5/arch.cc @@ -30,6 +30,7 @@ #include "nextpnr.h" #include "placer1.h" #include "placer_heap.h" +#include "placer_static.h" #include "router1.h" #include "router2.h" #include "timing.h" @@ -586,6 +587,49 @@ delay_t Arch::getRipupDelayPenalty() const { return 400; } // ----------------------------------------------------------------------- +namespace { +void configure_static(Arch *arch, PlacerStaticCfg &cfg) +{ + { + cfg.cell_groups.emplace_back(); + auto &comb = cfg.cell_groups.back(); + comb.name = arch->id("COMB"); + comb.bel_area[id_TRELLIS_COMB] = StaticRect(1.0f, 0.125f); + comb.bel_area[id_TRELLIS_RAMW] = StaticRect(1.0f, 0.125f); + comb.cell_area[id_TRELLIS_COMB] = StaticRect(1.0f, 0.125f); + comb.cell_area[id_TRELLIS_RAMW] = StaticRect(1.0f, 0.125f); + comb.zero_area_cells.insert(id_TRELLIS_RAMW); + comb.spacer_rect = StaticRect(1.0f, 0.125f); + } + + { + cfg.cell_groups.emplace_back(); + auto &comb = cfg.cell_groups.back(); + comb.name = arch->id("FF"); + comb.cell_area[id_TRELLIS_FF] = StaticRect(1.0f, 0.125f); + comb.bel_area[id_TRELLIS_FF] = StaticRect(1.0f, 0.125f); + comb.spacer_rect = StaticRect(1.0f, 0.125f); + } + + { + cfg.cell_groups.emplace_back(); + auto &comb = cfg.cell_groups.back(); + comb.name = arch->id("RAM"); + comb.cell_area[id_DP16KD] = StaticRect(2.0f, 1.0f); + comb.bel_area[id_DP16KD] = StaticRect(2.0f, 1.0f); + comb.spacer_rect = StaticRect(2.0f, 1.0f); + } + { + cfg.cell_groups.emplace_back(); + auto &comb = cfg.cell_groups.back(); + comb.name = arch->id("MUL"); + comb.cell_area[id_MULT18X18D] = StaticRect(2.0f, 1.0f); + comb.bel_area[id_MULT18X18D] = StaticRect(2.0f, 1.0f); + comb.spacer_rect = StaticRect(2.0f, 1.0f); + } +} +} // namespace + bool Arch::place() { std::string placer = str_or_default(settings, id_placer, defaultPlacer); @@ -609,6 +653,11 @@ bool Arch::place() if (!placer_heap(getCtx(), cfg)) return false; + } else if (placer == "static") { + PlacerStaticCfg cfg(getCtx()); + configure_static(this, cfg); + if (!placer_static(getCtx(), cfg)) + return false; } else if (placer == "sa") { if (!placer1(getCtx(), Placer1Cfg(getCtx()))) return false; @@ -1226,7 +1275,7 @@ WireId Arch::get_bank_eclk(int bank, int eclk) const std::string Arch::defaultPlacer = "heap"; -const std::vector Arch::availablePlacers = {"sa", "heap"}; +const std::vector Arch::availablePlacers = {"sa", "heap", "static"}; const std::string Arch::defaultRouter = "router1"; const std::vector Arch::availableRouters = {"router1", "router2"};