diff --git a/3rdparty/oourafft/fftsg.cc b/3rdparty/oourafft/fftsg.cc
new file mode 100644
index 00000000..d74fbb15
--- /dev/null
+++ b/3rdparty/oourafft/fftsg.cc
@@ -0,0 +1,3305 @@
+///////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2018-2020, The Regents of the University of California
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of the copyright holder nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+///////////////////////////////////////////////////////////////////////////////
+
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, float   *, int *, float   *);
+    void rdft(int, int, float   *, int *, float   *);
+    void ddct(int, int, float   *, int *, float   *);
+    void ddst(int, int, float   *, int *, float   *);
+    void dfct(int, float   *, float   *, int *, float   *);
+    void dfst(int, float   *, float   *, int *, float   *);
+macro definitions
+    USE_CDFT_PTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=8192
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=65536
+    USE_CDFT_WINTHREADS : default=not defined
+        CDFT_THREADS_BEGIN_N  : must be >= 512, default=32768
+        CDFT_4THREADS_BEGIN_N : must be >= 512, default=524288
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (float   *)
+                        input data
+                            a[2*j] = Re(x[j]),
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]),
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (float   *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            cdft(2*n, -1, a, ip, w);
+        is
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (float   *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (float   *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            rdft(n, 1, a, ip, w);
+        is
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (float   *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (float   *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddct(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (float   *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (float   *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddst(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (float   *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (float   *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (float   *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (float   *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (float   *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (float   *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            dfst(n, a, t, ip, w);
+        is
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+#include <cmath>
+#include <iostream>
+
+#include "nextpnr_namespaces.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+void cdft(int n, int isgn, float* a, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void cftfsub(int n, float* a, int* ip, int nw, float* w);
+  void cftbsub(int n, float* a, int* ip, int nw, float* w);
+  int nw;
+
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  if (isgn >= 0) {
+    cftfsub(n, a, ip, nw, w);
+  } else {
+    cftbsub(n, a, ip, nw, w);
+  }
+}
+
+void rdft(int n, int isgn, float* a, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void cftfsub(int n, float* a, int* ip, int nw, float* w);
+  void cftbsub(int n, float* a, int* ip, int nw, float* w);
+  void rftfsub(int n, float* a, int nc, float* c);
+  void rftbsub(int n, float* a, int nc, float* c);
+  int nw, nc;
+  float xi;
+
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > (nc << 2)) {
+    nc = n >> 2;
+    makect(nc, ip, w + nw);
+  }
+  if (isgn >= 0) {
+    if (n > 4) {
+      cftfsub(n, a, ip, nw, w);
+      rftfsub(n, a, nc, w + nw);
+    } else if (n == 4) {
+      cftfsub(n, a, ip, nw, w);
+    }
+    xi = a[0] - a[1];
+    a[0] += a[1];
+    a[1] = xi;
+  } else {
+    a[1] = 0.5 * (a[0] - a[1]);
+    a[0] -= a[1];
+    if (n > 4) {
+      rftbsub(n, a, nc, w + nw);
+      cftbsub(n, a, ip, nw, w);
+    } else if (n == 4) {
+      cftbsub(n, a, ip, nw, w);
+    }
+  }
+}
+
+void ddct(int n, int isgn, float* a, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void cftfsub(int n, float* a, int* ip, int nw, float* w);
+  void cftbsub(int n, float* a, int* ip, int nw, float* w);
+  void rftfsub(int n, float* a, int nc, float* c);
+  void rftbsub(int n, float* a, int nc, float* c);
+  void dctsub(int n, float* a, int nc, float* c);
+  int j, nw, nc;
+  float xr;
+
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > nc) {
+    nc = n;
+    makect(nc, ip, w + nw);
+  }
+  if (isgn < 0) {
+    xr = a[n - 1];
+
+    for (j = n - 2; j >= 2; j -= 2) {
+      a[j + 1] = a[j] - a[j - 1];
+      a[j] += a[j - 1];
+    }
+    a[1] = a[0] - xr;
+    a[0] += xr;
+    if (n > 4) {
+      rftbsub(n, a, nc, w + nw);
+      cftbsub(n, a, ip, nw, w);
+    } else if (n == 4) {
+      cftbsub(n, a, ip, nw, w);
+    }
+  }
+  dctsub(n, a, nc, w + nw);
+  if (isgn >= 0) {
+    if (n > 4) {
+      cftfsub(n, a, ip, nw, w);
+      rftfsub(n, a, nc, w + nw);
+    } else if (n == 4) {
+      cftfsub(n, a, ip, nw, w);
+    }
+    xr = a[0] - a[1];
+    a[0] += a[1];
+    for (j = 2; j < n; j += 2) {
+      a[j - 1] = a[j] - a[j + 1];
+      a[j] += a[j + 1];
+    }
+    a[n - 1] = xr;
+  }
+}
+
+void ddst(int n, int isgn, float* a, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void cftfsub(int n, float* a, int* ip, int nw, float* w);
+  void cftbsub(int n, float* a, int* ip, int nw, float* w);
+  void rftfsub(int n, float* a, int nc, float* c);
+  void rftbsub(int n, float* a, int nc, float* c);
+  void dstsub(int n, float* a, int nc, float* c);
+  int j, nw, nc;
+  float xr;
+
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > nc) {
+    nc = n;
+    makect(nc, ip, w + nw);
+  }
+  if (isgn < 0) {
+    xr = a[n - 1];
+    for (j = n - 2; j >= 2; j -= 2) {
+      a[j + 1] = -a[j] - a[j - 1];
+      a[j] -= a[j - 1];
+    }
+    a[1] = a[0] + xr;
+    a[0] -= xr;
+    if (n > 4) {
+      rftbsub(n, a, nc, w + nw);
+      cftbsub(n, a, ip, nw, w);
+    } else if (n == 4) {
+      cftbsub(n, a, ip, nw, w);
+    }
+  }
+  dstsub(n, a, nc, w + nw);
+  if (isgn >= 0) {
+    if (n > 4) {
+      cftfsub(n, a, ip, nw, w);
+      rftfsub(n, a, nc, w + nw);
+    } else if (n == 4) {
+      cftfsub(n, a, ip, nw, w);
+    }
+    xr = a[0] - a[1];
+    a[0] += a[1];
+    for (j = 2; j < n; j += 2) {
+      a[j - 1] = -a[j] - a[j + 1];
+      a[j] -= a[j + 1];
+    }
+    a[n - 1] = -xr;
+  }
+}
+
+void dfct(int n, float* a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void cftfsub(int n, float* a, int* ip, int nw, float* w);
+  void rftfsub(int n, float* a, int nc, float* c);
+  void dctsub(int n, float* a, int nc, float* c);
+  int j, k, l, m, mh, nw, nc;
+  float xr, xi, yr, yi;
+
+  nw = ip[0];
+  if (n > (nw << 3)) {
+    nw = n >> 3;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > (nc << 1)) {
+    nc = n >> 1;
+    makect(nc, ip, w + nw);
+  }
+  m = n >> 1;
+  yi = a[m];
+  xi = a[0] + a[n];
+  a[0] -= a[n];
+  t[0] = xi - yi;
+  t[m] = xi + yi;
+  if (n > 2) {
+    mh = m >> 1;
+    for (j = 1; j < mh; j++) {
+      k = m - j;
+      xr = a[j] - a[n - j];
+      xi = a[j] + a[n - j];
+      yr = a[k] - a[n - k];
+      yi = a[k] + a[n - k];
+      a[j] = xr;
+      a[k] = yr;
+      t[j] = xi - yi;
+      t[k] = xi + yi;
+    }
+    t[mh] = a[mh] + a[n - mh];
+    a[mh] -= a[n - mh];
+    dctsub(m, a, nc, w + nw);
+    if (m > 4) {
+      cftfsub(m, a, ip, nw, w);
+      rftfsub(m, a, nc, w + nw);
+    } else if (m == 4) {
+      cftfsub(m, a, ip, nw, w);
+    }
+    a[n - 1] = a[0] - a[1];
+    a[1] = a[0] + a[1];
+    for (j = m - 2; j >= 2; j -= 2) {
+      a[2 * j + 1] = a[j] + a[j + 1];
+      a[2 * j - 1] = a[j] - a[j + 1];
+    }
+    l = 2;
+    m = mh;
+    while (m >= 2) {
+      dctsub(m, t, nc, w + nw);
+      if (m > 4) {
+        cftfsub(m, t, ip, nw, w);
+        rftfsub(m, t, nc, w + nw);
+      } else if (m == 4) {
+        cftfsub(m, t, ip, nw, w);
+      }
+      a[n - l] = t[0] - t[1];
+      a[l] = t[0] + t[1];
+      k = 0;
+      for (j = 2; j < m; j += 2) {
+        k += l << 2;
+        a[k - l] = t[j] - t[j + 1];
+        a[k + l] = t[j] + t[j + 1];
+      }
+      l <<= 1;
+      mh = m >> 1;
+      for (j = 0; j < mh; j++) {
+        k = m - j;
+        t[j] = t[m + k] - t[m + j];
+        t[k] = t[m + k] + t[m + j];
+      }
+      t[mh] = t[m + mh];
+      m = mh;
+    }
+    a[l] = t[0];
+    a[n] = t[2] - t[1];
+    a[0] = t[2] + t[1];
+  } else {
+    a[1] = a[0];
+    a[2] = t[0];
+    a[0] = t[1];
+  }
+}
+
+void dfst(int n, float* a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void cftfsub(int n, float* a, int* ip, int nw, float* w);
+  void rftfsub(int n, float* a, int nc, float* c);
+  void dstsub(int n, float* a, int nc, float* c);
+  int j, k, l, m, mh, nw, nc;
+  float xr, xi, yr, yi;
+
+  nw = ip[0];
+  if (n > (nw << 3)) {
+    nw = n >> 3;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > (nc << 1)) {
+    nc = n >> 1;
+    makect(nc, ip, w + nw);
+  }
+  if (n > 2) {
+    m = n >> 1;
+    mh = m >> 1;
+    for (j = 1; j < mh; j++) {
+      k = m - j;
+      xr = a[j] + a[n - j];
+      xi = a[j] - a[n - j];
+      yr = a[k] + a[n - k];
+      yi = a[k] - a[n - k];
+      a[j] = xr;
+      a[k] = yr;
+      t[j] = xi + yi;
+      t[k] = xi - yi;
+    }
+    t[0] = a[mh] - a[n - mh];
+    a[mh] += a[n - mh];
+    a[0] = a[m];
+    dstsub(m, a, nc, w + nw);
+    if (m > 4) {
+      cftfsub(m, a, ip, nw, w);
+      rftfsub(m, a, nc, w + nw);
+    } else if (m == 4) {
+      cftfsub(m, a, ip, nw, w);
+    }
+    a[n - 1] = a[1] - a[0];
+    a[1] = a[0] + a[1];
+    for (j = m - 2; j >= 2; j -= 2) {
+      a[2 * j + 1] = a[j] - a[j + 1];
+      a[2 * j - 1] = -a[j] - a[j + 1];
+    }
+    l = 2;
+    m = mh;
+    while (m >= 2) {
+      dstsub(m, t, nc, w + nw);
+      if (m > 4) {
+        cftfsub(m, t, ip, nw, w);
+        rftfsub(m, t, nc, w + nw);
+      } else if (m == 4) {
+        cftfsub(m, t, ip, nw, w);
+      }
+      a[n - l] = t[1] - t[0];
+      a[l] = t[0] + t[1];
+      k = 0;
+      for (j = 2; j < m; j += 2) {
+        k += l << 2;
+        a[k - l] = -t[j] - t[j + 1];
+        a[k + l] = t[j] - t[j + 1];
+      }
+      l <<= 1;
+      mh = m >> 1;
+      for (j = 1; j < mh; j++) {
+        k = m - j;
+        t[j] = t[m + k] + t[m + j];
+        t[k] = t[m + k] - t[m + j];
+      }
+      t[0] = t[m + mh];
+      m = mh;
+    }
+    a[l] = t[0];
+  }
+  a[0] = 0;
+}
+
+/* -------- initializing routines -------- */
+
+void makewt(int nw, int* ip, float* w)
+{
+  void makeipt(int nw, int* ip);
+  int j, nwh, nw0, nw1;
+  float delta, wn4r, wk1r, wk1i, wk3r, wk3i;
+
+  ip[0] = nw;
+  ip[1] = 1;
+  if (nw > 2) {
+    nwh = nw >> 1;
+    delta = atan(1.0) / nwh;
+    wn4r = cos(delta * nwh);
+    w[0] = 1;
+    w[1] = wn4r;
+    if (nwh == 4) {
+      w[2] = cos(delta * 2);
+      w[3] = sin(delta * 2);
+    } else if (nwh > 4) {
+      makeipt(nw, ip);
+      w[2] = 0.5 / cos(delta * 2);
+      w[3] = 0.5 / cos(delta * 6);
+      for (j = 4; j < nwh; j += 4) {
+        w[j] = cos(delta * j);
+        w[j + 1] = sin(delta * j);
+        w[j + 2] = cos(3 * delta * j);
+        w[j + 3] = -sin(3 * delta * j);
+      }
+    }
+    nw0 = 0;
+    while (nwh > 2) {
+      nw1 = nw0 + nwh;
+      nwh >>= 1;
+      w[nw1] = 1;
+      w[nw1 + 1] = wn4r;
+      if (nwh == 4) {
+        wk1r = w[nw0 + 4];
+        wk1i = w[nw0 + 5];
+        w[nw1 + 2] = wk1r;
+        w[nw1 + 3] = wk1i;
+      } else if (nwh > 4) {
+        wk1r = w[nw0 + 4];
+        wk3r = w[nw0 + 6];
+        w[nw1 + 2] = 0.5 / wk1r;
+        w[nw1 + 3] = 0.5 / wk3r;
+        for (j = 4; j < nwh; j += 4) {
+          wk1r = w[nw0 + 2 * j];
+          wk1i = w[nw0 + 2 * j + 1];
+          wk3r = w[nw0 + 2 * j + 2];
+          wk3i = w[nw0 + 2 * j + 3];
+          w[nw1 + j] = wk1r;
+          w[nw1 + j + 1] = wk1i;
+          w[nw1 + j + 2] = wk3r;
+          w[nw1 + j + 3] = wk3i;
+        }
+      }
+      nw0 = nw1;
+    }
+  }
+}
+
+void makeipt(int nw, int* ip)
+{
+  int j, l, m, m2, p, q;
+
+  ip[2] = 0;
+  ip[3] = 16;
+  m = 2;
+  for (l = nw; l > 32; l >>= 2) {
+    m2 = m << 1;
+    q = m2 << 3;
+    for (j = m; j < m2; j++) {
+      p = ip[j] << 2;
+      ip[m + j] = p;
+      ip[m2 + j] = p + q;
+    }
+    m = m2;
+  }
+}
+
+void makect(int nc, int* ip, float* c)
+{
+  int j, nch;
+  float delta;
+
+  ip[1] = nc;
+  if (nc > 1) {
+    nch = nc >> 1;
+    delta = atan(1.0) / nch;
+    c[0] = cos(delta * nch);
+    c[nch] = 0.5 * c[0];
+    for (j = 1; j < nch; j++) {
+      c[j] = 0.5 * cos(delta * j);
+      c[nc - j] = 0.5 * sin(delta * j);
+    }
+  }
+}
+
+/* -------- child routines -------- */
+
+#ifdef USE_CDFT_PTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 8192
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#define cdft_thread_t pthread_t
+#define cdft_thread_create(thp, func, argp)                   \
+  {                                                           \
+    if (pthread_create(thp, NULL, func, (void*) argp) != 0) { \
+      fprintf(stderr, "cdft thread error\n");                 \
+      exit(1);                                                \
+    }                                                         \
+  }
+#define cdft_thread_wait(th)                  \
+  {                                           \
+    if (pthread_join(th, NULL) != 0) {        \
+      fprintf(stderr, "cdft thread error\n"); \
+      exit(1);                                \
+    }                                         \
+  }
+#endif /* USE_CDFT_PTHREADS */
+
+#ifdef USE_CDFT_WINTHREADS
+#define USE_CDFT_THREADS
+#ifndef CDFT_THREADS_BEGIN_N
+#define CDFT_THREADS_BEGIN_N 32768
+#endif
+#ifndef CDFT_4THREADS_BEGIN_N
+#define CDFT_4THREADS_BEGIN_N 524288
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <windows.h>
+#define cdft_thread_t HANDLE
+#define cdft_thread_create(thp, func, argp)                               \
+  {                                                                       \
+    DWORD thid;                                                           \
+    *(thp) = CreateThread(                                                \
+        NULL, 0, (LPTHREAD_START_ROUTINE) func, (LPVOID) argp, 0, &thid); \
+    if (*(thp) == 0) {                                                    \
+      fprintf(stderr, "cdft thread error\n");                             \
+      exit(1);                                                            \
+    }                                                                     \
+  }
+#define cdft_thread_wait(th)           \
+  {                                    \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th);                   \
+  }
+#endif /* USE_CDFT_WINTHREADS */
+
+void cftfsub(int n, float* a, int* ip, int nw, float* w)
+{
+  void bitrv2(int n, int* ip, float* a);
+  void bitrv216(float* a);
+  void bitrv208(float* a);
+  void cftf1st(int n, float* a, float* w);
+  void cftrec4(int n, float* a, int nw, float* w);
+  void cftleaf(int n, int isplt, float* a, int nw, float* w);
+  void cftfx41(int n, float* a, int nw, float* w);
+  void cftf161(float* a, float* w);
+  void cftf081(float* a, float* w);
+  void cftf040(float* a);
+  void cftx020(float* a);
+#ifdef USE_CDFT_THREADS
+  void cftrec4_th(int n, float* a, int nw, float* w);
+#endif /* USE_CDFT_THREADS */
+
+  if (n > 8) {
+    if (n > 32) {
+      cftf1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+      if (n > CDFT_THREADS_BEGIN_N) {
+        cftrec4_th(n, a, nw, w);
+      } else
+#endif /* USE_CDFT_THREADS */
+        if (n > 512) {
+          cftrec4(n, a, nw, w);
+        } else if (n > 128) {
+          cftleaf(n, 1, a, nw, w);
+        } else {
+          cftfx41(n, a, nw, w);
+        }
+      bitrv2(n, ip, a);
+    } else if (n == 32) {
+      cftf161(a, &w[nw - 8]);
+      bitrv216(a);
+    } else {
+      cftf081(a, w);
+      bitrv208(a);
+    }
+  } else if (n == 8) {
+    cftf040(a);
+  } else if (n == 4) {
+    cftx020(a);
+  }
+}
+
+void cftbsub(int n, float* a, int* ip, int nw, float* w)
+{
+  void bitrv2conj(int n, int* ip, float* a);
+  void bitrv216neg(float* a);
+  void bitrv208neg(float* a);
+  void cftb1st(int n, float* a, float* w);
+  void cftrec4(int n, float* a, int nw, float* w);
+  void cftleaf(int n, int isplt, float* a, int nw, float* w);
+  void cftfx41(int n, float* a, int nw, float* w);
+  void cftf161(float* a, float* w);
+  void cftf081(float* a, float* w);
+  void cftb040(float* a);
+  void cftx020(float* a);
+#ifdef USE_CDFT_THREADS
+  void cftrec4_th(int n, float* a, int nw, float* w);
+#endif /* USE_CDFT_THREADS */
+
+  if (n > 8) {
+    if (n > 32) {
+      cftb1st(n, a, &w[nw - (n >> 2)]);
+#ifdef USE_CDFT_THREADS
+      if (n > CDFT_THREADS_BEGIN_N) {
+        cftrec4_th(n, a, nw, w);
+      } else
+#endif /* USE_CDFT_THREADS */
+        if (n > 512) {
+          cftrec4(n, a, nw, w);
+        } else if (n > 128) {
+          cftleaf(n, 1, a, nw, w);
+        } else {
+          cftfx41(n, a, nw, w);
+        }
+      bitrv2conj(n, ip, a);
+    } else if (n == 32) {
+      cftf161(a, &w[nw - 8]);
+      bitrv216neg(a);
+    } else {
+      cftf081(a, w);
+      bitrv208neg(a);
+    }
+  } else if (n == 8) {
+    cftb040(a);
+  } else if (n == 4) {
+    cftx020(a);
+  }
+}
+
+void bitrv2(int n, int* ip, float* a)
+{
+  int j, j1, k, k1, l, m, nh, nm;
+  float xr, xi, yr, yi;
+
+  m = 1;
+  for (l = n >> 2; l > 8; l >>= 2) {
+    m <<= 1;
+  }
+  nh = n >> 1;
+  nm = 4 * m;
+  if (l == 8) {
+    for (k = 0; k < m; k++) {
+      for (j = 0; j < k; j++) {
+        j1 = 4 * j + 2 * ip[m + k];
+        k1 = 4 * k + 2 * ip[m + j];
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nh;
+        k1 += 2;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += 2;
+        k1 += nh;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nh;
+        k1 -= 2;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+      }
+      k1 = 4 * k + 2 * ip[m + k];
+      j1 = k1 + 2;
+      k1 += nh;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += nm;
+      k1 += 2 * nm;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += nm;
+      k1 -= nm;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 -= 2;
+      k1 -= nh;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += nh + 2;
+      k1 += nh + 2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 -= nh - nm;
+      k1 += 2 * nm - 2;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+    }
+  } else {
+    for (k = 0; k < m; k++) {
+      for (j = 0; j < k; j++) {
+        j1 = 4 * j + ip[m + k];
+        k1 = 4 * k + ip[m + j];
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nh;
+        k1 += 2;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += 2;
+        k1 += nh;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nh;
+        k1 -= 2;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = a[j1 + 1];
+        yr = a[k1];
+        yi = a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+      }
+      k1 = 4 * k + ip[m + k];
+      j1 = k1 + 2;
+      k1 += nh;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += nm;
+      k1 += nm;
+      xr = a[j1];
+      xi = a[j1 + 1];
+      yr = a[k1];
+      yi = a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+    }
+  }
+}
+
+void bitrv2conj(int n, int* ip, float* a)
+{
+  int j, j1, k, k1, l, m, nh, nm;
+  float xr, xi, yr, yi;
+
+  m = 1;
+  for (l = n >> 2; l > 8; l >>= 2) {
+    m <<= 1;
+  }
+  nh = n >> 1;
+  nm = 4 * m;
+  if (l == 8) {
+    for (k = 0; k < m; k++) {
+      for (j = 0; j < k; j++) {
+        j1 = 4 * j + 2 * ip[m + k];
+        k1 = 4 * k + 2 * ip[m + j];
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nh;
+        k1 += 2;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += 2;
+        k1 += nh;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nh;
+        k1 -= 2;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= 2 * nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+      }
+      k1 = 4 * k + 2 * ip[m + k];
+      j1 = k1 + 2;
+      k1 += nh;
+      a[j1 - 1] = -a[j1 - 1];
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      a[k1 + 3] = -a[k1 + 3];
+      j1 += nm;
+      k1 += 2 * nm;
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += nm;
+      k1 -= nm;
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 -= 2;
+      k1 -= nh;
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 += nh + 2;
+      k1 += nh + 2;
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      j1 -= nh - nm;
+      k1 += 2 * nm - 2;
+      a[j1 - 1] = -a[j1 - 1];
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      a[k1 + 3] = -a[k1 + 3];
+    }
+  } else {
+    for (k = 0; k < m; k++) {
+      for (j = 0; j < k; j++) {
+        j1 = 4 * j + ip[m + k];
+        k1 = 4 * k + ip[m + j];
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nh;
+        k1 += 2;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += 2;
+        k1 += nh;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 += nm;
+        k1 += nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nh;
+        k1 -= 2;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+        j1 -= nm;
+        k1 -= nm;
+        xr = a[j1];
+        xi = -a[j1 + 1];
+        yr = a[k1];
+        yi = -a[k1 + 1];
+        a[j1] = yr;
+        a[j1 + 1] = yi;
+        a[k1] = xr;
+        a[k1 + 1] = xi;
+      }
+      k1 = 4 * k + ip[m + k];
+      j1 = k1 + 2;
+      k1 += nh;
+      a[j1 - 1] = -a[j1 - 1];
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      a[k1 + 3] = -a[k1 + 3];
+      j1 += nm;
+      k1 += nm;
+      a[j1 - 1] = -a[j1 - 1];
+      xr = a[j1];
+      xi = -a[j1 + 1];
+      yr = a[k1];
+      yi = -a[k1 + 1];
+      a[j1] = yr;
+      a[j1 + 1] = yi;
+      a[k1] = xr;
+      a[k1 + 1] = xi;
+      a[k1 + 3] = -a[k1 + 3];
+    }
+  }
+}
+
+void bitrv216(float* a)
+{
+  float x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x7r, x7i, x8r, x8i,
+      x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r, x14i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x2r = a[4];
+  x2i = a[5];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x5r = a[10];
+  x5i = a[11];
+  x7r = a[14];
+  x7i = a[15];
+  x8r = a[16];
+  x8i = a[17];
+  x10r = a[20];
+  x10i = a[21];
+  x11r = a[22];
+  x11i = a[23];
+  x12r = a[24];
+  x12i = a[25];
+  x13r = a[26];
+  x13i = a[27];
+  x14r = a[28];
+  x14i = a[29];
+  a[2] = x8r;
+  a[3] = x8i;
+  a[4] = x4r;
+  a[5] = x4i;
+  a[6] = x12r;
+  a[7] = x12i;
+  a[8] = x2r;
+  a[9] = x2i;
+  a[10] = x10r;
+  a[11] = x10i;
+  a[14] = x14r;
+  a[15] = x14i;
+  a[16] = x1r;
+  a[17] = x1i;
+  a[20] = x5r;
+  a[21] = x5i;
+  a[22] = x13r;
+  a[23] = x13i;
+  a[24] = x3r;
+  a[25] = x3i;
+  a[26] = x11r;
+  a[27] = x11i;
+  a[28] = x7r;
+  a[29] = x7i;
+}
+
+void bitrv216neg(float* a)
+{
+  float x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i,
+      x8r, x8i, x9r, x9i, x10r, x10i, x11r, x11i, x12r, x12i, x13r, x13i, x14r,
+      x14i, x15r, x15i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x2r = a[4];
+  x2i = a[5];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x5r = a[10];
+  x5i = a[11];
+  x6r = a[12];
+  x6i = a[13];
+  x7r = a[14];
+  x7i = a[15];
+  x8r = a[16];
+  x8i = a[17];
+  x9r = a[18];
+  x9i = a[19];
+  x10r = a[20];
+  x10i = a[21];
+  x11r = a[22];
+  x11i = a[23];
+  x12r = a[24];
+  x12i = a[25];
+  x13r = a[26];
+  x13i = a[27];
+  x14r = a[28];
+  x14i = a[29];
+  x15r = a[30];
+  x15i = a[31];
+  a[2] = x15r;
+  a[3] = x15i;
+  a[4] = x7r;
+  a[5] = x7i;
+  a[6] = x11r;
+  a[7] = x11i;
+  a[8] = x3r;
+  a[9] = x3i;
+  a[10] = x13r;
+  a[11] = x13i;
+  a[12] = x5r;
+  a[13] = x5i;
+  a[14] = x9r;
+  a[15] = x9i;
+  a[16] = x1r;
+  a[17] = x1i;
+  a[18] = x14r;
+  a[19] = x14i;
+  a[20] = x6r;
+  a[21] = x6i;
+  a[22] = x10r;
+  a[23] = x10i;
+  a[24] = x2r;
+  a[25] = x2i;
+  a[26] = x12r;
+  a[27] = x12i;
+  a[28] = x4r;
+  a[29] = x4i;
+  a[30] = x8r;
+  a[31] = x8i;
+}
+
+void bitrv208(float* a)
+{
+  float x1r, x1i, x3r, x3i, x4r, x4i, x6r, x6i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x6r = a[12];
+  x6i = a[13];
+  a[2] = x4r;
+  a[3] = x4i;
+  a[6] = x6r;
+  a[7] = x6i;
+  a[8] = x1r;
+  a[9] = x1i;
+  a[12] = x3r;
+  a[13] = x3i;
+}
+
+void bitrv208neg(float* a)
+{
+  float x1r, x1i, x2r, x2i, x3r, x3i, x4r, x4i, x5r, x5i, x6r, x6i, x7r, x7i;
+
+  x1r = a[2];
+  x1i = a[3];
+  x2r = a[4];
+  x2i = a[5];
+  x3r = a[6];
+  x3i = a[7];
+  x4r = a[8];
+  x4i = a[9];
+  x5r = a[10];
+  x5i = a[11];
+  x6r = a[12];
+  x6i = a[13];
+  x7r = a[14];
+  x7i = a[15];
+  a[2] = x7r;
+  a[3] = x7i;
+  a[4] = x3r;
+  a[5] = x3i;
+  a[6] = x5r;
+  a[7] = x5i;
+  a[8] = x1r;
+  a[9] = x1i;
+  a[10] = x6r;
+  a[11] = x6i;
+  a[12] = x2r;
+  a[13] = x2i;
+  a[14] = x4r;
+  a[15] = x4i;
+}
+
+void cftf1st(int n, float* a, float* w)
+{
+  int j, j0, j1, j2, j3, k, m, mh;
+  float wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i,
+      y3r, y3i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] + a[j2];
+  x0i = a[1] + a[j2 + 1];
+  x1r = a[0] - a[j2];
+  x1i = a[1] - a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i - x2i;
+  a[j2] = x1r - x3i;
+  a[j2 + 1] = x1i + x3r;
+  a[j3] = x1r + x3i;
+  a[j3 + 1] = x1i - x3r;
+  wn4r = w[1];
+  csc1 = w[2];
+  csc3 = w[3];
+  wd1r = 1;
+  wd1i = 0;
+  wd3r = 1;
+  wd3i = 0;
+  k = 0;
+  for (j = 2; j < mh - 2; j += 4) {
+    k += 4;
+    wk1r = csc1 * (wd1r + w[k]);
+    wk1i = csc1 * (wd1i + w[k + 1]);
+    wk3r = csc3 * (wd3r + w[k + 2]);
+    wk3i = csc3 * (wd3i + w[k + 3]);
+    wd1r = w[k];
+    wd1i = w[k + 1];
+    wd3r = w[k + 2];
+    wd3i = w[k + 3];
+    j1 = j + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j] + a[j2];
+    x0i = a[j + 1] + a[j2 + 1];
+    x1r = a[j] - a[j2];
+    x1i = a[j + 1] - a[j2 + 1];
+    y0r = a[j + 2] + a[j2 + 2];
+    y0i = a[j + 3] + a[j2 + 3];
+    y1r = a[j + 2] - a[j2 + 2];
+    y1i = a[j + 3] - a[j2 + 3];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    y2r = a[j1 + 2] + a[j3 + 2];
+    y2i = a[j1 + 3] + a[j3 + 3];
+    y3r = a[j1 + 2] - a[j3 + 2];
+    y3i = a[j1 + 3] - a[j3 + 3];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j + 2] = y0r + y2r;
+    a[j + 3] = y0i + y2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j1 + 2] = y0r - y2r;
+    a[j1 + 3] = y0i - y2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wk1r * x0r - wk1i * x0i;
+    a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+    a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = wk3r * x0r + wk3i * x0i;
+    a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+    a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+    j0 = m - j;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    y0r = a[j0 - 2] + a[j2 - 2];
+    y0i = a[j0 - 1] + a[j2 - 1];
+    y1r = a[j0 - 2] - a[j2 - 2];
+    y1i = a[j0 - 1] - a[j2 - 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    y2r = a[j1 - 2] + a[j3 - 2];
+    y2i = a[j1 - 1] + a[j3 - 1];
+    y3r = a[j1 - 2] - a[j3 - 2];
+    y3i = a[j1 - 1] - a[j3 - 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j0 - 2] = y0r + y2r;
+    a[j0 - 1] = y0i + y2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    a[j1 - 2] = y0r - y2r;
+    a[j1 - 1] = y0i - y2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+    x0r = y1r - y3i;
+    x0i = y1i + y3r;
+    a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+    a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+    x0r = y1r + y3i;
+    x0i = y1i - y3r;
+    a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+    a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+  }
+  wk1r = csc1 * (wd1r + wn4r);
+  wk1i = csc1 * (wd1i + wn4r);
+  wk3r = csc3 * (wd3r - wn4r);
+  wk3i = csc3 * (wd3i - wn4r);
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0 - 2] + a[j2 - 2];
+  x0i = a[j0 - 1] + a[j2 - 1];
+  x1r = a[j0 - 2] - a[j2 - 2];
+  x1i = a[j0 - 1] - a[j2 - 1];
+  x2r = a[j1 - 2] + a[j3 - 2];
+  x2i = a[j1 - 1] + a[j3 - 1];
+  x3r = a[j1 - 2] - a[j3 - 2];
+  x3i = a[j1 - 1] - a[j3 - 1];
+  a[j0 - 2] = x0r + x2r;
+  a[j0 - 1] = x0i + x2i;
+  a[j1 - 2] = x0r - x2r;
+  a[j1 - 1] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+  a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+  a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+  x0r = a[j0] + a[j2];
+  x0i = a[j0 + 1] + a[j2 + 1];
+  x1r = a[j0] - a[j2];
+  x1i = a[j0 + 1] - a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[j0] = x0r + x2r;
+  a[j0 + 1] = x0i + x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2] = wn4r * (x0r - x0i);
+  a[j2 + 1] = wn4r * (x0i + x0r);
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3] = -wn4r * (x0r + x0i);
+  a[j3 + 1] = -wn4r * (x0i - x0r);
+  x0r = a[j0 + 2] + a[j2 + 2];
+  x0i = a[j0 + 3] + a[j2 + 3];
+  x1r = a[j0 + 2] - a[j2 + 2];
+  x1i = a[j0 + 3] - a[j2 + 3];
+  x2r = a[j1 + 2] + a[j3 + 2];
+  x2i = a[j1 + 3] + a[j3 + 3];
+  x3r = a[j1 + 2] - a[j3 + 2];
+  x3i = a[j1 + 3] - a[j3 + 3];
+  a[j0 + 2] = x0r + x2r;
+  a[j0 + 3] = x0i + x2i;
+  a[j1 + 2] = x0r - x2r;
+  a[j1 + 3] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+  a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+  a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+void cftb1st(int n, float* a, float* w)
+{
+  int j, j0, j1, j2, j3, k, m, mh;
+  float wn4r, csc1, csc3, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r, y2i,
+      y3r, y3i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] + a[j2];
+  x0i = -a[1] - a[j2 + 1];
+  x1r = a[0] - a[j2];
+  x1i = -a[1] + a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[0] = x0r + x2r;
+  a[1] = x0i - x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i + x2i;
+  a[j2] = x1r + x3i;
+  a[j2 + 1] = x1i + x3r;
+  a[j3] = x1r - x3i;
+  a[j3 + 1] = x1i - x3r;
+  wn4r = w[1];
+  csc1 = w[2];
+  csc3 = w[3];
+  wd1r = 1;
+  wd1i = 0;
+  wd3r = 1;
+  wd3i = 0;
+  k = 0;
+  for (j = 2; j < mh - 2; j += 4) {
+    k += 4;
+    wk1r = csc1 * (wd1r + w[k]);
+    wk1i = csc1 * (wd1i + w[k + 1]);
+    wk3r = csc3 * (wd3r + w[k + 2]);
+    wk3i = csc3 * (wd3i + w[k + 3]);
+    wd1r = w[k];
+    wd1i = w[k + 1];
+    wd3r = w[k + 2];
+    wd3i = w[k + 3];
+    j1 = j + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j] + a[j2];
+    x0i = -a[j + 1] - a[j2 + 1];
+    x1r = a[j] - a[j2];
+    x1i = -a[j + 1] + a[j2 + 1];
+    y0r = a[j + 2] + a[j2 + 2];
+    y0i = -a[j + 3] - a[j2 + 3];
+    y1r = a[j + 2] - a[j2 + 2];
+    y1i = -a[j + 3] + a[j2 + 3];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    y2r = a[j1 + 2] + a[j3 + 2];
+    y2i = a[j1 + 3] + a[j3 + 3];
+    y3r = a[j1 + 2] - a[j3 + 2];
+    y3i = a[j1 + 3] - a[j3 + 3];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i - x2i;
+    a[j + 2] = y0r + y2r;
+    a[j + 3] = y0i - y2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j1 + 2] = y0r - y2r;
+    a[j1 + 3] = y0i + y2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wk1r * x0r - wk1i * x0i;
+    a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+    x0r = y1r + y3i;
+    x0i = y1i + y3r;
+    a[j2 + 2] = wd1r * x0r - wd1i * x0i;
+    a[j2 + 3] = wd1r * x0i + wd1i * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = wk3r * x0r + wk3i * x0i;
+    a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+    x0r = y1r - y3i;
+    x0i = y1i - y3r;
+    a[j3 + 2] = wd3r * x0r + wd3i * x0i;
+    a[j3 + 3] = wd3r * x0i - wd3i * x0r;
+    j0 = m - j;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = -a[j0 + 1] - a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = -a[j0 + 1] + a[j2 + 1];
+    y0r = a[j0 - 2] + a[j2 - 2];
+    y0i = -a[j0 - 1] - a[j2 - 1];
+    y1r = a[j0 - 2] - a[j2 - 2];
+    y1i = -a[j0 - 1] + a[j2 - 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    y2r = a[j1 - 2] + a[j3 - 2];
+    y2i = a[j1 - 1] + a[j3 - 1];
+    y3r = a[j1 - 2] - a[j3 - 2];
+    y3i = a[j1 - 1] - a[j3 - 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i - x2i;
+    a[j0 - 2] = y0r + y2r;
+    a[j0 - 1] = y0i - y2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i + x2i;
+    a[j1 - 2] = y0r - y2r;
+    a[j1 - 1] = y0i + y2i;
+    x0r = x1r + x3i;
+    x0i = x1i + x3r;
+    a[j2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+    x0r = y1r + y3i;
+    x0i = y1i + y3r;
+    a[j2 - 2] = wd1i * x0r - wd1r * x0i;
+    a[j2 - 1] = wd1i * x0i + wd1r * x0r;
+    x0r = x1r - x3i;
+    x0i = x1i - x3r;
+    a[j3] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+    x0r = y1r - y3i;
+    x0i = y1i - y3r;
+    a[j3 - 2] = wd3i * x0r + wd3r * x0i;
+    a[j3 - 1] = wd3i * x0i - wd3r * x0r;
+  }
+  wk1r = csc1 * (wd1r + wn4r);
+  wk1i = csc1 * (wd1i + wn4r);
+  wk3r = csc3 * (wd3r - wn4r);
+  wk3i = csc3 * (wd3i - wn4r);
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0 - 2] + a[j2 - 2];
+  x0i = -a[j0 - 1] - a[j2 - 1];
+  x1r = a[j0 - 2] - a[j2 - 2];
+  x1i = -a[j0 - 1] + a[j2 - 1];
+  x2r = a[j1 - 2] + a[j3 - 2];
+  x2i = a[j1 - 1] + a[j3 - 1];
+  x3r = a[j1 - 2] - a[j3 - 2];
+  x3i = a[j1 - 1] - a[j3 - 1];
+  a[j0 - 2] = x0r + x2r;
+  a[j0 - 1] = x0i - x2i;
+  a[j1 - 2] = x0r - x2r;
+  a[j1 - 1] = x0i + x2i;
+  x0r = x1r + x3i;
+  x0i = x1i + x3r;
+  a[j2 - 2] = wk1r * x0r - wk1i * x0i;
+  a[j2 - 1] = wk1r * x0i + wk1i * x0r;
+  x0r = x1r - x3i;
+  x0i = x1i - x3r;
+  a[j3 - 2] = wk3r * x0r + wk3i * x0i;
+  a[j3 - 1] = wk3r * x0i - wk3i * x0r;
+  x0r = a[j0] + a[j2];
+  x0i = -a[j0 + 1] - a[j2 + 1];
+  x1r = a[j0] - a[j2];
+  x1i = -a[j0 + 1] + a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[j0] = x0r + x2r;
+  a[j0 + 1] = x0i - x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i + x2i;
+  x0r = x1r + x3i;
+  x0i = x1i + x3r;
+  a[j2] = wn4r * (x0r - x0i);
+  a[j2 + 1] = wn4r * (x0i + x0r);
+  x0r = x1r - x3i;
+  x0i = x1i - x3r;
+  a[j3] = -wn4r * (x0r + x0i);
+  a[j3 + 1] = -wn4r * (x0i - x0r);
+  x0r = a[j0 + 2] + a[j2 + 2];
+  x0i = -a[j0 + 3] - a[j2 + 3];
+  x1r = a[j0 + 2] - a[j2 + 2];
+  x1i = -a[j0 + 3] + a[j2 + 3];
+  x2r = a[j1 + 2] + a[j3 + 2];
+  x2i = a[j1 + 3] + a[j3 + 3];
+  x3r = a[j1 + 2] - a[j3 + 2];
+  x3i = a[j1 + 3] - a[j3 + 3];
+  a[j0 + 2] = x0r + x2r;
+  a[j0 + 3] = x0i - x2i;
+  a[j1 + 2] = x0r - x2r;
+  a[j1 + 3] = x0i + x2i;
+  x0r = x1r + x3i;
+  x0i = x1i + x3r;
+  a[j2 + 2] = wk1i * x0r - wk1r * x0i;
+  a[j2 + 3] = wk1i * x0i + wk1r * x0r;
+  x0r = x1r - x3i;
+  x0i = x1i - x3r;
+  a[j3 + 2] = wk3i * x0r + wk3r * x0i;
+  a[j3 + 3] = wk3i * x0i - wk3r * x0r;
+}
+
+#ifdef USE_CDFT_THREADS
+struct cdft_arg_st
+{
+  int n0;
+  int n;
+  float* a;
+  int nw;
+  float* w;
+};
+typedef struct cdft_arg_st cdft_arg_t;
+
+void cftrec4_th(int n, float* a, int nw, float* w)
+{
+  void* cftrec1_th(void* p);
+  void* cftrec2_th(void* p);
+  int i, idiv4, m, nthread;
+  cdft_thread_t th[4];
+  cdft_arg_t ag[4];
+
+  nthread = 2;
+  idiv4 = 0;
+  m = n >> 1;
+  if (n > CDFT_4THREADS_BEGIN_N) {
+    nthread = 4;
+    idiv4 = 1;
+    m >>= 1;
+  }
+  for (i = 0; i < nthread; i++) {
+    ag[i].n0 = n;
+    ag[i].n = m;
+    ag[i].a = &a[i * m];
+    ag[i].nw = nw;
+    ag[i].w = w;
+    if (i != idiv4) {
+      cdft_thread_create(&th[i], cftrec1_th, &ag[i]);
+    } else {
+      cdft_thread_create(&th[i], cftrec2_th, &ag[i]);
+    }
+  }
+  for (i = 0; i < nthread; i++) {
+    cdft_thread_wait(th[i]);
+  }
+}
+
+void* cftrec1_th(void* p)
+{
+  int cfttree(int n, int j, int k, float* a, int nw, float* w);
+  void cftleaf(int n, int isplt, float* a, int nw, float* w);
+  void cftmdl1(int n, float* a, float* w);
+  int isplt, j, k, m, n, n0, nw;
+  float *a, *w;
+
+  n0 = ((cdft_arg_t*) p)->n0;
+  n = ((cdft_arg_t*) p)->n;
+  a = ((cdft_arg_t*) p)->a;
+  nw = ((cdft_arg_t*) p)->nw;
+  w = ((cdft_arg_t*) p)->w;
+  m = n0;
+  while (m > 512) {
+    m >>= 2;
+    cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+  }
+  cftleaf(m, 1, &a[n - m], nw, w);
+  k = 0;
+  for (j = n - m; j > 0; j -= m) {
+    k++;
+    isplt = cfttree(m, j, k, a, nw, w);
+    cftleaf(m, isplt, &a[j - m], nw, w);
+  }
+  return (void*) 0;
+}
+
+void* cftrec2_th(void* p)
+{
+  int cfttree(int n, int j, int k, float* a, int nw, float* w);
+  void cftleaf(int n, int isplt, float* a, int nw, float* w);
+  void cftmdl2(int n, float* a, float* w);
+  int isplt, j, k, m, n, n0, nw;
+  float *a, *w;
+
+  n0 = ((cdft_arg_t*) p)->n0;
+  n = ((cdft_arg_t*) p)->n;
+  a = ((cdft_arg_t*) p)->a;
+  nw = ((cdft_arg_t*) p)->nw;
+  w = ((cdft_arg_t*) p)->w;
+  k = 1;
+  m = n0;
+  while (m > 512) {
+    m >>= 2;
+    k <<= 2;
+    cftmdl2(m, &a[n - m], &w[nw - m]);
+  }
+  cftleaf(m, 0, &a[n - m], nw, w);
+  k >>= 1;
+  for (j = n - m; j > 0; j -= m) {
+    k++;
+    isplt = cfttree(m, j, k, a, nw, w);
+    cftleaf(m, isplt, &a[j - m], nw, w);
+  }
+  return (void*) 0;
+}
+#endif /* USE_CDFT_THREADS */
+
+void cftrec4(int n, float* a, int nw, float* w)
+{
+  int cfttree(int n, int j, int k, float* a, int nw, float* w);
+  void cftleaf(int n, int isplt, float* a, int nw, float* w);
+  void cftmdl1(int n, float* a, float* w);
+  int isplt, j, k, m;
+
+  m = n;
+  while (m > 512) {
+    m >>= 2;
+    cftmdl1(m, &a[n - m], &w[nw - (m >> 1)]);
+  }
+  cftleaf(m, 1, &a[n - m], nw, w);
+  k = 0;
+  for (j = n - m; j > 0; j -= m) {
+    k++;
+    isplt = cfttree(m, j, k, a, nw, w);
+    cftleaf(m, isplt, &a[j - m], nw, w);
+  }
+}
+
+int cfttree(int n, int j, int k, float* a, int nw, float* w)
+{
+  void cftmdl1(int n, float* a, float* w);
+  void cftmdl2(int n, float* a, float* w);
+  int i, isplt, m;
+
+  if ((k & 3) != 0) {
+    isplt = k & 1;
+    if (isplt != 0) {
+      cftmdl1(n, &a[j - n], &w[nw - (n >> 1)]);
+    } else {
+      cftmdl2(n, &a[j - n], &w[nw - n]);
+    }
+  } else {
+    m = n;
+    for (i = k; (i & 3) == 0; i >>= 2) {
+      m <<= 2;
+    }
+    isplt = i & 1;
+    if (isplt != 0) {
+      while (m > 128) {
+        cftmdl1(m, &a[j - m], &w[nw - (m >> 1)]);
+        m >>= 2;
+      }
+    } else {
+      while (m > 128) {
+        cftmdl2(m, &a[j - m], &w[nw - m]);
+        m >>= 2;
+      }
+    }
+  }
+  return isplt;
+}
+
+void cftleaf(int n, int isplt, float* a, int nw, float* w)
+{
+  void cftmdl1(int n, float* a, float* w);
+  void cftmdl2(int n, float* a, float* w);
+  void cftf161(float* a, float* w);
+  void cftf162(float* a, float* w);
+  void cftf081(float* a, float* w);
+  void cftf082(float* a, float* w);
+
+  if (n == 512) {
+    cftmdl1(128, a, &w[nw - 64]);
+    cftf161(a, &w[nw - 8]);
+    cftf162(&a[32], &w[nw - 32]);
+    cftf161(&a[64], &w[nw - 8]);
+    cftf161(&a[96], &w[nw - 8]);
+    cftmdl2(128, &a[128], &w[nw - 128]);
+    cftf161(&a[128], &w[nw - 8]);
+    cftf162(&a[160], &w[nw - 32]);
+    cftf161(&a[192], &w[nw - 8]);
+    cftf162(&a[224], &w[nw - 32]);
+    cftmdl1(128, &a[256], &w[nw - 64]);
+    cftf161(&a[256], &w[nw - 8]);
+    cftf162(&a[288], &w[nw - 32]);
+    cftf161(&a[320], &w[nw - 8]);
+    cftf161(&a[352], &w[nw - 8]);
+    if (isplt != 0) {
+      cftmdl1(128, &a[384], &w[nw - 64]);
+      cftf161(&a[480], &w[nw - 8]);
+    } else {
+      cftmdl2(128, &a[384], &w[nw - 128]);
+      cftf162(&a[480], &w[nw - 32]);
+    }
+    cftf161(&a[384], &w[nw - 8]);
+    cftf162(&a[416], &w[nw - 32]);
+    cftf161(&a[448], &w[nw - 8]);
+  } else {
+    cftmdl1(64, a, &w[nw - 32]);
+    cftf081(a, &w[nw - 8]);
+    cftf082(&a[16], &w[nw - 8]);
+    cftf081(&a[32], &w[nw - 8]);
+    cftf081(&a[48], &w[nw - 8]);
+    cftmdl2(64, &a[64], &w[nw - 64]);
+    cftf081(&a[64], &w[nw - 8]);
+    cftf082(&a[80], &w[nw - 8]);
+    cftf081(&a[96], &w[nw - 8]);
+    cftf082(&a[112], &w[nw - 8]);
+    cftmdl1(64, &a[128], &w[nw - 32]);
+    cftf081(&a[128], &w[nw - 8]);
+    cftf082(&a[144], &w[nw - 8]);
+    cftf081(&a[160], &w[nw - 8]);
+    cftf081(&a[176], &w[nw - 8]);
+    if (isplt != 0) {
+      cftmdl1(64, &a[192], &w[nw - 32]);
+      cftf081(&a[240], &w[nw - 8]);
+    } else {
+      cftmdl2(64, &a[192], &w[nw - 64]);
+      cftf082(&a[240], &w[nw - 8]);
+    }
+    cftf081(&a[192], &w[nw - 8]);
+    cftf082(&a[208], &w[nw - 8]);
+    cftf081(&a[224], &w[nw - 8]);
+  }
+}
+
+void cftmdl1(int n, float* a, float* w)
+{
+  int j, j0, j1, j2, j3, k, m, mh;
+  float wn4r, wk1r, wk1i, wk3r, wk3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] + a[j2];
+  x0i = a[1] + a[j2 + 1];
+  x1r = a[0] - a[j2];
+  x1i = a[1] - a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i - x2i;
+  a[j2] = x1r - x3i;
+  a[j2 + 1] = x1i + x3r;
+  a[j3] = x1r + x3i;
+  a[j3 + 1] = x1i - x3r;
+  wn4r = w[1];
+  k = 0;
+  for (j = 2; j < mh; j += 2) {
+    k += 4;
+    wk1r = w[k];
+    wk1i = w[k + 1];
+    wk3r = w[k + 2];
+    wk3i = w[k + 3];
+    j1 = j + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j] + a[j2];
+    x0i = a[j + 1] + a[j2 + 1];
+    x1r = a[j] - a[j2];
+    x1i = a[j + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j] = x0r + x2r;
+    a[j + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wk1r * x0r - wk1i * x0i;
+    a[j2 + 1] = wk1r * x0i + wk1i * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = wk3r * x0r + wk3i * x0i;
+    a[j3 + 1] = wk3r * x0i - wk3i * x0r;
+    j0 = m - j;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] + a[j2];
+    x0i = a[j0 + 1] + a[j2 + 1];
+    x1r = a[j0] - a[j2];
+    x1i = a[j0 + 1] - a[j2 + 1];
+    x2r = a[j1] + a[j3];
+    x2i = a[j1 + 1] + a[j3 + 1];
+    x3r = a[j1] - a[j3];
+    x3i = a[j1 + 1] - a[j3 + 1];
+    a[j0] = x0r + x2r;
+    a[j0 + 1] = x0i + x2i;
+    a[j1] = x0r - x2r;
+    a[j1 + 1] = x0i - x2i;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[j2] = wk1i * x0r - wk1r * x0i;
+    a[j2 + 1] = wk1i * x0i + wk1r * x0r;
+    x0r = x1r + x3i;
+    x0i = x1i - x3r;
+    a[j3] = wk3i * x0r + wk3r * x0i;
+    a[j3 + 1] = wk3i * x0i - wk3r * x0r;
+  }
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0] + a[j2];
+  x0i = a[j0 + 1] + a[j2 + 1];
+  x1r = a[j0] - a[j2];
+  x1i = a[j0 + 1] - a[j2 + 1];
+  x2r = a[j1] + a[j3];
+  x2i = a[j1 + 1] + a[j3 + 1];
+  x3r = a[j1] - a[j3];
+  x3i = a[j1 + 1] - a[j3 + 1];
+  a[j0] = x0r + x2r;
+  a[j0 + 1] = x0i + x2i;
+  a[j1] = x0r - x2r;
+  a[j1 + 1] = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  a[j2] = wn4r * (x0r - x0i);
+  a[j2 + 1] = wn4r * (x0i + x0r);
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  a[j3] = -wn4r * (x0r + x0i);
+  a[j3 + 1] = -wn4r * (x0i - x0r);
+}
+
+void cftmdl2(int n, float* a, float* w)
+{
+  int j, j0, j1, j2, j3, k, kr, m, mh;
+  float wn4r, wk1r, wk1i, wk3r, wk3i, wd1r, wd1i, wd3r, wd3i;
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y2r, y2i;
+
+  mh = n >> 3;
+  m = 2 * mh;
+  wn4r = w[1];
+  j1 = m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[0] - a[j2 + 1];
+  x0i = a[1] + a[j2];
+  x1r = a[0] + a[j2 + 1];
+  x1i = a[1] - a[j2];
+  x2r = a[j1] - a[j3 + 1];
+  x2i = a[j1 + 1] + a[j3];
+  x3r = a[j1] + a[j3 + 1];
+  x3i = a[j1 + 1] - a[j3];
+  y0r = wn4r * (x2r - x2i);
+  y0i = wn4r * (x2i + x2r);
+  a[0] = x0r + y0r;
+  a[1] = x0i + y0i;
+  a[j1] = x0r - y0r;
+  a[j1 + 1] = x0i - y0i;
+  y0r = wn4r * (x3r - x3i);
+  y0i = wn4r * (x3i + x3r);
+  a[j2] = x1r - y0i;
+  a[j2 + 1] = x1i + y0r;
+  a[j3] = x1r + y0i;
+  a[j3 + 1] = x1i - y0r;
+  k = 0;
+  kr = 2 * m;
+  for (j = 2; j < mh; j += 2) {
+    k += 4;
+    wk1r = w[k];
+    wk1i = w[k + 1];
+    wk3r = w[k + 2];
+    wk3i = w[k + 3];
+    kr -= 4;
+    wd1i = w[kr];
+    wd1r = w[kr + 1];
+    wd3i = w[kr + 2];
+    wd3r = w[kr + 3];
+    j1 = j + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j] - a[j2 + 1];
+    x0i = a[j + 1] + a[j2];
+    x1r = a[j] + a[j2 + 1];
+    x1i = a[j + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wk1r * x0r - wk1i * x0i;
+    y0i = wk1r * x0i + wk1i * x0r;
+    y2r = wd1r * x2r - wd1i * x2i;
+    y2i = wd1r * x2i + wd1i * x2r;
+    a[j] = y0r + y2r;
+    a[j + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wk3r * x1r + wk3i * x1i;
+    y0i = wk3r * x1i - wk3i * x1r;
+    y2r = wd3r * x3r + wd3i * x3i;
+    y2i = wd3r * x3i - wd3i * x3r;
+    a[j2] = y0r + y2r;
+    a[j2 + 1] = y0i + y2i;
+    a[j3] = y0r - y2r;
+    a[j3 + 1] = y0i - y2i;
+    j0 = m - j;
+    j1 = j0 + m;
+    j2 = j1 + m;
+    j3 = j2 + m;
+    x0r = a[j0] - a[j2 + 1];
+    x0i = a[j0 + 1] + a[j2];
+    x1r = a[j0] + a[j2 + 1];
+    x1i = a[j0 + 1] - a[j2];
+    x2r = a[j1] - a[j3 + 1];
+    x2i = a[j1 + 1] + a[j3];
+    x3r = a[j1] + a[j3 + 1];
+    x3i = a[j1 + 1] - a[j3];
+    y0r = wd1i * x0r - wd1r * x0i;
+    y0i = wd1i * x0i + wd1r * x0r;
+    y2r = wk1i * x2r - wk1r * x2i;
+    y2i = wk1i * x2i + wk1r * x2r;
+    a[j0] = y0r + y2r;
+    a[j0 + 1] = y0i + y2i;
+    a[j1] = y0r - y2r;
+    a[j1 + 1] = y0i - y2i;
+    y0r = wd3i * x1r + wd3r * x1i;
+    y0i = wd3i * x1i - wd3r * x1r;
+    y2r = wk3i * x3r + wk3r * x3i;
+    y2i = wk3i * x3i - wk3r * x3r;
+    a[j2] = y0r + y2r;
+    a[j2 + 1] = y0i + y2i;
+    a[j3] = y0r - y2r;
+    a[j3 + 1] = y0i - y2i;
+  }
+  wk1r = w[m];
+  wk1i = w[m + 1];
+  j0 = mh;
+  j1 = j0 + m;
+  j2 = j1 + m;
+  j3 = j2 + m;
+  x0r = a[j0] - a[j2 + 1];
+  x0i = a[j0 + 1] + a[j2];
+  x1r = a[j0] + a[j2 + 1];
+  x1i = a[j0 + 1] - a[j2];
+  x2r = a[j1] - a[j3 + 1];
+  x2i = a[j1 + 1] + a[j3];
+  x3r = a[j1] + a[j3 + 1];
+  x3i = a[j1 + 1] - a[j3];
+  y0r = wk1r * x0r - wk1i * x0i;
+  y0i = wk1r * x0i + wk1i * x0r;
+  y2r = wk1i * x2r - wk1r * x2i;
+  y2i = wk1i * x2i + wk1r * x2r;
+  a[j0] = y0r + y2r;
+  a[j0 + 1] = y0i + y2i;
+  a[j1] = y0r - y2r;
+  a[j1 + 1] = y0i - y2i;
+  y0r = wk1i * x1r - wk1r * x1i;
+  y0i = wk1i * x1i + wk1r * x1r;
+  y2r = wk1r * x3r - wk1i * x3i;
+  y2i = wk1r * x3i + wk1i * x3r;
+  a[j2] = y0r - y2r;
+  a[j2 + 1] = y0i - y2i;
+  a[j3] = y0r + y2r;
+  a[j3 + 1] = y0i + y2i;
+}
+
+void cftfx41(int n, float* a, int nw, float* w)
+{
+  void cftf161(float* a, float* w);
+  void cftf162(float* a, float* w);
+  void cftf081(float* a, float* w);
+  void cftf082(float* a, float* w);
+
+  if (n == 128) {
+    cftf161(a, &w[nw - 8]);
+    cftf162(&a[32], &w[nw - 32]);
+    cftf161(&a[64], &w[nw - 8]);
+    cftf161(&a[96], &w[nw - 8]);
+  } else {
+    cftf081(a, &w[nw - 8]);
+    cftf082(&a[16], &w[nw - 8]);
+    cftf081(&a[32], &w[nw - 8]);
+    cftf081(&a[48], &w[nw - 8]);
+  }
+}
+
+void cftf161(float* a, float* w)
+{
+  float wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r,
+      y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i, y8r, y8i,
+      y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i, y14r, y14i,
+      y15r, y15i;
+
+  wn4r = w[1];
+  wk1r = w[2];
+  wk1i = w[3];
+  x0r = a[0] + a[16];
+  x0i = a[1] + a[17];
+  x1r = a[0] - a[16];
+  x1i = a[1] - a[17];
+  x2r = a[8] + a[24];
+  x2i = a[9] + a[25];
+  x3r = a[8] - a[24];
+  x3i = a[9] - a[25];
+  y0r = x0r + x2r;
+  y0i = x0i + x2i;
+  y4r = x0r - x2r;
+  y4i = x0i - x2i;
+  y8r = x1r - x3i;
+  y8i = x1i + x3r;
+  y12r = x1r + x3i;
+  y12i = x1i - x3r;
+  x0r = a[2] + a[18];
+  x0i = a[3] + a[19];
+  x1r = a[2] - a[18];
+  x1i = a[3] - a[19];
+  x2r = a[10] + a[26];
+  x2i = a[11] + a[27];
+  x3r = a[10] - a[26];
+  x3i = a[11] - a[27];
+  y1r = x0r + x2r;
+  y1i = x0i + x2i;
+  y5r = x0r - x2r;
+  y5i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  y9r = wk1r * x0r - wk1i * x0i;
+  y9i = wk1r * x0i + wk1i * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  y13r = wk1i * x0r - wk1r * x0i;
+  y13i = wk1i * x0i + wk1r * x0r;
+  x0r = a[4] + a[20];
+  x0i = a[5] + a[21];
+  x1r = a[4] - a[20];
+  x1i = a[5] - a[21];
+  x2r = a[12] + a[28];
+  x2i = a[13] + a[29];
+  x3r = a[12] - a[28];
+  x3i = a[13] - a[29];
+  y2r = x0r + x2r;
+  y2i = x0i + x2i;
+  y6r = x0r - x2r;
+  y6i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  y10r = wn4r * (x0r - x0i);
+  y10i = wn4r * (x0i + x0r);
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  y14r = wn4r * (x0r + x0i);
+  y14i = wn4r * (x0i - x0r);
+  x0r = a[6] + a[22];
+  x0i = a[7] + a[23];
+  x1r = a[6] - a[22];
+  x1i = a[7] - a[23];
+  x2r = a[14] + a[30];
+  x2i = a[15] + a[31];
+  x3r = a[14] - a[30];
+  x3i = a[15] - a[31];
+  y3r = x0r + x2r;
+  y3i = x0i + x2i;
+  y7r = x0r - x2r;
+  y7i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  y11r = wk1i * x0r - wk1r * x0i;
+  y11i = wk1i * x0i + wk1r * x0r;
+  x0r = x1r + x3i;
+  x0i = x1i - x3r;
+  y15r = wk1r * x0r - wk1i * x0i;
+  y15i = wk1r * x0i + wk1i * x0r;
+  x0r = y12r - y14r;
+  x0i = y12i - y14i;
+  x1r = y12r + y14r;
+  x1i = y12i + y14i;
+  x2r = y13r - y15r;
+  x2i = y13i - y15i;
+  x3r = y13r + y15r;
+  x3i = y13i + y15i;
+  a[24] = x0r + x2r;
+  a[25] = x0i + x2i;
+  a[26] = x0r - x2r;
+  a[27] = x0i - x2i;
+  a[28] = x1r - x3i;
+  a[29] = x1i + x3r;
+  a[30] = x1r + x3i;
+  a[31] = x1i - x3r;
+  x0r = y8r + y10r;
+  x0i = y8i + y10i;
+  x1r = y8r - y10r;
+  x1i = y8i - y10i;
+  x2r = y9r + y11r;
+  x2i = y9i + y11i;
+  x3r = y9r - y11r;
+  x3i = y9i - y11i;
+  a[16] = x0r + x2r;
+  a[17] = x0i + x2i;
+  a[18] = x0r - x2r;
+  a[19] = x0i - x2i;
+  a[20] = x1r - x3i;
+  a[21] = x1i + x3r;
+  a[22] = x1r + x3i;
+  a[23] = x1i - x3r;
+  x0r = y5r - y7i;
+  x0i = y5i + y7r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  x0r = y5r + y7i;
+  x0i = y5i - y7r;
+  x3r = wn4r * (x0r - x0i);
+  x3i = wn4r * (x0i + x0r);
+  x0r = y4r - y6i;
+  x0i = y4i + y6r;
+  x1r = y4r + y6i;
+  x1i = y4i - y6r;
+  a[8] = x0r + x2r;
+  a[9] = x0i + x2i;
+  a[10] = x0r - x2r;
+  a[11] = x0i - x2i;
+  a[12] = x1r - x3i;
+  a[13] = x1i + x3r;
+  a[14] = x1r + x3i;
+  a[15] = x1i - x3r;
+  x0r = y0r + y2r;
+  x0i = y0i + y2i;
+  x1r = y0r - y2r;
+  x1i = y0i - y2i;
+  x2r = y1r + y3r;
+  x2i = y1i + y3i;
+  x3r = y1r - y3r;
+  x3i = y1i - y3i;
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[2] = x0r - x2r;
+  a[3] = x0i - x2i;
+  a[4] = x1r - x3i;
+  a[5] = x1i + x3r;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+}
+
+void cftf162(float* a, float* w)
+{
+  float wn4r, wk1r, wk1i, wk2r, wk2i, wk3r, wk3i, x0r, x0i, x1r, x1i, x2r, x2i,
+      y0r, y0i, y1r, y1i, y2r, y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r,
+      y7i, y8r, y8i, y9r, y9i, y10r, y10i, y11r, y11i, y12r, y12i, y13r, y13i,
+      y14r, y14i, y15r, y15i;
+
+  wn4r = w[1];
+  wk1r = w[4];
+  wk1i = w[5];
+  wk3r = w[6];
+  wk3i = -w[7];
+  wk2r = w[8];
+  wk2i = w[9];
+  x1r = a[0] - a[17];
+  x1i = a[1] + a[16];
+  x0r = a[8] - a[25];
+  x0i = a[9] + a[24];
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  y0r = x1r + x2r;
+  y0i = x1i + x2i;
+  y4r = x1r - x2r;
+  y4i = x1i - x2i;
+  x1r = a[0] + a[17];
+  x1i = a[1] - a[16];
+  x0r = a[8] + a[25];
+  x0i = a[9] - a[24];
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  y8r = x1r - x2i;
+  y8i = x1i + x2r;
+  y12r = x1r + x2i;
+  y12i = x1i - x2r;
+  x0r = a[2] - a[19];
+  x0i = a[3] + a[18];
+  x1r = wk1r * x0r - wk1i * x0i;
+  x1i = wk1r * x0i + wk1i * x0r;
+  x0r = a[10] - a[27];
+  x0i = a[11] + a[26];
+  x2r = wk3i * x0r - wk3r * x0i;
+  x2i = wk3i * x0i + wk3r * x0r;
+  y1r = x1r + x2r;
+  y1i = x1i + x2i;
+  y5r = x1r - x2r;
+  y5i = x1i - x2i;
+  x0r = a[2] + a[19];
+  x0i = a[3] - a[18];
+  x1r = wk3r * x0r - wk3i * x0i;
+  x1i = wk3r * x0i + wk3i * x0r;
+  x0r = a[10] + a[27];
+  x0i = a[11] - a[26];
+  x2r = wk1r * x0r + wk1i * x0i;
+  x2i = wk1r * x0i - wk1i * x0r;
+  y9r = x1r - x2r;
+  y9i = x1i - x2i;
+  y13r = x1r + x2r;
+  y13i = x1i + x2i;
+  x0r = a[4] - a[21];
+  x0i = a[5] + a[20];
+  x1r = wk2r * x0r - wk2i * x0i;
+  x1i = wk2r * x0i + wk2i * x0r;
+  x0r = a[12] - a[29];
+  x0i = a[13] + a[28];
+  x2r = wk2i * x0r - wk2r * x0i;
+  x2i = wk2i * x0i + wk2r * x0r;
+  y2r = x1r + x2r;
+  y2i = x1i + x2i;
+  y6r = x1r - x2r;
+  y6i = x1i - x2i;
+  x0r = a[4] + a[21];
+  x0i = a[5] - a[20];
+  x1r = wk2i * x0r - wk2r * x0i;
+  x1i = wk2i * x0i + wk2r * x0r;
+  x0r = a[12] + a[29];
+  x0i = a[13] - a[28];
+  x2r = wk2r * x0r - wk2i * x0i;
+  x2i = wk2r * x0i + wk2i * x0r;
+  y10r = x1r - x2r;
+  y10i = x1i - x2i;
+  y14r = x1r + x2r;
+  y14i = x1i + x2i;
+  x0r = a[6] - a[23];
+  x0i = a[7] + a[22];
+  x1r = wk3r * x0r - wk3i * x0i;
+  x1i = wk3r * x0i + wk3i * x0r;
+  x0r = a[14] - a[31];
+  x0i = a[15] + a[30];
+  x2r = wk1i * x0r - wk1r * x0i;
+  x2i = wk1i * x0i + wk1r * x0r;
+  y3r = x1r + x2r;
+  y3i = x1i + x2i;
+  y7r = x1r - x2r;
+  y7i = x1i - x2i;
+  x0r = a[6] + a[23];
+  x0i = a[7] - a[22];
+  x1r = wk1i * x0r + wk1r * x0i;
+  x1i = wk1i * x0i - wk1r * x0r;
+  x0r = a[14] + a[31];
+  x0i = a[15] - a[30];
+  x2r = wk3i * x0r - wk3r * x0i;
+  x2i = wk3i * x0i + wk3r * x0r;
+  y11r = x1r + x2r;
+  y11i = x1i + x2i;
+  y15r = x1r - x2r;
+  y15i = x1i - x2i;
+  x1r = y0r + y2r;
+  x1i = y0i + y2i;
+  x2r = y1r + y3r;
+  x2i = y1i + y3i;
+  a[0] = x1r + x2r;
+  a[1] = x1i + x2i;
+  a[2] = x1r - x2r;
+  a[3] = x1i - x2i;
+  x1r = y0r - y2r;
+  x1i = y0i - y2i;
+  x2r = y1r - y3r;
+  x2i = y1i - y3i;
+  a[4] = x1r - x2i;
+  a[5] = x1i + x2r;
+  a[6] = x1r + x2i;
+  a[7] = x1i - x2r;
+  x1r = y4r - y6i;
+  x1i = y4i + y6r;
+  x0r = y5r - y7i;
+  x0i = y5i + y7r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[8] = x1r + x2r;
+  a[9] = x1i + x2i;
+  a[10] = x1r - x2r;
+  a[11] = x1i - x2i;
+  x1r = y4r + y6i;
+  x1i = y4i - y6r;
+  x0r = y5r + y7i;
+  x0i = y5i - y7r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[12] = x1r - x2i;
+  a[13] = x1i + x2r;
+  a[14] = x1r + x2i;
+  a[15] = x1i - x2r;
+  x1r = y8r + y10r;
+  x1i = y8i + y10i;
+  x2r = y9r - y11r;
+  x2i = y9i - y11i;
+  a[16] = x1r + x2r;
+  a[17] = x1i + x2i;
+  a[18] = x1r - x2r;
+  a[19] = x1i - x2i;
+  x1r = y8r - y10r;
+  x1i = y8i - y10i;
+  x2r = y9r + y11r;
+  x2i = y9i + y11i;
+  a[20] = x1r - x2i;
+  a[21] = x1i + x2r;
+  a[22] = x1r + x2i;
+  a[23] = x1i - x2r;
+  x1r = y12r - y14i;
+  x1i = y12i + y14r;
+  x0r = y13r + y15i;
+  x0i = y13i - y15r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[24] = x1r + x2r;
+  a[25] = x1i + x2i;
+  a[26] = x1r - x2r;
+  a[27] = x1i - x2i;
+  x1r = y12r + y14i;
+  x1i = y12i - y14r;
+  x0r = y13r - y15i;
+  x0i = y13i + y15r;
+  x2r = wn4r * (x0r - x0i);
+  x2i = wn4r * (x0i + x0r);
+  a[28] = x1r - x2i;
+  a[29] = x1i + x2r;
+  a[30] = x1r + x2i;
+  a[31] = x1i - x2r;
+}
+
+void cftf081(float* a, float* w)
+{
+  float wn4r, x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i, y0r, y0i, y1r, y1i, y2r,
+      y2i, y3r, y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+  wn4r = w[1];
+  x0r = a[0] + a[8];
+  x0i = a[1] + a[9];
+  x1r = a[0] - a[8];
+  x1i = a[1] - a[9];
+  x2r = a[4] + a[12];
+  x2i = a[5] + a[13];
+  x3r = a[4] - a[12];
+  x3i = a[5] - a[13];
+  y0r = x0r + x2r;
+  y0i = x0i + x2i;
+  y2r = x0r - x2r;
+  y2i = x0i - x2i;
+  y1r = x1r - x3i;
+  y1i = x1i + x3r;
+  y3r = x1r + x3i;
+  y3i = x1i - x3r;
+  x0r = a[2] + a[10];
+  x0i = a[3] + a[11];
+  x1r = a[2] - a[10];
+  x1i = a[3] - a[11];
+  x2r = a[6] + a[14];
+  x2i = a[7] + a[15];
+  x3r = a[6] - a[14];
+  x3i = a[7] - a[15];
+  y4r = x0r + x2r;
+  y4i = x0i + x2i;
+  y6r = x0r - x2r;
+  y6i = x0i - x2i;
+  x0r = x1r - x3i;
+  x0i = x1i + x3r;
+  x2r = x1r + x3i;
+  x2i = x1i - x3r;
+  y5r = wn4r * (x0r - x0i);
+  y5i = wn4r * (x0r + x0i);
+  y7r = wn4r * (x2r - x2i);
+  y7i = wn4r * (x2r + x2i);
+  a[8] = y1r + y5r;
+  a[9] = y1i + y5i;
+  a[10] = y1r - y5r;
+  a[11] = y1i - y5i;
+  a[12] = y3r - y7i;
+  a[13] = y3i + y7r;
+  a[14] = y3r + y7i;
+  a[15] = y3i - y7r;
+  a[0] = y0r + y4r;
+  a[1] = y0i + y4i;
+  a[2] = y0r - y4r;
+  a[3] = y0i - y4i;
+  a[4] = y2r - y6i;
+  a[5] = y2i + y6r;
+  a[6] = y2r + y6i;
+  a[7] = y2i - y6r;
+}
+
+void cftf082(float* a, float* w)
+{
+  float wn4r, wk1r, wk1i, x0r, x0i, x1r, x1i, y0r, y0i, y1r, y1i, y2r, y2i, y3r,
+      y3i, y4r, y4i, y5r, y5i, y6r, y6i, y7r, y7i;
+
+  wn4r = w[1];
+  wk1r = w[2];
+  wk1i = w[3];
+  y0r = a[0] - a[9];
+  y0i = a[1] + a[8];
+  y1r = a[0] + a[9];
+  y1i = a[1] - a[8];
+  x0r = a[4] - a[13];
+  x0i = a[5] + a[12];
+  y2r = wn4r * (x0r - x0i);
+  y2i = wn4r * (x0i + x0r);
+  x0r = a[4] + a[13];
+  x0i = a[5] - a[12];
+  y3r = wn4r * (x0r - x0i);
+  y3i = wn4r * (x0i + x0r);
+  x0r = a[2] - a[11];
+  x0i = a[3] + a[10];
+  y4r = wk1r * x0r - wk1i * x0i;
+  y4i = wk1r * x0i + wk1i * x0r;
+  x0r = a[2] + a[11];
+  x0i = a[3] - a[10];
+  y5r = wk1i * x0r - wk1r * x0i;
+  y5i = wk1i * x0i + wk1r * x0r;
+  x0r = a[6] - a[15];
+  x0i = a[7] + a[14];
+  y6r = wk1i * x0r - wk1r * x0i;
+  y6i = wk1i * x0i + wk1r * x0r;
+  x0r = a[6] + a[15];
+  x0i = a[7] - a[14];
+  y7r = wk1r * x0r - wk1i * x0i;
+  y7i = wk1r * x0i + wk1i * x0r;
+  x0r = y0r + y2r;
+  x0i = y0i + y2i;
+  x1r = y4r + y6r;
+  x1i = y4i + y6i;
+  a[0] = x0r + x1r;
+  a[1] = x0i + x1i;
+  a[2] = x0r - x1r;
+  a[3] = x0i - x1i;
+  x0r = y0r - y2r;
+  x0i = y0i - y2i;
+  x1r = y4r - y6r;
+  x1i = y4i - y6i;
+  a[4] = x0r - x1i;
+  a[5] = x0i + x1r;
+  a[6] = x0r + x1i;
+  a[7] = x0i - x1r;
+  x0r = y1r - y3i;
+  x0i = y1i + y3r;
+  x1r = y5r - y7r;
+  x1i = y5i - y7i;
+  a[8] = x0r + x1r;
+  a[9] = x0i + x1i;
+  a[10] = x0r - x1r;
+  a[11] = x0i - x1i;
+  x0r = y1r + y3i;
+  x0i = y1i - y3r;
+  x1r = y5r + y7r;
+  x1i = y5i + y7i;
+  a[12] = x0r - x1i;
+  a[13] = x0i + x1r;
+  a[14] = x0r + x1i;
+  a[15] = x0i - x1r;
+}
+
+void cftf040(float* a)
+{
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  x0r = a[0] + a[4];
+  x0i = a[1] + a[5];
+  x1r = a[0] - a[4];
+  x1i = a[1] - a[5];
+  x2r = a[2] + a[6];
+  x2i = a[3] + a[7];
+  x3r = a[2] - a[6];
+  x3i = a[3] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[2] = x1r - x3i;
+  a[3] = x1i + x3r;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[6] = x1r + x3i;
+  a[7] = x1i - x3r;
+}
+
+void cftb040(float* a)
+{
+  float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+  x0r = a[0] + a[4];
+  x0i = a[1] + a[5];
+  x1r = a[0] - a[4];
+  x1i = a[1] - a[5];
+  x2r = a[2] + a[6];
+  x2i = a[3] + a[7];
+  x3r = a[2] - a[6];
+  x3i = a[3] - a[7];
+  a[0] = x0r + x2r;
+  a[1] = x0i + x2i;
+  a[2] = x1r + x3i;
+  a[3] = x1i - x3r;
+  a[4] = x0r - x2r;
+  a[5] = x0i - x2i;
+  a[6] = x1r - x3i;
+  a[7] = x1i + x3r;
+}
+
+void cftx020(float* a)
+{
+  float x0r, x0i;
+
+  x0r = a[0] - a[2];
+  x0i = a[1] - a[3];
+  a[0] += a[2];
+  a[1] += a[3];
+  a[2] = x0r;
+  a[3] = x0i;
+}
+
+void rftfsub(int n, float* a, int nc, float* c)
+{
+  int j, k, kk, ks, m;
+  float wkr, wki, xr, xi, yr, yi;
+
+  m = n >> 1;
+  ks = 2 * nc / m;
+  kk = 0;
+  for (j = 2; j < m; j += 2) {
+    k = n - j;
+    kk += ks;
+    wkr = 0.5 - c[nc - kk];
+    wki = c[kk];
+    xr = a[j] - a[k];
+    xi = a[j + 1] + a[k + 1];
+    yr = wkr * xr - wki * xi;
+    yi = wkr * xi + wki * xr;
+    a[j] -= yr;
+    a[j + 1] -= yi;
+    a[k] += yr;
+    a[k + 1] -= yi;
+  }
+}
+
+void rftbsub(int n, float* a, int nc, float* c)
+{
+  int j, k, kk, ks, m;
+  float wkr, wki, xr, xi, yr, yi;
+
+  m = n >> 1;
+  ks = 2 * nc / m;
+  kk = 0;
+  for (j = 2; j < m; j += 2) {
+    k = n - j;
+    kk += ks;
+    wkr = 0.5 - c[nc - kk];
+    wki = c[kk];
+    xr = a[j] - a[k];
+    xi = a[j + 1] + a[k + 1];
+    yr = wkr * xr + wki * xi;
+    yi = wkr * xi - wki * xr;
+    a[j] -= yr;
+    a[j + 1] -= yi;
+    a[k] += yr;
+    a[k + 1] -= yi;
+  }
+}
+
+void dctsub(int n, float* a, int nc, float* c)
+{
+  int j, k, kk, ks, m;
+  float wkr, wki, xr;
+
+  m = n >> 1;
+  ks = nc / n;
+  kk = 0;
+  for (j = 1; j < m; j++) {
+    k = n - j;
+    kk += ks;
+    wkr = c[kk] - c[nc - kk];
+    wki = c[kk] + c[nc - kk];
+    xr = wki * a[j] - wkr * a[k];
+    a[j] = wkr * a[j] + wki * a[k];
+    a[k] = xr;
+  }
+  a[m] *= c[0];
+}
+
+void dstsub(int n, float* a, int nc, float* c)
+{
+  int j, k, kk, ks, m;
+  float wkr, wki, xr;
+
+  m = n >> 1;
+  ks = nc / n;
+  kk = 0;
+  for (j = 1; j < m; j++) {
+    k = n - j;
+    kk += ks;
+    wkr = c[kk] - c[nc - kk];
+    wki = c[kk] + c[nc - kk];
+    xr = wki * a[k] - wkr * a[j];
+    a[k] = wkr * a[k] + wki * a[j];
+    a[j] = xr;
+  }
+  a[m] *= c[0];
+}
+
+NEXTPNR_NAMESPACE_END
diff --git a/3rdparty/oourafft/fftsg.h b/3rdparty/oourafft/fftsg.h
new file mode 100644
index 00000000..4add9311
--- /dev/null
+++ b/3rdparty/oourafft/fftsg.h
@@ -0,0 +1,24 @@
+#ifndef FFTSG_H
+#define FFTSG_H
+#include "nextpnr_namespaces.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+//
+// The following FFT library came from
+// http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+//
+//
+/// 1D FFT ////////////////////////////////////////////////////////////////
+void ddct(int n, int isgn, float *a, int *ip, float *w);
+void ddst(int n, int isgn, float *a, int *ip, float *w);
+
+/// 2D FFT ////////////////////////////////////////////////////////////////
+void ddct2d(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w);
+void ddsct2d(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w);
+void ddcst2d(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w);
+
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/3rdparty/oourafft/fftsg2d.cc b/3rdparty/oourafft/fftsg2d.cc
new file mode 100644
index 00000000..2bbc2428
--- /dev/null
+++ b/3rdparty/oourafft/fftsg2d.cc
@@ -0,0 +1,1419 @@
+///////////////////////////////////////////////////////////////////////////////
+// BSD 3-Clause License
+//
+// Copyright (c) 2018-2020, The Regents of the University of California
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+//
+// * Redistributions of source code must retain the above copyright notice, this
+//   list of conditions and the following disclaimer.
+//
+// * Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// * Neither the name of the copyright holder nor the names of its
+//   contributors may be used to endorse or promote products derived from
+//   this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE
+// DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+// FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+// DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+// SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+// CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+// OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+///////////////////////////////////////////////////////////////////////////////
+
+/*
+Fast Fourier/Cosine/Sine Transform
+    dimension   :two
+    data length :power of 2
+    decimation  :frequency
+    radix       :split-radix, row-column
+    data        :inplace
+    table       :use
+functions
+    cdft2d: Complex Discrete Fourier Transform
+    rdft2d: Real Discrete Fourier Transform
+    ddct2d: Discrete Cosine Transform
+    ddst2d: Discrete Sine Transform
+function prototypes
+    void cdft2d(int, int, int, float   **, float   *, int *, float   *);
+    void rdft2d(int, int, int, float   **, float   *, int *, float   *);
+    void rdft2dsort(int, int, int, float   **);
+    void ddct2d(int, int, int, float   **, float   *, int *, float   *);
+    void ddst2d(int, int, int, float   **, float   *, int *, float   *);
+necessary package
+    fftsg.c  : 1D-FFT package
+macro definitions
+    USE_FFT2D_PTHREADS : default=not defined
+        FFT2D_MAX_THREADS     : must be 2^N, default=4
+        FFT2D_THREADS_BEGIN_N : default=65536
+    USE_FFT2D_WINTHREADS : default=not defined
+        FFT2D_MAX_THREADS     : must be 2^N, default=4
+        FFT2D_THREADS_BEGIN_N : default=131072
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
+                            exp(2*pi*i*j1*k1/n1) *
+                            exp(2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+        <case2>
+            X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] *
+                            exp(-2*pi*i*j1*k1/n1) *
+                            exp(-2*pi*i*j2*k2/n2), 0<=k1<n1, 0<=k2<n2
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft2d(n1, 2*n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft2d(n1, 2*n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 1, n1 = power of 2
+        2*n2   :data length (int)
+                n2 >= 1, n2 = power of 2
+        a[0...n1-1][0...2*n2-1]
+               :input/output data (float   **)
+                input data
+                    a[j1][2*j2] = Re(x[j1][j2]),
+                    a[j1][2*j2+1] = Im(x[j1][j2]),
+                    0<=j1<n1, 0<=j2<n2
+                output data
+                    a[k1][2*k2] = Re(X[k1][k2]),
+                    a[k1][2*k2+1] = Im(X[k1][k2]),
+                    0<=k1<n1, 0<=k2<n2
+        t[0...*]
+               :work area (float   *)
+                length of t >= 8*n1,                   if single thread,
+                length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads,
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (float   *)
+                length of w >= max(n1/2, n2/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            cdft2d(n1, 2*n2, -1, a, t, ip, w);
+        is
+            cdft2d(n1, 2*n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= 2 * n2 - 1; j2++) {
+                    a[j1][j2] *= 1.0 / n1 / n2;
+                }
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+                            0<=k1<n1, 0<=k2<n2
+            I[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2),
+                            0<=k1<n1, 0<=k2<n2
+        <case2> IRDFT (excluding scale)
+            a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1
+                            (R[j1][j2] *
+                            cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) +
+                            I[j1][j2] *
+                            sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)),
+                            0<=k1<n1, 0<=k2<n2
+        (notes: R[n1-k1][n2-k2] = R[k1][k2],
+                I[n1-k1][n2-k2] = -I[k1][k2],
+                R[n1-k1][0] = R[k1][0],
+                I[n1-k1][0] = -I[k1][0],
+                R[0][n2-k2] = R[0][k2],
+                I[0][n2-k2] = -I[0][k2],
+                0<k1<n1, 0<k2<n2)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (float   **)
+                <case1>
+                    output data
+                        a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2],
+                        a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2],
+                            0<k1<n1, 0<k2<n2/2,
+                        a[0][2*k2] = R[0][k2] = R[0][n2-k2],
+                        a[0][2*k2+1] = I[0][k2] = -I[0][n2-k2],
+                            0<k2<n2/2,
+                        a[k1][0] = R[k1][0] = R[n1-k1][0],
+                        a[k1][1] = I[k1][0] = -I[n1-k1][0],
+                        a[n1-k1][1] = R[k1][n2/2] = R[n1-k1][n2/2],
+                        a[n1-k1][0] = -I[k1][n2/2] = I[n1-k1][n2/2],
+                            0<k1<n1/2,
+                        a[0][0] = R[0][0],
+                        a[0][1] = R[0][n2/2],
+                        a[n1/2][0] = R[n1/2][0],
+                        a[n1/2][1] = R[n1/2][n2/2]
+                <case2>
+                    input data
+                        a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2],
+                        a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2],
+                            0<j1<n1, 0<j2<n2/2,
+                        a[0][2*j2] = R[0][j2] = R[0][n2-j2],
+                        a[0][2*j2+1] = I[0][j2] = -I[0][n2-j2],
+                            0<j2<n2/2,
+                        a[j1][0] = R[j1][0] = R[n1-j1][0],
+                        a[j1][1] = I[j1][0] = -I[n1-j1][0],
+                        a[n1-j1][1] = R[j1][n2/2] = R[n1-j1][n2/2],
+                        a[n1-j1][0] = -I[j1][n2/2] = I[n1-j1][n2/2],
+                            0<j1<n1/2,
+                        a[0][0] = R[0][0],
+                        a[0][1] = R[0][n2/2],
+                        a[n1/2][0] = R[n1/2][0],
+                        a[n1/2][1] = R[n1/2][n2/2]
+                ---- output ordering ----
+                    rdft2d(n1, n2, 1, a, t, ip, w);
+                    rdft2dsort(n1, n2, 1, a);
+                    // stored data is a[0...n1-1][0...n2+1]:
+                    // a[k1][2*k2] = R[k1][k2],
+                    // a[k1][2*k2+1] = I[k1][k2],
+                    // 0<=k1<n1, 0<=k2<=n2/2.
+                    // the stored data is larger than the input data!
+                ---- input ordering ----
+                    rdft2dsort(n1, n2, -1, a);
+                    rdft2d(n1, n2, -1, a, t, ip, w);
+        t[0...*]
+               :work area (float   *)
+                length of t >= 8*n1,                   if single thread,
+                length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads,
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (float   *)
+                length of w >= max(n1/2, n2/4) + n2/4
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            rdft2d(n1, n2, 1, a, t, ip, w);
+        is
+            rdft2d(n1, n2, -1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 2.0 / n1 / n2;
+                }
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+                            cos(pi*j1*(k1+1/2)/n1) *
+                            cos(pi*j2*(k2+1/2)/n2),
+                            0<=k1<n1, 0<=k2<n2
+        <case2> DCT
+            C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+                            cos(pi*(j1+1/2)*k1/n1) *
+                            cos(pi*(j2+1/2)*k2/n2),
+                            0<=k1<n1, 0<=k2<n2
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (float   **)
+                output data
+                    a[k1][k2] = C[k1][k2], 0<=k1<n1, 0<=k2<n2
+        t[0...*]
+               :work area (float   *)
+                length of t >= 4*n1,                   if single thread,
+                length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads,
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1/2, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (float   *)
+                length of w >= max(n1*3/2, n2*3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddct2d(n1, n2, -1, a, t, ip, w);
+        is
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                a[j1][0] *= 0.5;
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                a[0][j2] *= 0.5;
+            }
+            ddct2d(n1, n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 4.0 / n1 / n2;
+                }
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] *
+                            sin(pi*j1*(k1+1/2)/n1) *
+                            sin(pi*j2*(k2+1/2)/n2),
+                            0<=k1<n1, 0<=k2<n2
+        <case2> DST
+            S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] *
+                            sin(pi*(j1+1/2)*k1/n1) *
+                            sin(pi*(j2+1/2)*k2/n2),
+                            0<k1<=n1, 0<k2<=n2
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst2d(n1, n2, 1, a, t, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst2d(n1, n2, -1, a, t, ip, w);
+    [parameters]
+        n1     :data length (int)
+                n1 >= 2, n1 = power of 2
+        n2     :data length (int)
+                n2 >= 2, n2 = power of 2
+        a[0...n1-1][0...n2-1]
+               :input/output data (float   **)
+                <case1>
+                    input data
+                        a[j1][j2] = A[j1][j2], 0<j1<n1, 0<j2<n2,
+                        a[j1][0] = A[j1][n2], 0<j1<n1,
+                        a[0][j2] = A[n1][j2], 0<j2<n2,
+                        a[0][0] = A[n1][n2]
+                        (i.e. A[j1][j2] = a[j1 % n1][j2 % n2])
+                    output data
+                        a[k1][k2] = S[k1][k2], 0<=k1<n1, 0<=k2<n2
+                <case2>
+                    output data
+                        a[k1][k2] = S[k1][k2], 0<k1<n1, 0<k2<n2,
+                        a[k1][0] = S[k1][n2], 0<k1<n1,
+                        a[0][k2] = S[n1][k2], 0<k2<n2,
+                        a[0][0] = S[n1][n2]
+                        (i.e. S[k1][k2] = a[k1 % n1][k2 % n2])
+        t[0...*]
+               :work area (float   *)
+                length of t >= 4*n1,                   if single thread,
+                length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads,
+                t is dynamically allocated, if t == NULL.
+        ip[0...*]
+               :work area for bit reversal (int *)
+                length of ip >= 2+sqrt(n)
+                (n = max(n1/2, n2/2))
+                ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...*]
+               :cos/sin table (float   *)
+                length of w >= max(n1*3/2, n2*3/2)
+                w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddst2d(n1, n2, -1, a, t, ip, w);
+        is
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                a[j1][0] *= 0.5;
+            }
+            for (j2 = 0; j2 <= n2 - 1; j2++) {
+                a[0][j2] *= 0.5;
+            }
+            ddst2d(n1, n2, 1, a, t, ip, w);
+            for (j1 = 0; j1 <= n1 - 1; j1++) {
+                for (j2 = 0; j2 <= n2 - 1; j2++) {
+                    a[j1][j2] *= 4.0 / n1 / n2;
+                }
+            }
+        .
+*/
+
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#define fft2d_alloc_error_check(p)                        \
+  {                                                       \
+    if((p) == NULL) {                                     \
+      fprintf(stderr, "fft2d memory allocation error\n"); \
+      exit(1);                                            \
+    }                                                     \
+  }
+
+#ifdef USE_FFT2D_PTHREADS
+#define USE_FFT2D_THREADS
+#ifndef FFT2D_MAX_THREADS
+#define FFT2D_MAX_THREADS 4
+#endif
+#ifndef FFT2D_THREADS_BEGIN_N
+#define FFT2D_THREADS_BEGIN_N 65536
+#endif
+#include <pthread.h>
+#define fft2d_thread_t pthread_t
+#define fft2d_thread_create(thp, func, argp)                   \
+  {                                                            \
+    if(pthread_create(thp, NULL, func, (void *)(argp)) != 0) { \
+      fprintf(stderr, "fft2d thread error\n");                 \
+      exit(1);                                                 \
+    }                                                          \
+  }
+#define fft2d_thread_wait(th)                  \
+  {                                            \
+    if(pthread_join(th, NULL) != 0) {          \
+      fprintf(stderr, "fft2d thread error\n"); \
+      exit(1);                                 \
+    }                                          \
+  }
+#endif /* USE_FFT2D_PTHREADS */
+
+#ifdef USE_FFT2D_WINTHREADS
+#define USE_FFT2D_THREADS
+#ifndef FFT2D_MAX_THREADS
+#define FFT2D_MAX_THREADS 4
+#endif
+#ifndef FFT2D_THREADS_BEGIN_N
+#define FFT2D_THREADS_BEGIN_N 131072
+#endif
+#include <windows.h>
+#define fft2d_thread_t HANDLE
+#define fft2d_thread_create(thp, func, argp)                       \
+  {                                                                \
+    DWORD thid;                                                    \
+    *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)(func), \
+                          (LPVOID)(argp), 0, &thid);               \
+    if(*(thp) == 0) {                                              \
+      fprintf(stderr, "fft2d thread error\n");                     \
+      exit(1);                                                     \
+    }                                                              \
+  }
+#define fft2d_thread_wait(th)          \
+  {                                    \
+    WaitForSingleObject(th, INFINITE); \
+    CloseHandle(th);                   \
+  }
+#endif /* USE_FFT2D_WINTHREADS */
+
+#include "nextpnr_namespaces.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+
+void cdft2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void cdft(int n, int isgn, float* a, int* ip, float* w);
+  void cdft2d_sub(
+      int n1, int n2, int isgn, float** a, float* t, int* ip, float* w);
+#ifdef USE_FFT2D_THREADS
+  void xdft2d0_subth(
+      int n1, int n2, int icr, int isgn, float** a, int* ip, float* w);
+  void cdft2d_subth(
+      int n1, int n2, int isgn, float** a, float* t, int* ip, float* w);
+#endif /* USE_FFT2D_THREADS */
+  int n, itnull, nthread, nt, i;
+
+  n = n1 << 1;
+  if (n < n2) {
+    n = n2;
+  }
+  if (n > (ip[0] << 2)) {
+    makewt(n >> 2, ip, w);
+  }
+  itnull = 0;
+  if (t == NULL) {
+    itnull = 1;
+    nthread = 1;
+#ifdef USE_FFT2D_THREADS
+    nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+    nt = 8 * nthread * n1;
+    if (n2 == 4 * nthread) {
+      nt >>= 1;
+    } else if (n2 < 4 * nthread) {
+      nt >>= 2;
+    }
+    t = (float*) malloc(sizeof(float) * nt);
+    fft2d_alloc_error_check(t);
+  }
+#ifdef USE_FFT2D_THREADS
+  if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) {
+    xdft2d0_subth(n1, n2, 0, isgn, a, ip, w);
+    cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+  } else
+#endif /* USE_FFT2D_THREADS */
+  {
+    for (i = 0; i < n1; i++) {
+      cdft(n2, isgn, a[i], ip, w);
+    }
+    cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+  }
+  if (itnull != 0) {
+    free(t);
+  }
+}
+
+void rdft2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void rdft(int n, int isgn, float* a, int* ip, float* w);
+  void cdft2d_sub(
+      int n1, int n2, int isgn, float** a, float* t, int* ip, float* w);
+  void rdft2d_sub(int n1, int isgn, float** a);
+#ifdef USE_FFT2D_THREADS
+  void xdft2d0_subth(
+      int n1, int n2, int icr, int isgn, float** a, int* ip, float* w);
+  void cdft2d_subth(
+      int n1, int n2, int isgn, float** a, float* t, int* ip, float* w);
+#endif /* USE_FFT2D_THREADS */
+  int n, nw, nc, itnull, nthread, nt, i;
+
+  n = n1 << 1;
+  if (n < n2) {
+    n = n2;
+  }
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n2 > (nc << 2)) {
+    nc = n2 >> 2;
+    makect(nc, ip, w + nw);
+  }
+  itnull = 0;
+  if (t == NULL) {
+    itnull = 1;
+    nthread = 1;
+#ifdef USE_FFT2D_THREADS
+    nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+    nt = 8 * nthread * n1;
+    if (n2 == 4 * nthread) {
+      nt >>= 1;
+    } else if (n2 < 4 * nthread) {
+      nt >>= 2;
+    }
+    t = (float*) malloc(sizeof(float) * nt);
+    fft2d_alloc_error_check(t);
+  }
+#ifdef USE_FFT2D_THREADS
+  if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) {
+    if (isgn < 0) {
+      rdft2d_sub(n1, isgn, a);
+      cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+    }
+    xdft2d0_subth(n1, n2, 1, isgn, a, ip, w);
+    if (isgn >= 0) {
+      cdft2d_subth(n1, n2, isgn, a, t, ip, w);
+      rdft2d_sub(n1, isgn, a);
+    }
+  } else
+#endif /* USE_FFT2D_THREADS */
+  {
+    if (isgn < 0) {
+      rdft2d_sub(n1, isgn, a);
+      cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+    }
+    for (i = 0; i < n1; i++) {
+      rdft(n2, isgn, a[i], ip, w);
+    }
+    if (isgn >= 0) {
+      cdft2d_sub(n1, n2, isgn, a, t, ip, w);
+      rdft2d_sub(n1, isgn, a);
+    }
+  }
+  if (itnull != 0) {
+    free(t);
+  }
+}
+
+void rdft2dsort(int n1, int n2, int isgn, float** a)
+{
+  int n1h, i;
+  float x, y;
+
+  n1h = n1 >> 1;
+  if (isgn < 0) {
+    for (i = n1h + 1; i < n1; i++) {
+      a[i][0] = a[i][n2 + 1];
+      a[i][1] = a[i][n2];
+    }
+    a[0][1] = a[0][n2];
+    a[n1h][1] = a[n1h][n2];
+  } else {
+    for (i = n1h + 1; i < n1; i++) {
+      y = a[i][0];
+      x = a[i][1];
+      a[i][n2] = x;
+      a[i][n2 + 1] = y;
+      a[n1 - i][n2] = x;
+      a[n1 - i][n2 + 1] = -y;
+      a[i][0] = a[n1 - i][0];
+      a[i][1] = -a[n1 - i][1];
+    }
+    a[0][n2] = a[0][1];
+    a[0][n2 + 1] = 0;
+    a[0][1] = 0;
+    a[n1h][n2] = a[n1h][1];
+    a[n1h][n2 + 1] = 0;
+    a[n1h][1] = 0;
+  }
+}
+
+void ddcst2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void ddct(int n, int isgn, float* a, int* ip, float* w);
+  void ddst(int n, int isgn, float* a, int* ip, float* w);
+  void ddxt2d_sub(int n1,
+                  int n2,
+                  int ics,
+                  int isgn,
+                  float** a,
+                  float* t,
+                  int* ip,
+                  float* w);
+#ifdef USE_FFT2D_THREADS
+  void ddxt2d0_subth(
+      int n1, int n2, int ics, int isgn, float** a, int* ip, float* w);
+  void ddxt2d_subth(int n1,
+                    int n2,
+                    int ics,
+                    int isgn,
+                    float** a,
+                    float* t,
+                    int* ip,
+                    float* w);
+#endif /* USE_FFT2D_THREADS */
+  int n, nw, nc, itnull, nthread, nt, i;
+
+  n = n1;
+  if (n < n2) {
+    n = n2;
+  }
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > nc) {
+    nc = n;
+    makect(nc, ip, w + nw);
+  }
+  itnull = 0;
+  if (t == NULL) {
+    itnull = 1;
+    nthread = 1;
+#ifdef USE_FFT2D_THREADS
+    nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+    nt = 4 * nthread * n1;
+    if (n2 == 2 * nthread) {
+      nt >>= 1;
+    } else if (n2 < 2 * nthread) {
+      nt >>= 2;
+    }
+    t = (float*) malloc(sizeof(float) * nt);
+    fft2d_alloc_error_check(t);
+  }
+#ifdef USE_FFT2D_THREADS
+  if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) {
+    ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w);
+    ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w);
+  } else
+#endif /* USE_FFT2D_THREADS */
+  {
+    for (i = 0; i < n1; i++) {
+      ddst(n2, isgn, a[i], ip, w);
+    }
+    ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w);
+  }
+  if (itnull != 0) {
+    free(t);
+  }
+}
+
+void ddsct2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void ddct(int n, int isgn, float* a, int* ip, float* w);
+  void ddst(int n, int isgn, float* a, int* ip, float* w);
+  void ddxt2d_sub(int n1,
+                  int n2,
+                  int ics,
+                  int isgn,
+                  float** a,
+                  float* t,
+                  int* ip,
+                  float* w);
+#ifdef USE_FFT2D_THREADS
+  void ddxt2d0_subth(
+      int n1, int n2, int ics, int isgn, float** a, int* ip, float* w);
+  void ddxt2d_subth(int n1,
+                    int n2,
+                    int ics,
+                    int isgn,
+                    float** a,
+                    float* t,
+                    int* ip,
+                    float* w);
+#endif /* USE_FFT2D_THREADS */
+  int n, nw, nc, itnull, nthread, nt, i;
+
+  n = n1;
+  if (n < n2) {
+    n = n2;
+  }
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > nc) {
+    nc = n;
+    makect(nc, ip, w + nw);
+  }
+  itnull = 0;
+  if (t == NULL) {
+    itnull = 1;
+    nthread = 1;
+#ifdef USE_FFT2D_THREADS
+    nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+    nt = 4 * nthread * n1;
+    if (n2 == 2 * nthread) {
+      nt >>= 1;
+    } else if (n2 < 2 * nthread) {
+      nt >>= 2;
+    }
+    t = (float*) malloc(sizeof(float) * nt);
+    fft2d_alloc_error_check(t);
+  }
+#ifdef USE_FFT2D_THREADS
+  if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) {
+    ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w);
+    ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w);
+  } else
+#endif /* USE_FFT2D_THREADS */
+  {
+    for (i = 0; i < n1; i++) {
+      ddct(n2, isgn, a[i], ip, w);
+    }
+    ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w);
+  }
+  if (itnull != 0) {
+    free(t);
+  }
+}
+
+void ddct2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void ddct(int n, int isgn, float* a, int* ip, float* w);
+  void ddxt2d_sub(int n1,
+                  int n2,
+                  int ics,
+                  int isgn,
+                  float** a,
+                  float* t,
+                  int* ip,
+                  float* w);
+#ifdef USE_FFT2D_THREADS
+  void ddxt2d0_subth(
+      int n1, int n2, int ics, int isgn, float** a, int* ip, float* w);
+  void ddxt2d_subth(int n1,
+                    int n2,
+                    int ics,
+                    int isgn,
+                    float** a,
+                    float* t,
+                    int* ip,
+                    float* w);
+#endif /* USE_FFT2D_THREADS */
+  int n, nw, nc, itnull, nthread, nt, i;
+
+  n = n1;
+  if (n < n2) {
+    n = n2;
+  }
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > nc) {
+    nc = n;
+    makect(nc, ip, w + nw);
+  }
+  itnull = 0;
+  if (t == NULL) {
+    itnull = 1;
+    nthread = 1;
+#ifdef USE_FFT2D_THREADS
+    nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+    nt = 4 * nthread * n1;
+    if (n2 == 2 * nthread) {
+      nt >>= 1;
+    } else if (n2 < 2 * nthread) {
+      nt >>= 2;
+    }
+    t = (float*) malloc(sizeof(float) * nt);
+    fft2d_alloc_error_check(t);
+  }
+#ifdef USE_FFT2D_THREADS
+  if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) {
+    ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w);
+    ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w);
+  } else
+#endif /* USE_FFT2D_THREADS */
+  {
+    for (i = 0; i < n1; i++) {
+      ddct(n2, isgn, a[i], ip, w);
+    }
+    ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w);
+  }
+  if (itnull != 0) {
+    free(t);
+  }
+}
+
+void ddst2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w)
+{
+  void makewt(int nw, int* ip, float* w);
+  void makect(int nc, int* ip, float* c);
+  void ddst(int n, int isgn, float* a, int* ip, float* w);
+  void ddxt2d_sub(int n1,
+                  int n2,
+                  int ics,
+                  int isgn,
+                  float** a,
+                  float* t,
+                  int* ip,
+                  float* w);
+#ifdef USE_FFT2D_THREADS
+  void ddxt2d0_subth(
+      int n1, int n2, int ics, int isgn, float** a, int* ip, float* w);
+  void ddxt2d_subth(int n1,
+                    int n2,
+                    int ics,
+                    int isgn,
+                    float** a,
+                    float* t,
+                    int* ip,
+                    float* w);
+#endif /* USE_FFT2D_THREADS */
+  int n, nw, nc, itnull, nthread, nt, i;
+
+  n = n1;
+  if (n < n2) {
+    n = n2;
+  }
+  nw = ip[0];
+  if (n > (nw << 2)) {
+    nw = n >> 2;
+    makewt(nw, ip, w);
+  }
+  nc = ip[1];
+  if (n > nc) {
+    nc = n;
+    makect(nc, ip, w + nw);
+  }
+  itnull = 0;
+  if (t == NULL) {
+    itnull = 1;
+    nthread = 1;
+#ifdef USE_FFT2D_THREADS
+    nthread = FFT2D_MAX_THREADS;
+#endif /* USE_FFT2D_THREADS */
+    nt = 4 * nthread * n1;
+    if (n2 == 2 * nthread) {
+      nt >>= 1;
+    } else if (n2 < 2 * nthread) {
+      nt >>= 2;
+    }
+    t = (float*) malloc(sizeof(float) * nt);
+    fft2d_alloc_error_check(t);
+  }
+#ifdef USE_FFT2D_THREADS
+  if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) {
+    ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w);
+    ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w);
+  } else
+#endif /* USE_FFT2D_THREADS */
+  {
+    for (i = 0; i < n1; i++) {
+      ddst(n2, isgn, a[i], ip, w);
+    }
+    ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w);
+  }
+  if (itnull != 0) {
+    free(t);
+  }
+}
+
+/* -------- child routines -------- */
+
+void cdft2d_sub(int n1,
+                int n2,
+                int isgn,
+                float** a,
+                float* t,
+                int* ip,
+                float* w)
+{
+  void cdft(int n, int isgn, float* a, int* ip, float* w);
+  int i, j;
+
+  if (n2 > 4) {
+    for (j = 0; j < n2; j += 8) {
+      for (i = 0; i < n1; i++) {
+        t[2 * i] = a[i][j];
+        t[2 * i + 1] = a[i][j + 1];
+        t[2 * n1 + 2 * i] = a[i][j + 2];
+        t[2 * n1 + 2 * i + 1] = a[i][j + 3];
+        t[4 * n1 + 2 * i] = a[i][j + 4];
+        t[4 * n1 + 2 * i + 1] = a[i][j + 5];
+        t[6 * n1 + 2 * i] = a[i][j + 6];
+        t[6 * n1 + 2 * i + 1] = a[i][j + 7];
+      }
+      cdft(2 * n1, isgn, t, ip, w);
+      cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+      cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+      cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+      for (i = 0; i < n1; i++) {
+        a[i][j] = t[2 * i];
+        a[i][j + 1] = t[2 * i + 1];
+        a[i][j + 2] = t[2 * n1 + 2 * i];
+        a[i][j + 3] = t[2 * n1 + 2 * i + 1];
+        a[i][j + 4] = t[4 * n1 + 2 * i];
+        a[i][j + 5] = t[4 * n1 + 2 * i + 1];
+        a[i][j + 6] = t[6 * n1 + 2 * i];
+        a[i][j + 7] = t[6 * n1 + 2 * i + 1];
+      }
+    }
+  } else if (n2 == 4) {
+    for (i = 0; i < n1; i++) {
+      t[2 * i] = a[i][0];
+      t[2 * i + 1] = a[i][1];
+      t[2 * n1 + 2 * i] = a[i][2];
+      t[2 * n1 + 2 * i + 1] = a[i][3];
+    }
+    cdft(2 * n1, isgn, t, ip, w);
+    cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+    for (i = 0; i < n1; i++) {
+      a[i][0] = t[2 * i];
+      a[i][1] = t[2 * i + 1];
+      a[i][2] = t[2 * n1 + 2 * i];
+      a[i][3] = t[2 * n1 + 2 * i + 1];
+    }
+  } else if (n2 == 2) {
+    for (i = 0; i < n1; i++) {
+      t[2 * i] = a[i][0];
+      t[2 * i + 1] = a[i][1];
+    }
+    cdft(2 * n1, isgn, t, ip, w);
+    for (i = 0; i < n1; i++) {
+      a[i][0] = t[2 * i];
+      a[i][1] = t[2 * i + 1];
+    }
+  }
+}
+
+void rdft2d_sub(int n1, int isgn, float** a)
+{
+  int n1h, i, j;
+  float xi;
+
+  n1h = n1 >> 1;
+  if (isgn < 0) {
+    for (i = 1; i < n1h; i++) {
+      j = n1 - i;
+      xi = a[i][0] - a[j][0];
+      a[i][0] += a[j][0];
+      a[j][0] = xi;
+      xi = a[j][1] - a[i][1];
+      a[i][1] += a[j][1];
+      a[j][1] = xi;
+    }
+  } else {
+    for (i = 1; i < n1h; i++) {
+      j = n1 - i;
+      a[j][0] = 0.5 * (a[i][0] - a[j][0]);
+      a[i][0] -= a[j][0];
+      a[j][1] = 0.5 * (a[i][1] + a[j][1]);
+      a[i][1] -= a[j][1];
+    }
+  }
+}
+
+void ddxt2d_sub(int n1,
+                int n2,
+                int ics,
+                int isgn,
+                float** a,
+                float* t,
+                int* ip,
+                float* w)
+{
+  void ddct(int n, int isgn, float* a, int* ip, float* w);
+  void ddst(int n, int isgn, float* a, int* ip, float* w);
+  int i, j;
+
+  if (n2 > 2) {
+    for (j = 0; j < n2; j += 4) {
+      for (i = 0; i < n1; i++) {
+        t[i] = a[i][j];
+        t[n1 + i] = a[i][j + 1];
+        t[2 * n1 + i] = a[i][j + 2];
+        t[3 * n1 + i] = a[i][j + 3];
+      }
+      if (ics == 0) {
+        ddct(n1, isgn, t, ip, w);
+        ddct(n1, isgn, &t[n1], ip, w);
+        ddct(n1, isgn, &t[2 * n1], ip, w);
+        ddct(n1, isgn, &t[3 * n1], ip, w);
+      } else {
+        ddst(n1, isgn, t, ip, w);
+        ddst(n1, isgn, &t[n1], ip, w);
+        ddst(n1, isgn, &t[2 * n1], ip, w);
+        ddst(n1, isgn, &t[3 * n1], ip, w);
+      }
+      for (i = 0; i < n1; i++) {
+        a[i][j] = t[i];
+        a[i][j + 1] = t[n1 + i];
+        a[i][j + 2] = t[2 * n1 + i];
+        a[i][j + 3] = t[3 * n1 + i];
+      }
+    }
+  } else if (n2 == 2) {
+    for (i = 0; i < n1; i++) {
+      t[i] = a[i][0];
+      t[n1 + i] = a[i][1];
+    }
+    if (ics == 0) {
+      ddct(n1, isgn, t, ip, w);
+      ddct(n1, isgn, &t[n1], ip, w);
+    } else {
+      ddst(n1, isgn, t, ip, w);
+      ddst(n1, isgn, &t[n1], ip, w);
+    }
+    for (i = 0; i < n1; i++) {
+      a[i][0] = t[i];
+      a[i][1] = t[n1 + i];
+    }
+  }
+}
+
+#ifdef USE_FFT2D_THREADS
+struct fft2d_arg_st
+{
+  int nthread;
+  int n0;
+  int n1;
+  int n2;
+  int ic;
+  int isgn;
+  float** a;
+  float* t;
+  int* ip;
+  float* w;
+};
+typedef struct fft2d_arg_st fft2d_arg_t;
+
+void xdft2d0_subth(int n1,
+                   int n2,
+                   int icr,
+                   int isgn,
+                   float** a,
+                   int* ip,
+                   float* w)
+{
+  void* xdft2d0_th(void* p);
+  fft2d_thread_t th[FFT2D_MAX_THREADS];
+  fft2d_arg_t ag[FFT2D_MAX_THREADS];
+  int nthread, i;
+
+  nthread = FFT2D_MAX_THREADS;
+  if (nthread > n1) {
+    nthread = n1;
+  }
+  for (i = 0; i < nthread; i++) {
+    ag[i].nthread = nthread;
+    ag[i].n0 = i;
+    ag[i].n1 = n1;
+    ag[i].n2 = n2;
+    ag[i].ic = icr;
+    ag[i].isgn = isgn;
+    ag[i].a = a;
+    ag[i].ip = ip;
+    ag[i].w = w;
+    fft2d_thread_create(&th[i], xdft2d0_th, &ag[i]);
+  }
+  for (i = 0; i < nthread; i++) {
+    fft2d_thread_wait(th[i]);
+  }
+}
+
+void cdft2d_subth(int n1,
+                  int n2,
+                  int isgn,
+                  float** a,
+                  float* t,
+                  int* ip,
+                  float* w)
+{
+  void* cdft2d_th(void* p);
+  fft2d_thread_t th[FFT2D_MAX_THREADS];
+  fft2d_arg_t ag[FFT2D_MAX_THREADS];
+  int nthread, nt, i;
+
+  nthread = FFT2D_MAX_THREADS;
+  nt = 8 * n1;
+  if (n2 == 4 * FFT2D_MAX_THREADS) {
+    nt >>= 1;
+  } else if (n2 < 4 * FFT2D_MAX_THREADS) {
+    nthread = n2 >> 1;
+    nt >>= 2;
+  }
+  for (i = 0; i < nthread; i++) {
+    ag[i].nthread = nthread;
+    ag[i].n0 = i;
+    ag[i].n1 = n1;
+    ag[i].n2 = n2;
+    ag[i].isgn = isgn;
+    ag[i].a = a;
+    ag[i].t = &t[nt * i];
+    ag[i].ip = ip;
+    ag[i].w = w;
+    fft2d_thread_create(&th[i], cdft2d_th, &ag[i]);
+  }
+  for (i = 0; i < nthread; i++) {
+    fft2d_thread_wait(th[i]);
+  }
+}
+
+void ddxt2d0_subth(int n1,
+                   int n2,
+                   int ics,
+                   int isgn,
+                   float** a,
+                   int* ip,
+                   float* w)
+{
+  void* ddxt2d0_th(void* p);
+  fft2d_thread_t th[FFT2D_MAX_THREADS];
+  fft2d_arg_t ag[FFT2D_MAX_THREADS];
+  int nthread, i;
+
+  nthread = FFT2D_MAX_THREADS;
+  if (nthread > n1) {
+    nthread = n1;
+  }
+  for (i = 0; i < nthread; i++) {
+    ag[i].nthread = nthread;
+    ag[i].n0 = i;
+    ag[i].n1 = n1;
+    ag[i].n2 = n2;
+    ag[i].ic = ics;
+    ag[i].isgn = isgn;
+    ag[i].a = a;
+    ag[i].ip = ip;
+    ag[i].w = w;
+    fft2d_thread_create(&th[i], ddxt2d0_th, &ag[i]);
+  }
+  for (i = 0; i < nthread; i++) {
+    fft2d_thread_wait(th[i]);
+  }
+}
+
+void ddxt2d_subth(int n1,
+                  int n2,
+                  int ics,
+                  int isgn,
+                  float** a,
+                  float* t,
+                  int* ip,
+                  float* w)
+{
+  void* ddxt2d_th(void* p);
+  fft2d_thread_t th[FFT2D_MAX_THREADS];
+  fft2d_arg_t ag[FFT2D_MAX_THREADS];
+  int nthread, nt, i;
+
+  nthread = FFT2D_MAX_THREADS;
+  nt = 4 * n1;
+  if (n2 == 2 * FFT2D_MAX_THREADS) {
+    nt >>= 1;
+  } else if (n2 < 2 * FFT2D_MAX_THREADS) {
+    nthread = n2;
+    nt >>= 2;
+  }
+  for (i = 0; i < nthread; i++) {
+    ag[i].nthread = nthread;
+    ag[i].n0 = i;
+    ag[i].n1 = n1;
+    ag[i].n2 = n2;
+    ag[i].ic = ics;
+    ag[i].isgn = isgn;
+    ag[i].a = a;
+    ag[i].t = &t[nt * i];
+    ag[i].ip = ip;
+    ag[i].w = w;
+    fft2d_thread_create(&th[i], ddxt2d_th, &ag[i]);
+  }
+  for (i = 0; i < nthread; i++) {
+    fft2d_thread_wait(th[i]);
+  }
+}
+
+void* xdft2d0_th(void* p)
+{
+  void cdft(int n, int isgn, float* a, int* ip, float* w);
+  void rdft(int n, int isgn, float* a, int* ip, float* w);
+  int nthread, n0, n1, n2, icr, isgn, *ip, i;
+  float **a, *w;
+
+  nthread = ((fft2d_arg_t*) p)->nthread;
+  n0 = ((fft2d_arg_t*) p)->n0;
+  n1 = ((fft2d_arg_t*) p)->n1;
+  n2 = ((fft2d_arg_t*) p)->n2;
+  icr = ((fft2d_arg_t*) p)->ic;
+  isgn = ((fft2d_arg_t*) p)->isgn;
+  a = ((fft2d_arg_t*) p)->a;
+  ip = ((fft2d_arg_t*) p)->ip;
+  w = ((fft2d_arg_t*) p)->w;
+  if (icr == 0) {
+    for (i = n0; i < n1; i += nthread) {
+      cdft(n2, isgn, a[i], ip, w);
+    }
+  } else {
+    for (i = n0; i < n1; i += nthread) {
+      rdft(n2, isgn, a[i], ip, w);
+    }
+  }
+  return (void*) 0;
+}
+
+void* cdft2d_th(void* p)
+{
+  void cdft(int n, int isgn, float* a, int* ip, float* w);
+  int nthread, n0, n1, n2, isgn, *ip, i, j;
+  float **a, *t, *w;
+
+  nthread = ((fft2d_arg_t*) p)->nthread;
+  n0 = ((fft2d_arg_t*) p)->n0;
+  n1 = ((fft2d_arg_t*) p)->n1;
+  n2 = ((fft2d_arg_t*) p)->n2;
+  isgn = ((fft2d_arg_t*) p)->isgn;
+  a = ((fft2d_arg_t*) p)->a;
+  t = ((fft2d_arg_t*) p)->t;
+  ip = ((fft2d_arg_t*) p)->ip;
+  w = ((fft2d_arg_t*) p)->w;
+  if (n2 > 4 * nthread) {
+    for (j = 8 * n0; j < n2; j += 8 * nthread) {
+      for (i = 0; i < n1; i++) {
+        t[2 * i] = a[i][j];
+        t[2 * i + 1] = a[i][j + 1];
+        t[2 * n1 + 2 * i] = a[i][j + 2];
+        t[2 * n1 + 2 * i + 1] = a[i][j + 3];
+        t[4 * n1 + 2 * i] = a[i][j + 4];
+        t[4 * n1 + 2 * i + 1] = a[i][j + 5];
+        t[6 * n1 + 2 * i] = a[i][j + 6];
+        t[6 * n1 + 2 * i + 1] = a[i][j + 7];
+      }
+      cdft(2 * n1, isgn, t, ip, w);
+      cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+      cdft(2 * n1, isgn, &t[4 * n1], ip, w);
+      cdft(2 * n1, isgn, &t[6 * n1], ip, w);
+      for (i = 0; i < n1; i++) {
+        a[i][j] = t[2 * i];
+        a[i][j + 1] = t[2 * i + 1];
+        a[i][j + 2] = t[2 * n1 + 2 * i];
+        a[i][j + 3] = t[2 * n1 + 2 * i + 1];
+        a[i][j + 4] = t[4 * n1 + 2 * i];
+        a[i][j + 5] = t[4 * n1 + 2 * i + 1];
+        a[i][j + 6] = t[6 * n1 + 2 * i];
+        a[i][j + 7] = t[6 * n1 + 2 * i + 1];
+      }
+    }
+  } else if (n2 == 4 * nthread) {
+    for (i = 0; i < n1; i++) {
+      t[2 * i] = a[i][4 * n0];
+      t[2 * i + 1] = a[i][4 * n0 + 1];
+      t[2 * n1 + 2 * i] = a[i][4 * n0 + 2];
+      t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3];
+    }
+    cdft(2 * n1, isgn, t, ip, w);
+    cdft(2 * n1, isgn, &t[2 * n1], ip, w);
+    for (i = 0; i < n1; i++) {
+      a[i][4 * n0] = t[2 * i];
+      a[i][4 * n0 + 1] = t[2 * i + 1];
+      a[i][4 * n0 + 2] = t[2 * n1 + 2 * i];
+      a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1];
+    }
+  } else if (n2 == 2 * nthread) {
+    for (i = 0; i < n1; i++) {
+      t[2 * i] = a[i][2 * n0];
+      t[2 * i + 1] = a[i][2 * n0 + 1];
+    }
+    cdft(2 * n1, isgn, t, ip, w);
+    for (i = 0; i < n1; i++) {
+      a[i][2 * n0] = t[2 * i];
+      a[i][2 * n0 + 1] = t[2 * i + 1];
+    }
+  }
+  return (void*) 0;
+}
+
+void* ddxt2d0_th(void* p)
+{
+  void ddct(int n, int isgn, float* a, int* ip, float* w);
+  void ddst(int n, int isgn, float* a, int* ip, float* w);
+  int nthread, n0, n1, n2, ics, isgn, *ip, i;
+  float **a, *w;
+
+  nthread = ((fft2d_arg_t*) p)->nthread;
+  n0 = ((fft2d_arg_t*) p)->n0;
+  n1 = ((fft2d_arg_t*) p)->n1;
+  n2 = ((fft2d_arg_t*) p)->n2;
+  ics = ((fft2d_arg_t*) p)->ic;
+  isgn = ((fft2d_arg_t*) p)->isgn;
+  a = ((fft2d_arg_t*) p)->a;
+  ip = ((fft2d_arg_t*) p)->ip;
+  w = ((fft2d_arg_t*) p)->w;
+  if (ics == 0) {
+    for (i = n0; i < n1; i += nthread) {
+      ddct(n2, isgn, a[i], ip, w);
+    }
+  } else {
+    for (i = n0; i < n1; i += nthread) {
+      ddst(n2, isgn, a[i], ip, w);
+    }
+  }
+  return (void*) 0;
+}
+
+void* ddxt2d_th(void* p)
+{
+  void ddct(int n, int isgn, float* a, int* ip, float* w);
+  void ddst(int n, int isgn, float* a, int* ip, float* w);
+  int nthread, n0, n1, n2, ics, isgn, *ip, i, j;
+  float **a, *t, *w;
+
+  nthread = ((fft2d_arg_t*) p)->nthread;
+  n0 = ((fft2d_arg_t*) p)->n0;
+  n1 = ((fft2d_arg_t*) p)->n1;
+  n2 = ((fft2d_arg_t*) p)->n2;
+  ics = ((fft2d_arg_t*) p)->ic;
+  isgn = ((fft2d_arg_t*) p)->isgn;
+  a = ((fft2d_arg_t*) p)->a;
+  t = ((fft2d_arg_t*) p)->t;
+  ip = ((fft2d_arg_t*) p)->ip;
+  w = ((fft2d_arg_t*) p)->w;
+  if (n2 > 2 * nthread) {
+    for (j = 4 * n0; j < n2; j += 4 * nthread) {
+      for (i = 0; i < n1; i++) {
+        t[i] = a[i][j];
+        t[n1 + i] = a[i][j + 1];
+        t[2 * n1 + i] = a[i][j + 2];
+        t[3 * n1 + i] = a[i][j + 3];
+      }
+      if (ics == 0) {
+        ddct(n1, isgn, t, ip, w);
+        ddct(n1, isgn, &t[n1], ip, w);
+        ddct(n1, isgn, &t[2 * n1], ip, w);
+        ddct(n1, isgn, &t[3 * n1], ip, w);
+      } else {
+        ddst(n1, isgn, t, ip, w);
+        ddst(n1, isgn, &t[n1], ip, w);
+        ddst(n1, isgn, &t[2 * n1], ip, w);
+        ddst(n1, isgn, &t[3 * n1], ip, w);
+      }
+      for (i = 0; i < n1; i++) {
+        a[i][j] = t[i];
+        a[i][j + 1] = t[n1 + i];
+        a[i][j + 2] = t[2 * n1 + i];
+        a[i][j + 3] = t[3 * n1 + i];
+      }
+    }
+  } else if (n2 == 2 * nthread) {
+    for (i = 0; i < n1; i++) {
+      t[i] = a[i][2 * n0];
+      t[n1 + i] = a[i][2 * n0 + 1];
+    }
+    if (ics == 0) {
+      ddct(n1, isgn, t, ip, w);
+      ddct(n1, isgn, &t[n1], ip, w);
+    } else {
+      ddst(n1, isgn, t, ip, w);
+      ddst(n1, isgn, &t[n1], ip, w);
+    }
+    for (i = 0; i < n1; i++) {
+      a[i][2 * n0] = t[i];
+      a[i][2 * n0 + 1] = t[n1 + i];
+    }
+  } else if (n2 == nthread) {
+    for (i = 0; i < n1; i++) {
+      t[i] = a[i][n0];
+    }
+    if (ics == 0) {
+      ddct(n1, isgn, t, ip, w);
+    } else {
+      ddst(n1, isgn, t, ip, w);
+    }
+    for (i = 0; i < n1; i++) {
+      a[i][n0] = t[i];
+    }
+  }
+  return (void*) 0;
+}
+#endif /* USE_FFT2D_THREADS */
+
+NEXTPNR_NAMESPACE_END
diff --git a/3rdparty/oourafft/readme.txt b/3rdparty/oourafft/readme.txt
new file mode 100644
index 00000000..145a081b
--- /dev/null
+++ b/3rdparty/oourafft/readme.txt
@@ -0,0 +1,167 @@
+General Purpose FFT (Fast Fourier/Cosine/Sine Transform) Package
+
+Description:
+    A package to calculate Discrete Fourier/Cosine/Sine Transforms of 
+    1-dimensional sequences of length 2^N.
+
+Files:
+    fft4g.c    : FFT Package in C       - Fast Version   I   (radix 4,2)
+    fft4g.f    : FFT Package in Fortran - Fast Version   I   (radix 4,2)
+    fft4g_h.c  : FFT Package in C       - Simple Version I   (radix 4,2)
+    fft8g.c    : FFT Package in C       - Fast Version   II  (radix 8,4,2)
+    fft8g.f    : FFT Package in Fortran - Fast Version   II  (radix 8,4,2)
+    fft8g_h.c  : FFT Package in C       - Simple Version II  (radix 8,4,2)
+    fftsg.c    : FFT Package in C       - Fast Version   III (Split-Radix)
+    fftsg.f    : FFT Package in Fortran - Fast Version   III (Split-Radix)
+    fftsg_h.c  : FFT Package in C       - Simple Version III (Split-Radix)
+    readme.txt : Readme File
+    sample1/   : Test Directory
+        Makefile    : for gcc, cc
+        Makefile.f77: for Fortran
+        testxg.c    : Test Program for "fft*g.c"
+        testxg.f    : Test Program for "fft*g.f"
+        testxg_h.c  : Test Program for "fft*g_h.c"
+    sample2/   : Benchmark Directory
+        Makefile    : for gcc, cc
+        Makefile.pth: POSIX Thread version
+        pi_fft.c    : PI(= 3.1415926535897932384626...) Calculation Program
+                      for a Benchmark Test for "fft*g.c"
+
+Difference of the Files:
+    C and Fortran versions are equal and 
+    the same routines are in each version.
+    "fft4g*.*" are optimized for most machines.
+    "fft8g*.*" are fast on the UltraSPARC.
+    "fftsg*.*" are optimized for the machines that 
+    have the multi-level (L1,L2,etc) cache.
+    The simple versions "fft*g_h.c" use no work area, but 
+    the fast versions "fft*g.*" use work areas.
+    The fast versions "fft*g.*" have the same specification.
+
+Routines in the Package:
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+
+Usage:
+    Please refer to the comments in the "fft**.*" file which 
+    you want to use. Brief explanations are in the block 
+    comments of each package. The examples are also given in 
+    the test programs.
+
+Method:
+    -------- cdft --------
+    fft4g*.*, fft8g*.*:
+        A method of in-place, radix 2^M, Sande-Tukey (decimation in 
+        frequency). Index of the butterfly loop is in bit 
+        reverse order to keep continuous memory access.
+    fftsg*.*:
+        A method of in-place, Split-Radix, recursive fast 
+        algorithm.
+    -------- rdft --------
+    A method with a following butterfly operation appended to "cdft".
+    In forward transform :
+        A[k] = sum_j=0^n-1 a[j]*W(n)^(j*k), 0<=k<=n/2, 
+            W(n) = exp(2*pi*i/n), 
+    this routine makes an array x[] :
+        x[j] = a[2*j] + i*a[2*j+1], 0<=j<n/2
+    and calls "cdft" of length n/2 :
+        X[k] = sum_j=0^n/2-1 x[j] * W(n/2)^(j*k), 0<=k<n.
+    The result A[k] are :
+        A[k]     = X[k]     - (1+i*W(n)^k)/2 * (X[k]-conjg(X[n/2-k])), 
+        A[n/2-k] = X[n/2-k] + 
+                        conjg((1+i*W(n)^k)/2 * (X[k]-conjg(X[n/2-k]))), 
+            0<=k<=n/2
+        (notes: conjg() is a complex conjugate, X[n/2]=X[0]).
+    -------- ddct --------
+    A method with a following butterfly operation appended to "rdft".
+    In backward transform :
+        C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n, 
+    this routine makes an array r[] :
+        r[0] = a[0], 
+        r[j]   = Re((a[j] - i*a[n-j]) * W(4*n)^j*(1+i)/2), 
+        r[n-j] = Im((a[j] - i*a[n-j]) * W(4*n)^j*(1+i)/2), 
+            0<j<=n/2
+    and calls "rdft" of length n :
+        A[k] = sum_j=0^n-1 r[j]*W(n)^(j*k), 0<=k<=n/2, 
+            W(n) = exp(2*pi*i/n).
+    The result C[k] are :
+        C[2*k]   =  Re(A[k] * (1-i)), 
+        C[2*k-1] = -Im(A[k] * (1-i)).
+    -------- ddst --------
+    A method with a following butterfly operation appended to "rdft".
+    In backward transform :
+        S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n, 
+    this routine makes an array r[] :
+        r[0] = a[0], 
+        r[j]   = Im((a[n-j] - i*a[j]) * W(4*n)^j*(1+i)/2), 
+        r[n-j] = Re((a[n-j] - i*a[j]) * W(4*n)^j*(1+i)/2), 
+            0<j<=n/2
+    and calls "rdft" of length n :
+        A[k] = sum_j=0^n-1 r[j]*W(n)^(j*k), 0<=k<=n/2, 
+            W(n) = exp(2*pi*i/n).
+    The result S[k] are :
+        S[2*k]   =  Re(A[k] * (1+i)), 
+        S[2*k-1] = -Im(A[k] * (1+i)).
+    -------- dfct --------
+    A method to split into "dfct" and "ddct" of half length.
+    The transform :
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    is divided into :
+        C[2*k]   = sum'_j=0^n/2  (a[j]+a[n-j])*cos(pi*j*k/(n/2)), 
+        C[2*k+1] = sum_j=0^n/2-1 (a[j]-a[n-j])*cos(pi*j*(k+1/2)/(n/2))
+        (sum' is a summation whose last term multiplies 1/2).
+    This routine uses "ddct" recursively.
+    To keep the in-place operation, the data in fft*g_h.*
+    are sorted in bit reversal order.
+    -------- dfst --------
+    A method to split into "dfst" and "ddst" of half length.
+    The transform :
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    is divided into :
+        S[2*k]   = sum_j=1^n/2-1 (a[j]-a[n-j])*sin(pi*j*k/(n/2)), 
+        S[2*k+1] = sum'_j=1^n/2  (a[j]+a[n-j])*sin(pi*j*(k+1/2)/(n/2))
+        (sum' is a summation whose last term multiplies 1/2).
+    This routine uses "ddst" recursively.
+    To keep the in-place operation, the data in fft*g_h.*
+    are sorted in bit reversal order.
+
+Reference:
+    * Masatake MORI, Makoto NATORI, Tatuo TORII: Suchikeisan, 
+      Iwanamikouzajyouhoukagaku18, Iwanami, 1982 (Japanese)
+    * Henri J. Nussbaumer: Fast Fourier Transform and Convolution 
+      Algorithms, Springer Verlag, 1982
+    * C. S. Burrus, Notes on the FFT (with large FFT paper list)
+      http://www-dsp.rice.edu/research/fft/fftnote.asc
+
+Copyright:
+    Copyright(C) 1996-2001 Takuya OOURA
+    email: ooura@mmm.t.u-tokyo.ac.jp
+    download: http://momonga.t.u-tokyo.ac.jp/~ooura/fft.html
+    You may use, copy, modify this code for any purpose and 
+    without fee. You may distribute this ORIGINAL package.
+
+History:
+    ...
+    Dec. 1995  : Edit the General Purpose FFT
+    Mar. 1996  : Change the specification
+    Jun. 1996  : Change the method of trigonometric function table
+    Sep. 1996  : Modify the documents
+    Feb. 1997  : Change the butterfly loops
+    Dec. 1997  : Modify the documents
+    Dec. 1997  : Add "fft4g.*"
+    Jul. 1998  : Fix some bugs in the documents
+    Jul. 1998  : Add "fft8g.*" and delete "fft4f.*"
+    Jul. 1998  : Add a benchmark program "pi_fft.c"
+    Jul. 1999  : Add a simple version "fft*g_h.c"
+    Jul. 1999  : Add a Split-Radix FFT package "fftsg*.c"
+    Sep. 1999  : Reduce the memory operation (minor optimization)
+    Oct. 1999  : Change the butterfly structure of "fftsg*.c"
+    Oct. 1999  : Save the code size
+    Sep. 2001  : Add "fftsg.f"
+    Sep. 2001  : Add Pthread & Win32thread routines to "fftsg*.c"
+    Dec. 2006  : Fix a minor bug in "fftsg.f"
+
diff --git a/3rdparty/oourafft/readme2d.txt b/3rdparty/oourafft/readme2d.txt
new file mode 100644
index 00000000..7cc4dfa0
--- /dev/null
+++ b/3rdparty/oourafft/readme2d.txt
@@ -0,0 +1,77 @@
+General Purpose 2D,3D FFT (Fast Fourier Transform) Package
+
+Files
+    alloc.c    : 2D-array Allocation
+    alloc.h    : 2D-array Allocation
+    fft4f2d.c  : 2D FFT Package in C       - Version I (radix 4, 2)
+    fft4f2d.f  : 2D FFT Package in Fortran - Version I (radix 4, 2)
+    fftsg.c    : 1D FFT Package in C       - Fast Version (Split-Radix)
+    fftsg.f    : 1D FFT Package in Fortran - Fast Version (Split-Radix)
+    fftsg2d.c  : 2D FFT Package in C       - Version II (Split-Radix)
+    fftsg2d.f  : 2D FFT Package in Fortran - Version II (Split-Radix)
+    fftsg3d.c  : 3D FFT Package in C       - Version II (Split-Radix)
+    fftsg3d.f  : 3D FFT Package in Fortran - Version II (Split-Radix)
+    shrtdct.c  : 8x8, 16x16 DCT Package
+    sample2d/
+        Makefile    : for gcc, cc
+        Makefile.f77: for Fortran
+        Makefile.pth: Pthread version
+        fft4f2dt.c  : Test Program for "fft4f2d.c"
+        fft4f2dt.f  : Test Program for "fft4f2d.f"
+        fftsg2dt.c  : Test Program for "fftsg2d.c"
+        fftsg2dt.f  : Test Program for "fftsg2d.f"
+        fftsg3dt.c  : Test Program for "fftsg3d.c"
+        fftsg3dt.f  : Test Program for "fftsg3d.f"
+        shrtdctt.c  : Test Program for "shrtdct.c"
+
+Difference of Files
+    C and Fortran versions are equal and 
+    the same routines are in each version.
+    ---- Difference between "fft4f2d.*" and "fftsg2d.*" ----
+    "fft4f2d.*" are optimized for the old machines that 
+    don't have the large size CPU cache.
+    "fftsg2d.*", "fftsg3d.*" use 1D FFT routines in "fftsg.*".
+    "fftsg2d.*", "fftsg3d.*" are optimized for the machines that 
+    have the multi-level (L1,L2,etc) cache.
+
+Routines in the Package
+    in fft4f2d.*, fftsg2d.*
+        cdft2d: 2-dim Complex Discrete Fourier Transform
+        rdft2d: 2-dim Real Discrete Fourier Transform
+        ddct2d: 2-dim Discrete Cosine Transform
+        ddst2d: 2-dim Discrete Sine Transform
+        rdft2dsort: rdft2d input/output ordering (fftsg2d.*)
+    in fftsg3d.*
+        cdft3d: 3-dim Complex Discrete Fourier Transform
+        rdft3d: 3-dim Real Discrete Fourier Transform
+        ddct3d: 3-dim Discrete Cosine Transform
+        ddst3d: 3-dim Discrete Sine Transform
+        rdft3dsort: rdft3d input/output ordering
+    in fftsg.*
+        cdft: 1-dim Complex Discrete Fourier Transform
+        rdft: 1-dim Real Discrete Fourier Transform
+        ddct: 1-dim Discrete Cosine Transform
+        ddst: 1-dim Discrete Sine Transform
+        dfct: 1-dim Real Symmetric DFT
+        dfst: 1-dim Real Anti-symmetric DFT
+        (these routines are called by fftsg2d.*, fftsg3d.*)
+    in shrtdct.c
+        ddct8x8s  : Normalized 8x8 DCT
+        ddct16x16s: Normalized 16x16 DCT
+        (faster than ddct2d())
+
+Usage
+    Brief explanations are in block comments of each packages.
+    The examples are given in the test programs.
+
+Copyright
+    Copyright(C) 1997,2001 Takuya OOURA (email: ooura@kurims.kyoto-u.ac.jp).
+    You may use, copy, modify this code for any purpose and 
+    without fee. You may distribute this ORIGINAL package.
+
+History
+    ...
+    Nov. 2001  : Add 3D-FFT routines
+    Dec. 2006  : Fix a documentation bug in "fftsg3d.*"
+    Dec. 2006  : Fix a minor bug in "fftsg.f"
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 09eb8187..adef332f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -235,7 +235,7 @@ if (NOT DEFINED PYBIND11_INCLUDE_DIR)
     set(PYBIND11_INCLUDE_DIR "3rdparty/pybind11/include")
 endif()
 
-include_directories(common/kernel/ common/place/ common/route/ json/ frontend/ 3rdparty/json11/ ${PYBIND11_INCLUDE_DIR} ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS})
+include_directories(common/kernel/ common/place/ common/route/ json/ frontend/ 3rdparty/json11/  3rdparty/oourafft ${PYBIND11_INCLUDE_DIR} ${Boost_INCLUDE_DIRS} ${Python3_INCLUDE_DIRS})
 
 find_package (Eigen3 REQUIRED NO_MODULE)
 link_libraries(Eigen3::Eigen)
@@ -246,9 +246,10 @@ aux_source_directory(common/route/ ROUTE_SRC_FILES)
 
 aux_source_directory(json/ JSON_PARSER_FILES)
 aux_source_directory(3rdparty/json11 EXT_JSON11_FILES)
+aux_source_directory(3rdparty/oourafft EXT_OOURAFFT_FILES)
 aux_source_directory(frontend/ FRONTEND_FILES)
 
-set(COMMON_FILES ${KERNEL_SRC_FILES} ${PLACE_SRC_FILES} ${ROUTE_SRC_FILES} ${EXT_JSON11_FILES} ${JSON_PARSER_FILES} ${FRONTEND_FILES})
+set(COMMON_FILES ${KERNEL_SRC_FILES} ${PLACE_SRC_FILES} ${ROUTE_SRC_FILES} ${EXT_JSON11_FILES} ${EXT_OOURAFFT_FILES} ${JSON_PARSER_FILES} ${FRONTEND_FILES})
 if( NOT CMAKE_BUILD_TYPE )
     set(CMAKE_BUILD_TYPE Release)
 endif()
diff --git a/common/kernel/array2d.h b/common/kernel/array2d.h
new file mode 100644
index 00000000..bb1a150d
--- /dev/null
+++ b/common/kernel/array2d.h
@@ -0,0 +1,148 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2021-22  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef ARRAY2D_H
+#define ARRAY2D_H
+
+#include <algorithm>
+#include <fstream>
+#include <limits>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+#include "nextpnr_assertions.h"
+#include "nextpnr_namespaces.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+template <typename T> class array2d
+{
+  public:
+    array2d() : m_width(0), m_height(0), m_size(0), data(nullptr){};
+    array2d(int width, int height) : m_width(width), m_height(height), m_size(width * height)
+    {
+        data = new T[m_width * m_height]();
+    }
+    array2d(int width, int height, const T &init) : m_width(width), m_height(height), m_size(width * height)
+    {
+        data = new T[m_width * m_height];
+        std::fill(data, data + (m_width * m_height), init);
+    }
+    array2d(const array2d &other) : m_width(other.m_width), m_height(other.m_height), m_size(other.m_size)
+    {
+        data = new T[m_size]();
+        if (m_size > 0)
+            std::copy(other.data, other.data + (m_width * m_height), data);
+    }
+    array2d(array2d &&other) noexcept : m_width(other.m_width), m_height(other.m_height), m_size(other.m_size)
+    {
+        data = other.data;
+        other.data = nullptr;
+        other.m_size = 0;
+    }
+    void reset(int new_width, int new_height, const T &init = {})
+    {
+        if ((new_width * new_height) > m_size) {
+            delete[] data;
+            m_size = new_width * new_height;
+            data = new T[m_size];
+        }
+        m_width = new_width;
+        m_height = new_height;
+        std::fill(data, data + (m_width * m_height), init);
+    }
+
+    int width() const { return m_width; }
+    int height() const { return m_height; }
+    T &at(int x, int y)
+    {
+        NPNR_ASSERT(x >= 0 && x < m_width);
+        NPNR_ASSERT(y >= 0 && y < m_height);
+        return data[y * m_width + x];
+    }
+    T &at(const Loc &l) { return at(l.x, l.y); }
+    const T &at(int x, int y) const
+    {
+        NPNR_ASSERT(x >= 0 && x < m_width);
+        NPNR_ASSERT(y >= 0 && y < m_height);
+        return data[y * m_width + x];
+    }
+    const T &at(const Loc &l) const { return at(l.x, l.y); }
+    ~array2d() { delete[] data; }
+    struct entry
+    {
+        entry(int x, int y, T &value) : x(x), y(y), value(value){};
+        int x, y;
+        T &value;
+    };
+    struct iterator
+    {
+      public:
+        entry operator*() { return {x, y, base->at(x, y)}; }
+        inline iterator operator++()
+        {
+            ++x;
+            if (x >= base->width()) {
+                x = 0;
+                ++y;
+            }
+            return *this;
+        }
+        inline iterator operator++(int)
+        {
+            iterator prior(x, y, base);
+            ++x;
+            if (x >= base->width()) {
+                x = 0;
+                ++y;
+            }
+            return prior;
+        }
+        inline bool operator!=(const iterator &other) const { return other.x != x || other.y != y; }
+        inline bool operator==(const iterator &other) const { return other.x == x && other.y == y; }
+
+      private:
+        iterator(int x, int y, array2d<T> &base) : x(x), y(y), base(&base){};
+        int x, y;
+        array2d<T> *base;
+        friend class array2d;
+    };
+    iterator begin() { return {0, 0, *this}; }
+    iterator end() { return {0, m_height, *this}; }
+
+    void write_csv(const std::string &filename) const
+    {
+        std::ofstream out(filename);
+        NPNR_ASSERT(out);
+        for (int y = 0; y < m_height; y++) {
+            for (int x = 0; x < m_width; x++) {
+                out << at(x, y) << ",";
+            }
+            out << std::endl;
+        }
+    }
+
+  private:
+    int m_width, m_height, m_size;
+    T *data;
+};
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/common/kernel/deterministic_rng.h b/common/kernel/deterministic_rng.h
index 3aab5a49..73a9ef78 100644
--- a/common/kernel/deterministic_rng.h
+++ b/common/kernel/deterministic_rng.h
@@ -72,6 +72,8 @@ struct DeterministicRNG
         }
     }
 
+    float rngf(float n) { return float(rng64() & 0x3fffffff) * (n / float(0x3fffffff)); }
+
     void rngseed(uint64_t seed)
     {
         rngstate = seed ? seed : 0x3141592653589793;
diff --git a/common/place/placer_static.cc b/common/place/placer_static.cc
new file mode 100644
index 00000000..3e105e47
--- /dev/null
+++ b/common/place/placer_static.cc
@@ -0,0 +1,1296 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2022  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#include "placer_static.h"
+#include "static_util.h"
+
+#include <boost/optional.hpp>
+#include <chrono>
+#include <deque>
+#include <fstream>
+#include <numeric>
+#include <queue>
+#include <tuple>
+#include "array2d.h"
+#include "fast_bels.h"
+#include "log.h"
+#include "nextpnr.h"
+#include "parallel_refine.h"
+#include "place_common.h"
+#include "placer1.h"
+#include "scope_lock.h"
+#include "timing.h"
+#include "util.h"
+
+#include "fftsg.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+using namespace StaticUtil;
+
+namespace {
+
+struct PlacerGroup
+{
+    int total_bels = 0;
+    double concrete_area = 0;
+    double dark_area = 0;
+    double total_area = 0;
+    array2d<float> loc_area;
+
+    float overlap = 0;
+
+    array2d<double> conc_density; // excludes fillers and dark nodes
+    array2d<double> density;
+    // FFT related data (TODO: should these be per group?)
+    FFTArray density_fft;
+    FFTArray electro_phi;
+    FFTArray electro_fx, electro_fy;
+};
+
+// Could be an actual concrete netlist cell; or just a spacer
+struct MoveCell
+{
+    StaticRect rect;
+    // TODO: multiple contiguous vectors is probably faster than an array of structs, but also messier
+    RealPair pos, ref_pos, last_pos, last_ref_pos;
+    RealPair ref_wl_grad, wl_grad, last_wl_grad;
+    RealPair ref_dens_grad, dens_grad, last_dens_grad;
+    RealPair ref_total_grad, total_grad, last_total_grad;
+    int32_t pin_count;
+    int16_t group;
+    int16_t bx, by; // bins
+    bool is_fixed : 1;
+    bool is_spacer : 1;
+    bool is_dark : 1;
+};
+
+// Extra data for cells that aren't spacers
+struct ConcreteCell
+{
+    CellInfo *base_cell;
+    // When cells are macros; we split them up into chunks
+    // based on dx/dy location
+    int32_t macro_idx = -1;
+    int16_t chunk_dx = 0, chunk_dy = 0;
+};
+
+struct ClusterGroupKey
+{
+    ClusterGroupKey(int dx = 0, int dy = 0, int group = -1) : dx(dx), dy(dy), group(group){};
+    bool operator==(const ClusterGroupKey &other) const
+    {
+        return dx == other.dx && dy == other.dy && group == other.group;
+    }
+    unsigned hash() const { return mkhash(mkhash(dx, dy), group); }
+    int16_t dx, dy, group;
+};
+
+struct PlacerMacro
+{
+    CellInfo *root;
+    std::vector<int32_t> conc_cells;
+    dict<ClusterGroupKey, std::vector<CellInfo *>> cells;
+};
+
+struct PlacerBin
+{
+    float density;
+    // ...
+};
+
+struct PlacerPort
+{
+    // for wirelength data
+    static constexpr float invalid = std::numeric_limits<float>::lowest();
+
+    PortRef ref;
+    RealPair max_exp{invalid, invalid};
+    RealPair min_exp{invalid, invalid};
+    bool has_max_exp(Axis axis) const { return max_exp.at(axis) != invalid; }
+    bool has_min_exp(Axis axis) const { return min_exp.at(axis) != invalid; }
+};
+
+struct PlacerNet
+{
+    NetInfo *ni;
+    bool skip = false;
+    RealPair b1, b0; // real bounding box
+    RealPair min_exp, x_min_exp;
+    RealPair max_exp, x_max_exp;
+    RealPair wa_wl;
+    // lines up with user indexes; plus one for driver
+    std::vector<PlacerPort> ports;
+    int hpwl() { return (b1.x - b0.x) + (b1.y - b0.y); }
+};
+
+class StaticPlacer
+{
+    Context *ctx;
+    PlacerStaticCfg cfg;
+
+    std::vector<MoveCell> mcells;
+    std::vector<ConcreteCell> ccells;
+    std::vector<PlacerMacro> macros;
+    std::vector<PlacerGroup> groups;
+    std::vector<PlacerNet> nets;
+    idict<ClusterId> cluster2idx;
+
+    FastBels fast_bels;
+    TimingAnalyser tmg;
+
+    int width, height;
+    int iter = 0;
+    bool fft_debug = false;
+
+    // legalisation queue
+    std::priority_queue<std::pair<int, IdString>> to_legalise;
+
+    void prepare_cells()
+    {
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            ci->udata = -1;
+            // process legacy-ish bel attributes
+            if (ci->attrs.count(ctx->id("BEL")) && ci->bel == BelId()) {
+                std::string loc_name = ci->attrs.at(ctx->id("BEL")).as_string();
+                BelId bel = ctx->getBelByNameStr(loc_name);
+                NPNR_ASSERT(ctx->isValidBelForCellType(ci->type, bel));
+                NPNR_ASSERT(ctx->checkBelAvail(bel));
+                ctx->bindBel(bel, ci, STRENGTH_USER);
+            }
+        }
+    }
+
+    bool lookup_group(IdString type, int &group, StaticRect &rect)
+    {
+        for (size_t i = 0; i < cfg.cell_groups.size(); i++) {
+            const auto &g = cfg.cell_groups.at(i);
+            if (g.cell_area.count(type)) {
+                group = i;
+                rect = g.cell_area.at(type);
+                return true;
+            }
+        }
+        return false;
+    }
+
+    void init_bels()
+    {
+        log_info("⌁ initialising bels...\n");
+        width = 0;
+        height = 0;
+        for (auto bel : ctx->getBels()) {
+            Loc loc = ctx->getBelLocation(bel);
+            width = std::max(width, loc.x + 1);
+            height = std::max(height, loc.y + 1);
+        }
+        dict<IdString, int> beltype2group;
+        for (int i = 0; i < int(groups.size()); i++) {
+            groups.at(i).loc_area.reset(width, height);
+            for (const auto &bel_type : cfg.cell_groups.at(i).cell_area)
+                beltype2group[bel_type.first] = i;
+        }
+        for (auto bel : ctx->getBels()) {
+            Loc loc = ctx->getBelLocation(bel);
+            IdString type = ctx->getBelType(bel);
+            auto fnd = beltype2group.find(type);
+            if (fnd == beltype2group.end())
+                continue;
+            auto size = cfg.cell_groups.at(fnd->second).bel_area.at(type); // TODO: do we care about dimensions too
+            auto &group = groups.at(fnd->second);
+            for (int dy = 0; dy <= int(size.h); dy++) {
+                for (int dx = 0; dx <= int(size.w); dx++) {
+                    float h = (dy == int(size.h)) ? (size.h - int(size.h)) : 1;
+                    float w = (dx == int(size.w)) ? (size.w - int(size.w)) : 1;
+                    group.loc_area.at(loc.x + dx, loc.y + dy) += w * h;
+                }
+            }
+            group.total_area += size.area();
+            group.total_bels += 1;
+        }
+    }
+
+    void init_nets()
+    {
+        nets.reserve(ctx->nets.size());
+        for (auto &net : ctx->nets) {
+            NetInfo *ni = net.second.get();
+            ni->udata = nets.size();
+            nets.emplace_back();
+
+            auto &nd = nets.back();
+            nd.ni = ni;
+            nd.skip = (ni->driver.cell == nullptr);    // (or global buffer?)
+            nd.ports.resize(ni->users.capacity() + 1); // +1 for the driver
+            nd.ports.back().ref = ni->driver;
+            for (auto usr : ni->users.enumerate()) {
+                nd.ports.at(usr.index.idx()).ref = usr.value;
+            }
+        }
+    }
+
+    int add_cell(StaticRect rect, int group, RealPair pos, CellInfo *ci = nullptr)
+    {
+        int idx = mcells.size();
+        mcells.emplace_back();
+        auto &m = mcells.back();
+        m.rect = rect;
+        m.group = group;
+        m.pos = pos;
+        if (ci) {
+            // Is a concrete cell (might be a macro, in which case ci is just one of them...)
+            // Can't add concrete cells once we have spacers (we define it such that indices line up between mcells and
+            // ccells; spacer cells only exist in mcells)
+            NPNR_ASSERT(idx == int(ccells.size()));
+            ccells.emplace_back();
+            auto &c = ccells.back();
+            c.base_cell = ci;
+            groups.at(group).concrete_area += rect.area();
+        } else {
+            // Is a spacer cell
+            m.is_spacer = true;
+        }
+        return idx;
+    }
+
+    const float pi = 3.141592653589793f;
+
+    RealPair rand_loc()
+    {
+        // Box-muller
+        float u1 = ctx->rngf(1.0f);
+        while (u1 < 1e-5)
+            u1 = ctx->rngf(1.0f);
+        float u2 = ctx->rngf(1.0f);
+        float m = std::sqrt(-2.f * std::log(u1));
+        float z0 = m * std::cos(2.f * pi * u2);
+        float z1 = m * std::sin(2.f * pi * u2);
+        float x = (width / 2.f) + (width / 50.f) * z0;
+        float y = (height / 2.f) + (height / 50.f) * z1;
+        x = std::min<float>(width - 1.f, std::max<float>(x, 0));
+        y = std::min<float>(height - 1.f, std::max<float>(y, 0));
+        return RealPair(x, y);
+    }
+
+    RealPair cell_loc(CellInfo *ci, bool ref)
+    {
+        if (ci->udata == -1) {
+            // not handled?
+            NPNR_ASSERT(ci->bel != BelId()); // already fixed
+            return RealPair(ctx->getBelLocation(ci->bel), 0.5f);
+        } else {
+            return ref ? mcells.at(ci->udata).ref_pos : mcells.at(ci->udata).pos;
+        }
+    }
+
+    void init_cells()
+    {
+        log_info("⌁ initialising cells...\n");
+        // Process non-clustered cells and find clusters
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            int cell_group;
+            StaticRect rect;
+            // Mismatched group case
+            if (!lookup_group(ci->type, cell_group, rect)) {
+                // TODO: what is the best thing to do here? singletons/odd cells we can probably mostly randomly place
+                continue;
+            }
+            if (ci->cluster != ClusterId()) {
+                // Defer processing of macro clusters
+                int c_idx = cluster2idx(ci->cluster);
+                if (c_idx >= int(macros.size())) {
+                    macros.emplace_back();
+                    macros.back().root = ctx->getClusterRootCell(ci->cluster);
+                }
+                auto &m = macros.at(c_idx);
+                Loc delta = ctx->getClusterOffset(ci);
+                m.cells[ClusterGroupKey(delta.x, delta.y, cell_group)].push_back(ci);
+            } else {
+                // Non-clustered cells can be processed already
+                int idx = add_cell(rect, cell_group, rand_loc(), ci);
+                ci->udata = idx;
+                auto &mc = mcells.at(idx);
+                mc.pin_count += int(ci->ports.size());
+                if (ci->bel != BelId()) {
+                    // Currently; treat all ready-placed cells as fixed (eventually we might do incremental ripups
+                    // here...)
+                    Loc loc = ctx->getBelLocation(ci->bel);
+                    mc.pos.x = loc.x + 0.5;
+                    mc.pos.y = loc.y + 0.5;
+                    mc.is_fixed = true;
+                }
+            }
+        }
+        // Process clustered cells
+        for (int i = 0; i < int(macros.size()); i++) {
+            auto &m = macros.at(i);
+            for (auto &kv : m.cells) {
+                const auto &g = cfg.cell_groups.at(kv.first.group);
+                // Only treat zero-area cells as zero-area; if this cluster also contains non-zero area cells
+                bool has_nonzero = std::any_of(kv.second.begin(), kv.second.end(),
+                                               [&](const CellInfo *ci) { return !g.zero_area_cells.count(ci->type); });
+                StaticRect cluster_size;
+                for (auto ci : kv.second) {
+                    if (has_nonzero && g.zero_area_cells.count(ci->type))
+                        continue;
+                    // Compute an equivalent-area stacked rectange for cells in this cluster group.
+                    // There are probably some ugly cases this handles badly.
+                    StaticRect r = g.cell_area.at(ci->type);
+                    if (r.w > r.h) {
+                        // Long and thin, "stack" vertically
+                        // Compute height we add to stack
+                        if (cluster_size.w < r.w) {
+                            cluster_size.h *= (cluster_size.w / r.w);
+                            cluster_size.w = r.w;
+                        }
+                        cluster_size.h += ((r.w * r.h) / cluster_size.w);
+                    } else {
+                        // "stack" horizontally
+                        if (cluster_size.h < r.h) {
+                            cluster_size.w *= (cluster_size.h / r.h);
+                            cluster_size.h = r.h;
+                        }
+                        cluster_size.w += ((r.w * r.h) / cluster_size.h);
+                    }
+                }
+                // Now add the moveable cell
+                if (cluster_size.area() > 0) {
+                    int idx = add_cell(cluster_size, kv.first.group, rand_loc(), kv.second.front());
+                    auto &mc = mcells.at(idx);
+                    if (kv.second.front()->bel != BelId()) {
+                        // Currently; treat all ready-placed cells as fixed (eventually we might do incremental ripups
+                        // here...)
+                        Loc loc = ctx->getBelLocation(kv.second.front()->bel);
+                        mc.pos.x = loc.x + 0.5;
+                        mc.pos.y = loc.y + 0.5;
+                        mc.is_fixed = true;
+                    }
+                    for (auto ci : kv.second) {
+                        ci->udata = idx;
+                        mc.pin_count += int(ci->ports.size());
+                    }
+                    auto &cc = ccells.at(idx);
+                    cc.macro_idx = i;
+                    cc.chunk_dx = kv.first.dx;
+                    cc.chunk_dy = kv.first.dy;
+                    m.conc_cells.push_back(idx);
+                }
+            }
+        }
+    }
+
+    const double target_util = 0.8;
+
+    void insert_dark()
+    {
+        log_info("⌁ inserting dark nodes...\n");
+        for (int group = 0; group < int(groups.size()); group++) {
+            auto &g = groups.at(group);
+            for (auto tile : g.loc_area) {
+                if (tile.value > 0.5f)
+                    continue;
+                StaticRect dark_area(1.0f, 1.0f - tile.value);
+                int cell_idx = add_cell(dark_area, group, RealPair(tile.x + 0.5f, tile.y + 0.5f), nullptr /*spacer*/);
+                mcells.at(cell_idx).is_dark = true;
+            }
+        }
+    }
+
+    void insert_spacer()
+    {
+        log_info("⌁ inserting spacers...\n");
+        int inserted_spacers = 0;
+        for (int group = 0; group < int(groups.size()); group++) {
+            const auto &cg = cfg.cell_groups.at(group);
+            const auto &g = groups.at(group);
+            double util = g.concrete_area / g.total_area;
+            log_info("⌁   group %s pre-spacer utilisation %.02f%% (target %.02f%%)\n", ctx->nameOf(cg.name),
+                     (util * 100.0), (target_util * 100.0));
+            // TODO: better computation of spacer size and placement?
+            int spacer_count = (g.total_area * target_util - g.concrete_area) / cg.spacer_rect.area();
+            if (spacer_count <= 0)
+                continue;
+            for (int i = 0; i < spacer_count; i++) {
+                add_cell(cg.spacer_rect, group, RealPair(ctx->rngf(width), ctx->rngf(height)), nullptr /*spacer*/);
+                ++inserted_spacers;
+            }
+        }
+        log_info("⌁   inserted a total of %d spacers\n", inserted_spacers);
+    }
+
+    // TODO: dark node insertion when we have obstructions or non-rectangular placement regions
+    int m;
+    double bin_w, bin_h;
+
+    std::vector<float> cs_table_fft;
+    std::vector<int> work_area_fft;
+
+    void prepare_density_bins()
+    {
+        // TODO: a m x m grid follows the paper and makes the DCTs easier, but is it actually ideal for non-square
+        // FPGAs?
+        m = 1 << int(std::ceil(std::log2(std::sqrt(mcells.size() / groups.size()))));
+        bin_w = double(width) / m;
+        bin_h = double(height) / m;
+
+        for (auto &g : groups) {
+            g.density.reset(m, m, 0);
+            g.density_fft.reset(m, m, 0);
+            g.electro_phi.reset(m, m, 0);
+            g.electro_fx.reset(m, m, 0);
+            g.electro_fy.reset(m, m, 0);
+        }
+        cs_table_fft.resize(m * 3 / 2, 0);
+        work_area_fft.resize(std::round(std::sqrt(m)) + 2, 0);
+        work_area_fft.at(0) = 0;
+    }
+
+    template <typename TFunc> void iter_slithers(RealPair pos, StaticRect rect, TFunc func)
+    {
+        // compute the stamp over bins (this could probably be more efficient?)
+
+        double width = rect.w, height = rect.h;
+        double scaled_density = 1.0;
+        if (width < bin_w) {
+            scaled_density *= (width / bin_w);
+            width = bin_w;
+        }
+        if (height < bin_h) {
+            scaled_density *= (height / bin_h);
+            height = bin_h;
+        }
+
+        double x0 = pos.x - (width / 2), x1 = pos.x + (width / 2);
+        double y0 = pos.y - (height / 2), y1 = pos.y + (height / 2);
+        for (int y = int(y0 / bin_h); y <= int(y1 / bin_h); y++) {
+            for (int x = int(x0 / bin_w); x <= int(x1 / bin_w); x++) {
+                int xb = std::max(0, std::min(x, m - 1));
+                int yb = std::max(0, std::min(y, m - 1));
+                double slither_w = 1.0, slither_h = 1.0;
+                if (yb == int(y0 / bin_h)) // y slithers
+                    slither_h = ((yb + 1) * bin_h) - y0;
+                else if (yb == int(y1 / bin_h))
+                    slither_h = (y1 - yb * bin_h);
+                if (xb == int(x0 / bin_w)) // x slithers
+                    slither_w = ((xb + 1) * bin_w) - x0;
+                else if (xb == int(x1 / bin_w))
+                    slither_w = (x1 - xb * bin_w);
+                func(xb, yb, scaled_density * slither_w * slither_h);
+            }
+        }
+    };
+
+    void compute_density(int group, bool ref)
+    {
+        auto &g = groups.at(group);
+        // reset
+        for (auto entry : g.density)
+            entry.value = 0;
+        // populate
+        for (int idx = 0; idx < int(mcells.size()); idx++) {
+            auto &mc = mcells.at(idx);
+            if (mc.group != group)
+                continue;
+            // scale width and height to be at least one bin (local density smoothing from the eplace paper)
+            // TODO: should we really do this every iteration?
+
+            auto pos = ref ? mc.ref_pos : mc.pos;
+            iter_slithers(pos, mc.rect, [&](int x, int y, float area) { g.density.at(x, y) += area; });
+        }
+    }
+
+    void compute_overlap()
+    {
+        // populate for concrete cells only
+        for (auto &g : groups)
+            g.conc_density.reset(m, m, 0);
+        for (int idx = 0; idx < int(ccells.size()); idx++) {
+            auto &mc = mcells.at(idx);
+            auto &g = groups.at(mc.group);
+            // scale width and height to be at least one bin (local density smoothing from the eplace paper)
+            // TODO: should we really do this every iteration?
+            auto loc = mc.pos;
+            auto size = mc.rect;
+
+            for (int dy = 0; dy <= int(size.h); dy++) {
+                for (int dx = 0; dx <= int(size.w); dx++) {
+                    float h = (dy == int(size.h)) ? (size.h - int(size.h)) : 1;
+                    float w = (dx == int(size.w)) ? (size.w - int(size.w)) : 1;
+                    g.conc_density.at(loc.x + dx, loc.y + dy) += w * h;
+                }
+            }
+        }
+        std::string overlap_str = "";
+        for (int idx = 0; idx < int(groups.size()); idx++) {
+            auto &g = groups.at(idx);
+            g.overlap = 0;
+            float total_area = 0;
+            for (auto tile : g.loc_area) {
+                // g.overlap += std::max<float>(0, g.conc_density.at(tile.x, tile.y) - tile.value);
+                g.overlap += std::max<float>(0, g.conc_density.at(tile.x, tile.y) - 1);
+                total_area += g.conc_density.at(tile.x, tile.y);
+            }
+            g.overlap /= std::max(1.0f, total_area);
+            if (!overlap_str.empty())
+                overlap_str += ", ";
+            overlap_str += stringf("%s=%.1f%%", cfg.cell_groups.at(idx).name.c_str(ctx), g.overlap*100);
+            g.conc_density.write_csv(stringf("out_conc_density_%d_%d.csv", iter, idx));
+        }
+        log_info("overlap: %s\n", overlap_str.c_str());
+    }
+
+    void run_fft(int group)
+    {
+        // get data into form that fft wants
+        auto &g = groups.at(group);
+        for (auto entry : g.density)
+            g.density_fft.at(entry.x, entry.y) = entry.value;
+        if (fft_debug)
+            g.density_fft.write_csv(stringf("out_bin_density_%d_%d.csv", iter, group));
+        // Based on
+        // https://github.com/ALIGN-analoglayout/ALIGN-public/blob/master/PlaceRouteHierFlow/EA_placer/FFT/fft.cpp
+        // initial DCT for coefficients
+        ddct2d(m, m, -1, g.density_fft.data(), nullptr, work_area_fft.data(), cs_table_fft.data());
+        // postprocess coefficients
+        for (int x = 0; x < m; x++)
+            g.density_fft.at(x, 0) *= 0.5f;
+        for (int y = 0; y < m; y++)
+            g.density_fft.at(0, y) *= 0.5f;
+        for (int x = 0; x < m; x++)
+            for (int y = 0; y < m; y++)
+                g.density_fft.at(x, y) *= (4.0f / (m * m));
+        // scale inputs to IDCT for potentials and field
+        for (int x = 0; x < m; x++) {
+            float wx = pi * (x / float(m));
+            float wx2 = wx * wx;
+            for (int y = 0; y < m; y++) {
+                float wy = pi * (y / float(m));
+                float wy2 = wy * wy;
+
+                float dens = g.density_fft.at(x, y);
+                float phi = 0, ex = 0, ey = 0;
+
+                if (x != 0 || y != 0) { // avoid divide by zero...
+                    phi = dens / (wx2 + wy2);
+                    ex = phi * wx;
+                    ey = phi * wy;
+                }
+
+                g.electro_phi.at(x, y) = phi;
+                g.electro_fx.at(x, y) = ex;
+                g.electro_fy.at(x, y) = ey;
+            }
+        }
+        // IDCT for potential; 2D derivatives for field
+        ddct2d(m, m, 1, g.electro_phi.data(), nullptr, work_area_fft.data(), cs_table_fft.data());
+        ddsct2d(m, m, 1, g.electro_fx.data(), nullptr, work_area_fft.data(), cs_table_fft.data());
+        ddcst2d(m, m, 1, g.electro_fy.data(), nullptr, work_area_fft.data(), cs_table_fft.data());
+        if (fft_debug) {
+            g.electro_phi.write_csv(stringf("out_bin_phi_%d_%d.csv", iter, group));
+            g.electro_fx.write_csv(stringf("out_bin_ex_%d_%d.csv", iter, group));
+            g.electro_fy.write_csv(stringf("out_bin_ey_%d_%d.csv", iter, group));
+        }
+    }
+
+    void compute_bounds(PlacerNet &net, Axis axis, bool ref)
+    {
+        NetInfo *ni = net.ni;
+        auto drv_loc = cell_loc(ni->driver.cell, ref);
+        // seed with driver location
+        net.b1.at(axis) = net.b0.at(axis) = drv_loc.at(axis);
+        // iterate over users
+        for (auto &usr : ni->users) {
+            auto usr_loc = cell_loc(usr.cell, ref);
+            net.b1.at(axis) = std::max(net.b1.at(axis), usr_loc.at(axis));
+            net.b0.at(axis) = std::min(net.b0.at(axis), usr_loc.at(axis));
+        }
+    }
+
+    RealPair wl_coeff{0.5f, 0.5f};
+
+    void update_nets(Axis axis, bool ref)
+    {
+        static constexpr float min_wirelen_force = -300.f;
+        for (auto &net : nets) {
+            if (net.skip)
+                continue;
+            net.min_exp.at(axis) = 0;
+            net.x_min_exp.at(axis) = 0;
+            net.max_exp.at(axis) = 0;
+            net.x_max_exp.at(axis) = 0;
+            // update bounding box
+            compute_bounds(net, axis, ref);
+            // compute rough center to subtract from exponents to avoid FP issues (from replace)
+            float c = (net.b1.at(axis) + net.b0.at(axis)) / 2.f;
+            for (auto &port : net.ports) {
+                if (!port.ref.cell)
+                    continue;
+                RealPair loc = cell_loc(port.ref.cell, ref);
+                // update weighted-average model exponents
+                float emin = (c - loc.at(axis)) * wl_coeff.at(axis);
+                float emax = (loc.at(axis) - c) * wl_coeff.at(axis);
+
+                if (emin > min_wirelen_force) {
+                    port.min_exp.at(axis) = std::exp(emin);
+                    net.min_exp.at(axis) += port.min_exp.at(axis);
+                    net.x_min_exp.at(axis) += loc.at(axis) * port.min_exp.at(axis);
+                } else {
+                    port.min_exp.at(axis) = PlacerPort::invalid;
+                }
+                if (emax > min_wirelen_force) {
+                    port.max_exp.at(axis) = std::exp(emax);
+                    net.max_exp.at(axis) += port.max_exp.at(axis);
+                    net.x_max_exp.at(axis) += loc.at(axis) * port.max_exp.at(axis);
+                } else {
+                    port.max_exp.at(axis) = PlacerPort::invalid;
+                }
+            }
+            net.wa_wl.at(axis) =
+                    (net.x_max_exp.at(axis) / net.max_exp.at(axis)) - (net.x_min_exp.at(axis) / net.min_exp.at(axis));
+        }
+    }
+
+    float wirelen_grad(CellInfo *cell, Axis axis, bool ref)
+    {
+        float gradient = 0;
+        if (cell->udata == -1)
+            return 0;
+        RealPair loc = cell_loc(cell, ref);
+        for (auto &port : cell->ports) {
+            NetInfo *ni = port.second.net;
+            if (!ni)
+                continue;
+            auto &nd = nets.at(ni->udata);
+            if (nd.skip)
+                continue;
+            auto &pd = nd.ports.at(port.second.type == PORT_OUT ? (nd.ports.size() - 1) : port.second.user_idx.idx());
+            // From Replace
+            // TODO: check these derivatives on paper
+            float d_min = 0, d_max = 0;
+            if (pd.has_min_exp(axis)) {
+                float min_sum = nd.min_exp.at(axis), x_min_sum = nd.x_min_exp.at(axis);
+                d_min = (min_sum * (pd.min_exp.at(axis) * (1.0f - wl_coeff.at(axis) * loc.at(axis))) +
+                         wl_coeff.at(axis) * pd.min_exp.at(axis) * x_min_sum) /
+                        (min_sum * min_sum);
+            }
+            if (pd.has_max_exp(axis)) {
+                float max_sum = nd.max_exp.at(axis), x_max_sum = nd.x_max_exp.at(axis);
+                d_max = (max_sum * (pd.max_exp.at(axis) * (1.0f + wl_coeff.at(axis) * loc.at(axis))) -
+                         wl_coeff.at(axis) * pd.max_exp.at(axis) * x_max_sum) /
+                        (max_sum * max_sum);
+            }
+            gradient += (d_min - d_max);
+        }
+
+        return gradient;
+    }
+
+    float dens_penalty = 0.025f;
+    float nesterov_a = 1.0f;
+
+    void update_gradients(bool ref = true, bool set_prev = true, bool init_penalty = false)
+    {
+        // TODO: skip non-group cells more efficiently?
+        for (int group = 0; group < int(groups.size()); group++) {
+            compute_density(group, ref);
+            run_fft(group);
+        }
+        for (auto axis : {Axis::X, Axis::Y}) {
+            update_nets(axis, ref);
+        }
+        // First loop: back up gradients if required; set to zero; and compute density gradient
+        for (auto &cell : mcells) {
+            auto &g = groups.at(cell.group);
+            if (set_prev && ref) {
+                cell.last_wl_grad = cell.ref_wl_grad;
+                cell.last_dens_grad = cell.ref_dens_grad;
+                cell.last_total_grad = cell.ref_total_grad;
+            }
+            // wirelength gradient updated based on cell instances in next loop
+            (ref ? cell.ref_wl_grad : cell.wl_grad) = RealPair(0, 0);
+            // density grad based on bins - do we need to interpolate?
+            auto &grad = (ref ? cell.ref_dens_grad : cell.dens_grad);
+            grad = RealPair(0, 0);
+            iter_slithers((ref ? cell.ref_pos : cell.pos), cell.rect, [&](int x, int y, float area) {
+                grad += RealPair(g.electro_fx.at(x, y) * area, g.electro_fy.at(x, y) * area);
+            });
+            // total gradient computed at the end
+            (ref ? cell.ref_total_grad : cell.total_grad) = RealPair(0, 0);
+        }
+        // Second loop: sum up wirelength gradients across concrete cell instances
+        for (auto &cell : ctx->cells) {
+            CellInfo *ci = cell.second.get();
+            if (ci->udata == -1)
+                continue;
+            auto &mc = mcells.at(ci->udata);
+            // TODO: exploit parallelism across axes
+            float wl_gx = wirelen_grad(ci, Axis::X, ref);
+            float wl_gy = wirelen_grad(ci, Axis::Y, ref);
+            (ref ? mc.ref_wl_grad : mc.wl_grad) += RealPair(wl_gx, wl_gy);
+        }
+        if (init_penalty) {
+            // set initial density penalty
+            float wirelen_sum = 0;
+            float force_sum = 0;
+            for (auto &cell : ctx->cells) {
+                CellInfo *ci = cell.second.get();
+                if (ci->udata == -1)
+                    continue;
+                auto &mc = mcells.at(ci->udata);
+                wirelen_sum += std::abs(mc.ref_wl_grad.x) + std::abs(mc.ref_wl_grad.y);
+                force_sum += std::abs(mc.ref_dens_grad.x) + std::abs(mc.ref_dens_grad.y);
+            }
+            dens_penalty = wirelen_sum / force_sum;
+            log_info(" initial density penalty: %f\n", dens_penalty);
+        }
+        // Third loop: compute total gradient, and precondition
+        // TODO: ALM as well as simple penalty
+        for (auto &cell : mcells) {
+#if 0
+            if (!cell.is_spacer) {
+                printf("%d (%f, %f) wirelen_grad: (%f,%f) density_grad: (%f,%f)\n", iter, cell.ref_pos.x,
+                       cell.ref_pos.y, cell.ref_wl_grad.x, cell.ref_wl_grad.y, cell.ref_dens_grad.x,
+                       cell.ref_dens_grad.y);
+            }
+#endif
+            // Preconditioner from replace for now
+            float precond = std::max(1.0f, float(cell.pin_count) + dens_penalty * cell.rect.area());
+            (ref ? cell.ref_total_grad : cell.total_grad) =
+                    (((ref ? cell.ref_wl_grad : cell.wl_grad) * -1) -
+                     (ref ? cell.ref_dens_grad : cell.dens_grad) * dens_penalty) /
+                    precond;
+        }
+    }
+
+    float steplen = 0.01;
+
+    float get_steplen()
+    {
+        float coord_dist = 0;
+        float grad_dist = 0;
+        int n = 0;
+        for (auto &cell : mcells) {
+            coord_dist += (cell.ref_pos.x - cell.last_ref_pos.x) * (cell.ref_pos.x - cell.last_ref_pos.x);
+            coord_dist += (cell.ref_pos.y - cell.last_ref_pos.y) * (cell.ref_pos.y - cell.last_ref_pos.y);
+            grad_dist +=
+                    (cell.ref_total_grad.x - cell.last_total_grad.x) * (cell.ref_total_grad.x - cell.last_total_grad.x);
+            grad_dist +=
+                    (cell.ref_total_grad.y - cell.last_total_grad.y) * (cell.ref_total_grad.y - cell.last_total_grad.y);
+            // keep track of N separately, because we might exclude some cells
+            ++n;
+        }
+        coord_dist = std::sqrt(coord_dist / (2 * float(mcells.size())));
+        grad_dist = std::sqrt(grad_dist / (2 * float(mcells.size())));
+        log_info("coord_dist: %f grad_dist: %f\n", coord_dist, grad_dist);
+        return coord_dist / grad_dist;
+        // return 0.1;
+    }
+
+    float system_hpwl()
+    {
+        float hpwl = 0;
+        for (auto &net : nets) {
+            if (net.skip)
+                continue;
+            // update bounding box
+            compute_bounds(net, Axis::X, false);
+            compute_bounds(net, Axis::Y, false);
+            hpwl += net.b1.x - net.b0.x;
+            hpwl += net.b1.y - net.b0.y;
+        }
+        return hpwl;
+    }
+
+    float system_potential()
+    {
+        float pot = 0;
+        for (auto &cell : mcells) {
+            auto &g = groups.at(cell.group);
+            iter_slithers(cell.ref_pos, cell.rect,
+                          [&](int x, int y, float area) { pot += g.electro_phi.at(x, y) * area; });
+        }
+        return pot;
+    }
+
+    void initialise()
+    {
+        float initial_steplength = 0.01f;
+        // Update current and previous gradients with initial solution
+        for (auto &cell : mcells) {
+            cell.ref_pos = cell.pos;
+        }
+        while (true) {
+            update_gradients(true, true, /* init_penalty */ true);
+            // compute a "fake" previous position based on an arbitrary steplength and said gradients for nesterov
+            for (auto &cell : mcells) {
+                if (cell.is_fixed)
+                    continue;
+                // save current position in last_pos
+                cell.last_pos = cell.pos;
+                cell.last_ref_pos = cell.ref_pos;
+                // compute previous position; but store it in current for gradient computation
+                cell.ref_pos = cell.pos - cell.ref_total_grad * initial_steplength;
+            }
+            // Compute the previous gradients (albeit into the current state fields)
+            update_gradients(true);
+            // Now we have the fake previous state in the current state
+            for (auto &cell : mcells) {
+                std::swap(cell.last_ref_pos, cell.ref_pos);
+                std::swap(cell.ref_total_grad, cell.last_total_grad);
+                std::swap(cell.ref_wl_grad, cell.last_wl_grad);
+                std::swap(cell.ref_dens_grad, cell.last_dens_grad);
+            }
+            float next_steplen = get_steplen();
+            log_info("initial steplen=%f next steplen = %f\n", initial_steplength, next_steplen);
+            if (next_steplen != 0 && std::isfinite(next_steplen) && std::abs(next_steplen) < 1e10) {
+                break;
+            } else {
+                initial_steplength *= 10;
+            }
+        }
+    }
+
+    RealPair clamp_loc(RealPair loc)
+    {
+        return RealPair(std::max<float>(0, std::min<float>(width - 1, loc.x)),
+                        std::max<float>(0, std::min<float>(height - 1, loc.y)));
+    }
+
+    void update_chains()
+    {
+        // Move the post-solve position of a chain towards be the weighted average of its constituents
+        // The strength increases with iterations
+        float alpha = std::min<float>(std::pow(1.002f, iter) - 1, 1.0f);
+        for (int i = 0; i < int(macros.size()); i++) {
+            auto &macro = macros.at(i);
+            float total_area = 0;
+            const float area_epsilon = 0.05;
+            RealPair pos(0, 0), ref_pos(0, 0);
+            for (int c : macro.conc_cells) {
+                auto &mc = mcells.at(c);
+                float a = std::max(mc.rect.area(), area_epsilon);
+                pos += mc.pos * a;
+                ref_pos += mc.ref_pos * a;
+                total_area += a;
+            }
+            pos /= total_area;
+            ref_pos /= total_area;
+            for (int c : macro.conc_cells) {
+                auto &cc = ccells.at(c);
+                auto &mc = mcells.at(c);
+                mc.pos = mc.pos * (1-alpha) + (pos + RealPair(cc.chunk_dx, cc.chunk_dy)) * alpha;
+                mc.ref_pos = mc.ref_pos * (1 - alpha) + (ref_pos + RealPair(cc.chunk_dx, cc.chunk_dy)) * alpha;
+            }
+        }
+    }
+
+    void step()
+    {
+        // TODO: update penalties; wirelength factor; etc
+        steplen = get_steplen();
+        log_info("iter=%d steplen=%f a=%f\n", iter, steplen, nesterov_a);
+        float a_next = (1.0f + std::sqrt(4.0f * nesterov_a * nesterov_a + 1)) / 2.0f;
+        // Update positions according to trivial gradient descent (let's leave Nesterov's for later...)
+        for (auto &cell : mcells) {
+            if (cell.is_fixed || cell.is_dark)
+                continue;
+            // save current position in last_pos
+            cell.last_ref_pos = cell.ref_pos;
+            cell.last_pos = cell.pos;
+            // compute new position
+            cell.pos = clamp_loc(cell.ref_pos - cell.ref_total_grad * steplen);
+            // compute reference position
+            cell.ref_pos = clamp_loc(cell.pos + (cell.pos - cell.last_pos) * ((nesterov_a - 1) / a_next));
+        }
+        nesterov_a = a_next;
+        update_chains();
+        update_gradients(true);
+        log_info("   system potential: %f hpwl: %f\n", system_potential(), system_hpwl());
+        compute_overlap();
+    }
+
+    void legalise_step(bool dsp_bram) {
+        // assume DSP and BRAM are all groups 2+ for now
+        for (int i = 0; i < int(ccells.size()); i++) {
+            auto &mc = mcells.at(i);
+            auto &cc = ccells.at(i);
+            if (dsp_bram && mc.group < 2)
+                continue;
+            if (!dsp_bram && mc.group >= 2)
+                continue;
+            if (cc.macro_idx != -1 && i != macros.at(cc.macro_idx).root->udata)
+                continue; // not macro root
+            if (mc.is_fixed) { // already placed
+                NPNR_ASSERT(cc.base_cell->bel != BelId());
+                continue;
+            }
+            enqueue_legalise(i);
+        }
+        // in the DSP/BRAM step, also merge any cells outside of a group (global buffers, other random crud/singletons)
+        for (auto &cell : ctx->cells) {
+            if (cell.second->udata == -1)
+                enqueue_legalise(cell.second.get());
+        }
+        log_info("Strict legalising %d cells...\n", int(to_legalise.size()));
+        float pre_hpwl = system_hpwl();
+        legalise_placement_strict(true);
+        for (auto axis : {Axis::X, Axis::Y}) {
+            update_nets(axis, true);
+        }
+        float post_hpwl = system_hpwl();
+        log_info("HPWL after legalise: %f (delta: %f)\n", post_hpwl, post_hpwl - pre_hpwl);
+    }
+
+    void enqueue_legalise(int cell_idx) {
+        NPNR_ASSERT(cell_idx < int(ccells.size())); // we should never be legalising spacers or dark nodes
+        auto &ccell = ccells.at(cell_idx);
+        if (ccell.macro_idx != -1) {
+            // is a macro
+            auto &macro = macros.at(ccell.macro_idx);
+            to_legalise.emplace(int(macro.cells.size()), macro.root->name);
+        } else {
+            to_legalise.emplace(1, ccell.base_cell->name);
+        }
+    }
+
+    void enqueue_legalise(CellInfo *ci) {
+        if (ci->udata != -1) {
+            // managed by static
+            enqueue_legalise(ci->udata);
+        } else {
+            // special case
+            to_legalise.emplace(1, ci->name);
+        }
+    }
+    // Strict placement legalisation, performed after the initial HeAP spreading
+    void legalise_placement_strict(bool require_validity = true)
+    {
+        // At the moment we don't follow the full HeAP algorithm using cuts for legalisation, instead using
+        // the simple greedy largest-macro-first approach.
+        int ripup_radius = 2;
+        int total_iters = 0;
+        int total_iters_noreset = 0;
+        while (!to_legalise.empty()) {
+            auto top = to_legalise.top();
+            to_legalise.pop();
+
+            CellInfo *ci = ctx->cells.at(top.second).get();
+            // Was now placed, ignore
+            if (ci->bel != BelId())
+                continue;
+            // log_info("   Legalising %s (%s) %d\n", top.second.c_str(ctx), ci->type.c_str(ctx), top.first);
+            FastBels::FastBelsData *fb;
+            fast_bels.getBelsForCellType(ci->type, &fb);
+            int radius = 0;
+            int iter = 0;
+            int iter_at_radius = 0;
+            int total_iters_for_cell = 0;
+            bool placed = false;
+            BelId bestBel;
+            int best_inp_len = std::numeric_limits<int>::max();
+
+            total_iters++;
+            total_iters_noreset++;
+            if (total_iters > int(ccells.size())) {
+                total_iters = 0;
+                ripup_radius = std::max(std::max(width+1, height+1), ripup_radius * 2);
+            }
+
+            if (total_iters_noreset > std::max(5000, 8 * int(ctx->cells.size()))) {
+                log_error("Unable to find legal placement for all cells, design is probably at utilisation limit.\n");
+            }
+
+            while (!placed) {
+                // Determine a search radius around the solver location (which increases over time) that is clamped to
+                // the region constraint for the cell (if applicable)
+                int rx = radius, ry = radius;
+
+                // Pick a random X and Y location within our search radius
+                int cx, cy;
+                if (ci->udata == -1) {
+                    cx = width / 2;
+                    cy = height / 2;
+                } else {
+                    cx = int(mcells.at(ci->udata).pos.x);
+                    cy = int(mcells.at(ci->udata).pos.y);
+                }
+                int nx = ctx->rng(2 * rx + 1) + std::max(cx - rx, 0);
+                int ny = ctx->rng(2 * ry + 1) + std::max(cy - ry, 0);
+
+                iter++;
+                iter_at_radius++;
+                if (iter >= (10 * (radius + 1))) {
+                    // No luck yet, increase radius
+                    radius = std::min(std::max(width+1, height+1), radius + 1);
+                    while (radius < std::max(width+1, height+1)) {
+                        // Keep increasing the radius until it will actually increase the number of cells we are
+                        // checking (e.g. BRAM and DSP will not be in all cols/rows), so we don't waste effort
+                        for (int x = std::max(0, cx - radius);
+                             x <= std::min(width+1, cx + radius); x++) {
+                            if (x >= int(fb->size()))
+                                break;
+                            for (int y = std::max(0, cy - radius);
+                                 y <= std::min(height+1, cy + radius); y++) {
+                                if (y >= int(fb->at(x).size()))
+                                    break;
+                                if (fb->at(x).at(y).size() > 0)
+                                    goto notempty;
+                            }
+                        }
+                        radius = std::min(std::max(width+1, height+1), radius + 1);
+                    }
+                notempty:
+                    iter_at_radius = 0;
+                    iter = 0;
+                }
+                // If our randomly chosen cooridnate is out of bounds; or points to a tile with no relevant bels; ignore
+                // it
+                if (nx < 0 || nx > width+1)
+                    continue;
+                if (ny < 0 || ny > height+1)
+                    continue;
+
+                if (nx >= int(fb->size()))
+                    continue;
+                if (ny >= int(fb->at(nx).size()))
+                    continue;
+                if (fb->at(nx).at(ny).empty())
+                    continue;
+
+                // The number of attempts to find a location to try
+                int need_to_explore = 2 * radius;
+
+                // If we have found at least one legal location; and made enough attempts; assume it's good enough and
+                // finish
+                if (iter_at_radius >= need_to_explore && bestBel != BelId()) {
+                    CellInfo *bound = ctx->getBoundBelCell(bestBel);
+                    if (bound != nullptr) {
+                        ctx->unbindBel(bound->bel);
+                        enqueue_legalise(bound);
+                    }
+                    ctx->bindBel(bestBel, ci, STRENGTH_WEAK);
+                    placed = true;
+                    Loc loc = ctx->getBelLocation(bestBel);
+                    if (ci->udata != -1) {
+                        auto &mc = mcells.at(ci->udata);
+                        mc.pos = mc.ref_pos = RealPair(loc, 0.5);
+                        mc.is_fixed = true;
+                    }
+                    break;
+                }
+
+                if (ci->cluster == ClusterId()) {
+                    // The case where we have no relative constraints
+                    for (auto sz : fb->at(nx).at(ny)) {
+                        // Look through all bels in this tile; checking region constraint if applicable
+                        if (!ci->testRegion(sz))
+                            continue;
+                        // Prefer available bels; unless we are dealing with a wide radius (e.g. difficult control sets)
+                        // or occasionally trigger a tiebreaker
+                        if (ctx->checkBelAvail(sz) || (radius > ripup_radius || ctx->rng(20000) < 10)) {
+                            CellInfo *bound = ctx->getBoundBelCell(sz);
+                            if (bound != nullptr) {
+                                // Only rip up cells without constraints
+                                if (bound->cluster != ClusterId())
+                                    continue;
+                                ctx->unbindBel(bound->bel);
+                            }
+                            // Provisionally bind the bel
+                            ctx->bindBel(sz, ci, STRENGTH_WEAK);
+                            if (require_validity && !ctx->isBelLocationValid(sz)) {
+                                // New location is not legal; unbind the cell (and rebind the cell we ripped up if
+                                // applicable)
+                                ctx->unbindBel(sz);
+                                if (bound != nullptr)
+                                    ctx->bindBel(sz, bound, STRENGTH_WEAK);
+                            } else if (iter_at_radius < need_to_explore) {
+                                // It's legal, but we haven't tried enough locations yet
+                                ctx->unbindBel(sz);
+                                if (bound != nullptr)
+                                    ctx->bindBel(sz, bound, STRENGTH_WEAK);
+                                int input_len = 0;
+                                // Compute a fast input wirelength metric at this bel; and save if better than our last
+                                // try
+                                for (auto &port : ci->ports) {
+                                    auto &p = port.second;
+                                    if (p.type != PORT_IN || p.net == nullptr || p.net->driver.cell == nullptr)
+                                        continue;
+                                    CellInfo *drv = p.net->driver.cell;
+                                    if (drv->udata == -1)
+                                        continue;
+                                    auto drv_loc = mcells.at(drv->udata);
+                                    input_len += std::abs(int(drv_loc.pos.x) - nx) + std::abs(int(drv_loc.pos.y) - ny);
+                                }
+                                if (input_len < best_inp_len) {
+                                    best_inp_len = input_len;
+                                    bestBel = sz;
+                                }
+                                break;
+                            } else {
+                                // It's legal, and we've tried enough. Finish.
+                                if (bound != nullptr)
+                                    enqueue_legalise(bound);
+                                Loc loc = ctx->getBelLocation(sz);
+                                if (ci->udata != -1) {
+                                    auto &mc = mcells.at(ci->udata);
+                                    mc.pos = mc.ref_pos = RealPair(loc, 0.5);
+                                    mc.is_fixed = true;
+                                }
+                                placed = true;
+                                break;
+                            }
+                        }
+                    }
+                } else {
+                    // We do have relative constraints
+                    for (auto sz : fb->at(nx).at(ny)) {
+                        // List of cells and their destination
+                        std::vector<std::pair<CellInfo *, BelId>> targets;
+                        // List of bels we placed things at; and the cell that was there before if applicable
+                        std::vector<std::pair<BelId, CellInfo *>> swaps_made;
+
+                        if (!ctx->getClusterPlacement(ci->cluster, sz, targets))
+                            continue;
+
+                        for (auto &target : targets) {
+                            // Check it satisfies the region constraint if applicable
+                            if (!target.first->testRegion(target.second))
+                                goto fail;
+                            CellInfo *bound = ctx->getBoundBelCell(target.second);
+                            // Chains cannot overlap; so if we have to ripup a cell make sure it isn't part of a chain
+                            if (bound != nullptr)
+                                if (bound->cluster != ClusterId() || bound->belStrength > STRENGTH_WEAK)
+                                    goto fail;
+                        }
+                        // Actually perform the move; keeping track of the moves we make so we can revert them if needed
+                        for (auto &target : targets) {
+                            CellInfo *bound = ctx->getBoundBelCell(target.second);
+                            if (bound != nullptr)
+                                ctx->unbindBel(target.second);
+                            ctx->bindBel(target.second, target.first, STRENGTH_STRONG);
+                            swaps_made.emplace_back(target.second, bound);
+                        }
+                        // Check that the move we have made is legal
+                        for (auto &sm : swaps_made) {
+                            if (!ctx->isBelLocationValid(sm.first))
+                                goto fail;
+                        }
+
+                        if (false) {
+                        fail:
+                            // If the move turned out to be illegal; revert all the moves we made
+                            for (auto &swap : swaps_made) {
+                                ctx->unbindBel(swap.first);
+                                if (swap.second != nullptr)
+                                    ctx->bindBel(swap.first, swap.second, STRENGTH_WEAK);
+                            }
+                            continue;
+                        }
+                        for (auto &target : targets) {
+                            Loc loc = ctx->getBelLocation(target.second);
+                            if (ci->udata != -1) {
+                                auto &mc = mcells.at(target.first->udata);
+                                mc.pos = mc.ref_pos = RealPair(loc, 0.5);
+                                mc.is_fixed = true;
+                            }
+                            // log_info("%s %d %d %d\n", target.first->name.c_str(ctx), loc.x, loc.y, loc.z);
+                        }
+                        for (auto &swap : swaps_made) {
+                            // Where we have ripped up cells; add them to the queue
+                            if (swap.second != nullptr)
+                                enqueue_legalise(swap.second);
+                        }
+
+                        placed = true;
+                        break;
+                    }
+                }
+
+                total_iters_for_cell++;
+            }
+        }
+    }
+
+  public:
+    StaticPlacer(Context *ctx, PlacerStaticCfg cfg) : ctx(ctx), cfg(cfg), fast_bels(ctx, true, 8), tmg(ctx)
+    {
+        groups.resize(cfg.cell_groups.size());
+    };
+    void place()
+    {
+        log_info("Running Static placer...\n");
+        init_bels();
+        prepare_cells();
+        init_cells();
+        init_nets();
+        insert_dark();
+        insert_spacer();
+
+        prepare_density_bins();
+        initialise();
+        bool legalised_ip = false;
+        while (true) {
+            step();
+            dens_penalty *= 1.025;
+            if (!legalised_ip) {
+                float ip_overlap = 0;
+                for (int i = 2; i < int(groups.size()); i++)
+                    ip_overlap = std::max(ip_overlap, groups.at(i).overlap);
+                if (ip_overlap < 0.15) {
+                    legalise_step(true);
+                    legalised_ip = true;
+                }
+            } else {
+                float logic_overlap = 0;
+                for (int i = 0; i < 2; i++)
+                    logic_overlap = std::max(logic_overlap, groups.at(i).overlap);
+                if (logic_overlap < 0.1) {
+                    legalise_step(false);
+                    break;
+                }
+            }
+            ++iter;
+        }
+        {
+            auto placer1_cfg = Placer1Cfg(ctx);
+            placer1_cfg.hpwl_scale_x = cfg.hpwl_scale_x;
+            placer1_cfg.hpwl_scale_y = cfg.hpwl_scale_y;
+            placer1_refine(ctx, placer1_cfg);
+        }
+    }
+};
+}; // namespace
+
+bool placer_static(Context *ctx, PlacerStaticCfg cfg)
+{
+    StaticPlacer(ctx, cfg).place();
+    return true;
+}
+
+PlacerStaticCfg::PlacerStaticCfg(Context *ctx)
+{
+    timing_driven = ctx->setting<bool>("timing_driven");
+
+    hpwl_scale_x = 1;
+    hpwl_scale_y = 1;
+}
+
+NEXTPNR_NAMESPACE_END
diff --git a/common/place/placer_static.h b/common/place/placer_static.h
new file mode 100644
index 00000000..adce1f41
--- /dev/null
+++ b/common/place/placer_static.h
@@ -0,0 +1,70 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2022-23  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ *
+ */
+
+#ifndef PLACER_STATIC_H
+#define PLACER_STATIC_H
+#include "log.h"
+#include "nextpnr.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+struct StaticRect
+{
+    StaticRect() : w(0), h(0){};
+    StaticRect(float w, float h) : w(w), h(h){};
+    float w, h;
+    float area() const { return w * h; }
+};
+
+struct StaticCellGroupCfg
+{
+    // name of the group for debugging purposes
+    IdString name;
+    // bel buckets in this group
+    pool<BelBucketId> bel_buckets;
+    // cell & bel types in this group and the (normalised) area of that cell/bel type
+    dict<IdString, StaticRect> cell_area, bel_area;
+    // these cells (generally auxilliary CLB cells like RAMW; carry; MUX) are considered to have zero area when part of
+    // a macro with other non-zero-area cells
+    pool<IdString> zero_area_cells;
+    // size of spacers to insert
+    StaticRect spacer_rect{0.5, 0.5};
+};
+
+struct PlacerStaticCfg
+{
+    PlacerStaticCfg(Context *ctx);
+
+    // These cell types will be randomly locked to prevent singular matrices
+    pool<IdString> ioBufTypes;
+    int hpwl_scale_x = 1;
+    int hpwl_scale_y = 1;
+    bool timing_driven = false;
+    // for calculating timing estimates based on distance
+    // estimate = c + mx*dx + my * dy
+    delay_t timing_c = 100, timing_mx = 100, timing_my = 100;
+    // groups of cells that should be placed together.
+    // group 0 must be LUTs and group 1 must be FFs, further groups for BRAM/DSP/misc
+    std::vector<StaticCellGroupCfg> cell_groups;
+};
+
+extern bool placer_static(Context *ctx, PlacerStaticCfg cfg);
+NEXTPNR_NAMESPACE_END
+#endif
\ No newline at end of file
diff --git a/common/place/static_util.h b/common/place/static_util.h
new file mode 100644
index 00000000..a3dc5c93
--- /dev/null
+++ b/common/place/static_util.h
@@ -0,0 +1,142 @@
+/*
+ *  nextpnr -- Next Generation Place and Route
+ *
+ *  Copyright (C) 2023  gatecat <gatecat@ds0.me>
+ *
+ *  Permission to use, copy, modify, and/or distribute this software for any
+ *  purpose with or without fee is hereby granted, provided that the above
+ *  copyright notice and this permission notice appear in all copies.
+ *
+ *  THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ *  WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ *  MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ *  ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ *  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ *  ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ *  OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ */
+
+#ifndef STATIC_UTIL_H
+#define STATIC_UTIL_H
+
+#include <fstream>
+#include "nextpnr_assertions.h"
+#include "nextpnr_namespaces.h"
+
+NEXTPNR_NAMESPACE_BEGIN
+
+namespace StaticUtil {
+
+enum class Axis
+{
+    X,
+    Y
+};
+
+struct RealPair
+{
+    RealPair() : x(0), y(0){};
+    RealPair(float x, float y) : x(x), y(y){};
+    explicit RealPair(Loc l, float bias = 0.0f) : x(l.x + bias), y(l.y + bias){};
+    float x, y;
+    RealPair &operator+=(const RealPair &other)
+    {
+        x += other.x;
+        y += other.y;
+        return *this;
+    }
+    RealPair &operator/=(float factor)
+    {
+        x /= factor;
+        y /= factor;
+        return *this;
+    }
+    friend RealPair operator+(RealPair a, RealPair b);
+    friend RealPair operator-(RealPair a, RealPair b);
+    RealPair operator*(float factor) const { return RealPair(x * factor, y * factor); }
+    RealPair operator/(float factor) const { return RealPair(x / factor, y / factor); }
+    // to simplify axis-generic code
+    inline float &at(Axis axis) { return (axis == Axis::Y) ? y : x; }
+    inline const float &at(Axis axis) const { return (axis == Axis::Y) ? y : x; }
+};
+inline RealPair operator+(RealPair a, RealPair b) { return RealPair(a.x + b.x, a.y + b.y); }
+inline RealPair operator-(RealPair a, RealPair b) { return RealPair(a.x - b.x, a.y - b.y); }
+
+// array2d; but as ourafft wants it
+struct FFTArray
+{
+    FFTArray(int width = 0, int height = 0) : m_width(width), m_height(height)
+    {
+        alloc();
+        fill(0);
+    }
+
+    void fill(float value)
+    {
+        for (int x = 0; x < m_width; x++)
+            for (int y = 0; y < m_height; y++)
+                m_data[x][y] = value;
+    }
+
+    void reset(int width, int height, float value = 0)
+    {
+        if (width != m_height || height != m_height) {
+            destroy();
+            m_width = width;
+            m_height = height;
+            alloc();
+        }
+        fill(value);
+    }
+
+    float &at(int x, int y)
+    {
+        NPNR_ASSERT(x >= 0 && x < m_width && y >= 0 && y < m_height);
+        return m_data[x][y];
+    }
+    float at(int x, int y) const
+    {
+        NPNR_ASSERT(x >= 0 && x < m_width && y >= 0 && y < m_height);
+        return m_data[x][y];
+    }
+    float **data() { return m_data; }
+
+    void write_csv(const std::string &filename) const
+    {
+        std::ofstream out(filename);
+        NPNR_ASSERT(out);
+        for (int y = 0; y < m_height; y++) {
+            for (int x = 0; x < m_width; x++) {
+                out << at(x, y) << ",";
+            }
+            out << std::endl;
+        }
+    }
+
+    ~FFTArray() { destroy(); }
+
+  private:
+    int m_width, m_height;
+    float **m_data = nullptr;
+    void alloc()
+    {
+        if (m_width == 0)
+            return;
+        m_data = new float *[m_width];
+        for (int x = 0; x < m_width; x++)
+            m_data[x] = (m_height > 0) ? (new float[m_height]) : nullptr;
+    }
+    void destroy()
+    {
+        for (int x = 0; x < m_width; x++)
+            delete[] m_data[x];
+        delete[] m_data;
+    }
+};
+
+}; // namespace StaticUtil
+
+NEXTPNR_NAMESPACE_END
+
+#endif
diff --git a/ecp5/arch.cc b/ecp5/arch.cc
index a0b2644c..5d46f881 100644
--- a/ecp5/arch.cc
+++ b/ecp5/arch.cc
@@ -30,6 +30,7 @@
 #include "nextpnr.h"
 #include "placer1.h"
 #include "placer_heap.h"
+#include "placer_static.h"
 #include "router1.h"
 #include "router2.h"
 #include "timing.h"
@@ -586,6 +587,49 @@ delay_t Arch::getRipupDelayPenalty() const { return 400; }
 
 // -----------------------------------------------------------------------
 
+namespace {
+void configure_static(Arch *arch, PlacerStaticCfg &cfg)
+{
+    {
+        cfg.cell_groups.emplace_back();
+        auto &comb = cfg.cell_groups.back();
+        comb.name = arch->id("COMB");
+        comb.bel_area[id_TRELLIS_COMB] = StaticRect(1.0f, 0.125f);
+        comb.bel_area[id_TRELLIS_RAMW] = StaticRect(1.0f, 0.125f);
+        comb.cell_area[id_TRELLIS_COMB] = StaticRect(1.0f, 0.125f);
+        comb.cell_area[id_TRELLIS_RAMW] = StaticRect(1.0f, 0.125f);
+        comb.zero_area_cells.insert(id_TRELLIS_RAMW);
+        comb.spacer_rect = StaticRect(1.0f, 0.125f);
+    }
+
+    {
+        cfg.cell_groups.emplace_back();
+        auto &comb = cfg.cell_groups.back();
+        comb.name = arch->id("FF");
+        comb.cell_area[id_TRELLIS_FF] = StaticRect(1.0f, 0.125f);
+        comb.bel_area[id_TRELLIS_FF] = StaticRect(1.0f, 0.125f);
+        comb.spacer_rect = StaticRect(1.0f, 0.125f);
+    }
+
+    {
+        cfg.cell_groups.emplace_back();
+        auto &comb = cfg.cell_groups.back();
+        comb.name = arch->id("RAM");
+        comb.cell_area[id_DP16KD] = StaticRect(2.0f, 1.0f);
+        comb.bel_area[id_DP16KD] = StaticRect(2.0f, 1.0f);
+        comb.spacer_rect = StaticRect(2.0f, 1.0f);
+    }
+    {
+        cfg.cell_groups.emplace_back();
+        auto &comb = cfg.cell_groups.back();
+        comb.name = arch->id("MUL");
+        comb.cell_area[id_MULT18X18D] = StaticRect(2.0f, 1.0f);
+        comb.bel_area[id_MULT18X18D] = StaticRect(2.0f, 1.0f);
+        comb.spacer_rect = StaticRect(2.0f, 1.0f);
+    }
+}
+} // namespace
+
 bool Arch::place()
 {
     std::string placer = str_or_default(settings, id_placer, defaultPlacer);
@@ -609,6 +653,11 @@ bool Arch::place()
 
         if (!placer_heap(getCtx(), cfg))
             return false;
+    } else if (placer == "static") {
+        PlacerStaticCfg cfg(getCtx());
+        configure_static(this, cfg);
+        if (!placer_static(getCtx(), cfg))
+            return false;
     } else if (placer == "sa") {
         if (!placer1(getCtx(), Placer1Cfg(getCtx())))
             return false;
@@ -1226,7 +1275,7 @@ WireId Arch::get_bank_eclk(int bank, int eclk)
 
 const std::string Arch::defaultPlacer = "heap";
 
-const std::vector<std::string> Arch::availablePlacers = {"sa", "heap"};
+const std::vector<std::string> Arch::availablePlacers = {"sa", "heap", "static"};
 
 const std::string Arch::defaultRouter = "router1";
 const std::vector<std::string> Arch::availableRouters = {"router1", "router2"};