FloatFFT_2D.java example

Explorer
GranolaSDR-master
- src
  - edu
    - emory
      - mathcs
        jtransforms
        fft
        AccuracyCheckDoubleFFT.java
        AccuracyCheckFloatFFT.java
        BenchmarkDoubleFFT.java
        BenchmarkFloatFFT.java
        DoubleFFT_1D.java
        DoubleFFT_2D.java
        DoubleFFT_3D.java
        FloatFFT_1D.java
        FloatFFT_2D.java
        FloatFFT_3D.java
        utils
        ConcurrencyUtils.java
        IOUtils.java
  - granolasdr
/* ***** BEGIN LICENSE BLOCK *****
 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
 *
 * The contents of this file are subject to the Mozilla Public License Version
 * 1.1 (the "License"); you may not use this file except in compliance with
 * the License. You may obtain a copy of the License at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is JTransforms.
 *
 * The Initial Developer of the Original Code is
 * Piotr Wendykier, Emory University.
 * Portions created by the Initial Developer are Copyright (C) 2007
 * the Initial Developer. All Rights Reserved.
 *
 * Alternatively, the contents of this file may be used under the terms of
 * either the GNU General Public License Version 2 or later (the "GPL"), or
 * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
 * in which case the provisions of the GPL or the LGPL are applicable instead
 * of those above. If you wish to allow use of your version of this file only
 * under the terms of either the GPL or the LGPL, and not to allow others to
 * use your version of this file under the terms of the MPL, indicate your
 * decision by deleting the provisions above and replace them with the notice
 * and other provisions required by the GPL or the LGPL. If you do not delete
 * the provisions above, a recipient may use your version of this file under
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */

package edu.emory.mathcs.jtransforms.fft;

import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;

import edu.emory.mathcs.utils.ConcurrencyUtils;

/**
 * Computes 2D Discrete Fourier Transform (DFT) of complex and real, single
 * precision data. The sizes of both dimensions must be power-of-two numbers.
 * This is a parallel implementation of split-radix algorithm optimized for SMP
 * systems. <br>
 * <br>
 * This code is derived from General Purpose FFT Package written by Takuya Ooura
 * (http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html)
 * 
 * @author Piotr Wendykier (piotr.wendykier@gmail.com)
 * 
 */
public class FloatFFT_2D {

    private int n1;

    private int n2;

    private int[] ip;

    private float[] w;

    private float[] t;

    private FloatFFT_1D fftn2, fftn1;

    private int oldNthread;

    private int nt;

    /**
     * Creates new instance of FloatFFT_2D.
     * 
     * @param n1
     *            number of rows - must be a power-of-two number
     * @param n2
     *            number of columns - must be a power-of-two number
     */
    public FloatFFT_2D(int n1, int n2) {
        if (!ConcurrencyUtils.isPowerOf2(n1) || !ConcurrencyUtils.isPowerOf2(n2))
            throw new IllegalArgumentException("n1, n2 must be power of two numbers");
        if (n1 <= 1 || n2 <= 1) {
            throw new IllegalArgumentException("n1, n2 must be greater than 1");
        }
        this.n1 = n1;
        this.n2 = n2;
        ip = new int[2 + (int) Math.ceil(Math.sqrt(Math.max(n1, n2)))];
        this.w = new float[(int) Math.ceil(Math.max(Math.max(n1 / 2, n2 / 2), Math.max(n1 / 2, n2 / 4) + n2 / 4))];
        fftn2 = new FloatFFT_1D(n2, ip, w);
        fftn1 = new FloatFFT_1D(n1, ip, w);
        oldNthread = ConcurrencyUtils.getNumberOfProcessors();
        nt = 8 * oldNthread * n1;
        if (2 * n2 == 4 * oldNthread) {
            nt >>= 1;
        } else if (2 * n2 < 4 * oldNthread) {
            nt >>= 2;
        }
        t = new float[nt];
    }

    /**
     * Computes 2D forward DFT of complex data leaving the result in
     * <code>a</code>. The data is stored in 1D array in row-major order.
     * Complex number is stored as two float values in sequence: the real and
     * imaginary part, i.e. the input array must be of size n1*2*n2. The
     * physical layout of the input data is as follows:
     * 
     * <pre>
     * a[k1*2*n2+2*k2] = Re[k1][k2], 
     * a[k1*2*n2+2*k2+1] = Im[k1][k2], 0<=k1<n1, 0<=k2<n2,
     * </pre>
     * 
     * @param a
     *            data to transform
     */
    public void complexForward(float[] a) {
        int nthread, n, i;
        int oldn2 = n2;
        n2 = 2 * n2;

        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        if (n > (ip[0] << 2)) {
            makewt(n >> 2);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * oldn2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(0, -1, a, true);
            cdft2d_subth(-1, a, true);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.complexForward(a, i * n2);
            }
            cdft2d_sub(-1, a, true);
        }
        n2 = oldn2;
    }

    /**
     * Computes 2D forward DFT of complex data leaving the result in
     * <code>a</code>. The data is stored in 2D array. Complex data is
     * represented by 2 float values in sequence: the real and imaginary part,
     * i.e. the input array must be of size n1 by 2*n2. The physical layout of
     * the input data is as follows:
     * 
     * <pre>
     * a[k1][2*k2] = Re[k1][k2], 
     * a[k1][2*k2+1] = Im[k1][k2], 0<=k1<n1, 0<=k2<n2,
     * </pre>
     * 
     * @param a
     *            data to transform
     */
    public void complexForward(float[][] a) {
        int nthread, n, i;
        int oldn2 = n2;
        n2 = 2 * n2;

        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        if (n > (ip[0] << 2)) {
            makewt(n >> 2);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * oldn2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(0, -1, a, true);
            cdft2d_subth(-1, a, true);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.complexForward(a[i]);
            }
            cdft2d_sub(-1, a, true);
        }
        n2 = oldn2;
    }

    /**
     * Computes 2D inverse DFT of complex data leaving the result in
     * <code>a</code>. The data is stored in 1D array in row-major order.
     * Complex number is stored as two float values in sequence: the real and
     * imaginary part, i.e. the input array must be of size n1*2*n2. The
     * physical layout of the input data is as follows:
     * 
     * <pre>
     * a[k1*2*n2+2*k2] = Re[k1][k2], 
     * a[k1*2*n2+2*k2+1] = Im[k1][k2], 0<=k1<n1, 0<=k2<n2,
     * </pre>
     * 
     * @param a
     *            data to transform
     * @param scale
     *            if true then scaling is performed
     * 
     */
    public void complexInverse(float[] a, boolean scale) {
        int nthread, n, i;
        int oldn2 = n2;
        n2 = 2 * n2;
        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        if (n > (ip[0] << 2)) {
            makewt(n >> 2);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * oldn2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(0, 1, a, scale);
            cdft2d_subth(1, a, scale);
        } else {

            for (i = 0; i < n1; i++) {
                fftn2.complexInverse(a, i * n2, scale);
            }
            cdft2d_sub(1, a, scale);
        }
        n2 = oldn2;
    }

    /**
     * Computes 2D inverse DFT of complex data leaving the result in
     * <code>a</code>. The data is stored in 2D array. Complex data is
     * represented by 2 float values in sequence: the real and imaginary part,
     * i.e. the input array must be of size n1 by 2*n2. The physical layout of
     * the input data is as follows:
     * 
     * <pre>
     * a[k1][2*k2] = Re[k1][k2], 
     * a[k1][2*k2+1] = Im[k1][k2], 0<=k1<n1, 0<=k2<n2,
     * </pre>
     * 
     * @param a
     *            data to transform
     * @param scale
     *            if true then scaling is performed
     * 
     */
    public void complexInverse(float[][] a, boolean scale) {
        int nthread, n, i;
        int oldn2 = n2;
        n2 = 2 * n2;
        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        if (n > (ip[0] << 2)) {
            makewt(n >> 2);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * oldn2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(0, 1, a, scale);
            cdft2d_subth(1, a, scale);
        } else {

            for (i = 0; i < n1; i++) {
                fftn2.complexInverse(a[i], scale);
            }
            cdft2d_sub(1, a, scale);
        }
        n2 = oldn2;
    }

    /**
     * Computes 2D forward DFT of real data leaving the result in <code>a</code>
     * . The physical layout of the output data is as follows:
     * 
     * <pre>
     * a[k1*n2+2*k2] = Re[k1][k2] = Re[n1-k1][n2-k2], 
     * a[k1*n2+2*k2+1] = Im[k1][k2] = -Im[n1-k1][n2-k2], 
     *       0<k1<n1, 0<k2<n2/2, 
     * a[2*k2] = Re[0][k2] = Re[0][n2-k2], 
     * a[2*k2+1] = Im[0][k2] = -Im[0][n2-k2], 
     *       0<k2<n2/2, 
     * a[k1*n2] = Re[k1][0] = Re[n1-k1][0], 
     * a[k1*n2+1] = Im[k1][0] = -Im[n1-k1][0], 
     * a[(n1-k1)*n2+1] = Re[k1][n2/2] = Re[n1-k1][n2/2], 
     * a[(n1-k1)*n2] = -Im[k1][n2/2] = Im[n1-k1][n2/2], 
     *       0<k1<n1/2, 
     * a[0] = Re[0][0], 
     * a[1] = Re[0][n2/2], 
     * a[(n1/2)*n2] = Re[n1/2][0], 
     * a[(n1/2)*n2+1] = Re[n1/2][n2/2]
     * </pre>
     * 
     * This method computes only half of the elements of the real transform. The
     * other half satisfies the symmetry condition. If you want the full real
     * forward transform, use <code>realForwardFull</code>. To get back the
     * original data, use <code>realInverse</code> on the output of this method.
     * 
     * @param a
     *            data to transform
     */
    public void realForward(float[] a) {
        int nthread, n, nw, nc, i;

        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(1, 1, a, true);
            cdft2d_subth(-1, a, true);
            rdft2d_sub(1, a);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.realForward(a, i * n2);
            }
            cdft2d_sub(-1, a, true);
            rdft2d_sub(1, a);
        }
    }

    /**
     * Computes 2D forward DFT of real data leaving the result in <code>a</code>
     * . The physical layout of the output data is as follows:
     * 
     * <pre>
     * a[k1][2*k2] = Re[k1][k2] = Re[n1-k1][n2-k2], 
     * a[k1][2*k2+1] = Im[k1][k2] = -Im[n1-k1][n2-k2], 
     *       0<k1<n1, 0<k2<n2/2, 
     * a[0][2*k2] = Re[0][k2] = Re[0][n2-k2], 
     * a[0][2*k2+1] = Im[0][k2] = -Im[0][n2-k2], 
     *       0<k2<n2/2, 
     * a[k1][0] = Re[k1][0] = Re[n1-k1][0], 
     * a[k1][1] = Im[k1][0] = -Im[n1-k1][0], 
     * a[n1-k1][1] = Re[k1][n2/2] = Re[n1-k1][n2/2], 
     * a[n1-k1][0] = -Im[k1][n2/2] = Im[n1-k1][n2/2], 
     *       0<k1<n1/2, 
     * a[0][0] = Re[0][0], 
     * a[0][1] = Re[0][n2/2], 
     * a[n1/2][0] = Re[n1/2][0], 
     * a[n1/2][1] = Re[n1/2][n2/2]
     * </pre>
     * 
     * This method computes only half of the elements of the real transform. The
     * other half satisfies the symmetry condition. If you want the full real
     * forward transform, use <code>realForwardFull</code>. To get back the
     * original data, use <code>realInverse</code> on the output of this method.
     * 
     * @param a
     *            data to transform
     */
    public void realForward(float[][] a) {
        int nthread, n, nw, nc, i;

        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(1, 1, a, true);
            cdft2d_subth(-1, a, true);
            rdft2d_sub(1, a);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.realForward(a[i]);
            }
            cdft2d_sub(-1, a, true);
            rdft2d_sub(1, a);
        }
    }

    /**
     * Computes 2D forward DFT of real data leaving the result in <code>a</code>
     * . This method computes the full real forward transform, i.e. you will get
     * the same result as from <code>complexForward</code> called with all
     * imaginary part equal 0. Because the result is stored in <code>a</code>,
     * the input array must be of size n1*2*n2, with only the first n1*n2
     * elements filled with real data. To get back the original data, use
     * <code>complexInverse</code> on the output of this method.
     * 
     * @param a
     *            data to transform
     */
    public void realForwardFull(float[] a) {
        int nthread, n, nw, nc, i, newn2;
        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(1, 1, a, true);
            cdft2d_subth(-1, a, true);
            rdft2d_sub(1, a);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.realForward(a, i * n2);
            }
            cdft2d_sub(-1, a, true);
            rdft2d_sub(1, a);
        }
        newn2 = 2 * n2;
        int idx1, idx2, idx3;
        int n1d2 = n1 / 2;

        for (int k1 = (n1 - 1); k1 >= 1; k1--) {
            idx1 = k1 * n2;
            idx2 = 2 * idx1;
            for (int k2 = 0; k2 < n2; k2 += 2) {
                a[idx2 + k2] = a[idx1 + k2];
                a[idx1 + k2] = 0;
                a[idx2 + k2 + 1] = a[idx1 + k2 + 1];
                a[idx1 + k2 + 1] = 0;
            }
        }

        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            fillSymmetric(a);
        } else {
            for (int k1 = 1; k1 < n1d2; k1++) {
                idx2 = k1 * newn2;
                idx3 = (n1 - k1) * newn2;
                a[idx2 + n2] = a[idx3 + 1];
                a[idx2 + n2 + 1] = -a[idx3];
            }
            for (int k1 = 1; k1 < n1d2; k1++) {
                for (int k2 = n2 + 2; k2 < newn2; k2 += 2) {
                    idx2 = k1 * newn2;
                    idx3 = (n1 - k1) * newn2;
                    a[idx2 + k2] = a[idx3 + newn2 - k2];
                    a[idx2 + k2 + 1] = -a[idx3 + newn2 - k2 + 1];

                }
            }
            for (int k1 = 0; k1 <= n1 / 2; k1++) {
                for (int k2 = 0; k2 < newn2; k2 += 2) {
                    idx2 = k1 * newn2 + k2;
                    idx3 = ((n1 - k1) % n1) * newn2 + (newn2 - k2) % newn2;
                    a[idx3] = a[idx2];
                    a[idx3 + 1] = -a[idx2 + 1];
                }
            }
        }
        a[n2] = -a[1];
        a[1] = 0;
        a[n1d2 * newn2 + n2] = -a[n1d2 * newn2 + 1];
        a[n1d2 * newn2 + 1] = 0;
        a[n1d2 * newn2 + n2 + 1] = 0;
    }

    /**
     * Computes 2D forward DFT of real data leaving the result in <code>a</code>
     * . This method computes the full real forward transform, i.e. you will get
     * the same result as from <code>complexForward</code> called with all
     * imaginary part equal 0. Because the result is stored in <code>a</code>,
     * the input array must be of size n1 by 2*n2, with only the first n1 by n2
     * elements filled with real data. To get back the original data, use
     * <code>complexInverse</code> on the output of this method.
     * 
     * @param a
     *            data to transform
     */
    public void realForwardFull(float[][] a) {
        int nthread, n, nw, nc, i, newn2;
        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth1(1, 1, a, true);
            cdft2d_subth(-1, a, true);
            rdft2d_sub(1, a);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.realForward(a[i]);
            }
            cdft2d_sub(-1, a, true);
            rdft2d_sub(1, a);
        }
        newn2 = 2 * n2;
        int n1d2 = n1 / 2;

        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            fillSymmetric(a);
        } else {
            for (int k1 = 1; k1 < n1d2; k1++) {
                a[k1][n2] = a[n1 - k1][1];
                a[k1][n2 + 1] = -a[n1 - k1][0];
            }
            for (int k1 = 1; k1 < n1d2; k1++) {
                for (int k2 = n2 + 2; k2 < newn2; k2 += 2) {
                    a[k1][k2] = a[n1 - k1][newn2 - k2];
                    a[k1][k2 + 1] = -a[n1 - k1][newn2 - k2 + 1];

                }
            }

            for (int k1 = 0; k1 <= n1 / 2; k1++) {
                for (int k2 = 0; k2 < newn2; k2 += 2) {
                    a[(n1 - k1) % n1][(newn2 - k2) % newn2] = a[k1][k2];
                    a[(n1 - k1) % n1][(newn2 - k2) % newn2 + 1] = -a[k1][k2 + 1];
                }
            }
        }

        a[0][n2] = -a[0][1];
        a[0][1] = 0;
        a[n1d2][n2] = -a[n1d2][1];
        a[n1d2][1] = 0;
        a[n1d2][n2 + 1] = 0;
    }

    /**
     * Computes 2D inverse DFT of real data leaving the result in <code>a</code>
     * . The physical layout of the input data has to be as follows:
     * 
     * <pre>
     * a[k1*n2+2*k2] = Re[k1][k2] = Re[n1-k1][n2-k2], 
     * a[k1*n2+2*k2+1] = Im[k1][k2] = -Im[n1-k1][n2-k2], 
     *       0<k1<n1, 0<k2<n2/2, 
     * a[2*k2] = Re[0][k2] = Re[0][n2-k2], 
     * a[2*k2+1] = Im[0][k2] = -Im[0][n2-k2], 
     *       0<k2<n2/2, 
     * a[k1*n2] = Re[k1][0] = Re[n1-k1][0], 
     * a[k1*n2+1] = Im[k1][0] = -Im[n1-k1][0], 
     * a[(n1-k1)*n2+1] = Re[k1][n2/2] = Re[n1-k1][n2/2], 
     * a[(n1-k1)*n2] = -Im[k1][n2/2] = Im[n1-k1][n2/2], 
     *       0<k1<n1/2, 
     * a[0] = Re[0][0], 
     * a[1] = Re[0][n2/2], 
     * a[(n1/2)*n2] = Re[n1/2][0], 
     * a[(n1/2)*n2+1] = Re[n1/2][n2/2]
     * </pre>
     * 
     * This method computes only half of the elements of the real transform. The
     * other half satisfies the symmetry condition. If you want the full real
     * inverse transform, use <code>realInverseFull</code>.
     * 
     * @param a
     *            data to transform
     * 
     * @param scale
     *            if true then scaling is performed
     */
    public void realInverse(float[] a, boolean scale) {
        int nthread, n, nw, nc, i;

        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            rdft2d_sub(-1, a);
            cdft2d_subth(1, a, scale);
            xdft2d0_subth1(1, -1, a, scale);
        } else {
            rdft2d_sub(-1, a);
            cdft2d_sub(1, a, scale);
            for (i = 0; i < n1; i++) {
                fftn2.realInverse(a, i * n2, scale);
            }
        }
    }

    /**
     * Computes 2D inverse DFT of real data leaving the result in <code>a</code>
     * . The physical layout of the input data has to be as follows:
     * 
     * <pre>
     * a[k1][2*k2] = Re[k1][k2] = Re[n1-k1][n2-k2], 
     * a[k1][2*k2+1] = Im[k1][k2] = -Im[n1-k1][n2-k2], 
     *       0<k1<n1, 0<k2<n2/2, 
     * a[0][2*k2] = Re[0][k2] = Re[0][n2-k2], 
     * a[0][2*k2+1] = Im[0][k2] = -Im[0][n2-k2], 
     *       0<k2<n2/2, 
     * a[k1][0] = Re[k1][0] = Re[n1-k1][0], 
     * a[k1][1] = Im[k1][0] = -Im[n1-k1][0], 
     * a[n1-k1][1] = Re[k1][n2/2] = Re[n1-k1][n2/2], 
     * a[n1-k1][0] = -Im[k1][n2/2] = Im[n1-k1][n2/2], 
     *       0<k1<n1/2, 
     * a[0][0] = Re[0][0], 
     * a[0][1] = Re[0][n2/2], 
     * a[n1/2][0] = Re[n1/2][0], 
     * a[n1/2][1] = Re[n1/2][n2/2]
     * </pre>
     * 
     * This method computes only half of the elements of the real transform. The
     * other half satisfies the symmetry condition. If you want the full real
     * inverse transform, use <code>realInverseFull</code>.
     * 
     * @param a
     *            data to transform
     * 
     * @param scale
     *            if true then scaling is performed
     */
    public void realInverse(float[][] a, boolean scale) {
        int nthread, n, nw, nc, i;

        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            rdft2d_sub(-1, a);
            cdft2d_subth(1, a, scale);
            xdft2d0_subth1(1, -1, a, scale);
        } else {
            rdft2d_sub(-1, a);
            cdft2d_sub(1, a, scale);
            for (i = 0; i < n1; i++) {
                fftn2.realInverse(a[i], scale);
            }
        }
    }

    /**
     * Computes 2D inverse DFT of real data leaving the result in <code>a</code>
     * . This method computes the full real inverse transform, i.e. you will get
     * the same result as from <code>complexInverse</code> called with all
     * imaginary part equal 0. Because the result is stored in <code>a</code>,
     * the input array must be of size n1*2*n2, with only the first n1*n2
     * elements filled with real data.
     * 
     * @param a
     *            data to transform
     * 
     * @param scale
     *            if true then scaling is performed
     */
    public void realInverseFull(float[] a, boolean scale) {
        int nthread, n, nw, nc, i, newn2;
        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth2(1, -1, a, scale);
            cdft2d_subth(1, a, scale);
            rdft2d_sub(1, a);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.realInverse2(a, i * n2, scale);
            }
            cdft2d_sub(1, a, scale);
            rdft2d_sub(1, a);
        }
        newn2 = 2 * n2;
        int idx1, idx2, idx3;
        int n1d2 = n1 / 2;

        for (int k1 = (n1 - 1); k1 >= 1; k1--) {
            idx1 = k1 * n2;
            idx2 = 2 * idx1;
            for (int k2 = 0; k2 < n2; k2 += 2) {
                a[idx2 + k2] = a[idx1 + k2];
                a[idx1 + k2] = 0;
                a[idx2 + k2 + 1] = a[idx1 + k2 + 1];
                a[idx1 + k2 + 1] = 0;
            }
        }

        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            fillSymmetric(a);
        } else {
            for (int k1 = 1; k1 < n1d2; k1++) {
                idx2 = k1 * newn2;
                idx3 = (n1 - k1) * newn2;
                a[idx2 + n2] = a[idx3 + 1];
                a[idx2 + n2 + 1] = -a[idx3];
            }
            for (int k1 = 1; k1 < n1d2; k1++) {
                for (int k2 = n2 + 2; k2 < newn2; k2 += 2) {
                    idx2 = k1 * newn2;
                    idx3 = (n1 - k1) * newn2;
                    a[idx2 + k2] = a[idx3 + newn2 - k2];
                    a[idx2 + k2 + 1] = -a[idx3 + newn2 - k2 + 1];

                }
            }
            for (int k1 = 0; k1 <= n1 / 2; k1++) {
                for (int k2 = 0; k2 < newn2; k2 += 2) {
                    idx2 = k1 * newn2 + k2;
                    idx3 = ((n1 - k1) % n1) * newn2 + (newn2 - k2) % newn2;
                    a[idx3] = a[idx2];
                    a[idx3 + 1] = -a[idx2 + 1];
                }
            }
        }
        a[n2] = -a[1];
        a[1] = 0;
        a[n1d2 * newn2 + n2] = -a[n1d2 * newn2 + 1];
        a[n1d2 * newn2 + 1] = 0;
        a[n1d2 * newn2 + n2 + 1] = 0;
    }

    /**
     * Computes 2D inverse DFT of real data leaving the result in <code>a</code>
     * . This method computes the full real inverse transform, i.e. you will get
     * the same result as from <code>complexInverse</code> called with all
     * imaginary part equal 0. Because the result is stored in <code>a</code>,
     * the input array must be of size n1 by 2*n2, with only the first n1 by n2
     * elements filled with real data.
     * 
     * @param a
     *            data to transform
     * 
     * @param scale
     *            if true then scaling is performed
     */
    public void realInverseFull(float[][] a, boolean scale) {
        int nthread, n, nw, nc, i, newn2;
        n = n1 << 1;
        if (n < n2) {
            n = n2;
        }
        nw = ip[0];
        if (n > (nw << 2)) {
            nw = n >> 2;
            makewt(nw);
        }
        nc = ip[1];
        if (n2 > (nc << 2)) {
            nc = n2 >> 2;
            makect(nc, w, nw);
        }
        nthread = ConcurrencyUtils.getNumberOfProcessors();
        if (nthread != oldNthread) {
            nt = 8 * nthread * n1;
            if (n2 == 4 * nthread) {
                nt >>= 1;
            } else if (n2 < 4 * nthread) {
                nt >>= 2;
            }
            t = new float[nt];
            oldNthread = nthread;
        }
        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            xdft2d0_subth2(1, -1, a, scale);
            cdft2d_subth(1, a, scale);
            rdft2d_sub(1, a);
        } else {
            for (i = 0; i < n1; i++) {
                fftn2.realInverse2(a[i], 0, scale);
            }
            cdft2d_sub(1, a, scale);
            rdft2d_sub(1, a);
        }
        newn2 = 2 * n2;
        int n1d2 = n1 / 2;

        if ((nthread > 1) && (n1 * n2 >= ConcurrencyUtils.getThreadsBeginN_2D())) {
            fillSymmetric(a);
        } else {
            for (int k1 = 1; k1 < n1d2; k1++) {
                a[k1][n2] = a[n1 - k1][1];
                a[k1][n2 + 1] = -a[n1 - k1][0];
            }
            for (int k1 = 1; k1 < n1d2; k1++) {
                for (int k2 = n2 + 2; k2 < newn2; k2 += 2) {
                    a[k1][k2] = a[n1 - k1][newn2 - k2];
                    a[k1][k2 + 1] = -a[n1 - k1][newn2 - k2 + 1];

                }
            }

            for (int k1 = 0; k1 <= n1 / 2; k1++) {
                for (int k2 = 0; k2 < newn2; k2 += 2) {
                    a[(n1 - k1) % n1][(newn2 - k2) % newn2] = a[k1][k2];
                    a[(n1 - k1) % n1][(newn2 - k2) % newn2 + 1] = -a[k1][k2 + 1];
                }
            }
        }

        a[0][n2] = -a[0][1];
        a[0][1] = 0;
        a[n1d2][n2] = -a[n1d2][1];
        a[n1d2][1] = 0;
        a[n1d2][n2 + 1] = 0;
    }

    /* -------- initializing routines -------- */

    private void makewt(int nw) {
        int j, nwh, nw0, nw1;
        float delta, wn4r, wk1r, wk1i, wk3r, wk3i;

        ip[0] = nw;
        ip[1] = 1;
        if (nw > 2) {
            nwh = nw >> 1;
            delta = (float) (Math.atan(1.0) / nwh);
            wn4r = (float) Math.cos(delta * nwh);
            w[0] = 1;
            w[1] = wn4r;
            if (nwh == 4) {
                w[2] = (float) Math.cos(delta * 2);
                w[3] = (float) Math.sin(delta * 2);
            } else if (nwh > 4) {
                makeipt(nw);
                w[2] = (float) (0.5 / Math.cos(delta * 2));
                w[3] = (float) (0.5 / Math.cos(delta * 6));
                for (j = 4; j < nwh; j += 4) {
                    w[j] = (float) Math.cos(delta * j);
                    w[j + 1] = (float) Math.sin(delta * j);
                    w[j + 2] = (float) Math.cos(3 * delta * j);
                    w[j + 3] = (float) -Math.sin(3 * delta * j);
                }
            }
            nw0 = 0;
            while (nwh > 2) {
                nw1 = nw0 + nwh;
                nwh >>= 1;
                w[nw1] = 1;
                w[nw1 + 1] = wn4r;
                if (nwh == 4) {
                    wk1r = w[nw0 + 4];
                    wk1i = w[nw0 + 5];
                    w[nw1 + 2] = wk1r;
                    w[nw1 + 3] = wk1i;
                } else if (nwh > 4) {
                    wk1r = w[nw0 + 4];
                    wk3r = w[nw0 + 6];
                    w[nw1 + 2] = (float) (0.5 / wk1r);
                    w[nw1 + 3] = (float) (0.5 / wk3r);
                    for (j = 4; j < nwh; j += 4) {
                        int idx1 = nw0 + 2 * j;
                        int idx2 = nw1 + j;
                        wk1r = w[idx1];
                        wk1i = w[idx1 + 1];
                        wk3r = w[idx1 + 2];
                        wk3i = w[idx1 + 3];
                        w[idx2] = wk1r;
                        w[idx2 + 1] = wk1i;
                        w[idx2 + 2] = wk3r;
                        w[idx2 + 3] = wk3i;
                    }
                }
                nw0 = nw1;
            }
        }
    }

    private void makeipt(int nw) {
        int j, l, m, m2, p, q;

        ip[2] = 0;
        ip[3] = 16;
        m = 2;
        for (l = nw; l > 32; l >>= 2) {
            m2 = m << 1;
            q = m2 << 3;
            for (j = m; j < m2; j++) {
                p = ip[j] << 2;
                ip[m + j] = p;
                ip[m2 + j] = p + q;
            }
            m = m2;
        }
    }

    private void makect(int nc, float[] c, int startc) {
        int j, nch;
        float delta;

        ip[1] = nc;
        if (nc > 1) {
            nch = nc >> 1;
            delta = (float) Math.atan(1.0) / nch;
            c[startc] = (float) Math.cos(delta * nch);
            c[startc + nch] = 0.5f * c[startc];
            for (j = 1; j < nch; j++) {
                c[startc + j] = (float) (0.5 * Math.cos(delta * j));
                c[startc + nc - j] = (float) (0.5 * Math.sin(delta * j));
            }
        }
    }

    /* -------- child routines -------- */

    private void rdft2d_sub(int isgn, float[] a) {
        int n1h, i, j;
        float xi;
        int idx1, idx2;

        n1h = n1 >> 1;
        if (isgn < 0) {
            for (i = 1; i < n1h; i++) {
                j = n1 - i;
                idx1 = i * n2;
                idx2 = j * n2;
                xi = a[idx1] - a[idx2];
                a[idx1] += a[idx2];
                a[idx2] = xi;
                xi = a[idx2 + 1] - a[idx1 + 1];
                a[idx1 + 1] += a[idx2 + 1];
                a[idx2 + 1] = xi;
            }
        } else {
            for (i = 1; i < n1h; i++) {
                j = n1 - i;
                idx1 = i * n2;
                idx2 = j * n2;
                a[idx2] = 0.5f * (a[idx1] - a[idx2]);
                a[idx1] -= a[idx2];
                a[idx2 + 1] = 0.5f * (a[idx1 + 1] + a[idx2 + 1]);
                a[idx1 + 1] -= a[idx2 + 1];
            }
        }
    }

    private void rdft2d_sub(int isgn, float[][] a) {
        int n1h, i, j;
        float xi;

        n1h = n1 >> 1;
        if (isgn < 0) {
            for (i = 1; i < n1h; i++) {
                j = n1 - i;
                xi = a[i][0] - a[j][0];
                a[i][0] += a[j][0];
                a[j][0] = xi;
                xi = a[j][1] - a[i][1];
                a[i][1] += a[j][1];
                a[j][1] = xi;
            }
        } else {
            for (i = 1; i < n1h; i++) {
                j = n1 - i;
                a[j][0] = 0.5f * (a[i][0] - a[j][0]);
                a[i][0] -= a[j][0];
                a[j][1] = 0.5f * (a[i][1] + a[j][1]);
                a[i][1] -= a[j][1];
            }
        }
    }

    private void cdft2d_sub(int isgn, float[] a, boolean scale) {
        int i, j;
        int idx1, idx2, idx3, idx4, idx5;
        if (isgn == -1) {
            if (n2 > 4) {
                for (j = 0; j < n2; j += 8) {
                    for (i = 0; i < n1; i++) {
                        idx1 = i * n2 + j;
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        t[idx2] = a[idx1];
                        t[idx2 + 1] = a[idx1 + 1];
                        t[idx3] = a[idx1 + 2];
                        t[idx3 + 1] = a[idx1 + 3];
                        t[idx4] = a[idx1 + 4];
                        t[idx4 + 1] = a[idx1 + 5];
                        t[idx5] = a[idx1 + 6];
                        t[idx5 + 1] = a[idx1 + 7];
                    }
                    fftn1.complexForward(t, 0);
                    fftn1.complexForward(t, 2 * n1);
                    fftn1.complexForward(t, 4 * n1);
                    fftn1.complexForward(t, 6 * n1);
                    for (i = 0; i < n1; i++) {
                        idx1 = i * n2 + j;
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        a[idx1] = t[idx2];
                        a[idx1 + 1] = t[idx2 + 1];
                        a[idx1 + 2] = t[idx3];
                        a[idx1 + 3] = t[idx3 + 1];
                        a[idx1 + 4] = t[idx4];
                        a[idx1 + 5] = t[idx4 + 1];
                        a[idx1 + 6] = t[idx5];
                        a[idx1 + 7] = t[idx5 + 1];
                    }
                }
            } else if (n2 == 4) {
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    t[idx2] = a[idx1];
                    t[idx2 + 1] = a[idx1 + 1];
                    t[idx3] = a[idx1 + 2];
                    t[idx3 + 1] = a[idx1 + 3];
                }
                fftn1.complexForward(t, 0);
                fftn1.complexForward(t, 2 * n1);
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    a[idx1] = t[idx2];
                    a[idx1 + 1] = t[idx2 + 1];
                    a[idx1 + 2] = t[idx3];
                    a[idx1 + 3] = t[idx3 + 1];
                }
            } else if (n2 == 2) {
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    t[idx2] = a[idx1];
                    t[idx2 + 1] = a[idx1 + 1];
                }
                fftn1.complexForward(t, 0);
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    a[idx1] = t[idx2];
                    a[idx1 + 1] = t[idx2 + 1];
                }
            }
        } else {
            if (n2 > 4) {
                for (j = 0; j < n2; j += 8) {
                    for (i = 0; i < n1; i++) {
                        idx1 = i * n2 + j;
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        t[idx2] = a[idx1];
                        t[idx2 + 1] = a[idx1 + 1];
                        t[idx3] = a[idx1 + 2];
                        t[idx3 + 1] = a[idx1 + 3];
                        t[idx4] = a[idx1 + 4];
                        t[idx4 + 1] = a[idx1 + 5];
                        t[idx5] = a[idx1 + 6];
                        t[idx5 + 1] = a[idx1 + 7];
                    }
                    fftn1.complexInverse(t, 0, scale);
                    fftn1.complexInverse(t, 2 * n1, scale);
                    fftn1.complexInverse(t, 4 * n1, scale);
                    fftn1.complexInverse(t, 6 * n1, scale);
                    for (i = 0; i < n1; i++) {
                        idx1 = i * n2 + j;
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        a[idx1] = t[idx2];
                        a[idx1 + 1] = t[idx2 + 1];
                        a[idx1 + 2] = t[idx3];
                        a[idx1 + 3] = t[idx3 + 1];
                        a[idx1 + 4] = t[idx4];
                        a[idx1 + 5] = t[idx4 + 1];
                        a[idx1 + 6] = t[idx5];
                        a[idx1 + 7] = t[idx5 + 1];
                    }
                }
            } else if (n2 == 4) {
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    t[idx2] = a[idx1];
                    t[idx2 + 1] = a[idx1 + 1];
                    t[idx3] = a[idx1 + 2];
                    t[idx3 + 1] = a[idx1 + 3];
                }
                fftn1.complexInverse(t, 0, scale);
                fftn1.complexInverse(t, 2 * n1, scale);
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    a[idx1] = t[idx2];
                    a[idx1 + 1] = t[idx2 + 1];
                    a[idx1 + 2] = t[idx3];
                    a[idx1 + 3] = t[idx3 + 1];
                }
            } else if (n2 == 2) {
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    t[idx2] = a[idx1];
                    t[idx2 + 1] = a[idx1 + 1];
                }
                fftn1.complexInverse(t, 0, scale);
                for (i = 0; i < n1; i++) {
                    idx1 = i * n2;
                    idx2 = 2 * i;
                    a[idx1] = t[idx2];
                    a[idx1 + 1] = t[idx2 + 1];
                }
            }
        }
    }

    private void cdft2d_sub(int isgn, float[][] a, boolean scale) {
        int i, j;
        int idx2, idx3, idx4, idx5;
        if (isgn == -1) {
            if (n2 > 4) {
                for (j = 0; j < n2; j += 8) {
                    for (i = 0; i < n1; i++) {
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        t[idx2] = a[i][j];
                        t[idx2 + 1] = a[i][j + 1];
                        t[idx3] = a[i][j + 2];
                        t[idx3 + 1] = a[i][j + 3];
                        t[idx4] = a[i][j + 4];
                        t[idx4 + 1] = a[i][j + 5];
                        t[idx5] = a[i][j + 6];
                        t[idx5 + 1] = a[i][j + 7];
                    }
                    fftn1.complexForward(t, 0);
                    fftn1.complexForward(t, 2 * n1);
                    fftn1.complexForward(t, 4 * n1);
                    fftn1.complexForward(t, 6 * n1);
                    for (i = 0; i < n1; i++) {
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        a[i][j] = t[idx2];
                        a[i][j + 1] = t[idx2 + 1];
                        a[i][j + 2] = t[idx3];
                        a[i][j + 3] = t[idx3 + 1];
                        a[i][j + 4] = t[idx4];
                        a[i][j + 5] = t[idx4 + 1];
                        a[i][j + 6] = t[idx5];
                        a[i][j + 7] = t[idx5 + 1];
                    }
                }
            } else if (n2 == 4) {
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    t[idx2] = a[i][0];
                    t[idx2 + 1] = a[i][1];
                    t[idx3] = a[i][2];
                    t[idx3 + 1] = a[i][3];
                }
                fftn1.complexForward(t, 0);
                fftn1.complexForward(t, 2 * n1);
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    a[i][0] = t[idx2];
                    a[i][1] = t[idx2 + 1];
                    a[i][2] = t[idx3];
                    a[i][3] = t[idx3 + 1];
                }
            } else if (n2 == 2) {
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    t[idx2] = a[i][0];
                    t[idx2 + 1] = a[i][1];
                }
                fftn1.complexForward(t, 0);
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    a[i][0] = t[idx2];
                    a[i][1] = t[idx2 + 1];
                }
            }
        } else {
            if (n2 > 4) {
                for (j = 0; j < n2; j += 8) {
                    for (i = 0; i < n1; i++) {
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        t[idx2] = a[i][j];
                        t[idx2 + 1] = a[i][j + 1];
                        t[idx3] = a[i][j + 2];
                        t[idx3 + 1] = a[i][j + 3];
                        t[idx4] = a[i][j + 4];
                        t[idx4 + 1] = a[i][j + 5];
                        t[idx5] = a[i][j + 6];
                        t[idx5 + 1] = a[i][j + 7];
                    }
                    fftn1.complexInverse(t, 0, scale);
                    fftn1.complexInverse(t, 2 * n1, scale);
                    fftn1.complexInverse(t, 4 * n1, scale);
                    fftn1.complexInverse(t, 6 * n1, scale);
                    for (i = 0; i < n1; i++) {
                        idx2 = 2 * i;
                        idx3 = 2 * n1 + 2 * i;
                        idx4 = idx3 + 2 * n1;
                        idx5 = idx4 + 2 * n1;
                        a[i][j] = t[idx2];
                        a[i][j + 1] = t[idx2 + 1];
                        a[i][j + 2] = t[idx3];
                        a[i][j + 3] = t[idx3 + 1];
                        a[i][j + 4] = t[idx4];
                        a[i][j + 5] = t[idx4 + 1];
                        a[i][j + 6] = t[idx5];
                        a[i][j + 7] = t[idx5 + 1];
                    }
                }
            } else if (n2 == 4) {
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    t[idx2] = a[i][0];
                    t[idx2 + 1] = a[i][1];
                    t[idx3] = a[i][2];
                    t[idx3 + 1] = a[i][3];
                }
                fftn1.complexInverse(t, 0, scale);
                fftn1.complexInverse(t, 2 * n1, scale);
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    idx3 = 2 * n1 + 2 * i;
                    a[i][0] = t[idx2];
                    a[i][1] = t[idx2 + 1];
                    a[i][2] = t[idx3];
                    a[i][3] = t[idx3 + 1];
                }
            } else if (n2 == 2) {
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    t[idx2] = a[i][0];
                    t[idx2 + 1] = a[i][1];
                }
                fftn1.complexInverse(t, 0, scale);
                for (i = 0; i < n1; i++) {
                    idx2 = 2 * i;
                    a[i][0] = t[idx2];
                    a[i][1] = t[idx2 + 1];
                }
            }
        }
    }

    private void xdft2d0_subth1(final int icr, final int isgn, final float[] a, final boolean scale) {
        final int nthread;
        int i;
        int np = ConcurrencyUtils.getNumberOfProcessors();
        if (np > n1) {
            nthread = n1;
        } else {
            nthread = np;
        }
        Future[] futures = new Future[nthread];
        for (i = 0; i < nthread; i++) {
            final int n0 = i;
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int i;
                    if (icr == 0) {
                        if (isgn == -1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexForward(a, i * n2);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexInverse(a, i * n2, scale);
                            }
                        }
                    } else {
                        if (isgn == 1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realForward(a, i * n2);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realInverse(a, i * n2, scale);
                            }
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < nthread; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void xdft2d0_subth2(final int icr, final int isgn, final float[] a, final boolean scale) {
        final int nthread;
        int i;
        int np = ConcurrencyUtils.getNumberOfProcessors();
        if (np > n1) {
            nthread = n1;
        } else {
            nthread = np;
        }
        Future[] futures = new Future[nthread];
        for (i = 0; i < nthread; i++) {
            final int n0 = i;
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int i;
                    if (icr == 0) {
                        if (isgn == -1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexForward(a, i * n2);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexInverse(a, i * n2, scale);
                            }
                        }
                    } else {
                        if (isgn == 1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realForward(a, i * n2);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realInverse2(a, i * n2, scale);
                            }
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < nthread; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void xdft2d0_subth1(final int icr, final int isgn, final float[][] a, final boolean scale) {
        final int nthread;
        int i;
        int np = ConcurrencyUtils.getNumberOfProcessors();
        if (np > n1) {
            nthread = n1;
        } else {
            nthread = np;
        }
        Future[] futures = new Future[nthread];
        for (i = 0; i < nthread; i++) {
            final int n0 = i;
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int i;
                    if (icr == 0) {
                        if (isgn == -1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexForward(a[i]);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexInverse(a[i], scale);
                            }
                        }
                    } else {
                        if (isgn == 1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realForward(a[i]);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realInverse(a[i], scale);
                            }
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < nthread; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void xdft2d0_subth2(final int icr, final int isgn, final float[][] a, final boolean scale) {
        final int nthread;
        int i;
        int np = ConcurrencyUtils.getNumberOfProcessors();
        if (np > n1) {
            nthread = n1;
        } else {
            nthread = np;
        }
        Future[] futures = new Future[nthread];
        for (i = 0; i < nthread; i++) {
            final int n0 = i;
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int i;
                    if (icr == 0) {
                        if (isgn == -1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexForward(a[i]);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.complexInverse(a[i], scale);
                            }
                        }
                    } else {
                        if (isgn == 1) {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realForward(a[i]);
                            }
                        } else {
                            for (i = n0; i < n1; i += nthread) {
                                fftn2.realInverse2(a[i], 0, scale);
                            }
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < nthread; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void cdft2d_subth(final int isgn, final float[] a, final boolean scale) {
        int nthread;
        int nt, i;
        int np = ConcurrencyUtils.getNumberOfProcessors();
        nthread = np;
        nt = 8 * n1;
        if (n2 == 4 * np) {
            nt >>= 1;
        } else if (n2 < 4 * np) {
            nthread = n2 >> 1;
            nt >>= 2;
        }
        Future[] futures = new Future[nthread];
        final int nthread_f = nthread;
        for (i = 0; i < nthread; i++) {
            final int n0 = i;
            final int startt = nt * i;
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int i, j;
                    int idx1, idx2, idx3, idx4, idx5;
                    if (isgn == -1) {
                        if (n2 > 4 * nthread_f) {
                            for (j = 8 * n0; j < n2; j += 8 * nthread_f) {
                                for (i = 0; i < n1; i++) {
                                    idx1 = i * n2 + j;
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    t[idx2] = a[idx1];
                                    t[idx2 + 1] = a[idx1 + 1];
                                    t[idx3] = a[idx1 + 2];
                                    t[idx3 + 1] = a[idx1 + 3];
                                    t[idx4] = a[idx1 + 4];
                                    t[idx4 + 1] = a[idx1 + 5];
                                    t[idx5] = a[idx1 + 6];
                                    t[idx5 + 1] = a[idx1 + 7];
                                }
                                fftn1.complexForward(t, startt);
                                fftn1.complexForward(t, startt + 2 * n1);
                                fftn1.complexForward(t, startt + 4 * n1);
                                fftn1.complexForward(t, startt + 6 * n1);
                                for (i = 0; i < n1; i++) {
                                    idx1 = i * n2 + j;
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    a[idx1] = t[idx2];
                                    a[idx1 + 1] = t[idx2 + 1];
                                    a[idx1 + 2] = t[idx3];
                                    a[idx1 + 3] = t[idx3 + 1];
                                    a[idx1 + 4] = t[idx4];
                                    a[idx1 + 5] = t[idx4 + 1];
                                    a[idx1 + 6] = t[idx5];
                                    a[idx1 + 7] = t[idx5 + 1];
                                }
                            }
                        } else if (n2 == 4 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 4 * n0;
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                t[idx2] = a[idx1];
                                t[idx2 + 1] = a[idx1 + 1];
                                t[idx3] = a[idx1 + 2];
                                t[idx3 + 1] = a[idx1 + 3];
                            }
                            fftn1.complexForward(t, startt);
                            fftn1.complexForward(t, startt + 2 * n1);
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 4 * n0;
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                a[idx1] = t[idx2];
                                a[idx1 + 1] = t[idx2 + 1];
                                a[idx1 + 2] = t[idx3];
                                a[idx1 + 3] = t[idx3 + 1];
                            }
                        } else if (n2 == 2 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 2 * n0;
                                idx2 = startt + 2 * i;
                                t[idx2] = a[idx1];
                                t[idx2 + 1] = a[idx1 + 1];
                            }
                            fftn1.complexForward(t, startt);
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 2 * n0;
                                idx2 = startt + 2 * i;
                                a[idx1] = t[idx2];
                                a[idx1 + 1] = t[idx2 + 1];
                            }
                        }
                    } else {
                        if (n2 > 4 * nthread_f) {
                            for (j = 8 * n0; j < n2; j += 8 * nthread_f) {
                                for (i = 0; i < n1; i++) {
                                    idx1 = i * n2 + j;
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    t[idx2] = a[idx1];
                                    t[idx2 + 1] = a[idx1 + 1];
                                    t[idx3] = a[idx1 + 2];
                                    t[idx3 + 1] = a[idx1 + 3];
                                    t[idx4] = a[idx1 + 4];
                                    t[idx4 + 1] = a[idx1 + 5];
                                    t[idx5] = a[idx1 + 6];
                                    t[idx5 + 1] = a[idx1 + 7];
                                }
                                fftn1.complexInverse(t, startt, scale);
                                fftn1.complexInverse(t, startt + 2 * n1, scale);
                                fftn1.complexInverse(t, startt + 4 * n1, scale);
                                fftn1.complexInverse(t, startt + 6 * n1, scale);
                                for (i = 0; i < n1; i++) {
                                    idx1 = i * n2 + j;
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    a[idx1] = t[idx2];
                                    a[idx1 + 1] = t[idx2 + 1];
                                    a[idx1 + 2] = t[idx3];
                                    a[idx1 + 3] = t[idx3 + 1];
                                    a[idx1 + 4] = t[idx4];
                                    a[idx1 + 5] = t[idx4 + 1];
                                    a[idx1 + 6] = t[idx5];
                                    a[idx1 + 7] = t[idx5 + 1];
                                }
                            }
                        } else if (n2 == 4 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 4 * n0;
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                t[idx2] = a[idx1];
                                t[idx2 + 1] = a[idx1 + 1];
                                t[idx3] = a[idx1 + 2];
                                t[idx3 + 1] = a[idx1 + 3];
                            }
                            fftn1.complexInverse(t, startt, scale);
                            fftn1.complexInverse(t, startt + 2 * n1, scale);
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 4 * n0;
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                a[idx1] = t[idx2];
                                a[idx1 + 1] = t[idx2 + 1];
                                a[idx1 + 2] = t[idx3];
                                a[idx1 + 3] = t[idx3 + 1];
                            }
                        } else if (n2 == 2 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 2 * n0;
                                idx2 = startt + 2 * i;
                                t[idx2] = a[idx1];
                                t[idx2 + 1] = a[idx1 + 1];
                            }
                            fftn1.complexInverse(t, startt, scale);
                            for (i = 0; i < n1; i++) {
                                idx1 = i * n2 + 2 * n0;
                                idx2 = startt + 2 * i;
                                a[idx1] = t[idx2];
                                a[idx1 + 1] = t[idx2 + 1];
                            }
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < nthread; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void cdft2d_subth(final int isgn, final float[][] a, final boolean scale) {
        int nthread;
        int nt, i;
        int np = ConcurrencyUtils.getNumberOfProcessors();
        nthread = np;
        nt = 8 * n1;
        if (n2 == 4 * np) {
            nt >>= 1;
        } else if (n2 < 4 * np) {
            nthread = n2 >> 1;
            nt >>= 2;
        }
        Future[] futures = new Future[nthread];
        final int nthread_f = nthread;
        for (i = 0; i < nthread; i++) {
            final int n0 = i;
            final int startt = nt * i;
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int i, j;
                    int idx2, idx3, idx4, idx5;
                    if (isgn == -1) {
                        if (n2 > 4 * nthread_f) {
                            for (j = 8 * n0; j < n2; j += 8 * nthread_f) {
                                for (i = 0; i < n1; i++) {
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    t[idx2] = a[i][j];
                                    t[idx2 + 1] = a[i][j + 1];
                                    t[idx3] = a[i][j + 2];
                                    t[idx3 + 1] = a[i][j + 3];
                                    t[idx4] = a[i][j + 4];
                                    t[idx4 + 1] = a[i][j + 5];
                                    t[idx5] = a[i][j + 6];
                                    t[idx5 + 1] = a[i][j + 7];
                                }
                                fftn1.complexForward(t, startt);
                                fftn1.complexForward(t, startt + 2 * n1);
                                fftn1.complexForward(t, startt + 4 * n1);
                                fftn1.complexForward(t, startt + 6 * n1);
                                for (i = 0; i < n1; i++) {
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    a[i][j] = t[idx2];
                                    a[i][j + 1] = t[idx2 + 1];
                                    a[i][j + 2] = t[idx3];
                                    a[i][j + 3] = t[idx3 + 1];
                                    a[i][j + 4] = t[idx4];
                                    a[i][j + 5] = t[idx4 + 1];
                                    a[i][j + 6] = t[idx5];
                                    a[i][j + 7] = t[idx5 + 1];
                                }
                            }
                        } else if (n2 == 4 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                t[idx2] = a[i][4 * n0];
                                t[idx2 + 1] = a[i][4 * n0 + 1];
                                t[idx3] = a[i][4 * n0 + 2];
                                t[idx3 + 1] = a[i][4 * n0 + 3];
                            }
                            fftn1.complexForward(t, startt);
                            fftn1.complexForward(t, startt + 2 * n1);
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                a[i][4 * n0] = t[idx2];
                                a[i][4 * n0 + 1] = t[idx2 + 1];
                                a[i][4 * n0 + 2] = t[idx3];
                                a[i][4 * n0 + 3] = t[idx3 + 1];
                            }
                        } else if (n2 == 2 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                t[idx2] = a[i][2 * n0];
                                t[idx2 + 1] = a[i][2 * n0 + 1];
                            }
                            fftn1.complexForward(t, startt);
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                a[i][2 * n0] = t[idx2];
                                a[i][2 * n0 + 1] = t[idx2 + 1];
                            }
                        }
                    } else {
                        if (n2 > 4 * nthread_f) {
                            for (j = 8 * n0; j < n2; j += 8 * nthread_f) {
                                for (i = 0; i < n1; i++) {
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    t[idx2] = a[i][j];
                                    t[idx2 + 1] = a[i][j + 1];
                                    t[idx3] = a[i][j + 2];
                                    t[idx3 + 1] = a[i][j + 3];
                                    t[idx4] = a[i][j + 4];
                                    t[idx4 + 1] = a[i][j + 5];
                                    t[idx5] = a[i][j + 6];
                                    t[idx5 + 1] = a[i][j + 7];
                                }
                                fftn1.complexInverse(t, startt, scale);
                                fftn1.complexInverse(t, startt + 2 * n1, scale);
                                fftn1.complexInverse(t, startt + 4 * n1, scale);
                                fftn1.complexInverse(t, startt + 6 * n1, scale);
                                for (i = 0; i < n1; i++) {
                                    idx2 = startt + 2 * i;
                                    idx3 = startt + 2 * n1 + 2 * i;
                                    idx4 = idx3 + 2 * n1;
                                    idx5 = idx4 + 2 * n1;
                                    a[i][j] = t[idx2];
                                    a[i][j + 1] = t[idx2 + 1];
                                    a[i][j + 2] = t[idx3];
                                    a[i][j + 3] = t[idx3 + 1];
                                    a[i][j + 4] = t[idx4];
                                    a[i][j + 5] = t[idx4 + 1];
                                    a[i][j + 6] = t[idx5];
                                    a[i][j + 7] = t[idx5 + 1];
                                }
                            }
                        } else if (n2 == 4 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                t[idx2] = a[i][4 * n0];
                                t[idx2 + 1] = a[i][4 * n0 + 1];
                                t[idx3] = a[i][4 * n0 + 2];
                                t[idx3 + 1] = a[i][4 * n0 + 3];
                            }
                            fftn1.complexInverse(t, startt, scale);
                            fftn1.complexInverse(t, startt + 2 * n1, scale);
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                idx3 = startt + 2 * n1 + 2 * i;
                                a[i][4 * n0] = t[idx2];
                                a[i][4 * n0 + 1] = t[idx2 + 1];
                                a[i][4 * n0 + 2] = t[idx3];
                                a[i][4 * n0 + 3] = t[idx3 + 1];
                            }
                        } else if (n2 == 2 * nthread_f) {
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                t[idx2] = a[i][2 * n0];
                                t[idx2 + 1] = a[i][2 * n0 + 1];
                            }
                            fftn1.complexInverse(t, startt, scale);
                            for (i = 0; i < n1; i++) {
                                idx2 = startt + 2 * i;
                                a[i][2 * n0] = t[idx2];
                                a[i][2 * n0 + 1] = t[idx2 + 1];
                            }
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < nthread; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void fillSymmetric(final float[] a) {
        int np = ConcurrencyUtils.getNumberOfProcessors();
        Future[] futures = new Future[np];
        int n1d2 = n1 / 2;
        int l1k = n1d2 / np;
        final int newn2 = 2 * n2;
        for (int i = 0; i < np; i++) {
            final int l1offa, l1stopa, l2offa, l2stopa;
            if (i == 0)
                l1offa = i * l1k + 1;
            else {
                l1offa = i * l1k;
            }
            l1stopa = i * l1k + l1k;
            l2offa = i * l1k;
            if (i == np - 1) {
                l2stopa = i * l1k + l1k + 1;
            } else {
                l2stopa = i * l1k + l1k;
            }
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {
                    int idx1, idx2;

                    for (int k1 = l1offa; k1 < l1stopa; k1++) {
                        idx1 = k1 * newn2;
                        idx2 = (n1 - k1) * newn2;
                        a[idx1 + n2] = a[idx2 + 1];
                        a[idx1 + n2 + 1] = -a[idx2];
                    }
                    for (int k1 = l1offa; k1 < l1stopa; k1++) {
                        for (int k2 = n2 + 2; k2 < newn2; k2 = k2 + 2) {
                            idx1 = k1 * newn2;
                            idx2 = (n1 - k1) * newn2 + newn2 - k2;
                            a[idx1 + k2] = a[idx2];
                            a[idx1 + k2 + 1] = -a[idx2 + 1];

                        }
                    }
                    for (int k1 = l2offa; k1 < l2stopa; k1++) {
                        for (int k2 = 0; k2 < newn2; k2 = k2 + 2) {
                            idx1 = ((n1 - k1) % n1) * newn2 + (newn2 - k2) % newn2;
                            idx2 = k1 * newn2 + k2;
                            a[idx1] = a[idx2];
                            a[idx1 + 1] = -a[idx2 + 1];
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < np; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }

    private void fillSymmetric(final float[][] a) {
        int np = ConcurrencyUtils.getNumberOfProcessors();
        Future[] futures = new Future[np];
        int n1d2 = n1 / 2;
        int l1k = n1d2 / np;
        final int newn2 = 2 * n2;
        for (int i = 0; i < np; i++) {
            final int l1offa, l1stopa, l2offa, l2stopa;
            if (i == 0)
                l1offa = i * l1k + 1;
            else {
                l1offa = i * l1k;
            }
            l1stopa = i * l1k + l1k;
            l2offa = i * l1k;
            if (i == np - 1) {
                l2stopa = i * l1k + l1k + 1;
            } else {
                l2stopa = i * l1k + l1k;
            }
            futures[i] = ConcurrencyUtils.threadPool.submit(new Runnable() {
                public void run() {

                    for (int k1 = l1offa; k1 < l1stopa; k1++) {
                        a[k1][n2] = a[n1 - k1][1];
                        a[k1][n2 + 1] = -a[n1 - k1][0];
                    }
                    for (int k1 = l1offa; k1 < l1stopa; k1++) {
                        for (int k2 = n2 + 2; k2 < newn2; k2 = k2 + 2) {
                            a[k1][k2] = a[n1 - k1][newn2 - k2];
                            a[k1][k2 + 1] = -a[n1 - k1][newn2 - k2 + 1];

                        }
                    }
                    for (int k1 = l2offa; k1 < l2stopa; k1++) {
                        for (int k2 = 0; k2 < newn2; k2 = k2 + 2) {
                            a[(n1 - k1) % n1][(newn2 - k2) % newn2] = a[k1][k2];
                            a[(n1 - k1) % n1][(newn2 - k2) % newn2 + 1] = -a[k1][k2 + 1];
                        }
                    }
                }
            });
        }
        try {
            for (int j = 0; j < np; j++) {
                futures[j].get();
            }
        } catch (ExecutionException ex) {
            ex.printStackTrace();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
    }
}