This commit is contained in:
2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions

View File

@@ -0,0 +1,109 @@
# This Makefile.am specifies a set of codelets, efficient transforms
# of small sizes, that are used as building blocks (kernels) by FFTW
# to build up large transforms, as well as the options for generating
# and compiling them.
# You can customize FFTW for special needs, e.g. to handle certain
# sizes more efficiently, by adding new codelets to the lists of those
# included by default. If you change the list of codelets, any new
# ones you added will be automatically generated when you run the
# bootstrap script (see "Generating your own code" in the FFTW
# manual).
###########################################################################
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = librdft_scalar_r2cf.la
###########################################################################
# r2cf_<n> is a hard-coded real-to-complex FFT of size <n> (base cases
# of real-input FFT recursion)
R2CF = r2cf_2.c r2cf_3.c r2cf_4.c r2cf_5.c r2cf_6.c r2cf_7.c r2cf_8.c \
r2cf_9.c r2cf_10.c r2cf_11.c r2cf_12.c r2cf_13.c r2cf_14.c r2cf_15.c \
r2cf_16.c r2cf_32.c r2cf_64.c r2cf_128.c \
r2cf_20.c r2cf_25.c # r2cf_30.c r2cf_40.c r2cf_50.c
###########################################################################
# hf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
# step for a real-input FFT. Every hf codelet must have a
# corresponding r2cfII codelet (see below)!
HF = hf_2.c hf_3.c hf_4.c hf_5.c hf_6.c hf_7.c hf_8.c hf_9.c \
hf_10.c hf_12.c hf_15.c hf_16.c hf_32.c hf_64.c \
hf_20.c hf_25.c # hf_30.c hf_40.c hf_50.c
# like hf, but generates part of its trig table on the fly (good for large n)
HF2 = hf2_4.c hf2_8.c hf2_16.c hf2_32.c \
hf2_5.c hf2_20.c hf2_25.c
# an r2cf transform where the input is shifted by half a sample (output
# is multiplied by a phase). This is needed as part of the DIT recursion;
# every hf_<r> or hf2_<r> codelet should have a corresponding r2cfII_<r>
R2CFII = r2cfII_2.c r2cfII_3.c r2cfII_4.c r2cfII_5.c r2cfII_6.c \
r2cfII_7.c r2cfII_8.c r2cfII_9.c r2cfII_10.c r2cfII_12.c r2cfII_15.c \
r2cfII_16.c r2cfII_32.c r2cfII_64.c \
r2cfII_20.c r2cfII_25.c # r2cfII_30.c r2cfII_40.c r2cfII_50.c
###########################################################################
# hc2cf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
# step for a real-input FFT with rdft2-style output. <r> must be even.
HC2CF = hc2cf_2.c hc2cf_4.c hc2cf_6.c hc2cf_8.c hc2cf_10.c hc2cf_12.c \
hc2cf_16.c hc2cf_32.c \
hc2cf_20.c # hc2cf_30.c
HC2CFDFT = hc2cfdft_2.c hc2cfdft_4.c hc2cfdft_6.c hc2cfdft_8.c \
hc2cfdft_10.c hc2cfdft_12.c hc2cfdft_16.c hc2cfdft_32.c \
hc2cfdft_20.c # hc2cfdft_30.c
# like hc2cf, but generates part of its trig table on the fly (good
# for large n)
HC2CF2 = hc2cf2_4.c hc2cf2_8.c hc2cf2_16.c hc2cf2_32.c \
hc2cf2_20.c # hc2cf2_30.c
HC2CFDFT2 = hc2cfdft2_4.c hc2cfdft2_8.c hc2cfdft2_16.c hc2cfdft2_32.c \
hc2cfdft2_20.c # hc2cfdft2_30.c
###########################################################################
ALL_CODELETS = $(R2CF) $(HF) $(HF2) $(R2CFII) $(HC2CF) $(HC2CF2) \
$(HC2CFDFT) $(HC2CFDFT2)
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
librdft_scalar_r2cf_la_SOURCES = $(BUILT_SOURCES)
SOLVTAB_NAME = X(solvtab_rdft_r2cf)
XRENAME=X
# special rules for regenerating codelets.
include $(top_srcdir)/support/Makefile.codelets
if MAINTAINER_MODE
FLAGS_R2CF=$(RDFT_FLAGS_COMMON)
FLAGS_HF=$(RDFT_FLAGS_COMMON)
FLAGS_HF2=$(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
FLAGS_HC2CF=$(RDFT_FLAGS_COMMON)
FLAGS_HC2CF2=$(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
FLAGS_R2CFII=$(RDFT_FLAGS_COMMON)
r2cf_%.c: $(CODELET_DEPS) $(GEN_R2CF)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cf_$* -include "rdft/scalar/r2cf.h") | $(ADD_DATE) | $(INDENT) >$@
hf_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF) -n $* -dit -name hf_$* -include "rdft/scalar/hf.h") | $(ADD_DATE) | $(INDENT) >$@
hf2_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF2) -n $* -dit -name hf2_$* -include "rdft/scalar/hf.h") | $(ADD_DATE) | $(INDENT) >$@
r2cfII_%.c: $(CODELET_DEPS) $(GEN_R2CF)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cfII_$* -dft-II -include "rdft/scalar/r2cfII.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cf_%.c: $(CODELET_DEPS) $(GEN_HC2C)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF) -n $* -dit -name hc2cf_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cf2_%.c: $(CODELET_DEPS) $(GEN_HC2C)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF2) -n $* -dit -name hc2cf2_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cfdft_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF) -n $* -dit -name hc2cfdft_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
hc2cfdft2_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF2) -n $* -dit -name hc2cfdft2_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
endif # MAINTAINER_MODE

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,183 @@
#include "kernel/ifftw.h"
extern void X(codelet_r2cf_2)(planner *);
extern void X(codelet_r2cf_3)(planner *);
extern void X(codelet_r2cf_4)(planner *);
extern void X(codelet_r2cf_5)(planner *);
extern void X(codelet_r2cf_6)(planner *);
extern void X(codelet_r2cf_7)(planner *);
extern void X(codelet_r2cf_8)(planner *);
extern void X(codelet_r2cf_9)(planner *);
extern void X(codelet_r2cf_10)(planner *);
extern void X(codelet_r2cf_11)(planner *);
extern void X(codelet_r2cf_12)(planner *);
extern void X(codelet_r2cf_13)(planner *);
extern void X(codelet_r2cf_14)(planner *);
extern void X(codelet_r2cf_15)(planner *);
extern void X(codelet_r2cf_16)(planner *);
extern void X(codelet_r2cf_32)(planner *);
extern void X(codelet_r2cf_64)(planner *);
extern void X(codelet_r2cf_128)(planner *);
extern void X(codelet_r2cf_20)(planner *);
extern void X(codelet_r2cf_25)(planner *);
extern void X(codelet_hf_2)(planner *);
extern void X(codelet_hf_3)(planner *);
extern void X(codelet_hf_4)(planner *);
extern void X(codelet_hf_5)(planner *);
extern void X(codelet_hf_6)(planner *);
extern void X(codelet_hf_7)(planner *);
extern void X(codelet_hf_8)(planner *);
extern void X(codelet_hf_9)(planner *);
extern void X(codelet_hf_10)(planner *);
extern void X(codelet_hf_12)(planner *);
extern void X(codelet_hf_15)(planner *);
extern void X(codelet_hf_16)(planner *);
extern void X(codelet_hf_32)(planner *);
extern void X(codelet_hf_64)(planner *);
extern void X(codelet_hf_20)(planner *);
extern void X(codelet_hf_25)(planner *);
extern void X(codelet_hf2_4)(planner *);
extern void X(codelet_hf2_8)(planner *);
extern void X(codelet_hf2_16)(planner *);
extern void X(codelet_hf2_32)(planner *);
extern void X(codelet_hf2_5)(planner *);
extern void X(codelet_hf2_20)(planner *);
extern void X(codelet_hf2_25)(planner *);
extern void X(codelet_r2cfII_2)(planner *);
extern void X(codelet_r2cfII_3)(planner *);
extern void X(codelet_r2cfII_4)(planner *);
extern void X(codelet_r2cfII_5)(planner *);
extern void X(codelet_r2cfII_6)(planner *);
extern void X(codelet_r2cfII_7)(planner *);
extern void X(codelet_r2cfII_8)(planner *);
extern void X(codelet_r2cfII_9)(planner *);
extern void X(codelet_r2cfII_10)(planner *);
extern void X(codelet_r2cfII_12)(planner *);
extern void X(codelet_r2cfII_15)(planner *);
extern void X(codelet_r2cfII_16)(planner *);
extern void X(codelet_r2cfII_32)(planner *);
extern void X(codelet_r2cfII_64)(planner *);
extern void X(codelet_r2cfII_20)(planner *);
extern void X(codelet_r2cfII_25)(planner *);
extern void X(codelet_hc2cf_2)(planner *);
extern void X(codelet_hc2cf_4)(planner *);
extern void X(codelet_hc2cf_6)(planner *);
extern void X(codelet_hc2cf_8)(planner *);
extern void X(codelet_hc2cf_10)(planner *);
extern void X(codelet_hc2cf_12)(planner *);
extern void X(codelet_hc2cf_16)(planner *);
extern void X(codelet_hc2cf_32)(planner *);
extern void X(codelet_hc2cf_20)(planner *);
extern void X(codelet_hc2cf2_4)(planner *);
extern void X(codelet_hc2cf2_8)(planner *);
extern void X(codelet_hc2cf2_16)(planner *);
extern void X(codelet_hc2cf2_32)(planner *);
extern void X(codelet_hc2cf2_20)(planner *);
extern void X(codelet_hc2cfdft_2)(planner *);
extern void X(codelet_hc2cfdft_4)(planner *);
extern void X(codelet_hc2cfdft_6)(planner *);
extern void X(codelet_hc2cfdft_8)(planner *);
extern void X(codelet_hc2cfdft_10)(planner *);
extern void X(codelet_hc2cfdft_12)(planner *);
extern void X(codelet_hc2cfdft_16)(planner *);
extern void X(codelet_hc2cfdft_32)(planner *);
extern void X(codelet_hc2cfdft_20)(planner *);
extern void X(codelet_hc2cfdft2_4)(planner *);
extern void X(codelet_hc2cfdft2_8)(planner *);
extern void X(codelet_hc2cfdft2_16)(planner *);
extern void X(codelet_hc2cfdft2_32)(planner *);
extern void X(codelet_hc2cfdft2_20)(planner *);
extern const solvtab X(solvtab_rdft_r2cf);
const solvtab X(solvtab_rdft_r2cf) = {
SOLVTAB(X(codelet_r2cf_2)),
SOLVTAB(X(codelet_r2cf_3)),
SOLVTAB(X(codelet_r2cf_4)),
SOLVTAB(X(codelet_r2cf_5)),
SOLVTAB(X(codelet_r2cf_6)),
SOLVTAB(X(codelet_r2cf_7)),
SOLVTAB(X(codelet_r2cf_8)),
SOLVTAB(X(codelet_r2cf_9)),
SOLVTAB(X(codelet_r2cf_10)),
SOLVTAB(X(codelet_r2cf_11)),
SOLVTAB(X(codelet_r2cf_12)),
SOLVTAB(X(codelet_r2cf_13)),
SOLVTAB(X(codelet_r2cf_14)),
SOLVTAB(X(codelet_r2cf_15)),
SOLVTAB(X(codelet_r2cf_16)),
SOLVTAB(X(codelet_r2cf_32)),
SOLVTAB(X(codelet_r2cf_64)),
SOLVTAB(X(codelet_r2cf_128)),
SOLVTAB(X(codelet_r2cf_20)),
SOLVTAB(X(codelet_r2cf_25)),
SOLVTAB(X(codelet_hf_2)),
SOLVTAB(X(codelet_hf_3)),
SOLVTAB(X(codelet_hf_4)),
SOLVTAB(X(codelet_hf_5)),
SOLVTAB(X(codelet_hf_6)),
SOLVTAB(X(codelet_hf_7)),
SOLVTAB(X(codelet_hf_8)),
SOLVTAB(X(codelet_hf_9)),
SOLVTAB(X(codelet_hf_10)),
SOLVTAB(X(codelet_hf_12)),
SOLVTAB(X(codelet_hf_15)),
SOLVTAB(X(codelet_hf_16)),
SOLVTAB(X(codelet_hf_32)),
SOLVTAB(X(codelet_hf_64)),
SOLVTAB(X(codelet_hf_20)),
SOLVTAB(X(codelet_hf_25)),
SOLVTAB(X(codelet_hf2_4)),
SOLVTAB(X(codelet_hf2_8)),
SOLVTAB(X(codelet_hf2_16)),
SOLVTAB(X(codelet_hf2_32)),
SOLVTAB(X(codelet_hf2_5)),
SOLVTAB(X(codelet_hf2_20)),
SOLVTAB(X(codelet_hf2_25)),
SOLVTAB(X(codelet_r2cfII_2)),
SOLVTAB(X(codelet_r2cfII_3)),
SOLVTAB(X(codelet_r2cfII_4)),
SOLVTAB(X(codelet_r2cfII_5)),
SOLVTAB(X(codelet_r2cfII_6)),
SOLVTAB(X(codelet_r2cfII_7)),
SOLVTAB(X(codelet_r2cfII_8)),
SOLVTAB(X(codelet_r2cfII_9)),
SOLVTAB(X(codelet_r2cfII_10)),
SOLVTAB(X(codelet_r2cfII_12)),
SOLVTAB(X(codelet_r2cfII_15)),
SOLVTAB(X(codelet_r2cfII_16)),
SOLVTAB(X(codelet_r2cfII_32)),
SOLVTAB(X(codelet_r2cfII_64)),
SOLVTAB(X(codelet_r2cfII_20)),
SOLVTAB(X(codelet_r2cfII_25)),
SOLVTAB(X(codelet_hc2cf_2)),
SOLVTAB(X(codelet_hc2cf_4)),
SOLVTAB(X(codelet_hc2cf_6)),
SOLVTAB(X(codelet_hc2cf_8)),
SOLVTAB(X(codelet_hc2cf_10)),
SOLVTAB(X(codelet_hc2cf_12)),
SOLVTAB(X(codelet_hc2cf_16)),
SOLVTAB(X(codelet_hc2cf_32)),
SOLVTAB(X(codelet_hc2cf_20)),
SOLVTAB(X(codelet_hc2cf2_4)),
SOLVTAB(X(codelet_hc2cf2_8)),
SOLVTAB(X(codelet_hc2cf2_16)),
SOLVTAB(X(codelet_hc2cf2_32)),
SOLVTAB(X(codelet_hc2cf2_20)),
SOLVTAB(X(codelet_hc2cfdft_2)),
SOLVTAB(X(codelet_hc2cfdft_4)),
SOLVTAB(X(codelet_hc2cfdft_6)),
SOLVTAB(X(codelet_hc2cfdft_8)),
SOLVTAB(X(codelet_hc2cfdft_10)),
SOLVTAB(X(codelet_hc2cfdft_12)),
SOLVTAB(X(codelet_hc2cfdft_16)),
SOLVTAB(X(codelet_hc2cfdft_32)),
SOLVTAB(X(codelet_hc2cfdft_20)),
SOLVTAB(X(codelet_hc2cfdft2_4)),
SOLVTAB(X(codelet_hc2cfdft2_8)),
SOLVTAB(X(codelet_hc2cfdft2_16)),
SOLVTAB(X(codelet_hc2cfdft2_32)),
SOLVTAB(X(codelet_hc2cfdft2_20)),
SOLVTAB_END
};

View File

@@ -0,0 +1,836 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:35 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 196 FP additions, 134 FP multiplications,
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
* 90 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
{
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
T2 = W[0];
Tf = W[2];
Tg = T2 * Tf;
TM = W[6];
TN = T2 * TM;
TO = W[7];
TS = T2 * TO;
T3 = W[4];
T4 = T2 * T3;
Tp = Tf * T3;
T6 = W[5];
Ta = T2 * T6;
Tt = Tf * T6;
T5 = W[1];
Th = W[3];
Tl = T2 * Th;
Tz = FMA(T5, Th, Tg);
Ti = FNMS(T5, Th, Tg);
T7 = FMA(T5, T6, T4);
TZ = FNMS(Th, T3, Tt);
TT = FNMS(T5, TM, TS);
Tq = FNMS(Th, T6, Tp);
TW = FMA(Th, T6, Tp);
Tb = FNMS(T5, T3, Ta);
Tu = FMA(Th, T3, Tt);
TP = FMA(T5, TO, TN);
TI = FMA(T5, T3, Ta);
TF = FNMS(T5, T6, T4);
{
E T1y, T1C, T1e, T1i;
T1y = Tz * T3;
T1C = Tz * T6;
TC = FNMS(T5, Tf, Tl);
T1z = FMA(TC, T6, T1y);
T1O = FMA(TC, T3, T1C);
T1D = FNMS(TC, T3, T1C);
T1L = FNMS(TC, T6, T1y);
T1e = Ti * T3;
T1i = Ti * T6;
Tm = FMA(T5, Tf, Tl);
T1f = FMA(Tm, T6, T1e);
T1p = FMA(Tm, T3, T1i);
T1j = FNMS(Tm, T3, T1i);
T1m = FNMS(Tm, T6, T1e);
}
}
{
E Te, T1U, T3A, T3L, T1G, T2D, T2B, T3h, T1R, T2w, T2I, T3i, Tx, T3M, T1Z;
E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
E T2d, T38;
{
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
T1 = Rp[0];
T3z = Rm[0];
T8 = Rp[WS(rs, 4)];
T9 = T7 * T8;
Tc = Rm[WS(rs, 4)];
T3x = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Te = T1 + Td;
T1U = T1 - Td;
T3y = FNMS(Tb, T8, T3x);
T3A = T3y + T3z;
T3L = T3z - T3y;
}
{
E T1u, T1v, T1w, T2x, T1A, T1B, T1E, T2z;
T1u = Ip[WS(rs, 7)];
T1v = TM * T1u;
T1w = Im[WS(rs, 7)];
T2x = TM * T1w;
T1A = Ip[WS(rs, 3)];
T1B = T1z * T1A;
T1E = Im[WS(rs, 3)];
T2z = T1z * T1E;
{
E T1x, T1F, T2y, T2A;
T1x = FMA(TO, T1w, T1v);
T1F = FMA(T1D, T1E, T1B);
T1G = T1x + T1F;
T2D = T1x - T1F;
T2y = FNMS(TO, T1u, T2x);
T2A = FNMS(T1D, T1A, T2z);
T2B = T2y - T2A;
T3h = T2y + T2A;
}
}
{
E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
T1H = Ip[WS(rs, 1)];
T1I = Tf * T1H;
T1J = Im[WS(rs, 1)];
T2E = Tf * T1J;
T1M = Ip[WS(rs, 5)];
T1N = T1L * T1M;
T1P = Im[WS(rs, 5)];
T2G = T1L * T1P;
{
E T1K, T1Q, T2F, T2H;
T1K = FMA(Th, T1J, T1I);
T1Q = FMA(T1O, T1P, T1N);
T1R = T1K + T1Q;
T2w = T1Q - T1K;
T2F = FNMS(Th, T1H, T2E);
T2H = FNMS(T1O, T1M, T2G);
T2I = T2F - T2H;
T3i = T2F + T2H;
}
}
{
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Tj = Rp[WS(rs, 2)];
Tk = Ti * Tj;
Tn = Rm[WS(rs, 2)];
T1V = Ti * Tn;
Tr = Rp[WS(rs, 6)];
Ts = Tq * Tr;
Tv = Rm[WS(rs, 6)];
T1X = Tq * Tv;
{
E To, Tw, T1W, T1Y;
To = FMA(Tm, Tn, Tk);
Tw = FMA(Tu, Tv, Ts);
Tx = To + Tw;
T3M = To - Tw;
T1W = FNMS(Tm, Tj, T1V);
T1Y = FNMS(Tu, Tr, T1X);
T1Z = T1W - T1Y;
T3w = T1W + T1Y;
}
}
{
E TA, TB, TD, T21, TG, TH, TJ, T23;
TA = Rp[WS(rs, 1)];
TB = Tz * TA;
TD = Rm[WS(rs, 1)];
T21 = Tz * TD;
TG = Rp[WS(rs, 5)];
TH = TF * TG;
TJ = Rm[WS(rs, 5)];
T23 = TF * TJ;
{
E TE, TK, T22, T24;
TE = FMA(TC, TD, TB);
TK = FMA(TI, TJ, TH);
TL = TE + TK;
T26 = TE - TK;
T22 = FNMS(TC, TA, T21);
T24 = FNMS(TI, TG, T23);
T25 = T22 - T24;
T37 = T22 + T24;
}
}
{
E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
T15 = Ip[0];
T16 = T2 * T15;
T17 = Im[0];
T2h = T2 * T17;
T19 = Ip[WS(rs, 4)];
T1a = T3 * T19;
T1b = Im[WS(rs, 4)];
T2j = T3 * T1b;
{
E T18, T1c, T2i, T2k;
T18 = FMA(T5, T17, T16);
T1c = FMA(T6, T1b, T1a);
T1d = T18 + T1c;
T2o = T18 - T1c;
T2i = FNMS(T5, T15, T2h);
T2k = FNMS(T6, T19, T2j);
T2l = T2i - T2k;
T3c = T2i + T2k;
}
}
{
E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
T1g = Ip[WS(rs, 2)];
T1h = T1f * T1g;
T1k = Im[WS(rs, 2)];
T2p = T1f * T1k;
T1n = Ip[WS(rs, 6)];
T1o = T1m * T1n;
T1q = Im[WS(rs, 6)];
T2r = T1m * T1q;
{
E T1l, T1r, T2q, T2s;
T1l = FMA(T1j, T1k, T1h);
T1r = FMA(T1p, T1q, T1o);
T1s = T1l + T1r;
T2m = T1l - T1r;
T2q = FNMS(T1j, T1g, T2p);
T2s = FNMS(T1p, T1n, T2r);
T2t = T2q - T2s;
T3d = T2q + T2s;
}
}
{
E TQ, TR, TU, T29, TX, TY, T10, T2b;
TQ = Rp[WS(rs, 7)];
TR = TP * TQ;
TU = Rm[WS(rs, 7)];
T29 = TP * TU;
TX = Rp[WS(rs, 3)];
TY = TW * TX;
T10 = Rm[WS(rs, 3)];
T2b = TW * T10;
{
E TV, T11, T2a, T2c;
TV = FMA(TT, TU, TR);
T11 = FMA(TZ, T10, TY);
T12 = TV + T11;
T28 = TV - T11;
T2a = FNMS(TT, TQ, T29);
T2c = FNMS(TZ, TX, T2b);
T2d = T2a - T2c;
T38 = T2a + T2c;
}
}
{
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
{
E Ty, T13, T3v, T3B;
Ty = Te + Tx;
T13 = TL + T12;
T14 = Ty + T13;
T3q = Ty - T13;
T3v = T37 + T38;
T3B = T3w + T3A;
T3C = T3v + T3B;
T3E = T3B - T3v;
}
{
E T1t, T1S, T3r, T3s;
T1t = T1d + T1s;
T1S = T1G + T1R;
T1T = T1t + T1S;
T3D = T1S - T1t;
T3r = T3c + T3d;
T3s = T3h + T3i;
T3t = T3r - T3s;
T3u = T3r + T3s;
}
Rm[WS(rs, 7)] = T14 - T1T;
Im[WS(rs, 7)] = T3u - T3C;
Rp[0] = T14 + T1T;
Ip[0] = T3u + T3C;
Rm[WS(rs, 3)] = T3q - T3t;
Im[WS(rs, 3)] = T3D - T3E;
Rp[WS(rs, 4)] = T3q + T3t;
Ip[WS(rs, 4)] = T3D + T3E;
}
{
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
{
E T36, T39, T3F, T3G;
T36 = Te - Tx;
T39 = T37 - T38;
T3a = T36 + T39;
T3m = T36 - T39;
T3F = T12 - TL;
T3G = T3A - T3w;
T3H = T3F + T3G;
T3J = T3G - T3F;
}
{
E T3b, T3e, T3g, T3j;
T3b = T1d - T1s;
T3e = T3c - T3d;
T3f = T3b + T3e;
T3n = T3e - T3b;
T3g = T1G - T1R;
T3j = T3h - T3i;
T3k = T3g - T3j;
T3o = T3g + T3j;
}
{
E T3l, T3I, T3p, T3K;
T3l = T3f + T3k;
Rm[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
Rp[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
T3I = T3n + T3o;
Im[WS(rs, 5)] = FMS(KP707106781, T3I, T3H);
Ip[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
T3p = T3n - T3o;
Rm[WS(rs, 1)] = FNMS(KP707106781, T3p, T3m);
Rp[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
T3K = T3k - T3f;
Im[WS(rs, 1)] = FMS(KP707106781, T3K, T3J);
Ip[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
}
}
{
E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
E T2O;
{
E T27, T2e, T2n, T2u;
T20 = T1U - T1Z;
T3N = T3L - T3M;
T3T = T3M + T3L;
T2Q = T1U + T1Z;
T27 = T25 - T26;
T2e = T28 + T2d;
T2f = T27 - T2e;
T3O = T27 + T2e;
{
E T2Y, T2Z, T2R, T2S;
T2Y = T2D + T2I;
T2Z = T2B + T2w;
T30 = FNMS(KP414213562, T2Z, T2Y);
T34 = FMA(KP414213562, T2Y, T2Z);
T2R = T26 + T25;
T2S = T28 - T2d;
T2T = T2R + T2S;
T3U = T2S - T2R;
}
T2n = T2l + T2m;
T2u = T2o - T2t;
T2v = FMA(KP414213562, T2u, T2n);
T2N = FNMS(KP414213562, T2n, T2u);
{
E T2V, T2W, T2C, T2J;
T2V = T2o + T2t;
T2W = T2l - T2m;
T2X = FMA(KP414213562, T2W, T2V);
T33 = FNMS(KP414213562, T2V, T2W);
T2C = T2w - T2B;
T2J = T2D - T2I;
T2K = FMA(KP414213562, T2J, T2C);
T2O = FNMS(KP414213562, T2C, T2J);
}
}
{
E T2g, T2L, T3V, T3W;
T2g = FMA(KP707106781, T2f, T20);
T2L = T2v + T2K;
Rm[WS(rs, 4)] = FNMS(KP923879532, T2L, T2g);
Rp[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
T3V = FMA(KP707106781, T3U, T3T);
T3W = T2O - T2N;
Im[WS(rs, 4)] = FMS(KP923879532, T3W, T3V);
Ip[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
}
{
E T2M, T2P, T3X, T3Y;
T2M = FNMS(KP707106781, T2f, T20);
T2P = T2N + T2O;
Rp[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
Rm[0] = FMA(KP923879532, T2P, T2M);
T3X = FNMS(KP707106781, T3U, T3T);
T3Y = T2K - T2v;
Im[0] = FMS(KP923879532, T3Y, T3X);
Ip[WS(rs, 7)] = FMA(KP923879532, T3Y, T3X);
}
{
E T2U, T31, T3P, T3Q;
T2U = FMA(KP707106781, T2T, T2Q);
T31 = T2X + T30;
Rm[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
Rp[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
T3P = FMA(KP707106781, T3O, T3N);
T3Q = T33 + T34;
Im[WS(rs, 6)] = FMS(KP923879532, T3Q, T3P);
Ip[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
}
{
E T32, T35, T3R, T3S;
T32 = FNMS(KP707106781, T2T, T2Q);
T35 = T33 - T34;
Rm[WS(rs, 2)] = FNMS(KP923879532, T35, T32);
Rp[WS(rs, 5)] = FMA(KP923879532, T35, T32);
T3R = FNMS(KP707106781, T3O, T3N);
T3S = T30 - T2X;
Im[WS(rs, 2)] = FMS(KP923879532, T3S, T3R);
Ip[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
void X(codelet_hc2cf2_16) (planner *p) {
X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 196 FP additions, 108 FP multiplications,
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
* 82 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
{
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
{
E Th, Tn, Tj, Tm;
T2 = W[0];
T5 = W[1];
Tg = W[2];
Ti = W[3];
Th = T2 * Tg;
Tn = T5 * Tg;
Tj = T5 * Ti;
Tm = T2 * Ti;
Tk = Th - Tj;
To = Tm + Tn;
TE = Tm - Tn;
TC = Th + Tj;
T6 = W[5];
T7 = T5 * T6;
Tv = Tg * T6;
Ta = T2 * T6;
Ts = Ti * T6;
T3 = W[4];
T4 = T2 * T3;
Tw = Ti * T3;
Tb = T5 * T3;
Tr = Tg * T3;
}
T8 = T4 + T7;
TW = Tv - Tw;
TJ = Ta + Tb;
Tt = Tr - Ts;
TU = Tr + Ts;
Tc = Ta - Tb;
Tx = Tv + Tw;
TH = T4 - T7;
TN = W[6];
TO = W[7];
TP = FMA(T2, TN, T5 * TO);
TR = FNMS(T5, TN, T2 * TO);
{
E T1d, T1e, T19, T1a;
T1d = Tk * T6;
T1e = To * T3;
T1f = T1d - T1e;
T1k = T1d + T1e;
T19 = Tk * T3;
T1a = To * T6;
T1b = T19 + T1a;
T1i = T19 - T1a;
}
{
E T1w, T1x, T1s, T1t;
T1w = TC * T6;
T1x = TE * T3;
T1y = T1w - T1x;
T1H = T1w + T1x;
T1s = TC * T3;
T1t = TE * T6;
T1u = T1s + T1t;
T1F = T1s - T1t;
}
}
{
E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
E T2S, T2T, T28, T2A, T2d, T2B;
{
E T1, T3d, Te, T3c, T9, Td;
T1 = Rp[0];
T3d = Rm[0];
T9 = Rp[WS(rs, 4)];
Td = Rm[WS(rs, 4)];
Te = FMA(T8, T9, Tc * Td);
T3c = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T3r = T3d - T3c;
T1N = T1 - Te;
T3e = T3c + T3d;
}
{
E Tq, T1O, Tz, T1P;
{
E Tl, Tp, Tu, Ty;
Tl = Rp[WS(rs, 2)];
Tp = Rm[WS(rs, 2)];
Tq = FMA(Tk, Tl, To * Tp);
T1O = FNMS(To, Tl, Tk * Tp);
Tu = Rp[WS(rs, 6)];
Ty = Rm[WS(rs, 6)];
Tz = FMA(Tt, Tu, Tx * Ty);
T1P = FNMS(Tx, Tu, Tt * Ty);
}
TA = Tq + Tz;
T3s = Tq - Tz;
T1Q = T1O - T1P;
T3b = T1O + T1P;
}
{
E TG, T1S, TL, T1T, T1U, T1V;
{
E TD, TF, TI, TK;
TD = Rp[WS(rs, 1)];
TF = Rm[WS(rs, 1)];
TG = FMA(TC, TD, TE * TF);
T1S = FNMS(TE, TD, TC * TF);
TI = Rp[WS(rs, 5)];
TK = Rm[WS(rs, 5)];
TL = FMA(TH, TI, TJ * TK);
T1T = FNMS(TJ, TI, TH * TK);
}
TM = TG + TL;
T2M = T1S + T1T;
T1U = T1S - T1T;
T1V = TG - TL;
T1W = T1U - T1V;
T2w = T1V + T1U;
}
{
E TT, T1Y, TY, T1Z, T1X, T20;
{
E TQ, TS, TV, TX;
TQ = Rp[WS(rs, 7)];
TS = Rm[WS(rs, 7)];
TT = FMA(TP, TQ, TR * TS);
T1Y = FNMS(TR, TQ, TP * TS);
TV = Rp[WS(rs, 3)];
TX = Rm[WS(rs, 3)];
TY = FMA(TU, TV, TW * TX);
T1Z = FNMS(TW, TV, TU * TX);
}
TZ = TT + TY;
T2N = T1Y + T1Z;
T1X = TT - TY;
T20 = T1Y - T1Z;
T21 = T1X + T20;
T2x = T1X - T20;
}
{
E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
{
E T1p, T1q, T1G, T1I;
T1p = Ip[WS(rs, 7)];
T1q = Im[WS(rs, 7)];
T1r = FMA(TN, T1p, TO * T1q);
T2k = FNMS(TO, T1p, TN * T1q);
T1G = Ip[WS(rs, 5)];
T1I = Im[WS(rs, 5)];
T1J = FMA(T1F, T1G, T1H * T1I);
T2h = FNMS(T1H, T1G, T1F * T1I);
}
{
E T1v, T1z, T1C, T1D;
T1v = Ip[WS(rs, 3)];
T1z = Im[WS(rs, 3)];
T1A = FMA(T1u, T1v, T1y * T1z);
T2l = FNMS(T1y, T1v, T1u * T1z);
T1C = Ip[WS(rs, 1)];
T1D = Im[WS(rs, 1)];
T1E = FMA(Tg, T1C, Ti * T1D);
T2g = FNMS(Ti, T1C, Tg * T1D);
}
T1B = T1r + T1A;
T1K = T1E + T1J;
T2V = T1B - T1K;
T2W = T2k + T2l;
T2X = T2g + T2h;
T2Y = T2W - T2X;
{
E T2f, T2i, T2m, T2n;
T2f = T1r - T1A;
T2i = T2g - T2h;
T2j = T2f - T2i;
T2D = T2f + T2i;
T2m = T2k - T2l;
T2n = T1E - T1J;
T2o = T2m + T2n;
T2E = T2m - T2n;
}
}
{
E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
{
E T12, T13, T1j, T1l;
T12 = Ip[0];
T13 = Im[0];
T14 = FMA(T2, T12, T5 * T13);
T24 = FNMS(T5, T12, T2 * T13);
T1j = Ip[WS(rs, 6)];
T1l = Im[WS(rs, 6)];
T1m = FMA(T1i, T1j, T1k * T1l);
T2b = FNMS(T1k, T1j, T1i * T1l);
}
{
E T15, T16, T1c, T1g;
T15 = Ip[WS(rs, 4)];
T16 = Im[WS(rs, 4)];
T17 = FMA(T3, T15, T6 * T16);
T25 = FNMS(T6, T15, T3 * T16);
T1c = Ip[WS(rs, 2)];
T1g = Im[WS(rs, 2)];
T1h = FMA(T1b, T1c, T1f * T1g);
T2a = FNMS(T1f, T1c, T1b * T1g);
}
T18 = T14 + T17;
T1n = T1h + T1m;
T2Q = T18 - T1n;
T2R = T24 + T25;
T2S = T2a + T2b;
T2T = T2R - T2S;
{
E T26, T27, T29, T2c;
T26 = T24 - T25;
T27 = T1h - T1m;
T28 = T26 + T27;
T2A = T26 - T27;
T29 = T14 - T17;
T2c = T2a - T2b;
T2d = T29 - T2c;
T2B = T29 + T2c;
}
}
{
E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
{
E T1R, T22, T3y, T3z;
T1R = T1N - T1Q;
T22 = KP707106781 * (T1W - T21);
T23 = T1R + T22;
T2r = T1R - T22;
T3y = KP707106781 * (T2x - T2w);
T3z = T3s + T3r;
T3A = T3y + T3z;
T3C = T3z - T3y;
}
{
E T2e, T2p, T2s, T2t;
T2e = FMA(KP923879532, T28, KP382683432 * T2d);
T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
T2q = T2e + T2p;
T3B = T2p - T2e;
T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
T2u = T2s - T2t;
T3x = T2s + T2t;
}
Rm[WS(rs, 4)] = T23 - T2q;
Im[WS(rs, 4)] = T3x - T3A;
Rp[WS(rs, 3)] = T23 + T2q;
Ip[WS(rs, 3)] = T3x + T3A;
Rm[0] = T2r - T2u;
Im[0] = T3B - T3C;
Rp[WS(rs, 7)] = T2r + T2u;
Ip[WS(rs, 7)] = T3B + T3C;
}
{
E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
{
E T2L, T2O, T3k, T3l;
T2L = Tf - TA;
T2O = T2M - T2N;
T2P = T2L + T2O;
T31 = T2L - T2O;
T3k = TZ - TM;
T3l = T3e - T3b;
T3m = T3k + T3l;
T3o = T3l - T3k;
}
{
E T2U, T2Z, T32, T33;
T2U = T2Q + T2T;
T2Z = T2V - T2Y;
T30 = KP707106781 * (T2U + T2Z);
T3n = KP707106781 * (T2Z - T2U);
T32 = T2T - T2Q;
T33 = T2V + T2Y;
T34 = KP707106781 * (T32 - T33);
T3j = KP707106781 * (T32 + T33);
}
Rm[WS(rs, 5)] = T2P - T30;
Im[WS(rs, 5)] = T3j - T3m;
Rp[WS(rs, 2)] = T2P + T30;
Ip[WS(rs, 2)] = T3j + T3m;
Rm[WS(rs, 1)] = T31 - T34;
Im[WS(rs, 1)] = T3n - T3o;
Rp[WS(rs, 6)] = T31 + T34;
Ip[WS(rs, 6)] = T3n + T3o;
}
{
E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
{
E T2v, T2y, T3q, T3t;
T2v = T1N + T1Q;
T2y = KP707106781 * (T2w + T2x);
T2z = T2v + T2y;
T2H = T2v - T2y;
T3q = KP707106781 * (T1W + T21);
T3t = T3r - T3s;
T3u = T3q + T3t;
T3w = T3t - T3q;
}
{
E T2C, T2F, T2I, T2J;
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
T2G = T2C + T2F;
T3v = T2F - T2C;
T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
T2K = T2I - T2J;
T3p = T2I + T2J;
}
Rm[WS(rs, 6)] = T2z - T2G;
Im[WS(rs, 6)] = T3p - T3u;
Rp[WS(rs, 1)] = T2z + T2G;
Ip[WS(rs, 1)] = T3p + T3u;
Rm[WS(rs, 2)] = T2H - T2K;
Im[WS(rs, 2)] = T3v - T3w;
Rp[WS(rs, 5)] = T2H + T2K;
Ip[WS(rs, 5)] = T3v + T3w;
}
{
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
{
E TB, T10, T3a, T3f;
TB = Tf + TA;
T10 = TM + TZ;
T11 = TB + T10;
T35 = TB - T10;
T3a = T2M + T2N;
T3f = T3b + T3e;
T3g = T3a + T3f;
T3i = T3f - T3a;
}
{
E T1o, T1L, T36, T37;
T1o = T18 + T1n;
T1L = T1B + T1K;
T1M = T1o + T1L;
T3h = T1L - T1o;
T36 = T2R + T2S;
T37 = T2W + T2X;
T38 = T36 - T37;
T39 = T36 + T37;
}
Rm[WS(rs, 7)] = T11 - T1M;
Im[WS(rs, 7)] = T39 - T3g;
Rp[0] = T11 + T1M;
Ip[0] = T39 + T3g;
Rm[WS(rs, 3)] = T35 - T38;
Im[WS(rs, 3)] = T3h - T3i;
Rp[WS(rs, 4)] = T35 + T38;
Ip[WS(rs, 4)] = T3h + T3i;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
void X(codelet_hc2cf2_16) (planner *p) {
X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:34 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cf2_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T6, T3, T5, T7, Tb, T4, Ta;
T2 = W[0];
T6 = W[3];
T3 = W[2];
T4 = T2 * T3;
Ta = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tb = FNMS(T5, T3, Ta);
{
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
T1 = Rp[0];
Tx = Rm[0];
{
E T8, T9, Tc, Tv;
T8 = Rp[WS(rs, 1)];
T9 = T7 * T8;
Tc = Rm[WS(rs, 1)];
Tv = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Tw = FNMS(Tb, T8, Tv);
}
{
E Tf, Tg, Th, Tp;
Tf = Ip[0];
Tg = T2 * Tf;
Th = Im[0];
Tp = T2 * Th;
Ti = FMA(T5, Th, Tg);
Tq = FNMS(T5, Tf, Tp);
}
{
E Tj, Tk, Tl, Tr;
Tj = Ip[WS(rs, 1)];
Tk = T3 * Tj;
Tl = Im[WS(rs, 1)];
Tr = T3 * Tl;
Tm = FMA(T6, Tl, Tk);
Ts = FNMS(T6, Tj, Tr);
}
{
E Te, Tn, Tu, Ty;
Te = T1 + Td;
Tn = Ti + Tm;
Rm[WS(rs, 1)] = Te - Tn;
Rp[0] = Te + Tn;
Tu = Tq + Ts;
Ty = Tw + Tx;
Im[WS(rs, 1)] = Tu - Ty;
Ip[0] = Tu + Ty;
}
{
E To, Tt, Tz, TA;
To = T1 - Td;
Tt = Tq - Ts;
Rm[0] = To - Tt;
Rp[WS(rs, 1)] = To + Tt;
Tz = Tm - Ti;
TA = Tx - Tw;
Im[0] = Tz - TA;
Ip[WS(rs, 1)] = Tz + TA;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hc2cf2_4) (planner *p) {
X(khc2c_register) (p, hc2cf2_4, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cf2_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T4, T3, T5, T6, T8;
T2 = W[0];
T4 = W[1];
T3 = W[2];
T5 = W[3];
T6 = FMA(T2, T3, T4 * T5);
T8 = FNMS(T4, T3, T2 * T5);
{
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
T1 = Rp[0];
Tp = Rm[0];
T7 = Rp[WS(rs, 1)];
T9 = Rm[WS(rs, 1)];
Ta = FMA(T6, T7, T8 * T9);
To = FNMS(T8, T7, T6 * T9);
{
E Tc, Td, Tf, Tg;
Tc = Ip[0];
Td = Im[0];
Te = FMA(T2, Tc, T4 * Td);
Tk = FNMS(T4, Tc, T2 * Td);
Tf = Ip[WS(rs, 1)];
Tg = Im[WS(rs, 1)];
Th = FMA(T3, Tf, T5 * Tg);
Tl = FNMS(T5, Tf, T3 * Tg);
}
{
E Tb, Ti, Tn, Tq;
Tb = T1 + Ta;
Ti = Te + Th;
Rm[WS(rs, 1)] = Tb - Ti;
Rp[0] = Tb + Ti;
Tn = Tk + Tl;
Tq = To + Tp;
Im[WS(rs, 1)] = Tn - Tq;
Ip[0] = Tn + Tq;
}
{
E Tj, Tm, Tr, Ts;
Tj = T1 - Ta;
Tm = Tk - Tl;
Rm[0] = Tj - Tm;
Rp[WS(rs, 1)] = Tj + Tm;
Tr = Th - Te;
Ts = Tp - To;
Im[0] = Tr - Ts;
Ip[WS(rs, 1)] = Tr + Ts;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hc2cf2_4) (planner *p) {
X(khc2c_register) (p, hc2cf2_4, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,390 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:34 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 74 FP additions, 50 FP multiplications,
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
* 48 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
{
E T4, Tm, Tr, Ta, TB, TF;
T2 = W[0];
T3 = W[2];
T4 = T2 * T3;
Tl = W[4];
Tm = T2 * Tl;
Tn = W[5];
Tr = T2 * Tn;
T5 = W[1];
T6 = W[3];
Ta = T2 * T6;
Tf = FMA(T5, T6, T4);
T7 = FNMS(T5, T6, T4);
Ts = FNMS(T5, Tl, Tr);
Tb = FMA(T5, T3, Ta);
To = FMA(T5, Tn, Tm);
TB = Tf * Tl;
TF = Tf * Tn;
Ti = FNMS(T5, T3, Ta);
TC = FMA(Ti, Tn, TB);
TG = FNMS(Ti, Tl, TF);
}
{
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
E TI, T11, T13, T15, T16;
T1 = Rp[0];
T1s = Rm[0];
{
E T8, T9, Tc, T1q;
T8 = Rp[WS(rs, 2)];
T9 = T7 * T8;
Tc = Rm[WS(rs, 2)];
T1q = T7 * Tc;
Td = FMA(Tb, Tc, T9);
T1r = FNMS(Tb, T8, T1q);
}
{
E Tp, Tq, Tt, TX;
Tp = Rp[WS(rs, 3)];
Tq = To * Tp;
Tt = Rm[WS(rs, 3)];
TX = To * Tt;
Tu = FMA(Ts, Tt, Tq);
TY = FNMS(Ts, Tp, TX);
}
{
E Tg, Th, Tj, TV;
Tg = Rp[WS(rs, 1)];
Th = Tf * Tg;
Tj = Rm[WS(rs, 1)];
TV = Tf * Tj;
Tk = FMA(Ti, Tj, Th);
TW = FNMS(Ti, Tg, TV);
}
{
E TK, TL, TM, T19, TO, TP, TQ, T1b;
TK = Ip[WS(rs, 3)];
TL = Tl * TK;
TM = Im[WS(rs, 3)];
T19 = Tl * TM;
TO = Ip[WS(rs, 1)];
TP = T3 * TO;
TQ = Im[WS(rs, 1)];
T1b = T3 * TQ;
TN = FMA(Tn, TM, TL);
TR = FMA(T6, TQ, TP);
T18 = TN - TR;
T1a = FNMS(Tn, TK, T19);
T1c = FNMS(T6, TO, T1b);
T1d = T1a - T1c;
}
{
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
Tx = Ip[0];
Ty = T2 * Tx;
Tz = Im[0];
T12 = T2 * Tz;
TD = Ip[WS(rs, 2)];
TE = TC * TD;
TH = Im[WS(rs, 2)];
T14 = TC * TH;
TA = FMA(T5, Tz, Ty);
TI = FMA(TG, TH, TE);
T11 = TA - TI;
T13 = FNMS(T5, Tx, T12);
T15 = FNMS(TG, TD, T14);
T16 = T13 - T15;
}
{
E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
{
E TU, TZ, T1x, T1y;
TU = T1 - Td;
TZ = TW - TY;
T10 = TU + TZ;
T1g = TU - TZ;
T1x = T1s - T1r;
T1y = Tk - Tu;
T1z = T1x - T1y;
T1B = T1y + T1x;
}
{
E T17, T1e, T1h, T1i;
T17 = T11 + T16;
T1e = T18 - T1d;
T1f = T17 + T1e;
T1C = T1e - T17;
T1h = T16 - T11;
T1i = T18 + T1d;
T1j = T1h - T1i;
T1A = T1h + T1i;
}
Rm[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
Im[WS(rs, 2)] = FMS(KP707106781, T1A, T1z);
Rp[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
Ip[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
Rm[0] = FNMS(KP707106781, T1j, T1g);
Im[0] = FMS(KP707106781, T1C, T1B);
Rp[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
Ip[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
}
{
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
{
E Te, Tv, T1p, T1t;
Te = T1 + Td;
Tv = Tk + Tu;
Tw = Te + Tv;
T1k = Te - Tv;
T1p = TW + TY;
T1t = T1r + T1s;
T1u = T1p + T1t;
T1w = T1t - T1p;
}
{
E TJ, TS, T1l, T1m;
TJ = TA + TI;
TS = TN + TR;
TT = TJ + TS;
T1v = TS - TJ;
T1l = T13 + T15;
T1m = T1a + T1c;
T1n = T1l - T1m;
T1o = T1l + T1m;
}
Rm[WS(rs, 3)] = Tw - TT;
Im[WS(rs, 3)] = T1o - T1u;
Rp[0] = Tw + TT;
Ip[0] = T1o + T1u;
Rm[WS(rs, 1)] = T1k - T1n;
Im[WS(rs, 1)] = T1v - T1w;
Rp[WS(rs, 2)] = T1k + T1n;
Ip[WS(rs, 2)] = T1v + T1w;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
void X(codelet_hc2cf2_8) (planner *p) {
X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 74 FP additions, 44 FP multiplications,
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
* 42 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
{
E T4, Tb, T7, Ta;
T2 = W[0];
T5 = W[1];
T3 = W[2];
T6 = W[3];
T4 = T2 * T3;
Tb = T5 * T3;
T7 = T5 * T6;
Ta = T2 * T6;
T8 = T4 - T7;
Tc = Ta + Tb;
Tg = T4 + T7;
Ti = Ta - Tb;
Tl = W[4];
Tm = W[5];
Tn = FMA(T2, Tl, T5 * Tm);
Tz = FNMS(Ti, Tl, Tg * Tm);
Tp = FNMS(T5, Tl, T2 * Tm);
Tx = FMA(Tg, Tl, Ti * Tm);
}
{
E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
E TT;
{
E T1, T1c, Te, T1b, T9, Td;
T1 = Rp[0];
T1c = Rm[0];
T9 = Rp[WS(rs, 2)];
Td = Rm[WS(rs, 2)];
Te = FMA(T8, T9, Tc * Td);
T1b = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T1i = T1c - T1b;
TL = T1 - Te;
T1d = T1b + T1c;
}
{
E TF, TW, TI, TX;
{
E TD, TE, TG, TH;
TD = Ip[WS(rs, 3)];
TE = Im[WS(rs, 3)];
TF = FMA(Tl, TD, Tm * TE);
TW = FNMS(Tm, TD, Tl * TE);
TG = Ip[WS(rs, 1)];
TH = Im[WS(rs, 1)];
TI = FMA(T3, TG, T6 * TH);
TX = FNMS(T6, TG, T3 * TH);
}
TJ = TF + TI;
T17 = TW + TX;
TV = TF - TI;
TY = TW - TX;
}
{
E Tk, TM, Tr, TN;
{
E Th, Tj, To, Tq;
Th = Rp[WS(rs, 1)];
Tj = Rm[WS(rs, 1)];
Tk = FMA(Tg, Th, Ti * Tj);
TM = FNMS(Ti, Th, Tg * Tj);
To = Rp[WS(rs, 3)];
Tq = Rm[WS(rs, 3)];
Tr = FMA(Tn, To, Tp * Tq);
TN = FNMS(Tp, To, Tn * Tq);
}
Ts = Tk + Tr;
T1j = Tk - Tr;
TO = TM - TN;
T1a = TM + TN;
}
{
E Tw, TR, TB, TS;
{
E Tu, Tv, Ty, TA;
Tu = Ip[0];
Tv = Im[0];
Tw = FMA(T2, Tu, T5 * Tv);
TR = FNMS(T5, Tu, T2 * Tv);
Ty = Ip[WS(rs, 2)];
TA = Im[WS(rs, 2)];
TB = FMA(Tx, Ty, Tz * TA);
TS = FNMS(Tz, Ty, Tx * TA);
}
TC = Tw + TB;
T16 = TR + TS;
TQ = Tw - TB;
TT = TR - TS;
}
{
E Tt, TK, T1f, T1g;
Tt = Tf + Ts;
TK = TC + TJ;
Rm[WS(rs, 3)] = Tt - TK;
Rp[0] = Tt + TK;
{
E T19, T1e, T15, T18;
T19 = T16 + T17;
T1e = T1a + T1d;
Im[WS(rs, 3)] = T19 - T1e;
Ip[0] = T19 + T1e;
T15 = Tf - Ts;
T18 = T16 - T17;
Rm[WS(rs, 1)] = T15 - T18;
Rp[WS(rs, 2)] = T15 + T18;
}
T1f = TJ - TC;
T1g = T1d - T1a;
Im[WS(rs, 1)] = T1f - T1g;
Ip[WS(rs, 2)] = T1f + T1g;
{
E T11, T1k, T14, T1h, T12, T13;
T11 = TL - TO;
T1k = T1i - T1j;
T12 = TT - TQ;
T13 = TV + TY;
T14 = KP707106781 * (T12 - T13);
T1h = KP707106781 * (T12 + T13);
Rm[0] = T11 - T14;
Ip[WS(rs, 1)] = T1h + T1k;
Rp[WS(rs, 3)] = T11 + T14;
Im[WS(rs, 2)] = T1h - T1k;
}
{
E TP, T1m, T10, T1l, TU, TZ;
TP = TL + TO;
T1m = T1j + T1i;
TU = TQ + TT;
TZ = TV - TY;
T10 = KP707106781 * (TU + TZ);
T1l = KP707106781 * (TZ - TU);
Rm[WS(rs, 2)] = TP - T10;
Ip[WS(rs, 3)] = T1l + T1m;
Rp[WS(rs, 1)] = TP + T10;
Im[0] = T1l - T1m;
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
void X(codelet_hc2cf2_8) (planner *p) {
X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,489 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cf_10 -include rdft/scalar/hc2cf.h */
/*
* This function contains 102 FP additions, 72 FP multiplications,
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
* 47 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E T8, T26, T12, T1U, TM, TZ, T10, T1I, T1J, T24, T16, T17, T18, T1h, T1m;
E T1P, Tl, Ty, Tz, T1F, T1G, T23, T13, T14, T15, T1s, T1x, T1O;
{
E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5;
T1 = Rp[0];
T1T = Rm[0];
T3 = Ip[WS(rs, 2)];
T6 = Im[WS(rs, 2)];
T2 = W[8];
T4 = T2 * T3;
T1R = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1S = FNMS(T5, T3, T1R);
T8 = T1 - T7;
T26 = T1T - T1S;
T12 = T1 + T7;
T1U = T1S + T1T;
}
{
E TF, T1e, TY, T1l, TL, T1g, TS, T1j;
{
E TB, TE, TC, T1d, TA, TD;
TB = Rp[WS(rs, 2)];
TE = Rm[WS(rs, 2)];
TA = W[6];
TC = TA * TB;
T1d = TA * TE;
TD = W[7];
TF = FMA(TD, TE, TC);
T1e = FNMS(TD, TB, T1d);
}
{
E TU, TX, TV, T1k, TT, TW;
TU = Ip[0];
TX = Im[0];
TT = W[0];
TV = TT * TU;
T1k = TT * TX;
TW = W[1];
TY = FMA(TW, TX, TV);
T1l = FNMS(TW, TU, T1k);
}
{
E TH, TK, TI, T1f, TG, TJ;
TH = Ip[WS(rs, 4)];
TK = Im[WS(rs, 4)];
TG = W[16];
TI = TG * TH;
T1f = TG * TK;
TJ = W[17];
TL = FMA(TJ, TK, TI);
T1g = FNMS(TJ, TH, T1f);
}
{
E TO, TR, TP, T1i, TN, TQ;
TO = Rp[WS(rs, 3)];
TR = Rm[WS(rs, 3)];
TN = W[10];
TP = TN * TO;
T1i = TN * TR;
TQ = W[11];
TS = FMA(TQ, TR, TP);
T1j = FNMS(TQ, TO, T1i);
}
TM = TF - TL;
TZ = TS - TY;
T10 = TM + TZ;
T1I = T1l - T1j;
T1J = T1g - T1e;
T24 = T1J + T1I;
T16 = TF + TL;
T17 = TS + TY;
T18 = T16 + T17;
T1h = T1e + T1g;
T1m = T1j + T1l;
T1P = T1h + T1m;
}
{
E Te, T1p, Tx, T1w, Tk, T1r, Tr, T1u;
{
E Ta, Td, Tb, T1o, T9, Tc;
Ta = Rp[WS(rs, 1)];
Td = Rm[WS(rs, 1)];
T9 = W[2];
Tb = T9 * Ta;
T1o = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
T1p = FNMS(Tc, Ta, T1o);
}
{
E Tt, Tw, Tu, T1v, Ts, Tv;
Tt = Ip[WS(rs, 1)];
Tw = Im[WS(rs, 1)];
Ts = W[4];
Tu = Ts * Tt;
T1v = Ts * Tw;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
T1w = FNMS(Tv, Tt, T1v);
}
{
E Tg, Tj, Th, T1q, Tf, Ti;
Tg = Ip[WS(rs, 3)];
Tj = Im[WS(rs, 3)];
Tf = W[12];
Th = Tf * Tg;
T1q = Tf * Tj;
Ti = W[13];
Tk = FMA(Ti, Tj, Th);
T1r = FNMS(Ti, Tg, T1q);
}
{
E Tn, Tq, To, T1t, Tm, Tp;
Tn = Rp[WS(rs, 4)];
Tq = Rm[WS(rs, 4)];
Tm = W[14];
To = Tm * Tn;
T1t = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1u = FNMS(Tp, Tn, T1t);
}
Tl = Te - Tk;
Ty = Tr - Tx;
Tz = Tl + Ty;
T1F = T1w - T1u;
T1G = T1r - T1p;
T23 = T1G + T1F;
T13 = Te + Tk;
T14 = Tr + Tx;
T15 = T13 + T14;
T1s = T1p + T1r;
T1x = T1u + T1w;
T1O = T1s + T1x;
}
{
E T1D, T11, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
T1D = Tz - T10;
T11 = Tz + T10;
T1C = FNMS(KP250000000, T11, T8);
T1H = T1F - T1G;
T1K = T1I - T1J;
T1L = FMA(KP618033988, T1K, T1H);
T1N = FNMS(KP618033988, T1H, T1K);
Rm[WS(rs, 4)] = T8 + T11;
T1M = FNMS(KP559016994, T1D, T1C);
Rm[WS(rs, 2)] = FNMS(KP951056516, T1N, T1M);
Rp[WS(rs, 3)] = FMA(KP951056516, T1N, T1M);
T1E = FMA(KP559016994, T1D, T1C);
Rm[0] = FNMS(KP951056516, T1L, T1E);
Rp[WS(rs, 1)] = FMA(KP951056516, T1L, T1E);
}
{
E T28, T25, T27, T2c, T2e, T2a, T2b, T2d, T29;
T28 = T24 - T23;
T25 = T23 + T24;
T27 = FMA(KP250000000, T25, T26);
T2a = Ty - Tl;
T2b = TZ - TM;
T2c = FMA(KP618033988, T2b, T2a);
T2e = FNMS(KP618033988, T2a, T2b);
Im[WS(rs, 4)] = T25 - T26;
T2d = FNMS(KP559016994, T28, T27);
Im[WS(rs, 2)] = FMS(KP951056516, T2e, T2d);
Ip[WS(rs, 3)] = FMA(KP951056516, T2e, T2d);
T29 = FMA(KP559016994, T28, T27);
Im[0] = FMS(KP951056516, T2c, T29);
Ip[WS(rs, 1)] = FMA(KP951056516, T2c, T29);
}
{
E T1b, T19, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
T1b = T15 - T18;
T19 = T15 + T18;
T1a = FNMS(KP250000000, T19, T12);
T1n = T1h - T1m;
T1y = T1s - T1x;
T1z = FNMS(KP618033988, T1y, T1n);
T1B = FMA(KP618033988, T1n, T1y);
Rp[0] = T12 + T19;
T1A = FMA(KP559016994, T1b, T1a);
Rp[WS(rs, 4)] = FNMS(KP951056516, T1B, T1A);
Rm[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
T1c = FNMS(KP559016994, T1b, T1a);
Rp[WS(rs, 2)] = FNMS(KP951056516, T1z, T1c);
Rm[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
}
{
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
T1W = T1O - T1P;
T1Q = T1O + T1P;
T1V = FNMS(KP250000000, T1Q, T1U);
T1Y = T16 - T17;
T1Z = T13 - T14;
T20 = FNMS(KP618033988, T1Z, T1Y);
T22 = FMA(KP618033988, T1Y, T1Z);
Ip[0] = T1Q + T1U;
T21 = FMA(KP559016994, T1W, T1V);
Im[WS(rs, 3)] = FMS(KP951056516, T22, T21);
Ip[WS(rs, 4)] = FMA(KP951056516, T22, T21);
T1X = FNMS(KP559016994, T1W, T1V);
Im[WS(rs, 1)] = FMS(KP951056516, T20, T1X);
Ip[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cf_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
void X(codelet_hc2cf_10) (planner *p) {
X(khc2c_register) (p, hc2cf_10, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cf_10 -include rdft/scalar/hc2cf.h */
/*
* This function contains 102 FP additions, 60 FP multiplications,
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
* 45 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E T7, T1O, TT, T1C, TF, TQ, TR, T1r, T1s, T1L, TX, TY, TZ, T16, T19;
E T1y, Ti, Tt, Tu, T1o, T1p, T1M, TU, TV, TW, T1d, T1g, T1x;
{
E T1, T1B, T6, T1A;
T1 = Rp[0];
T1B = Rm[0];
{
E T3, T5, T2, T4;
T3 = Ip[WS(rs, 2)];
T5 = Im[WS(rs, 2)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1A = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
T1O = T1B - T1A;
TT = T1 + T6;
T1C = T1A + T1B;
}
{
E Tz, T14, TP, T18, TE, T15, TK, T17;
{
E Tw, Ty, Tv, Tx;
Tw = Rp[WS(rs, 2)];
Ty = Rm[WS(rs, 2)];
Tv = W[6];
Tx = W[7];
Tz = FMA(Tv, Tw, Tx * Ty);
T14 = FNMS(Tx, Tw, Tv * Ty);
}
{
E TM, TO, TL, TN;
TM = Ip[0];
TO = Im[0];
TL = W[0];
TN = W[1];
TP = FMA(TL, TM, TN * TO);
T18 = FNMS(TN, TM, TL * TO);
}
{
E TB, TD, TA, TC;
TB = Ip[WS(rs, 4)];
TD = Im[WS(rs, 4)];
TA = W[16];
TC = W[17];
TE = FMA(TA, TB, TC * TD);
T15 = FNMS(TC, TB, TA * TD);
}
{
E TH, TJ, TG, TI;
TH = Rp[WS(rs, 3)];
TJ = Rm[WS(rs, 3)];
TG = W[10];
TI = W[11];
TK = FMA(TG, TH, TI * TJ);
T17 = FNMS(TI, TH, TG * TJ);
}
TF = Tz - TE;
TQ = TK - TP;
TR = TF + TQ;
T1r = T14 - T15;
T1s = T18 - T17;
T1L = T1s - T1r;
TX = Tz + TE;
TY = TK + TP;
TZ = TX + TY;
T16 = T14 + T15;
T19 = T17 + T18;
T1y = T16 + T19;
}
{
E Tc, T1b, Ts, T1f, Th, T1c, Tn, T1e;
{
E T9, Tb, T8, Ta;
T9 = Rp[WS(rs, 1)];
Tb = Rm[WS(rs, 1)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
T1b = FNMS(Ta, T9, T8 * Tb);
}
{
E Tp, Tr, To, Tq;
Tp = Ip[WS(rs, 1)];
Tr = Im[WS(rs, 1)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
T1f = FNMS(Tq, Tp, To * Tr);
}
{
E Te, Tg, Td, Tf;
Te = Ip[WS(rs, 3)];
Tg = Im[WS(rs, 3)];
Td = W[12];
Tf = W[13];
Th = FMA(Td, Te, Tf * Tg);
T1c = FNMS(Tf, Te, Td * Tg);
}
{
E Tk, Tm, Tj, Tl;
Tk = Rp[WS(rs, 4)];
Tm = Rm[WS(rs, 4)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T1e = FNMS(Tl, Tk, Tj * Tm);
}
Ti = Tc - Th;
Tt = Tn - Ts;
Tu = Ti + Tt;
T1o = T1b - T1c;
T1p = T1e - T1f;
T1M = T1o + T1p;
TU = Tc + Th;
TV = Tn + Ts;
TW = TU + TV;
T1d = T1b + T1c;
T1g = T1e + T1f;
T1x = T1d + T1g;
}
{
E T1l, TS, T1m, T1u, T1w, T1q, T1t, T1v, T1n;
T1l = KP559016994 * (Tu - TR);
TS = Tu + TR;
T1m = FNMS(KP250000000, TS, T7);
T1q = T1o - T1p;
T1t = T1r + T1s;
T1u = FMA(KP951056516, T1q, KP587785252 * T1t);
T1w = FNMS(KP587785252, T1q, KP951056516 * T1t);
Rm[WS(rs, 4)] = T7 + TS;
T1v = T1m - T1l;
Rm[WS(rs, 2)] = T1v - T1w;
Rp[WS(rs, 3)] = T1v + T1w;
T1n = T1l + T1m;
Rm[0] = T1n - T1u;
Rp[WS(rs, 1)] = T1n + T1u;
}
{
E T1S, T1N, T1T, T1R, T1V, T1P, T1Q, T1W, T1U;
T1S = KP559016994 * (T1M + T1L);
T1N = T1L - T1M;
T1T = FMA(KP250000000, T1N, T1O);
T1P = TQ - TF;
T1Q = Ti - Tt;
T1R = FNMS(KP951056516, T1Q, KP587785252 * T1P);
T1V = FMA(KP587785252, T1Q, KP951056516 * T1P);
Im[WS(rs, 4)] = T1N - T1O;
T1W = T1T - T1S;
Im[WS(rs, 2)] = T1V - T1W;
Ip[WS(rs, 3)] = T1V + T1W;
T1U = T1S + T1T;
Im[0] = T1R - T1U;
Ip[WS(rs, 1)] = T1R + T1U;
}
{
E T12, T10, T11, T1i, T1k, T1a, T1h, T1j, T13;
T12 = KP559016994 * (TW - TZ);
T10 = TW + TZ;
T11 = FNMS(KP250000000, T10, TT);
T1a = T16 - T19;
T1h = T1d - T1g;
T1i = FNMS(KP587785252, T1h, KP951056516 * T1a);
T1k = FMA(KP951056516, T1h, KP587785252 * T1a);
Rp[0] = TT + T10;
T1j = T12 + T11;
Rp[WS(rs, 4)] = T1j - T1k;
Rm[WS(rs, 3)] = T1j + T1k;
T13 = T11 - T12;
Rp[WS(rs, 2)] = T13 - T1i;
Rm[WS(rs, 1)] = T13 + T1i;
}
{
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
T1H = KP559016994 * (T1x - T1y);
T1z = T1x + T1y;
T1G = FNMS(KP250000000, T1z, T1C);
T1D = TX - TY;
T1E = TU - TV;
T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
Ip[0] = T1z + T1C;
T1K = T1H + T1G;
Im[WS(rs, 3)] = T1J - T1K;
Ip[WS(rs, 4)] = T1J + T1K;
T1I = T1G - T1H;
Im[WS(rs, 1)] = T1F - T1I;
Ip[WS(rs, 2)] = T1F + T1I;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cf_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
void X(codelet_hc2cf_10) (planner *p) {
X(khc2c_register) (p, hc2cf_10, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,581 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cf_12 -include rdft/scalar/hc2cf.h */
/*
* This function contains 118 FP additions, 68 FP multiplications,
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2s, T1s, T2f, T1d, T21, T1H;
E T1Z, Te, T2p, T1l, T2h, TT, T1V, T1A, T1T;
T1 = Rp[0];
T2i = Rm[0];
{
E Th, Tk, Ti, T2d, Tg, Tj;
Th = Rp[WS(rs, 3)];
Tk = Rm[WS(rs, 3)];
Tg = W[10];
Ti = Tg * Th;
T2d = Tg * Tk;
Tj = W[11];
Tl = FMA(Tj, Tk, Ti);
T2e = FNMS(Tj, Th, T2d);
}
{
E TW, TZ, TX, T1X, TV, TY;
TW = Ip[WS(rs, 4)];
TZ = Im[WS(rs, 4)];
TV = W[16];
TX = TV * TW;
T1X = TV * TZ;
TY = W[17];
T10 = FMA(TY, TZ, TX);
T1Y = FNMS(TY, TW, T1X);
}
{
E TC, TF, TD, T1R, TB, TE;
TC = Ip[WS(rs, 1)];
TF = Im[WS(rs, 1)];
TB = W[4];
TD = TB * TC;
T1R = TB * TF;
TE = W[5];
TG = FMA(TE, TF, TD);
T1S = FNMS(TE, TC, T1R);
}
{
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
Tn = Rp[WS(rs, 5)];
Tq = Rm[WS(rs, 5)];
Tm = W[18];
To = Tm * Tn;
T1o = Tm * Tq;
Tt = Rp[WS(rs, 1)];
Tw = Rm[WS(rs, 1)];
Ts = W[2];
Tu = Ts * Tt;
T1q = Ts * Tw;
{
E Tr, T1p, Tx, T1r, Tp, Tv;
Tp = W[19];
Tr = FMA(Tp, Tq, To);
T1p = FNMS(Tp, Tn, T1o);
Tv = W[3];
Tx = FMA(Tv, Tw, Tu);
T1r = FNMS(Tv, Tt, T1q);
Ty = Tr + Tx;
T2s = Tx - Tr;
T1s = T1p - T1r;
T2f = T1p + T1r;
}
}
{
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
T12 = Ip[0];
T15 = Im[0];
T11 = W[0];
T13 = T11 * T12;
T1D = T11 * T15;
T18 = Ip[WS(rs, 2)];
T1b = Im[WS(rs, 2)];
T17 = W[8];
T19 = T17 * T18;
T1F = T17 * T1b;
{
E T16, T1E, T1c, T1G, T14, T1a;
T14 = W[1];
T16 = FMA(T14, T15, T13);
T1E = FNMS(T14, T12, T1D);
T1a = W[9];
T1c = FMA(T1a, T1b, T19);
T1G = FNMS(T1a, T18, T1F);
T1d = T16 + T1c;
T21 = T1c - T16;
T1H = T1E - T1G;
T1Z = T1E + T1G;
}
}
{
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
T3 = Rp[WS(rs, 2)];
T6 = Rm[WS(rs, 2)];
T2 = W[6];
T4 = T2 * T3;
T1h = T2 * T6;
T9 = Rp[WS(rs, 4)];
Tc = Rm[WS(rs, 4)];
T8 = W[14];
Ta = T8 * T9;
T1j = T8 * Tc;
{
E T7, T1i, Td, T1k, T5, Tb;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1i = FNMS(T5, T3, T1h);
Tb = W[15];
Td = FMA(Tb, Tc, Ta);
T1k = FNMS(Tb, T9, T1j);
Te = T7 + Td;
T2p = Td - T7;
T1l = T1i - T1k;
T2h = T1i + T1k;
}
}
{
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
TI = Ip[WS(rs, 3)];
TL = Im[WS(rs, 3)];
TH = W[12];
TJ = TH * TI;
T1w = TH * TL;
TO = Ip[WS(rs, 5)];
TR = Im[WS(rs, 5)];
TN = W[20];
TP = TN * TO;
T1y = TN * TR;
{
E TM, T1x, TS, T1z, TK, TQ;
TK = W[13];
TM = FMA(TK, TL, TJ);
T1x = FNMS(TK, TI, T1w);
TQ = W[21];
TS = FMA(TQ, TR, TP);
T1z = FNMS(TQ, TO, T1y);
TT = TM + TS;
T1V = TS - TM;
T1A = T1x - T1z;
T1T = T1x + T1z;
}
}
{
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
{
E Tf, Tz, T2g, T2j;
Tf = T1 + Te;
Tz = Tl + Ty;
TA = Tf + Tz;
T28 = Tf - Tz;
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2m = T2j - T2g;
}
{
E TU, T1e, T29, T2a;
TU = TG + TT;
T1e = T10 + T1d;
T1f = TU + T1e;
T2l = TU - T1e;
T29 = T1S + T1T;
T2a = T1Y + T1Z;
T2b = T29 - T2a;
T2c = T29 + T2a;
}
Rm[WS(rs, 5)] = TA - T1f;
Im[WS(rs, 5)] = T2c - T2k;
Rp[0] = TA + T1f;
Ip[0] = T2c + T2k;
Rp[WS(rs, 3)] = T28 - T2b;
Ip[WS(rs, 3)] = T2l + T2m;
Rm[WS(rs, 2)] = T28 + T2b;
Im[WS(rs, 2)] = T2l - T2m;
}
{
E T1m, T1K, T2q, T2z, T2t, T2y, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
E T1O;
{
E T1g, T2o, T2r, T1n;
T1g = FNMS(KP500000000, Te, T1);
T1m = FNMS(KP866025403, T1l, T1g);
T1K = FMA(KP866025403, T1l, T1g);
T2o = FNMS(KP500000000, T2h, T2i);
T2q = FMA(KP866025403, T2p, T2o);
T2z = FNMS(KP866025403, T2p, T2o);
T2r = FNMS(KP500000000, T2f, T2e);
T2t = FMA(KP866025403, T2s, T2r);
T2y = FNMS(KP866025403, T2s, T2r);
T1n = FNMS(KP500000000, Ty, Tl);
T1t = FNMS(KP866025403, T1s, T1n);
T1L = FMA(KP866025403, T1s, T1n);
}
{
E T1v, T1U, T20, T1C;
T1v = FNMS(KP500000000, TT, TG);
T1B = FNMS(KP866025403, T1A, T1v);
T1N = FMA(KP866025403, T1A, T1v);
T1U = FNMS(KP500000000, T1T, T1S);
T1W = FNMS(KP866025403, T1V, T1U);
T25 = FMA(KP866025403, T1V, T1U);
T20 = FNMS(KP500000000, T1Z, T1Y);
T22 = FNMS(KP866025403, T21, T20);
T26 = FMA(KP866025403, T21, T20);
T1C = FNMS(KP500000000, T1d, T10);
T1I = FNMS(KP866025403, T1H, T1C);
T1O = FMA(KP866025403, T1H, T1C);
}
{
E T1u, T1J, T2x, T2A;
T1u = T1m + T1t;
T1J = T1B + T1I;
Rp[WS(rs, 2)] = T1u - T1J;
Rm[WS(rs, 3)] = T1u + T1J;
T2x = T1W + T22;
T2A = T2y + T2z;
Im[WS(rs, 3)] = -(T2x + T2A);
Ip[WS(rs, 2)] = T2A - T2x;
}
{
E T1M, T1P, T2v, T2w;
T1M = T1K + T1L;
T1P = T1N + T1O;
Rm[WS(rs, 1)] = T1M - T1P;
Rp[WS(rs, 4)] = T1M + T1P;
T2v = T25 + T26;
T2w = T2t + T2q;
Im[WS(rs, 1)] = T2v - T2w;
Ip[WS(rs, 4)] = T2v + T2w;
}
{
E T1Q, T23, T2B, T2C;
T1Q = T1m - T1t;
T23 = T1W - T22;
Rm[0] = T1Q - T23;
Rp[WS(rs, 5)] = T1Q + T23;
T2B = T1I - T1B;
T2C = T2z - T2y;
Im[0] = T2B - T2C;
Ip[WS(rs, 5)] = T2B + T2C;
}
{
E T24, T27, T2n, T2u;
T24 = T1K - T1L;
T27 = T25 - T26;
Rm[WS(rs, 4)] = T24 - T27;
Rp[WS(rs, 1)] = T24 + T27;
T2n = T1O - T1N;
T2u = T2q - T2t;
Im[WS(rs, 4)] = T2n - T2u;
Ip[WS(rs, 1)] = T2n + T2u;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cf_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
void X(codelet_hc2cf_12) (planner *p) {
X(khc2c_register) (p, hc2cf_12, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cf_12 -include rdft/scalar/hc2cf.h */
/*
* This function contains 118 FP additions, 60 FP multiplications,
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E T1, T1W, T18, T22, Tc, T15, T1V, T23, TR, T1E, T1o, T1D, T12, T1l, T1F;
E T1G, Ti, T1S, T1d, T25, Tt, T1a, T1T, T26, TA, T1y, T1j, T1B, TL, T1g;
E T1z, T1A;
{
E T6, T16, Tb, T17;
T1 = Rp[0];
T1W = Rm[0];
{
E T3, T5, T2, T4;
T3 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 2)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T16 = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = Rp[WS(rs, 4)];
Ta = Rm[WS(rs, 4)];
T7 = W[14];
T9 = W[15];
Tb = FMA(T7, T8, T9 * Ta);
T17 = FNMS(T9, T8, T7 * Ta);
}
T18 = KP866025403 * (T16 - T17);
T22 = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
T15 = FNMS(KP500000000, Tc, T1);
T1V = T16 + T17;
T23 = FNMS(KP500000000, T1V, T1W);
}
{
E T11, T1n, TW, T1m;
{
E TO, TQ, TN, TP;
TO = Ip[WS(rs, 4)];
TQ = Im[WS(rs, 4)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1E = FNMS(TP, TO, TN * TQ);
}
{
E TY, T10, TX, TZ;
TY = Ip[WS(rs, 2)];
T10 = Im[WS(rs, 2)];
TX = W[8];
TZ = W[9];
T11 = FMA(TX, TY, TZ * T10);
T1n = FNMS(TZ, TY, TX * T10);
}
{
E TT, TV, TS, TU;
TT = Ip[0];
TV = Im[0];
TS = W[0];
TU = W[1];
TW = FMA(TS, TT, TU * TV);
T1m = FNMS(TU, TT, TS * TV);
}
T1o = KP866025403 * (T1m - T1n);
T1D = KP866025403 * (T11 - TW);
T12 = TW + T11;
T1l = FNMS(KP500000000, T12, TR);
T1F = T1m + T1n;
T1G = FNMS(KP500000000, T1F, T1E);
}
{
E Ts, T1c, Tn, T1b;
{
E Tf, Th, Te, Tg;
Tf = Rp[WS(rs, 3)];
Th = Rm[WS(rs, 3)];
Te = W[10];
Tg = W[11];
Ti = FMA(Te, Tf, Tg * Th);
T1S = FNMS(Tg, Tf, Te * Th);
}
{
E Tp, Tr, To, Tq;
Tp = Rp[WS(rs, 1)];
Tr = Rm[WS(rs, 1)];
To = W[2];
Tq = W[3];
Ts = FMA(To, Tp, Tq * Tr);
T1c = FNMS(Tq, Tp, To * Tr);
}
{
E Tk, Tm, Tj, Tl;
Tk = Rp[WS(rs, 5)];
Tm = Rm[WS(rs, 5)];
Tj = W[18];
Tl = W[19];
Tn = FMA(Tj, Tk, Tl * Tm);
T1b = FNMS(Tl, Tk, Tj * Tm);
}
T1d = KP866025403 * (T1b - T1c);
T25 = KP866025403 * (Ts - Tn);
Tt = Tn + Ts;
T1a = FNMS(KP500000000, Tt, Ti);
T1T = T1b + T1c;
T26 = FNMS(KP500000000, T1T, T1S);
}
{
E TK, T1i, TF, T1h;
{
E Tx, Tz, Tw, Ty;
Tx = Ip[WS(rs, 1)];
Tz = Im[WS(rs, 1)];
Tw = W[4];
Ty = W[5];
TA = FMA(Tw, Tx, Ty * Tz);
T1y = FNMS(Ty, Tx, Tw * Tz);
}
{
E TH, TJ, TG, TI;
TH = Ip[WS(rs, 5)];
TJ = Im[WS(rs, 5)];
TG = W[20];
TI = W[21];
TK = FMA(TG, TH, TI * TJ);
T1i = FNMS(TI, TH, TG * TJ);
}
{
E TC, TE, TB, TD;
TC = Ip[WS(rs, 3)];
TE = Im[WS(rs, 3)];
TB = W[12];
TD = W[13];
TF = FMA(TB, TC, TD * TE);
T1h = FNMS(TD, TC, TB * TE);
}
T1j = KP866025403 * (T1h - T1i);
T1B = KP866025403 * (TK - TF);
TL = TF + TK;
T1g = FNMS(KP500000000, TL, TA);
T1z = T1h + T1i;
T1A = FNMS(KP500000000, T1z, T1y);
}
{
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
{
E Td, Tu, T1U, T1X;
Td = T1 + Tc;
Tu = Ti + Tt;
Tv = Td + Tu;
T1N = Td - Tu;
T1U = T1S + T1T;
T1X = T1V + T1W;
T1Y = T1U + T1X;
T20 = T1X - T1U;
}
{
E TM, T13, T1O, T1P;
TM = TA + TL;
T13 = TR + T12;
T14 = TM + T13;
T1Z = TM - T13;
T1O = T1y + T1z;
T1P = T1E + T1F;
T1Q = T1O - T1P;
T1R = T1O + T1P;
}
Rm[WS(rs, 5)] = Tv - T14;
Im[WS(rs, 5)] = T1R - T1Y;
Rp[0] = Tv + T14;
Ip[0] = T1R + T1Y;
Rp[WS(rs, 3)] = T1N - T1Q;
Ip[WS(rs, 3)] = T1Z + T20;
Rm[WS(rs, 2)] = T1N + T1Q;
Im[WS(rs, 2)] = T1Z - T20;
}
{
E T1t, T1J, T28, T2a, T1w, T21, T1M, T29;
{
E T1r, T1s, T24, T27;
T1r = T15 + T18;
T1s = T1a + T1d;
T1t = T1r + T1s;
T1J = T1r - T1s;
T24 = T22 + T23;
T27 = T25 + T26;
T28 = T24 - T27;
T2a = T27 + T24;
}
{
E T1u, T1v, T1K, T1L;
T1u = T1g + T1j;
T1v = T1l + T1o;
T1w = T1u + T1v;
T21 = T1v - T1u;
T1K = T1B + T1A;
T1L = T1D + T1G;
T1M = T1K - T1L;
T29 = T1K + T1L;
}
Rm[WS(rs, 1)] = T1t - T1w;
Im[WS(rs, 1)] = T29 - T2a;
Rp[WS(rs, 4)] = T1t + T1w;
Ip[WS(rs, 4)] = T29 + T2a;
Rm[WS(rs, 4)] = T1J - T1M;
Im[WS(rs, 4)] = T21 - T28;
Rp[WS(rs, 1)] = T1J + T1M;
Ip[WS(rs, 1)] = T21 + T28;
}
{
E T1f, T1x, T2e, T2g, T1q, T2f, T1I, T2b;
{
E T19, T1e, T2c, T2d;
T19 = T15 - T18;
T1e = T1a - T1d;
T1f = T19 + T1e;
T1x = T19 - T1e;
T2c = T26 - T25;
T2d = T23 - T22;
T2e = T2c + T2d;
T2g = T2d - T2c;
}
{
E T1k, T1p, T1C, T1H;
T1k = T1g - T1j;
T1p = T1l - T1o;
T1q = T1k + T1p;
T2f = T1p - T1k;
T1C = T1A - T1B;
T1H = T1D - T1G;
T1I = T1C + T1H;
T2b = T1H - T1C;
}
Rp[WS(rs, 2)] = T1f - T1q;
Ip[WS(rs, 2)] = T2b + T2e;
Rm[WS(rs, 3)] = T1f + T1q;
Im[WS(rs, 3)] = T2b - T2e;
Rm[0] = T1x - T1I;
Im[0] = T2f - T2g;
Rp[WS(rs, 5)] = T1x + T1I;
Ip[WS(rs, 5)] = T2f + T2g;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cf_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
void X(codelet_hc2cf_12) (planner *p) {
X(khc2c_register) (p, hc2cf_12, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,796 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cf_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 174 FP additions, 100 FP multiplications,
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
* 60 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E T8, T3z, T1I, T3o, T1s, T35, T2p, T2r, T1F, T36, T2k, T2w, Tl, T3A, T1N;
E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
E T1W, T21;
{
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
T1 = Rp[0];
T3n = Rm[0];
T3 = Rp[WS(rs, 4)];
T6 = Rm[WS(rs, 4)];
T2 = W[14];
T4 = T2 * T3;
T3l = T2 * T6;
T5 = W[15];
T7 = FMA(T5, T6, T4);
T3m = FNMS(T5, T3, T3l);
T8 = T1 + T7;
T3z = T3n - T3m;
T1I = T1 - T7;
T3o = T3m + T3n;
}
{
E T1h, T1k, T1i, T2l, T1n, T1q, T1o, T2n, T1g, T1m;
T1h = Ip[WS(rs, 7)];
T1k = Im[WS(rs, 7)];
T1g = W[28];
T1i = T1g * T1h;
T2l = T1g * T1k;
T1n = Ip[WS(rs, 3)];
T1q = Im[WS(rs, 3)];
T1m = W[12];
T1o = T1m * T1n;
T2n = T1m * T1q;
{
E T1l, T2m, T1r, T2o, T1j, T1p;
T1j = W[29];
T1l = FMA(T1j, T1k, T1i);
T2m = FNMS(T1j, T1h, T2l);
T1p = W[13];
T1r = FMA(T1p, T1q, T1o);
T2o = FNMS(T1p, T1n, T2n);
T1s = T1l + T1r;
T35 = T2m + T2o;
T2p = T2m - T2o;
T2r = T1l - T1r;
}
}
{
E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
T1u = Ip[WS(rs, 1)];
T1x = Im[WS(rs, 1)];
T1t = W[4];
T1v = T1t * T1u;
T2s = T1t * T1x;
T1A = Ip[WS(rs, 5)];
T1D = Im[WS(rs, 5)];
T1z = W[20];
T1B = T1z * T1A;
T2u = T1z * T1D;
{
E T1y, T2t, T1E, T2v, T1w, T1C;
T1w = W[5];
T1y = FMA(T1w, T1x, T1v);
T2t = FNMS(T1w, T1u, T2s);
T1C = W[21];
T1E = FMA(T1C, T1D, T1B);
T2v = FNMS(T1C, T1A, T2u);
T1F = T1y + T1E;
T36 = T2t + T2v;
T2k = T1E - T1y;
T2w = T2t - T2v;
}
}
{
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
Ta = Rp[WS(rs, 2)];
Td = Rm[WS(rs, 2)];
T9 = W[6];
Tb = T9 * Ta;
T1J = T9 * Td;
Tg = Rp[WS(rs, 6)];
Tj = Rm[WS(rs, 6)];
Tf = W[22];
Th = Tf * Tg;
T1L = Tf * Tj;
{
E Te, T1K, Tk, T1M, Tc, Ti;
Tc = W[7];
Te = FMA(Tc, Td, Tb);
T1K = FNMS(Tc, Ta, T1J);
Ti = W[23];
Tk = FMA(Ti, Tj, Th);
T1M = FNMS(Ti, Tg, T1L);
Tl = Te + Tk;
T3A = Te - Tk;
T1N = T1K - T1M;
T3k = T1K + T1M;
}
}
{
E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
To = Rp[WS(rs, 1)];
Tr = Rm[WS(rs, 1)];
Tn = W[2];
Tp = Tn * To;
T1P = Tn * Tr;
Tu = Rp[WS(rs, 5)];
Tx = Rm[WS(rs, 5)];
Tt = W[18];
Tv = Tt * Tu;
T1R = Tt * Tx;
{
E Ts, T1Q, Ty, T1S, Tq, Tw;
Tq = W[3];
Ts = FMA(Tq, Tr, Tp);
T1Q = FNMS(Tq, To, T1P);
Tw = W[19];
Ty = FMA(Tw, Tx, Tv);
T1S = FNMS(Tw, Tu, T1R);
Tz = Ts + Ty;
T2V = T1Q + T1S;
T1T = T1Q - T1S;
T1U = Ts - Ty;
}
}
{
E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
TQ = Ip[0];
TT = Im[0];
TP = W[0];
TR = TP * TQ;
T25 = TP * TT;
TW = Ip[WS(rs, 4)];
TZ = Im[WS(rs, 4)];
TV = W[16];
TX = TV * TW;
T27 = TV * TZ;
{
E TU, T26, T10, T28, TS, TY;
TS = W[1];
TU = FMA(TS, TT, TR);
T26 = FNMS(TS, TQ, T25);
TY = W[17];
T10 = FMA(TY, TZ, TX);
T28 = FNMS(TY, TW, T27);
T11 = TU + T10;
T30 = T26 + T28;
T29 = T26 - T28;
T2c = TU - T10;
}
}
{
E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
T13 = Ip[WS(rs, 2)];
T16 = Im[WS(rs, 2)];
T12 = W[8];
T14 = T12 * T13;
T2d = T12 * T16;
T19 = Ip[WS(rs, 6)];
T1c = Im[WS(rs, 6)];
T18 = W[24];
T1a = T18 * T19;
T2f = T18 * T1c;
{
E T17, T2e, T1d, T2g, T15, T1b;
T15 = W[9];
T17 = FMA(T15, T16, T14);
T2e = FNMS(T15, T13, T2d);
T1b = W[25];
T1d = FMA(T1b, T1c, T1a);
T2g = FNMS(T1b, T19, T2f);
T1e = T17 + T1d;
T31 = T2e + T2g;
T2a = T17 - T1d;
T2h = T2e - T2g;
}
}
{
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
TB = Rp[WS(rs, 7)];
TE = Rm[WS(rs, 7)];
TA = W[26];
TC = TA * TB;
T1X = TA * TE;
TH = Rp[WS(rs, 3)];
TK = Rm[WS(rs, 3)];
TG = W[10];
TI = TG * TH;
T1Z = TG * TK;
{
E TF, T1Y, TL, T20, TD, TJ;
TD = W[27];
TF = FMA(TD, TE, TC);
T1Y = FNMS(TD, TB, T1X);
TJ = W[11];
TL = FMA(TJ, TK, TI);
T20 = FNMS(TJ, TH, T1Z);
TM = TF + TL;
T2W = T1Y + T20;
T1W = TF - TL;
T21 = T1Y - T20;
}
}
{
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
{
E Tm, TN, T3j, T3p;
Tm = T8 + Tl;
TN = Tz + TM;
TO = Tm + TN;
T3e = Tm - TN;
T3j = T2V + T2W;
T3p = T3k + T3o;
T3q = T3j + T3p;
T3s = T3p - T3j;
}
{
E T1f, T1G, T3f, T3g;
T1f = T11 + T1e;
T1G = T1s + T1F;
T1H = T1f + T1G;
T3r = T1G - T1f;
T3f = T30 + T31;
T3g = T35 + T36;
T3h = T3f - T3g;
T3i = T3f + T3g;
}
Rm[WS(rs, 7)] = TO - T1H;
Im[WS(rs, 7)] = T3i - T3q;
Rp[0] = TO + T1H;
Ip[0] = T3i + T3q;
Rm[WS(rs, 3)] = T3e - T3h;
Im[WS(rs, 3)] = T3r - T3s;
Rp[WS(rs, 4)] = T3e + T3h;
Ip[WS(rs, 4)] = T3r + T3s;
}
{
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
{
E T2U, T2X, T3t, T3u;
T2U = T8 - Tl;
T2X = T2V - T2W;
T2Y = T2U + T2X;
T3a = T2U - T2X;
T3t = TM - Tz;
T3u = T3o - T3k;
T3v = T3t + T3u;
T3x = T3u - T3t;
}
{
E T2Z, T32, T34, T37;
T2Z = T11 - T1e;
T32 = T30 - T31;
T33 = T2Z + T32;
T3b = T32 - T2Z;
T34 = T1s - T1F;
T37 = T35 - T36;
T38 = T34 - T37;
T3c = T34 + T37;
}
{
E T39, T3w, T3d, T3y;
T39 = T33 + T38;
Rm[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
Rp[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
T3w = T3b + T3c;
Im[WS(rs, 5)] = FMS(KP707106781, T3w, T3v);
Ip[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
T3d = T3b - T3c;
Rm[WS(rs, 1)] = FNMS(KP707106781, T3d, T3a);
Rp[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
T3y = T38 - T33;
Im[WS(rs, 1)] = FMS(KP707106781, T3y, T3x);
Ip[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
}
}
{
E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
E T2C;
{
E T1V, T22, T2b, T2i;
T1O = T1I - T1N;
T3B = T3z - T3A;
T3H = T3A + T3z;
T2E = T1I + T1N;
T1V = T1T - T1U;
T22 = T1W + T21;
T23 = T1V - T22;
T3C = T1V + T22;
{
E T2M, T2N, T2F, T2G;
T2M = T2r + T2w;
T2N = T2p + T2k;
T2O = FNMS(KP414213562, T2N, T2M);
T2S = FMA(KP414213562, T2M, T2N);
T2F = T1U + T1T;
T2G = T1W - T21;
T2H = T2F + T2G;
T3I = T2G - T2F;
}
T2b = T29 + T2a;
T2i = T2c - T2h;
T2j = FMA(KP414213562, T2i, T2b);
T2B = FNMS(KP414213562, T2b, T2i);
{
E T2J, T2K, T2q, T2x;
T2J = T2c + T2h;
T2K = T29 - T2a;
T2L = FMA(KP414213562, T2K, T2J);
T2R = FNMS(KP414213562, T2J, T2K);
T2q = T2k - T2p;
T2x = T2r - T2w;
T2y = FMA(KP414213562, T2x, T2q);
T2C = FNMS(KP414213562, T2q, T2x);
}
}
{
E T24, T2z, T3J, T3K;
T24 = FMA(KP707106781, T23, T1O);
T2z = T2j + T2y;
Rm[WS(rs, 4)] = FNMS(KP923879532, T2z, T24);
Rp[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
T3J = FMA(KP707106781, T3I, T3H);
T3K = T2C - T2B;
Im[WS(rs, 4)] = FMS(KP923879532, T3K, T3J);
Ip[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
}
{
E T2A, T2D, T3L, T3M;
T2A = FNMS(KP707106781, T23, T1O);
T2D = T2B + T2C;
Rp[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
Rm[0] = FMA(KP923879532, T2D, T2A);
T3L = FNMS(KP707106781, T3I, T3H);
T3M = T2y - T2j;
Im[0] = FMS(KP923879532, T3M, T3L);
Ip[WS(rs, 7)] = FMA(KP923879532, T3M, T3L);
}
{
E T2I, T2P, T3D, T3E;
T2I = FMA(KP707106781, T2H, T2E);
T2P = T2L + T2O;
Rm[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
Rp[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
T3D = FMA(KP707106781, T3C, T3B);
T3E = T2R + T2S;
Im[WS(rs, 6)] = FMS(KP923879532, T3E, T3D);
Ip[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
}
{
E T2Q, T2T, T3F, T3G;
T2Q = FNMS(KP707106781, T2H, T2E);
T2T = T2R - T2S;
Rm[WS(rs, 2)] = FNMS(KP923879532, T2T, T2Q);
Rp[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
T3F = FNMS(KP707106781, T3C, T3B);
T3G = T2O - T2L;
Im[WS(rs, 2)] = FMS(KP923879532, T3G, T3F);
Ip[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cf_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
void X(codelet_hc2cf_16) (planner *p) {
X(khc2c_register) (p, hc2cf_16, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cf_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 174 FP additions, 84 FP multiplications,
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
* 52 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
E T2y, T2z, T1O, T2g, T1T, T2h;
{
E T1, T2T, T6, T2S;
T1 = Rp[0];
T2T = Rm[0];
{
E T3, T5, T2, T4;
T3 = Rp[WS(rs, 4)];
T5 = Rm[WS(rs, 4)];
T2 = W[14];
T4 = W[15];
T6 = FMA(T2, T3, T4 * T5);
T2S = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T37 = T2T - T2S;
T1t = T1 - T6;
T2U = T2S + T2T;
}
{
E Tc, T1u, Th, T1v;
{
E T9, Tb, T8, Ta;
T9 = Rp[WS(rs, 2)];
Tb = Rm[WS(rs, 2)];
T8 = W[6];
Ta = W[7];
Tc = FMA(T8, T9, Ta * Tb);
T1u = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = Rp[WS(rs, 6)];
Tg = Rm[WS(rs, 6)];
Td = W[22];
Tf = W[23];
Th = FMA(Td, Te, Tf * Tg);
T1v = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T38 = Tc - Th;
T1w = T1u - T1v;
T2R = T1u + T1v;
}
{
E To, T1y, Tt, T1z, T1A, T1B;
{
E Tl, Tn, Tk, Tm;
Tl = Rp[WS(rs, 1)];
Tn = Rm[WS(rs, 1)];
Tk = W[2];
Tm = W[3];
To = FMA(Tk, Tl, Tm * Tn);
T1y = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = Rp[WS(rs, 5)];
Ts = Rm[WS(rs, 5)];
Tp = W[18];
Tr = W[19];
Tt = FMA(Tp, Tq, Tr * Ts);
T1z = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T2s = T1y + T1z;
T1A = T1y - T1z;
T1B = To - Tt;
T1C = T1A - T1B;
T2c = T1B + T1A;
}
{
E Tz, T1E, TE, T1F, T1D, T1G;
{
E Tw, Ty, Tv, Tx;
Tw = Rp[WS(rs, 7)];
Ty = Rm[WS(rs, 7)];
Tv = W[26];
Tx = W[27];
Tz = FMA(Tv, Tw, Tx * Ty);
T1E = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = Rp[WS(rs, 3)];
TD = Rm[WS(rs, 3)];
TA = W[10];
TC = W[11];
TE = FMA(TA, TB, TC * TD);
T1F = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T2t = T1E + T1F;
T1D = Tz - TE;
T1G = T1E - T1F;
T1H = T1D + T1G;
T2d = T1D - T1G;
}
{
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
{
E T16, T18, T15, T17;
T16 = Ip[WS(rs, 7)];
T18 = Im[WS(rs, 7)];
T15 = W[28];
T17 = W[29];
T19 = FMA(T15, T16, T17 * T18);
T20 = FNMS(T17, T16, T15 * T18);
}
{
E T1m, T1o, T1l, T1n;
T1m = Ip[WS(rs, 5)];
T1o = Im[WS(rs, 5)];
T1l = W[20];
T1n = W[21];
T1p = FMA(T1l, T1m, T1n * T1o);
T1X = FNMS(T1n, T1m, T1l * T1o);
}
{
E T1b, T1d, T1a, T1c;
T1b = Ip[WS(rs, 3)];
T1d = Im[WS(rs, 3)];
T1a = W[12];
T1c = W[13];
T1e = FMA(T1a, T1b, T1c * T1d);
T21 = FNMS(T1c, T1b, T1a * T1d);
}
{
E T1h, T1j, T1g, T1i;
T1h = Ip[WS(rs, 1)];
T1j = Im[WS(rs, 1)];
T1g = W[4];
T1i = W[5];
T1k = FMA(T1g, T1h, T1i * T1j);
T1W = FNMS(T1i, T1h, T1g * T1j);
}
T1f = T19 + T1e;
T1q = T1k + T1p;
T2B = T1f - T1q;
T2C = T20 + T21;
T2D = T1W + T1X;
T2E = T2C - T2D;
{
E T1V, T1Y, T22, T23;
T1V = T19 - T1e;
T1Y = T1W - T1X;
T1Z = T1V - T1Y;
T2j = T1V + T1Y;
T22 = T20 - T21;
T23 = T1k - T1p;
T24 = T22 + T23;
T2k = T22 - T23;
}
}
{
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
{
E TJ, TL, TI, TK;
TJ = Ip[0];
TL = Im[0];
TI = W[0];
TK = W[1];
TM = FMA(TI, TJ, TK * TL);
T1K = FNMS(TK, TJ, TI * TL);
}
{
E TZ, T11, TY, T10;
TZ = Ip[WS(rs, 6)];
T11 = Im[WS(rs, 6)];
TY = W[24];
T10 = W[25];
T12 = FMA(TY, TZ, T10 * T11);
T1R = FNMS(T10, TZ, TY * T11);
}
{
E TO, TQ, TN, TP;
TO = Ip[WS(rs, 4)];
TQ = Im[WS(rs, 4)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1L = FNMS(TP, TO, TN * TQ);
}
{
E TU, TW, TT, TV;
TU = Ip[WS(rs, 2)];
TW = Im[WS(rs, 2)];
TT = W[8];
TV = W[9];
TX = FMA(TT, TU, TV * TW);
T1Q = FNMS(TV, TU, TT * TW);
}
TS = TM + TR;
T13 = TX + T12;
T2w = TS - T13;
T2x = T1K + T1L;
T2y = T1Q + T1R;
T2z = T2x - T2y;
{
E T1M, T1N, T1P, T1S;
T1M = T1K - T1L;
T1N = TX - T12;
T1O = T1M + T1N;
T2g = T1M - T1N;
T1P = TM - TR;
T1S = T1Q - T1R;
T1T = T1P - T1S;
T2h = T1P + T1S;
}
}
{
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
{
E T1x, T1I, T3e, T3f;
T1x = T1t - T1w;
T1I = KP707106781 * (T1C - T1H);
T1J = T1x + T1I;
T27 = T1x - T1I;
T3e = KP707106781 * (T2d - T2c);
T3f = T38 + T37;
T3g = T3e + T3f;
T3i = T3f - T3e;
}
{
E T1U, T25, T28, T29;
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
T26 = T1U + T25;
T3h = T25 - T1U;
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
T2a = T28 - T29;
T3d = T28 + T29;
}
Rm[WS(rs, 4)] = T1J - T26;
Im[WS(rs, 4)] = T3d - T3g;
Rp[WS(rs, 3)] = T1J + T26;
Ip[WS(rs, 3)] = T3d + T3g;
Rm[0] = T27 - T2a;
Im[0] = T3h - T3i;
Rp[WS(rs, 7)] = T27 + T2a;
Ip[WS(rs, 7)] = T3h + T3i;
}
{
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
{
E T2r, T2u, T30, T31;
T2r = T7 - Ti;
T2u = T2s - T2t;
T2v = T2r + T2u;
T2H = T2r - T2u;
T30 = TF - Tu;
T31 = T2U - T2R;
T32 = T30 + T31;
T34 = T31 - T30;
}
{
E T2A, T2F, T2I, T2J;
T2A = T2w + T2z;
T2F = T2B - T2E;
T2G = KP707106781 * (T2A + T2F);
T33 = KP707106781 * (T2F - T2A);
T2I = T2z - T2w;
T2J = T2B + T2E;
T2K = KP707106781 * (T2I - T2J);
T2Z = KP707106781 * (T2I + T2J);
}
Rm[WS(rs, 5)] = T2v - T2G;
Im[WS(rs, 5)] = T2Z - T32;
Rp[WS(rs, 2)] = T2v + T2G;
Ip[WS(rs, 2)] = T2Z + T32;
Rm[WS(rs, 1)] = T2H - T2K;
Im[WS(rs, 1)] = T33 - T34;
Rp[WS(rs, 6)] = T2H + T2K;
Ip[WS(rs, 6)] = T33 + T34;
}
{
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
{
E T2b, T2e, T36, T39;
T2b = T1t + T1w;
T2e = KP707106781 * (T2c + T2d);
T2f = T2b + T2e;
T2n = T2b - T2e;
T36 = KP707106781 * (T1C + T1H);
T39 = T37 - T38;
T3a = T36 + T39;
T3c = T39 - T36;
}
{
E T2i, T2l, T2o, T2p;
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
T2m = T2i + T2l;
T3b = T2l - T2i;
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
T2q = T2o - T2p;
T35 = T2o + T2p;
}
Rm[WS(rs, 6)] = T2f - T2m;
Im[WS(rs, 6)] = T35 - T3a;
Rp[WS(rs, 1)] = T2f + T2m;
Ip[WS(rs, 1)] = T35 + T3a;
Rm[WS(rs, 2)] = T2n - T2q;
Im[WS(rs, 2)] = T3b - T3c;
Rp[WS(rs, 5)] = T2n + T2q;
Ip[WS(rs, 5)] = T3b + T3c;
}
{
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
{
E Tj, TG, T2Q, T2V;
Tj = T7 + Ti;
TG = Tu + TF;
TH = Tj + TG;
T2L = Tj - TG;
T2Q = T2s + T2t;
T2V = T2R + T2U;
T2W = T2Q + T2V;
T2Y = T2V - T2Q;
}
{
E T14, T1r, T2M, T2N;
T14 = TS + T13;
T1r = T1f + T1q;
T1s = T14 + T1r;
T2X = T1r - T14;
T2M = T2x + T2y;
T2N = T2C + T2D;
T2O = T2M - T2N;
T2P = T2M + T2N;
}
Rm[WS(rs, 7)] = TH - T1s;
Im[WS(rs, 7)] = T2P - T2W;
Rp[0] = TH + T1s;
Ip[0] = T2P + T2W;
Rm[WS(rs, 3)] = T2L - T2O;
Im[WS(rs, 3)] = T2X - T2Y;
Rp[WS(rs, 4)] = T2L + T2O;
Ip[WS(rs, 4)] = T2X + T2Y;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cf_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
void X(codelet_hc2cf_16) (planner *p) {
X(khc2c_register) (p, hc2cf_16, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cf_2 -include rdft/scalar/hc2cf.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 11 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
T1 = Rp[0];
Ta = Rm[0];
T3 = Ip[0];
T6 = Im[0];
T2 = W[0];
T4 = T2 * T3;
T8 = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
T9 = FNMS(T5, T3, T8);
Rm[0] = T1 - T7;
Im[0] = T9 - Ta;
Rp[0] = T1 + T7;
Ip[0] = T9 + Ta;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hc2cf_2) (planner *p) {
X(khc2c_register) (p, hc2cf_2, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cf_2 -include rdft/scalar/hc2cf.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, T8, T6, T7;
T1 = Rp[0];
T8 = Rm[0];
{
E T3, T5, T2, T4;
T3 = Ip[0];
T5 = Im[0];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
T7 = FNMS(T4, T3, T2 * T5);
}
Rm[0] = T1 - T6;
Im[0] = T7 - T8;
Rp[0] = T1 + T6;
Ip[0] = T7 + T8;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hc2cf_2) (planner *p) {
X(khc2c_register) (p, hc2cf_2, &desc, HC2C_VIA_RDFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cf_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
T1 = Rp[0];
Tv = Rm[0];
{
E T3, T6, T4, Tt, T2, T5;
T3 = Rp[WS(rs, 1)];
T6 = Rm[WS(rs, 1)];
T2 = W[2];
T4 = T2 * T3;
Tt = T2 * T6;
T5 = W[3];
T7 = FMA(T5, T6, T4);
Tu = FNMS(T5, T3, Tt);
}
{
E Ta, Td, Tb, Tn, T9, Tc;
Ta = Ip[0];
Td = Im[0];
T9 = W[0];
Tb = T9 * Ta;
Tn = T9 * Td;
Tc = W[1];
Te = FMA(Tc, Td, Tb);
To = FNMS(Tc, Ta, Tn);
}
{
E Tg, Tj, Th, Tp, Tf, Ti;
Tg = Ip[WS(rs, 1)];
Tj = Im[WS(rs, 1)];
Tf = W[4];
Th = Tf * Tg;
Tp = Tf * Tj;
Ti = W[5];
Tk = FMA(Ti, Tj, Th);
Tq = FNMS(Ti, Tg, Tp);
}
{
E T8, Tl, Ts, Tw;
T8 = T1 + T7;
Tl = Te + Tk;
Rm[WS(rs, 1)] = T8 - Tl;
Rp[0] = T8 + Tl;
Ts = To + Tq;
Tw = Tu + Tv;
Im[WS(rs, 1)] = Ts - Tw;
Ip[0] = Ts + Tw;
}
{
E Tm, Tr, Tx, Ty;
Tm = T1 - T7;
Tr = To - Tq;
Rm[0] = Tm - Tr;
Rp[WS(rs, 1)] = Tm + Tr;
Tx = Tk - Te;
Ty = Tv - Tu;
Im[0] = Tx - Ty;
Ip[WS(rs, 1)] = Tx + Ty;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hc2cf_4) (planner *p) {
X(khc2c_register) (p, hc2cf_4, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cf_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
T1 = Rp[0];
Tp = Rm[0];
{
E T3, T5, T2, T4;
T3 = Rp[WS(rs, 1)];
T5 = Rm[WS(rs, 1)];
T2 = W[2];
T4 = W[3];
T6 = FMA(T2, T3, T4 * T5);
To = FNMS(T4, T3, T2 * T5);
}
{
E T9, Tb, T8, Ta;
T9 = Ip[0];
Tb = Im[0];
T8 = W[0];
Ta = W[1];
Tc = FMA(T8, T9, Ta * Tb);
Tk = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = Ip[WS(rs, 1)];
Tg = Im[WS(rs, 1)];
Td = W[4];
Tf = W[5];
Th = FMA(Td, Te, Tf * Tg);
Tl = FNMS(Tf, Te, Td * Tg);
}
{
E T7, Ti, Tn, Tq;
T7 = T1 + T6;
Ti = Tc + Th;
Rm[WS(rs, 1)] = T7 - Ti;
Rp[0] = T7 + Ti;
Tn = Tk + Tl;
Tq = To + Tp;
Im[WS(rs, 1)] = Tn - Tq;
Ip[0] = Tn + Tq;
}
{
E Tj, Tm, Tr, Ts;
Tj = T1 - T6;
Tm = Tk - Tl;
Rm[0] = Tj - Tm;
Rp[WS(rs, 1)] = Tj + Tm;
Tr = Th - Tc;
Ts = Tp - To;
Im[0] = Tr - Ts;
Ip[WS(rs, 1)] = Tr + Ts;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hc2cf_4) (planner *p) {
X(khc2c_register) (p, hc2cf_4, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,295 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cf_6 -include rdft/scalar/hc2cf.h */
/*
* This function contains 46 FP additions, 32 FP multiplications,
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, TX, T7, TW, Tl, TS, TB, TJ, Ty, TR, TC, TO;
T1 = Rp[0];
TX = Rm[0];
{
E T3, T6, T4, TV, T2, T5;
T3 = Ip[WS(rs, 1)];
T6 = Im[WS(rs, 1)];
T2 = W[4];
T4 = T2 * T3;
TV = T2 * T6;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TW = FNMS(T5, T3, TV);
}
{
E Ta, Td, Tb, TF, Tg, Tj, Th, TH, T9, Tf;
Ta = Rp[WS(rs, 1)];
Td = Rm[WS(rs, 1)];
T9 = W[2];
Tb = T9 * Ta;
TF = T9 * Td;
Tg = Ip[WS(rs, 2)];
Tj = Im[WS(rs, 2)];
Tf = W[8];
Th = Tf * Tg;
TH = Tf * Tj;
{
E Te, TG, Tk, TI, Tc, Ti;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TG = FNMS(Tc, Ta, TF);
Ti = W[9];
Tk = FMA(Ti, Tj, Th);
TI = FNMS(Ti, Tg, TH);
Tl = Te - Tk;
TS = TI - TG;
TB = Te + Tk;
TJ = TG + TI;
}
}
{
E Tn, Tq, To, TK, Tt, Tw, Tu, TM, Tm, Ts;
Tn = Rp[WS(rs, 2)];
Tq = Rm[WS(rs, 2)];
Tm = W[6];
To = Tm * Tn;
TK = Tm * Tq;
Tt = Ip[0];
Tw = Im[0];
Ts = W[0];
Tu = Ts * Tt;
TM = Ts * Tw;
{
E Tr, TL, Tx, TN, Tp, Tv;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
TL = FNMS(Tp, Tn, TK);
Tv = W[1];
Tx = FMA(Tv, Tw, Tu);
TN = FNMS(Tv, Tt, TM);
Ty = Tr - Tx;
TR = TN - TL;
TC = Tr + Tx;
TO = TL + TN;
}
}
{
E TT, T8, Tz, TQ;
TT = TR - TS;
T8 = T1 - T7;
Tz = Tl + Ty;
TQ = FNMS(KP500000000, Tz, T8);
Rm[WS(rs, 2)] = T8 + Tz;
Rp[WS(rs, 1)] = FMA(KP866025403, TT, TQ);
Rm[0] = FNMS(KP866025403, TT, TQ);
}
{
E T14, T11, T12, T13;
T14 = Ty - Tl;
T11 = TS + TR;
T12 = TX - TW;
T13 = FMA(KP500000000, T11, T12);
Im[WS(rs, 2)] = T11 - T12;
Ip[WS(rs, 1)] = FMA(KP866025403, T14, T13);
Im[0] = FMS(KP866025403, T14, T13);
}
{
E TP, TA, TD, TE;
TP = TJ - TO;
TA = T1 + T7;
TD = TB + TC;
TE = FNMS(KP500000000, TD, TA);
Rp[0] = TA + TD;
Rm[WS(rs, 1)] = FMA(KP866025403, TP, TE);
Rp[WS(rs, 2)] = FNMS(KP866025403, TP, TE);
}
{
E T10, TU, TY, TZ;
T10 = TB - TC;
TU = TJ + TO;
TY = TW + TX;
TZ = FNMS(KP500000000, TU, TY);
Ip[0] = TU + TY;
Ip[WS(rs, 2)] = FMA(KP866025403, T10, TZ);
Im[WS(rs, 1)] = FMS(KP866025403, T10, TZ);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 6, "hc2cf_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
void X(codelet_hc2cf_6) (planner *p) {
X(khc2c_register) (p, hc2cf_6, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cf_6 -include rdft/scalar/hc2cf.h */
/*
* This function contains 46 FP additions, 28 FP multiplications,
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
{
E T1, TN, T6, TM;
T1 = Rp[0];
TN = Rm[0];
{
E T3, T5, T2, T4;
T3 = Ip[WS(rs, 1)];
T5 = Im[WS(rs, 1)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TM = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
TS = TN - TM;
Tv = T1 + T6;
TO = TM + TN;
}
{
E Tn, TD, Ts, TE;
{
E Tk, Tm, Tj, Tl;
Tk = Rp[WS(rs, 2)];
Tm = Rm[WS(rs, 2)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TD = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = Ip[0];
Tr = Im[0];
To = W[0];
Tq = W[1];
Ts = FMA(To, Tp, Tq * Tr);
TE = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn - Ts;
TJ = TE - TD;
Tx = Tn + Ts;
TF = TD + TE;
}
{
E Tc, TA, Th, TB;
{
E T9, Tb, T8, Ta;
T9 = Rp[WS(rs, 1)];
Tb = Rm[WS(rs, 1)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TA = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = Ip[WS(rs, 2)];
Tg = Im[WS(rs, 2)];
Td = W[8];
Tf = W[9];
Th = FMA(Td, Te, Tf * Tg);
TB = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc - Th;
TI = TA - TB;
Tw = Tc + Th;
TC = TA + TB;
}
{
E TK, Tu, TH, TT, TR, TU;
TK = KP866025403 * (TI + TJ);
Tu = Ti + Tt;
TH = FNMS(KP500000000, Tu, T7);
Rm[WS(rs, 2)] = T7 + Tu;
Rp[WS(rs, 1)] = TH + TK;
Rm[0] = TH - TK;
TT = KP866025403 * (Tt - Ti);
TR = TJ - TI;
TU = FMA(KP500000000, TR, TS);
Im[WS(rs, 2)] = TR - TS;
Ip[WS(rs, 1)] = TT + TU;
Im[0] = TT - TU;
}
{
E TG, Ty, Tz, TP, TL, TQ;
TG = KP866025403 * (TC - TF);
Ty = Tw + Tx;
Tz = FNMS(KP500000000, Ty, Tv);
Rp[0] = Tv + Ty;
Rm[WS(rs, 1)] = Tz + TG;
Rp[WS(rs, 2)] = Tz - TG;
TP = KP866025403 * (Tw - Tx);
TL = TC + TF;
TQ = FNMS(KP500000000, TL, TO);
Ip[0] = TL + TO;
Ip[WS(rs, 2)] = TP + TQ;
Im[WS(rs, 1)] = TP - TQ;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 6, "hc2cf_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
void X(codelet_hc2cf_6) (planner *p) {
X(khc2c_register) (p, hc2cf_6, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,376 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 66 FP additions, 36 FP multiplications,
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
* 34 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
E TX, Ty, TZ, TV, T10;
T1 = Rp[0];
T1m = Rm[0];
{
E T3, T6, T4, T1k, T2, T5;
T3 = Rp[WS(rs, 2)];
T6 = Rm[WS(rs, 2)];
T2 = W[6];
T4 = T2 * T3;
T1k = T2 * T6;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1l = FNMS(T5, T3, T1k);
}
{
E Tg, Tj, Th, TR, Tf, Ti;
Tg = Rp[WS(rs, 3)];
Tj = Rm[WS(rs, 3)];
Tf = W[10];
Th = Tf * Tg;
TR = Tf * Tj;
Ti = W[11];
Tk = FMA(Ti, Tj, Th);
TS = FNMS(Ti, Tg, TR);
}
{
E Ta, Td, Tb, TP, T9, Tc;
Ta = Rp[WS(rs, 1)];
Td = Rm[WS(rs, 1)];
T9 = W[2];
Tb = T9 * Ta;
TP = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TQ = FNMS(Tc, Ta, TP);
}
{
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
TB = Ip[WS(rs, 3)];
TE = Im[WS(rs, 3)];
TA = W[12];
TC = TA * TB;
T13 = TA * TE;
TH = Ip[WS(rs, 1)];
TK = Im[WS(rs, 1)];
TG = W[4];
TI = TG * TH;
T15 = TG * TK;
TD = W[13];
TF = FMA(TD, TE, TC);
T14 = FNMS(TD, TB, T13);
TJ = W[5];
TL = FMA(TJ, TK, TI);
T16 = FNMS(TJ, TH, T15);
T12 = TF - TL;
T17 = T14 - T16;
}
{
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
To = Ip[0];
Tr = Im[0];
Tn = W[0];
Tp = Tn * To;
TW = Tn * Tr;
Tu = Ip[WS(rs, 2)];
Tx = Im[WS(rs, 2)];
Tt = W[8];
Tv = Tt * Tu;
TY = Tt * Tx;
Tq = W[1];
Ts = FMA(Tq, Tr, Tp);
TX = FNMS(Tq, To, TW);
Tw = W[9];
Ty = FMA(Tw, Tx, Tv);
TZ = FNMS(Tw, Tu, TY);
TV = Ts - Ty;
T10 = TX - TZ;
}
{
E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
{
E TO, TT, T1r, T1s;
TO = T1 - T7;
TT = TQ - TS;
TU = TO + TT;
T1a = TO - TT;
T1r = T1m - T1l;
T1s = Te - Tk;
T1t = T1r - T1s;
T1v = T1s + T1r;
}
{
E T11, T18, T1b, T1c;
T11 = TV + T10;
T18 = T12 - T17;
T19 = T11 + T18;
T1w = T18 - T11;
T1b = T10 - TV;
T1c = T12 + T17;
T1d = T1b - T1c;
T1u = T1b + T1c;
}
Rm[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
Im[WS(rs, 2)] = FMS(KP707106781, T1u, T1t);
Rp[WS(rs, 1)] = FMA(KP707106781, T19, TU);
Ip[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
Rm[0] = FNMS(KP707106781, T1d, T1a);
Im[0] = FMS(KP707106781, T1w, T1v);
Rp[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
Ip[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
}
{
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
{
E T8, Tl, T1j, T1n;
T8 = T1 + T7;
Tl = Te + Tk;
Tm = T8 + Tl;
T1e = T8 - Tl;
T1j = TQ + TS;
T1n = T1l + T1m;
T1o = T1j + T1n;
T1q = T1n - T1j;
}
{
E Tz, TM, T1f, T1g;
Tz = Ts + Ty;
TM = TF + TL;
TN = Tz + TM;
T1p = TM - Tz;
T1f = TX + TZ;
T1g = T14 + T16;
T1h = T1f - T1g;
T1i = T1f + T1g;
}
Rm[WS(rs, 3)] = Tm - TN;
Im[WS(rs, 3)] = T1i - T1o;
Rp[0] = Tm + TN;
Ip[0] = T1i + T1o;
Rm[WS(rs, 1)] = T1e - T1h;
Im[WS(rs, 1)] = T1p - T1q;
Rp[WS(rs, 2)] = T1e + T1h;
Ip[WS(rs, 2)] = T1p + T1q;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
void X(codelet_hc2cf_8) (planner *p) {
X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 66 FP additions, 32 FP multiplications,
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
E TP;
{
E T1, T18, T6, T17;
T1 = Rp[0];
T18 = Rm[0];
{
E T3, T5, T2, T4;
T3 = Rp[WS(rs, 2)];
T5 = Rm[WS(rs, 2)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T17 = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T1e = T18 - T17;
TH = T1 - T6;
T19 = T17 + T18;
}
{
E Tz, TS, TE, TT;
{
E Tw, Ty, Tv, Tx;
Tw = Ip[WS(rs, 3)];
Ty = Im[WS(rs, 3)];
Tv = W[12];
Tx = W[13];
Tz = FMA(Tv, Tw, Tx * Ty);
TS = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = Ip[WS(rs, 1)];
TD = Im[WS(rs, 1)];
TA = W[4];
TC = W[5];
TE = FMA(TA, TB, TC * TD);
TT = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T13 = TS + TT;
TR = Tz - TE;
TU = TS - TT;
}
{
E Tc, TI, Th, TJ;
{
E T9, Tb, T8, Ta;
T9 = Rp[WS(rs, 1)];
Tb = Rm[WS(rs, 1)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TI = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = Rp[WS(rs, 3)];
Tg = Rm[WS(rs, 3)];
Td = W[10];
Tf = W[11];
Th = FMA(Td, Te, Tf * Tg);
TJ = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T1f = Tc - Th;
TK = TI - TJ;
T16 = TI + TJ;
}
{
E To, TN, Tt, TO;
{
E Tl, Tn, Tk, Tm;
Tl = Ip[0];
Tn = Im[0];
Tk = W[0];
Tm = W[1];
To = FMA(Tk, Tl, Tm * Tn);
TN = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = Ip[WS(rs, 2)];
Ts = Im[WS(rs, 2)];
Tp = W[8];
Tr = W[9];
Tt = FMA(Tp, Tq, Tr * Ts);
TO = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T12 = TN + TO;
TM = To - Tt;
TP = TN - TO;
}
{
E Tj, TG, T1b, T1c;
Tj = T7 + Ti;
TG = Tu + TF;
Rm[WS(rs, 3)] = Tj - TG;
Rp[0] = Tj + TG;
{
E T15, T1a, T11, T14;
T15 = T12 + T13;
T1a = T16 + T19;
Im[WS(rs, 3)] = T15 - T1a;
Ip[0] = T15 + T1a;
T11 = T7 - Ti;
T14 = T12 - T13;
Rm[WS(rs, 1)] = T11 - T14;
Rp[WS(rs, 2)] = T11 + T14;
}
T1b = TF - Tu;
T1c = T19 - T16;
Im[WS(rs, 1)] = T1b - T1c;
Ip[WS(rs, 2)] = T1b + T1c;
{
E TX, T1g, T10, T1d, TY, TZ;
TX = TH - TK;
T1g = T1e - T1f;
TY = TP - TM;
TZ = TR + TU;
T10 = KP707106781 * (TY - TZ);
T1d = KP707106781 * (TY + TZ);
Rm[0] = TX - T10;
Ip[WS(rs, 1)] = T1d + T1g;
Rp[WS(rs, 3)] = TX + T10;
Im[WS(rs, 2)] = T1d - T1g;
}
{
E TL, T1i, TW, T1h, TQ, TV;
TL = TH + TK;
T1i = T1f + T1e;
TQ = TM + TP;
TV = TR - TU;
TW = KP707106781 * (TQ + TV);
T1h = KP707106781 * (TV - TQ);
Rm[WS(rs, 2)] = TL - TW;
Ip[WS(rs, 3)] = T1h + T1i;
Rp[WS(rs, 1)] = TL + TW;
Im[0] = T1h - T1i;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
void X(codelet_hc2cf_8) (planner *p) {
X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
}
#endif

View File

@@ -0,0 +1,937 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:38 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 228 FP additions, 166 FP multiplications,
* (or, 136 additions, 74 multiplications, 92 fused multiply/add),
* 91 stack variables, 4 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
E T1, T2, Tw, Ty, Th, Tj, T4, T5, TY, T6, Tk, T1o, T1d, Tz, T1j;
E Tq, TF, T18, TR, TL, T13, T1A, T1K, T1E, T1H, Tc, T25, T2k, T29, T2h;
{
E Tx, TE, Ti, TK, Tp, TQ, Tb, T3;
T1 = W[0];
T2 = W[2];
T3 = T1 * T2;
Tw = W[6];
Tx = T1 * Tw;
Ty = W[7];
TE = T1 * Ty;
Th = W[4];
Ti = T1 * Th;
TK = T2 * Th;
Tj = W[5];
Tp = T1 * Tj;
TQ = T2 * Tj;
T4 = W[1];
T5 = W[3];
Tb = T1 * T5;
TY = FNMS(T4, T5, T3);
T6 = FMA(T4, T5, T3);
Tk = FNMS(T4, Tj, Ti);
T1o = FNMS(T4, Th, Tp);
T1d = FMA(T5, Th, TQ);
Tz = FMA(T4, Ty, Tx);
T1j = FMA(T4, Tj, Ti);
Tq = FMA(T4, Th, Tp);
TF = FNMS(T4, Tw, TE);
T18 = FNMS(T5, Tj, TK);
TR = FNMS(T5, Th, TQ);
TL = FMA(T5, Tj, TK);
{
E T1z, T1D, T24, T28;
T1z = TY * Th;
T1D = TY * Tj;
T13 = FMA(T4, T2, Tb);
T1A = FMA(T13, Tj, T1z);
T1K = FMA(T13, Th, T1D);
T1E = FNMS(T13, Th, T1D);
T1H = FNMS(T13, Tj, T1z);
T24 = T6 * Th;
T28 = T6 * Tj;
Tc = FNMS(T4, T2, Tb);
T25 = FNMS(Tc, Tj, T24);
T2k = FNMS(Tc, Th, T28);
T29 = FMA(Tc, Th, T28);
T2h = FMA(Tc, Tj, T24);
}
}
{
E T1v, T2q, T1s, T2s, T38, T3T, T1Y, T3P, T17, T1h, T2x, T2v, T33, T3Q, T1N;
E T3S, Tg, Tu, T3A, T2B, T2D, T3B, T2c, T3L, T2S, T3I, TJ, TV, T3E, T2G;
E T2I, T3D, T2n, T3J, T2X, T3M;
{
E T1t, T1u, T1W, T1m, T1Q, T1S, T1T, T1V, T36, T1r, T34, T1P, T1k, T1l, T1n;
E T2r;
T1t = Ip[0];
T1u = Im[0];
T1W = T1t + T1u;
T1k = Ip[WS(rs, 4)];
T1l = Im[WS(rs, 4)];
T1m = T1k - T1l;
T1Q = T1k + T1l;
{
E T1U, T1p, T1q, T1O;
T1S = Rm[0];
T1T = Rp[0];
T1U = T1S - T1T;
T1V = T1 * T1U;
T36 = T4 * T1U;
T1p = Rp[WS(rs, 4)];
T1q = Rm[WS(rs, 4)];
T1O = T1q - T1p;
T1r = T1p + T1q;
T34 = Tj * T1O;
T1P = Th * T1O;
}
T1v = T1t - T1u;
T2q = T1T + T1S;
T1n = T1j * T1m;
T1s = FNMS(T1o, T1r, T1n);
T2r = T1j * T1r;
T2s = FMA(T1o, T1m, T2r);
{
E T35, T37, T1R, T1X;
T35 = FMA(Th, T1Q, T34);
T37 = FMA(T1, T1W, T36);
T38 = T35 + T37;
T3T = T37 - T35;
T1R = FNMS(Tj, T1Q, T1P);
T1X = FNMS(T4, T1W, T1V);
T1Y = T1R + T1X;
T3P = T1X - T1R;
}
}
{
E T11, T1F, T16, T2Z, T1C, T1b, T1L, T1g, T31, T1J;
{
E TZ, T10, T14, T15, T1B;
TZ = Ip[WS(rs, 2)];
T10 = Im[WS(rs, 2)];
T11 = TZ - T10;
T1F = TZ + T10;
T14 = Rp[WS(rs, 2)];
T15 = Rm[WS(rs, 2)];
T1B = T15 - T14;
T16 = T14 + T15;
T2Z = T1E * T1B;
T1C = T1A * T1B;
}
{
E T19, T1a, T1e, T1f, T1I;
T19 = Ip[WS(rs, 6)];
T1a = Im[WS(rs, 6)];
T1b = T19 - T1a;
T1L = T19 + T1a;
T1e = Rp[WS(rs, 6)];
T1f = Rm[WS(rs, 6)];
T1I = T1f - T1e;
T1g = T1e + T1f;
T31 = T1K * T1I;
T1J = T1H * T1I;
}
{
E T12, T1c, T2w, T2u;
T12 = TY * T11;
T17 = FNMS(T13, T16, T12);
T1c = T18 * T1b;
T1h = FNMS(T1d, T1g, T1c);
T2w = T18 * T1g;
T2x = FMA(T1d, T1b, T2w);
T2u = TY * T16;
T2v = FMA(T13, T11, T2u);
{
E T30, T32, T1G, T1M;
T30 = FMA(T1A, T1F, T2Z);
T32 = FMA(T1H, T1L, T31);
T33 = T30 + T32;
T3Q = T30 - T32;
T1G = FNMS(T1E, T1F, T1C);
T1M = FNMS(T1K, T1L, T1J);
T1N = T1G + T1M;
T3S = T1G - T1M;
}
}
}
{
E T9, T22, Ta, T2O, Tf, T20, T21, T2A, Tn, T2a, To, T2Q, Tt, T26, T27;
E T2C;
{
E T7, T8, Td, Te;
T7 = Ip[WS(rs, 1)];
T8 = Im[WS(rs, 1)];
T9 = T7 - T8;
T22 = T7 + T8;
Ta = T6 * T9;
T2O = T2 * T22;
Td = Rp[WS(rs, 1)];
Te = Rm[WS(rs, 1)];
Tf = Td + Te;
T20 = Td - Te;
T21 = T2 * T20;
T2A = T6 * Tf;
}
{
E Tl, Tm, Tr, Ts;
Tl = Ip[WS(rs, 5)];
Tm = Im[WS(rs, 5)];
Tn = Tl - Tm;
T2a = Tl + Tm;
To = Tk * Tn;
T2Q = T25 * T2a;
Tr = Rp[WS(rs, 5)];
Ts = Rm[WS(rs, 5)];
Tt = Tr + Ts;
T26 = Tr - Ts;
T27 = T25 * T26;
T2C = Tk * Tt;
}
Tg = FNMS(Tc, Tf, Ta);
Tu = FNMS(Tq, Tt, To);
T3A = Tg - Tu;
T2B = FMA(Tc, T9, T2A);
T2D = FMA(Tq, Tn, T2C);
T3B = T2B - T2D;
{
E T23, T2b, T2P, T2R;
T23 = FMA(T5, T22, T21);
T2b = FMA(T29, T2a, T27);
T2c = T23 + T2b;
T3L = T2b - T23;
T2P = FNMS(T5, T20, T2O);
T2R = FNMS(T29, T26, T2Q);
T2S = T2P + T2R;
T3I = T2R - T2P;
}
}
{
E TC, T2f, TD, T2T, TI, T2d, T2e, T2F, TO, T2l, TP, T2V, TU, T2i, T2j;
E T2H;
{
E TA, TB, TG, TH;
TA = Ip[WS(rs, 7)];
TB = Im[WS(rs, 7)];
TC = TA - TB;
T2f = TA + TB;
TD = Tz * TC;
T2T = Tw * T2f;
TG = Rp[WS(rs, 7)];
TH = Rm[WS(rs, 7)];
TI = TG + TH;
T2d = TG - TH;
T2e = Tw * T2d;
T2F = Tz * TI;
}
{
E TM, TN, TS, TT;
TM = Ip[WS(rs, 3)];
TN = Im[WS(rs, 3)];
TO = TM - TN;
T2l = TM + TN;
TP = TL * TO;
T2V = T2h * T2l;
TS = Rp[WS(rs, 3)];
TT = Rm[WS(rs, 3)];
TU = TS + TT;
T2i = TS - TT;
T2j = T2h * T2i;
T2H = TL * TU;
}
TJ = FNMS(TF, TI, TD);
TV = FNMS(TR, TU, TP);
T3E = TJ - TV;
T2G = FMA(TF, TC, T2F);
T2I = FMA(TR, TO, T2H);
T3D = T2G - T2I;
{
E T2g, T2m, T2U, T2W;
T2g = FMA(Ty, T2f, T2e);
T2m = FMA(T2k, T2l, T2j);
T2n = T2g + T2m;
T3J = T2m - T2g;
T2U = FNMS(Ty, T2d, T2T);
T2W = FNMS(T2k, T2i, T2V);
T2X = T2U + T2W;
T3M = T2U - T2W;
}
}
{
E TX, T3o, T3i, T3s, T3l, T3t, T1x, T3e, T2p, T2M, T2K, T3d, T3a, T3c, T2z;
E T3n;
{
E Tv, TW, T3g, T3h;
Tv = Tg + Tu;
TW = TJ + TV;
TX = Tv + TW;
T3o = Tv - TW;
T3g = T2X - T2S;
T3h = T2c - T2n;
T3i = T3g + T3h;
T3s = T3g - T3h;
}
{
E T3j, T3k, T1i, T1w;
T3j = T1Y - T1N;
T3k = T38 - T33;
T3l = T3j - T3k;
T3t = T3j + T3k;
T1i = T17 + T1h;
T1w = T1s + T1v;
T1x = T1i + T1w;
T3e = T1w - T1i;
}
{
E T1Z, T2o, T2E, T2J;
T1Z = T1N + T1Y;
T2o = T2c + T2n;
T2p = T1Z - T2o;
T2M = T2o + T1Z;
T2E = T2B + T2D;
T2J = T2G + T2I;
T2K = T2E + T2J;
T3d = T2J - T2E;
}
{
E T2Y, T39, T2t, T2y;
T2Y = T2S + T2X;
T39 = T33 + T38;
T3a = T2Y - T39;
T3c = T2Y + T39;
T2t = T2q + T2s;
T2y = T2v + T2x;
T2z = T2t + T2y;
T3n = T2t - T2y;
}
{
E T1y, T3b, T2L, T2N;
T1y = TX + T1x;
Ip[0] = KP500000000 * (T1y + T2p);
Im[WS(rs, 7)] = KP500000000 * (T2p - T1y);
T3b = T2z + T2K;
Rm[WS(rs, 7)] = KP500000000 * (T3b - T3c);
Rp[0] = KP500000000 * (T3b + T3c);
T2L = T2z - T2K;
Rm[WS(rs, 3)] = KP500000000 * (T2L - T2M);
Rp[WS(rs, 4)] = KP500000000 * (T2L + T2M);
T2N = T1x - TX;
Ip[WS(rs, 4)] = KP500000000 * (T2N + T3a);
Im[WS(rs, 3)] = KP500000000 * (T3a - T2N);
}
{
E T3f, T3m, T3v, T3w;
T3f = T3d + T3e;
T3m = T3i + T3l;
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3m, T3f));
Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3m, T3f)));
T3v = T3n + T3o;
T3w = T3s + T3t;
Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3w, T3v));
Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3w, T3v));
}
{
E T3p, T3q, T3r, T3u;
T3p = T3n - T3o;
T3q = T3l - T3i;
Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3q, T3p));
Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3q, T3p));
T3r = T3e - T3d;
T3u = T3s - T3t;
Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3u, T3r));
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3u, T3r)));
}
}
{
E T3z, T4b, T4g, T4q, T4j, T4r, T3G, T4m, T3O, T46, T3Z, T4l, T42, T4c, T3V;
E T47;
{
E T3x, T3y, T4e, T4f;
T3x = T1v - T1s;
T3y = T2v - T2x;
T3z = T3x - T3y;
T4b = T3y + T3x;
T4e = T3I - T3J;
T4f = T3M - T3L;
T4g = FMA(KP414213562, T4f, T4e);
T4q = FNMS(KP414213562, T4e, T4f);
}
{
E T4h, T4i, T3C, T3F;
T4h = T3Q + T3P;
T4i = T3T - T3S;
T4j = FMA(KP414213562, T4i, T4h);
T4r = FNMS(KP414213562, T4h, T4i);
T3C = T3A - T3B;
T3F = T3D + T3E;
T3G = T3C + T3F;
T4m = T3C - T3F;
}
{
E T3K, T3N, T3X, T3Y;
T3K = T3I + T3J;
T3N = T3L + T3M;
T3O = FMA(KP414213562, T3N, T3K);
T46 = FNMS(KP414213562, T3K, T3N);
T3X = T2q - T2s;
T3Y = T17 - T1h;
T3Z = T3X + T3Y;
T4l = T3X - T3Y;
}
{
E T40, T41, T3R, T3U;
T40 = T3B + T3A;
T41 = T3D - T3E;
T42 = T40 + T41;
T4c = T41 - T40;
T3R = T3P - T3Q;
T3U = T3S + T3T;
T3V = FNMS(KP414213562, T3U, T3R);
T47 = FMA(KP414213562, T3R, T3U);
}
{
E T3H, T3W, T49, T4a;
T3H = FMA(KP707106781, T3G, T3z);
T3W = T3O + T3V;
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3W, T3H));
Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3H)));
T49 = FMA(KP707106781, T42, T3Z);
T4a = T46 + T47;
Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T4a, T49));
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T4a, T49));
}
{
E T43, T44, T45, T48;
T43 = FNMS(KP707106781, T42, T3Z);
T44 = T3V - T3O;
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T44, T43));
Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T44, T43));
T45 = FNMS(KP707106781, T3G, T3z);
T48 = T46 - T47;
Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T48, T45));
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T48, T45)));
}
{
E T4d, T4k, T4t, T4u;
T4d = FNMS(KP707106781, T4c, T4b);
T4k = T4g - T4j;
Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T4k, T4d));
Im[0] = -(KP500000000 * (FNMS(KP923879532, T4k, T4d)));
T4t = FNMS(KP707106781, T4m, T4l);
T4u = T4q + T4r;
Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4u, T4t));
Rm[0] = KP500000000 * (FMA(KP923879532, T4u, T4t));
}
{
E T4n, T4o, T4p, T4s;
T4n = FMA(KP707106781, T4m, T4l);
T4o = T4g + T4j;
Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4o, T4n));
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4o, T4n));
T4p = FMA(KP707106781, T4c, T4b);
T4s = T4q - T4r;
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4s, T4p));
Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4s, T4p)));
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, { 136, 74, 92, 0 } };
void X(codelet_hc2cfdft2_16) (planner *p) {
X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 228 FP additions, 124 FP multiplications,
* (or, 188 additions, 84 multiplications, 40 fused multiply/add),
* 91 stack variables, 4 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP461939766, +0.461939766255643378064091594698394143411208313);
DK(KP191341716, +0.191341716182544885864229992015199433380672281);
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
E T1, T4, T2, T5, T7, Td, T12, TY, Tk, Ti, Tm, T1l, T1b, TL, T1h;
E Ts, TR, T17, Ty, Tz, TA, TE, T1L, T1Q, T1H, T1O, T24, T2d, T20, T2b;
{
E Tl, TP, Tq, TK, Tj, TQ, Tr, TJ;
{
E T3, Tc, T6, Tb;
T1 = W[0];
T4 = W[1];
T2 = W[2];
T5 = W[3];
T3 = T1 * T2;
Tc = T4 * T2;
T6 = T4 * T5;
Tb = T1 * T5;
T7 = T3 + T6;
Td = Tb - Tc;
T12 = Tb + Tc;
TY = T3 - T6;
Tk = W[5];
Tl = T4 * Tk;
TP = T2 * Tk;
Tq = T1 * Tk;
TK = T5 * Tk;
Ti = W[4];
Tj = T1 * Ti;
TQ = T5 * Ti;
Tr = T4 * Ti;
TJ = T2 * Ti;
}
Tm = Tj - Tl;
T1l = Tq - Tr;
T1b = TP + TQ;
TL = TJ + TK;
T1h = Tj + Tl;
Ts = Tq + Tr;
TR = TP - TQ;
T17 = TJ - TK;
Ty = W[6];
Tz = W[7];
TA = FMA(T1, Ty, T4 * Tz);
TE = FNMS(T4, Ty, T1 * Tz);
{
E T1J, T1K, T1F, T1G;
T1J = TY * Tk;
T1K = T12 * Ti;
T1L = T1J - T1K;
T1Q = T1J + T1K;
T1F = TY * Ti;
T1G = T12 * Tk;
T1H = T1F + T1G;
T1O = T1F - T1G;
}
{
E T22, T23, T1Y, T1Z;
T22 = T7 * Tk;
T23 = Td * Ti;
T24 = T22 + T23;
T2d = T22 - T23;
T1Y = T7 * Ti;
T1Z = Td * Tk;
T20 = T1Y - T1Z;
T2b = T1Y + T1Z;
}
}
{
E T1t, T3i, T2l, T3B, T1E, T3t, T2M, T3x, T1g, T3C, T2J, T3u, T1T, T3w, T2o;
E T3j, Tx, T3b, T2C, T3q, T27, T3m, T2s, T3c, TW, T3f, T2F, T3n, T2g, T3p;
E T2v, T3e;
{
E T1k, T1C, T1o, T1B, T1s, T1z, T1y, T2j, T1p, T2k;
{
E T1i, T1j, T1m, T1n;
T1i = Ip[WS(rs, 4)];
T1j = Im[WS(rs, 4)];
T1k = T1i - T1j;
T1C = T1i + T1j;
T1m = Rp[WS(rs, 4)];
T1n = Rm[WS(rs, 4)];
T1o = T1m + T1n;
T1B = T1m - T1n;
}
{
E T1q, T1r, T1w, T1x;
T1q = Ip[0];
T1r = Im[0];
T1s = T1q - T1r;
T1z = T1q + T1r;
T1w = Rm[0];
T1x = Rp[0];
T1y = T1w - T1x;
T2j = T1x + T1w;
}
T1p = FNMS(T1l, T1o, T1h * T1k);
T1t = T1p + T1s;
T3i = T1s - T1p;
T2k = FMA(T1h, T1o, T1l * T1k);
T2l = T2j + T2k;
T3B = T2j - T2k;
{
E T1A, T1D, T2K, T2L;
T1A = FNMS(T4, T1z, T1 * T1y);
T1D = FMA(Ti, T1B, Tk * T1C);
T1E = T1A - T1D;
T3t = T1D + T1A;
T2K = FNMS(Tk, T1B, Ti * T1C);
T2L = FMA(T4, T1y, T1 * T1z);
T2M = T2K + T2L;
T3x = T2L - T2K;
}
}
{
E T11, T1M, T15, T1I, T1a, T1R, T1e, T1P;
{
E TZ, T10, T13, T14;
TZ = Ip[WS(rs, 2)];
T10 = Im[WS(rs, 2)];
T11 = TZ - T10;
T1M = TZ + T10;
T13 = Rp[WS(rs, 2)];
T14 = Rm[WS(rs, 2)];
T15 = T13 + T14;
T1I = T13 - T14;
}
{
E T18, T19, T1c, T1d;
T18 = Ip[WS(rs, 6)];
T19 = Im[WS(rs, 6)];
T1a = T18 - T19;
T1R = T18 + T19;
T1c = Rp[WS(rs, 6)];
T1d = Rm[WS(rs, 6)];
T1e = T1c + T1d;
T1P = T1c - T1d;
}
{
E T16, T1f, T2H, T2I;
T16 = FNMS(T12, T15, TY * T11);
T1f = FNMS(T1b, T1e, T17 * T1a);
T1g = T16 + T1f;
T3C = T16 - T1f;
T2H = FNMS(T1L, T1I, T1H * T1M);
T2I = FNMS(T1Q, T1P, T1O * T1R);
T2J = T2H + T2I;
T3u = T2H - T2I;
}
{
E T1N, T1S, T2m, T2n;
T1N = FMA(T1H, T1I, T1L * T1M);
T1S = FMA(T1O, T1P, T1Q * T1R);
T1T = T1N + T1S;
T3w = T1S - T1N;
T2m = FMA(TY, T15, T12 * T11);
T2n = FMA(T17, T1e, T1b * T1a);
T2o = T2m + T2n;
T3j = T2m - T2n;
}
}
{
E Ta, T1W, Tg, T1V, Tp, T25, Tv, T21;
{
E T8, T9, Te, Tf;
T8 = Ip[WS(rs, 1)];
T9 = Im[WS(rs, 1)];
Ta = T8 - T9;
T1W = T8 + T9;
Te = Rp[WS(rs, 1)];
Tf = Rm[WS(rs, 1)];
Tg = Te + Tf;
T1V = Te - Tf;
}
{
E Tn, To, Tt, Tu;
Tn = Ip[WS(rs, 5)];
To = Im[WS(rs, 5)];
Tp = Tn - To;
T25 = Tn + To;
Tt = Rp[WS(rs, 5)];
Tu = Rm[WS(rs, 5)];
Tv = Tt + Tu;
T21 = Tt - Tu;
}
{
E Th, Tw, T2A, T2B;
Th = FNMS(Td, Tg, T7 * Ta);
Tw = FNMS(Ts, Tv, Tm * Tp);
Tx = Th + Tw;
T3b = Th - Tw;
T2A = FNMS(T5, T1V, T2 * T1W);
T2B = FNMS(T24, T21, T20 * T25);
T2C = T2A + T2B;
T3q = T2A - T2B;
}
{
E T1X, T26, T2q, T2r;
T1X = FMA(T2, T1V, T5 * T1W);
T26 = FMA(T20, T21, T24 * T25);
T27 = T1X + T26;
T3m = T26 - T1X;
T2q = FMA(T7, Tg, Td * Ta);
T2r = FMA(Tm, Tv, Ts * Tp);
T2s = T2q + T2r;
T3c = T2q - T2r;
}
}
{
E TD, T29, TH, T28, TO, T2e, TU, T2c;
{
E TB, TC, TF, TG;
TB = Ip[WS(rs, 7)];
TC = Im[WS(rs, 7)];
TD = TB - TC;
T29 = TB + TC;
TF = Rp[WS(rs, 7)];
TG = Rm[WS(rs, 7)];
TH = TF + TG;
T28 = TF - TG;
}
{
E TM, TN, TS, TT;
TM = Ip[WS(rs, 3)];
TN = Im[WS(rs, 3)];
TO = TM - TN;
T2e = TM + TN;
TS = Rp[WS(rs, 3)];
TT = Rm[WS(rs, 3)];
TU = TS + TT;
T2c = TS - TT;
}
{
E TI, TV, T2D, T2E;
TI = FNMS(TE, TH, TA * TD);
TV = FNMS(TR, TU, TL * TO);
TW = TI + TV;
T3f = TI - TV;
T2D = FNMS(Tz, T28, Ty * T29);
T2E = FNMS(T2d, T2c, T2b * T2e);
T2F = T2D + T2E;
T3n = T2D - T2E;
}
{
E T2a, T2f, T2t, T2u;
T2a = FMA(Ty, T28, Tz * T29);
T2f = FMA(T2b, T2c, T2d * T2e);
T2g = T2a + T2f;
T3p = T2f - T2a;
T2t = FMA(TA, TH, TE * TD);
T2u = FMA(TL, TU, TR * TO);
T2v = T2t + T2u;
T3e = T2t - T2u;
}
}
{
E T1v, T2z, T2O, T2Q, T2i, T2y, T2x, T2P;
{
E TX, T1u, T2G, T2N;
TX = Tx + TW;
T1u = T1g + T1t;
T1v = TX + T1u;
T2z = T1u - TX;
T2G = T2C + T2F;
T2N = T2J + T2M;
T2O = T2G - T2N;
T2Q = T2G + T2N;
}
{
E T1U, T2h, T2p, T2w;
T1U = T1E - T1T;
T2h = T27 + T2g;
T2i = T1U - T2h;
T2y = T2h + T1U;
T2p = T2l + T2o;
T2w = T2s + T2v;
T2x = T2p - T2w;
T2P = T2p + T2w;
}
Ip[0] = KP500000000 * (T1v + T2i);
Rp[0] = KP500000000 * (T2P + T2Q);
Im[WS(rs, 7)] = KP500000000 * (T2i - T1v);
Rm[WS(rs, 7)] = KP500000000 * (T2P - T2Q);
Rm[WS(rs, 3)] = KP500000000 * (T2x - T2y);
Im[WS(rs, 3)] = KP500000000 * (T2O - T2z);
Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
Ip[WS(rs, 4)] = KP500000000 * (T2z + T2O);
}
{
E T2T, T35, T33, T39, T2W, T36, T2Z, T37;
{
E T2R, T2S, T31, T32;
T2R = T2v - T2s;
T2S = T1t - T1g;
T2T = KP500000000 * (T2R + T2S);
T35 = KP500000000 * (T2S - T2R);
T31 = T2l - T2o;
T32 = Tx - TW;
T33 = KP500000000 * (T31 - T32);
T39 = KP500000000 * (T31 + T32);
}
{
E T2U, T2V, T2X, T2Y;
T2U = T2F - T2C;
T2V = T27 - T2g;
T2W = T2U + T2V;
T36 = T2U - T2V;
T2X = T1T + T1E;
T2Y = T2M - T2J;
T2Z = T2X - T2Y;
T37 = T2X + T2Y;
}
{
E T30, T3a, T34, T38;
T30 = KP353553390 * (T2W + T2Z);
Ip[WS(rs, 2)] = T2T + T30;
Im[WS(rs, 5)] = T30 - T2T;
T3a = KP353553390 * (T36 + T37);
Rm[WS(rs, 5)] = T39 - T3a;
Rp[WS(rs, 2)] = T39 + T3a;
T34 = KP353553390 * (T2Z - T2W);
Rm[WS(rs, 1)] = T33 - T34;
Rp[WS(rs, 6)] = T33 + T34;
T38 = KP353553390 * (T36 - T37);
Ip[WS(rs, 6)] = T35 + T38;
Im[WS(rs, 1)] = T38 - T35;
}
}
{
E T3k, T3Q, T3Z, T3D, T3h, T40, T3X, T45, T3G, T3P, T3s, T3K, T3U, T44, T3z;
E T3L;
{
E T3d, T3g, T3o, T3r;
T3k = KP500000000 * (T3i - T3j);
T3Q = KP500000000 * (T3j + T3i);
T3Z = KP500000000 * (T3B - T3C);
T3D = KP500000000 * (T3B + T3C);
T3d = T3b - T3c;
T3g = T3e + T3f;
T3h = KP353553390 * (T3d + T3g);
T40 = KP353553390 * (T3d - T3g);
{
E T3V, T3W, T3E, T3F;
T3V = T3u + T3t;
T3W = T3x - T3w;
T3X = FNMS(KP461939766, T3W, KP191341716 * T3V);
T45 = FMA(KP461939766, T3V, KP191341716 * T3W);
T3E = T3c + T3b;
T3F = T3e - T3f;
T3G = KP353553390 * (T3E + T3F);
T3P = KP353553390 * (T3F - T3E);
}
T3o = T3m + T3n;
T3r = T3p - T3q;
T3s = FMA(KP191341716, T3o, KP461939766 * T3r);
T3K = FNMS(KP191341716, T3r, KP461939766 * T3o);
{
E T3S, T3T, T3v, T3y;
T3S = T3n - T3m;
T3T = T3q + T3p;
T3U = FMA(KP461939766, T3S, KP191341716 * T3T);
T44 = FNMS(KP461939766, T3T, KP191341716 * T3S);
T3v = T3t - T3u;
T3y = T3w + T3x;
T3z = FNMS(KP191341716, T3y, KP461939766 * T3v);
T3L = FMA(KP191341716, T3v, KP461939766 * T3y);
}
}
{
E T3l, T3A, T3N, T3O;
T3l = T3h + T3k;
T3A = T3s + T3z;
Ip[WS(rs, 1)] = T3l + T3A;
Im[WS(rs, 6)] = T3A - T3l;
T3N = T3D + T3G;
T3O = T3K + T3L;
Rm[WS(rs, 6)] = T3N - T3O;
Rp[WS(rs, 1)] = T3N + T3O;
}
{
E T3H, T3I, T3J, T3M;
T3H = T3D - T3G;
T3I = T3z - T3s;
Rm[WS(rs, 2)] = T3H - T3I;
Rp[WS(rs, 5)] = T3H + T3I;
T3J = T3k - T3h;
T3M = T3K - T3L;
Ip[WS(rs, 5)] = T3J + T3M;
Im[WS(rs, 2)] = T3M - T3J;
}
{
E T3R, T3Y, T47, T48;
T3R = T3P + T3Q;
T3Y = T3U + T3X;
Ip[WS(rs, 3)] = T3R + T3Y;
Im[WS(rs, 4)] = T3Y - T3R;
T47 = T3Z + T40;
T48 = T44 + T45;
Rm[WS(rs, 4)] = T47 - T48;
Rp[WS(rs, 3)] = T47 + T48;
}
{
E T41, T42, T43, T46;
T41 = T3Z - T40;
T42 = T3X - T3U;
Rm[0] = T41 - T42;
Rp[WS(rs, 7)] = T41 + T42;
T43 = T3Q - T3P;
T46 = T44 - T45;
Ip[WS(rs, 7)] = T43 + T46;
Im[0] = T46 - T43;
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, { 188, 84, 40, 0 } };
void X(codelet_hc2cfdft2_16) (planner *p) {
X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,221 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:38 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cfdft2_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 32 FP additions, 24 FP multiplications,
* (or, 24 additions, 16 multiplications, 8 fused multiply/add),
* 37 stack variables, 1 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, T5, T2, T4, T6, Tc, T3, Tb;
T1 = W[0];
T5 = W[3];
T2 = W[2];
T3 = T1 * T2;
Tb = T1 * T5;
T4 = W[1];
T6 = FMA(T4, T5, T3);
Tc = FNMS(T4, T2, Tb);
{
E Tj, Tp, To, TE, Tw, T9, Tt, Ta, TC, Tf, Tr, Ts, Tx;
{
E Th, Ti, Tl, Tm, Tn;
Th = Ip[0];
Ti = Im[0];
Tj = Th - Ti;
Tp = Th + Ti;
Tl = Rm[0];
Tm = Rp[0];
Tn = Tl - Tm;
To = T1 * Tn;
TE = T4 * Tn;
Tw = Tm + Tl;
}
{
E T7, T8, Td, Te;
T7 = Ip[WS(rs, 1)];
T8 = Im[WS(rs, 1)];
T9 = T7 - T8;
Tt = T7 + T8;
Ta = T6 * T9;
TC = T2 * Tt;
Td = Rp[WS(rs, 1)];
Te = Rm[WS(rs, 1)];
Tf = Td + Te;
Tr = Td - Te;
Ts = T2 * Tr;
Tx = T6 * Tf;
}
{
E Tk, TB, Tz, TH, Tv, TA, TG, TI, Tg, Ty;
Tg = FNMS(Tc, Tf, Ta);
Tk = Tg + Tj;
TB = Tj - Tg;
Ty = FMA(Tc, T9, Tx);
Tz = Tw - Ty;
TH = Tw + Ty;
{
E Tq, Tu, TD, TF;
Tq = FNMS(T4, Tp, To);
Tu = FMA(T5, Tt, Ts);
Tv = Tq - Tu;
TA = Tu + Tq;
TD = FNMS(T5, Tr, TC);
TF = FMA(T1, Tp, TE);
TG = TD - TF;
TI = TD + TF;
}
Ip[0] = KP500000000 * (Tk + Tv);
Rp[0] = KP500000000 * (TH + TI);
Im[WS(rs, 1)] = KP500000000 * (Tv - Tk);
Rm[WS(rs, 1)] = KP500000000 * (TH - TI);
Rm[0] = KP500000000 * (Tz - TA);
Im[0] = KP500000000 * (TG - TB);
Rp[WS(rs, 1)] = KP500000000 * (Tz + TA);
Ip[WS(rs, 1)] = KP500000000 * (TB + TG);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cfdft2_4", twinstr, &GENUS, { 24, 16, 8, 0 } };
void X(codelet_hc2cfdft2_4) (planner *p) {
X(khc2c_register) (p, hc2cfdft2_4, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cfdft2_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 32 FP additions, 24 FP multiplications,
* (or, 24 additions, 16 multiplications, 8 fused multiply/add),
* 24 stack variables, 1 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, T3, T2, T4, T5, T9;
T1 = W[0];
T3 = W[1];
T2 = W[2];
T4 = W[3];
T5 = FMA(T1, T2, T3 * T4);
T9 = FNMS(T3, T2, T1 * T4);
{
E Tg, Tr, Tm, Tx, Td, Tw, Tp, Ts;
{
E Te, Tf, Tl, Ti, Tj, Tk;
Te = Ip[0];
Tf = Im[0];
Tl = Te + Tf;
Ti = Rm[0];
Tj = Rp[0];
Tk = Ti - Tj;
Tg = Te - Tf;
Tr = Tj + Ti;
Tm = FNMS(T3, Tl, T1 * Tk);
Tx = FMA(T3, Tk, T1 * Tl);
}
{
E T8, To, Tc, Tn;
{
E T6, T7, Ta, Tb;
T6 = Ip[WS(rs, 1)];
T7 = Im[WS(rs, 1)];
T8 = T6 - T7;
To = T6 + T7;
Ta = Rp[WS(rs, 1)];
Tb = Rm[WS(rs, 1)];
Tc = Ta + Tb;
Tn = Ta - Tb;
}
Td = FNMS(T9, Tc, T5 * T8);
Tw = FNMS(T4, Tn, T2 * To);
Tp = FMA(T2, Tn, T4 * To);
Ts = FMA(T5, Tc, T9 * T8);
}
{
E Th, Tq, Tz, TA;
Th = Td + Tg;
Tq = Tm - Tp;
Ip[0] = KP500000000 * (Th + Tq);
Im[WS(rs, 1)] = KP500000000 * (Tq - Th);
Tz = Tr + Ts;
TA = Tw + Tx;
Rm[WS(rs, 1)] = KP500000000 * (Tz - TA);
Rp[0] = KP500000000 * (Tz + TA);
}
{
E Tt, Tu, Tv, Ty;
Tt = Tr - Ts;
Tu = Tp + Tm;
Rm[0] = KP500000000 * (Tt - Tu);
Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu);
Tv = Tg - Td;
Ty = Tw - Tx;
Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty);
Im[0] = KP500000000 * (Ty - Tv);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cfdft2_4", twinstr, &GENUS, { 24, 16, 8, 0 } };
void X(codelet_hc2cfdft2_4) (planner *p) {
X(khc2c_register) (p, hc2cfdft2_4, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,442 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:38 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cfdft2_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 90 FP additions, 66 FP multiplications,
* (or, 60 additions, 36 multiplications, 30 fused multiply/add),
* 45 stack variables, 2 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
E T1, T2, Th, Tj, T4, T5, T6, Tk, TB, Tq, Tw, Tc, TM, TQ;
{
E T3, Ti, Tp, Tb, TL, TP;
T1 = W[0];
T2 = W[2];
T3 = T1 * T2;
Th = W[4];
Ti = T1 * Th;
Tj = W[5];
Tp = T1 * Tj;
T4 = W[1];
T5 = W[3];
Tb = T1 * T5;
T6 = FMA(T4, T5, T3);
Tk = FMA(T4, Tj, Ti);
TB = FMA(T4, T2, Tb);
Tq = FNMS(T4, Th, Tp);
Tw = FNMS(T4, T5, T3);
TL = T6 * Th;
TP = T6 * Tj;
Tc = FNMS(T4, T2, Tb);
TM = FMA(Tc, Tj, TL);
TQ = FNMS(Tc, Th, TP);
}
{
E TI, T1a, TY, T1u, TF, T1s, TS, T1c, Tg, T1n, T13, T1f, Tu, T1p, T17;
E T1h;
{
E TG, TH, TX, TT, TU, TV, TW, T1t;
TG = Ip[0];
TH = Im[0];
TX = TG + TH;
TT = Rm[0];
TU = Rp[0];
TV = TT - TU;
TI = TG - TH;
T1a = TU + TT;
TW = T1 * TV;
TY = FNMS(T4, TX, TW);
T1t = T4 * TV;
T1u = FMA(T1, TX, T1t);
}
{
E Tz, TR, TE, TN;
{
E Tx, Ty, TC, TD;
Tx = Ip[WS(rs, 2)];
Ty = Im[WS(rs, 2)];
Tz = Tx - Ty;
TR = Tx + Ty;
TC = Rp[WS(rs, 2)];
TD = Rm[WS(rs, 2)];
TE = TC + TD;
TN = TD - TC;
}
{
E TA, T1r, TO, T1b;
TA = Tw * Tz;
TF = FNMS(TB, TE, TA);
T1r = TQ * TN;
T1s = FMA(TM, TR, T1r);
TO = TM * TN;
TS = FNMS(TQ, TR, TO);
T1b = Tw * TE;
T1c = FMA(TB, Tz, T1b);
}
}
{
E T9, T12, Tf, T10;
{
E T7, T8, Td, Te;
T7 = Ip[WS(rs, 1)];
T8 = Im[WS(rs, 1)];
T9 = T7 - T8;
T12 = T7 + T8;
Td = Rp[WS(rs, 1)];
Te = Rm[WS(rs, 1)];
Tf = Td + Te;
T10 = Td - Te;
}
{
E Ta, T1m, T11, T1e;
Ta = T6 * T9;
Tg = FNMS(Tc, Tf, Ta);
T1m = T2 * T12;
T1n = FNMS(T5, T10, T1m);
T11 = T2 * T10;
T13 = FMA(T5, T12, T11);
T1e = T6 * Tf;
T1f = FMA(Tc, T9, T1e);
}
}
{
E Tn, T16, Tt, T14;
{
E Tl, Tm, Tr, Ts;
Tl = Ip[WS(rs, 3)];
Tm = Im[WS(rs, 3)];
Tn = Tl - Tm;
T16 = Tl + Tm;
Tr = Rp[WS(rs, 3)];
Ts = Rm[WS(rs, 3)];
Tt = Tr + Ts;
T14 = Tr - Ts;
}
{
E To, T1o, T15, T1g;
To = Tk * Tn;
Tu = FNMS(Tq, Tt, To);
T1o = Th * T16;
T1p = FNMS(Tj, T14, T1o);
T15 = Th * T14;
T17 = FMA(Tj, T16, T15);
T1g = Tk * Tt;
T1h = FMA(Tq, Tn, T1g);
}
}
{
E TK, T1l, T1w, T1y, T19, T1k, T1j, T1x;
{
E Tv, TJ, T1q, T1v;
Tv = Tg + Tu;
TJ = TF + TI;
TK = Tv + TJ;
T1l = TJ - Tv;
T1q = T1n + T1p;
T1v = T1s + T1u;
T1w = T1q - T1v;
T1y = T1q + T1v;
}
{
E TZ, T18, T1d, T1i;
TZ = TS + TY;
T18 = T13 + T17;
T19 = TZ - T18;
T1k = T18 + TZ;
T1d = T1a + T1c;
T1i = T1f + T1h;
T1j = T1d - T1i;
T1x = T1d + T1i;
}
Ip[0] = KP500000000 * (TK + T19);
Rp[0] = KP500000000 * (T1x + T1y);
Im[WS(rs, 3)] = KP500000000 * (T19 - TK);
Rm[WS(rs, 3)] = KP500000000 * (T1x - T1y);
Rm[WS(rs, 1)] = KP500000000 * (T1j - T1k);
Im[WS(rs, 1)] = KP500000000 * (T1w - T1l);
Rp[WS(rs, 2)] = KP500000000 * (T1j + T1k);
Ip[WS(rs, 2)] = KP500000000 * (T1l + T1w);
}
{
E T1B, T1N, T1L, T1R, T1E, T1O, T1H, T1P;
{
E T1z, T1A, T1J, T1K;
T1z = TI - TF;
T1A = T1f - T1h;
T1B = T1z - T1A;
T1N = T1A + T1z;
T1J = T1a - T1c;
T1K = Tg - Tu;
T1L = T1J - T1K;
T1R = T1J + T1K;
}
{
E T1C, T1D, T1F, T1G;
T1C = T1p - T1n;
T1D = T13 - T17;
T1E = T1C + T1D;
T1O = T1C - T1D;
T1F = TY - TS;
T1G = T1u - T1s;
T1H = T1F - T1G;
T1P = T1F + T1G;
}
{
E T1I, T1S, T1M, T1Q;
T1I = T1E + T1H;
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1I, T1B));
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1I, T1B)));
T1S = T1O + T1P;
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1S, T1R));
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1S, T1R));
T1M = T1H - T1E;
Rm[0] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
T1Q = T1O - T1P;
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1Q, T1N));
Im[0] = -(KP500000000 * (FNMS(KP707106781, T1Q, T1N)));
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cfdft2_8", twinstr, &GENUS, { 60, 36, 30, 0 } };
void X(codelet_hc2cfdft2_8) (planner *p) {
X(khc2c_register) (p, hc2cfdft2_8, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cfdft2_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 90 FP additions, 56 FP multiplications,
* (or, 72 additions, 38 multiplications, 18 fused multiply/add),
* 51 stack variables, 2 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
E T1, T4, T2, T5, Tu, Ty, T7, Td, Ti, Tj, Tk, TP, To, TN;
{
E T3, Tc, T6, Tb;
T1 = W[0];
T4 = W[1];
T2 = W[2];
T5 = W[3];
T3 = T1 * T2;
Tc = T4 * T2;
T6 = T4 * T5;
Tb = T1 * T5;
Tu = T3 - T6;
Ty = Tb + Tc;
T7 = T3 + T6;
Td = Tb - Tc;
Ti = W[4];
Tj = W[5];
Tk = FMA(T1, Ti, T4 * Tj);
TP = FNMS(Td, Ti, T7 * Tj);
To = FNMS(T4, Ti, T1 * Tj);
TN = FMA(T7, Ti, Td * Tj);
}
{
E TF, T11, TC, T12, T1d, T1e, T1q, TM, TR, T1p, Th, Ts, T15, T14, T1a;
E T1b, T1m, TV, TY, T1n;
{
E TD, TE, TL, TI, TJ, TK, Tx, TQ, TB, TO;
TD = Ip[0];
TE = Im[0];
TL = TD + TE;
TI = Rm[0];
TJ = Rp[0];
TK = TI - TJ;
{
E Tv, Tw, Tz, TA;
Tv = Ip[WS(rs, 2)];
Tw = Im[WS(rs, 2)];
Tx = Tv - Tw;
TQ = Tv + Tw;
Tz = Rp[WS(rs, 2)];
TA = Rm[WS(rs, 2)];
TB = Tz + TA;
TO = Tz - TA;
}
TF = TD - TE;
T11 = TJ + TI;
TC = FNMS(Ty, TB, Tu * Tx);
T12 = FMA(Tu, TB, Ty * Tx);
T1d = FNMS(TP, TO, TN * TQ);
T1e = FMA(T4, TK, T1 * TL);
T1q = T1e - T1d;
TM = FNMS(T4, TL, T1 * TK);
TR = FMA(TN, TO, TP * TQ);
T1p = TR + TM;
}
{
E Ta, TU, Tg, TT, Tn, TX, Tr, TW;
{
E T8, T9, Te, Tf;
T8 = Ip[WS(rs, 1)];
T9 = Im[WS(rs, 1)];
Ta = T8 - T9;
TU = T8 + T9;
Te = Rp[WS(rs, 1)];
Tf = Rm[WS(rs, 1)];
Tg = Te + Tf;
TT = Te - Tf;
}
{
E Tl, Tm, Tp, Tq;
Tl = Ip[WS(rs, 3)];
Tm = Im[WS(rs, 3)];
Tn = Tl - Tm;
TX = Tl + Tm;
Tp = Rp[WS(rs, 3)];
Tq = Rm[WS(rs, 3)];
Tr = Tp + Tq;
TW = Tp - Tq;
}
Th = FNMS(Td, Tg, T7 * Ta);
Ts = FNMS(To, Tr, Tk * Tn);
T15 = FMA(Tk, Tr, To * Tn);
T14 = FMA(T7, Tg, Td * Ta);
T1a = FNMS(T5, TT, T2 * TU);
T1b = FNMS(Tj, TW, Ti * TX);
T1m = T1b - T1a;
TV = FMA(T2, TT, T5 * TU);
TY = FMA(Ti, TW, Tj * TX);
T1n = TV - TY;
}
{
E T1l, T1x, T1A, T1C, T1s, T1w, T1v, T1B;
{
E T1j, T1k, T1y, T1z;
T1j = TF - TC;
T1k = T14 - T15;
T1l = KP500000000 * (T1j - T1k);
T1x = KP500000000 * (T1k + T1j);
T1y = T1m - T1n;
T1z = T1p + T1q;
T1A = KP353553390 * (T1y - T1z);
T1C = KP353553390 * (T1y + T1z);
}
{
E T1o, T1r, T1t, T1u;
T1o = T1m + T1n;
T1r = T1p - T1q;
T1s = KP353553390 * (T1o + T1r);
T1w = KP353553390 * (T1r - T1o);
T1t = T11 - T12;
T1u = Th - Ts;
T1v = KP500000000 * (T1t - T1u);
T1B = KP500000000 * (T1t + T1u);
}
Ip[WS(rs, 1)] = T1l + T1s;
Rp[WS(rs, 1)] = T1B + T1C;
Im[WS(rs, 2)] = T1s - T1l;
Rm[WS(rs, 2)] = T1B - T1C;
Rm[0] = T1v - T1w;
Im[0] = T1A - T1x;
Rp[WS(rs, 3)] = T1v + T1w;
Ip[WS(rs, 3)] = T1x + T1A;
}
{
E TH, T19, T1g, T1i, T10, T18, T17, T1h;
{
E Tt, TG, T1c, T1f;
Tt = Th + Ts;
TG = TC + TF;
TH = Tt + TG;
T19 = TG - Tt;
T1c = T1a + T1b;
T1f = T1d + T1e;
T1g = T1c - T1f;
T1i = T1c + T1f;
}
{
E TS, TZ, T13, T16;
TS = TM - TR;
TZ = TV + TY;
T10 = TS - TZ;
T18 = TZ + TS;
T13 = T11 + T12;
T16 = T14 + T15;
T17 = T13 - T16;
T1h = T13 + T16;
}
Ip[0] = KP500000000 * (TH + T10);
Rp[0] = KP500000000 * (T1h + T1i);
Im[WS(rs, 3)] = KP500000000 * (T10 - TH);
Rm[WS(rs, 3)] = KP500000000 * (T1h - T1i);
Rm[WS(rs, 1)] = KP500000000 * (T17 - T18);
Im[WS(rs, 1)] = KP500000000 * (T1g - T19);
Rp[WS(rs, 2)] = KP500000000 * (T17 + T18);
Ip[WS(rs, 2)] = KP500000000 * (T19 + T1g);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cfdft2_8", twinstr, &GENUS, { 72, 38, 18, 0 } };
void X(codelet_hc2cfdft2_8) (planner *p) {
X(khc2c_register) (p, hc2cfdft2_8, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,546 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include rdft/scalar/hc2cf.h */
/*
* This function contains 122 FP additions, 92 FP multiplications,
* (or, 68 additions, 38 multiplications, 54 fused multiply/add),
* 81 stack variables, 5 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E T3, T1u, Td, T1w, T1S, T2f, T14, T1p, T1j, T1q, T1N, T2e, TQ, T2i, T1n;
E T1H, Tz, T2h, T1m, T1C;
{
E T1, T2, T1h, Tc, TW, T1c, T1d, T1b, T1f, T1g, T1Q, T7, TV, T1J, TS;
E TU, Ts, Tx, T19, T18, T1O, T15, T17, Tt, T1A, Ti, Tn, TE, TD, T1F;
E TA, TC, Tj, T1y, TJ, TO, T12, T11, T1L, TY, T10, TK, T1D;
{
E Ta, Tb, T1e, T5, T6, TT;
T1 = Ip[0];
T2 = Im[0];
T1h = T1 + T2;
Ta = Rp[WS(rs, 2)];
Tb = Rm[WS(rs, 2)];
Tc = Ta - Tb;
TW = Ta + Tb;
T1c = Rm[0];
T1d = Rp[0];
T1e = T1c - T1d;
T1b = W[0];
T1f = T1b * T1e;
T1g = W[1];
T1Q = T1g * T1e;
T5 = Ip[WS(rs, 2)];
T6 = Im[WS(rs, 2)];
TT = T5 - T6;
T7 = T5 + T6;
TV = W[7];
T1J = TV * TT;
TS = W[6];
TU = TS * TT;
{
E Tq, Tr, T16, Tv, Tw, Tp;
Tq = Rm[WS(rs, 3)];
Tr = Rp[WS(rs, 3)];
Ts = Tq - Tr;
Tv = Ip[WS(rs, 3)];
Tw = Im[WS(rs, 3)];
Tx = Tv + Tw;
T16 = Tv - Tw;
T19 = Tr + Tq;
T18 = W[11];
T1O = T18 * T16;
T15 = W[10];
T17 = T15 * T16;
Tp = W[12];
Tt = Tp * Ts;
T1A = Tp * Tx;
}
{
E Tg, Th, TB, Tl, Tm, Tf;
Tg = Ip[WS(rs, 1)];
Th = Im[WS(rs, 1)];
Ti = Tg - Th;
Tl = Rp[WS(rs, 1)];
Tm = Rm[WS(rs, 1)];
Tn = Tl + Tm;
TB = Tm - Tl;
TE = Tg + Th;
TD = W[5];
T1F = TD * TB;
TA = W[4];
TC = TA * TB;
Tf = W[2];
Tj = Tf * Ti;
T1y = Tf * Tn;
}
{
E TH, TI, TZ, TM, TN, TG;
TH = Ip[WS(rs, 4)];
TI = Im[WS(rs, 4)];
TJ = TH - TI;
TM = Rp[WS(rs, 4)];
TN = Rm[WS(rs, 4)];
TO = TM + TN;
TZ = TN - TM;
T12 = TH + TI;
T11 = W[17];
T1L = T11 * TZ;
TY = W[16];
T10 = TY * TZ;
TG = W[14];
TK = TG * TJ;
T1D = TG * TO;
}
}
{
E T1P, T1R, T1K, T1M;
T3 = T1 - T2;
T1u = T1d + T1c;
{
E T4, T8, T9, T1v;
T4 = W[9];
T8 = T4 * T7;
T9 = W[8];
T1v = T9 * T7;
Td = FMA(T9, Tc, T8);
T1w = FNMS(T4, Tc, T1v);
}
T1P = FMA(T15, T19, T1O);
T1R = FMA(T1b, T1h, T1Q);
T1S = T1P - T1R;
T2f = T1P + T1R;
{
E TX, T13, T1a, T1i;
TX = FNMS(TV, TW, TU);
T13 = FNMS(T11, T12, T10);
T14 = TX + T13;
T1p = T13 - TX;
T1a = FNMS(T18, T19, T17);
T1i = FNMS(T1g, T1h, T1f);
T1j = T1a + T1i;
T1q = T1i - T1a;
}
T1K = FMA(TS, TW, T1J);
T1M = FMA(TY, T12, T1L);
T1N = T1K - T1M;
T2e = T1K + T1M;
{
E TF, T1G, TP, T1E, TL;
TF = FNMS(TD, TE, TC);
T1G = FMA(TA, TE, T1F);
TL = W[15];
TP = FNMS(TL, TO, TK);
T1E = FMA(TL, TJ, T1D);
TQ = TF + TP;
T2i = T1G + T1E;
T1n = TF - TP;
T1H = T1E - T1G;
}
{
E To, T1z, Ty, T1B, Tk, Tu;
Tk = W[3];
To = FNMS(Tk, Tn, Tj);
T1z = FMA(Tk, Ti, T1y);
Tu = W[13];
Ty = FNMS(Tu, Tx, Tt);
T1B = FMA(Tu, Ts, T1A);
Tz = To + Ty;
T2h = T1z + T1B;
T1m = Ty - To;
T1C = T1z - T1B;
}
}
}
{
E T2k, T2m, Te, T1l, T2b, T2c, T2l, T2d;
{
E T2g, T2j, TR, T1k;
T2g = T2e - T2f;
T2j = T2h - T2i;
T2k = FNMS(KP618033988, T2j, T2g);
T2m = FMA(KP618033988, T2g, T2j);
Te = T3 - Td;
TR = Tz + TQ;
T1k = T14 + T1j;
T1l = TR + T1k;
T2b = FNMS(KP250000000, T1l, Te);
T2c = TR - T1k;
}
Ip[0] = KP500000000 * (Te + T1l);
T2l = FMA(KP559016994, T2c, T2b);
Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T2m, T2l));
Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T2m, T2l)));
T2d = FNMS(KP559016994, T2c, T2b);
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T2k, T2d));
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T2k, T2d)));
}
{
E T2w, T2y, T2n, T2q, T2r, T2s, T2x, T2t;
{
E T2u, T2v, T2o, T2p;
T2u = T14 - T1j;
T2v = Tz - TQ;
T2w = FNMS(KP618033988, T2v, T2u);
T2y = FMA(KP618033988, T2u, T2v);
T2n = T1u + T1w;
T2o = T2h + T2i;
T2p = T2e + T2f;
T2q = T2o + T2p;
T2r = FNMS(KP250000000, T2q, T2n);
T2s = T2o - T2p;
}
Rp[0] = KP500000000 * (T2n + T2q);
T2x = FMA(KP559016994, T2s, T2r);
Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T2y, T2x));
Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2y, T2x));
T2t = FNMS(KP559016994, T2s, T2r);
Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T2w, T2t));
Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T2w, T2t));
}
{
E T28, T2a, T1t, T1s, T23, T24, T29, T25;
{
E T26, T27, T1o, T1r;
T26 = T1H - T1C;
T27 = T1S - T1N;
T28 = FMA(KP618033988, T27, T26);
T2a = FNMS(KP618033988, T26, T27);
T1t = Td + T3;
T1o = T1m + T1n;
T1r = T1p + T1q;
T1s = T1o + T1r;
T23 = FMA(KP250000000, T1s, T1t);
T24 = T1r - T1o;
}
Im[WS(rs, 4)] = KP500000000 * (T1s - T1t);
T29 = FNMS(KP559016994, T24, T23);
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2a, T29));
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP951056516, T2a, T29)));
T25 = FMA(KP559016994, T24, T23);
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T28, T25));
Im[0] = -(KP500000000 * (FNMS(KP951056516, T28, T25)));
}
{
E T20, T22, T1x, T1U, T1V, T1W, T21, T1X;
{
E T1Y, T1Z, T1I, T1T;
T1Y = T1n - T1m;
T1Z = T1q - T1p;
T20 = FMA(KP618033988, T1Z, T1Y);
T22 = FNMS(KP618033988, T1Y, T1Z);
T1x = T1u - T1w;
T1I = T1C + T1H;
T1T = T1N + T1S;
T1U = T1I + T1T;
T1V = FNMS(KP250000000, T1U, T1x);
T1W = T1I - T1T;
}
Rm[WS(rs, 4)] = KP500000000 * (T1x + T1U);
T21 = FNMS(KP559016994, T1W, T1V);
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T22, T21));
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T22, T21));
T1X = FMA(KP559016994, T1W, T1V);
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T20, T1X));
Rm[0] = KP500000000 * (FNMS(KP951056516, T20, T1X));
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, { 68, 38, 54, 0 } };
void X(codelet_hc2cfdft_10) (planner *p) {
X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include rdft/scalar/hc2cf.h */
/*
* This function contains 122 FP additions, 68 FP multiplications,
* (or, 92 additions, 38 multiplications, 30 fused multiply/add),
* 62 stack variables, 5 constants, and 40 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP293892626, +0.293892626146236564584352977319536384298826219);
DK(KP475528258, +0.475528258147576786058219666689691071702849317);
DK(KP125000000, +0.125000000000000000000000000000000000000000000);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP279508497, +0.279508497187473712051146708591409529430077295);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
E Tw, TL, TM, T1W, T1X, T27, T1Z, T20, T26, TX, T1a, T1b, T1d, T1e, T1f;
E T1q, T1t, T1u, T1x, T1A, T1B, T1g, T1h, T1i, Td, T25, T1k, T1F;
{
E T3, T1D, T19, T1z, T7, Tb, TR, T1v, Tm, T1o, TK, T1s, Tv, T1p, T12;
E T1y, TF, T1r, TW, T1w;
{
E T1, T2, T18, T14, T15, T16, T13, T17;
T1 = Ip[0];
T2 = Im[0];
T18 = T1 + T2;
T14 = Rm[0];
T15 = Rp[0];
T16 = T14 - T15;
T3 = T1 - T2;
T1D = T15 + T14;
T13 = W[0];
T17 = W[1];
T19 = FNMS(T17, T18, T13 * T16);
T1z = FMA(T17, T16, T13 * T18);
}
{
E T5, T6, TO, T9, Ta, TQ, TN, TP;
T5 = Ip[WS(rs, 2)];
T6 = Im[WS(rs, 2)];
TO = T5 - T6;
T9 = Rp[WS(rs, 2)];
Ta = Rm[WS(rs, 2)];
TQ = T9 + Ta;
T7 = T5 + T6;
Tb = T9 - Ta;
TN = W[6];
TP = W[7];
TR = FNMS(TP, TQ, TN * TO);
T1v = FMA(TP, TO, TN * TQ);
}
{
E Th, TJ, Tl, TH;
{
E Tf, Tg, Tj, Tk;
Tf = Ip[WS(rs, 1)];
Tg = Im[WS(rs, 1)];
Th = Tf - Tg;
TJ = Tf + Tg;
Tj = Rp[WS(rs, 1)];
Tk = Rm[WS(rs, 1)];
Tl = Tj + Tk;
TH = Tj - Tk;
}
{
E Te, Ti, TG, TI;
Te = W[2];
Ti = W[3];
Tm = FNMS(Ti, Tl, Te * Th);
T1o = FMA(Te, Tl, Ti * Th);
TG = W[4];
TI = W[5];
TK = FMA(TG, TH, TI * TJ);
T1s = FNMS(TI, TH, TG * TJ);
}
}
{
E Tq, TZ, Tu, T11;
{
E To, Tp, Ts, Tt;
To = Ip[WS(rs, 3)];
Tp = Im[WS(rs, 3)];
Tq = To + Tp;
TZ = To - Tp;
Ts = Rp[WS(rs, 3)];
Tt = Rm[WS(rs, 3)];
Tu = Ts - Tt;
T11 = Ts + Tt;
}
{
E Tn, Tr, TY, T10;
Tn = W[13];
Tr = W[12];
Tv = FMA(Tn, Tq, Tr * Tu);
T1p = FNMS(Tn, Tu, Tr * Tq);
TY = W[10];
T10 = W[11];
T12 = FNMS(T10, T11, TY * TZ);
T1y = FMA(T10, TZ, TY * T11);
}
}
{
E TA, TV, TE, TT;
{
E Ty, Tz, TC, TD;
Ty = Ip[WS(rs, 4)];
Tz = Im[WS(rs, 4)];
TA = Ty - Tz;
TV = Ty + Tz;
TC = Rp[WS(rs, 4)];
TD = Rm[WS(rs, 4)];
TE = TC + TD;
TT = TC - TD;
}
{
E Tx, TB, TS, TU;
Tx = W[14];
TB = W[15];
TF = FNMS(TB, TE, Tx * TA);
T1r = FMA(Tx, TE, TB * TA);
TS = W[16];
TU = W[17];
TW = FMA(TS, TT, TU * TV);
T1w = FNMS(TU, TT, TS * TV);
}
}
Tw = Tm - Tv;
TL = TF - TK;
TM = Tw + TL;
T1W = T1v + T1w;
T1X = T1y + T1z;
T27 = T1W + T1X;
T1Z = T1o + T1p;
T20 = T1s + T1r;
T26 = T1Z + T20;
TX = TR - TW;
T1a = T12 + T19;
T1b = TX + T1a;
T1d = T19 - T12;
T1e = TR + TW;
T1f = T1d - T1e;
T1q = T1o - T1p;
T1t = T1r - T1s;
T1u = T1q + T1t;
T1x = T1v - T1w;
T1A = T1y - T1z;
T1B = T1x + T1A;
T1g = Tm + Tv;
T1h = TK + TF;
T1i = T1g + T1h;
{
E Tc, T1E, T4, T8;
T4 = W[9];
T8 = W[8];
Tc = FMA(T4, T7, T8 * Tb);
T1E = FNMS(T4, Tb, T8 * T7);
Td = T3 - Tc;
T25 = T1D + T1E;
T1k = Tc + T3;
T1F = T1D - T1E;
}
}
{
E T1U, T1c, T1T, T22, T24, T1Y, T21, T23, T1V;
T1U = KP279508497 * (TM - T1b);
T1c = TM + T1b;
T1T = FNMS(KP125000000, T1c, KP500000000 * Td);
T1Y = T1W - T1X;
T21 = T1Z - T20;
T22 = FNMS(KP293892626, T21, KP475528258 * T1Y);
T24 = FMA(KP475528258, T21, KP293892626 * T1Y);
Ip[0] = KP500000000 * (Td + T1c);
T23 = T1U + T1T;
Ip[WS(rs, 4)] = T23 + T24;
Im[WS(rs, 3)] = T24 - T23;
T1V = T1T - T1U;
Ip[WS(rs, 2)] = T1V + T22;
Im[WS(rs, 1)] = T22 - T1V;
}
{
E T2a, T28, T29, T2e, T2g, T2c, T2d, T2f, T2b;
T2a = KP279508497 * (T26 - T27);
T28 = T26 + T27;
T29 = FNMS(KP125000000, T28, KP500000000 * T25);
T2c = TX - T1a;
T2d = Tw - TL;
T2e = FNMS(KP293892626, T2d, KP475528258 * T2c);
T2g = FMA(KP475528258, T2d, KP293892626 * T2c);
Rp[0] = KP500000000 * (T25 + T28);
T2f = T2a + T29;
Rp[WS(rs, 4)] = T2f - T2g;
Rm[WS(rs, 3)] = T2g + T2f;
T2b = T29 - T2a;
Rp[WS(rs, 2)] = T2b - T2e;
Rm[WS(rs, 1)] = T2e + T2b;
}
{
E T1M, T1j, T1L, T1Q, T1S, T1O, T1P, T1R, T1N;
T1M = KP279508497 * (T1i + T1f);
T1j = T1f - T1i;
T1L = FMA(KP500000000, T1k, KP125000000 * T1j);
T1O = T1A - T1x;
T1P = T1q - T1t;
T1Q = FNMS(KP475528258, T1P, KP293892626 * T1O);
T1S = FMA(KP293892626, T1P, KP475528258 * T1O);
Im[WS(rs, 4)] = KP500000000 * (T1j - T1k);
T1R = T1L - T1M;
Ip[WS(rs, 3)] = T1R + T1S;
Im[WS(rs, 2)] = T1S - T1R;
T1N = T1L + T1M;
Ip[WS(rs, 1)] = T1N + T1Q;
Im[0] = T1Q - T1N;
}
{
E T1C, T1G, T1H, T1n, T1J, T1l, T1m, T1K, T1I;
T1C = KP279508497 * (T1u - T1B);
T1G = T1u + T1B;
T1H = FNMS(KP125000000, T1G, KP500000000 * T1F);
T1l = T1g - T1h;
T1m = T1e + T1d;
T1n = FMA(KP475528258, T1l, KP293892626 * T1m);
T1J = FNMS(KP293892626, T1l, KP475528258 * T1m);
Rm[WS(rs, 4)] = KP500000000 * (T1F + T1G);
T1K = T1H - T1C;
Rp[WS(rs, 3)] = T1J + T1K;
Rm[WS(rs, 2)] = T1K - T1J;
T1I = T1C + T1H;
Rp[WS(rs, 1)] = T1n + T1I;
Rm[0] = T1I - T1n;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, { 92, 38, 30, 0 } };
void X(codelet_hc2cfdft_10) (planner *p) {
X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,646 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:37 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include rdft/scalar/hc2cf.h */
/*
* This function contains 142 FP additions, 92 FP multiplications,
* (or, 96 additions, 46 multiplications, 46 fused multiply/add),
* 65 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E To, T1E, T1m, T2H, Ta, T1G, Tk, T1I, Tl, T1J, T1s, T2b, T1A, T2d, T1B;
E T2I, T12, T18, T19, T24, T26, T2C, Tz, T1M, T1f, T2B, TJ, T1O, TT, T1Q;
E TU, T1R;
{
E Tm, Tn, T1u, T1x, T1y, T1z, T1v, T2c, Te, Tj, T1i, T1l, Tf, T1H, T4;
E T1o, T9, T1r, T5, T1F, T1p, T2a, T1t, T1, T1n;
Tm = Ip[0];
Tn = Im[0];
T1u = Tm + Tn;
T1x = Rp[0];
T1y = Rm[0];
T1z = T1x - T1y;
T1t = W[0];
T1v = T1t * T1u;
T2c = T1t * T1z;
{
E Tc, Td, Th, Ti, Tb;
Tc = Ip[WS(rs, 4)];
Td = Im[WS(rs, 4)];
Te = Tc - Td;
Th = Rp[WS(rs, 4)];
Ti = Rm[WS(rs, 4)];
Tj = Th + Ti;
T1i = Tc + Td;
T1l = Th - Ti;
Tb = W[14];
Tf = Tb * Te;
T1H = Tb * Tj;
}
{
E T2, T3, T7, T8;
T2 = Ip[WS(rs, 2)];
T3 = Im[WS(rs, 2)];
T4 = T2 - T3;
T1o = T2 + T3;
T7 = Rp[WS(rs, 2)];
T8 = Rm[WS(rs, 2)];
T9 = T7 + T8;
T1r = T7 - T8;
}
T1 = W[6];
T5 = T1 * T4;
T1F = T1 * T9;
T1n = W[8];
T1p = T1n * T1o;
T2a = T1n * T1r;
To = Tm - Tn;
T1E = T1x + T1y;
{
E T1j, T2G, T1h, T1k;
T1h = W[16];
T1j = T1h * T1i;
T2G = T1h * T1l;
T1k = W[17];
T1m = FNMS(T1k, T1l, T1j);
T2H = FMA(T1k, T1i, T2G);
}
{
E T6, Tg, T1q, T1w;
T6 = W[7];
Ta = FNMS(T6, T9, T5);
T1G = FMA(T6, T4, T1F);
Tg = W[15];
Tk = FNMS(Tg, Tj, Tf);
T1I = FMA(Tg, Te, T1H);
Tl = Ta + Tk;
T1J = T1G + T1I;
T1q = W[9];
T1s = FNMS(T1q, T1r, T1p);
T2b = FMA(T1q, T1o, T2a);
T1w = W[1];
T1A = FNMS(T1w, T1z, T1v);
T2d = FMA(T1w, T1u, T2c);
T1B = T1s + T1A;
T2I = T2b + T2d;
}
}
{
E Tt, T11, Ty, T10, T23, TX, TZ, TN, TS, T1b, T1e, TO, T1P, TD, TI;
E T17, T16, T25, T13, T15, TE, T1N, TF, TP;
{
E Tr, Ts, Tw, Tx, TY;
Tr = Ip[WS(rs, 3)];
Ts = Im[WS(rs, 3)];
Tt = Tr - Ts;
T11 = Tr + Ts;
Tw = Rp[WS(rs, 3)];
Tx = Rm[WS(rs, 3)];
TY = Tx - Tw;
Ty = Tw + Tx;
T10 = W[12];
T23 = T10 * TY;
TX = W[13];
TZ = TX * TY;
}
{
E TL, TM, TQ, TR, TK;
TL = Ip[WS(rs, 1)];
TM = Im[WS(rs, 1)];
TN = TL - TM;
TQ = Rp[WS(rs, 1)];
TR = Rm[WS(rs, 1)];
TS = TQ + TR;
T1b = TL + TM;
T1e = TQ - TR;
TK = W[2];
TO = TK * TN;
T1P = TK * TS;
}
{
E TB, TC, T14, TG, TH, TA;
TB = Ip[WS(rs, 5)];
TC = Im[WS(rs, 5)];
TD = TB - TC;
TG = Rp[WS(rs, 5)];
TH = Rm[WS(rs, 5)];
TI = TG + TH;
T14 = TH - TG;
T17 = TB + TC;
T16 = W[20];
T25 = T16 * T14;
T13 = W[21];
T15 = T13 * T14;
TA = W[18];
TE = TA * TD;
T1N = TA * TI;
}
T12 = FMA(T10, T11, TZ);
T18 = FMA(T16, T17, T15);
T19 = T12 + T18;
T24 = FNMS(TX, T11, T23);
T26 = FNMS(T13, T17, T25);
T2C = T24 + T26;
{
E Tu, T1L, Tq, Tv;
Tq = W[10];
Tu = Tq * Tt;
T1L = Tq * Ty;
Tv = W[11];
Tz = FNMS(Tv, Ty, Tu);
T1M = FMA(Tv, Tt, T1L);
}
{
E T1c, T2A, T1a, T1d;
T1a = W[4];
T1c = T1a * T1b;
T2A = T1a * T1e;
T1d = W[5];
T1f = FNMS(T1d, T1e, T1c);
T2B = FMA(T1d, T1b, T2A);
}
TF = W[19];
TJ = FNMS(TF, TI, TE);
T1O = FMA(TF, TD, T1N);
TP = W[3];
TT = FNMS(TP, TS, TO);
T1Q = FMA(TP, TN, T1P);
TU = TJ + TT;
T1R = T1O + T1Q;
}
{
E TW, T2V, T2Y, T30, T1D, T1U, T1T, T2Z;
{
E Tp, TV, T2W, T2X;
Tp = Tl + To;
TV = Tz + TU;
TW = Tp - TV;
T2V = TV + Tp;
T2W = T2C - T2B;
T2X = T2H + T2I;
T2Y = T2W - T2X;
T30 = T2W + T2X;
}
{
E T1g, T1C, T1K, T1S;
T1g = T19 + T1f;
T1C = T1m + T1B;
T1D = T1g - T1C;
T1U = T1g + T1C;
T1K = T1E + T1J;
T1S = T1M + T1R;
T1T = T1K + T1S;
T2Z = T1K - T1S;
}
Ip[WS(rs, 3)] = KP500000000 * (TW + T1D);
Rp[WS(rs, 3)] = KP500000000 * (T2Z - T30);
Im[WS(rs, 2)] = KP500000000 * (T1D - TW);
Rm[WS(rs, 2)] = KP500000000 * (T2Z + T30);
Rm[WS(rs, 5)] = KP500000000 * (T1T - T1U);
Im[WS(rs, 5)] = KP500000000 * (T2Y - T2V);
Rp[0] = KP500000000 * (T1T + T1U);
Ip[0] = KP500000000 * (T2V + T2Y);
}
{
E T1X, T2v, T2F, T2Q, T2L, T2R, T20, T2w, T28, T2t, T2j, T2p, T2m, T2q, T2f;
E T2s;
{
E T1V, T1W, T2D, T2E;
T1V = FNMS(KP500000000, T1J, T1E);
T1W = Ta - Tk;
T1X = FNMS(KP866025403, T1W, T1V);
T2v = FMA(KP866025403, T1W, T1V);
T2D = FMA(KP500000000, T2C, T2B);
T2E = T18 - T12;
T2F = FNMS(KP866025403, T2E, T2D);
T2Q = FMA(KP866025403, T2E, T2D);
}
{
E T2J, T2K, T1Y, T1Z;
T2J = FNMS(KP500000000, T2I, T2H);
T2K = T1s - T1A;
T2L = FNMS(KP866025403, T2K, T2J);
T2R = FMA(KP866025403, T2K, T2J);
T1Y = FNMS(KP500000000, T1R, T1M);
T1Z = TJ - TT;
T20 = FNMS(KP866025403, T1Z, T1Y);
T2w = FMA(KP866025403, T1Z, T1Y);
}
{
E T22, T27, T2h, T2i;
T22 = FNMS(KP500000000, T19, T1f);
T27 = T24 - T26;
T28 = FNMS(KP866025403, T27, T22);
T2t = FMA(KP866025403, T27, T22);
T2h = FNMS(KP500000000, Tl, To);
T2i = T1I - T1G;
T2j = FNMS(KP866025403, T2i, T2h);
T2p = FMA(KP866025403, T2i, T2h);
}
{
E T2k, T2l, T29, T2e;
T2k = FNMS(KP500000000, TU, Tz);
T2l = T1Q - T1O;
T2m = FNMS(KP866025403, T2l, T2k);
T2q = FMA(KP866025403, T2l, T2k);
T29 = FNMS(KP500000000, T1B, T1m);
T2e = T2b - T2d;
T2f = FNMS(KP866025403, T2e, T29);
T2s = FMA(KP866025403, T2e, T29);
}
{
E T21, T2g, T2P, T2S;
T21 = T1X + T20;
T2g = T28 + T2f;
Rp[WS(rs, 2)] = KP500000000 * (T21 - T2g);
Rm[WS(rs, 3)] = KP500000000 * (T21 + T2g);
T2P = T2m + T2j;
T2S = T2Q + T2R;
Ip[WS(rs, 2)] = KP500000000 * (T2P + T2S);
Im[WS(rs, 3)] = KP500000000 * (T2S - T2P);
}
{
E T2n, T2o, T2T, T2U;
T2n = T2j - T2m;
T2o = T2f - T28;
Ip[WS(rs, 5)] = KP500000000 * (T2n + T2o);
Im[0] = KP500000000 * (T2o - T2n);
T2T = T1X - T20;
T2U = T2R - T2Q;
Rm[0] = KP500000000 * (T2T - T2U);
Rp[WS(rs, 5)] = KP500000000 * (T2T + T2U);
}
{
E T2r, T2u, T2N, T2O;
T2r = T2p - T2q;
T2u = T2s - T2t;
Ip[WS(rs, 1)] = KP500000000 * (T2r + T2u);
Im[WS(rs, 4)] = KP500000000 * (T2u - T2r);
T2N = T2v - T2w;
T2O = T2L - T2F;
Rm[WS(rs, 4)] = KP500000000 * (T2N - T2O);
Rp[WS(rs, 1)] = KP500000000 * (T2N + T2O);
}
{
E T2x, T2y, T2z, T2M;
T2x = T2v + T2w;
T2y = T2t + T2s;
Rm[WS(rs, 1)] = KP500000000 * (T2x - T2y);
Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
T2z = T2q + T2p;
T2M = T2F + T2L;
Ip[WS(rs, 4)] = KP500000000 * (T2z - T2M);
Im[WS(rs, 1)] = -(KP500000000 * (T2z + T2M));
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, { 96, 46, 46, 0 } };
void X(codelet_hc2cfdft_12) (planner *p) {
X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include rdft/scalar/hc2cf.h */
/*
* This function contains 142 FP additions, 76 FP multiplications,
* (or, 112 additions, 46 multiplications, 30 fused multiply/add),
* 52 stack variables, 3 constants, and 48 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP433012701, +0.433012701892219323381861585376468091735701313);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
E Tm, T1t, T1d, T2j, Tj, T1Y, T1w, T1G, T1q, T2q, T1U, T2k, Tw, T1y, T17;
E T2g, TP, T21, T1B, T1J, T12, T2u, T1P, T2h;
{
E Tk, Tl, T1k, T1m, T1n, T1o, T4, T1f, T8, T1h, Th, T1c, Td, T1a, T19;
E T1b;
{
E T2, T3, T6, T7;
Tk = Ip[0];
Tl = Im[0];
T1k = Tk + Tl;
T1m = Rp[0];
T1n = Rm[0];
T1o = T1m - T1n;
T2 = Ip[WS(rs, 2)];
T3 = Im[WS(rs, 2)];
T4 = T2 - T3;
T1f = T2 + T3;
T6 = Rp[WS(rs, 2)];
T7 = Rm[WS(rs, 2)];
T8 = T6 + T7;
T1h = T6 - T7;
{
E Tf, Tg, Tb, Tc;
Tf = Rp[WS(rs, 4)];
Tg = Rm[WS(rs, 4)];
Th = Tf + Tg;
T1c = Tf - Tg;
Tb = Ip[WS(rs, 4)];
Tc = Im[WS(rs, 4)];
Td = Tb - Tc;
T1a = Tb + Tc;
}
}
Tm = Tk - Tl;
T1t = T1m + T1n;
T19 = W[16];
T1b = W[17];
T1d = FNMS(T1b, T1c, T19 * T1a);
T2j = FMA(T19, T1c, T1b * T1a);
{
E T9, T1u, Ti, T1v;
{
E T1, T5, Ta, Te;
T1 = W[6];
T5 = W[7];
T9 = FNMS(T5, T8, T1 * T4);
T1u = FMA(T1, T8, T5 * T4);
Ta = W[14];
Te = W[15];
Ti = FNMS(Te, Th, Ta * Td);
T1v = FMA(Ta, Th, Te * Td);
}
Tj = T9 + Ti;
T1Y = KP433012701 * (T1v - T1u);
T1w = T1u + T1v;
T1G = KP433012701 * (T9 - Ti);
}
{
E T1i, T1S, T1p, T1T;
{
E T1e, T1g, T1j, T1l;
T1e = W[8];
T1g = W[9];
T1i = FNMS(T1g, T1h, T1e * T1f);
T1S = FMA(T1e, T1h, T1g * T1f);
T1j = W[0];
T1l = W[1];
T1p = FNMS(T1l, T1o, T1j * T1k);
T1T = FMA(T1j, T1o, T1l * T1k);
}
T1q = T1i + T1p;
T2q = KP433012701 * (T1i - T1p);
T1U = KP433012701 * (T1S - T1T);
T2k = T1S + T1T;
}
}
{
E Tr, TT, Tv, TV, TA, TY, TE, T10, TN, T14, TJ, T16;
{
E Tp, Tq, TC, TD;
Tp = Ip[WS(rs, 3)];
Tq = Im[WS(rs, 3)];
Tr = Tp - Tq;
TT = Tp + Tq;
{
E Tt, Tu, Ty, Tz;
Tt = Rp[WS(rs, 3)];
Tu = Rm[WS(rs, 3)];
Tv = Tt + Tu;
TV = Tt - Tu;
Ty = Ip[WS(rs, 5)];
Tz = Im[WS(rs, 5)];
TA = Ty - Tz;
TY = Ty + Tz;
}
TC = Rp[WS(rs, 5)];
TD = Rm[WS(rs, 5)];
TE = TC + TD;
T10 = TC - TD;
{
E TL, TM, TH, TI;
TL = Rp[WS(rs, 1)];
TM = Rm[WS(rs, 1)];
TN = TL + TM;
T14 = TM - TL;
TH = Ip[WS(rs, 1)];
TI = Im[WS(rs, 1)];
TJ = TH - TI;
T16 = TH + TI;
}
}
{
E To, Ts, T13, T15;
To = W[10];
Ts = W[11];
Tw = FNMS(Ts, Tv, To * Tr);
T1y = FMA(To, Tv, Ts * Tr);
T13 = W[5];
T15 = W[4];
T17 = FMA(T13, T14, T15 * T16);
T2g = FNMS(T13, T16, T15 * T14);
}
{
E TF, T1z, TO, T1A;
{
E Tx, TB, TG, TK;
Tx = W[18];
TB = W[19];
TF = FNMS(TB, TE, Tx * TA);
T1z = FMA(Tx, TE, TB * TA);
TG = W[2];
TK = W[3];
TO = FNMS(TK, TN, TG * TJ);
T1A = FMA(TG, TN, TK * TJ);
}
TP = TF + TO;
T21 = KP433012701 * (T1A - T1z);
T1B = T1z + T1A;
T1J = KP433012701 * (TF - TO);
}
{
E TW, T1O, T11, T1N;
{
E TS, TU, TX, TZ;
TS = W[12];
TU = W[13];
TW = FNMS(TU, TV, TS * TT);
T1O = FMA(TS, TV, TU * TT);
TX = W[20];
TZ = W[21];
T11 = FNMS(TZ, T10, TX * TY);
T1N = FMA(TX, T10, TZ * TY);
}
T12 = TW + T11;
T2u = KP433012701 * (T11 - TW);
T1P = KP433012701 * (T1N - T1O);
T2h = T1O + T1N;
}
}
{
E TR, T2f, T2m, T2o, T1s, T1E, T1D, T2n;
{
E Tn, TQ, T2i, T2l;
Tn = Tj + Tm;
TQ = Tw + TP;
TR = Tn - TQ;
T2f = TQ + Tn;
T2i = T2g - T2h;
T2l = T2j + T2k;
T2m = T2i - T2l;
T2o = T2i + T2l;
}
{
E T18, T1r, T1x, T1C;
T18 = T12 + T17;
T1r = T1d + T1q;
T1s = T18 - T1r;
T1E = T18 + T1r;
T1x = T1t + T1w;
T1C = T1y + T1B;
T1D = T1x + T1C;
T2n = T1x - T1C;
}
Ip[WS(rs, 3)] = KP500000000 * (TR + T1s);
Rp[WS(rs, 3)] = KP500000000 * (T2n - T2o);
Im[WS(rs, 2)] = KP500000000 * (T1s - TR);
Rm[WS(rs, 2)] = KP500000000 * (T2n + T2o);
Rm[WS(rs, 5)] = KP500000000 * (T1D - T1E);
Im[WS(rs, 5)] = KP500000000 * (T2m - T2f);
Rp[0] = KP500000000 * (T1D + T1E);
Ip[0] = KP500000000 * (T2f + T2m);
}
{
E T1H, T2b, T2s, T2B, T2v, T2A, T1K, T2c, T1Q, T29, T1Z, T25, T22, T26, T1V;
E T28;
{
E T1F, T2r, T2t, T1I;
T1F = FNMS(KP250000000, T1w, KP500000000 * T1t);
T1H = T1F - T1G;
T2b = T1F + T1G;
T2r = FNMS(KP500000000, T2j, KP250000000 * T2k);
T2s = T2q - T2r;
T2B = T2q + T2r;
T2t = FMA(KP250000000, T2h, KP500000000 * T2g);
T2v = T2t - T2u;
T2A = T2u + T2t;
T1I = FNMS(KP250000000, T1B, KP500000000 * T1y);
T1K = T1I - T1J;
T2c = T1I + T1J;
}
{
E T1M, T1X, T20, T1R;
T1M = FNMS(KP250000000, T12, KP500000000 * T17);
T1Q = T1M - T1P;
T29 = T1P + T1M;
T1X = FNMS(KP250000000, Tj, KP500000000 * Tm);
T1Z = T1X - T1Y;
T25 = T1Y + T1X;
T20 = FNMS(KP250000000, TP, KP500000000 * Tw);
T22 = T20 - T21;
T26 = T21 + T20;
T1R = FNMS(KP250000000, T1q, KP500000000 * T1d);
T1V = T1R - T1U;
T28 = T1R + T1U;
}
{
E T1L, T1W, T2p, T2w;
T1L = T1H + T1K;
T1W = T1Q + T1V;
Rp[WS(rs, 2)] = T1L - T1W;
Rm[WS(rs, 3)] = T1L + T1W;
T2p = T22 + T1Z;
T2w = T2s - T2v;
Ip[WS(rs, 2)] = T2p + T2w;
Im[WS(rs, 3)] = T2w - T2p;
}
{
E T23, T24, T2x, T2y;
T23 = T1Z - T22;
T24 = T1V - T1Q;
Ip[WS(rs, 5)] = T23 + T24;
Im[0] = T24 - T23;
T2x = T1H - T1K;
T2y = T2v + T2s;
Rm[0] = T2x - T2y;
Rp[WS(rs, 5)] = T2x + T2y;
}
{
E T27, T2a, T2z, T2C;
T27 = T25 - T26;
T2a = T28 - T29;
Ip[WS(rs, 1)] = T27 + T2a;
Im[WS(rs, 4)] = T2a - T27;
T2z = T2b - T2c;
T2C = T2A - T2B;
Rm[WS(rs, 4)] = T2z - T2C;
Rp[WS(rs, 1)] = T2z + T2C;
}
{
E T2d, T2e, T2D, T2E;
T2d = T2b + T2c;
T2e = T29 + T28;
Rm[WS(rs, 1)] = T2d - T2e;
Rp[WS(rs, 4)] = T2d + T2e;
T2D = T26 + T25;
T2E = T2A + T2B;
Ip[WS(rs, 4)] = T2D + T2E;
Im[WS(rs, 1)] = T2E - T2D;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, { 112, 46, 30, 0 } };
void X(codelet_hc2cfdft_12) (planner *p) {
X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,909 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:37 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 206 FP additions, 132 FP multiplications,
* (or, 136 additions, 62 multiplications, 70 fused multiply/add),
* 67 stack variables, 4 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E T1f, T2e, T1c, T2g, T1K, T3D, T2W, T3H, TR, T2j, T2R, T3E, T11, T2l, T1v;
E T3G, Ta, T2p, Tk, T2r, T3o, T3p, T1Y, T3z, T2G, T3w, Tv, T2u, TF, T2w;
E T3r, T3s, T2b, T3A, T2L, T3x;
{
E T1d, T1e, T1I, T16, T1A, T1D, T1E, T1C, T1G, T1H, T2U, T1b, T1z, T2S, T1w;
E T1y, T14, T15;
T1d = Ip[0];
T1e = Im[0];
T1I = T1d + T1e;
T14 = Ip[WS(rs, 4)];
T15 = Im[WS(rs, 4)];
T16 = T14 - T15;
T1A = T14 + T15;
{
E T1F, T19, T1a, T1x;
T1D = Rm[0];
T1E = Rp[0];
T1F = T1D - T1E;
T1C = W[0];
T1G = T1C * T1F;
T1H = W[1];
T2U = T1H * T1F;
T19 = Rp[WS(rs, 4)];
T1a = Rm[WS(rs, 4)];
T1x = T1a - T19;
T1b = T19 + T1a;
T1z = W[17];
T2S = T1z * T1x;
T1w = W[16];
T1y = T1w * T1x;
}
T1f = T1d - T1e;
T2e = T1E + T1D;
{
E T17, T2f, T13, T18;
T13 = W[14];
T17 = T13 * T16;
T2f = T13 * T1b;
T18 = W[15];
T1c = FNMS(T18, T1b, T17);
T2g = FMA(T18, T16, T2f);
}
{
E T1B, T1J, T2T, T2V;
T1B = FNMS(T1z, T1A, T1y);
T1J = FNMS(T1H, T1I, T1G);
T1K = T1B + T1J;
T3D = T1J - T1B;
T2T = FMA(T1w, T1A, T2S);
T2V = FMA(T1C, T1I, T2U);
T2W = T2T + T2V;
T3H = T2V - T2T;
}
}
{
E TL, T1n, TQ, T1m, T2N, T1j, T1l, TV, T1t, T10, T1s, T2P, T1p, T1r;
{
E TJ, TK, TO, TP, T1k;
TJ = Ip[WS(rs, 2)];
TK = Im[WS(rs, 2)];
TL = TJ - TK;
T1n = TJ + TK;
TO = Rp[WS(rs, 2)];
TP = Rm[WS(rs, 2)];
T1k = TP - TO;
TQ = TO + TP;
T1m = W[9];
T2N = T1m * T1k;
T1j = W[8];
T1l = T1j * T1k;
}
{
E TT, TU, TY, TZ, T1q;
TT = Ip[WS(rs, 6)];
TU = Im[WS(rs, 6)];
TV = TT - TU;
T1t = TT + TU;
TY = Rp[WS(rs, 6)];
TZ = Rm[WS(rs, 6)];
T1q = TZ - TY;
T10 = TY + TZ;
T1s = W[25];
T2P = T1s * T1q;
T1p = W[24];
T1r = T1p * T1q;
}
{
E T2O, T2Q, T1o, T1u;
{
E TM, T2i, TI, TN;
TI = W[6];
TM = TI * TL;
T2i = TI * TQ;
TN = W[7];
TR = FNMS(TN, TQ, TM);
T2j = FMA(TN, TL, T2i);
}
T2O = FMA(T1j, T1n, T2N);
T2Q = FMA(T1p, T1t, T2P);
T2R = T2O + T2Q;
T3E = T2O - T2Q;
{
E TW, T2k, TS, TX;
TS = W[22];
TW = TS * TV;
T2k = TS * T10;
TX = W[23];
T11 = FNMS(TX, T10, TW);
T2l = FMA(TX, TV, T2k);
}
T1o = FNMS(T1m, T1n, T1l);
T1u = FNMS(T1s, T1t, T1r);
T1v = T1o + T1u;
T3G = T1o - T1u;
}
}
{
E T4, T1Q, T9, T1N, T5, T2o, T1O, T2C, Te, T1W, Tj, T1T, Tf, T2q, T1U;
E T2E, T6, Tg;
{
E T1, T1M, Tb, T1S;
{
E T2, T3, T7, T8;
T2 = Ip[WS(rs, 1)];
T3 = Im[WS(rs, 1)];
T4 = T2 - T3;
T1Q = T2 + T3;
T7 = Rp[WS(rs, 1)];
T8 = Rm[WS(rs, 1)];
T9 = T7 + T8;
T1N = T7 - T8;
}
T1 = W[2];
T5 = T1 * T4;
T2o = T1 * T9;
T1M = W[4];
T1O = T1M * T1N;
T2C = T1M * T1Q;
{
E Tc, Td, Th, Ti;
Tc = Ip[WS(rs, 5)];
Td = Im[WS(rs, 5)];
Te = Tc - Td;
T1W = Tc + Td;
Th = Rp[WS(rs, 5)];
Ti = Rm[WS(rs, 5)];
Tj = Th + Ti;
T1T = Th - Ti;
}
Tb = W[18];
Tf = Tb * Te;
T2q = Tb * Tj;
T1S = W[20];
T1U = T1S * T1T;
T2E = T1S * T1W;
}
T6 = W[3];
Ta = FNMS(T6, T9, T5);
T2p = FMA(T6, T4, T2o);
Tg = W[19];
Tk = FNMS(Tg, Tj, Tf);
T2r = FMA(Tg, Te, T2q);
T3o = Ta - Tk;
T3p = T2p - T2r;
{
E T1R, T2D, T1X, T2F, T1P, T1V;
T1P = W[5];
T1R = FMA(T1P, T1Q, T1O);
T2D = FNMS(T1P, T1N, T2C);
T1V = W[21];
T1X = FMA(T1V, T1W, T1U);
T2F = FNMS(T1V, T1T, T2E);
T1Y = T1R + T1X;
T3z = T1X - T1R;
T2G = T2D + T2F;
T3w = T2F - T2D;
}
}
{
E Tp, T23, Tu, T20, Tq, T2t, T21, T2H, Tz, T29, TE, T26, TA, T2v, T27;
E T2J, Tr, TB;
{
E Tm, T1Z, Tw, T25;
{
E Tn, To, Ts, Tt;
Tn = Ip[WS(rs, 7)];
To = Im[WS(rs, 7)];
Tp = Tn - To;
T23 = Tn + To;
Ts = Rp[WS(rs, 7)];
Tt = Rm[WS(rs, 7)];
Tu = Ts + Tt;
T20 = Ts - Tt;
}
Tm = W[26];
Tq = Tm * Tp;
T2t = Tm * Tu;
T1Z = W[28];
T21 = T1Z * T20;
T2H = T1Z * T23;
{
E Tx, Ty, TC, TD;
Tx = Ip[WS(rs, 3)];
Ty = Im[WS(rs, 3)];
Tz = Tx - Ty;
T29 = Tx + Ty;
TC = Rp[WS(rs, 3)];
TD = Rm[WS(rs, 3)];
TE = TC + TD;
T26 = TC - TD;
}
Tw = W[10];
TA = Tw * Tz;
T2v = Tw * TE;
T25 = W[12];
T27 = T25 * T26;
T2J = T25 * T29;
}
Tr = W[27];
Tv = FNMS(Tr, Tu, Tq);
T2u = FMA(Tr, Tp, T2t);
TB = W[11];
TF = FNMS(TB, TE, TA);
T2w = FMA(TB, Tz, T2v);
T3r = T2u - T2w;
T3s = Tv - TF;
{
E T24, T2I, T2a, T2K, T22, T28;
T22 = W[29];
T24 = FMA(T22, T23, T21);
T2I = FNMS(T22, T20, T2H);
T28 = W[13];
T2a = FMA(T28, T29, T27);
T2K = FNMS(T28, T26, T2J);
T2b = T24 + T2a;
T3A = T2I - T2K;
T2L = T2I + T2K;
T3x = T2a - T24;
}
}
{
E TH, T3c, T36, T3g, T39, T3h, T1h, T32, T2d, T2A, T2y, T31, T2Y, T30, T2n;
E T3b;
{
E Tl, TG, T34, T35;
Tl = Ta + Tk;
TG = Tv + TF;
TH = Tl + TG;
T3c = Tl - TG;
T34 = T2L - T2G;
T35 = T1Y - T2b;
T36 = T34 + T35;
T3g = T34 - T35;
}
{
E T37, T38, T12, T1g;
T37 = T1K - T1v;
T38 = T2W - T2R;
T39 = T37 - T38;
T3h = T37 + T38;
T12 = TR + T11;
T1g = T1c + T1f;
T1h = T12 + T1g;
T32 = T1g - T12;
}
{
E T1L, T2c, T2s, T2x;
T1L = T1v + T1K;
T2c = T1Y + T2b;
T2d = T1L - T2c;
T2A = T2c + T1L;
T2s = T2p + T2r;
T2x = T2u + T2w;
T2y = T2s + T2x;
T31 = T2x - T2s;
}
{
E T2M, T2X, T2h, T2m;
T2M = T2G + T2L;
T2X = T2R + T2W;
T2Y = T2M - T2X;
T30 = T2M + T2X;
T2h = T2e + T2g;
T2m = T2j + T2l;
T2n = T2h + T2m;
T3b = T2h - T2m;
}
{
E T1i, T2Z, T2z, T2B;
T1i = TH + T1h;
Ip[0] = KP500000000 * (T1i + T2d);
Im[WS(rs, 7)] = KP500000000 * (T2d - T1i);
T2Z = T2n + T2y;
Rm[WS(rs, 7)] = KP500000000 * (T2Z - T30);
Rp[0] = KP500000000 * (T2Z + T30);
T2z = T2n - T2y;
Rm[WS(rs, 3)] = KP500000000 * (T2z - T2A);
Rp[WS(rs, 4)] = KP500000000 * (T2z + T2A);
T2B = T1h - TH;
Ip[WS(rs, 4)] = KP500000000 * (T2B + T2Y);
Im[WS(rs, 3)] = KP500000000 * (T2Y - T2B);
}
{
E T33, T3a, T3j, T3k;
T33 = T31 + T32;
T3a = T36 + T39;
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3a, T33));
Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3a, T33)));
T3j = T3b + T3c;
T3k = T3g + T3h;
Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3k, T3j));
Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3k, T3j));
}
{
E T3d, T3e, T3f, T3i;
T3d = T3b - T3c;
T3e = T39 - T36;
Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3e, T3d));
Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3e, T3d));
T3f = T32 - T31;
T3i = T3g - T3h;
Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3i, T3f));
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3i, T3f)));
}
}
{
E T3n, T3Z, T44, T4e, T47, T4f, T3u, T4a, T3C, T3U, T3N, T49, T3Q, T40, T3J;
E T3V;
{
E T3l, T3m, T42, T43;
T3l = T1f - T1c;
T3m = T2j - T2l;
T3n = T3l - T3m;
T3Z = T3m + T3l;
T42 = T3w - T3x;
T43 = T3A - T3z;
T44 = FMA(KP414213562, T43, T42);
T4e = FNMS(KP414213562, T42, T43);
}
{
E T45, T46, T3q, T3t;
T45 = T3E + T3D;
T46 = T3H - T3G;
T47 = FMA(KP414213562, T46, T45);
T4f = FNMS(KP414213562, T45, T46);
T3q = T3o - T3p;
T3t = T3r + T3s;
T3u = T3q + T3t;
T4a = T3q - T3t;
}
{
E T3y, T3B, T3L, T3M;
T3y = T3w + T3x;
T3B = T3z + T3A;
T3C = FMA(KP414213562, T3B, T3y);
T3U = FNMS(KP414213562, T3y, T3B);
T3L = T2e - T2g;
T3M = TR - T11;
T3N = T3L + T3M;
T49 = T3L - T3M;
}
{
E T3O, T3P, T3F, T3I;
T3O = T3p + T3o;
T3P = T3r - T3s;
T3Q = T3O + T3P;
T40 = T3P - T3O;
T3F = T3D - T3E;
T3I = T3G + T3H;
T3J = FNMS(KP414213562, T3I, T3F);
T3V = FMA(KP414213562, T3F, T3I);
}
{
E T3v, T3K, T3X, T3Y;
T3v = FMA(KP707106781, T3u, T3n);
T3K = T3C + T3J;
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3K, T3v));
Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3K, T3v)));
T3X = FMA(KP707106781, T3Q, T3N);
T3Y = T3U + T3V;
Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T3Y, T3X));
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3Y, T3X));
}
{
E T3R, T3S, T3T, T3W;
T3R = FNMS(KP707106781, T3Q, T3N);
T3S = T3J - T3C;
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T3S, T3R));
Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3S, T3R));
T3T = FNMS(KP707106781, T3u, T3n);
T3W = T3U - T3V;
Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3W, T3T));
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3T)));
}
{
E T41, T48, T4h, T4i;
T41 = FNMS(KP707106781, T40, T3Z);
T48 = T44 - T47;
Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T48, T41));
Im[0] = -(KP500000000 * (FNMS(KP923879532, T48, T41)));
T4h = FNMS(KP707106781, T4a, T49);
T4i = T4e + T4f;
Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4i, T4h));
Rm[0] = KP500000000 * (FMA(KP923879532, T4i, T4h));
}
{
E T4b, T4c, T4d, T4g;
T4b = FMA(KP707106781, T4a, T49);
T4c = T44 + T47;
Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4c, T4b));
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4c, T4b));
T4d = FMA(KP707106781, T40, T3Z);
T4g = T4e - T4f;
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4g, T4d));
Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4g, T4d)));
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, { 136, 62, 70, 0 } };
void X(codelet_hc2cfdft_16) (planner *p) {
X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include rdft/scalar/hc2cf.h */
/*
* This function contains 206 FP additions, 100 FP multiplications,
* (or, 168 additions, 62 multiplications, 38 fused multiply/add),
* 61 stack variables, 4 constants, and 64 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP461939766, +0.461939766255643378064091594698394143411208313);
DK(KP191341716, +0.191341716182544885864229992015199433380672281);
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
E T19, T3h, T21, T2Y, T1o, T3d, T2s, T39, TW, T3i, T24, T2Z, T1z, T3c, T2p;
E T3a, Tj, T2S, T28, T2R, T1L, T36, T2i, T32, TC, T2V, T2b, T2U, T1W, T35;
E T2l, T33;
{
E T10, T1m, T14, T1k, T18, T1h, T1f, T1Z;
{
E TY, TZ, T12, T13;
TY = Ip[WS(rs, 4)];
TZ = Im[WS(rs, 4)];
T10 = TY - TZ;
T1m = TY + TZ;
T12 = Rp[WS(rs, 4)];
T13 = Rm[WS(rs, 4)];
T14 = T12 + T13;
T1k = T12 - T13;
}
{
E T16, T17, T1d, T1e;
T16 = Ip[0];
T17 = Im[0];
T18 = T16 - T17;
T1h = T16 + T17;
T1d = Rm[0];
T1e = Rp[0];
T1f = T1d - T1e;
T1Z = T1e + T1d;
}
{
E T15, T20, TX, T11;
TX = W[14];
T11 = W[15];
T15 = FNMS(T11, T14, TX * T10);
T20 = FMA(TX, T14, T11 * T10);
T19 = T15 + T18;
T3h = T1Z - T20;
T21 = T1Z + T20;
T2Y = T18 - T15;
}
{
E T1i, T2r, T1n, T2q;
{
E T1c, T1g, T1j, T1l;
T1c = W[0];
T1g = W[1];
T1i = FNMS(T1g, T1h, T1c * T1f);
T2r = FMA(T1g, T1f, T1c * T1h);
T1j = W[16];
T1l = W[17];
T1n = FMA(T1j, T1k, T1l * T1m);
T2q = FNMS(T1l, T1k, T1j * T1m);
}
T1o = T1i - T1n;
T3d = T2r - T2q;
T2s = T2q + T2r;
T39 = T1n + T1i;
}
}
{
E TH, T1s, TL, T1q, TQ, T1x, TU, T1v;
{
E TF, TG, TJ, TK;
TF = Ip[WS(rs, 2)];
TG = Im[WS(rs, 2)];
TH = TF - TG;
T1s = TF + TG;
TJ = Rp[WS(rs, 2)];
TK = Rm[WS(rs, 2)];
TL = TJ + TK;
T1q = TJ - TK;
}
{
E TO, TP, TS, TT;
TO = Ip[WS(rs, 6)];
TP = Im[WS(rs, 6)];
TQ = TO - TP;
T1x = TO + TP;
TS = Rp[WS(rs, 6)];
TT = Rm[WS(rs, 6)];
TU = TS + TT;
T1v = TS - TT;
}
{
E TM, T22, TV, T23;
{
E TE, TI, TN, TR;
TE = W[6];
TI = W[7];
TM = FNMS(TI, TL, TE * TH);
T22 = FMA(TE, TL, TI * TH);
TN = W[22];
TR = W[23];
TV = FNMS(TR, TU, TN * TQ);
T23 = FMA(TN, TU, TR * TQ);
}
TW = TM + TV;
T3i = TM - TV;
T24 = T22 + T23;
T2Z = T22 - T23;
}
{
E T1t, T2n, T1y, T2o;
{
E T1p, T1r, T1u, T1w;
T1p = W[8];
T1r = W[9];
T1t = FMA(T1p, T1q, T1r * T1s);
T2n = FNMS(T1r, T1q, T1p * T1s);
T1u = W[24];
T1w = W[25];
T1y = FMA(T1u, T1v, T1w * T1x);
T2o = FNMS(T1w, T1v, T1u * T1x);
}
T1z = T1t + T1y;
T3c = T1y - T1t;
T2p = T2n + T2o;
T3a = T2n - T2o;
}
}
{
E T4, T1E, T8, T1C, Td, T1J, Th, T1H;
{
E T2, T3, T6, T7;
T2 = Ip[WS(rs, 1)];
T3 = Im[WS(rs, 1)];
T4 = T2 - T3;
T1E = T2 + T3;
T6 = Rp[WS(rs, 1)];
T7 = Rm[WS(rs, 1)];
T8 = T6 + T7;
T1C = T6 - T7;
}
{
E Tb, Tc, Tf, Tg;
Tb = Ip[WS(rs, 5)];
Tc = Im[WS(rs, 5)];
Td = Tb - Tc;
T1J = Tb + Tc;
Tf = Rp[WS(rs, 5)];
Tg = Rm[WS(rs, 5)];
Th = Tf + Tg;
T1H = Tf - Tg;
}
{
E T9, T26, Ti, T27;
{
E T1, T5, Ta, Te;
T1 = W[2];
T5 = W[3];
T9 = FNMS(T5, T8, T1 * T4);
T26 = FMA(T1, T8, T5 * T4);
Ta = W[18];
Te = W[19];
Ti = FNMS(Te, Th, Ta * Td);
T27 = FMA(Ta, Th, Te * Td);
}
Tj = T9 + Ti;
T2S = T26 - T27;
T28 = T26 + T27;
T2R = T9 - Ti;
}
{
E T1F, T2g, T1K, T2h;
{
E T1B, T1D, T1G, T1I;
T1B = W[4];
T1D = W[5];
T1F = FMA(T1B, T1C, T1D * T1E);
T2g = FNMS(T1D, T1C, T1B * T1E);
T1G = W[20];
T1I = W[21];
T1K = FMA(T1G, T1H, T1I * T1J);
T2h = FNMS(T1I, T1H, T1G * T1J);
}
T1L = T1F + T1K;
T36 = T2g - T2h;
T2i = T2g + T2h;
T32 = T1K - T1F;
}
}
{
E Tn, T1P, Tr, T1N, Tw, T1U, TA, T1S;
{
E Tl, Tm, Tp, Tq;
Tl = Ip[WS(rs, 7)];
Tm = Im[WS(rs, 7)];
Tn = Tl - Tm;
T1P = Tl + Tm;
Tp = Rp[WS(rs, 7)];
Tq = Rm[WS(rs, 7)];
Tr = Tp + Tq;
T1N = Tp - Tq;
}
{
E Tu, Tv, Ty, Tz;
Tu = Ip[WS(rs, 3)];
Tv = Im[WS(rs, 3)];
Tw = Tu - Tv;
T1U = Tu + Tv;
Ty = Rp[WS(rs, 3)];
Tz = Rm[WS(rs, 3)];
TA = Ty + Tz;
T1S = Ty - Tz;
}
{
E Ts, T29, TB, T2a;
{
E Tk, To, Tt, Tx;
Tk = W[26];
To = W[27];
Ts = FNMS(To, Tr, Tk * Tn);
T29 = FMA(Tk, Tr, To * Tn);
Tt = W[10];
Tx = W[11];
TB = FNMS(Tx, TA, Tt * Tw);
T2a = FMA(Tt, TA, Tx * Tw);
}
TC = Ts + TB;
T2V = Ts - TB;
T2b = T29 + T2a;
T2U = T29 - T2a;
}
{
E T1Q, T2j, T1V, T2k;
{
E T1M, T1O, T1R, T1T;
T1M = W[28];
T1O = W[29];
T1Q = FMA(T1M, T1N, T1O * T1P);
T2j = FNMS(T1O, T1N, T1M * T1P);
T1R = W[12];
T1T = W[13];
T1V = FMA(T1R, T1S, T1T * T1U);
T2k = FNMS(T1T, T1S, T1R * T1U);
}
T1W = T1Q + T1V;
T35 = T1V - T1Q;
T2l = T2j + T2k;
T33 = T2j - T2k;
}
}
{
E T1b, T2f, T2u, T2w, T1Y, T2e, T2d, T2v;
{
E TD, T1a, T2m, T2t;
TD = Tj + TC;
T1a = TW + T19;
T1b = TD + T1a;
T2f = T1a - TD;
T2m = T2i + T2l;
T2t = T2p + T2s;
T2u = T2m - T2t;
T2w = T2m + T2t;
}
{
E T1A, T1X, T25, T2c;
T1A = T1o - T1z;
T1X = T1L + T1W;
T1Y = T1A - T1X;
T2e = T1X + T1A;
T25 = T21 + T24;
T2c = T28 + T2b;
T2d = T25 - T2c;
T2v = T25 + T2c;
}
Ip[0] = KP500000000 * (T1b + T1Y);
Rp[0] = KP500000000 * (T2v + T2w);
Im[WS(rs, 7)] = KP500000000 * (T1Y - T1b);
Rm[WS(rs, 7)] = KP500000000 * (T2v - T2w);
Rm[WS(rs, 3)] = KP500000000 * (T2d - T2e);
Im[WS(rs, 3)] = KP500000000 * (T2u - T2f);
Rp[WS(rs, 4)] = KP500000000 * (T2d + T2e);
Ip[WS(rs, 4)] = KP500000000 * (T2f + T2u);
}
{
E T2z, T2L, T2J, T2P, T2C, T2M, T2F, T2N;
{
E T2x, T2y, T2H, T2I;
T2x = T2b - T28;
T2y = T19 - TW;
T2z = KP500000000 * (T2x + T2y);
T2L = KP500000000 * (T2y - T2x);
T2H = T21 - T24;
T2I = Tj - TC;
T2J = KP500000000 * (T2H - T2I);
T2P = KP500000000 * (T2H + T2I);
}
{
E T2A, T2B, T2D, T2E;
T2A = T2l - T2i;
T2B = T1L - T1W;
T2C = T2A + T2B;
T2M = T2A - T2B;
T2D = T1z + T1o;
T2E = T2s - T2p;
T2F = T2D - T2E;
T2N = T2D + T2E;
}
{
E T2G, T2Q, T2K, T2O;
T2G = KP353553390 * (T2C + T2F);
Ip[WS(rs, 2)] = T2z + T2G;
Im[WS(rs, 5)] = T2G - T2z;
T2Q = KP353553390 * (T2M + T2N);
Rm[WS(rs, 5)] = T2P - T2Q;
Rp[WS(rs, 2)] = T2P + T2Q;
T2K = KP353553390 * (T2F - T2C);
Rm[WS(rs, 1)] = T2J - T2K;
Rp[WS(rs, 6)] = T2J + T2K;
T2O = KP353553390 * (T2M - T2N);
Ip[WS(rs, 6)] = T2L + T2O;
Im[WS(rs, 1)] = T2O - T2L;
}
}
{
E T30, T3w, T3F, T3j, T2X, T3G, T3D, T3L, T3m, T3v, T38, T3q, T3A, T3K, T3f;
E T3r;
{
E T2T, T2W, T34, T37;
T30 = KP500000000 * (T2Y - T2Z);
T3w = KP500000000 * (T2Z + T2Y);
T3F = KP500000000 * (T3h - T3i);
T3j = KP500000000 * (T3h + T3i);
T2T = T2R - T2S;
T2W = T2U + T2V;
T2X = KP353553390 * (T2T + T2W);
T3G = KP353553390 * (T2T - T2W);
{
E T3B, T3C, T3k, T3l;
T3B = T3a + T39;
T3C = T3d - T3c;
T3D = FNMS(KP461939766, T3C, KP191341716 * T3B);
T3L = FMA(KP461939766, T3B, KP191341716 * T3C);
T3k = T2S + T2R;
T3l = T2U - T2V;
T3m = KP353553390 * (T3k + T3l);
T3v = KP353553390 * (T3l - T3k);
}
T34 = T32 + T33;
T37 = T35 - T36;
T38 = FMA(KP191341716, T34, KP461939766 * T37);
T3q = FNMS(KP191341716, T37, KP461939766 * T34);
{
E T3y, T3z, T3b, T3e;
T3y = T33 - T32;
T3z = T36 + T35;
T3A = FMA(KP461939766, T3y, KP191341716 * T3z);
T3K = FNMS(KP461939766, T3z, KP191341716 * T3y);
T3b = T39 - T3a;
T3e = T3c + T3d;
T3f = FNMS(KP191341716, T3e, KP461939766 * T3b);
T3r = FMA(KP191341716, T3b, KP461939766 * T3e);
}
}
{
E T31, T3g, T3t, T3u;
T31 = T2X + T30;
T3g = T38 + T3f;
Ip[WS(rs, 1)] = T31 + T3g;
Im[WS(rs, 6)] = T3g - T31;
T3t = T3j + T3m;
T3u = T3q + T3r;
Rm[WS(rs, 6)] = T3t - T3u;
Rp[WS(rs, 1)] = T3t + T3u;
}
{
E T3n, T3o, T3p, T3s;
T3n = T3j - T3m;
T3o = T3f - T38;
Rm[WS(rs, 2)] = T3n - T3o;
Rp[WS(rs, 5)] = T3n + T3o;
T3p = T30 - T2X;
T3s = T3q - T3r;
Ip[WS(rs, 5)] = T3p + T3s;
Im[WS(rs, 2)] = T3s - T3p;
}
{
E T3x, T3E, T3N, T3O;
T3x = T3v + T3w;
T3E = T3A + T3D;
Ip[WS(rs, 3)] = T3x + T3E;
Im[WS(rs, 4)] = T3E - T3x;
T3N = T3F + T3G;
T3O = T3K + T3L;
Rm[WS(rs, 4)] = T3N - T3O;
Rp[WS(rs, 3)] = T3N + T3O;
}
{
E T3H, T3I, T3J, T3M;
T3H = T3F - T3G;
T3I = T3D - T3A;
Rm[0] = T3H - T3I;
Rp[WS(rs, 7)] = T3H + T3I;
T3J = T3w - T3v;
T3M = T3K - T3L;
Ip[WS(rs, 7)] = T3J + T3M;
Im[0] = T3M - T3J;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, { 168, 62, 38, 0 } };
void X(codelet_hc2cfdft_16) (planner *p) {
X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,133 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cfdft_2 -include rdft/scalar/hc2cf.h */
/*
* This function contains 10 FP additions, 8 FP multiplications,
* (or, 8 additions, 6 multiplications, 2 fused multiply/add),
* 16 stack variables, 1 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T3, Ta, Tc, T9, Td, T4, T8, Tb, Te;
{
E T1, T2, T5, T6, T7;
T1 = Ip[0];
T2 = Im[0];
T3 = T1 - T2;
Ta = T1 + T2;
T5 = Rm[0];
T6 = Rp[0];
T7 = T5 - T6;
Tc = T6 + T5;
T9 = W[1];
Td = T9 * T7;
T4 = W[0];
T8 = T4 * T7;
}
Tb = FNMS(T9, Ta, T8);
Ip[0] = KP500000000 * (T3 + Tb);
Im[0] = KP500000000 * (Tb - T3);
Te = FMA(T4, Ta, Td);
Rm[0] = KP500000000 * (Tc - Te);
Rp[0] = KP500000000 * (Tc + Te);
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cfdft_2", twinstr, &GENUS, { 8, 6, 2, 0 } };
void X(codelet_hc2cfdft_2) (planner *p) {
X(khc2c_register) (p, hc2cfdft_2, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cfdft_2 -include rdft/scalar/hc2cf.h */
/*
* This function contains 10 FP additions, 8 FP multiplications,
* (or, 8 additions, 6 multiplications, 2 fused multiply/add),
* 10 stack variables, 1 constants, and 8 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
E T3, T9, T7, Tb;
{
E T1, T2, T5, T6;
T1 = Ip[0];
T2 = Im[0];
T3 = T1 - T2;
T9 = T1 + T2;
T5 = Rm[0];
T6 = Rp[0];
T7 = T5 - T6;
Tb = T6 + T5;
}
{
E Ta, Tc, T4, T8;
T4 = W[0];
T8 = W[1];
Ta = FNMS(T8, T9, T4 * T7);
Tc = FMA(T8, T7, T4 * T9);
Ip[0] = KP500000000 * (T3 + Ta);
Rp[0] = KP500000000 * (Tb + Tc);
Im[0] = KP500000000 * (Ta - T3);
Rm[0] = KP500000000 * (Tb - Tc);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 2, "hc2cfdft_2", twinstr, &GENUS, { 8, 6, 2, 0 } };
void X(codelet_hc2cfdft_2) (planner *p) {
X(khc2c_register) (p, hc2cfdft_2, &desc, HC2C_VIA_DFT);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,218 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cfdft_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 30 FP additions, 20 FP multiplications,
* (or, 24 additions, 14 multiplications, 6 fused multiply/add),
* 31 stack variables, 1 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E Td, Tl, Tu, Tk, TC, Tf, Tj, T4, Tr, T9, To, T5, Tv, Tp, TA;
E Tb, Tc;
Tb = Ip[0];
Tc = Im[0];
Td = Tb - Tc;
Tl = Tb + Tc;
{
E Tg, Th, Ti, T1, Tn;
Tg = Rm[0];
Th = Rp[0];
Ti = Tg - Th;
Tu = Th + Tg;
Tk = W[1];
TC = Tk * Ti;
Tf = W[0];
Tj = Tf * Ti;
{
E T2, T3, T7, T8;
T2 = Ip[WS(rs, 1)];
T3 = Im[WS(rs, 1)];
T4 = T2 - T3;
Tr = T2 + T3;
T7 = Rp[WS(rs, 1)];
T8 = Rm[WS(rs, 1)];
T9 = T7 + T8;
To = T7 - T8;
}
T1 = W[2];
T5 = T1 * T4;
Tv = T1 * T9;
Tn = W[4];
Tp = Tn * To;
TA = Tn * Tr;
}
{
E Tm, TD, Ta, Tw, Ts, TB, T6, Tq;
Tm = FNMS(Tk, Tl, Tj);
TD = FMA(Tf, Tl, TC);
T6 = W[3];
Ta = FNMS(T6, T9, T5);
Tw = FMA(T6, T4, Tv);
Tq = W[5];
Ts = FMA(Tq, Tr, Tp);
TB = FNMS(Tq, To, TA);
{
E Te, Tt, TF, TG;
Te = Ta + Td;
Tt = Tm - Ts;
Ip[0] = KP500000000 * (Te + Tt);
Im[WS(rs, 1)] = KP500000000 * (Tt - Te);
TF = Tu + Tw;
TG = TB + TD;
Rm[WS(rs, 1)] = KP500000000 * (TF - TG);
Rp[0] = KP500000000 * (TF + TG);
}
{
E Tx, Ty, Tz, TE;
Tx = Tu - Tw;
Ty = Ts + Tm;
Rm[0] = KP500000000 * (Tx - Ty);
Rp[WS(rs, 1)] = KP500000000 * (Tx + Ty);
Tz = Td - Ta;
TE = TB - TD;
Ip[WS(rs, 1)] = KP500000000 * (Tz + TE);
Im[0] = KP500000000 * (TE - Tz);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cfdft_4", twinstr, &GENUS, { 24, 14, 6, 0 } };
void X(codelet_hc2cfdft_4) (planner *p) {
X(khc2c_register) (p, hc2cfdft_4, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cfdft_4 -include rdft/scalar/hc2cf.h */
/*
* This function contains 30 FP additions, 20 FP multiplications,
* (or, 24 additions, 14 multiplications, 6 fused multiply/add),
* 18 stack variables, 1 constants, and 16 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E Tc, Tr, Tk, Tx, T9, Ts, Tp, Tw;
{
E Ta, Tb, Tj, Tf, Tg, Th, Te, Ti;
Ta = Ip[0];
Tb = Im[0];
Tj = Ta + Tb;
Tf = Rm[0];
Tg = Rp[0];
Th = Tf - Tg;
Tc = Ta - Tb;
Tr = Tg + Tf;
Te = W[0];
Ti = W[1];
Tk = FNMS(Ti, Tj, Te * Th);
Tx = FMA(Ti, Th, Te * Tj);
}
{
E T4, To, T8, Tm;
{
E T2, T3, T6, T7;
T2 = Ip[WS(rs, 1)];
T3 = Im[WS(rs, 1)];
T4 = T2 - T3;
To = T2 + T3;
T6 = Rp[WS(rs, 1)];
T7 = Rm[WS(rs, 1)];
T8 = T6 + T7;
Tm = T6 - T7;
}
{
E T1, T5, Tl, Tn;
T1 = W[2];
T5 = W[3];
T9 = FNMS(T5, T8, T1 * T4);
Ts = FMA(T1, T8, T5 * T4);
Tl = W[4];
Tn = W[5];
Tp = FMA(Tl, Tm, Tn * To);
Tw = FNMS(Tn, Tm, Tl * To);
}
}
{
E Td, Tq, Tz, TA;
Td = T9 + Tc;
Tq = Tk - Tp;
Ip[0] = KP500000000 * (Td + Tq);
Im[WS(rs, 1)] = KP500000000 * (Tq - Td);
Tz = Tr + Ts;
TA = Tw + Tx;
Rm[WS(rs, 1)] = KP500000000 * (Tz - TA);
Rp[0] = KP500000000 * (Tz + TA);
}
{
E Tt, Tu, Tv, Ty;
Tt = Tr - Ts;
Tu = Tp + Tk;
Rm[0] = KP500000000 * (Tt - Tu);
Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu);
Tv = Tc - T9;
Ty = Tw - Tx;
Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty);
Im[0] = KP500000000 * (Ty - Tv);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 4, "hc2cfdft_4", twinstr, &GENUS, { 24, 14, 6, 0 } };
void X(codelet_hc2cfdft_4) (planner *p) {
X(khc2c_register) (p, hc2cfdft_4, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,339 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include rdft/scalar/hc2cf.h */
/*
* This function contains 58 FP additions, 44 FP multiplications,
* (or, 36 additions, 22 multiplications, 22 fused multiply/add),
* 27 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
E T3, TQ, TJ, T12, Tu, TX, TB, T10, Td, TS, Tk, TV;
{
E T1, T2, TI, TD, TE, TF;
T1 = Ip[0];
T2 = Im[0];
TI = T1 + T2;
TD = Rm[0];
TE = Rp[0];
TF = TD - TE;
T3 = T1 - T2;
TQ = TE + TD;
{
E TC, TG, TH, T11;
TC = W[0];
TG = TC * TF;
TH = W[1];
T11 = TH * TF;
TJ = FNMS(TH, TI, TG);
T12 = FMA(TC, TI, T11);
}
}
{
E To, TA, Tt, Tx;
{
E Tm, Tn, Tr, Ts;
Tm = Rm[WS(rs, 2)];
Tn = Rp[WS(rs, 2)];
To = Tm - Tn;
TA = Tn + Tm;
Tr = Ip[WS(rs, 2)];
Ts = Im[WS(rs, 2)];
Tt = Tr + Ts;
Tx = Tr - Ts;
}
{
E Tp, TW, Tl, Tq;
Tl = W[8];
Tp = Tl * To;
TW = Tl * Tt;
Tq = W[9];
Tu = FNMS(Tq, Tt, Tp);
TX = FMA(Tq, To, TW);
}
{
E Tw, Ty, Tz, TZ;
Tw = W[6];
Ty = Tw * Tx;
Tz = W[7];
TZ = Tz * Tx;
TB = FNMS(Tz, TA, Ty);
T10 = FMA(Tw, TA, TZ);
}
}
{
E T7, Tg, Tc, Tj;
{
E T5, T6, Ta, Tb;
T5 = Ip[WS(rs, 1)];
T6 = Im[WS(rs, 1)];
T7 = T5 + T6;
Tg = T5 - T6;
Ta = Rp[WS(rs, 1)];
Tb = Rm[WS(rs, 1)];
Tc = Ta - Tb;
Tj = Ta + Tb;
}
{
E T4, T8, T9, TR;
T4 = W[5];
T8 = T4 * T7;
T9 = W[4];
TR = T9 * T7;
Td = FMA(T9, Tc, T8);
TS = FNMS(T4, Tc, TR);
}
{
E Tf, Th, Ti, TU;
Tf = W[2];
Th = Tf * Tg;
Ti = W[3];
TU = Ti * Tg;
Tk = FNMS(Ti, Tj, Th);
TV = FMA(Tf, Tj, TU);
}
}
{
E Te, T1d, TL, T1g, T1c, T1e, T19, T1f;
Te = T3 - Td;
T1d = TQ + TS;
{
E Tv, TK, T1a, T1b;
Tv = Tk + Tu;
TK = TB + TJ;
TL = Tv + TK;
T1g = Tv - TK;
T1a = TV + TX;
T1b = T10 + T12;
T1c = T1a - T1b;
T1e = T1a + T1b;
}
Ip[0] = KP500000000 * (Te + TL);
Rp[0] = KP500000000 * (T1d + T1e);
T19 = FNMS(KP500000000, TL, Te);
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP866025403, T1c, T19));
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP866025403, T1c, T19)));
T1f = FNMS(KP500000000, T1e, T1d);
Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP866025403, T1g, T1f));
Rm[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T1g, T1f));
}
{
E TP, TT, TO, T16, T14, T18, T15, T17;
TP = Td + T3;
TT = TQ - TS;
{
E TM, TN, TY, T13;
TM = Tu - Tk;
TN = TJ - TB;
TO = TM + TN;
T16 = TN - TM;
TY = TV - TX;
T13 = T10 - T12;
T14 = TY + T13;
T18 = T13 - TY;
}
Im[WS(rs, 2)] = KP500000000 * (TO - TP);
Rm[WS(rs, 2)] = KP500000000 * (TT + T14);
T15 = FNMS(KP500000000, T14, TT);
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T16, T15));
Rm[0] = KP500000000 * (FNMS(KP866025403, T16, T15));
T17 = FMA(KP500000000, TO, TP);
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T18, T17));
Im[0] = -(KP500000000 * (FNMS(KP866025403, T18, T17)));
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, { 36, 22, 22, 0 } };
void X(codelet_hc2cfdft_6) (planner *p) {
X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include rdft/scalar/hc2cf.h */
/*
* This function contains 58 FP additions, 36 FP multiplications,
* (or, 44 additions, 22 multiplications, 14 fused multiply/add),
* 40 stack variables, 3 constants, and 24 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP433012701, +0.433012701892219323381861585376468091735701313);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
E T3, TM, Tc, TN, Ts, T10, TI, TR, TF, T11, TH, TU;
{
E T1, T2, TD, Tz, TA, TB, T7, Tf, Tb, Th, Tq, Tw, Tm, Tu, T4;
E T8;
{
E T5, T6, T9, Ta;
T1 = Ip[0];
T2 = Im[0];
TD = T1 + T2;
Tz = Rm[0];
TA = Rp[0];
TB = Tz - TA;
T5 = Ip[WS(rs, 1)];
T6 = Im[WS(rs, 1)];
T7 = T5 + T6;
Tf = T5 - T6;
T9 = Rp[WS(rs, 1)];
Ta = Rm[WS(rs, 1)];
Tb = T9 - Ta;
Th = T9 + Ta;
{
E To, Tp, Tk, Tl;
To = Rp[WS(rs, 2)];
Tp = Rm[WS(rs, 2)];
Tq = To - Tp;
Tw = To + Tp;
Tk = Ip[WS(rs, 2)];
Tl = Im[WS(rs, 2)];
Tm = Tk + Tl;
Tu = Tk - Tl;
}
}
T3 = T1 - T2;
TM = TA + Tz;
T4 = W[5];
T8 = W[4];
Tc = FMA(T4, T7, T8 * Tb);
TN = FNMS(T4, Tb, T8 * T7);
{
E Ti, TP, Tr, TQ;
{
E Te, Tg, Tj, Tn;
Te = W[2];
Tg = W[3];
Ti = FNMS(Tg, Th, Te * Tf);
TP = FMA(Tg, Tf, Te * Th);
Tj = W[9];
Tn = W[8];
Tr = FMA(Tj, Tm, Tn * Tq);
TQ = FNMS(Tj, Tq, Tn * Tm);
}
Ts = Ti - Tr;
T10 = TP + TQ;
TI = Ti + Tr;
TR = TP - TQ;
}
{
E Tx, TS, TE, TT;
{
E Tt, Tv, Ty, TC;
Tt = W[6];
Tv = W[7];
Tx = FNMS(Tv, Tw, Tt * Tu);
TS = FMA(Tv, Tu, Tt * Tw);
Ty = W[0];
TC = W[1];
TE = FNMS(TC, TD, Ty * TB);
TT = FMA(TC, TB, Ty * TD);
}
TF = Tx + TE;
T11 = TS + TT;
TH = TE - Tx;
TU = TS - TT;
}
}
{
E T12, Td, TG, TZ;
T12 = KP433012701 * (T10 - T11);
Td = T3 - Tc;
TG = Ts + TF;
TZ = FNMS(KP250000000, TG, KP500000000 * Td);
Ip[0] = KP500000000 * (Td + TG);
Im[WS(rs, 1)] = T12 - TZ;
Ip[WS(rs, 2)] = TZ + T12;
}
{
E T16, T13, T14, T15;
T16 = KP433012701 * (Ts - TF);
T13 = TM + TN;
T14 = T10 + T11;
T15 = FNMS(KP250000000, T14, KP500000000 * T13);
Rp[WS(rs, 2)] = T15 - T16;
Rp[0] = KP500000000 * (T13 + T14);
Rm[WS(rs, 1)] = T16 + T15;
}
{
E TY, TJ, TK, TX;
TY = KP433012701 * (TU - TR);
TJ = TH - TI;
TK = Tc + T3;
TX = FMA(KP500000000, TK, KP250000000 * TJ);
Im[WS(rs, 2)] = KP500000000 * (TJ - TK);
Im[0] = TY - TX;
Ip[WS(rs, 1)] = TX + TY;
}
{
E TL, TO, TV, TW;
TL = KP433012701 * (TI + TH);
TO = TM - TN;
TV = TR + TU;
TW = FNMS(KP250000000, TV, KP500000000 * TO);
Rp[WS(rs, 1)] = TL + TW;
Rm[WS(rs, 2)] = KP500000000 * (TO + TV);
Rm[0] = TW - TL;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, { 44, 22, 14, 0 } };
void X(codelet_hc2cfdft_6) (planner *p) {
X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,437 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 82 FP additions, 52 FP multiplications,
* (or, 60 additions, 30 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E Ty, T14, TO, T1o, Tv, T16, TG, T1m, Ta, T19, TV, T1h, Tk, T1b, T11;
E T1j;
{
E Tw, Tx, TN, TI, TJ, TK;
Tw = Ip[0];
Tx = Im[0];
TN = Tw + Tx;
TI = Rm[0];
TJ = Rp[0];
TK = TI - TJ;
Ty = Tw - Tx;
T14 = TJ + TI;
{
E TH, TL, TM, T1n;
TH = W[0];
TL = TH * TK;
TM = W[1];
T1n = TM * TK;
TO = FNMS(TM, TN, TL);
T1o = FMA(TH, TN, T1n);
}
}
{
E Tp, TF, Tu, TC;
{
E Tn, To, Ts, Tt;
Tn = Ip[WS(rs, 2)];
To = Im[WS(rs, 2)];
Tp = Tn - To;
TF = Tn + To;
Ts = Rp[WS(rs, 2)];
Tt = Rm[WS(rs, 2)];
Tu = Ts + Tt;
TC = Tt - Ts;
}
{
E Tq, T15, Tm, Tr;
Tm = W[6];
Tq = Tm * Tp;
T15 = Tm * Tu;
Tr = W[7];
Tv = FNMS(Tr, Tu, Tq);
T16 = FMA(Tr, Tp, T15);
}
{
E TB, TD, TE, T1l;
TB = W[8];
TD = TB * TC;
TE = W[9];
T1l = TE * TC;
TG = FNMS(TE, TF, TD);
T1m = FMA(TB, TF, T1l);
}
}
{
E T4, TU, T9, TR;
{
E T2, T3, T7, T8;
T2 = Ip[WS(rs, 1)];
T3 = Im[WS(rs, 1)];
T4 = T2 - T3;
TU = T2 + T3;
T7 = Rp[WS(rs, 1)];
T8 = Rm[WS(rs, 1)];
T9 = T7 + T8;
TR = T7 - T8;
}
{
E T5, T18, T1, T6;
T1 = W[2];
T5 = T1 * T4;
T18 = T1 * T9;
T6 = W[3];
Ta = FNMS(T6, T9, T5);
T19 = FMA(T6, T4, T18);
}
{
E TS, T1g, TQ, TT;
TQ = W[4];
TS = TQ * TR;
T1g = TQ * TU;
TT = W[5];
TV = FMA(TT, TU, TS);
T1h = FNMS(TT, TR, T1g);
}
}
{
E Te, T10, Tj, TX;
{
E Tc, Td, Th, Ti;
Tc = Ip[WS(rs, 3)];
Td = Im[WS(rs, 3)];
Te = Tc - Td;
T10 = Tc + Td;
Th = Rp[WS(rs, 3)];
Ti = Rm[WS(rs, 3)];
Tj = Th + Ti;
TX = Th - Ti;
}
{
E Tf, T1a, Tb, Tg;
Tb = W[10];
Tf = Tb * Te;
T1a = Tb * Tj;
Tg = W[11];
Tk = FNMS(Tg, Tj, Tf);
T1b = FMA(Tg, Te, T1a);
}
{
E TY, T1i, TW, TZ;
TW = W[12];
TY = TW * TX;
T1i = TW * T10;
TZ = W[13];
T11 = FMA(TZ, T10, TY);
T1j = FNMS(TZ, TX, T1i);
}
}
{
E TA, T1f, T1q, T1s, T13, T1e, T1d, T1r;
{
E Tl, Tz, T1k, T1p;
Tl = Ta + Tk;
Tz = Tv + Ty;
TA = Tl + Tz;
T1f = Tz - Tl;
T1k = T1h + T1j;
T1p = T1m + T1o;
T1q = T1k - T1p;
T1s = T1k + T1p;
}
{
E TP, T12, T17, T1c;
TP = TG + TO;
T12 = TV + T11;
T13 = TP - T12;
T1e = T12 + TP;
T17 = T14 + T16;
T1c = T19 + T1b;
T1d = T17 - T1c;
T1r = T17 + T1c;
}
Ip[0] = KP500000000 * (TA + T13);
Rp[0] = KP500000000 * (T1r + T1s);
Im[WS(rs, 3)] = KP500000000 * (T13 - TA);
Rm[WS(rs, 3)] = KP500000000 * (T1r - T1s);
Rm[WS(rs, 1)] = KP500000000 * (T1d - T1e);
Im[WS(rs, 1)] = KP500000000 * (T1q - T1f);
Rp[WS(rs, 2)] = KP500000000 * (T1d + T1e);
Ip[WS(rs, 2)] = KP500000000 * (T1f + T1q);
}
{
E T1v, T1H, T1F, T1L, T1y, T1I, T1B, T1J;
{
E T1t, T1u, T1D, T1E;
T1t = Ty - Tv;
T1u = T19 - T1b;
T1v = T1t - T1u;
T1H = T1u + T1t;
T1D = T14 - T16;
T1E = Ta - Tk;
T1F = T1D - T1E;
T1L = T1D + T1E;
}
{
E T1w, T1x, T1z, T1A;
T1w = T1j - T1h;
T1x = TV - T11;
T1y = T1w + T1x;
T1I = T1w - T1x;
T1z = TO - TG;
T1A = T1o - T1m;
T1B = T1z - T1A;
T1J = T1z + T1A;
}
{
E T1C, T1M, T1G, T1K;
T1C = T1y + T1B;
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1C, T1v));
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1C, T1v)));
T1M = T1I + T1J;
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
T1G = T1B - T1y;
Rm[0] = KP500000000 * (FNMS(KP707106781, T1G, T1F));
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1G, T1F));
T1K = T1I - T1J;
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1K, T1H));
Im[0] = -(KP500000000 * (FNMS(KP707106781, T1K, T1H)));
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, { 60, 30, 22, 0 } };
void X(codelet_hc2cfdft_8) (planner *p) {
X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
}
#else
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include rdft/scalar/hc2cf.h */
/*
* This function contains 82 FP additions, 44 FP multiplications,
* (or, 68 additions, 30 multiplications, 14 fused multiply/add),
* 39 stack variables, 2 constants, and 32 memory accesses
*/
#include "rdft/scalar/hc2cf.h"
static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
E Tv, TX, Ts, TY, TE, T1a, TJ, T19, T1l, T1m, T9, T10, Ti, T11, TP;
E T16, TU, T17, T1i, T1j;
{
E Tt, Tu, TD, Tz, TA, TB, Tn, TI, Tr, TG, Tk, To;
Tt = Ip[0];
Tu = Im[0];
TD = Tt + Tu;
Tz = Rm[0];
TA = Rp[0];
TB = Tz - TA;
{
E Tl, Tm, Tp, Tq;
Tl = Ip[WS(rs, 2)];
Tm = Im[WS(rs, 2)];
Tn = Tl - Tm;
TI = Tl + Tm;
Tp = Rp[WS(rs, 2)];
Tq = Rm[WS(rs, 2)];
Tr = Tp + Tq;
TG = Tp - Tq;
}
Tv = Tt - Tu;
TX = TA + Tz;
Tk = W[6];
To = W[7];
Ts = FNMS(To, Tr, Tk * Tn);
TY = FMA(Tk, Tr, To * Tn);
{
E Ty, TC, TF, TH;
Ty = W[0];
TC = W[1];
TE = FNMS(TC, TD, Ty * TB);
T1a = FMA(TC, TB, Ty * TD);
TF = W[8];
TH = W[9];
TJ = FMA(TF, TG, TH * TI);
T19 = FNMS(TH, TG, TF * TI);
}
T1l = TJ + TE;
T1m = T1a - T19;
}
{
E T4, TO, T8, TM, Td, TT, Th, TR;
{
E T2, T3, T6, T7;
T2 = Ip[WS(rs, 1)];
T3 = Im[WS(rs, 1)];
T4 = T2 - T3;
TO = T2 + T3;
T6 = Rp[WS(rs, 1)];
T7 = Rm[WS(rs, 1)];
T8 = T6 + T7;
TM = T6 - T7;
}
{
E Tb, Tc, Tf, Tg;
Tb = Ip[WS(rs, 3)];
Tc = Im[WS(rs, 3)];
Td = Tb - Tc;
TT = Tb + Tc;
Tf = Rp[WS(rs, 3)];
Tg = Rm[WS(rs, 3)];
Th = Tf + Tg;
TR = Tf - Tg;
}
{
E T1, T5, Ta, Te;
T1 = W[2];
T5 = W[3];
T9 = FNMS(T5, T8, T1 * T4);
T10 = FMA(T1, T8, T5 * T4);
Ta = W[10];
Te = W[11];
Ti = FNMS(Te, Th, Ta * Td);
T11 = FMA(Ta, Th, Te * Td);
{
E TL, TN, TQ, TS;
TL = W[4];
TN = W[5];
TP = FMA(TL, TM, TN * TO);
T16 = FNMS(TN, TM, TL * TO);
TQ = W[12];
TS = W[13];
TU = FMA(TQ, TR, TS * TT);
T17 = FNMS(TS, TR, TQ * TT);
}
T1i = T17 - T16;
T1j = TP - TU;
}
}
{
E T1h, T1t, T1w, T1y, T1o, T1s, T1r, T1x;
{
E T1f, T1g, T1u, T1v;
T1f = Tv - Ts;
T1g = T10 - T11;
T1h = KP500000000 * (T1f - T1g);
T1t = KP500000000 * (T1g + T1f);
T1u = T1i - T1j;
T1v = T1l + T1m;
T1w = KP353553390 * (T1u - T1v);
T1y = KP353553390 * (T1u + T1v);
}
{
E T1k, T1n, T1p, T1q;
T1k = T1i + T1j;
T1n = T1l - T1m;
T1o = KP353553390 * (T1k + T1n);
T1s = KP353553390 * (T1n - T1k);
T1p = TX - TY;
T1q = T9 - Ti;
T1r = KP500000000 * (T1p - T1q);
T1x = KP500000000 * (T1p + T1q);
}
Ip[WS(rs, 1)] = T1h + T1o;
Rp[WS(rs, 1)] = T1x + T1y;
Im[WS(rs, 2)] = T1o - T1h;
Rm[WS(rs, 2)] = T1x - T1y;
Rm[0] = T1r - T1s;
Im[0] = T1w - T1t;
Rp[WS(rs, 3)] = T1r + T1s;
Ip[WS(rs, 3)] = T1t + T1w;
}
{
E Tx, T15, T1c, T1e, TW, T14, T13, T1d;
{
E Tj, Tw, T18, T1b;
Tj = T9 + Ti;
Tw = Ts + Tv;
Tx = Tj + Tw;
T15 = Tw - Tj;
T18 = T16 + T17;
T1b = T19 + T1a;
T1c = T18 - T1b;
T1e = T18 + T1b;
}
{
E TK, TV, TZ, T12;
TK = TE - TJ;
TV = TP + TU;
TW = TK - TV;
T14 = TV + TK;
TZ = TX + TY;
T12 = T10 + T11;
T13 = TZ - T12;
T1d = TZ + T12;
}
Ip[0] = KP500000000 * (Tx + TW);
Rp[0] = KP500000000 * (T1d + T1e);
Im[WS(rs, 3)] = KP500000000 * (TW - Tx);
Rm[WS(rs, 3)] = KP500000000 * (T1d - T1e);
Rm[WS(rs, 1)] = KP500000000 * (T13 - T14);
Im[WS(rs, 1)] = KP500000000 * (T1c - T15);
Rp[WS(rs, 2)] = KP500000000 * (T13 + T14);
Ip[WS(rs, 2)] = KP500000000 * (T15 + T1c);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, { 68, 30, 14, 0 } };
void X(codelet_hc2cfdft_8) (planner *p) {
X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
}
#endif

View File

@@ -0,0 +1,836 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:18 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include rdft/scalar/hf.h */
/*
* This function contains 196 FP additions, 134 FP multiplications,
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
* 90 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
{
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
T2 = W[0];
Tf = W[2];
Tg = T2 * Tf;
TM = W[6];
TN = T2 * TM;
TO = W[7];
TS = T2 * TO;
T3 = W[4];
T4 = T2 * T3;
Tp = Tf * T3;
T6 = W[5];
Ta = T2 * T6;
Tt = Tf * T6;
T5 = W[1];
Th = W[3];
Tl = T2 * Th;
Tz = FMA(T5, Th, Tg);
Ti = FNMS(T5, Th, Tg);
T7 = FMA(T5, T6, T4);
TZ = FNMS(Th, T3, Tt);
TT = FNMS(T5, TM, TS);
Tq = FNMS(Th, T6, Tp);
TW = FMA(Th, T6, Tp);
Tb = FNMS(T5, T3, Ta);
Tu = FMA(Th, T3, Tt);
TP = FMA(T5, TO, TN);
TI = FMA(T5, T3, Ta);
TF = FNMS(T5, T6, T4);
{
E T1y, T1C, T1e, T1i;
T1y = Tz * T3;
T1C = Tz * T6;
TC = FNMS(T5, Tf, Tl);
T1z = FMA(TC, T6, T1y);
T1O = FMA(TC, T3, T1C);
T1D = FNMS(TC, T3, T1C);
T1L = FNMS(TC, T6, T1y);
T1e = Ti * T3;
T1i = Ti * T6;
Tm = FMA(T5, Tf, Tl);
T1f = FMA(Tm, T6, T1e);
T1p = FMA(Tm, T3, T1i);
T1j = FNMS(Tm, T3, T1i);
T1m = FNMS(Tm, T6, T1e);
}
}
{
E Te, T1U, T3A, T3M, T1G, T2w, T2I, T3h, T1R, T2D, T2B, T3i, Tx, T3L, T1Z;
E T3w, TL, T21, T26, T38, T1d, T2h, T2s, T3c, T1s, T2t, T2m, T3d, T12, T28;
E T2d, T37;
{
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
T1 = cr[0];
T3z = ci[0];
T8 = cr[WS(rs, 8)];
T9 = T7 * T8;
Tc = ci[WS(rs, 8)];
T3x = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Te = T1 + Td;
T1U = T1 - Td;
T3y = FNMS(Tb, T8, T3x);
T3A = T3y + T3z;
T3M = T3z - T3y;
}
{
E T1u, T1v, T1w, T2E, T1A, T1B, T1E, T2G;
T1u = cr[WS(rs, 15)];
T1v = TM * T1u;
T1w = ci[WS(rs, 15)];
T2E = TM * T1w;
T1A = cr[WS(rs, 7)];
T1B = T1z * T1A;
T1E = ci[WS(rs, 7)];
T2G = T1z * T1E;
{
E T1x, T1F, T2F, T2H;
T1x = FMA(TO, T1w, T1v);
T1F = FMA(T1D, T1E, T1B);
T1G = T1x + T1F;
T2w = T1x - T1F;
T2F = FNMS(TO, T1u, T2E);
T2H = FNMS(T1D, T1A, T2G);
T2I = T2F - T2H;
T3h = T2F + T2H;
}
}
{
E T1H, T1I, T1J, T2x, T1M, T1N, T1P, T2z;
T1H = cr[WS(rs, 3)];
T1I = Tf * T1H;
T1J = ci[WS(rs, 3)];
T2x = Tf * T1J;
T1M = cr[WS(rs, 11)];
T1N = T1L * T1M;
T1P = ci[WS(rs, 11)];
T2z = T1L * T1P;
{
E T1K, T1Q, T2y, T2A;
T1K = FMA(Th, T1J, T1I);
T1Q = FMA(T1O, T1P, T1N);
T1R = T1K + T1Q;
T2D = T1Q - T1K;
T2y = FNMS(Th, T1H, T2x);
T2A = FNMS(T1O, T1M, T2z);
T2B = T2y - T2A;
T3i = T2y + T2A;
}
}
{
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Tj = cr[WS(rs, 4)];
Tk = Ti * Tj;
Tn = ci[WS(rs, 4)];
T1V = Ti * Tn;
Tr = cr[WS(rs, 12)];
Ts = Tq * Tr;
Tv = ci[WS(rs, 12)];
T1X = Tq * Tv;
{
E To, Tw, T1W, T1Y;
To = FMA(Tm, Tn, Tk);
Tw = FMA(Tu, Tv, Ts);
Tx = To + Tw;
T3L = To - Tw;
T1W = FNMS(Tm, Tj, T1V);
T1Y = FNMS(Tu, Tr, T1X);
T1Z = T1W - T1Y;
T3w = T1W + T1Y;
}
}
{
E TA, TB, TD, T22, TG, TH, TJ, T24;
TA = cr[WS(rs, 2)];
TB = Tz * TA;
TD = ci[WS(rs, 2)];
T22 = Tz * TD;
TG = cr[WS(rs, 10)];
TH = TF * TG;
TJ = ci[WS(rs, 10)];
T24 = TF * TJ;
{
E TE, TK, T23, T25;
TE = FMA(TC, TD, TB);
TK = FMA(TI, TJ, TH);
TL = TE + TK;
T21 = TE - TK;
T23 = FNMS(TC, TA, T22);
T25 = FNMS(TI, TG, T24);
T26 = T23 - T25;
T38 = T23 + T25;
}
}
{
E T15, T16, T17, T2o, T19, T1a, T1b, T2q;
T15 = cr[WS(rs, 1)];
T16 = T2 * T15;
T17 = ci[WS(rs, 1)];
T2o = T2 * T17;
T19 = cr[WS(rs, 9)];
T1a = T3 * T19;
T1b = ci[WS(rs, 9)];
T2q = T3 * T1b;
{
E T18, T1c, T2p, T2r;
T18 = FMA(T5, T17, T16);
T1c = FMA(T6, T1b, T1a);
T1d = T18 + T1c;
T2h = T18 - T1c;
T2p = FNMS(T5, T15, T2o);
T2r = FNMS(T6, T19, T2q);
T2s = T2p - T2r;
T3c = T2p + T2r;
}
}
{
E T1g, T1h, T1k, T2i, T1n, T1o, T1q, T2k;
T1g = cr[WS(rs, 5)];
T1h = T1f * T1g;
T1k = ci[WS(rs, 5)];
T2i = T1f * T1k;
T1n = cr[WS(rs, 13)];
T1o = T1m * T1n;
T1q = ci[WS(rs, 13)];
T2k = T1m * T1q;
{
E T1l, T1r, T2j, T2l;
T1l = FMA(T1j, T1k, T1h);
T1r = FMA(T1p, T1q, T1o);
T1s = T1l + T1r;
T2t = T1l - T1r;
T2j = FNMS(T1j, T1g, T2i);
T2l = FNMS(T1p, T1n, T2k);
T2m = T2j - T2l;
T3d = T2j + T2l;
}
}
{
E TQ, TR, TU, T29, TX, TY, T10, T2b;
TQ = cr[WS(rs, 14)];
TR = TP * TQ;
TU = ci[WS(rs, 14)];
T29 = TP * TU;
TX = cr[WS(rs, 6)];
TY = TW * TX;
T10 = ci[WS(rs, 6)];
T2b = TW * T10;
{
E TV, T11, T2a, T2c;
TV = FMA(TT, TU, TR);
T11 = FMA(TZ, T10, TY);
T12 = TV + T11;
T28 = TV - T11;
T2a = FNMS(TT, TQ, T29);
T2c = FNMS(TZ, TX, T2b);
T2d = T2a - T2c;
T37 = T2a + T2c;
}
}
{
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
{
E Ty, T13, T3v, T3B;
Ty = Te + Tx;
T13 = TL + T12;
T14 = Ty + T13;
T3q = Ty - T13;
T3v = T38 + T37;
T3B = T3w + T3A;
T3C = T3v + T3B;
T3E = T3B - T3v;
}
{
E T1t, T1S, T3r, T3s;
T1t = T1d + T1s;
T1S = T1G + T1R;
T1T = T1t + T1S;
T3D = T1S - T1t;
T3r = T3h + T3i;
T3s = T3c + T3d;
T3t = T3r - T3s;
T3u = T3s + T3r;
}
ci[WS(rs, 7)] = T14 - T1T;
cr[WS(rs, 12)] = T3D - T3E;
ci[WS(rs, 11)] = T3D + T3E;
cr[0] = T14 + T1T;
cr[WS(rs, 4)] = T3q - T3t;
cr[WS(rs, 8)] = T3u - T3C;
ci[WS(rs, 15)] = T3u + T3C;
ci[WS(rs, 3)] = T3q + T3t;
}
{
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
{
E T36, T39, T3F, T3G;
T36 = Te - Tx;
T39 = T37 - T38;
T3a = T36 - T39;
T3m = T36 + T39;
T3F = TL - T12;
T3G = T3A - T3w;
T3H = T3F + T3G;
T3J = T3G - T3F;
}
{
E T3b, T3e, T3g, T3j;
T3b = T1d - T1s;
T3e = T3c - T3d;
T3f = T3b + T3e;
T3n = T3b - T3e;
T3g = T1G - T1R;
T3j = T3h - T3i;
T3k = T3g - T3j;
T3o = T3g + T3j;
}
{
E T3l, T3K, T3p, T3I;
T3l = T3f + T3k;
ci[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
cr[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
T3K = T3o - T3n;
cr[WS(rs, 10)] = FMS(KP707106781, T3K, T3J);
ci[WS(rs, 13)] = FMA(KP707106781, T3K, T3J);
T3p = T3n + T3o;
cr[WS(rs, 6)] = FNMS(KP707106781, T3p, T3m);
ci[WS(rs, 1)] = FMA(KP707106781, T3p, T3m);
T3I = T3k - T3f;
cr[WS(rs, 14)] = FMS(KP707106781, T3I, T3H);
ci[WS(rs, 9)] = FMA(KP707106781, T3I, T3H);
}
}
{
E T20, T3N, T3T, T2Q, T2f, T3U, T30, T33, T2T, T3O, T2v, T2N, T2X, T34, T2K;
E T2O;
{
E T27, T2e, T2n, T2u;
T20 = T1U - T1Z;
T3N = T3L + T3M;
T3T = T3M - T3L;
T2Q = T1U + T1Z;
T27 = T21 - T26;
T2e = T28 + T2d;
T2f = T27 + T2e;
T3U = T2e - T27;
{
E T2Y, T2Z, T2R, T2S;
T2Y = T2w + T2B;
T2Z = T2I + T2D;
T30 = FNMS(KP414213562, T2Z, T2Y);
T33 = FMA(KP414213562, T2Y, T2Z);
T2R = T21 + T26;
T2S = T28 - T2d;
T2T = T2R + T2S;
T3O = T2R - T2S;
}
T2n = T2h - T2m;
T2u = T2s + T2t;
T2v = FNMS(KP414213562, T2u, T2n);
T2N = FMA(KP414213562, T2n, T2u);
{
E T2V, T2W, T2C, T2J;
T2V = T2h + T2m;
T2W = T2s - T2t;
T2X = FMA(KP414213562, T2W, T2V);
T34 = FNMS(KP414213562, T2V, T2W);
T2C = T2w - T2B;
T2J = T2D - T2I;
T2K = FNMS(KP414213562, T2J, T2C);
T2O = FMA(KP414213562, T2C, T2J);
}
}
{
E T2g, T2L, T3V, T3W;
T2g = FMA(KP707106781, T2f, T20);
T2L = T2v + T2K;
cr[WS(rs, 7)] = FNMS(KP923879532, T2L, T2g);
ci[0] = FMA(KP923879532, T2L, T2g);
T3V = FMA(KP707106781, T3U, T3T);
T3W = T34 + T33;
cr[WS(rs, 9)] = FMS(KP923879532, T3W, T3V);
ci[WS(rs, 14)] = FMA(KP923879532, T3W, T3V);
}
{
E T3X, T3Y, T2M, T2P;
T3X = FNMS(KP707106781, T3U, T3T);
T3Y = T30 - T2X;
cr[WS(rs, 13)] = FMS(KP923879532, T3Y, T3X);
ci[WS(rs, 10)] = FMA(KP923879532, T3Y, T3X);
T2M = FNMS(KP707106781, T2f, T20);
T2P = T2N + T2O;
ci[WS(rs, 4)] = FNMS(KP923879532, T2P, T2M);
cr[WS(rs, 3)] = FMA(KP923879532, T2P, T2M);
}
{
E T2U, T31, T3P, T3Q;
T2U = FMA(KP707106781, T2T, T2Q);
T31 = T2X + T30;
ci[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
cr[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
T3P = FMA(KP707106781, T3O, T3N);
T3Q = T2O - T2N;
cr[WS(rs, 15)] = FMS(KP923879532, T3Q, T3P);
ci[WS(rs, 8)] = FMA(KP923879532, T3Q, T3P);
}
{
E T3R, T3S, T32, T35;
T3R = FNMS(KP707106781, T3O, T3N);
T3S = T2K - T2v;
cr[WS(rs, 11)] = FMS(KP923879532, T3S, T3R);
ci[WS(rs, 12)] = FMA(KP923879532, T3S, T3R);
T32 = FNMS(KP707106781, T2T, T2Q);
T35 = T33 - T34;
cr[WS(rs, 5)] = FNMS(KP923879532, T35, T32);
ci[WS(rs, 2)] = FMA(KP923879532, T35, T32);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
void X(codelet_hf2_16) (planner *p) {
X(khc2hc_register) (p, hf2_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include rdft/scalar/hf.h */
/*
* This function contains 196 FP additions, 108 FP multiplications,
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
* 82 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
{
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
{
E Th, Tn, Tj, Tm;
T2 = W[0];
T5 = W[1];
Tg = W[2];
Ti = W[3];
Th = T2 * Tg;
Tn = T5 * Tg;
Tj = T5 * Ti;
Tm = T2 * Ti;
Tk = Th - Tj;
To = Tm + Tn;
TE = Tm - Tn;
TC = Th + Tj;
T6 = W[5];
T7 = T5 * T6;
Tv = Tg * T6;
Ta = T2 * T6;
Ts = Ti * T6;
T3 = W[4];
T4 = T2 * T3;
Tw = Ti * T3;
Tb = T5 * T3;
Tr = Tg * T3;
}
T8 = T4 + T7;
TW = Tv - Tw;
TJ = Ta + Tb;
Tt = Tr - Ts;
TU = Tr + Ts;
Tc = Ta - Tb;
Tx = Tv + Tw;
TH = T4 - T7;
TN = W[6];
TO = W[7];
TP = FMA(T2, TN, T5 * TO);
TR = FNMS(T5, TN, T2 * TO);
{
E T1d, T1e, T19, T1a;
T1d = Tk * T6;
T1e = To * T3;
T1f = T1d - T1e;
T1k = T1d + T1e;
T19 = Tk * T3;
T1a = To * T6;
T1b = T19 + T1a;
T1i = T19 - T1a;
}
{
E T1w, T1x, T1s, T1t;
T1w = TC * T6;
T1x = TE * T3;
T1y = T1w - T1x;
T1H = T1w + T1x;
T1s = TC * T3;
T1t = TE * T6;
T1u = T1s + T1t;
T1F = T1s - T1t;
}
}
{
E Tf, T3s, T1N, T3e, TA, T3r, T1Q, T3b, TM, T2N, T1W, T2w, TZ, T2M, T21;
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2E, T2o, T2D, T18, T1n, T2Q, T2R;
E T2S, T2T, T28, T2B, T2d, T2A;
{
E T1, T3d, Te, T3c, T9, Td;
T1 = cr[0];
T3d = ci[0];
T9 = cr[WS(rs, 8)];
Td = ci[WS(rs, 8)];
Te = FMA(T8, T9, Tc * Td);
T3c = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T3s = T3d - T3c;
T1N = T1 - Te;
T3e = T3c + T3d;
}
{
E Tq, T1O, Tz, T1P;
{
E Tl, Tp, Tu, Ty;
Tl = cr[WS(rs, 4)];
Tp = ci[WS(rs, 4)];
Tq = FMA(Tk, Tl, To * Tp);
T1O = FNMS(To, Tl, Tk * Tp);
Tu = cr[WS(rs, 12)];
Ty = ci[WS(rs, 12)];
Tz = FMA(Tt, Tu, Tx * Ty);
T1P = FNMS(Tx, Tu, Tt * Ty);
}
TA = Tq + Tz;
T3r = Tq - Tz;
T1Q = T1O - T1P;
T3b = T1O + T1P;
}
{
E TG, T1T, TL, T1U, T1S, T1V;
{
E TD, TF, TI, TK;
TD = cr[WS(rs, 2)];
TF = ci[WS(rs, 2)];
TG = FMA(TC, TD, TE * TF);
T1T = FNMS(TE, TD, TC * TF);
TI = cr[WS(rs, 10)];
TK = ci[WS(rs, 10)];
TL = FMA(TH, TI, TJ * TK);
T1U = FNMS(TJ, TI, TH * TK);
}
TM = TG + TL;
T2N = T1T + T1U;
T1S = TG - TL;
T1V = T1T - T1U;
T1W = T1S - T1V;
T2w = T1S + T1V;
}
{
E TT, T1Y, TY, T1Z, T1X, T20;
{
E TQ, TS, TV, TX;
TQ = cr[WS(rs, 14)];
TS = ci[WS(rs, 14)];
TT = FMA(TP, TQ, TR * TS);
T1Y = FNMS(TR, TQ, TP * TS);
TV = cr[WS(rs, 6)];
TX = ci[WS(rs, 6)];
TY = FMA(TU, TV, TW * TX);
T1Z = FNMS(TW, TV, TU * TX);
}
TZ = TT + TY;
T2M = T1Y + T1Z;
T1X = TT - TY;
T20 = T1Y - T1Z;
T21 = T1X + T20;
T2x = T1X - T20;
}
{
E T1r, T2f, T1J, T2m, T1A, T2g, T1E, T2l;
{
E T1p, T1q, T1G, T1I;
T1p = cr[WS(rs, 15)];
T1q = ci[WS(rs, 15)];
T1r = FMA(TN, T1p, TO * T1q);
T2f = FNMS(TO, T1p, TN * T1q);
T1G = cr[WS(rs, 11)];
T1I = ci[WS(rs, 11)];
T1J = FMA(T1F, T1G, T1H * T1I);
T2m = FNMS(T1H, T1G, T1F * T1I);
}
{
E T1v, T1z, T1C, T1D;
T1v = cr[WS(rs, 7)];
T1z = ci[WS(rs, 7)];
T1A = FMA(T1u, T1v, T1y * T1z);
T2g = FNMS(T1y, T1v, T1u * T1z);
T1C = cr[WS(rs, 3)];
T1D = ci[WS(rs, 3)];
T1E = FMA(Tg, T1C, Ti * T1D);
T2l = FNMS(Ti, T1C, Tg * T1D);
}
T1B = T1r + T1A;
T1K = T1E + T1J;
T2V = T1B - T1K;
T2W = T2f + T2g;
T2X = T2l + T2m;
T2Y = T2W - T2X;
{
E T2h, T2i, T2k, T2n;
T2h = T2f - T2g;
T2i = T1E - T1J;
T2j = T2h + T2i;
T2E = T2h - T2i;
T2k = T1r - T1A;
T2n = T2l - T2m;
T2o = T2k - T2n;
T2D = T2k + T2n;
}
}
{
E T14, T29, T1m, T26, T17, T2a, T1h, T25;
{
E T12, T13, T1j, T1l;
T12 = cr[WS(rs, 1)];
T13 = ci[WS(rs, 1)];
T14 = FMA(T2, T12, T5 * T13);
T29 = FNMS(T5, T12, T2 * T13);
T1j = cr[WS(rs, 13)];
T1l = ci[WS(rs, 13)];
T1m = FMA(T1i, T1j, T1k * T1l);
T26 = FNMS(T1k, T1j, T1i * T1l);
}
{
E T15, T16, T1c, T1g;
T15 = cr[WS(rs, 9)];
T16 = ci[WS(rs, 9)];
T17 = FMA(T3, T15, T6 * T16);
T2a = FNMS(T6, T15, T3 * T16);
T1c = cr[WS(rs, 5)];
T1g = ci[WS(rs, 5)];
T1h = FMA(T1b, T1c, T1f * T1g);
T25 = FNMS(T1f, T1c, T1b * T1g);
}
T18 = T14 + T17;
T1n = T1h + T1m;
T2Q = T18 - T1n;
T2R = T29 + T2a;
T2S = T25 + T26;
T2T = T2R - T2S;
{
E T24, T27, T2b, T2c;
T24 = T14 - T17;
T27 = T25 - T26;
T28 = T24 - T27;
T2B = T24 + T27;
T2b = T29 - T2a;
T2c = T1h - T1m;
T2d = T2b + T2c;
T2A = T2b - T2c;
}
}
{
E T23, T2r, T3u, T3w, T2q, T3v, T2u, T3p;
{
E T1R, T22, T3q, T3t;
T1R = T1N - T1Q;
T22 = KP707106781 * (T1W + T21);
T23 = T1R + T22;
T2r = T1R - T22;
T3q = KP707106781 * (T2w - T2x);
T3t = T3r + T3s;
T3u = T3q + T3t;
T3w = T3t - T3q;
}
{
E T2e, T2p, T2s, T2t;
T2e = FNMS(KP382683432, T2d, KP923879532 * T28);
T2p = FMA(KP382683432, T2j, KP923879532 * T2o);
T2q = T2e + T2p;
T3v = T2p - T2e;
T2s = FMA(KP923879532, T2d, KP382683432 * T28);
T2t = FNMS(KP923879532, T2j, KP382683432 * T2o);
T2u = T2s + T2t;
T3p = T2t - T2s;
}
cr[WS(rs, 7)] = T23 - T2q;
cr[WS(rs, 11)] = T3v - T3w;
ci[WS(rs, 12)] = T3v + T3w;
ci[0] = T23 + T2q;
ci[WS(rs, 4)] = T2r - T2u;
cr[WS(rs, 15)] = T3p - T3u;
ci[WS(rs, 8)] = T3p + T3u;
cr[WS(rs, 3)] = T2r + T2u;
}
{
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
{
E TB, T10, T3a, T3f;
TB = Tf + TA;
T10 = TM + TZ;
T11 = TB + T10;
T35 = TB - T10;
T3a = T2N + T2M;
T3f = T3b + T3e;
T3g = T3a + T3f;
T3i = T3f - T3a;
}
{
E T1o, T1L, T36, T37;
T1o = T18 + T1n;
T1L = T1B + T1K;
T1M = T1o + T1L;
T3h = T1L - T1o;
T36 = T2W + T2X;
T37 = T2R + T2S;
T38 = T36 - T37;
T39 = T37 + T36;
}
ci[WS(rs, 7)] = T11 - T1M;
cr[WS(rs, 12)] = T3h - T3i;
ci[WS(rs, 11)] = T3h + T3i;
cr[0] = T11 + T1M;
cr[WS(rs, 4)] = T35 - T38;
cr[WS(rs, 8)] = T39 - T3g;
ci[WS(rs, 15)] = T39 + T3g;
ci[WS(rs, 3)] = T35 + T38;
}
{
E T2z, T2H, T3A, T3C, T2G, T3B, T2K, T3x;
{
E T2v, T2y, T3y, T3z;
T2v = T1N + T1Q;
T2y = KP707106781 * (T2w + T2x);
T2z = T2v + T2y;
T2H = T2v - T2y;
T3y = KP707106781 * (T21 - T1W);
T3z = T3s - T3r;
T3A = T3y + T3z;
T3C = T3z - T3y;
}
{
E T2C, T2F, T2I, T2J;
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
T2G = T2C + T2F;
T3B = T2F - T2C;
T2I = FNMS(KP923879532, T2A, KP382683432 * T2B);
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
T2K = T2I + T2J;
T3x = T2J - T2I;
}
ci[WS(rs, 6)] = T2z - T2G;
cr[WS(rs, 13)] = T3B - T3C;
ci[WS(rs, 10)] = T3B + T3C;
cr[WS(rs, 1)] = T2z + T2G;
cr[WS(rs, 5)] = T2H - T2K;
cr[WS(rs, 9)] = T3x - T3A;
ci[WS(rs, 14)] = T3x + T3A;
ci[WS(rs, 2)] = T2H + T2K;
}
{
E T2P, T31, T3m, T3o, T30, T3j, T34, T3n;
{
E T2L, T2O, T3k, T3l;
T2L = Tf - TA;
T2O = T2M - T2N;
T2P = T2L - T2O;
T31 = T2L + T2O;
T3k = TM - TZ;
T3l = T3e - T3b;
T3m = T3k + T3l;
T3o = T3l - T3k;
}
{
E T2U, T2Z, T32, T33;
T2U = T2Q + T2T;
T2Z = T2V - T2Y;
T30 = KP707106781 * (T2U + T2Z);
T3j = KP707106781 * (T2Z - T2U);
T32 = T2Q - T2T;
T33 = T2V + T2Y;
T34 = KP707106781 * (T32 + T33);
T3n = KP707106781 * (T33 - T32);
}
ci[WS(rs, 5)] = T2P - T30;
cr[WS(rs, 10)] = T3n - T3o;
ci[WS(rs, 13)] = T3n + T3o;
cr[WS(rs, 2)] = T2P + T30;
cr[WS(rs, 6)] = T31 - T34;
cr[WS(rs, 14)] = T3j - T3m;
ci[WS(rs, 9)] = T3j + T3m;
ci[WS(rs, 1)] = T31 + T34;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 9 },
{ TW_CEXP, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
void X(codelet_hf2_16) (planner *p) {
X(khc2hc_register) (p, hf2_16, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:18 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hf2_4 -include rdft/scalar/hf.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T2, T6, T3, T5, T7, Tb, T4, Ta;
T2 = W[0];
T6 = W[3];
T3 = W[2];
T4 = T2 * T3;
Ta = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tb = FNMS(T5, T3, Ta);
{
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
T1 = cr[0];
Tx = ci[0];
{
E T8, T9, Tc, Tv;
T8 = cr[WS(rs, 2)];
T9 = T7 * T8;
Tc = ci[WS(rs, 2)];
Tv = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Tw = FNMS(Tb, T8, Tv);
}
{
E Tf, Tg, Th, Tp;
Tf = cr[WS(rs, 1)];
Tg = T2 * Tf;
Th = ci[WS(rs, 1)];
Tp = T2 * Th;
Ti = FMA(T5, Th, Tg);
Tq = FNMS(T5, Tf, Tp);
}
{
E Tj, Tk, Tl, Tr;
Tj = cr[WS(rs, 3)];
Tk = T3 * Tj;
Tl = ci[WS(rs, 3)];
Tr = T3 * Tl;
Tm = FMA(T6, Tl, Tk);
Ts = FNMS(T6, Tj, Tr);
}
{
E Te, Tn, To, Tt;
Te = T1 + Td;
Tn = Ti + Tm;
ci[WS(rs, 1)] = Te - Tn;
cr[0] = Te + Tn;
To = T1 - Td;
Tt = Tq - Ts;
ci[0] = To - Tt;
cr[WS(rs, 1)] = To + Tt;
}
{
E Tu, Ty, Tz, TA;
Tu = Tq + Ts;
Ty = Tw + Tx;
cr[WS(rs, 2)] = Tu - Ty;
ci[WS(rs, 3)] = Tu + Ty;
Tz = Tm - Ti;
TA = Tx - Tw;
cr[WS(rs, 3)] = Tz - TA;
ci[WS(rs, 2)] = Tz + TA;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hf2_4) (planner *p) {
X(khc2hc_register) (p, hf2_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hf2_4 -include rdft/scalar/hf.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T2, T4, T3, T5, T6, T8;
T2 = W[0];
T4 = W[1];
T3 = W[2];
T5 = W[3];
T6 = FMA(T2, T3, T4 * T5);
T8 = FNMS(T4, T3, T2 * T5);
{
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
T1 = cr[0];
Tp = ci[0];
T7 = cr[WS(rs, 2)];
T9 = ci[WS(rs, 2)];
Ta = FMA(T6, T7, T8 * T9);
To = FNMS(T8, T7, T6 * T9);
{
E Tc, Td, Tf, Tg;
Tc = cr[WS(rs, 1)];
Td = ci[WS(rs, 1)];
Te = FMA(T2, Tc, T4 * Td);
Tk = FNMS(T4, Tc, T2 * Td);
Tf = cr[WS(rs, 3)];
Tg = ci[WS(rs, 3)];
Th = FMA(T3, Tf, T5 * Tg);
Tl = FNMS(T5, Tf, T3 * Tg);
}
{
E Tb, Ti, Tj, Tm;
Tb = T1 + Ta;
Ti = Te + Th;
ci[WS(rs, 1)] = Tb - Ti;
cr[0] = Tb + Ti;
Tj = T1 - Ta;
Tm = Tk - Tl;
ci[0] = Tj - Tm;
cr[WS(rs, 1)] = Tj + Tm;
}
{
E Tn, Tq, Tr, Ts;
Tn = Tk + Tl;
Tq = To + Tp;
cr[WS(rs, 2)] = Tn - Tq;
ci[WS(rs, 3)] = Tn + Tq;
Tr = Th - Te;
Ts = Tp - To;
cr[WS(rs, 3)] = Tr - Ts;
ci[WS(rs, 2)] = Tr + Ts;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
void X(codelet_hf2_4) (planner *p) {
X(khc2hc_register) (p, hf2_4, &desc);
}
#endif

View File

@@ -0,0 +1,264 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:20 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -dit -name hf2_5 -include rdft/scalar/hf.h */
/*
* This function contains 44 FP additions, 40 FP multiplications,
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
* 38 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
T2 = W[0];
Ta = W[3];
T8 = W[2];
T9 = T2 * T8;
Te = T2 * Ta;
T5 = W[1];
Tb = FNMS(T5, Ta, T9);
Tm = FNMS(T5, T8, Te);
Tf = FMA(T5, T8, Te);
Tj = FMA(T5, Ta, T9);
{
E T1, TL, T7, Th, Ti, Tz, TB, TM, To, Ts, Tt, TE, TG, TN;
T1 = cr[0];
TL = ci[0];
{
E T3, T4, T6, Ty, Tc, Td, Tg, TA;
T3 = cr[WS(rs, 1)];
T4 = T2 * T3;
T6 = ci[WS(rs, 1)];
Ty = T2 * T6;
Tc = cr[WS(rs, 4)];
Td = Tb * Tc;
Tg = ci[WS(rs, 4)];
TA = Tb * Tg;
T7 = FMA(T5, T6, T4);
Th = FMA(Tf, Tg, Td);
Ti = T7 + Th;
Tz = FNMS(T5, T3, Ty);
TB = FNMS(Tf, Tc, TA);
TM = Tz + TB;
}
{
E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
Tk = cr[WS(rs, 2)];
Tl = Tj * Tk;
Tn = ci[WS(rs, 2)];
TD = Tj * Tn;
Tp = cr[WS(rs, 3)];
Tq = T8 * Tp;
Tr = ci[WS(rs, 3)];
TF = T8 * Tr;
To = FMA(Tm, Tn, Tl);
Ts = FMA(Ta, Tr, Tq);
Tt = To + Ts;
TE = FNMS(Tm, Tk, TD);
TG = FNMS(Ta, Tp, TF);
TN = TE + TG;
}
{
E Tw, Tu, Tv, TI, TK, TC, TH, Tx, TJ;
Tw = Ti - Tt;
Tu = Ti + Tt;
Tv = FNMS(KP250000000, Tu, T1);
TC = Tz - TB;
TH = TE - TG;
TI = FMA(KP618033988, TH, TC);
TK = FNMS(KP618033988, TC, TH);
cr[0] = T1 + Tu;
Tx = FMA(KP559016994, Tw, Tv);
ci[0] = FNMS(KP951056516, TI, Tx);
cr[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
TJ = FNMS(KP559016994, Tw, Tv);
cr[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
ci[WS(rs, 1)] = FMA(KP951056516, TK, TJ);
}
{
E TQ, TO, TP, TU, TW, TS, TT, TV, TR;
TQ = TM - TN;
TO = TM + TN;
TP = FNMS(KP250000000, TO, TL);
TS = To - Ts;
TT = Th - T7;
TU = FMA(KP618033988, TT, TS);
TW = FNMS(KP618033988, TS, TT);
ci[WS(rs, 4)] = TO + TL;
TV = FMA(KP559016994, TQ, TP);
cr[WS(rs, 4)] = FMS(KP951056516, TW, TV);
ci[WS(rs, 3)] = FMA(KP951056516, TW, TV);
TR = FNMS(KP559016994, TQ, TP);
cr[WS(rs, 3)] = FMS(KP951056516, TU, TR);
ci[WS(rs, 2)] = FMA(KP951056516, TU, TR);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hf2_5", twinstr, &GENUS, { 14, 10, 30, 0 } };
void X(codelet_hf2_5) (planner *p) {
X(khc2hc_register) (p, hf2_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -dit -name hf2_5 -include rdft/scalar/hf.h */
/*
* This function contains 44 FP additions, 32 FP multiplications,
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
* 37 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
{
E T8, Te, Ta, Td;
T2 = W[0];
T4 = W[1];
T7 = W[2];
T9 = W[3];
T8 = T2 * T7;
Te = T4 * T7;
Ta = T4 * T9;
Td = T2 * T9;
Tb = T8 - Ta;
Tl = Td - Te;
Tf = Td + Te;
Tj = T8 + Ta;
}
{
E T1, TI, Ty, TB, TG, TF, TJ, TK, TL, Ti, Tr, Ts;
T1 = cr[0];
TI = ci[0];
{
E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
{
E T3, T5, To, Tp;
T3 = cr[WS(rs, 1)];
T5 = ci[WS(rs, 1)];
T6 = FMA(T2, T3, T4 * T5);
Tw = FNMS(T4, T3, T2 * T5);
To = cr[WS(rs, 3)];
Tp = ci[WS(rs, 3)];
Tq = FMA(T7, To, T9 * Tp);
TA = FNMS(T9, To, T7 * Tp);
}
{
E Tc, Tg, Tk, Tm;
Tc = cr[WS(rs, 4)];
Tg = ci[WS(rs, 4)];
Th = FMA(Tb, Tc, Tf * Tg);
Tx = FNMS(Tf, Tc, Tb * Tg);
Tk = cr[WS(rs, 2)];
Tm = ci[WS(rs, 2)];
Tn = FMA(Tj, Tk, Tl * Tm);
Tz = FNMS(Tl, Tk, Tj * Tm);
}
Ty = Tw - Tx;
TB = Tz - TA;
TG = Tn - Tq;
TF = Th - T6;
TJ = Tw + Tx;
TK = Tz + TA;
TL = TJ + TK;
Ti = T6 + Th;
Tr = Tn + Tq;
Ts = Ti + Tr;
}
cr[0] = T1 + Ts;
{
E TC, TE, Tv, TD, Tt, Tu;
TC = FMA(KP951056516, Ty, KP587785252 * TB);
TE = FNMS(KP587785252, Ty, KP951056516 * TB);
Tt = KP559016994 * (Ti - Tr);
Tu = FNMS(KP250000000, Ts, T1);
Tv = Tt + Tu;
TD = Tu - Tt;
ci[0] = Tv - TC;
ci[WS(rs, 1)] = TD + TE;
cr[WS(rs, 1)] = Tv + TC;
cr[WS(rs, 2)] = TD - TE;
}
ci[WS(rs, 4)] = TL + TI;
{
E TH, TP, TO, TQ, TM, TN;
TH = FMA(KP587785252, TF, KP951056516 * TG);
TP = FNMS(KP587785252, TG, KP951056516 * TF);
TM = FNMS(KP250000000, TL, TI);
TN = KP559016994 * (TJ - TK);
TO = TM - TN;
TQ = TN + TM;
cr[WS(rs, 3)] = TH - TO;
ci[WS(rs, 3)] = TP + TQ;
ci[WS(rs, 2)] = TH + TO;
cr[WS(rs, 4)] = TP - TQ;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hf2_5", twinstr, &GENUS, { 30, 18, 14, 0 } };
void X(codelet_hf2_5) (planner *p) {
X(khc2hc_register) (p, hf2_5, &desc);
}
#endif

View File

@@ -0,0 +1,390 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:18 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include rdft/scalar/hf.h */
/*
* This function contains 74 FP additions, 50 FP multiplications,
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
* 48 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
{
E T4, Tm, Tr, Ta, TB, TF;
T2 = W[0];
T3 = W[2];
T4 = T2 * T3;
Tl = W[4];
Tm = T2 * Tl;
Tn = W[5];
Tr = T2 * Tn;
T5 = W[1];
T6 = W[3];
Ta = T2 * T6;
Tf = FMA(T5, T6, T4);
T7 = FNMS(T5, T6, T4);
Ts = FNMS(T5, Tl, Tr);
Tb = FMA(T5, T3, Ta);
To = FMA(T5, Tn, Tm);
TB = Tf * Tl;
TF = Tf * Tn;
Ti = FNMS(T5, T3, Ta);
TC = FMA(Ti, Tn, TB);
TG = FNMS(Ti, Tl, TF);
}
{
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
E TI, T11, T13, T15, T16;
T1 = cr[0];
T1s = ci[0];
{
E T8, T9, Tc, T1q;
T8 = cr[WS(rs, 4)];
T9 = T7 * T8;
Tc = ci[WS(rs, 4)];
T1q = T7 * Tc;
Td = FMA(Tb, Tc, T9);
T1r = FNMS(Tb, T8, T1q);
}
{
E Tp, Tq, Tt, TX;
Tp = cr[WS(rs, 6)];
Tq = To * Tp;
Tt = ci[WS(rs, 6)];
TX = To * Tt;
Tu = FMA(Ts, Tt, Tq);
TY = FNMS(Ts, Tp, TX);
}
{
E Tg, Th, Tj, TV;
Tg = cr[WS(rs, 2)];
Th = Tf * Tg;
Tj = ci[WS(rs, 2)];
TV = Tf * Tj;
Tk = FMA(Ti, Tj, Th);
TW = FNMS(Ti, Tg, TV);
}
{
E TK, TL, TM, T19, TO, TP, TQ, T1b;
TK = cr[WS(rs, 7)];
TL = Tl * TK;
TM = ci[WS(rs, 7)];
T19 = Tl * TM;
TO = cr[WS(rs, 3)];
TP = T3 * TO;
TQ = ci[WS(rs, 3)];
T1b = T3 * TQ;
TN = FMA(Tn, TM, TL);
TR = FMA(T6, TQ, TP);
T18 = TN - TR;
T1a = FNMS(Tn, TK, T19);
T1c = FNMS(T6, TO, T1b);
T1d = T1a - T1c;
}
{
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
Tx = cr[WS(rs, 1)];
Ty = T2 * Tx;
Tz = ci[WS(rs, 1)];
T12 = T2 * Tz;
TD = cr[WS(rs, 5)];
TE = TC * TD;
TH = ci[WS(rs, 5)];
T14 = TC * TH;
TA = FMA(T5, Tz, Ty);
TI = FMA(TG, TH, TE);
T11 = TA - TI;
T13 = FNMS(T5, Tx, T12);
T15 = FNMS(TG, TD, T14);
T16 = T13 - T15;
}
{
E T10, T1g, T1z, T1B, T1f, T1A, T1j, T1C;
{
E TU, TZ, T1x, T1y;
TU = T1 - Td;
TZ = TW - TY;
T10 = TU + TZ;
T1g = TU - TZ;
T1x = Tk - Tu;
T1y = T1s - T1r;
T1z = T1x + T1y;
T1B = T1y - T1x;
}
{
E T17, T1e, T1h, T1i;
T17 = T11 + T16;
T1e = T18 - T1d;
T1f = T17 + T1e;
T1A = T1e - T17;
T1h = T11 - T16;
T1i = T18 + T1d;
T1j = T1h + T1i;
T1C = T1i - T1h;
}
ci[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
cr[WS(rs, 5)] = FMS(KP707106781, T1C, T1B);
ci[WS(rs, 6)] = FMA(KP707106781, T1C, T1B);
cr[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
cr[WS(rs, 3)] = FNMS(KP707106781, T1j, T1g);
cr[WS(rs, 7)] = FMS(KP707106781, T1A, T1z);
ci[WS(rs, 4)] = FMA(KP707106781, T1A, T1z);
ci[0] = FMA(KP707106781, T1j, T1g);
}
{
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
{
E Te, Tv, T1p, T1t;
Te = T1 + Td;
Tv = Tk + Tu;
Tw = Te + Tv;
T1k = Te - Tv;
T1p = TW + TY;
T1t = T1r + T1s;
T1u = T1p + T1t;
T1w = T1t - T1p;
}
{
E TJ, TS, T1l, T1m;
TJ = TA + TI;
TS = TN + TR;
TT = TJ + TS;
T1v = TS - TJ;
T1l = T1a + T1c;
T1m = T13 + T15;
T1n = T1l - T1m;
T1o = T1m + T1l;
}
ci[WS(rs, 3)] = Tw - TT;
cr[WS(rs, 6)] = T1v - T1w;
ci[WS(rs, 5)] = T1v + T1w;
cr[0] = Tw + TT;
cr[WS(rs, 2)] = T1k - T1n;
cr[WS(rs, 4)] = T1o - T1u;
ci[WS(rs, 7)] = T1o + T1u;
ci[WS(rs, 1)] = T1k + T1n;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
void X(codelet_hf2_8) (planner *p) {
X(khc2hc_register) (p, hf2_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include rdft/scalar/hf.h */
/*
* This function contains 74 FP additions, 44 FP multiplications,
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
* 42 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
{
E T4, Tb, T7, Ta;
T2 = W[0];
T5 = W[1];
T3 = W[2];
T6 = W[3];
T4 = T2 * T3;
Tb = T5 * T3;
T7 = T5 * T6;
Ta = T2 * T6;
T8 = T4 - T7;
Tc = Ta + Tb;
Tg = T4 + T7;
Ti = Ta - Tb;
Tl = W[4];
Tm = W[5];
Tn = FMA(T2, Tl, T5 * Tm);
Tz = FNMS(Ti, Tl, Tg * Tm);
Tp = FNMS(T5, Tl, T2 * Tm);
Tx = FMA(Tg, Tl, Ti * Tm);
}
{
E Tf, T1j, TL, T1d, TJ, T16, TV, TY, Ts, T1i, TO, T1a, TC, T17, TQ;
E TT;
{
E T1, T1c, Te, T1b, T9, Td;
T1 = cr[0];
T1c = ci[0];
T9 = cr[WS(rs, 4)];
Td = ci[WS(rs, 4)];
Te = FMA(T8, T9, Tc * Td);
T1b = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T1j = T1c - T1b;
TL = T1 - Te;
T1d = T1b + T1c;
}
{
E TF, TW, TI, TX;
{
E TD, TE, TG, TH;
TD = cr[WS(rs, 7)];
TE = ci[WS(rs, 7)];
TF = FMA(Tl, TD, Tm * TE);
TW = FNMS(Tm, TD, Tl * TE);
TG = cr[WS(rs, 3)];
TH = ci[WS(rs, 3)];
TI = FMA(T3, TG, T6 * TH);
TX = FNMS(T6, TG, T3 * TH);
}
TJ = TF + TI;
T16 = TW + TX;
TV = TF - TI;
TY = TW - TX;
}
{
E Tk, TM, Tr, TN;
{
E Th, Tj, To, Tq;
Th = cr[WS(rs, 2)];
Tj = ci[WS(rs, 2)];
Tk = FMA(Tg, Th, Ti * Tj);
TM = FNMS(Ti, Th, Tg * Tj);
To = cr[WS(rs, 6)];
Tq = ci[WS(rs, 6)];
Tr = FMA(Tn, To, Tp * Tq);
TN = FNMS(Tp, To, Tn * Tq);
}
Ts = Tk + Tr;
T1i = Tk - Tr;
TO = TM - TN;
T1a = TM + TN;
}
{
E Tw, TR, TB, TS;
{
E Tu, Tv, Ty, TA;
Tu = cr[WS(rs, 1)];
Tv = ci[WS(rs, 1)];
Tw = FMA(T2, Tu, T5 * Tv);
TR = FNMS(T5, Tu, T2 * Tv);
Ty = cr[WS(rs, 5)];
TA = ci[WS(rs, 5)];
TB = FMA(Tx, Ty, Tz * TA);
TS = FNMS(Tz, Ty, Tx * TA);
}
TC = Tw + TB;
T17 = TR + TS;
TQ = Tw - TB;
TT = TR - TS;
}
{
E Tt, TK, T1f, T1g;
Tt = Tf + Ts;
TK = TC + TJ;
ci[WS(rs, 3)] = Tt - TK;
cr[0] = Tt + TK;
T1f = TJ - TC;
T1g = T1d - T1a;
cr[WS(rs, 6)] = T1f - T1g;
ci[WS(rs, 5)] = T1f + T1g;
{
E T11, T1m, T14, T1l, T12, T13;
T11 = TL - TO;
T1m = T1j - T1i;
T12 = TQ - TT;
T13 = TV + TY;
T14 = KP707106781 * (T12 + T13);
T1l = KP707106781 * (T13 - T12);
cr[WS(rs, 3)] = T11 - T14;
ci[WS(rs, 6)] = T1l + T1m;
ci[0] = T11 + T14;
cr[WS(rs, 5)] = T1l - T1m;
}
}
{
E T19, T1e, T15, T18;
T19 = T17 + T16;
T1e = T1a + T1d;
cr[WS(rs, 4)] = T19 - T1e;
ci[WS(rs, 7)] = T19 + T1e;
T15 = Tf - Ts;
T18 = T16 - T17;
cr[WS(rs, 2)] = T15 - T18;
ci[WS(rs, 1)] = T15 + T18;
{
E TP, T1k, T10, T1h, TU, TZ;
TP = TL + TO;
T1k = T1i + T1j;
TU = TQ + TT;
TZ = TV - TY;
T10 = KP707106781 * (TU + TZ);
T1h = KP707106781 * (TZ - TU);
ci[WS(rs, 2)] = TP - T10;
ci[WS(rs, 4)] = T1h + T1k;
cr[WS(rs, 1)] = TP + T10;
cr[WS(rs, 7)] = T1h - T1k;
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 1, 1 },
{ TW_CEXP, 1, 3 },
{ TW_CEXP, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
void X(codelet_hf2_8) (planner *p) {
X(khc2hc_register) (p, hf2_8, &desc);
}
#endif

View File

@@ -0,0 +1,489 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hf_10 -include rdft/scalar/hf.h */
/*
* This function contains 102 FP additions, 72 FP multiplications,
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
* 47 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T25, T16, T17, T18, T1s, T1x;
E T1P, Tl, Ty, Tz, T1I, T1J, T24, T13, T14, T15, T1h, T1m, T1O;
{
E T1, T1R, T3, T6, T4, T1S, T2, T7, T1T, T5;
T1 = cr[0];
T1R = ci[0];
T3 = cr[WS(rs, 5)];
T6 = ci[WS(rs, 5)];
T2 = W[8];
T4 = T2 * T3;
T1S = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1T = FNMS(T5, T3, T1S);
T8 = T1 - T7;
T23 = T1T + T1R;
T12 = T1 + T7;
T1U = T1R - T1T;
}
{
E TF, T1w, TY, T1p, TL, T1u, TS, T1r;
{
E TB, TE, TC, T1v, TA, TD;
TB = cr[WS(rs, 4)];
TE = ci[WS(rs, 4)];
TA = W[6];
TC = TA * TB;
T1v = TA * TE;
TD = W[7];
TF = FMA(TD, TE, TC);
T1w = FNMS(TD, TB, T1v);
}
{
E TU, TX, TV, T1o, TT, TW;
TU = cr[WS(rs, 1)];
TX = ci[WS(rs, 1)];
TT = W[0];
TV = TT * TU;
T1o = TT * TX;
TW = W[1];
TY = FMA(TW, TX, TV);
T1p = FNMS(TW, TU, T1o);
}
{
E TH, TK, TI, T1t, TG, TJ;
TH = cr[WS(rs, 9)];
TK = ci[WS(rs, 9)];
TG = W[16];
TI = TG * TH;
T1t = TG * TK;
TJ = W[17];
TL = FMA(TJ, TK, TI);
T1u = FNMS(TJ, TH, T1t);
}
{
E TO, TR, TP, T1q, TN, TQ;
TO = cr[WS(rs, 6)];
TR = ci[WS(rs, 6)];
TN = W[10];
TP = TN * TO;
T1q = TN * TR;
TQ = W[11];
TS = FMA(TQ, TR, TP);
T1r = FNMS(TQ, TO, T1q);
}
TM = TF - TL;
TZ = TS - TY;
T10 = TM + TZ;
T1F = T1w + T1u;
T1G = T1r + T1p;
T25 = T1F + T1G;
T16 = TF + TL;
T17 = TS + TY;
T18 = T16 + T17;
T1s = T1p - T1r;
T1x = T1u - T1w;
T1P = T1x + T1s;
}
{
E Te, T1l, Tx, T1e, Tk, T1j, Tr, T1g;
{
E Ta, Td, Tb, T1k, T9, Tc;
Ta = cr[WS(rs, 2)];
Td = ci[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
T1k = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
T1l = FNMS(Tc, Ta, T1k);
}
{
E Tt, Tw, Tu, T1d, Ts, Tv;
Tt = cr[WS(rs, 3)];
Tw = ci[WS(rs, 3)];
Ts = W[4];
Tu = Ts * Tt;
T1d = Ts * Tw;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
T1e = FNMS(Tv, Tt, T1d);
}
{
E Tg, Tj, Th, T1i, Tf, Ti;
Tg = cr[WS(rs, 7)];
Tj = ci[WS(rs, 7)];
Tf = W[12];
Th = Tf * Tg;
T1i = Tf * Tj;
Ti = W[13];
Tk = FMA(Ti, Tj, Th);
T1j = FNMS(Ti, Tg, T1i);
}
{
E Tn, Tq, To, T1f, Tm, Tp;
Tn = cr[WS(rs, 8)];
Tq = ci[WS(rs, 8)];
Tm = W[14];
To = Tm * Tn;
T1f = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1g = FNMS(Tp, Tn, T1f);
}
Tl = Te - Tk;
Ty = Tr - Tx;
Tz = Tl + Ty;
T1I = T1l + T1j;
T1J = T1g + T1e;
T24 = T1I + T1J;
T13 = Te + Tk;
T14 = Tr + Tx;
T15 = T13 + T14;
T1h = T1e - T1g;
T1m = T1j - T1l;
T1O = T1m + T1h;
}
{
E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
T1b = Tz - T10;
T11 = Tz + T10;
T1a = FNMS(KP250000000, T11, T8);
T1n = T1h - T1m;
T1y = T1s - T1x;
T1z = FMA(KP618033988, T1y, T1n);
T1B = FNMS(KP618033988, T1n, T1y);
ci[WS(rs, 4)] = T8 + T11;
T1A = FNMS(KP559016994, T1b, T1a);
ci[WS(rs, 2)] = FNMS(KP951056516, T1B, T1A);
cr[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
T1c = FMA(KP559016994, T1b, T1a);
ci[0] = FNMS(KP951056516, T1z, T1c);
cr[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
}
{
E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
T1D = T15 - T18;
T19 = T15 + T18;
T1C = FNMS(KP250000000, T19, T12);
T1H = T1F - T1G;
T1K = T1I - T1J;
T1L = FNMS(KP618033988, T1K, T1H);
T1N = FMA(KP618033988, T1H, T1K);
cr[0] = T12 + T19;
T1M = FMA(KP559016994, T1D, T1C);
cr[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
ci[WS(rs, 3)] = FMA(KP951056516, T1N, T1M);
T1E = FNMS(KP559016994, T1D, T1C);
cr[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
ci[WS(rs, 1)] = FMA(KP951056516, T1L, T1E);
}
{
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
T1W = T1P - T1O;
T1Q = T1O + T1P;
T1V = FMA(KP250000000, T1Q, T1U);
T1Y = TZ - TM;
T1Z = Ty - Tl;
T20 = FNMS(KP618033988, T1Z, T1Y);
T22 = FMA(KP618033988, T1Y, T1Z);
cr[WS(rs, 5)] = T1Q - T1U;
T21 = FMA(KP559016994, T1W, T1V);
cr[WS(rs, 9)] = FMS(KP951056516, T22, T21);
ci[WS(rs, 8)] = FMA(KP951056516, T22, T21);
T1X = FNMS(KP559016994, T1W, T1V);
cr[WS(rs, 7)] = FMS(KP951056516, T20, T1X);
ci[WS(rs, 6)] = FMA(KP951056516, T20, T1X);
}
{
E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29;
T28 = T24 - T25;
T26 = T24 + T25;
T27 = FNMS(KP250000000, T26, T23);
T2a = T13 - T14;
T2b = T16 - T17;
T2c = FMA(KP618033988, T2b, T2a);
T2e = FNMS(KP618033988, T2a, T2b);
ci[WS(rs, 9)] = T26 + T23;
T2d = FNMS(KP559016994, T28, T27);
cr[WS(rs, 8)] = FMS(KP951056516, T2e, T2d);
ci[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
T29 = FMA(KP559016994, T28, T27);
cr[WS(rs, 6)] = FMS(KP951056516, T2c, T29);
ci[WS(rs, 5)] = FMA(KP951056516, T2c, T29);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 10, "hf_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
void X(codelet_hf_10) (planner *p) {
X(khc2hc_register) (p, hf_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hf_10 -include rdft/scalar/hf.h */
/*
* This function contains 102 FP additions, 60 FP multiplications,
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
* 45 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T7, T1R, TT, T1C, TF, TQ, TR, T1o, T1p, T1P, TX, TY, TZ, T1d, T1g;
E T1x, Ti, Tt, Tu, T1r, T1s, T1O, TU, TV, TW, T16, T19, T1y;
{
E T1, T1A, T6, T1B;
T1 = cr[0];
T1A = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 5)];
T5 = ci[WS(rs, 5)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1B = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
T1R = T1B + T1A;
TT = T1 + T6;
T1C = T1A - T1B;
}
{
E Tz, T1b, TP, T1e, TE, T1c, TK, T1f;
{
E Tw, Ty, Tv, Tx;
Tw = cr[WS(rs, 4)];
Ty = ci[WS(rs, 4)];
Tv = W[6];
Tx = W[7];
Tz = FMA(Tv, Tw, Tx * Ty);
T1b = FNMS(Tx, Tw, Tv * Ty);
}
{
E TM, TO, TL, TN;
TM = cr[WS(rs, 1)];
TO = ci[WS(rs, 1)];
TL = W[0];
TN = W[1];
TP = FMA(TL, TM, TN * TO);
T1e = FNMS(TN, TM, TL * TO);
}
{
E TB, TD, TA, TC;
TB = cr[WS(rs, 9)];
TD = ci[WS(rs, 9)];
TA = W[16];
TC = W[17];
TE = FMA(TA, TB, TC * TD);
T1c = FNMS(TC, TB, TA * TD);
}
{
E TH, TJ, TG, TI;
TH = cr[WS(rs, 6)];
TJ = ci[WS(rs, 6)];
TG = W[10];
TI = W[11];
TK = FMA(TG, TH, TI * TJ);
T1f = FNMS(TI, TH, TG * TJ);
}
TF = Tz - TE;
TQ = TK - TP;
TR = TF + TQ;
T1o = T1b + T1c;
T1p = T1f + T1e;
T1P = T1o + T1p;
TX = Tz + TE;
TY = TK + TP;
TZ = TX + TY;
T1d = T1b - T1c;
T1g = T1e - T1f;
T1x = T1g - T1d;
}
{
E Tc, T14, Ts, T18, Th, T15, Tn, T17;
{
E T9, Tb, T8, Ta;
T9 = cr[WS(rs, 2)];
Tb = ci[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
T14 = FNMS(Ta, T9, T8 * Tb);
}
{
E Tp, Tr, To, Tq;
Tp = cr[WS(rs, 3)];
Tr = ci[WS(rs, 3)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
T18 = FNMS(Tq, Tp, To * Tr);
}
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 7)];
Tg = ci[WS(rs, 7)];
Td = W[12];
Tf = W[13];
Th = FMA(Td, Te, Tf * Tg);
T15 = FNMS(Tf, Te, Td * Tg);
}
{
E Tk, Tm, Tj, Tl;
Tk = cr[WS(rs, 8)];
Tm = ci[WS(rs, 8)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T17 = FNMS(Tl, Tk, Tj * Tm);
}
Ti = Tc - Th;
Tt = Tn - Ts;
Tu = Ti + Tt;
T1r = T14 + T15;
T1s = T17 + T18;
T1O = T1r + T1s;
TU = Tc + Th;
TV = Tn + Ts;
TW = TU + TV;
T16 = T14 - T15;
T19 = T17 - T18;
T1y = T16 + T19;
}
{
E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
T11 = KP559016994 * (Tu - TR);
TS = Tu + TR;
T12 = FNMS(KP250000000, TS, T7);
T1a = T16 - T19;
T1h = T1d + T1g;
T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
ci[WS(rs, 4)] = T7 + TS;
T1j = T12 - T11;
ci[WS(rs, 2)] = T1j - T1k;
cr[WS(rs, 3)] = T1j + T1k;
T13 = T11 + T12;
ci[0] = T13 - T1i;
cr[WS(rs, 1)] = T13 + T1i;
}
{
E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
T1m = KP559016994 * (TW - TZ);
T10 = TW + TZ;
T1l = FNMS(KP250000000, T10, TT);
T1q = T1o - T1p;
T1t = T1r - T1s;
T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
cr[0] = TT + T10;
T1v = T1m + T1l;
cr[WS(rs, 4)] = T1v - T1w;
ci[WS(rs, 3)] = T1v + T1w;
T1n = T1l - T1m;
cr[WS(rs, 2)] = T1n - T1u;
ci[WS(rs, 1)] = T1n + T1u;
}
{
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
T1H = KP559016994 * (T1y + T1x);
T1z = T1x - T1y;
T1G = FMA(KP250000000, T1z, T1C);
T1D = Ti - Tt;
T1E = TQ - TF;
T1F = FMA(KP587785252, T1D, KP951056516 * T1E);
T1J = FNMS(KP951056516, T1D, KP587785252 * T1E);
cr[WS(rs, 5)] = T1z - T1C;
T1K = T1H + T1G;
cr[WS(rs, 9)] = T1J - T1K;
ci[WS(rs, 8)] = T1J + T1K;
T1I = T1G - T1H;
cr[WS(rs, 7)] = T1F - T1I;
ci[WS(rs, 6)] = T1F + T1I;
}
{
E T1Q, T1S, T1T, T1N, T1V, T1L, T1M, T1W, T1U;
T1Q = KP559016994 * (T1O - T1P);
T1S = T1O + T1P;
T1T = FNMS(KP250000000, T1S, T1R);
T1L = TU - TV;
T1M = TX - TY;
T1N = FMA(KP951056516, T1L, KP587785252 * T1M);
T1V = FNMS(KP587785252, T1L, KP951056516 * T1M);
ci[WS(rs, 9)] = T1S + T1R;
T1W = T1T - T1Q;
cr[WS(rs, 8)] = T1V - T1W;
ci[WS(rs, 7)] = T1V + T1W;
T1U = T1Q + T1T;
cr[WS(rs, 6)] = T1N - T1U;
ci[WS(rs, 5)] = T1N + T1U;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 10 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 10, "hf_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
void X(codelet_hf_10) (planner *p) {
X(khc2hc_register) (p, hf_10, &desc);
}
#endif

View File

@@ -0,0 +1,581 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include rdft/scalar/hf.h */
/*
* This function contains 118 FP additions, 68 FP multiplications,
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2s, T1s, T2f, T1d, T21, T1H;
E T1Z, Te, T2p, T1l, T2h, TT, T1V, T1A, T1T;
T1 = cr[0];
T2i = ci[0];
{
E Th, Tk, Ti, T2d, Tg, Tj;
Th = cr[WS(rs, 6)];
Tk = ci[WS(rs, 6)];
Tg = W[10];
Ti = Tg * Th;
T2d = Tg * Tk;
Tj = W[11];
Tl = FMA(Tj, Tk, Ti);
T2e = FNMS(Tj, Th, T2d);
}
{
E TW, TZ, TX, T1X, TV, TY;
TW = cr[WS(rs, 9)];
TZ = ci[WS(rs, 9)];
TV = W[16];
TX = TV * TW;
T1X = TV * TZ;
TY = W[17];
T10 = FMA(TY, TZ, TX);
T1Y = FNMS(TY, TW, T1X);
}
{
E TC, TF, TD, T1R, TB, TE;
TC = cr[WS(rs, 3)];
TF = ci[WS(rs, 3)];
TB = W[4];
TD = TB * TC;
T1R = TB * TF;
TE = W[5];
TG = FMA(TE, TF, TD);
T1S = FNMS(TE, TC, T1R);
}
{
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
Tn = cr[WS(rs, 10)];
Tq = ci[WS(rs, 10)];
Tm = W[18];
To = Tm * Tn;
T1o = Tm * Tq;
Tt = cr[WS(rs, 2)];
Tw = ci[WS(rs, 2)];
Ts = W[2];
Tu = Ts * Tt;
T1q = Ts * Tw;
{
E Tr, T1p, Tx, T1r, Tp, Tv;
Tp = W[19];
Tr = FMA(Tp, Tq, To);
T1p = FNMS(Tp, Tn, T1o);
Tv = W[3];
Tx = FMA(Tv, Tw, Tu);
T1r = FNMS(Tv, Tt, T1q);
Ty = Tr + Tx;
T2s = Tx - Tr;
T1s = T1p - T1r;
T2f = T1p + T1r;
}
}
{
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
T12 = cr[WS(rs, 1)];
T15 = ci[WS(rs, 1)];
T11 = W[0];
T13 = T11 * T12;
T1D = T11 * T15;
T18 = cr[WS(rs, 5)];
T1b = ci[WS(rs, 5)];
T17 = W[8];
T19 = T17 * T18;
T1F = T17 * T1b;
{
E T16, T1E, T1c, T1G, T14, T1a;
T14 = W[1];
T16 = FMA(T14, T15, T13);
T1E = FNMS(T14, T12, T1D);
T1a = W[9];
T1c = FMA(T1a, T1b, T19);
T1G = FNMS(T1a, T18, T1F);
T1d = T16 + T1c;
T21 = T1c - T16;
T1H = T1E - T1G;
T1Z = T1E + T1G;
}
}
{
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
T3 = cr[WS(rs, 4)];
T6 = ci[WS(rs, 4)];
T2 = W[6];
T4 = T2 * T3;
T1h = T2 * T6;
T9 = cr[WS(rs, 8)];
Tc = ci[WS(rs, 8)];
T8 = W[14];
Ta = T8 * T9;
T1j = T8 * Tc;
{
E T7, T1i, Td, T1k, T5, Tb;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1i = FNMS(T5, T3, T1h);
Tb = W[15];
Td = FMA(Tb, Tc, Ta);
T1k = FNMS(Tb, T9, T1j);
Te = T7 + Td;
T2p = Td - T7;
T1l = T1i - T1k;
T2h = T1i + T1k;
}
}
{
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
TI = cr[WS(rs, 7)];
TL = ci[WS(rs, 7)];
TH = W[12];
TJ = TH * TI;
T1w = TH * TL;
TO = cr[WS(rs, 11)];
TR = ci[WS(rs, 11)];
TN = W[20];
TP = TN * TO;
T1y = TN * TR;
{
E TM, T1x, TS, T1z, TK, TQ;
TK = W[13];
TM = FMA(TK, TL, TJ);
T1x = FNMS(TK, TI, T1w);
TQ = W[21];
TS = FMA(TQ, TR, TP);
T1z = FNMS(TQ, TO, T1y);
TT = TM + TS;
T1V = TS - TM;
T1A = T1x - T1z;
T1T = T1x + T1z;
}
}
{
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
{
E Tf, Tz, T2g, T2j;
Tf = T1 + Te;
Tz = Tl + Ty;
TA = Tf + Tz;
T28 = Tf - Tz;
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2m = T2j - T2g;
}
{
E TU, T1e, T29, T2a;
TU = TG + TT;
T1e = T10 + T1d;
T1f = TU + T1e;
T2l = TU - T1e;
T29 = T1S + T1T;
T2a = T1Y + T1Z;
T2b = T29 - T2a;
T2c = T29 + T2a;
}
ci[WS(rs, 5)] = TA - T1f;
cr[WS(rs, 9)] = T2l - T2m;
ci[WS(rs, 8)] = T2l + T2m;
cr[0] = TA + T1f;
cr[WS(rs, 3)] = T28 - T2b;
cr[WS(rs, 6)] = T2c - T2k;
ci[WS(rs, 11)] = T2c + T2k;
ci[WS(rs, 2)] = T28 + T2b;
}
{
E T1m, T1K, T2q, T2y, T2t, T2z, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
E T1O;
{
E T1g, T2o, T2r, T1n;
T1g = FNMS(KP500000000, Te, T1);
T1m = FNMS(KP866025403, T1l, T1g);
T1K = FMA(KP866025403, T1l, T1g);
T2o = FNMS(KP500000000, T2h, T2i);
T2q = FNMS(KP866025403, T2p, T2o);
T2y = FMA(KP866025403, T2p, T2o);
T2r = FNMS(KP500000000, T2f, T2e);
T2t = FNMS(KP866025403, T2s, T2r);
T2z = FMA(KP866025403, T2s, T2r);
T1n = FNMS(KP500000000, Ty, Tl);
T1t = FNMS(KP866025403, T1s, T1n);
T1L = FMA(KP866025403, T1s, T1n);
}
{
E T1v, T1U, T20, T1C;
T1v = FNMS(KP500000000, TT, TG);
T1B = FNMS(KP866025403, T1A, T1v);
T1N = FMA(KP866025403, T1A, T1v);
T1U = FNMS(KP500000000, T1T, T1S);
T1W = FNMS(KP866025403, T1V, T1U);
T25 = FMA(KP866025403, T1V, T1U);
T20 = FNMS(KP500000000, T1Z, T1Y);
T22 = FNMS(KP866025403, T21, T20);
T26 = FMA(KP866025403, T21, T20);
T1C = FNMS(KP500000000, T1d, T10);
T1I = FNMS(KP866025403, T1H, T1C);
T1O = FMA(KP866025403, T1H, T1C);
}
{
E T1u, T1J, T2v, T2w;
T1u = T1m + T1t;
T1J = T1B + T1I;
cr[WS(rs, 2)] = T1u - T1J;
ci[WS(rs, 3)] = T1u + T1J;
T2v = T1W + T22;
T2w = T2t + T2q;
cr[WS(rs, 8)] = -(T2v + T2w);
ci[WS(rs, 9)] = T2w - T2v;
}
{
E T2B, T2C, T2x, T2A;
T2B = T25 + T26;
T2C = T2z + T2y;
cr[WS(rs, 10)] = T2B - T2C;
ci[WS(rs, 7)] = T2B + T2C;
T2x = T1O - T1N;
T2A = T2y - T2z;
cr[WS(rs, 7)] = T2x - T2A;
ci[WS(rs, 10)] = T2x + T2A;
}
{
E T1M, T1P, T24, T27;
T1M = T1K + T1L;
T1P = T1N + T1O;
ci[WS(rs, 1)] = T1M - T1P;
cr[WS(rs, 4)] = T1M + T1P;
T24 = T1K - T1L;
T27 = T25 - T26;
ci[WS(rs, 4)] = T24 - T27;
cr[WS(rs, 1)] = T24 + T27;
}
{
E T1Q, T23, T2n, T2u;
T1Q = T1m - T1t;
T23 = T1W - T22;
ci[0] = T1Q - T23;
cr[WS(rs, 5)] = T1Q + T23;
T2n = T1I - T1B;
T2u = T2q - T2t;
cr[WS(rs, 11)] = T2n - T2u;
ci[WS(rs, 6)] = T2n + T2u;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
void X(codelet_hf_12) (planner *p) {
X(khc2hc_register) (p, hf_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include rdft/scalar/hf.h */
/*
* This function contains 118 FP additions, 60 FP multiplications,
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, T1W, T18, T23, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
E T1G, Ti, T1S, T1d, T26, Tt, T1a, T1T, T25, TA, T1y, T1j, T1B, TL, T1g;
E T1z, T1A;
{
E T6, T16, Tb, T17;
T1 = cr[0];
T1W = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 4)];
T5 = ci[WS(rs, 4)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T16 = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = cr[WS(rs, 8)];
Ta = ci[WS(rs, 8)];
T7 = W[14];
T9 = W[15];
Tb = FMA(T7, T8, T9 * Ta);
T17 = FNMS(T9, T8, T7 * Ta);
}
T18 = KP866025403 * (T16 - T17);
T23 = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
T15 = FNMS(KP500000000, Tc, T1);
T1V = T16 + T17;
T22 = FNMS(KP500000000, T1V, T1W);
}
{
E T11, T1n, TW, T1m;
{
E TO, TQ, TN, TP;
TO = cr[WS(rs, 9)];
TQ = ci[WS(rs, 9)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1E = FNMS(TP, TO, TN * TQ);
}
{
E TY, T10, TX, TZ;
TY = cr[WS(rs, 5)];
T10 = ci[WS(rs, 5)];
TX = W[8];
TZ = W[9];
T11 = FMA(TX, TY, TZ * T10);
T1n = FNMS(TZ, TY, TX * T10);
}
{
E TT, TV, TS, TU;
TT = cr[WS(rs, 1)];
TV = ci[WS(rs, 1)];
TS = W[0];
TU = W[1];
TW = FMA(TS, TT, TU * TV);
T1m = FNMS(TU, TT, TS * TV);
}
T1o = KP866025403 * (T1m - T1n);
T1D = KP866025403 * (T11 - TW);
T12 = TW + T11;
T1l = FNMS(KP500000000, T12, TR);
T1F = T1m + T1n;
T1G = FNMS(KP500000000, T1F, T1E);
}
{
E Ts, T1c, Tn, T1b;
{
E Tf, Th, Te, Tg;
Tf = cr[WS(rs, 6)];
Th = ci[WS(rs, 6)];
Te = W[10];
Tg = W[11];
Ti = FMA(Te, Tf, Tg * Th);
T1S = FNMS(Tg, Tf, Te * Th);
}
{
E Tp, Tr, To, Tq;
Tp = cr[WS(rs, 2)];
Tr = ci[WS(rs, 2)];
To = W[2];
Tq = W[3];
Ts = FMA(To, Tp, Tq * Tr);
T1c = FNMS(Tq, Tp, To * Tr);
}
{
E Tk, Tm, Tj, Tl;
Tk = cr[WS(rs, 10)];
Tm = ci[WS(rs, 10)];
Tj = W[18];
Tl = W[19];
Tn = FMA(Tj, Tk, Tl * Tm);
T1b = FNMS(Tl, Tk, Tj * Tm);
}
T1d = KP866025403 * (T1b - T1c);
T26 = KP866025403 * (Ts - Tn);
Tt = Tn + Ts;
T1a = FNMS(KP500000000, Tt, Ti);
T1T = T1b + T1c;
T25 = FNMS(KP500000000, T1T, T1S);
}
{
E TK, T1i, TF, T1h;
{
E Tx, Tz, Tw, Ty;
Tx = cr[WS(rs, 3)];
Tz = ci[WS(rs, 3)];
Tw = W[4];
Ty = W[5];
TA = FMA(Tw, Tx, Ty * Tz);
T1y = FNMS(Ty, Tx, Tw * Tz);
}
{
E TH, TJ, TG, TI;
TH = cr[WS(rs, 11)];
TJ = ci[WS(rs, 11)];
TG = W[20];
TI = W[21];
TK = FMA(TG, TH, TI * TJ);
T1i = FNMS(TI, TH, TG * TJ);
}
{
E TC, TE, TB, TD;
TC = cr[WS(rs, 7)];
TE = ci[WS(rs, 7)];
TB = W[12];
TD = W[13];
TF = FMA(TB, TC, TD * TE);
T1h = FNMS(TD, TC, TB * TE);
}
T1j = KP866025403 * (T1h - T1i);
T1B = KP866025403 * (TK - TF);
TL = TF + TK;
T1g = FNMS(KP500000000, TL, TA);
T1z = T1h + T1i;
T1A = FNMS(KP500000000, T1z, T1y);
}
{
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
{
E Td, Tu, T1U, T1X;
Td = T1 + Tc;
Tu = Ti + Tt;
Tv = Td + Tu;
T1N = Td - Tu;
T1U = T1S + T1T;
T1X = T1V + T1W;
T1Y = T1U + T1X;
T20 = T1X - T1U;
}
{
E TM, T13, T1O, T1P;
TM = TA + TL;
T13 = TR + T12;
T14 = TM + T13;
T1Z = TM - T13;
T1O = T1y + T1z;
T1P = T1E + T1F;
T1Q = T1O - T1P;
T1R = T1O + T1P;
}
ci[WS(rs, 5)] = Tv - T14;
cr[WS(rs, 9)] = T1Z - T20;
ci[WS(rs, 8)] = T1Z + T20;
cr[0] = Tv + T14;
cr[WS(rs, 3)] = T1N - T1Q;
cr[WS(rs, 6)] = T1R - T1Y;
ci[WS(rs, 11)] = T1R + T1Y;
ci[WS(rs, 2)] = T1N + T1Q;
}
{
E T1f, T1x, T28, T2a, T1q, T21, T1I, T29;
{
E T19, T1e, T24, T27;
T19 = T15 - T18;
T1e = T1a - T1d;
T1f = T19 + T1e;
T1x = T19 - T1e;
T24 = T22 - T23;
T27 = T25 - T26;
T28 = T24 - T27;
T2a = T27 + T24;
}
{
E T1k, T1p, T1C, T1H;
T1k = T1g - T1j;
T1p = T1l - T1o;
T1q = T1k + T1p;
T21 = T1p - T1k;
T1C = T1A - T1B;
T1H = T1D - T1G;
T1I = T1C + T1H;
T29 = T1H - T1C;
}
cr[WS(rs, 2)] = T1f - T1q;
cr[WS(rs, 8)] = T29 - T2a;
ci[WS(rs, 9)] = T29 + T2a;
ci[WS(rs, 3)] = T1f + T1q;
ci[0] = T1x - T1I;
cr[WS(rs, 11)] = T21 - T28;
ci[WS(rs, 6)] = T21 + T28;
cr[WS(rs, 5)] = T1x + T1I;
}
{
E T1t, T1J, T2e, T2g, T1w, T2b, T1M, T2f;
{
E T1r, T1s, T2c, T2d;
T1r = T15 + T18;
T1s = T1a + T1d;
T1t = T1r + T1s;
T1J = T1r - T1s;
T2c = T23 + T22;
T2d = T26 + T25;
T2e = T2c - T2d;
T2g = T2d + T2c;
}
{
E T1u, T1v, T1K, T1L;
T1u = T1g + T1j;
T1v = T1l + T1o;
T1w = T1u + T1v;
T2b = T1v - T1u;
T1K = T1B + T1A;
T1L = T1D + T1G;
T1M = T1K - T1L;
T2f = T1K + T1L;
}
ci[WS(rs, 1)] = T1t - T1w;
cr[WS(rs, 1)] = T1J + T1M;
cr[WS(rs, 4)] = T1t + T1w;
ci[WS(rs, 4)] = T1J - T1M;
cr[WS(rs, 7)] = T2b - T2e;
ci[WS(rs, 7)] = T2f + T2g;
ci[WS(rs, 10)] = T2b + T2e;
cr[WS(rs, 10)] = T2f - T2g;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 12 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
void X(codelet_hf_12) (planner *p) {
X(khc2hc_register) (p, hf_12, &desc);
}
#endif

View File

@@ -0,0 +1,816 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include rdft/scalar/hf.h */
/*
* This function contains 184 FP additions, 140 FP multiplications,
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
* 51 stack variables, 6 constants, and 60 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T1, T3i, T1G, T3l, Te, T1B, T3j, T3k, T1y, T2i, T2a, T2M, T37, T2Y, Tz;
E T2e, T1O, T2t, T39, T2U, TT, T2f, T1V, T2z, T3a, T2V, T1e, T2h, T23, T2G;
E T36, T2X;
{
E T7, T1D, Td, T1F;
T1 = cr[0];
T3i = ci[0];
{
E T3, T6, T4, T1C, T2, T5;
T3 = cr[WS(rs, 5)];
T6 = ci[WS(rs, 5)];
T2 = W[8];
T4 = T2 * T3;
T1C = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1D = FNMS(T5, T3, T1C);
}
{
E T9, Tc, Ta, T1E, T8, Tb;
T9 = cr[WS(rs, 10)];
Tc = ci[WS(rs, 10)];
T8 = W[18];
Ta = T8 * T9;
T1E = T8 * Tc;
Tb = W[19];
Td = FMA(Tb, Tc, Ta);
T1F = FNMS(Tb, T9, T1E);
}
T1G = T1D - T1F;
T3l = Td - T7;
Te = T7 + Td;
T1B = FNMS(KP500000000, Te, T1);
T3j = T1D + T1F;
T3k = FNMS(KP500000000, T3j, T3i);
}
{
E T1k, T2I, T1w, T28, T1q, T26;
{
E T1g, T1j, T1h, T2H, T1f, T1i;
T1g = cr[WS(rs, 9)];
T1j = ci[WS(rs, 9)];
T1f = W[16];
T1h = T1f * T1g;
T2H = T1f * T1j;
T1i = W[17];
T1k = FMA(T1i, T1j, T1h);
T2I = FNMS(T1i, T1g, T2H);
}
{
E T1s, T1v, T1t, T27, T1r, T1u;
T1s = cr[WS(rs, 4)];
T1v = ci[WS(rs, 4)];
T1r = W[6];
T1t = T1r * T1s;
T27 = T1r * T1v;
T1u = W[7];
T1w = FMA(T1u, T1v, T1t);
T28 = FNMS(T1u, T1s, T27);
}
{
E T1m, T1p, T1n, T25, T1l, T1o;
T1m = cr[WS(rs, 14)];
T1p = ci[WS(rs, 14)];
T1l = W[26];
T1n = T1l * T1m;
T25 = T1l * T1p;
T1o = W[27];
T1q = FMA(T1o, T1p, T1n);
T26 = FNMS(T1o, T1m, T25);
}
{
E T29, T1x, T24, T2L, T2J, T2K;
T29 = T26 - T28;
T1x = T1q + T1w;
T24 = FNMS(KP500000000, T1x, T1k);
T1y = T1k + T1x;
T2i = FMA(KP866025403, T29, T24);
T2a = FNMS(KP866025403, T29, T24);
T2L = T1q - T1w;
T2J = T26 + T28;
T2K = FNMS(KP500000000, T2J, T2I);
T2M = FNMS(KP866025403, T2L, T2K);
T37 = T2I + T2J;
T2Y = FMA(KP866025403, T2L, T2K);
}
}
{
E Tl, T2p, Tx, T1M, Tr, T1K;
{
E Th, Tk, Ti, T2o, Tg, Tj;
Th = cr[WS(rs, 3)];
Tk = ci[WS(rs, 3)];
Tg = W[4];
Ti = Tg * Th;
T2o = Tg * Tk;
Tj = W[5];
Tl = FMA(Tj, Tk, Ti);
T2p = FNMS(Tj, Th, T2o);
}
{
E Tt, Tw, Tu, T1L, Ts, Tv;
Tt = cr[WS(rs, 13)];
Tw = ci[WS(rs, 13)];
Ts = W[24];
Tu = Ts * Tt;
T1L = Ts * Tw;
Tv = W[25];
Tx = FMA(Tv, Tw, Tu);
T1M = FNMS(Tv, Tt, T1L);
}
{
E Tn, Tq, To, T1J, Tm, Tp;
Tn = cr[WS(rs, 8)];
Tq = ci[WS(rs, 8)];
Tm = W[14];
To = Tm * Tn;
T1J = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1K = FNMS(Tp, Tn, T1J);
}
{
E T1N, Ty, T1I, T2s, T2q, T2r;
T1N = T1K - T1M;
Ty = Tr + Tx;
T1I = FNMS(KP500000000, Ty, Tl);
Tz = Tl + Ty;
T2e = FMA(KP866025403, T1N, T1I);
T1O = FNMS(KP866025403, T1N, T1I);
T2s = Tr - Tx;
T2q = T1K + T1M;
T2r = FNMS(KP500000000, T2q, T2p);
T2t = FNMS(KP866025403, T2s, T2r);
T39 = T2p + T2q;
T2U = FMA(KP866025403, T2s, T2r);
}
}
{
E TF, T2v, TR, T1T, TL, T1R;
{
E TB, TE, TC, T2u, TA, TD;
TB = cr[WS(rs, 12)];
TE = ci[WS(rs, 12)];
TA = W[22];
TC = TA * TB;
T2u = TA * TE;
TD = W[23];
TF = FMA(TD, TE, TC);
T2v = FNMS(TD, TB, T2u);
}
{
E TN, TQ, TO, T1S, TM, TP;
TN = cr[WS(rs, 7)];
TQ = ci[WS(rs, 7)];
TM = W[12];
TO = TM * TN;
T1S = TM * TQ;
TP = W[13];
TR = FMA(TP, TQ, TO);
T1T = FNMS(TP, TN, T1S);
}
{
E TH, TK, TI, T1Q, TG, TJ;
TH = cr[WS(rs, 2)];
TK = ci[WS(rs, 2)];
TG = W[2];
TI = TG * TH;
T1Q = TG * TK;
TJ = W[3];
TL = FMA(TJ, TK, TI);
T1R = FNMS(TJ, TH, T1Q);
}
{
E T1U, TS, T1P, T2y, T2w, T2x;
T1U = T1R - T1T;
TS = TL + TR;
T1P = FNMS(KP500000000, TS, TF);
TT = TF + TS;
T2f = FMA(KP866025403, T1U, T1P);
T1V = FNMS(KP866025403, T1U, T1P);
T2y = TL - TR;
T2w = T1R + T1T;
T2x = FNMS(KP500000000, T2w, T2v);
T2z = FNMS(KP866025403, T2y, T2x);
T3a = T2v + T2w;
T2V = FMA(KP866025403, T2y, T2x);
}
}
{
E T10, T2C, T1c, T21, T16, T1Z;
{
E TW, TZ, TX, T2B, TV, TY;
TW = cr[WS(rs, 6)];
TZ = ci[WS(rs, 6)];
TV = W[10];
TX = TV * TW;
T2B = TV * TZ;
TY = W[11];
T10 = FMA(TY, TZ, TX);
T2C = FNMS(TY, TW, T2B);
}
{
E T18, T1b, T19, T20, T17, T1a;
T18 = cr[WS(rs, 1)];
T1b = ci[WS(rs, 1)];
T17 = W[0];
T19 = T17 * T18;
T20 = T17 * T1b;
T1a = W[1];
T1c = FMA(T1a, T1b, T19);
T21 = FNMS(T1a, T18, T20);
}
{
E T12, T15, T13, T1Y, T11, T14;
T12 = cr[WS(rs, 11)];
T15 = ci[WS(rs, 11)];
T11 = W[20];
T13 = T11 * T12;
T1Y = T11 * T15;
T14 = W[21];
T16 = FMA(T14, T15, T13);
T1Z = FNMS(T14, T12, T1Y);
}
{
E T22, T1d, T1X, T2F, T2D, T2E;
T22 = T1Z - T21;
T1d = T16 + T1c;
T1X = FNMS(KP500000000, T1d, T10);
T1e = T10 + T1d;
T2h = FMA(KP866025403, T22, T1X);
T23 = FNMS(KP866025403, T22, T1X);
T2F = T16 - T1c;
T2D = T1Z + T21;
T2E = FNMS(KP500000000, T2D, T2C);
T2G = FNMS(KP866025403, T2F, T2E);
T36 = T2C + T2D;
T2X = FMA(KP866025403, T2F, T2E);
}
}
{
E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
{
E T38, T3b, TU, T1z;
T38 = T36 - T37;
T3b = T39 - T3a;
T3c = FNMS(KP618033988, T3b, T38);
T3e = FMA(KP618033988, T38, T3b);
Tf = T1 + Te;
TU = Tz + TT;
T1z = T1e + T1y;
T1A = TU + T1z;
T33 = FNMS(KP250000000, T1A, Tf);
T34 = TU - T1z;
}
cr[0] = Tf + T1A;
T3d = FMA(KP559016994, T34, T33);
ci[WS(rs, 5)] = FNMS(KP951056516, T3e, T3d);
cr[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
T35 = FNMS(KP559016994, T34, T33);
ci[WS(rs, 2)] = FNMS(KP951056516, T3c, T35);
cr[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
}
{
E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
{
E T2W, T2Z, T1W, T2b;
T2W = T2U - T2V;
T2Z = T2X - T2Y;
T30 = FMA(KP618033988, T2Z, T2W);
T32 = FNMS(KP618033988, T2W, T2Z);
T1H = FNMS(KP866025403, T1G, T1B);
T1W = T1O + T1V;
T2b = T23 + T2a;
T2c = T1W + T2b;
T2R = FNMS(KP250000000, T2c, T1H);
T2S = T1W - T2b;
}
cr[WS(rs, 5)] = T1H + T2c;
T31 = FNMS(KP559016994, T2S, T2R);
cr[WS(rs, 2)] = FNMS(KP951056516, T32, T31);
ci[WS(rs, 6)] = FMA(KP951056516, T32, T31);
T2T = FMA(KP559016994, T2S, T2R);
ci[0] = FNMS(KP951056516, T30, T2T);
ci[WS(rs, 3)] = FMA(KP951056516, T30, T2T);
}
{
E T2O, T2Q, T2d, T2k, T2l, T2m, T2n, T2P;
{
E T2A, T2N, T2g, T2j;
T2A = T2t - T2z;
T2N = T2G - T2M;
T2O = FMA(KP618033988, T2N, T2A);
T2Q = FNMS(KP618033988, T2A, T2N);
T2d = FMA(KP866025403, T1G, T1B);
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2l = FNMS(KP250000000, T2k, T2d);
T2m = T2g - T2j;
}
ci[WS(rs, 4)] = T2d + T2k;
T2n = FMA(KP559016994, T2m, T2l);
cr[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
cr[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
T2P = FNMS(KP559016994, T2m, T2l);
cr[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
ci[WS(rs, 1)] = FMA(KP951056516, T2Q, T2P);
}
{
E T3s, T3u, T3m, T3h, T3n, T3o, T3t, T3p;
{
E T3q, T3r, T3f, T3g;
T3q = T2h - T2i;
T3r = T2e - T2f;
T3s = FNMS(KP618033988, T3r, T3q);
T3u = FMA(KP618033988, T3q, T3r);
T3m = FMA(KP866025403, T3l, T3k);
T3f = T2t + T2z;
T3g = T2G + T2M;
T3h = T3f + T3g;
T3n = FNMS(KP250000000, T3h, T3m);
T3o = T3f - T3g;
}
cr[WS(rs, 10)] = -(T3h + T3m);
T3t = FMA(KP559016994, T3o, T3n);
ci[WS(rs, 10)] = FMA(KP951056516, T3u, T3t);
ci[WS(rs, 13)] = FNMS(KP951056516, T3u, T3t);
T3p = FNMS(KP559016994, T3o, T3n);
cr[WS(rs, 13)] = FMS(KP951056516, T3s, T3p);
ci[WS(rs, 7)] = FMA(KP951056516, T3s, T3p);
}
{
E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
{
E T3O, T3P, T3I, T3J;
T3O = TT - Tz;
T3P = T1y - T1e;
T3Q = FMA(KP618033988, T3P, T3O);
T3S = FNMS(KP618033988, T3O, T3P);
T3H = T3j + T3i;
T3I = T39 + T3a;
T3J = T36 + T37;
T3K = T3I + T3J;
T3L = FNMS(KP250000000, T3K, T3H);
T3M = T3I - T3J;
}
ci[WS(rs, 14)] = T3K + T3H;
T3R = FNMS(KP559016994, T3M, T3L);
cr[WS(rs, 12)] = FMS(KP951056516, T3S, T3R);
ci[WS(rs, 11)] = FMA(KP951056516, T3S, T3R);
T3N = FMA(KP559016994, T3M, T3L);
cr[WS(rs, 9)] = FMS(KP951056516, T3Q, T3N);
ci[WS(rs, 8)] = FMA(KP951056516, T3Q, T3N);
}
{
E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
{
E T3C, T3D, T3w, T3x;
T3C = T1O - T1V;
T3D = T23 - T2a;
T3E = FMA(KP618033988, T3D, T3C);
T3G = FNMS(KP618033988, T3C, T3D);
T3v = FNMS(KP866025403, T3l, T3k);
T3w = T2U + T2V;
T3x = T2X + T2Y;
T3y = T3w + T3x;
T3z = FNMS(KP250000000, T3y, T3v);
T3A = T3x - T3w;
}
ci[WS(rs, 9)] = T3y + T3v;
T3F = FMA(KP559016994, T3A, T3z);
cr[WS(rs, 8)] = FMS(KP951056516, T3G, T3F);
ci[WS(rs, 12)] = FMA(KP951056516, T3G, T3F);
T3B = FNMS(KP559016994, T3A, T3z);
cr[WS(rs, 11)] = FMS(KP951056516, T3E, T3B);
cr[WS(rs, 14)] = -(FMA(KP951056516, T3E, T3B));
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, { 72, 28, 112, 0 } };
void X(codelet_hf_15) (planner *p) {
X(khc2hc_register) (p, hf_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include rdft/scalar/hf.h */
/*
* This function contains 184 FP additions, 112 FP multiplications,
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
* 65 stack variables, 6 constants, and 60 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T1q, T2Q, Td, T1n, T2T, T3l, T13, T1k, T1l, T2E, T2F, T3j, T1H, T1T, T2k;
E T2w, T2f, T2v, T1M, T1U, Tu, TL, TM, T2H, T2I, T3i, T1w, T1Q, T29, T2t;
E T24, T2s, T1B, T1R;
{
E T1, T2R, T6, T1o, Tb, T1p, Tc, T2S;
T1 = cr[0];
T2R = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 5)];
T5 = ci[WS(rs, 5)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1o = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = cr[WS(rs, 10)];
Ta = ci[WS(rs, 10)];
T7 = W[18];
T9 = W[19];
Tb = FMA(T7, T8, T9 * Ta);
T1p = FNMS(T9, T8, T7 * Ta);
}
T1q = KP866025403 * (T1o - T1p);
T2Q = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
Td = T1 + Tc;
T1n = FNMS(KP500000000, Tc, T1);
T2S = T1o + T1p;
T2T = FNMS(KP500000000, T2S, T2R);
T3l = T2S + T2R;
}
{
E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
E T2i;
{
E TO, TQ, TN, TP;
TO = cr[WS(rs, 6)];
TQ = ci[WS(rs, 6)];
TN = W[10];
TP = W[11];
TR = FMA(TN, TO, TP * TQ);
T2c = FNMS(TP, TO, TN * TQ);
}
{
E T15, T17, T14, T16;
T15 = cr[WS(rs, 9)];
T17 = ci[WS(rs, 9)];
T14 = W[16];
T16 = W[17];
T18 = FMA(T14, T15, T16 * T17);
T2h = FNMS(T16, T15, T14 * T17);
}
{
E TT, TV, TS, TU;
TT = cr[WS(rs, 11)];
TV = ci[WS(rs, 11)];
TS = W[20];
TU = W[21];
TW = FMA(TS, TT, TU * TV);
T1E = FNMS(TU, TT, TS * TV);
}
{
E TY, T10, TX, TZ;
TY = cr[WS(rs, 1)];
T10 = ci[WS(rs, 1)];
TX = W[0];
TZ = W[1];
T11 = FMA(TX, TY, TZ * T10);
T1F = FNMS(TZ, TY, TX * T10);
}
T12 = TW + T11;
T2d = T1E + T1F;
{
E T1a, T1c, T19, T1b;
T1a = cr[WS(rs, 14)];
T1c = ci[WS(rs, 14)];
T19 = W[26];
T1b = W[27];
T1d = FMA(T19, T1a, T1b * T1c);
T1J = FNMS(T1b, T1a, T19 * T1c);
}
{
E T1f, T1h, T1e, T1g;
T1f = cr[WS(rs, 4)];
T1h = ci[WS(rs, 4)];
T1e = W[6];
T1g = W[7];
T1i = FMA(T1e, T1f, T1g * T1h);
T1K = FNMS(T1g, T1f, T1e * T1h);
}
T1j = T1d + T1i;
T2i = T1J + T1K;
{
E T1D, T1G, T2g, T2j;
T13 = TR + T12;
T1k = T18 + T1j;
T1l = T13 + T1k;
T2E = T2c + T2d;
T2F = T2h + T2i;
T3j = T2E + T2F;
T1D = FNMS(KP500000000, T12, TR);
T1G = KP866025403 * (T1E - T1F);
T1H = T1D - T1G;
T1T = T1D + T1G;
T2g = KP866025403 * (T1d - T1i);
T2j = FNMS(KP500000000, T2i, T2h);
T2k = T2g - T2j;
T2w = T2g + T2j;
{
E T2b, T2e, T1I, T1L;
T2b = KP866025403 * (T11 - TW);
T2e = FNMS(KP500000000, T2d, T2c);
T2f = T2b + T2e;
T2v = T2e - T2b;
T1I = FNMS(KP500000000, T1j, T18);
T1L = KP866025403 * (T1J - T1K);
T1M = T1I - T1L;
T1U = T1I + T1L;
}
}
}
{
E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
E T27;
{
E Tf, Th, Te, Tg;
Tf = cr[WS(rs, 3)];
Th = ci[WS(rs, 3)];
Te = W[4];
Tg = W[5];
Ti = FMA(Te, Tf, Tg * Th);
T21 = FNMS(Tg, Tf, Te * Th);
}
{
E Tw, Ty, Tv, Tx;
Tw = cr[WS(rs, 12)];
Ty = ci[WS(rs, 12)];
Tv = W[22];
Tx = W[23];
Tz = FMA(Tv, Tw, Tx * Ty);
T26 = FNMS(Tx, Tw, Tv * Ty);
}
{
E Tk, Tm, Tj, Tl;
Tk = cr[WS(rs, 8)];
Tm = ci[WS(rs, 8)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T1t = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = cr[WS(rs, 13)];
Tr = ci[WS(rs, 13)];
To = W[24];
Tq = W[25];
Ts = FMA(To, Tp, Tq * Tr);
T1u = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn + Ts;
T22 = T1t + T1u;
{
E TB, TD, TA, TC;
TB = cr[WS(rs, 2)];
TD = ci[WS(rs, 2)];
TA = W[2];
TC = W[3];
TE = FMA(TA, TB, TC * TD);
T1y = FNMS(TC, TB, TA * TD);
}
{
E TG, TI, TF, TH;
TG = cr[WS(rs, 7)];
TI = ci[WS(rs, 7)];
TF = W[12];
TH = W[13];
TJ = FMA(TF, TG, TH * TI);
T1z = FNMS(TH, TG, TF * TI);
}
TK = TE + TJ;
T27 = T1y + T1z;
{
E T1s, T1v, T25, T28;
Tu = Ti + Tt;
TL = Tz + TK;
TM = Tu + TL;
T2H = T21 + T22;
T2I = T26 + T27;
T3i = T2H + T2I;
T1s = FNMS(KP500000000, Tt, Ti);
T1v = KP866025403 * (T1t - T1u);
T1w = T1s - T1v;
T1Q = T1s + T1v;
T25 = KP866025403 * (TJ - TE);
T28 = FNMS(KP500000000, T27, T26);
T29 = T25 + T28;
T2t = T28 - T25;
{
E T20, T23, T1x, T1A;
T20 = KP866025403 * (Ts - Tn);
T23 = FNMS(KP500000000, T22, T21);
T24 = T20 + T23;
T2s = T23 - T20;
T1x = FNMS(KP500000000, TK, Tz);
T1A = KP866025403 * (T1y - T1z);
T1B = T1x - T1A;
T1R = T1x + T1A;
}
}
}
{
E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
T2C = KP559016994 * (TM - T1l);
T1m = TM + T1l;
T2B = FNMS(KP250000000, T1m, Td);
T2G = T2E - T2F;
T2J = T2H - T2I;
T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
cr[0] = Td + T1m;
T2L = T2C + T2B;
ci[WS(rs, 5)] = T2L - T2M;
cr[WS(rs, 6)] = T2L + T2M;
T2D = T2B - T2C;
ci[WS(rs, 2)] = T2D - T2K;
cr[WS(rs, 3)] = T2D + T2K;
}
{
E T3k, T3m, T3n, T3h, T3p, T3f, T3g, T3q, T3o;
T3k = KP559016994 * (T3i - T3j);
T3m = T3i + T3j;
T3n = FNMS(KP250000000, T3m, T3l);
T3f = T1k - T13;
T3g = Tu - TL;
T3h = FNMS(KP951056516, T3g, KP587785252 * T3f);
T3p = FMA(KP587785252, T3g, KP951056516 * T3f);
ci[WS(rs, 14)] = T3m + T3l;
T3q = T3n - T3k;
cr[WS(rs, 12)] = T3p - T3q;
ci[WS(rs, 11)] = T3p + T3q;
T3o = T3k + T3n;
cr[WS(rs, 9)] = T3h - T3o;
ci[WS(rs, 8)] = T3h + T3o;
}
{
E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
{
E T2u, T2x, T1C, T1N;
T2u = T2s - T2t;
T2x = T2v - T2w;
T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
T2A = FNMS(KP587785252, T2u, KP951056516 * T2x);
T1r = T1n - T1q;
T1C = T1w + T1B;
T1N = T1H + T1M;
T1O = T1C + T1N;
T2p = KP559016994 * (T1C - T1N);
T2q = FNMS(KP250000000, T1O, T1r);
}
cr[WS(rs, 5)] = T1r + T1O;
T2z = T2q - T2p;
cr[WS(rs, 2)] = T2z - T2A;
ci[WS(rs, 6)] = T2z + T2A;
T2r = T2p + T2q;
ci[0] = T2r - T2y;
ci[WS(rs, 3)] = T2r + T2y;
}
{
E T35, T3d, T39, T3a, T38, T3b, T3e, T3c;
{
E T33, T34, T36, T37;
T33 = T1w - T1B;
T34 = T1H - T1M;
T35 = FMA(KP951056516, T33, KP587785252 * T34);
T3d = FNMS(KP587785252, T33, KP951056516 * T34);
T39 = T2T - T2Q;
T36 = T2v + T2w;
T37 = T2s + T2t;
T3a = T37 + T36;
T38 = KP559016994 * (T36 - T37);
T3b = FNMS(KP250000000, T3a, T39);
}
ci[WS(rs, 9)] = T3a + T39;
T3e = T38 + T3b;
cr[WS(rs, 8)] = T3d - T3e;
ci[WS(rs, 12)] = T3d + T3e;
T3c = T38 - T3b;
cr[WS(rs, 11)] = T35 + T3c;
cr[WS(rs, 14)] = T3c - T35;
}
{
E T2X, T31, T2U, T2P, T2Y, T2Z, T32, T30;
{
E T2V, T2W, T2N, T2O;
T2V = T1T - T1U;
T2W = T1Q - T1R;
T2X = FNMS(KP587785252, T2W, KP951056516 * T2V);
T31 = FMA(KP951056516, T2W, KP587785252 * T2V);
T2U = T2Q + T2T;
T2N = T2k - T2f;
T2O = T24 + T29;
T2P = T2N - T2O;
T2Y = FMA(KP250000000, T2P, T2U);
T2Z = KP559016994 * (T2O + T2N);
}
cr[WS(rs, 10)] = T2P - T2U;
T32 = T2Z + T2Y;
ci[WS(rs, 10)] = T31 + T32;
ci[WS(rs, 13)] = T32 - T31;
T30 = T2Y - T2Z;
cr[WS(rs, 13)] = T2X - T30;
ci[WS(rs, 7)] = T2X + T30;
}
{
E T2m, T2o, T1P, T1W, T1X, T1Y, T1Z, T2n;
{
E T2a, T2l, T1S, T1V;
T2a = T24 - T29;
T2l = T2f + T2k;
T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
T1P = T1n + T1q;
T1S = T1Q + T1R;
T1V = T1T + T1U;
T1W = T1S + T1V;
T1X = KP559016994 * (T1S - T1V);
T1Y = FNMS(KP250000000, T1W, T1P);
}
ci[WS(rs, 4)] = T1P + T1W;
T1Z = T1X + T1Y;
cr[WS(rs, 4)] = T1Z - T2m;
cr[WS(rs, 1)] = T1Z + T2m;
T2n = T1Y - T1X;
cr[WS(rs, 7)] = T2n - T2o;
ci[WS(rs, 1)] = T2n + T2o;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 15 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, { 128, 56, 56, 0 } };
void X(codelet_hf_15) (planner *p) {
X(khc2hc_register) (p, hf_15, &desc);
}
#endif

View File

@@ -0,0 +1,796 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:14 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include rdft/scalar/hf.h */
/*
* This function contains 174 FP additions, 100 FP multiplications,
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
* 60 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T8, T3A, T1I, T3o, T1s, T35, T2k, T2w, T1F, T36, T2p, T2r, Tl, T3z, T1N;
E T3k, Tz, T2W, T1P, T1U, T11, T30, T25, T2g, T1e, T31, T2a, T2h, TM, T2V;
E T1W, T21;
{
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
T1 = cr[0];
T3n = ci[0];
T3 = cr[WS(rs, 8)];
T6 = ci[WS(rs, 8)];
T2 = W[14];
T4 = T2 * T3;
T3l = T2 * T6;
T5 = W[15];
T7 = FMA(T5, T6, T4);
T3m = FNMS(T5, T3, T3l);
T8 = T1 + T7;
T3A = T3n - T3m;
T1I = T1 - T7;
T3o = T3m + T3n;
}
{
E T1h, T1k, T1i, T2s, T1n, T1q, T1o, T2u, T1g, T1m;
T1h = cr[WS(rs, 15)];
T1k = ci[WS(rs, 15)];
T1g = W[28];
T1i = T1g * T1h;
T2s = T1g * T1k;
T1n = cr[WS(rs, 7)];
T1q = ci[WS(rs, 7)];
T1m = W[12];
T1o = T1m * T1n;
T2u = T1m * T1q;
{
E T1l, T2t, T1r, T2v, T1j, T1p;
T1j = W[29];
T1l = FMA(T1j, T1k, T1i);
T2t = FNMS(T1j, T1h, T2s);
T1p = W[13];
T1r = FMA(T1p, T1q, T1o);
T2v = FNMS(T1p, T1n, T2u);
T1s = T1l + T1r;
T35 = T2t + T2v;
T2k = T1l - T1r;
T2w = T2t - T2v;
}
}
{
E T1u, T1x, T1v, T2l, T1A, T1D, T1B, T2n, T1t, T1z;
T1u = cr[WS(rs, 3)];
T1x = ci[WS(rs, 3)];
T1t = W[4];
T1v = T1t * T1u;
T2l = T1t * T1x;
T1A = cr[WS(rs, 11)];
T1D = ci[WS(rs, 11)];
T1z = W[20];
T1B = T1z * T1A;
T2n = T1z * T1D;
{
E T1y, T2m, T1E, T2o, T1w, T1C;
T1w = W[5];
T1y = FMA(T1w, T1x, T1v);
T2m = FNMS(T1w, T1u, T2l);
T1C = W[21];
T1E = FMA(T1C, T1D, T1B);
T2o = FNMS(T1C, T1A, T2n);
T1F = T1y + T1E;
T36 = T2m + T2o;
T2p = T2m - T2o;
T2r = T1E - T1y;
}
}
{
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
Ta = cr[WS(rs, 4)];
Td = ci[WS(rs, 4)];
T9 = W[6];
Tb = T9 * Ta;
T1J = T9 * Td;
Tg = cr[WS(rs, 12)];
Tj = ci[WS(rs, 12)];
Tf = W[22];
Th = Tf * Tg;
T1L = Tf * Tj;
{
E Te, T1K, Tk, T1M, Tc, Ti;
Tc = W[7];
Te = FMA(Tc, Td, Tb);
T1K = FNMS(Tc, Ta, T1J);
Ti = W[23];
Tk = FMA(Ti, Tj, Th);
T1M = FNMS(Ti, Tg, T1L);
Tl = Te + Tk;
T3z = Te - Tk;
T1N = T1K - T1M;
T3k = T1K + T1M;
}
}
{
E To, Tr, Tp, T1Q, Tu, Tx, Tv, T1S, Tn, Tt;
To = cr[WS(rs, 2)];
Tr = ci[WS(rs, 2)];
Tn = W[2];
Tp = Tn * To;
T1Q = Tn * Tr;
Tu = cr[WS(rs, 10)];
Tx = ci[WS(rs, 10)];
Tt = W[18];
Tv = Tt * Tu;
T1S = Tt * Tx;
{
E Ts, T1R, Ty, T1T, Tq, Tw;
Tq = W[3];
Ts = FMA(Tq, Tr, Tp);
T1R = FNMS(Tq, To, T1Q);
Tw = W[19];
Ty = FMA(Tw, Tx, Tv);
T1T = FNMS(Tw, Tu, T1S);
Tz = Ts + Ty;
T2W = T1R + T1T;
T1P = Ts - Ty;
T1U = T1R - T1T;
}
}
{
E TQ, TT, TR, T2c, TW, TZ, TX, T2e, TP, TV;
TQ = cr[WS(rs, 1)];
TT = ci[WS(rs, 1)];
TP = W[0];
TR = TP * TQ;
T2c = TP * TT;
TW = cr[WS(rs, 9)];
TZ = ci[WS(rs, 9)];
TV = W[16];
TX = TV * TW;
T2e = TV * TZ;
{
E TU, T2d, T10, T2f, TS, TY;
TS = W[1];
TU = FMA(TS, TT, TR);
T2d = FNMS(TS, TQ, T2c);
TY = W[17];
T10 = FMA(TY, TZ, TX);
T2f = FNMS(TY, TW, T2e);
T11 = TU + T10;
T30 = T2d + T2f;
T25 = TU - T10;
T2g = T2d - T2f;
}
}
{
E T13, T16, T14, T26, T19, T1c, T1a, T28, T12, T18;
T13 = cr[WS(rs, 5)];
T16 = ci[WS(rs, 5)];
T12 = W[8];
T14 = T12 * T13;
T26 = T12 * T16;
T19 = cr[WS(rs, 13)];
T1c = ci[WS(rs, 13)];
T18 = W[24];
T1a = T18 * T19;
T28 = T18 * T1c;
{
E T17, T27, T1d, T29, T15, T1b;
T15 = W[9];
T17 = FMA(T15, T16, T14);
T27 = FNMS(T15, T13, T26);
T1b = W[25];
T1d = FMA(T1b, T1c, T1a);
T29 = FNMS(T1b, T19, T28);
T1e = T17 + T1d;
T31 = T27 + T29;
T2a = T27 - T29;
T2h = T17 - T1d;
}
}
{
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
TB = cr[WS(rs, 14)];
TE = ci[WS(rs, 14)];
TA = W[26];
TC = TA * TB;
T1X = TA * TE;
TH = cr[WS(rs, 6)];
TK = ci[WS(rs, 6)];
TG = W[10];
TI = TG * TH;
T1Z = TG * TK;
{
E TF, T1Y, TL, T20, TD, TJ;
TD = W[27];
TF = FMA(TD, TE, TC);
T1Y = FNMS(TD, TB, T1X);
TJ = W[11];
TL = FMA(TJ, TK, TI);
T20 = FNMS(TJ, TH, T1Z);
TM = TF + TL;
T2V = T1Y + T20;
T1W = TF - TL;
T21 = T1Y - T20;
}
}
{
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
{
E Tm, TN, T3j, T3p;
Tm = T8 + Tl;
TN = Tz + TM;
TO = Tm + TN;
T3e = Tm - TN;
T3j = T2W + T2V;
T3p = T3k + T3o;
T3q = T3j + T3p;
T3s = T3p - T3j;
}
{
E T1f, T1G, T3f, T3g;
T1f = T11 + T1e;
T1G = T1s + T1F;
T1H = T1f + T1G;
T3r = T1G - T1f;
T3f = T35 + T36;
T3g = T30 + T31;
T3h = T3f - T3g;
T3i = T3g + T3f;
}
ci[WS(rs, 7)] = TO - T1H;
cr[WS(rs, 12)] = T3r - T3s;
ci[WS(rs, 11)] = T3r + T3s;
cr[0] = TO + T1H;
cr[WS(rs, 4)] = T3e - T3h;
cr[WS(rs, 8)] = T3i - T3q;
ci[WS(rs, 15)] = T3i + T3q;
ci[WS(rs, 3)] = T3e + T3h;
}
{
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
{
E T2U, T2X, T3t, T3u;
T2U = T8 - Tl;
T2X = T2V - T2W;
T2Y = T2U - T2X;
T3a = T2U + T2X;
T3t = Tz - TM;
T3u = T3o - T3k;
T3v = T3t + T3u;
T3x = T3u - T3t;
}
{
E T2Z, T32, T34, T37;
T2Z = T11 - T1e;
T32 = T30 - T31;
T33 = T2Z + T32;
T3b = T2Z - T32;
T34 = T1s - T1F;
T37 = T35 - T36;
T38 = T34 - T37;
T3c = T34 + T37;
}
{
E T39, T3y, T3d, T3w;
T39 = T33 + T38;
ci[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
cr[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
T3y = T3c - T3b;
cr[WS(rs, 10)] = FMS(KP707106781, T3y, T3x);
ci[WS(rs, 13)] = FMA(KP707106781, T3y, T3x);
T3d = T3b + T3c;
cr[WS(rs, 6)] = FNMS(KP707106781, T3d, T3a);
ci[WS(rs, 1)] = FMA(KP707106781, T3d, T3a);
T3w = T38 - T33;
cr[WS(rs, 14)] = FMS(KP707106781, T3w, T3v);
ci[WS(rs, 9)] = FMA(KP707106781, T3w, T3v);
}
}
{
E T1O, T3B, T3H, T2E, T23, T3I, T2O, T2R, T2H, T3C, T2j, T2B, T2L, T2S, T2y;
E T2C;
{
E T1V, T22, T2b, T2i;
T1O = T1I - T1N;
T3B = T3z + T3A;
T3H = T3A - T3z;
T2E = T1I + T1N;
T1V = T1P - T1U;
T22 = T1W + T21;
T23 = T1V + T22;
T3I = T22 - T1V;
{
E T2M, T2N, T2F, T2G;
T2M = T2k + T2p;
T2N = T2w + T2r;
T2O = FNMS(KP414213562, T2N, T2M);
T2R = FMA(KP414213562, T2M, T2N);
T2F = T1P + T1U;
T2G = T1W - T21;
T2H = T2F + T2G;
T3C = T2F - T2G;
}
T2b = T25 - T2a;
T2i = T2g + T2h;
T2j = FNMS(KP414213562, T2i, T2b);
T2B = FMA(KP414213562, T2b, T2i);
{
E T2J, T2K, T2q, T2x;
T2J = T25 + T2a;
T2K = T2g - T2h;
T2L = FMA(KP414213562, T2K, T2J);
T2S = FNMS(KP414213562, T2J, T2K);
T2q = T2k - T2p;
T2x = T2r - T2w;
T2y = FNMS(KP414213562, T2x, T2q);
T2C = FMA(KP414213562, T2q, T2x);
}
}
{
E T24, T2z, T3J, T3K;
T24 = FMA(KP707106781, T23, T1O);
T2z = T2j + T2y;
cr[WS(rs, 7)] = FNMS(KP923879532, T2z, T24);
ci[0] = FMA(KP923879532, T2z, T24);
T3J = FMA(KP707106781, T3I, T3H);
T3K = T2S + T2R;
cr[WS(rs, 9)] = FMS(KP923879532, T3K, T3J);
ci[WS(rs, 14)] = FMA(KP923879532, T3K, T3J);
}
{
E T3L, T3M, T2A, T2D;
T3L = FNMS(KP707106781, T3I, T3H);
T3M = T2O - T2L;
cr[WS(rs, 13)] = FMS(KP923879532, T3M, T3L);
ci[WS(rs, 10)] = FMA(KP923879532, T3M, T3L);
T2A = FNMS(KP707106781, T23, T1O);
T2D = T2B + T2C;
ci[WS(rs, 4)] = FNMS(KP923879532, T2D, T2A);
cr[WS(rs, 3)] = FMA(KP923879532, T2D, T2A);
}
{
E T2I, T2P, T3D, T3E;
T2I = FMA(KP707106781, T2H, T2E);
T2P = T2L + T2O;
ci[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
cr[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
T3D = FMA(KP707106781, T3C, T3B);
T3E = T2C - T2B;
cr[WS(rs, 15)] = FMS(KP923879532, T3E, T3D);
ci[WS(rs, 8)] = FMA(KP923879532, T3E, T3D);
}
{
E T3F, T3G, T2Q, T2T;
T3F = FNMS(KP707106781, T3C, T3B);
T3G = T2y - T2j;
cr[WS(rs, 11)] = FMS(KP923879532, T3G, T3F);
ci[WS(rs, 12)] = FMA(KP923879532, T3G, T3F);
T2Q = FNMS(KP707106781, T2H, T2E);
T2T = T2R - T2S;
cr[WS(rs, 5)] = FNMS(KP923879532, T2T, T2Q);
ci[WS(rs, 2)] = FMA(KP923879532, T2T, T2Q);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
void X(codelet_hf_16) (planner *p) {
X(khc2hc_register) (p, hf_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include rdft/scalar/hf.h */
/*
* This function contains 174 FP additions, 84 FP multiplications,
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
* 52 stack variables, 3 constants, and 64 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T38, T1t, T2U, Ti, T37, T1w, T2R, Tu, T2t, T1C, T2c, TF, T2s, T1H;
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2k, T24, T2j, TS, T13, T2w, T2x;
E T2y, T2z, T1O, T2h, T1T, T2g;
{
E T1, T2T, T6, T2S;
T1 = cr[0];
T2T = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 8)];
T5 = ci[WS(rs, 8)];
T2 = W[14];
T4 = W[15];
T6 = FMA(T2, T3, T4 * T5);
T2S = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T38 = T2T - T2S;
T1t = T1 - T6;
T2U = T2S + T2T;
}
{
E Tc, T1u, Th, T1v;
{
E T9, Tb, T8, Ta;
T9 = cr[WS(rs, 4)];
Tb = ci[WS(rs, 4)];
T8 = W[6];
Ta = W[7];
Tc = FMA(T8, T9, Ta * Tb);
T1u = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 12)];
Tg = ci[WS(rs, 12)];
Td = W[22];
Tf = W[23];
Th = FMA(Td, Te, Tf * Tg);
T1v = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T37 = Tc - Th;
T1w = T1u - T1v;
T2R = T1u + T1v;
}
{
E To, T1z, Tt, T1A, T1y, T1B;
{
E Tl, Tn, Tk, Tm;
Tl = cr[WS(rs, 2)];
Tn = ci[WS(rs, 2)];
Tk = W[2];
Tm = W[3];
To = FMA(Tk, Tl, Tm * Tn);
T1z = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = cr[WS(rs, 10)];
Ts = ci[WS(rs, 10)];
Tp = W[18];
Tr = W[19];
Tt = FMA(Tp, Tq, Tr * Ts);
T1A = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T2t = T1z + T1A;
T1y = To - Tt;
T1B = T1z - T1A;
T1C = T1y - T1B;
T2c = T1y + T1B;
}
{
E Tz, T1E, TE, T1F, T1D, T1G;
{
E Tw, Ty, Tv, Tx;
Tw = cr[WS(rs, 14)];
Ty = ci[WS(rs, 14)];
Tv = W[26];
Tx = W[27];
Tz = FMA(Tv, Tw, Tx * Ty);
T1E = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = cr[WS(rs, 6)];
TD = ci[WS(rs, 6)];
TA = W[10];
TC = W[11];
TE = FMA(TA, TB, TC * TD);
T1F = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T2s = T1E + T1F;
T1D = Tz - TE;
T1G = T1E - T1F;
T1H = T1D + T1G;
T2d = T1D - T1G;
}
{
E T19, T1V, T1p, T22, T1e, T1W, T1k, T21;
{
E T16, T18, T15, T17;
T16 = cr[WS(rs, 15)];
T18 = ci[WS(rs, 15)];
T15 = W[28];
T17 = W[29];
T19 = FMA(T15, T16, T17 * T18);
T1V = FNMS(T17, T16, T15 * T18);
}
{
E T1m, T1o, T1l, T1n;
T1m = cr[WS(rs, 11)];
T1o = ci[WS(rs, 11)];
T1l = W[20];
T1n = W[21];
T1p = FMA(T1l, T1m, T1n * T1o);
T22 = FNMS(T1n, T1m, T1l * T1o);
}
{
E T1b, T1d, T1a, T1c;
T1b = cr[WS(rs, 7)];
T1d = ci[WS(rs, 7)];
T1a = W[12];
T1c = W[13];
T1e = FMA(T1a, T1b, T1c * T1d);
T1W = FNMS(T1c, T1b, T1a * T1d);
}
{
E T1h, T1j, T1g, T1i;
T1h = cr[WS(rs, 3)];
T1j = ci[WS(rs, 3)];
T1g = W[4];
T1i = W[5];
T1k = FMA(T1g, T1h, T1i * T1j);
T21 = FNMS(T1i, T1h, T1g * T1j);
}
T1f = T19 + T1e;
T1q = T1k + T1p;
T2B = T1f - T1q;
T2C = T1V + T1W;
T2D = T21 + T22;
T2E = T2C - T2D;
{
E T1X, T1Y, T20, T23;
T1X = T1V - T1W;
T1Y = T1k - T1p;
T1Z = T1X + T1Y;
T2k = T1X - T1Y;
T20 = T19 - T1e;
T23 = T21 - T22;
T24 = T20 - T23;
T2j = T20 + T23;
}
}
{
E TM, T1P, T12, T1M, TR, T1Q, TX, T1L;
{
E TJ, TL, TI, TK;
TJ = cr[WS(rs, 1)];
TL = ci[WS(rs, 1)];
TI = W[0];
TK = W[1];
TM = FMA(TI, TJ, TK * TL);
T1P = FNMS(TK, TJ, TI * TL);
}
{
E TZ, T11, TY, T10;
TZ = cr[WS(rs, 13)];
T11 = ci[WS(rs, 13)];
TY = W[24];
T10 = W[25];
T12 = FMA(TY, TZ, T10 * T11);
T1M = FNMS(T10, TZ, TY * T11);
}
{
E TO, TQ, TN, TP;
TO = cr[WS(rs, 9)];
TQ = ci[WS(rs, 9)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1Q = FNMS(TP, TO, TN * TQ);
}
{
E TU, TW, TT, TV;
TU = cr[WS(rs, 5)];
TW = ci[WS(rs, 5)];
TT = W[8];
TV = W[9];
TX = FMA(TT, TU, TV * TW);
T1L = FNMS(TV, TU, TT * TW);
}
TS = TM + TR;
T13 = TX + T12;
T2w = TS - T13;
T2x = T1P + T1Q;
T2y = T1L + T1M;
T2z = T2x - T2y;
{
E T1K, T1N, T1R, T1S;
T1K = TM - TR;
T1N = T1L - T1M;
T1O = T1K - T1N;
T2h = T1K + T1N;
T1R = T1P - T1Q;
T1S = TX - T12;
T1T = T1R + T1S;
T2g = T1R - T1S;
}
}
{
E T1J, T27, T3a, T3c, T26, T3b, T2a, T35;
{
E T1x, T1I, T36, T39;
T1x = T1t - T1w;
T1I = KP707106781 * (T1C + T1H);
T1J = T1x + T1I;
T27 = T1x - T1I;
T36 = KP707106781 * (T2c - T2d);
T39 = T37 + T38;
T3a = T36 + T39;
T3c = T39 - T36;
}
{
E T1U, T25, T28, T29;
T1U = FNMS(KP382683432, T1T, KP923879532 * T1O);
T25 = FMA(KP382683432, T1Z, KP923879532 * T24);
T26 = T1U + T25;
T3b = T25 - T1U;
T28 = FMA(KP923879532, T1T, KP382683432 * T1O);
T29 = FNMS(KP923879532, T1Z, KP382683432 * T24);
T2a = T28 + T29;
T35 = T29 - T28;
}
cr[WS(rs, 7)] = T1J - T26;
cr[WS(rs, 11)] = T3b - T3c;
ci[WS(rs, 12)] = T3b + T3c;
ci[0] = T1J + T26;
ci[WS(rs, 4)] = T27 - T2a;
cr[WS(rs, 15)] = T35 - T3a;
ci[WS(rs, 8)] = T35 + T3a;
cr[WS(rs, 3)] = T27 + T2a;
}
{
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
{
E Tj, TG, T2Q, T2V;
Tj = T7 + Ti;
TG = Tu + TF;
TH = Tj + TG;
T2L = Tj - TG;
T2Q = T2t + T2s;
T2V = T2R + T2U;
T2W = T2Q + T2V;
T2Y = T2V - T2Q;
}
{
E T14, T1r, T2M, T2N;
T14 = TS + T13;
T1r = T1f + T1q;
T1s = T14 + T1r;
T2X = T1r - T14;
T2M = T2C + T2D;
T2N = T2x + T2y;
T2O = T2M - T2N;
T2P = T2N + T2M;
}
ci[WS(rs, 7)] = TH - T1s;
cr[WS(rs, 12)] = T2X - T2Y;
ci[WS(rs, 11)] = T2X + T2Y;
cr[0] = TH + T1s;
cr[WS(rs, 4)] = T2L - T2O;
cr[WS(rs, 8)] = T2P - T2W;
ci[WS(rs, 15)] = T2P + T2W;
ci[WS(rs, 3)] = T2L + T2O;
}
{
E T2f, T2n, T3g, T3i, T2m, T3h, T2q, T3d;
{
E T2b, T2e, T3e, T3f;
T2b = T1t + T1w;
T2e = KP707106781 * (T2c + T2d);
T2f = T2b + T2e;
T2n = T2b - T2e;
T3e = KP707106781 * (T1H - T1C);
T3f = T38 - T37;
T3g = T3e + T3f;
T3i = T3f - T3e;
}
{
E T2i, T2l, T2o, T2p;
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
T2m = T2i + T2l;
T3h = T2l - T2i;
T2o = FNMS(KP923879532, T2g, KP382683432 * T2h);
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
T2q = T2o + T2p;
T3d = T2p - T2o;
}
ci[WS(rs, 6)] = T2f - T2m;
cr[WS(rs, 13)] = T3h - T3i;
ci[WS(rs, 10)] = T3h + T3i;
cr[WS(rs, 1)] = T2f + T2m;
cr[WS(rs, 5)] = T2n - T2q;
cr[WS(rs, 9)] = T3d - T3g;
ci[WS(rs, 14)] = T3d + T3g;
ci[WS(rs, 2)] = T2n + T2q;
}
{
E T2v, T2H, T32, T34, T2G, T2Z, T2K, T33;
{
E T2r, T2u, T30, T31;
T2r = T7 - Ti;
T2u = T2s - T2t;
T2v = T2r - T2u;
T2H = T2r + T2u;
T30 = Tu - TF;
T31 = T2U - T2R;
T32 = T30 + T31;
T34 = T31 - T30;
}
{
E T2A, T2F, T2I, T2J;
T2A = T2w + T2z;
T2F = T2B - T2E;
T2G = KP707106781 * (T2A + T2F);
T2Z = KP707106781 * (T2F - T2A);
T2I = T2w - T2z;
T2J = T2B + T2E;
T2K = KP707106781 * (T2I + T2J);
T33 = KP707106781 * (T2J - T2I);
}
ci[WS(rs, 5)] = T2v - T2G;
cr[WS(rs, 10)] = T33 - T34;
ci[WS(rs, 13)] = T33 + T34;
cr[WS(rs, 2)] = T2v + T2G;
cr[WS(rs, 6)] = T2H - T2K;
cr[WS(rs, 14)] = T2Z - T32;
ci[WS(rs, 9)] = T2Z + T32;
ci[WS(rs, 1)] = T2H + T2K;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 16 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
void X(codelet_hf_16) (planner *p) {
X(khc2hc_register) (p, hf_16, &desc);
}
#endif

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hf_2 -include rdft/scalar/hf.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 11 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
T1 = cr[0];
Ta = ci[0];
T3 = cr[WS(rs, 1)];
T6 = ci[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
T8 = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
T9 = FNMS(T5, T3, T8);
ci[0] = T1 - T7;
cr[0] = T1 + T7;
cr[WS(rs, 1)] = T9 - Ta;
ci[WS(rs, 1)] = T9 + Ta;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 2, "hf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hf_2) (planner *p) {
X(khc2hc_register) (p, hf_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hf_2 -include rdft/scalar/hf.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, T8, T6, T7;
T1 = cr[0];
T8 = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 1)];
T5 = ci[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
T7 = FNMS(T4, T3, T2 * T5);
}
ci[0] = T1 - T6;
cr[0] = T1 + T6;
cr[WS(rs, 1)] = T7 - T8;
ci[WS(rs, 1)] = T7 + T8;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 2 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 2, "hf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
void X(codelet_hf_2) (planner *p) {
X(khc2hc_register) (p, hf_2, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,166 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -dit -name hf_3 -include rdft/scalar/hf.h */
/*
* This function contains 16 FP additions, 14 FP multiplications,
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, Tl, T7, Th, Td, Tj;
T1 = cr[0];
Tl = ci[0];
{
E T3, T6, T4, Tg, T2, T5;
T3 = cr[WS(rs, 1)];
T6 = ci[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
Tg = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Th = FNMS(T5, T3, Tg);
}
{
E T9, Tc, Ta, Ti, T8, Tb;
T9 = cr[WS(rs, 2)];
Tc = ci[WS(rs, 2)];
T8 = W[2];
Ta = T8 * T9;
Ti = T8 * Tc;
Tb = W[3];
Td = FMA(Tb, Tc, Ta);
Tj = FNMS(Tb, T9, Ti);
}
{
E Tk, Te, Tf, To, Tm, Tn;
Tk = Th - Tj;
Te = T7 + Td;
Tf = FNMS(KP500000000, Te, T1);
cr[0] = T1 + Te;
ci[0] = FNMS(KP866025403, Tk, Tf);
cr[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
To = Td - T7;
Tm = Th + Tj;
Tn = FNMS(KP500000000, Tm, Tl);
cr[WS(rs, 2)] = FMS(KP866025403, To, Tn);
ci[WS(rs, 2)] = Tm + Tl;
ci[WS(rs, 1)] = FMA(KP866025403, To, Tn);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 3, "hf_3", twinstr, &GENUS, { 6, 4, 10, 0 } };
void X(codelet_hf_3) (planner *p) {
X(khc2hc_register) (p, hf_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 3 -dit -name hf_3 -include rdft/scalar/hf.h */
/*
* This function contains 16 FP additions, 12 FP multiplications,
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, Ti, T6, Te, Tb, Tf, Tc, Tj;
T1 = cr[0];
Ti = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 1)];
T5 = ci[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
Te = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = cr[WS(rs, 2)];
Ta = ci[WS(rs, 2)];
T7 = W[2];
T9 = W[3];
Tb = FMA(T7, T8, T9 * Ta);
Tf = FNMS(T9, T8, T7 * Ta);
}
Tc = T6 + Tb;
Tj = Te + Tf;
{
E Td, Tg, Th, Tk;
cr[0] = T1 + Tc;
Td = FNMS(KP500000000, Tc, T1);
Tg = KP866025403 * (Te - Tf);
ci[0] = Td - Tg;
cr[WS(rs, 1)] = Td + Tg;
ci[WS(rs, 2)] = Tj + Ti;
Th = KP866025403 * (Tb - T6);
Tk = FNMS(KP500000000, Tj, Ti);
cr[WS(rs, 2)] = Th - Tk;
ci[WS(rs, 1)] = Th + Tk;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 3 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 3, "hf_3", twinstr, &GENUS, { 10, 6, 6, 0 } };
void X(codelet_hf_3) (planner *p) {
X(khc2hc_register) (p, hf_3, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hf_4 -include rdft/scalar/hf.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
T1 = cr[0];
Tv = ci[0];
{
E T3, T6, T4, Tt, T2, T5;
T3 = cr[WS(rs, 2)];
T6 = ci[WS(rs, 2)];
T2 = W[2];
T4 = T2 * T3;
Tt = T2 * T6;
T5 = W[3];
T7 = FMA(T5, T6, T4);
Tu = FNMS(T5, T3, Tt);
}
{
E Ta, Td, Tb, Tn, T9, Tc;
Ta = cr[WS(rs, 1)];
Td = ci[WS(rs, 1)];
T9 = W[0];
Tb = T9 * Ta;
Tn = T9 * Td;
Tc = W[1];
Te = FMA(Tc, Td, Tb);
To = FNMS(Tc, Ta, Tn);
}
{
E Tg, Tj, Th, Tp, Tf, Ti;
Tg = cr[WS(rs, 3)];
Tj = ci[WS(rs, 3)];
Tf = W[4];
Th = Tf * Tg;
Tp = Tf * Tj;
Ti = W[5];
Tk = FMA(Ti, Tj, Th);
Tq = FNMS(Ti, Tg, Tp);
}
{
E T8, Tl, Tm, Tr;
T8 = T1 + T7;
Tl = Te + Tk;
ci[WS(rs, 1)] = T8 - Tl;
cr[0] = T8 + Tl;
Tm = T1 - T7;
Tr = To - Tq;
ci[0] = Tm - Tr;
cr[WS(rs, 1)] = Tm + Tr;
}
{
E Ts, Tw, Tx, Ty;
Ts = To + Tq;
Tw = Tu + Tv;
cr[WS(rs, 2)] = Ts - Tw;
ci[WS(rs, 3)] = Ts + Tw;
Tx = Tk - Te;
Ty = Tv - Tu;
cr[WS(rs, 3)] = Tx - Ty;
ci[WS(rs, 2)] = Tx + Ty;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hf_4) (planner *p) {
X(khc2hc_register) (p, hf_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hf_4 -include rdft/scalar/hf.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
T1 = cr[0];
Tp = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 2)];
T5 = ci[WS(rs, 2)];
T2 = W[2];
T4 = W[3];
T6 = FMA(T2, T3, T4 * T5);
To = FNMS(T4, T3, T2 * T5);
}
{
E T9, Tb, T8, Ta;
T9 = cr[WS(rs, 1)];
Tb = ci[WS(rs, 1)];
T8 = W[0];
Ta = W[1];
Tc = FMA(T8, T9, Ta * Tb);
Tk = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 3)];
Tg = ci[WS(rs, 3)];
Td = W[4];
Tf = W[5];
Th = FMA(Td, Te, Tf * Tg);
Tl = FNMS(Tf, Te, Td * Tg);
}
{
E T7, Ti, Tj, Tm;
T7 = T1 + T6;
Ti = Tc + Th;
ci[WS(rs, 1)] = T7 - Ti;
cr[0] = T7 + Ti;
Tj = T1 - T6;
Tm = Tk - Tl;
ci[0] = Tj - Tm;
cr[WS(rs, 1)] = Tj + Tm;
}
{
E Tn, Tq, Tr, Ts;
Tn = Tk + Tl;
Tq = To + Tp;
cr[WS(rs, 2)] = Tn - Tq;
ci[WS(rs, 3)] = Tn + Tq;
Tr = Th - Tc;
Ts = Tp - To;
cr[WS(rs, 3)] = Tr - Ts;
ci[WS(rs, 2)] = Tr + Ts;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 4 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 4, "hf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
void X(codelet_hf_4) (planner *p) {
X(khc2hc_register) (p, hf_4, &desc);
}
#endif

View File

@@ -0,0 +1,253 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -dit -name hf_5 -include rdft/scalar/hf.h */
/*
* This function contains 40 FP additions, 34 FP multiplications,
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
* 31 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, TJ, T7, Tx, Td, Tz, Te, TK, Tk, TC, Tq, TE, Tr, TL;
T1 = cr[0];
TJ = ci[0];
{
E T3, T6, T4, Tw, T9, Tc, Ta, Ty, T2, T8, T5, Tb;
T3 = cr[WS(rs, 1)];
T6 = ci[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
Tw = T2 * T6;
T9 = cr[WS(rs, 4)];
Tc = ci[WS(rs, 4)];
T8 = W[6];
Ta = T8 * T9;
Ty = T8 * Tc;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tx = FNMS(T5, T3, Tw);
Tb = W[7];
Td = FMA(Tb, Tc, Ta);
Tz = FNMS(Tb, T9, Ty);
Te = T7 + Td;
TK = Tx + Tz;
}
{
E Tg, Tj, Th, TB, Tm, Tp, Tn, TD, Tf, Tl, Ti, To;
Tg = cr[WS(rs, 2)];
Tj = ci[WS(rs, 2)];
Tf = W[2];
Th = Tf * Tg;
TB = Tf * Tj;
Tm = cr[WS(rs, 3)];
Tp = ci[WS(rs, 3)];
Tl = W[4];
Tn = Tl * Tm;
TD = Tl * Tp;
Ti = W[3];
Tk = FMA(Ti, Tj, Th);
TC = FNMS(Ti, Tg, TB);
To = W[5];
Tq = FMA(To, Tp, Tn);
TE = FNMS(To, Tm, TD);
Tr = Tk + Tq;
TL = TC + TE;
}
{
E Tu, Ts, Tt, TG, TI, TA, TF, Tv, TH;
Tu = Te - Tr;
Ts = Te + Tr;
Tt = FNMS(KP250000000, Ts, T1);
TA = Tx - Tz;
TF = TC - TE;
TG = FMA(KP618033988, TF, TA);
TI = FNMS(KP618033988, TA, TF);
cr[0] = T1 + Ts;
Tv = FMA(KP559016994, Tu, Tt);
ci[0] = FNMS(KP951056516, TG, Tv);
cr[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
TH = FNMS(KP559016994, Tu, Tt);
cr[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
ci[WS(rs, 1)] = FMA(KP951056516, TI, TH);
}
{
E TO, TM, TN, TS, TU, TQ, TR, TT, TP;
TO = TK - TL;
TM = TK + TL;
TN = FNMS(KP250000000, TM, TJ);
TQ = Tk - Tq;
TR = Td - T7;
TS = FMA(KP618033988, TR, TQ);
TU = FNMS(KP618033988, TQ, TR);
ci[WS(rs, 4)] = TM + TJ;
TT = FMA(KP559016994, TO, TN);
cr[WS(rs, 4)] = FMS(KP951056516, TU, TT);
ci[WS(rs, 3)] = FMA(KP951056516, TU, TT);
TP = FNMS(KP559016994, TO, TN);
cr[WS(rs, 3)] = FMS(KP951056516, TS, TP);
ci[WS(rs, 2)] = FMA(KP951056516, TS, TP);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 5 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hf_5", twinstr, &GENUS, { 14, 8, 26, 0 } };
void X(codelet_hf_5) (planner *p) {
X(khc2hc_register) (p, hf_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 5 -dit -name hf_5 -include rdft/scalar/hf.h */
/*
* This function contains 40 FP additions, 28 FP multiplications,
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
* 29 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, TE, Tu, Tx, TC, TB, TF, TG, TH, Tc, Tn, To;
T1 = cr[0];
TE = ci[0];
{
E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 1)];
T5 = ci[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
Ts = FNMS(T4, T3, T2 * T5);
}
{
E Tj, Tl, Ti, Tk;
Tj = cr[WS(rs, 3)];
Tl = ci[WS(rs, 3)];
Ti = W[4];
Tk = W[5];
Tm = FMA(Ti, Tj, Tk * Tl);
Tw = FNMS(Tk, Tj, Ti * Tl);
}
{
E T8, Ta, T7, T9;
T8 = cr[WS(rs, 4)];
Ta = ci[WS(rs, 4)];
T7 = W[6];
T9 = W[7];
Tb = FMA(T7, T8, T9 * Ta);
Tt = FNMS(T9, T8, T7 * Ta);
}
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 2)];
Tg = ci[WS(rs, 2)];
Td = W[2];
Tf = W[3];
Th = FMA(Td, Te, Tf * Tg);
Tv = FNMS(Tf, Te, Td * Tg);
}
Tu = Ts - Tt;
Tx = Tv - Tw;
TC = Th - Tm;
TB = Tb - T6;
TF = Ts + Tt;
TG = Tv + Tw;
TH = TF + TG;
Tc = T6 + Tb;
Tn = Th + Tm;
To = Tc + Tn;
}
cr[0] = T1 + To;
{
E Ty, TA, Tr, Tz, Tp, Tq;
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
Tp = KP559016994 * (Tc - Tn);
Tq = FNMS(KP250000000, To, T1);
Tr = Tp + Tq;
Tz = Tq - Tp;
ci[0] = Tr - Ty;
ci[WS(rs, 1)] = Tz + TA;
cr[WS(rs, 1)] = Tr + Ty;
cr[WS(rs, 2)] = Tz - TA;
}
ci[WS(rs, 4)] = TH + TE;
{
E TD, TL, TK, TM, TI, TJ;
TD = FMA(KP587785252, TB, KP951056516 * TC);
TL = FNMS(KP587785252, TC, KP951056516 * TB);
TI = FNMS(KP250000000, TH, TE);
TJ = KP559016994 * (TF - TG);
TK = TI - TJ;
TM = TJ + TI;
cr[WS(rs, 3)] = TD - TK;
ci[WS(rs, 3)] = TL + TM;
ci[WS(rs, 2)] = TD + TK;
cr[WS(rs, 4)] = TL - TM;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 5 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 5, "hf_5", twinstr, &GENUS, { 26, 14, 14, 0 } };
void X(codelet_hf_5) (planner *p) {
X(khc2hc_register) (p, hf_5, &desc);
}
#endif

View File

@@ -0,0 +1,295 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hf_6 -include rdft/scalar/hf.h */
/*
* This function contains 46 FP additions, 32 FP multiplications,
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T1, TV, T7, TX, Tl, TR, TB, TO, Ty, TS, TC, TJ;
T1 = cr[0];
TV = ci[0];
{
E T3, T6, T4, TW, T2, T5;
T3 = cr[WS(rs, 3)];
T6 = ci[WS(rs, 3)];
T2 = W[4];
T4 = T2 * T3;
TW = T2 * T6;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TX = FNMS(T5, T3, TW);
}
{
E Ta, Td, Tb, TM, Tg, Tj, Th, TK, T9, Tf;
Ta = cr[WS(rs, 2)];
Td = ci[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
TM = T9 * Td;
Tg = cr[WS(rs, 5)];
Tj = ci[WS(rs, 5)];
Tf = W[8];
Th = Tf * Tg;
TK = Tf * Tj;
{
E Te, TN, Tk, TL, Tc, Ti;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TN = FNMS(Tc, Ta, TM);
Ti = W[9];
Tk = FMA(Ti, Tj, Th);
TL = FNMS(Ti, Tg, TK);
Tl = Te - Tk;
TR = TN + TL;
TB = Te + Tk;
TO = TL - TN;
}
}
{
E Tn, Tq, To, TH, Tt, Tw, Tu, TF, Tm, Ts;
Tn = cr[WS(rs, 4)];
Tq = ci[WS(rs, 4)];
Tm = W[6];
To = Tm * Tn;
TH = Tm * Tq;
Tt = cr[WS(rs, 1)];
Tw = ci[WS(rs, 1)];
Ts = W[0];
Tu = Ts * Tt;
TF = Ts * Tw;
{
E Tr, TI, Tx, TG, Tp, Tv;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
TI = FNMS(Tp, Tn, TH);
Tv = W[1];
Tx = FMA(Tv, Tw, Tu);
TG = FNMS(Tv, Tt, TF);
Ty = Tr - Tx;
TS = TI + TG;
TC = Tr + Tx;
TJ = TG - TI;
}
}
{
E TP, T8, Tz, TE;
TP = TJ - TO;
T8 = T1 - T7;
Tz = Tl + Ty;
TE = FNMS(KP500000000, Tz, T8);
ci[WS(rs, 2)] = T8 + Tz;
cr[WS(rs, 1)] = FMA(KP866025403, TP, TE);
ci[0] = FNMS(KP866025403, TP, TE);
}
{
E TT, TA, TD, TQ;
TT = TR - TS;
TA = T1 + T7;
TD = TB + TC;
TQ = FNMS(KP500000000, TD, TA);
cr[0] = TA + TD;
ci[WS(rs, 1)] = FMA(KP866025403, TT, TQ);
cr[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
}
{
E T10, TU, TY, TZ;
T10 = Ty - Tl;
TU = TO + TJ;
TY = TV - TX;
TZ = FMA(KP500000000, TU, TY);
cr[WS(rs, 3)] = TU - TY;
ci[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
cr[WS(rs, 5)] = FMS(KP866025403, T10, TZ);
}
{
E T14, T11, T12, T13;
T14 = TB - TC;
T11 = TX + TV;
T12 = TR + TS;
T13 = FNMS(KP500000000, T12, T11);
cr[WS(rs, 4)] = FMS(KP866025403, T14, T13);
ci[WS(rs, 5)] = T12 + T11;
ci[WS(rs, 3)] = FMA(KP866025403, T14, T13);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 6, "hf_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
void X(codelet_hf_6) (planner *p) {
X(khc2hc_register) (p, hf_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hf_6 -include rdft/scalar/hf.h */
/*
* This function contains 46 FP additions, 28 FP multiplications,
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
{
E T1, TM, T6, TN;
T1 = cr[0];
TM = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 3)];
T5 = ci[WS(rs, 3)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TN = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
TS = TN + TM;
Tv = T1 + T6;
TO = TM - TN;
}
{
E Tn, TE, Ts, TD;
{
E Tk, Tm, Tj, Tl;
Tk = cr[WS(rs, 4)];
Tm = ci[WS(rs, 4)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TE = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = cr[WS(rs, 1)];
Tr = ci[WS(rs, 1)];
To = W[0];
Tq = W[1];
Ts = FMA(To, Tp, Tq * Tr);
TD = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn - Ts;
TJ = TE + TD;
Tx = Tn + Ts;
TF = TD - TE;
}
{
E Tc, TA, Th, TB;
{
E T9, Tb, T8, Ta;
T9 = cr[WS(rs, 2)];
Tb = ci[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TA = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 5)];
Tg = ci[WS(rs, 5)];
Td = W[8];
Tf = W[9];
Th = FMA(Td, Te, Tf * Tg);
TB = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc - Th;
TI = TA + TB;
Tw = Tc + Th;
TC = TA - TB;
}
{
E TG, Tu, Tz, TK, Ty, TH;
TG = KP866025403 * (TC + TF);
Tu = Ti + Tt;
Tz = FNMS(KP500000000, Tu, T7);
ci[WS(rs, 2)] = T7 + Tu;
cr[WS(rs, 1)] = Tz + TG;
ci[0] = Tz - TG;
TK = KP866025403 * (TI - TJ);
Ty = Tw + Tx;
TH = FNMS(KP500000000, Ty, Tv);
cr[0] = Tv + Ty;
ci[WS(rs, 1)] = TH + TK;
cr[WS(rs, 2)] = TH - TK;
}
{
E TP, TL, TQ, TR, TT, TU;
TP = KP866025403 * (Tt - Ti);
TL = TF - TC;
TQ = FMA(KP500000000, TL, TO);
cr[WS(rs, 3)] = TL - TO;
ci[WS(rs, 4)] = TP + TQ;
cr[WS(rs, 5)] = TP - TQ;
TR = KP866025403 * (Tw - Tx);
TT = TI + TJ;
TU = FNMS(KP500000000, TT, TS);
cr[WS(rs, 4)] = TR - TU;
ci[WS(rs, 5)] = TT + TS;
ci[WS(rs, 3)] = TR + TU;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 6 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 6, "hf_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
void X(codelet_hf_6) (planner *p) {
X(khc2hc_register) (p, hf_6, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,354 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include rdft/scalar/hf.h */
/*
* This function contains 72 FP additions, 66 FP multiplications,
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
* 37 stack variables, 6 constants, and 28 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, T19, Te, T1i, TR, T1a, Tr, T1h, TM, T1b, TE, T1g, TW, T1c;
T1 = cr[0];
T19 = ci[0];
{
E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
T3 = cr[WS(rs, 1)];
T6 = ci[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
TN = T2 * T6;
T9 = cr[WS(rs, 6)];
Tc = ci[WS(rs, 6)];
T8 = W[10];
Ta = T8 * T9;
TP = T8 * Tc;
{
E T7, TO, Td, TQ, T5, Tb;
T5 = W[1];
T7 = FMA(T5, T6, T4);
TO = FNMS(T5, T3, TN);
Tb = W[11];
Td = FMA(Tb, Tc, Ta);
TQ = FNMS(Tb, T9, TP);
Te = T7 + Td;
T1i = Td - T7;
TR = TO - TQ;
T1a = TO + TQ;
}
}
{
E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
Tg = cr[WS(rs, 2)];
Tj = ci[WS(rs, 2)];
Tf = W[2];
Th = Tf * Tg;
TI = Tf * Tj;
Tm = cr[WS(rs, 5)];
Tp = ci[WS(rs, 5)];
Tl = W[8];
Tn = Tl * Tm;
TK = Tl * Tp;
{
E Tk, TJ, Tq, TL, Ti, To;
Ti = W[3];
Tk = FMA(Ti, Tj, Th);
TJ = FNMS(Ti, Tg, TI);
To = W[9];
Tq = FMA(To, Tp, Tn);
TL = FNMS(To, Tm, TK);
Tr = Tk + Tq;
T1h = Tq - Tk;
TM = TJ - TL;
T1b = TJ + TL;
}
}
{
E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
Tt = cr[WS(rs, 3)];
Tw = ci[WS(rs, 3)];
Ts = W[4];
Tu = Ts * Tt;
TS = Ts * Tw;
Tz = cr[WS(rs, 4)];
TC = ci[WS(rs, 4)];
Ty = W[6];
TA = Ty * Tz;
TU = Ty * TC;
{
E Tx, TT, TD, TV, Tv, TB;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
TT = FNMS(Tv, Tt, TS);
TB = W[7];
TD = FMA(TB, TC, TA);
TV = FNMS(TB, Tz, TU);
TE = Tx + TD;
T1g = TD - Tx;
TW = TT - TV;
T1c = TT + TV;
}
}
cr[0] = T1 + Te + Tr + TE;
{
E TG, TY, TF, TX, TH;
TF = FNMS(KP356895867, Tr, Te);
TG = FNMS(KP692021471, TF, TE);
TX = FMA(KP554958132, TW, TR);
TY = FMA(KP801937735, TX, TM);
TH = FNMS(KP900968867, TG, T1);
ci[0] = FNMS(KP974927912, TY, TH);
cr[WS(rs, 1)] = FMA(KP974927912, TY, TH);
}
ci[WS(rs, 6)] = T1a + T1b + T1c + T19;
{
E T1r, T1u, T1q, T1t, T1s;
T1q = FNMS(KP356895867, T1b, T1a);
T1r = FNMS(KP692021471, T1q, T1c);
T1t = FMA(KP554958132, T1g, T1i);
T1u = FMA(KP801937735, T1t, T1h);
T1s = FNMS(KP900968867, T1r, T19);
cr[WS(rs, 6)] = FMS(KP974927912, T1u, T1s);
ci[WS(rs, 5)] = FMA(KP974927912, T1u, T1s);
}
{
E T1m, T1p, T1l, T1o, T1n;
T1l = FNMS(KP356895867, T1a, T1c);
T1m = FNMS(KP692021471, T1l, T1b);
T1o = FMA(KP554958132, T1h, T1g);
T1p = FNMS(KP801937735, T1o, T1i);
T1n = FNMS(KP900968867, T1m, T19);
cr[WS(rs, 5)] = FMS(KP974927912, T1p, T1n);
ci[WS(rs, 4)] = FMA(KP974927912, T1p, T1n);
}
{
E T1e, T1k, T1d, T1j, T1f;
T1d = FNMS(KP356895867, T1c, T1b);
T1e = FNMS(KP692021471, T1d, T1a);
T1j = FNMS(KP554958132, T1i, T1h);
T1k = FNMS(KP801937735, T1j, T1g);
T1f = FNMS(KP900968867, T1e, T19);
cr[WS(rs, 4)] = FMS(KP974927912, T1k, T1f);
ci[WS(rs, 3)] = FMA(KP974927912, T1k, T1f);
}
{
E T15, T18, T14, T17, T16;
T14 = FNMS(KP356895867, TE, Tr);
T15 = FNMS(KP692021471, T14, Te);
T17 = FNMS(KP554958132, TR, TM);
T18 = FNMS(KP801937735, T17, TW);
T16 = FNMS(KP900968867, T15, T1);
ci[WS(rs, 2)] = FNMS(KP974927912, T18, T16);
cr[WS(rs, 3)] = FMA(KP974927912, T18, T16);
}
{
E T10, T13, TZ, T12, T11;
TZ = FNMS(KP356895867, Te, TE);
T10 = FNMS(KP692021471, TZ, Tr);
T12 = FMA(KP554958132, TM, TW);
T13 = FNMS(KP801937735, T12, TR);
T11 = FNMS(KP900968867, T10, T1);
ci[WS(rs, 1)] = FNMS(KP974927912, T13, T11);
cr[WS(rs, 2)] = FMA(KP974927912, T13, T11);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, { 18, 12, 54, 0 } };
void X(codelet_hf_7) (planner *p) {
X(khc2hc_register) (p, hf_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include rdft/scalar/hf.h */
/*
* This function contains 72 FP additions, 60 FP multiplications,
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
* 29 stack variables, 6 constants, and 28 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, TT, Tc, TV, TC, TO, Tn, TS, TI, TP, Ty, TU, TF, TQ;
T1 = cr[0];
TT = ci[0];
{
E T6, TA, Tb, TB;
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 1)];
T5 = ci[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
TA = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = cr[WS(rs, 6)];
Ta = ci[WS(rs, 6)];
T7 = W[10];
T9 = W[11];
Tb = FMA(T7, T8, T9 * Ta);
TB = FNMS(T9, T8, T7 * Ta);
}
Tc = T6 + Tb;
TV = TA + TB;
TC = TA - TB;
TO = Tb - T6;
}
{
E Th, TG, Tm, TH;
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 2)];
Tg = ci[WS(rs, 2)];
Td = W[2];
Tf = W[3];
Th = FMA(Td, Te, Tf * Tg);
TG = FNMS(Tf, Te, Td * Tg);
}
{
E Tj, Tl, Ti, Tk;
Tj = cr[WS(rs, 5)];
Tl = ci[WS(rs, 5)];
Ti = W[8];
Tk = W[9];
Tm = FMA(Ti, Tj, Tk * Tl);
TH = FNMS(Tk, Tj, Ti * Tl);
}
Tn = Th + Tm;
TS = TG + TH;
TI = TG - TH;
TP = Th - Tm;
}
{
E Ts, TD, Tx, TE;
{
E Tp, Tr, To, Tq;
Tp = cr[WS(rs, 3)];
Tr = ci[WS(rs, 3)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
TD = FNMS(Tq, Tp, To * Tr);
}
{
E Tu, Tw, Tt, Tv;
Tu = cr[WS(rs, 4)];
Tw = ci[WS(rs, 4)];
Tt = W[6];
Tv = W[7];
Tx = FMA(Tt, Tu, Tv * Tw);
TE = FNMS(Tv, Tu, Tt * Tw);
}
Ty = Ts + Tx;
TU = TD + TE;
TF = TD - TE;
TQ = Tx - Ts;
}
{
E TL, TK, TZ, T10;
cr[0] = T1 + Tc + Tn + Ty;
TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
ci[0] = TK - TL;
cr[WS(rs, 1)] = TK + TL;
ci[WS(rs, 6)] = TV + TS + TU + TT;
TZ = FMA(KP781831482, TO, KP433883739 * TQ) - (KP974927912 * TP);
T10 = FMA(KP623489801, TV, TT) + FNMA(KP900968867, TU, KP222520933 * TS);
cr[WS(rs, 6)] = TZ - T10;
ci[WS(rs, 5)] = TZ + T10;
}
{
E TX, TY, TR, TW;
TX = FMA(KP974927912, TO, KP433883739 * TP) - (KP781831482 * TQ);
TY = FMA(KP623489801, TU, TT) + FNMA(KP900968867, TS, KP222520933 * TV);
cr[WS(rs, 5)] = TX - TY;
ci[WS(rs, 4)] = TX + TY;
TR = FMA(KP433883739, TO, KP781831482 * TP) + (KP974927912 * TQ);
TW = FMA(KP623489801, TS, TT) + FNMA(KP222520933, TU, KP900968867 * TV);
cr[WS(rs, 4)] = TR - TW;
ci[WS(rs, 3)] = TR + TW;
}
{
E TN, TM, TJ, Tz;
TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
ci[WS(rs, 2)] = TM - TN;
cr[WS(rs, 3)] = TM + TN;
TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
ci[WS(rs, 1)] = Tz - TJ;
cr[WS(rs, 2)] = Tz + TJ;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 7 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, { 36, 24, 36, 0 } };
void X(codelet_hf_7) (planner *p) {
X(khc2hc_register) (p, hf_7, &desc);
}
#endif

View File

@@ -0,0 +1,376 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include rdft/scalar/hf.h */
/*
* This function contains 66 FP additions, 36 FP multiplications,
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
* 34 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
E TX, Ty, TZ, TV, T10;
T1 = cr[0];
T1m = ci[0];
{
E T3, T6, T4, T1k, T2, T5;
T3 = cr[WS(rs, 4)];
T6 = ci[WS(rs, 4)];
T2 = W[6];
T4 = T2 * T3;
T1k = T2 * T6;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1l = FNMS(T5, T3, T1k);
}
{
E Tg, Tj, Th, TR, Tf, Ti;
Tg = cr[WS(rs, 6)];
Tj = ci[WS(rs, 6)];
Tf = W[10];
Th = Tf * Tg;
TR = Tf * Tj;
Ti = W[11];
Tk = FMA(Ti, Tj, Th);
TS = FNMS(Ti, Tg, TR);
}
{
E Ta, Td, Tb, TP, T9, Tc;
Ta = cr[WS(rs, 2)];
Td = ci[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
TP = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TQ = FNMS(Tc, Ta, TP);
}
{
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
TB = cr[WS(rs, 7)];
TE = ci[WS(rs, 7)];
TA = W[12];
TC = TA * TB;
T13 = TA * TE;
TH = cr[WS(rs, 3)];
TK = ci[WS(rs, 3)];
TG = W[4];
TI = TG * TH;
T15 = TG * TK;
TD = W[13];
TF = FMA(TD, TE, TC);
T14 = FNMS(TD, TB, T13);
TJ = W[5];
TL = FMA(TJ, TK, TI);
T16 = FNMS(TJ, TH, T15);
T12 = TF - TL;
T17 = T14 - T16;
}
{
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
To = cr[WS(rs, 1)];
Tr = ci[WS(rs, 1)];
Tn = W[0];
Tp = Tn * To;
TW = Tn * Tr;
Tu = cr[WS(rs, 5)];
Tx = ci[WS(rs, 5)];
Tt = W[8];
Tv = Tt * Tu;
TY = Tt * Tx;
Tq = W[1];
Ts = FMA(Tq, Tr, Tp);
TX = FNMS(Tq, To, TW);
Tw = W[9];
Ty = FMA(Tw, Tx, Tv);
TZ = FNMS(Tw, Tu, TY);
TV = Ts - Ty;
T10 = TX - TZ;
}
{
E TU, T1a, T1t, T1v, T19, T1u, T1d, T1w;
{
E TO, TT, T1r, T1s;
TO = T1 - T7;
TT = TQ - TS;
TU = TO + TT;
T1a = TO - TT;
T1r = Te - Tk;
T1s = T1m - T1l;
T1t = T1r + T1s;
T1v = T1s - T1r;
}
{
E T11, T18, T1b, T1c;
T11 = TV + T10;
T18 = T12 - T17;
T19 = T11 + T18;
T1u = T18 - T11;
T1b = TV - T10;
T1c = T12 + T17;
T1d = T1b + T1c;
T1w = T1c - T1b;
}
ci[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
cr[WS(rs, 5)] = FMS(KP707106781, T1w, T1v);
ci[WS(rs, 6)] = FMA(KP707106781, T1w, T1v);
cr[WS(rs, 1)] = FMA(KP707106781, T19, TU);
cr[WS(rs, 3)] = FNMS(KP707106781, T1d, T1a);
cr[WS(rs, 7)] = FMS(KP707106781, T1u, T1t);
ci[WS(rs, 4)] = FMA(KP707106781, T1u, T1t);
ci[0] = FMA(KP707106781, T1d, T1a);
}
{
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
{
E T8, Tl, T1j, T1n;
T8 = T1 + T7;
Tl = Te + Tk;
Tm = T8 + Tl;
T1e = T8 - Tl;
T1j = TQ + TS;
T1n = T1l + T1m;
T1o = T1j + T1n;
T1q = T1n - T1j;
}
{
E Tz, TM, T1f, T1g;
Tz = Ts + Ty;
TM = TF + TL;
TN = Tz + TM;
T1p = TM - Tz;
T1f = T14 + T16;
T1g = TX + TZ;
T1h = T1f - T1g;
T1i = T1g + T1f;
}
ci[WS(rs, 3)] = Tm - TN;
cr[WS(rs, 6)] = T1p - T1q;
ci[WS(rs, 5)] = T1p + T1q;
cr[0] = Tm + TN;
cr[WS(rs, 2)] = T1e - T1h;
cr[WS(rs, 4)] = T1i - T1o;
ci[WS(rs, 7)] = T1i + T1o;
ci[WS(rs, 1)] = T1e + T1h;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
void X(codelet_hf_8) (planner *p) {
X(khc2hc_register) (p, hf_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include rdft/scalar/hf.h */
/*
* This function contains 66 FP additions, 32 FP multiplications,
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, T1f, TH, T19, TF, T12, TR, TU, Ti, T1e, TK, T16, Tu, T13, TM;
E TP;
{
E T1, T18, T6, T17;
T1 = cr[0];
T18 = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 4)];
T5 = ci[WS(rs, 4)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T17 = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T1f = T18 - T17;
TH = T1 - T6;
T19 = T17 + T18;
}
{
E Tz, TS, TE, TT;
{
E Tw, Ty, Tv, Tx;
Tw = cr[WS(rs, 7)];
Ty = ci[WS(rs, 7)];
Tv = W[12];
Tx = W[13];
Tz = FMA(Tv, Tw, Tx * Ty);
TS = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = cr[WS(rs, 3)];
TD = ci[WS(rs, 3)];
TA = W[4];
TC = W[5];
TE = FMA(TA, TB, TC * TD);
TT = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T12 = TS + TT;
TR = Tz - TE;
TU = TS - TT;
}
{
E Tc, TI, Th, TJ;
{
E T9, Tb, T8, Ta;
T9 = cr[WS(rs, 2)];
Tb = ci[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TI = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = cr[WS(rs, 6)];
Tg = ci[WS(rs, 6)];
Td = W[10];
Tf = W[11];
Th = FMA(Td, Te, Tf * Tg);
TJ = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T1e = Tc - Th;
TK = TI - TJ;
T16 = TI + TJ;
}
{
E To, TN, Tt, TO;
{
E Tl, Tn, Tk, Tm;
Tl = cr[WS(rs, 1)];
Tn = ci[WS(rs, 1)];
Tk = W[0];
Tm = W[1];
To = FMA(Tk, Tl, Tm * Tn);
TN = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = cr[WS(rs, 5)];
Ts = ci[WS(rs, 5)];
Tp = W[8];
Tr = W[9];
Tt = FMA(Tp, Tq, Tr * Ts);
TO = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T13 = TN + TO;
TM = To - Tt;
TP = TN - TO;
}
{
E Tj, TG, T1b, T1c;
Tj = T7 + Ti;
TG = Tu + TF;
ci[WS(rs, 3)] = Tj - TG;
cr[0] = Tj + TG;
T1b = TF - Tu;
T1c = T19 - T16;
cr[WS(rs, 6)] = T1b - T1c;
ci[WS(rs, 5)] = T1b + T1c;
{
E TX, T1i, T10, T1h, TY, TZ;
TX = TH - TK;
T1i = T1f - T1e;
TY = TM - TP;
TZ = TR + TU;
T10 = KP707106781 * (TY + TZ);
T1h = KP707106781 * (TZ - TY);
cr[WS(rs, 3)] = TX - T10;
ci[WS(rs, 6)] = T1h + T1i;
ci[0] = TX + T10;
cr[WS(rs, 5)] = T1h - T1i;
}
}
{
E T15, T1a, T11, T14;
T15 = T13 + T12;
T1a = T16 + T19;
cr[WS(rs, 4)] = T15 - T1a;
ci[WS(rs, 7)] = T15 + T1a;
T11 = T7 - Ti;
T14 = T12 - T13;
cr[WS(rs, 2)] = T11 - T14;
ci[WS(rs, 1)] = T11 + T14;
{
E TL, T1g, TW, T1d, TQ, TV;
TL = TH + TK;
T1g = T1e + T1f;
TQ = TM + TP;
TV = TR - TU;
TW = KP707106781 * (TQ + TV);
T1d = KP707106781 * (TV - TQ);
ci[WS(rs, 2)] = TL - TW;
ci[WS(rs, 4)] = T1d + T1g;
cr[WS(rs, 1)] = TL + TW;
cr[WS(rs, 7)] = T1d - T1g;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 8 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
void X(codelet_hf_8) (planner *p) {
X(khc2hc_register) (p, hf_8, &desc);
}
#endif

View File

@@ -0,0 +1,487 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include rdft/scalar/hf.h */
/*
* This function contains 96 FP additions, 88 FP multiplications,
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
* 55 stack variables, 10 constants, and 36 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T1, T1P, Te, T1S, T10, T1Q, T1a, T1d, Ty, T18, Tl, T13, T19, T1c, T1l;
E T1r, TS, T1p, TF, T1o, T1g, T1q;
T1 = cr[0];
T1P = ci[0];
{
E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8;
T3 = cr[WS(rs, 3)];
T6 = ci[WS(rs, 3)];
T2 = W[4];
T4 = T2 * T3;
TW = T2 * T6;
T9 = cr[WS(rs, 6)];
Tc = ci[WS(rs, 6)];
T8 = W[10];
Ta = T8 * T9;
TY = T8 * Tc;
{
E T7, TX, Td, TZ, T5, Tb;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TX = FNMS(T5, T3, TW);
Tb = W[11];
Td = FMA(Tb, Tc, Ta);
TZ = FNMS(Tb, T9, TY);
Te = T7 + Td;
T1S = Td - T7;
T10 = TX - TZ;
T1Q = TX + TZ;
}
}
{
E Th, Tk, Ti, T12, Tx, T17, Tr, T15, Tg, Tj;
Th = cr[WS(rs, 1)];
Tk = ci[WS(rs, 1)];
Tg = W[0];
Ti = Tg * Th;
T12 = Tg * Tk;
{
E Tt, Tw, Tu, T16, Ts, Tv;
Tt = cr[WS(rs, 7)];
Tw = ci[WS(rs, 7)];
Ts = W[12];
Tu = Ts * Tt;
T16 = Ts * Tw;
Tv = W[13];
Tx = FMA(Tv, Tw, Tu);
T17 = FNMS(Tv, Tt, T16);
}
{
E Tn, Tq, To, T14, Tm, Tp;
Tn = cr[WS(rs, 4)];
Tq = ci[WS(rs, 4)];
Tm = W[6];
To = Tm * Tn;
T14 = Tm * Tq;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
T15 = FNMS(Tp, Tn, T14);
}
T1a = Tr - Tx;
T1d = T15 - T17;
Ty = Tr + Tx;
T18 = T15 + T17;
Tj = W[1];
Tl = FMA(Tj, Tk, Ti);
T13 = FNMS(Tj, Th, T12);
T19 = FNMS(KP500000000, T18, T13);
T1c = FNMS(KP500000000, Ty, Tl);
}
{
E TB, TE, TC, T1n, TR, T1k, TL, T1i, TA, TD;
TB = cr[WS(rs, 2)];
TE = ci[WS(rs, 2)];
TA = W[2];
TC = TA * TB;
T1n = TA * TE;
{
E TN, TQ, TO, T1j, TM, TP;
TN = cr[WS(rs, 8)];
TQ = ci[WS(rs, 8)];
TM = W[14];
TO = TM * TN;
T1j = TM * TQ;
TP = W[15];
TR = FMA(TP, TQ, TO);
T1k = FNMS(TP, TN, T1j);
}
{
E TH, TK, TI, T1h, TG, TJ;
TH = cr[WS(rs, 5)];
TK = ci[WS(rs, 5)];
TG = W[8];
TI = TG * TH;
T1h = TG * TK;
TJ = W[9];
TL = FMA(TJ, TK, TI);
T1i = FNMS(TJ, TH, T1h);
}
T1l = T1i - T1k;
T1r = TR - TL;
TS = TL + TR;
T1p = T1i + T1k;
TD = W[3];
TF = FMA(TD, TE, TC);
T1o = FNMS(TD, TB, T1n);
T1g = FNMS(KP500000000, TS, TF);
T1q = FNMS(KP500000000, T1p, T1o);
}
{
E Tf, T21, TU, T24, T1O, T22, T1L, T23;
Tf = T1 + Te;
T21 = T1Q + T1P;
{
E Tz, TT, T1M, T1N;
Tz = Tl + Ty;
TT = TF + TS;
TU = Tz + TT;
T24 = TT - Tz;
T1M = T13 + T18;
T1N = T1o + T1p;
T1O = T1M - T1N;
T22 = T1M + T1N;
}
cr[0] = Tf + TU;
ci[WS(rs, 8)] = T22 + T21;
T1L = FNMS(KP500000000, TU, Tf);
ci[WS(rs, 2)] = FNMS(KP866025403, T1O, T1L);
cr[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
T23 = FNMS(KP500000000, T22, T21);
cr[WS(rs, 6)] = FMS(KP866025403, T24, T23);
ci[WS(rs, 5)] = FMA(KP866025403, T24, T23);
}
{
E T11, T1z, T1T, T1X, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G;
E T1U, TV, T1R;
TV = FNMS(KP500000000, Te, T1);
T11 = FNMS(KP866025403, T10, TV);
T1z = FMA(KP866025403, T10, TV);
T1R = FNMS(KP500000000, T1Q, T1P);
T1T = FMA(KP866025403, T1S, T1R);
T1X = FNMS(KP866025403, T1S, T1R);
{
E T1b, T1e, T1m, T1s;
T1b = FMA(KP866025403, T1a, T19);
T1e = FNMS(KP866025403, T1d, T1c);
T1f = FMA(KP176326980, T1e, T1b);
T1w = FNMS(KP176326980, T1b, T1e);
T1m = FNMS(KP866025403, T1l, T1g);
T1s = FNMS(KP866025403, T1r, T1q);
T1t = FNMS(KP363970234, T1s, T1m);
T1x = FMA(KP363970234, T1m, T1s);
}
T1u = FNMS(KP954188894, T1t, T1f);
T1Y = FMA(KP954188894, T1x, T1w);
{
E T1A, T1B, T1D, T1E;
T1A = FMA(KP866025403, T1r, T1q);
T1B = FMA(KP866025403, T1l, T1g);
T1C = FMA(KP176326980, T1B, T1A);
T1I = FNMS(KP176326980, T1A, T1B);
T1D = FMA(KP866025403, T1d, T1c);
T1E = FNMS(KP866025403, T1a, T19);
T1F = FMA(KP839099631, T1E, T1D);
T1J = FNMS(KP839099631, T1D, T1E);
}
T1G = FMA(KP777861913, T1F, T1C);
T1U = FNMS(KP777861913, T1J, T1I);
cr[WS(rs, 2)] = FMA(KP984807753, T1u, T11);
ci[WS(rs, 7)] = FNMS(KP984807753, T1U, T1T);
ci[WS(rs, 6)] = FNMS(KP984807753, T1Y, T1X);
cr[WS(rs, 1)] = FMA(KP984807753, T1G, T1z);
{
E T1V, T1W, T1H, T1K;
T1V = FMA(KP492403876, T1U, T1T);
T1W = FNMS(KP777861913, T1F, T1C);
cr[WS(rs, 7)] = FMS(KP852868531, T1W, T1V);
ci[WS(rs, 4)] = FMA(KP852868531, T1W, T1V);
T1H = FNMS(KP492403876, T1G, T1z);
T1K = FMA(KP777861913, T1J, T1I);
ci[WS(rs, 1)] = FNMS(KP852868531, T1K, T1H);
cr[WS(rs, 4)] = FMA(KP852868531, T1K, T1H);
}
{
E T1v, T1y, T1Z, T20;
T1v = FNMS(KP492403876, T1u, T11);
T1y = FNMS(KP954188894, T1x, T1w);
ci[WS(rs, 3)] = FNMS(KP852868531, T1y, T1v);
ci[0] = FMA(KP852868531, T1y, T1v);
T1Z = FMA(KP492403876, T1Y, T1X);
T20 = FMA(KP954188894, T1t, T1f);
cr[WS(rs, 5)] = FMS(KP852868531, T20, T1Z);
cr[WS(rs, 8)] = -(FMA(KP852868531, T20, T1Z));
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 9 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, { 24, 16, 72, 0 } };
void X(codelet_hf_9) (planner *p) {
X(khc2hc_register) (p, hf_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include rdft/scalar/hf.h */
/*
* This function contains 96 FP additions, 72 FP multiplications,
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
* 41 stack variables, 8 constants, and 36 memory accesses
*/
#include "rdft/scalar/hf.h"
static void hf_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T1, T1B, TQ, T1A, Tc, TN, T1C, T1D, TL, T1x, T19, T1o, T1c, T1n, Tu;
E T1w, TW, T1k, T11, T1l;
{
E T6, TO, Tb, TP;
T1 = cr[0];
T1B = ci[0];
{
E T3, T5, T2, T4;
T3 = cr[WS(rs, 3)];
T5 = ci[WS(rs, 3)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TO = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = cr[WS(rs, 6)];
Ta = ci[WS(rs, 6)];
T7 = W[10];
T9 = W[11];
Tb = FMA(T7, T8, T9 * Ta);
TP = FNMS(T9, T8, T7 * Ta);
}
TQ = KP866025403 * (TO - TP);
T1A = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
TN = FNMS(KP500000000, Tc, T1);
T1C = TO + TP;
T1D = FNMS(KP500000000, T1C, T1B);
}
{
E Tz, T13, TE, T14, TJ, T15, TK, T16;
{
E Tw, Ty, Tv, Tx;
Tw = cr[WS(rs, 2)];
Ty = ci[WS(rs, 2)];
Tv = W[2];
Tx = W[3];
Tz = FMA(Tv, Tw, Tx * Ty);
T13 = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = cr[WS(rs, 5)];
TD = ci[WS(rs, 5)];
TA = W[8];
TC = W[9];
TE = FMA(TA, TB, TC * TD);
T14 = FNMS(TC, TB, TA * TD);
}
{
E TG, TI, TF, TH;
TG = cr[WS(rs, 8)];
TI = ci[WS(rs, 8)];
TF = W[14];
TH = W[15];
TJ = FMA(TF, TG, TH * TI);
T15 = FNMS(TH, TG, TF * TI);
}
TK = TE + TJ;
T16 = T14 + T15;
TL = Tz + TK;
T1x = T13 + T16;
{
E T17, T18, T1a, T1b;
T17 = FNMS(KP500000000, T16, T13);
T18 = KP866025403 * (TJ - TE);
T19 = T17 - T18;
T1o = T18 + T17;
T1a = FNMS(KP500000000, TK, Tz);
T1b = KP866025403 * (T14 - T15);
T1c = T1a - T1b;
T1n = T1a + T1b;
}
}
{
E Ti, TX, Tn, TT, Ts, TU, Tt, TY;
{
E Tf, Th, Te, Tg;
Tf = cr[WS(rs, 1)];
Th = ci[WS(rs, 1)];
Te = W[0];
Tg = W[1];
Ti = FMA(Te, Tf, Tg * Th);
TX = FNMS(Tg, Tf, Te * Th);
}
{
E Tk, Tm, Tj, Tl;
Tk = cr[WS(rs, 4)];
Tm = ci[WS(rs, 4)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TT = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = cr[WS(rs, 7)];
Tr = ci[WS(rs, 7)];
To = W[12];
Tq = W[13];
Ts = FMA(To, Tp, Tq * Tr);
TU = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn + Ts;
TY = TT + TU;
Tu = Ti + Tt;
T1w = TX + TY;
{
E TS, TV, TZ, T10;
TS = FNMS(KP500000000, Tt, Ti);
TV = KP866025403 * (TT - TU);
TW = TS - TV;
T1k = TS + TV;
TZ = FNMS(KP500000000, TY, TX);
T10 = KP866025403 * (Ts - Tn);
T11 = TZ - T10;
T1l = T10 + TZ;
}
}
{
E T1y, Td, TM, T1v;
T1y = KP866025403 * (T1w - T1x);
Td = T1 + Tc;
TM = Tu + TL;
T1v = FNMS(KP500000000, TM, Td);
cr[0] = Td + TM;
cr[WS(rs, 3)] = T1v + T1y;
ci[WS(rs, 2)] = T1v - T1y;
}
{
E TR, T1I, T1e, T1K, T1i, T1H, T1f, T1J;
TR = TN - TQ;
T1I = T1D - T1A;
{
E T12, T1d, T1g, T1h;
T12 = FMA(KP173648177, TW, KP984807753 * T11);
T1d = FNMS(KP939692620, T1c, KP342020143 * T19);
T1e = T12 + T1d;
T1K = KP866025403 * (T1d - T12);
T1g = FNMS(KP984807753, TW, KP173648177 * T11);
T1h = FMA(KP342020143, T1c, KP939692620 * T19);
T1i = KP866025403 * (T1g + T1h);
T1H = T1g - T1h;
}
cr[WS(rs, 2)] = TR + T1e;
ci[WS(rs, 6)] = T1H + T1I;
T1f = FNMS(KP500000000, T1e, TR);
ci[0] = T1f - T1i;
ci[WS(rs, 3)] = T1f + T1i;
T1J = FMS(KP500000000, T1H, T1I);
cr[WS(rs, 5)] = T1J - T1K;
cr[WS(rs, 8)] = T1K + T1J;
}
{
E T1L, T1M, T1N, T1O;
T1L = KP866025403 * (TL - Tu);
T1M = T1C + T1B;
T1N = T1w + T1x;
T1O = FNMS(KP500000000, T1N, T1M);
cr[WS(rs, 6)] = T1L - T1O;
ci[WS(rs, 8)] = T1N + T1M;
ci[WS(rs, 5)] = T1L + T1O;
}
{
E T1j, T1E, T1q, T1z, T1u, T1F, T1r, T1G;
T1j = TN + TQ;
T1E = T1A + T1D;
{
E T1m, T1p, T1s, T1t;
T1m = FMA(KP766044443, T1k, KP642787609 * T1l);
T1p = FMA(KP173648177, T1n, KP984807753 * T1o);
T1q = T1m + T1p;
T1z = KP866025403 * (T1p - T1m);
T1s = FNMS(KP642787609, T1k, KP766044443 * T1l);
T1t = FNMS(KP984807753, T1n, KP173648177 * T1o);
T1u = KP866025403 * (T1s - T1t);
T1F = T1s + T1t;
}
cr[WS(rs, 1)] = T1j + T1q;
T1r = FNMS(KP500000000, T1q, T1j);
ci[WS(rs, 1)] = T1r - T1u;
cr[WS(rs, 4)] = T1r + T1u;
ci[WS(rs, 7)] = T1F + T1E;
T1G = FNMS(KP500000000, T1F, T1E);
cr[WS(rs, 7)] = T1z - T1G;
ci[WS(rs, 4)] = T1z + T1G;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 1, 9 },
{ TW_NEXT, 1, 0 }
};
static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, { 60, 36, 36, 0 } };
void X(codelet_hf_9) (planner *p) {
X(khc2hc_register) (p, hf_9, &desc);
}
#endif

View File

@@ -0,0 +1,194 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cfII_10 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 21 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
E T1, To, T8, Tt, Ta, Ts, Te, Tq, Th, Tn;
T1 = R0[0];
To = R1[WS(rs, 2)];
{
E T2, T3, T4, T5, T6, T7;
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 3)];
T4 = T2 - T3;
T5 = R0[WS(rs, 4)];
T6 = R0[WS(rs, 1)];
T7 = T5 - T6;
T8 = T4 + T7;
Tt = T5 + T6;
Ta = T4 - T7;
Ts = T2 + T3;
}
{
E Tc, Td, Tm, Tf, Tg, Tl;
Tc = R1[0];
Td = R1[WS(rs, 4)];
Tm = Tc + Td;
Tf = R1[WS(rs, 1)];
Tg = R1[WS(rs, 3)];
Tl = Tf + Tg;
Te = Tc - Td;
Tq = Tm + Tl;
Th = Tf - Tg;
Tn = Tl - Tm;
}
Cr[WS(csr, 2)] = T1 + T8;
Ci[WS(csi, 2)] = Tn - To;
{
E Ti, Tk, Tb, Tj, T9;
Ti = FMA(KP618033988, Th, Te);
Tk = FNMS(KP618033988, Te, Th);
T9 = FNMS(KP250000000, T8, T1);
Tb = FMA(KP559016994, Ta, T9);
Tj = FNMS(KP559016994, Ta, T9);
Cr[WS(csr, 4)] = FNMS(KP951056516, Ti, Tb);
Cr[WS(csr, 3)] = FMA(KP951056516, Tk, Tj);
Cr[0] = FMA(KP951056516, Ti, Tb);
Cr[WS(csr, 1)] = FNMS(KP951056516, Tk, Tj);
}
{
E Tu, Tw, Tr, Tv, Tp;
Tu = FMA(KP618033988, Tt, Ts);
Tw = FNMS(KP618033988, Ts, Tt);
Tp = FMA(KP250000000, Tn, To);
Tr = FMA(KP559016994, Tq, Tp);
Tv = FNMS(KP559016994, Tq, Tp);
Ci[0] = -(FMA(KP951056516, Tu, Tr));
Ci[WS(csi, 3)] = FMA(KP951056516, Tw, Tv);
Ci[WS(csi, 4)] = FMS(KP951056516, Tu, Tr);
Ci[WS(csi, 1)] = FNMS(KP951056516, Tw, Tv);
}
}
}
}
static const kr2c_desc desc = { 10, "r2cfII_10", { 14, 0, 18, 0 }, &GENUS };
void X(codelet_r2cfII_10) (planner *p) { X(kr2c_register) (p, r2cfII_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cfII_10 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 32 FP additions, 12 FP multiplications,
* (or, 26 additions, 6 multiplications, 6 fused multiply/add),
* 21 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
E T1, To, T8, Tq, T9, Tp, Te, Ts, Th, Tn;
T1 = R0[0];
To = R1[WS(rs, 2)];
{
E T2, T3, T4, T5, T6, T7;
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 3)];
T4 = T2 - T3;
T5 = R0[WS(rs, 4)];
T6 = R0[WS(rs, 1)];
T7 = T5 - T6;
T8 = T4 + T7;
Tq = T5 + T6;
T9 = KP559016994 * (T4 - T7);
Tp = T2 + T3;
}
{
E Tc, Td, Tm, Tf, Tg, Tl;
Tc = R1[0];
Td = R1[WS(rs, 4)];
Tm = Tc + Td;
Tf = R1[WS(rs, 1)];
Tg = R1[WS(rs, 3)];
Tl = Tf + Tg;
Te = Tc - Td;
Ts = KP559016994 * (Tm + Tl);
Th = Tf - Tg;
Tn = Tl - Tm;
}
Cr[WS(csr, 2)] = T1 + T8;
Ci[WS(csi, 2)] = Tn - To;
{
E Ti, Tk, Tb, Tj, Ta;
Ti = FMA(KP951056516, Te, KP587785252 * Th);
Tk = FNMS(KP587785252, Te, KP951056516 * Th);
Ta = FNMS(KP250000000, T8, T1);
Tb = T9 + Ta;
Tj = Ta - T9;
Cr[WS(csr, 4)] = Tb - Ti;
Cr[WS(csr, 3)] = Tj + Tk;
Cr[0] = Tb + Ti;
Cr[WS(csr, 1)] = Tj - Tk;
}
{
E Tr, Tw, Tu, Tv, Tt;
Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
Tw = FNMS(KP587785252, Tp, KP951056516 * Tq);
Tt = FMA(KP250000000, Tn, To);
Tu = Ts + Tt;
Tv = Tt - Ts;
Ci[0] = -(Tr + Tu);
Ci[WS(csi, 3)] = Tw + Tv;
Ci[WS(csi, 4)] = Tr - Tu;
Ci[WS(csi, 1)] = Tv - Tw;
}
}
}
}
static const kr2c_desc desc = { 10, "r2cfII_10", { 26, 6, 6, 0 }, &GENUS };
void X(codelet_r2cfII_10) (planner *p) { X(kr2c_register) (p, r2cfII_10, &desc);
}
#endif

View File

@@ -0,0 +1,225 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cfII_12 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 45 FP additions, 24 FP multiplications,
* (or, 21 additions, 0 multiplications, 24 fused multiply/add),
* 28 stack variables, 3 constants, and 24 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
E Tx, Ty, T8, Tz, Tl, Tm, Tv, T5, TA, Tt, Te, Tf, Tu, T6, T7;
E Tw, TF, TG;
Tx = R0[WS(rs, 3)];
T6 = R0[WS(rs, 5)];
T7 = R0[WS(rs, 1)];
Ty = T6 + T7;
T8 = T6 - T7;
Tz = FMA(KP500000000, Ty, Tx);
{
E Th, Ti, Tj, Tk;
Th = R1[WS(rs, 4)];
Ti = R1[WS(rs, 2)];
Tj = R1[0];
Tk = Ti - Tj;
Tl = FMA(KP500000000, Tk, Th);
Tm = Ti + Tj;
Tv = Ti - Tj - Th;
}
{
E T1, T2, T3, T4;
T1 = R0[0];
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 4)];
T4 = T2 - T3;
T5 = FMA(KP500000000, T4, T1);
TA = T3 + T2;
Tt = T1 + T3 - T2;
}
{
E Ta, Tb, Tc, Td;
Ta = R1[WS(rs, 1)];
Tb = R1[WS(rs, 3)];
Tc = R1[WS(rs, 5)];
Td = Tb - Tc;
Te = FMA(KP500000000, Td, Ta);
Tf = Tc + Tb;
Tu = Ta + Tc - Tb;
}
Tw = Tu + Tv;
Cr[WS(csr, 1)] = FNMS(KP707106781, Tw, Tt);
Cr[WS(csr, 4)] = FMA(KP707106781, Tw, Tt);
TF = Tx - Ty;
TG = Tv - Tu;
Ci[WS(csi, 4)] = FMS(KP707106781, TG, TF);
Ci[WS(csi, 1)] = FMA(KP707106781, TG, TF);
{
E T9, TD, To, TE, Tg, Tn;
T9 = FNMS(KP866025403, T8, T5);
TD = FNMS(KP866025403, TA, Tz);
Tg = FNMS(KP866025403, Tf, Te);
Tn = FNMS(KP866025403, Tm, Tl);
To = Tg - Tn;
TE = Tg + Tn;
Cr[WS(csr, 5)] = FNMS(KP707106781, To, T9);
Ci[WS(csi, 3)] = FMA(KP707106781, TE, TD);
Cr[0] = FMA(KP707106781, To, T9);
Ci[WS(csi, 2)] = FMS(KP707106781, TE, TD);
}
{
E Tp, TB, Ts, TC, Tq, Tr;
Tp = FMA(KP866025403, T8, T5);
TB = FMA(KP866025403, TA, Tz);
Tq = FMA(KP866025403, Tm, Tl);
Tr = FMA(KP866025403, Tf, Te);
Ts = Tq - Tr;
TC = Tr + Tq;
Cr[WS(csr, 3)] = FNMS(KP707106781, Ts, Tp);
Ci[WS(csi, 5)] = FNMS(KP707106781, TC, TB);
Cr[WS(csr, 2)] = FMA(KP707106781, Ts, Tp);
Ci[0] = -(FMA(KP707106781, TC, TB));
}
}
}
}
static const kr2c_desc desc = { 12, "r2cfII_12", { 21, 0, 24, 0 }, &GENUS };
void X(codelet_r2cfII_12) (planner *p) { X(kr2c_register) (p, r2cfII_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cfII_12 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 43 FP additions, 12 FP multiplications,
* (or, 39 additions, 8 multiplications, 4 fused multiply/add),
* 28 stack variables, 5 constants, and 24 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP612372435, +0.612372435695794524549321018676472847991486870);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
E Tx, Tg, T4, Tz, Ty, Tj, TA, T9, Tm, Tl, Te, Tp, To, Tf, TE;
E TF;
{
E T1, T3, T2, Th, Ti;
T1 = R0[0];
T3 = R0[WS(rs, 2)];
T2 = R0[WS(rs, 4)];
Tx = KP866025403 * (T2 + T3);
Tg = FMA(KP500000000, T3 - T2, T1);
T4 = T1 + T2 - T3;
Tz = R0[WS(rs, 3)];
Th = R0[WS(rs, 5)];
Ti = R0[WS(rs, 1)];
Ty = Th + Ti;
Tj = KP866025403 * (Th - Ti);
TA = FMA(KP500000000, Ty, Tz);
}
{
E T5, T6, T7, T8;
T5 = R1[WS(rs, 1)];
T6 = R1[WS(rs, 5)];
T7 = R1[WS(rs, 3)];
T8 = T6 - T7;
T9 = T5 + T8;
Tm = KP612372435 * (T6 + T7);
Tl = FNMS(KP353553390, T8, KP707106781 * T5);
}
{
E Td, Ta, Tb, Tc;
Td = R1[WS(rs, 4)];
Ta = R1[WS(rs, 2)];
Tb = R1[0];
Tc = Ta - Tb;
Te = Tc - Td;
Tp = FMA(KP353553390, Tc, KP707106781 * Td);
To = KP612372435 * (Ta + Tb);
}
Tf = KP707106781 * (T9 + Te);
Cr[WS(csr, 1)] = T4 - Tf;
Cr[WS(csr, 4)] = T4 + Tf;
TE = KP707106781 * (Te - T9);
TF = Tz - Ty;
Ci[WS(csi, 4)] = TE - TF;
Ci[WS(csi, 1)] = TE + TF;
{
E Tk, TB, Tr, Tw, Tn, Tq;
Tk = Tg - Tj;
TB = Tx - TA;
Tn = Tl - Tm;
Tq = To - Tp;
Tr = Tn + Tq;
Tw = Tn - Tq;
Cr[WS(csr, 5)] = Tk - Tr;
Ci[WS(csi, 2)] = Tw + TB;
Cr[0] = Tk + Tr;
Ci[WS(csi, 3)] = Tw - TB;
}
{
E Ts, TD, Tv, TC, Tt, Tu;
Ts = Tg + Tj;
TD = Tx + TA;
Tt = To + Tp;
Tu = Tm + Tl;
Tv = Tt - Tu;
TC = Tu + Tt;
Cr[WS(csr, 3)] = Ts - Tv;
Ci[WS(csi, 5)] = TD - TC;
Cr[WS(csr, 2)] = Ts + Tv;
Ci[0] = -(TC + TD);
}
}
}
}
static const kr2c_desc desc = { 12, "r2cfII_12", { 39, 8, 4, 0 }, &GENUS };
void X(codelet_r2cfII_12) (planner *p) { X(kr2c_register) (p, r2cfII_12, &desc);
}
#endif

View File

@@ -0,0 +1,297 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 72 FP additions, 41 FP multiplications,
* (or, 38 additions, 7 multiplications, 34 fused multiply/add),
* 42 stack variables, 12 constants, and 30 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP823639103, +0.823639103546331925877420039278190003029660514);
DK(KP910592997, +0.910592997310029334643087372129977886038870291);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP690983005, +0.690983005625052575897706582817180941139845410);
DK(KP447213595, +0.447213595499957939281834733746255247088123672);
DK(KP552786404, +0.552786404500042060718165266253744752911876328);
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
E Ta, Tl, T1, T6, T7, TX, TT, T8, Tg, Th, TM, TZ, Tj, Tz, Tr;
E Ts, TP, TY, Tu, TC;
Ta = R0[WS(rs, 5)];
Tl = R1[WS(rs, 2)];
{
E T2, T5, T3, T4, TR, TS;
T1 = R0[0];
T2 = R0[WS(rs, 3)];
T5 = R1[WS(rs, 4)];
T3 = R0[WS(rs, 6)];
T4 = R1[WS(rs, 1)];
TR = T2 + T5;
TS = T3 + T4;
T6 = T2 + T3 - T4 - T5;
T7 = FNMS(KP250000000, T6, T1);
TX = FNMS(KP618033988, TR, TS);
TT = FMA(KP618033988, TS, TR);
T8 = (T3 + T5 - T2) - T4;
}
{
E Tf, TL, TK, Ti, Ty;
{
E Tb, Tc, Td, Te;
Tb = R1[0];
Tg = R0[WS(rs, 2)];
Tc = R1[WS(rs, 3)];
Td = R1[WS(rs, 6)];
Te = Tc + Td;
Tf = Tb - Te;
TL = Tc - Td;
Th = Tb + Te;
TK = Tg + Tb;
}
TM = FMA(KP618033988, TL, TK);
TZ = FNMS(KP618033988, TK, TL);
Ti = FMA(KP809016994, Th, Tg);
Tj = FNMS(KP552786404, Ti, Tf);
Ty = FMA(KP447213595, Th, Tf);
Tz = FNMS(KP690983005, Ty, Tg);
}
{
E Tq, TO, TN, Tt, TB;
{
E Tm, Tn, To, Tp;
Tm = R0[WS(rs, 7)];
Tr = R1[WS(rs, 5)];
Tn = R0[WS(rs, 1)];
To = R0[WS(rs, 4)];
Tp = Tn + To;
Tq = Tm - Tp;
TO = To - Tn;
Ts = Tm + Tp;
TN = Tr + Tm;
}
TP = FMA(KP618033988, TO, TN);
TY = FNMS(KP618033988, TN, TO);
Tt = FMA(KP809016994, Ts, Tr);
Tu = FNMS(KP552786404, Tt, Tq);
TB = FMA(KP447213595, Ts, Tq);
TC = FNMS(KP690983005, TB, Tr);
}
{
E TF, TG, TH, TI;
TF = T1 + T6;
TG = Ts - Tr - Tl;
TH = Ta + Tg - Th;
TI = TG + TH;
Cr[WS(csr, 2)] = FNMS(KP500000000, TI, TF);
Ci[WS(csi, 2)] = KP866025403 * (TH - TG);
Cr[WS(csr, 7)] = TF + TI;
}
{
E Tx, T14, T10, T11, TE, T12, TA, TD, T13;
Tx = FMA(KP559016994, T8, T7);
T14 = TZ - TY;
T10 = TY + TZ;
T11 = FMA(KP500000000, T10, TX);
TA = FNMS(KP809016994, Tz, Ta);
TD = FNMS(KP809016994, TC, Tl);
TE = TA - TD;
T12 = TD + TA;
Cr[WS(csr, 1)] = Tx + TE;
Ci[WS(csi, 1)] = KP951056516 * (T10 - TX);
Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP910592997, T12, T11));
Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP910592997, T12, T11)));
T13 = FNMS(KP500000000, TE, Tx);
Cr[WS(csr, 3)] = FNMS(KP823639103, T14, T13);
Cr[WS(csr, 6)] = FMA(KP823639103, T14, T13);
}
{
E T9, TQ, TU, TV, Tw, TW, Tk, Tv, TJ;
T9 = FNMS(KP559016994, T8, T7);
TQ = TM - TP;
TU = TP + TM;
TV = FMA(KP500000000, TU, TT);
Tk = FNMS(KP559016994, Tj, Ta);
Tv = FNMS(KP559016994, Tu, Tl);
Tw = Tk - Tv;
TW = Tv + Tk;
Cr[WS(csr, 4)] = T9 + Tw;
Ci[WS(csi, 4)] = KP951056516 * (TT - TU);
Ci[0] = -(KP951056516 * (FMA(KP910592997, TW, TV)));
Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP910592997, TW, TV)));
TJ = FNMS(KP500000000, Tw, T9);
Cr[WS(csr, 5)] = FNMS(KP823639103, TQ, TJ);
Cr[0] = FMA(KP823639103, TQ, TJ);
}
}
}
}
static const kr2c_desc desc = { 15, "r2cfII_15", { 38, 7, 34, 0 }, &GENUS };
void X(codelet_r2cfII_15) (planner *p) { X(kr2c_register) (p, r2cfII_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 72 FP additions, 33 FP multiplications,
* (or, 54 additions, 15 multiplications, 18 fused multiply/add),
* 37 stack variables, 8 constants, and 30 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
DK(KP309016994, +0.309016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
E T1, T2, Tx, TR, TE, T7, TD, Th, Tm, Tr, TQ, TA, TB, Tf, Te;
E Tu, TS, Td, TH, TO;
T1 = R0[WS(rs, 5)];
{
E T3, Tv, T6, Tw, T4, T5;
T2 = R0[WS(rs, 2)];
T3 = R1[0];
Tv = T2 + T3;
T4 = R1[WS(rs, 3)];
T5 = R1[WS(rs, 6)];
T6 = T4 + T5;
Tw = T4 - T5;
Tx = FMA(KP951056516, Tv, KP587785252 * Tw);
TR = FNMS(KP587785252, Tv, KP951056516 * Tw);
TE = KP559016994 * (T3 - T6);
T7 = T3 + T6;
TD = KP250000000 * T7;
}
{
E Ti, Tl, Tj, Tk, Tp, Tq;
Th = R0[0];
Ti = R1[WS(rs, 4)];
Tl = R0[WS(rs, 6)];
Tj = R1[WS(rs, 1)];
Tk = R0[WS(rs, 3)];
Tp = Tk + Ti;
Tq = Tl + Tj;
Tm = Ti + Tj - (Tk + Tl);
Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
TQ = FNMS(KP951056516, Tq, KP587785252 * Tp);
TA = FMA(KP250000000, Tm, Th);
TB = KP559016994 * (Tl + Ti - (Tk + Tj));
}
{
E T9, Tt, Tc, Ts, Ta, Tb, TG;
Tf = R1[WS(rs, 2)];
T9 = R0[WS(rs, 7)];
Te = R1[WS(rs, 5)];
Tt = T9 + Te;
Ta = R0[WS(rs, 1)];
Tb = R0[WS(rs, 4)];
Tc = Ta + Tb;
Ts = Ta - Tb;
Tu = FNMS(KP951056516, Tt, KP587785252 * Ts);
TS = FMA(KP951056516, Ts, KP587785252 * Tt);
Td = T9 + Tc;
TG = KP559016994 * (T9 - Tc);
TH = FNMS(KP309016994, Te, TG) + FNMA(KP250000000, Td, Tf);
TO = FMS(KP809016994, Te, Tf) + FNMA(KP250000000, Td, TG);
}
{
E Tn, T8, Tg, To;
Tn = Th - Tm;
T8 = T1 + T2 - T7;
Tg = Td - Te - Tf;
To = T8 + Tg;
Ci[WS(csi, 2)] = KP866025403 * (T8 - Tg);
Cr[WS(csr, 2)] = FNMS(KP500000000, To, Tn);
Cr[WS(csr, 7)] = Tn + To;
}
{
E TM, TX, TT, TV, TP, TU, TN, TW;
TM = TB + TA;
TX = KP866025403 * (TR + TS);
TT = TR - TS;
TV = FMS(KP500000000, TT, TQ);
TN = T1 + TE + FNMS(KP809016994, T2, TD);
TP = TN + TO;
TU = KP866025403 * (TO - TN);
Cr[WS(csr, 1)] = TM + TP;
Ci[WS(csi, 1)] = TQ + TT;
Ci[WS(csi, 6)] = TU - TV;
Ci[WS(csi, 3)] = TU + TV;
TW = FNMS(KP500000000, TP, TM);
Cr[WS(csr, 3)] = TW - TX;
Cr[WS(csr, 6)] = TW + TX;
}
{
E Tz, TC, Ty, TK, TI, TL, TF, TJ;
Tz = KP866025403 * (Tx + Tu);
TC = TA - TB;
Ty = Tu - Tx;
TK = FMS(KP500000000, Ty, Tr);
TF = FMA(KP309016994, T2, T1) + TD - TE;
TI = TF + TH;
TL = KP866025403 * (TH - TF);
Ci[WS(csi, 4)] = Tr + Ty;
Cr[WS(csr, 4)] = TC + TI;
Ci[WS(csi, 5)] = TK - TL;
Ci[0] = TK + TL;
TJ = FNMS(KP500000000, TI, TC);
Cr[0] = Tz + TJ;
Cr[WS(csr, 5)] = TJ - Tz;
}
}
}
}
static const kr2c_desc desc = { 15, "r2cfII_15", { 54, 15, 18, 0 }, &GENUS };
void X(codelet_r2cfII_15) (planner *p) { X(kr2c_register) (p, r2cfII_15, &desc);
}
#endif

View File

@@ -0,0 +1,312 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 66 FP additions, 48 FP multiplications,
* (or, 18 additions, 0 multiplications, 48 fused multiply/add),
* 32 stack variables, 7 constants, and 32 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
E T5, TZ, TB, TT, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
E TU;
{
E T1, TR, T4, TS, T2, T3;
T1 = R0[0];
TR = R0[WS(rs, 4)];
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 6)];
T4 = T2 - T3;
TS = T2 + T3;
T5 = FNMS(KP707106781, T4, T1);
TZ = FNMS(KP707106781, TS, TR);
TB = FMA(KP707106781, T4, T1);
TT = FMA(KP707106781, TS, TR);
}
{
E Tn, Ts, Tq, Tt, To, Tp;
Tn = R1[WS(rs, 7)];
Ts = R1[WS(rs, 3)];
To = R1[WS(rs, 1)];
Tp = R1[WS(rs, 5)];
Tq = To - Tp;
Tt = To + Tp;
Tr = FMA(KP707106781, Tq, Tn);
TK = FMA(KP707106781, Tt, Ts);
Tu = FNMS(KP707106781, Tt, Ts);
TJ = FMS(KP707106781, Tq, Tn);
}
{
E Te, Tj, Th, Tk, Tf, Tg;
Te = R1[0];
Tj = R1[WS(rs, 4)];
Tf = R1[WS(rs, 2)];
Tg = R1[WS(rs, 6)];
Th = Tf - Tg;
Tk = Tf + Tg;
Ti = FNMS(KP707106781, Th, Te);
TH = FMA(KP707106781, Tk, Tj);
Tl = FNMS(KP707106781, Tk, Tj);
TG = FMA(KP707106781, Th, Te);
}
{
E T8, TC, Tb, TD;
{
E T6, T7, T9, Ta;
T6 = R0[WS(rs, 5)];
T7 = R0[WS(rs, 1)];
T8 = FMA(KP414213562, T7, T6);
TC = FNMS(KP414213562, T6, T7);
T9 = R0[WS(rs, 3)];
Ta = R0[WS(rs, 7)];
Tb = FMA(KP414213562, Ta, T9);
TD = FMS(KP414213562, T9, Ta);
}
Tc = T8 - Tb;
T10 = TD - TC;
TE = TC + TD;
TU = T8 + Tb;
}
{
E Td, T13, Tw, T14, Tm, Tv;
Td = FMA(KP923879532, Tc, T5);
T13 = FNMS(KP923879532, T10, TZ);
Tm = FMA(KP668178637, Tl, Ti);
Tv = FMA(KP668178637, Tu, Tr);
Tw = Tm - Tv;
T14 = Tm + Tv;
Cr[WS(csr, 6)] = FNMS(KP831469612, Tw, Td);
Ci[WS(csi, 5)] = FNMS(KP831469612, T14, T13);
Cr[WS(csr, 1)] = FMA(KP831469612, Tw, Td);
Ci[WS(csi, 2)] = -(FMA(KP831469612, T14, T13));
}
{
E Tx, T11, TA, T12, Ty, Tz;
Tx = FNMS(KP923879532, Tc, T5);
T11 = FMA(KP923879532, T10, TZ);
Ty = FNMS(KP668178637, Tr, Tu);
Tz = FNMS(KP668178637, Ti, Tl);
TA = Ty - Tz;
T12 = Tz + Ty;
Cr[WS(csr, 5)] = FNMS(KP831469612, TA, Tx);
Ci[WS(csi, 1)] = FMA(KP831469612, T12, T11);
Cr[WS(csr, 2)] = FMA(KP831469612, TA, Tx);
Ci[WS(csi, 6)] = FMS(KP831469612, T12, T11);
}
{
E TF, TX, TM, TY, TI, TL;
TF = FMA(KP923879532, TE, TB);
TX = FNMS(KP923879532, TU, TT);
TI = FNMS(KP198912367, TH, TG);
TL = FMA(KP198912367, TK, TJ);
TM = TI + TL;
TY = TL - TI;
Cr[WS(csr, 7)] = FNMS(KP980785280, TM, TF);
Ci[WS(csi, 3)] = FMA(KP980785280, TY, TX);
Cr[0] = FMA(KP980785280, TM, TF);
Ci[WS(csi, 4)] = FMS(KP980785280, TY, TX);
}
{
E TN, TV, TQ, TW, TO, TP;
TN = FNMS(KP923879532, TE, TB);
TV = FMA(KP923879532, TU, TT);
TO = FMA(KP198912367, TG, TH);
TP = FNMS(KP198912367, TJ, TK);
TQ = TO - TP;
TW = TO + TP;
Cr[WS(csr, 4)] = FNMS(KP980785280, TQ, TN);
Ci[WS(csi, 7)] = FNMS(KP980785280, TW, TV);
Cr[WS(csr, 3)] = FMA(KP980785280, TQ, TN);
Ci[0] = -(FMA(KP980785280, TW, TV));
}
}
}
}
static const kr2c_desc desc = { 16, "r2cfII_16", { 18, 0, 48, 0 }, &GENUS };
void X(codelet_r2cfII_16) (planner *p) { X(kr2c_register) (p, r2cfII_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 66 FP additions, 30 FP multiplications,
* (or, 54 additions, 18 multiplications, 12 fused multiply/add),
* 32 stack variables, 7 constants, and 32 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
E T5, T11, TB, TV, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
E TS;
{
E T1, TU, T4, TT, T2, T3;
T1 = R0[0];
TU = R0[WS(rs, 4)];
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 6)];
T4 = KP707106781 * (T2 - T3);
TT = KP707106781 * (T2 + T3);
T5 = T1 + T4;
T11 = TU - TT;
TB = T1 - T4;
TV = TT + TU;
}
{
E Tq, Tt, Tp, Ts, Tn, To;
Tq = R1[WS(rs, 7)];
Tt = R1[WS(rs, 3)];
Tn = R1[WS(rs, 1)];
To = R1[WS(rs, 5)];
Tp = KP707106781 * (Tn - To);
Ts = KP707106781 * (Tn + To);
Tr = Tp - Tq;
TK = Tt - Ts;
Tu = Ts + Tt;
TJ = Tp + Tq;
}
{
E Te, Tk, Th, Tj, Tf, Tg;
Te = R1[0];
Tk = R1[WS(rs, 4)];
Tf = R1[WS(rs, 2)];
Tg = R1[WS(rs, 6)];
Th = KP707106781 * (Tf - Tg);
Tj = KP707106781 * (Tf + Tg);
Ti = Te + Th;
TH = Tk - Tj;
Tl = Tj + Tk;
TG = Te - Th;
}
{
E T8, TC, Tb, TD;
{
E T6, T7, T9, Ta;
T6 = R0[WS(rs, 1)];
T7 = R0[WS(rs, 5)];
T8 = FNMS(KP382683432, T7, KP923879532 * T6);
TC = FMA(KP382683432, T6, KP923879532 * T7);
T9 = R0[WS(rs, 3)];
Ta = R0[WS(rs, 7)];
Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
TD = FMA(KP923879532, T9, KP382683432 * Ta);
}
Tc = T8 + Tb;
T10 = Tb - T8;
TE = TC - TD;
TS = TC + TD;
}
{
E Td, TW, Tw, TR, Tm, Tv;
Td = T5 - Tc;
TW = TS + TV;
Tm = FMA(KP195090322, Ti, KP980785280 * Tl);
Tv = FNMS(KP980785280, Tu, KP195090322 * Tr);
Tw = Tm + Tv;
TR = Tv - Tm;
Cr[WS(csr, 4)] = Td - Tw;
Ci[WS(csi, 7)] = TR + TW;
Cr[WS(csr, 3)] = Td + Tw;
Ci[0] = TR - TW;
}
{
E Tx, TY, TA, TX, Ty, Tz;
Tx = T5 + Tc;
TY = TV - TS;
Ty = FNMS(KP195090322, Tl, KP980785280 * Ti);
Tz = FMA(KP980785280, Tr, KP195090322 * Tu);
TA = Ty + Tz;
TX = Tz - Ty;
Cr[WS(csr, 7)] = Tx - TA;
Ci[WS(csi, 3)] = TX + TY;
Cr[0] = Tx + TA;
Ci[WS(csi, 4)] = TX - TY;
}
{
E TF, T12, TM, TZ, TI, TL;
TF = TB + TE;
T12 = T10 - T11;
TI = FMA(KP831469612, TG, KP555570233 * TH);
TL = FMA(KP831469612, TJ, KP555570233 * TK);
TM = TI - TL;
TZ = TI + TL;
Cr[WS(csr, 6)] = TF - TM;
Ci[WS(csi, 2)] = T12 - TZ;
Cr[WS(csr, 1)] = TF + TM;
Ci[WS(csi, 5)] = -(TZ + T12);
}
{
E TN, T14, TQ, T13, TO, TP;
TN = TB - TE;
T14 = T10 + T11;
TO = FNMS(KP555570233, TJ, KP831469612 * TK);
TP = FNMS(KP555570233, TG, KP831469612 * TH);
TQ = TO - TP;
T13 = TP + TO;
Cr[WS(csr, 5)] = TN - TQ;
Ci[WS(csi, 1)] = T13 + T14;
Cr[WS(csr, 2)] = TN + TQ;
Ci[WS(csi, 6)] = T13 - T14;
}
}
}
}
static const kr2c_desc desc = { 16, "r2cfII_16", { 54, 18, 12, 0 }, &GENUS };
void X(codelet_r2cfII_16) (planner *p) { X(kr2c_register) (p, r2cfII_16, &desc);
}
#endif

View File

@@ -0,0 +1,86 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cfII_2 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 0 FP additions, 0 FP multiplications,
* (or, 0 additions, 0 multiplications, 0 fused multiply/add),
* 3 stack variables, 0 constants, and 4 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
E T1, T2;
T1 = R0[0];
T2 = R1[0];
Cr[0] = T1;
Ci[0] = -T2;
}
}
}
static const kr2c_desc desc = { 2, "r2cfII_2", { 0, 0, 0, 0 }, &GENUS };
void X(codelet_r2cfII_2) (planner *p) { X(kr2c_register) (p, r2cfII_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cfII_2 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 0 FP additions, 0 FP multiplications,
* (or, 0 additions, 0 multiplications, 0 fused multiply/add),
* 3 stack variables, 0 constants, and 4 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
E T1, T2;
T1 = R0[0];
T2 = R1[0];
Cr[0] = T1;
Ci[0] = -T2;
}
}
}
static const kr2c_desc desc = { 2, "r2cfII_2", { 0, 0, 0, 0 }, &GENUS };
void X(codelet_r2cfII_2) (planner *p) { X(kr2c_register) (p, r2cfII_2, &desc);
}
#endif

View File

@@ -0,0 +1,394 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:28 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 102 FP additions, 63 FP multiplications,
* (or, 39 additions, 0 multiplications, 63 fused multiply/add),
* 53 stack variables, 10 constants, and 40 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP690983005, +0.690983005625052575897706582817180941139845410);
DK(KP447213595, +0.447213595499957939281834733746255247088123672);
DK(KP552786404, +0.552786404500042060718165266253744752911876328);
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP381966011, +0.381966011250105151795413165634361882279690820);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
E Ti, T1d, T1f, T1e, Tg, T1p, TS, T1g, T1, T6, T7, T1r, T1k, T8, To;
E Tp, Tv, TX, Tr, TV, Tx, TF, TC, TD, T12, TG, TK, T10, Tc, Tf;
Ti = R1[WS(rs, 2)];
T1d = R0[WS(rs, 5)];
{
E Ta, Tb, Td, Te;
Ta = R0[WS(rs, 9)];
Tb = R0[WS(rs, 1)];
Tc = Ta - Tb;
T1f = Ta + Tb;
Td = R0[WS(rs, 3)];
Te = R0[WS(rs, 7)];
Tf = Td - Te;
T1e = Td + Te;
}
Tg = FNMS(KP618033988, Tf, Tc);
T1p = FMA(KP381966011, T1e, T1f);
TS = FMA(KP618033988, Tc, Tf);
T1g = FMA(KP381966011, T1f, T1e);
{
E T2, T5, T3, T4, T1i, T1j;
T1 = R0[0];
T2 = R0[WS(rs, 4)];
T5 = R0[WS(rs, 6)];
T3 = R0[WS(rs, 8)];
T4 = R0[WS(rs, 2)];
T1i = T2 + T5;
T1j = T3 + T4;
T6 = T2 + T3 - T4 - T5;
T7 = FNMS(KP250000000, T6, T1);
T1r = FNMS(KP618033988, T1i, T1j);
T1k = FMA(KP618033988, T1j, T1i);
T8 = (T3 + T5 - T2) - T4;
}
{
E Tn, Tu, Tt, Tq, TU;
{
E Tj, Tk, Tl, Tm;
Tj = R1[WS(rs, 8)];
To = R1[WS(rs, 6)];
Tk = R1[0];
Tl = R1[WS(rs, 4)];
Tm = Tk + Tl;
Tn = Tj - Tm;
Tu = Tk - Tl;
Tp = Tj + Tm;
Tt = To + Tj;
}
Tv = FNMS(KP618033988, Tu, Tt);
TX = FMA(KP618033988, Tt, Tu);
Tq = FMA(KP809016994, Tp, To);
Tr = FNMS(KP552786404, Tq, Tn);
TU = FMA(KP447213595, Tp, Tn);
TV = FNMS(KP690983005, TU, To);
}
{
E TJ, TE, TI, TZ;
Tx = R1[WS(rs, 7)];
{
E Ty, Tz, TA, TB;
Ty = R1[WS(rs, 1)];
TF = R1[WS(rs, 3)];
Tz = R1[WS(rs, 5)];
TA = R1[WS(rs, 9)];
TB = Tz + TA;
TC = Ty + TB;
TJ = Tz - TA;
TE = Ty - TB;
TI = TF + Ty;
}
TD = FMA(KP250000000, TC, Tx);
T12 = FNMS(KP618033988, TI, TJ);
TG = FNMS(KP552786404, TF, TE);
TK = FMA(KP618033988, TJ, TI);
TZ = FMA(KP447213595, TC, TE);
T10 = FNMS(KP690983005, TZ, TF);
}
{
E T19, T1w, T1c, T1x, T1a, T1b;
T19 = T1 + T6;
T1w = T1f + T1d - T1e;
T1a = Ti + To - Tp;
T1b = TC - TF - Tx;
T1c = T1a + T1b;
T1x = T1a - T1b;
Cr[WS(csr, 2)] = FNMS(KP707106781, T1c, T19);
Ci[WS(csi, 2)] = FMS(KP707106781, T1x, T1w);
Cr[WS(csr, 7)] = FMA(KP707106781, T1c, T19);
Ci[WS(csi, 7)] = FMA(KP707106781, T1x, T1w);
}
{
E TT, T15, T1s, T1u, TY, T17, T13, T16;
{
E TR, T1q, TW, T11;
TR = FMA(KP559016994, T8, T7);
TT = FMA(KP951056516, TS, TR);
T15 = FNMS(KP951056516, TS, TR);
T1q = FNMS(KP809016994, T1p, T1d);
T1s = FNMS(KP951056516, T1r, T1q);
T1u = FMA(KP951056516, T1r, T1q);
TW = FNMS(KP809016994, TV, Ti);
TY = FMA(KP951056516, TX, TW);
T17 = FNMS(KP951056516, TX, TW);
T11 = FNMS(KP809016994, T10, Tx);
T13 = FNMS(KP951056516, T12, T11);
T16 = FMA(KP951056516, T12, T11);
}
{
E T14, T1v, T18, T1t;
T14 = TY - T13;
Cr[WS(csr, 6)] = FNMS(KP707106781, T14, TT);
Cr[WS(csr, 3)] = FMA(KP707106781, T14, TT);
T1v = T17 + T16;
Ci[WS(csi, 6)] = FMS(KP707106781, T1v, T1u);
Ci[WS(csi, 3)] = FMA(KP707106781, T1v, T1u);
T18 = T16 - T17;
Cr[WS(csr, 8)] = FNMS(KP707106781, T18, T15);
Cr[WS(csr, 1)] = FMA(KP707106781, T18, T15);
T1t = TY + T13;
Ci[WS(csi, 8)] = -(FMA(KP707106781, T1t, T1s));
Ci[WS(csi, 1)] = FNMS(KP707106781, T1t, T1s);
}
}
{
E Th, TN, T1l, T1n, Tw, TO, TL, TP;
{
E T9, T1h, Ts, TH;
T9 = FNMS(KP559016994, T8, T7);
Th = FNMS(KP951056516, Tg, T9);
TN = FMA(KP951056516, Tg, T9);
T1h = FMA(KP809016994, T1g, T1d);
T1l = FMA(KP951056516, T1k, T1h);
T1n = FNMS(KP951056516, T1k, T1h);
Ts = FNMS(KP559016994, Tr, Ti);
Tw = FNMS(KP951056516, Tv, Ts);
TO = FMA(KP951056516, Tv, Ts);
TH = FNMS(KP559016994, TG, TD);
TL = FNMS(KP951056516, TK, TH);
TP = FMA(KP951056516, TK, TH);
}
{
E TM, T1m, TQ, T1o;
TM = Tw - TL;
Cr[WS(csr, 9)] = FNMS(KP707106781, TM, Th);
Cr[0] = FMA(KP707106781, TM, Th);
T1m = TO + TP;
Ci[0] = -(FMA(KP707106781, T1m, T1l));
Ci[WS(csi, 9)] = FNMS(KP707106781, T1m, T1l);
TQ = TO - TP;
Cr[WS(csr, 5)] = FNMS(KP707106781, TQ, TN);
Cr[WS(csr, 4)] = FMA(KP707106781, TQ, TN);
T1o = Tw + TL;
Ci[WS(csi, 4)] = -(FMA(KP707106781, T1o, T1n));
Ci[WS(csi, 5)] = FNMS(KP707106781, T1o, T1n);
}
}
}
}
}
static const kr2c_desc desc = { 20, "r2cfII_20", { 39, 0, 63, 0 }, &GENUS };
void X(codelet_r2cfII_20) (planner *p) { X(kr2c_register) (p, r2cfII_20, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 102 FP additions, 34 FP multiplications,
* (or, 86 additions, 18 multiplications, 16 fused multiply/add),
* 60 stack variables, 13 constants, and 40 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP572061402, +0.572061402817684297600072783580302076536153377);
DK(KP218508012, +0.218508012224410535399650602527877556893735408);
DK(KP309016994, +0.309016994374947424102293417182819058860154590);
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP176776695, +0.176776695296636881100211090526212259821208984);
DK(KP395284707, +0.395284707521047416499861693054089816714944392);
DK(KP672498511, +0.672498511963957326960058968885748755876783111);
DK(KP415626937, +0.415626937777453428589967464113135184222253485);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
E T8, TD, Tm, TN, T9, TC, TY, TE, Te, TF, Tl, TK, T12, TL, Tk;
E TM, T1, T6, Tq, T1l, T1c, Tp, T1f, T1e, T1d, Ty, TW, T1g, T1m, Tx;
E Tu;
T8 = R1[WS(rs, 2)];
TD = KP707106781 * T8;
Tm = R1[WS(rs, 7)];
TN = KP707106781 * Tm;
{
E Ta, TA, Td, TB, Tb, Tc;
T9 = R1[WS(rs, 6)];
Ta = R1[WS(rs, 8)];
TA = T9 + Ta;
Tb = R1[0];
Tc = R1[WS(rs, 4)];
Td = Tb + Tc;
TB = Tb - Tc;
TC = FMA(KP415626937, TA, KP672498511 * TB);
TY = FNMS(KP415626937, TB, KP672498511 * TA);
TE = KP395284707 * (Ta - Td);
Te = Ta + Td;
TF = KP176776695 * Te;
}
{
E Tg, TJ, Tj, TI, Th, Ti;
Tg = R1[WS(rs, 1)];
Tl = R1[WS(rs, 3)];
TJ = Tg + Tl;
Th = R1[WS(rs, 5)];
Ti = R1[WS(rs, 9)];
Tj = Th + Ti;
TI = Th - Ti;
TK = FNMS(KP415626937, TJ, KP672498511 * TI);
T12 = FMA(KP415626937, TI, KP672498511 * TJ);
TL = KP395284707 * (Tg - Tj);
Tk = Tg + Tj;
TM = KP176776695 * Tk;
}
{
E T2, T5, T3, T4, T1a, T1b;
T1 = R0[0];
T2 = R0[WS(rs, 6)];
T5 = R0[WS(rs, 8)];
T3 = R0[WS(rs, 2)];
T4 = R0[WS(rs, 4)];
T1a = T4 + T2;
T1b = T5 + T3;
T6 = T2 + T3 - (T4 + T5);
Tq = FMA(KP250000000, T6, T1);
T1l = FNMS(KP951056516, T1b, KP587785252 * T1a);
T1c = FMA(KP951056516, T1a, KP587785252 * T1b);
Tp = KP559016994 * (T5 + T2 - (T4 + T3));
}
T1f = R0[WS(rs, 5)];
{
E Tv, Tw, Ts, Tt;
Tv = R0[WS(rs, 9)];
Tw = R0[WS(rs, 1)];
Tx = Tv - Tw;
T1e = Tv + Tw;
Ts = R0[WS(rs, 3)];
Tt = R0[WS(rs, 7)];
Tu = Ts - Tt;
T1d = Ts + Tt;
}
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
TW = FNMS(KP951056516, Tx, KP587785252 * Tu);
T1g = FMA(KP809016994, T1d, KP309016994 * T1e) + T1f;
T1m = FNMS(KP809016994, T1e, T1f) - (KP309016994 * T1d);
{
E T7, T1r, To, T1q, Tf, Tn;
T7 = T1 - T6;
T1r = T1e + T1f - T1d;
Tf = T8 + (T9 - Te);
Tn = (Tk - Tl) - Tm;
To = KP707106781 * (Tf + Tn);
T1q = KP707106781 * (Tf - Tn);
Cr[WS(csr, 2)] = T7 - To;
Ci[WS(csi, 2)] = T1q - T1r;
Cr[WS(csr, 7)] = T7 + To;
Ci[WS(csi, 7)] = T1q + T1r;
}
{
E T1h, T1j, TX, T15, T10, T16, T13, T17, TV, TZ, T11;
T1h = T1c - T1g;
T1j = T1c + T1g;
TV = Tq - Tp;
TX = TV - TW;
T15 = TV + TW;
TZ = FMA(KP218508012, T9, TD) + TF - TE;
T10 = TY + TZ;
T16 = TZ - TY;
T11 = FNMS(KP218508012, Tl, TL) - (TM + TN);
T13 = T11 - T12;
T17 = T11 + T12;
{
E T14, T19, T18, T1i;
T14 = T10 + T13;
Cr[WS(csr, 5)] = TX - T14;
Cr[WS(csr, 4)] = TX + T14;
T19 = T17 - T16;
Ci[WS(csi, 5)] = T19 - T1h;
Ci[WS(csi, 4)] = T19 + T1h;
T18 = T16 + T17;
Cr[WS(csr, 9)] = T15 - T18;
Cr[0] = T15 + T18;
T1i = T13 - T10;
Ci[0] = T1i - T1j;
Ci[WS(csi, 9)] = T1i + T1j;
}
}
{
E T1n, T1p, Tz, TR, TH, TS, TP, TT, Tr, TG, TO;
T1n = T1l + T1m;
T1p = T1m - T1l;
Tr = Tp + Tq;
Tz = Tr + Ty;
TR = Tr - Ty;
TG = TD + TE + FNMS(KP572061402, T9, TF);
TH = TC + TG;
TS = TC - TG;
TO = TL + TM + FNMS(KP572061402, Tl, TN);
TP = TK - TO;
TT = TK + TO;
{
E TQ, T1o, TU, T1k;
TQ = TH + TP;
Cr[WS(csr, 6)] = Tz - TQ;
Cr[WS(csr, 3)] = Tz + TQ;
T1o = TT - TS;
Ci[WS(csi, 6)] = T1o - T1p;
Ci[WS(csi, 3)] = T1o + T1p;
TU = TS + TT;
Cr[WS(csr, 8)] = TR - TU;
Cr[WS(csr, 1)] = TR + TU;
T1k = TP - TH;
Ci[WS(csi, 8)] = T1k - T1n;
Ci[WS(csi, 1)] = T1k + T1n;
}
}
}
}
}
static const kr2c_desc desc = { 20, "r2cfII_20", { 86, 18, 16, 0 }, &GENUS };
void X(codelet_r2cfII_20) (planner *p) { X(kr2c_register) (p, r2cfII_20, &desc);
}
#endif

View File

@@ -0,0 +1,776 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:28 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cfII_25 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 212 FP additions, 177 FP multiplications,
* (or, 47 additions, 12 multiplications, 165 fused multiply/add),
* 131 stack variables, 67 constants, and 50 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP876091699, +0.876091699473550838204498029706869638173524346);
DK(KP792626838, +0.792626838241819413632131824093538848057784557);
DK(KP690668130, +0.690668130712929053565177988380887884042527623);
DK(KP809385824, +0.809385824416008241660603814668679683846476688);
DK(KP860541664, +0.860541664367944677098261680920518816412804187);
DK(KP681693190, +0.681693190061530575150324149145440022633095390);
DK(KP560319534, +0.560319534973832390111614715371676131169633784);
DK(KP237294955, +0.237294955877110315393888866460840817927895961);
DK(KP897376177, +0.897376177523557693138608077137219684419427330);
DK(KP584303379, +0.584303379262766050358567120694562180043261496);
DK(KP653711795, +0.653711795629256296299985401753308353544378892);
DK(KP997675361, +0.997675361079556513670859573984492383596555031);
DK(KP645989928, +0.645989928319777763844272876603899665178054552);
DK(KP591287873, +0.591287873858343558732323717242372865934480959);
DK(KP952936919, +0.952936919628306576880750665357914584765951388);
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
DK(KP956723877, +0.956723877038460305821989399535483155872969262);
DK(KP945422727, +0.945422727388575946270360266328811958657216298);
DK(KP734762448, +0.734762448793050413546343770063151342619912334);
DK(KP772036680, +0.772036680810363904029489473607579825330539880);
DK(KP683113946, +0.683113946453479238701949862233725244439656928);
DK(KP559154169, +0.559154169276087864842202529084232643714075927);
DK(KP242145790, +0.242145790282157779872542093866183953459003101);
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
DK(KP999754674, +0.999754674276473633366203429228112409535557487);
DK(KP904730450, +0.904730450839922351881287709692877908104763647);
DK(KP916574801, +0.916574801383451584742370439148878693530976769);
DK(KP829049696, +0.829049696159252993975487806364305442437946767);
DK(KP831864738, +0.831864738706457140726048799369896829771167132);
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
DK(KP949179823, +0.949179823508441261575555465843363271711583843);
DK(KP669429328, +0.669429328479476605641803240971985825917022098);
DK(KP262346850, +0.262346850930607871785420028382979691334784273);
DK(KP923225144, +0.923225144846402650453449441572664695995209956);
DK(KP906616052, +0.906616052148196230441134447086066874408359177);
DK(KP921078979, +0.921078979742360627699756128143719920817673854);
DK(KP982009705, +0.982009705009746369461829878184175962711969869);
DK(KP845997307, +0.845997307939530944175097360758058292389769300);
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
DK(KP803003575, +0.803003575438660414833440593570376004635464850);
DK(KP763583905, +0.763583905359130246362948588764067237776594106);
DK(KP248028675, +0.248028675328619457762448260696444630363259177);
DK(KP904508497, +0.904508497187473712051146708591409529430077295);
DK(KP894834959, +0.894834959464455102997960030820114611498661386);
DK(KP958953096, +0.958953096729998668045963838399037225970891871);
DK(KP867381224, +0.867381224396525206773171885031575671309956167);
DK(KP912575812, +0.912575812670962425556968549836277086778922727);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP869845200, +0.869845200362138853122720822420327157933056305);
DK(KP120146378, +0.120146378570687701782758537356596213647956445);
DK(KP132830569, +0.132830569247582714407653942074819768844536507);
DK(KP786782374, +0.786782374965295178365099601674911834788448471);
DK(KP893101515, +0.893101515366181661711202267938416198338079437);
DK(KP987388751, +0.987388751065621252324603216482382109400433949);
DK(KP244189809, +0.244189809627953270309879511234821255780225091);
DK(KP269969613, +0.269969613759572083574752974412347470060951301);
DK(KP494780565, +0.494780565770515410344588413655324772219443730);
DK(KP066152395, +0.066152395967733048213034281011006031460903353);
DK(KP059835404, +0.059835404262124915169548397419498386427871950);
DK(KP447533225, +0.447533225982656890041886979663652563063114397);
DK(KP522847744, +0.522847744331509716623755382187077770911012542);
DK(KP667278218, +0.667278218140296670899089292254759909713898805);
DK(KP603558818, +0.603558818296015001454675132653458027918768137);
DK(KP578046249, +0.578046249379945007321754579646815604023525655);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
E T2v, TJ, T2A, T1K, T2y, T2z, TB, T15, T2d, T2l, T1g, T1s, T1N, T21, T1D;
E T9, TQ, T2g, T2o, T1j, T1u, T1X, T25, T1z, Ti, TX, T2f, T2p, T1k, T1v;
E T1U, T24, T1A, Ts, T1c, T2c, T2k, T1h, T1r, T1Q, T22, T1C, Tj, TC;
{
E TI, T2x, TF, T2w;
T2v = R0[0];
{
E TG, TH, TD, TE;
TG = R0[WS(rs, 10)];
TH = R1[WS(rs, 2)];
TI = TG + TH;
T2x = TG - TH;
TD = R0[WS(rs, 5)];
TE = R1[WS(rs, 7)];
TF = TD + TE;
T2w = TD - TE;
}
TJ = FMA(KP618033988, TI, TF);
T2A = T2w - T2x;
T1K = FNMS(KP618033988, TF, TI);
T2y = T2w + T2x;
T2z = FNMS(KP250000000, T2y, T2v);
}
{
E Tt, TA, T13, TZ, T10;
Tt = R0[WS(rs, 2)];
{
E Tu, Tv, Tw, Tx, Ty, Tz;
Tu = R0[WS(rs, 7)];
Tv = R1[WS(rs, 9)];
Tw = Tu - Tv;
Tx = R0[WS(rs, 12)];
Ty = R1[WS(rs, 4)];
Tz = Tx - Ty;
TA = Tw + Tz;
T13 = Tz - Tw;
TZ = Tu + Tv;
T10 = Tx + Ty;
}
TB = Tt + TA;
{
E T11, T1M, T14, T1L, T12;
T11 = FMA(KP618033988, T10, TZ);
T1M = FNMS(KP618033988, TZ, T10);
T12 = FNMS(KP250000000, TA, Tt);
T14 = FNMS(KP559016994, T13, T12);
T1L = FMA(KP559016994, T13, T12);
T15 = FMA(KP578046249, T14, T11);
T2d = FNMS(KP603558818, T1M, T1L);
T2l = FMA(KP667278218, T1L, T1M);
T1g = FNMS(KP522847744, T11, T14);
T1s = FMA(KP447533225, T11, T14);
T1N = FMA(KP059835404, T1M, T1L);
T21 = FNMS(KP066152395, T1L, T1M);
T1D = FNMS(KP494780565, T14, T11);
}
}
{
E T1, T8, TO, TK, TL;
T1 = R0[WS(rs, 1)];
{
E T2, T3, T4, T5, T6, T7;
T2 = R0[WS(rs, 6)];
T3 = R1[WS(rs, 8)];
T4 = T2 - T3;
T5 = R0[WS(rs, 11)];
T6 = R1[WS(rs, 3)];
T7 = T5 - T6;
T8 = T4 + T7;
TO = T4 - T7;
TK = T2 + T3;
TL = T5 + T6;
}
T9 = T1 + T8;
{
E TM, T1V, TP, T1W, TN;
TM = FMA(KP618033988, TL, TK);
T1V = FNMS(KP618033988, TK, TL);
TN = FNMS(KP250000000, T8, T1);
TP = FMA(KP559016994, TO, TN);
T1W = FNMS(KP559016994, TO, TN);
TQ = FMA(KP269969613, TP, TM);
T2g = FNMS(KP578046249, T1W, T1V);
T2o = FMA(KP522847744, T1V, T1W);
T1j = FNMS(KP244189809, TM, TP);
T1u = FNMS(KP603558818, TM, TP);
T1X = FMA(KP987388751, T1W, T1V);
T25 = FNMS(KP893101515, T1V, T1W);
T1z = FMA(KP667278218, TP, TM);
}
}
{
E Th, Tg, TV, TS, TU;
Th = R0[WS(rs, 4)];
{
E Ta, Tb, Tc, Td, Te, Tf;
Ta = R0[WS(rs, 9)];
Tb = R1[WS(rs, 11)];
Tc = Ta - Tb;
Td = R1[WS(rs, 6)];
Te = R1[WS(rs, 1)];
Tf = Td + Te;
Tg = Tc - Tf;
TV = Te - Td;
TS = Tc + Tf;
TU = Ta + Tb;
}
Ti = Tg + Th;
{
E TW, T1S, TT, T1T, TR;
TW = FNMS(KP618033988, TV, TU);
T1S = FMA(KP618033988, TU, TV);
TR = FNMS(KP250000000, Tg, Th);
TT = FMA(KP559016994, TS, TR);
T1T = FNMS(KP559016994, TS, TR);
TX = FMA(KP603558818, TW, TT);
T2f = FNMS(KP447533225, T1S, T1T);
T2p = FMA(KP494780565, T1T, T1S);
T1k = FNMS(KP667278218, TT, TW);
T1v = FNMS(KP786782374, TW, TT);
T1U = FMA(KP132830569, T1T, T1S);
T24 = FNMS(KP120146378, T1S, T1T);
T1A = FMA(KP869845200, TT, TW);
}
}
{
E Tk, Tr, T1a, T16, T17;
Tk = R0[WS(rs, 3)];
{
E Tl, Tm, Tn, To, Tp, Tq;
Tl = R0[WS(rs, 8)];
Tm = R1[WS(rs, 10)];
Tn = Tl - Tm;
To = R1[0];
Tp = R1[WS(rs, 5)];
Tq = To + Tp;
Tr = Tn - Tq;
T1a = Tn + Tq;
T16 = Tl + Tm;
T17 = Tp - To;
}
Ts = Tk + Tr;
{
E T18, T1P, T1b, T1O, T19;
T18 = FMA(KP618033988, T17, T16);
T1P = FNMS(KP618033988, T16, T17);
T19 = FNMS(KP250000000, Tr, Tk);
T1b = FMA(KP559016994, T1a, T19);
T1O = FNMS(KP559016994, T1a, T19);
T1c = FMA(KP987388751, T1b, T18);
T2c = FNMS(KP059835404, T1P, T1O);
T2k = FMA(KP066152395, T1O, T1P);
T1h = FNMS(KP893101515, T18, T1b);
T1r = FMA(KP132830569, T1b, T18);
T1Q = FNMS(KP786782374, T1P, T1O);
T22 = FMA(KP869845200, T1O, T1P);
T1C = FNMS(KP120146378, T18, T1b);
}
}
Tj = T9 - Ti;
TC = Ts - TB;
Ci[WS(csi, 2)] = -(KP951056516 * (FNMS(KP618033988, TC, Tj)));
Ci[WS(csi, 7)] = KP951056516 * (FMA(KP618033988, Tj, TC));
{
E T3l, T3o, T3q, T3m, T3n, T3p;
T3l = T2v + T2y;
T3m = T9 + Ti;
T3n = TB + Ts;
T3o = T3m + T3n;
T3q = T3m - T3n;
Cr[WS(csr, 12)] = T3o + T3l;
T3p = FNMS(KP250000000, T3o, T3l);
Cr[WS(csr, 2)] = FMA(KP559016994, T3q, T3p);
Cr[WS(csr, 7)] = FNMS(KP559016994, T3q, T3p);
}
{
E T1B, T1E, T1x, T1I, T1G, T1t, T1w, T1F, T1y, T1J, T1H;
T1B = FMA(KP912575812, T1A, T1z);
T1E = FMA(KP867381224, T1D, T1C);
T1t = FMA(KP958953096, T1s, T1r);
T1w = FNMS(KP912575812, T1v, T1u);
T1F = FNMS(KP894834959, T1w, T1t);
T1x = FMA(KP894834959, T1w, T1t);
T1I = FNMS(KP894834959, T1B, T1F);
T1G = FNMS(KP904508497, T1F, T1E);
T1y = FMA(KP248028675, T1x, TJ);
T1J = FMA(KP559016994, T1I, T1E);
T1H = FMA(KP763583905, T1G, T1B);
Ci[WS(csi, 4)] = KP951056516 * (FNMS(KP803003575, T1H, T1y));
Ci[WS(csi, 9)] = KP951056516 * (FNMS(KP992114701, T1J, T1y));
}
{
E T2m, T2q, T2i, T2t, T2r, T2e, T2h, T2n, T2j, T2u, T2s;
T2m = FNMS(KP845997307, T2l, T2k);
T2q = FMA(KP982009705, T2p, T2o);
T2e = FMA(KP845997307, T2d, T2c);
T2h = FNMS(KP921078979, T2g, T2f);
T2n = FNMS(KP906616052, T2h, T2e);
T2i = FMA(KP906616052, T2h, T2e);
T2t = T2m + T2n;
T2r = FNMS(KP923225144, T2q, T2n);
T2j = FMA(KP262346850, T2i, T1K);
T2u = FNMS(KP669429328, T2t, T2q);
T2s = FNMS(KP618033988, T2r, T2m);
Ci[WS(csi, 8)] = KP951056516 * (FMA(KP949179823, T2s, T2j));
Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP876306680, T2u, T2j));
}
{
E T1i, T1l, T1e, T1p, T1n, TY, T1d, T1m, T1f, T1q, T1o;
T1i = FNMS(KP831864738, T1h, T1g);
T1l = FMA(KP829049696, T1k, T1j);
TY = FMA(KP916574801, TX, TQ);
T1d = FMA(KP831864738, T1c, T15);
T1m = FNMS(KP904730450, T1d, TY);
T1e = FMA(KP904730450, T1d, TY);
T1p = FNMS(KP999754674, T1m, T1i);
T1n = FNMS(KP904508497, T1m, T1l);
Ci[0] = -(KP951056516 * (FMA(KP968583161, T1e, TJ)));
T1f = FNMS(KP242145790, T1e, TJ);
T1q = FMA(KP559154169, T1p, T1l);
T1o = FNMS(KP683113946, T1n, T1i);
Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP876306680, T1o, T1f)));
Ci[WS(csi, 10)] = -(KP951056516 * (FNMS(KP968583161, T1q, T1f)));
}
{
E T23, T26, T1Z, T2a, T28, T1R, T1Y, T27, T20, T2b, T29;
T23 = FNMS(KP772036680, T22, T21);
T26 = FMA(KP734762448, T25, T24);
T1R = FMA(KP772036680, T1Q, T1N);
T1Y = FMA(KP734762448, T1X, T1U);
T27 = FNMS(KP945422727, T1Y, T1R);
T1Z = FMA(KP945422727, T1Y, T1R);
T2a = T27 - T23;
T28 = FMA(KP956723877, T27, T26);
Ci[WS(csi, 1)] = -(KP998026728 * (FMA(KP952936919, T1K, T1Z)));
T20 = FNMS(KP262346850, T1Z, T1K);
T2b = FMA(KP591287873, T2a, T26);
T29 = FMA(KP645989928, T28, T23);
Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP949179823, T29, T20)));
Ci[WS(csi, 11)] = -(KP951056516 * (FNMS(KP992114701, T2b, T20)));
}
{
E T2Y, T33, T31, T38, T36, T3e, T3f, T3c, T3j, T3h, T3a, T3b, T3g;
T2Y = FNMS(KP559016994, T2A, T2z);
T33 = FNMS(KP772036680, T1Q, T1N);
{
E T34, T2Z, T30, T35;
T34 = FNMS(KP734762448, T1X, T1U);
T2Z = FNMS(KP734762448, T25, T24);
T30 = FMA(KP772036680, T22, T21);
T35 = FNMS(KP956723877, T30, T2Z);
T31 = FMA(KP956723877, T30, T2Z);
T38 = FMA(KP618033988, T35, T34);
T36 = T34 + T35;
}
T3e = FMA(KP921078979, T2g, T2f);
T3f = FNMS(KP845997307, T2d, T2c);
T3a = FMA(KP845997307, T2l, T2k);
T3b = FNMS(KP982009705, T2p, T2o);
T3g = FNMS(KP923225144, T3b, T3a);
T3c = FMA(KP923225144, T3b, T3a);
T3j = FNMS(KP997675361, T3g, T3e);
T3h = FNMS(KP904508497, T3g, T3f);
Cr[WS(csr, 1)] = FNMS(KP992114701, T31, T2Y);
{
E T32, T39, T37, T3d, T3k, T3i;
T32 = FMA(KP248028675, T31, T2Y);
T39 = FNMS(KP653711795, T33, T38);
T37 = FMA(KP584303379, T36, T33);
Cr[WS(csr, 6)] = FMA(KP949179823, T37, T32);
Cr[WS(csr, 11)] = FNMS(KP897376177, T39, T32);
T3d = FNMS(KP237294955, T3c, T2Y);
T3k = FNMS(KP560319534, T3j, T3f);
T3i = FMA(KP681693190, T3h, T3e);
Cr[WS(csr, 3)] = FMA(KP860541664, T3i, T3d);
Cr[WS(csr, 8)] = FMA(KP949179823, T3k, T3d);
}
}
{
E T2B, T2R, T2T, T2P, T2W, T2U, T2G, T2H, T2E, T2L, T2J;
T2B = FMA(KP559016994, T2A, T2z);
{
E T2N, T2O, T2S, T2C, T2D, T2I;
T2R = FNMS(KP958953096, T1s, T1r);
T2T = FMA(KP912575812, T1v, T1u);
T2N = FNMS(KP867381224, T1D, T1C);
T2O = FNMS(KP912575812, T1A, T1z);
T2S = FMA(KP809385824, T2O, T2N);
T2P = FNMS(KP809385824, T2O, T2N);
T2W = T2R + T2S;
T2U = FNMS(KP894834959, T2T, T2S);
T2G = FNMS(KP831864738, T1c, T15);
T2H = FNMS(KP916574801, TX, TQ);
T2C = FNMS(KP829049696, T1k, T1j);
T2D = FMA(KP831864738, T1h, T1g);
T2I = FNMS(KP904730450, T2D, T2C);
T2E = FMA(KP904730450, T2D, T2C);
T2L = FMA(KP904730450, T2G, T2I);
T2J = T2H + T2I;
}
Cr[0] = FMA(KP968583161, T2E, T2B);
{
E T2Q, T2X, T2V, T2F, T2M, T2K;
T2Q = FMA(KP248028675, T2P, T2B);
T2X = FNMS(KP690668130, T2W, T2T);
T2V = FNMS(KP618033988, T2U, T2R);
Cr[WS(csr, 9)] = FMA(KP897376177, T2V, T2Q);
Cr[WS(csr, 4)] = FNMS(KP803003575, T2X, T2Q);
T2F = FNMS(KP242145790, T2E, T2B);
T2M = FMA(KP618033988, T2L, T2H);
T2K = FNMS(KP683113946, T2J, T2G);
Cr[WS(csr, 5)] = FMA(KP792626838, T2K, T2F);
Cr[WS(csr, 10)] = FMA(KP876091699, T2M, T2F);
}
}
}
}
}
static const kr2c_desc desc = { 25, "r2cfII_25", { 47, 12, 165, 0 }, &GENUS };
void X(codelet_r2cfII_25) (planner *p) { X(kr2c_register) (p, r2cfII_25, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cfII_25 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 213 FP additions, 148 FP multiplications,
* (or, 126 additions, 61 multiplications, 87 fused multiply/add),
* 94 stack variables, 38 constants, and 50 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
DK(KP062790519, +0.062790519529313376076178224565631133122484832);
DK(KP125581039, +0.125581039058626752152356449131262266244969664);
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
DK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
DK(KP728968627, +0.728968627421411523146730319055259111372571664);
DK(KP963507348, +0.963507348203430549974383005744259307057084020);
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
DK(KP497379774, +0.497379774329709576484567492012895936835134813);
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
DK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
DK(KP684547105, +0.684547105928688673732283357621209269889519233);
DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
DK(KP481753674, +0.481753674101715274987191502872129653528542010);
DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
DK(KP248689887, +0.248689887164854788242283746006447968417567406);
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
DK(KP250666467, +0.250666467128608490746237519633017587885836494);
DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
DK(KP425779291, +0.425779291565072648862502445744251703979973042);
DK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
DK(KP637423989, +0.637423989748689710176712811676016195434917298);
DK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
DK(KP535826794, +0.535826794978996618271308767867639978063575346);
DK(KP851558583, +0.851558583130145297725004891488503407959946084);
DK(KP904827052, +0.904827052466019527713668647932697593970413911);
DK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
DK(KP125333233, +0.125333233564304245373118759816508793942918247);
DK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
DK(KP770513242, +0.770513242775789230803009636396177847271667672);
DK(KP844327925, +0.844327925502015078548558063966681505381659241);
DK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
DK(KP293892626, +0.293892626146236564584352977319536384298826219);
DK(KP475528258, +0.475528258147576786058219666689691071702849317);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
E TE, TR, T2i, T1z, TL, TS, TB, T2d, T1l, T1i, T2c, T9, T23, TZ, TW;
E T22, Ti, T26, T16, T13, T25, Ts, T2a, T1e, T1b, T29, TP, TQ;
{
E TK, T1y, TH, T1x;
TE = R0[0];
{
E TI, TJ, TF, TG;
TI = R0[WS(rs, 10)];
TJ = R1[WS(rs, 2)];
TK = TI - TJ;
T1y = TI + TJ;
TF = R0[WS(rs, 5)];
TG = R1[WS(rs, 7)];
TH = TF - TG;
T1x = TF + TG;
}
TR = KP559016994 * (TH - TK);
T2i = FNMS(KP587785252, T1x, KP951056516 * T1y);
T1z = FMA(KP951056516, T1x, KP587785252 * T1y);
TL = TH + TK;
TS = FNMS(KP250000000, TL, TE);
}
{
E Tt, Tw, Tz, TA, T1k, T1j, T1g, T1h;
Tt = R0[WS(rs, 3)];
{
E Tu, Tv, Tx, Ty;
Tu = R0[WS(rs, 8)];
Tv = R1[WS(rs, 10)];
Tw = Tu - Tv;
Tx = R1[0];
Ty = R1[WS(rs, 5)];
Tz = Tx + Ty;
TA = Tw - Tz;
T1k = Ty - Tx;
T1j = Tu + Tv;
}
TB = Tt + TA;
T2d = FNMS(KP293892626, T1j, KP475528258 * T1k);
T1l = FMA(KP475528258, T1j, KP293892626 * T1k);
T1g = FNMS(KP250000000, TA, Tt);
T1h = KP559016994 * (Tw + Tz);
T1i = T1g + T1h;
T2c = T1g - T1h;
}
{
E T1, T4, T7, T8, TY, TX, TU, TV;
T1 = R0[WS(rs, 1)];
{
E T2, T3, T5, T6;
T2 = R0[WS(rs, 6)];
T3 = R1[WS(rs, 8)];
T4 = T2 - T3;
T5 = R0[WS(rs, 11)];
T6 = R1[WS(rs, 3)];
T7 = T5 - T6;
T8 = T4 + T7;
TY = T5 + T6;
TX = T2 + T3;
}
T9 = T1 + T8;
T23 = FNMS(KP293892626, TX, KP475528258 * TY);
TZ = FMA(KP475528258, TX, KP293892626 * TY);
TU = KP559016994 * (T4 - T7);
TV = FNMS(KP250000000, T8, T1);
TW = TU + TV;
T22 = TV - TU;
}
{
E Ta, Td, Tg, Th, T15, T14, T11, T12;
Ta = R0[WS(rs, 4)];
{
E Tb, Tc, Te, Tf;
Tb = R0[WS(rs, 9)];
Tc = R1[WS(rs, 11)];
Td = Tb - Tc;
Te = R1[WS(rs, 1)];
Tf = R1[WS(rs, 6)];
Tg = Te + Tf;
Th = Td - Tg;
T15 = Tf - Te;
T14 = Tb + Tc;
}
Ti = Ta + Th;
T26 = FNMS(KP293892626, T14, KP475528258 * T15);
T16 = FMA(KP475528258, T14, KP293892626 * T15);
T11 = FNMS(KP250000000, Th, Ta);
T12 = KP559016994 * (Td + Tg);
T13 = T11 + T12;
T25 = T11 - T12;
}
{
E Tk, Tn, Tq, Tr, T1d, T1c, T19, T1a;
Tk = R0[WS(rs, 2)];
{
E Tl, Tm, To, Tp;
Tl = R0[WS(rs, 7)];
Tm = R1[WS(rs, 9)];
Tn = Tl - Tm;
To = R0[WS(rs, 12)];
Tp = R1[WS(rs, 4)];
Tq = To - Tp;
Tr = Tn + Tq;
T1d = To + Tp;
T1c = Tl + Tm;
}
Ts = Tk + Tr;
T2a = FNMS(KP293892626, T1c, KP475528258 * T1d);
T1e = FMA(KP475528258, T1c, KP293892626 * T1d);
T19 = KP559016994 * (Tn - Tq);
T1a = FNMS(KP250000000, Tr, Tk);
T1b = T19 + T1a;
T29 = T1a - T19;
}
TP = TB - Ts;
TQ = T9 - Ti;
Ci[WS(csi, 2)] = FNMS(KP951056516, TQ, KP587785252 * TP);
Ci[WS(csi, 7)] = FMA(KP587785252, TQ, KP951056516 * TP);
{
E TM, TD, TN, Tj, TC, TO;
TM = TE + TL;
Tj = T9 + Ti;
TC = Ts + TB;
TD = KP559016994 * (Tj - TC);
TN = Tj + TC;
Cr[WS(csr, 12)] = TM + TN;
TO = FNMS(KP250000000, TN, TM);
Cr[WS(csr, 2)] = TD + TO;
Cr[WS(csr, 7)] = TO - TD;
}
{
E TT, T1J, T1Y, T1U, T1X, T1P, T1V, T1M, T1W, T1A, T1B, T1r, T1C, T1v, T18;
E T1n, T1o, T1G, T1D;
TT = TR + TS;
{
E T1H, T1I, T1S, T1T;
T1H = FNMS(KP844327925, TW, KP1_071653589 * TZ);
T1I = FNMS(KP1_274847979, T16, KP770513242 * T13);
T1J = T1H - T1I;
T1Y = T1H + T1I;
T1S = FMA(KP125333233, T1i, KP1_984229402 * T1l);
T1T = FMA(KP904827052, T1b, KP851558583 * T1e);
T1U = T1S - T1T;
T1X = T1T + T1S;
}
{
E T1N, T1O, T1K, T1L;
T1N = FMA(KP535826794, TW, KP1_688655851 * TZ);
T1O = FMA(KP637423989, T13, KP1_541026485 * T16);
T1P = T1N - T1O;
T1V = T1N + T1O;
T1K = FNMS(KP1_809654104, T1e, KP425779291 * T1b);
T1L = FNMS(KP992114701, T1i, KP250666467 * T1l);
T1M = T1K - T1L;
T1W = T1K + T1L;
}
{
E T1p, T1q, T1t, T1u;
T1p = FMA(KP844327925, T13, KP1_071653589 * T16);
T1q = FMA(KP248689887, TW, KP1_937166322 * TZ);
T1A = T1q + T1p;
T1t = FMA(KP481753674, T1b, KP1_752613360 * T1e);
T1u = FMA(KP684547105, T1i, KP1_457937254 * T1l);
T1B = T1t + T1u;
T1r = T1p - T1q;
T1C = T1A + T1B;
T1v = T1t - T1u;
}
{
E T10, T17, T1f, T1m;
T10 = FNMS(KP497379774, TZ, KP968583161 * TW);
T17 = FNMS(KP1_688655851, T16, KP535826794 * T13);
T18 = T10 + T17;
T1f = FNMS(KP963507348, T1e, KP876306680 * T1b);
T1m = FNMS(KP1_369094211, T1l, KP728968627 * T1i);
T1n = T1f + T1m;
T1o = T18 + T1n;
T1G = T10 - T17;
T1D = T1f - T1m;
}
{
E T1R, T1Q, T20, T1Z;
Cr[0] = TT + T1o;
Ci[0] = -(T1z + T1C);
T1R = KP559016994 * (T1P + T1M);
T1Q = FMA(KP250000000, T1M - T1P, TT);
Cr[WS(csr, 4)] = FMA(KP951056516, T1J, T1Q) + FMA(KP587785252, T1U, T1R);
Cr[WS(csr, 9)] = FMA(KP951056516, T1U, T1Q) + FNMA(KP587785252, T1J, T1R);
T20 = KP559016994 * (T1Y + T1X);
T1Z = FMA(KP250000000, T1X - T1Y, T1z);
Ci[WS(csi, 9)] = FMA(KP587785252, T1V, KP951056516 * T1W) + T1Z - T20;
Ci[WS(csi, 4)] = FMA(KP587785252, T1W, T1Z) + FNMS(KP951056516, T1V, T20);
{
E T1E, T1F, T1s, T1w;
T1E = FMS(KP250000000, T1C, T1z);
T1F = KP559016994 * (T1B - T1A);
Ci[WS(csi, 5)] = FMA(KP951056516, T1D, T1E) + FNMA(KP587785252, T1G, T1F);
Ci[WS(csi, 10)] = FMA(KP951056516, T1G, KP587785252 * T1D) + T1E + T1F;
T1s = FNMS(KP250000000, T1o, TT);
T1w = KP559016994 * (T18 - T1n);
Cr[WS(csr, 5)] = FMA(KP587785252, T1r, T1s) + FMS(KP951056516, T1v, T1w);
Cr[WS(csr, 10)] = T1w + FMA(KP587785252, T1v, T1s) - (KP951056516 * T1r);
}
}
}
{
E T21, T2z, T2L, T2K, T2M, T2F, T2P, T2C, T2Q, T2l, T2o, T2p, T2w, T2u, T28;
E T2f, T2g, T2s, T2h;
T21 = TS - TR;
{
E T2x, T2y, T2I, T2J;
T2x = FNMS(KP844327925, T29, KP1_071653589 * T2a);
T2y = FNMS(KP125581039, T2d, KP998026728 * T2c);
T2z = T2x + T2y;
T2L = T2y - T2x;
T2I = FNMS(KP481753674, T22, KP1_752613360 * T23);
T2J = FMA(KP904827052, T25, KP851558583 * T26);
T2K = T2I + T2J;
T2M = T2I - T2J;
}
{
E T2D, T2E, T2A, T2B;
T2D = FMA(KP535826794, T29, KP1_688655851 * T2a);
T2E = FMA(KP062790519, T2c, KP1_996053456 * T2d);
T2F = T2D + T2E;
T2P = T2E - T2D;
T2A = FMA(KP876306680, T22, KP963507348 * T23);
T2B = FNMS(KP425779291, T25, KP1_809654104 * T26);
T2C = T2A + T2B;
T2Q = T2A - T2B;
}
{
E T2j, T2k, T2m, T2n;
T2j = FNMS(KP125333233, T25, KP1_984229402 * T26);
T2k = FMA(KP684547105, T22, KP1_457937254 * T23);
T2l = T2j - T2k;
T2m = FNMS(KP770513242, T2c, KP1_274847979 * T2d);
T2n = FMA(KP998026728, T29, KP125581039 * T2a);
T2o = T2m - T2n;
T2p = T2l + T2o;
T2w = T2k + T2j;
T2u = T2n + T2m;
}
{
E T24, T27, T2b, T2e;
T24 = FNMS(KP1_369094211, T23, KP728968627 * T22);
T27 = FMA(KP992114701, T25, KP250666467 * T26);
T28 = T24 - T27;
T2b = FNMS(KP1_996053456, T2a, KP062790519 * T29);
T2e = FMA(KP637423989, T2c, KP1_541026485 * T2d);
T2f = T2b - T2e;
T2g = T28 + T2f;
T2s = T24 + T27;
T2h = T2b + T2e;
}
{
E T2H, T2G, T2O, T2N;
Cr[WS(csr, 1)] = T21 + T2g;
Ci[WS(csi, 1)] = T2p - T2i;
T2H = KP559016994 * (T2C - T2F);
T2G = FNMS(KP250000000, T2C + T2F, T21);
Cr[WS(csr, 8)] = FMA(KP951056516, T2z, T2G) + FNMA(KP587785252, T2K, T2H);
Cr[WS(csr, 3)] = FMA(KP951056516, T2K, KP587785252 * T2z) + T2G + T2H;
T2O = KP559016994 * (T2M + T2L);
T2N = FMA(KP250000000, T2L - T2M, T2i);
Ci[WS(csi, 3)] = T2N + FMA(KP587785252, T2P, T2O) - (KP951056516 * T2Q);
Ci[WS(csi, 8)] = FMA(KP587785252, T2Q, T2N) + FMS(KP951056516, T2P, T2O);
{
E T2t, T2v, T2q, T2r;
T2t = FNMS(KP250000000, T2g, T21);
T2v = KP559016994 * (T28 - T2f);
Cr[WS(csr, 6)] = FMA(KP951056516, T2u, T2t) + FNMA(KP587785252, T2w, T2v);
Cr[WS(csr, 11)] = FMA(KP951056516, T2w, T2v) + FMA(KP587785252, T2u, T2t);
T2q = KP250000000 * T2p;
T2r = KP559016994 * (T2l - T2o);
Ci[WS(csi, 6)] = FMS(KP951056516, T2h, T2i + T2q) + FNMA(KP587785252, T2s, T2r);
Ci[WS(csi, 11)] = FMA(KP951056516, T2s, KP587785252 * T2h) + T2r - (T2i + T2q);
}
}
}
}
}
}
static const kr2c_desc desc = { 25, "r2cfII_25", { 126, 61, 87, 0 }, &GENUS };
void X(codelet_r2cfII_25) (planner *p) { X(kr2c_register) (p, r2cfII_25, &desc);
}
#endif

View File

@@ -0,0 +1,96 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cfII_3 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 4 FP additions, 2 FP multiplications,
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
* 7 stack variables, 2 constants, and 6 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
E T3, T1, T2, T4;
T3 = R0[0];
T1 = R1[0];
T2 = R0[WS(rs, 1)];
T4 = T2 - T1;
Ci[0] = -(KP866025403 * (T1 + T2));
Cr[0] = FNMS(KP500000000, T4, T3);
Cr[WS(csr, 1)] = T3 + T4;
}
}
}
static const kr2c_desc desc = { 3, "r2cfII_3", { 3, 1, 1, 0 }, &GENUS };
void X(codelet_r2cfII_3) (planner *p) { X(kr2c_register) (p, r2cfII_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cfII_3 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 4 FP additions, 2 FP multiplications,
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
* 7 stack variables, 2 constants, and 6 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
E T1, T2, T3, T4;
T1 = R0[0];
T2 = R1[0];
T3 = R0[WS(rs, 1)];
T4 = T2 - T3;
Cr[WS(csr, 1)] = T1 - T4;
Ci[0] = -(KP866025403 * (T2 + T3));
Cr[0] = FMA(KP500000000, T4, T1);
}
}
}
static const kr2c_desc desc = { 3, "r2cfII_3", { 3, 1, 1, 0 }, &GENUS };
void X(codelet_r2cfII_3) (planner *p) { X(kr2c_register) (p, r2cfII_3, &desc);
}
#endif

View File

@@ -0,0 +1,686 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:25 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cfII_32 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 174 FP additions, 128 FP multiplications,
* (or, 46 additions, 0 multiplications, 128 fused multiply/add),
* 62 stack variables, 15 constants, and 64 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP773010453, +0.773010453362736960810906609758469800971041293);
DK(KP820678790, +0.820678790828660330972281985331011598767386482);
DK(KP956940335, +0.956940335732208864935797886980269969482849206);
DK(KP303346683, +0.303346683607342391675883946941299872384187453);
DK(KP995184726, +0.995184726672196886244836953109479921575474869);
DK(KP098491403, +0.098491403357164253077197521291327432293052451);
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
DK(KP881921264, +0.881921264348355029712756863660388349508442621);
DK(KP534511135, +0.534511135950791641089685961295362908582039528);
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
E T5, T2B, T1z, T2n, Tc, T2C, T1C, T2o, Tm, T1l, T1J, T27, Tv, T1k, T1G;
E T26, T15, T1r, T1Y, T2e, T1c, T1s, T1V, T2d, TK, T1o, T1R, T2b, TR, T1p;
E T1O, T2a;
{
E T1, T2l, T4, T2m, T2, T3;
T1 = R0[0];
T2l = R0[WS(rs, 8)];
T2 = R0[WS(rs, 4)];
T3 = R0[WS(rs, 12)];
T4 = T2 - T3;
T2m = T2 + T3;
T5 = FNMS(KP707106781, T4, T1);
T2B = FNMS(KP707106781, T2m, T2l);
T1z = FMA(KP707106781, T4, T1);
T2n = FMA(KP707106781, T2m, T2l);
}
{
E T8, T1A, Tb, T1B;
{
E T6, T7, T9, Ta;
T6 = R0[WS(rs, 10)];
T7 = R0[WS(rs, 2)];
T8 = FMA(KP414213562, T7, T6);
T1A = FNMS(KP414213562, T6, T7);
T9 = R0[WS(rs, 6)];
Ta = R0[WS(rs, 14)];
Tb = FMA(KP414213562, Ta, T9);
T1B = FMS(KP414213562, T9, Ta);
}
Tc = T8 - Tb;
T2C = T1B - T1A;
T1C = T1A + T1B;
T2o = T8 + Tb;
}
{
E Te, Tj, Th, Tk, Tf, Tg;
Te = R0[WS(rs, 7)];
Tj = R0[WS(rs, 15)];
Tf = R0[WS(rs, 3)];
Tg = R0[WS(rs, 11)];
Th = Tf + Tg;
Tk = Tg - Tf;
{
E Ti, Tl, T1H, T1I;
Ti = FNMS(KP707106781, Th, Te);
Tl = FNMS(KP707106781, Tk, Tj);
Tm = FNMS(KP668178637, Tl, Ti);
T1l = FMA(KP668178637, Ti, Tl);
T1H = FMA(KP707106781, Th, Te);
T1I = FMA(KP707106781, Tk, Tj);
T1J = FMA(KP198912367, T1I, T1H);
T27 = FNMS(KP198912367, T1H, T1I);
}
}
{
E Tn, Ts, Tq, Tt, To, Tp;
Tn = R0[WS(rs, 9)];
Ts = R0[WS(rs, 1)];
To = R0[WS(rs, 5)];
Tp = R0[WS(rs, 13)];
Tq = To + Tp;
Tt = To - Tp;
{
E Tr, Tu, T1E, T1F;
Tr = FNMS(KP707106781, Tq, Tn);
Tu = FNMS(KP707106781, Tt, Ts);
Tv = FNMS(KP668178637, Tu, Tr);
T1k = FMA(KP668178637, Tr, Tu);
T1E = FMA(KP707106781, Tq, Tn);
T1F = FMA(KP707106781, Tt, Ts);
T1G = FMA(KP198912367, T1F, T1E);
T26 = FNMS(KP198912367, T1E, T1F);
}
}
{
E TT, T16, TW, T17, T10, T1a, T13, T19, TU, TV;
TT = R1[WS(rs, 15)];
T16 = R1[WS(rs, 7)];
TU = R1[WS(rs, 3)];
TV = R1[WS(rs, 11)];
TW = TU - TV;
T17 = TU + TV;
{
E TY, TZ, T11, T12;
TY = R1[WS(rs, 9)];
TZ = R1[WS(rs, 1)];
T10 = FMA(KP414213562, TZ, TY);
T1a = FNMS(KP414213562, TY, TZ);
T11 = R1[WS(rs, 5)];
T12 = R1[WS(rs, 13)];
T13 = FMA(KP414213562, T12, T11);
T19 = FMS(KP414213562, T11, T12);
}
{
E TX, T14, T1W, T1X;
TX = FMA(KP707106781, TW, TT);
T14 = T10 - T13;
T15 = FMA(KP923879532, T14, TX);
T1r = FNMS(KP923879532, T14, TX);
T1W = FMA(KP707106781, T17, T16);
T1X = T10 + T13;
T1Y = FNMS(KP923879532, T1X, T1W);
T2e = FMA(KP923879532, T1X, T1W);
}
{
E T18, T1b, T1T, T1U;
T18 = FNMS(KP707106781, T17, T16);
T1b = T19 - T1a;
T1c = FNMS(KP923879532, T1b, T18);
T1s = FMA(KP923879532, T1b, T18);
T1T = FMS(KP707106781, TW, TT);
T1U = T1a + T19;
T1V = FNMS(KP923879532, T1U, T1T);
T2d = FMA(KP923879532, T1U, T1T);
}
}
{
E Ty, TL, TB, TM, TF, TP, TI, TO, Tz, TA;
Ty = R1[0];
TL = R1[WS(rs, 8)];
Tz = R1[WS(rs, 4)];
TA = R1[WS(rs, 12)];
TB = Tz - TA;
TM = Tz + TA;
{
E TD, TE, TG, TH;
TD = R1[WS(rs, 10)];
TE = R1[WS(rs, 2)];
TF = FMA(KP414213562, TE, TD);
TP = FNMS(KP414213562, TD, TE);
TG = R1[WS(rs, 6)];
TH = R1[WS(rs, 14)];
TI = FMA(KP414213562, TH, TG);
TO = FMS(KP414213562, TG, TH);
}
{
E TC, TJ, T1P, T1Q;
TC = FNMS(KP707106781, TB, Ty);
TJ = TF - TI;
TK = FNMS(KP923879532, TJ, TC);
T1o = FMA(KP923879532, TJ, TC);
T1P = FMA(KP707106781, TM, TL);
T1Q = TF + TI;
T1R = FNMS(KP923879532, T1Q, T1P);
T2b = FMA(KP923879532, T1Q, T1P);
}
{
E TN, TQ, T1M, T1N;
TN = FNMS(KP707106781, TM, TL);
TQ = TO - TP;
TR = FNMS(KP923879532, TQ, TN);
T1p = FMA(KP923879532, TQ, TN);
T1M = FMA(KP707106781, TB, Ty);
T1N = TP + TO;
T1O = FNMS(KP923879532, T1N, T1M);
T2a = FMA(KP923879532, T1N, T1M);
}
}
{
E Tx, T1f, T2L, T2N, T1e, T2O, T1i, T2M;
{
E Td, Tw, T2J, T2K;
Td = FNMS(KP923879532, Tc, T5);
Tw = Tm - Tv;
Tx = FMA(KP831469612, Tw, Td);
T1f = FNMS(KP831469612, Tw, Td);
T2J = FNMS(KP923879532, T2C, T2B);
T2K = T1k + T1l;
T2L = FMA(KP831469612, T2K, T2J);
T2N = FNMS(KP831469612, T2K, T2J);
}
{
E TS, T1d, T1g, T1h;
TS = FNMS(KP534511135, TR, TK);
T1d = FNMS(KP534511135, T1c, T15);
T1e = TS - T1d;
T2O = TS + T1d;
T1g = FMA(KP534511135, TK, TR);
T1h = FMA(KP534511135, T15, T1c);
T1i = T1g - T1h;
T2M = T1g + T1h;
}
Cr[WS(csr, 13)] = FNMS(KP881921264, T1e, Tx);
Ci[WS(csi, 13)] = FNMS(KP881921264, T2M, T2L);
Cr[WS(csr, 2)] = FMA(KP881921264, T1e, Tx);
Ci[WS(csi, 2)] = -(FMA(KP881921264, T2M, T2L));
Cr[WS(csr, 10)] = FNMS(KP881921264, T1i, T1f);
Ci[WS(csi, 10)] = -(FMA(KP881921264, T2O, T2N));
Cr[WS(csr, 5)] = FMA(KP881921264, T1i, T1f);
Ci[WS(csi, 5)] = FNMS(KP881921264, T2O, T2N);
}
{
E T29, T2h, T2r, T2t, T2g, T2u, T2k, T2s;
{
E T25, T28, T2p, T2q;
T25 = FMA(KP923879532, T1C, T1z);
T28 = T26 - T27;
T29 = FMA(KP980785280, T28, T25);
T2h = FNMS(KP980785280, T28, T25);
T2p = FMA(KP923879532, T2o, T2n);
T2q = T1G + T1J;
T2r = FMA(KP980785280, T2q, T2p);
T2t = FNMS(KP980785280, T2q, T2p);
}
{
E T2c, T2f, T2i, T2j;
T2c = FNMS(KP098491403, T2b, T2a);
T2f = FMA(KP098491403, T2e, T2d);
T2g = T2c + T2f;
T2u = T2f - T2c;
T2i = FMA(KP098491403, T2a, T2b);
T2j = FNMS(KP098491403, T2d, T2e);
T2k = T2i - T2j;
T2s = T2i + T2j;
}
Cr[WS(csr, 15)] = FNMS(KP995184726, T2g, T29);
Ci[WS(csi, 15)] = FNMS(KP995184726, T2s, T2r);
Cr[0] = FMA(KP995184726, T2g, T29);
Ci[0] = -(FMA(KP995184726, T2s, T2r));
Cr[WS(csr, 8)] = FNMS(KP995184726, T2k, T2h);
Ci[WS(csi, 8)] = FMS(KP995184726, T2u, T2t);
Cr[WS(csr, 7)] = FMA(KP995184726, T2k, T2h);
Ci[WS(csi, 7)] = FMA(KP995184726, T2u, T2t);
}
{
E T1n, T1v, T2F, T2H, T1u, T2I, T1y, T2G;
{
E T1j, T1m, T2D, T2E;
T1j = FMA(KP923879532, Tc, T5);
T1m = T1k - T1l;
T1n = FMA(KP831469612, T1m, T1j);
T1v = FNMS(KP831469612, T1m, T1j);
T2D = FMA(KP923879532, T2C, T2B);
T2E = Tv + Tm;
T2F = FMA(KP831469612, T2E, T2D);
T2H = FNMS(KP831469612, T2E, T2D);
}
{
E T1q, T1t, T1w, T1x;
T1q = FMA(KP303346683, T1p, T1o);
T1t = FMA(KP303346683, T1s, T1r);
T1u = T1q - T1t;
T2I = T1q + T1t;
T1w = FNMS(KP303346683, T1r, T1s);
T1x = FNMS(KP303346683, T1o, T1p);
T1y = T1w - T1x;
T2G = T1x + T1w;
}
Cr[WS(csr, 14)] = FNMS(KP956940335, T1u, T1n);
Ci[WS(csi, 14)] = FMS(KP956940335, T2G, T2F);
Cr[WS(csr, 1)] = FMA(KP956940335, T1u, T1n);
Ci[WS(csi, 1)] = FMA(KP956940335, T2G, T2F);
Cr[WS(csr, 9)] = FNMS(KP956940335, T1y, T1v);
Ci[WS(csi, 9)] = FNMS(KP956940335, T2I, T2H);
Cr[WS(csr, 6)] = FMA(KP956940335, T1y, T1v);
Ci[WS(csi, 6)] = -(FMA(KP956940335, T2I, T2H));
}
{
E T1L, T21, T2x, T2z, T20, T2A, T24, T2y;
{
E T1D, T1K, T2v, T2w;
T1D = FNMS(KP923879532, T1C, T1z);
T1K = T1G - T1J;
T1L = FMA(KP980785280, T1K, T1D);
T21 = FNMS(KP980785280, T1K, T1D);
T2v = FNMS(KP923879532, T2o, T2n);
T2w = T26 + T27;
T2x = FNMS(KP980785280, T2w, T2v);
T2z = FMA(KP980785280, T2w, T2v);
}
{
E T1S, T1Z, T22, T23;
T1S = FMA(KP820678790, T1R, T1O);
T1Z = FNMS(KP820678790, T1Y, T1V);
T20 = T1S + T1Z;
T2A = T1Z - T1S;
T22 = FMA(KP820678790, T1V, T1Y);
T23 = FNMS(KP820678790, T1O, T1R);
T24 = T22 - T23;
T2y = T23 + T22;
}
Cr[WS(csr, 12)] = FNMS(KP773010453, T20, T1L);
Ci[WS(csi, 12)] = FMS(KP773010453, T2y, T2x);
Cr[WS(csr, 3)] = FMA(KP773010453, T20, T1L);
Ci[WS(csi, 3)] = FMA(KP773010453, T2y, T2x);
Cr[WS(csr, 11)] = FNMS(KP773010453, T24, T21);
Ci[WS(csi, 11)] = FMA(KP773010453, T2A, T2z);
Cr[WS(csr, 4)] = FMA(KP773010453, T24, T21);
Ci[WS(csi, 4)] = FMS(KP773010453, T2A, T2z);
}
}
}
}
static const kr2c_desc desc = { 32, "r2cfII_32", { 46, 0, 128, 0 }, &GENUS };
void X(codelet_r2cfII_32) (planner *p) { X(kr2c_register) (p, r2cfII_32, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cfII_32 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 174 FP additions, 82 FP multiplications,
* (or, 138 additions, 46 multiplications, 36 fused multiply/add),
* 62 stack variables, 15 constants, and 64 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP471396736, +0.471396736825997648556387625905254377657460319);
DK(KP881921264, +0.881921264348355029712756863660388349508442621);
DK(KP634393284, +0.634393284163645498215171613225493370675687095);
DK(KP773010453, +0.773010453362736960810906609758469800971041293);
DK(KP290284677, +0.290284677254462367636192375817395274691476278);
DK(KP956940335, +0.956940335732208864935797886980269969482849206);
DK(KP995184726, +0.995184726672196886244836953109479921575474869);
DK(KP098017140, +0.098017140329560601994195563888641845861136673);
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
E T5, T2D, T1z, T2q, Tc, T2C, T1C, T2n, Tm, T1k, T1J, T26, Tv, T1l, T1G;
E T27, T15, T1r, T1Y, T2e, T1c, T1s, T1V, T2d, TK, T1o, T1R, T2b, TR, T1p;
E T1O, T2a;
{
E T1, T2p, T4, T2o, T2, T3;
T1 = R0[0];
T2p = R0[WS(rs, 8)];
T2 = R0[WS(rs, 4)];
T3 = R0[WS(rs, 12)];
T4 = KP707106781 * (T2 - T3);
T2o = KP707106781 * (T2 + T3);
T5 = T1 + T4;
T2D = T2p - T2o;
T1z = T1 - T4;
T2q = T2o + T2p;
}
{
E T8, T1A, Tb, T1B;
{
E T6, T7, T9, Ta;
T6 = R0[WS(rs, 2)];
T7 = R0[WS(rs, 10)];
T8 = FNMS(KP382683432, T7, KP923879532 * T6);
T1A = FMA(KP382683432, T6, KP923879532 * T7);
T9 = R0[WS(rs, 6)];
Ta = R0[WS(rs, 14)];
Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
T1B = FMA(KP923879532, T9, KP382683432 * Ta);
}
Tc = T8 + Tb;
T2C = Tb - T8;
T1C = T1A - T1B;
T2n = T1A + T1B;
}
{
E Te, Tk, Th, Tj, Tf, Tg;
Te = R0[WS(rs, 1)];
Tk = R0[WS(rs, 9)];
Tf = R0[WS(rs, 5)];
Tg = R0[WS(rs, 13)];
Th = KP707106781 * (Tf - Tg);
Tj = KP707106781 * (Tf + Tg);
{
E Ti, Tl, T1H, T1I;
Ti = Te + Th;
Tl = Tj + Tk;
Tm = FNMS(KP195090322, Tl, KP980785280 * Ti);
T1k = FMA(KP195090322, Ti, KP980785280 * Tl);
T1H = Tk - Tj;
T1I = Te - Th;
T1J = FNMS(KP555570233, T1I, KP831469612 * T1H);
T26 = FMA(KP831469612, T1I, KP555570233 * T1H);
}
}
{
E Tq, Tt, Tp, Ts, Tn, To;
Tq = R0[WS(rs, 15)];
Tt = R0[WS(rs, 7)];
Tn = R0[WS(rs, 3)];
To = R0[WS(rs, 11)];
Tp = KP707106781 * (Tn - To);
Ts = KP707106781 * (Tn + To);
{
E Tr, Tu, T1E, T1F;
Tr = Tp - Tq;
Tu = Ts + Tt;
Tv = FMA(KP980785280, Tr, KP195090322 * Tu);
T1l = FNMS(KP980785280, Tu, KP195090322 * Tr);
T1E = Tt - Ts;
T1F = Tp + Tq;
T1G = FNMS(KP555570233, T1F, KP831469612 * T1E);
T27 = FMA(KP831469612, T1F, KP555570233 * T1E);
}
}
{
E TW, T1a, TV, T19, T10, T16, T13, T17, TT, TU;
TW = R1[WS(rs, 15)];
T1a = R1[WS(rs, 7)];
TT = R1[WS(rs, 3)];
TU = R1[WS(rs, 11)];
TV = KP707106781 * (TT - TU);
T19 = KP707106781 * (TT + TU);
{
E TY, TZ, T11, T12;
TY = R1[WS(rs, 1)];
TZ = R1[WS(rs, 9)];
T10 = FNMS(KP382683432, TZ, KP923879532 * TY);
T16 = FMA(KP382683432, TY, KP923879532 * TZ);
T11 = R1[WS(rs, 5)];
T12 = R1[WS(rs, 13)];
T13 = FNMS(KP923879532, T12, KP382683432 * T11);
T17 = FMA(KP923879532, T11, KP382683432 * T12);
}
{
E TX, T14, T1W, T1X;
TX = TV - TW;
T14 = T10 + T13;
T15 = TX + T14;
T1r = TX - T14;
T1W = T13 - T10;
T1X = T1a - T19;
T1Y = T1W - T1X;
T2e = T1W + T1X;
}
{
E T18, T1b, T1T, T1U;
T18 = T16 + T17;
T1b = T19 + T1a;
T1c = T18 + T1b;
T1s = T1b - T18;
T1T = TV + TW;
T1U = T16 - T17;
T1V = T1T + T1U;
T2d = T1U - T1T;
}
}
{
E Ty, TP, TB, TO, TF, TL, TI, TM, Tz, TA;
Ty = R1[0];
TP = R1[WS(rs, 8)];
Tz = R1[WS(rs, 4)];
TA = R1[WS(rs, 12)];
TB = KP707106781 * (Tz - TA);
TO = KP707106781 * (Tz + TA);
{
E TD, TE, TG, TH;
TD = R1[WS(rs, 2)];
TE = R1[WS(rs, 10)];
TF = FNMS(KP382683432, TE, KP923879532 * TD);
TL = FMA(KP382683432, TD, KP923879532 * TE);
TG = R1[WS(rs, 6)];
TH = R1[WS(rs, 14)];
TI = FNMS(KP923879532, TH, KP382683432 * TG);
TM = FMA(KP923879532, TG, KP382683432 * TH);
}
{
E TC, TJ, T1P, T1Q;
TC = Ty + TB;
TJ = TF + TI;
TK = TC + TJ;
T1o = TC - TJ;
T1P = TI - TF;
T1Q = TP - TO;
T1R = T1P - T1Q;
T2b = T1P + T1Q;
}
{
E TN, TQ, T1M, T1N;
TN = TL + TM;
TQ = TO + TP;
TR = TN + TQ;
T1p = TQ - TN;
T1M = Ty - TB;
T1N = TL - TM;
T1O = T1M - T1N;
T2a = T1M + T1N;
}
}
{
E Tx, T1f, T2s, T2u, T1e, T2l, T1i, T2t;
{
E Td, Tw, T2m, T2r;
Td = T5 + Tc;
Tw = Tm + Tv;
Tx = Td - Tw;
T1f = Td + Tw;
T2m = T1l - T1k;
T2r = T2n + T2q;
T2s = T2m - T2r;
T2u = T2m + T2r;
}
{
E TS, T1d, T1g, T1h;
TS = FMA(KP098017140, TK, KP995184726 * TR);
T1d = FNMS(KP995184726, T1c, KP098017140 * T15);
T1e = TS + T1d;
T2l = T1d - TS;
T1g = FNMS(KP098017140, TR, KP995184726 * TK);
T1h = FMA(KP995184726, T15, KP098017140 * T1c);
T1i = T1g + T1h;
T2t = T1h - T1g;
}
Cr[WS(csr, 8)] = Tx - T1e;
Ci[WS(csi, 8)] = T2t - T2u;
Cr[WS(csr, 7)] = Tx + T1e;
Ci[WS(csi, 7)] = T2t + T2u;
Cr[WS(csr, 15)] = T1f - T1i;
Ci[WS(csi, 15)] = T2l - T2s;
Cr[0] = T1f + T1i;
Ci[0] = T2l + T2s;
}
{
E T29, T2h, T2M, T2O, T2g, T2J, T2k, T2N;
{
E T25, T28, T2K, T2L;
T25 = T1z + T1C;
T28 = T26 - T27;
T29 = T25 + T28;
T2h = T25 - T28;
T2K = T1J + T1G;
T2L = T2C + T2D;
T2M = T2K - T2L;
T2O = T2K + T2L;
}
{
E T2c, T2f, T2i, T2j;
T2c = FMA(KP956940335, T2a, KP290284677 * T2b);
T2f = FNMS(KP290284677, T2e, KP956940335 * T2d);
T2g = T2c + T2f;
T2J = T2f - T2c;
T2i = FMA(KP290284677, T2d, KP956940335 * T2e);
T2j = FNMS(KP290284677, T2a, KP956940335 * T2b);
T2k = T2i - T2j;
T2N = T2j + T2i;
}
Cr[WS(csr, 14)] = T29 - T2g;
Ci[WS(csi, 14)] = T2N - T2O;
Cr[WS(csr, 1)] = T29 + T2g;
Ci[WS(csi, 1)] = T2N + T2O;
Cr[WS(csr, 9)] = T2h - T2k;
Ci[WS(csi, 9)] = T2J - T2M;
Cr[WS(csr, 6)] = T2h + T2k;
Ci[WS(csi, 6)] = T2J + T2M;
}
{
E T1n, T1v, T2y, T2A, T1u, T2v, T1y, T2z;
{
E T1j, T1m, T2w, T2x;
T1j = T5 - Tc;
T1m = T1k + T1l;
T1n = T1j + T1m;
T1v = T1j - T1m;
T2w = Tv - Tm;
T2x = T2q - T2n;
T2y = T2w - T2x;
T2A = T2w + T2x;
}
{
E T1q, T1t, T1w, T1x;
T1q = FMA(KP773010453, T1o, KP634393284 * T1p);
T1t = FNMS(KP634393284, T1s, KP773010453 * T1r);
T1u = T1q + T1t;
T2v = T1t - T1q;
T1w = FMA(KP634393284, T1r, KP773010453 * T1s);
T1x = FNMS(KP634393284, T1o, KP773010453 * T1p);
T1y = T1w - T1x;
T2z = T1x + T1w;
}
Cr[WS(csr, 12)] = T1n - T1u;
Ci[WS(csi, 12)] = T2z - T2A;
Cr[WS(csr, 3)] = T1n + T1u;
Ci[WS(csi, 3)] = T2z + T2A;
Cr[WS(csr, 11)] = T1v - T1y;
Ci[WS(csi, 11)] = T2v - T2y;
Cr[WS(csr, 4)] = T1v + T1y;
Ci[WS(csi, 4)] = T2v + T2y;
}
{
E T1L, T21, T2G, T2I, T20, T2H, T24, T2B;
{
E T1D, T1K, T2E, T2F;
T1D = T1z - T1C;
T1K = T1G - T1J;
T1L = T1D + T1K;
T21 = T1D - T1K;
T2E = T2C - T2D;
T2F = T26 + T27;
T2G = T2E - T2F;
T2I = T2F + T2E;
}
{
E T1S, T1Z, T22, T23;
T1S = FMA(KP881921264, T1O, KP471396736 * T1R);
T1Z = FMA(KP881921264, T1V, KP471396736 * T1Y);
T20 = T1S - T1Z;
T2H = T1S + T1Z;
T22 = FNMS(KP471396736, T1V, KP881921264 * T1Y);
T23 = FNMS(KP471396736, T1O, KP881921264 * T1R);
T24 = T22 - T23;
T2B = T23 + T22;
}
Cr[WS(csr, 13)] = T1L - T20;
Ci[WS(csi, 13)] = T2B - T2G;
Cr[WS(csr, 2)] = T1L + T20;
Ci[WS(csi, 2)] = T2B + T2G;
Cr[WS(csr, 10)] = T21 - T24;
Ci[WS(csi, 10)] = T2I - T2H;
Cr[WS(csr, 5)] = T21 + T24;
Ci[WS(csi, 5)] = -(T2H + T2I);
}
}
}
}
static const kr2c_desc desc = { 32, "r2cfII_32", { 138, 46, 36, 0 }, &GENUS };
void X(codelet_r2cfII_32) (planner *p) { X(kr2c_register) (p, r2cfII_32, &desc);
}
#endif

View File

@@ -0,0 +1,100 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cfII_4 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 2 additions, 0 multiplications, 4 fused multiply/add),
* 8 stack variables, 1 constants, and 8 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
E T1, T5, T4, T6, T2, T3;
T1 = R0[0];
T5 = R0[WS(rs, 1)];
T2 = R1[0];
T3 = R1[WS(rs, 1)];
T4 = T2 - T3;
T6 = T2 + T3;
Cr[WS(csr, 1)] = FNMS(KP707106781, T4, T1);
Ci[WS(csi, 1)] = FNMS(KP707106781, T6, T5);
Cr[0] = FMA(KP707106781, T4, T1);
Ci[0] = -(FMA(KP707106781, T6, T5));
}
}
}
static const kr2c_desc desc = { 4, "r2cfII_4", { 2, 0, 4, 0 }, &GENUS };
void X(codelet_r2cfII_4) (planner *p) { X(kr2c_register) (p, r2cfII_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cfII_4 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 6 FP additions, 2 FP multiplications,
* (or, 6 additions, 2 multiplications, 0 fused multiply/add),
* 8 stack variables, 1 constants, and 8 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
E T1, T6, T4, T5, T2, T3;
T1 = R0[0];
T6 = R0[WS(rs, 1)];
T2 = R1[0];
T3 = R1[WS(rs, 1)];
T4 = KP707106781 * (T2 - T3);
T5 = KP707106781 * (T2 + T3);
Cr[WS(csr, 1)] = T1 - T4;
Ci[WS(csi, 1)] = T6 - T5;
Cr[0] = T1 + T4;
Ci[0] = -(T5 + T6);
}
}
}
static const kr2c_desc desc = { 4, "r2cfII_4", { 6, 2, 0, 0 }, &GENUS };
void X(codelet_r2cfII_4) (planner *p) { X(kr2c_register) (p, r2cfII_4, &desc);
}
#endif

View File

@@ -0,0 +1,126 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cfII_5 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 12 FP additions, 7 FP multiplications,
* (or, 7 additions, 2 multiplications, 5 fused multiply/add),
* 17 stack variables, 4 constants, and 10 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
E T1, T4, T7, T8, Tc, Tb, T9, Ta;
T1 = R0[0];
{
E T2, T3, T5, T6;
T2 = R0[WS(rs, 1)];
T3 = R1[WS(rs, 1)];
T4 = T2 - T3;
T5 = R0[WS(rs, 2)];
T6 = R1[0];
T7 = T5 - T6;
T8 = T4 + T7;
Tc = T5 + T6;
Tb = T2 + T3;
}
Cr[WS(csr, 2)] = T1 + T8;
Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP618033988, Tb, Tc)));
Ci[0] = -(KP951056516 * (FMA(KP618033988, Tc, Tb)));
T9 = FNMS(KP250000000, T8, T1);
Ta = T4 - T7;
Cr[0] = FMA(KP559016994, Ta, T9);
Cr[WS(csr, 1)] = FNMS(KP559016994, Ta, T9);
}
}
}
static const kr2c_desc desc = { 5, "r2cfII_5", { 7, 2, 5, 0 }, &GENUS };
void X(codelet_r2cfII_5) (planner *p) { X(kr2c_register) (p, r2cfII_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cfII_5 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 12 FP additions, 6 FP multiplications,
* (or, 9 additions, 3 multiplications, 3 fused multiply/add),
* 17 stack variables, 4 constants, and 10 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
E T8, T3, T6, T9, Tc, Tb, T7, Ta;
T8 = R0[0];
{
E T1, T2, T4, T5;
T1 = R0[WS(rs, 1)];
T2 = R1[WS(rs, 1)];
T3 = T1 - T2;
T4 = R0[WS(rs, 2)];
T5 = R1[0];
T6 = T4 - T5;
T9 = T3 + T6;
Tc = T4 + T5;
Tb = T1 + T2;
}
Cr[WS(csr, 2)] = T8 + T9;
Ci[WS(csi, 1)] = FNMS(KP951056516, Tc, KP587785252 * Tb);
Ci[0] = -(FMA(KP951056516, Tb, KP587785252 * Tc));
T7 = KP559016994 * (T3 - T6);
Ta = FNMS(KP250000000, T9, T8);
Cr[0] = T7 + Ta;
Cr[WS(csr, 1)] = Ta - T7;
}
}
}
static const kr2c_desc desc = { 5, "r2cfII_5", { 9, 3, 3, 0 }, &GENUS };
void X(codelet_r2cfII_5) (planner *p) { X(kr2c_register) (p, r2cfII_5, &desc);
}
#endif

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cfII_6 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 13 FP additions, 6 FP multiplications,
* (or, 7 additions, 0 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
E T1, T9, T2, T3, T4, Tc, T8, Ta, T6, T7, T5, Tb;
T1 = R0[0];
T9 = R1[WS(rs, 1)];
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 1)];
T4 = T3 - T2;
Tc = T2 + T3;
T6 = R1[WS(rs, 2)];
T7 = R1[0];
T8 = T6 - T7;
Ta = T6 + T7;
Ci[WS(csi, 1)] = T9 - Ta;
Cr[WS(csr, 1)] = T1 + T2 - T3;
T5 = FMA(KP500000000, T4, T1);
Cr[0] = FNMS(KP866025403, T8, T5);
Cr[WS(csr, 2)] = FMA(KP866025403, T8, T5);
Tb = FMA(KP500000000, Ta, T9);
Ci[0] = -(FMA(KP866025403, Tc, Tb));
Ci[WS(csi, 2)] = FMS(KP866025403, Tc, Tb);
}
}
}
static const kr2c_desc desc = { 6, "r2cfII_6", { 7, 0, 6, 0 }, &GENUS };
void X(codelet_r2cfII_6) (planner *p) { X(kr2c_register) (p, r2cfII_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cfII_6 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 13 FP additions, 4 FP multiplications,
* (or, 11 additions, 2 multiplications, 2 fused multiply/add),
* 14 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
E Ta, T7, T9, T1, T3, T2, T8, T4, T5, T6, Tb;
Ta = R1[WS(rs, 1)];
T5 = R1[WS(rs, 2)];
T6 = R1[0];
T7 = KP866025403 * (T5 - T6);
T9 = T5 + T6;
T1 = R0[0];
T3 = R0[WS(rs, 1)];
T2 = R0[WS(rs, 2)];
T8 = KP866025403 * (T2 + T3);
T4 = FMA(KP500000000, T3 - T2, T1);
Cr[0] = T4 - T7;
Tb = FMA(KP500000000, T9, Ta);
Ci[0] = -(T8 + Tb);
Ci[WS(csi, 2)] = T8 - Tb;
Cr[WS(csr, 2)] = T4 + T7;
Ci[WS(csi, 1)] = Ta - T9;
Cr[WS(csr, 1)] = T1 + T2 - T3;
}
}
}
static const kr2c_desc desc = { 6, "r2cfII_6", { 11, 2, 2, 0 }, &GENUS };
void X(codelet_r2cfII_6) (planner *p) { X(kr2c_register) (p, r2cfII_6, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,148 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cfII_7 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 24 FP additions, 18 FP multiplications,
* (or, 9 additions, 3 multiplications, 15 fused multiply/add),
* 23 stack variables, 6 constants, and 14 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
E Td, Te, Tf, Tg, T3, T6, Tl, Tj, Th, T9;
Td = R0[0];
{
E T1, T2, T7, T8, T4, T5;
T1 = R0[WS(rs, 1)];
T2 = R1[WS(rs, 2)];
Te = T1 - T2;
T7 = R1[WS(rs, 1)];
T8 = R0[WS(rs, 2)];
Tf = T8 - T7;
T4 = R1[0];
T5 = R0[WS(rs, 3)];
Tg = T5 - T4;
T3 = T1 + T2;
T6 = T4 + T5;
Tl = FNMS(KP356895867, Te, Tg);
Tj = FNMS(KP356895867, Tf, Te);
Th = FNMS(KP356895867, Tg, Tf);
T9 = T7 + T8;
}
{
E Ta, Tm, Tb, Ti, Tc, Tk;
Ta = FMA(KP554958132, T9, T6);
Ci[WS(csi, 2)] = KP974927912 * (FNMS(KP801937735, Ta, T3));
Tm = FNMS(KP692021471, Tl, Tf);
Cr[WS(csr, 2)] = FNMS(KP900968867, Tm, Td);
Tb = FNMS(KP554958132, T3, T9);
Ci[WS(csi, 1)] = -(KP974927912 * (FNMS(KP801937735, Tb, T6)));
Ti = FNMS(KP692021471, Th, Te);
Cr[WS(csr, 1)] = FNMS(KP900968867, Ti, Td);
Cr[WS(csr, 3)] = Te + Tg + Tf + Td;
Tc = FMA(KP554958132, T6, T3);
Ci[0] = -(KP974927912 * (FMA(KP801937735, Tc, T9)));
Tk = FNMS(KP692021471, Tj, Tg);
Cr[0] = FNMS(KP900968867, Tk, Td);
}
}
}
}
static const kr2c_desc desc = { 7, "r2cfII_7", { 9, 3, 15, 0 }, &GENUS };
void X(codelet_r2cfII_7) (planner *p) { X(kr2c_register) (p, r2cfII_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cfII_7 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 24 FP additions, 18 FP multiplications,
* (or, 12 additions, 6 multiplications, 12 fused multiply/add),
* 20 stack variables, 6 constants, and 14 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
E T1, Ta, Td, T4, Tb, T7, Tc, T8, T9;
T1 = R0[0];
T8 = R1[0];
T9 = R0[WS(rs, 3)];
Ta = T8 - T9;
Td = T8 + T9;
{
E T2, T3, T5, T6;
T2 = R0[WS(rs, 1)];
T3 = R1[WS(rs, 2)];
T4 = T2 - T3;
Tb = T2 + T3;
T5 = R1[WS(rs, 1)];
T6 = R0[WS(rs, 2)];
T7 = T5 - T6;
Tc = T5 + T6;
}
Ci[0] = -(FMA(KP781831482, Tb, KP974927912 * Tc) + (KP433883739 * Td));
Ci[WS(csi, 1)] = FNMS(KP974927912, Td, KP781831482 * Tc) - (KP433883739 * Tb);
Cr[0] = FMA(KP623489801, T4, T1) + FMA(KP222520933, T7, KP900968867 * Ta);
Ci[WS(csi, 2)] = FNMS(KP781831482, Td, KP974927912 * Tb) - (KP433883739 * Tc);
Cr[WS(csr, 2)] = FMA(KP900968867, T7, T1) + FNMA(KP623489801, Ta, KP222520933 * T4);
Cr[WS(csr, 1)] = FMA(KP222520933, Ta, T1) + FNMA(KP623489801, T7, KP900968867 * T4);
Cr[WS(csr, 3)] = T1 + T4 - (T7 + Ta);
}
}
}
static const kr2c_desc desc = { 7, "r2cfII_7", { 12, 6, 12, 0 }, &GENUS };
void X(codelet_r2cfII_7) (planner *p) { X(kr2c_register) (p, r2cfII_7, &desc);
}
#endif

View File

@@ -0,0 +1,162 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cfII_8 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 22 FP additions, 16 FP multiplications,
* (or, 6 additions, 0 multiplications, 16 fused multiply/add),
* 18 stack variables, 3 constants, and 16 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
E T1, Th, T4, Ti, T8, Te, Tb, Tf, T2, T3;
T1 = R0[0];
Th = R0[WS(rs, 2)];
T2 = R0[WS(rs, 1)];
T3 = R0[WS(rs, 3)];
T4 = T2 - T3;
Ti = T2 + T3;
{
E T6, T7, T9, Ta;
T6 = R1[0];
T7 = R1[WS(rs, 2)];
T8 = FNMS(KP414213562, T7, T6);
Te = FMA(KP414213562, T6, T7);
T9 = R1[WS(rs, 3)];
Ta = R1[WS(rs, 1)];
Tb = FMS(KP414213562, Ta, T9);
Tf = FMA(KP414213562, T9, Ta);
}
{
E T5, Tc, Tj, Tk;
T5 = FMA(KP707106781, T4, T1);
Tc = T8 + Tb;
Cr[WS(csr, 3)] = FNMS(KP923879532, Tc, T5);
Cr[0] = FMA(KP923879532, Tc, T5);
Tj = FMA(KP707106781, Ti, Th);
Tk = Te + Tf;
Ci[0] = -(FMA(KP923879532, Tk, Tj));
Ci[WS(csi, 3)] = FNMS(KP923879532, Tk, Tj);
}
{
E Td, Tg, Tl, Tm;
Td = FNMS(KP707106781, T4, T1);
Tg = Te - Tf;
Cr[WS(csr, 2)] = FNMS(KP923879532, Tg, Td);
Cr[WS(csr, 1)] = FMA(KP923879532, Tg, Td);
Tl = FNMS(KP707106781, Ti, Th);
Tm = Tb - T8;
Ci[WS(csi, 2)] = FMS(KP923879532, Tm, Tl);
Ci[WS(csi, 1)] = FMA(KP923879532, Tm, Tl);
}
}
}
}
static const kr2c_desc desc = { 8, "r2cfII_8", { 6, 0, 16, 0 }, &GENUS };
void X(codelet_r2cfII_8) (planner *p) { X(kr2c_register) (p, r2cfII_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cfII_8 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 22 FP additions, 10 FP multiplications,
* (or, 18 additions, 6 multiplications, 4 fused multiply/add),
* 18 stack variables, 3 constants, and 16 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
E T1, Tj, T4, Ti, T8, Te, Tb, Tf, T2, T3;
T1 = R0[0];
Tj = R0[WS(rs, 2)];
T2 = R0[WS(rs, 1)];
T3 = R0[WS(rs, 3)];
T4 = KP707106781 * (T2 - T3);
Ti = KP707106781 * (T2 + T3);
{
E T6, T7, T9, Ta;
T6 = R1[0];
T7 = R1[WS(rs, 2)];
T8 = FNMS(KP382683432, T7, KP923879532 * T6);
Te = FMA(KP382683432, T6, KP923879532 * T7);
T9 = R1[WS(rs, 1)];
Ta = R1[WS(rs, 3)];
Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
Tf = FMA(KP923879532, T9, KP382683432 * Ta);
}
{
E T5, Tc, Th, Tk;
T5 = T1 + T4;
Tc = T8 + Tb;
Cr[WS(csr, 3)] = T5 - Tc;
Cr[0] = T5 + Tc;
Th = Te + Tf;
Tk = Ti + Tj;
Ci[0] = -(Th + Tk);
Ci[WS(csi, 3)] = Tk - Th;
}
{
E Td, Tg, Tl, Tm;
Td = T1 - T4;
Tg = Te - Tf;
Cr[WS(csr, 2)] = Td - Tg;
Cr[WS(csr, 1)] = Td + Tg;
Tl = Tb - T8;
Tm = Tj - Ti;
Ci[WS(csi, 2)] = Tl - Tm;
Ci[WS(csi, 1)] = Tl + Tm;
}
}
}
}
static const kr2c_desc desc = { 8, "r2cfII_8", { 18, 6, 4, 0 }, &GENUS };
void X(codelet_r2cfII_8) (planner *p) { X(kr2c_register) (p, r2cfII_8, &desc);
}
#endif

View File

@@ -0,0 +1,223 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cfII_9 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 42 FP additions, 34 FP multiplications,
* (or, 12 additions, 4 multiplications, 30 fused multiply/add),
* 48 stack variables, 17 constants, and 18 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP879385241, +0.879385241571816768108218554649462939872416269);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP898197570, +0.898197570222573798468955502359086394667167570);
DK(KP673648177, +0.673648177666930348851716626769314796000375677);
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP907603734, +0.907603734547952313649323976213898122064543220);
DK(KP666666666, +0.666666666666666666666666666666666666666666667);
DK(KP826351822, +0.826351822333069651148283373230685203999624323);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP315207469, +0.315207469095904627298647952427796244129086440);
DK(KP420276625, +0.420276625461206169731530603237061658838781920);
DK(KP203604859, +0.203604859554852403062088995281827210665664861);
DK(KP152703644, +0.152703644666139302296566746461370407999248646);
DK(KP726681596, +0.726681596905677465811651808188092531873167623);
DK(KP968908795, +0.968908795874236621082202410917456709164223497);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
E T1, T4, To, Ta, Tm, TB, Tq, Tt, Tf, Tj, TA, Tr, Ts, T2, T3;
E T5, Tg;
T1 = R0[0];
T2 = R0[WS(rs, 3)];
T3 = R1[WS(rs, 1)];
T4 = T2 - T3;
To = T2 + T3;
{
E T6, T9, Tk, T7, T8, Tl;
T6 = R0[WS(rs, 1)];
T7 = R0[WS(rs, 4)];
T8 = R1[WS(rs, 2)];
T9 = T7 - T8;
Tk = T7 + T8;
Ta = T6 + T9;
Tl = FNMS(KP500000000, T9, T6);
Tm = FMA(KP968908795, Tl, Tk);
TB = FNMS(KP726681596, Tk, Tl);
Tq = FNMS(KP152703644, Tk, Tl);
Tt = FMA(KP203604859, Tl, Tk);
}
{
E Tb, Te, Ti, Tc, Td, Th;
Tb = R0[WS(rs, 2)];
Tc = R1[0];
Td = R1[WS(rs, 3)];
Te = Tc + Td;
Ti = Tc - Td;
Tf = Tb - Te;
Th = FMA(KP500000000, Te, Tb);
Tj = FNMS(KP152703644, Ti, Th);
TA = FMA(KP203604859, Th, Ti);
Tr = FNMS(KP420276625, Th, Ti);
Ts = FMA(KP315207469, Ti, Th);
}
Ci[WS(csi, 1)] = KP866025403 * (Tf - Ta);
T5 = T1 + T4;
Tg = Ta + Tf;
Cr[WS(csr, 1)] = FNMS(KP500000000, Tg, T5);
Cr[WS(csr, 4)] = T5 + Tg;
{
E Ty, Tx, Tz, Tn, TD, TC;
Tx = FNMS(KP826351822, Tr, Tq);
Ty = FNMS(KP666666666, Tx, Tt);
Tz = FMA(KP907603734, Ty, Ts);
Ci[WS(csi, 2)] = KP866025403 * (FNMS(KP939692620, Tz, To));
Tn = FMA(KP673648177, Tm, Tj);
TC = FNMS(KP898197570, TB, TA);
TD = FNMS(KP666666666, Tn, TC);
Ci[0] = -(KP984807753 * (FMA(KP879385241, To, Tn)));
Ci[WS(csi, 3)] = -(KP866025403 * (FMA(KP852868531, TD, To)));
{
E Tp, Tv, TF, TG, Tu, TE, Tw;
Tp = FNMS(KP500000000, T4, T1);
Tu = FNMS(KP907603734, Tt, Ts);
Tv = FNMS(KP666666666, Tu, Tr);
TE = FNMS(KP673648177, Tm, Tj);
TF = FMA(KP898197570, TB, TA);
TG = FMA(KP500000000, TF, TE);
Cr[WS(csr, 3)] = FNMS(KP852868531, TG, Tp);
Cr[0] = FMA(KP852868531, TF, Tp);
Tw = FMA(KP826351822, Tv, Tq);
Cr[WS(csr, 2)] = FNMS(KP852868531, Tw, Tp);
}
}
}
}
}
static const kr2c_desc desc = { 9, "r2cfII_9", { 12, 4, 30, 0 }, &GENUS };
void X(codelet_r2cfII_9) (planner *p) { X(kr2c_register) (p, r2cfII_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cfII_9 -dft-II -include rdft/scalar/r2cfII.h */
/*
* This function contains 42 FP additions, 30 FP multiplications,
* (or, 25 additions, 13 multiplications, 17 fused multiply/add),
* 39 stack variables, 14 constants, and 18 memory accesses
*/
#include "rdft/scalar/r2cfII.h"
static void r2cfII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP663413948, +0.663413948168938396205421319635891297216863310);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP556670399, +0.556670399226419366452912952047023132968291906);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP150383733, +0.150383733180435296639271897612501926072238258);
DK(KP813797681, +0.813797681349373692844693217248393223289101568);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP296198132, +0.296198132726023843175338011893050938967728390);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
E T1, T4, To, Ta, Tl, Tk, Tf, Ti, Th, T2, T3, T5, Tg;
T1 = R0[0];
T2 = R1[WS(rs, 1)];
T3 = R0[WS(rs, 3)];
T4 = T2 - T3;
To = T2 + T3;
{
E T6, T7, T8, T9;
T6 = R0[WS(rs, 1)];
T7 = R1[WS(rs, 2)];
T8 = R0[WS(rs, 4)];
T9 = T7 - T8;
Ta = T6 - T9;
Tl = T7 + T8;
Tk = FMA(KP500000000, T9, T6);
}
{
E Tb, Tc, Td, Te;
Tb = R0[WS(rs, 2)];
Tc = R1[0];
Td = R1[WS(rs, 3)];
Te = Tc + Td;
Tf = Tb - Te;
Ti = FMA(KP500000000, Te, Tb);
Th = Tc - Td;
}
Ci[WS(csi, 1)] = KP866025403 * (Tf - Ta);
T5 = T1 - T4;
Tg = Ta + Tf;
Cr[WS(csr, 1)] = FNMS(KP500000000, Tg, T5);
Cr[WS(csr, 4)] = T5 + Tg;
{
E Tr, Tt, Tw, Tv, Tu, Tp, Tq, Ts, Tj, Tm, Tn;
Tr = FMA(KP500000000, T4, T1);
Tt = FMA(KP296198132, Th, KP939692620 * Ti);
Tw = FNMS(KP813797681, Th, KP342020143 * Ti);
Tv = FNMS(KP984807753, Tk, KP150383733 * Tl);
Tu = FMA(KP173648177, Tk, KP852868531 * Tl);
Tp = FNMS(KP556670399, Tl, KP766044443 * Tk);
Tq = FMA(KP852868531, Th, KP173648177 * Ti);
Ts = Tp + Tq;
Tj = FNMS(KP984807753, Ti, KP150383733 * Th);
Tm = FMA(KP642787609, Tk, KP663413948 * Tl);
Tn = Tj - Tm;
Ci[0] = FNMS(KP866025403, To, Tn);
Cr[0] = Tr + Ts;
Ci[WS(csi, 3)] = FNMS(KP500000000, Tn, KP866025403 * ((Tp - Tq) - To));
Cr[WS(csr, 3)] = FMA(KP866025403, Tm + Tj, Tr) - (KP500000000 * Ts);
Ci[WS(csi, 2)] = FMA(KP866025403, To - (Tu + Tt), KP500000000 * (Tw - Tv));
Cr[WS(csr, 2)] = FMA(KP500000000, Tt - Tu, Tr) + (KP866025403 * (Tv + Tw));
}
}
}
}
static const kr2c_desc desc = { 9, "r2cfII_9", { 25, 13, 17, 0 }, &GENUS };
void X(codelet_r2cfII_9) (planner *p) { X(kr2c_register) (p, r2cfII_9, &desc);
}
#endif

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cf_10 -include rdft/scalar/r2cf.h */
/*
* This function contains 34 FP additions, 14 FP multiplications,
* (or, 24 additions, 4 multiplications, 10 fused multiply/add),
* 26 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
E T3, Tt, Td, Tn, Tg, To, Th, Tv, T6, Tq, T9, Tr, Ta, Tu, T1;
E T2;
T1 = R0[0];
T2 = R1[WS(rs, 2)];
T3 = T1 - T2;
Tt = T1 + T2;
{
E Tb, Tc, Te, Tf;
Tb = R0[WS(rs, 2)];
Tc = R1[WS(rs, 4)];
Td = Tb - Tc;
Tn = Tb + Tc;
Te = R0[WS(rs, 3)];
Tf = R1[0];
Tg = Te - Tf;
To = Te + Tf;
}
Th = Td + Tg;
Tv = Tn + To;
{
E T4, T5, T7, T8;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 3)];
T6 = T4 - T5;
Tq = T4 + T5;
T7 = R0[WS(rs, 4)];
T8 = R1[WS(rs, 1)];
T9 = T7 - T8;
Tr = T7 + T8;
}
Ta = T6 + T9;
Tu = Tq + Tr;
{
E Tl, Tm, Tk, Ti, Tj;
Tl = T6 - T9;
Tm = Tg - Td;
Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP618033988, Tm, Tl)));
Ci[WS(csi, 3)] = KP951056516 * (FMA(KP618033988, Tl, Tm));
Tk = Ta - Th;
Ti = Ta + Th;
Tj = FNMS(KP250000000, Ti, T3);
Cr[WS(csr, 1)] = FMA(KP559016994, Tk, Tj);
Cr[WS(csr, 5)] = T3 + Ti;
Cr[WS(csr, 3)] = FNMS(KP559016994, Tk, Tj);
}
{
E Tp, Ts, Ty, Tw, Tx;
Tp = Tn - To;
Ts = Tq - Tr;
Ci[WS(csi, 2)] = KP951056516 * (FNMS(KP618033988, Ts, Tp));
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, Tp, Ts));
Ty = Tu - Tv;
Tw = Tu + Tv;
Tx = FNMS(KP250000000, Tw, Tt);
Cr[WS(csr, 2)] = FNMS(KP559016994, Ty, Tx);
Cr[0] = Tt + Tw;
Cr[WS(csr, 4)] = FMA(KP559016994, Ty, Tx);
}
}
}
}
static const kr2c_desc desc = { 10, "r2cf_10", { 24, 4, 10, 0 }, &GENUS };
void X(codelet_r2cf_10) (planner *p) { X(kr2c_register) (p, r2cf_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cf_10 -include rdft/scalar/r2cf.h */
/*
* This function contains 34 FP additions, 12 FP multiplications,
* (or, 28 additions, 6 multiplications, 6 fused multiply/add),
* 26 stack variables, 4 constants, and 20 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
E Ti, Tt, Ta, Tn, Td, To, Te, Tv, T3, Tq, T6, Tr, T7, Tu, Tg;
E Th;
Tg = R0[0];
Th = R1[WS(rs, 2)];
Ti = Tg - Th;
Tt = Tg + Th;
{
E T8, T9, Tb, Tc;
T8 = R0[WS(rs, 2)];
T9 = R1[WS(rs, 4)];
Ta = T8 - T9;
Tn = T8 + T9;
Tb = R0[WS(rs, 3)];
Tc = R1[0];
Td = Tb - Tc;
To = Tb + Tc;
}
Te = Ta + Td;
Tv = Tn + To;
{
E T1, T2, T4, T5;
T1 = R0[WS(rs, 1)];
T2 = R1[WS(rs, 3)];
T3 = T1 - T2;
Tq = T1 + T2;
T4 = R0[WS(rs, 4)];
T5 = R1[WS(rs, 1)];
T6 = T4 - T5;
Tr = T4 + T5;
}
T7 = T3 + T6;
Tu = Tq + Tr;
{
E Tl, Tm, Tf, Tj, Tk;
Tl = Td - Ta;
Tm = T3 - T6;
Ci[WS(csi, 1)] = FNMS(KP951056516, Tm, KP587785252 * Tl);
Ci[WS(csi, 3)] = FMA(KP587785252, Tm, KP951056516 * Tl);
Tf = KP559016994 * (T7 - Te);
Tj = T7 + Te;
Tk = FNMS(KP250000000, Tj, Ti);
Cr[WS(csr, 1)] = Tf + Tk;
Cr[WS(csr, 5)] = Ti + Tj;
Cr[WS(csr, 3)] = Tk - Tf;
}
{
E Tp, Ts, Ty, Tw, Tx;
Tp = Tn - To;
Ts = Tq - Tr;
Ci[WS(csi, 2)] = FNMS(KP587785252, Ts, KP951056516 * Tp);
Ci[WS(csi, 4)] = FMA(KP951056516, Ts, KP587785252 * Tp);
Ty = KP559016994 * (Tu - Tv);
Tw = Tu + Tv;
Tx = FNMS(KP250000000, Tw, Tt);
Cr[WS(csr, 2)] = Tx - Ty;
Cr[0] = Tt + Tw;
Cr[WS(csr, 4)] = Ty + Tx;
}
}
}
}
static const kr2c_desc desc = { 10, "r2cf_10", { 28, 6, 6, 0 }, &GENUS };
void X(codelet_r2cf_10) (planner *p) { X(kr2c_register) (p, r2cf_10, &desc);
}
#endif

View File

@@ -0,0 +1,228 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 11 -name r2cf_11 -include rdft/scalar/r2cf.h */
/*
* This function contains 60 FP additions, 50 FP multiplications,
* (or, 15 additions, 5 multiplications, 45 fused multiply/add),
* 42 stack variables, 10 constants, and 22 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP918985947, +0.918985947228994779780736114132655398124909697);
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
DK(KP830830026, +0.830830026003772851058548298459246407048009821);
DK(KP715370323, +0.715370323453429719112414662767260662417897278);
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
DK(KP876768831, +0.876768831002589333891339807079336796764054852);
DK(KP778434453, +0.778434453334651800608337670740821884709317477);
DK(KP634356270, +0.634356270682424498893150776899916060542806975);
DK(KP342584725, +0.342584725681637509502641509861112333758894680);
DK(KP521108558, +0.521108558113202722944698153526659300680427422);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
E T1, T4, TC, Tg, TE, T7, TD, Ta, TF, Td, TB, TG, TM, TS, TJ;
E TP, Ty, Tq, Ti, Tu, Tm, T5, T6;
T1 = R0[0];
{
E T2, T3, Te, Tf;
T2 = R1[0];
T3 = R0[WS(rs, 5)];
T4 = T2 + T3;
TC = T3 - T2;
Te = R1[WS(rs, 2)];
Tf = R0[WS(rs, 3)];
Tg = Te + Tf;
TE = Tf - Te;
}
T5 = R0[WS(rs, 1)];
T6 = R1[WS(rs, 4)];
T7 = T5 + T6;
TD = T5 - T6;
{
E T8, T9, Tb, Tc;
T8 = R1[WS(rs, 1)];
T9 = R0[WS(rs, 4)];
Ta = T8 + T9;
TF = T9 - T8;
Tb = R0[WS(rs, 2)];
Tc = R1[WS(rs, 3)];
Td = Tb + Tc;
TB = Tb - Tc;
}
TG = FMA(KP521108558, TF, TE);
TM = FNMS(KP521108558, TD, TB);
TS = FMA(KP521108558, TC, TD);
TJ = FMA(KP521108558, TE, TC);
TP = FNMS(KP521108558, TB, TF);
{
E Tx, Tp, Th, Tt, Tl;
Tx = FNMS(KP342584725, Ta, T7);
Ty = FNMS(KP634356270, Tx, Td);
Tp = FNMS(KP342584725, T4, Ta);
Tq = FNMS(KP634356270, Tp, Tg);
Th = FNMS(KP342584725, Tg, Td);
Ti = FNMS(KP634356270, Th, Ta);
Tt = FNMS(KP342584725, Td, T4);
Tu = FNMS(KP634356270, Tt, T7);
Tl = FNMS(KP342584725, T7, Tg);
Tm = FNMS(KP634356270, Tl, T4);
}
{
E To, Tn, TI, TH;
{
E Tk, Tj, TU, TT;
Tj = FNMS(KP778434453, Ti, T7);
Tk = FNMS(KP876768831, Tj, T4);
Cr[WS(csr, 5)] = FNMS(KP959492973, Tk, T1);
TT = FMA(KP715370323, TS, TF);
TU = FMA(KP830830026, TT, TB);
Ci[WS(csi, 5)] = KP989821441 * (FMA(KP918985947, TU, TE));
}
Tn = FNMS(KP778434453, Tm, Ta);
To = FNMS(KP876768831, Tn, Td);
Cr[WS(csr, 4)] = FNMS(KP959492973, To, T1);
{
E TR, TQ, Ts, Tr;
TQ = FMA(KP715370323, TP, TC);
TR = FNMS(KP830830026, TQ, TE);
Ci[WS(csi, 4)] = KP989821441 * (FNMS(KP918985947, TR, TD));
Tr = FNMS(KP778434453, Tq, Td);
Ts = FNMS(KP876768831, Tr, T7);
Cr[WS(csr, 3)] = FNMS(KP959492973, Ts, T1);
}
{
E TO, TN, Tw, Tv;
TN = FNMS(KP715370323, TM, TE);
TO = FNMS(KP830830026, TN, TF);
Ci[WS(csi, 3)] = KP989821441 * (FNMS(KP918985947, TO, TC));
Tv = FNMS(KP778434453, Tu, Tg);
Tw = FNMS(KP876768831, Tv, Ta);
Cr[WS(csr, 2)] = FNMS(KP959492973, Tw, T1);
Cr[0] = T1 + T4 + T7 + Ta + Td + Tg;
}
TH = FMA(KP715370323, TG, TD);
TI = FNMS(KP830830026, TH, TC);
Ci[WS(csi, 2)] = KP989821441 * (FMA(KP918985947, TI, TB));
{
E TL, TK, TA, Tz;
TK = FNMS(KP715370323, TJ, TB);
TL = FMA(KP830830026, TK, TD);
Ci[WS(csi, 1)] = KP989821441 * (FNMS(KP918985947, TL, TF));
Tz = FNMS(KP778434453, Ty, T4);
TA = FNMS(KP876768831, Tz, Tg);
Cr[WS(csr, 1)] = FNMS(KP959492973, TA, T1);
}
}
}
}
}
static const kr2c_desc desc = { 11, "r2cf_11", { 15, 5, 45, 0 }, &GENUS };
void X(codelet_r2cf_11) (planner *p) { X(kr2c_register) (p, r2cf_11, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 11 -name r2cf_11 -include rdft/scalar/r2cf.h */
/*
* This function contains 60 FP additions, 50 FP multiplications,
* (or, 20 additions, 10 multiplications, 40 fused multiply/add),
* 28 stack variables, 10 constants, and 22 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP654860733, +0.654860733945285064056925072466293553183791199);
DK(KP142314838, +0.142314838273285140443792668616369668791051361);
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
DK(KP415415013, +0.415415013001886425529274149229623203524004910);
DK(KP841253532, +0.841253532831181168861811648919367717513292498);
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
DK(KP909631995, +0.909631995354518371411715383079028460060241051);
DK(KP281732556, +0.281732556841429697711417915346616899035777899);
DK(KP540640817, +0.540640817455597582107635954318691695431770608);
DK(KP755749574, +0.755749574354258283774035843972344420179717445);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
E T1, T4, Tl, Tg, Th, Td, Ti, Ta, Tk, T7, Tj, Tb, Tc;
T1 = R0[0];
{
E T2, T3, Te, Tf;
T2 = R0[WS(rs, 1)];
T3 = R1[WS(rs, 4)];
T4 = T2 + T3;
Tl = T3 - T2;
Te = R1[0];
Tf = R0[WS(rs, 5)];
Tg = Te + Tf;
Th = Tf - Te;
}
Tb = R1[WS(rs, 1)];
Tc = R0[WS(rs, 4)];
Td = Tb + Tc;
Ti = Tc - Tb;
{
E T8, T9, T5, T6;
T8 = R1[WS(rs, 2)];
T9 = R0[WS(rs, 3)];
Ta = T8 + T9;
Tk = T9 - T8;
T5 = R0[WS(rs, 2)];
T6 = R1[WS(rs, 3)];
T7 = T5 + T6;
Tj = T6 - T5;
}
Ci[WS(csi, 4)] = FMA(KP755749574, Th, KP540640817 * Ti) + FNMS(KP909631995, Tk, KP281732556 * Tj) - (KP989821441 * Tl);
Cr[WS(csr, 4)] = FMA(KP841253532, Td, T1) + FNMS(KP959492973, T7, KP415415013 * Ta) + FNMA(KP142314838, T4, KP654860733 * Tg);
Ci[WS(csi, 2)] = FMA(KP909631995, Th, KP755749574 * Tl) + FNMA(KP540640817, Tk, KP989821441 * Tj) - (KP281732556 * Ti);
Ci[WS(csi, 5)] = FMA(KP281732556, Th, KP755749574 * Ti) + FNMS(KP909631995, Tj, KP989821441 * Tk) - (KP540640817 * Tl);
Ci[WS(csi, 1)] = FMA(KP540640817, Th, KP909631995 * Tl) + FMA(KP989821441, Ti, KP755749574 * Tj) + (KP281732556 * Tk);
Ci[WS(csi, 3)] = FMA(KP989821441, Th, KP540640817 * Tj) + FNMS(KP909631995, Ti, KP755749574 * Tk) - (KP281732556 * Tl);
Cr[WS(csr, 3)] = FMA(KP415415013, Td, T1) + FNMS(KP654860733, Ta, KP841253532 * T7) + FNMA(KP959492973, T4, KP142314838 * Tg);
Cr[WS(csr, 1)] = FMA(KP841253532, Tg, T1) + FNMS(KP959492973, Ta, KP415415013 * T4) + FNMA(KP654860733, T7, KP142314838 * Td);
Cr[0] = T1 + Tg + T4 + Td + T7 + Ta;
Cr[WS(csr, 2)] = FMA(KP415415013, Tg, T1) + FNMS(KP142314838, T7, KP841253532 * Ta) + FNMA(KP959492973, Td, KP654860733 * T4);
Cr[WS(csr, 5)] = FMA(KP841253532, T4, T1) + FNMS(KP142314838, Ta, KP415415013 * T7) + FNMA(KP654860733, Td, KP959492973 * Tg);
}
}
}
static const kr2c_desc desc = { 11, "r2cf_11", { 20, 10, 40, 0 }, &GENUS };
void X(codelet_r2cf_11) (planner *p) { X(kr2c_register) (p, r2cf_11, &desc);
}
#endif

View File

@@ -0,0 +1,218 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cf_12 -include rdft/scalar/r2cf.h */
/*
* This function contains 38 FP additions, 10 FP multiplications,
* (or, 30 additions, 2 multiplications, 8 fused multiply/add),
* 21 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
E T5, Tp, Tm, Tk, Ty, Tt, Ta, Tq, Tn, Tf, Tz, Tu, Tl, To;
{
E T1, T2, T3, T4;
T1 = R0[0];
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 4)];
T4 = T2 + T3;
T5 = T1 + T4;
Tp = FNMS(KP500000000, T4, T1);
Tm = T3 - T2;
}
{
E Tg, Th, Ti, Tj;
Tg = R1[WS(rs, 1)];
Th = R1[WS(rs, 3)];
Ti = R1[WS(rs, 5)];
Tj = Th + Ti;
Tk = FNMS(KP500000000, Tj, Tg);
Ty = Ti - Th;
Tt = Tg + Tj;
}
{
E T6, T7, T8, T9;
T6 = R0[WS(rs, 3)];
T7 = R0[WS(rs, 5)];
T8 = R0[WS(rs, 1)];
T9 = T7 + T8;
Ta = T6 + T9;
Tq = FNMS(KP500000000, T9, T6);
Tn = T8 - T7;
}
{
E Tb, Tc, Td, Te;
Tb = R1[WS(rs, 4)];
Tc = R1[0];
Td = R1[WS(rs, 2)];
Te = Tc + Td;
Tf = FNMS(KP500000000, Te, Tb);
Tz = Td - Tc;
Tu = Tb + Te;
}
Cr[WS(csr, 3)] = T5 - Ta;
Ci[WS(csi, 3)] = Tt - Tu;
Tl = Tf - Tk;
To = Tm - Tn;
Ci[WS(csi, 1)] = FMA(KP866025403, To, Tl);
Ci[WS(csi, 5)] = FNMS(KP866025403, To, Tl);
{
E Tx, TA, Tv, Tw;
Tx = Tp - Tq;
TA = Ty - Tz;
Cr[WS(csr, 5)] = FNMS(KP866025403, TA, Tx);
Cr[WS(csr, 1)] = FMA(KP866025403, TA, Tx);
Tv = T5 + Ta;
Tw = Tt + Tu;
Cr[WS(csr, 6)] = Tv - Tw;
Cr[0] = Tv + Tw;
}
{
E Tr, Ts, TB, TC;
Tr = Tp + Tq;
Ts = Tk + Tf;
Cr[WS(csr, 2)] = Tr - Ts;
Cr[WS(csr, 4)] = Tr + Ts;
TB = Ty + Tz;
TC = Tm + Tn;
Ci[WS(csi, 2)] = KP866025403 * (TB - TC);
Ci[WS(csi, 4)] = KP866025403 * (TC + TB);
}
}
}
}
static const kr2c_desc desc = { 12, "r2cf_12", { 30, 2, 8, 0 }, &GENUS };
void X(codelet_r2cf_12) (planner *p) { X(kr2c_register) (p, r2cf_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cf_12 -include rdft/scalar/r2cf.h */
/*
* This function contains 38 FP additions, 8 FP multiplications,
* (or, 34 additions, 4 multiplications, 4 fused multiply/add),
* 21 stack variables, 2 constants, and 24 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
E T5, Tp, Tb, Tn, Ty, Tt, Ta, Tq, Tc, Ti, Tz, Tu, Td, To;
{
E T1, T2, T3, T4;
T1 = R0[0];
T2 = R0[WS(rs, 2)];
T3 = R0[WS(rs, 4)];
T4 = T2 + T3;
T5 = T1 + T4;
Tp = FNMS(KP500000000, T4, T1);
Tb = T3 - T2;
}
{
E Tj, Tk, Tl, Tm;
Tj = R1[WS(rs, 1)];
Tk = R1[WS(rs, 3)];
Tl = R1[WS(rs, 5)];
Tm = Tk + Tl;
Tn = FNMS(KP500000000, Tm, Tj);
Ty = Tl - Tk;
Tt = Tj + Tm;
}
{
E T6, T7, T8, T9;
T6 = R0[WS(rs, 3)];
T7 = R0[WS(rs, 5)];
T8 = R0[WS(rs, 1)];
T9 = T7 + T8;
Ta = T6 + T9;
Tq = FNMS(KP500000000, T9, T6);
Tc = T8 - T7;
}
{
E Te, Tf, Tg, Th;
Te = R1[WS(rs, 4)];
Tf = R1[0];
Tg = R1[WS(rs, 2)];
Th = Tf + Tg;
Ti = FNMS(KP500000000, Th, Te);
Tz = Tg - Tf;
Tu = Te + Th;
}
Cr[WS(csr, 3)] = T5 - Ta;
Ci[WS(csi, 3)] = Tt - Tu;
Td = KP866025403 * (Tb - Tc);
To = Ti - Tn;
Ci[WS(csi, 1)] = Td + To;
Ci[WS(csi, 5)] = To - Td;
{
E Tx, TA, Tv, Tw;
Tx = Tp - Tq;
TA = KP866025403 * (Ty - Tz);
Cr[WS(csr, 5)] = Tx - TA;
Cr[WS(csr, 1)] = Tx + TA;
Tv = T5 + Ta;
Tw = Tt + Tu;
Cr[WS(csr, 6)] = Tv - Tw;
Cr[0] = Tv + Tw;
}
{
E Tr, Ts, TB, TC;
Tr = Tp + Tq;
Ts = Tn + Ti;
Cr[WS(csr, 2)] = Tr - Ts;
Cr[WS(csr, 4)] = Tr + Ts;
TB = Ty + Tz;
TC = Tb + Tc;
Ci[WS(csi, 2)] = KP866025403 * (TB - TC);
Ci[WS(csi, 4)] = KP866025403 * (TC + TB);
}
}
}
}
static const kr2c_desc desc = { 12, "r2cf_12", { 34, 4, 4, 0 }, &GENUS };
void X(codelet_r2cf_12) (planner *p) { X(kr2c_register) (p, r2cf_12, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,361 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 13 -name r2cf_13 -include rdft/scalar/r2cf.h */
/*
* This function contains 76 FP additions, 51 FP multiplications,
* (or, 31 additions, 6 multiplications, 45 fused multiply/add),
* 58 stack variables, 23 constants, and 26 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
DK(KP516520780, +0.516520780623489722840901288569017135705033622);
DK(KP859542535, +0.859542535098774820163672132761689612766401925);
DK(KP581704778, +0.581704778510515730456870384989698884939833902);
DK(KP514918778, +0.514918778086315755491789696138117261566051239);
DK(KP769338817, +0.769338817572980603471413688209101117038278899);
DK(KP686558370, +0.686558370781754340655719594850823015421401653);
DK(KP226109445, +0.226109445035782405468510155372505010481906348);
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
DK(KP301479260, +0.301479260047709873958013540496673347309208464);
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
DK(KP904176221, +0.904176221990848204433795481776887926501523162);
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
DK(KP522026385, +0.522026385161275033714027226654165028300441940);
DK(KP957805992, +0.957805992594665126462521754605754580515587217);
DK(KP600477271, +0.600477271932665282925769253334763009352012849);
DK(KP853480001, +0.853480001859823990758994934970528322872359049);
DK(KP612264650, +0.612264650376756543746494474777125408779395514);
DK(KP038632954, +0.038632954644348171955506895830342264440241080);
DK(KP302775637, +0.302775637731994646559610633735247973125648287);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
E TN, TA, TD, TO, TR, TS, TZ, T12, Tu, Tx, Tj, Tw, TW, T13;
TN = R0[0];
{
E T3, TP, Th, TB, Tp, Te, TC, Tm, T6, Tr, T9, Ts, Ta, TQ, T1;
E T2;
T1 = R0[WS(rs, 4)];
T2 = R1[WS(rs, 2)];
T3 = T1 - T2;
TP = T1 + T2;
{
E Tn, Tf, Tg, To;
Tn = R0[WS(rs, 6)];
Tf = R0[WS(rs, 5)];
Tg = R0[WS(rs, 2)];
To = Tf + Tg;
Th = Tf - Tg;
TB = Tn + To;
Tp = FMS(KP500000000, To, Tn);
}
{
E Tk, Tc, Td, Tl;
Tk = R1[0];
Tc = R1[WS(rs, 4)];
Td = R1[WS(rs, 1)];
Tl = Td + Tc;
Te = Tc - Td;
TC = Tk + Tl;
Tm = FNMS(KP500000000, Tl, Tk);
}
{
E T4, T5, T7, T8;
T4 = R1[WS(rs, 5)];
T5 = R0[WS(rs, 3)];
T6 = T4 - T5;
Tr = T4 + T5;
T7 = R1[WS(rs, 3)];
T8 = R0[WS(rs, 1)];
T9 = T7 - T8;
Ts = T7 + T8;
}
Ta = T6 + T9;
TQ = Tr + Ts;
TA = T3 + Ta;
TD = TB - TC;
TO = TC + TB;
TR = TP + TQ;
TS = TO + TR;
{
E TX, TY, Tq, Tt;
TX = Tm - Tp;
TY = FNMS(KP500000000, TQ, TP);
TZ = TX + TY;
T12 = TX - TY;
Tq = Tm + Tp;
Tt = Tr - Ts;
Tu = FMA(KP866025403, Tt, Tq);
Tx = FNMS(KP866025403, Tt, Tq);
}
{
E Tb, Ti, TU, TV;
Tb = FNMS(KP500000000, Ta, T3);
Ti = Te + Th;
Tj = FMA(KP866025403, Ti, Tb);
Tw = FNMS(KP866025403, Ti, Tb);
TU = Th - Te;
TV = T6 - T9;
TW = TU + TV;
T13 = TU - TV;
}
}
Cr[0] = TN + TS;
{
E TE, TI, Tz, TK, TH, TM, TJ, TL;
TE = FMA(KP302775637, TD, TA);
TI = FNMS(KP302775637, TA, TD);
{
E Tv, Ty, TF, TG;
Tv = FMA(KP038632954, Tu, Tj);
Ty = FMA(KP612264650, Tx, Tw);
Tz = FNMS(KP853480001, Ty, Tv);
TK = FMA(KP853480001, Ty, Tv);
TF = FNMS(KP038632954, Tj, Tu);
TG = FNMS(KP612264650, Tw, Tx);
TH = FNMS(KP853480001, TG, TF);
TM = FMA(KP853480001, TG, TF);
}
Ci[WS(csi, 1)] = KP600477271 * (FMA(KP957805992, TE, Tz));
Ci[WS(csi, 5)] = -(KP600477271 * (FNMS(KP957805992, TI, TH)));
TJ = FMA(KP522026385, TH, TI);
Ci[WS(csi, 2)] = KP575140729 * (FNMS(KP904176221, TK, TJ));
Ci[WS(csi, 6)] = KP575140729 * (FMA(KP904176221, TK, TJ));
TL = FNMS(KP522026385, Tz, TE);
Ci[WS(csi, 3)] = KP575140729 * (FNMS(KP904176221, TM, TL));
Ci[WS(csi, 4)] = -(KP575140729 * (FMA(KP904176221, TM, TL)));
}
{
E T11, T17, T1c, T1e, T16, T18, TT, T10, T19, T1d;
TT = FNMS(KP083333333, TS, TN);
T10 = FMA(KP301479260, TZ, TW);
T11 = FMA(KP503537032, T10, TT);
T17 = FNMS(KP251768516, T10, TT);
{
E T1a, T1b, T14, T15;
T1a = FNMS(KP226109445, TW, TZ);
T1b = FMA(KP686558370, T12, T13);
T1c = FNMS(KP769338817, T1b, T1a);
T1e = FMA(KP769338817, T1b, T1a);
T14 = FNMS(KP514918778, T13, T12);
T15 = TO - TR;
T16 = FMA(KP581704778, T15, T14);
T18 = FNMS(KP859542535, T14, T15);
}
Cr[WS(csr, 5)] = FNMS(KP516520780, T16, T11);
Cr[WS(csr, 1)] = FMA(KP516520780, T16, T11);
T19 = FMA(KP300462606, T18, T17);
Cr[WS(csr, 4)] = FNMS(KP503537032, T1c, T19);
Cr[WS(csr, 3)] = FMA(KP503537032, T1c, T19);
T1d = FNMS(KP300462606, T18, T17);
Cr[WS(csr, 6)] = FNMS(KP503537032, T1e, T1d);
Cr[WS(csr, 2)] = FMA(KP503537032, T1e, T1d);
}
}
}
}
static const kr2c_desc desc = { 13, "r2cf_13", { 31, 6, 45, 0 }, &GENUS };
void X(codelet_r2cf_13) (planner *p) { X(kr2c_register) (p, r2cf_13, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 13 -name r2cf_13 -include rdft/scalar/r2cf.h */
/*
* This function contains 76 FP additions, 34 FP multiplications,
* (or, 57 additions, 15 multiplications, 19 fused multiply/add),
* 55 stack variables, 20 constants, and 26 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
DK(KP075902986, +0.075902986037193865983102897245103540356428373);
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
DK(KP113854479, +0.113854479055790798974654345867655310534642560);
DK(KP265966249, +0.265966249214837287587521063842185948798330267);
DK(KP387390585, +0.387390585467617292130675966426762851778775217);
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
DK(KP132983124, +0.132983124607418643793760531921092974399165133);
DK(KP258260390, +0.258260390311744861420450644284508567852516811);
DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
DK(KP300238635, +0.300238635966332641462884626667381504676006424);
DK(KP011599105, +0.011599105605768290721655456654083252189827041);
DK(KP156891391, +0.156891391051584611046832726756003269660212636);
DK(KP256247671, +0.256247671582936600958684654061725059144125175);
DK(KP174138601, +0.174138601152135905005660794929264742616964676);
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
E T13, Tb, Tm, TW, TX, T14, TU, T10, Tz, TB, Tu, TC, TR, T11;
T13 = R0[0];
{
E Te, TO, Ta, Tv, To, T5, Tw, Tp, Th, Tr, Tk, Ts, Tl, TP, Tc;
E Td;
Tc = R0[WS(rs, 4)];
Td = R1[WS(rs, 2)];
Te = Tc - Td;
TO = Tc + Td;
{
E T6, T7, T8, T9;
T6 = R1[0];
T7 = R1[WS(rs, 1)];
T8 = R1[WS(rs, 4)];
T9 = T7 + T8;
Ta = T6 + T9;
Tv = T7 - T8;
To = FNMS(KP500000000, T9, T6);
}
{
E T1, T2, T3, T4;
T1 = R0[WS(rs, 6)];
T2 = R0[WS(rs, 5)];
T3 = R0[WS(rs, 2)];
T4 = T2 + T3;
T5 = T1 + T4;
Tw = T2 - T3;
Tp = FNMS(KP500000000, T4, T1);
}
{
E Tf, Tg, Ti, Tj;
Tf = R1[WS(rs, 5)];
Tg = R0[WS(rs, 3)];
Th = Tf - Tg;
Tr = Tf + Tg;
Ti = R1[WS(rs, 3)];
Tj = R0[WS(rs, 1)];
Tk = Ti - Tj;
Ts = Ti + Tj;
}
Tl = Th + Tk;
TP = Tr + Ts;
Tb = T5 - Ta;
Tm = Te + Tl;
TW = Ta + T5;
TX = TO + TP;
T14 = TW + TX;
{
E TS, TT, Tx, Ty;
TS = Tv + Tw;
TT = Th - Tk;
TU = TS - TT;
T10 = TS + TT;
Tx = KP866025403 * (Tv - Tw);
Ty = FNMS(KP500000000, Tl, Te);
Tz = Tx + Ty;
TB = Ty - Tx;
}
{
E Tq, Tt, TN, TQ;
Tq = To - Tp;
Tt = KP866025403 * (Tr - Ts);
Tu = Tq - Tt;
TC = Tq + Tt;
TN = To + Tp;
TQ = FNMS(KP500000000, TP, TO);
TR = TN - TQ;
T11 = TN + TQ;
}
}
Cr[0] = T13 + T14;
{
E Tn, TG, TE, TF, TJ, TM, TK, TL;
Tn = FNMS(KP174138601, Tm, KP575140729 * Tb);
TG = FMA(KP174138601, Tb, KP575140729 * Tm);
{
E TA, TD, TH, TI;
TA = FNMS(KP156891391, Tz, KP256247671 * Tu);
TD = FNMS(KP300238635, TC, KP011599105 * TB);
TE = TA + TD;
TF = KP1_732050807 * (TD - TA);
TH = FMA(KP300238635, TB, KP011599105 * TC);
TI = FMA(KP256247671, Tz, KP156891391 * Tu);
TJ = TH - TI;
TM = KP1_732050807 * (TI + TH);
}
Ci[WS(csi, 5)] = FMA(KP2_000000000, TE, Tn);
Ci[WS(csi, 1)] = FMA(KP2_000000000, TJ, TG);
TK = TG - TJ;
Ci[WS(csi, 4)] = TF - TK;
Ci[WS(csi, 3)] = TF + TK;
TL = Tn - TE;
Ci[WS(csi, 2)] = TL - TM;
Ci[WS(csi, 6)] = TL + TM;
}
{
E TZ, T1b, T19, T1e, T16, T1a, TV, TY, T1c, T1d;
TV = FNMS(KP132983124, TU, KP258260390 * TR);
TY = KP300462606 * (TW - TX);
TZ = FMA(KP2_000000000, TV, TY);
T1b = TY - TV;
{
E T17, T18, T12, T15;
T17 = FMA(KP387390585, TU, KP265966249 * TR);
T18 = FNMS(KP503537032, T11, KP113854479 * T10);
T19 = T17 - T18;
T1e = T17 + T18;
T12 = FMA(KP251768516, T10, KP075902986 * T11);
T15 = FNMS(KP083333333, T14, T13);
T16 = FMA(KP2_000000000, T12, T15);
T1a = T15 - T12;
}
Cr[WS(csr, 1)] = TZ + T16;
Cr[WS(csr, 5)] = T16 - TZ;
T1c = T1a - T1b;
Cr[WS(csr, 2)] = T19 + T1c;
Cr[WS(csr, 6)] = T1c - T19;
T1d = T1b + T1a;
Cr[WS(csr, 3)] = T1d - T1e;
Cr[WS(csr, 4)] = T1e + T1d;
}
}
}
}
static const kr2c_desc desc = { 13, "r2cf_13", { 57, 15, 19, 0 }, &GENUS };
void X(codelet_r2cf_13) (planner *p) { X(kr2c_register) (p, r2cf_13, &desc);
}
#endif

View File

@@ -0,0 +1,263 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 14 -name r2cf_14 -include rdft/scalar/r2cf.h */
/*
* This function contains 62 FP additions, 36 FP multiplications,
* (or, 32 additions, 6 multiplications, 30 fused multiply/add),
* 33 stack variables, 6 constants, and 28 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
E T3, TN, To, TQ, Tx, TG, Ta, TO, Tw, TD, Th, TP, Tv, TJ, T1;
E T2, TA, TK;
T1 = R0[0];
T2 = R1[WS(rs, 3)];
T3 = T1 - T2;
TN = T1 + T2;
{
E Tk, TE, Tn, TF;
{
E Ti, Tj, Tl, Tm;
Ti = R0[WS(rs, 3)];
Tj = R1[WS(rs, 6)];
Tk = Ti - Tj;
TE = Ti + Tj;
Tl = R0[WS(rs, 4)];
Tm = R1[0];
Tn = Tl - Tm;
TF = Tl + Tm;
}
To = Tk + Tn;
TQ = TE + TF;
Tx = Tn - Tk;
TG = TE - TF;
}
{
E T6, TC, T9, TB;
{
E T4, T5, T7, T8;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 4)];
T6 = T4 - T5;
TC = T4 + T5;
T7 = R0[WS(rs, 6)];
T8 = R1[WS(rs, 2)];
T9 = T7 - T8;
TB = T7 + T8;
}
Ta = T6 + T9;
TO = TC + TB;
Tw = T6 - T9;
TD = TB - TC;
}
{
E Td, TH, Tg, TI;
{
E Tb, Tc, Te, Tf;
Tb = R0[WS(rs, 2)];
Tc = R1[WS(rs, 5)];
Td = Tb - Tc;
TH = Tb + Tc;
Te = R0[WS(rs, 5)];
Tf = R1[WS(rs, 1)];
Tg = Te - Tf;
TI = Te + Tf;
}
Th = Td + Tg;
TP = TH + TI;
Tv = Tg - Td;
TJ = TH - TI;
}
Cr[WS(csr, 7)] = T3 + Ta + Th + To;
Cr[0] = TN + TO + TP + TQ;
TA = FMA(KP554958132, Tw, Tv);
Ci[WS(csi, 3)] = KP974927912 * (FNMS(KP801937735, TA, Tx));
{
E TL, TM, Ty, Tz;
TL = FNMS(KP554958132, TG, TD);
Ci[WS(csi, 6)] = KP974927912 * (FNMS(KP801937735, TL, TJ));
TM = FMA(KP554958132, TD, TJ);
Ci[WS(csi, 4)] = KP974927912 * (FNMS(KP801937735, TM, TG));
Ty = FNMS(KP554958132, Tx, Tw);
Ci[WS(csi, 1)] = KP974927912 * (FNMS(KP801937735, Ty, Tv));
Tz = FMA(KP554958132, Tv, Tx);
Ci[WS(csi, 5)] = KP974927912 * (FMA(KP801937735, Tz, Tw));
}
TK = FMA(KP554958132, TJ, TG);
Ci[WS(csi, 2)] = KP974927912 * (FMA(KP801937735, TK, TD));
{
E TU, TT, Tq, Tp;
TT = FNMS(KP356895867, TO, TQ);
TU = FNMS(KP692021471, TT, TP);
Cr[WS(csr, 2)] = FNMS(KP900968867, TU, TN);
Tp = FNMS(KP356895867, To, Th);
Tq = FNMS(KP692021471, Tp, Ta);
Cr[WS(csr, 3)] = FNMS(KP900968867, Tq, T3);
}
{
E Tu, Tt, Ts, Tr;
Tt = FNMS(KP356895867, Th, Ta);
Tu = FNMS(KP692021471, Tt, To);
Cr[WS(csr, 1)] = FNMS(KP900968867, Tu, T3);
Tr = FNMS(KP356895867, Ta, To);
Ts = FNMS(KP692021471, Tr, Th);
Cr[WS(csr, 5)] = FNMS(KP900968867, Ts, T3);
}
{
E TW, TV, TS, TR;
TV = FNMS(KP356895867, TP, TO);
TW = FNMS(KP692021471, TV, TQ);
Cr[WS(csr, 6)] = FNMS(KP900968867, TW, TN);
TR = FNMS(KP356895867, TQ, TP);
TS = FNMS(KP692021471, TR, TO);
Cr[WS(csr, 4)] = FNMS(KP900968867, TS, TN);
}
}
}
}
static const kr2c_desc desc = { 14, "r2cf_14", { 32, 6, 30, 0 }, &GENUS };
void X(codelet_r2cf_14) (planner *p) { X(kr2c_register) (p, r2cf_14, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 14 -name r2cf_14 -include rdft/scalar/r2cf.h */
/*
* This function contains 62 FP additions, 36 FP multiplications,
* (or, 38 additions, 12 multiplications, 24 fused multiply/add),
* 29 stack variables, 6 constants, and 28 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
E T3, TB, T6, Tv, Tn, Ts, Tk, Tt, Td, Ty, T9, Tw, Tg, Tz, T1;
E T2;
T1 = R0[0];
T2 = R1[WS(rs, 3)];
T3 = T1 - T2;
TB = T1 + T2;
{
E T4, T5, Tl, Tm;
T4 = R0[WS(rs, 2)];
T5 = R1[WS(rs, 5)];
T6 = T4 - T5;
Tv = T4 + T5;
Tl = R0[WS(rs, 6)];
Tm = R1[WS(rs, 2)];
Tn = Tl - Tm;
Ts = Tl + Tm;
}
{
E Ti, Tj, Tb, Tc;
Ti = R0[WS(rs, 1)];
Tj = R1[WS(rs, 4)];
Tk = Ti - Tj;
Tt = Ti + Tj;
Tb = R0[WS(rs, 3)];
Tc = R1[WS(rs, 6)];
Td = Tb - Tc;
Ty = Tb + Tc;
}
{
E T7, T8, Te, Tf;
T7 = R0[WS(rs, 5)];
T8 = R1[WS(rs, 1)];
T9 = T7 - T8;
Tw = T7 + T8;
Te = R0[WS(rs, 4)];
Tf = R1[0];
Tg = Te - Tf;
Tz = Te + Tf;
}
{
E Tp, Tr, Tq, Ta, To, Th;
Tp = Tn - Tk;
Tr = Tg - Td;
Tq = T9 - T6;
Ci[WS(csi, 1)] = FMA(KP781831482, Tp, KP974927912 * Tq) + (KP433883739 * Tr);
Ci[WS(csi, 5)] = FMA(KP433883739, Tq, KP781831482 * Tr) - (KP974927912 * Tp);
Ci[WS(csi, 3)] = FMA(KP433883739, Tp, KP974927912 * Tr) - (KP781831482 * Tq);
Ta = T6 + T9;
To = Tk + Tn;
Th = Td + Tg;
Cr[WS(csr, 3)] = FMA(KP623489801, Ta, T3) + FNMA(KP222520933, Th, KP900968867 * To);
Cr[WS(csr, 7)] = T3 + To + Ta + Th;
Cr[WS(csr, 1)] = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
Cr[WS(csr, 5)] = FMA(KP623489801, Th, T3) + FNMA(KP900968867, Ta, KP222520933 * To);
}
{
E Tu, TA, Tx, TC, TE, TD;
Tu = Ts - Tt;
TA = Ty - Tz;
Tx = Tv - Tw;
Ci[WS(csi, 2)] = FMA(KP974927912, Tu, KP433883739 * Tx) + (KP781831482 * TA);
Ci[WS(csi, 6)] = FMA(KP974927912, Tx, KP433883739 * TA) - (KP781831482 * Tu);
Ci[WS(csi, 4)] = FNMS(KP781831482, Tx, KP974927912 * TA) - (KP433883739 * Tu);
TC = Tt + Ts;
TE = Tv + Tw;
TD = Ty + Tz;
Cr[WS(csr, 6)] = FMA(KP623489801, TC, TB) + FNMA(KP900968867, TD, KP222520933 * TE);
Cr[WS(csr, 2)] = FMA(KP623489801, TD, TB) + FNMA(KP900968867, TE, KP222520933 * TC);
Cr[WS(csr, 4)] = FMA(KP623489801, TE, TB) + FNMA(KP222520933, TD, KP900968867 * TC);
Cr[0] = TB + TC + TE + TD;
}
}
}
}
static const kr2c_desc desc = { 14, "r2cf_14", { 38, 12, 24, 0 }, &GENUS };
void X(codelet_r2cf_14) (planner *p) { X(kr2c_register) (p, r2cf_14, &desc);
}
#endif

View File

@@ -0,0 +1,304 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cf_15 -include rdft/scalar/r2cf.h */
/*
* This function contains 64 FP additions, 35 FP multiplications,
* (or, 36 additions, 7 multiplications, 28 fused multiply/add),
* 45 stack variables, 8 constants, and 30 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP910592997, +0.910592997310029334643087372129977886038870291);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP823639103, +0.823639103546331925877420039278190003029660514);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
E Ti, TR, TF, TM, TN, T7, Te, Tf, TV, TW, TX, Ts, Tv, TH, Tl;
E To, TG, TS, TT, TU;
{
E TD, Tg, Th, TE;
TD = R0[0];
Tg = R0[WS(rs, 5)];
Th = R1[WS(rs, 2)];
TE = Th + Tg;
Ti = Tg - Th;
TR = TD + TE;
TF = FNMS(KP500000000, TE, TD);
}
{
E Tj, Tq, Tt, Tm, T3, Tk, Ta, Tr, Td, Tu, T6, Tn;
Tj = R1[WS(rs, 1)];
Tq = R0[WS(rs, 3)];
Tt = R1[WS(rs, 4)];
Tm = R0[WS(rs, 6)];
{
E T1, T2, T8, T9;
T1 = R0[WS(rs, 4)];
T2 = R1[WS(rs, 6)];
T3 = T1 - T2;
Tk = T1 + T2;
T8 = R1[WS(rs, 5)];
T9 = R1[0];
Ta = T8 - T9;
Tr = T8 + T9;
}
{
E Tb, Tc, T4, T5;
Tb = R0[WS(rs, 7)];
Tc = R0[WS(rs, 2)];
Td = Tb - Tc;
Tu = Tb + Tc;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 3)];
T6 = T4 - T5;
Tn = T4 + T5;
}
TM = T6 - T3;
TN = Td - Ta;
T7 = T3 + T6;
Te = Ta + Td;
Tf = T7 + Te;
TV = Tq + Tr;
TW = Tt + Tu;
TX = TV + TW;
Ts = FNMS(KP500000000, Tr, Tq);
Tv = FNMS(KP500000000, Tu, Tt);
TH = Ts + Tv;
Tl = FNMS(KP500000000, Tk, Tj);
To = FNMS(KP500000000, Tn, Tm);
TG = Tl + To;
TS = Tj + Tk;
TT = Tm + Tn;
TU = TS + TT;
}
Ci[WS(csi, 5)] = KP866025403 * (Tf - Ti);
{
E TK, TQ, TO, TI, TJ, TP, TL;
TK = TG - TH;
TQ = FNMS(KP618033988, TM, TN);
TO = FMA(KP618033988, TN, TM);
TI = TG + TH;
TJ = FNMS(KP250000000, TI, TF);
Cr[WS(csr, 5)] = TF + TI;
TP = FNMS(KP559016994, TK, TJ);
Cr[WS(csr, 2)] = FMA(KP823639103, TQ, TP);
Cr[WS(csr, 7)] = FNMS(KP823639103, TQ, TP);
TL = FMA(KP559016994, TK, TJ);
Cr[WS(csr, 1)] = FMA(KP823639103, TO, TL);
Cr[WS(csr, 4)] = FNMS(KP823639103, TO, TL);
}
{
E T11, T12, T10, TY, TZ;
T11 = TW - TV;
T12 = TS - TT;
Ci[WS(csi, 3)] = KP951056516 * (FMA(KP618033988, T12, T11));
Ci[WS(csi, 6)] = -(KP951056516 * (FNMS(KP618033988, T11, T12)));
T10 = TU - TX;
TY = TU + TX;
TZ = FNMS(KP250000000, TY, TR);
Cr[WS(csr, 3)] = FNMS(KP559016994, T10, TZ);
Cr[0] = TR + TY;
Cr[WS(csr, 6)] = FMA(KP559016994, T10, TZ);
{
E Tx, TB, TA, TC;
{
E Tp, Tw, Ty, Tz;
Tp = Tl - To;
Tw = Ts - Tv;
Tx = FMA(KP618033988, Tw, Tp);
TB = FNMS(KP618033988, Tp, Tw);
Ty = FMA(KP250000000, Tf, Ti);
Tz = Te - T7;
TA = FMA(KP559016994, Tz, Ty);
TC = FNMS(KP559016994, Tz, Ty);
}
Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP910592997, TA, Tx)));
Ci[WS(csi, 7)] = KP951056516 * (FMA(KP910592997, TC, TB));
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP910592997, TA, Tx));
Ci[WS(csi, 2)] = KP951056516 * (FNMS(KP910592997, TC, TB));
}
}
}
}
}
static const kr2c_desc desc = { 15, "r2cf_15", { 36, 7, 28, 0 }, &GENUS };
void X(codelet_r2cf_15) (planner *p) { X(kr2c_register) (p, r2cf_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cf_15 -include rdft/scalar/r2cf.h */
/*
* This function contains 64 FP additions, 25 FP multiplications,
* (or, 50 additions, 11 multiplications, 14 fused multiply/add),
* 47 stack variables, 10 constants, and 30 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP484122918, +0.484122918275927110647408174972799951354115213);
DK(KP216506350, +0.216506350946109661690930792688234045867850657);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP509036960, +0.509036960455127183450980863393907648510733164);
DK(KP823639103, +0.823639103546331925877420039278190003029660514);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
E Ti, TR, TL, TD, TE, T7, Te, Tf, TV, TW, TX, Tv, Ty, TH, To;
E Tr, TG, TS, TT, TU;
{
E TJ, Tg, Th, TK;
TJ = R0[0];
Tg = R0[WS(rs, 5)];
Th = R1[WS(rs, 2)];
TK = Th + Tg;
Ti = Tg - Th;
TR = TJ + TK;
TL = FNMS(KP500000000, TK, TJ);
}
{
E Tm, Tt, Tw, Tp, T3, Tx, Ta, Tn, Td, Tq, T6, Tu;
Tm = R1[WS(rs, 1)];
Tt = R0[WS(rs, 3)];
Tw = R1[WS(rs, 4)];
Tp = R0[WS(rs, 6)];
{
E T1, T2, T8, T9;
T1 = R0[WS(rs, 7)];
T2 = R0[WS(rs, 2)];
T3 = T1 - T2;
Tx = T1 + T2;
T8 = R1[WS(rs, 6)];
T9 = R0[WS(rs, 4)];
Ta = T8 - T9;
Tn = T9 + T8;
}
{
E Tb, Tc, T4, T5;
Tb = R1[WS(rs, 3)];
Tc = R0[WS(rs, 1)];
Td = Tb - Tc;
Tq = Tc + Tb;
T4 = R1[0];
T5 = R1[WS(rs, 5)];
T6 = T4 - T5;
Tu = T5 + T4;
}
TD = Ta - Td;
TE = T6 + T3;
T7 = T3 - T6;
Te = Ta + Td;
Tf = T7 - Te;
TV = Tt + Tu;
TW = Tw + Tx;
TX = TV + TW;
Tv = FNMS(KP500000000, Tu, Tt);
Ty = FNMS(KP500000000, Tx, Tw);
TH = Tv + Ty;
To = FNMS(KP500000000, Tn, Tm);
Tr = FNMS(KP500000000, Tq, Tp);
TG = To + Tr;
TS = Tm + Tn;
TT = Tp + Tq;
TU = TS + TT;
}
Ci[WS(csi, 5)] = KP866025403 * (Tf - Ti);
{
E TF, TP, TI, TM, TN, TQ, TO;
TF = FMA(KP823639103, TD, KP509036960 * TE);
TP = FNMS(KP509036960, TD, KP823639103 * TE);
TI = KP559016994 * (TG - TH);
TM = TG + TH;
TN = FNMS(KP250000000, TM, TL);
Cr[WS(csr, 5)] = TL + TM;
TQ = TN - TI;
Cr[WS(csr, 2)] = TP + TQ;
Cr[WS(csr, 7)] = TQ - TP;
TO = TI + TN;
Cr[WS(csr, 1)] = TF + TO;
Cr[WS(csr, 4)] = TO - TF;
}
{
E T11, T12, T10, TY, TZ;
T11 = TS - TT;
T12 = TW - TV;
Ci[WS(csi, 3)] = FMA(KP587785252, T11, KP951056516 * T12);
Ci[WS(csi, 6)] = FNMS(KP951056516, T11, KP587785252 * T12);
T10 = KP559016994 * (TU - TX);
TY = TU + TX;
TZ = FNMS(KP250000000, TY, TR);
Cr[WS(csr, 3)] = TZ - T10;
Cr[0] = TR + TY;
Cr[WS(csr, 6)] = T10 + TZ;
{
E Tl, TB, TA, TC;
{
E Tj, Tk, Ts, Tz;
Tj = FMA(KP866025403, Ti, KP216506350 * Tf);
Tk = KP484122918 * (Te + T7);
Tl = Tj + Tk;
TB = Tk - Tj;
Ts = To - Tr;
Tz = Tv - Ty;
TA = FMA(KP951056516, Ts, KP587785252 * Tz);
TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
}
Ci[WS(csi, 1)] = Tl - TA;
Ci[WS(csi, 7)] = TC - TB;
Ci[WS(csi, 4)] = Tl + TA;
Ci[WS(csi, 2)] = TB + TC;
}
}
}
}
}
static const kr2c_desc desc = { 15, "r2cf_15", { 50, 11, 14, 0 }, &GENUS };
void X(codelet_r2cf_15) (planner *p) { X(kr2c_register) (p, r2cf_15, &desc);
}
#endif

View File

@@ -0,0 +1,288 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include rdft/scalar/r2cf.h */
/*
* This function contains 58 FP additions, 20 FP multiplications,
* (or, 38 additions, 0 multiplications, 20 fused multiply/add),
* 34 stack variables, 3 constants, and 32 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
E T3, T6, T7, TN, TB, Ta, Td, Te, TO, TE, Tm, TT, Ty, TI, Tt;
E TS, Tz, TL, TC, TD, TR, TU;
{
E T1, T2, T4, T5;
T1 = R0[0];
T2 = R0[WS(rs, 4)];
T3 = T1 + T2;
T4 = R0[WS(rs, 2)];
T5 = R0[WS(rs, 6)];
T6 = T4 + T5;
T7 = T3 + T6;
TN = T4 - T5;
TB = T1 - T2;
}
{
E T8, T9, Tb, Tc;
T8 = R0[WS(rs, 1)];
T9 = R0[WS(rs, 5)];
Ta = T8 + T9;
TC = T8 - T9;
Tb = R0[WS(rs, 7)];
Tc = R0[WS(rs, 3)];
Td = Tb + Tc;
TD = Tb - Tc;
}
Te = Ta + Td;
TO = TD - TC;
TE = TC + TD;
{
E Ti, TG, Tl, TH;
{
E Tg, Th, Tj, Tk;
Tg = R1[0];
Th = R1[WS(rs, 4)];
Ti = Tg + Th;
TG = Tg - Th;
Tj = R1[WS(rs, 2)];
Tk = R1[WS(rs, 6)];
Tl = Tj + Tk;
TH = Tj - Tk;
}
Tm = Ti - Tl;
TT = FMA(KP414213562, TG, TH);
Ty = Ti + Tl;
TI = FNMS(KP414213562, TH, TG);
}
{
E Tp, TJ, Ts, TK;
{
E Tn, To, Tq, Tr;
Tn = R1[WS(rs, 7)];
To = R1[WS(rs, 3)];
Tp = Tn + To;
TJ = Tn - To;
Tq = R1[WS(rs, 1)];
Tr = R1[WS(rs, 5)];
Ts = Tq + Tr;
TK = Tr - Tq;
}
Tt = Tp - Ts;
TS = FMA(KP414213562, TJ, TK);
Tz = Tp + Ts;
TL = FNMS(KP414213562, TK, TJ);
}
Cr[WS(csr, 4)] = T7 - Te;
Ci[WS(csi, 4)] = Tz - Ty;
{
E Tf, Tu, Tv, Tw;
Tf = T3 - T6;
Tu = Tm + Tt;
Cr[WS(csr, 6)] = FNMS(KP707106781, Tu, Tf);
Cr[WS(csr, 2)] = FMA(KP707106781, Tu, Tf);
Tv = Td - Ta;
Tw = Tt - Tm;
Ci[WS(csi, 2)] = FMA(KP707106781, Tw, Tv);
Ci[WS(csi, 6)] = FMS(KP707106781, Tw, Tv);
}
{
E Tx, TA, TF, TM;
Tx = T7 + Te;
TA = Ty + Tz;
Cr[WS(csr, 8)] = Tx - TA;
Cr[0] = Tx + TA;
TF = FMA(KP707106781, TE, TB);
TM = TI + TL;
Cr[WS(csr, 7)] = FNMS(KP923879532, TM, TF);
Cr[WS(csr, 1)] = FMA(KP923879532, TM, TF);
}
TR = FNMS(KP707106781, TO, TN);
TU = TS - TT;
Ci[WS(csi, 1)] = FMS(KP923879532, TU, TR);
Ci[WS(csi, 7)] = FMA(KP923879532, TU, TR);
{
E TV, TW, TP, TQ;
TV = FNMS(KP707106781, TE, TB);
TW = TT + TS;
Cr[WS(csr, 5)] = FNMS(KP923879532, TW, TV);
Cr[WS(csr, 3)] = FMA(KP923879532, TW, TV);
TP = FMA(KP707106781, TO, TN);
TQ = TL - TI;
Ci[WS(csi, 3)] = FMA(KP923879532, TQ, TP);
Ci[WS(csi, 5)] = FMS(KP923879532, TQ, TP);
}
}
}
}
static const kr2c_desc desc = { 16, "r2cf_16", { 38, 0, 20, 0 }, &GENUS };
void X(codelet_r2cf_16) (planner *p) { X(kr2c_register) (p, r2cf_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include rdft/scalar/r2cf.h */
/*
* This function contains 58 FP additions, 12 FP multiplications,
* (or, 54 additions, 8 multiplications, 4 fused multiply/add),
* 34 stack variables, 3 constants, and 32 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
E T3, T6, T7, Tz, Ti, Ta, Td, Te, TA, Th, Tq, TV, TF, TP, Tx;
E TU, TE, TM, Tg, Tf, TJ, TQ;
{
E T1, T2, T4, T5;
T1 = R0[0];
T2 = R0[WS(rs, 4)];
T3 = T1 + T2;
T4 = R0[WS(rs, 2)];
T5 = R0[WS(rs, 6)];
T6 = T4 + T5;
T7 = T3 + T6;
Tz = T1 - T2;
Ti = T4 - T5;
}
{
E T8, T9, Tb, Tc;
T8 = R0[WS(rs, 1)];
T9 = R0[WS(rs, 5)];
Ta = T8 + T9;
Tg = T8 - T9;
Tb = R0[WS(rs, 7)];
Tc = R0[WS(rs, 3)];
Td = Tb + Tc;
Tf = Tb - Tc;
}
Te = Ta + Td;
TA = KP707106781 * (Tg + Tf);
Th = KP707106781 * (Tf - Tg);
{
E Tm, TN, Tp, TO;
{
E Tk, Tl, Tn, To;
Tk = R1[WS(rs, 7)];
Tl = R1[WS(rs, 3)];
Tm = Tk - Tl;
TN = Tk + Tl;
Tn = R1[WS(rs, 1)];
To = R1[WS(rs, 5)];
Tp = Tn - To;
TO = Tn + To;
}
Tq = FNMS(KP923879532, Tp, KP382683432 * Tm);
TV = TN + TO;
TF = FMA(KP923879532, Tm, KP382683432 * Tp);
TP = TN - TO;
}
{
E Tt, TK, Tw, TL;
{
E Tr, Ts, Tu, Tv;
Tr = R1[0];
Ts = R1[WS(rs, 4)];
Tt = Tr - Ts;
TK = Tr + Ts;
Tu = R1[WS(rs, 2)];
Tv = R1[WS(rs, 6)];
Tw = Tu - Tv;
TL = Tu + Tv;
}
Tx = FMA(KP382683432, Tt, KP923879532 * Tw);
TU = TK + TL;
TE = FNMS(KP382683432, Tw, KP923879532 * Tt);
TM = TK - TL;
}
Cr[WS(csr, 4)] = T7 - Te;
Ci[WS(csi, 4)] = TV - TU;
{
E Tj, Ty, TD, TG;
Tj = Th - Ti;
Ty = Tq - Tx;
Ci[WS(csi, 1)] = Tj + Ty;
Ci[WS(csi, 7)] = Ty - Tj;
TD = Tz + TA;
TG = TE + TF;
Cr[WS(csr, 7)] = TD - TG;
Cr[WS(csr, 1)] = TD + TG;
}
{
E TB, TC, TH, TI;
TB = Tz - TA;
TC = Tx + Tq;
Cr[WS(csr, 5)] = TB - TC;
Cr[WS(csr, 3)] = TB + TC;
TH = Ti + Th;
TI = TF - TE;
Ci[WS(csi, 3)] = TH + TI;
Ci[WS(csi, 5)] = TI - TH;
}
TJ = T3 - T6;
TQ = KP707106781 * (TM + TP);
Cr[WS(csr, 6)] = TJ - TQ;
Cr[WS(csr, 2)] = TJ + TQ;
{
E TR, TS, TT, TW;
TR = Td - Ta;
TS = KP707106781 * (TP - TM);
Ci[WS(csi, 2)] = TR + TS;
Ci[WS(csi, 6)] = TS - TR;
TT = T7 + Te;
TW = TU + TV;
Cr[WS(csr, 8)] = TT - TW;
Cr[0] = TT + TW;
}
}
}
}
static const kr2c_desc desc = { 16, "r2cf_16", { 54, 8, 4, 0 }, &GENUS };
void X(codelet_r2cf_16) (planner *p) { X(kr2c_register) (p, r2cf_16, &desc);
}
#endif

View File

@@ -0,0 +1,86 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cf_2 -include rdft/scalar/r2cf.h */
/*
* This function contains 2 FP additions, 0 FP multiplications,
* (or, 2 additions, 0 multiplications, 0 fused multiply/add),
* 3 stack variables, 0 constants, and 4 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
E T1, T2;
T1 = R0[0];
T2 = R1[0];
Cr[WS(csr, 1)] = T1 - T2;
Cr[0] = T1 + T2;
}
}
}
static const kr2c_desc desc = { 2, "r2cf_2", { 2, 0, 0, 0 }, &GENUS };
void X(codelet_r2cf_2) (planner *p) { X(kr2c_register) (p, r2cf_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cf_2 -include rdft/scalar/r2cf.h */
/*
* This function contains 2 FP additions, 0 FP multiplications,
* (or, 2 additions, 0 multiplications, 0 fused multiply/add),
* 3 stack variables, 0 constants, and 4 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
E T1, T2;
T1 = R0[0];
T2 = R1[0];
Cr[WS(csr, 1)] = T1 - T2;
Cr[0] = T1 + T2;
}
}
}
static const kr2c_desc desc = { 2, "r2cf_2", { 2, 0, 0, 0 }, &GENUS };
void X(codelet_r2cf_2) (planner *p) { X(kr2c_register) (p, r2cf_2, &desc);
}
#endif

View File

@@ -0,0 +1,362 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:11 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include rdft/scalar/r2cf.h */
/*
* This function contains 86 FP additions, 32 FP multiplications,
* (or, 58 additions, 4 multiplications, 28 fused multiply/add),
* 51 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
E T3, T1d, TJ, TV, T16, T1k, T1l, T19, Ta, Th, Ti, T1e, T1f, T1g, TP;
E TQ, TX, Tn, Ts, TK, TS, TT, TW, Ty, TD, TL;
{
E T1, T2, TF, TG, TH, TI;
T1 = R0[0];
T2 = R0[WS(rs, 5)];
TF = T1 + T2;
TG = R1[WS(rs, 2)];
TH = R1[WS(rs, 7)];
TI = TG + TH;
T3 = T1 - T2;
T1d = TG - TH;
TJ = TF - TI;
TV = TF + TI;
}
{
E T6, To, Tx, T17, TC, T18, T9, Tj, Td, Tu, Tm, T15, Tr, T14, Tg;
E Tz;
{
E T4, T5, Tv, Tw;
T4 = R0[WS(rs, 2)];
T5 = R0[WS(rs, 7)];
T6 = T4 - T5;
To = T4 + T5;
Tv = R1[WS(rs, 6)];
Tw = R1[WS(rs, 1)];
Tx = Tv + Tw;
T17 = Tw - Tv;
}
{
E TA, TB, T7, T8;
TA = R1[WS(rs, 8)];
TB = R1[WS(rs, 3)];
TC = TA + TB;
T18 = TB - TA;
T7 = R0[WS(rs, 8)];
T8 = R0[WS(rs, 3)];
T9 = T7 - T8;
Tj = T7 + T8;
}
{
E Tb, Tc, Tk, Tl;
Tb = R0[WS(rs, 4)];
Tc = R0[WS(rs, 9)];
Td = Tb - Tc;
Tu = Tb + Tc;
Tk = R1[0];
Tl = R1[WS(rs, 5)];
Tm = Tk + Tl;
T15 = Tl - Tk;
}
{
E Tp, Tq, Te, Tf;
Tp = R1[WS(rs, 4)];
Tq = R1[WS(rs, 9)];
Tr = Tp + Tq;
T14 = Tq - Tp;
Te = R0[WS(rs, 6)];
Tf = R0[WS(rs, 1)];
Tg = Te - Tf;
Tz = Te + Tf;
}
T16 = T14 - T15;
T1k = T6 - T9;
T1l = Td - Tg;
T19 = T17 - T18;
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
T1e = T14 + T15;
T1f = T17 + T18;
T1g = T1e + T1f;
TP = Tu + Tx;
TQ = Tz + TC;
TX = TP + TQ;
Tn = Tj - Tm;
Ts = To - Tr;
TK = Ts + Tn;
TS = To + Tr;
TT = Tj + Tm;
TW = TS + TT;
Ty = Tu - Tx;
TD = Tz - TC;
TL = Ty + TD;
}
Cr[WS(csr, 5)] = T3 + Ti;
Ci[WS(csi, 5)] = T1g - T1d;
{
E Tt, TE, TR, TU;
Tt = Tn - Ts;
TE = Ty - TD;
Ci[WS(csi, 6)] = KP951056516 * (FNMS(KP618033988, TE, Tt));
Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, Tt, TE));
TR = TP - TQ;
TU = TS - TT;
Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP618033988, TU, TR)));
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, TR, TU));
}
{
E T10, TY, TZ, TO, TM, TN;
T10 = TW - TX;
TY = TW + TX;
TZ = FNMS(KP250000000, TY, TV);
Cr[WS(csr, 4)] = FMA(KP559016994, T10, TZ);
Cr[0] = TV + TY;
Cr[WS(csr, 8)] = FNMS(KP559016994, T10, TZ);
TO = TK - TL;
TM = TK + TL;
TN = FNMS(KP250000000, TM, TJ);
Cr[WS(csr, 2)] = FNMS(KP559016994, TO, TN);
Cr[WS(csr, 10)] = TJ + TM;
Cr[WS(csr, 6)] = FMA(KP559016994, TO, TN);
}
{
E T1a, T1c, T13, T1b, T11, T12;
T1a = FMA(KP618033988, T19, T16);
T1c = FNMS(KP618033988, T16, T19);
T11 = FNMS(KP250000000, Ti, T3);
T12 = Ta - Th;
T13 = FMA(KP559016994, T12, T11);
T1b = FNMS(KP559016994, T12, T11);
Cr[WS(csr, 9)] = FNMS(KP951056516, T1a, T13);
Cr[WS(csr, 7)] = FMA(KP951056516, T1c, T1b);
Cr[WS(csr, 1)] = FMA(KP951056516, T1a, T13);
Cr[WS(csr, 3)] = FNMS(KP951056516, T1c, T1b);
}
{
E T1m, T1o, T1j, T1n, T1h, T1i;
T1m = FMA(KP618033988, T1l, T1k);
T1o = FNMS(KP618033988, T1k, T1l);
T1h = FMA(KP250000000, T1g, T1d);
T1i = T1e - T1f;
T1j = FNMS(KP559016994, T1i, T1h);
T1n = FMA(KP559016994, T1i, T1h);
Ci[WS(csi, 1)] = -(FMA(KP951056516, T1m, T1j));
Ci[WS(csi, 7)] = FMA(KP951056516, T1o, T1n);
Ci[WS(csi, 9)] = FMS(KP951056516, T1m, T1j);
Ci[WS(csi, 3)] = FNMS(KP951056516, T1o, T1n);
}
}
}
}
static const kr2c_desc desc = { 20, "r2cf_20", { 58, 4, 28, 0 }, &GENUS };
void X(codelet_r2cf_20) (planner *p) { X(kr2c_register) (p, r2cf_20, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include rdft/scalar/r2cf.h */
/*
* This function contains 86 FP additions, 24 FP multiplications,
* (or, 74 additions, 12 multiplications, 12 fused multiply/add),
* 51 stack variables, 4 constants, and 40 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
E T3, T1m, TF, T17, Ts, TM, TN, Tz, Ta, Th, Ti, T1g, T1h, T1k, T10;
E T13, T19, TG, TH, TI, T1d, T1e, T1j, TT, TW, T18;
{
E T1, T2, T15, TD, TE, T16;
T1 = R0[0];
T2 = R0[WS(rs, 5)];
T15 = T1 + T2;
TD = R1[WS(rs, 7)];
TE = R1[WS(rs, 2)];
T16 = TE + TD;
T3 = T1 - T2;
T1m = T15 + T16;
TF = TD - TE;
T17 = T15 - T16;
}
{
E T6, TU, Tv, T12, Ty, TZ, T9, TR, Td, TY, To, TS, Tr, TV, Tg;
E T11;
{
E T4, T5, Tt, Tu;
T4 = R0[WS(rs, 2)];
T5 = R0[WS(rs, 7)];
T6 = T4 - T5;
TU = T4 + T5;
Tt = R1[WS(rs, 8)];
Tu = R1[WS(rs, 3)];
Tv = Tt - Tu;
T12 = Tt + Tu;
}
{
E Tw, Tx, T7, T8;
Tw = R1[WS(rs, 6)];
Tx = R1[WS(rs, 1)];
Ty = Tw - Tx;
TZ = Tw + Tx;
T7 = R0[WS(rs, 8)];
T8 = R0[WS(rs, 3)];
T9 = T7 - T8;
TR = T7 + T8;
}
{
E Tb, Tc, Tm, Tn;
Tb = R0[WS(rs, 4)];
Tc = R0[WS(rs, 9)];
Td = Tb - Tc;
TY = Tb + Tc;
Tm = R1[0];
Tn = R1[WS(rs, 5)];
To = Tm - Tn;
TS = Tm + Tn;
}
{
E Tp, Tq, Te, Tf;
Tp = R1[WS(rs, 4)];
Tq = R1[WS(rs, 9)];
Tr = Tp - Tq;
TV = Tp + Tq;
Te = R0[WS(rs, 6)];
Tf = R0[WS(rs, 1)];
Tg = Te - Tf;
T11 = Te + Tf;
}
Ts = To - Tr;
TM = T6 - T9;
TN = Td - Tg;
Tz = Tv - Ty;
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
T1g = TY + TZ;
T1h = T11 + T12;
T1k = T1g + T1h;
T10 = TY - TZ;
T13 = T11 - T12;
T19 = T10 + T13;
TG = Tr + To;
TH = Ty + Tv;
TI = TG + TH;
T1d = TU + TV;
T1e = TR + TS;
T1j = T1d + T1e;
TT = TR - TS;
TW = TU - TV;
T18 = TW + TT;
}
Cr[WS(csr, 5)] = T3 + Ti;
Ci[WS(csi, 5)] = TF - TI;
{
E TX, T14, T1f, T1i;
TX = TT - TW;
T14 = T10 - T13;
Ci[WS(csi, 6)] = FNMS(KP587785252, T14, KP951056516 * TX);
Ci[WS(csi, 2)] = FMA(KP587785252, TX, KP951056516 * T14);
T1f = T1d - T1e;
T1i = T1g - T1h;
Ci[WS(csi, 8)] = FNMS(KP951056516, T1i, KP587785252 * T1f);
Ci[WS(csi, 4)] = FMA(KP951056516, T1f, KP587785252 * T1i);
}
{
E T1l, T1n, T1o, T1c, T1a, T1b;
T1l = KP559016994 * (T1j - T1k);
T1n = T1j + T1k;
T1o = FNMS(KP250000000, T1n, T1m);
Cr[WS(csr, 4)] = T1l + T1o;
Cr[0] = T1m + T1n;
Cr[WS(csr, 8)] = T1o - T1l;
T1c = KP559016994 * (T18 - T19);
T1a = T18 + T19;
T1b = FNMS(KP250000000, T1a, T17);
Cr[WS(csr, 2)] = T1b - T1c;
Cr[WS(csr, 10)] = T17 + T1a;
Cr[WS(csr, 6)] = T1c + T1b;
}
{
E TA, TC, Tl, TB, Tj, Tk;
TA = FMA(KP951056516, Ts, KP587785252 * Tz);
TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
Tj = KP559016994 * (Ta - Th);
Tk = FNMS(KP250000000, Ti, T3);
Tl = Tj + Tk;
TB = Tk - Tj;
Cr[WS(csr, 9)] = Tl - TA;
Cr[WS(csr, 7)] = TB + TC;
Cr[WS(csr, 1)] = Tl + TA;
Cr[WS(csr, 3)] = TB - TC;
}
{
E TO, TQ, TL, TP, TJ, TK;
TO = FMA(KP951056516, TM, KP587785252 * TN);
TQ = FNMS(KP587785252, TM, KP951056516 * TN);
TJ = FMA(KP250000000, TI, TF);
TK = KP559016994 * (TH - TG);
TL = TJ + TK;
TP = TK - TJ;
Ci[WS(csi, 1)] = TL - TO;
Ci[WS(csi, 7)] = TQ + TP;
Ci[WS(csi, 9)] = TO + TL;
Ci[WS(csi, 3)] = TP - TQ;
}
}
}
}
static const kr2c_desc desc = { 20, "r2cf_20", { 74, 12, 12, 0 }, &GENUS };
void X(codelet_r2cf_20) (planner *p) { X(kr2c_register) (p, r2cf_20, &desc);
}
#endif

View File

@@ -0,0 +1,736 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:11 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cf_25 -include rdft/scalar/r2cf.h */
/*
* This function contains 200 FP additions, 168 FP multiplications,
* (or, 44 additions, 12 multiplications, 156 fused multiply/add),
* 127 stack variables, 66 constants, and 50 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP792626838, +0.792626838241819413632131824093538848057784557);
DK(KP876091699, +0.876091699473550838204498029706869638173524346);
DK(KP809385824, +0.809385824416008241660603814668679683846476688);
DK(KP860541664, +0.860541664367944677098261680920518816412804187);
DK(KP560319534, +0.560319534973832390111614715371676131169633784);
DK(KP681693190, +0.681693190061530575150324149145440022633095390);
DK(KP237294955, +0.237294955877110315393888866460840817927895961);
DK(KP897376177, +0.897376177523557693138608077137219684419427330);
DK(KP997675361, +0.997675361079556513670859573984492383596555031);
DK(KP923225144, +0.923225144846402650453449441572664695995209956);
DK(KP956723877, +0.956723877038460305821989399535483155872969262);
DK(KP949179823, +0.949179823508441261575555465843363271711583843);
DK(KP570584518, +0.570584518783621657366766175430996792655723863);
DK(KP669429328, +0.669429328479476605641803240971985825917022098);
DK(KP262346850, +0.262346850930607871785420028382979691334784273);
DK(KP906616052, +0.906616052148196230441134447086066874408359177);
DK(KP921078979, +0.921078979742360627699756128143719920817673854);
DK(KP845997307, +0.845997307939530944175097360758058292389769300);
DK(KP982009705, +0.982009705009746369461829878184175962711969869);
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
DK(KP559154169, +0.559154169276087864842202529084232643714075927);
DK(KP683113946, +0.683113946453479238701949862233725244439656928);
DK(KP242145790, +0.242145790282157779872542093866183953459003101);
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
DK(KP999754674, +0.999754674276473633366203429228112409535557487);
DK(KP904508497, +0.904508497187473712051146708591409529430077295);
DK(KP904730450, +0.904730450839922351881287709692877908104763647);
DK(KP916574801, +0.916574801383451584742370439148878693530976769);
DK(KP831864738, +0.831864738706457140726048799369896829771167132);
DK(KP829049696, +0.829049696159252993975487806364305442437946767);
DK(KP855719849, +0.855719849902058969314654733608091555096772472);
DK(KP952936919, +0.952936919628306576880750665357914584765951388);
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
DK(KP690983005, +0.690983005625052575897706582817180941139845410);
DK(KP522616830, +0.522616830205754336872861364785224694908468440);
DK(KP772036680, +0.772036680810363904029489473607579825330539880);
DK(KP734762448, +0.734762448793050413546343770063151342619912334);
DK(KP803003575, +0.803003575438660414833440593570376004635464850);
DK(KP999544308, +0.999544308746292983948881682379742149196758193);
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
DK(KP763932022, +0.763932022500210303590826331268723764559381640);
DK(KP894834959, +0.894834959464455102997960030820114611498661386);
DK(KP447417479, +0.447417479732227551498980015410057305749330693);
DK(KP867381224, +0.867381224396525206773171885031575671309956167);
DK(KP958953096, +0.958953096729998668045963838399037225970891871);
DK(KP912575812, +0.912575812670962425556968549836277086778922727);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP244189809, +0.244189809627953270309879511234821255780225091);
DK(KP522847744, +0.522847744331509716623755382187077770911012542);
DK(KP578046249, +0.578046249379945007321754579646815604023525655);
DK(KP269969613, +0.269969613759572083574752974412347470060951301);
DK(KP667278218, +0.667278218140296670899089292254759909713898805);
DK(KP494780565, +0.494780565770515410344588413655324772219443730);
DK(KP447533225, +0.447533225982656890041886979663652563063114397);
DK(KP603558818, +0.603558818296015001454675132653458027918768137);
DK(KP120146378, +0.120146378570687701782758537356596213647956445);
DK(KP869845200, +0.869845200362138853122720822420327157933056305);
DK(KP786782374, +0.786782374965295178365099601674911834788448471);
DK(KP132830569, +0.132830569247582714407653942074819768844536507);
DK(KP893101515, +0.893101515366181661711202267938416198338079437);
DK(KP066152395, +0.066152395967733048213034281011006031460903353);
DK(KP059835404, +0.059835404262124915169548397419498386427871950);
DK(KP987388751, +0.987388751065621252324603216482382109400433949);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
E T2p, TJ, T2u, T1O, T2s, T2t, TB, T1c, T26, T2e, T1k, T1r, T1M, T21, T1B;
E T9, TX, T29, T2k, T1h, T1v, T1R, T1X, T1z, Ti, TQ, T2a, T2j, T1g, T1u;
E T1U, T1Y, T1y, Ts, T15, T27, T2f, T1j, T1s, T1J, T20, T1C, Tj, TC;
{
E TI, T2r, TF, T2q;
T2p = R0[0];
{
E TG, TH, TD, TE;
TG = R0[WS(rs, 5)];
TH = R1[WS(rs, 7)];
TI = TG - TH;
T2r = TG + TH;
TD = R1[WS(rs, 2)];
TE = R0[WS(rs, 10)];
TF = TD - TE;
T2q = TD + TE;
}
TJ = FMA(KP618033988, TI, TF);
T2u = T2q - T2r;
T1O = FNMS(KP618033988, TF, TI);
T2s = T2q + T2r;
T2t = FNMS(KP250000000, T2s, T2p);
}
{
E Tt, TA, T1a, T16, T17;
Tt = R1[WS(rs, 1)];
{
E Tu, Tv, Tw, Tx, Ty, Tz;
Tu = R0[WS(rs, 4)];
Tv = R1[WS(rs, 11)];
Tw = Tu + Tv;
Tx = R1[WS(rs, 6)];
Ty = R0[WS(rs, 9)];
Tz = Tx + Ty;
TA = Tw + Tz;
T1a = Tz - Tw;
T16 = Tv - Tu;
T17 = Tx - Ty;
}
TB = Tt + TA;
{
E T18, T1L, T1b, T1K, T19;
T18 = FNMS(KP618033988, T17, T16);
T1L = FMA(KP618033988, T16, T17);
T19 = FNMS(KP250000000, TA, Tt);
T1b = FNMS(KP559016994, T1a, T19);
T1K = FMA(KP559016994, T1a, T19);
T1c = FNMS(KP987388751, T1b, T18);
T26 = FNMS(KP059835404, T1L, T1K);
T2e = FMA(KP066152395, T1K, T1L);
T1k = FMA(KP893101515, T18, T1b);
T1r = FNMS(KP132830569, T1b, T18);
T1M = FNMS(KP786782374, T1L, T1K);
T21 = FMA(KP869845200, T1K, T1L);
T1B = FMA(KP120146378, T18, T1b);
}
}
{
E T1, T8, TV, TS, TU;
T1 = R0[WS(rs, 2)];
{
E T2, T3, T4, T5, T6, T7;
T2 = R1[WS(rs, 4)];
T3 = R0[WS(rs, 12)];
T4 = T2 + T3;
T5 = R0[WS(rs, 7)];
T6 = R1[WS(rs, 9)];
T7 = T5 + T6;
T8 = T4 + T7;
TV = T5 - T6;
TS = T4 - T7;
TU = T3 - T2;
}
T9 = T1 + T8;
{
E TW, T1P, TT, T1Q, TR;
TW = FNMS(KP618033988, TV, TU);
T1P = FMA(KP618033988, TU, TV);
TR = FMS(KP250000000, T8, T1);
TT = FNMS(KP559016994, TS, TR);
T1Q = FMA(KP559016994, TS, TR);
TX = FMA(KP603558818, TW, TT);
T29 = FNMS(KP447533225, T1P, T1Q);
T2k = FMA(KP494780565, T1Q, T1P);
T1h = FNMS(KP667278218, TT, TW);
T1v = FNMS(KP786782374, TW, TT);
T1R = FMA(KP132830569, T1Q, T1P);
T1X = FNMS(KP120146378, T1P, T1Q);
T1z = FMA(KP869845200, TT, TW);
}
}
{
E Ta, Th, TO, TK, TL;
Ta = R1[0];
{
E Tb, Tc, Td, Te, Tf, Tg;
Tb = R0[WS(rs, 3)];
Tc = R1[WS(rs, 10)];
Td = Tb + Tc;
Te = R1[WS(rs, 5)];
Tf = R0[WS(rs, 8)];
Tg = Te + Tf;
Th = Td + Tg;
TO = Td - Tg;
TK = Tb - Tc;
TL = Tf - Te;
}
Ti = Ta + Th;
{
E TM, T1S, TP, T1T, TN;
TM = FNMS(KP618033988, TL, TK);
T1S = FMA(KP618033988, TK, TL);
TN = FNMS(KP250000000, Th, Ta);
TP = FMA(KP559016994, TO, TN);
T1T = FNMS(KP559016994, TO, TN);
TQ = FMA(KP269969613, TP, TM);
T2a = FMA(KP578046249, T1T, T1S);
T2j = FNMS(KP522847744, T1S, T1T);
T1g = FNMS(KP244189809, TM, TP);
T1u = FNMS(KP603558818, TM, TP);
T1U = FNMS(KP987388751, T1T, T1S);
T1Y = FMA(KP893101515, T1S, T1T);
T1y = FMA(KP667278218, TP, TM);
}
}
{
E Tk, Tr, T13, TZ, T10;
Tk = R0[WS(rs, 1)];
{
E Tl, Tm, Tn, To, Tp, Tq;
Tl = R1[WS(rs, 3)];
Tm = R0[WS(rs, 11)];
Tn = Tl + Tm;
To = R0[WS(rs, 6)];
Tp = R1[WS(rs, 8)];
Tq = To + Tp;
Tr = Tn + Tq;
T13 = Tn - Tq;
TZ = Tm - Tl;
T10 = Tp - To;
}
Ts = Tk + Tr;
{
E T11, T1I, T14, T1H, T12;
T11 = FMA(KP618033988, T10, TZ);
T1I = FNMS(KP618033988, TZ, T10);
T12 = FMS(KP250000000, Tr, Tk);
T14 = FNMS(KP559016994, T13, T12);
T1H = FMA(KP559016994, T13, T12);
T15 = FMA(KP578046249, T14, T11);
T27 = FNMS(KP603558818, T1I, T1H);
T2f = FMA(KP667278218, T1H, T1I);
T1j = FNMS(KP522847744, T11, T14);
T1s = FMA(KP447533225, T11, T14);
T1J = FMA(KP059835404, T1I, T1H);
T20 = FNMS(KP066152395, T1H, T1I);
T1C = FNMS(KP494780565, T14, T11);
}
}
Tj = T9 - Ti;
TC = Ts - TB;
Ci[WS(csi, 5)] = KP951056516 * (FNMS(KP618033988, TC, Tj));
Ci[WS(csi, 10)] = KP951056516 * (FMA(KP618033988, Tj, TC));
{
E T39, T3c, T3e, T3a, T3b, T3d;
T39 = T2p + T2s;
T3a = T9 + Ti;
T3b = Ts + TB;
T3c = T3a + T3b;
T3e = T3a - T3b;
Cr[0] = T3c + T39;
T3d = FNMS(KP250000000, T3c, T39);
Cr[WS(csr, 5)] = FMA(KP559016994, T3e, T3d);
Cr[WS(csr, 10)] = FNMS(KP559016994, T3e, T3d);
}
{
E T1A, T1x, T1F, T1G;
T1A = FNMS(KP912575812, T1z, T1y);
{
E T1t, T1w, T1E, T1D;
T1t = FMA(KP958953096, T1s, T1r);
T1w = FMA(KP912575812, T1v, T1u);
T1D = FNMS(KP867381224, T1C, T1B);
T1E = FMA(KP447417479, T1w, T1D);
T1x = FNMS(KP894834959, T1w, T1t);
T1F = FMA(KP763932022, T1E, T1t);
}
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP992114701, T1x, TJ));
T1G = FMA(KP999544308, T1F, T1A);
Ci[WS(csi, 9)] = KP951056516 * (FNMS(KP803003575, T1G, TJ));
}
{
E T1Z, T1N, T1W, T24, T1V, T23, T22, T25;
T1Z = FNMS(KP734762448, T1Y, T1X);
T1N = FNMS(KP772036680, T1M, T1J);
T1V = FMA(KP734762448, T1U, T1R);
T22 = FMA(KP772036680, T21, T20);
T23 = FNMS(KP522616830, T1V, T22);
T1W = FNMS(KP992114701, T1V, T1O);
T24 = FMA(KP690983005, T23, T1N);
Ci[WS(csi, 3)] = KP998026728 * (FNMS(KP952936919, T1W, T1N));
T25 = FNMS(KP855719849, T24, T1Z);
Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP992114701, T25, T1O)));
}
{
E T1i, T1l, T1e, T1p, T1n, TY, T1d, T1m, T1f, T1q, T1o;
T1i = FNMS(KP829049696, T1h, T1g);
T1l = FMA(KP831864738, T1k, T1j);
TY = FNMS(KP916574801, TX, TQ);
T1d = FMA(KP831864738, T1c, T15);
T1m = FMA(KP904730450, T1d, TY);
T1e = FNMS(KP904730450, T1d, TY);
T1p = FNMS(KP904508497, T1m, T1i);
T1n = FNMS(KP999754674, T1m, T1l);
Ci[WS(csi, 1)] = -(KP951056516 * (FMA(KP968583161, T1e, TJ)));
T1f = FNMS(KP242145790, T1e, TJ);
T1q = FMA(KP683113946, T1p, T1l);
T1o = FNMS(KP559154169, T1n, T1i);
Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP968583161, T1o, T1f)));
Ci[WS(csi, 11)] = -(KP951056516 * (FMA(KP876306680, T1q, T1f)));
}
{
E T2l, T2c, T2n, T2i, T2d, T2o, T2m;
T2l = FNMS(KP982009705, T2k, T2j);
{
E T2g, T28, T2b, T2h;
T2g = FMA(KP845997307, T2f, T2e);
T28 = FNMS(KP845997307, T27, T26);
T2b = FNMS(KP921078979, T2a, T29);
T2h = FMA(KP906616052, T2b, T28);
T2c = FNMS(KP906616052, T2b, T28);
T2n = T2g + T2h;
T2i = FMA(KP618033988, T2h, T2g);
}
Ci[WS(csi, 2)] = -(KP998026728 * (FNMS(KP952936919, T1O, T2c)));
T2d = FMA(KP262346850, T2c, T1O);
T2o = FNMS(KP669429328, T2n, T2l);
T2m = FMA(KP570584518, T2l, T2i);
Ci[WS(csi, 12)] = KP951056516 * (FNMS(KP949179823, T2m, T2d));
Ci[WS(csi, 7)] = KP951056516 * (FNMS(KP876306680, T2o, T2d));
}
{
E T2P, T2W, T2V, T2Z, T32, T33, T2S, T37, T35, T2Q, T2R, T34;
T2P = FNMS(KP559016994, T2u, T2t);
T2W = FNMS(KP734762448, T1U, T1R);
{
E T2U, T2T, T2Y, T2X;
T2U = FNMS(KP772036680, T21, T20);
T2T = FMA(KP734762448, T1Y, T1X);
T2X = FMA(KP772036680, T1M, T1J);
T2Y = FMA(KP522616830, T2T, T2X);
T2V = FMA(KP956723877, T2U, T2T);
T2Z = FNMS(KP763932022, T2Y, T2U);
}
T32 = FMA(KP845997307, T27, T26);
T33 = FMA(KP921078979, T2a, T29);
T2Q = FNMS(KP845997307, T2f, T2e);
T2R = FMA(KP982009705, T2k, T2j);
T34 = FNMS(KP923225144, T2R, T2Q);
T2S = FMA(KP923225144, T2R, T2Q);
T37 = FNMS(KP904508497, T34, T32);
T35 = FNMS(KP997675361, T34, T33);
Cr[WS(csr, 2)] = FMA(KP949179823, T2S, T2P);
Cr[WS(csr, 3)] = FMA(KP992114701, T2V, T2P);
{
E T30, T31, T38, T36;
T30 = FMA(KP855719849, T2Z, T2W);
Cr[WS(csr, 8)] = FNMS(KP897376177, T30, T2P);
T31 = FNMS(KP237294955, T2S, T2P);
T38 = FNMS(KP681693190, T37, T33);
T36 = FMA(KP560319534, T35, T32);
Cr[WS(csr, 12)] = FNMS(KP949179823, T36, T31);
Cr[WS(csr, 7)] = FNMS(KP860541664, T38, T31);
}
}
{
E T2v, T2H, T2M, T2O, T2A, T2C, T2y, T2F, T2D, T2w, T2x, T2B;
T2v = FMA(KP559016994, T2u, T2t);
T2H = FNMS(KP912575812, T1v, T1u);
{
E T2I, T2K, T2L, T2J;
T2I = FMA(KP867381224, T1C, T1B);
T2J = FNMS(KP958953096, T1s, T1r);
T2K = FMA(KP912575812, T1z, T1y);
T2L = FNMS(KP447417479, T2K, T2J);
T2M = FNMS(KP690983005, T2L, T2I);
T2O = FNMS(KP809385824, T2K, T2I);
}
T2A = FMA(KP916574801, TX, TQ);
T2C = FNMS(KP831864738, T1c, T15);
T2w = FMA(KP829049696, T1h, T1g);
T2x = FNMS(KP831864738, T1k, T1j);
T2B = FMA(KP904730450, T2x, T2w);
T2y = FNMS(KP904730450, T2x, T2w);
T2F = T2A + T2B;
T2D = FMA(KP904730450, T2C, T2B);
Cr[WS(csr, 1)] = FMA(KP968583161, T2y, T2v);
Cr[WS(csr, 4)] = FNMS(KP992114701, T2O, T2v);
{
E T2N, T2z, T2G, T2E;
T2N = FNMS(KP999544308, T2M, T2H);
Cr[WS(csr, 9)] = FNMS(KP803003575, T2N, T2v);
T2z = FNMS(KP242145790, T2y, T2v);
T2G = FMA(KP683113946, T2F, T2C);
T2E = FNMS(KP618033988, T2D, T2A);
Cr[WS(csr, 6)] = FNMS(KP876091699, T2E, T2z);
Cr[WS(csr, 11)] = FNMS(KP792626838, T2G, T2z);
}
}
}
}
}
static const kr2c_desc desc = { 25, "r2cf_25", { 44, 12, 156, 0 }, &GENUS };
void X(codelet_r2cf_25) (planner *p) { X(kr2c_register) (p, r2cf_25, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cf_25 -include rdft/scalar/r2cf.h */
/*
* This function contains 200 FP additions, 140 FP multiplications,
* (or, 117 additions, 57 multiplications, 83 fused multiply/add),
* 101 stack variables, 40 constants, and 50 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
DK(KP125581039, +0.125581039058626752152356449131262266244969664);
DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
DK(KP062790519, +0.062790519529313376076178224565631133122484832);
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
DK(KP309016994, +0.309016994374947424102293417182819058860154590);
DK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
DK(KP728968627, +0.728968627421411523146730319055259111372571664);
DK(KP963507348, +0.963507348203430549974383005744259307057084020);
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
DK(KP497379774, +0.497379774329709576484567492012895936835134813);
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
DK(KP684547105, +0.684547105928688673732283357621209269889519233);
DK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
DK(KP481753674, +0.481753674101715274987191502872129653528542010);
DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
DK(KP248689887, +0.248689887164854788242283746006447968417567406);
DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
DK(KP250666467, +0.250666467128608490746237519633017587885836494);
DK(KP425779291, +0.425779291565072648862502445744251703979973042);
DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
DK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
DK(KP770513242, +0.770513242775789230803009636396177847271667672);
DK(KP844327925, +0.844327925502015078548558063966681505381659241);
DK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
DK(KP125333233, +0.125333233564304245373118759816508793942918247);
DK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
DK(KP904827052, +0.904827052466019527713668647932697593970413911);
DK(KP851558583, +0.851558583130145297725004891488503407959946084);
DK(KP637423989, +0.637423989748689710176712811676016195434917298);
DK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
DK(KP535826794, +0.535826794978996618271308767867639978063575346);
DK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
DK(KP293892626, +0.293892626146236564584352977319536384298826219);
DK(KP475528258, +0.475528258147576786058219666689691071702849317);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
E T8, T1j, T1V, T1l, T7, T9, Ta, T12, T2u, T1O, T19, T1P, Ti, T2r, T1K;
E Tp, T1L, Tx, T2q, T1H, TE, T1I, TN, T2t, T1R, TU, T1S, T6, T1k, T3;
E T2s, T2v;
T8 = R0[0];
{
E T4, T5, T1, T2;
T4 = R0[WS(rs, 5)];
T5 = R1[WS(rs, 7)];
T6 = T4 + T5;
T1k = T4 - T5;
T1 = R1[WS(rs, 2)];
T2 = R0[WS(rs, 10)];
T3 = T1 + T2;
T1j = T1 - T2;
}
T1V = KP951056516 * T1k;
T1l = FMA(KP951056516, T1j, KP587785252 * T1k);
T7 = KP559016994 * (T3 - T6);
T9 = T3 + T6;
Ta = FNMS(KP250000000, T9, T8);
{
E T16, T13, T14, TY, T17, T11, T15, T18;
T16 = R1[WS(rs, 1)];
{
E TW, TX, TZ, T10;
TW = R0[WS(rs, 4)];
TX = R1[WS(rs, 11)];
T13 = TW + TX;
TZ = R1[WS(rs, 6)];
T10 = R0[WS(rs, 9)];
T14 = TZ + T10;
TY = TW - TX;
T17 = T13 + T14;
T11 = TZ - T10;
}
T12 = FMA(KP475528258, TY, KP293892626 * T11);
T2u = T16 + T17;
T1O = FNMS(KP293892626, TY, KP475528258 * T11);
T15 = KP559016994 * (T13 - T14);
T18 = FNMS(KP250000000, T17, T16);
T19 = T15 + T18;
T1P = T18 - T15;
}
{
E Tm, Tj, Tk, Te, Tn, Th, Tl, To;
Tm = R1[0];
{
E Tc, Td, Tf, Tg;
Tc = R0[WS(rs, 3)];
Td = R1[WS(rs, 10)];
Tj = Tc + Td;
Tf = R1[WS(rs, 5)];
Tg = R0[WS(rs, 8)];
Tk = Tf + Tg;
Te = Tc - Td;
Tn = Tj + Tk;
Th = Tf - Tg;
}
Ti = FMA(KP475528258, Te, KP293892626 * Th);
T2r = Tm + Tn;
T1K = FNMS(KP293892626, Te, KP475528258 * Th);
Tl = KP559016994 * (Tj - Tk);
To = FNMS(KP250000000, Tn, Tm);
Tp = Tl + To;
T1L = To - Tl;
}
{
E TB, Ty, Tz, Tt, TC, Tw, TA, TD;
TB = R0[WS(rs, 2)];
{
E Tr, Ts, Tu, Tv;
Tr = R1[WS(rs, 4)];
Ts = R0[WS(rs, 12)];
Ty = Tr + Ts;
Tu = R0[WS(rs, 7)];
Tv = R1[WS(rs, 9)];
Tz = Tu + Tv;
Tt = Tr - Ts;
TC = Ty + Tz;
Tw = Tu - Tv;
}
Tx = FMA(KP475528258, Tt, KP293892626 * Tw);
T2q = TB + TC;
T1H = FNMS(KP293892626, Tt, KP475528258 * Tw);
TA = KP559016994 * (Ty - Tz);
TD = FNMS(KP250000000, TC, TB);
TE = TA + TD;
T1I = TD - TA;
}
{
E TR, TO, TP, TJ, TS, TM, TQ, TT;
TR = R0[WS(rs, 1)];
{
E TH, TI, TK, TL;
TH = R1[WS(rs, 3)];
TI = R0[WS(rs, 11)];
TO = TH + TI;
TK = R0[WS(rs, 6)];
TL = R1[WS(rs, 8)];
TP = TK + TL;
TJ = TH - TI;
TS = TO + TP;
TM = TK - TL;
}
TN = FMA(KP475528258, TJ, KP293892626 * TM);
T2t = TR + TS;
T1R = FNMS(KP293892626, TJ, KP475528258 * TM);
TQ = KP559016994 * (TO - TP);
TT = FNMS(KP250000000, TS, TR);
TU = TQ + TT;
T1S = TT - TQ;
}
T2s = T2q - T2r;
T2v = T2t - T2u;
Ci[WS(csi, 5)] = FNMS(KP587785252, T2v, KP951056516 * T2s);
Ci[WS(csi, 10)] = FMA(KP587785252, T2s, KP951056516 * T2v);
{
E T2z, T2y, T2A, T2w, T2x, T2B;
T2z = T8 + T9;
T2w = T2r + T2q;
T2x = T2t + T2u;
T2y = KP559016994 * (T2w - T2x);
T2A = T2w + T2x;
Cr[0] = T2z + T2A;
T2B = FNMS(KP250000000, T2A, T2z);
Cr[WS(csr, 5)] = T2y + T2B;
Cr[WS(csr, 10)] = T2B - T2y;
}
{
E Tb, Tq, TF, TG, T1E, T1F, T1G, T1B, T1C, T1D, TV, T1a, T1b, T1o, T1r;
E T1s, T1z, T1x, T1e, T1h, T1i, T1u, T1t;
Tb = T7 + Ta;
Tq = FMA(KP1_688655851, Ti, KP535826794 * Tp);
TF = FMA(KP1_541026485, Tx, KP637423989 * TE);
TG = Tq - TF;
T1E = FMA(KP851558583, TN, KP904827052 * TU);
T1F = FMA(KP1_984229402, T12, KP125333233 * T19);
T1G = T1E + T1F;
T1B = FNMS(KP844327925, Tp, KP1_071653589 * Ti);
T1C = FNMS(KP1_274847979, Tx, KP770513242 * TE);
T1D = T1B + T1C;
TV = FNMS(KP425779291, TU, KP1_809654104 * TN);
T1a = FNMS(KP992114701, T19, KP250666467 * T12);
T1b = TV + T1a;
{
E T1m, T1n, T1p, T1q;
T1m = FMA(KP1_937166322, Ti, KP248689887 * Tp);
T1n = FMA(KP1_071653589, Tx, KP844327925 * TE);
T1o = T1m + T1n;
T1p = FMA(KP1_752613360, TN, KP481753674 * TU);
T1q = FMA(KP1_457937254, T12, KP684547105 * T19);
T1r = T1p + T1q;
T1s = T1o + T1r;
T1z = T1q - T1p;
T1x = T1n - T1m;
}
{
E T1c, T1d, T1f, T1g;
T1c = FNMS(KP497379774, Ti, KP968583161 * Tp);
T1d = FNMS(KP1_688655851, Tx, KP535826794 * TE);
T1e = T1c + T1d;
T1f = FNMS(KP963507348, TN, KP876306680 * TU);
T1g = FNMS(KP1_369094211, T12, KP728968627 * T19);
T1h = T1f + T1g;
T1i = T1e + T1h;
T1u = T1f - T1g;
T1t = T1d - T1c;
}
Cr[WS(csr, 1)] = Tb + T1i;
Ci[WS(csi, 1)] = -(T1l + T1s);
Cr[WS(csr, 4)] = Tb + TG + T1b;
Ci[WS(csi, 4)] = T1l + T1D - T1G;
Ci[WS(csi, 9)] = FMA(KP309016994, T1D, T1l) + FMA(KP587785252, T1a - TV, KP809016994 * T1G) - (KP951056516 * (Tq + TF));
Cr[WS(csr, 9)] = FMA(KP309016994, TG, Tb) + FMA(KP951056516, T1B - T1C, KP587785252 * (T1F - T1E)) - (KP809016994 * T1b);
{
E T1v, T1w, T1y, T1A;
T1v = FMS(KP250000000, T1s, T1l);
T1w = KP559016994 * (T1r - T1o);
Ci[WS(csi, 11)] = FMA(KP587785252, T1t, KP951056516 * T1u) + T1v - T1w;
Ci[WS(csi, 6)] = FMA(KP951056516, T1t, T1v) + FNMS(KP587785252, T1u, T1w);
T1y = FNMS(KP250000000, T1i, Tb);
T1A = KP559016994 * (T1e - T1h);
Cr[WS(csr, 11)] = FMA(KP587785252, T1x, T1y) + FNMA(KP951056516, T1z, T1A);
Cr[WS(csr, 6)] = FMA(KP951056516, T1x, T1A) + FMA(KP587785252, T1z, T1y);
}
}
{
E T1W, T1X, T1J, T1M, T1N, T21, T22, T23, T1Q, T1T, T1U, T1Y, T1Z, T20, T26;
E T29, T2a, T2k, T2j, T2l, T2m, T2d, T2o, T2i;
T1W = FNMS(KP587785252, T1j, T1V);
T1X = Ta - T7;
T1J = FNMS(KP125333233, T1I, KP1_984229402 * T1H);
T1M = FMA(KP1_457937254, T1K, KP684547105 * T1L);
T1N = T1J - T1M;
T21 = FNMS(KP1_996053456, T1R, KP062790519 * T1S);
T22 = FMA(KP1_541026485, T1O, KP637423989 * T1P);
T23 = T21 - T22;
T1Q = FNMS(KP770513242, T1P, KP1_274847979 * T1O);
T1T = FMA(KP125581039, T1R, KP998026728 * T1S);
T1U = T1Q - T1T;
T1Y = FNMS(KP1_369094211, T1K, KP728968627 * T1L);
T1Z = FMA(KP250666467, T1H, KP992114701 * T1I);
T20 = T1Y - T1Z;
{
E T24, T25, T27, T28;
T24 = FNMS(KP481753674, T1L, KP1_752613360 * T1K);
T25 = FMA(KP851558583, T1H, KP904827052 * T1I);
T26 = T24 - T25;
T27 = FNMS(KP844327925, T1S, KP1_071653589 * T1R);
T28 = FNMS(KP998026728, T1P, KP125581039 * T1O);
T29 = T27 + T28;
T2a = T26 + T29;
T2k = T27 - T28;
T2j = T24 + T25;
}
{
E T2b, T2c, T2g, T2h;
T2b = FNMS(KP425779291, T1I, KP1_809654104 * T1H);
T2c = FMA(KP963507348, T1K, KP876306680 * T1L);
T2l = T2c + T2b;
T2g = FMA(KP1_688655851, T1R, KP535826794 * T1S);
T2h = FMA(KP1_996053456, T1O, KP062790519 * T1P);
T2m = T2g + T2h;
T2d = T2b - T2c;
T2o = T2l + T2m;
T2i = T2g - T2h;
}
Ci[WS(csi, 2)] = T1W + T2a;
Cr[WS(csr, 2)] = T1X + T2o;
Ci[WS(csi, 3)] = T1N + T1U - T1W;
Cr[WS(csr, 3)] = T1X + T20 + T23;
Cr[WS(csr, 8)] = FMA(KP309016994, T20, T1X) + FNMA(KP809016994, T23, KP587785252 * (T1T + T1Q)) - (KP951056516 * (T1M + T1J));
Ci[WS(csi, 8)] = FNMS(KP587785252, T21 + T22, KP309016994 * T1N) + FNMA(KP809016994, T1U, KP951056516 * (T1Y + T1Z)) - T1W;
{
E T2e, T2f, T2n, T2p;
T2e = KP559016994 * (T26 - T29);
T2f = FNMS(KP250000000, T2a, T1W);
Ci[WS(csi, 7)] = FMA(KP951056516, T2d, T2e) + FNMS(KP587785252, T2i, T2f);
Ci[WS(csi, 12)] = FMA(KP587785252, T2d, T2f) + FMS(KP951056516, T2i, T2e);
T2n = KP559016994 * (T2l - T2m);
T2p = FNMS(KP250000000, T2o, T1X);
Cr[WS(csr, 7)] = FMA(KP951056516, T2j, KP587785252 * T2k) + T2n + T2p;
Cr[WS(csr, 12)] = FMA(KP587785252, T2j, T2p) + FNMA(KP951056516, T2k, T2n);
}
}
}
}
}
static const kr2c_desc desc = { 25, "r2cf_25", { 117, 57, 83, 0 }, &GENUS };
void X(codelet_r2cf_25) (planner *p) { X(kr2c_register) (p, r2cf_25, &desc);
}
#endif

View File

@@ -0,0 +1,96 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cf_3 -include rdft/scalar/r2cf.h */
/*
* This function contains 4 FP additions, 2 FP multiplications,
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
* 7 stack variables, 2 constants, and 6 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
E T1, T2, T3, T4;
T1 = R0[0];
T2 = R1[0];
T3 = R0[WS(rs, 1)];
T4 = T2 + T3;
Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1);
Ci[WS(csi, 1)] = KP866025403 * (T3 - T2);
Cr[0] = T1 + T4;
}
}
}
static const kr2c_desc desc = { 3, "r2cf_3", { 3, 1, 1, 0 }, &GENUS };
void X(codelet_r2cf_3) (planner *p) { X(kr2c_register) (p, r2cf_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cf_3 -include rdft/scalar/r2cf.h */
/*
* This function contains 4 FP additions, 2 FP multiplications,
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
* 7 stack variables, 2 constants, and 6 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
E T1, T2, T3, T4;
T1 = R0[0];
T2 = R1[0];
T3 = R0[WS(rs, 1)];
T4 = T2 + T3;
Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1);
Ci[WS(csi, 1)] = KP866025403 * (T3 - T2);
Cr[0] = T1 + T4;
}
}
}
static const kr2c_desc desc = { 3, "r2cf_3", { 3, 1, 1, 0 }, &GENUS };
void X(codelet_r2cf_3) (planner *p) { X(kr2c_register) (p, r2cf_3, &desc);
}
#endif

View File

@@ -0,0 +1,610 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:11 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cf_32 -include rdft/scalar/r2cf.h */
/*
* This function contains 156 FP additions, 68 FP multiplications,
* (or, 88 additions, 0 multiplications, 68 fused multiply/add),
* 54 stack variables, 7 constants, and 64 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
E T7, T2b, Tv, T1h, Te, T2n, Ty, T1i, Tt, T2d, TF, T1l, Tm, T2c, TC;
E T1k, T1Z, T22, T2k, T2j, T1e, T1C, T19, T1B, T1S, T1V, T2h, T2g, TX, T1z;
E TS, T1y;
{
E T1, T2, T3, T4, T5, T6;
T1 = R0[0];
T2 = R0[WS(rs, 8)];
T3 = T1 + T2;
T4 = R0[WS(rs, 4)];
T5 = R0[WS(rs, 12)];
T6 = T4 + T5;
T7 = T3 + T6;
T2b = T3 - T6;
Tv = T1 - T2;
T1h = T4 - T5;
}
{
E Ta, Tw, Td, Tx;
{
E T8, T9, Tb, Tc;
T8 = R0[WS(rs, 2)];
T9 = R0[WS(rs, 10)];
Ta = T8 + T9;
Tw = T8 - T9;
Tb = R0[WS(rs, 14)];
Tc = R0[WS(rs, 6)];
Td = Tb + Tc;
Tx = Tb - Tc;
}
Te = Ta + Td;
T2n = Td - Ta;
Ty = Tw + Tx;
T1i = Tx - Tw;
}
{
E Tp, TD, Ts, TE;
{
E Tn, To, Tq, Tr;
Tn = R0[WS(rs, 15)];
To = R0[WS(rs, 7)];
Tp = Tn + To;
TD = Tn - To;
Tq = R0[WS(rs, 3)];
Tr = R0[WS(rs, 11)];
Ts = Tq + Tr;
TE = Tq - Tr;
}
Tt = Tp + Ts;
T2d = Tp - Ts;
TF = FMA(KP414213562, TE, TD);
T1l = FNMS(KP414213562, TD, TE);
}
{
E Ti, TA, Tl, TB;
{
E Tg, Th, Tj, Tk;
Tg = R0[WS(rs, 1)];
Th = R0[WS(rs, 9)];
Ti = Tg + Th;
TA = Tg - Th;
Tj = R0[WS(rs, 5)];
Tk = R0[WS(rs, 13)];
Tl = Tj + Tk;
TB = Tj - Tk;
}
Tm = Ti + Tl;
T2c = Ti - Tl;
TC = FNMS(KP414213562, TB, TA);
T1k = FMA(KP414213562, TA, TB);
}
{
E T11, T1X, T1c, T1Y, T14, T20, T17, T21, T1d, T18;
{
E TZ, T10, T1a, T1b;
TZ = R1[WS(rs, 15)];
T10 = R1[WS(rs, 7)];
T11 = TZ - T10;
T1X = TZ + T10;
T1a = R1[WS(rs, 11)];
T1b = R1[WS(rs, 3)];
T1c = T1a - T1b;
T1Y = T1b + T1a;
}
{
E T12, T13, T15, T16;
T12 = R1[WS(rs, 1)];
T13 = R1[WS(rs, 9)];
T14 = T12 - T13;
T20 = T12 + T13;
T15 = R1[WS(rs, 13)];
T16 = R1[WS(rs, 5)];
T17 = T15 - T16;
T21 = T15 + T16;
}
T1Z = T1X + T1Y;
T22 = T20 + T21;
T2k = T21 - T20;
T2j = T1X - T1Y;
T1d = T17 - T14;
T1e = FMA(KP707106781, T1d, T1c);
T1C = FNMS(KP707106781, T1d, T1c);
T18 = T14 + T17;
T19 = FMA(KP707106781, T18, T11);
T1B = FNMS(KP707106781, T18, T11);
}
{
E TK, T1Q, TV, T1R, TN, T1T, TQ, T1U, TW, TR;
{
E TI, TJ, TT, TU;
TI = R1[0];
TJ = R1[WS(rs, 8)];
TK = TI - TJ;
T1Q = TI + TJ;
TT = R1[WS(rs, 4)];
TU = R1[WS(rs, 12)];
TV = TT - TU;
T1R = TT + TU;
}
{
E TL, TM, TO, TP;
TL = R1[WS(rs, 2)];
TM = R1[WS(rs, 10)];
TN = TL - TM;
T1T = TL + TM;
TO = R1[WS(rs, 14)];
TP = R1[WS(rs, 6)];
TQ = TO - TP;
T1U = TO + TP;
}
T1S = T1Q + T1R;
T1V = T1T + T1U;
T2h = T1U - T1T;
T2g = T1Q - T1R;
TW = TN - TQ;
TX = FMA(KP707106781, TW, TV);
T1z = FNMS(KP707106781, TW, TV);
TR = TN + TQ;
TS = FMA(KP707106781, TR, TK);
T1y = FNMS(KP707106781, TR, TK);
}
{
E Tf, Tu, T27, T28, T29, T2a;
Tf = T7 + Te;
Tu = Tm + Tt;
T27 = Tf + Tu;
T28 = T1S + T1V;
T29 = T1Z + T22;
T2a = T28 + T29;
Cr[WS(csr, 8)] = Tf - Tu;
Ci[WS(csi, 8)] = T29 - T28;
Cr[WS(csr, 16)] = T27 - T2a;
Cr[0] = T27 + T2a;
}
{
E T1P, T25, T24, T26, T1W, T23;
T1P = T7 - Te;
T25 = Tt - Tm;
T1W = T1S - T1V;
T23 = T1Z - T22;
T24 = T1W + T23;
T26 = T23 - T1W;
Cr[WS(csr, 12)] = FNMS(KP707106781, T24, T1P);
Ci[WS(csi, 12)] = FMS(KP707106781, T26, T25);
Cr[WS(csr, 4)] = FMA(KP707106781, T24, T1P);
Ci[WS(csi, 4)] = FMA(KP707106781, T26, T25);
}
{
E T2f, T2v, T2p, T2r, T2m, T2q, T2u, T2w, T2e, T2o;
T2e = T2c + T2d;
T2f = FMA(KP707106781, T2e, T2b);
T2v = FNMS(KP707106781, T2e, T2b);
T2o = T2d - T2c;
T2p = FNMS(KP707106781, T2o, T2n);
T2r = FMA(KP707106781, T2o, T2n);
{
E T2i, T2l, T2s, T2t;
T2i = FMA(KP414213562, T2h, T2g);
T2l = FNMS(KP414213562, T2k, T2j);
T2m = T2i + T2l;
T2q = T2l - T2i;
T2s = FNMS(KP414213562, T2g, T2h);
T2t = FMA(KP414213562, T2j, T2k);
T2u = T2s + T2t;
T2w = T2t - T2s;
}
Cr[WS(csr, 14)] = FNMS(KP923879532, T2m, T2f);
Ci[WS(csi, 14)] = FMS(KP923879532, T2u, T2r);
Cr[WS(csr, 2)] = FMA(KP923879532, T2m, T2f);
Ci[WS(csi, 2)] = FMA(KP923879532, T2u, T2r);
Ci[WS(csi, 6)] = FMS(KP923879532, T2q, T2p);
Cr[WS(csr, 6)] = FMA(KP923879532, T2w, T2v);
Ci[WS(csi, 10)] = FMA(KP923879532, T2q, T2p);
Cr[WS(csr, 10)] = FNMS(KP923879532, T2w, T2v);
}
{
E TH, T1t, T1s, T1u, T1g, T1o, T1n, T1p;
{
E Tz, TG, T1q, T1r;
Tz = FMA(KP707106781, Ty, Tv);
TG = TC + TF;
TH = FMA(KP923879532, TG, Tz);
T1t = FNMS(KP923879532, TG, Tz);
T1q = FMA(KP198912367, T19, T1e);
T1r = FMA(KP198912367, TS, TX);
T1s = T1q - T1r;
T1u = T1r + T1q;
}
{
E TY, T1f, T1j, T1m;
TY = FNMS(KP198912367, TX, TS);
T1f = FNMS(KP198912367, T1e, T19);
T1g = TY + T1f;
T1o = T1f - TY;
T1j = FNMS(KP707106781, T1i, T1h);
T1m = T1k + T1l;
T1n = FNMS(KP923879532, T1m, T1j);
T1p = FMA(KP923879532, T1m, T1j);
}
Cr[WS(csr, 15)] = FNMS(KP980785280, T1g, TH);
Ci[WS(csi, 15)] = FMA(KP980785280, T1s, T1p);
Cr[WS(csr, 1)] = FMA(KP980785280, T1g, TH);
Ci[WS(csi, 1)] = FMS(KP980785280, T1s, T1p);
Ci[WS(csi, 7)] = FMA(KP980785280, T1o, T1n);
Cr[WS(csr, 7)] = FMA(KP980785280, T1u, T1t);
Ci[WS(csi, 9)] = FMS(KP980785280, T1o, T1n);
Cr[WS(csr, 9)] = FNMS(KP980785280, T1u, T1t);
}
{
E T1x, T1N, T1M, T1O, T1E, T1I, T1H, T1J;
{
E T1v, T1w, T1K, T1L;
T1v = FNMS(KP707106781, Ty, Tv);
T1w = T1k - T1l;
T1x = FMA(KP923879532, T1w, T1v);
T1N = FNMS(KP923879532, T1w, T1v);
T1K = FNMS(KP668178637, T1y, T1z);
T1L = FNMS(KP668178637, T1B, T1C);
T1M = T1K - T1L;
T1O = T1K + T1L;
}
{
E T1A, T1D, T1F, T1G;
T1A = FMA(KP668178637, T1z, T1y);
T1D = FMA(KP668178637, T1C, T1B);
T1E = T1A + T1D;
T1I = T1D - T1A;
T1F = FMA(KP707106781, T1i, T1h);
T1G = TF - TC;
T1H = FNMS(KP923879532, T1G, T1F);
T1J = FMA(KP923879532, T1G, T1F);
}
Cr[WS(csr, 13)] = FNMS(KP831469612, T1E, T1x);
Ci[WS(csi, 13)] = FMS(KP831469612, T1M, T1J);
Cr[WS(csr, 3)] = FMA(KP831469612, T1E, T1x);
Ci[WS(csi, 3)] = FMA(KP831469612, T1M, T1J);
Ci[WS(csi, 5)] = FMS(KP831469612, T1I, T1H);
Cr[WS(csr, 5)] = FNMS(KP831469612, T1O, T1N);
Ci[WS(csi, 11)] = FMA(KP831469612, T1I, T1H);
Cr[WS(csr, 11)] = FMA(KP831469612, T1O, T1N);
}
}
}
}
static const kr2c_desc desc = { 32, "r2cf_32", { 88, 0, 68, 0 }, &GENUS };
void X(codelet_r2cf_32) (planner *p) { X(kr2c_register) (p, r2cf_32, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cf_32 -include rdft/scalar/r2cf.h */
/*
* This function contains 156 FP additions, 42 FP multiplications,
* (or, 140 additions, 26 multiplications, 16 fused multiply/add),
* 54 stack variables, 7 constants, and 64 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
E T7, T2b, Tv, T1l, Te, T2o, Ty, T1k, Tt, T2d, TF, T1h, Tm, T2c, TC;
E T1i, T1Z, T22, T2k, T2j, T1e, T1C, T19, T1B, T1S, T1V, T2h, T2g, TX, T1z;
E TS, T1y;
{
E T1, T2, T3, T4, T5, T6;
T1 = R0[0];
T2 = R0[WS(rs, 8)];
T3 = T1 + T2;
T4 = R0[WS(rs, 4)];
T5 = R0[WS(rs, 12)];
T6 = T4 + T5;
T7 = T3 + T6;
T2b = T3 - T6;
Tv = T1 - T2;
T1l = T4 - T5;
}
{
E Ta, Tw, Td, Tx;
{
E T8, T9, Tb, Tc;
T8 = R0[WS(rs, 2)];
T9 = R0[WS(rs, 10)];
Ta = T8 + T9;
Tw = T8 - T9;
Tb = R0[WS(rs, 14)];
Tc = R0[WS(rs, 6)];
Td = Tb + Tc;
Tx = Tb - Tc;
}
Te = Ta + Td;
T2o = Td - Ta;
Ty = KP707106781 * (Tw + Tx);
T1k = KP707106781 * (Tx - Tw);
}
{
E Tp, TD, Ts, TE;
{
E Tn, To, Tq, Tr;
Tn = R0[WS(rs, 15)];
To = R0[WS(rs, 7)];
Tp = Tn + To;
TD = Tn - To;
Tq = R0[WS(rs, 3)];
Tr = R0[WS(rs, 11)];
Ts = Tq + Tr;
TE = Tq - Tr;
}
Tt = Tp + Ts;
T2d = Tp - Ts;
TF = FMA(KP923879532, TD, KP382683432 * TE);
T1h = FNMS(KP923879532, TE, KP382683432 * TD);
}
{
E Ti, TA, Tl, TB;
{
E Tg, Th, Tj, Tk;
Tg = R0[WS(rs, 1)];
Th = R0[WS(rs, 9)];
Ti = Tg + Th;
TA = Tg - Th;
Tj = R0[WS(rs, 5)];
Tk = R0[WS(rs, 13)];
Tl = Tj + Tk;
TB = Tj - Tk;
}
Tm = Ti + Tl;
T2c = Ti - Tl;
TC = FNMS(KP382683432, TB, KP923879532 * TA);
T1i = FMA(KP382683432, TA, KP923879532 * TB);
}
{
E T11, T1X, T1d, T1Y, T14, T20, T17, T21, T1a, T18;
{
E TZ, T10, T1b, T1c;
TZ = R1[WS(rs, 15)];
T10 = R1[WS(rs, 7)];
T11 = TZ - T10;
T1X = TZ + T10;
T1b = R1[WS(rs, 3)];
T1c = R1[WS(rs, 11)];
T1d = T1b - T1c;
T1Y = T1b + T1c;
}
{
E T12, T13, T15, T16;
T12 = R1[WS(rs, 1)];
T13 = R1[WS(rs, 9)];
T14 = T12 - T13;
T20 = T12 + T13;
T15 = R1[WS(rs, 13)];
T16 = R1[WS(rs, 5)];
T17 = T15 - T16;
T21 = T15 + T16;
}
T1Z = T1X + T1Y;
T22 = T20 + T21;
T2k = T21 - T20;
T2j = T1X - T1Y;
T1a = KP707106781 * (T17 - T14);
T1e = T1a - T1d;
T1C = T1d + T1a;
T18 = KP707106781 * (T14 + T17);
T19 = T11 + T18;
T1B = T11 - T18;
}
{
E TK, T1Q, TW, T1R, TN, T1T, TQ, T1U, TT, TR;
{
E TI, TJ, TU, TV;
TI = R1[0];
TJ = R1[WS(rs, 8)];
TK = TI - TJ;
T1Q = TI + TJ;
TU = R1[WS(rs, 4)];
TV = R1[WS(rs, 12)];
TW = TU - TV;
T1R = TU + TV;
}
{
E TL, TM, TO, TP;
TL = R1[WS(rs, 2)];
TM = R1[WS(rs, 10)];
TN = TL - TM;
T1T = TL + TM;
TO = R1[WS(rs, 14)];
TP = R1[WS(rs, 6)];
TQ = TO - TP;
T1U = TO + TP;
}
T1S = T1Q + T1R;
T1V = T1T + T1U;
T2h = T1U - T1T;
T2g = T1Q - T1R;
TT = KP707106781 * (TQ - TN);
TX = TT - TW;
T1z = TW + TT;
TR = KP707106781 * (TN + TQ);
TS = TK + TR;
T1y = TK - TR;
}
{
E Tf, Tu, T27, T28, T29, T2a;
Tf = T7 + Te;
Tu = Tm + Tt;
T27 = Tf + Tu;
T28 = T1S + T1V;
T29 = T1Z + T22;
T2a = T28 + T29;
Cr[WS(csr, 8)] = Tf - Tu;
Ci[WS(csi, 8)] = T29 - T28;
Cr[WS(csr, 16)] = T27 - T2a;
Cr[0] = T27 + T2a;
}
{
E T1P, T25, T24, T26, T1W, T23;
T1P = T7 - Te;
T25 = Tt - Tm;
T1W = T1S - T1V;
T23 = T1Z - T22;
T24 = KP707106781 * (T1W + T23);
T26 = KP707106781 * (T23 - T1W);
Cr[WS(csr, 12)] = T1P - T24;
Ci[WS(csi, 12)] = T26 - T25;
Cr[WS(csr, 4)] = T1P + T24;
Ci[WS(csi, 4)] = T25 + T26;
}
{
E T2f, T2v, T2p, T2r, T2m, T2q, T2u, T2w, T2e, T2n;
T2e = KP707106781 * (T2c + T2d);
T2f = T2b + T2e;
T2v = T2b - T2e;
T2n = KP707106781 * (T2d - T2c);
T2p = T2n - T2o;
T2r = T2o + T2n;
{
E T2i, T2l, T2s, T2t;
T2i = FMA(KP923879532, T2g, KP382683432 * T2h);
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
T2m = T2i + T2l;
T2q = T2l - T2i;
T2s = FNMS(KP382683432, T2g, KP923879532 * T2h);
T2t = FMA(KP382683432, T2j, KP923879532 * T2k);
T2u = T2s + T2t;
T2w = T2t - T2s;
}
Cr[WS(csr, 14)] = T2f - T2m;
Ci[WS(csi, 14)] = T2u - T2r;
Cr[WS(csr, 2)] = T2f + T2m;
Ci[WS(csi, 2)] = T2r + T2u;
Ci[WS(csi, 6)] = T2p + T2q;
Cr[WS(csr, 6)] = T2v + T2w;
Ci[WS(csi, 10)] = T2q - T2p;
Cr[WS(csr, 10)] = T2v - T2w;
}
{
E TH, T1t, T1s, T1u, T1g, T1o, T1n, T1p;
{
E Tz, TG, T1q, T1r;
Tz = Tv + Ty;
TG = TC + TF;
TH = Tz + TG;
T1t = Tz - TG;
T1q = FNMS(KP195090322, TS, KP980785280 * TX);
T1r = FMA(KP195090322, T19, KP980785280 * T1e);
T1s = T1q + T1r;
T1u = T1r - T1q;
}
{
E TY, T1f, T1j, T1m;
TY = FMA(KP980785280, TS, KP195090322 * TX);
T1f = FNMS(KP195090322, T1e, KP980785280 * T19);
T1g = TY + T1f;
T1o = T1f - TY;
T1j = T1h - T1i;
T1m = T1k - T1l;
T1n = T1j - T1m;
T1p = T1m + T1j;
}
Cr[WS(csr, 15)] = TH - T1g;
Ci[WS(csi, 15)] = T1s - T1p;
Cr[WS(csr, 1)] = TH + T1g;
Ci[WS(csi, 1)] = T1p + T1s;
Ci[WS(csi, 7)] = T1n + T1o;
Cr[WS(csr, 7)] = T1t + T1u;
Ci[WS(csi, 9)] = T1o - T1n;
Cr[WS(csr, 9)] = T1t - T1u;
}
{
E T1x, T1N, T1M, T1O, T1E, T1I, T1H, T1J;
{
E T1v, T1w, T1K, T1L;
T1v = Tv - Ty;
T1w = T1i + T1h;
T1x = T1v + T1w;
T1N = T1v - T1w;
T1K = FNMS(KP555570233, T1y, KP831469612 * T1z);
T1L = FMA(KP555570233, T1B, KP831469612 * T1C);
T1M = T1K + T1L;
T1O = T1L - T1K;
}
{
E T1A, T1D, T1F, T1G;
T1A = FMA(KP831469612, T1y, KP555570233 * T1z);
T1D = FNMS(KP555570233, T1C, KP831469612 * T1B);
T1E = T1A + T1D;
T1I = T1D - T1A;
T1F = TF - TC;
T1G = T1l + T1k;
T1H = T1F - T1G;
T1J = T1G + T1F;
}
Cr[WS(csr, 13)] = T1x - T1E;
Ci[WS(csi, 13)] = T1M - T1J;
Cr[WS(csr, 3)] = T1x + T1E;
Ci[WS(csi, 3)] = T1J + T1M;
Ci[WS(csi, 5)] = T1H + T1I;
Cr[WS(csr, 5)] = T1N + T1O;
Ci[WS(csi, 11)] = T1I - T1H;
Cr[WS(csr, 11)] = T1N - T1O;
}
}
}
}
static const kr2c_desc desc = { 32, "r2cf_32", { 140, 26, 16, 0 }, &GENUS };
void X(codelet_r2cf_32) (planner *p) { X(kr2c_register) (p, r2cf_32, &desc);
}
#endif

View File

@@ -0,0 +1,98 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cf_4 -include rdft/scalar/r2cf.h */
/*
* This function contains 6 FP additions, 0 FP multiplications,
* (or, 6 additions, 0 multiplications, 0 fused multiply/add),
* 7 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
E T1, T2, T3, T4, T5, T6;
T1 = R0[0];
T2 = R0[WS(rs, 1)];
T3 = T1 + T2;
T4 = R1[0];
T5 = R1[WS(rs, 1)];
T6 = T4 + T5;
Cr[WS(csr, 1)] = T1 - T2;
Ci[WS(csi, 1)] = T5 - T4;
Cr[WS(csr, 2)] = T3 - T6;
Cr[0] = T3 + T6;
}
}
}
static const kr2c_desc desc = { 4, "r2cf_4", { 6, 0, 0, 0 }, &GENUS };
void X(codelet_r2cf_4) (planner *p) { X(kr2c_register) (p, r2cf_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cf_4 -include rdft/scalar/r2cf.h */
/*
* This function contains 6 FP additions, 0 FP multiplications,
* (or, 6 additions, 0 multiplications, 0 fused multiply/add),
* 7 stack variables, 0 constants, and 8 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
E T1, T2, T3, T4, T5, T6;
T1 = R0[0];
T2 = R0[WS(rs, 1)];
T3 = T1 + T2;
T4 = R1[0];
T5 = R1[WS(rs, 1)];
T6 = T4 + T5;
Cr[WS(csr, 1)] = T1 - T2;
Ci[WS(csi, 1)] = T5 - T4;
Cr[WS(csr, 2)] = T3 - T6;
Cr[0] = T3 + T6;
}
}
}
static const kr2c_desc desc = { 4, "r2cf_4", { 6, 0, 0, 0 }, &GENUS };
void X(codelet_r2cf_4) (planner *p) { X(kr2c_register) (p, r2cf_4, &desc);
}
#endif

View File

@@ -0,0 +1,126 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cf_5 -include rdft/scalar/r2cf.h */
/*
* This function contains 12 FP additions, 7 FP multiplications,
* (or, 7 additions, 2 multiplications, 5 fused multiply/add),
* 17 stack variables, 4 constants, and 10 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
E T7, T8, T9, T3, Ta, T6, Tb, Tc;
T7 = R0[0];
{
E T1, T2, T4, T5;
T1 = R0[WS(rs, 2)];
T2 = R1[0];
T8 = T2 + T1;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 1)];
T9 = T4 + T5;
T3 = T1 - T2;
Ta = T8 + T9;
T6 = T4 - T5;
}
Ci[WS(csi, 1)] = KP951056516 * (FNMS(KP618033988, T6, T3));
Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, T3, T6));
Cr[0] = T7 + Ta;
Tb = FNMS(KP250000000, Ta, T7);
Tc = T8 - T9;
Cr[WS(csr, 1)] = FMA(KP559016994, Tc, Tb);
Cr[WS(csr, 2)] = FNMS(KP559016994, Tc, Tb);
}
}
}
static const kr2c_desc desc = { 5, "r2cf_5", { 7, 2, 5, 0 }, &GENUS };
void X(codelet_r2cf_5) (planner *p) { X(kr2c_register) (p, r2cf_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cf_5 -include rdft/scalar/r2cf.h */
/*
* This function contains 12 FP additions, 6 FP multiplications,
* (or, 9 additions, 3 multiplications, 3 fused multiply/add),
* 17 stack variables, 4 constants, and 10 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
E Ta, T7, T8, T3, Tb, T6, T9, Tc;
Ta = R0[0];
{
E T1, T2, T4, T5;
T1 = R0[WS(rs, 2)];
T2 = R1[0];
T7 = T2 + T1;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 1)];
T8 = T4 + T5;
T3 = T1 - T2;
Tb = T7 + T8;
T6 = T4 - T5;
}
Ci[WS(csi, 1)] = FNMS(KP587785252, T6, KP951056516 * T3);
Ci[WS(csi, 2)] = FMA(KP587785252, T3, KP951056516 * T6);
Cr[0] = Ta + Tb;
T9 = KP559016994 * (T7 - T8);
Tc = FNMS(KP250000000, Tb, Ta);
Cr[WS(csr, 1)] = T9 + Tc;
Cr[WS(csr, 2)] = Tc - T9;
}
}
}
static const kr2c_desc desc = { 5, "r2cf_5", { 9, 3, 3, 0 }, &GENUS };
void X(codelet_r2cf_5) (planner *p) { X(kr2c_register) (p, r2cf_5, &desc);
}
#endif

View File

@@ -0,0 +1,128 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cf_6 -include rdft/scalar/r2cf.h */
/*
* This function contains 14 FP additions, 4 FP multiplications,
* (or, 12 additions, 2 multiplications, 2 fused multiply/add),
* 17 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
E T3, Td, T9, Tc, T6, Tb, T1, T2, Ta, Te;
T1 = R0[0];
T2 = R1[WS(rs, 1)];
T3 = T1 - T2;
Td = T1 + T2;
{
E T7, T8, T4, T5;
T7 = R0[WS(rs, 2)];
T8 = R1[0];
T9 = T7 - T8;
Tc = T7 + T8;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 2)];
T6 = T4 - T5;
Tb = T4 + T5;
}
Ci[WS(csi, 1)] = KP866025403 * (T9 - T6);
Ta = T6 + T9;
Cr[WS(csr, 1)] = FNMS(KP500000000, Ta, T3);
Cr[WS(csr, 3)] = T3 + Ta;
Ci[WS(csi, 2)] = KP866025403 * (Tb - Tc);
Te = Tb + Tc;
Cr[WS(csr, 2)] = FNMS(KP500000000, Te, Td);
Cr[0] = Td + Te;
}
}
}
static const kr2c_desc desc = { 6, "r2cf_6", { 12, 2, 2, 0 }, &GENUS };
void X(codelet_r2cf_6) (planner *p) { X(kr2c_register) (p, r2cf_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cf_6 -include rdft/scalar/r2cf.h */
/*
* This function contains 14 FP additions, 4 FP multiplications,
* (or, 12 additions, 2 multiplications, 2 fused multiply/add),
* 17 stack variables, 2 constants, and 12 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
E T3, Td, T9, Tc, T6, Tb, T1, T2, Ta, Te;
T1 = R0[0];
T2 = R1[WS(rs, 1)];
T3 = T1 - T2;
Td = T1 + T2;
{
E T7, T8, T4, T5;
T7 = R0[WS(rs, 2)];
T8 = R1[0];
T9 = T7 - T8;
Tc = T7 + T8;
T4 = R0[WS(rs, 1)];
T5 = R1[WS(rs, 2)];
T6 = T4 - T5;
Tb = T4 + T5;
}
Ci[WS(csi, 1)] = KP866025403 * (T9 - T6);
Ta = T6 + T9;
Cr[WS(csr, 1)] = FNMS(KP500000000, Ta, T3);
Cr[WS(csr, 3)] = T3 + Ta;
Ci[WS(csi, 2)] = KP866025403 * (Tb - Tc);
Te = Tb + Tc;
Cr[WS(csr, 2)] = FNMS(KP500000000, Te, Td);
Cr[0] = Td + Te;
}
}
}
static const kr2c_desc desc = { 6, "r2cf_6", { 12, 2, 2, 0 }, &GENUS };
void X(codelet_r2cf_6) (planner *p) { X(kr2c_register) (p, r2cf_6, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,148 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cf_7 -include rdft/scalar/r2cf.h */
/*
* This function contains 24 FP additions, 18 FP multiplications,
* (or, 9 additions, 3 multiplications, 15 fused multiply/add),
* 23 stack variables, 6 constants, and 14 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
E T1, T4, Ta, T7, Tb, Td, Tj, Ti, Th, Tf;
T1 = R0[0];
{
E T2, T3, T8, T9, T5, T6;
T2 = R1[0];
T3 = R0[WS(rs, 3)];
T4 = T2 + T3;
T8 = R1[WS(rs, 1)];
T9 = R0[WS(rs, 2)];
Ta = T8 + T9;
T5 = R0[WS(rs, 1)];
T6 = R1[WS(rs, 2)];
T7 = T5 + T6;
Tb = FNMS(KP356895867, Ta, T7);
Td = FNMS(KP356895867, T4, Ta);
Tj = T6 - T5;
Ti = T9 - T8;
Th = T3 - T2;
Tf = FNMS(KP356895867, T7, T4);
}
{
E Tc, Tm, Te, Tk, Tg, Tl;
Tc = FNMS(KP692021471, Tb, T4);
Cr[WS(csr, 3)] = FNMS(KP900968867, Tc, T1);
Tm = FNMS(KP554958132, Th, Tj);
Ci[WS(csi, 3)] = KP974927912 * (FNMS(KP801937735, Tm, Ti));
Te = FNMS(KP692021471, Td, T7);
Cr[WS(csr, 2)] = FNMS(KP900968867, Te, T1);
Tk = FMA(KP554958132, Tj, Ti);
Ci[WS(csi, 2)] = KP974927912 * (FNMS(KP801937735, Tk, Th));
Cr[0] = T1 + T4 + T7 + Ta;
Tg = FNMS(KP692021471, Tf, Ta);
Cr[WS(csr, 1)] = FNMS(KP900968867, Tg, T1);
Tl = FMA(KP554958132, Ti, Th);
Ci[WS(csi, 1)] = KP974927912 * (FMA(KP801937735, Tl, Tj));
}
}
}
}
static const kr2c_desc desc = { 7, "r2cf_7", { 9, 3, 15, 0 }, &GENUS };
void X(codelet_r2cf_7) (planner *p) { X(kr2c_register) (p, r2cf_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cf_7 -include rdft/scalar/r2cf.h */
/*
* This function contains 24 FP additions, 18 FP multiplications,
* (or, 12 additions, 6 multiplications, 12 fused multiply/add),
* 20 stack variables, 6 constants, and 14 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
E T1, Ta, Tb, T4, Td, T7, Tc, T8, T9;
T1 = R0[0];
T8 = R1[0];
T9 = R0[WS(rs, 3)];
Ta = T8 + T9;
Tb = T9 - T8;
{
E T2, T3, T5, T6;
T2 = R0[WS(rs, 1)];
T3 = R1[WS(rs, 2)];
T4 = T2 + T3;
Td = T3 - T2;
T5 = R1[WS(rs, 1)];
T6 = R0[WS(rs, 2)];
T7 = T5 + T6;
Tc = T6 - T5;
}
Ci[WS(csi, 2)] = FNMS(KP781831482, Tc, KP974927912 * Tb) - (KP433883739 * Td);
Ci[WS(csi, 1)] = FMA(KP781831482, Tb, KP974927912 * Td) + (KP433883739 * Tc);
Cr[WS(csr, 2)] = FMA(KP623489801, T7, T1) + FNMA(KP900968867, T4, KP222520933 * Ta);
Ci[WS(csi, 3)] = FMA(KP433883739, Tb, KP974927912 * Tc) - (KP781831482 * Td);
Cr[WS(csr, 3)] = FMA(KP623489801, T4, T1) + FNMA(KP222520933, T7, KP900968867 * Ta);
Cr[WS(csr, 1)] = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
Cr[0] = T1 + Ta + T4 + T7;
}
}
}
static const kr2c_desc desc = { 7, "r2cf_7", { 12, 6, 12, 0 }, &GENUS };
void X(codelet_r2cf_7) (planner *p) { X(kr2c_register) (p, r2cf_7, &desc);
}
#endif

View File

@@ -0,0 +1,154 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cf_8 -include rdft/scalar/r2cf.h */
/*
* This function contains 20 FP additions, 4 FP multiplications,
* (or, 16 additions, 0 multiplications, 4 fused multiply/add),
* 14 stack variables, 1 constants, and 16 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
E T3, T7, Td, Tj, T6, Tf, Ta, Ti;
{
E T1, T2, Tb, Tc;
T1 = R0[0];
T2 = R0[WS(rs, 2)];
T3 = T1 + T2;
T7 = T1 - T2;
Tb = R1[WS(rs, 3)];
Tc = R1[WS(rs, 1)];
Td = Tb - Tc;
Tj = Tb + Tc;
}
{
E T4, T5, T8, T9;
T4 = R0[WS(rs, 1)];
T5 = R0[WS(rs, 3)];
T6 = T4 + T5;
Tf = T4 - T5;
T8 = R1[0];
T9 = R1[WS(rs, 2)];
Ta = T8 - T9;
Ti = T8 + T9;
}
Cr[WS(csr, 2)] = T3 - T6;
Ci[WS(csi, 2)] = Tj - Ti;
{
E Te, Tg, Th, Tk;
Te = Ta + Td;
Cr[WS(csr, 3)] = FNMS(KP707106781, Te, T7);
Cr[WS(csr, 1)] = FMA(KP707106781, Te, T7);
Tg = Td - Ta;
Ci[WS(csi, 1)] = FMS(KP707106781, Tg, Tf);
Ci[WS(csi, 3)] = FMA(KP707106781, Tg, Tf);
Th = T3 + T6;
Tk = Ti + Tj;
Cr[WS(csr, 4)] = Th - Tk;
Cr[0] = Th + Tk;
}
}
}
}
static const kr2c_desc desc = { 8, "r2cf_8", { 16, 0, 4, 0 }, &GENUS };
void X(codelet_r2cf_8) (planner *p) { X(kr2c_register) (p, r2cf_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cf_8 -include rdft/scalar/r2cf.h */
/*
* This function contains 20 FP additions, 2 FP multiplications,
* (or, 20 additions, 2 multiplications, 0 fused multiply/add),
* 14 stack variables, 1 constants, and 16 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
E T3, T7, Td, Tj, T6, Tg, Ta, Ti;
{
E T1, T2, Tb, Tc;
T1 = R0[0];
T2 = R0[WS(rs, 2)];
T3 = T1 + T2;
T7 = T1 - T2;
Tb = R1[WS(rs, 3)];
Tc = R1[WS(rs, 1)];
Td = Tb - Tc;
Tj = Tb + Tc;
}
{
E T4, T5, T8, T9;
T4 = R0[WS(rs, 1)];
T5 = R0[WS(rs, 3)];
T6 = T4 + T5;
Tg = T4 - T5;
T8 = R1[0];
T9 = R1[WS(rs, 2)];
Ta = T8 - T9;
Ti = T8 + T9;
}
Cr[WS(csr, 2)] = T3 - T6;
Ci[WS(csi, 2)] = Tj - Ti;
{
E Te, Tf, Th, Tk;
Te = KP707106781 * (Ta + Td);
Cr[WS(csr, 3)] = T7 - Te;
Cr[WS(csr, 1)] = T7 + Te;
Tf = KP707106781 * (Td - Ta);
Ci[WS(csi, 1)] = Tf - Tg;
Ci[WS(csi, 3)] = Tg + Tf;
Th = T3 + T6;
Tk = Ti + Tj;
Cr[WS(csr, 4)] = Th - Tk;
Cr[0] = Th + Tk;
}
}
}
}
static const kr2c_desc desc = { 8, "r2cf_8", { 20, 2, 0, 0 }, &GENUS };
void X(codelet_r2cf_8) (planner *p) { X(kr2c_register) (p, r2cf_8, &desc);
}
#endif

View File

@@ -0,0 +1,217 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
#include "rdft/codelet-rdft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cf_9 -include rdft/scalar/r2cf.h */
/*
* This function contains 38 FP additions, 30 FP multiplications,
* (or, 12 additions, 4 multiplications, 26 fused multiply/add),
* 48 stack variables, 18 constants, and 18 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP907603734, +0.907603734547952313649323976213898122064543220);
DK(KP347296355, +0.347296355333860697703433253538629592000751354);
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP666666666, +0.666666666666666666666666666666666666666666667);
DK(KP898197570, +0.898197570222573798468955502359086394667167570);
DK(KP673648177, +0.673648177666930348851716626769314796000375677);
DK(KP879385241, +0.879385241571816768108218554649462939872416269);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP394930843, +0.394930843634698457567117349190734585290304520);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP586256827, +0.586256827714544512072145703099641959914944179);
DK(KP726681596, +0.726681596905677465811651808188092531873167623);
DK(KP968908795, +0.968908795874236621082202410917456709164223497);
DK(KP203604859, +0.203604859554852403062088995281827210665664861);
DK(KP152703644, +0.152703644666139302296566746461370407999248646);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP184792530, +0.184792530904095372701352047572203755870913560);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
E T1, T4, To, Tk, Ta, Tu, Tf, Th, Tj, Tx, Tl, Tm, Ty, Tq, T2;
E T3, T5, Tg;
T1 = R0[0];
T2 = R1[WS(rs, 1)];
T3 = R0[WS(rs, 3)];
T4 = T2 + T3;
To = T3 - T2;
{
E T6, Tb, T9, Te, Ti;
T6 = R1[0];
Tb = R0[WS(rs, 1)];
{
E T7, T8, Tc, Td;
T7 = R0[WS(rs, 2)];
T8 = R1[WS(rs, 3)];
T9 = T7 + T8;
Tk = T7 - T8;
Tc = R1[WS(rs, 2)];
Td = R0[WS(rs, 4)];
Te = Tc + Td;
Ti = Td - Tc;
}
Ta = T6 + T9;
Tu = FMA(KP184792530, Tk, Ti);
Tf = Tb + Te;
Th = FNMS(KP500000000, Te, Tb);
Tj = FNMS(KP152703644, Ti, Th);
Tx = FMA(KP203604859, Th, Ti);
Tl = FMS(KP500000000, T9, T6);
Tm = FNMS(KP968908795, Tl, Tk);
Ty = FMA(KP726681596, Tk, Tl);
Tq = FMA(KP586256827, Tl, Ti);
}
Ci[WS(csi, 3)] = KP866025403 * (Tf - Ta);
T5 = T1 + T4;
Tg = Ta + Tf;
Cr[WS(csr, 3)] = FNMS(KP500000000, Tg, T5);
Cr[0] = T5 + Tg;
{
E Tv, Tt, Tn, TC, TB;
Tt = FMA(KP394930843, Th, To);
Tv = FNMS(KP939692620, Tu, Tt);
Ci[WS(csi, 2)] = KP984807753 * (FNMS(KP879385241, Tv, Tl));
Tn = FMA(KP673648177, Tm, Tj);
TB = FMA(KP898197570, Ty, Tx);
TC = FMA(KP666666666, Tn, TB);
Ci[WS(csi, 1)] = -(KP984807753 * (FNMS(KP879385241, To, Tn)));
Ci[WS(csi, 4)] = KP866025403 * (FMA(KP852868531, TC, To));
{
E Tp, Ts, Tz, TA, Tr, Tw;
Tp = FNMS(KP500000000, T4, T1);
Tr = FNMS(KP347296355, Tq, Tk);
Ts = FNMS(KP907603734, Tr, Th);
Tw = FNMS(KP673648177, Tm, Tj);
Tz = FNMS(KP898197570, Ty, Tx);
TA = FNMS(KP500000000, Tz, Tw);
Cr[WS(csr, 2)] = FNMS(KP939692620, Ts, Tp);
Cr[WS(csr, 1)] = FMA(KP852868531, Tz, Tp);
Cr[WS(csr, 4)] = FMA(KP852868531, TA, Tp);
}
}
}
}
}
static const kr2c_desc desc = { 9, "r2cf_9", { 12, 4, 26, 0 }, &GENUS };
void X(codelet_r2cf_9) (planner *p) { X(kr2c_register) (p, r2cf_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cf_9 -include rdft/scalar/r2cf.h */
/*
* This function contains 38 FP additions, 26 FP multiplications,
* (or, 21 additions, 9 multiplications, 17 fused multiply/add),
* 36 stack variables, 14 constants, and 18 memory accesses
*/
#include "rdft/scalar/r2cf.h"
static void r2cf_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
{
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP296198132, +0.296198132726023843175338011893050938967728390);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP813797681, +0.813797681349373692844693217248393223289101568);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP150383733, +0.150383733180435296639271897612501926072238258);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP663413948, +0.663413948168938396205421319635891297216863310);
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP556670399, +0.556670399226419366452912952047023132968291906);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
E T1, T4, Tr, Ta, Tl, Ti, Tf, Tk, Tj, T2, T3, T5, Tg;
T1 = R0[0];
T2 = R1[WS(rs, 1)];
T3 = R0[WS(rs, 3)];
T4 = T2 + T3;
Tr = T3 - T2;
{
E T6, T7, T8, T9;
T6 = R1[0];
T7 = R0[WS(rs, 2)];
T8 = R1[WS(rs, 3)];
T9 = T7 + T8;
Ta = T6 + T9;
Tl = T8 - T7;
Ti = FNMS(KP500000000, T9, T6);
}
{
E Tb, Tc, Td, Te;
Tb = R0[WS(rs, 1)];
Tc = R1[WS(rs, 2)];
Td = R0[WS(rs, 4)];
Te = Tc + Td;
Tf = Tb + Te;
Tk = FNMS(KP500000000, Te, Tb);
Tj = Td - Tc;
}
Ci[WS(csi, 3)] = KP866025403 * (Tf - Ta);
T5 = T1 + T4;
Tg = Ta + Tf;
Cr[WS(csr, 3)] = FNMS(KP500000000, Tg, T5);
Cr[0] = T5 + Tg;
{
E Tt, Th, Tm, Tn, To, Tp, Tq, Ts;
Tt = KP866025403 * Tr;
Th = FNMS(KP500000000, T4, T1);
Tm = FMA(KP766044443, Ti, KP556670399 * Tl);
Tn = FMA(KP173648177, Tk, KP852868531 * Tj);
To = Tm + Tn;
Tp = FNMS(KP642787609, Ti, KP663413948 * Tl);
Tq = FNMS(KP984807753, Tk, KP150383733 * Tj);
Ts = Tp + Tq;
Cr[WS(csr, 1)] = Th + To;
Ci[WS(csi, 1)] = Tt + Ts;
Cr[WS(csr, 4)] = FMA(KP866025403, Tp - Tq, Th) - (KP500000000 * To);
Ci[WS(csi, 4)] = FNMS(KP500000000, Ts, KP866025403 * (Tr + (Tn - Tm)));
Ci[WS(csi, 2)] = FNMS(KP342020143, Tk, KP813797681 * Tj) + FNMA(KP150383733, Tl, KP984807753 * Ti) - Tt;
Cr[WS(csr, 2)] = FMA(KP173648177, Ti, Th) + FNMA(KP296198132, Tj, KP939692620 * Tk) - (KP852868531 * Tl);
}
}
}
}
static const kr2c_desc desc = { 9, "r2cf_9", { 21, 9, 17, 0 }, &GENUS };
void X(codelet_r2cf_9) (planner *p) { X(kr2c_register) (p, r2cf_9, &desc);
}
#endif