Updates
This commit is contained in:
109
fftw-3.3.10/rdft/scalar/r2cf/Makefile.am
Normal file
109
fftw-3.3.10/rdft/scalar/r2cf/Makefile.am
Normal file
@@ -0,0 +1,109 @@
|
||||
# This Makefile.am specifies a set of codelets, efficient transforms
|
||||
# of small sizes, that are used as building blocks (kernels) by FFTW
|
||||
# to build up large transforms, as well as the options for generating
|
||||
# and compiling them.
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
###########################################################################
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = librdft_scalar_r2cf.la
|
||||
|
||||
###########################################################################
|
||||
# r2cf_<n> is a hard-coded real-to-complex FFT of size <n> (base cases
|
||||
# of real-input FFT recursion)
|
||||
R2CF = r2cf_2.c r2cf_3.c r2cf_4.c r2cf_5.c r2cf_6.c r2cf_7.c r2cf_8.c \
|
||||
r2cf_9.c r2cf_10.c r2cf_11.c r2cf_12.c r2cf_13.c r2cf_14.c r2cf_15.c \
|
||||
r2cf_16.c r2cf_32.c r2cf_64.c r2cf_128.c \
|
||||
r2cf_20.c r2cf_25.c # r2cf_30.c r2cf_40.c r2cf_50.c
|
||||
|
||||
###########################################################################
|
||||
# hf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
|
||||
# step for a real-input FFT. Every hf codelet must have a
|
||||
# corresponding r2cfII codelet (see below)!
|
||||
HF = hf_2.c hf_3.c hf_4.c hf_5.c hf_6.c hf_7.c hf_8.c hf_9.c \
|
||||
hf_10.c hf_12.c hf_15.c hf_16.c hf_32.c hf_64.c \
|
||||
hf_20.c hf_25.c # hf_30.c hf_40.c hf_50.c
|
||||
|
||||
# like hf, but generates part of its trig table on the fly (good for large n)
|
||||
HF2 = hf2_4.c hf2_8.c hf2_16.c hf2_32.c \
|
||||
hf2_5.c hf2_20.c hf2_25.c
|
||||
|
||||
# an r2cf transform where the input is shifted by half a sample (output
|
||||
# is multiplied by a phase). This is needed as part of the DIT recursion;
|
||||
# every hf_<r> or hf2_<r> codelet should have a corresponding r2cfII_<r>
|
||||
R2CFII = r2cfII_2.c r2cfII_3.c r2cfII_4.c r2cfII_5.c r2cfII_6.c \
|
||||
r2cfII_7.c r2cfII_8.c r2cfII_9.c r2cfII_10.c r2cfII_12.c r2cfII_15.c \
|
||||
r2cfII_16.c r2cfII_32.c r2cfII_64.c \
|
||||
r2cfII_20.c r2cfII_25.c # r2cfII_30.c r2cfII_40.c r2cfII_50.c
|
||||
|
||||
###########################################################################
|
||||
# hc2cf_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT
|
||||
# step for a real-input FFT with rdft2-style output. <r> must be even.
|
||||
HC2CF = hc2cf_2.c hc2cf_4.c hc2cf_6.c hc2cf_8.c hc2cf_10.c hc2cf_12.c \
|
||||
hc2cf_16.c hc2cf_32.c \
|
||||
hc2cf_20.c # hc2cf_30.c
|
||||
|
||||
HC2CFDFT = hc2cfdft_2.c hc2cfdft_4.c hc2cfdft_6.c hc2cfdft_8.c \
|
||||
hc2cfdft_10.c hc2cfdft_12.c hc2cfdft_16.c hc2cfdft_32.c \
|
||||
hc2cfdft_20.c # hc2cfdft_30.c
|
||||
|
||||
# like hc2cf, but generates part of its trig table on the fly (good
|
||||
# for large n)
|
||||
HC2CF2 = hc2cf2_4.c hc2cf2_8.c hc2cf2_16.c hc2cf2_32.c \
|
||||
hc2cf2_20.c # hc2cf2_30.c
|
||||
HC2CFDFT2 = hc2cfdft2_4.c hc2cfdft2_8.c hc2cfdft2_16.c hc2cfdft2_32.c \
|
||||
hc2cfdft2_20.c # hc2cfdft2_30.c
|
||||
|
||||
###########################################################################
|
||||
ALL_CODELETS = $(R2CF) $(HF) $(HF2) $(R2CFII) $(HC2CF) $(HC2CF2) \
|
||||
$(HC2CFDFT) $(HC2CFDFT2)
|
||||
|
||||
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
|
||||
|
||||
librdft_scalar_r2cf_la_SOURCES = $(BUILT_SOURCES)
|
||||
|
||||
SOLVTAB_NAME = X(solvtab_rdft_r2cf)
|
||||
XRENAME=X
|
||||
|
||||
# special rules for regenerating codelets.
|
||||
include $(top_srcdir)/support/Makefile.codelets
|
||||
|
||||
if MAINTAINER_MODE
|
||||
FLAGS_R2CF=$(RDFT_FLAGS_COMMON)
|
||||
FLAGS_HF=$(RDFT_FLAGS_COMMON)
|
||||
FLAGS_HF2=$(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_HC2CF=$(RDFT_FLAGS_COMMON)
|
||||
FLAGS_HC2CF2=$(RDFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_R2CFII=$(RDFT_FLAGS_COMMON)
|
||||
|
||||
r2cf_%.c: $(CODELET_DEPS) $(GEN_R2CF)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cf_$* -include "rdft/scalar/r2cf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hf_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF) -n $* -dit -name hf_$* -include "rdft/scalar/hf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hf2_%.c: $(CODELET_DEPS) $(GEN_HC2HC)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2HC) $(FLAGS_HF2) -n $* -dit -name hf2_$* -include "rdft/scalar/hf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
r2cfII_%.c: $(CODELET_DEPS) $(GEN_R2CF)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_R2CF) $(FLAGS_R2CF) -n $* -name r2cfII_$* -dft-II -include "rdft/scalar/r2cfII.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cf_%.c: $(CODELET_DEPS) $(GEN_HC2C)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF) -n $* -dit -name hc2cf_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cf2_%.c: $(CODELET_DEPS) $(GEN_HC2C)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2C) $(FLAGS_HC2CF2) -n $* -dit -name hc2cf2_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cfdft_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF) -n $* -dit -name hc2cfdft_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
hc2cfdft2_%.c: $(CODELET_DEPS) $(GEN_HC2CDFT)
|
||||
($(PRELUDE_COMMANDS_RDFT); $(TWOVERS) $(GEN_HC2CDFT) $(FLAGS_HC2CF2) -n $* -dit -name hc2cfdft2_$* -include "rdft/scalar/hc2cf.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
endif # MAINTAINER_MODE
|
||||
1153
fftw-3.3.10/rdft/scalar/r2cf/Makefile.in
Normal file
1153
fftw-3.3.10/rdft/scalar/r2cf/Makefile.in
Normal file
File diff suppressed because it is too large
Load Diff
183
fftw-3.3.10/rdft/scalar/r2cf/codlist.c
Normal file
183
fftw-3.3.10/rdft/scalar/r2cf/codlist.c
Normal file
@@ -0,0 +1,183 @@
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
|
||||
extern void X(codelet_r2cf_2)(planner *);
|
||||
extern void X(codelet_r2cf_3)(planner *);
|
||||
extern void X(codelet_r2cf_4)(planner *);
|
||||
extern void X(codelet_r2cf_5)(planner *);
|
||||
extern void X(codelet_r2cf_6)(planner *);
|
||||
extern void X(codelet_r2cf_7)(planner *);
|
||||
extern void X(codelet_r2cf_8)(planner *);
|
||||
extern void X(codelet_r2cf_9)(planner *);
|
||||
extern void X(codelet_r2cf_10)(planner *);
|
||||
extern void X(codelet_r2cf_11)(planner *);
|
||||
extern void X(codelet_r2cf_12)(planner *);
|
||||
extern void X(codelet_r2cf_13)(planner *);
|
||||
extern void X(codelet_r2cf_14)(planner *);
|
||||
extern void X(codelet_r2cf_15)(planner *);
|
||||
extern void X(codelet_r2cf_16)(planner *);
|
||||
extern void X(codelet_r2cf_32)(planner *);
|
||||
extern void X(codelet_r2cf_64)(planner *);
|
||||
extern void X(codelet_r2cf_128)(planner *);
|
||||
extern void X(codelet_r2cf_20)(planner *);
|
||||
extern void X(codelet_r2cf_25)(planner *);
|
||||
extern void X(codelet_hf_2)(planner *);
|
||||
extern void X(codelet_hf_3)(planner *);
|
||||
extern void X(codelet_hf_4)(planner *);
|
||||
extern void X(codelet_hf_5)(planner *);
|
||||
extern void X(codelet_hf_6)(planner *);
|
||||
extern void X(codelet_hf_7)(planner *);
|
||||
extern void X(codelet_hf_8)(planner *);
|
||||
extern void X(codelet_hf_9)(planner *);
|
||||
extern void X(codelet_hf_10)(planner *);
|
||||
extern void X(codelet_hf_12)(planner *);
|
||||
extern void X(codelet_hf_15)(planner *);
|
||||
extern void X(codelet_hf_16)(planner *);
|
||||
extern void X(codelet_hf_32)(planner *);
|
||||
extern void X(codelet_hf_64)(planner *);
|
||||
extern void X(codelet_hf_20)(planner *);
|
||||
extern void X(codelet_hf_25)(planner *);
|
||||
extern void X(codelet_hf2_4)(planner *);
|
||||
extern void X(codelet_hf2_8)(planner *);
|
||||
extern void X(codelet_hf2_16)(planner *);
|
||||
extern void X(codelet_hf2_32)(planner *);
|
||||
extern void X(codelet_hf2_5)(planner *);
|
||||
extern void X(codelet_hf2_20)(planner *);
|
||||
extern void X(codelet_hf2_25)(planner *);
|
||||
extern void X(codelet_r2cfII_2)(planner *);
|
||||
extern void X(codelet_r2cfII_3)(planner *);
|
||||
extern void X(codelet_r2cfII_4)(planner *);
|
||||
extern void X(codelet_r2cfII_5)(planner *);
|
||||
extern void X(codelet_r2cfII_6)(planner *);
|
||||
extern void X(codelet_r2cfII_7)(planner *);
|
||||
extern void X(codelet_r2cfII_8)(planner *);
|
||||
extern void X(codelet_r2cfII_9)(planner *);
|
||||
extern void X(codelet_r2cfII_10)(planner *);
|
||||
extern void X(codelet_r2cfII_12)(planner *);
|
||||
extern void X(codelet_r2cfII_15)(planner *);
|
||||
extern void X(codelet_r2cfII_16)(planner *);
|
||||
extern void X(codelet_r2cfII_32)(planner *);
|
||||
extern void X(codelet_r2cfII_64)(planner *);
|
||||
extern void X(codelet_r2cfII_20)(planner *);
|
||||
extern void X(codelet_r2cfII_25)(planner *);
|
||||
extern void X(codelet_hc2cf_2)(planner *);
|
||||
extern void X(codelet_hc2cf_4)(planner *);
|
||||
extern void X(codelet_hc2cf_6)(planner *);
|
||||
extern void X(codelet_hc2cf_8)(planner *);
|
||||
extern void X(codelet_hc2cf_10)(planner *);
|
||||
extern void X(codelet_hc2cf_12)(planner *);
|
||||
extern void X(codelet_hc2cf_16)(planner *);
|
||||
extern void X(codelet_hc2cf_32)(planner *);
|
||||
extern void X(codelet_hc2cf_20)(planner *);
|
||||
extern void X(codelet_hc2cf2_4)(planner *);
|
||||
extern void X(codelet_hc2cf2_8)(planner *);
|
||||
extern void X(codelet_hc2cf2_16)(planner *);
|
||||
extern void X(codelet_hc2cf2_32)(planner *);
|
||||
extern void X(codelet_hc2cf2_20)(planner *);
|
||||
extern void X(codelet_hc2cfdft_2)(planner *);
|
||||
extern void X(codelet_hc2cfdft_4)(planner *);
|
||||
extern void X(codelet_hc2cfdft_6)(planner *);
|
||||
extern void X(codelet_hc2cfdft_8)(planner *);
|
||||
extern void X(codelet_hc2cfdft_10)(planner *);
|
||||
extern void X(codelet_hc2cfdft_12)(planner *);
|
||||
extern void X(codelet_hc2cfdft_16)(planner *);
|
||||
extern void X(codelet_hc2cfdft_32)(planner *);
|
||||
extern void X(codelet_hc2cfdft_20)(planner *);
|
||||
extern void X(codelet_hc2cfdft2_4)(planner *);
|
||||
extern void X(codelet_hc2cfdft2_8)(planner *);
|
||||
extern void X(codelet_hc2cfdft2_16)(planner *);
|
||||
extern void X(codelet_hc2cfdft2_32)(planner *);
|
||||
extern void X(codelet_hc2cfdft2_20)(planner *);
|
||||
|
||||
|
||||
extern const solvtab X(solvtab_rdft_r2cf);
|
||||
const solvtab X(solvtab_rdft_r2cf) = {
|
||||
SOLVTAB(X(codelet_r2cf_2)),
|
||||
SOLVTAB(X(codelet_r2cf_3)),
|
||||
SOLVTAB(X(codelet_r2cf_4)),
|
||||
SOLVTAB(X(codelet_r2cf_5)),
|
||||
SOLVTAB(X(codelet_r2cf_6)),
|
||||
SOLVTAB(X(codelet_r2cf_7)),
|
||||
SOLVTAB(X(codelet_r2cf_8)),
|
||||
SOLVTAB(X(codelet_r2cf_9)),
|
||||
SOLVTAB(X(codelet_r2cf_10)),
|
||||
SOLVTAB(X(codelet_r2cf_11)),
|
||||
SOLVTAB(X(codelet_r2cf_12)),
|
||||
SOLVTAB(X(codelet_r2cf_13)),
|
||||
SOLVTAB(X(codelet_r2cf_14)),
|
||||
SOLVTAB(X(codelet_r2cf_15)),
|
||||
SOLVTAB(X(codelet_r2cf_16)),
|
||||
SOLVTAB(X(codelet_r2cf_32)),
|
||||
SOLVTAB(X(codelet_r2cf_64)),
|
||||
SOLVTAB(X(codelet_r2cf_128)),
|
||||
SOLVTAB(X(codelet_r2cf_20)),
|
||||
SOLVTAB(X(codelet_r2cf_25)),
|
||||
SOLVTAB(X(codelet_hf_2)),
|
||||
SOLVTAB(X(codelet_hf_3)),
|
||||
SOLVTAB(X(codelet_hf_4)),
|
||||
SOLVTAB(X(codelet_hf_5)),
|
||||
SOLVTAB(X(codelet_hf_6)),
|
||||
SOLVTAB(X(codelet_hf_7)),
|
||||
SOLVTAB(X(codelet_hf_8)),
|
||||
SOLVTAB(X(codelet_hf_9)),
|
||||
SOLVTAB(X(codelet_hf_10)),
|
||||
SOLVTAB(X(codelet_hf_12)),
|
||||
SOLVTAB(X(codelet_hf_15)),
|
||||
SOLVTAB(X(codelet_hf_16)),
|
||||
SOLVTAB(X(codelet_hf_32)),
|
||||
SOLVTAB(X(codelet_hf_64)),
|
||||
SOLVTAB(X(codelet_hf_20)),
|
||||
SOLVTAB(X(codelet_hf_25)),
|
||||
SOLVTAB(X(codelet_hf2_4)),
|
||||
SOLVTAB(X(codelet_hf2_8)),
|
||||
SOLVTAB(X(codelet_hf2_16)),
|
||||
SOLVTAB(X(codelet_hf2_32)),
|
||||
SOLVTAB(X(codelet_hf2_5)),
|
||||
SOLVTAB(X(codelet_hf2_20)),
|
||||
SOLVTAB(X(codelet_hf2_25)),
|
||||
SOLVTAB(X(codelet_r2cfII_2)),
|
||||
SOLVTAB(X(codelet_r2cfII_3)),
|
||||
SOLVTAB(X(codelet_r2cfII_4)),
|
||||
SOLVTAB(X(codelet_r2cfII_5)),
|
||||
SOLVTAB(X(codelet_r2cfII_6)),
|
||||
SOLVTAB(X(codelet_r2cfII_7)),
|
||||
SOLVTAB(X(codelet_r2cfII_8)),
|
||||
SOLVTAB(X(codelet_r2cfII_9)),
|
||||
SOLVTAB(X(codelet_r2cfII_10)),
|
||||
SOLVTAB(X(codelet_r2cfII_12)),
|
||||
SOLVTAB(X(codelet_r2cfII_15)),
|
||||
SOLVTAB(X(codelet_r2cfII_16)),
|
||||
SOLVTAB(X(codelet_r2cfII_32)),
|
||||
SOLVTAB(X(codelet_r2cfII_64)),
|
||||
SOLVTAB(X(codelet_r2cfII_20)),
|
||||
SOLVTAB(X(codelet_r2cfII_25)),
|
||||
SOLVTAB(X(codelet_hc2cf_2)),
|
||||
SOLVTAB(X(codelet_hc2cf_4)),
|
||||
SOLVTAB(X(codelet_hc2cf_6)),
|
||||
SOLVTAB(X(codelet_hc2cf_8)),
|
||||
SOLVTAB(X(codelet_hc2cf_10)),
|
||||
SOLVTAB(X(codelet_hc2cf_12)),
|
||||
SOLVTAB(X(codelet_hc2cf_16)),
|
||||
SOLVTAB(X(codelet_hc2cf_32)),
|
||||
SOLVTAB(X(codelet_hc2cf_20)),
|
||||
SOLVTAB(X(codelet_hc2cf2_4)),
|
||||
SOLVTAB(X(codelet_hc2cf2_8)),
|
||||
SOLVTAB(X(codelet_hc2cf2_16)),
|
||||
SOLVTAB(X(codelet_hc2cf2_32)),
|
||||
SOLVTAB(X(codelet_hc2cf2_20)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_2)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_4)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_6)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_8)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_10)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_12)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_16)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_32)),
|
||||
SOLVTAB(X(codelet_hc2cfdft_20)),
|
||||
SOLVTAB(X(codelet_hc2cfdft2_4)),
|
||||
SOLVTAB(X(codelet_hc2cfdft2_8)),
|
||||
SOLVTAB(X(codelet_hc2cfdft2_16)),
|
||||
SOLVTAB(X(codelet_hc2cfdft2_32)),
|
||||
SOLVTAB(X(codelet_hc2cfdft2_20)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
836
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_16.c
Normal file
836
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_16.c
Normal file
@@ -0,0 +1,836 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:35 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 134 FP multiplications,
|
||||
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
|
||||
* 90 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
|
||||
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
|
||||
{
|
||||
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
|
||||
T2 = W[0];
|
||||
Tf = W[2];
|
||||
Tg = T2 * Tf;
|
||||
TM = W[6];
|
||||
TN = T2 * TM;
|
||||
TO = W[7];
|
||||
TS = T2 * TO;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tp = Tf * T3;
|
||||
T6 = W[5];
|
||||
Ta = T2 * T6;
|
||||
Tt = Tf * T6;
|
||||
T5 = W[1];
|
||||
Th = W[3];
|
||||
Tl = T2 * Th;
|
||||
Tz = FMA(T5, Th, Tg);
|
||||
Ti = FNMS(T5, Th, Tg);
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TZ = FNMS(Th, T3, Tt);
|
||||
TT = FNMS(T5, TM, TS);
|
||||
Tq = FNMS(Th, T6, Tp);
|
||||
TW = FMA(Th, T6, Tp);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
Tu = FMA(Th, T3, Tt);
|
||||
TP = FMA(T5, TO, TN);
|
||||
TI = FMA(T5, T3, Ta);
|
||||
TF = FNMS(T5, T6, T4);
|
||||
{
|
||||
E T1y, T1C, T1e, T1i;
|
||||
T1y = Tz * T3;
|
||||
T1C = Tz * T6;
|
||||
TC = FNMS(T5, Tf, Tl);
|
||||
T1z = FMA(TC, T6, T1y);
|
||||
T1O = FMA(TC, T3, T1C);
|
||||
T1D = FNMS(TC, T3, T1C);
|
||||
T1L = FNMS(TC, T6, T1y);
|
||||
T1e = Ti * T3;
|
||||
T1i = Ti * T6;
|
||||
Tm = FMA(T5, Tf, Tl);
|
||||
T1f = FMA(Tm, T6, T1e);
|
||||
T1p = FMA(Tm, T3, T1i);
|
||||
T1j = FNMS(Tm, T3, T1i);
|
||||
T1m = FNMS(Tm, T6, T1e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Te, T1U, T3A, T3L, T1G, T2D, T2B, T3h, T1R, T2w, T2I, T3i, Tx, T3M, T1Z;
|
||||
E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
|
||||
E T2d, T38;
|
||||
{
|
||||
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
|
||||
T1 = Rp[0];
|
||||
T3z = Rm[0];
|
||||
T8 = Rp[WS(rs, 4)];
|
||||
T9 = T7 * T8;
|
||||
Tc = Rm[WS(rs, 4)];
|
||||
T3x = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Te = T1 + Td;
|
||||
T1U = T1 - Td;
|
||||
T3y = FNMS(Tb, T8, T3x);
|
||||
T3A = T3y + T3z;
|
||||
T3L = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1w, T2x, T1A, T1B, T1E, T2z;
|
||||
T1u = Ip[WS(rs, 7)];
|
||||
T1v = TM * T1u;
|
||||
T1w = Im[WS(rs, 7)];
|
||||
T2x = TM * T1w;
|
||||
T1A = Ip[WS(rs, 3)];
|
||||
T1B = T1z * T1A;
|
||||
T1E = Im[WS(rs, 3)];
|
||||
T2z = T1z * T1E;
|
||||
{
|
||||
E T1x, T1F, T2y, T2A;
|
||||
T1x = FMA(TO, T1w, T1v);
|
||||
T1F = FMA(T1D, T1E, T1B);
|
||||
T1G = T1x + T1F;
|
||||
T2D = T1x - T1F;
|
||||
T2y = FNMS(TO, T1u, T2x);
|
||||
T2A = FNMS(T1D, T1A, T2z);
|
||||
T2B = T2y - T2A;
|
||||
T3h = T2y + T2A;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
|
||||
T1H = Ip[WS(rs, 1)];
|
||||
T1I = Tf * T1H;
|
||||
T1J = Im[WS(rs, 1)];
|
||||
T2E = Tf * T1J;
|
||||
T1M = Ip[WS(rs, 5)];
|
||||
T1N = T1L * T1M;
|
||||
T1P = Im[WS(rs, 5)];
|
||||
T2G = T1L * T1P;
|
||||
{
|
||||
E T1K, T1Q, T2F, T2H;
|
||||
T1K = FMA(Th, T1J, T1I);
|
||||
T1Q = FMA(T1O, T1P, T1N);
|
||||
T1R = T1K + T1Q;
|
||||
T2w = T1Q - T1K;
|
||||
T2F = FNMS(Th, T1H, T2E);
|
||||
T2H = FNMS(T1O, T1M, T2G);
|
||||
T2I = T2F - T2H;
|
||||
T3i = T2F + T2H;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
|
||||
Tj = Rp[WS(rs, 2)];
|
||||
Tk = Ti * Tj;
|
||||
Tn = Rm[WS(rs, 2)];
|
||||
T1V = Ti * Tn;
|
||||
Tr = Rp[WS(rs, 6)];
|
||||
Ts = Tq * Tr;
|
||||
Tv = Rm[WS(rs, 6)];
|
||||
T1X = Tq * Tv;
|
||||
{
|
||||
E To, Tw, T1W, T1Y;
|
||||
To = FMA(Tm, Tn, Tk);
|
||||
Tw = FMA(Tu, Tv, Ts);
|
||||
Tx = To + Tw;
|
||||
T3M = To - Tw;
|
||||
T1W = FNMS(Tm, Tj, T1V);
|
||||
T1Y = FNMS(Tu, Tr, T1X);
|
||||
T1Z = T1W - T1Y;
|
||||
T3w = T1W + T1Y;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, TB, TD, T21, TG, TH, TJ, T23;
|
||||
TA = Rp[WS(rs, 1)];
|
||||
TB = Tz * TA;
|
||||
TD = Rm[WS(rs, 1)];
|
||||
T21 = Tz * TD;
|
||||
TG = Rp[WS(rs, 5)];
|
||||
TH = TF * TG;
|
||||
TJ = Rm[WS(rs, 5)];
|
||||
T23 = TF * TJ;
|
||||
{
|
||||
E TE, TK, T22, T24;
|
||||
TE = FMA(TC, TD, TB);
|
||||
TK = FMA(TI, TJ, TH);
|
||||
TL = TE + TK;
|
||||
T26 = TE - TK;
|
||||
T22 = FNMS(TC, TA, T21);
|
||||
T24 = FNMS(TI, TG, T23);
|
||||
T25 = T22 - T24;
|
||||
T37 = T22 + T24;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
|
||||
T15 = Ip[0];
|
||||
T16 = T2 * T15;
|
||||
T17 = Im[0];
|
||||
T2h = T2 * T17;
|
||||
T19 = Ip[WS(rs, 4)];
|
||||
T1a = T3 * T19;
|
||||
T1b = Im[WS(rs, 4)];
|
||||
T2j = T3 * T1b;
|
||||
{
|
||||
E T18, T1c, T2i, T2k;
|
||||
T18 = FMA(T5, T17, T16);
|
||||
T1c = FMA(T6, T1b, T1a);
|
||||
T1d = T18 + T1c;
|
||||
T2o = T18 - T1c;
|
||||
T2i = FNMS(T5, T15, T2h);
|
||||
T2k = FNMS(T6, T19, T2j);
|
||||
T2l = T2i - T2k;
|
||||
T3c = T2i + T2k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
|
||||
T1g = Ip[WS(rs, 2)];
|
||||
T1h = T1f * T1g;
|
||||
T1k = Im[WS(rs, 2)];
|
||||
T2p = T1f * T1k;
|
||||
T1n = Ip[WS(rs, 6)];
|
||||
T1o = T1m * T1n;
|
||||
T1q = Im[WS(rs, 6)];
|
||||
T2r = T1m * T1q;
|
||||
{
|
||||
E T1l, T1r, T2q, T2s;
|
||||
T1l = FMA(T1j, T1k, T1h);
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T1s = T1l + T1r;
|
||||
T2m = T1l - T1r;
|
||||
T2q = FNMS(T1j, T1g, T2p);
|
||||
T2s = FNMS(T1p, T1n, T2r);
|
||||
T2t = T2q - T2s;
|
||||
T3d = T2q + T2s;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TR, TU, T29, TX, TY, T10, T2b;
|
||||
TQ = Rp[WS(rs, 7)];
|
||||
TR = TP * TQ;
|
||||
TU = Rm[WS(rs, 7)];
|
||||
T29 = TP * TU;
|
||||
TX = Rp[WS(rs, 3)];
|
||||
TY = TW * TX;
|
||||
T10 = Rm[WS(rs, 3)];
|
||||
T2b = TW * T10;
|
||||
{
|
||||
E TV, T11, T2a, T2c;
|
||||
TV = FMA(TT, TU, TR);
|
||||
T11 = FMA(TZ, T10, TY);
|
||||
T12 = TV + T11;
|
||||
T28 = TV - T11;
|
||||
T2a = FNMS(TT, TQ, T29);
|
||||
T2c = FNMS(TZ, TX, T2b);
|
||||
T2d = T2a - T2c;
|
||||
T38 = T2a + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
|
||||
{
|
||||
E Ty, T13, T3v, T3B;
|
||||
Ty = Te + Tx;
|
||||
T13 = TL + T12;
|
||||
T14 = Ty + T13;
|
||||
T3q = Ty - T13;
|
||||
T3v = T37 + T38;
|
||||
T3B = T3w + T3A;
|
||||
T3C = T3v + T3B;
|
||||
T3E = T3B - T3v;
|
||||
}
|
||||
{
|
||||
E T1t, T1S, T3r, T3s;
|
||||
T1t = T1d + T1s;
|
||||
T1S = T1G + T1R;
|
||||
T1T = T1t + T1S;
|
||||
T3D = T1S - T1t;
|
||||
T3r = T3c + T3d;
|
||||
T3s = T3h + T3i;
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3r + T3s;
|
||||
}
|
||||
Rm[WS(rs, 7)] = T14 - T1T;
|
||||
Im[WS(rs, 7)] = T3u - T3C;
|
||||
Rp[0] = T14 + T1T;
|
||||
Ip[0] = T3u + T3C;
|
||||
Rm[WS(rs, 3)] = T3q - T3t;
|
||||
Im[WS(rs, 3)] = T3D - T3E;
|
||||
Rp[WS(rs, 4)] = T3q + T3t;
|
||||
Ip[WS(rs, 4)] = T3D + T3E;
|
||||
}
|
||||
{
|
||||
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
|
||||
{
|
||||
E T36, T39, T3F, T3G;
|
||||
T36 = Te - Tx;
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 + T39;
|
||||
T3m = T36 - T39;
|
||||
T3F = T12 - TL;
|
||||
T3G = T3A - T3w;
|
||||
T3H = T3F + T3G;
|
||||
T3J = T3G - T3F;
|
||||
}
|
||||
{
|
||||
E T3b, T3e, T3g, T3j;
|
||||
T3b = T1d - T1s;
|
||||
T3e = T3c - T3d;
|
||||
T3f = T3b + T3e;
|
||||
T3n = T3e - T3b;
|
||||
T3g = T1G - T1R;
|
||||
T3j = T3h - T3i;
|
||||
T3k = T3g - T3j;
|
||||
T3o = T3g + T3j;
|
||||
}
|
||||
{
|
||||
E T3l, T3I, T3p, T3K;
|
||||
T3l = T3f + T3k;
|
||||
Rm[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
|
||||
Rp[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
|
||||
T3I = T3n + T3o;
|
||||
Im[WS(rs, 5)] = FMS(KP707106781, T3I, T3H);
|
||||
Ip[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
|
||||
T3p = T3n - T3o;
|
||||
Rm[WS(rs, 1)] = FNMS(KP707106781, T3p, T3m);
|
||||
Rp[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
|
||||
T3K = T3k - T3f;
|
||||
Im[WS(rs, 1)] = FMS(KP707106781, T3K, T3J);
|
||||
Ip[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
|
||||
E T2O;
|
||||
{
|
||||
E T27, T2e, T2n, T2u;
|
||||
T20 = T1U - T1Z;
|
||||
T3N = T3L - T3M;
|
||||
T3T = T3M + T3L;
|
||||
T2Q = T1U + T1Z;
|
||||
T27 = T25 - T26;
|
||||
T2e = T28 + T2d;
|
||||
T2f = T27 - T2e;
|
||||
T3O = T27 + T2e;
|
||||
{
|
||||
E T2Y, T2Z, T2R, T2S;
|
||||
T2Y = T2D + T2I;
|
||||
T2Z = T2B + T2w;
|
||||
T30 = FNMS(KP414213562, T2Z, T2Y);
|
||||
T34 = FMA(KP414213562, T2Y, T2Z);
|
||||
T2R = T26 + T25;
|
||||
T2S = T28 - T2d;
|
||||
T2T = T2R + T2S;
|
||||
T3U = T2S - T2R;
|
||||
}
|
||||
T2n = T2l + T2m;
|
||||
T2u = T2o - T2t;
|
||||
T2v = FMA(KP414213562, T2u, T2n);
|
||||
T2N = FNMS(KP414213562, T2n, T2u);
|
||||
{
|
||||
E T2V, T2W, T2C, T2J;
|
||||
T2V = T2o + T2t;
|
||||
T2W = T2l - T2m;
|
||||
T2X = FMA(KP414213562, T2W, T2V);
|
||||
T33 = FNMS(KP414213562, T2V, T2W);
|
||||
T2C = T2w - T2B;
|
||||
T2J = T2D - T2I;
|
||||
T2K = FMA(KP414213562, T2J, T2C);
|
||||
T2O = FNMS(KP414213562, T2C, T2J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2g, T2L, T3V, T3W;
|
||||
T2g = FMA(KP707106781, T2f, T20);
|
||||
T2L = T2v + T2K;
|
||||
Rm[WS(rs, 4)] = FNMS(KP923879532, T2L, T2g);
|
||||
Rp[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
|
||||
T3V = FMA(KP707106781, T3U, T3T);
|
||||
T3W = T2O - T2N;
|
||||
Im[WS(rs, 4)] = FMS(KP923879532, T3W, T3V);
|
||||
Ip[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
|
||||
}
|
||||
{
|
||||
E T2M, T2P, T3X, T3Y;
|
||||
T2M = FNMS(KP707106781, T2f, T20);
|
||||
T2P = T2N + T2O;
|
||||
Rp[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
|
||||
Rm[0] = FMA(KP923879532, T2P, T2M);
|
||||
T3X = FNMS(KP707106781, T3U, T3T);
|
||||
T3Y = T2K - T2v;
|
||||
Im[0] = FMS(KP923879532, T3Y, T3X);
|
||||
Ip[WS(rs, 7)] = FMA(KP923879532, T3Y, T3X);
|
||||
}
|
||||
{
|
||||
E T2U, T31, T3P, T3Q;
|
||||
T2U = FMA(KP707106781, T2T, T2Q);
|
||||
T31 = T2X + T30;
|
||||
Rm[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
|
||||
Rp[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
|
||||
T3P = FMA(KP707106781, T3O, T3N);
|
||||
T3Q = T33 + T34;
|
||||
Im[WS(rs, 6)] = FMS(KP923879532, T3Q, T3P);
|
||||
Ip[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
|
||||
}
|
||||
{
|
||||
E T32, T35, T3R, T3S;
|
||||
T32 = FNMS(KP707106781, T2T, T2Q);
|
||||
T35 = T33 - T34;
|
||||
Rm[WS(rs, 2)] = FNMS(KP923879532, T35, T32);
|
||||
Rp[WS(rs, 5)] = FMA(KP923879532, T35, T32);
|
||||
T3R = FNMS(KP707106781, T3O, T3N);
|
||||
T3S = T30 - T2X;
|
||||
Im[WS(rs, 2)] = FMS(KP923879532, T3S, T3R);
|
||||
Ip[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
|
||||
|
||||
void X(codelet_hc2cf2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cf2_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 108 FP multiplications,
|
||||
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
|
||||
* 82 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
|
||||
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
|
||||
{
|
||||
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
|
||||
{
|
||||
E Th, Tn, Tj, Tm;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
Tg = W[2];
|
||||
Ti = W[3];
|
||||
Th = T2 * Tg;
|
||||
Tn = T5 * Tg;
|
||||
Tj = T5 * Ti;
|
||||
Tm = T2 * Ti;
|
||||
Tk = Th - Tj;
|
||||
To = Tm + Tn;
|
||||
TE = Tm - Tn;
|
||||
TC = Th + Tj;
|
||||
T6 = W[5];
|
||||
T7 = T5 * T6;
|
||||
Tv = Tg * T6;
|
||||
Ta = T2 * T6;
|
||||
Ts = Ti * T6;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tw = Ti * T3;
|
||||
Tb = T5 * T3;
|
||||
Tr = Tg * T3;
|
||||
}
|
||||
T8 = T4 + T7;
|
||||
TW = Tv - Tw;
|
||||
TJ = Ta + Tb;
|
||||
Tt = Tr - Ts;
|
||||
TU = Tr + Ts;
|
||||
Tc = Ta - Tb;
|
||||
Tx = Tv + Tw;
|
||||
TH = T4 - T7;
|
||||
TN = W[6];
|
||||
TO = W[7];
|
||||
TP = FMA(T2, TN, T5 * TO);
|
||||
TR = FNMS(T5, TN, T2 * TO);
|
||||
{
|
||||
E T1d, T1e, T19, T1a;
|
||||
T1d = Tk * T6;
|
||||
T1e = To * T3;
|
||||
T1f = T1d - T1e;
|
||||
T1k = T1d + T1e;
|
||||
T19 = Tk * T3;
|
||||
T1a = To * T6;
|
||||
T1b = T19 + T1a;
|
||||
T1i = T19 - T1a;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1s, T1t;
|
||||
T1w = TC * T6;
|
||||
T1x = TE * T3;
|
||||
T1y = T1w - T1x;
|
||||
T1H = T1w + T1x;
|
||||
T1s = TC * T3;
|
||||
T1t = TE * T6;
|
||||
T1u = T1s + T1t;
|
||||
T1F = T1s - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
|
||||
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
|
||||
E T2S, T2T, T28, T2A, T2d, T2B;
|
||||
{
|
||||
E T1, T3d, Te, T3c, T9, Td;
|
||||
T1 = Rp[0];
|
||||
T3d = Rm[0];
|
||||
T9 = Rp[WS(rs, 4)];
|
||||
Td = Rm[WS(rs, 4)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T3c = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T3r = T3d - T3c;
|
||||
T1N = T1 - Te;
|
||||
T3e = T3c + T3d;
|
||||
}
|
||||
{
|
||||
E Tq, T1O, Tz, T1P;
|
||||
{
|
||||
E Tl, Tp, Tu, Ty;
|
||||
Tl = Rp[WS(rs, 2)];
|
||||
Tp = Rm[WS(rs, 2)];
|
||||
Tq = FMA(Tk, Tl, To * Tp);
|
||||
T1O = FNMS(To, Tl, Tk * Tp);
|
||||
Tu = Rp[WS(rs, 6)];
|
||||
Ty = Rm[WS(rs, 6)];
|
||||
Tz = FMA(Tt, Tu, Tx * Ty);
|
||||
T1P = FNMS(Tx, Tu, Tt * Ty);
|
||||
}
|
||||
TA = Tq + Tz;
|
||||
T3s = Tq - Tz;
|
||||
T1Q = T1O - T1P;
|
||||
T3b = T1O + T1P;
|
||||
}
|
||||
{
|
||||
E TG, T1S, TL, T1T, T1U, T1V;
|
||||
{
|
||||
E TD, TF, TI, TK;
|
||||
TD = Rp[WS(rs, 1)];
|
||||
TF = Rm[WS(rs, 1)];
|
||||
TG = FMA(TC, TD, TE * TF);
|
||||
T1S = FNMS(TE, TD, TC * TF);
|
||||
TI = Rp[WS(rs, 5)];
|
||||
TK = Rm[WS(rs, 5)];
|
||||
TL = FMA(TH, TI, TJ * TK);
|
||||
T1T = FNMS(TJ, TI, TH * TK);
|
||||
}
|
||||
TM = TG + TL;
|
||||
T2M = T1S + T1T;
|
||||
T1U = T1S - T1T;
|
||||
T1V = TG - TL;
|
||||
T1W = T1U - T1V;
|
||||
T2w = T1V + T1U;
|
||||
}
|
||||
{
|
||||
E TT, T1Y, TY, T1Z, T1X, T20;
|
||||
{
|
||||
E TQ, TS, TV, TX;
|
||||
TQ = Rp[WS(rs, 7)];
|
||||
TS = Rm[WS(rs, 7)];
|
||||
TT = FMA(TP, TQ, TR * TS);
|
||||
T1Y = FNMS(TR, TQ, TP * TS);
|
||||
TV = Rp[WS(rs, 3)];
|
||||
TX = Rm[WS(rs, 3)];
|
||||
TY = FMA(TU, TV, TW * TX);
|
||||
T1Z = FNMS(TW, TV, TU * TX);
|
||||
}
|
||||
TZ = TT + TY;
|
||||
T2N = T1Y + T1Z;
|
||||
T1X = TT - TY;
|
||||
T20 = T1Y - T1Z;
|
||||
T21 = T1X + T20;
|
||||
T2x = T1X - T20;
|
||||
}
|
||||
{
|
||||
E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
|
||||
{
|
||||
E T1p, T1q, T1G, T1I;
|
||||
T1p = Ip[WS(rs, 7)];
|
||||
T1q = Im[WS(rs, 7)];
|
||||
T1r = FMA(TN, T1p, TO * T1q);
|
||||
T2k = FNMS(TO, T1p, TN * T1q);
|
||||
T1G = Ip[WS(rs, 5)];
|
||||
T1I = Im[WS(rs, 5)];
|
||||
T1J = FMA(T1F, T1G, T1H * T1I);
|
||||
T2h = FNMS(T1H, T1G, T1F * T1I);
|
||||
}
|
||||
{
|
||||
E T1v, T1z, T1C, T1D;
|
||||
T1v = Ip[WS(rs, 3)];
|
||||
T1z = Im[WS(rs, 3)];
|
||||
T1A = FMA(T1u, T1v, T1y * T1z);
|
||||
T2l = FNMS(T1y, T1v, T1u * T1z);
|
||||
T1C = Ip[WS(rs, 1)];
|
||||
T1D = Im[WS(rs, 1)];
|
||||
T1E = FMA(Tg, T1C, Ti * T1D);
|
||||
T2g = FNMS(Ti, T1C, Tg * T1D);
|
||||
}
|
||||
T1B = T1r + T1A;
|
||||
T1K = T1E + T1J;
|
||||
T2V = T1B - T1K;
|
||||
T2W = T2k + T2l;
|
||||
T2X = T2g + T2h;
|
||||
T2Y = T2W - T2X;
|
||||
{
|
||||
E T2f, T2i, T2m, T2n;
|
||||
T2f = T1r - T1A;
|
||||
T2i = T2g - T2h;
|
||||
T2j = T2f - T2i;
|
||||
T2D = T2f + T2i;
|
||||
T2m = T2k - T2l;
|
||||
T2n = T1E - T1J;
|
||||
T2o = T2m + T2n;
|
||||
T2E = T2m - T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
|
||||
{
|
||||
E T12, T13, T1j, T1l;
|
||||
T12 = Ip[0];
|
||||
T13 = Im[0];
|
||||
T14 = FMA(T2, T12, T5 * T13);
|
||||
T24 = FNMS(T5, T12, T2 * T13);
|
||||
T1j = Ip[WS(rs, 6)];
|
||||
T1l = Im[WS(rs, 6)];
|
||||
T1m = FMA(T1i, T1j, T1k * T1l);
|
||||
T2b = FNMS(T1k, T1j, T1i * T1l);
|
||||
}
|
||||
{
|
||||
E T15, T16, T1c, T1g;
|
||||
T15 = Ip[WS(rs, 4)];
|
||||
T16 = Im[WS(rs, 4)];
|
||||
T17 = FMA(T3, T15, T6 * T16);
|
||||
T25 = FNMS(T6, T15, T3 * T16);
|
||||
T1c = Ip[WS(rs, 2)];
|
||||
T1g = Im[WS(rs, 2)];
|
||||
T1h = FMA(T1b, T1c, T1f * T1g);
|
||||
T2a = FNMS(T1f, T1c, T1b * T1g);
|
||||
}
|
||||
T18 = T14 + T17;
|
||||
T1n = T1h + T1m;
|
||||
T2Q = T18 - T1n;
|
||||
T2R = T24 + T25;
|
||||
T2S = T2a + T2b;
|
||||
T2T = T2R - T2S;
|
||||
{
|
||||
E T26, T27, T29, T2c;
|
||||
T26 = T24 - T25;
|
||||
T27 = T1h - T1m;
|
||||
T28 = T26 + T27;
|
||||
T2A = T26 - T27;
|
||||
T29 = T14 - T17;
|
||||
T2c = T2a - T2b;
|
||||
T2d = T29 - T2c;
|
||||
T2B = T29 + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
|
||||
{
|
||||
E T1R, T22, T3y, T3z;
|
||||
T1R = T1N - T1Q;
|
||||
T22 = KP707106781 * (T1W - T21);
|
||||
T23 = T1R + T22;
|
||||
T2r = T1R - T22;
|
||||
T3y = KP707106781 * (T2x - T2w);
|
||||
T3z = T3s + T3r;
|
||||
T3A = T3y + T3z;
|
||||
T3C = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T2e, T2p, T2s, T2t;
|
||||
T2e = FMA(KP923879532, T28, KP382683432 * T2d);
|
||||
T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
|
||||
T2q = T2e + T2p;
|
||||
T3B = T2p - T2e;
|
||||
T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
|
||||
T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
|
||||
T2u = T2s - T2t;
|
||||
T3x = T2s + T2t;
|
||||
}
|
||||
Rm[WS(rs, 4)] = T23 - T2q;
|
||||
Im[WS(rs, 4)] = T3x - T3A;
|
||||
Rp[WS(rs, 3)] = T23 + T2q;
|
||||
Ip[WS(rs, 3)] = T3x + T3A;
|
||||
Rm[0] = T2r - T2u;
|
||||
Im[0] = T3B - T3C;
|
||||
Rp[WS(rs, 7)] = T2r + T2u;
|
||||
Ip[WS(rs, 7)] = T3B + T3C;
|
||||
}
|
||||
{
|
||||
E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
|
||||
{
|
||||
E T2L, T2O, T3k, T3l;
|
||||
T2L = Tf - TA;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2L + T2O;
|
||||
T31 = T2L - T2O;
|
||||
T3k = TZ - TM;
|
||||
T3l = T3e - T3b;
|
||||
T3m = T3k + T3l;
|
||||
T3o = T3l - T3k;
|
||||
}
|
||||
{
|
||||
E T2U, T2Z, T32, T33;
|
||||
T2U = T2Q + T2T;
|
||||
T2Z = T2V - T2Y;
|
||||
T30 = KP707106781 * (T2U + T2Z);
|
||||
T3n = KP707106781 * (T2Z - T2U);
|
||||
T32 = T2T - T2Q;
|
||||
T33 = T2V + T2Y;
|
||||
T34 = KP707106781 * (T32 - T33);
|
||||
T3j = KP707106781 * (T32 + T33);
|
||||
}
|
||||
Rm[WS(rs, 5)] = T2P - T30;
|
||||
Im[WS(rs, 5)] = T3j - T3m;
|
||||
Rp[WS(rs, 2)] = T2P + T30;
|
||||
Ip[WS(rs, 2)] = T3j + T3m;
|
||||
Rm[WS(rs, 1)] = T31 - T34;
|
||||
Im[WS(rs, 1)] = T3n - T3o;
|
||||
Rp[WS(rs, 6)] = T31 + T34;
|
||||
Ip[WS(rs, 6)] = T3n + T3o;
|
||||
}
|
||||
{
|
||||
E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
|
||||
{
|
||||
E T2v, T2y, T3q, T3t;
|
||||
T2v = T1N + T1Q;
|
||||
T2y = KP707106781 * (T2w + T2x);
|
||||
T2z = T2v + T2y;
|
||||
T2H = T2v - T2y;
|
||||
T3q = KP707106781 * (T1W + T21);
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3q + T3t;
|
||||
T3w = T3t - T3q;
|
||||
}
|
||||
{
|
||||
E T2C, T2F, T2I, T2J;
|
||||
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
|
||||
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
|
||||
T2G = T2C + T2F;
|
||||
T3v = T2F - T2C;
|
||||
T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
|
||||
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
|
||||
T2K = T2I - T2J;
|
||||
T3p = T2I + T2J;
|
||||
}
|
||||
Rm[WS(rs, 6)] = T2z - T2G;
|
||||
Im[WS(rs, 6)] = T3p - T3u;
|
||||
Rp[WS(rs, 1)] = T2z + T2G;
|
||||
Ip[WS(rs, 1)] = T3p + T3u;
|
||||
Rm[WS(rs, 2)] = T2H - T2K;
|
||||
Im[WS(rs, 2)] = T3v - T3w;
|
||||
Rp[WS(rs, 5)] = T2H + T2K;
|
||||
Ip[WS(rs, 5)] = T3v + T3w;
|
||||
}
|
||||
{
|
||||
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
|
||||
{
|
||||
E TB, T10, T3a, T3f;
|
||||
TB = Tf + TA;
|
||||
T10 = TM + TZ;
|
||||
T11 = TB + T10;
|
||||
T35 = TB - T10;
|
||||
T3a = T2M + T2N;
|
||||
T3f = T3b + T3e;
|
||||
T3g = T3a + T3f;
|
||||
T3i = T3f - T3a;
|
||||
}
|
||||
{
|
||||
E T1o, T1L, T36, T37;
|
||||
T1o = T18 + T1n;
|
||||
T1L = T1B + T1K;
|
||||
T1M = T1o + T1L;
|
||||
T3h = T1L - T1o;
|
||||
T36 = T2R + T2S;
|
||||
T37 = T2W + T2X;
|
||||
T38 = T36 - T37;
|
||||
T39 = T36 + T37;
|
||||
}
|
||||
Rm[WS(rs, 7)] = T11 - T1M;
|
||||
Im[WS(rs, 7)] = T39 - T3g;
|
||||
Rp[0] = T11 + T1M;
|
||||
Ip[0] = T39 + T3g;
|
||||
Rm[WS(rs, 3)] = T35 - T38;
|
||||
Im[WS(rs, 3)] = T3h - T3i;
|
||||
Rp[WS(rs, 4)] = T35 + T38;
|
||||
Ip[WS(rs, 4)] = T3h + T3i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cf2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
|
||||
|
||||
void X(codelet_hc2cf2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf2_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
1097
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_20.c
Normal file
1097
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1893
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_32.c
Normal file
1893
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
200
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_4.c
Normal file
200
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_4.c
Normal file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:34 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cf2_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T6, T3, T5, T7, Tb, T4, Ta;
|
||||
T2 = W[0];
|
||||
T6 = W[3];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Ta = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
{
|
||||
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
|
||||
T1 = Rp[0];
|
||||
Tx = Rm[0];
|
||||
{
|
||||
E T8, T9, Tc, Tv;
|
||||
T8 = Rp[WS(rs, 1)];
|
||||
T9 = T7 * T8;
|
||||
Tc = Rm[WS(rs, 1)];
|
||||
Tv = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Tw = FNMS(Tb, T8, Tv);
|
||||
}
|
||||
{
|
||||
E Tf, Tg, Th, Tp;
|
||||
Tf = Ip[0];
|
||||
Tg = T2 * Tf;
|
||||
Th = Im[0];
|
||||
Tp = T2 * Th;
|
||||
Ti = FMA(T5, Th, Tg);
|
||||
Tq = FNMS(T5, Tf, Tp);
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tl, Tr;
|
||||
Tj = Ip[WS(rs, 1)];
|
||||
Tk = T3 * Tj;
|
||||
Tl = Im[WS(rs, 1)];
|
||||
Tr = T3 * Tl;
|
||||
Tm = FMA(T6, Tl, Tk);
|
||||
Ts = FNMS(T6, Tj, Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tn, Tu, Ty;
|
||||
Te = T1 + Td;
|
||||
Tn = Ti + Tm;
|
||||
Rm[WS(rs, 1)] = Te - Tn;
|
||||
Rp[0] = Te + Tn;
|
||||
Tu = Tq + Ts;
|
||||
Ty = Tw + Tx;
|
||||
Im[WS(rs, 1)] = Tu - Ty;
|
||||
Ip[0] = Tu + Ty;
|
||||
}
|
||||
{
|
||||
E To, Tt, Tz, TA;
|
||||
To = T1 - Td;
|
||||
Tt = Tq - Ts;
|
||||
Rm[0] = To - Tt;
|
||||
Rp[WS(rs, 1)] = To + Tt;
|
||||
Tz = Tm - Ti;
|
||||
TA = Tx - Tw;
|
||||
Im[0] = Tz - TA;
|
||||
Ip[WS(rs, 1)] = Tz + TA;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hc2cf2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf2_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cf2_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T4, T3, T5, T6, T8;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T3 = W[2];
|
||||
T5 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T8 = FNMS(T4, T3, T2 * T5);
|
||||
{
|
||||
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
|
||||
T1 = Rp[0];
|
||||
Tp = Rm[0];
|
||||
T7 = Rp[WS(rs, 1)];
|
||||
T9 = Rm[WS(rs, 1)];
|
||||
Ta = FMA(T6, T7, T8 * T9);
|
||||
To = FNMS(T8, T7, T6 * T9);
|
||||
{
|
||||
E Tc, Td, Tf, Tg;
|
||||
Tc = Ip[0];
|
||||
Td = Im[0];
|
||||
Te = FMA(T2, Tc, T4 * Td);
|
||||
Tk = FNMS(T4, Tc, T2 * Td);
|
||||
Tf = Ip[WS(rs, 1)];
|
||||
Tg = Im[WS(rs, 1)];
|
||||
Th = FMA(T3, Tf, T5 * Tg);
|
||||
Tl = FNMS(T5, Tf, T3 * Tg);
|
||||
}
|
||||
{
|
||||
E Tb, Ti, Tn, Tq;
|
||||
Tb = T1 + Ta;
|
||||
Ti = Te + Th;
|
||||
Rm[WS(rs, 1)] = Tb - Ti;
|
||||
Rp[0] = Tb + Ti;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
Im[WS(rs, 1)] = Tn - Tq;
|
||||
Ip[0] = Tn + Tq;
|
||||
}
|
||||
{
|
||||
E Tj, Tm, Tr, Ts;
|
||||
Tj = T1 - Ta;
|
||||
Tm = Tk - Tl;
|
||||
Rm[0] = Tj - Tm;
|
||||
Rp[WS(rs, 1)] = Tj + Tm;
|
||||
Tr = Th - Te;
|
||||
Ts = Tp - To;
|
||||
Im[0] = Tr - Ts;
|
||||
Ip[WS(rs, 1)] = Tr + Ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hc2cf2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf2_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
390
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_8.c
Normal file
390
fftw-3.3.10/rdft/scalar/r2cf/hc2cf2_8.c
Normal file
@@ -0,0 +1,390 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:34 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 50 FP multiplications,
|
||||
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
|
||||
* 48 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
|
||||
{
|
||||
E T4, Tm, Tr, Ta, TB, TF;
|
||||
T2 = W[0];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tl = W[4];
|
||||
Tm = T2 * Tl;
|
||||
Tn = W[5];
|
||||
Tr = T2 * Tn;
|
||||
T5 = W[1];
|
||||
T6 = W[3];
|
||||
Ta = T2 * T6;
|
||||
Tf = FMA(T5, T6, T4);
|
||||
T7 = FNMS(T5, T6, T4);
|
||||
Ts = FNMS(T5, Tl, Tr);
|
||||
Tb = FMA(T5, T3, Ta);
|
||||
To = FMA(T5, Tn, Tm);
|
||||
TB = Tf * Tl;
|
||||
TF = Tf * Tn;
|
||||
Ti = FNMS(T5, T3, Ta);
|
||||
TC = FMA(Ti, Tn, TB);
|
||||
TG = FNMS(Ti, Tl, TF);
|
||||
}
|
||||
{
|
||||
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
|
||||
E TI, T11, T13, T15, T16;
|
||||
T1 = Rp[0];
|
||||
T1s = Rm[0];
|
||||
{
|
||||
E T8, T9, Tc, T1q;
|
||||
T8 = Rp[WS(rs, 2)];
|
||||
T9 = T7 * T8;
|
||||
Tc = Rm[WS(rs, 2)];
|
||||
T1q = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
T1r = FNMS(Tb, T8, T1q);
|
||||
}
|
||||
{
|
||||
E Tp, Tq, Tt, TX;
|
||||
Tp = Rp[WS(rs, 3)];
|
||||
Tq = To * Tp;
|
||||
Tt = Rm[WS(rs, 3)];
|
||||
TX = To * Tt;
|
||||
Tu = FMA(Ts, Tt, Tq);
|
||||
TY = FNMS(Ts, Tp, TX);
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, TV;
|
||||
Tg = Rp[WS(rs, 1)];
|
||||
Th = Tf * Tg;
|
||||
Tj = Rm[WS(rs, 1)];
|
||||
TV = Tf * Tj;
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TW = FNMS(Ti, Tg, TV);
|
||||
}
|
||||
{
|
||||
E TK, TL, TM, T19, TO, TP, TQ, T1b;
|
||||
TK = Ip[WS(rs, 3)];
|
||||
TL = Tl * TK;
|
||||
TM = Im[WS(rs, 3)];
|
||||
T19 = Tl * TM;
|
||||
TO = Ip[WS(rs, 1)];
|
||||
TP = T3 * TO;
|
||||
TQ = Im[WS(rs, 1)];
|
||||
T1b = T3 * TQ;
|
||||
TN = FMA(Tn, TM, TL);
|
||||
TR = FMA(T6, TQ, TP);
|
||||
T18 = TN - TR;
|
||||
T1a = FNMS(Tn, TK, T19);
|
||||
T1c = FNMS(T6, TO, T1b);
|
||||
T1d = T1a - T1c;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
|
||||
Tx = Ip[0];
|
||||
Ty = T2 * Tx;
|
||||
Tz = Im[0];
|
||||
T12 = T2 * Tz;
|
||||
TD = Ip[WS(rs, 2)];
|
||||
TE = TC * TD;
|
||||
TH = Im[WS(rs, 2)];
|
||||
T14 = TC * TH;
|
||||
TA = FMA(T5, Tz, Ty);
|
||||
TI = FMA(TG, TH, TE);
|
||||
T11 = TA - TI;
|
||||
T13 = FNMS(T5, Tx, T12);
|
||||
T15 = FNMS(TG, TD, T14);
|
||||
T16 = T13 - T15;
|
||||
}
|
||||
{
|
||||
E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
|
||||
{
|
||||
E TU, TZ, T1x, T1y;
|
||||
TU = T1 - Td;
|
||||
TZ = TW - TY;
|
||||
T10 = TU + TZ;
|
||||
T1g = TU - TZ;
|
||||
T1x = T1s - T1r;
|
||||
T1y = Tk - Tu;
|
||||
T1z = T1x - T1y;
|
||||
T1B = T1y + T1x;
|
||||
}
|
||||
{
|
||||
E T17, T1e, T1h, T1i;
|
||||
T17 = T11 + T16;
|
||||
T1e = T18 - T1d;
|
||||
T1f = T17 + T1e;
|
||||
T1C = T1e - T17;
|
||||
T1h = T16 - T11;
|
||||
T1i = T18 + T1d;
|
||||
T1j = T1h - T1i;
|
||||
T1A = T1h + T1i;
|
||||
}
|
||||
Rm[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
|
||||
Im[WS(rs, 2)] = FMS(KP707106781, T1A, T1z);
|
||||
Rp[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
|
||||
Ip[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
|
||||
Rm[0] = FNMS(KP707106781, T1j, T1g);
|
||||
Im[0] = FMS(KP707106781, T1C, T1B);
|
||||
Rp[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
|
||||
Ip[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
|
||||
}
|
||||
{
|
||||
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
|
||||
{
|
||||
E Te, Tv, T1p, T1t;
|
||||
Te = T1 + Td;
|
||||
Tv = Tk + Tu;
|
||||
Tw = Te + Tv;
|
||||
T1k = Te - Tv;
|
||||
T1p = TW + TY;
|
||||
T1t = T1r + T1s;
|
||||
T1u = T1p + T1t;
|
||||
T1w = T1t - T1p;
|
||||
}
|
||||
{
|
||||
E TJ, TS, T1l, T1m;
|
||||
TJ = TA + TI;
|
||||
TS = TN + TR;
|
||||
TT = TJ + TS;
|
||||
T1v = TS - TJ;
|
||||
T1l = T13 + T15;
|
||||
T1m = T1a + T1c;
|
||||
T1n = T1l - T1m;
|
||||
T1o = T1l + T1m;
|
||||
}
|
||||
Rm[WS(rs, 3)] = Tw - TT;
|
||||
Im[WS(rs, 3)] = T1o - T1u;
|
||||
Rp[0] = Tw + TT;
|
||||
Ip[0] = T1o + T1u;
|
||||
Rm[WS(rs, 1)] = T1k - T1n;
|
||||
Im[WS(rs, 1)] = T1v - T1w;
|
||||
Rp[WS(rs, 2)] = T1k + T1n;
|
||||
Ip[WS(rs, 2)] = T1v + T1w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cf2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cf2_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 44 FP multiplications,
|
||||
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
|
||||
* 42 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
|
||||
{
|
||||
E T4, Tb, T7, Ta;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
T3 = W[2];
|
||||
T6 = W[3];
|
||||
T4 = T2 * T3;
|
||||
Tb = T5 * T3;
|
||||
T7 = T5 * T6;
|
||||
Ta = T2 * T6;
|
||||
T8 = T4 - T7;
|
||||
Tc = Ta + Tb;
|
||||
Tg = T4 + T7;
|
||||
Ti = Ta - Tb;
|
||||
Tl = W[4];
|
||||
Tm = W[5];
|
||||
Tn = FMA(T2, Tl, T5 * Tm);
|
||||
Tz = FNMS(Ti, Tl, Tg * Tm);
|
||||
Tp = FNMS(T5, Tl, T2 * Tm);
|
||||
Tx = FMA(Tg, Tl, Ti * Tm);
|
||||
}
|
||||
{
|
||||
E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
|
||||
E TT;
|
||||
{
|
||||
E T1, T1c, Te, T1b, T9, Td;
|
||||
T1 = Rp[0];
|
||||
T1c = Rm[0];
|
||||
T9 = Rp[WS(rs, 2)];
|
||||
Td = Rm[WS(rs, 2)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T1b = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T1i = T1c - T1b;
|
||||
TL = T1 - Te;
|
||||
T1d = T1b + T1c;
|
||||
}
|
||||
{
|
||||
E TF, TW, TI, TX;
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = Ip[WS(rs, 3)];
|
||||
TE = Im[WS(rs, 3)];
|
||||
TF = FMA(Tl, TD, Tm * TE);
|
||||
TW = FNMS(Tm, TD, Tl * TE);
|
||||
TG = Ip[WS(rs, 1)];
|
||||
TH = Im[WS(rs, 1)];
|
||||
TI = FMA(T3, TG, T6 * TH);
|
||||
TX = FNMS(T6, TG, T3 * TH);
|
||||
}
|
||||
TJ = TF + TI;
|
||||
T17 = TW + TX;
|
||||
TV = TF - TI;
|
||||
TY = TW - TX;
|
||||
}
|
||||
{
|
||||
E Tk, TM, Tr, TN;
|
||||
{
|
||||
E Th, Tj, To, Tq;
|
||||
Th = Rp[WS(rs, 1)];
|
||||
Tj = Rm[WS(rs, 1)];
|
||||
Tk = FMA(Tg, Th, Ti * Tj);
|
||||
TM = FNMS(Ti, Th, Tg * Tj);
|
||||
To = Rp[WS(rs, 3)];
|
||||
Tq = Rm[WS(rs, 3)];
|
||||
Tr = FMA(Tn, To, Tp * Tq);
|
||||
TN = FNMS(Tp, To, Tn * Tq);
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
T1j = Tk - Tr;
|
||||
TO = TM - TN;
|
||||
T1a = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tw, TR, TB, TS;
|
||||
{
|
||||
E Tu, Tv, Ty, TA;
|
||||
Tu = Ip[0];
|
||||
Tv = Im[0];
|
||||
Tw = FMA(T2, Tu, T5 * Tv);
|
||||
TR = FNMS(T5, Tu, T2 * Tv);
|
||||
Ty = Ip[WS(rs, 2)];
|
||||
TA = Im[WS(rs, 2)];
|
||||
TB = FMA(Tx, Ty, Tz * TA);
|
||||
TS = FNMS(Tz, Ty, Tx * TA);
|
||||
}
|
||||
TC = Tw + TB;
|
||||
T16 = TR + TS;
|
||||
TQ = Tw - TB;
|
||||
TT = TR - TS;
|
||||
}
|
||||
{
|
||||
E Tt, TK, T1f, T1g;
|
||||
Tt = Tf + Ts;
|
||||
TK = TC + TJ;
|
||||
Rm[WS(rs, 3)] = Tt - TK;
|
||||
Rp[0] = Tt + TK;
|
||||
{
|
||||
E T19, T1e, T15, T18;
|
||||
T19 = T16 + T17;
|
||||
T1e = T1a + T1d;
|
||||
Im[WS(rs, 3)] = T19 - T1e;
|
||||
Ip[0] = T19 + T1e;
|
||||
T15 = Tf - Ts;
|
||||
T18 = T16 - T17;
|
||||
Rm[WS(rs, 1)] = T15 - T18;
|
||||
Rp[WS(rs, 2)] = T15 + T18;
|
||||
}
|
||||
T1f = TJ - TC;
|
||||
T1g = T1d - T1a;
|
||||
Im[WS(rs, 1)] = T1f - T1g;
|
||||
Ip[WS(rs, 2)] = T1f + T1g;
|
||||
{
|
||||
E T11, T1k, T14, T1h, T12, T13;
|
||||
T11 = TL - TO;
|
||||
T1k = T1i - T1j;
|
||||
T12 = TT - TQ;
|
||||
T13 = TV + TY;
|
||||
T14 = KP707106781 * (T12 - T13);
|
||||
T1h = KP707106781 * (T12 + T13);
|
||||
Rm[0] = T11 - T14;
|
||||
Ip[WS(rs, 1)] = T1h + T1k;
|
||||
Rp[WS(rs, 3)] = T11 + T14;
|
||||
Im[WS(rs, 2)] = T1h - T1k;
|
||||
}
|
||||
{
|
||||
E TP, T1m, T10, T1l, TU, TZ;
|
||||
TP = TL + TO;
|
||||
T1m = T1j + T1i;
|
||||
TU = TQ + TT;
|
||||
TZ = TV - TY;
|
||||
T10 = KP707106781 * (TU + TZ);
|
||||
T1l = KP707106781 * (TZ - TU);
|
||||
Rm[WS(rs, 2)] = TP - T10;
|
||||
Ip[WS(rs, 3)] = T1l + T1m;
|
||||
Rp[WS(rs, 1)] = TP + T10;
|
||||
Im[0] = T1l - T1m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cf2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
|
||||
|
||||
void X(codelet_hc2cf2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf2_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
489
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_10.c
Normal file
489
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_10.c
Normal file
@@ -0,0 +1,489 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cf_10 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 72 FP multiplications,
|
||||
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 47 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E T8, T26, T12, T1U, TM, TZ, T10, T1I, T1J, T24, T16, T17, T18, T1h, T1m;
|
||||
E T1P, Tl, Ty, Tz, T1F, T1G, T23, T13, T14, T15, T1s, T1x, T1O;
|
||||
{
|
||||
E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5;
|
||||
T1 = Rp[0];
|
||||
T1T = Rm[0];
|
||||
T3 = Ip[WS(rs, 2)];
|
||||
T6 = Im[WS(rs, 2)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1R = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1S = FNMS(T5, T3, T1R);
|
||||
T8 = T1 - T7;
|
||||
T26 = T1T - T1S;
|
||||
T12 = T1 + T7;
|
||||
T1U = T1S + T1T;
|
||||
}
|
||||
{
|
||||
E TF, T1e, TY, T1l, TL, T1g, TS, T1j;
|
||||
{
|
||||
E TB, TE, TC, T1d, TA, TD;
|
||||
TB = Rp[WS(rs, 2)];
|
||||
TE = Rm[WS(rs, 2)];
|
||||
TA = W[6];
|
||||
TC = TA * TB;
|
||||
T1d = TA * TE;
|
||||
TD = W[7];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1e = FNMS(TD, TB, T1d);
|
||||
}
|
||||
{
|
||||
E TU, TX, TV, T1k, TT, TW;
|
||||
TU = Ip[0];
|
||||
TX = Im[0];
|
||||
TT = W[0];
|
||||
TV = TT * TU;
|
||||
T1k = TT * TX;
|
||||
TW = W[1];
|
||||
TY = FMA(TW, TX, TV);
|
||||
T1l = FNMS(TW, TU, T1k);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1f, TG, TJ;
|
||||
TH = Ip[WS(rs, 4)];
|
||||
TK = Im[WS(rs, 4)];
|
||||
TG = W[16];
|
||||
TI = TG * TH;
|
||||
T1f = TG * TK;
|
||||
TJ = W[17];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1g = FNMS(TJ, TH, T1f);
|
||||
}
|
||||
{
|
||||
E TO, TR, TP, T1i, TN, TQ;
|
||||
TO = Rp[WS(rs, 3)];
|
||||
TR = Rm[WS(rs, 3)];
|
||||
TN = W[10];
|
||||
TP = TN * TO;
|
||||
T1i = TN * TR;
|
||||
TQ = W[11];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1j = FNMS(TQ, TO, T1i);
|
||||
}
|
||||
TM = TF - TL;
|
||||
TZ = TS - TY;
|
||||
T10 = TM + TZ;
|
||||
T1I = T1l - T1j;
|
||||
T1J = T1g - T1e;
|
||||
T24 = T1J + T1I;
|
||||
T16 = TF + TL;
|
||||
T17 = TS + TY;
|
||||
T18 = T16 + T17;
|
||||
T1h = T1e + T1g;
|
||||
T1m = T1j + T1l;
|
||||
T1P = T1h + T1m;
|
||||
}
|
||||
{
|
||||
E Te, T1p, Tx, T1w, Tk, T1r, Tr, T1u;
|
||||
{
|
||||
E Ta, Td, Tb, T1o, T9, Tc;
|
||||
Ta = Rp[WS(rs, 1)];
|
||||
Td = Rm[WS(rs, 1)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
T1o = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1p = FNMS(Tc, Ta, T1o);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1v, Ts, Tv;
|
||||
Tt = Ip[WS(rs, 1)];
|
||||
Tw = Im[WS(rs, 1)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
T1v = Ts * Tw;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1w = FNMS(Tv, Tt, T1v);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, T1q, Tf, Ti;
|
||||
Tg = Ip[WS(rs, 3)];
|
||||
Tj = Im[WS(rs, 3)];
|
||||
Tf = W[12];
|
||||
Th = Tf * Tg;
|
||||
T1q = Tf * Tj;
|
||||
Ti = W[13];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1r = FNMS(Ti, Tg, T1q);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1t, Tm, Tp;
|
||||
Tn = Rp[WS(rs, 4)];
|
||||
Tq = Rm[WS(rs, 4)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1t = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1u = FNMS(Tp, Tn, T1t);
|
||||
}
|
||||
Tl = Te - Tk;
|
||||
Ty = Tr - Tx;
|
||||
Tz = Tl + Ty;
|
||||
T1F = T1w - T1u;
|
||||
T1G = T1r - T1p;
|
||||
T23 = T1G + T1F;
|
||||
T13 = Te + Tk;
|
||||
T14 = Tr + Tx;
|
||||
T15 = T13 + T14;
|
||||
T1s = T1p + T1r;
|
||||
T1x = T1u + T1w;
|
||||
T1O = T1s + T1x;
|
||||
}
|
||||
{
|
||||
E T1D, T11, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
|
||||
T1D = Tz - T10;
|
||||
T11 = Tz + T10;
|
||||
T1C = FNMS(KP250000000, T11, T8);
|
||||
T1H = T1F - T1G;
|
||||
T1K = T1I - T1J;
|
||||
T1L = FMA(KP618033988, T1K, T1H);
|
||||
T1N = FNMS(KP618033988, T1H, T1K);
|
||||
Rm[WS(rs, 4)] = T8 + T11;
|
||||
T1M = FNMS(KP559016994, T1D, T1C);
|
||||
Rm[WS(rs, 2)] = FNMS(KP951056516, T1N, T1M);
|
||||
Rp[WS(rs, 3)] = FMA(KP951056516, T1N, T1M);
|
||||
T1E = FMA(KP559016994, T1D, T1C);
|
||||
Rm[0] = FNMS(KP951056516, T1L, T1E);
|
||||
Rp[WS(rs, 1)] = FMA(KP951056516, T1L, T1E);
|
||||
}
|
||||
{
|
||||
E T28, T25, T27, T2c, T2e, T2a, T2b, T2d, T29;
|
||||
T28 = T24 - T23;
|
||||
T25 = T23 + T24;
|
||||
T27 = FMA(KP250000000, T25, T26);
|
||||
T2a = Ty - Tl;
|
||||
T2b = TZ - TM;
|
||||
T2c = FMA(KP618033988, T2b, T2a);
|
||||
T2e = FNMS(KP618033988, T2a, T2b);
|
||||
Im[WS(rs, 4)] = T25 - T26;
|
||||
T2d = FNMS(KP559016994, T28, T27);
|
||||
Im[WS(rs, 2)] = FMS(KP951056516, T2e, T2d);
|
||||
Ip[WS(rs, 3)] = FMA(KP951056516, T2e, T2d);
|
||||
T29 = FMA(KP559016994, T28, T27);
|
||||
Im[0] = FMS(KP951056516, T2c, T29);
|
||||
Ip[WS(rs, 1)] = FMA(KP951056516, T2c, T29);
|
||||
}
|
||||
{
|
||||
E T1b, T19, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
|
||||
T1b = T15 - T18;
|
||||
T19 = T15 + T18;
|
||||
T1a = FNMS(KP250000000, T19, T12);
|
||||
T1n = T1h - T1m;
|
||||
T1y = T1s - T1x;
|
||||
T1z = FNMS(KP618033988, T1y, T1n);
|
||||
T1B = FMA(KP618033988, T1n, T1y);
|
||||
Rp[0] = T12 + T19;
|
||||
T1A = FMA(KP559016994, T1b, T1a);
|
||||
Rp[WS(rs, 4)] = FNMS(KP951056516, T1B, T1A);
|
||||
Rm[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
|
||||
T1c = FNMS(KP559016994, T1b, T1a);
|
||||
Rp[WS(rs, 2)] = FNMS(KP951056516, T1z, T1c);
|
||||
Rm[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
|
||||
}
|
||||
{
|
||||
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
|
||||
T1W = T1O - T1P;
|
||||
T1Q = T1O + T1P;
|
||||
T1V = FNMS(KP250000000, T1Q, T1U);
|
||||
T1Y = T16 - T17;
|
||||
T1Z = T13 - T14;
|
||||
T20 = FNMS(KP618033988, T1Z, T1Y);
|
||||
T22 = FMA(KP618033988, T1Y, T1Z);
|
||||
Ip[0] = T1Q + T1U;
|
||||
T21 = FMA(KP559016994, T1W, T1V);
|
||||
Im[WS(rs, 3)] = FMS(KP951056516, T22, T21);
|
||||
Ip[WS(rs, 4)] = FMA(KP951056516, T22, T21);
|
||||
T1X = FNMS(KP559016994, T1W, T1V);
|
||||
Im[WS(rs, 1)] = FMS(KP951056516, T20, T1X);
|
||||
Ip[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cf_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_10, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cf_10 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 60 FP multiplications,
|
||||
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 45 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E T7, T1O, TT, T1C, TF, TQ, TR, T1r, T1s, T1L, TX, TY, TZ, T16, T19;
|
||||
E T1y, Ti, Tt, Tu, T1o, T1p, T1M, TU, TV, TW, T1d, T1g, T1x;
|
||||
{
|
||||
E T1, T1B, T6, T1A;
|
||||
T1 = Rp[0];
|
||||
T1B = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Ip[WS(rs, 2)];
|
||||
T5 = Im[WS(rs, 2)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1A = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
T1O = T1B - T1A;
|
||||
TT = T1 + T6;
|
||||
T1C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tz, T14, TP, T18, TE, T15, TK, T17;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = Rp[WS(rs, 2)];
|
||||
Ty = Rm[WS(rs, 2)];
|
||||
Tv = W[6];
|
||||
Tx = W[7];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T14 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = Ip[0];
|
||||
TO = Im[0];
|
||||
TL = W[0];
|
||||
TN = W[1];
|
||||
TP = FMA(TL, TM, TN * TO);
|
||||
T18 = FNMS(TN, TM, TL * TO);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = Ip[WS(rs, 4)];
|
||||
TD = Im[WS(rs, 4)];
|
||||
TA = W[16];
|
||||
TC = W[17];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T15 = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = Rp[WS(rs, 3)];
|
||||
TJ = Rm[WS(rs, 3)];
|
||||
TG = W[10];
|
||||
TI = W[11];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T17 = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
TF = Tz - TE;
|
||||
TQ = TK - TP;
|
||||
TR = TF + TQ;
|
||||
T1r = T14 - T15;
|
||||
T1s = T18 - T17;
|
||||
T1L = T1s - T1r;
|
||||
TX = Tz + TE;
|
||||
TY = TK + TP;
|
||||
TZ = TX + TY;
|
||||
T16 = T14 + T15;
|
||||
T19 = T17 + T18;
|
||||
T1y = T16 + T19;
|
||||
}
|
||||
{
|
||||
E Tc, T1b, Ts, T1f, Th, T1c, Tn, T1e;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = Rp[WS(rs, 1)];
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T1b = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = Ip[WS(rs, 1)];
|
||||
Tr = Im[WS(rs, 1)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1f = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = Ip[WS(rs, 3)];
|
||||
Tg = Im[WS(rs, 3)];
|
||||
Td = W[12];
|
||||
Tf = W[13];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T1c = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = Rp[WS(rs, 4)];
|
||||
Tm = Rm[WS(rs, 4)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1e = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
Tt = Tn - Ts;
|
||||
Tu = Ti + Tt;
|
||||
T1o = T1b - T1c;
|
||||
T1p = T1e - T1f;
|
||||
T1M = T1o + T1p;
|
||||
TU = Tc + Th;
|
||||
TV = Tn + Ts;
|
||||
TW = TU + TV;
|
||||
T1d = T1b + T1c;
|
||||
T1g = T1e + T1f;
|
||||
T1x = T1d + T1g;
|
||||
}
|
||||
{
|
||||
E T1l, TS, T1m, T1u, T1w, T1q, T1t, T1v, T1n;
|
||||
T1l = KP559016994 * (Tu - TR);
|
||||
TS = Tu + TR;
|
||||
T1m = FNMS(KP250000000, TS, T7);
|
||||
T1q = T1o - T1p;
|
||||
T1t = T1r + T1s;
|
||||
T1u = FMA(KP951056516, T1q, KP587785252 * T1t);
|
||||
T1w = FNMS(KP587785252, T1q, KP951056516 * T1t);
|
||||
Rm[WS(rs, 4)] = T7 + TS;
|
||||
T1v = T1m - T1l;
|
||||
Rm[WS(rs, 2)] = T1v - T1w;
|
||||
Rp[WS(rs, 3)] = T1v + T1w;
|
||||
T1n = T1l + T1m;
|
||||
Rm[0] = T1n - T1u;
|
||||
Rp[WS(rs, 1)] = T1n + T1u;
|
||||
}
|
||||
{
|
||||
E T1S, T1N, T1T, T1R, T1V, T1P, T1Q, T1W, T1U;
|
||||
T1S = KP559016994 * (T1M + T1L);
|
||||
T1N = T1L - T1M;
|
||||
T1T = FMA(KP250000000, T1N, T1O);
|
||||
T1P = TQ - TF;
|
||||
T1Q = Ti - Tt;
|
||||
T1R = FNMS(KP951056516, T1Q, KP587785252 * T1P);
|
||||
T1V = FMA(KP587785252, T1Q, KP951056516 * T1P);
|
||||
Im[WS(rs, 4)] = T1N - T1O;
|
||||
T1W = T1T - T1S;
|
||||
Im[WS(rs, 2)] = T1V - T1W;
|
||||
Ip[WS(rs, 3)] = T1V + T1W;
|
||||
T1U = T1S + T1T;
|
||||
Im[0] = T1R - T1U;
|
||||
Ip[WS(rs, 1)] = T1R + T1U;
|
||||
}
|
||||
{
|
||||
E T12, T10, T11, T1i, T1k, T1a, T1h, T1j, T13;
|
||||
T12 = KP559016994 * (TW - TZ);
|
||||
T10 = TW + TZ;
|
||||
T11 = FNMS(KP250000000, T10, TT);
|
||||
T1a = T16 - T19;
|
||||
T1h = T1d - T1g;
|
||||
T1i = FNMS(KP587785252, T1h, KP951056516 * T1a);
|
||||
T1k = FMA(KP951056516, T1h, KP587785252 * T1a);
|
||||
Rp[0] = TT + T10;
|
||||
T1j = T12 + T11;
|
||||
Rp[WS(rs, 4)] = T1j - T1k;
|
||||
Rm[WS(rs, 3)] = T1j + T1k;
|
||||
T13 = T11 - T12;
|
||||
Rp[WS(rs, 2)] = T13 - T1i;
|
||||
Rm[WS(rs, 1)] = T13 + T1i;
|
||||
}
|
||||
{
|
||||
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
|
||||
T1H = KP559016994 * (T1x - T1y);
|
||||
T1z = T1x + T1y;
|
||||
T1G = FNMS(KP250000000, T1z, T1C);
|
||||
T1D = TX - TY;
|
||||
T1E = TU - TV;
|
||||
T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
|
||||
T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
|
||||
Ip[0] = T1z + T1C;
|
||||
T1K = T1H + T1G;
|
||||
Im[WS(rs, 3)] = T1J - T1K;
|
||||
Ip[WS(rs, 4)] = T1J + T1K;
|
||||
T1I = T1G - T1H;
|
||||
Im[WS(rs, 1)] = T1F - T1I;
|
||||
Ip[WS(rs, 2)] = T1F + T1I;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cf_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_10, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
581
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_12.c
Normal file
581
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_12.c
Normal file
@@ -0,0 +1,581 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cf_12 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 68 FP multiplications,
|
||||
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2s, T1s, T2f, T1d, T21, T1H;
|
||||
E T1Z, Te, T2p, T1l, T2h, TT, T1V, T1A, T1T;
|
||||
T1 = Rp[0];
|
||||
T2i = Rm[0];
|
||||
{
|
||||
E Th, Tk, Ti, T2d, Tg, Tj;
|
||||
Th = Rp[WS(rs, 3)];
|
||||
Tk = Rm[WS(rs, 3)];
|
||||
Tg = W[10];
|
||||
Ti = Tg * Th;
|
||||
T2d = Tg * Tk;
|
||||
Tj = W[11];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2e = FNMS(Tj, Th, T2d);
|
||||
}
|
||||
{
|
||||
E TW, TZ, TX, T1X, TV, TY;
|
||||
TW = Ip[WS(rs, 4)];
|
||||
TZ = Im[WS(rs, 4)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T1X = TV * TZ;
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T1Y = FNMS(TY, TW, T1X);
|
||||
}
|
||||
{
|
||||
E TC, TF, TD, T1R, TB, TE;
|
||||
TC = Ip[WS(rs, 1)];
|
||||
TF = Im[WS(rs, 1)];
|
||||
TB = W[4];
|
||||
TD = TB * TC;
|
||||
T1R = TB * TF;
|
||||
TE = W[5];
|
||||
TG = FMA(TE, TF, TD);
|
||||
T1S = FNMS(TE, TC, T1R);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
|
||||
Tn = Rp[WS(rs, 5)];
|
||||
Tq = Rm[WS(rs, 5)];
|
||||
Tm = W[18];
|
||||
To = Tm * Tn;
|
||||
T1o = Tm * Tq;
|
||||
Tt = Rp[WS(rs, 1)];
|
||||
Tw = Rm[WS(rs, 1)];
|
||||
Ts = W[2];
|
||||
Tu = Ts * Tt;
|
||||
T1q = Ts * Tw;
|
||||
{
|
||||
E Tr, T1p, Tx, T1r, Tp, Tv;
|
||||
Tp = W[19];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1p = FNMS(Tp, Tn, T1o);
|
||||
Tv = W[3];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1r = FNMS(Tv, Tt, T1q);
|
||||
Ty = Tr + Tx;
|
||||
T2s = Tx - Tr;
|
||||
T1s = T1p - T1r;
|
||||
T2f = T1p + T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
|
||||
T12 = Ip[0];
|
||||
T15 = Im[0];
|
||||
T11 = W[0];
|
||||
T13 = T11 * T12;
|
||||
T1D = T11 * T15;
|
||||
T18 = Ip[WS(rs, 2)];
|
||||
T1b = Im[WS(rs, 2)];
|
||||
T17 = W[8];
|
||||
T19 = T17 * T18;
|
||||
T1F = T17 * T1b;
|
||||
{
|
||||
E T16, T1E, T1c, T1G, T14, T1a;
|
||||
T14 = W[1];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1E = FNMS(T14, T12, T1D);
|
||||
T1a = W[9];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T1G = FNMS(T1a, T18, T1F);
|
||||
T1d = T16 + T1c;
|
||||
T21 = T1c - T16;
|
||||
T1H = T1E - T1G;
|
||||
T1Z = T1E + T1G;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
|
||||
T3 = Rp[WS(rs, 2)];
|
||||
T6 = Rm[WS(rs, 2)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1h = T2 * T6;
|
||||
T9 = Rp[WS(rs, 4)];
|
||||
Tc = Rm[WS(rs, 4)];
|
||||
T8 = W[14];
|
||||
Ta = T8 * T9;
|
||||
T1j = T8 * Tc;
|
||||
{
|
||||
E T7, T1i, Td, T1k, T5, Tb;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1i = FNMS(T5, T3, T1h);
|
||||
Tb = W[15];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1k = FNMS(Tb, T9, T1j);
|
||||
Te = T7 + Td;
|
||||
T2p = Td - T7;
|
||||
T1l = T1i - T1k;
|
||||
T2h = T1i + T1k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
|
||||
TI = Ip[WS(rs, 3)];
|
||||
TL = Im[WS(rs, 3)];
|
||||
TH = W[12];
|
||||
TJ = TH * TI;
|
||||
T1w = TH * TL;
|
||||
TO = Ip[WS(rs, 5)];
|
||||
TR = Im[WS(rs, 5)];
|
||||
TN = W[20];
|
||||
TP = TN * TO;
|
||||
T1y = TN * TR;
|
||||
{
|
||||
E TM, T1x, TS, T1z, TK, TQ;
|
||||
TK = W[13];
|
||||
TM = FMA(TK, TL, TJ);
|
||||
T1x = FNMS(TK, TI, T1w);
|
||||
TQ = W[21];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1z = FNMS(TQ, TO, T1y);
|
||||
TT = TM + TS;
|
||||
T1V = TS - TM;
|
||||
T1A = T1x - T1z;
|
||||
T1T = T1x + T1z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
|
||||
{
|
||||
E Tf, Tz, T2g, T2j;
|
||||
Tf = T1 + Te;
|
||||
Tz = Tl + Ty;
|
||||
TA = Tf + Tz;
|
||||
T28 = Tf - Tz;
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2m = T2j - T2g;
|
||||
}
|
||||
{
|
||||
E TU, T1e, T29, T2a;
|
||||
TU = TG + TT;
|
||||
T1e = T10 + T1d;
|
||||
T1f = TU + T1e;
|
||||
T2l = TU - T1e;
|
||||
T29 = T1S + T1T;
|
||||
T2a = T1Y + T1Z;
|
||||
T2b = T29 - T2a;
|
||||
T2c = T29 + T2a;
|
||||
}
|
||||
Rm[WS(rs, 5)] = TA - T1f;
|
||||
Im[WS(rs, 5)] = T2c - T2k;
|
||||
Rp[0] = TA + T1f;
|
||||
Ip[0] = T2c + T2k;
|
||||
Rp[WS(rs, 3)] = T28 - T2b;
|
||||
Ip[WS(rs, 3)] = T2l + T2m;
|
||||
Rm[WS(rs, 2)] = T28 + T2b;
|
||||
Im[WS(rs, 2)] = T2l - T2m;
|
||||
}
|
||||
{
|
||||
E T1m, T1K, T2q, T2z, T2t, T2y, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
|
||||
E T1O;
|
||||
{
|
||||
E T1g, T2o, T2r, T1n;
|
||||
T1g = FNMS(KP500000000, Te, T1);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1K = FMA(KP866025403, T1l, T1g);
|
||||
T2o = FNMS(KP500000000, T2h, T2i);
|
||||
T2q = FMA(KP866025403, T2p, T2o);
|
||||
T2z = FNMS(KP866025403, T2p, T2o);
|
||||
T2r = FNMS(KP500000000, T2f, T2e);
|
||||
T2t = FMA(KP866025403, T2s, T2r);
|
||||
T2y = FNMS(KP866025403, T2s, T2r);
|
||||
T1n = FNMS(KP500000000, Ty, Tl);
|
||||
T1t = FNMS(KP866025403, T1s, T1n);
|
||||
T1L = FMA(KP866025403, T1s, T1n);
|
||||
}
|
||||
{
|
||||
E T1v, T1U, T20, T1C;
|
||||
T1v = FNMS(KP500000000, TT, TG);
|
||||
T1B = FNMS(KP866025403, T1A, T1v);
|
||||
T1N = FMA(KP866025403, T1A, T1v);
|
||||
T1U = FNMS(KP500000000, T1T, T1S);
|
||||
T1W = FNMS(KP866025403, T1V, T1U);
|
||||
T25 = FMA(KP866025403, T1V, T1U);
|
||||
T20 = FNMS(KP500000000, T1Z, T1Y);
|
||||
T22 = FNMS(KP866025403, T21, T20);
|
||||
T26 = FMA(KP866025403, T21, T20);
|
||||
T1C = FNMS(KP500000000, T1d, T10);
|
||||
T1I = FNMS(KP866025403, T1H, T1C);
|
||||
T1O = FMA(KP866025403, T1H, T1C);
|
||||
}
|
||||
{
|
||||
E T1u, T1J, T2x, T2A;
|
||||
T1u = T1m + T1t;
|
||||
T1J = T1B + T1I;
|
||||
Rp[WS(rs, 2)] = T1u - T1J;
|
||||
Rm[WS(rs, 3)] = T1u + T1J;
|
||||
T2x = T1W + T22;
|
||||
T2A = T2y + T2z;
|
||||
Im[WS(rs, 3)] = -(T2x + T2A);
|
||||
Ip[WS(rs, 2)] = T2A - T2x;
|
||||
}
|
||||
{
|
||||
E T1M, T1P, T2v, T2w;
|
||||
T1M = T1K + T1L;
|
||||
T1P = T1N + T1O;
|
||||
Rm[WS(rs, 1)] = T1M - T1P;
|
||||
Rp[WS(rs, 4)] = T1M + T1P;
|
||||
T2v = T25 + T26;
|
||||
T2w = T2t + T2q;
|
||||
Im[WS(rs, 1)] = T2v - T2w;
|
||||
Ip[WS(rs, 4)] = T2v + T2w;
|
||||
}
|
||||
{
|
||||
E T1Q, T23, T2B, T2C;
|
||||
T1Q = T1m - T1t;
|
||||
T23 = T1W - T22;
|
||||
Rm[0] = T1Q - T23;
|
||||
Rp[WS(rs, 5)] = T1Q + T23;
|
||||
T2B = T1I - T1B;
|
||||
T2C = T2z - T2y;
|
||||
Im[0] = T2B - T2C;
|
||||
Ip[WS(rs, 5)] = T2B + T2C;
|
||||
}
|
||||
{
|
||||
E T24, T27, T2n, T2u;
|
||||
T24 = T1K - T1L;
|
||||
T27 = T25 - T26;
|
||||
Rm[WS(rs, 4)] = T24 - T27;
|
||||
Rp[WS(rs, 1)] = T24 + T27;
|
||||
T2n = T1O - T1N;
|
||||
T2u = T2q - T2t;
|
||||
Im[WS(rs, 4)] = T2n - T2u;
|
||||
Ip[WS(rs, 1)] = T2n + T2u;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cf_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_12, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cf_12 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 60 FP multiplications,
|
||||
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E T1, T1W, T18, T22, Tc, T15, T1V, T23, TR, T1E, T1o, T1D, T12, T1l, T1F;
|
||||
E T1G, Ti, T1S, T1d, T25, Tt, T1a, T1T, T26, TA, T1y, T1j, T1B, TL, T1g;
|
||||
E T1z, T1A;
|
||||
{
|
||||
E T6, T16, Tb, T17;
|
||||
T1 = Rp[0];
|
||||
T1W = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 2)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T16 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = Rp[WS(rs, 4)];
|
||||
Ta = Rm[WS(rs, 4)];
|
||||
T7 = W[14];
|
||||
T9 = W[15];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T17 = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T18 = KP866025403 * (T16 - T17);
|
||||
T22 = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
T15 = FNMS(KP500000000, Tc, T1);
|
||||
T1V = T16 + T17;
|
||||
T23 = FNMS(KP500000000, T1V, T1W);
|
||||
}
|
||||
{
|
||||
E T11, T1n, TW, T1m;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = Ip[WS(rs, 4)];
|
||||
TQ = Im[WS(rs, 4)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1E = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = Ip[WS(rs, 2)];
|
||||
T10 = Im[WS(rs, 2)];
|
||||
TX = W[8];
|
||||
TZ = W[9];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1n = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = Ip[0];
|
||||
TV = Im[0];
|
||||
TS = W[0];
|
||||
TU = W[1];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1m = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
T1o = KP866025403 * (T1m - T1n);
|
||||
T1D = KP866025403 * (T11 - TW);
|
||||
T12 = TW + T11;
|
||||
T1l = FNMS(KP500000000, T12, TR);
|
||||
T1F = T1m + T1n;
|
||||
T1G = FNMS(KP500000000, T1F, T1E);
|
||||
}
|
||||
{
|
||||
E Ts, T1c, Tn, T1b;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = Rp[WS(rs, 3)];
|
||||
Th = Rm[WS(rs, 3)];
|
||||
Te = W[10];
|
||||
Tg = W[11];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T1S = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = Rp[WS(rs, 1)];
|
||||
Tr = Rm[WS(rs, 1)];
|
||||
To = W[2];
|
||||
Tq = W[3];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1c = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = Rp[WS(rs, 5)];
|
||||
Tm = Rm[WS(rs, 5)];
|
||||
Tj = W[18];
|
||||
Tl = W[19];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1b = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
T1d = KP866025403 * (T1b - T1c);
|
||||
T25 = KP866025403 * (Ts - Tn);
|
||||
Tt = Tn + Ts;
|
||||
T1a = FNMS(KP500000000, Tt, Ti);
|
||||
T1T = T1b + T1c;
|
||||
T26 = FNMS(KP500000000, T1T, T1S);
|
||||
}
|
||||
{
|
||||
E TK, T1i, TF, T1h;
|
||||
{
|
||||
E Tx, Tz, Tw, Ty;
|
||||
Tx = Ip[WS(rs, 1)];
|
||||
Tz = Im[WS(rs, 1)];
|
||||
Tw = W[4];
|
||||
Ty = W[5];
|
||||
TA = FMA(Tw, Tx, Ty * Tz);
|
||||
T1y = FNMS(Ty, Tx, Tw * Tz);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = Ip[WS(rs, 5)];
|
||||
TJ = Im[WS(rs, 5)];
|
||||
TG = W[20];
|
||||
TI = W[21];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1i = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
{
|
||||
E TC, TE, TB, TD;
|
||||
TC = Ip[WS(rs, 3)];
|
||||
TE = Im[WS(rs, 3)];
|
||||
TB = W[12];
|
||||
TD = W[13];
|
||||
TF = FMA(TB, TC, TD * TE);
|
||||
T1h = FNMS(TD, TC, TB * TE);
|
||||
}
|
||||
T1j = KP866025403 * (T1h - T1i);
|
||||
T1B = KP866025403 * (TK - TF);
|
||||
TL = TF + TK;
|
||||
T1g = FNMS(KP500000000, TL, TA);
|
||||
T1z = T1h + T1i;
|
||||
T1A = FNMS(KP500000000, T1z, T1y);
|
||||
}
|
||||
{
|
||||
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
|
||||
{
|
||||
E Td, Tu, T1U, T1X;
|
||||
Td = T1 + Tc;
|
||||
Tu = Ti + Tt;
|
||||
Tv = Td + Tu;
|
||||
T1N = Td - Tu;
|
||||
T1U = T1S + T1T;
|
||||
T1X = T1V + T1W;
|
||||
T1Y = T1U + T1X;
|
||||
T20 = T1X - T1U;
|
||||
}
|
||||
{
|
||||
E TM, T13, T1O, T1P;
|
||||
TM = TA + TL;
|
||||
T13 = TR + T12;
|
||||
T14 = TM + T13;
|
||||
T1Z = TM - T13;
|
||||
T1O = T1y + T1z;
|
||||
T1P = T1E + T1F;
|
||||
T1Q = T1O - T1P;
|
||||
T1R = T1O + T1P;
|
||||
}
|
||||
Rm[WS(rs, 5)] = Tv - T14;
|
||||
Im[WS(rs, 5)] = T1R - T1Y;
|
||||
Rp[0] = Tv + T14;
|
||||
Ip[0] = T1R + T1Y;
|
||||
Rp[WS(rs, 3)] = T1N - T1Q;
|
||||
Ip[WS(rs, 3)] = T1Z + T20;
|
||||
Rm[WS(rs, 2)] = T1N + T1Q;
|
||||
Im[WS(rs, 2)] = T1Z - T20;
|
||||
}
|
||||
{
|
||||
E T1t, T1J, T28, T2a, T1w, T21, T1M, T29;
|
||||
{
|
||||
E T1r, T1s, T24, T27;
|
||||
T1r = T15 + T18;
|
||||
T1s = T1a + T1d;
|
||||
T1t = T1r + T1s;
|
||||
T1J = T1r - T1s;
|
||||
T24 = T22 + T23;
|
||||
T27 = T25 + T26;
|
||||
T28 = T24 - T27;
|
||||
T2a = T27 + T24;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1K, T1L;
|
||||
T1u = T1g + T1j;
|
||||
T1v = T1l + T1o;
|
||||
T1w = T1u + T1v;
|
||||
T21 = T1v - T1u;
|
||||
T1K = T1B + T1A;
|
||||
T1L = T1D + T1G;
|
||||
T1M = T1K - T1L;
|
||||
T29 = T1K + T1L;
|
||||
}
|
||||
Rm[WS(rs, 1)] = T1t - T1w;
|
||||
Im[WS(rs, 1)] = T29 - T2a;
|
||||
Rp[WS(rs, 4)] = T1t + T1w;
|
||||
Ip[WS(rs, 4)] = T29 + T2a;
|
||||
Rm[WS(rs, 4)] = T1J - T1M;
|
||||
Im[WS(rs, 4)] = T21 - T28;
|
||||
Rp[WS(rs, 1)] = T1J + T1M;
|
||||
Ip[WS(rs, 1)] = T21 + T28;
|
||||
}
|
||||
{
|
||||
E T1f, T1x, T2e, T2g, T1q, T2f, T1I, T2b;
|
||||
{
|
||||
E T19, T1e, T2c, T2d;
|
||||
T19 = T15 - T18;
|
||||
T1e = T1a - T1d;
|
||||
T1f = T19 + T1e;
|
||||
T1x = T19 - T1e;
|
||||
T2c = T26 - T25;
|
||||
T2d = T23 - T22;
|
||||
T2e = T2c + T2d;
|
||||
T2g = T2d - T2c;
|
||||
}
|
||||
{
|
||||
E T1k, T1p, T1C, T1H;
|
||||
T1k = T1g - T1j;
|
||||
T1p = T1l - T1o;
|
||||
T1q = T1k + T1p;
|
||||
T2f = T1p - T1k;
|
||||
T1C = T1A - T1B;
|
||||
T1H = T1D - T1G;
|
||||
T1I = T1C + T1H;
|
||||
T2b = T1H - T1C;
|
||||
}
|
||||
Rp[WS(rs, 2)] = T1f - T1q;
|
||||
Ip[WS(rs, 2)] = T2b + T2e;
|
||||
Rm[WS(rs, 3)] = T1f + T1q;
|
||||
Im[WS(rs, 3)] = T2b - T2e;
|
||||
Rm[0] = T1x - T1I;
|
||||
Im[0] = T2f - T2g;
|
||||
Rp[WS(rs, 5)] = T1x + T1I;
|
||||
Ip[WS(rs, 5)] = T2f + T2g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cf_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_12, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
796
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_16.c
Normal file
796
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_16.c
Normal file
@@ -0,0 +1,796 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cf_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 100 FP multiplications,
|
||||
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 60 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T8, T3z, T1I, T3o, T1s, T35, T2p, T2r, T1F, T36, T2k, T2w, Tl, T3A, T1N;
|
||||
E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
|
||||
E T1W, T21;
|
||||
{
|
||||
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
|
||||
T1 = Rp[0];
|
||||
T3n = Rm[0];
|
||||
T3 = Rp[WS(rs, 4)];
|
||||
T6 = Rm[WS(rs, 4)];
|
||||
T2 = W[14];
|
||||
T4 = T2 * T3;
|
||||
T3l = T2 * T6;
|
||||
T5 = W[15];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T3m = FNMS(T5, T3, T3l);
|
||||
T8 = T1 + T7;
|
||||
T3z = T3n - T3m;
|
||||
T1I = T1 - T7;
|
||||
T3o = T3m + T3n;
|
||||
}
|
||||
{
|
||||
E T1h, T1k, T1i, T2l, T1n, T1q, T1o, T2n, T1g, T1m;
|
||||
T1h = Ip[WS(rs, 7)];
|
||||
T1k = Im[WS(rs, 7)];
|
||||
T1g = W[28];
|
||||
T1i = T1g * T1h;
|
||||
T2l = T1g * T1k;
|
||||
T1n = Ip[WS(rs, 3)];
|
||||
T1q = Im[WS(rs, 3)];
|
||||
T1m = W[12];
|
||||
T1o = T1m * T1n;
|
||||
T2n = T1m * T1q;
|
||||
{
|
||||
E T1l, T2m, T1r, T2o, T1j, T1p;
|
||||
T1j = W[29];
|
||||
T1l = FMA(T1j, T1k, T1i);
|
||||
T2m = FNMS(T1j, T1h, T2l);
|
||||
T1p = W[13];
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T2o = FNMS(T1p, T1n, T2n);
|
||||
T1s = T1l + T1r;
|
||||
T35 = T2m + T2o;
|
||||
T2p = T2m - T2o;
|
||||
T2r = T1l - T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
|
||||
T1u = Ip[WS(rs, 1)];
|
||||
T1x = Im[WS(rs, 1)];
|
||||
T1t = W[4];
|
||||
T1v = T1t * T1u;
|
||||
T2s = T1t * T1x;
|
||||
T1A = Ip[WS(rs, 5)];
|
||||
T1D = Im[WS(rs, 5)];
|
||||
T1z = W[20];
|
||||
T1B = T1z * T1A;
|
||||
T2u = T1z * T1D;
|
||||
{
|
||||
E T1y, T2t, T1E, T2v, T1w, T1C;
|
||||
T1w = W[5];
|
||||
T1y = FMA(T1w, T1x, T1v);
|
||||
T2t = FNMS(T1w, T1u, T2s);
|
||||
T1C = W[21];
|
||||
T1E = FMA(T1C, T1D, T1B);
|
||||
T2v = FNMS(T1C, T1A, T2u);
|
||||
T1F = T1y + T1E;
|
||||
T36 = T2t + T2v;
|
||||
T2k = T1E - T1y;
|
||||
T2w = T2t - T2v;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
|
||||
Ta = Rp[WS(rs, 2)];
|
||||
Td = Rm[WS(rs, 2)];
|
||||
T9 = W[6];
|
||||
Tb = T9 * Ta;
|
||||
T1J = T9 * Td;
|
||||
Tg = Rp[WS(rs, 6)];
|
||||
Tj = Rm[WS(rs, 6)];
|
||||
Tf = W[22];
|
||||
Th = Tf * Tg;
|
||||
T1L = Tf * Tj;
|
||||
{
|
||||
E Te, T1K, Tk, T1M, Tc, Ti;
|
||||
Tc = W[7];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1K = FNMS(Tc, Ta, T1J);
|
||||
Ti = W[23];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1M = FNMS(Ti, Tg, T1L);
|
||||
Tl = Te + Tk;
|
||||
T3A = Te - Tk;
|
||||
T1N = T1K - T1M;
|
||||
T3k = T1K + T1M;
|
||||
}
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
|
||||
To = Rp[WS(rs, 1)];
|
||||
Tr = Rm[WS(rs, 1)];
|
||||
Tn = W[2];
|
||||
Tp = Tn * To;
|
||||
T1P = Tn * Tr;
|
||||
Tu = Rp[WS(rs, 5)];
|
||||
Tx = Rm[WS(rs, 5)];
|
||||
Tt = W[18];
|
||||
Tv = Tt * Tu;
|
||||
T1R = Tt * Tx;
|
||||
{
|
||||
E Ts, T1Q, Ty, T1S, Tq, Tw;
|
||||
Tq = W[3];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
T1Q = FNMS(Tq, To, T1P);
|
||||
Tw = W[19];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
T1S = FNMS(Tw, Tu, T1R);
|
||||
Tz = Ts + Ty;
|
||||
T2V = T1Q + T1S;
|
||||
T1T = T1Q - T1S;
|
||||
T1U = Ts - Ty;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
|
||||
TQ = Ip[0];
|
||||
TT = Im[0];
|
||||
TP = W[0];
|
||||
TR = TP * TQ;
|
||||
T25 = TP * TT;
|
||||
TW = Ip[WS(rs, 4)];
|
||||
TZ = Im[WS(rs, 4)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T27 = TV * TZ;
|
||||
{
|
||||
E TU, T26, T10, T28, TS, TY;
|
||||
TS = W[1];
|
||||
TU = FMA(TS, TT, TR);
|
||||
T26 = FNMS(TS, TQ, T25);
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T28 = FNMS(TY, TW, T27);
|
||||
T11 = TU + T10;
|
||||
T30 = T26 + T28;
|
||||
T29 = T26 - T28;
|
||||
T2c = TU - T10;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
|
||||
T13 = Ip[WS(rs, 2)];
|
||||
T16 = Im[WS(rs, 2)];
|
||||
T12 = W[8];
|
||||
T14 = T12 * T13;
|
||||
T2d = T12 * T16;
|
||||
T19 = Ip[WS(rs, 6)];
|
||||
T1c = Im[WS(rs, 6)];
|
||||
T18 = W[24];
|
||||
T1a = T18 * T19;
|
||||
T2f = T18 * T1c;
|
||||
{
|
||||
E T17, T2e, T1d, T2g, T15, T1b;
|
||||
T15 = W[9];
|
||||
T17 = FMA(T15, T16, T14);
|
||||
T2e = FNMS(T15, T13, T2d);
|
||||
T1b = W[25];
|
||||
T1d = FMA(T1b, T1c, T1a);
|
||||
T2g = FNMS(T1b, T19, T2f);
|
||||
T1e = T17 + T1d;
|
||||
T31 = T2e + T2g;
|
||||
T2a = T17 - T1d;
|
||||
T2h = T2e - T2g;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
|
||||
TB = Rp[WS(rs, 7)];
|
||||
TE = Rm[WS(rs, 7)];
|
||||
TA = W[26];
|
||||
TC = TA * TB;
|
||||
T1X = TA * TE;
|
||||
TH = Rp[WS(rs, 3)];
|
||||
TK = Rm[WS(rs, 3)];
|
||||
TG = W[10];
|
||||
TI = TG * TH;
|
||||
T1Z = TG * TK;
|
||||
{
|
||||
E TF, T1Y, TL, T20, TD, TJ;
|
||||
TD = W[27];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1Y = FNMS(TD, TB, T1X);
|
||||
TJ = W[11];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T20 = FNMS(TJ, TH, T1Z);
|
||||
TM = TF + TL;
|
||||
T2W = T1Y + T20;
|
||||
T1W = TF - TL;
|
||||
T21 = T1Y - T20;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
|
||||
{
|
||||
E Tm, TN, T3j, T3p;
|
||||
Tm = T8 + Tl;
|
||||
TN = Tz + TM;
|
||||
TO = Tm + TN;
|
||||
T3e = Tm - TN;
|
||||
T3j = T2V + T2W;
|
||||
T3p = T3k + T3o;
|
||||
T3q = T3j + T3p;
|
||||
T3s = T3p - T3j;
|
||||
}
|
||||
{
|
||||
E T1f, T1G, T3f, T3g;
|
||||
T1f = T11 + T1e;
|
||||
T1G = T1s + T1F;
|
||||
T1H = T1f + T1G;
|
||||
T3r = T1G - T1f;
|
||||
T3f = T30 + T31;
|
||||
T3g = T35 + T36;
|
||||
T3h = T3f - T3g;
|
||||
T3i = T3f + T3g;
|
||||
}
|
||||
Rm[WS(rs, 7)] = TO - T1H;
|
||||
Im[WS(rs, 7)] = T3i - T3q;
|
||||
Rp[0] = TO + T1H;
|
||||
Ip[0] = T3i + T3q;
|
||||
Rm[WS(rs, 3)] = T3e - T3h;
|
||||
Im[WS(rs, 3)] = T3r - T3s;
|
||||
Rp[WS(rs, 4)] = T3e + T3h;
|
||||
Ip[WS(rs, 4)] = T3r + T3s;
|
||||
}
|
||||
{
|
||||
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
|
||||
{
|
||||
E T2U, T2X, T3t, T3u;
|
||||
T2U = T8 - Tl;
|
||||
T2X = T2V - T2W;
|
||||
T2Y = T2U + T2X;
|
||||
T3a = T2U - T2X;
|
||||
T3t = TM - Tz;
|
||||
T3u = T3o - T3k;
|
||||
T3v = T3t + T3u;
|
||||
T3x = T3u - T3t;
|
||||
}
|
||||
{
|
||||
E T2Z, T32, T34, T37;
|
||||
T2Z = T11 - T1e;
|
||||
T32 = T30 - T31;
|
||||
T33 = T2Z + T32;
|
||||
T3b = T32 - T2Z;
|
||||
T34 = T1s - T1F;
|
||||
T37 = T35 - T36;
|
||||
T38 = T34 - T37;
|
||||
T3c = T34 + T37;
|
||||
}
|
||||
{
|
||||
E T39, T3w, T3d, T3y;
|
||||
T39 = T33 + T38;
|
||||
Rm[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
|
||||
Rp[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
|
||||
T3w = T3b + T3c;
|
||||
Im[WS(rs, 5)] = FMS(KP707106781, T3w, T3v);
|
||||
Ip[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
|
||||
T3d = T3b - T3c;
|
||||
Rm[WS(rs, 1)] = FNMS(KP707106781, T3d, T3a);
|
||||
Rp[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
|
||||
T3y = T38 - T33;
|
||||
Im[WS(rs, 1)] = FMS(KP707106781, T3y, T3x);
|
||||
Ip[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
|
||||
E T2C;
|
||||
{
|
||||
E T1V, T22, T2b, T2i;
|
||||
T1O = T1I - T1N;
|
||||
T3B = T3z - T3A;
|
||||
T3H = T3A + T3z;
|
||||
T2E = T1I + T1N;
|
||||
T1V = T1T - T1U;
|
||||
T22 = T1W + T21;
|
||||
T23 = T1V - T22;
|
||||
T3C = T1V + T22;
|
||||
{
|
||||
E T2M, T2N, T2F, T2G;
|
||||
T2M = T2r + T2w;
|
||||
T2N = T2p + T2k;
|
||||
T2O = FNMS(KP414213562, T2N, T2M);
|
||||
T2S = FMA(KP414213562, T2M, T2N);
|
||||
T2F = T1U + T1T;
|
||||
T2G = T1W - T21;
|
||||
T2H = T2F + T2G;
|
||||
T3I = T2G - T2F;
|
||||
}
|
||||
T2b = T29 + T2a;
|
||||
T2i = T2c - T2h;
|
||||
T2j = FMA(KP414213562, T2i, T2b);
|
||||
T2B = FNMS(KP414213562, T2b, T2i);
|
||||
{
|
||||
E T2J, T2K, T2q, T2x;
|
||||
T2J = T2c + T2h;
|
||||
T2K = T29 - T2a;
|
||||
T2L = FMA(KP414213562, T2K, T2J);
|
||||
T2R = FNMS(KP414213562, T2J, T2K);
|
||||
T2q = T2k - T2p;
|
||||
T2x = T2r - T2w;
|
||||
T2y = FMA(KP414213562, T2x, T2q);
|
||||
T2C = FNMS(KP414213562, T2q, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T24, T2z, T3J, T3K;
|
||||
T24 = FMA(KP707106781, T23, T1O);
|
||||
T2z = T2j + T2y;
|
||||
Rm[WS(rs, 4)] = FNMS(KP923879532, T2z, T24);
|
||||
Rp[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
|
||||
T3J = FMA(KP707106781, T3I, T3H);
|
||||
T3K = T2C - T2B;
|
||||
Im[WS(rs, 4)] = FMS(KP923879532, T3K, T3J);
|
||||
Ip[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
|
||||
}
|
||||
{
|
||||
E T2A, T2D, T3L, T3M;
|
||||
T2A = FNMS(KP707106781, T23, T1O);
|
||||
T2D = T2B + T2C;
|
||||
Rp[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
|
||||
Rm[0] = FMA(KP923879532, T2D, T2A);
|
||||
T3L = FNMS(KP707106781, T3I, T3H);
|
||||
T3M = T2y - T2j;
|
||||
Im[0] = FMS(KP923879532, T3M, T3L);
|
||||
Ip[WS(rs, 7)] = FMA(KP923879532, T3M, T3L);
|
||||
}
|
||||
{
|
||||
E T2I, T2P, T3D, T3E;
|
||||
T2I = FMA(KP707106781, T2H, T2E);
|
||||
T2P = T2L + T2O;
|
||||
Rm[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
|
||||
Rp[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
|
||||
T3D = FMA(KP707106781, T3C, T3B);
|
||||
T3E = T2R + T2S;
|
||||
Im[WS(rs, 6)] = FMS(KP923879532, T3E, T3D);
|
||||
Ip[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
|
||||
}
|
||||
{
|
||||
E T2Q, T2T, T3F, T3G;
|
||||
T2Q = FNMS(KP707106781, T2H, T2E);
|
||||
T2T = T2R - T2S;
|
||||
Rm[WS(rs, 2)] = FNMS(KP923879532, T2T, T2Q);
|
||||
Rp[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
|
||||
T3F = FNMS(KP707106781, T3C, T3B);
|
||||
T3G = T2O - T2L;
|
||||
Im[WS(rs, 2)] = FMS(KP923879532, T3G, T3F);
|
||||
Ip[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cf_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cf_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 84 FP multiplications,
|
||||
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 52 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
|
||||
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
|
||||
E T2y, T2z, T1O, T2g, T1T, T2h;
|
||||
{
|
||||
E T1, T2T, T6, T2S;
|
||||
T1 = Rp[0];
|
||||
T2T = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Rp[WS(rs, 4)];
|
||||
T5 = Rm[WS(rs, 4)];
|
||||
T2 = W[14];
|
||||
T4 = W[15];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T2S = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T37 = T2T - T2S;
|
||||
T1t = T1 - T6;
|
||||
T2U = T2S + T2T;
|
||||
}
|
||||
{
|
||||
E Tc, T1u, Th, T1v;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = Rp[WS(rs, 2)];
|
||||
Tb = Rm[WS(rs, 2)];
|
||||
T8 = W[6];
|
||||
Ta = W[7];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T1u = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = Rp[WS(rs, 6)];
|
||||
Tg = Rm[WS(rs, 6)];
|
||||
Td = W[22];
|
||||
Tf = W[23];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T1v = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T38 = Tc - Th;
|
||||
T1w = T1u - T1v;
|
||||
T2R = T1u + T1v;
|
||||
}
|
||||
{
|
||||
E To, T1y, Tt, T1z, T1A, T1B;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = Rp[WS(rs, 1)];
|
||||
Tn = Rm[WS(rs, 1)];
|
||||
Tk = W[2];
|
||||
Tm = W[3];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
T1y = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = Rp[WS(rs, 5)];
|
||||
Ts = Rm[WS(rs, 5)];
|
||||
Tp = W[18];
|
||||
Tr = W[19];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
T1z = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T2s = T1y + T1z;
|
||||
T1A = T1y - T1z;
|
||||
T1B = To - Tt;
|
||||
T1C = T1A - T1B;
|
||||
T2c = T1B + T1A;
|
||||
}
|
||||
{
|
||||
E Tz, T1E, TE, T1F, T1D, T1G;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = Rp[WS(rs, 7)];
|
||||
Ty = Rm[WS(rs, 7)];
|
||||
Tv = W[26];
|
||||
Tx = W[27];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1E = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = Rp[WS(rs, 3)];
|
||||
TD = Rm[WS(rs, 3)];
|
||||
TA = W[10];
|
||||
TC = W[11];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1F = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T2t = T1E + T1F;
|
||||
T1D = Tz - TE;
|
||||
T1G = T1E - T1F;
|
||||
T1H = T1D + T1G;
|
||||
T2d = T1D - T1G;
|
||||
}
|
||||
{
|
||||
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
|
||||
{
|
||||
E T16, T18, T15, T17;
|
||||
T16 = Ip[WS(rs, 7)];
|
||||
T18 = Im[WS(rs, 7)];
|
||||
T15 = W[28];
|
||||
T17 = W[29];
|
||||
T19 = FMA(T15, T16, T17 * T18);
|
||||
T20 = FNMS(T17, T16, T15 * T18);
|
||||
}
|
||||
{
|
||||
E T1m, T1o, T1l, T1n;
|
||||
T1m = Ip[WS(rs, 5)];
|
||||
T1o = Im[WS(rs, 5)];
|
||||
T1l = W[20];
|
||||
T1n = W[21];
|
||||
T1p = FMA(T1l, T1m, T1n * T1o);
|
||||
T1X = FNMS(T1n, T1m, T1l * T1o);
|
||||
}
|
||||
{
|
||||
E T1b, T1d, T1a, T1c;
|
||||
T1b = Ip[WS(rs, 3)];
|
||||
T1d = Im[WS(rs, 3)];
|
||||
T1a = W[12];
|
||||
T1c = W[13];
|
||||
T1e = FMA(T1a, T1b, T1c * T1d);
|
||||
T21 = FNMS(T1c, T1b, T1a * T1d);
|
||||
}
|
||||
{
|
||||
E T1h, T1j, T1g, T1i;
|
||||
T1h = Ip[WS(rs, 1)];
|
||||
T1j = Im[WS(rs, 1)];
|
||||
T1g = W[4];
|
||||
T1i = W[5];
|
||||
T1k = FMA(T1g, T1h, T1i * T1j);
|
||||
T1W = FNMS(T1i, T1h, T1g * T1j);
|
||||
}
|
||||
T1f = T19 + T1e;
|
||||
T1q = T1k + T1p;
|
||||
T2B = T1f - T1q;
|
||||
T2C = T20 + T21;
|
||||
T2D = T1W + T1X;
|
||||
T2E = T2C - T2D;
|
||||
{
|
||||
E T1V, T1Y, T22, T23;
|
||||
T1V = T19 - T1e;
|
||||
T1Y = T1W - T1X;
|
||||
T1Z = T1V - T1Y;
|
||||
T2j = T1V + T1Y;
|
||||
T22 = T20 - T21;
|
||||
T23 = T1k - T1p;
|
||||
T24 = T22 + T23;
|
||||
T2k = T22 - T23;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
|
||||
{
|
||||
E TJ, TL, TI, TK;
|
||||
TJ = Ip[0];
|
||||
TL = Im[0];
|
||||
TI = W[0];
|
||||
TK = W[1];
|
||||
TM = FMA(TI, TJ, TK * TL);
|
||||
T1K = FNMS(TK, TJ, TI * TL);
|
||||
}
|
||||
{
|
||||
E TZ, T11, TY, T10;
|
||||
TZ = Ip[WS(rs, 6)];
|
||||
T11 = Im[WS(rs, 6)];
|
||||
TY = W[24];
|
||||
T10 = W[25];
|
||||
T12 = FMA(TY, TZ, T10 * T11);
|
||||
T1R = FNMS(T10, TZ, TY * T11);
|
||||
}
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = Ip[WS(rs, 4)];
|
||||
TQ = Im[WS(rs, 4)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1L = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TU, TW, TT, TV;
|
||||
TU = Ip[WS(rs, 2)];
|
||||
TW = Im[WS(rs, 2)];
|
||||
TT = W[8];
|
||||
TV = W[9];
|
||||
TX = FMA(TT, TU, TV * TW);
|
||||
T1Q = FNMS(TV, TU, TT * TW);
|
||||
}
|
||||
TS = TM + TR;
|
||||
T13 = TX + T12;
|
||||
T2w = TS - T13;
|
||||
T2x = T1K + T1L;
|
||||
T2y = T1Q + T1R;
|
||||
T2z = T2x - T2y;
|
||||
{
|
||||
E T1M, T1N, T1P, T1S;
|
||||
T1M = T1K - T1L;
|
||||
T1N = TX - T12;
|
||||
T1O = T1M + T1N;
|
||||
T2g = T1M - T1N;
|
||||
T1P = TM - TR;
|
||||
T1S = T1Q - T1R;
|
||||
T1T = T1P - T1S;
|
||||
T2h = T1P + T1S;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
|
||||
{
|
||||
E T1x, T1I, T3e, T3f;
|
||||
T1x = T1t - T1w;
|
||||
T1I = KP707106781 * (T1C - T1H);
|
||||
T1J = T1x + T1I;
|
||||
T27 = T1x - T1I;
|
||||
T3e = KP707106781 * (T2d - T2c);
|
||||
T3f = T38 + T37;
|
||||
T3g = T3e + T3f;
|
||||
T3i = T3f - T3e;
|
||||
}
|
||||
{
|
||||
E T1U, T25, T28, T29;
|
||||
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
|
||||
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
|
||||
T26 = T1U + T25;
|
||||
T3h = T25 - T1U;
|
||||
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
|
||||
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
|
||||
T2a = T28 - T29;
|
||||
T3d = T28 + T29;
|
||||
}
|
||||
Rm[WS(rs, 4)] = T1J - T26;
|
||||
Im[WS(rs, 4)] = T3d - T3g;
|
||||
Rp[WS(rs, 3)] = T1J + T26;
|
||||
Ip[WS(rs, 3)] = T3d + T3g;
|
||||
Rm[0] = T27 - T2a;
|
||||
Im[0] = T3h - T3i;
|
||||
Rp[WS(rs, 7)] = T27 + T2a;
|
||||
Ip[WS(rs, 7)] = T3h + T3i;
|
||||
}
|
||||
{
|
||||
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
|
||||
{
|
||||
E T2r, T2u, T30, T31;
|
||||
T2r = T7 - Ti;
|
||||
T2u = T2s - T2t;
|
||||
T2v = T2r + T2u;
|
||||
T2H = T2r - T2u;
|
||||
T30 = TF - Tu;
|
||||
T31 = T2U - T2R;
|
||||
T32 = T30 + T31;
|
||||
T34 = T31 - T30;
|
||||
}
|
||||
{
|
||||
E T2A, T2F, T2I, T2J;
|
||||
T2A = T2w + T2z;
|
||||
T2F = T2B - T2E;
|
||||
T2G = KP707106781 * (T2A + T2F);
|
||||
T33 = KP707106781 * (T2F - T2A);
|
||||
T2I = T2z - T2w;
|
||||
T2J = T2B + T2E;
|
||||
T2K = KP707106781 * (T2I - T2J);
|
||||
T2Z = KP707106781 * (T2I + T2J);
|
||||
}
|
||||
Rm[WS(rs, 5)] = T2v - T2G;
|
||||
Im[WS(rs, 5)] = T2Z - T32;
|
||||
Rp[WS(rs, 2)] = T2v + T2G;
|
||||
Ip[WS(rs, 2)] = T2Z + T32;
|
||||
Rm[WS(rs, 1)] = T2H - T2K;
|
||||
Im[WS(rs, 1)] = T33 - T34;
|
||||
Rp[WS(rs, 6)] = T2H + T2K;
|
||||
Ip[WS(rs, 6)] = T33 + T34;
|
||||
}
|
||||
{
|
||||
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
|
||||
{
|
||||
E T2b, T2e, T36, T39;
|
||||
T2b = T1t + T1w;
|
||||
T2e = KP707106781 * (T2c + T2d);
|
||||
T2f = T2b + T2e;
|
||||
T2n = T2b - T2e;
|
||||
T36 = KP707106781 * (T1C + T1H);
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 + T39;
|
||||
T3c = T39 - T36;
|
||||
}
|
||||
{
|
||||
E T2i, T2l, T2o, T2p;
|
||||
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
|
||||
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
|
||||
T2m = T2i + T2l;
|
||||
T3b = T2l - T2i;
|
||||
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
|
||||
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
|
||||
T2q = T2o - T2p;
|
||||
T35 = T2o + T2p;
|
||||
}
|
||||
Rm[WS(rs, 6)] = T2f - T2m;
|
||||
Im[WS(rs, 6)] = T35 - T3a;
|
||||
Rp[WS(rs, 1)] = T2f + T2m;
|
||||
Ip[WS(rs, 1)] = T35 + T3a;
|
||||
Rm[WS(rs, 2)] = T2n - T2q;
|
||||
Im[WS(rs, 2)] = T3b - T3c;
|
||||
Rp[WS(rs, 5)] = T2n + T2q;
|
||||
Ip[WS(rs, 5)] = T3b + T3c;
|
||||
}
|
||||
{
|
||||
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
|
||||
{
|
||||
E Tj, TG, T2Q, T2V;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
TH = Tj + TG;
|
||||
T2L = Tj - TG;
|
||||
T2Q = T2s + T2t;
|
||||
T2V = T2R + T2U;
|
||||
T2W = T2Q + T2V;
|
||||
T2Y = T2V - T2Q;
|
||||
}
|
||||
{
|
||||
E T14, T1r, T2M, T2N;
|
||||
T14 = TS + T13;
|
||||
T1r = T1f + T1q;
|
||||
T1s = T14 + T1r;
|
||||
T2X = T1r - T14;
|
||||
T2M = T2x + T2y;
|
||||
T2N = T2C + T2D;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2M + T2N;
|
||||
}
|
||||
Rm[WS(rs, 7)] = TH - T1s;
|
||||
Im[WS(rs, 7)] = T2P - T2W;
|
||||
Rp[0] = TH + T1s;
|
||||
Ip[0] = T2P + T2W;
|
||||
Rm[WS(rs, 3)] = T2L - T2O;
|
||||
Im[WS(rs, 3)] = T2X - T2Y;
|
||||
Rp[WS(rs, 4)] = T2L + T2O;
|
||||
Ip[WS(rs, 4)] = T2X + T2Y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cf_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_16, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
117
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_2.c
Normal file
117
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_2.c
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cf_2 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 11 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
|
||||
T1 = Rp[0];
|
||||
Ta = Rm[0];
|
||||
T3 = Ip[0];
|
||||
T6 = Im[0];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
T8 = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T9 = FNMS(T5, T3, T8);
|
||||
Rm[0] = T1 - T7;
|
||||
Im[0] = T9 - Ta;
|
||||
Rp[0] = T1 + T7;
|
||||
Ip[0] = T9 + Ta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_2, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cf_2 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, T8, T6, T7;
|
||||
T1 = Rp[0];
|
||||
T8 = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Ip[0];
|
||||
T5 = Im[0];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T7 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
Rm[0] = T1 - T6;
|
||||
Im[0] = T7 - T8;
|
||||
Rp[0] = T1 + T6;
|
||||
Ip[0] = T7 + T8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_2, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
1050
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_20.c
Normal file
1050
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1809
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_32.c
Normal file
1809
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_32.c
Normal file
File diff suppressed because it is too large
Load Diff
196
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_4.c
Normal file
196
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_4.c
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cf_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
|
||||
T1 = Rp[0];
|
||||
Tv = Rm[0];
|
||||
{
|
||||
E T3, T6, T4, Tt, T2, T5;
|
||||
T3 = Rp[WS(rs, 1)];
|
||||
T6 = Rm[WS(rs, 1)];
|
||||
T2 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tt = T2 * T6;
|
||||
T5 = W[3];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tu = FNMS(T5, T3, Tt);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, Tn, T9, Tc;
|
||||
Ta = Ip[0];
|
||||
Td = Im[0];
|
||||
T9 = W[0];
|
||||
Tb = T9 * Ta;
|
||||
Tn = T9 * Td;
|
||||
Tc = W[1];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
To = FNMS(Tc, Ta, Tn);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, Tp, Tf, Ti;
|
||||
Tg = Ip[WS(rs, 1)];
|
||||
Tj = Im[WS(rs, 1)];
|
||||
Tf = W[4];
|
||||
Th = Tf * Tg;
|
||||
Tp = Tf * Tj;
|
||||
Ti = W[5];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
Tq = FNMS(Ti, Tg, Tp);
|
||||
}
|
||||
{
|
||||
E T8, Tl, Ts, Tw;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
Rm[WS(rs, 1)] = T8 - Tl;
|
||||
Rp[0] = T8 + Tl;
|
||||
Ts = To + Tq;
|
||||
Tw = Tu + Tv;
|
||||
Im[WS(rs, 1)] = Ts - Tw;
|
||||
Ip[0] = Ts + Tw;
|
||||
}
|
||||
{
|
||||
E Tm, Tr, Tx, Ty;
|
||||
Tm = T1 - T7;
|
||||
Tr = To - Tq;
|
||||
Rm[0] = Tm - Tr;
|
||||
Rp[WS(rs, 1)] = Tm + Tr;
|
||||
Tx = Tk - Te;
|
||||
Ty = Tv - Tu;
|
||||
Im[0] = Tx - Ty;
|
||||
Ip[WS(rs, 1)] = Tx + Ty;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cf_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
|
||||
T1 = Rp[0];
|
||||
Tp = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Rp[WS(rs, 1)];
|
||||
T5 = Rm[WS(rs, 1)];
|
||||
T2 = W[2];
|
||||
T4 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
To = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = Ip[0];
|
||||
Tb = Im[0];
|
||||
T8 = W[0];
|
||||
Ta = W[1];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
Tk = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = Ip[WS(rs, 1)];
|
||||
Tg = Im[WS(rs, 1)];
|
||||
Td = W[4];
|
||||
Tf = W[5];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tl = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E T7, Ti, Tn, Tq;
|
||||
T7 = T1 + T6;
|
||||
Ti = Tc + Th;
|
||||
Rm[WS(rs, 1)] = T7 - Ti;
|
||||
Rp[0] = T7 + Ti;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
Im[WS(rs, 1)] = Tn - Tq;
|
||||
Ip[0] = Tn + Tq;
|
||||
}
|
||||
{
|
||||
E Tj, Tm, Tr, Ts;
|
||||
Tj = T1 - T6;
|
||||
Tm = Tk - Tl;
|
||||
Rm[0] = Tj - Tm;
|
||||
Rp[WS(rs, 1)] = Tj + Tm;
|
||||
Tr = Th - Tc;
|
||||
Ts = Tp - To;
|
||||
Im[0] = Tr - Ts;
|
||||
Ip[WS(rs, 1)] = Tr + Ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_4, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
295
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_6.c
Normal file
295
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_6.c
Normal file
@@ -0,0 +1,295 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cf_6 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 32 FP multiplications,
|
||||
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, TX, T7, TW, Tl, TS, TB, TJ, Ty, TR, TC, TO;
|
||||
T1 = Rp[0];
|
||||
TX = Rm[0];
|
||||
{
|
||||
E T3, T6, T4, TV, T2, T5;
|
||||
T3 = Ip[WS(rs, 1)];
|
||||
T6 = Im[WS(rs, 1)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TV = T2 * T6;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TW = FNMS(T5, T3, TV);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TF, Tg, Tj, Th, TH, T9, Tf;
|
||||
Ta = Rp[WS(rs, 1)];
|
||||
Td = Rm[WS(rs, 1)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TF = T9 * Td;
|
||||
Tg = Ip[WS(rs, 2)];
|
||||
Tj = Im[WS(rs, 2)];
|
||||
Tf = W[8];
|
||||
Th = Tf * Tg;
|
||||
TH = Tf * Tj;
|
||||
{
|
||||
E Te, TG, Tk, TI, Tc, Ti;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TG = FNMS(Tc, Ta, TF);
|
||||
Ti = W[9];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TI = FNMS(Ti, Tg, TH);
|
||||
Tl = Te - Tk;
|
||||
TS = TI - TG;
|
||||
TB = Te + Tk;
|
||||
TJ = TG + TI;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, TK, Tt, Tw, Tu, TM, Tm, Ts;
|
||||
Tn = Rp[WS(rs, 2)];
|
||||
Tq = Rm[WS(rs, 2)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
TK = Tm * Tq;
|
||||
Tt = Ip[0];
|
||||
Tw = Im[0];
|
||||
Ts = W[0];
|
||||
Tu = Ts * Tt;
|
||||
TM = Ts * Tw;
|
||||
{
|
||||
E Tr, TL, Tx, TN, Tp, Tv;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
TL = FNMS(Tp, Tn, TK);
|
||||
Tv = W[1];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TN = FNMS(Tv, Tt, TM);
|
||||
Ty = Tr - Tx;
|
||||
TR = TN - TL;
|
||||
TC = Tr + Tx;
|
||||
TO = TL + TN;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TT, T8, Tz, TQ;
|
||||
TT = TR - TS;
|
||||
T8 = T1 - T7;
|
||||
Tz = Tl + Ty;
|
||||
TQ = FNMS(KP500000000, Tz, T8);
|
||||
Rm[WS(rs, 2)] = T8 + Tz;
|
||||
Rp[WS(rs, 1)] = FMA(KP866025403, TT, TQ);
|
||||
Rm[0] = FNMS(KP866025403, TT, TQ);
|
||||
}
|
||||
{
|
||||
E T14, T11, T12, T13;
|
||||
T14 = Ty - Tl;
|
||||
T11 = TS + TR;
|
||||
T12 = TX - TW;
|
||||
T13 = FMA(KP500000000, T11, T12);
|
||||
Im[WS(rs, 2)] = T11 - T12;
|
||||
Ip[WS(rs, 1)] = FMA(KP866025403, T14, T13);
|
||||
Im[0] = FMS(KP866025403, T14, T13);
|
||||
}
|
||||
{
|
||||
E TP, TA, TD, TE;
|
||||
TP = TJ - TO;
|
||||
TA = T1 + T7;
|
||||
TD = TB + TC;
|
||||
TE = FNMS(KP500000000, TD, TA);
|
||||
Rp[0] = TA + TD;
|
||||
Rm[WS(rs, 1)] = FMA(KP866025403, TP, TE);
|
||||
Rp[WS(rs, 2)] = FNMS(KP866025403, TP, TE);
|
||||
}
|
||||
{
|
||||
E T10, TU, TY, TZ;
|
||||
T10 = TB - TC;
|
||||
TU = TJ + TO;
|
||||
TY = TW + TX;
|
||||
TZ = FNMS(KP500000000, TU, TY);
|
||||
Ip[0] = TU + TY;
|
||||
Ip[WS(rs, 2)] = FMA(KP866025403, T10, TZ);
|
||||
Im[WS(rs, 1)] = FMS(KP866025403, T10, TZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, "hc2cf_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_6, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cf_6 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 28 FP multiplications,
|
||||
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
|
||||
{
|
||||
E T1, TN, T6, TM;
|
||||
T1 = Rp[0];
|
||||
TN = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Ip[WS(rs, 1)];
|
||||
T5 = Im[WS(rs, 1)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TM = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
TS = TN - TM;
|
||||
Tv = T1 + T6;
|
||||
TO = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tn, TD, Ts, TE;
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = Rp[WS(rs, 2)];
|
||||
Tm = Rm[WS(rs, 2)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TD = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = Ip[0];
|
||||
Tr = Im[0];
|
||||
To = W[0];
|
||||
Tq = W[1];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TE = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn - Ts;
|
||||
TJ = TE - TD;
|
||||
Tx = Tn + Ts;
|
||||
TF = TD + TE;
|
||||
}
|
||||
{
|
||||
E Tc, TA, Th, TB;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = Rp[WS(rs, 1)];
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TA = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = Ip[WS(rs, 2)];
|
||||
Tg = Im[WS(rs, 2)];
|
||||
Td = W[8];
|
||||
Tf = W[9];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TB = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
TI = TA - TB;
|
||||
Tw = Tc + Th;
|
||||
TC = TA + TB;
|
||||
}
|
||||
{
|
||||
E TK, Tu, TH, TT, TR, TU;
|
||||
TK = KP866025403 * (TI + TJ);
|
||||
Tu = Ti + Tt;
|
||||
TH = FNMS(KP500000000, Tu, T7);
|
||||
Rm[WS(rs, 2)] = T7 + Tu;
|
||||
Rp[WS(rs, 1)] = TH + TK;
|
||||
Rm[0] = TH - TK;
|
||||
TT = KP866025403 * (Tt - Ti);
|
||||
TR = TJ - TI;
|
||||
TU = FMA(KP500000000, TR, TS);
|
||||
Im[WS(rs, 2)] = TR - TS;
|
||||
Ip[WS(rs, 1)] = TT + TU;
|
||||
Im[0] = TT - TU;
|
||||
}
|
||||
{
|
||||
E TG, Ty, Tz, TP, TL, TQ;
|
||||
TG = KP866025403 * (TC - TF);
|
||||
Ty = Tw + Tx;
|
||||
Tz = FNMS(KP500000000, Ty, Tv);
|
||||
Rp[0] = Tv + Ty;
|
||||
Rm[WS(rs, 1)] = Tz + TG;
|
||||
Rp[WS(rs, 2)] = Tz - TG;
|
||||
TP = KP866025403 * (Tw - Tx);
|
||||
TL = TC + TF;
|
||||
TQ = FNMS(KP500000000, TL, TO);
|
||||
Ip[0] = TL + TO;
|
||||
Ip[WS(rs, 2)] = TP + TQ;
|
||||
Im[WS(rs, 1)] = TP - TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, "hc2cf_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_6, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
376
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_8.c
Normal file
376
fftw-3.3.10/rdft/scalar/r2cf/hc2cf_8.c
Normal file
@@ -0,0 +1,376 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:31 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 34 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
|
||||
E TX, Ty, TZ, TV, T10;
|
||||
T1 = Rp[0];
|
||||
T1m = Rm[0];
|
||||
{
|
||||
E T3, T6, T4, T1k, T2, T5;
|
||||
T3 = Rp[WS(rs, 2)];
|
||||
T6 = Rm[WS(rs, 2)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1k = T2 * T6;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1l = FNMS(T5, T3, T1k);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TR, Tf, Ti;
|
||||
Tg = Rp[WS(rs, 3)];
|
||||
Tj = Rm[WS(rs, 3)];
|
||||
Tf = W[10];
|
||||
Th = Tf * Tg;
|
||||
TR = Tf * Tj;
|
||||
Ti = W[11];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TS = FNMS(Ti, Tg, TR);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TP, T9, Tc;
|
||||
Ta = Rp[WS(rs, 1)];
|
||||
Td = Rm[WS(rs, 1)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TP = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TQ = FNMS(Tc, Ta, TP);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
|
||||
TB = Ip[WS(rs, 3)];
|
||||
TE = Im[WS(rs, 3)];
|
||||
TA = W[12];
|
||||
TC = TA * TB;
|
||||
T13 = TA * TE;
|
||||
TH = Ip[WS(rs, 1)];
|
||||
TK = Im[WS(rs, 1)];
|
||||
TG = W[4];
|
||||
TI = TG * TH;
|
||||
T15 = TG * TK;
|
||||
TD = W[13];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T14 = FNMS(TD, TB, T13);
|
||||
TJ = W[5];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T16 = FNMS(TJ, TH, T15);
|
||||
T12 = TF - TL;
|
||||
T17 = T14 - T16;
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
|
||||
To = Ip[0];
|
||||
Tr = Im[0];
|
||||
Tn = W[0];
|
||||
Tp = Tn * To;
|
||||
TW = Tn * Tr;
|
||||
Tu = Ip[WS(rs, 2)];
|
||||
Tx = Im[WS(rs, 2)];
|
||||
Tt = W[8];
|
||||
Tv = Tt * Tu;
|
||||
TY = Tt * Tx;
|
||||
Tq = W[1];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
TX = FNMS(Tq, To, TW);
|
||||
Tw = W[9];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
TZ = FNMS(Tw, Tu, TY);
|
||||
TV = Ts - Ty;
|
||||
T10 = TX - TZ;
|
||||
}
|
||||
{
|
||||
E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
|
||||
{
|
||||
E TO, TT, T1r, T1s;
|
||||
TO = T1 - T7;
|
||||
TT = TQ - TS;
|
||||
TU = TO + TT;
|
||||
T1a = TO - TT;
|
||||
T1r = T1m - T1l;
|
||||
T1s = Te - Tk;
|
||||
T1t = T1r - T1s;
|
||||
T1v = T1s + T1r;
|
||||
}
|
||||
{
|
||||
E T11, T18, T1b, T1c;
|
||||
T11 = TV + T10;
|
||||
T18 = T12 - T17;
|
||||
T19 = T11 + T18;
|
||||
T1w = T18 - T11;
|
||||
T1b = T10 - TV;
|
||||
T1c = T12 + T17;
|
||||
T1d = T1b - T1c;
|
||||
T1u = T1b + T1c;
|
||||
}
|
||||
Rm[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
|
||||
Im[WS(rs, 2)] = FMS(KP707106781, T1u, T1t);
|
||||
Rp[WS(rs, 1)] = FMA(KP707106781, T19, TU);
|
||||
Ip[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
|
||||
Rm[0] = FNMS(KP707106781, T1d, T1a);
|
||||
Im[0] = FMS(KP707106781, T1w, T1v);
|
||||
Rp[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
|
||||
Ip[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
|
||||
}
|
||||
{
|
||||
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
|
||||
{
|
||||
E T8, Tl, T1j, T1n;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
Tm = T8 + Tl;
|
||||
T1e = T8 - Tl;
|
||||
T1j = TQ + TS;
|
||||
T1n = T1l + T1m;
|
||||
T1o = T1j + T1n;
|
||||
T1q = T1n - T1j;
|
||||
}
|
||||
{
|
||||
E Tz, TM, T1f, T1g;
|
||||
Tz = Ts + Ty;
|
||||
TM = TF + TL;
|
||||
TN = Tz + TM;
|
||||
T1p = TM - Tz;
|
||||
T1f = TX + TZ;
|
||||
T1g = T14 + T16;
|
||||
T1h = T1f - T1g;
|
||||
T1i = T1f + T1g;
|
||||
}
|
||||
Rm[WS(rs, 3)] = Tm - TN;
|
||||
Im[WS(rs, 3)] = T1i - T1o;
|
||||
Rp[0] = Tm + TN;
|
||||
Ip[0] = T1i + T1o;
|
||||
Rm[WS(rs, 1)] = T1e - T1h;
|
||||
Im[WS(rs, 1)] = T1p - T1q;
|
||||
Rp[WS(rs, 2)] = T1e + T1h;
|
||||
Ip[WS(rs, 2)] = T1p + T1q;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2c.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cf_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 32 FP multiplications,
|
||||
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cf_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
|
||||
E TP;
|
||||
{
|
||||
E T1, T18, T6, T17;
|
||||
T1 = Rp[0];
|
||||
T18 = Rm[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = Rp[WS(rs, 2)];
|
||||
T5 = Rm[WS(rs, 2)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T17 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T1e = T18 - T17;
|
||||
TH = T1 - T6;
|
||||
T19 = T17 + T18;
|
||||
}
|
||||
{
|
||||
E Tz, TS, TE, TT;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = Ip[WS(rs, 3)];
|
||||
Ty = Im[WS(rs, 3)];
|
||||
Tv = W[12];
|
||||
Tx = W[13];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
TS = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = Ip[WS(rs, 1)];
|
||||
TD = Im[WS(rs, 1)];
|
||||
TA = W[4];
|
||||
TC = W[5];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
TT = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T13 = TS + TT;
|
||||
TR = Tz - TE;
|
||||
TU = TS - TT;
|
||||
}
|
||||
{
|
||||
E Tc, TI, Th, TJ;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = Rp[WS(rs, 1)];
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TI = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = Rp[WS(rs, 3)];
|
||||
Tg = Rm[WS(rs, 3)];
|
||||
Td = W[10];
|
||||
Tf = W[11];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TJ = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T1f = Tc - Th;
|
||||
TK = TI - TJ;
|
||||
T16 = TI + TJ;
|
||||
}
|
||||
{
|
||||
E To, TN, Tt, TO;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = Ip[0];
|
||||
Tn = Im[0];
|
||||
Tk = W[0];
|
||||
Tm = W[1];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
TN = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = Ip[WS(rs, 2)];
|
||||
Ts = Im[WS(rs, 2)];
|
||||
Tp = W[8];
|
||||
Tr = W[9];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
TO = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T12 = TN + TO;
|
||||
TM = To - Tt;
|
||||
TP = TN - TO;
|
||||
}
|
||||
{
|
||||
E Tj, TG, T1b, T1c;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
Rm[WS(rs, 3)] = Tj - TG;
|
||||
Rp[0] = Tj + TG;
|
||||
{
|
||||
E T15, T1a, T11, T14;
|
||||
T15 = T12 + T13;
|
||||
T1a = T16 + T19;
|
||||
Im[WS(rs, 3)] = T15 - T1a;
|
||||
Ip[0] = T15 + T1a;
|
||||
T11 = T7 - Ti;
|
||||
T14 = T12 - T13;
|
||||
Rm[WS(rs, 1)] = T11 - T14;
|
||||
Rp[WS(rs, 2)] = T11 + T14;
|
||||
}
|
||||
T1b = TF - Tu;
|
||||
T1c = T19 - T16;
|
||||
Im[WS(rs, 1)] = T1b - T1c;
|
||||
Ip[WS(rs, 2)] = T1b + T1c;
|
||||
{
|
||||
E TX, T1g, T10, T1d, TY, TZ;
|
||||
TX = TH - TK;
|
||||
T1g = T1e - T1f;
|
||||
TY = TP - TM;
|
||||
TZ = TR + TU;
|
||||
T10 = KP707106781 * (TY - TZ);
|
||||
T1d = KP707106781 * (TY + TZ);
|
||||
Rm[0] = TX - T10;
|
||||
Ip[WS(rs, 1)] = T1d + T1g;
|
||||
Rp[WS(rs, 3)] = TX + T10;
|
||||
Im[WS(rs, 2)] = T1d - T1g;
|
||||
}
|
||||
{
|
||||
E TL, T1i, TW, T1h, TQ, TV;
|
||||
TL = TH + TK;
|
||||
T1i = T1f + T1e;
|
||||
TQ = TM + TP;
|
||||
TV = TR - TU;
|
||||
TW = KP707106781 * (TQ + TV);
|
||||
T1h = KP707106781 * (TV - TQ);
|
||||
Rm[WS(rs, 2)] = TL - TW;
|
||||
Ip[WS(rs, 3)] = T1h + T1i;
|
||||
Rp[WS(rs, 1)] = TL + TW;
|
||||
Im[0] = T1h - T1i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cf_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cf_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cf_8, &desc, HC2C_VIA_RDFT);
|
||||
}
|
||||
#endif
|
||||
937
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_16.c
Normal file
937
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_16.c
Normal file
@@ -0,0 +1,937 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:38 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 228 FP additions, 166 FP multiplications,
|
||||
* (or, 136 additions, 74 multiplications, 92 fused multiply/add),
|
||||
* 91 stack variables, 4 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T1, T2, Tw, Ty, Th, Tj, T4, T5, TY, T6, Tk, T1o, T1d, Tz, T1j;
|
||||
E Tq, TF, T18, TR, TL, T13, T1A, T1K, T1E, T1H, Tc, T25, T2k, T29, T2h;
|
||||
{
|
||||
E Tx, TE, Ti, TK, Tp, TQ, Tb, T3;
|
||||
T1 = W[0];
|
||||
T2 = W[2];
|
||||
T3 = T1 * T2;
|
||||
Tw = W[6];
|
||||
Tx = T1 * Tw;
|
||||
Ty = W[7];
|
||||
TE = T1 * Ty;
|
||||
Th = W[4];
|
||||
Ti = T1 * Th;
|
||||
TK = T2 * Th;
|
||||
Tj = W[5];
|
||||
Tp = T1 * Tj;
|
||||
TQ = T2 * Tj;
|
||||
T4 = W[1];
|
||||
T5 = W[3];
|
||||
Tb = T1 * T5;
|
||||
TY = FNMS(T4, T5, T3);
|
||||
T6 = FMA(T4, T5, T3);
|
||||
Tk = FNMS(T4, Tj, Ti);
|
||||
T1o = FNMS(T4, Th, Tp);
|
||||
T1d = FMA(T5, Th, TQ);
|
||||
Tz = FMA(T4, Ty, Tx);
|
||||
T1j = FMA(T4, Tj, Ti);
|
||||
Tq = FMA(T4, Th, Tp);
|
||||
TF = FNMS(T4, Tw, TE);
|
||||
T18 = FNMS(T5, Tj, TK);
|
||||
TR = FNMS(T5, Th, TQ);
|
||||
TL = FMA(T5, Tj, TK);
|
||||
{
|
||||
E T1z, T1D, T24, T28;
|
||||
T1z = TY * Th;
|
||||
T1D = TY * Tj;
|
||||
T13 = FMA(T4, T2, Tb);
|
||||
T1A = FMA(T13, Tj, T1z);
|
||||
T1K = FMA(T13, Th, T1D);
|
||||
T1E = FNMS(T13, Th, T1D);
|
||||
T1H = FNMS(T13, Tj, T1z);
|
||||
T24 = T6 * Th;
|
||||
T28 = T6 * Tj;
|
||||
Tc = FNMS(T4, T2, Tb);
|
||||
T25 = FNMS(Tc, Tj, T24);
|
||||
T2k = FNMS(Tc, Th, T28);
|
||||
T29 = FMA(Tc, Th, T28);
|
||||
T2h = FMA(Tc, Tj, T24);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1v, T2q, T1s, T2s, T38, T3T, T1Y, T3P, T17, T1h, T2x, T2v, T33, T3Q, T1N;
|
||||
E T3S, Tg, Tu, T3A, T2B, T2D, T3B, T2c, T3L, T2S, T3I, TJ, TV, T3E, T2G;
|
||||
E T2I, T3D, T2n, T3J, T2X, T3M;
|
||||
{
|
||||
E T1t, T1u, T1W, T1m, T1Q, T1S, T1T, T1V, T36, T1r, T34, T1P, T1k, T1l, T1n;
|
||||
E T2r;
|
||||
T1t = Ip[0];
|
||||
T1u = Im[0];
|
||||
T1W = T1t + T1u;
|
||||
T1k = Ip[WS(rs, 4)];
|
||||
T1l = Im[WS(rs, 4)];
|
||||
T1m = T1k - T1l;
|
||||
T1Q = T1k + T1l;
|
||||
{
|
||||
E T1U, T1p, T1q, T1O;
|
||||
T1S = Rm[0];
|
||||
T1T = Rp[0];
|
||||
T1U = T1S - T1T;
|
||||
T1V = T1 * T1U;
|
||||
T36 = T4 * T1U;
|
||||
T1p = Rp[WS(rs, 4)];
|
||||
T1q = Rm[WS(rs, 4)];
|
||||
T1O = T1q - T1p;
|
||||
T1r = T1p + T1q;
|
||||
T34 = Tj * T1O;
|
||||
T1P = Th * T1O;
|
||||
}
|
||||
T1v = T1t - T1u;
|
||||
T2q = T1T + T1S;
|
||||
T1n = T1j * T1m;
|
||||
T1s = FNMS(T1o, T1r, T1n);
|
||||
T2r = T1j * T1r;
|
||||
T2s = FMA(T1o, T1m, T2r);
|
||||
{
|
||||
E T35, T37, T1R, T1X;
|
||||
T35 = FMA(Th, T1Q, T34);
|
||||
T37 = FMA(T1, T1W, T36);
|
||||
T38 = T35 + T37;
|
||||
T3T = T37 - T35;
|
||||
T1R = FNMS(Tj, T1Q, T1P);
|
||||
T1X = FNMS(T4, T1W, T1V);
|
||||
T1Y = T1R + T1X;
|
||||
T3P = T1X - T1R;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T11, T1F, T16, T2Z, T1C, T1b, T1L, T1g, T31, T1J;
|
||||
{
|
||||
E TZ, T10, T14, T15, T1B;
|
||||
TZ = Ip[WS(rs, 2)];
|
||||
T10 = Im[WS(rs, 2)];
|
||||
T11 = TZ - T10;
|
||||
T1F = TZ + T10;
|
||||
T14 = Rp[WS(rs, 2)];
|
||||
T15 = Rm[WS(rs, 2)];
|
||||
T1B = T15 - T14;
|
||||
T16 = T14 + T15;
|
||||
T2Z = T1E * T1B;
|
||||
T1C = T1A * T1B;
|
||||
}
|
||||
{
|
||||
E T19, T1a, T1e, T1f, T1I;
|
||||
T19 = Ip[WS(rs, 6)];
|
||||
T1a = Im[WS(rs, 6)];
|
||||
T1b = T19 - T1a;
|
||||
T1L = T19 + T1a;
|
||||
T1e = Rp[WS(rs, 6)];
|
||||
T1f = Rm[WS(rs, 6)];
|
||||
T1I = T1f - T1e;
|
||||
T1g = T1e + T1f;
|
||||
T31 = T1K * T1I;
|
||||
T1J = T1H * T1I;
|
||||
}
|
||||
{
|
||||
E T12, T1c, T2w, T2u;
|
||||
T12 = TY * T11;
|
||||
T17 = FNMS(T13, T16, T12);
|
||||
T1c = T18 * T1b;
|
||||
T1h = FNMS(T1d, T1g, T1c);
|
||||
T2w = T18 * T1g;
|
||||
T2x = FMA(T1d, T1b, T2w);
|
||||
T2u = TY * T16;
|
||||
T2v = FMA(T13, T11, T2u);
|
||||
{
|
||||
E T30, T32, T1G, T1M;
|
||||
T30 = FMA(T1A, T1F, T2Z);
|
||||
T32 = FMA(T1H, T1L, T31);
|
||||
T33 = T30 + T32;
|
||||
T3Q = T30 - T32;
|
||||
T1G = FNMS(T1E, T1F, T1C);
|
||||
T1M = FNMS(T1K, T1L, T1J);
|
||||
T1N = T1G + T1M;
|
||||
T3S = T1G - T1M;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T9, T22, Ta, T2O, Tf, T20, T21, T2A, Tn, T2a, To, T2Q, Tt, T26, T27;
|
||||
E T2C;
|
||||
{
|
||||
E T7, T8, Td, Te;
|
||||
T7 = Ip[WS(rs, 1)];
|
||||
T8 = Im[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
T22 = T7 + T8;
|
||||
Ta = T6 * T9;
|
||||
T2O = T2 * T22;
|
||||
Td = Rp[WS(rs, 1)];
|
||||
Te = Rm[WS(rs, 1)];
|
||||
Tf = Td + Te;
|
||||
T20 = Td - Te;
|
||||
T21 = T2 * T20;
|
||||
T2A = T6 * Tf;
|
||||
}
|
||||
{
|
||||
E Tl, Tm, Tr, Ts;
|
||||
Tl = Ip[WS(rs, 5)];
|
||||
Tm = Im[WS(rs, 5)];
|
||||
Tn = Tl - Tm;
|
||||
T2a = Tl + Tm;
|
||||
To = Tk * Tn;
|
||||
T2Q = T25 * T2a;
|
||||
Tr = Rp[WS(rs, 5)];
|
||||
Ts = Rm[WS(rs, 5)];
|
||||
Tt = Tr + Ts;
|
||||
T26 = Tr - Ts;
|
||||
T27 = T25 * T26;
|
||||
T2C = Tk * Tt;
|
||||
}
|
||||
Tg = FNMS(Tc, Tf, Ta);
|
||||
Tu = FNMS(Tq, Tt, To);
|
||||
T3A = Tg - Tu;
|
||||
T2B = FMA(Tc, T9, T2A);
|
||||
T2D = FMA(Tq, Tn, T2C);
|
||||
T3B = T2B - T2D;
|
||||
{
|
||||
E T23, T2b, T2P, T2R;
|
||||
T23 = FMA(T5, T22, T21);
|
||||
T2b = FMA(T29, T2a, T27);
|
||||
T2c = T23 + T2b;
|
||||
T3L = T2b - T23;
|
||||
T2P = FNMS(T5, T20, T2O);
|
||||
T2R = FNMS(T29, T26, T2Q);
|
||||
T2S = T2P + T2R;
|
||||
T3I = T2R - T2P;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TC, T2f, TD, T2T, TI, T2d, T2e, T2F, TO, T2l, TP, T2V, TU, T2i, T2j;
|
||||
E T2H;
|
||||
{
|
||||
E TA, TB, TG, TH;
|
||||
TA = Ip[WS(rs, 7)];
|
||||
TB = Im[WS(rs, 7)];
|
||||
TC = TA - TB;
|
||||
T2f = TA + TB;
|
||||
TD = Tz * TC;
|
||||
T2T = Tw * T2f;
|
||||
TG = Rp[WS(rs, 7)];
|
||||
TH = Rm[WS(rs, 7)];
|
||||
TI = TG + TH;
|
||||
T2d = TG - TH;
|
||||
T2e = Tw * T2d;
|
||||
T2F = Tz * TI;
|
||||
}
|
||||
{
|
||||
E TM, TN, TS, TT;
|
||||
TM = Ip[WS(rs, 3)];
|
||||
TN = Im[WS(rs, 3)];
|
||||
TO = TM - TN;
|
||||
T2l = TM + TN;
|
||||
TP = TL * TO;
|
||||
T2V = T2h * T2l;
|
||||
TS = Rp[WS(rs, 3)];
|
||||
TT = Rm[WS(rs, 3)];
|
||||
TU = TS + TT;
|
||||
T2i = TS - TT;
|
||||
T2j = T2h * T2i;
|
||||
T2H = TL * TU;
|
||||
}
|
||||
TJ = FNMS(TF, TI, TD);
|
||||
TV = FNMS(TR, TU, TP);
|
||||
T3E = TJ - TV;
|
||||
T2G = FMA(TF, TC, T2F);
|
||||
T2I = FMA(TR, TO, T2H);
|
||||
T3D = T2G - T2I;
|
||||
{
|
||||
E T2g, T2m, T2U, T2W;
|
||||
T2g = FMA(Ty, T2f, T2e);
|
||||
T2m = FMA(T2k, T2l, T2j);
|
||||
T2n = T2g + T2m;
|
||||
T3J = T2m - T2g;
|
||||
T2U = FNMS(Ty, T2d, T2T);
|
||||
T2W = FNMS(T2k, T2i, T2V);
|
||||
T2X = T2U + T2W;
|
||||
T3M = T2U - T2W;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TX, T3o, T3i, T3s, T3l, T3t, T1x, T3e, T2p, T2M, T2K, T3d, T3a, T3c, T2z;
|
||||
E T3n;
|
||||
{
|
||||
E Tv, TW, T3g, T3h;
|
||||
Tv = Tg + Tu;
|
||||
TW = TJ + TV;
|
||||
TX = Tv + TW;
|
||||
T3o = Tv - TW;
|
||||
T3g = T2X - T2S;
|
||||
T3h = T2c - T2n;
|
||||
T3i = T3g + T3h;
|
||||
T3s = T3g - T3h;
|
||||
}
|
||||
{
|
||||
E T3j, T3k, T1i, T1w;
|
||||
T3j = T1Y - T1N;
|
||||
T3k = T38 - T33;
|
||||
T3l = T3j - T3k;
|
||||
T3t = T3j + T3k;
|
||||
T1i = T17 + T1h;
|
||||
T1w = T1s + T1v;
|
||||
T1x = T1i + T1w;
|
||||
T3e = T1w - T1i;
|
||||
}
|
||||
{
|
||||
E T1Z, T2o, T2E, T2J;
|
||||
T1Z = T1N + T1Y;
|
||||
T2o = T2c + T2n;
|
||||
T2p = T1Z - T2o;
|
||||
T2M = T2o + T1Z;
|
||||
T2E = T2B + T2D;
|
||||
T2J = T2G + T2I;
|
||||
T2K = T2E + T2J;
|
||||
T3d = T2J - T2E;
|
||||
}
|
||||
{
|
||||
E T2Y, T39, T2t, T2y;
|
||||
T2Y = T2S + T2X;
|
||||
T39 = T33 + T38;
|
||||
T3a = T2Y - T39;
|
||||
T3c = T2Y + T39;
|
||||
T2t = T2q + T2s;
|
||||
T2y = T2v + T2x;
|
||||
T2z = T2t + T2y;
|
||||
T3n = T2t - T2y;
|
||||
}
|
||||
{
|
||||
E T1y, T3b, T2L, T2N;
|
||||
T1y = TX + T1x;
|
||||
Ip[0] = KP500000000 * (T1y + T2p);
|
||||
Im[WS(rs, 7)] = KP500000000 * (T2p - T1y);
|
||||
T3b = T2z + T2K;
|
||||
Rm[WS(rs, 7)] = KP500000000 * (T3b - T3c);
|
||||
Rp[0] = KP500000000 * (T3b + T3c);
|
||||
T2L = T2z - T2K;
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T2L - T2M);
|
||||
Rp[WS(rs, 4)] = KP500000000 * (T2L + T2M);
|
||||
T2N = T1x - TX;
|
||||
Ip[WS(rs, 4)] = KP500000000 * (T2N + T3a);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T3a - T2N);
|
||||
}
|
||||
{
|
||||
E T3f, T3m, T3v, T3w;
|
||||
T3f = T3d + T3e;
|
||||
T3m = T3i + T3l;
|
||||
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3m, T3f));
|
||||
Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3m, T3f)));
|
||||
T3v = T3n + T3o;
|
||||
T3w = T3s + T3t;
|
||||
Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3w, T3v));
|
||||
Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3w, T3v));
|
||||
}
|
||||
{
|
||||
E T3p, T3q, T3r, T3u;
|
||||
T3p = T3n - T3o;
|
||||
T3q = T3l - T3i;
|
||||
Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3q, T3p));
|
||||
Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3q, T3p));
|
||||
T3r = T3e - T3d;
|
||||
T3u = T3s - T3t;
|
||||
Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3u, T3r));
|
||||
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3u, T3r)));
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3z, T4b, T4g, T4q, T4j, T4r, T3G, T4m, T3O, T46, T3Z, T4l, T42, T4c, T3V;
|
||||
E T47;
|
||||
{
|
||||
E T3x, T3y, T4e, T4f;
|
||||
T3x = T1v - T1s;
|
||||
T3y = T2v - T2x;
|
||||
T3z = T3x - T3y;
|
||||
T4b = T3y + T3x;
|
||||
T4e = T3I - T3J;
|
||||
T4f = T3M - T3L;
|
||||
T4g = FMA(KP414213562, T4f, T4e);
|
||||
T4q = FNMS(KP414213562, T4e, T4f);
|
||||
}
|
||||
{
|
||||
E T4h, T4i, T3C, T3F;
|
||||
T4h = T3Q + T3P;
|
||||
T4i = T3T - T3S;
|
||||
T4j = FMA(KP414213562, T4i, T4h);
|
||||
T4r = FNMS(KP414213562, T4h, T4i);
|
||||
T3C = T3A - T3B;
|
||||
T3F = T3D + T3E;
|
||||
T3G = T3C + T3F;
|
||||
T4m = T3C - T3F;
|
||||
}
|
||||
{
|
||||
E T3K, T3N, T3X, T3Y;
|
||||
T3K = T3I + T3J;
|
||||
T3N = T3L + T3M;
|
||||
T3O = FMA(KP414213562, T3N, T3K);
|
||||
T46 = FNMS(KP414213562, T3K, T3N);
|
||||
T3X = T2q - T2s;
|
||||
T3Y = T17 - T1h;
|
||||
T3Z = T3X + T3Y;
|
||||
T4l = T3X - T3Y;
|
||||
}
|
||||
{
|
||||
E T40, T41, T3R, T3U;
|
||||
T40 = T3B + T3A;
|
||||
T41 = T3D - T3E;
|
||||
T42 = T40 + T41;
|
||||
T4c = T41 - T40;
|
||||
T3R = T3P - T3Q;
|
||||
T3U = T3S + T3T;
|
||||
T3V = FNMS(KP414213562, T3U, T3R);
|
||||
T47 = FMA(KP414213562, T3R, T3U);
|
||||
}
|
||||
{
|
||||
E T3H, T3W, T49, T4a;
|
||||
T3H = FMA(KP707106781, T3G, T3z);
|
||||
T3W = T3O + T3V;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3W, T3H));
|
||||
Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3H)));
|
||||
T49 = FMA(KP707106781, T42, T3Z);
|
||||
T4a = T46 + T47;
|
||||
Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T4a, T49));
|
||||
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T4a, T49));
|
||||
}
|
||||
{
|
||||
E T43, T44, T45, T48;
|
||||
T43 = FNMS(KP707106781, T42, T3Z);
|
||||
T44 = T3V - T3O;
|
||||
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T44, T43));
|
||||
Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T44, T43));
|
||||
T45 = FNMS(KP707106781, T3G, T3z);
|
||||
T48 = T46 - T47;
|
||||
Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T48, T45));
|
||||
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T48, T45)));
|
||||
}
|
||||
{
|
||||
E T4d, T4k, T4t, T4u;
|
||||
T4d = FNMS(KP707106781, T4c, T4b);
|
||||
T4k = T4g - T4j;
|
||||
Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T4k, T4d));
|
||||
Im[0] = -(KP500000000 * (FNMS(KP923879532, T4k, T4d)));
|
||||
T4t = FNMS(KP707106781, T4m, T4l);
|
||||
T4u = T4q + T4r;
|
||||
Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4u, T4t));
|
||||
Rm[0] = KP500000000 * (FMA(KP923879532, T4u, T4t));
|
||||
}
|
||||
{
|
||||
E T4n, T4o, T4p, T4s;
|
||||
T4n = FMA(KP707106781, T4m, T4l);
|
||||
T4o = T4g + T4j;
|
||||
Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4o, T4n));
|
||||
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4o, T4n));
|
||||
T4p = FMA(KP707106781, T4c, T4b);
|
||||
T4s = T4q - T4r;
|
||||
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4s, T4p));
|
||||
Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4s, T4p)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, { 136, 74, 92, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hc2cfdft2_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 228 FP additions, 124 FP multiplications,
|
||||
* (or, 188 additions, 84 multiplications, 40 fused multiply/add),
|
||||
* 91 stack variables, 4 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft2_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP461939766, +0.461939766255643378064091594698394143411208313);
|
||||
DK(KP191341716, +0.191341716182544885864229992015199433380672281);
|
||||
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 8, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T1, T4, T2, T5, T7, Td, T12, TY, Tk, Ti, Tm, T1l, T1b, TL, T1h;
|
||||
E Ts, TR, T17, Ty, Tz, TA, TE, T1L, T1Q, T1H, T1O, T24, T2d, T20, T2b;
|
||||
{
|
||||
E Tl, TP, Tq, TK, Tj, TQ, Tr, TJ;
|
||||
{
|
||||
E T3, Tc, T6, Tb;
|
||||
T1 = W[0];
|
||||
T4 = W[1];
|
||||
T2 = W[2];
|
||||
T5 = W[3];
|
||||
T3 = T1 * T2;
|
||||
Tc = T4 * T2;
|
||||
T6 = T4 * T5;
|
||||
Tb = T1 * T5;
|
||||
T7 = T3 + T6;
|
||||
Td = Tb - Tc;
|
||||
T12 = Tb + Tc;
|
||||
TY = T3 - T6;
|
||||
Tk = W[5];
|
||||
Tl = T4 * Tk;
|
||||
TP = T2 * Tk;
|
||||
Tq = T1 * Tk;
|
||||
TK = T5 * Tk;
|
||||
Ti = W[4];
|
||||
Tj = T1 * Ti;
|
||||
TQ = T5 * Ti;
|
||||
Tr = T4 * Ti;
|
||||
TJ = T2 * Ti;
|
||||
}
|
||||
Tm = Tj - Tl;
|
||||
T1l = Tq - Tr;
|
||||
T1b = TP + TQ;
|
||||
TL = TJ + TK;
|
||||
T1h = Tj + Tl;
|
||||
Ts = Tq + Tr;
|
||||
TR = TP - TQ;
|
||||
T17 = TJ - TK;
|
||||
Ty = W[6];
|
||||
Tz = W[7];
|
||||
TA = FMA(T1, Ty, T4 * Tz);
|
||||
TE = FNMS(T4, Ty, T1 * Tz);
|
||||
{
|
||||
E T1J, T1K, T1F, T1G;
|
||||
T1J = TY * Tk;
|
||||
T1K = T12 * Ti;
|
||||
T1L = T1J - T1K;
|
||||
T1Q = T1J + T1K;
|
||||
T1F = TY * Ti;
|
||||
T1G = T12 * Tk;
|
||||
T1H = T1F + T1G;
|
||||
T1O = T1F - T1G;
|
||||
}
|
||||
{
|
||||
E T22, T23, T1Y, T1Z;
|
||||
T22 = T7 * Tk;
|
||||
T23 = Td * Ti;
|
||||
T24 = T22 + T23;
|
||||
T2d = T22 - T23;
|
||||
T1Y = T7 * Ti;
|
||||
T1Z = Td * Tk;
|
||||
T20 = T1Y - T1Z;
|
||||
T2b = T1Y + T1Z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1t, T3i, T2l, T3B, T1E, T3t, T2M, T3x, T1g, T3C, T2J, T3u, T1T, T3w, T2o;
|
||||
E T3j, Tx, T3b, T2C, T3q, T27, T3m, T2s, T3c, TW, T3f, T2F, T3n, T2g, T3p;
|
||||
E T2v, T3e;
|
||||
{
|
||||
E T1k, T1C, T1o, T1B, T1s, T1z, T1y, T2j, T1p, T2k;
|
||||
{
|
||||
E T1i, T1j, T1m, T1n;
|
||||
T1i = Ip[WS(rs, 4)];
|
||||
T1j = Im[WS(rs, 4)];
|
||||
T1k = T1i - T1j;
|
||||
T1C = T1i + T1j;
|
||||
T1m = Rp[WS(rs, 4)];
|
||||
T1n = Rm[WS(rs, 4)];
|
||||
T1o = T1m + T1n;
|
||||
T1B = T1m - T1n;
|
||||
}
|
||||
{
|
||||
E T1q, T1r, T1w, T1x;
|
||||
T1q = Ip[0];
|
||||
T1r = Im[0];
|
||||
T1s = T1q - T1r;
|
||||
T1z = T1q + T1r;
|
||||
T1w = Rm[0];
|
||||
T1x = Rp[0];
|
||||
T1y = T1w - T1x;
|
||||
T2j = T1x + T1w;
|
||||
}
|
||||
T1p = FNMS(T1l, T1o, T1h * T1k);
|
||||
T1t = T1p + T1s;
|
||||
T3i = T1s - T1p;
|
||||
T2k = FMA(T1h, T1o, T1l * T1k);
|
||||
T2l = T2j + T2k;
|
||||
T3B = T2j - T2k;
|
||||
{
|
||||
E T1A, T1D, T2K, T2L;
|
||||
T1A = FNMS(T4, T1z, T1 * T1y);
|
||||
T1D = FMA(Ti, T1B, Tk * T1C);
|
||||
T1E = T1A - T1D;
|
||||
T3t = T1D + T1A;
|
||||
T2K = FNMS(Tk, T1B, Ti * T1C);
|
||||
T2L = FMA(T4, T1y, T1 * T1z);
|
||||
T2M = T2K + T2L;
|
||||
T3x = T2L - T2K;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T11, T1M, T15, T1I, T1a, T1R, T1e, T1P;
|
||||
{
|
||||
E TZ, T10, T13, T14;
|
||||
TZ = Ip[WS(rs, 2)];
|
||||
T10 = Im[WS(rs, 2)];
|
||||
T11 = TZ - T10;
|
||||
T1M = TZ + T10;
|
||||
T13 = Rp[WS(rs, 2)];
|
||||
T14 = Rm[WS(rs, 2)];
|
||||
T15 = T13 + T14;
|
||||
T1I = T13 - T14;
|
||||
}
|
||||
{
|
||||
E T18, T19, T1c, T1d;
|
||||
T18 = Ip[WS(rs, 6)];
|
||||
T19 = Im[WS(rs, 6)];
|
||||
T1a = T18 - T19;
|
||||
T1R = T18 + T19;
|
||||
T1c = Rp[WS(rs, 6)];
|
||||
T1d = Rm[WS(rs, 6)];
|
||||
T1e = T1c + T1d;
|
||||
T1P = T1c - T1d;
|
||||
}
|
||||
{
|
||||
E T16, T1f, T2H, T2I;
|
||||
T16 = FNMS(T12, T15, TY * T11);
|
||||
T1f = FNMS(T1b, T1e, T17 * T1a);
|
||||
T1g = T16 + T1f;
|
||||
T3C = T16 - T1f;
|
||||
T2H = FNMS(T1L, T1I, T1H * T1M);
|
||||
T2I = FNMS(T1Q, T1P, T1O * T1R);
|
||||
T2J = T2H + T2I;
|
||||
T3u = T2H - T2I;
|
||||
}
|
||||
{
|
||||
E T1N, T1S, T2m, T2n;
|
||||
T1N = FMA(T1H, T1I, T1L * T1M);
|
||||
T1S = FMA(T1O, T1P, T1Q * T1R);
|
||||
T1T = T1N + T1S;
|
||||
T3w = T1S - T1N;
|
||||
T2m = FMA(TY, T15, T12 * T11);
|
||||
T2n = FMA(T17, T1e, T1b * T1a);
|
||||
T2o = T2m + T2n;
|
||||
T3j = T2m - T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ta, T1W, Tg, T1V, Tp, T25, Tv, T21;
|
||||
{
|
||||
E T8, T9, Te, Tf;
|
||||
T8 = Ip[WS(rs, 1)];
|
||||
T9 = Im[WS(rs, 1)];
|
||||
Ta = T8 - T9;
|
||||
T1W = T8 + T9;
|
||||
Te = Rp[WS(rs, 1)];
|
||||
Tf = Rm[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
T1V = Te - Tf;
|
||||
}
|
||||
{
|
||||
E Tn, To, Tt, Tu;
|
||||
Tn = Ip[WS(rs, 5)];
|
||||
To = Im[WS(rs, 5)];
|
||||
Tp = Tn - To;
|
||||
T25 = Tn + To;
|
||||
Tt = Rp[WS(rs, 5)];
|
||||
Tu = Rm[WS(rs, 5)];
|
||||
Tv = Tt + Tu;
|
||||
T21 = Tt - Tu;
|
||||
}
|
||||
{
|
||||
E Th, Tw, T2A, T2B;
|
||||
Th = FNMS(Td, Tg, T7 * Ta);
|
||||
Tw = FNMS(Ts, Tv, Tm * Tp);
|
||||
Tx = Th + Tw;
|
||||
T3b = Th - Tw;
|
||||
T2A = FNMS(T5, T1V, T2 * T1W);
|
||||
T2B = FNMS(T24, T21, T20 * T25);
|
||||
T2C = T2A + T2B;
|
||||
T3q = T2A - T2B;
|
||||
}
|
||||
{
|
||||
E T1X, T26, T2q, T2r;
|
||||
T1X = FMA(T2, T1V, T5 * T1W);
|
||||
T26 = FMA(T20, T21, T24 * T25);
|
||||
T27 = T1X + T26;
|
||||
T3m = T26 - T1X;
|
||||
T2q = FMA(T7, Tg, Td * Ta);
|
||||
T2r = FMA(Tm, Tv, Ts * Tp);
|
||||
T2s = T2q + T2r;
|
||||
T3c = T2q - T2r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TD, T29, TH, T28, TO, T2e, TU, T2c;
|
||||
{
|
||||
E TB, TC, TF, TG;
|
||||
TB = Ip[WS(rs, 7)];
|
||||
TC = Im[WS(rs, 7)];
|
||||
TD = TB - TC;
|
||||
T29 = TB + TC;
|
||||
TF = Rp[WS(rs, 7)];
|
||||
TG = Rm[WS(rs, 7)];
|
||||
TH = TF + TG;
|
||||
T28 = TF - TG;
|
||||
}
|
||||
{
|
||||
E TM, TN, TS, TT;
|
||||
TM = Ip[WS(rs, 3)];
|
||||
TN = Im[WS(rs, 3)];
|
||||
TO = TM - TN;
|
||||
T2e = TM + TN;
|
||||
TS = Rp[WS(rs, 3)];
|
||||
TT = Rm[WS(rs, 3)];
|
||||
TU = TS + TT;
|
||||
T2c = TS - TT;
|
||||
}
|
||||
{
|
||||
E TI, TV, T2D, T2E;
|
||||
TI = FNMS(TE, TH, TA * TD);
|
||||
TV = FNMS(TR, TU, TL * TO);
|
||||
TW = TI + TV;
|
||||
T3f = TI - TV;
|
||||
T2D = FNMS(Tz, T28, Ty * T29);
|
||||
T2E = FNMS(T2d, T2c, T2b * T2e);
|
||||
T2F = T2D + T2E;
|
||||
T3n = T2D - T2E;
|
||||
}
|
||||
{
|
||||
E T2a, T2f, T2t, T2u;
|
||||
T2a = FMA(Ty, T28, Tz * T29);
|
||||
T2f = FMA(T2b, T2c, T2d * T2e);
|
||||
T2g = T2a + T2f;
|
||||
T3p = T2f - T2a;
|
||||
T2t = FMA(TA, TH, TE * TD);
|
||||
T2u = FMA(TL, TU, TR * TO);
|
||||
T2v = T2t + T2u;
|
||||
T3e = T2t - T2u;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1v, T2z, T2O, T2Q, T2i, T2y, T2x, T2P;
|
||||
{
|
||||
E TX, T1u, T2G, T2N;
|
||||
TX = Tx + TW;
|
||||
T1u = T1g + T1t;
|
||||
T1v = TX + T1u;
|
||||
T2z = T1u - TX;
|
||||
T2G = T2C + T2F;
|
||||
T2N = T2J + T2M;
|
||||
T2O = T2G - T2N;
|
||||
T2Q = T2G + T2N;
|
||||
}
|
||||
{
|
||||
E T1U, T2h, T2p, T2w;
|
||||
T1U = T1E - T1T;
|
||||
T2h = T27 + T2g;
|
||||
T2i = T1U - T2h;
|
||||
T2y = T2h + T1U;
|
||||
T2p = T2l + T2o;
|
||||
T2w = T2s + T2v;
|
||||
T2x = T2p - T2w;
|
||||
T2P = T2p + T2w;
|
||||
}
|
||||
Ip[0] = KP500000000 * (T1v + T2i);
|
||||
Rp[0] = KP500000000 * (T2P + T2Q);
|
||||
Im[WS(rs, 7)] = KP500000000 * (T2i - T1v);
|
||||
Rm[WS(rs, 7)] = KP500000000 * (T2P - T2Q);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T2x - T2y);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T2O - T2z);
|
||||
Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
|
||||
Ip[WS(rs, 4)] = KP500000000 * (T2z + T2O);
|
||||
}
|
||||
{
|
||||
E T2T, T35, T33, T39, T2W, T36, T2Z, T37;
|
||||
{
|
||||
E T2R, T2S, T31, T32;
|
||||
T2R = T2v - T2s;
|
||||
T2S = T1t - T1g;
|
||||
T2T = KP500000000 * (T2R + T2S);
|
||||
T35 = KP500000000 * (T2S - T2R);
|
||||
T31 = T2l - T2o;
|
||||
T32 = Tx - TW;
|
||||
T33 = KP500000000 * (T31 - T32);
|
||||
T39 = KP500000000 * (T31 + T32);
|
||||
}
|
||||
{
|
||||
E T2U, T2V, T2X, T2Y;
|
||||
T2U = T2F - T2C;
|
||||
T2V = T27 - T2g;
|
||||
T2W = T2U + T2V;
|
||||
T36 = T2U - T2V;
|
||||
T2X = T1T + T1E;
|
||||
T2Y = T2M - T2J;
|
||||
T2Z = T2X - T2Y;
|
||||
T37 = T2X + T2Y;
|
||||
}
|
||||
{
|
||||
E T30, T3a, T34, T38;
|
||||
T30 = KP353553390 * (T2W + T2Z);
|
||||
Ip[WS(rs, 2)] = T2T + T30;
|
||||
Im[WS(rs, 5)] = T30 - T2T;
|
||||
T3a = KP353553390 * (T36 + T37);
|
||||
Rm[WS(rs, 5)] = T39 - T3a;
|
||||
Rp[WS(rs, 2)] = T39 + T3a;
|
||||
T34 = KP353553390 * (T2Z - T2W);
|
||||
Rm[WS(rs, 1)] = T33 - T34;
|
||||
Rp[WS(rs, 6)] = T33 + T34;
|
||||
T38 = KP353553390 * (T36 - T37);
|
||||
Ip[WS(rs, 6)] = T35 + T38;
|
||||
Im[WS(rs, 1)] = T38 - T35;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3k, T3Q, T3Z, T3D, T3h, T40, T3X, T45, T3G, T3P, T3s, T3K, T3U, T44, T3z;
|
||||
E T3L;
|
||||
{
|
||||
E T3d, T3g, T3o, T3r;
|
||||
T3k = KP500000000 * (T3i - T3j);
|
||||
T3Q = KP500000000 * (T3j + T3i);
|
||||
T3Z = KP500000000 * (T3B - T3C);
|
||||
T3D = KP500000000 * (T3B + T3C);
|
||||
T3d = T3b - T3c;
|
||||
T3g = T3e + T3f;
|
||||
T3h = KP353553390 * (T3d + T3g);
|
||||
T40 = KP353553390 * (T3d - T3g);
|
||||
{
|
||||
E T3V, T3W, T3E, T3F;
|
||||
T3V = T3u + T3t;
|
||||
T3W = T3x - T3w;
|
||||
T3X = FNMS(KP461939766, T3W, KP191341716 * T3V);
|
||||
T45 = FMA(KP461939766, T3V, KP191341716 * T3W);
|
||||
T3E = T3c + T3b;
|
||||
T3F = T3e - T3f;
|
||||
T3G = KP353553390 * (T3E + T3F);
|
||||
T3P = KP353553390 * (T3F - T3E);
|
||||
}
|
||||
T3o = T3m + T3n;
|
||||
T3r = T3p - T3q;
|
||||
T3s = FMA(KP191341716, T3o, KP461939766 * T3r);
|
||||
T3K = FNMS(KP191341716, T3r, KP461939766 * T3o);
|
||||
{
|
||||
E T3S, T3T, T3v, T3y;
|
||||
T3S = T3n - T3m;
|
||||
T3T = T3q + T3p;
|
||||
T3U = FMA(KP461939766, T3S, KP191341716 * T3T);
|
||||
T44 = FNMS(KP461939766, T3T, KP191341716 * T3S);
|
||||
T3v = T3t - T3u;
|
||||
T3y = T3w + T3x;
|
||||
T3z = FNMS(KP191341716, T3y, KP461939766 * T3v);
|
||||
T3L = FMA(KP191341716, T3v, KP461939766 * T3y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3l, T3A, T3N, T3O;
|
||||
T3l = T3h + T3k;
|
||||
T3A = T3s + T3z;
|
||||
Ip[WS(rs, 1)] = T3l + T3A;
|
||||
Im[WS(rs, 6)] = T3A - T3l;
|
||||
T3N = T3D + T3G;
|
||||
T3O = T3K + T3L;
|
||||
Rm[WS(rs, 6)] = T3N - T3O;
|
||||
Rp[WS(rs, 1)] = T3N + T3O;
|
||||
}
|
||||
{
|
||||
E T3H, T3I, T3J, T3M;
|
||||
T3H = T3D - T3G;
|
||||
T3I = T3z - T3s;
|
||||
Rm[WS(rs, 2)] = T3H - T3I;
|
||||
Rp[WS(rs, 5)] = T3H + T3I;
|
||||
T3J = T3k - T3h;
|
||||
T3M = T3K - T3L;
|
||||
Ip[WS(rs, 5)] = T3J + T3M;
|
||||
Im[WS(rs, 2)] = T3M - T3J;
|
||||
}
|
||||
{
|
||||
E T3R, T3Y, T47, T48;
|
||||
T3R = T3P + T3Q;
|
||||
T3Y = T3U + T3X;
|
||||
Ip[WS(rs, 3)] = T3R + T3Y;
|
||||
Im[WS(rs, 4)] = T3Y - T3R;
|
||||
T47 = T3Z + T40;
|
||||
T48 = T44 + T45;
|
||||
Rm[WS(rs, 4)] = T47 - T48;
|
||||
Rp[WS(rs, 3)] = T47 + T48;
|
||||
}
|
||||
{
|
||||
E T41, T42, T43, T46;
|
||||
T41 = T3Z - T40;
|
||||
T42 = T3X - T3U;
|
||||
Rm[0] = T41 - T42;
|
||||
Rp[WS(rs, 7)] = T41 + T42;
|
||||
T43 = T3Q - T3P;
|
||||
T46 = T44 - T45;
|
||||
Ip[WS(rs, 7)] = T43 + T46;
|
||||
Im[0] = T46 - T43;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cfdft2_16", twinstr, &GENUS, { 188, 84, 40, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft2_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft2_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
1203
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_20.c
Normal file
1203
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
2057
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_32.c
Normal file
2057
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
221
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_4.c
Normal file
221
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_4.c
Normal file
@@ -0,0 +1,221 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:38 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cfdft2_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 24 FP multiplications,
|
||||
* (or, 24 additions, 16 multiplications, 8 fused multiply/add),
|
||||
* 37 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, T5, T2, T4, T6, Tc, T3, Tb;
|
||||
T1 = W[0];
|
||||
T5 = W[3];
|
||||
T2 = W[2];
|
||||
T3 = T1 * T2;
|
||||
Tb = T1 * T5;
|
||||
T4 = W[1];
|
||||
T6 = FMA(T4, T5, T3);
|
||||
Tc = FNMS(T4, T2, Tb);
|
||||
{
|
||||
E Tj, Tp, To, TE, Tw, T9, Tt, Ta, TC, Tf, Tr, Ts, Tx;
|
||||
{
|
||||
E Th, Ti, Tl, Tm, Tn;
|
||||
Th = Ip[0];
|
||||
Ti = Im[0];
|
||||
Tj = Th - Ti;
|
||||
Tp = Th + Ti;
|
||||
Tl = Rm[0];
|
||||
Tm = Rp[0];
|
||||
Tn = Tl - Tm;
|
||||
To = T1 * Tn;
|
||||
TE = T4 * Tn;
|
||||
Tw = Tm + Tl;
|
||||
}
|
||||
{
|
||||
E T7, T8, Td, Te;
|
||||
T7 = Ip[WS(rs, 1)];
|
||||
T8 = Im[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
Tt = T7 + T8;
|
||||
Ta = T6 * T9;
|
||||
TC = T2 * Tt;
|
||||
Td = Rp[WS(rs, 1)];
|
||||
Te = Rm[WS(rs, 1)];
|
||||
Tf = Td + Te;
|
||||
Tr = Td - Te;
|
||||
Ts = T2 * Tr;
|
||||
Tx = T6 * Tf;
|
||||
}
|
||||
{
|
||||
E Tk, TB, Tz, TH, Tv, TA, TG, TI, Tg, Ty;
|
||||
Tg = FNMS(Tc, Tf, Ta);
|
||||
Tk = Tg + Tj;
|
||||
TB = Tj - Tg;
|
||||
Ty = FMA(Tc, T9, Tx);
|
||||
Tz = Tw - Ty;
|
||||
TH = Tw + Ty;
|
||||
{
|
||||
E Tq, Tu, TD, TF;
|
||||
Tq = FNMS(T4, Tp, To);
|
||||
Tu = FMA(T5, Tt, Ts);
|
||||
Tv = Tq - Tu;
|
||||
TA = Tu + Tq;
|
||||
TD = FNMS(T5, Tr, TC);
|
||||
TF = FMA(T1, Tp, TE);
|
||||
TG = TD - TF;
|
||||
TI = TD + TF;
|
||||
}
|
||||
Ip[0] = KP500000000 * (Tk + Tv);
|
||||
Rp[0] = KP500000000 * (TH + TI);
|
||||
Im[WS(rs, 1)] = KP500000000 * (Tv - Tk);
|
||||
Rm[WS(rs, 1)] = KP500000000 * (TH - TI);
|
||||
Rm[0] = KP500000000 * (Tz - TA);
|
||||
Im[0] = KP500000000 * (TG - TB);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (Tz + TA);
|
||||
Ip[WS(rs, 1)] = KP500000000 * (TB + TG);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cfdft2_4", twinstr, &GENUS, { 24, 16, 8, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft2_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hc2cfdft2_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 24 FP multiplications,
|
||||
* (or, 24 additions, 16 multiplications, 8 fused multiply/add),
|
||||
* 24 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, T3, T2, T4, T5, T9;
|
||||
T1 = W[0];
|
||||
T3 = W[1];
|
||||
T2 = W[2];
|
||||
T4 = W[3];
|
||||
T5 = FMA(T1, T2, T3 * T4);
|
||||
T9 = FNMS(T3, T2, T1 * T4);
|
||||
{
|
||||
E Tg, Tr, Tm, Tx, Td, Tw, Tp, Ts;
|
||||
{
|
||||
E Te, Tf, Tl, Ti, Tj, Tk;
|
||||
Te = Ip[0];
|
||||
Tf = Im[0];
|
||||
Tl = Te + Tf;
|
||||
Ti = Rm[0];
|
||||
Tj = Rp[0];
|
||||
Tk = Ti - Tj;
|
||||
Tg = Te - Tf;
|
||||
Tr = Tj + Ti;
|
||||
Tm = FNMS(T3, Tl, T1 * Tk);
|
||||
Tx = FMA(T3, Tk, T1 * Tl);
|
||||
}
|
||||
{
|
||||
E T8, To, Tc, Tn;
|
||||
{
|
||||
E T6, T7, Ta, Tb;
|
||||
T6 = Ip[WS(rs, 1)];
|
||||
T7 = Im[WS(rs, 1)];
|
||||
T8 = T6 - T7;
|
||||
To = T6 + T7;
|
||||
Ta = Rp[WS(rs, 1)];
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Ta + Tb;
|
||||
Tn = Ta - Tb;
|
||||
}
|
||||
Td = FNMS(T9, Tc, T5 * T8);
|
||||
Tw = FNMS(T4, Tn, T2 * To);
|
||||
Tp = FMA(T2, Tn, T4 * To);
|
||||
Ts = FMA(T5, Tc, T9 * T8);
|
||||
}
|
||||
{
|
||||
E Th, Tq, Tz, TA;
|
||||
Th = Td + Tg;
|
||||
Tq = Tm - Tp;
|
||||
Ip[0] = KP500000000 * (Th + Tq);
|
||||
Im[WS(rs, 1)] = KP500000000 * (Tq - Th);
|
||||
Tz = Tr + Ts;
|
||||
TA = Tw + Tx;
|
||||
Rm[WS(rs, 1)] = KP500000000 * (Tz - TA);
|
||||
Rp[0] = KP500000000 * (Tz + TA);
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tv, Ty;
|
||||
Tt = Tr - Ts;
|
||||
Tu = Tp + Tm;
|
||||
Rm[0] = KP500000000 * (Tt - Tu);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu);
|
||||
Tv = Tg - Td;
|
||||
Ty = Tw - Tx;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty);
|
||||
Im[0] = KP500000000 * (Ty - Tv);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cfdft2_4", twinstr, &GENUS, { 24, 16, 8, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft2_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft2_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
442
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_8.c
Normal file
442
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft2_8.c
Normal file
@@ -0,0 +1,442 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:38 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cfdft2_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 90 FP additions, 66 FP multiplications,
|
||||
* (or, 60 additions, 36 multiplications, 30 fused multiply/add),
|
||||
* 45 stack variables, 2 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T1, T2, Th, Tj, T4, T5, T6, Tk, TB, Tq, Tw, Tc, TM, TQ;
|
||||
{
|
||||
E T3, Ti, Tp, Tb, TL, TP;
|
||||
T1 = W[0];
|
||||
T2 = W[2];
|
||||
T3 = T1 * T2;
|
||||
Th = W[4];
|
||||
Ti = T1 * Th;
|
||||
Tj = W[5];
|
||||
Tp = T1 * Tj;
|
||||
T4 = W[1];
|
||||
T5 = W[3];
|
||||
Tb = T1 * T5;
|
||||
T6 = FMA(T4, T5, T3);
|
||||
Tk = FMA(T4, Tj, Ti);
|
||||
TB = FMA(T4, T2, Tb);
|
||||
Tq = FNMS(T4, Th, Tp);
|
||||
Tw = FNMS(T4, T5, T3);
|
||||
TL = T6 * Th;
|
||||
TP = T6 * Tj;
|
||||
Tc = FNMS(T4, T2, Tb);
|
||||
TM = FMA(Tc, Tj, TL);
|
||||
TQ = FNMS(Tc, Th, TP);
|
||||
}
|
||||
{
|
||||
E TI, T1a, TY, T1u, TF, T1s, TS, T1c, Tg, T1n, T13, T1f, Tu, T1p, T17;
|
||||
E T1h;
|
||||
{
|
||||
E TG, TH, TX, TT, TU, TV, TW, T1t;
|
||||
TG = Ip[0];
|
||||
TH = Im[0];
|
||||
TX = TG + TH;
|
||||
TT = Rm[0];
|
||||
TU = Rp[0];
|
||||
TV = TT - TU;
|
||||
TI = TG - TH;
|
||||
T1a = TU + TT;
|
||||
TW = T1 * TV;
|
||||
TY = FNMS(T4, TX, TW);
|
||||
T1t = T4 * TV;
|
||||
T1u = FMA(T1, TX, T1t);
|
||||
}
|
||||
{
|
||||
E Tz, TR, TE, TN;
|
||||
{
|
||||
E Tx, Ty, TC, TD;
|
||||
Tx = Ip[WS(rs, 2)];
|
||||
Ty = Im[WS(rs, 2)];
|
||||
Tz = Tx - Ty;
|
||||
TR = Tx + Ty;
|
||||
TC = Rp[WS(rs, 2)];
|
||||
TD = Rm[WS(rs, 2)];
|
||||
TE = TC + TD;
|
||||
TN = TD - TC;
|
||||
}
|
||||
{
|
||||
E TA, T1r, TO, T1b;
|
||||
TA = Tw * Tz;
|
||||
TF = FNMS(TB, TE, TA);
|
||||
T1r = TQ * TN;
|
||||
T1s = FMA(TM, TR, T1r);
|
||||
TO = TM * TN;
|
||||
TS = FNMS(TQ, TR, TO);
|
||||
T1b = Tw * TE;
|
||||
T1c = FMA(TB, Tz, T1b);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T9, T12, Tf, T10;
|
||||
{
|
||||
E T7, T8, Td, Te;
|
||||
T7 = Ip[WS(rs, 1)];
|
||||
T8 = Im[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
T12 = T7 + T8;
|
||||
Td = Rp[WS(rs, 1)];
|
||||
Te = Rm[WS(rs, 1)];
|
||||
Tf = Td + Te;
|
||||
T10 = Td - Te;
|
||||
}
|
||||
{
|
||||
E Ta, T1m, T11, T1e;
|
||||
Ta = T6 * T9;
|
||||
Tg = FNMS(Tc, Tf, Ta);
|
||||
T1m = T2 * T12;
|
||||
T1n = FNMS(T5, T10, T1m);
|
||||
T11 = T2 * T10;
|
||||
T13 = FMA(T5, T12, T11);
|
||||
T1e = T6 * Tf;
|
||||
T1f = FMA(Tc, T9, T1e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, T16, Tt, T14;
|
||||
{
|
||||
E Tl, Tm, Tr, Ts;
|
||||
Tl = Ip[WS(rs, 3)];
|
||||
Tm = Im[WS(rs, 3)];
|
||||
Tn = Tl - Tm;
|
||||
T16 = Tl + Tm;
|
||||
Tr = Rp[WS(rs, 3)];
|
||||
Ts = Rm[WS(rs, 3)];
|
||||
Tt = Tr + Ts;
|
||||
T14 = Tr - Ts;
|
||||
}
|
||||
{
|
||||
E To, T1o, T15, T1g;
|
||||
To = Tk * Tn;
|
||||
Tu = FNMS(Tq, Tt, To);
|
||||
T1o = Th * T16;
|
||||
T1p = FNMS(Tj, T14, T1o);
|
||||
T15 = Th * T14;
|
||||
T17 = FMA(Tj, T16, T15);
|
||||
T1g = Tk * Tt;
|
||||
T1h = FMA(Tq, Tn, T1g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TK, T1l, T1w, T1y, T19, T1k, T1j, T1x;
|
||||
{
|
||||
E Tv, TJ, T1q, T1v;
|
||||
Tv = Tg + Tu;
|
||||
TJ = TF + TI;
|
||||
TK = Tv + TJ;
|
||||
T1l = TJ - Tv;
|
||||
T1q = T1n + T1p;
|
||||
T1v = T1s + T1u;
|
||||
T1w = T1q - T1v;
|
||||
T1y = T1q + T1v;
|
||||
}
|
||||
{
|
||||
E TZ, T18, T1d, T1i;
|
||||
TZ = TS + TY;
|
||||
T18 = T13 + T17;
|
||||
T19 = TZ - T18;
|
||||
T1k = T18 + TZ;
|
||||
T1d = T1a + T1c;
|
||||
T1i = T1f + T1h;
|
||||
T1j = T1d - T1i;
|
||||
T1x = T1d + T1i;
|
||||
}
|
||||
Ip[0] = KP500000000 * (TK + T19);
|
||||
Rp[0] = KP500000000 * (T1x + T1y);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T19 - TK);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T1x - T1y);
|
||||
Rm[WS(rs, 1)] = KP500000000 * (T1j - T1k);
|
||||
Im[WS(rs, 1)] = KP500000000 * (T1w - T1l);
|
||||
Rp[WS(rs, 2)] = KP500000000 * (T1j + T1k);
|
||||
Ip[WS(rs, 2)] = KP500000000 * (T1l + T1w);
|
||||
}
|
||||
{
|
||||
E T1B, T1N, T1L, T1R, T1E, T1O, T1H, T1P;
|
||||
{
|
||||
E T1z, T1A, T1J, T1K;
|
||||
T1z = TI - TF;
|
||||
T1A = T1f - T1h;
|
||||
T1B = T1z - T1A;
|
||||
T1N = T1A + T1z;
|
||||
T1J = T1a - T1c;
|
||||
T1K = Tg - Tu;
|
||||
T1L = T1J - T1K;
|
||||
T1R = T1J + T1K;
|
||||
}
|
||||
{
|
||||
E T1C, T1D, T1F, T1G;
|
||||
T1C = T1p - T1n;
|
||||
T1D = T13 - T17;
|
||||
T1E = T1C + T1D;
|
||||
T1O = T1C - T1D;
|
||||
T1F = TY - TS;
|
||||
T1G = T1u - T1s;
|
||||
T1H = T1F - T1G;
|
||||
T1P = T1F + T1G;
|
||||
}
|
||||
{
|
||||
E T1I, T1S, T1M, T1Q;
|
||||
T1I = T1E + T1H;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1I, T1B));
|
||||
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1I, T1B)));
|
||||
T1S = T1O + T1P;
|
||||
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1S, T1R));
|
||||
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1S, T1R));
|
||||
T1M = T1H - T1E;
|
||||
Rm[0] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
|
||||
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
|
||||
T1Q = T1O - T1P;
|
||||
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1Q, T1N));
|
||||
Im[0] = -(KP500000000 * (FNMS(KP707106781, T1Q, T1N)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cfdft2_8", twinstr, &GENUS, { 60, 36, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft2_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hc2cfdft2_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 90 FP additions, 56 FP multiplications,
|
||||
* (or, 72 additions, 38 multiplications, 18 fused multiply/add),
|
||||
* 51 stack variables, 2 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T1, T4, T2, T5, Tu, Ty, T7, Td, Ti, Tj, Tk, TP, To, TN;
|
||||
{
|
||||
E T3, Tc, T6, Tb;
|
||||
T1 = W[0];
|
||||
T4 = W[1];
|
||||
T2 = W[2];
|
||||
T5 = W[3];
|
||||
T3 = T1 * T2;
|
||||
Tc = T4 * T2;
|
||||
T6 = T4 * T5;
|
||||
Tb = T1 * T5;
|
||||
Tu = T3 - T6;
|
||||
Ty = Tb + Tc;
|
||||
T7 = T3 + T6;
|
||||
Td = Tb - Tc;
|
||||
Ti = W[4];
|
||||
Tj = W[5];
|
||||
Tk = FMA(T1, Ti, T4 * Tj);
|
||||
TP = FNMS(Td, Ti, T7 * Tj);
|
||||
To = FNMS(T4, Ti, T1 * Tj);
|
||||
TN = FMA(T7, Ti, Td * Tj);
|
||||
}
|
||||
{
|
||||
E TF, T11, TC, T12, T1d, T1e, T1q, TM, TR, T1p, Th, Ts, T15, T14, T1a;
|
||||
E T1b, T1m, TV, TY, T1n;
|
||||
{
|
||||
E TD, TE, TL, TI, TJ, TK, Tx, TQ, TB, TO;
|
||||
TD = Ip[0];
|
||||
TE = Im[0];
|
||||
TL = TD + TE;
|
||||
TI = Rm[0];
|
||||
TJ = Rp[0];
|
||||
TK = TI - TJ;
|
||||
{
|
||||
E Tv, Tw, Tz, TA;
|
||||
Tv = Ip[WS(rs, 2)];
|
||||
Tw = Im[WS(rs, 2)];
|
||||
Tx = Tv - Tw;
|
||||
TQ = Tv + Tw;
|
||||
Tz = Rp[WS(rs, 2)];
|
||||
TA = Rm[WS(rs, 2)];
|
||||
TB = Tz + TA;
|
||||
TO = Tz - TA;
|
||||
}
|
||||
TF = TD - TE;
|
||||
T11 = TJ + TI;
|
||||
TC = FNMS(Ty, TB, Tu * Tx);
|
||||
T12 = FMA(Tu, TB, Ty * Tx);
|
||||
T1d = FNMS(TP, TO, TN * TQ);
|
||||
T1e = FMA(T4, TK, T1 * TL);
|
||||
T1q = T1e - T1d;
|
||||
TM = FNMS(T4, TL, T1 * TK);
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1p = TR + TM;
|
||||
}
|
||||
{
|
||||
E Ta, TU, Tg, TT, Tn, TX, Tr, TW;
|
||||
{
|
||||
E T8, T9, Te, Tf;
|
||||
T8 = Ip[WS(rs, 1)];
|
||||
T9 = Im[WS(rs, 1)];
|
||||
Ta = T8 - T9;
|
||||
TU = T8 + T9;
|
||||
Te = Rp[WS(rs, 1)];
|
||||
Tf = Rm[WS(rs, 1)];
|
||||
Tg = Te + Tf;
|
||||
TT = Te - Tf;
|
||||
}
|
||||
{
|
||||
E Tl, Tm, Tp, Tq;
|
||||
Tl = Ip[WS(rs, 3)];
|
||||
Tm = Im[WS(rs, 3)];
|
||||
Tn = Tl - Tm;
|
||||
TX = Tl + Tm;
|
||||
Tp = Rp[WS(rs, 3)];
|
||||
Tq = Rm[WS(rs, 3)];
|
||||
Tr = Tp + Tq;
|
||||
TW = Tp - Tq;
|
||||
}
|
||||
Th = FNMS(Td, Tg, T7 * Ta);
|
||||
Ts = FNMS(To, Tr, Tk * Tn);
|
||||
T15 = FMA(Tk, Tr, To * Tn);
|
||||
T14 = FMA(T7, Tg, Td * Ta);
|
||||
T1a = FNMS(T5, TT, T2 * TU);
|
||||
T1b = FNMS(Tj, TW, Ti * TX);
|
||||
T1m = T1b - T1a;
|
||||
TV = FMA(T2, TT, T5 * TU);
|
||||
TY = FMA(Ti, TW, Tj * TX);
|
||||
T1n = TV - TY;
|
||||
}
|
||||
{
|
||||
E T1l, T1x, T1A, T1C, T1s, T1w, T1v, T1B;
|
||||
{
|
||||
E T1j, T1k, T1y, T1z;
|
||||
T1j = TF - TC;
|
||||
T1k = T14 - T15;
|
||||
T1l = KP500000000 * (T1j - T1k);
|
||||
T1x = KP500000000 * (T1k + T1j);
|
||||
T1y = T1m - T1n;
|
||||
T1z = T1p + T1q;
|
||||
T1A = KP353553390 * (T1y - T1z);
|
||||
T1C = KP353553390 * (T1y + T1z);
|
||||
}
|
||||
{
|
||||
E T1o, T1r, T1t, T1u;
|
||||
T1o = T1m + T1n;
|
||||
T1r = T1p - T1q;
|
||||
T1s = KP353553390 * (T1o + T1r);
|
||||
T1w = KP353553390 * (T1r - T1o);
|
||||
T1t = T11 - T12;
|
||||
T1u = Th - Ts;
|
||||
T1v = KP500000000 * (T1t - T1u);
|
||||
T1B = KP500000000 * (T1t + T1u);
|
||||
}
|
||||
Ip[WS(rs, 1)] = T1l + T1s;
|
||||
Rp[WS(rs, 1)] = T1B + T1C;
|
||||
Im[WS(rs, 2)] = T1s - T1l;
|
||||
Rm[WS(rs, 2)] = T1B - T1C;
|
||||
Rm[0] = T1v - T1w;
|
||||
Im[0] = T1A - T1x;
|
||||
Rp[WS(rs, 3)] = T1v + T1w;
|
||||
Ip[WS(rs, 3)] = T1x + T1A;
|
||||
}
|
||||
{
|
||||
E TH, T19, T1g, T1i, T10, T18, T17, T1h;
|
||||
{
|
||||
E Tt, TG, T1c, T1f;
|
||||
Tt = Th + Ts;
|
||||
TG = TC + TF;
|
||||
TH = Tt + TG;
|
||||
T19 = TG - Tt;
|
||||
T1c = T1a + T1b;
|
||||
T1f = T1d + T1e;
|
||||
T1g = T1c - T1f;
|
||||
T1i = T1c + T1f;
|
||||
}
|
||||
{
|
||||
E TS, TZ, T13, T16;
|
||||
TS = TM - TR;
|
||||
TZ = TV + TY;
|
||||
T10 = TS - TZ;
|
||||
T18 = TZ + TS;
|
||||
T13 = T11 + T12;
|
||||
T16 = T14 + T15;
|
||||
T17 = T13 - T16;
|
||||
T1h = T13 + T16;
|
||||
}
|
||||
Ip[0] = KP500000000 * (TH + T10);
|
||||
Rp[0] = KP500000000 * (T1h + T1i);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T10 - TH);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T1h - T1i);
|
||||
Rm[WS(rs, 1)] = KP500000000 * (T17 - T18);
|
||||
Im[WS(rs, 1)] = KP500000000 * (T1g - T19);
|
||||
Rp[WS(rs, 2)] = KP500000000 * (T17 + T18);
|
||||
Ip[WS(rs, 2)] = KP500000000 * (T19 + T1g);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cfdft2_8", twinstr, &GENUS, { 72, 38, 18, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft2_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft2_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
546
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_10.c
Normal file
546
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_10.c
Normal file
@@ -0,0 +1,546 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 122 FP additions, 92 FP multiplications,
|
||||
* (or, 68 additions, 38 multiplications, 54 fused multiply/add),
|
||||
* 81 stack variables, 5 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E T3, T1u, Td, T1w, T1S, T2f, T14, T1p, T1j, T1q, T1N, T2e, TQ, T2i, T1n;
|
||||
E T1H, Tz, T2h, T1m, T1C;
|
||||
{
|
||||
E T1, T2, T1h, Tc, TW, T1c, T1d, T1b, T1f, T1g, T1Q, T7, TV, T1J, TS;
|
||||
E TU, Ts, Tx, T19, T18, T1O, T15, T17, Tt, T1A, Ti, Tn, TE, TD, T1F;
|
||||
E TA, TC, Tj, T1y, TJ, TO, T12, T11, T1L, TY, T10, TK, T1D;
|
||||
{
|
||||
E Ta, Tb, T1e, T5, T6, TT;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
T1h = T1 + T2;
|
||||
Ta = Rp[WS(rs, 2)];
|
||||
Tb = Rm[WS(rs, 2)];
|
||||
Tc = Ta - Tb;
|
||||
TW = Ta + Tb;
|
||||
T1c = Rm[0];
|
||||
T1d = Rp[0];
|
||||
T1e = T1c - T1d;
|
||||
T1b = W[0];
|
||||
T1f = T1b * T1e;
|
||||
T1g = W[1];
|
||||
T1Q = T1g * T1e;
|
||||
T5 = Ip[WS(rs, 2)];
|
||||
T6 = Im[WS(rs, 2)];
|
||||
TT = T5 - T6;
|
||||
T7 = T5 + T6;
|
||||
TV = W[7];
|
||||
T1J = TV * TT;
|
||||
TS = W[6];
|
||||
TU = TS * TT;
|
||||
{
|
||||
E Tq, Tr, T16, Tv, Tw, Tp;
|
||||
Tq = Rm[WS(rs, 3)];
|
||||
Tr = Rp[WS(rs, 3)];
|
||||
Ts = Tq - Tr;
|
||||
Tv = Ip[WS(rs, 3)];
|
||||
Tw = Im[WS(rs, 3)];
|
||||
Tx = Tv + Tw;
|
||||
T16 = Tv - Tw;
|
||||
T19 = Tr + Tq;
|
||||
T18 = W[11];
|
||||
T1O = T18 * T16;
|
||||
T15 = W[10];
|
||||
T17 = T15 * T16;
|
||||
Tp = W[12];
|
||||
Tt = Tp * Ts;
|
||||
T1A = Tp * Tx;
|
||||
}
|
||||
{
|
||||
E Tg, Th, TB, Tl, Tm, Tf;
|
||||
Tg = Ip[WS(rs, 1)];
|
||||
Th = Im[WS(rs, 1)];
|
||||
Ti = Tg - Th;
|
||||
Tl = Rp[WS(rs, 1)];
|
||||
Tm = Rm[WS(rs, 1)];
|
||||
Tn = Tl + Tm;
|
||||
TB = Tm - Tl;
|
||||
TE = Tg + Th;
|
||||
TD = W[5];
|
||||
T1F = TD * TB;
|
||||
TA = W[4];
|
||||
TC = TA * TB;
|
||||
Tf = W[2];
|
||||
Tj = Tf * Ti;
|
||||
T1y = Tf * Tn;
|
||||
}
|
||||
{
|
||||
E TH, TI, TZ, TM, TN, TG;
|
||||
TH = Ip[WS(rs, 4)];
|
||||
TI = Im[WS(rs, 4)];
|
||||
TJ = TH - TI;
|
||||
TM = Rp[WS(rs, 4)];
|
||||
TN = Rm[WS(rs, 4)];
|
||||
TO = TM + TN;
|
||||
TZ = TN - TM;
|
||||
T12 = TH + TI;
|
||||
T11 = W[17];
|
||||
T1L = T11 * TZ;
|
||||
TY = W[16];
|
||||
T10 = TY * TZ;
|
||||
TG = W[14];
|
||||
TK = TG * TJ;
|
||||
T1D = TG * TO;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1P, T1R, T1K, T1M;
|
||||
T3 = T1 - T2;
|
||||
T1u = T1d + T1c;
|
||||
{
|
||||
E T4, T8, T9, T1v;
|
||||
T4 = W[9];
|
||||
T8 = T4 * T7;
|
||||
T9 = W[8];
|
||||
T1v = T9 * T7;
|
||||
Td = FMA(T9, Tc, T8);
|
||||
T1w = FNMS(T4, Tc, T1v);
|
||||
}
|
||||
T1P = FMA(T15, T19, T1O);
|
||||
T1R = FMA(T1b, T1h, T1Q);
|
||||
T1S = T1P - T1R;
|
||||
T2f = T1P + T1R;
|
||||
{
|
||||
E TX, T13, T1a, T1i;
|
||||
TX = FNMS(TV, TW, TU);
|
||||
T13 = FNMS(T11, T12, T10);
|
||||
T14 = TX + T13;
|
||||
T1p = T13 - TX;
|
||||
T1a = FNMS(T18, T19, T17);
|
||||
T1i = FNMS(T1g, T1h, T1f);
|
||||
T1j = T1a + T1i;
|
||||
T1q = T1i - T1a;
|
||||
}
|
||||
T1K = FMA(TS, TW, T1J);
|
||||
T1M = FMA(TY, T12, T1L);
|
||||
T1N = T1K - T1M;
|
||||
T2e = T1K + T1M;
|
||||
{
|
||||
E TF, T1G, TP, T1E, TL;
|
||||
TF = FNMS(TD, TE, TC);
|
||||
T1G = FMA(TA, TE, T1F);
|
||||
TL = W[15];
|
||||
TP = FNMS(TL, TO, TK);
|
||||
T1E = FMA(TL, TJ, T1D);
|
||||
TQ = TF + TP;
|
||||
T2i = T1G + T1E;
|
||||
T1n = TF - TP;
|
||||
T1H = T1E - T1G;
|
||||
}
|
||||
{
|
||||
E To, T1z, Ty, T1B, Tk, Tu;
|
||||
Tk = W[3];
|
||||
To = FNMS(Tk, Tn, Tj);
|
||||
T1z = FMA(Tk, Ti, T1y);
|
||||
Tu = W[13];
|
||||
Ty = FNMS(Tu, Tx, Tt);
|
||||
T1B = FMA(Tu, Ts, T1A);
|
||||
Tz = To + Ty;
|
||||
T2h = T1z + T1B;
|
||||
T1m = Ty - To;
|
||||
T1C = T1z - T1B;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2k, T2m, Te, T1l, T2b, T2c, T2l, T2d;
|
||||
{
|
||||
E T2g, T2j, TR, T1k;
|
||||
T2g = T2e - T2f;
|
||||
T2j = T2h - T2i;
|
||||
T2k = FNMS(KP618033988, T2j, T2g);
|
||||
T2m = FMA(KP618033988, T2g, T2j);
|
||||
Te = T3 - Td;
|
||||
TR = Tz + TQ;
|
||||
T1k = T14 + T1j;
|
||||
T1l = TR + T1k;
|
||||
T2b = FNMS(KP250000000, T1l, Te);
|
||||
T2c = TR - T1k;
|
||||
}
|
||||
Ip[0] = KP500000000 * (Te + T1l);
|
||||
T2l = FMA(KP559016994, T2c, T2b);
|
||||
Ip[WS(rs, 4)] = KP500000000 * (FMA(KP951056516, T2m, T2l));
|
||||
Im[WS(rs, 3)] = -(KP500000000 * (FNMS(KP951056516, T2m, T2l)));
|
||||
T2d = FNMS(KP559016994, T2c, T2b);
|
||||
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP951056516, T2k, T2d));
|
||||
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP951056516, T2k, T2d)));
|
||||
}
|
||||
{
|
||||
E T2w, T2y, T2n, T2q, T2r, T2s, T2x, T2t;
|
||||
{
|
||||
E T2u, T2v, T2o, T2p;
|
||||
T2u = T14 - T1j;
|
||||
T2v = Tz - TQ;
|
||||
T2w = FNMS(KP618033988, T2v, T2u);
|
||||
T2y = FMA(KP618033988, T2u, T2v);
|
||||
T2n = T1u + T1w;
|
||||
T2o = T2h + T2i;
|
||||
T2p = T2e + T2f;
|
||||
T2q = T2o + T2p;
|
||||
T2r = FNMS(KP250000000, T2q, T2n);
|
||||
T2s = T2o - T2p;
|
||||
}
|
||||
Rp[0] = KP500000000 * (T2n + T2q);
|
||||
T2x = FMA(KP559016994, T2s, T2r);
|
||||
Rp[WS(rs, 4)] = KP500000000 * (FNMS(KP951056516, T2y, T2x));
|
||||
Rm[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2y, T2x));
|
||||
T2t = FNMS(KP559016994, T2s, T2r);
|
||||
Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T2w, T2t));
|
||||
Rm[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T2w, T2t));
|
||||
}
|
||||
{
|
||||
E T28, T2a, T1t, T1s, T23, T24, T29, T25;
|
||||
{
|
||||
E T26, T27, T1o, T1r;
|
||||
T26 = T1H - T1C;
|
||||
T27 = T1S - T1N;
|
||||
T28 = FMA(KP618033988, T27, T26);
|
||||
T2a = FNMS(KP618033988, T26, T27);
|
||||
T1t = Td + T3;
|
||||
T1o = T1m + T1n;
|
||||
T1r = T1p + T1q;
|
||||
T1s = T1o + T1r;
|
||||
T23 = FMA(KP250000000, T1s, T1t);
|
||||
T24 = T1r - T1o;
|
||||
}
|
||||
Im[WS(rs, 4)] = KP500000000 * (T1s - T1t);
|
||||
T29 = FNMS(KP559016994, T24, T23);
|
||||
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T2a, T29));
|
||||
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP951056516, T2a, T29)));
|
||||
T25 = FMA(KP559016994, T24, T23);
|
||||
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T28, T25));
|
||||
Im[0] = -(KP500000000 * (FNMS(KP951056516, T28, T25)));
|
||||
}
|
||||
{
|
||||
E T20, T22, T1x, T1U, T1V, T1W, T21, T1X;
|
||||
{
|
||||
E T1Y, T1Z, T1I, T1T;
|
||||
T1Y = T1n - T1m;
|
||||
T1Z = T1q - T1p;
|
||||
T20 = FMA(KP618033988, T1Z, T1Y);
|
||||
T22 = FNMS(KP618033988, T1Y, T1Z);
|
||||
T1x = T1u - T1w;
|
||||
T1I = T1C + T1H;
|
||||
T1T = T1N + T1S;
|
||||
T1U = T1I + T1T;
|
||||
T1V = FNMS(KP250000000, T1U, T1x);
|
||||
T1W = T1I - T1T;
|
||||
}
|
||||
Rm[WS(rs, 4)] = KP500000000 * (T1x + T1U);
|
||||
T21 = FNMS(KP559016994, T1W, T1V);
|
||||
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP951056516, T22, T21));
|
||||
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP951056516, T22, T21));
|
||||
T1X = FMA(KP559016994, T1W, T1V);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP951056516, T20, T1X));
|
||||
Rm[0] = KP500000000 * (FNMS(KP951056516, T20, T1X));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, { 68, 38, 54, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hc2cfdft_10 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 122 FP additions, 68 FP multiplications,
|
||||
* (or, 92 additions, 38 multiplications, 30 fused multiply/add),
|
||||
* 62 stack variables, 5 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_10(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP293892626, +0.293892626146236564584352977319536384298826219);
|
||||
DK(KP475528258, +0.475528258147576786058219666689691071702849317);
|
||||
DK(KP125000000, +0.125000000000000000000000000000000000000000000);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP279508497, +0.279508497187473712051146708591409529430077295);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 18, MAKE_VOLATILE_STRIDE(40, rs)) {
|
||||
E Tw, TL, TM, T1W, T1X, T27, T1Z, T20, T26, TX, T1a, T1b, T1d, T1e, T1f;
|
||||
E T1q, T1t, T1u, T1x, T1A, T1B, T1g, T1h, T1i, Td, T25, T1k, T1F;
|
||||
{
|
||||
E T3, T1D, T19, T1z, T7, Tb, TR, T1v, Tm, T1o, TK, T1s, Tv, T1p, T12;
|
||||
E T1y, TF, T1r, TW, T1w;
|
||||
{
|
||||
E T1, T2, T18, T14, T15, T16, T13, T17;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
T18 = T1 + T2;
|
||||
T14 = Rm[0];
|
||||
T15 = Rp[0];
|
||||
T16 = T14 - T15;
|
||||
T3 = T1 - T2;
|
||||
T1D = T15 + T14;
|
||||
T13 = W[0];
|
||||
T17 = W[1];
|
||||
T19 = FNMS(T17, T18, T13 * T16);
|
||||
T1z = FMA(T17, T16, T13 * T18);
|
||||
}
|
||||
{
|
||||
E T5, T6, TO, T9, Ta, TQ, TN, TP;
|
||||
T5 = Ip[WS(rs, 2)];
|
||||
T6 = Im[WS(rs, 2)];
|
||||
TO = T5 - T6;
|
||||
T9 = Rp[WS(rs, 2)];
|
||||
Ta = Rm[WS(rs, 2)];
|
||||
TQ = T9 + Ta;
|
||||
T7 = T5 + T6;
|
||||
Tb = T9 - Ta;
|
||||
TN = W[6];
|
||||
TP = W[7];
|
||||
TR = FNMS(TP, TQ, TN * TO);
|
||||
T1v = FMA(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E Th, TJ, Tl, TH;
|
||||
{
|
||||
E Tf, Tg, Tj, Tk;
|
||||
Tf = Ip[WS(rs, 1)];
|
||||
Tg = Im[WS(rs, 1)];
|
||||
Th = Tf - Tg;
|
||||
TJ = Tf + Tg;
|
||||
Tj = Rp[WS(rs, 1)];
|
||||
Tk = Rm[WS(rs, 1)];
|
||||
Tl = Tj + Tk;
|
||||
TH = Tj - Tk;
|
||||
}
|
||||
{
|
||||
E Te, Ti, TG, TI;
|
||||
Te = W[2];
|
||||
Ti = W[3];
|
||||
Tm = FNMS(Ti, Tl, Te * Th);
|
||||
T1o = FMA(Te, Tl, Ti * Th);
|
||||
TG = W[4];
|
||||
TI = W[5];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1s = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tq, TZ, Tu, T11;
|
||||
{
|
||||
E To, Tp, Ts, Tt;
|
||||
To = Ip[WS(rs, 3)];
|
||||
Tp = Im[WS(rs, 3)];
|
||||
Tq = To + Tp;
|
||||
TZ = To - Tp;
|
||||
Ts = Rp[WS(rs, 3)];
|
||||
Tt = Rm[WS(rs, 3)];
|
||||
Tu = Ts - Tt;
|
||||
T11 = Ts + Tt;
|
||||
}
|
||||
{
|
||||
E Tn, Tr, TY, T10;
|
||||
Tn = W[13];
|
||||
Tr = W[12];
|
||||
Tv = FMA(Tn, Tq, Tr * Tu);
|
||||
T1p = FNMS(Tn, Tu, Tr * Tq);
|
||||
TY = W[10];
|
||||
T10 = W[11];
|
||||
T12 = FNMS(T10, T11, TY * TZ);
|
||||
T1y = FMA(T10, TZ, TY * T11);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, TV, TE, TT;
|
||||
{
|
||||
E Ty, Tz, TC, TD;
|
||||
Ty = Ip[WS(rs, 4)];
|
||||
Tz = Im[WS(rs, 4)];
|
||||
TA = Ty - Tz;
|
||||
TV = Ty + Tz;
|
||||
TC = Rp[WS(rs, 4)];
|
||||
TD = Rm[WS(rs, 4)];
|
||||
TE = TC + TD;
|
||||
TT = TC - TD;
|
||||
}
|
||||
{
|
||||
E Tx, TB, TS, TU;
|
||||
Tx = W[14];
|
||||
TB = W[15];
|
||||
TF = FNMS(TB, TE, Tx * TA);
|
||||
T1r = FMA(Tx, TE, TB * TA);
|
||||
TS = W[16];
|
||||
TU = W[17];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1w = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
}
|
||||
Tw = Tm - Tv;
|
||||
TL = TF - TK;
|
||||
TM = Tw + TL;
|
||||
T1W = T1v + T1w;
|
||||
T1X = T1y + T1z;
|
||||
T27 = T1W + T1X;
|
||||
T1Z = T1o + T1p;
|
||||
T20 = T1s + T1r;
|
||||
T26 = T1Z + T20;
|
||||
TX = TR - TW;
|
||||
T1a = T12 + T19;
|
||||
T1b = TX + T1a;
|
||||
T1d = T19 - T12;
|
||||
T1e = TR + TW;
|
||||
T1f = T1d - T1e;
|
||||
T1q = T1o - T1p;
|
||||
T1t = T1r - T1s;
|
||||
T1u = T1q + T1t;
|
||||
T1x = T1v - T1w;
|
||||
T1A = T1y - T1z;
|
||||
T1B = T1x + T1A;
|
||||
T1g = Tm + Tv;
|
||||
T1h = TK + TF;
|
||||
T1i = T1g + T1h;
|
||||
{
|
||||
E Tc, T1E, T4, T8;
|
||||
T4 = W[9];
|
||||
T8 = W[8];
|
||||
Tc = FMA(T4, T7, T8 * Tb);
|
||||
T1E = FNMS(T4, Tb, T8 * T7);
|
||||
Td = T3 - Tc;
|
||||
T25 = T1D + T1E;
|
||||
T1k = Tc + T3;
|
||||
T1F = T1D - T1E;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1U, T1c, T1T, T22, T24, T1Y, T21, T23, T1V;
|
||||
T1U = KP279508497 * (TM - T1b);
|
||||
T1c = TM + T1b;
|
||||
T1T = FNMS(KP125000000, T1c, KP500000000 * Td);
|
||||
T1Y = T1W - T1X;
|
||||
T21 = T1Z - T20;
|
||||
T22 = FNMS(KP293892626, T21, KP475528258 * T1Y);
|
||||
T24 = FMA(KP475528258, T21, KP293892626 * T1Y);
|
||||
Ip[0] = KP500000000 * (Td + T1c);
|
||||
T23 = T1U + T1T;
|
||||
Ip[WS(rs, 4)] = T23 + T24;
|
||||
Im[WS(rs, 3)] = T24 - T23;
|
||||
T1V = T1T - T1U;
|
||||
Ip[WS(rs, 2)] = T1V + T22;
|
||||
Im[WS(rs, 1)] = T22 - T1V;
|
||||
}
|
||||
{
|
||||
E T2a, T28, T29, T2e, T2g, T2c, T2d, T2f, T2b;
|
||||
T2a = KP279508497 * (T26 - T27);
|
||||
T28 = T26 + T27;
|
||||
T29 = FNMS(KP125000000, T28, KP500000000 * T25);
|
||||
T2c = TX - T1a;
|
||||
T2d = Tw - TL;
|
||||
T2e = FNMS(KP293892626, T2d, KP475528258 * T2c);
|
||||
T2g = FMA(KP475528258, T2d, KP293892626 * T2c);
|
||||
Rp[0] = KP500000000 * (T25 + T28);
|
||||
T2f = T2a + T29;
|
||||
Rp[WS(rs, 4)] = T2f - T2g;
|
||||
Rm[WS(rs, 3)] = T2g + T2f;
|
||||
T2b = T29 - T2a;
|
||||
Rp[WS(rs, 2)] = T2b - T2e;
|
||||
Rm[WS(rs, 1)] = T2e + T2b;
|
||||
}
|
||||
{
|
||||
E T1M, T1j, T1L, T1Q, T1S, T1O, T1P, T1R, T1N;
|
||||
T1M = KP279508497 * (T1i + T1f);
|
||||
T1j = T1f - T1i;
|
||||
T1L = FMA(KP500000000, T1k, KP125000000 * T1j);
|
||||
T1O = T1A - T1x;
|
||||
T1P = T1q - T1t;
|
||||
T1Q = FNMS(KP475528258, T1P, KP293892626 * T1O);
|
||||
T1S = FMA(KP293892626, T1P, KP475528258 * T1O);
|
||||
Im[WS(rs, 4)] = KP500000000 * (T1j - T1k);
|
||||
T1R = T1L - T1M;
|
||||
Ip[WS(rs, 3)] = T1R + T1S;
|
||||
Im[WS(rs, 2)] = T1S - T1R;
|
||||
T1N = T1L + T1M;
|
||||
Ip[WS(rs, 1)] = T1N + T1Q;
|
||||
Im[0] = T1Q - T1N;
|
||||
}
|
||||
{
|
||||
E T1C, T1G, T1H, T1n, T1J, T1l, T1m, T1K, T1I;
|
||||
T1C = KP279508497 * (T1u - T1B);
|
||||
T1G = T1u + T1B;
|
||||
T1H = FNMS(KP125000000, T1G, KP500000000 * T1F);
|
||||
T1l = T1g - T1h;
|
||||
T1m = T1e + T1d;
|
||||
T1n = FMA(KP475528258, T1l, KP293892626 * T1m);
|
||||
T1J = FNMS(KP293892626, T1l, KP475528258 * T1m);
|
||||
Rm[WS(rs, 4)] = KP500000000 * (T1F + T1G);
|
||||
T1K = T1H - T1C;
|
||||
Rp[WS(rs, 3)] = T1J + T1K;
|
||||
Rm[WS(rs, 2)] = T1K - T1J;
|
||||
T1I = T1C + T1H;
|
||||
Rp[WS(rs, 1)] = T1n + T1I;
|
||||
Rm[0] = T1I - T1n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 10, "hc2cfdft_10", twinstr, &GENUS, { 92, 38, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_10) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_10, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
646
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_12.c
Normal file
646
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_12.c
Normal file
@@ -0,0 +1,646 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:37 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 142 FP additions, 92 FP multiplications,
|
||||
* (or, 96 additions, 46 multiplications, 46 fused multiply/add),
|
||||
* 65 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E To, T1E, T1m, T2H, Ta, T1G, Tk, T1I, Tl, T1J, T1s, T2b, T1A, T2d, T1B;
|
||||
E T2I, T12, T18, T19, T24, T26, T2C, Tz, T1M, T1f, T2B, TJ, T1O, TT, T1Q;
|
||||
E TU, T1R;
|
||||
{
|
||||
E Tm, Tn, T1u, T1x, T1y, T1z, T1v, T2c, Te, Tj, T1i, T1l, Tf, T1H, T4;
|
||||
E T1o, T9, T1r, T5, T1F, T1p, T2a, T1t, T1, T1n;
|
||||
Tm = Ip[0];
|
||||
Tn = Im[0];
|
||||
T1u = Tm + Tn;
|
||||
T1x = Rp[0];
|
||||
T1y = Rm[0];
|
||||
T1z = T1x - T1y;
|
||||
T1t = W[0];
|
||||
T1v = T1t * T1u;
|
||||
T2c = T1t * T1z;
|
||||
{
|
||||
E Tc, Td, Th, Ti, Tb;
|
||||
Tc = Ip[WS(rs, 4)];
|
||||
Td = Im[WS(rs, 4)];
|
||||
Te = Tc - Td;
|
||||
Th = Rp[WS(rs, 4)];
|
||||
Ti = Rm[WS(rs, 4)];
|
||||
Tj = Th + Ti;
|
||||
T1i = Tc + Td;
|
||||
T1l = Th - Ti;
|
||||
Tb = W[14];
|
||||
Tf = Tb * Te;
|
||||
T1H = Tb * Tj;
|
||||
}
|
||||
{
|
||||
E T2, T3, T7, T8;
|
||||
T2 = Ip[WS(rs, 2)];
|
||||
T3 = Im[WS(rs, 2)];
|
||||
T4 = T2 - T3;
|
||||
T1o = T2 + T3;
|
||||
T7 = Rp[WS(rs, 2)];
|
||||
T8 = Rm[WS(rs, 2)];
|
||||
T9 = T7 + T8;
|
||||
T1r = T7 - T8;
|
||||
}
|
||||
T1 = W[6];
|
||||
T5 = T1 * T4;
|
||||
T1F = T1 * T9;
|
||||
T1n = W[8];
|
||||
T1p = T1n * T1o;
|
||||
T2a = T1n * T1r;
|
||||
To = Tm - Tn;
|
||||
T1E = T1x + T1y;
|
||||
{
|
||||
E T1j, T2G, T1h, T1k;
|
||||
T1h = W[16];
|
||||
T1j = T1h * T1i;
|
||||
T2G = T1h * T1l;
|
||||
T1k = W[17];
|
||||
T1m = FNMS(T1k, T1l, T1j);
|
||||
T2H = FMA(T1k, T1i, T2G);
|
||||
}
|
||||
{
|
||||
E T6, Tg, T1q, T1w;
|
||||
T6 = W[7];
|
||||
Ta = FNMS(T6, T9, T5);
|
||||
T1G = FMA(T6, T4, T1F);
|
||||
Tg = W[15];
|
||||
Tk = FNMS(Tg, Tj, Tf);
|
||||
T1I = FMA(Tg, Te, T1H);
|
||||
Tl = Ta + Tk;
|
||||
T1J = T1G + T1I;
|
||||
T1q = W[9];
|
||||
T1s = FNMS(T1q, T1r, T1p);
|
||||
T2b = FMA(T1q, T1o, T2a);
|
||||
T1w = W[1];
|
||||
T1A = FNMS(T1w, T1z, T1v);
|
||||
T2d = FMA(T1w, T1u, T2c);
|
||||
T1B = T1s + T1A;
|
||||
T2I = T2b + T2d;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tt, T11, Ty, T10, T23, TX, TZ, TN, TS, T1b, T1e, TO, T1P, TD, TI;
|
||||
E T17, T16, T25, T13, T15, TE, T1N, TF, TP;
|
||||
{
|
||||
E Tr, Ts, Tw, Tx, TY;
|
||||
Tr = Ip[WS(rs, 3)];
|
||||
Ts = Im[WS(rs, 3)];
|
||||
Tt = Tr - Ts;
|
||||
T11 = Tr + Ts;
|
||||
Tw = Rp[WS(rs, 3)];
|
||||
Tx = Rm[WS(rs, 3)];
|
||||
TY = Tx - Tw;
|
||||
Ty = Tw + Tx;
|
||||
T10 = W[12];
|
||||
T23 = T10 * TY;
|
||||
TX = W[13];
|
||||
TZ = TX * TY;
|
||||
}
|
||||
{
|
||||
E TL, TM, TQ, TR, TK;
|
||||
TL = Ip[WS(rs, 1)];
|
||||
TM = Im[WS(rs, 1)];
|
||||
TN = TL - TM;
|
||||
TQ = Rp[WS(rs, 1)];
|
||||
TR = Rm[WS(rs, 1)];
|
||||
TS = TQ + TR;
|
||||
T1b = TL + TM;
|
||||
T1e = TQ - TR;
|
||||
TK = W[2];
|
||||
TO = TK * TN;
|
||||
T1P = TK * TS;
|
||||
}
|
||||
{
|
||||
E TB, TC, T14, TG, TH, TA;
|
||||
TB = Ip[WS(rs, 5)];
|
||||
TC = Im[WS(rs, 5)];
|
||||
TD = TB - TC;
|
||||
TG = Rp[WS(rs, 5)];
|
||||
TH = Rm[WS(rs, 5)];
|
||||
TI = TG + TH;
|
||||
T14 = TH - TG;
|
||||
T17 = TB + TC;
|
||||
T16 = W[20];
|
||||
T25 = T16 * T14;
|
||||
T13 = W[21];
|
||||
T15 = T13 * T14;
|
||||
TA = W[18];
|
||||
TE = TA * TD;
|
||||
T1N = TA * TI;
|
||||
}
|
||||
T12 = FMA(T10, T11, TZ);
|
||||
T18 = FMA(T16, T17, T15);
|
||||
T19 = T12 + T18;
|
||||
T24 = FNMS(TX, T11, T23);
|
||||
T26 = FNMS(T13, T17, T25);
|
||||
T2C = T24 + T26;
|
||||
{
|
||||
E Tu, T1L, Tq, Tv;
|
||||
Tq = W[10];
|
||||
Tu = Tq * Tt;
|
||||
T1L = Tq * Ty;
|
||||
Tv = W[11];
|
||||
Tz = FNMS(Tv, Ty, Tu);
|
||||
T1M = FMA(Tv, Tt, T1L);
|
||||
}
|
||||
{
|
||||
E T1c, T2A, T1a, T1d;
|
||||
T1a = W[4];
|
||||
T1c = T1a * T1b;
|
||||
T2A = T1a * T1e;
|
||||
T1d = W[5];
|
||||
T1f = FNMS(T1d, T1e, T1c);
|
||||
T2B = FMA(T1d, T1b, T2A);
|
||||
}
|
||||
TF = W[19];
|
||||
TJ = FNMS(TF, TI, TE);
|
||||
T1O = FMA(TF, TD, T1N);
|
||||
TP = W[3];
|
||||
TT = FNMS(TP, TS, TO);
|
||||
T1Q = FMA(TP, TN, T1P);
|
||||
TU = TJ + TT;
|
||||
T1R = T1O + T1Q;
|
||||
}
|
||||
{
|
||||
E TW, T2V, T2Y, T30, T1D, T1U, T1T, T2Z;
|
||||
{
|
||||
E Tp, TV, T2W, T2X;
|
||||
Tp = Tl + To;
|
||||
TV = Tz + TU;
|
||||
TW = Tp - TV;
|
||||
T2V = TV + Tp;
|
||||
T2W = T2C - T2B;
|
||||
T2X = T2H + T2I;
|
||||
T2Y = T2W - T2X;
|
||||
T30 = T2W + T2X;
|
||||
}
|
||||
{
|
||||
E T1g, T1C, T1K, T1S;
|
||||
T1g = T19 + T1f;
|
||||
T1C = T1m + T1B;
|
||||
T1D = T1g - T1C;
|
||||
T1U = T1g + T1C;
|
||||
T1K = T1E + T1J;
|
||||
T1S = T1M + T1R;
|
||||
T1T = T1K + T1S;
|
||||
T2Z = T1K - T1S;
|
||||
}
|
||||
Ip[WS(rs, 3)] = KP500000000 * (TW + T1D);
|
||||
Rp[WS(rs, 3)] = KP500000000 * (T2Z - T30);
|
||||
Im[WS(rs, 2)] = KP500000000 * (T1D - TW);
|
||||
Rm[WS(rs, 2)] = KP500000000 * (T2Z + T30);
|
||||
Rm[WS(rs, 5)] = KP500000000 * (T1T - T1U);
|
||||
Im[WS(rs, 5)] = KP500000000 * (T2Y - T2V);
|
||||
Rp[0] = KP500000000 * (T1T + T1U);
|
||||
Ip[0] = KP500000000 * (T2V + T2Y);
|
||||
}
|
||||
{
|
||||
E T1X, T2v, T2F, T2Q, T2L, T2R, T20, T2w, T28, T2t, T2j, T2p, T2m, T2q, T2f;
|
||||
E T2s;
|
||||
{
|
||||
E T1V, T1W, T2D, T2E;
|
||||
T1V = FNMS(KP500000000, T1J, T1E);
|
||||
T1W = Ta - Tk;
|
||||
T1X = FNMS(KP866025403, T1W, T1V);
|
||||
T2v = FMA(KP866025403, T1W, T1V);
|
||||
T2D = FMA(KP500000000, T2C, T2B);
|
||||
T2E = T18 - T12;
|
||||
T2F = FNMS(KP866025403, T2E, T2D);
|
||||
T2Q = FMA(KP866025403, T2E, T2D);
|
||||
}
|
||||
{
|
||||
E T2J, T2K, T1Y, T1Z;
|
||||
T2J = FNMS(KP500000000, T2I, T2H);
|
||||
T2K = T1s - T1A;
|
||||
T2L = FNMS(KP866025403, T2K, T2J);
|
||||
T2R = FMA(KP866025403, T2K, T2J);
|
||||
T1Y = FNMS(KP500000000, T1R, T1M);
|
||||
T1Z = TJ - TT;
|
||||
T20 = FNMS(KP866025403, T1Z, T1Y);
|
||||
T2w = FMA(KP866025403, T1Z, T1Y);
|
||||
}
|
||||
{
|
||||
E T22, T27, T2h, T2i;
|
||||
T22 = FNMS(KP500000000, T19, T1f);
|
||||
T27 = T24 - T26;
|
||||
T28 = FNMS(KP866025403, T27, T22);
|
||||
T2t = FMA(KP866025403, T27, T22);
|
||||
T2h = FNMS(KP500000000, Tl, To);
|
||||
T2i = T1I - T1G;
|
||||
T2j = FNMS(KP866025403, T2i, T2h);
|
||||
T2p = FMA(KP866025403, T2i, T2h);
|
||||
}
|
||||
{
|
||||
E T2k, T2l, T29, T2e;
|
||||
T2k = FNMS(KP500000000, TU, Tz);
|
||||
T2l = T1Q - T1O;
|
||||
T2m = FNMS(KP866025403, T2l, T2k);
|
||||
T2q = FMA(KP866025403, T2l, T2k);
|
||||
T29 = FNMS(KP500000000, T1B, T1m);
|
||||
T2e = T2b - T2d;
|
||||
T2f = FNMS(KP866025403, T2e, T29);
|
||||
T2s = FMA(KP866025403, T2e, T29);
|
||||
}
|
||||
{
|
||||
E T21, T2g, T2P, T2S;
|
||||
T21 = T1X + T20;
|
||||
T2g = T28 + T2f;
|
||||
Rp[WS(rs, 2)] = KP500000000 * (T21 - T2g);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T21 + T2g);
|
||||
T2P = T2m + T2j;
|
||||
T2S = T2Q + T2R;
|
||||
Ip[WS(rs, 2)] = KP500000000 * (T2P + T2S);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T2S - T2P);
|
||||
}
|
||||
{
|
||||
E T2n, T2o, T2T, T2U;
|
||||
T2n = T2j - T2m;
|
||||
T2o = T2f - T28;
|
||||
Ip[WS(rs, 5)] = KP500000000 * (T2n + T2o);
|
||||
Im[0] = KP500000000 * (T2o - T2n);
|
||||
T2T = T1X - T20;
|
||||
T2U = T2R - T2Q;
|
||||
Rm[0] = KP500000000 * (T2T - T2U);
|
||||
Rp[WS(rs, 5)] = KP500000000 * (T2T + T2U);
|
||||
}
|
||||
{
|
||||
E T2r, T2u, T2N, T2O;
|
||||
T2r = T2p - T2q;
|
||||
T2u = T2s - T2t;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (T2r + T2u);
|
||||
Im[WS(rs, 4)] = KP500000000 * (T2u - T2r);
|
||||
T2N = T2v - T2w;
|
||||
T2O = T2L - T2F;
|
||||
Rm[WS(rs, 4)] = KP500000000 * (T2N - T2O);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (T2N + T2O);
|
||||
}
|
||||
{
|
||||
E T2x, T2y, T2z, T2M;
|
||||
T2x = T2v + T2w;
|
||||
T2y = T2t + T2s;
|
||||
Rm[WS(rs, 1)] = KP500000000 * (T2x - T2y);
|
||||
Rp[WS(rs, 4)] = KP500000000 * (T2x + T2y);
|
||||
T2z = T2q + T2p;
|
||||
T2M = T2F + T2L;
|
||||
Ip[WS(rs, 4)] = KP500000000 * (T2z - T2M);
|
||||
Im[WS(rs, 1)] = -(KP500000000 * (T2z + T2M));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, { 96, 46, 46, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hc2cfdft_12 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 142 FP additions, 76 FP multiplications,
|
||||
* (or, 112 additions, 46 multiplications, 30 fused multiply/add),
|
||||
* 52 stack variables, 3 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP433012701, +0.433012701892219323381861585376468091735701313);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 22, MAKE_VOLATILE_STRIDE(48, rs)) {
|
||||
E Tm, T1t, T1d, T2j, Tj, T1Y, T1w, T1G, T1q, T2q, T1U, T2k, Tw, T1y, T17;
|
||||
E T2g, TP, T21, T1B, T1J, T12, T2u, T1P, T2h;
|
||||
{
|
||||
E Tk, Tl, T1k, T1m, T1n, T1o, T4, T1f, T8, T1h, Th, T1c, Td, T1a, T19;
|
||||
E T1b;
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
Tk = Ip[0];
|
||||
Tl = Im[0];
|
||||
T1k = Tk + Tl;
|
||||
T1m = Rp[0];
|
||||
T1n = Rm[0];
|
||||
T1o = T1m - T1n;
|
||||
T2 = Ip[WS(rs, 2)];
|
||||
T3 = Im[WS(rs, 2)];
|
||||
T4 = T2 - T3;
|
||||
T1f = T2 + T3;
|
||||
T6 = Rp[WS(rs, 2)];
|
||||
T7 = Rm[WS(rs, 2)];
|
||||
T8 = T6 + T7;
|
||||
T1h = T6 - T7;
|
||||
{
|
||||
E Tf, Tg, Tb, Tc;
|
||||
Tf = Rp[WS(rs, 4)];
|
||||
Tg = Rm[WS(rs, 4)];
|
||||
Th = Tf + Tg;
|
||||
T1c = Tf - Tg;
|
||||
Tb = Ip[WS(rs, 4)];
|
||||
Tc = Im[WS(rs, 4)];
|
||||
Td = Tb - Tc;
|
||||
T1a = Tb + Tc;
|
||||
}
|
||||
}
|
||||
Tm = Tk - Tl;
|
||||
T1t = T1m + T1n;
|
||||
T19 = W[16];
|
||||
T1b = W[17];
|
||||
T1d = FNMS(T1b, T1c, T19 * T1a);
|
||||
T2j = FMA(T19, T1c, T1b * T1a);
|
||||
{
|
||||
E T9, T1u, Ti, T1v;
|
||||
{
|
||||
E T1, T5, Ta, Te;
|
||||
T1 = W[6];
|
||||
T5 = W[7];
|
||||
T9 = FNMS(T5, T8, T1 * T4);
|
||||
T1u = FMA(T1, T8, T5 * T4);
|
||||
Ta = W[14];
|
||||
Te = W[15];
|
||||
Ti = FNMS(Te, Th, Ta * Td);
|
||||
T1v = FMA(Ta, Th, Te * Td);
|
||||
}
|
||||
Tj = T9 + Ti;
|
||||
T1Y = KP433012701 * (T1v - T1u);
|
||||
T1w = T1u + T1v;
|
||||
T1G = KP433012701 * (T9 - Ti);
|
||||
}
|
||||
{
|
||||
E T1i, T1S, T1p, T1T;
|
||||
{
|
||||
E T1e, T1g, T1j, T1l;
|
||||
T1e = W[8];
|
||||
T1g = W[9];
|
||||
T1i = FNMS(T1g, T1h, T1e * T1f);
|
||||
T1S = FMA(T1e, T1h, T1g * T1f);
|
||||
T1j = W[0];
|
||||
T1l = W[1];
|
||||
T1p = FNMS(T1l, T1o, T1j * T1k);
|
||||
T1T = FMA(T1j, T1o, T1l * T1k);
|
||||
}
|
||||
T1q = T1i + T1p;
|
||||
T2q = KP433012701 * (T1i - T1p);
|
||||
T1U = KP433012701 * (T1S - T1T);
|
||||
T2k = T1S + T1T;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tr, TT, Tv, TV, TA, TY, TE, T10, TN, T14, TJ, T16;
|
||||
{
|
||||
E Tp, Tq, TC, TD;
|
||||
Tp = Ip[WS(rs, 3)];
|
||||
Tq = Im[WS(rs, 3)];
|
||||
Tr = Tp - Tq;
|
||||
TT = Tp + Tq;
|
||||
{
|
||||
E Tt, Tu, Ty, Tz;
|
||||
Tt = Rp[WS(rs, 3)];
|
||||
Tu = Rm[WS(rs, 3)];
|
||||
Tv = Tt + Tu;
|
||||
TV = Tt - Tu;
|
||||
Ty = Ip[WS(rs, 5)];
|
||||
Tz = Im[WS(rs, 5)];
|
||||
TA = Ty - Tz;
|
||||
TY = Ty + Tz;
|
||||
}
|
||||
TC = Rp[WS(rs, 5)];
|
||||
TD = Rm[WS(rs, 5)];
|
||||
TE = TC + TD;
|
||||
T10 = TC - TD;
|
||||
{
|
||||
E TL, TM, TH, TI;
|
||||
TL = Rp[WS(rs, 1)];
|
||||
TM = Rm[WS(rs, 1)];
|
||||
TN = TL + TM;
|
||||
T14 = TM - TL;
|
||||
TH = Ip[WS(rs, 1)];
|
||||
TI = Im[WS(rs, 1)];
|
||||
TJ = TH - TI;
|
||||
T16 = TH + TI;
|
||||
}
|
||||
}
|
||||
{
|
||||
E To, Ts, T13, T15;
|
||||
To = W[10];
|
||||
Ts = W[11];
|
||||
Tw = FNMS(Ts, Tv, To * Tr);
|
||||
T1y = FMA(To, Tv, Ts * Tr);
|
||||
T13 = W[5];
|
||||
T15 = W[4];
|
||||
T17 = FMA(T13, T14, T15 * T16);
|
||||
T2g = FNMS(T13, T16, T15 * T14);
|
||||
}
|
||||
{
|
||||
E TF, T1z, TO, T1A;
|
||||
{
|
||||
E Tx, TB, TG, TK;
|
||||
Tx = W[18];
|
||||
TB = W[19];
|
||||
TF = FNMS(TB, TE, Tx * TA);
|
||||
T1z = FMA(Tx, TE, TB * TA);
|
||||
TG = W[2];
|
||||
TK = W[3];
|
||||
TO = FNMS(TK, TN, TG * TJ);
|
||||
T1A = FMA(TG, TN, TK * TJ);
|
||||
}
|
||||
TP = TF + TO;
|
||||
T21 = KP433012701 * (T1A - T1z);
|
||||
T1B = T1z + T1A;
|
||||
T1J = KP433012701 * (TF - TO);
|
||||
}
|
||||
{
|
||||
E TW, T1O, T11, T1N;
|
||||
{
|
||||
E TS, TU, TX, TZ;
|
||||
TS = W[12];
|
||||
TU = W[13];
|
||||
TW = FNMS(TU, TV, TS * TT);
|
||||
T1O = FMA(TS, TV, TU * TT);
|
||||
TX = W[20];
|
||||
TZ = W[21];
|
||||
T11 = FNMS(TZ, T10, TX * TY);
|
||||
T1N = FMA(TX, T10, TZ * TY);
|
||||
}
|
||||
T12 = TW + T11;
|
||||
T2u = KP433012701 * (T11 - TW);
|
||||
T1P = KP433012701 * (T1N - T1O);
|
||||
T2h = T1O + T1N;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TR, T2f, T2m, T2o, T1s, T1E, T1D, T2n;
|
||||
{
|
||||
E Tn, TQ, T2i, T2l;
|
||||
Tn = Tj + Tm;
|
||||
TQ = Tw + TP;
|
||||
TR = Tn - TQ;
|
||||
T2f = TQ + Tn;
|
||||
T2i = T2g - T2h;
|
||||
T2l = T2j + T2k;
|
||||
T2m = T2i - T2l;
|
||||
T2o = T2i + T2l;
|
||||
}
|
||||
{
|
||||
E T18, T1r, T1x, T1C;
|
||||
T18 = T12 + T17;
|
||||
T1r = T1d + T1q;
|
||||
T1s = T18 - T1r;
|
||||
T1E = T18 + T1r;
|
||||
T1x = T1t + T1w;
|
||||
T1C = T1y + T1B;
|
||||
T1D = T1x + T1C;
|
||||
T2n = T1x - T1C;
|
||||
}
|
||||
Ip[WS(rs, 3)] = KP500000000 * (TR + T1s);
|
||||
Rp[WS(rs, 3)] = KP500000000 * (T2n - T2o);
|
||||
Im[WS(rs, 2)] = KP500000000 * (T1s - TR);
|
||||
Rm[WS(rs, 2)] = KP500000000 * (T2n + T2o);
|
||||
Rm[WS(rs, 5)] = KP500000000 * (T1D - T1E);
|
||||
Im[WS(rs, 5)] = KP500000000 * (T2m - T2f);
|
||||
Rp[0] = KP500000000 * (T1D + T1E);
|
||||
Ip[0] = KP500000000 * (T2f + T2m);
|
||||
}
|
||||
{
|
||||
E T1H, T2b, T2s, T2B, T2v, T2A, T1K, T2c, T1Q, T29, T1Z, T25, T22, T26, T1V;
|
||||
E T28;
|
||||
{
|
||||
E T1F, T2r, T2t, T1I;
|
||||
T1F = FNMS(KP250000000, T1w, KP500000000 * T1t);
|
||||
T1H = T1F - T1G;
|
||||
T2b = T1F + T1G;
|
||||
T2r = FNMS(KP500000000, T2j, KP250000000 * T2k);
|
||||
T2s = T2q - T2r;
|
||||
T2B = T2q + T2r;
|
||||
T2t = FMA(KP250000000, T2h, KP500000000 * T2g);
|
||||
T2v = T2t - T2u;
|
||||
T2A = T2u + T2t;
|
||||
T1I = FNMS(KP250000000, T1B, KP500000000 * T1y);
|
||||
T1K = T1I - T1J;
|
||||
T2c = T1I + T1J;
|
||||
}
|
||||
{
|
||||
E T1M, T1X, T20, T1R;
|
||||
T1M = FNMS(KP250000000, T12, KP500000000 * T17);
|
||||
T1Q = T1M - T1P;
|
||||
T29 = T1P + T1M;
|
||||
T1X = FNMS(KP250000000, Tj, KP500000000 * Tm);
|
||||
T1Z = T1X - T1Y;
|
||||
T25 = T1Y + T1X;
|
||||
T20 = FNMS(KP250000000, TP, KP500000000 * Tw);
|
||||
T22 = T20 - T21;
|
||||
T26 = T21 + T20;
|
||||
T1R = FNMS(KP250000000, T1q, KP500000000 * T1d);
|
||||
T1V = T1R - T1U;
|
||||
T28 = T1R + T1U;
|
||||
}
|
||||
{
|
||||
E T1L, T1W, T2p, T2w;
|
||||
T1L = T1H + T1K;
|
||||
T1W = T1Q + T1V;
|
||||
Rp[WS(rs, 2)] = T1L - T1W;
|
||||
Rm[WS(rs, 3)] = T1L + T1W;
|
||||
T2p = T22 + T1Z;
|
||||
T2w = T2s - T2v;
|
||||
Ip[WS(rs, 2)] = T2p + T2w;
|
||||
Im[WS(rs, 3)] = T2w - T2p;
|
||||
}
|
||||
{
|
||||
E T23, T24, T2x, T2y;
|
||||
T23 = T1Z - T22;
|
||||
T24 = T1V - T1Q;
|
||||
Ip[WS(rs, 5)] = T23 + T24;
|
||||
Im[0] = T24 - T23;
|
||||
T2x = T1H - T1K;
|
||||
T2y = T2v + T2s;
|
||||
Rm[0] = T2x - T2y;
|
||||
Rp[WS(rs, 5)] = T2x + T2y;
|
||||
}
|
||||
{
|
||||
E T27, T2a, T2z, T2C;
|
||||
T27 = T25 - T26;
|
||||
T2a = T28 - T29;
|
||||
Ip[WS(rs, 1)] = T27 + T2a;
|
||||
Im[WS(rs, 4)] = T2a - T27;
|
||||
T2z = T2b - T2c;
|
||||
T2C = T2A - T2B;
|
||||
Rm[WS(rs, 4)] = T2z - T2C;
|
||||
Rp[WS(rs, 1)] = T2z + T2C;
|
||||
}
|
||||
{
|
||||
E T2d, T2e, T2D, T2E;
|
||||
T2d = T2b + T2c;
|
||||
T2e = T29 + T28;
|
||||
Rm[WS(rs, 1)] = T2d - T2e;
|
||||
Rp[WS(rs, 4)] = T2d + T2e;
|
||||
T2D = T26 + T25;
|
||||
T2E = T2A + T2B;
|
||||
Ip[WS(rs, 4)] = T2D + T2E;
|
||||
Im[WS(rs, 1)] = T2E - T2D;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 12, "hc2cfdft_12", twinstr, &GENUS, { 112, 46, 30, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_12) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_12, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
909
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_16.c
Normal file
909
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_16.c
Normal file
@@ -0,0 +1,909 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:37 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 206 FP additions, 132 FP multiplications,
|
||||
* (or, 136 additions, 62 multiplications, 70 fused multiply/add),
|
||||
* 67 stack variables, 4 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T1f, T2e, T1c, T2g, T1K, T3D, T2W, T3H, TR, T2j, T2R, T3E, T11, T2l, T1v;
|
||||
E T3G, Ta, T2p, Tk, T2r, T3o, T3p, T1Y, T3z, T2G, T3w, Tv, T2u, TF, T2w;
|
||||
E T3r, T3s, T2b, T3A, T2L, T3x;
|
||||
{
|
||||
E T1d, T1e, T1I, T16, T1A, T1D, T1E, T1C, T1G, T1H, T2U, T1b, T1z, T2S, T1w;
|
||||
E T1y, T14, T15;
|
||||
T1d = Ip[0];
|
||||
T1e = Im[0];
|
||||
T1I = T1d + T1e;
|
||||
T14 = Ip[WS(rs, 4)];
|
||||
T15 = Im[WS(rs, 4)];
|
||||
T16 = T14 - T15;
|
||||
T1A = T14 + T15;
|
||||
{
|
||||
E T1F, T19, T1a, T1x;
|
||||
T1D = Rm[0];
|
||||
T1E = Rp[0];
|
||||
T1F = T1D - T1E;
|
||||
T1C = W[0];
|
||||
T1G = T1C * T1F;
|
||||
T1H = W[1];
|
||||
T2U = T1H * T1F;
|
||||
T19 = Rp[WS(rs, 4)];
|
||||
T1a = Rm[WS(rs, 4)];
|
||||
T1x = T1a - T19;
|
||||
T1b = T19 + T1a;
|
||||
T1z = W[17];
|
||||
T2S = T1z * T1x;
|
||||
T1w = W[16];
|
||||
T1y = T1w * T1x;
|
||||
}
|
||||
T1f = T1d - T1e;
|
||||
T2e = T1E + T1D;
|
||||
{
|
||||
E T17, T2f, T13, T18;
|
||||
T13 = W[14];
|
||||
T17 = T13 * T16;
|
||||
T2f = T13 * T1b;
|
||||
T18 = W[15];
|
||||
T1c = FNMS(T18, T1b, T17);
|
||||
T2g = FMA(T18, T16, T2f);
|
||||
}
|
||||
{
|
||||
E T1B, T1J, T2T, T2V;
|
||||
T1B = FNMS(T1z, T1A, T1y);
|
||||
T1J = FNMS(T1H, T1I, T1G);
|
||||
T1K = T1B + T1J;
|
||||
T3D = T1J - T1B;
|
||||
T2T = FMA(T1w, T1A, T2S);
|
||||
T2V = FMA(T1C, T1I, T2U);
|
||||
T2W = T2T + T2V;
|
||||
T3H = T2V - T2T;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TL, T1n, TQ, T1m, T2N, T1j, T1l, TV, T1t, T10, T1s, T2P, T1p, T1r;
|
||||
{
|
||||
E TJ, TK, TO, TP, T1k;
|
||||
TJ = Ip[WS(rs, 2)];
|
||||
TK = Im[WS(rs, 2)];
|
||||
TL = TJ - TK;
|
||||
T1n = TJ + TK;
|
||||
TO = Rp[WS(rs, 2)];
|
||||
TP = Rm[WS(rs, 2)];
|
||||
T1k = TP - TO;
|
||||
TQ = TO + TP;
|
||||
T1m = W[9];
|
||||
T2N = T1m * T1k;
|
||||
T1j = W[8];
|
||||
T1l = T1j * T1k;
|
||||
}
|
||||
{
|
||||
E TT, TU, TY, TZ, T1q;
|
||||
TT = Ip[WS(rs, 6)];
|
||||
TU = Im[WS(rs, 6)];
|
||||
TV = TT - TU;
|
||||
T1t = TT + TU;
|
||||
TY = Rp[WS(rs, 6)];
|
||||
TZ = Rm[WS(rs, 6)];
|
||||
T1q = TZ - TY;
|
||||
T10 = TY + TZ;
|
||||
T1s = W[25];
|
||||
T2P = T1s * T1q;
|
||||
T1p = W[24];
|
||||
T1r = T1p * T1q;
|
||||
}
|
||||
{
|
||||
E T2O, T2Q, T1o, T1u;
|
||||
{
|
||||
E TM, T2i, TI, TN;
|
||||
TI = W[6];
|
||||
TM = TI * TL;
|
||||
T2i = TI * TQ;
|
||||
TN = W[7];
|
||||
TR = FNMS(TN, TQ, TM);
|
||||
T2j = FMA(TN, TL, T2i);
|
||||
}
|
||||
T2O = FMA(T1j, T1n, T2N);
|
||||
T2Q = FMA(T1p, T1t, T2P);
|
||||
T2R = T2O + T2Q;
|
||||
T3E = T2O - T2Q;
|
||||
{
|
||||
E TW, T2k, TS, TX;
|
||||
TS = W[22];
|
||||
TW = TS * TV;
|
||||
T2k = TS * T10;
|
||||
TX = W[23];
|
||||
T11 = FNMS(TX, T10, TW);
|
||||
T2l = FMA(TX, TV, T2k);
|
||||
}
|
||||
T1o = FNMS(T1m, T1n, T1l);
|
||||
T1u = FNMS(T1s, T1t, T1r);
|
||||
T1v = T1o + T1u;
|
||||
T3G = T1o - T1u;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T4, T1Q, T9, T1N, T5, T2o, T1O, T2C, Te, T1W, Tj, T1T, Tf, T2q, T1U;
|
||||
E T2E, T6, Tg;
|
||||
{
|
||||
E T1, T1M, Tb, T1S;
|
||||
{
|
||||
E T2, T3, T7, T8;
|
||||
T2 = Ip[WS(rs, 1)];
|
||||
T3 = Im[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
T1Q = T2 + T3;
|
||||
T7 = Rp[WS(rs, 1)];
|
||||
T8 = Rm[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
T1N = T7 - T8;
|
||||
}
|
||||
T1 = W[2];
|
||||
T5 = T1 * T4;
|
||||
T2o = T1 * T9;
|
||||
T1M = W[4];
|
||||
T1O = T1M * T1N;
|
||||
T2C = T1M * T1Q;
|
||||
{
|
||||
E Tc, Td, Th, Ti;
|
||||
Tc = Ip[WS(rs, 5)];
|
||||
Td = Im[WS(rs, 5)];
|
||||
Te = Tc - Td;
|
||||
T1W = Tc + Td;
|
||||
Th = Rp[WS(rs, 5)];
|
||||
Ti = Rm[WS(rs, 5)];
|
||||
Tj = Th + Ti;
|
||||
T1T = Th - Ti;
|
||||
}
|
||||
Tb = W[18];
|
||||
Tf = Tb * Te;
|
||||
T2q = Tb * Tj;
|
||||
T1S = W[20];
|
||||
T1U = T1S * T1T;
|
||||
T2E = T1S * T1W;
|
||||
}
|
||||
T6 = W[3];
|
||||
Ta = FNMS(T6, T9, T5);
|
||||
T2p = FMA(T6, T4, T2o);
|
||||
Tg = W[19];
|
||||
Tk = FNMS(Tg, Tj, Tf);
|
||||
T2r = FMA(Tg, Te, T2q);
|
||||
T3o = Ta - Tk;
|
||||
T3p = T2p - T2r;
|
||||
{
|
||||
E T1R, T2D, T1X, T2F, T1P, T1V;
|
||||
T1P = W[5];
|
||||
T1R = FMA(T1P, T1Q, T1O);
|
||||
T2D = FNMS(T1P, T1N, T2C);
|
||||
T1V = W[21];
|
||||
T1X = FMA(T1V, T1W, T1U);
|
||||
T2F = FNMS(T1V, T1T, T2E);
|
||||
T1Y = T1R + T1X;
|
||||
T3z = T1X - T1R;
|
||||
T2G = T2D + T2F;
|
||||
T3w = T2F - T2D;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, T23, Tu, T20, Tq, T2t, T21, T2H, Tz, T29, TE, T26, TA, T2v, T27;
|
||||
E T2J, Tr, TB;
|
||||
{
|
||||
E Tm, T1Z, Tw, T25;
|
||||
{
|
||||
E Tn, To, Ts, Tt;
|
||||
Tn = Ip[WS(rs, 7)];
|
||||
To = Im[WS(rs, 7)];
|
||||
Tp = Tn - To;
|
||||
T23 = Tn + To;
|
||||
Ts = Rp[WS(rs, 7)];
|
||||
Tt = Rm[WS(rs, 7)];
|
||||
Tu = Ts + Tt;
|
||||
T20 = Ts - Tt;
|
||||
}
|
||||
Tm = W[26];
|
||||
Tq = Tm * Tp;
|
||||
T2t = Tm * Tu;
|
||||
T1Z = W[28];
|
||||
T21 = T1Z * T20;
|
||||
T2H = T1Z * T23;
|
||||
{
|
||||
E Tx, Ty, TC, TD;
|
||||
Tx = Ip[WS(rs, 3)];
|
||||
Ty = Im[WS(rs, 3)];
|
||||
Tz = Tx - Ty;
|
||||
T29 = Tx + Ty;
|
||||
TC = Rp[WS(rs, 3)];
|
||||
TD = Rm[WS(rs, 3)];
|
||||
TE = TC + TD;
|
||||
T26 = TC - TD;
|
||||
}
|
||||
Tw = W[10];
|
||||
TA = Tw * Tz;
|
||||
T2v = Tw * TE;
|
||||
T25 = W[12];
|
||||
T27 = T25 * T26;
|
||||
T2J = T25 * T29;
|
||||
}
|
||||
Tr = W[27];
|
||||
Tv = FNMS(Tr, Tu, Tq);
|
||||
T2u = FMA(Tr, Tp, T2t);
|
||||
TB = W[11];
|
||||
TF = FNMS(TB, TE, TA);
|
||||
T2w = FMA(TB, Tz, T2v);
|
||||
T3r = T2u - T2w;
|
||||
T3s = Tv - TF;
|
||||
{
|
||||
E T24, T2I, T2a, T2K, T22, T28;
|
||||
T22 = W[29];
|
||||
T24 = FMA(T22, T23, T21);
|
||||
T2I = FNMS(T22, T20, T2H);
|
||||
T28 = W[13];
|
||||
T2a = FMA(T28, T29, T27);
|
||||
T2K = FNMS(T28, T26, T2J);
|
||||
T2b = T24 + T2a;
|
||||
T3A = T2I - T2K;
|
||||
T2L = T2I + T2K;
|
||||
T3x = T2a - T24;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TH, T3c, T36, T3g, T39, T3h, T1h, T32, T2d, T2A, T2y, T31, T2Y, T30, T2n;
|
||||
E T3b;
|
||||
{
|
||||
E Tl, TG, T34, T35;
|
||||
Tl = Ta + Tk;
|
||||
TG = Tv + TF;
|
||||
TH = Tl + TG;
|
||||
T3c = Tl - TG;
|
||||
T34 = T2L - T2G;
|
||||
T35 = T1Y - T2b;
|
||||
T36 = T34 + T35;
|
||||
T3g = T34 - T35;
|
||||
}
|
||||
{
|
||||
E T37, T38, T12, T1g;
|
||||
T37 = T1K - T1v;
|
||||
T38 = T2W - T2R;
|
||||
T39 = T37 - T38;
|
||||
T3h = T37 + T38;
|
||||
T12 = TR + T11;
|
||||
T1g = T1c + T1f;
|
||||
T1h = T12 + T1g;
|
||||
T32 = T1g - T12;
|
||||
}
|
||||
{
|
||||
E T1L, T2c, T2s, T2x;
|
||||
T1L = T1v + T1K;
|
||||
T2c = T1Y + T2b;
|
||||
T2d = T1L - T2c;
|
||||
T2A = T2c + T1L;
|
||||
T2s = T2p + T2r;
|
||||
T2x = T2u + T2w;
|
||||
T2y = T2s + T2x;
|
||||
T31 = T2x - T2s;
|
||||
}
|
||||
{
|
||||
E T2M, T2X, T2h, T2m;
|
||||
T2M = T2G + T2L;
|
||||
T2X = T2R + T2W;
|
||||
T2Y = T2M - T2X;
|
||||
T30 = T2M + T2X;
|
||||
T2h = T2e + T2g;
|
||||
T2m = T2j + T2l;
|
||||
T2n = T2h + T2m;
|
||||
T3b = T2h - T2m;
|
||||
}
|
||||
{
|
||||
E T1i, T2Z, T2z, T2B;
|
||||
T1i = TH + T1h;
|
||||
Ip[0] = KP500000000 * (T1i + T2d);
|
||||
Im[WS(rs, 7)] = KP500000000 * (T2d - T1i);
|
||||
T2Z = T2n + T2y;
|
||||
Rm[WS(rs, 7)] = KP500000000 * (T2Z - T30);
|
||||
Rp[0] = KP500000000 * (T2Z + T30);
|
||||
T2z = T2n - T2y;
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T2z - T2A);
|
||||
Rp[WS(rs, 4)] = KP500000000 * (T2z + T2A);
|
||||
T2B = T1h - TH;
|
||||
Ip[WS(rs, 4)] = KP500000000 * (T2B + T2Y);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T2Y - T2B);
|
||||
}
|
||||
{
|
||||
E T33, T3a, T3j, T3k;
|
||||
T33 = T31 + T32;
|
||||
T3a = T36 + T39;
|
||||
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3a, T33));
|
||||
Im[WS(rs, 5)] = -(KP500000000 * (FNMS(KP707106781, T3a, T33)));
|
||||
T3j = T3b + T3c;
|
||||
T3k = T3g + T3h;
|
||||
Rm[WS(rs, 5)] = KP500000000 * (FNMS(KP707106781, T3k, T3j));
|
||||
Rp[WS(rs, 2)] = KP500000000 * (FMA(KP707106781, T3k, T3j));
|
||||
}
|
||||
{
|
||||
E T3d, T3e, T3f, T3i;
|
||||
T3d = T3b - T3c;
|
||||
T3e = T39 - T36;
|
||||
Rm[WS(rs, 1)] = KP500000000 * (FNMS(KP707106781, T3e, T3d));
|
||||
Rp[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3e, T3d));
|
||||
T3f = T32 - T31;
|
||||
T3i = T3g - T3h;
|
||||
Ip[WS(rs, 6)] = KP500000000 * (FMA(KP707106781, T3i, T3f));
|
||||
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP707106781, T3i, T3f)));
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3n, T3Z, T44, T4e, T47, T4f, T3u, T4a, T3C, T3U, T3N, T49, T3Q, T40, T3J;
|
||||
E T3V;
|
||||
{
|
||||
E T3l, T3m, T42, T43;
|
||||
T3l = T1f - T1c;
|
||||
T3m = T2j - T2l;
|
||||
T3n = T3l - T3m;
|
||||
T3Z = T3m + T3l;
|
||||
T42 = T3w - T3x;
|
||||
T43 = T3A - T3z;
|
||||
T44 = FMA(KP414213562, T43, T42);
|
||||
T4e = FNMS(KP414213562, T42, T43);
|
||||
}
|
||||
{
|
||||
E T45, T46, T3q, T3t;
|
||||
T45 = T3E + T3D;
|
||||
T46 = T3H - T3G;
|
||||
T47 = FMA(KP414213562, T46, T45);
|
||||
T4f = FNMS(KP414213562, T45, T46);
|
||||
T3q = T3o - T3p;
|
||||
T3t = T3r + T3s;
|
||||
T3u = T3q + T3t;
|
||||
T4a = T3q - T3t;
|
||||
}
|
||||
{
|
||||
E T3y, T3B, T3L, T3M;
|
||||
T3y = T3w + T3x;
|
||||
T3B = T3z + T3A;
|
||||
T3C = FMA(KP414213562, T3B, T3y);
|
||||
T3U = FNMS(KP414213562, T3y, T3B);
|
||||
T3L = T2e - T2g;
|
||||
T3M = TR - T11;
|
||||
T3N = T3L + T3M;
|
||||
T49 = T3L - T3M;
|
||||
}
|
||||
{
|
||||
E T3O, T3P, T3F, T3I;
|
||||
T3O = T3p + T3o;
|
||||
T3P = T3r - T3s;
|
||||
T3Q = T3O + T3P;
|
||||
T40 = T3P - T3O;
|
||||
T3F = T3D - T3E;
|
||||
T3I = T3G + T3H;
|
||||
T3J = FNMS(KP414213562, T3I, T3F);
|
||||
T3V = FMA(KP414213562, T3F, T3I);
|
||||
}
|
||||
{
|
||||
E T3v, T3K, T3X, T3Y;
|
||||
T3v = FMA(KP707106781, T3u, T3n);
|
||||
T3K = T3C + T3J;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3K, T3v));
|
||||
Im[WS(rs, 6)] = -(KP500000000 * (FNMS(KP923879532, T3K, T3v)));
|
||||
T3X = FMA(KP707106781, T3Q, T3N);
|
||||
T3Y = T3U + T3V;
|
||||
Rm[WS(rs, 6)] = KP500000000 * (FNMS(KP923879532, T3Y, T3X));
|
||||
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP923879532, T3Y, T3X));
|
||||
}
|
||||
{
|
||||
E T3R, T3S, T3T, T3W;
|
||||
T3R = FNMS(KP707106781, T3Q, T3N);
|
||||
T3S = T3J - T3C;
|
||||
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP923879532, T3S, T3R));
|
||||
Rp[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3S, T3R));
|
||||
T3T = FNMS(KP707106781, T3u, T3n);
|
||||
T3W = T3U - T3V;
|
||||
Ip[WS(rs, 5)] = KP500000000 * (FMA(KP923879532, T3W, T3T));
|
||||
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP923879532, T3W, T3T)));
|
||||
}
|
||||
{
|
||||
E T41, T48, T4h, T4i;
|
||||
T41 = FNMS(KP707106781, T40, T3Z);
|
||||
T48 = T44 - T47;
|
||||
Ip[WS(rs, 7)] = KP500000000 * (FMA(KP923879532, T48, T41));
|
||||
Im[0] = -(KP500000000 * (FNMS(KP923879532, T48, T41)));
|
||||
T4h = FNMS(KP707106781, T4a, T49);
|
||||
T4i = T4e + T4f;
|
||||
Rp[WS(rs, 7)] = KP500000000 * (FNMS(KP923879532, T4i, T4h));
|
||||
Rm[0] = KP500000000 * (FMA(KP923879532, T4i, T4h));
|
||||
}
|
||||
{
|
||||
E T4b, T4c, T4d, T4g;
|
||||
T4b = FMA(KP707106781, T4a, T49);
|
||||
T4c = T44 + T47;
|
||||
Rm[WS(rs, 4)] = KP500000000 * (FNMS(KP923879532, T4c, T4b));
|
||||
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4c, T4b));
|
||||
T4d = FMA(KP707106781, T40, T3Z);
|
||||
T4g = T4e - T4f;
|
||||
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP923879532, T4g, T4d));
|
||||
Im[WS(rs, 4)] = -(KP500000000 * (FNMS(KP923879532, T4g, T4d)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, { 136, 62, 70, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hc2cfdft_16 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 206 FP additions, 100 FP multiplications,
|
||||
* (or, 168 additions, 62 multiplications, 38 fused multiply/add),
|
||||
* 61 stack variables, 4 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_16(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP461939766, +0.461939766255643378064091594698394143411208313);
|
||||
DK(KP191341716, +0.191341716182544885864229992015199433380672281);
|
||||
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 30, MAKE_VOLATILE_STRIDE(64, rs)) {
|
||||
E T19, T3h, T21, T2Y, T1o, T3d, T2s, T39, TW, T3i, T24, T2Z, T1z, T3c, T2p;
|
||||
E T3a, Tj, T2S, T28, T2R, T1L, T36, T2i, T32, TC, T2V, T2b, T2U, T1W, T35;
|
||||
E T2l, T33;
|
||||
{
|
||||
E T10, T1m, T14, T1k, T18, T1h, T1f, T1Z;
|
||||
{
|
||||
E TY, TZ, T12, T13;
|
||||
TY = Ip[WS(rs, 4)];
|
||||
TZ = Im[WS(rs, 4)];
|
||||
T10 = TY - TZ;
|
||||
T1m = TY + TZ;
|
||||
T12 = Rp[WS(rs, 4)];
|
||||
T13 = Rm[WS(rs, 4)];
|
||||
T14 = T12 + T13;
|
||||
T1k = T12 - T13;
|
||||
}
|
||||
{
|
||||
E T16, T17, T1d, T1e;
|
||||
T16 = Ip[0];
|
||||
T17 = Im[0];
|
||||
T18 = T16 - T17;
|
||||
T1h = T16 + T17;
|
||||
T1d = Rm[0];
|
||||
T1e = Rp[0];
|
||||
T1f = T1d - T1e;
|
||||
T1Z = T1e + T1d;
|
||||
}
|
||||
{
|
||||
E T15, T20, TX, T11;
|
||||
TX = W[14];
|
||||
T11 = W[15];
|
||||
T15 = FNMS(T11, T14, TX * T10);
|
||||
T20 = FMA(TX, T14, T11 * T10);
|
||||
T19 = T15 + T18;
|
||||
T3h = T1Z - T20;
|
||||
T21 = T1Z + T20;
|
||||
T2Y = T18 - T15;
|
||||
}
|
||||
{
|
||||
E T1i, T2r, T1n, T2q;
|
||||
{
|
||||
E T1c, T1g, T1j, T1l;
|
||||
T1c = W[0];
|
||||
T1g = W[1];
|
||||
T1i = FNMS(T1g, T1h, T1c * T1f);
|
||||
T2r = FMA(T1g, T1f, T1c * T1h);
|
||||
T1j = W[16];
|
||||
T1l = W[17];
|
||||
T1n = FMA(T1j, T1k, T1l * T1m);
|
||||
T2q = FNMS(T1l, T1k, T1j * T1m);
|
||||
}
|
||||
T1o = T1i - T1n;
|
||||
T3d = T2r - T2q;
|
||||
T2s = T2q + T2r;
|
||||
T39 = T1n + T1i;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TH, T1s, TL, T1q, TQ, T1x, TU, T1v;
|
||||
{
|
||||
E TF, TG, TJ, TK;
|
||||
TF = Ip[WS(rs, 2)];
|
||||
TG = Im[WS(rs, 2)];
|
||||
TH = TF - TG;
|
||||
T1s = TF + TG;
|
||||
TJ = Rp[WS(rs, 2)];
|
||||
TK = Rm[WS(rs, 2)];
|
||||
TL = TJ + TK;
|
||||
T1q = TJ - TK;
|
||||
}
|
||||
{
|
||||
E TO, TP, TS, TT;
|
||||
TO = Ip[WS(rs, 6)];
|
||||
TP = Im[WS(rs, 6)];
|
||||
TQ = TO - TP;
|
||||
T1x = TO + TP;
|
||||
TS = Rp[WS(rs, 6)];
|
||||
TT = Rm[WS(rs, 6)];
|
||||
TU = TS + TT;
|
||||
T1v = TS - TT;
|
||||
}
|
||||
{
|
||||
E TM, T22, TV, T23;
|
||||
{
|
||||
E TE, TI, TN, TR;
|
||||
TE = W[6];
|
||||
TI = W[7];
|
||||
TM = FNMS(TI, TL, TE * TH);
|
||||
T22 = FMA(TE, TL, TI * TH);
|
||||
TN = W[22];
|
||||
TR = W[23];
|
||||
TV = FNMS(TR, TU, TN * TQ);
|
||||
T23 = FMA(TN, TU, TR * TQ);
|
||||
}
|
||||
TW = TM + TV;
|
||||
T3i = TM - TV;
|
||||
T24 = T22 + T23;
|
||||
T2Z = T22 - T23;
|
||||
}
|
||||
{
|
||||
E T1t, T2n, T1y, T2o;
|
||||
{
|
||||
E T1p, T1r, T1u, T1w;
|
||||
T1p = W[8];
|
||||
T1r = W[9];
|
||||
T1t = FMA(T1p, T1q, T1r * T1s);
|
||||
T2n = FNMS(T1r, T1q, T1p * T1s);
|
||||
T1u = W[24];
|
||||
T1w = W[25];
|
||||
T1y = FMA(T1u, T1v, T1w * T1x);
|
||||
T2o = FNMS(T1w, T1v, T1u * T1x);
|
||||
}
|
||||
T1z = T1t + T1y;
|
||||
T3c = T1y - T1t;
|
||||
T2p = T2n + T2o;
|
||||
T3a = T2n - T2o;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T4, T1E, T8, T1C, Td, T1J, Th, T1H;
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = Ip[WS(rs, 1)];
|
||||
T3 = Im[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
T1E = T2 + T3;
|
||||
T6 = Rp[WS(rs, 1)];
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = T6 + T7;
|
||||
T1C = T6 - T7;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tf, Tg;
|
||||
Tb = Ip[WS(rs, 5)];
|
||||
Tc = Im[WS(rs, 5)];
|
||||
Td = Tb - Tc;
|
||||
T1J = Tb + Tc;
|
||||
Tf = Rp[WS(rs, 5)];
|
||||
Tg = Rm[WS(rs, 5)];
|
||||
Th = Tf + Tg;
|
||||
T1H = Tf - Tg;
|
||||
}
|
||||
{
|
||||
E T9, T26, Ti, T27;
|
||||
{
|
||||
E T1, T5, Ta, Te;
|
||||
T1 = W[2];
|
||||
T5 = W[3];
|
||||
T9 = FNMS(T5, T8, T1 * T4);
|
||||
T26 = FMA(T1, T8, T5 * T4);
|
||||
Ta = W[18];
|
||||
Te = W[19];
|
||||
Ti = FNMS(Te, Th, Ta * Td);
|
||||
T27 = FMA(Ta, Th, Te * Td);
|
||||
}
|
||||
Tj = T9 + Ti;
|
||||
T2S = T26 - T27;
|
||||
T28 = T26 + T27;
|
||||
T2R = T9 - Ti;
|
||||
}
|
||||
{
|
||||
E T1F, T2g, T1K, T2h;
|
||||
{
|
||||
E T1B, T1D, T1G, T1I;
|
||||
T1B = W[4];
|
||||
T1D = W[5];
|
||||
T1F = FMA(T1B, T1C, T1D * T1E);
|
||||
T2g = FNMS(T1D, T1C, T1B * T1E);
|
||||
T1G = W[20];
|
||||
T1I = W[21];
|
||||
T1K = FMA(T1G, T1H, T1I * T1J);
|
||||
T2h = FNMS(T1I, T1H, T1G * T1J);
|
||||
}
|
||||
T1L = T1F + T1K;
|
||||
T36 = T2g - T2h;
|
||||
T2i = T2g + T2h;
|
||||
T32 = T1K - T1F;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, T1P, Tr, T1N, Tw, T1U, TA, T1S;
|
||||
{
|
||||
E Tl, Tm, Tp, Tq;
|
||||
Tl = Ip[WS(rs, 7)];
|
||||
Tm = Im[WS(rs, 7)];
|
||||
Tn = Tl - Tm;
|
||||
T1P = Tl + Tm;
|
||||
Tp = Rp[WS(rs, 7)];
|
||||
Tq = Rm[WS(rs, 7)];
|
||||
Tr = Tp + Tq;
|
||||
T1N = Tp - Tq;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, Ty, Tz;
|
||||
Tu = Ip[WS(rs, 3)];
|
||||
Tv = Im[WS(rs, 3)];
|
||||
Tw = Tu - Tv;
|
||||
T1U = Tu + Tv;
|
||||
Ty = Rp[WS(rs, 3)];
|
||||
Tz = Rm[WS(rs, 3)];
|
||||
TA = Ty + Tz;
|
||||
T1S = Ty - Tz;
|
||||
}
|
||||
{
|
||||
E Ts, T29, TB, T2a;
|
||||
{
|
||||
E Tk, To, Tt, Tx;
|
||||
Tk = W[26];
|
||||
To = W[27];
|
||||
Ts = FNMS(To, Tr, Tk * Tn);
|
||||
T29 = FMA(Tk, Tr, To * Tn);
|
||||
Tt = W[10];
|
||||
Tx = W[11];
|
||||
TB = FNMS(Tx, TA, Tt * Tw);
|
||||
T2a = FMA(Tt, TA, Tx * Tw);
|
||||
}
|
||||
TC = Ts + TB;
|
||||
T2V = Ts - TB;
|
||||
T2b = T29 + T2a;
|
||||
T2U = T29 - T2a;
|
||||
}
|
||||
{
|
||||
E T1Q, T2j, T1V, T2k;
|
||||
{
|
||||
E T1M, T1O, T1R, T1T;
|
||||
T1M = W[28];
|
||||
T1O = W[29];
|
||||
T1Q = FMA(T1M, T1N, T1O * T1P);
|
||||
T2j = FNMS(T1O, T1N, T1M * T1P);
|
||||
T1R = W[12];
|
||||
T1T = W[13];
|
||||
T1V = FMA(T1R, T1S, T1T * T1U);
|
||||
T2k = FNMS(T1T, T1S, T1R * T1U);
|
||||
}
|
||||
T1W = T1Q + T1V;
|
||||
T35 = T1V - T1Q;
|
||||
T2l = T2j + T2k;
|
||||
T33 = T2j - T2k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1b, T2f, T2u, T2w, T1Y, T2e, T2d, T2v;
|
||||
{
|
||||
E TD, T1a, T2m, T2t;
|
||||
TD = Tj + TC;
|
||||
T1a = TW + T19;
|
||||
T1b = TD + T1a;
|
||||
T2f = T1a - TD;
|
||||
T2m = T2i + T2l;
|
||||
T2t = T2p + T2s;
|
||||
T2u = T2m - T2t;
|
||||
T2w = T2m + T2t;
|
||||
}
|
||||
{
|
||||
E T1A, T1X, T25, T2c;
|
||||
T1A = T1o - T1z;
|
||||
T1X = T1L + T1W;
|
||||
T1Y = T1A - T1X;
|
||||
T2e = T1X + T1A;
|
||||
T25 = T21 + T24;
|
||||
T2c = T28 + T2b;
|
||||
T2d = T25 - T2c;
|
||||
T2v = T25 + T2c;
|
||||
}
|
||||
Ip[0] = KP500000000 * (T1b + T1Y);
|
||||
Rp[0] = KP500000000 * (T2v + T2w);
|
||||
Im[WS(rs, 7)] = KP500000000 * (T1Y - T1b);
|
||||
Rm[WS(rs, 7)] = KP500000000 * (T2v - T2w);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T2d - T2e);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T2u - T2f);
|
||||
Rp[WS(rs, 4)] = KP500000000 * (T2d + T2e);
|
||||
Ip[WS(rs, 4)] = KP500000000 * (T2f + T2u);
|
||||
}
|
||||
{
|
||||
E T2z, T2L, T2J, T2P, T2C, T2M, T2F, T2N;
|
||||
{
|
||||
E T2x, T2y, T2H, T2I;
|
||||
T2x = T2b - T28;
|
||||
T2y = T19 - TW;
|
||||
T2z = KP500000000 * (T2x + T2y);
|
||||
T2L = KP500000000 * (T2y - T2x);
|
||||
T2H = T21 - T24;
|
||||
T2I = Tj - TC;
|
||||
T2J = KP500000000 * (T2H - T2I);
|
||||
T2P = KP500000000 * (T2H + T2I);
|
||||
}
|
||||
{
|
||||
E T2A, T2B, T2D, T2E;
|
||||
T2A = T2l - T2i;
|
||||
T2B = T1L - T1W;
|
||||
T2C = T2A + T2B;
|
||||
T2M = T2A - T2B;
|
||||
T2D = T1z + T1o;
|
||||
T2E = T2s - T2p;
|
||||
T2F = T2D - T2E;
|
||||
T2N = T2D + T2E;
|
||||
}
|
||||
{
|
||||
E T2G, T2Q, T2K, T2O;
|
||||
T2G = KP353553390 * (T2C + T2F);
|
||||
Ip[WS(rs, 2)] = T2z + T2G;
|
||||
Im[WS(rs, 5)] = T2G - T2z;
|
||||
T2Q = KP353553390 * (T2M + T2N);
|
||||
Rm[WS(rs, 5)] = T2P - T2Q;
|
||||
Rp[WS(rs, 2)] = T2P + T2Q;
|
||||
T2K = KP353553390 * (T2F - T2C);
|
||||
Rm[WS(rs, 1)] = T2J - T2K;
|
||||
Rp[WS(rs, 6)] = T2J + T2K;
|
||||
T2O = KP353553390 * (T2M - T2N);
|
||||
Ip[WS(rs, 6)] = T2L + T2O;
|
||||
Im[WS(rs, 1)] = T2O - T2L;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T30, T3w, T3F, T3j, T2X, T3G, T3D, T3L, T3m, T3v, T38, T3q, T3A, T3K, T3f;
|
||||
E T3r;
|
||||
{
|
||||
E T2T, T2W, T34, T37;
|
||||
T30 = KP500000000 * (T2Y - T2Z);
|
||||
T3w = KP500000000 * (T2Z + T2Y);
|
||||
T3F = KP500000000 * (T3h - T3i);
|
||||
T3j = KP500000000 * (T3h + T3i);
|
||||
T2T = T2R - T2S;
|
||||
T2W = T2U + T2V;
|
||||
T2X = KP353553390 * (T2T + T2W);
|
||||
T3G = KP353553390 * (T2T - T2W);
|
||||
{
|
||||
E T3B, T3C, T3k, T3l;
|
||||
T3B = T3a + T39;
|
||||
T3C = T3d - T3c;
|
||||
T3D = FNMS(KP461939766, T3C, KP191341716 * T3B);
|
||||
T3L = FMA(KP461939766, T3B, KP191341716 * T3C);
|
||||
T3k = T2S + T2R;
|
||||
T3l = T2U - T2V;
|
||||
T3m = KP353553390 * (T3k + T3l);
|
||||
T3v = KP353553390 * (T3l - T3k);
|
||||
}
|
||||
T34 = T32 + T33;
|
||||
T37 = T35 - T36;
|
||||
T38 = FMA(KP191341716, T34, KP461939766 * T37);
|
||||
T3q = FNMS(KP191341716, T37, KP461939766 * T34);
|
||||
{
|
||||
E T3y, T3z, T3b, T3e;
|
||||
T3y = T33 - T32;
|
||||
T3z = T36 + T35;
|
||||
T3A = FMA(KP461939766, T3y, KP191341716 * T3z);
|
||||
T3K = FNMS(KP461939766, T3z, KP191341716 * T3y);
|
||||
T3b = T39 - T3a;
|
||||
T3e = T3c + T3d;
|
||||
T3f = FNMS(KP191341716, T3e, KP461939766 * T3b);
|
||||
T3r = FMA(KP191341716, T3b, KP461939766 * T3e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T31, T3g, T3t, T3u;
|
||||
T31 = T2X + T30;
|
||||
T3g = T38 + T3f;
|
||||
Ip[WS(rs, 1)] = T31 + T3g;
|
||||
Im[WS(rs, 6)] = T3g - T31;
|
||||
T3t = T3j + T3m;
|
||||
T3u = T3q + T3r;
|
||||
Rm[WS(rs, 6)] = T3t - T3u;
|
||||
Rp[WS(rs, 1)] = T3t + T3u;
|
||||
}
|
||||
{
|
||||
E T3n, T3o, T3p, T3s;
|
||||
T3n = T3j - T3m;
|
||||
T3o = T3f - T38;
|
||||
Rm[WS(rs, 2)] = T3n - T3o;
|
||||
Rp[WS(rs, 5)] = T3n + T3o;
|
||||
T3p = T30 - T2X;
|
||||
T3s = T3q - T3r;
|
||||
Ip[WS(rs, 5)] = T3p + T3s;
|
||||
Im[WS(rs, 2)] = T3s - T3p;
|
||||
}
|
||||
{
|
||||
E T3x, T3E, T3N, T3O;
|
||||
T3x = T3v + T3w;
|
||||
T3E = T3A + T3D;
|
||||
Ip[WS(rs, 3)] = T3x + T3E;
|
||||
Im[WS(rs, 4)] = T3E - T3x;
|
||||
T3N = T3F + T3G;
|
||||
T3O = T3K + T3L;
|
||||
Rm[WS(rs, 4)] = T3N - T3O;
|
||||
Rp[WS(rs, 3)] = T3N + T3O;
|
||||
}
|
||||
{
|
||||
E T3H, T3I, T3J, T3M;
|
||||
T3H = T3F - T3G;
|
||||
T3I = T3D - T3A;
|
||||
Rm[0] = T3H - T3I;
|
||||
Rp[WS(rs, 7)] = T3H + T3I;
|
||||
T3J = T3w - T3v;
|
||||
T3M = T3K - T3L;
|
||||
Ip[WS(rs, 7)] = T3J + T3M;
|
||||
Im[0] = T3M - T3J;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 16, "hc2cfdft_16", twinstr, &GENUS, { 168, 62, 38, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_16) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_16, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
133
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_2.c
Normal file
133
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_2.c
Normal file
@@ -0,0 +1,133 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cfdft_2 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 10 FP additions, 8 FP multiplications,
|
||||
* (or, 8 additions, 6 multiplications, 2 fused multiply/add),
|
||||
* 16 stack variables, 1 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T3, Ta, Tc, T9, Td, T4, T8, Tb, Te;
|
||||
{
|
||||
E T1, T2, T5, T6, T7;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
T3 = T1 - T2;
|
||||
Ta = T1 + T2;
|
||||
T5 = Rm[0];
|
||||
T6 = Rp[0];
|
||||
T7 = T5 - T6;
|
||||
Tc = T6 + T5;
|
||||
T9 = W[1];
|
||||
Td = T9 * T7;
|
||||
T4 = W[0];
|
||||
T8 = T4 * T7;
|
||||
}
|
||||
Tb = FNMS(T9, Ta, T8);
|
||||
Ip[0] = KP500000000 * (T3 + Tb);
|
||||
Im[0] = KP500000000 * (Tb - T3);
|
||||
Te = FMA(T4, Ta, Td);
|
||||
Rm[0] = KP500000000 * (Tc - Te);
|
||||
Rp[0] = KP500000000 * (Tc + Te);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cfdft_2", twinstr, &GENUS, { 8, 6, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hc2cfdft_2 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 10 FP additions, 8 FP multiplications,
|
||||
* (or, 8 additions, 6 multiplications, 2 fused multiply/add),
|
||||
* 10 stack variables, 1 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T3, T9, T7, Tb;
|
||||
{
|
||||
E T1, T2, T5, T6;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
T3 = T1 - T2;
|
||||
T9 = T1 + T2;
|
||||
T5 = Rm[0];
|
||||
T6 = Rp[0];
|
||||
T7 = T5 - T6;
|
||||
Tb = T6 + T5;
|
||||
}
|
||||
{
|
||||
E Ta, Tc, T4, T8;
|
||||
T4 = W[0];
|
||||
T8 = W[1];
|
||||
Ta = FNMS(T8, T9, T4 * T7);
|
||||
Tc = FMA(T8, T7, T4 * T9);
|
||||
Ip[0] = KP500000000 * (T3 + Ta);
|
||||
Rp[0] = KP500000000 * (Tb + Tc);
|
||||
Im[0] = KP500000000 * (Ta - T3);
|
||||
Rm[0] = KP500000000 * (Tb - Tc);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 2, "hc2cfdft_2", twinstr, &GENUS, { 8, 6, 2, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_2) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_2, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
1155
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_20.c
Normal file
1155
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1983
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_32.c
Normal file
1983
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_32.c
Normal file
File diff suppressed because it is too large
Load Diff
218
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_4.c
Normal file
218
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_4.c
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cfdft_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 30 FP additions, 20 FP multiplications,
|
||||
* (or, 24 additions, 14 multiplications, 6 fused multiply/add),
|
||||
* 31 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E Td, Tl, Tu, Tk, TC, Tf, Tj, T4, Tr, T9, To, T5, Tv, Tp, TA;
|
||||
E Tb, Tc;
|
||||
Tb = Ip[0];
|
||||
Tc = Im[0];
|
||||
Td = Tb - Tc;
|
||||
Tl = Tb + Tc;
|
||||
{
|
||||
E Tg, Th, Ti, T1, Tn;
|
||||
Tg = Rm[0];
|
||||
Th = Rp[0];
|
||||
Ti = Tg - Th;
|
||||
Tu = Th + Tg;
|
||||
Tk = W[1];
|
||||
TC = Tk * Ti;
|
||||
Tf = W[0];
|
||||
Tj = Tf * Ti;
|
||||
{
|
||||
E T2, T3, T7, T8;
|
||||
T2 = Ip[WS(rs, 1)];
|
||||
T3 = Im[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
Tr = T2 + T3;
|
||||
T7 = Rp[WS(rs, 1)];
|
||||
T8 = Rm[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
To = T7 - T8;
|
||||
}
|
||||
T1 = W[2];
|
||||
T5 = T1 * T4;
|
||||
Tv = T1 * T9;
|
||||
Tn = W[4];
|
||||
Tp = Tn * To;
|
||||
TA = Tn * Tr;
|
||||
}
|
||||
{
|
||||
E Tm, TD, Ta, Tw, Ts, TB, T6, Tq;
|
||||
Tm = FNMS(Tk, Tl, Tj);
|
||||
TD = FMA(Tf, Tl, TC);
|
||||
T6 = W[3];
|
||||
Ta = FNMS(T6, T9, T5);
|
||||
Tw = FMA(T6, T4, Tv);
|
||||
Tq = W[5];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
TB = FNMS(Tq, To, TA);
|
||||
{
|
||||
E Te, Tt, TF, TG;
|
||||
Te = Ta + Td;
|
||||
Tt = Tm - Ts;
|
||||
Ip[0] = KP500000000 * (Te + Tt);
|
||||
Im[WS(rs, 1)] = KP500000000 * (Tt - Te);
|
||||
TF = Tu + Tw;
|
||||
TG = TB + TD;
|
||||
Rm[WS(rs, 1)] = KP500000000 * (TF - TG);
|
||||
Rp[0] = KP500000000 * (TF + TG);
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, TE;
|
||||
Tx = Tu - Tw;
|
||||
Ty = Ts + Tm;
|
||||
Rm[0] = KP500000000 * (Tx - Ty);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (Tx + Ty);
|
||||
Tz = Td - Ta;
|
||||
TE = TB - TD;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (Tz + TE);
|
||||
Im[0] = KP500000000 * (TE - Tz);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cfdft_4", twinstr, &GENUS, { 24, 14, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hc2cfdft_4 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 30 FP additions, 20 FP multiplications,
|
||||
* (or, 24 additions, 14 multiplications, 6 fused multiply/add),
|
||||
* 18 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E Tc, Tr, Tk, Tx, T9, Ts, Tp, Tw;
|
||||
{
|
||||
E Ta, Tb, Tj, Tf, Tg, Th, Te, Ti;
|
||||
Ta = Ip[0];
|
||||
Tb = Im[0];
|
||||
Tj = Ta + Tb;
|
||||
Tf = Rm[0];
|
||||
Tg = Rp[0];
|
||||
Th = Tf - Tg;
|
||||
Tc = Ta - Tb;
|
||||
Tr = Tg + Tf;
|
||||
Te = W[0];
|
||||
Ti = W[1];
|
||||
Tk = FNMS(Ti, Tj, Te * Th);
|
||||
Tx = FMA(Ti, Th, Te * Tj);
|
||||
}
|
||||
{
|
||||
E T4, To, T8, Tm;
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = Ip[WS(rs, 1)];
|
||||
T3 = Im[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
To = T2 + T3;
|
||||
T6 = Rp[WS(rs, 1)];
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = T6 + T7;
|
||||
Tm = T6 - T7;
|
||||
}
|
||||
{
|
||||
E T1, T5, Tl, Tn;
|
||||
T1 = W[2];
|
||||
T5 = W[3];
|
||||
T9 = FNMS(T5, T8, T1 * T4);
|
||||
Ts = FMA(T1, T8, T5 * T4);
|
||||
Tl = W[4];
|
||||
Tn = W[5];
|
||||
Tp = FMA(Tl, Tm, Tn * To);
|
||||
Tw = FNMS(Tn, Tm, Tl * To);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Td, Tq, Tz, TA;
|
||||
Td = T9 + Tc;
|
||||
Tq = Tk - Tp;
|
||||
Ip[0] = KP500000000 * (Td + Tq);
|
||||
Im[WS(rs, 1)] = KP500000000 * (Tq - Td);
|
||||
Tz = Tr + Ts;
|
||||
TA = Tw + Tx;
|
||||
Rm[WS(rs, 1)] = KP500000000 * (Tz - TA);
|
||||
Rp[0] = KP500000000 * (Tz + TA);
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tv, Ty;
|
||||
Tt = Tr - Ts;
|
||||
Tu = Tp + Tk;
|
||||
Rm[0] = KP500000000 * (Tt - Tu);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu);
|
||||
Tv = Tc - T9;
|
||||
Ty = Tw - Tx;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty);
|
||||
Im[0] = KP500000000 * (Ty - Tv);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 4, "hc2cfdft_4", twinstr, &GENUS, { 24, 14, 6, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_4) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_4, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
339
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_6.c
Normal file
339
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_6.c
Normal file
@@ -0,0 +1,339 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 58 FP additions, 44 FP multiplications,
|
||||
* (or, 36 additions, 22 multiplications, 22 fused multiply/add),
|
||||
* 27 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T3, TQ, TJ, T12, Tu, TX, TB, T10, Td, TS, Tk, TV;
|
||||
{
|
||||
E T1, T2, TI, TD, TE, TF;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
TI = T1 + T2;
|
||||
TD = Rm[0];
|
||||
TE = Rp[0];
|
||||
TF = TD - TE;
|
||||
T3 = T1 - T2;
|
||||
TQ = TE + TD;
|
||||
{
|
||||
E TC, TG, TH, T11;
|
||||
TC = W[0];
|
||||
TG = TC * TF;
|
||||
TH = W[1];
|
||||
T11 = TH * TF;
|
||||
TJ = FNMS(TH, TI, TG);
|
||||
T12 = FMA(TC, TI, T11);
|
||||
}
|
||||
}
|
||||
{
|
||||
E To, TA, Tt, Tx;
|
||||
{
|
||||
E Tm, Tn, Tr, Ts;
|
||||
Tm = Rm[WS(rs, 2)];
|
||||
Tn = Rp[WS(rs, 2)];
|
||||
To = Tm - Tn;
|
||||
TA = Tn + Tm;
|
||||
Tr = Ip[WS(rs, 2)];
|
||||
Ts = Im[WS(rs, 2)];
|
||||
Tt = Tr + Ts;
|
||||
Tx = Tr - Ts;
|
||||
}
|
||||
{
|
||||
E Tp, TW, Tl, Tq;
|
||||
Tl = W[8];
|
||||
Tp = Tl * To;
|
||||
TW = Tl * Tt;
|
||||
Tq = W[9];
|
||||
Tu = FNMS(Tq, Tt, Tp);
|
||||
TX = FMA(Tq, To, TW);
|
||||
}
|
||||
{
|
||||
E Tw, Ty, Tz, TZ;
|
||||
Tw = W[6];
|
||||
Ty = Tw * Tx;
|
||||
Tz = W[7];
|
||||
TZ = Tz * Tx;
|
||||
TB = FNMS(Tz, TA, Ty);
|
||||
T10 = FMA(Tw, TA, TZ);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Tg, Tc, Tj;
|
||||
{
|
||||
E T5, T6, Ta, Tb;
|
||||
T5 = Ip[WS(rs, 1)];
|
||||
T6 = Im[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tg = T5 - T6;
|
||||
Ta = Rp[WS(rs, 1)];
|
||||
Tb = Rm[WS(rs, 1)];
|
||||
Tc = Ta - Tb;
|
||||
Tj = Ta + Tb;
|
||||
}
|
||||
{
|
||||
E T4, T8, T9, TR;
|
||||
T4 = W[5];
|
||||
T8 = T4 * T7;
|
||||
T9 = W[4];
|
||||
TR = T9 * T7;
|
||||
Td = FMA(T9, Tc, T8);
|
||||
TS = FNMS(T4, Tc, TR);
|
||||
}
|
||||
{
|
||||
E Tf, Th, Ti, TU;
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
Ti = W[3];
|
||||
TU = Ti * Tg;
|
||||
Tk = FNMS(Ti, Tj, Th);
|
||||
TV = FMA(Tf, Tj, TU);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Te, T1d, TL, T1g, T1c, T1e, T19, T1f;
|
||||
Te = T3 - Td;
|
||||
T1d = TQ + TS;
|
||||
{
|
||||
E Tv, TK, T1a, T1b;
|
||||
Tv = Tk + Tu;
|
||||
TK = TB + TJ;
|
||||
TL = Tv + TK;
|
||||
T1g = Tv - TK;
|
||||
T1a = TV + TX;
|
||||
T1b = T10 + T12;
|
||||
T1c = T1a - T1b;
|
||||
T1e = T1a + T1b;
|
||||
}
|
||||
Ip[0] = KP500000000 * (Te + TL);
|
||||
Rp[0] = KP500000000 * (T1d + T1e);
|
||||
T19 = FNMS(KP500000000, TL, Te);
|
||||
Ip[WS(rs, 2)] = KP500000000 * (FMA(KP866025403, T1c, T19));
|
||||
Im[WS(rs, 1)] = -(KP500000000 * (FNMS(KP866025403, T1c, T19)));
|
||||
T1f = FNMS(KP500000000, T1e, T1d);
|
||||
Rp[WS(rs, 2)] = KP500000000 * (FNMS(KP866025403, T1g, T1f));
|
||||
Rm[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T1g, T1f));
|
||||
}
|
||||
{
|
||||
E TP, TT, TO, T16, T14, T18, T15, T17;
|
||||
TP = Td + T3;
|
||||
TT = TQ - TS;
|
||||
{
|
||||
E TM, TN, TY, T13;
|
||||
TM = Tu - Tk;
|
||||
TN = TJ - TB;
|
||||
TO = TM + TN;
|
||||
T16 = TN - TM;
|
||||
TY = TV - TX;
|
||||
T13 = T10 - T12;
|
||||
T14 = TY + T13;
|
||||
T18 = T13 - TY;
|
||||
}
|
||||
Im[WS(rs, 2)] = KP500000000 * (TO - TP);
|
||||
Rm[WS(rs, 2)] = KP500000000 * (TT + T14);
|
||||
T15 = FNMS(KP500000000, T14, TT);
|
||||
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T16, T15));
|
||||
Rm[0] = KP500000000 * (FNMS(KP866025403, T16, T15));
|
||||
T17 = FMA(KP500000000, TO, TP);
|
||||
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP866025403, T18, T17));
|
||||
Im[0] = -(KP500000000 * (FNMS(KP866025403, T18, T17)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, { 36, 22, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hc2cfdft_6 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 58 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 22 multiplications, 14 fused multiply/add),
|
||||
* 40 stack variables, 3 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_6(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP433012701, +0.433012701892219323381861585376468091735701313);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 10, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T3, TM, Tc, TN, Ts, T10, TI, TR, TF, T11, TH, TU;
|
||||
{
|
||||
E T1, T2, TD, Tz, TA, TB, T7, Tf, Tb, Th, Tq, Tw, Tm, Tu, T4;
|
||||
E T8;
|
||||
{
|
||||
E T5, T6, T9, Ta;
|
||||
T1 = Ip[0];
|
||||
T2 = Im[0];
|
||||
TD = T1 + T2;
|
||||
Tz = Rm[0];
|
||||
TA = Rp[0];
|
||||
TB = Tz - TA;
|
||||
T5 = Ip[WS(rs, 1)];
|
||||
T6 = Im[WS(rs, 1)];
|
||||
T7 = T5 + T6;
|
||||
Tf = T5 - T6;
|
||||
T9 = Rp[WS(rs, 1)];
|
||||
Ta = Rm[WS(rs, 1)];
|
||||
Tb = T9 - Ta;
|
||||
Th = T9 + Ta;
|
||||
{
|
||||
E To, Tp, Tk, Tl;
|
||||
To = Rp[WS(rs, 2)];
|
||||
Tp = Rm[WS(rs, 2)];
|
||||
Tq = To - Tp;
|
||||
Tw = To + Tp;
|
||||
Tk = Ip[WS(rs, 2)];
|
||||
Tl = Im[WS(rs, 2)];
|
||||
Tm = Tk + Tl;
|
||||
Tu = Tk - Tl;
|
||||
}
|
||||
}
|
||||
T3 = T1 - T2;
|
||||
TM = TA + Tz;
|
||||
T4 = W[5];
|
||||
T8 = W[4];
|
||||
Tc = FMA(T4, T7, T8 * Tb);
|
||||
TN = FNMS(T4, Tb, T8 * T7);
|
||||
{
|
||||
E Ti, TP, Tr, TQ;
|
||||
{
|
||||
E Te, Tg, Tj, Tn;
|
||||
Te = W[2];
|
||||
Tg = W[3];
|
||||
Ti = FNMS(Tg, Th, Te * Tf);
|
||||
TP = FMA(Tg, Tf, Te * Th);
|
||||
Tj = W[9];
|
||||
Tn = W[8];
|
||||
Tr = FMA(Tj, Tm, Tn * Tq);
|
||||
TQ = FNMS(Tj, Tq, Tn * Tm);
|
||||
}
|
||||
Ts = Ti - Tr;
|
||||
T10 = TP + TQ;
|
||||
TI = Ti + Tr;
|
||||
TR = TP - TQ;
|
||||
}
|
||||
{
|
||||
E Tx, TS, TE, TT;
|
||||
{
|
||||
E Tt, Tv, Ty, TC;
|
||||
Tt = W[6];
|
||||
Tv = W[7];
|
||||
Tx = FNMS(Tv, Tw, Tt * Tu);
|
||||
TS = FMA(Tv, Tu, Tt * Tw);
|
||||
Ty = W[0];
|
||||
TC = W[1];
|
||||
TE = FNMS(TC, TD, Ty * TB);
|
||||
TT = FMA(TC, TB, Ty * TD);
|
||||
}
|
||||
TF = Tx + TE;
|
||||
T11 = TS + TT;
|
||||
TH = TE - Tx;
|
||||
TU = TS - TT;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T12, Td, TG, TZ;
|
||||
T12 = KP433012701 * (T10 - T11);
|
||||
Td = T3 - Tc;
|
||||
TG = Ts + TF;
|
||||
TZ = FNMS(KP250000000, TG, KP500000000 * Td);
|
||||
Ip[0] = KP500000000 * (Td + TG);
|
||||
Im[WS(rs, 1)] = T12 - TZ;
|
||||
Ip[WS(rs, 2)] = TZ + T12;
|
||||
}
|
||||
{
|
||||
E T16, T13, T14, T15;
|
||||
T16 = KP433012701 * (Ts - TF);
|
||||
T13 = TM + TN;
|
||||
T14 = T10 + T11;
|
||||
T15 = FNMS(KP250000000, T14, KP500000000 * T13);
|
||||
Rp[WS(rs, 2)] = T15 - T16;
|
||||
Rp[0] = KP500000000 * (T13 + T14);
|
||||
Rm[WS(rs, 1)] = T16 + T15;
|
||||
}
|
||||
{
|
||||
E TY, TJ, TK, TX;
|
||||
TY = KP433012701 * (TU - TR);
|
||||
TJ = TH - TI;
|
||||
TK = Tc + T3;
|
||||
TX = FMA(KP500000000, TK, KP250000000 * TJ);
|
||||
Im[WS(rs, 2)] = KP500000000 * (TJ - TK);
|
||||
Im[0] = TY - TX;
|
||||
Ip[WS(rs, 1)] = TX + TY;
|
||||
}
|
||||
{
|
||||
E TL, TO, TV, TW;
|
||||
TL = KP433012701 * (TI + TH);
|
||||
TO = TM - TN;
|
||||
TV = TR + TU;
|
||||
TW = FNMS(KP250000000, TV, KP500000000 * TO);
|
||||
Rp[WS(rs, 1)] = TL + TW;
|
||||
Rm[WS(rs, 2)] = KP500000000 * (TO + TV);
|
||||
Rm[0] = TW - TL;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 6, "hc2cfdft_6", twinstr, &GENUS, { 44, 22, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_6) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_6, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
437
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_8.c
Normal file
437
fftw-3.3.10/rdft/scalar/r2cf/hc2cfdft_8.c
Normal file
@@ -0,0 +1,437 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:36 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 82 FP additions, 52 FP multiplications,
|
||||
* (or, 60 additions, 30 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Ty, T14, TO, T1o, Tv, T16, TG, T1m, Ta, T19, TV, T1h, Tk, T1b, T11;
|
||||
E T1j;
|
||||
{
|
||||
E Tw, Tx, TN, TI, TJ, TK;
|
||||
Tw = Ip[0];
|
||||
Tx = Im[0];
|
||||
TN = Tw + Tx;
|
||||
TI = Rm[0];
|
||||
TJ = Rp[0];
|
||||
TK = TI - TJ;
|
||||
Ty = Tw - Tx;
|
||||
T14 = TJ + TI;
|
||||
{
|
||||
E TH, TL, TM, T1n;
|
||||
TH = W[0];
|
||||
TL = TH * TK;
|
||||
TM = W[1];
|
||||
T1n = TM * TK;
|
||||
TO = FNMS(TM, TN, TL);
|
||||
T1o = FMA(TH, TN, T1n);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tp, TF, Tu, TC;
|
||||
{
|
||||
E Tn, To, Ts, Tt;
|
||||
Tn = Ip[WS(rs, 2)];
|
||||
To = Im[WS(rs, 2)];
|
||||
Tp = Tn - To;
|
||||
TF = Tn + To;
|
||||
Ts = Rp[WS(rs, 2)];
|
||||
Tt = Rm[WS(rs, 2)];
|
||||
Tu = Ts + Tt;
|
||||
TC = Tt - Ts;
|
||||
}
|
||||
{
|
||||
E Tq, T15, Tm, Tr;
|
||||
Tm = W[6];
|
||||
Tq = Tm * Tp;
|
||||
T15 = Tm * Tu;
|
||||
Tr = W[7];
|
||||
Tv = FNMS(Tr, Tu, Tq);
|
||||
T16 = FMA(Tr, Tp, T15);
|
||||
}
|
||||
{
|
||||
E TB, TD, TE, T1l;
|
||||
TB = W[8];
|
||||
TD = TB * TC;
|
||||
TE = W[9];
|
||||
T1l = TE * TC;
|
||||
TG = FNMS(TE, TF, TD);
|
||||
T1m = FMA(TB, TF, T1l);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T4, TU, T9, TR;
|
||||
{
|
||||
E T2, T3, T7, T8;
|
||||
T2 = Ip[WS(rs, 1)];
|
||||
T3 = Im[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
TU = T2 + T3;
|
||||
T7 = Rp[WS(rs, 1)];
|
||||
T8 = Rm[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
TR = T7 - T8;
|
||||
}
|
||||
{
|
||||
E T5, T18, T1, T6;
|
||||
T1 = W[2];
|
||||
T5 = T1 * T4;
|
||||
T18 = T1 * T9;
|
||||
T6 = W[3];
|
||||
Ta = FNMS(T6, T9, T5);
|
||||
T19 = FMA(T6, T4, T18);
|
||||
}
|
||||
{
|
||||
E TS, T1g, TQ, TT;
|
||||
TQ = W[4];
|
||||
TS = TQ * TR;
|
||||
T1g = TQ * TU;
|
||||
TT = W[5];
|
||||
TV = FMA(TT, TU, TS);
|
||||
T1h = FNMS(TT, TR, T1g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Te, T10, Tj, TX;
|
||||
{
|
||||
E Tc, Td, Th, Ti;
|
||||
Tc = Ip[WS(rs, 3)];
|
||||
Td = Im[WS(rs, 3)];
|
||||
Te = Tc - Td;
|
||||
T10 = Tc + Td;
|
||||
Th = Rp[WS(rs, 3)];
|
||||
Ti = Rm[WS(rs, 3)];
|
||||
Tj = Th + Ti;
|
||||
TX = Th - Ti;
|
||||
}
|
||||
{
|
||||
E Tf, T1a, Tb, Tg;
|
||||
Tb = W[10];
|
||||
Tf = Tb * Te;
|
||||
T1a = Tb * Tj;
|
||||
Tg = W[11];
|
||||
Tk = FNMS(Tg, Tj, Tf);
|
||||
T1b = FMA(Tg, Te, T1a);
|
||||
}
|
||||
{
|
||||
E TY, T1i, TW, TZ;
|
||||
TW = W[12];
|
||||
TY = TW * TX;
|
||||
T1i = TW * T10;
|
||||
TZ = W[13];
|
||||
T11 = FMA(TZ, T10, TY);
|
||||
T1j = FNMS(TZ, TX, T1i);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, T1f, T1q, T1s, T13, T1e, T1d, T1r;
|
||||
{
|
||||
E Tl, Tz, T1k, T1p;
|
||||
Tl = Ta + Tk;
|
||||
Tz = Tv + Ty;
|
||||
TA = Tl + Tz;
|
||||
T1f = Tz - Tl;
|
||||
T1k = T1h + T1j;
|
||||
T1p = T1m + T1o;
|
||||
T1q = T1k - T1p;
|
||||
T1s = T1k + T1p;
|
||||
}
|
||||
{
|
||||
E TP, T12, T17, T1c;
|
||||
TP = TG + TO;
|
||||
T12 = TV + T11;
|
||||
T13 = TP - T12;
|
||||
T1e = T12 + TP;
|
||||
T17 = T14 + T16;
|
||||
T1c = T19 + T1b;
|
||||
T1d = T17 - T1c;
|
||||
T1r = T17 + T1c;
|
||||
}
|
||||
Ip[0] = KP500000000 * (TA + T13);
|
||||
Rp[0] = KP500000000 * (T1r + T1s);
|
||||
Im[WS(rs, 3)] = KP500000000 * (T13 - TA);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T1r - T1s);
|
||||
Rm[WS(rs, 1)] = KP500000000 * (T1d - T1e);
|
||||
Im[WS(rs, 1)] = KP500000000 * (T1q - T1f);
|
||||
Rp[WS(rs, 2)] = KP500000000 * (T1d + T1e);
|
||||
Ip[WS(rs, 2)] = KP500000000 * (T1f + T1q);
|
||||
}
|
||||
{
|
||||
E T1v, T1H, T1F, T1L, T1y, T1I, T1B, T1J;
|
||||
{
|
||||
E T1t, T1u, T1D, T1E;
|
||||
T1t = Ty - Tv;
|
||||
T1u = T19 - T1b;
|
||||
T1v = T1t - T1u;
|
||||
T1H = T1u + T1t;
|
||||
T1D = T14 - T16;
|
||||
T1E = Ta - Tk;
|
||||
T1F = T1D - T1E;
|
||||
T1L = T1D + T1E;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1z, T1A;
|
||||
T1w = T1j - T1h;
|
||||
T1x = TV - T11;
|
||||
T1y = T1w + T1x;
|
||||
T1I = T1w - T1x;
|
||||
T1z = TO - TG;
|
||||
T1A = T1o - T1m;
|
||||
T1B = T1z - T1A;
|
||||
T1J = T1z + T1A;
|
||||
}
|
||||
{
|
||||
E T1C, T1M, T1G, T1K;
|
||||
T1C = T1y + T1B;
|
||||
Ip[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1C, T1v));
|
||||
Im[WS(rs, 2)] = -(KP500000000 * (FNMS(KP707106781, T1C, T1v)));
|
||||
T1M = T1I + T1J;
|
||||
Rm[WS(rs, 2)] = KP500000000 * (FNMS(KP707106781, T1M, T1L));
|
||||
Rp[WS(rs, 1)] = KP500000000 * (FMA(KP707106781, T1M, T1L));
|
||||
T1G = T1B - T1y;
|
||||
Rm[0] = KP500000000 * (FNMS(KP707106781, T1G, T1F));
|
||||
Rp[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1G, T1F));
|
||||
T1K = T1I - T1J;
|
||||
Ip[WS(rs, 3)] = KP500000000 * (FMA(KP707106781, T1K, T1H));
|
||||
Im[0] = -(KP500000000 * (FNMS(KP707106781, T1K, T1H)));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, { 60, 30, 22, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2cdft.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hc2cfdft_8 -include rdft/scalar/hc2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 82 FP additions, 44 FP multiplications,
|
||||
* (or, 68 additions, 30 multiplications, 14 fused multiply/add),
|
||||
* 39 stack variables, 2 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hc2cf.h"
|
||||
|
||||
static void hc2cfdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E Tv, TX, Ts, TY, TE, T1a, TJ, T19, T1l, T1m, T9, T10, Ti, T11, TP;
|
||||
E T16, TU, T17, T1i, T1j;
|
||||
{
|
||||
E Tt, Tu, TD, Tz, TA, TB, Tn, TI, Tr, TG, Tk, To;
|
||||
Tt = Ip[0];
|
||||
Tu = Im[0];
|
||||
TD = Tt + Tu;
|
||||
Tz = Rm[0];
|
||||
TA = Rp[0];
|
||||
TB = Tz - TA;
|
||||
{
|
||||
E Tl, Tm, Tp, Tq;
|
||||
Tl = Ip[WS(rs, 2)];
|
||||
Tm = Im[WS(rs, 2)];
|
||||
Tn = Tl - Tm;
|
||||
TI = Tl + Tm;
|
||||
Tp = Rp[WS(rs, 2)];
|
||||
Tq = Rm[WS(rs, 2)];
|
||||
Tr = Tp + Tq;
|
||||
TG = Tp - Tq;
|
||||
}
|
||||
Tv = Tt - Tu;
|
||||
TX = TA + Tz;
|
||||
Tk = W[6];
|
||||
To = W[7];
|
||||
Ts = FNMS(To, Tr, Tk * Tn);
|
||||
TY = FMA(Tk, Tr, To * Tn);
|
||||
{
|
||||
E Ty, TC, TF, TH;
|
||||
Ty = W[0];
|
||||
TC = W[1];
|
||||
TE = FNMS(TC, TD, Ty * TB);
|
||||
T1a = FMA(TC, TB, Ty * TD);
|
||||
TF = W[8];
|
||||
TH = W[9];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T19 = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
T1l = TJ + TE;
|
||||
T1m = T1a - T19;
|
||||
}
|
||||
{
|
||||
E T4, TO, T8, TM, Td, TT, Th, TR;
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = Ip[WS(rs, 1)];
|
||||
T3 = Im[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
TO = T2 + T3;
|
||||
T6 = Rp[WS(rs, 1)];
|
||||
T7 = Rm[WS(rs, 1)];
|
||||
T8 = T6 + T7;
|
||||
TM = T6 - T7;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tf, Tg;
|
||||
Tb = Ip[WS(rs, 3)];
|
||||
Tc = Im[WS(rs, 3)];
|
||||
Td = Tb - Tc;
|
||||
TT = Tb + Tc;
|
||||
Tf = Rp[WS(rs, 3)];
|
||||
Tg = Rm[WS(rs, 3)];
|
||||
Th = Tf + Tg;
|
||||
TR = Tf - Tg;
|
||||
}
|
||||
{
|
||||
E T1, T5, Ta, Te;
|
||||
T1 = W[2];
|
||||
T5 = W[3];
|
||||
T9 = FNMS(T5, T8, T1 * T4);
|
||||
T10 = FMA(T1, T8, T5 * T4);
|
||||
Ta = W[10];
|
||||
Te = W[11];
|
||||
Ti = FNMS(Te, Th, Ta * Td);
|
||||
T11 = FMA(Ta, Th, Te * Td);
|
||||
{
|
||||
E TL, TN, TQ, TS;
|
||||
TL = W[4];
|
||||
TN = W[5];
|
||||
TP = FMA(TL, TM, TN * TO);
|
||||
T16 = FNMS(TN, TM, TL * TO);
|
||||
TQ = W[12];
|
||||
TS = W[13];
|
||||
TU = FMA(TQ, TR, TS * TT);
|
||||
T17 = FNMS(TS, TR, TQ * TT);
|
||||
}
|
||||
T1i = T17 - T16;
|
||||
T1j = TP - TU;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1h, T1t, T1w, T1y, T1o, T1s, T1r, T1x;
|
||||
{
|
||||
E T1f, T1g, T1u, T1v;
|
||||
T1f = Tv - Ts;
|
||||
T1g = T10 - T11;
|
||||
T1h = KP500000000 * (T1f - T1g);
|
||||
T1t = KP500000000 * (T1g + T1f);
|
||||
T1u = T1i - T1j;
|
||||
T1v = T1l + T1m;
|
||||
T1w = KP353553390 * (T1u - T1v);
|
||||
T1y = KP353553390 * (T1u + T1v);
|
||||
}
|
||||
{
|
||||
E T1k, T1n, T1p, T1q;
|
||||
T1k = T1i + T1j;
|
||||
T1n = T1l - T1m;
|
||||
T1o = KP353553390 * (T1k + T1n);
|
||||
T1s = KP353553390 * (T1n - T1k);
|
||||
T1p = TX - TY;
|
||||
T1q = T9 - Ti;
|
||||
T1r = KP500000000 * (T1p - T1q);
|
||||
T1x = KP500000000 * (T1p + T1q);
|
||||
}
|
||||
Ip[WS(rs, 1)] = T1h + T1o;
|
||||
Rp[WS(rs, 1)] = T1x + T1y;
|
||||
Im[WS(rs, 2)] = T1o - T1h;
|
||||
Rm[WS(rs, 2)] = T1x - T1y;
|
||||
Rm[0] = T1r - T1s;
|
||||
Im[0] = T1w - T1t;
|
||||
Rp[WS(rs, 3)] = T1r + T1s;
|
||||
Ip[WS(rs, 3)] = T1t + T1w;
|
||||
}
|
||||
{
|
||||
E Tx, T15, T1c, T1e, TW, T14, T13, T1d;
|
||||
{
|
||||
E Tj, Tw, T18, T1b;
|
||||
Tj = T9 + Ti;
|
||||
Tw = Ts + Tv;
|
||||
Tx = Tj + Tw;
|
||||
T15 = Tw - Tj;
|
||||
T18 = T16 + T17;
|
||||
T1b = T19 + T1a;
|
||||
T1c = T18 - T1b;
|
||||
T1e = T18 + T1b;
|
||||
}
|
||||
{
|
||||
E TK, TV, TZ, T12;
|
||||
TK = TE - TJ;
|
||||
TV = TP + TU;
|
||||
TW = TK - TV;
|
||||
T14 = TV + TK;
|
||||
TZ = TX + TY;
|
||||
T12 = T10 + T11;
|
||||
T13 = TZ - T12;
|
||||
T1d = TZ + T12;
|
||||
}
|
||||
Ip[0] = KP500000000 * (Tx + TW);
|
||||
Rp[0] = KP500000000 * (T1d + T1e);
|
||||
Im[WS(rs, 3)] = KP500000000 * (TW - Tx);
|
||||
Rm[WS(rs, 3)] = KP500000000 * (T1d - T1e);
|
||||
Rm[WS(rs, 1)] = KP500000000 * (T13 - T14);
|
||||
Im[WS(rs, 1)] = KP500000000 * (T1c - T15);
|
||||
Rp[WS(rs, 2)] = KP500000000 * (T13 + T14);
|
||||
Ip[WS(rs, 2)] = KP500000000 * (T15 + T1c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2c_desc desc = { 8, "hc2cfdft_8", twinstr, &GENUS, { 68, 30, 14, 0 } };
|
||||
|
||||
void X(codelet_hc2cfdft_8) (planner *p) {
|
||||
X(khc2c_register) (p, hc2cfdft_8, &desc, HC2C_VIA_DFT);
|
||||
}
|
||||
#endif
|
||||
836
fftw-3.3.10/rdft/scalar/r2cf/hf2_16.c
Normal file
836
fftw-3.3.10/rdft/scalar/r2cf/hf2_16.c
Normal file
@@ -0,0 +1,836 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:18 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 134 FP multiplications,
|
||||
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
|
||||
* 90 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
|
||||
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
|
||||
{
|
||||
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
|
||||
T2 = W[0];
|
||||
Tf = W[2];
|
||||
Tg = T2 * Tf;
|
||||
TM = W[6];
|
||||
TN = T2 * TM;
|
||||
TO = W[7];
|
||||
TS = T2 * TO;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tp = Tf * T3;
|
||||
T6 = W[5];
|
||||
Ta = T2 * T6;
|
||||
Tt = Tf * T6;
|
||||
T5 = W[1];
|
||||
Th = W[3];
|
||||
Tl = T2 * Th;
|
||||
Tz = FMA(T5, Th, Tg);
|
||||
Ti = FNMS(T5, Th, Tg);
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TZ = FNMS(Th, T3, Tt);
|
||||
TT = FNMS(T5, TM, TS);
|
||||
Tq = FNMS(Th, T6, Tp);
|
||||
TW = FMA(Th, T6, Tp);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
Tu = FMA(Th, T3, Tt);
|
||||
TP = FMA(T5, TO, TN);
|
||||
TI = FMA(T5, T3, Ta);
|
||||
TF = FNMS(T5, T6, T4);
|
||||
{
|
||||
E T1y, T1C, T1e, T1i;
|
||||
T1y = Tz * T3;
|
||||
T1C = Tz * T6;
|
||||
TC = FNMS(T5, Tf, Tl);
|
||||
T1z = FMA(TC, T6, T1y);
|
||||
T1O = FMA(TC, T3, T1C);
|
||||
T1D = FNMS(TC, T3, T1C);
|
||||
T1L = FNMS(TC, T6, T1y);
|
||||
T1e = Ti * T3;
|
||||
T1i = Ti * T6;
|
||||
Tm = FMA(T5, Tf, Tl);
|
||||
T1f = FMA(Tm, T6, T1e);
|
||||
T1p = FMA(Tm, T3, T1i);
|
||||
T1j = FNMS(Tm, T3, T1i);
|
||||
T1m = FNMS(Tm, T6, T1e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Te, T1U, T3A, T3M, T1G, T2w, T2I, T3h, T1R, T2D, T2B, T3i, Tx, T3L, T1Z;
|
||||
E T3w, TL, T21, T26, T38, T1d, T2h, T2s, T3c, T1s, T2t, T2m, T3d, T12, T28;
|
||||
E T2d, T37;
|
||||
{
|
||||
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
|
||||
T1 = cr[0];
|
||||
T3z = ci[0];
|
||||
T8 = cr[WS(rs, 8)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ci[WS(rs, 8)];
|
||||
T3x = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Te = T1 + Td;
|
||||
T1U = T1 - Td;
|
||||
T3y = FNMS(Tb, T8, T3x);
|
||||
T3A = T3y + T3z;
|
||||
T3M = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1w, T2E, T1A, T1B, T1E, T2G;
|
||||
T1u = cr[WS(rs, 15)];
|
||||
T1v = TM * T1u;
|
||||
T1w = ci[WS(rs, 15)];
|
||||
T2E = TM * T1w;
|
||||
T1A = cr[WS(rs, 7)];
|
||||
T1B = T1z * T1A;
|
||||
T1E = ci[WS(rs, 7)];
|
||||
T2G = T1z * T1E;
|
||||
{
|
||||
E T1x, T1F, T2F, T2H;
|
||||
T1x = FMA(TO, T1w, T1v);
|
||||
T1F = FMA(T1D, T1E, T1B);
|
||||
T1G = T1x + T1F;
|
||||
T2w = T1x - T1F;
|
||||
T2F = FNMS(TO, T1u, T2E);
|
||||
T2H = FNMS(T1D, T1A, T2G);
|
||||
T2I = T2F - T2H;
|
||||
T3h = T2F + T2H;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1H, T1I, T1J, T2x, T1M, T1N, T1P, T2z;
|
||||
T1H = cr[WS(rs, 3)];
|
||||
T1I = Tf * T1H;
|
||||
T1J = ci[WS(rs, 3)];
|
||||
T2x = Tf * T1J;
|
||||
T1M = cr[WS(rs, 11)];
|
||||
T1N = T1L * T1M;
|
||||
T1P = ci[WS(rs, 11)];
|
||||
T2z = T1L * T1P;
|
||||
{
|
||||
E T1K, T1Q, T2y, T2A;
|
||||
T1K = FMA(Th, T1J, T1I);
|
||||
T1Q = FMA(T1O, T1P, T1N);
|
||||
T1R = T1K + T1Q;
|
||||
T2D = T1Q - T1K;
|
||||
T2y = FNMS(Th, T1H, T2x);
|
||||
T2A = FNMS(T1O, T1M, T2z);
|
||||
T2B = T2y - T2A;
|
||||
T3i = T2y + T2A;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
|
||||
Tj = cr[WS(rs, 4)];
|
||||
Tk = Ti * Tj;
|
||||
Tn = ci[WS(rs, 4)];
|
||||
T1V = Ti * Tn;
|
||||
Tr = cr[WS(rs, 12)];
|
||||
Ts = Tq * Tr;
|
||||
Tv = ci[WS(rs, 12)];
|
||||
T1X = Tq * Tv;
|
||||
{
|
||||
E To, Tw, T1W, T1Y;
|
||||
To = FMA(Tm, Tn, Tk);
|
||||
Tw = FMA(Tu, Tv, Ts);
|
||||
Tx = To + Tw;
|
||||
T3L = To - Tw;
|
||||
T1W = FNMS(Tm, Tj, T1V);
|
||||
T1Y = FNMS(Tu, Tr, T1X);
|
||||
T1Z = T1W - T1Y;
|
||||
T3w = T1W + T1Y;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, TB, TD, T22, TG, TH, TJ, T24;
|
||||
TA = cr[WS(rs, 2)];
|
||||
TB = Tz * TA;
|
||||
TD = ci[WS(rs, 2)];
|
||||
T22 = Tz * TD;
|
||||
TG = cr[WS(rs, 10)];
|
||||
TH = TF * TG;
|
||||
TJ = ci[WS(rs, 10)];
|
||||
T24 = TF * TJ;
|
||||
{
|
||||
E TE, TK, T23, T25;
|
||||
TE = FMA(TC, TD, TB);
|
||||
TK = FMA(TI, TJ, TH);
|
||||
TL = TE + TK;
|
||||
T21 = TE - TK;
|
||||
T23 = FNMS(TC, TA, T22);
|
||||
T25 = FNMS(TI, TG, T24);
|
||||
T26 = T23 - T25;
|
||||
T38 = T23 + T25;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T15, T16, T17, T2o, T19, T1a, T1b, T2q;
|
||||
T15 = cr[WS(rs, 1)];
|
||||
T16 = T2 * T15;
|
||||
T17 = ci[WS(rs, 1)];
|
||||
T2o = T2 * T17;
|
||||
T19 = cr[WS(rs, 9)];
|
||||
T1a = T3 * T19;
|
||||
T1b = ci[WS(rs, 9)];
|
||||
T2q = T3 * T1b;
|
||||
{
|
||||
E T18, T1c, T2p, T2r;
|
||||
T18 = FMA(T5, T17, T16);
|
||||
T1c = FMA(T6, T1b, T1a);
|
||||
T1d = T18 + T1c;
|
||||
T2h = T18 - T1c;
|
||||
T2p = FNMS(T5, T15, T2o);
|
||||
T2r = FNMS(T6, T19, T2q);
|
||||
T2s = T2p - T2r;
|
||||
T3c = T2p + T2r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1g, T1h, T1k, T2i, T1n, T1o, T1q, T2k;
|
||||
T1g = cr[WS(rs, 5)];
|
||||
T1h = T1f * T1g;
|
||||
T1k = ci[WS(rs, 5)];
|
||||
T2i = T1f * T1k;
|
||||
T1n = cr[WS(rs, 13)];
|
||||
T1o = T1m * T1n;
|
||||
T1q = ci[WS(rs, 13)];
|
||||
T2k = T1m * T1q;
|
||||
{
|
||||
E T1l, T1r, T2j, T2l;
|
||||
T1l = FMA(T1j, T1k, T1h);
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T1s = T1l + T1r;
|
||||
T2t = T1l - T1r;
|
||||
T2j = FNMS(T1j, T1g, T2i);
|
||||
T2l = FNMS(T1p, T1n, T2k);
|
||||
T2m = T2j - T2l;
|
||||
T3d = T2j + T2l;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TR, TU, T29, TX, TY, T10, T2b;
|
||||
TQ = cr[WS(rs, 14)];
|
||||
TR = TP * TQ;
|
||||
TU = ci[WS(rs, 14)];
|
||||
T29 = TP * TU;
|
||||
TX = cr[WS(rs, 6)];
|
||||
TY = TW * TX;
|
||||
T10 = ci[WS(rs, 6)];
|
||||
T2b = TW * T10;
|
||||
{
|
||||
E TV, T11, T2a, T2c;
|
||||
TV = FMA(TT, TU, TR);
|
||||
T11 = FMA(TZ, T10, TY);
|
||||
T12 = TV + T11;
|
||||
T28 = TV - T11;
|
||||
T2a = FNMS(TT, TQ, T29);
|
||||
T2c = FNMS(TZ, TX, T2b);
|
||||
T2d = T2a - T2c;
|
||||
T37 = T2a + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
|
||||
{
|
||||
E Ty, T13, T3v, T3B;
|
||||
Ty = Te + Tx;
|
||||
T13 = TL + T12;
|
||||
T14 = Ty + T13;
|
||||
T3q = Ty - T13;
|
||||
T3v = T38 + T37;
|
||||
T3B = T3w + T3A;
|
||||
T3C = T3v + T3B;
|
||||
T3E = T3B - T3v;
|
||||
}
|
||||
{
|
||||
E T1t, T1S, T3r, T3s;
|
||||
T1t = T1d + T1s;
|
||||
T1S = T1G + T1R;
|
||||
T1T = T1t + T1S;
|
||||
T3D = T1S - T1t;
|
||||
T3r = T3h + T3i;
|
||||
T3s = T3c + T3d;
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3s + T3r;
|
||||
}
|
||||
ci[WS(rs, 7)] = T14 - T1T;
|
||||
cr[WS(rs, 12)] = T3D - T3E;
|
||||
ci[WS(rs, 11)] = T3D + T3E;
|
||||
cr[0] = T14 + T1T;
|
||||
cr[WS(rs, 4)] = T3q - T3t;
|
||||
cr[WS(rs, 8)] = T3u - T3C;
|
||||
ci[WS(rs, 15)] = T3u + T3C;
|
||||
ci[WS(rs, 3)] = T3q + T3t;
|
||||
}
|
||||
{
|
||||
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
|
||||
{
|
||||
E T36, T39, T3F, T3G;
|
||||
T36 = Te - Tx;
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 - T39;
|
||||
T3m = T36 + T39;
|
||||
T3F = TL - T12;
|
||||
T3G = T3A - T3w;
|
||||
T3H = T3F + T3G;
|
||||
T3J = T3G - T3F;
|
||||
}
|
||||
{
|
||||
E T3b, T3e, T3g, T3j;
|
||||
T3b = T1d - T1s;
|
||||
T3e = T3c - T3d;
|
||||
T3f = T3b + T3e;
|
||||
T3n = T3b - T3e;
|
||||
T3g = T1G - T1R;
|
||||
T3j = T3h - T3i;
|
||||
T3k = T3g - T3j;
|
||||
T3o = T3g + T3j;
|
||||
}
|
||||
{
|
||||
E T3l, T3K, T3p, T3I;
|
||||
T3l = T3f + T3k;
|
||||
ci[WS(rs, 5)] = FNMS(KP707106781, T3l, T3a);
|
||||
cr[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
|
||||
T3K = T3o - T3n;
|
||||
cr[WS(rs, 10)] = FMS(KP707106781, T3K, T3J);
|
||||
ci[WS(rs, 13)] = FMA(KP707106781, T3K, T3J);
|
||||
T3p = T3n + T3o;
|
||||
cr[WS(rs, 6)] = FNMS(KP707106781, T3p, T3m);
|
||||
ci[WS(rs, 1)] = FMA(KP707106781, T3p, T3m);
|
||||
T3I = T3k - T3f;
|
||||
cr[WS(rs, 14)] = FMS(KP707106781, T3I, T3H);
|
||||
ci[WS(rs, 9)] = FMA(KP707106781, T3I, T3H);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T20, T3N, T3T, T2Q, T2f, T3U, T30, T33, T2T, T3O, T2v, T2N, T2X, T34, T2K;
|
||||
E T2O;
|
||||
{
|
||||
E T27, T2e, T2n, T2u;
|
||||
T20 = T1U - T1Z;
|
||||
T3N = T3L + T3M;
|
||||
T3T = T3M - T3L;
|
||||
T2Q = T1U + T1Z;
|
||||
T27 = T21 - T26;
|
||||
T2e = T28 + T2d;
|
||||
T2f = T27 + T2e;
|
||||
T3U = T2e - T27;
|
||||
{
|
||||
E T2Y, T2Z, T2R, T2S;
|
||||
T2Y = T2w + T2B;
|
||||
T2Z = T2I + T2D;
|
||||
T30 = FNMS(KP414213562, T2Z, T2Y);
|
||||
T33 = FMA(KP414213562, T2Y, T2Z);
|
||||
T2R = T21 + T26;
|
||||
T2S = T28 - T2d;
|
||||
T2T = T2R + T2S;
|
||||
T3O = T2R - T2S;
|
||||
}
|
||||
T2n = T2h - T2m;
|
||||
T2u = T2s + T2t;
|
||||
T2v = FNMS(KP414213562, T2u, T2n);
|
||||
T2N = FMA(KP414213562, T2n, T2u);
|
||||
{
|
||||
E T2V, T2W, T2C, T2J;
|
||||
T2V = T2h + T2m;
|
||||
T2W = T2s - T2t;
|
||||
T2X = FMA(KP414213562, T2W, T2V);
|
||||
T34 = FNMS(KP414213562, T2V, T2W);
|
||||
T2C = T2w - T2B;
|
||||
T2J = T2D - T2I;
|
||||
T2K = FNMS(KP414213562, T2J, T2C);
|
||||
T2O = FMA(KP414213562, T2C, T2J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2g, T2L, T3V, T3W;
|
||||
T2g = FMA(KP707106781, T2f, T20);
|
||||
T2L = T2v + T2K;
|
||||
cr[WS(rs, 7)] = FNMS(KP923879532, T2L, T2g);
|
||||
ci[0] = FMA(KP923879532, T2L, T2g);
|
||||
T3V = FMA(KP707106781, T3U, T3T);
|
||||
T3W = T34 + T33;
|
||||
cr[WS(rs, 9)] = FMS(KP923879532, T3W, T3V);
|
||||
ci[WS(rs, 14)] = FMA(KP923879532, T3W, T3V);
|
||||
}
|
||||
{
|
||||
E T3X, T3Y, T2M, T2P;
|
||||
T3X = FNMS(KP707106781, T3U, T3T);
|
||||
T3Y = T30 - T2X;
|
||||
cr[WS(rs, 13)] = FMS(KP923879532, T3Y, T3X);
|
||||
ci[WS(rs, 10)] = FMA(KP923879532, T3Y, T3X);
|
||||
T2M = FNMS(KP707106781, T2f, T20);
|
||||
T2P = T2N + T2O;
|
||||
ci[WS(rs, 4)] = FNMS(KP923879532, T2P, T2M);
|
||||
cr[WS(rs, 3)] = FMA(KP923879532, T2P, T2M);
|
||||
}
|
||||
{
|
||||
E T2U, T31, T3P, T3Q;
|
||||
T2U = FMA(KP707106781, T2T, T2Q);
|
||||
T31 = T2X + T30;
|
||||
ci[WS(rs, 6)] = FNMS(KP923879532, T31, T2U);
|
||||
cr[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
|
||||
T3P = FMA(KP707106781, T3O, T3N);
|
||||
T3Q = T2O - T2N;
|
||||
cr[WS(rs, 15)] = FMS(KP923879532, T3Q, T3P);
|
||||
ci[WS(rs, 8)] = FMA(KP923879532, T3Q, T3P);
|
||||
}
|
||||
{
|
||||
E T3R, T3S, T32, T35;
|
||||
T3R = FNMS(KP707106781, T3O, T3N);
|
||||
T3S = T2K - T2v;
|
||||
cr[WS(rs, 11)] = FMS(KP923879532, T3S, T3R);
|
||||
ci[WS(rs, 12)] = FMA(KP923879532, T3S, T3R);
|
||||
T32 = FNMS(KP707106781, T2T, T2Q);
|
||||
T35 = T33 - T34;
|
||||
cr[WS(rs, 5)] = FNMS(KP923879532, T35, T32);
|
||||
ci[WS(rs, 2)] = FMA(KP923879532, T35, T32);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, { 104, 42, 92, 0 } };
|
||||
|
||||
void X(codelet_hf2_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -dit -name hf2_16 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 108 FP multiplications,
|
||||
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
|
||||
* 82 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
|
||||
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
|
||||
{
|
||||
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
|
||||
{
|
||||
E Th, Tn, Tj, Tm;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
Tg = W[2];
|
||||
Ti = W[3];
|
||||
Th = T2 * Tg;
|
||||
Tn = T5 * Tg;
|
||||
Tj = T5 * Ti;
|
||||
Tm = T2 * Ti;
|
||||
Tk = Th - Tj;
|
||||
To = Tm + Tn;
|
||||
TE = Tm - Tn;
|
||||
TC = Th + Tj;
|
||||
T6 = W[5];
|
||||
T7 = T5 * T6;
|
||||
Tv = Tg * T6;
|
||||
Ta = T2 * T6;
|
||||
Ts = Ti * T6;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tw = Ti * T3;
|
||||
Tb = T5 * T3;
|
||||
Tr = Tg * T3;
|
||||
}
|
||||
T8 = T4 + T7;
|
||||
TW = Tv - Tw;
|
||||
TJ = Ta + Tb;
|
||||
Tt = Tr - Ts;
|
||||
TU = Tr + Ts;
|
||||
Tc = Ta - Tb;
|
||||
Tx = Tv + Tw;
|
||||
TH = T4 - T7;
|
||||
TN = W[6];
|
||||
TO = W[7];
|
||||
TP = FMA(T2, TN, T5 * TO);
|
||||
TR = FNMS(T5, TN, T2 * TO);
|
||||
{
|
||||
E T1d, T1e, T19, T1a;
|
||||
T1d = Tk * T6;
|
||||
T1e = To * T3;
|
||||
T1f = T1d - T1e;
|
||||
T1k = T1d + T1e;
|
||||
T19 = Tk * T3;
|
||||
T1a = To * T6;
|
||||
T1b = T19 + T1a;
|
||||
T1i = T19 - T1a;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1s, T1t;
|
||||
T1w = TC * T6;
|
||||
T1x = TE * T3;
|
||||
T1y = T1w - T1x;
|
||||
T1H = T1w + T1x;
|
||||
T1s = TC * T3;
|
||||
T1t = TE * T6;
|
||||
T1u = T1s + T1t;
|
||||
T1F = T1s - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, T3s, T1N, T3e, TA, T3r, T1Q, T3b, TM, T2N, T1W, T2w, TZ, T2M, T21;
|
||||
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2E, T2o, T2D, T18, T1n, T2Q, T2R;
|
||||
E T2S, T2T, T28, T2B, T2d, T2A;
|
||||
{
|
||||
E T1, T3d, Te, T3c, T9, Td;
|
||||
T1 = cr[0];
|
||||
T3d = ci[0];
|
||||
T9 = cr[WS(rs, 8)];
|
||||
Td = ci[WS(rs, 8)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T3c = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T3s = T3d - T3c;
|
||||
T1N = T1 - Te;
|
||||
T3e = T3c + T3d;
|
||||
}
|
||||
{
|
||||
E Tq, T1O, Tz, T1P;
|
||||
{
|
||||
E Tl, Tp, Tu, Ty;
|
||||
Tl = cr[WS(rs, 4)];
|
||||
Tp = ci[WS(rs, 4)];
|
||||
Tq = FMA(Tk, Tl, To * Tp);
|
||||
T1O = FNMS(To, Tl, Tk * Tp);
|
||||
Tu = cr[WS(rs, 12)];
|
||||
Ty = ci[WS(rs, 12)];
|
||||
Tz = FMA(Tt, Tu, Tx * Ty);
|
||||
T1P = FNMS(Tx, Tu, Tt * Ty);
|
||||
}
|
||||
TA = Tq + Tz;
|
||||
T3r = Tq - Tz;
|
||||
T1Q = T1O - T1P;
|
||||
T3b = T1O + T1P;
|
||||
}
|
||||
{
|
||||
E TG, T1T, TL, T1U, T1S, T1V;
|
||||
{
|
||||
E TD, TF, TI, TK;
|
||||
TD = cr[WS(rs, 2)];
|
||||
TF = ci[WS(rs, 2)];
|
||||
TG = FMA(TC, TD, TE * TF);
|
||||
T1T = FNMS(TE, TD, TC * TF);
|
||||
TI = cr[WS(rs, 10)];
|
||||
TK = ci[WS(rs, 10)];
|
||||
TL = FMA(TH, TI, TJ * TK);
|
||||
T1U = FNMS(TJ, TI, TH * TK);
|
||||
}
|
||||
TM = TG + TL;
|
||||
T2N = T1T + T1U;
|
||||
T1S = TG - TL;
|
||||
T1V = T1T - T1U;
|
||||
T1W = T1S - T1V;
|
||||
T2w = T1S + T1V;
|
||||
}
|
||||
{
|
||||
E TT, T1Y, TY, T1Z, T1X, T20;
|
||||
{
|
||||
E TQ, TS, TV, TX;
|
||||
TQ = cr[WS(rs, 14)];
|
||||
TS = ci[WS(rs, 14)];
|
||||
TT = FMA(TP, TQ, TR * TS);
|
||||
T1Y = FNMS(TR, TQ, TP * TS);
|
||||
TV = cr[WS(rs, 6)];
|
||||
TX = ci[WS(rs, 6)];
|
||||
TY = FMA(TU, TV, TW * TX);
|
||||
T1Z = FNMS(TW, TV, TU * TX);
|
||||
}
|
||||
TZ = TT + TY;
|
||||
T2M = T1Y + T1Z;
|
||||
T1X = TT - TY;
|
||||
T20 = T1Y - T1Z;
|
||||
T21 = T1X + T20;
|
||||
T2x = T1X - T20;
|
||||
}
|
||||
{
|
||||
E T1r, T2f, T1J, T2m, T1A, T2g, T1E, T2l;
|
||||
{
|
||||
E T1p, T1q, T1G, T1I;
|
||||
T1p = cr[WS(rs, 15)];
|
||||
T1q = ci[WS(rs, 15)];
|
||||
T1r = FMA(TN, T1p, TO * T1q);
|
||||
T2f = FNMS(TO, T1p, TN * T1q);
|
||||
T1G = cr[WS(rs, 11)];
|
||||
T1I = ci[WS(rs, 11)];
|
||||
T1J = FMA(T1F, T1G, T1H * T1I);
|
||||
T2m = FNMS(T1H, T1G, T1F * T1I);
|
||||
}
|
||||
{
|
||||
E T1v, T1z, T1C, T1D;
|
||||
T1v = cr[WS(rs, 7)];
|
||||
T1z = ci[WS(rs, 7)];
|
||||
T1A = FMA(T1u, T1v, T1y * T1z);
|
||||
T2g = FNMS(T1y, T1v, T1u * T1z);
|
||||
T1C = cr[WS(rs, 3)];
|
||||
T1D = ci[WS(rs, 3)];
|
||||
T1E = FMA(Tg, T1C, Ti * T1D);
|
||||
T2l = FNMS(Ti, T1C, Tg * T1D);
|
||||
}
|
||||
T1B = T1r + T1A;
|
||||
T1K = T1E + T1J;
|
||||
T2V = T1B - T1K;
|
||||
T2W = T2f + T2g;
|
||||
T2X = T2l + T2m;
|
||||
T2Y = T2W - T2X;
|
||||
{
|
||||
E T2h, T2i, T2k, T2n;
|
||||
T2h = T2f - T2g;
|
||||
T2i = T1E - T1J;
|
||||
T2j = T2h + T2i;
|
||||
T2E = T2h - T2i;
|
||||
T2k = T1r - T1A;
|
||||
T2n = T2l - T2m;
|
||||
T2o = T2k - T2n;
|
||||
T2D = T2k + T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T29, T1m, T26, T17, T2a, T1h, T25;
|
||||
{
|
||||
E T12, T13, T1j, T1l;
|
||||
T12 = cr[WS(rs, 1)];
|
||||
T13 = ci[WS(rs, 1)];
|
||||
T14 = FMA(T2, T12, T5 * T13);
|
||||
T29 = FNMS(T5, T12, T2 * T13);
|
||||
T1j = cr[WS(rs, 13)];
|
||||
T1l = ci[WS(rs, 13)];
|
||||
T1m = FMA(T1i, T1j, T1k * T1l);
|
||||
T26 = FNMS(T1k, T1j, T1i * T1l);
|
||||
}
|
||||
{
|
||||
E T15, T16, T1c, T1g;
|
||||
T15 = cr[WS(rs, 9)];
|
||||
T16 = ci[WS(rs, 9)];
|
||||
T17 = FMA(T3, T15, T6 * T16);
|
||||
T2a = FNMS(T6, T15, T3 * T16);
|
||||
T1c = cr[WS(rs, 5)];
|
||||
T1g = ci[WS(rs, 5)];
|
||||
T1h = FMA(T1b, T1c, T1f * T1g);
|
||||
T25 = FNMS(T1f, T1c, T1b * T1g);
|
||||
}
|
||||
T18 = T14 + T17;
|
||||
T1n = T1h + T1m;
|
||||
T2Q = T18 - T1n;
|
||||
T2R = T29 + T2a;
|
||||
T2S = T25 + T26;
|
||||
T2T = T2R - T2S;
|
||||
{
|
||||
E T24, T27, T2b, T2c;
|
||||
T24 = T14 - T17;
|
||||
T27 = T25 - T26;
|
||||
T28 = T24 - T27;
|
||||
T2B = T24 + T27;
|
||||
T2b = T29 - T2a;
|
||||
T2c = T1h - T1m;
|
||||
T2d = T2b + T2c;
|
||||
T2A = T2b - T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T23, T2r, T3u, T3w, T2q, T3v, T2u, T3p;
|
||||
{
|
||||
E T1R, T22, T3q, T3t;
|
||||
T1R = T1N - T1Q;
|
||||
T22 = KP707106781 * (T1W + T21);
|
||||
T23 = T1R + T22;
|
||||
T2r = T1R - T22;
|
||||
T3q = KP707106781 * (T2w - T2x);
|
||||
T3t = T3r + T3s;
|
||||
T3u = T3q + T3t;
|
||||
T3w = T3t - T3q;
|
||||
}
|
||||
{
|
||||
E T2e, T2p, T2s, T2t;
|
||||
T2e = FNMS(KP382683432, T2d, KP923879532 * T28);
|
||||
T2p = FMA(KP382683432, T2j, KP923879532 * T2o);
|
||||
T2q = T2e + T2p;
|
||||
T3v = T2p - T2e;
|
||||
T2s = FMA(KP923879532, T2d, KP382683432 * T28);
|
||||
T2t = FNMS(KP923879532, T2j, KP382683432 * T2o);
|
||||
T2u = T2s + T2t;
|
||||
T3p = T2t - T2s;
|
||||
}
|
||||
cr[WS(rs, 7)] = T23 - T2q;
|
||||
cr[WS(rs, 11)] = T3v - T3w;
|
||||
ci[WS(rs, 12)] = T3v + T3w;
|
||||
ci[0] = T23 + T2q;
|
||||
ci[WS(rs, 4)] = T2r - T2u;
|
||||
cr[WS(rs, 15)] = T3p - T3u;
|
||||
ci[WS(rs, 8)] = T3p + T3u;
|
||||
cr[WS(rs, 3)] = T2r + T2u;
|
||||
}
|
||||
{
|
||||
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
|
||||
{
|
||||
E TB, T10, T3a, T3f;
|
||||
TB = Tf + TA;
|
||||
T10 = TM + TZ;
|
||||
T11 = TB + T10;
|
||||
T35 = TB - T10;
|
||||
T3a = T2N + T2M;
|
||||
T3f = T3b + T3e;
|
||||
T3g = T3a + T3f;
|
||||
T3i = T3f - T3a;
|
||||
}
|
||||
{
|
||||
E T1o, T1L, T36, T37;
|
||||
T1o = T18 + T1n;
|
||||
T1L = T1B + T1K;
|
||||
T1M = T1o + T1L;
|
||||
T3h = T1L - T1o;
|
||||
T36 = T2W + T2X;
|
||||
T37 = T2R + T2S;
|
||||
T38 = T36 - T37;
|
||||
T39 = T37 + T36;
|
||||
}
|
||||
ci[WS(rs, 7)] = T11 - T1M;
|
||||
cr[WS(rs, 12)] = T3h - T3i;
|
||||
ci[WS(rs, 11)] = T3h + T3i;
|
||||
cr[0] = T11 + T1M;
|
||||
cr[WS(rs, 4)] = T35 - T38;
|
||||
cr[WS(rs, 8)] = T39 - T3g;
|
||||
ci[WS(rs, 15)] = T39 + T3g;
|
||||
ci[WS(rs, 3)] = T35 + T38;
|
||||
}
|
||||
{
|
||||
E T2z, T2H, T3A, T3C, T2G, T3B, T2K, T3x;
|
||||
{
|
||||
E T2v, T2y, T3y, T3z;
|
||||
T2v = T1N + T1Q;
|
||||
T2y = KP707106781 * (T2w + T2x);
|
||||
T2z = T2v + T2y;
|
||||
T2H = T2v - T2y;
|
||||
T3y = KP707106781 * (T21 - T1W);
|
||||
T3z = T3s - T3r;
|
||||
T3A = T3y + T3z;
|
||||
T3C = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T2C, T2F, T2I, T2J;
|
||||
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
|
||||
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
|
||||
T2G = T2C + T2F;
|
||||
T3B = T2F - T2C;
|
||||
T2I = FNMS(KP923879532, T2A, KP382683432 * T2B);
|
||||
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
|
||||
T2K = T2I + T2J;
|
||||
T3x = T2J - T2I;
|
||||
}
|
||||
ci[WS(rs, 6)] = T2z - T2G;
|
||||
cr[WS(rs, 13)] = T3B - T3C;
|
||||
ci[WS(rs, 10)] = T3B + T3C;
|
||||
cr[WS(rs, 1)] = T2z + T2G;
|
||||
cr[WS(rs, 5)] = T2H - T2K;
|
||||
cr[WS(rs, 9)] = T3x - T3A;
|
||||
ci[WS(rs, 14)] = T3x + T3A;
|
||||
ci[WS(rs, 2)] = T2H + T2K;
|
||||
}
|
||||
{
|
||||
E T2P, T31, T3m, T3o, T30, T3j, T34, T3n;
|
||||
{
|
||||
E T2L, T2O, T3k, T3l;
|
||||
T2L = Tf - TA;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2L - T2O;
|
||||
T31 = T2L + T2O;
|
||||
T3k = TM - TZ;
|
||||
T3l = T3e - T3b;
|
||||
T3m = T3k + T3l;
|
||||
T3o = T3l - T3k;
|
||||
}
|
||||
{
|
||||
E T2U, T2Z, T32, T33;
|
||||
T2U = T2Q + T2T;
|
||||
T2Z = T2V - T2Y;
|
||||
T30 = KP707106781 * (T2U + T2Z);
|
||||
T3j = KP707106781 * (T2Z - T2U);
|
||||
T32 = T2Q - T2T;
|
||||
T33 = T2V + T2Y;
|
||||
T34 = KP707106781 * (T32 + T33);
|
||||
T3n = KP707106781 * (T33 - T32);
|
||||
}
|
||||
ci[WS(rs, 5)] = T2P - T30;
|
||||
cr[WS(rs, 10)] = T3n - T3o;
|
||||
ci[WS(rs, 13)] = T3n + T3o;
|
||||
cr[WS(rs, 2)] = T2P + T30;
|
||||
cr[WS(rs, 6)] = T31 - T34;
|
||||
cr[WS(rs, 14)] = T3j - T3m;
|
||||
ci[WS(rs, 9)] = T3j + T3m;
|
||||
ci[WS(rs, 1)] = T31 + T34;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 9 },
|
||||
{ TW_CEXP, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hf2_16", twinstr, &GENUS, { 156, 68, 40, 0 } };
|
||||
|
||||
void X(codelet_hf2_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_16, &desc);
|
||||
}
|
||||
#endif
|
||||
1097
fftw-3.3.10/rdft/scalar/r2cf/hf2_20.c
Normal file
1097
fftw-3.3.10/rdft/scalar/r2cf/hf2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1620
fftw-3.3.10/rdft/scalar/r2cf/hf2_25.c
Normal file
1620
fftw-3.3.10/rdft/scalar/r2cf/hf2_25.c
Normal file
File diff suppressed because it is too large
Load Diff
1893
fftw-3.3.10/rdft/scalar/r2cf/hf2_32.c
Normal file
1893
fftw-3.3.10/rdft/scalar/r2cf/hf2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
200
fftw-3.3.10/rdft/scalar/r2cf/hf2_4.c
Normal file
200
fftw-3.3.10/rdft/scalar/r2cf/hf2_4.c
Normal file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:18 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hf2_4 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T2, T6, T3, T5, T7, Tb, T4, Ta;
|
||||
T2 = W[0];
|
||||
T6 = W[3];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Ta = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
{
|
||||
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
|
||||
T1 = cr[0];
|
||||
Tx = ci[0];
|
||||
{
|
||||
E T8, T9, Tc, Tv;
|
||||
T8 = cr[WS(rs, 2)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ci[WS(rs, 2)];
|
||||
Tv = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Tw = FNMS(Tb, T8, Tv);
|
||||
}
|
||||
{
|
||||
E Tf, Tg, Th, Tp;
|
||||
Tf = cr[WS(rs, 1)];
|
||||
Tg = T2 * Tf;
|
||||
Th = ci[WS(rs, 1)];
|
||||
Tp = T2 * Th;
|
||||
Ti = FMA(T5, Th, Tg);
|
||||
Tq = FNMS(T5, Tf, Tp);
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tl, Tr;
|
||||
Tj = cr[WS(rs, 3)];
|
||||
Tk = T3 * Tj;
|
||||
Tl = ci[WS(rs, 3)];
|
||||
Tr = T3 * Tl;
|
||||
Tm = FMA(T6, Tl, Tk);
|
||||
Ts = FNMS(T6, Tj, Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tn, To, Tt;
|
||||
Te = T1 + Td;
|
||||
Tn = Ti + Tm;
|
||||
ci[WS(rs, 1)] = Te - Tn;
|
||||
cr[0] = Te + Tn;
|
||||
To = T1 - Td;
|
||||
Tt = Tq - Ts;
|
||||
ci[0] = To - Tt;
|
||||
cr[WS(rs, 1)] = To + Tt;
|
||||
}
|
||||
{
|
||||
E Tu, Ty, Tz, TA;
|
||||
Tu = Tq + Ts;
|
||||
Ty = Tw + Tx;
|
||||
cr[WS(rs, 2)] = Tu - Ty;
|
||||
ci[WS(rs, 3)] = Tu + Ty;
|
||||
Tz = Tm - Ti;
|
||||
TA = Tx - Tw;
|
||||
cr[WS(rs, 3)] = Tz - TA;
|
||||
ci[WS(rs, 2)] = Tz + TA;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hf2_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -dit -name hf2_4 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T2, T4, T3, T5, T6, T8;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T3 = W[2];
|
||||
T5 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T8 = FNMS(T4, T3, T2 * T5);
|
||||
{
|
||||
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
|
||||
T1 = cr[0];
|
||||
Tp = ci[0];
|
||||
T7 = cr[WS(rs, 2)];
|
||||
T9 = ci[WS(rs, 2)];
|
||||
Ta = FMA(T6, T7, T8 * T9);
|
||||
To = FNMS(T8, T7, T6 * T9);
|
||||
{
|
||||
E Tc, Td, Tf, Tg;
|
||||
Tc = cr[WS(rs, 1)];
|
||||
Td = ci[WS(rs, 1)];
|
||||
Te = FMA(T2, Tc, T4 * Td);
|
||||
Tk = FNMS(T4, Tc, T2 * Td);
|
||||
Tf = cr[WS(rs, 3)];
|
||||
Tg = ci[WS(rs, 3)];
|
||||
Th = FMA(T3, Tf, T5 * Tg);
|
||||
Tl = FNMS(T5, Tf, T3 * Tg);
|
||||
}
|
||||
{
|
||||
E Tb, Ti, Tj, Tm;
|
||||
Tb = T1 + Ta;
|
||||
Ti = Te + Th;
|
||||
ci[WS(rs, 1)] = Tb - Ti;
|
||||
cr[0] = Tb + Ti;
|
||||
Tj = T1 - Ta;
|
||||
Tm = Tk - Tl;
|
||||
ci[0] = Tj - Tm;
|
||||
cr[WS(rs, 1)] = Tj + Tm;
|
||||
}
|
||||
{
|
||||
E Tn, Tq, Tr, Ts;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
cr[WS(rs, 2)] = Tn - Tq;
|
||||
ci[WS(rs, 3)] = Tn + Tq;
|
||||
Tr = Th - Te;
|
||||
Ts = Tp - To;
|
||||
cr[WS(rs, 3)] = Tr - Ts;
|
||||
ci[WS(rs, 2)] = Tr + Ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hf2_4", twinstr, &GENUS, { 16, 8, 8, 0 } };
|
||||
|
||||
void X(codelet_hf2_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_4, &desc);
|
||||
}
|
||||
#endif
|
||||
264
fftw-3.3.10/rdft/scalar/r2cf/hf2_5.c
Normal file
264
fftw-3.3.10/rdft/scalar/r2cf/hf2_5.c
Normal file
@@ -0,0 +1,264 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:20 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -dit -name hf2_5 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 40 FP multiplications,
|
||||
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
|
||||
* 38 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
|
||||
T2 = W[0];
|
||||
Ta = W[3];
|
||||
T8 = W[2];
|
||||
T9 = T2 * T8;
|
||||
Te = T2 * Ta;
|
||||
T5 = W[1];
|
||||
Tb = FNMS(T5, Ta, T9);
|
||||
Tm = FNMS(T5, T8, Te);
|
||||
Tf = FMA(T5, T8, Te);
|
||||
Tj = FMA(T5, Ta, T9);
|
||||
{
|
||||
E T1, TL, T7, Th, Ti, Tz, TB, TM, To, Ts, Tt, TE, TG, TN;
|
||||
T1 = cr[0];
|
||||
TL = ci[0];
|
||||
{
|
||||
E T3, T4, T6, Ty, Tc, Td, Tg, TA;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T4 = T2 * T3;
|
||||
T6 = ci[WS(rs, 1)];
|
||||
Ty = T2 * T6;
|
||||
Tc = cr[WS(rs, 4)];
|
||||
Td = Tb * Tc;
|
||||
Tg = ci[WS(rs, 4)];
|
||||
TA = Tb * Tg;
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Th = FMA(Tf, Tg, Td);
|
||||
Ti = T7 + Th;
|
||||
Tz = FNMS(T5, T3, Ty);
|
||||
TB = FNMS(Tf, Tc, TA);
|
||||
TM = Tz + TB;
|
||||
}
|
||||
{
|
||||
E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
|
||||
Tk = cr[WS(rs, 2)];
|
||||
Tl = Tj * Tk;
|
||||
Tn = ci[WS(rs, 2)];
|
||||
TD = Tj * Tn;
|
||||
Tp = cr[WS(rs, 3)];
|
||||
Tq = T8 * Tp;
|
||||
Tr = ci[WS(rs, 3)];
|
||||
TF = T8 * Tr;
|
||||
To = FMA(Tm, Tn, Tl);
|
||||
Ts = FMA(Ta, Tr, Tq);
|
||||
Tt = To + Ts;
|
||||
TE = FNMS(Tm, Tk, TD);
|
||||
TG = FNMS(Ta, Tp, TF);
|
||||
TN = TE + TG;
|
||||
}
|
||||
{
|
||||
E Tw, Tu, Tv, TI, TK, TC, TH, Tx, TJ;
|
||||
Tw = Ti - Tt;
|
||||
Tu = Ti + Tt;
|
||||
Tv = FNMS(KP250000000, Tu, T1);
|
||||
TC = Tz - TB;
|
||||
TH = TE - TG;
|
||||
TI = FMA(KP618033988, TH, TC);
|
||||
TK = FNMS(KP618033988, TC, TH);
|
||||
cr[0] = T1 + Tu;
|
||||
Tx = FMA(KP559016994, Tw, Tv);
|
||||
ci[0] = FNMS(KP951056516, TI, Tx);
|
||||
cr[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
|
||||
TJ = FNMS(KP559016994, Tw, Tv);
|
||||
cr[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
|
||||
ci[WS(rs, 1)] = FMA(KP951056516, TK, TJ);
|
||||
}
|
||||
{
|
||||
E TQ, TO, TP, TU, TW, TS, TT, TV, TR;
|
||||
TQ = TM - TN;
|
||||
TO = TM + TN;
|
||||
TP = FNMS(KP250000000, TO, TL);
|
||||
TS = To - Ts;
|
||||
TT = Th - T7;
|
||||
TU = FMA(KP618033988, TT, TS);
|
||||
TW = FNMS(KP618033988, TS, TT);
|
||||
ci[WS(rs, 4)] = TO + TL;
|
||||
TV = FMA(KP559016994, TQ, TP);
|
||||
cr[WS(rs, 4)] = FMS(KP951056516, TW, TV);
|
||||
ci[WS(rs, 3)] = FMA(KP951056516, TW, TV);
|
||||
TR = FNMS(KP559016994, TQ, TP);
|
||||
cr[WS(rs, 3)] = FMS(KP951056516, TU, TR);
|
||||
ci[WS(rs, 2)] = FMA(KP951056516, TU, TR);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hf2_5", twinstr, &GENUS, { 14, 10, 30, 0 } };
|
||||
|
||||
void X(codelet_hf2_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -dit -name hf2_5 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 32 FP multiplications,
|
||||
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 37 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
|
||||
{
|
||||
E T8, Te, Ta, Td;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
T8 = T2 * T7;
|
||||
Te = T4 * T7;
|
||||
Ta = T4 * T9;
|
||||
Td = T2 * T9;
|
||||
Tb = T8 - Ta;
|
||||
Tl = Td - Te;
|
||||
Tf = Td + Te;
|
||||
Tj = T8 + Ta;
|
||||
}
|
||||
{
|
||||
E T1, TI, Ty, TB, TG, TF, TJ, TK, TL, Ti, Tr, Ts;
|
||||
T1 = cr[0];
|
||||
TI = ci[0];
|
||||
{
|
||||
E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
|
||||
{
|
||||
E T3, T5, To, Tp;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Tw = FNMS(T4, T3, T2 * T5);
|
||||
To = cr[WS(rs, 3)];
|
||||
Tp = ci[WS(rs, 3)];
|
||||
Tq = FMA(T7, To, T9 * Tp);
|
||||
TA = FNMS(T9, To, T7 * Tp);
|
||||
}
|
||||
{
|
||||
E Tc, Tg, Tk, Tm;
|
||||
Tc = cr[WS(rs, 4)];
|
||||
Tg = ci[WS(rs, 4)];
|
||||
Th = FMA(Tb, Tc, Tf * Tg);
|
||||
Tx = FNMS(Tf, Tc, Tb * Tg);
|
||||
Tk = cr[WS(rs, 2)];
|
||||
Tm = ci[WS(rs, 2)];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
Tz = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ty = Tw - Tx;
|
||||
TB = Tz - TA;
|
||||
TG = Tn - Tq;
|
||||
TF = Th - T6;
|
||||
TJ = Tw + Tx;
|
||||
TK = Tz + TA;
|
||||
TL = TJ + TK;
|
||||
Ti = T6 + Th;
|
||||
Tr = Tn + Tq;
|
||||
Ts = Ti + Tr;
|
||||
}
|
||||
cr[0] = T1 + Ts;
|
||||
{
|
||||
E TC, TE, Tv, TD, Tt, Tu;
|
||||
TC = FMA(KP951056516, Ty, KP587785252 * TB);
|
||||
TE = FNMS(KP587785252, Ty, KP951056516 * TB);
|
||||
Tt = KP559016994 * (Ti - Tr);
|
||||
Tu = FNMS(KP250000000, Ts, T1);
|
||||
Tv = Tt + Tu;
|
||||
TD = Tu - Tt;
|
||||
ci[0] = Tv - TC;
|
||||
ci[WS(rs, 1)] = TD + TE;
|
||||
cr[WS(rs, 1)] = Tv + TC;
|
||||
cr[WS(rs, 2)] = TD - TE;
|
||||
}
|
||||
ci[WS(rs, 4)] = TL + TI;
|
||||
{
|
||||
E TH, TP, TO, TQ, TM, TN;
|
||||
TH = FMA(KP587785252, TF, KP951056516 * TG);
|
||||
TP = FNMS(KP587785252, TG, KP951056516 * TF);
|
||||
TM = FNMS(KP250000000, TL, TI);
|
||||
TN = KP559016994 * (TJ - TK);
|
||||
TO = TM - TN;
|
||||
TQ = TN + TM;
|
||||
cr[WS(rs, 3)] = TH - TO;
|
||||
ci[WS(rs, 3)] = TP + TQ;
|
||||
ci[WS(rs, 2)] = TH + TO;
|
||||
cr[WS(rs, 4)] = TP - TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hf2_5", twinstr, &GENUS, { 30, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hf2_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_5, &desc);
|
||||
}
|
||||
#endif
|
||||
390
fftw-3.3.10/rdft/scalar/r2cf/hf2_8.c
Normal file
390
fftw-3.3.10/rdft/scalar/r2cf/hf2_8.c
Normal file
@@ -0,0 +1,390 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:18 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 50 FP multiplications,
|
||||
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
|
||||
* 48 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
|
||||
{
|
||||
E T4, Tm, Tr, Ta, TB, TF;
|
||||
T2 = W[0];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tl = W[4];
|
||||
Tm = T2 * Tl;
|
||||
Tn = W[5];
|
||||
Tr = T2 * Tn;
|
||||
T5 = W[1];
|
||||
T6 = W[3];
|
||||
Ta = T2 * T6;
|
||||
Tf = FMA(T5, T6, T4);
|
||||
T7 = FNMS(T5, T6, T4);
|
||||
Ts = FNMS(T5, Tl, Tr);
|
||||
Tb = FMA(T5, T3, Ta);
|
||||
To = FMA(T5, Tn, Tm);
|
||||
TB = Tf * Tl;
|
||||
TF = Tf * Tn;
|
||||
Ti = FNMS(T5, T3, Ta);
|
||||
TC = FMA(Ti, Tn, TB);
|
||||
TG = FNMS(Ti, Tl, TF);
|
||||
}
|
||||
{
|
||||
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
|
||||
E TI, T11, T13, T15, T16;
|
||||
T1 = cr[0];
|
||||
T1s = ci[0];
|
||||
{
|
||||
E T8, T9, Tc, T1q;
|
||||
T8 = cr[WS(rs, 4)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ci[WS(rs, 4)];
|
||||
T1q = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
T1r = FNMS(Tb, T8, T1q);
|
||||
}
|
||||
{
|
||||
E Tp, Tq, Tt, TX;
|
||||
Tp = cr[WS(rs, 6)];
|
||||
Tq = To * Tp;
|
||||
Tt = ci[WS(rs, 6)];
|
||||
TX = To * Tt;
|
||||
Tu = FMA(Ts, Tt, Tq);
|
||||
TY = FNMS(Ts, Tp, TX);
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, TV;
|
||||
Tg = cr[WS(rs, 2)];
|
||||
Th = Tf * Tg;
|
||||
Tj = ci[WS(rs, 2)];
|
||||
TV = Tf * Tj;
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TW = FNMS(Ti, Tg, TV);
|
||||
}
|
||||
{
|
||||
E TK, TL, TM, T19, TO, TP, TQ, T1b;
|
||||
TK = cr[WS(rs, 7)];
|
||||
TL = Tl * TK;
|
||||
TM = ci[WS(rs, 7)];
|
||||
T19 = Tl * TM;
|
||||
TO = cr[WS(rs, 3)];
|
||||
TP = T3 * TO;
|
||||
TQ = ci[WS(rs, 3)];
|
||||
T1b = T3 * TQ;
|
||||
TN = FMA(Tn, TM, TL);
|
||||
TR = FMA(T6, TQ, TP);
|
||||
T18 = TN - TR;
|
||||
T1a = FNMS(Tn, TK, T19);
|
||||
T1c = FNMS(T6, TO, T1b);
|
||||
T1d = T1a - T1c;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
|
||||
Tx = cr[WS(rs, 1)];
|
||||
Ty = T2 * Tx;
|
||||
Tz = ci[WS(rs, 1)];
|
||||
T12 = T2 * Tz;
|
||||
TD = cr[WS(rs, 5)];
|
||||
TE = TC * TD;
|
||||
TH = ci[WS(rs, 5)];
|
||||
T14 = TC * TH;
|
||||
TA = FMA(T5, Tz, Ty);
|
||||
TI = FMA(TG, TH, TE);
|
||||
T11 = TA - TI;
|
||||
T13 = FNMS(T5, Tx, T12);
|
||||
T15 = FNMS(TG, TD, T14);
|
||||
T16 = T13 - T15;
|
||||
}
|
||||
{
|
||||
E T10, T1g, T1z, T1B, T1f, T1A, T1j, T1C;
|
||||
{
|
||||
E TU, TZ, T1x, T1y;
|
||||
TU = T1 - Td;
|
||||
TZ = TW - TY;
|
||||
T10 = TU + TZ;
|
||||
T1g = TU - TZ;
|
||||
T1x = Tk - Tu;
|
||||
T1y = T1s - T1r;
|
||||
T1z = T1x + T1y;
|
||||
T1B = T1y - T1x;
|
||||
}
|
||||
{
|
||||
E T17, T1e, T1h, T1i;
|
||||
T17 = T11 + T16;
|
||||
T1e = T18 - T1d;
|
||||
T1f = T17 + T1e;
|
||||
T1A = T1e - T17;
|
||||
T1h = T11 - T16;
|
||||
T1i = T18 + T1d;
|
||||
T1j = T1h + T1i;
|
||||
T1C = T1i - T1h;
|
||||
}
|
||||
ci[WS(rs, 2)] = FNMS(KP707106781, T1f, T10);
|
||||
cr[WS(rs, 5)] = FMS(KP707106781, T1C, T1B);
|
||||
ci[WS(rs, 6)] = FMA(KP707106781, T1C, T1B);
|
||||
cr[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
|
||||
cr[WS(rs, 3)] = FNMS(KP707106781, T1j, T1g);
|
||||
cr[WS(rs, 7)] = FMS(KP707106781, T1A, T1z);
|
||||
ci[WS(rs, 4)] = FMA(KP707106781, T1A, T1z);
|
||||
ci[0] = FMA(KP707106781, T1j, T1g);
|
||||
}
|
||||
{
|
||||
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
|
||||
{
|
||||
E Te, Tv, T1p, T1t;
|
||||
Te = T1 + Td;
|
||||
Tv = Tk + Tu;
|
||||
Tw = Te + Tv;
|
||||
T1k = Te - Tv;
|
||||
T1p = TW + TY;
|
||||
T1t = T1r + T1s;
|
||||
T1u = T1p + T1t;
|
||||
T1w = T1t - T1p;
|
||||
}
|
||||
{
|
||||
E TJ, TS, T1l, T1m;
|
||||
TJ = TA + TI;
|
||||
TS = TN + TR;
|
||||
TT = TJ + TS;
|
||||
T1v = TS - TJ;
|
||||
T1l = T1a + T1c;
|
||||
T1m = T13 + T15;
|
||||
T1n = T1l - T1m;
|
||||
T1o = T1m + T1l;
|
||||
}
|
||||
ci[WS(rs, 3)] = Tw - TT;
|
||||
cr[WS(rs, 6)] = T1v - T1w;
|
||||
ci[WS(rs, 5)] = T1v + T1w;
|
||||
cr[0] = Tw + TT;
|
||||
cr[WS(rs, 2)] = T1k - T1n;
|
||||
cr[WS(rs, 4)] = T1o - T1u;
|
||||
ci[WS(rs, 7)] = T1o + T1u;
|
||||
ci[WS(rs, 1)] = T1k + T1n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, { 44, 20, 30, 0 } };
|
||||
|
||||
void X(codelet_hf2_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -dit -name hf2_8 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 44 FP multiplications,
|
||||
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
|
||||
* 42 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf2_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
|
||||
{
|
||||
E T4, Tb, T7, Ta;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
T3 = W[2];
|
||||
T6 = W[3];
|
||||
T4 = T2 * T3;
|
||||
Tb = T5 * T3;
|
||||
T7 = T5 * T6;
|
||||
Ta = T2 * T6;
|
||||
T8 = T4 - T7;
|
||||
Tc = Ta + Tb;
|
||||
Tg = T4 + T7;
|
||||
Ti = Ta - Tb;
|
||||
Tl = W[4];
|
||||
Tm = W[5];
|
||||
Tn = FMA(T2, Tl, T5 * Tm);
|
||||
Tz = FNMS(Ti, Tl, Tg * Tm);
|
||||
Tp = FNMS(T5, Tl, T2 * Tm);
|
||||
Tx = FMA(Tg, Tl, Ti * Tm);
|
||||
}
|
||||
{
|
||||
E Tf, T1j, TL, T1d, TJ, T16, TV, TY, Ts, T1i, TO, T1a, TC, T17, TQ;
|
||||
E TT;
|
||||
{
|
||||
E T1, T1c, Te, T1b, T9, Td;
|
||||
T1 = cr[0];
|
||||
T1c = ci[0];
|
||||
T9 = cr[WS(rs, 4)];
|
||||
Td = ci[WS(rs, 4)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T1b = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T1j = T1c - T1b;
|
||||
TL = T1 - Te;
|
||||
T1d = T1b + T1c;
|
||||
}
|
||||
{
|
||||
E TF, TW, TI, TX;
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = cr[WS(rs, 7)];
|
||||
TE = ci[WS(rs, 7)];
|
||||
TF = FMA(Tl, TD, Tm * TE);
|
||||
TW = FNMS(Tm, TD, Tl * TE);
|
||||
TG = cr[WS(rs, 3)];
|
||||
TH = ci[WS(rs, 3)];
|
||||
TI = FMA(T3, TG, T6 * TH);
|
||||
TX = FNMS(T6, TG, T3 * TH);
|
||||
}
|
||||
TJ = TF + TI;
|
||||
T16 = TW + TX;
|
||||
TV = TF - TI;
|
||||
TY = TW - TX;
|
||||
}
|
||||
{
|
||||
E Tk, TM, Tr, TN;
|
||||
{
|
||||
E Th, Tj, To, Tq;
|
||||
Th = cr[WS(rs, 2)];
|
||||
Tj = ci[WS(rs, 2)];
|
||||
Tk = FMA(Tg, Th, Ti * Tj);
|
||||
TM = FNMS(Ti, Th, Tg * Tj);
|
||||
To = cr[WS(rs, 6)];
|
||||
Tq = ci[WS(rs, 6)];
|
||||
Tr = FMA(Tn, To, Tp * Tq);
|
||||
TN = FNMS(Tp, To, Tn * Tq);
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
T1i = Tk - Tr;
|
||||
TO = TM - TN;
|
||||
T1a = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tw, TR, TB, TS;
|
||||
{
|
||||
E Tu, Tv, Ty, TA;
|
||||
Tu = cr[WS(rs, 1)];
|
||||
Tv = ci[WS(rs, 1)];
|
||||
Tw = FMA(T2, Tu, T5 * Tv);
|
||||
TR = FNMS(T5, Tu, T2 * Tv);
|
||||
Ty = cr[WS(rs, 5)];
|
||||
TA = ci[WS(rs, 5)];
|
||||
TB = FMA(Tx, Ty, Tz * TA);
|
||||
TS = FNMS(Tz, Ty, Tx * TA);
|
||||
}
|
||||
TC = Tw + TB;
|
||||
T17 = TR + TS;
|
||||
TQ = Tw - TB;
|
||||
TT = TR - TS;
|
||||
}
|
||||
{
|
||||
E Tt, TK, T1f, T1g;
|
||||
Tt = Tf + Ts;
|
||||
TK = TC + TJ;
|
||||
ci[WS(rs, 3)] = Tt - TK;
|
||||
cr[0] = Tt + TK;
|
||||
T1f = TJ - TC;
|
||||
T1g = T1d - T1a;
|
||||
cr[WS(rs, 6)] = T1f - T1g;
|
||||
ci[WS(rs, 5)] = T1f + T1g;
|
||||
{
|
||||
E T11, T1m, T14, T1l, T12, T13;
|
||||
T11 = TL - TO;
|
||||
T1m = T1j - T1i;
|
||||
T12 = TQ - TT;
|
||||
T13 = TV + TY;
|
||||
T14 = KP707106781 * (T12 + T13);
|
||||
T1l = KP707106781 * (T13 - T12);
|
||||
cr[WS(rs, 3)] = T11 - T14;
|
||||
ci[WS(rs, 6)] = T1l + T1m;
|
||||
ci[0] = T11 + T14;
|
||||
cr[WS(rs, 5)] = T1l - T1m;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T19, T1e, T15, T18;
|
||||
T19 = T17 + T16;
|
||||
T1e = T1a + T1d;
|
||||
cr[WS(rs, 4)] = T19 - T1e;
|
||||
ci[WS(rs, 7)] = T19 + T1e;
|
||||
T15 = Tf - Ts;
|
||||
T18 = T16 - T17;
|
||||
cr[WS(rs, 2)] = T15 - T18;
|
||||
ci[WS(rs, 1)] = T15 + T18;
|
||||
{
|
||||
E TP, T1k, T10, T1h, TU, TZ;
|
||||
TP = TL + TO;
|
||||
T1k = T1i + T1j;
|
||||
TU = TQ + TT;
|
||||
TZ = TV - TY;
|
||||
T10 = KP707106781 * (TU + TZ);
|
||||
T1h = KP707106781 * (TZ - TU);
|
||||
ci[WS(rs, 2)] = TP - T10;
|
||||
ci[WS(rs, 4)] = T1h + T1k;
|
||||
cr[WS(rs, 1)] = TP + T10;
|
||||
cr[WS(rs, 7)] = T1h - T1k;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 1, 1 },
|
||||
{ TW_CEXP, 1, 3 },
|
||||
{ TW_CEXP, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hf2_8", twinstr, &GENUS, { 56, 26, 18, 0 } };
|
||||
|
||||
void X(codelet_hf2_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hf2_8, &desc);
|
||||
}
|
||||
#endif
|
||||
489
fftw-3.3.10/rdft/scalar/r2cf/hf_10.c
Normal file
489
fftw-3.3.10/rdft/scalar/r2cf/hf_10.c
Normal file
@@ -0,0 +1,489 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hf_10 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 72 FP multiplications,
|
||||
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 47 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T25, T16, T17, T18, T1s, T1x;
|
||||
E T1P, Tl, Ty, Tz, T1I, T1J, T24, T13, T14, T15, T1h, T1m, T1O;
|
||||
{
|
||||
E T1, T1R, T3, T6, T4, T1S, T2, T7, T1T, T5;
|
||||
T1 = cr[0];
|
||||
T1R = ci[0];
|
||||
T3 = cr[WS(rs, 5)];
|
||||
T6 = ci[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1S = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1T = FNMS(T5, T3, T1S);
|
||||
T8 = T1 - T7;
|
||||
T23 = T1T + T1R;
|
||||
T12 = T1 + T7;
|
||||
T1U = T1R - T1T;
|
||||
}
|
||||
{
|
||||
E TF, T1w, TY, T1p, TL, T1u, TS, T1r;
|
||||
{
|
||||
E TB, TE, TC, T1v, TA, TD;
|
||||
TB = cr[WS(rs, 4)];
|
||||
TE = ci[WS(rs, 4)];
|
||||
TA = W[6];
|
||||
TC = TA * TB;
|
||||
T1v = TA * TE;
|
||||
TD = W[7];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1w = FNMS(TD, TB, T1v);
|
||||
}
|
||||
{
|
||||
E TU, TX, TV, T1o, TT, TW;
|
||||
TU = cr[WS(rs, 1)];
|
||||
TX = ci[WS(rs, 1)];
|
||||
TT = W[0];
|
||||
TV = TT * TU;
|
||||
T1o = TT * TX;
|
||||
TW = W[1];
|
||||
TY = FMA(TW, TX, TV);
|
||||
T1p = FNMS(TW, TU, T1o);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1t, TG, TJ;
|
||||
TH = cr[WS(rs, 9)];
|
||||
TK = ci[WS(rs, 9)];
|
||||
TG = W[16];
|
||||
TI = TG * TH;
|
||||
T1t = TG * TK;
|
||||
TJ = W[17];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1u = FNMS(TJ, TH, T1t);
|
||||
}
|
||||
{
|
||||
E TO, TR, TP, T1q, TN, TQ;
|
||||
TO = cr[WS(rs, 6)];
|
||||
TR = ci[WS(rs, 6)];
|
||||
TN = W[10];
|
||||
TP = TN * TO;
|
||||
T1q = TN * TR;
|
||||
TQ = W[11];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1r = FNMS(TQ, TO, T1q);
|
||||
}
|
||||
TM = TF - TL;
|
||||
TZ = TS - TY;
|
||||
T10 = TM + TZ;
|
||||
T1F = T1w + T1u;
|
||||
T1G = T1r + T1p;
|
||||
T25 = T1F + T1G;
|
||||
T16 = TF + TL;
|
||||
T17 = TS + TY;
|
||||
T18 = T16 + T17;
|
||||
T1s = T1p - T1r;
|
||||
T1x = T1u - T1w;
|
||||
T1P = T1x + T1s;
|
||||
}
|
||||
{
|
||||
E Te, T1l, Tx, T1e, Tk, T1j, Tr, T1g;
|
||||
{
|
||||
E Ta, Td, Tb, T1k, T9, Tc;
|
||||
Ta = cr[WS(rs, 2)];
|
||||
Td = ci[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
T1k = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1l = FNMS(Tc, Ta, T1k);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1d, Ts, Tv;
|
||||
Tt = cr[WS(rs, 3)];
|
||||
Tw = ci[WS(rs, 3)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
T1d = Ts * Tw;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1e = FNMS(Tv, Tt, T1d);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, T1i, Tf, Ti;
|
||||
Tg = cr[WS(rs, 7)];
|
||||
Tj = ci[WS(rs, 7)];
|
||||
Tf = W[12];
|
||||
Th = Tf * Tg;
|
||||
T1i = Tf * Tj;
|
||||
Ti = W[13];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1j = FNMS(Ti, Tg, T1i);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1f, Tm, Tp;
|
||||
Tn = cr[WS(rs, 8)];
|
||||
Tq = ci[WS(rs, 8)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1f = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1g = FNMS(Tp, Tn, T1f);
|
||||
}
|
||||
Tl = Te - Tk;
|
||||
Ty = Tr - Tx;
|
||||
Tz = Tl + Ty;
|
||||
T1I = T1l + T1j;
|
||||
T1J = T1g + T1e;
|
||||
T24 = T1I + T1J;
|
||||
T13 = Te + Tk;
|
||||
T14 = Tr + Tx;
|
||||
T15 = T13 + T14;
|
||||
T1h = T1e - T1g;
|
||||
T1m = T1j - T1l;
|
||||
T1O = T1m + T1h;
|
||||
}
|
||||
{
|
||||
E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
|
||||
T1b = Tz - T10;
|
||||
T11 = Tz + T10;
|
||||
T1a = FNMS(KP250000000, T11, T8);
|
||||
T1n = T1h - T1m;
|
||||
T1y = T1s - T1x;
|
||||
T1z = FMA(KP618033988, T1y, T1n);
|
||||
T1B = FNMS(KP618033988, T1n, T1y);
|
||||
ci[WS(rs, 4)] = T8 + T11;
|
||||
T1A = FNMS(KP559016994, T1b, T1a);
|
||||
ci[WS(rs, 2)] = FNMS(KP951056516, T1B, T1A);
|
||||
cr[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
|
||||
T1c = FMA(KP559016994, T1b, T1a);
|
||||
ci[0] = FNMS(KP951056516, T1z, T1c);
|
||||
cr[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
|
||||
}
|
||||
{
|
||||
E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
|
||||
T1D = T15 - T18;
|
||||
T19 = T15 + T18;
|
||||
T1C = FNMS(KP250000000, T19, T12);
|
||||
T1H = T1F - T1G;
|
||||
T1K = T1I - T1J;
|
||||
T1L = FNMS(KP618033988, T1K, T1H);
|
||||
T1N = FMA(KP618033988, T1H, T1K);
|
||||
cr[0] = T12 + T19;
|
||||
T1M = FMA(KP559016994, T1D, T1C);
|
||||
cr[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
|
||||
ci[WS(rs, 3)] = FMA(KP951056516, T1N, T1M);
|
||||
T1E = FNMS(KP559016994, T1D, T1C);
|
||||
cr[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
|
||||
ci[WS(rs, 1)] = FMA(KP951056516, T1L, T1E);
|
||||
}
|
||||
{
|
||||
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
|
||||
T1W = T1P - T1O;
|
||||
T1Q = T1O + T1P;
|
||||
T1V = FMA(KP250000000, T1Q, T1U);
|
||||
T1Y = TZ - TM;
|
||||
T1Z = Ty - Tl;
|
||||
T20 = FNMS(KP618033988, T1Z, T1Y);
|
||||
T22 = FMA(KP618033988, T1Y, T1Z);
|
||||
cr[WS(rs, 5)] = T1Q - T1U;
|
||||
T21 = FMA(KP559016994, T1W, T1V);
|
||||
cr[WS(rs, 9)] = FMS(KP951056516, T22, T21);
|
||||
ci[WS(rs, 8)] = FMA(KP951056516, T22, T21);
|
||||
T1X = FNMS(KP559016994, T1W, T1V);
|
||||
cr[WS(rs, 7)] = FMS(KP951056516, T20, T1X);
|
||||
ci[WS(rs, 6)] = FMA(KP951056516, T20, T1X);
|
||||
}
|
||||
{
|
||||
E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29;
|
||||
T28 = T24 - T25;
|
||||
T26 = T24 + T25;
|
||||
T27 = FNMS(KP250000000, T26, T23);
|
||||
T2a = T13 - T14;
|
||||
T2b = T16 - T17;
|
||||
T2c = FMA(KP618033988, T2b, T2a);
|
||||
T2e = FNMS(KP618033988, T2a, T2b);
|
||||
ci[WS(rs, 9)] = T26 + T23;
|
||||
T2d = FNMS(KP559016994, T28, T27);
|
||||
cr[WS(rs, 8)] = FMS(KP951056516, T2e, T2d);
|
||||
ci[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
|
||||
T29 = FMA(KP559016994, T28, T27);
|
||||
cr[WS(rs, 6)] = FMS(KP951056516, T2c, T29);
|
||||
ci[WS(rs, 5)] = FMA(KP951056516, T2c, T29);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 10, "hf_10", twinstr, &GENUS, { 48, 18, 54, 0 } };
|
||||
|
||||
void X(codelet_hf_10) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_10, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 10 -dit -name hf_10 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 60 FP multiplications,
|
||||
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 45 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_10(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 18); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T7, T1R, TT, T1C, TF, TQ, TR, T1o, T1p, T1P, TX, TY, TZ, T1d, T1g;
|
||||
E T1x, Ti, Tt, Tu, T1r, T1s, T1O, TU, TV, TW, T16, T19, T1y;
|
||||
{
|
||||
E T1, T1A, T6, T1B;
|
||||
T1 = cr[0];
|
||||
T1A = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 5)];
|
||||
T5 = ci[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1B = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
T1R = T1B + T1A;
|
||||
TT = T1 + T6;
|
||||
T1C = T1A - T1B;
|
||||
}
|
||||
{
|
||||
E Tz, T1b, TP, T1e, TE, T1c, TK, T1f;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = cr[WS(rs, 4)];
|
||||
Ty = ci[WS(rs, 4)];
|
||||
Tv = W[6];
|
||||
Tx = W[7];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1b = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = cr[WS(rs, 1)];
|
||||
TO = ci[WS(rs, 1)];
|
||||
TL = W[0];
|
||||
TN = W[1];
|
||||
TP = FMA(TL, TM, TN * TO);
|
||||
T1e = FNMS(TN, TM, TL * TO);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = cr[WS(rs, 9)];
|
||||
TD = ci[WS(rs, 9)];
|
||||
TA = W[16];
|
||||
TC = W[17];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1c = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = cr[WS(rs, 6)];
|
||||
TJ = ci[WS(rs, 6)];
|
||||
TG = W[10];
|
||||
TI = W[11];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1f = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
TF = Tz - TE;
|
||||
TQ = TK - TP;
|
||||
TR = TF + TQ;
|
||||
T1o = T1b + T1c;
|
||||
T1p = T1f + T1e;
|
||||
T1P = T1o + T1p;
|
||||
TX = Tz + TE;
|
||||
TY = TK + TP;
|
||||
TZ = TX + TY;
|
||||
T1d = T1b - T1c;
|
||||
T1g = T1e - T1f;
|
||||
T1x = T1g - T1d;
|
||||
}
|
||||
{
|
||||
E Tc, T14, Ts, T18, Th, T15, Tn, T17;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = cr[WS(rs, 2)];
|
||||
Tb = ci[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T14 = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = cr[WS(rs, 3)];
|
||||
Tr = ci[WS(rs, 3)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T18 = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 7)];
|
||||
Tg = ci[WS(rs, 7)];
|
||||
Td = W[12];
|
||||
Tf = W[13];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T15 = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = cr[WS(rs, 8)];
|
||||
Tm = ci[WS(rs, 8)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T17 = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
Tt = Tn - Ts;
|
||||
Tu = Ti + Tt;
|
||||
T1r = T14 + T15;
|
||||
T1s = T17 + T18;
|
||||
T1O = T1r + T1s;
|
||||
TU = Tc + Th;
|
||||
TV = Tn + Ts;
|
||||
TW = TU + TV;
|
||||
T16 = T14 - T15;
|
||||
T19 = T17 - T18;
|
||||
T1y = T16 + T19;
|
||||
}
|
||||
{
|
||||
E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
|
||||
T11 = KP559016994 * (Tu - TR);
|
||||
TS = Tu + TR;
|
||||
T12 = FNMS(KP250000000, TS, T7);
|
||||
T1a = T16 - T19;
|
||||
T1h = T1d + T1g;
|
||||
T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
|
||||
T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
|
||||
ci[WS(rs, 4)] = T7 + TS;
|
||||
T1j = T12 - T11;
|
||||
ci[WS(rs, 2)] = T1j - T1k;
|
||||
cr[WS(rs, 3)] = T1j + T1k;
|
||||
T13 = T11 + T12;
|
||||
ci[0] = T13 - T1i;
|
||||
cr[WS(rs, 1)] = T13 + T1i;
|
||||
}
|
||||
{
|
||||
E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
|
||||
T1m = KP559016994 * (TW - TZ);
|
||||
T10 = TW + TZ;
|
||||
T1l = FNMS(KP250000000, T10, TT);
|
||||
T1q = T1o - T1p;
|
||||
T1t = T1r - T1s;
|
||||
T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
|
||||
T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
|
||||
cr[0] = TT + T10;
|
||||
T1v = T1m + T1l;
|
||||
cr[WS(rs, 4)] = T1v - T1w;
|
||||
ci[WS(rs, 3)] = T1v + T1w;
|
||||
T1n = T1l - T1m;
|
||||
cr[WS(rs, 2)] = T1n - T1u;
|
||||
ci[WS(rs, 1)] = T1n + T1u;
|
||||
}
|
||||
{
|
||||
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
|
||||
T1H = KP559016994 * (T1y + T1x);
|
||||
T1z = T1x - T1y;
|
||||
T1G = FMA(KP250000000, T1z, T1C);
|
||||
T1D = Ti - Tt;
|
||||
T1E = TQ - TF;
|
||||
T1F = FMA(KP587785252, T1D, KP951056516 * T1E);
|
||||
T1J = FNMS(KP951056516, T1D, KP587785252 * T1E);
|
||||
cr[WS(rs, 5)] = T1z - T1C;
|
||||
T1K = T1H + T1G;
|
||||
cr[WS(rs, 9)] = T1J - T1K;
|
||||
ci[WS(rs, 8)] = T1J + T1K;
|
||||
T1I = T1G - T1H;
|
||||
cr[WS(rs, 7)] = T1F - T1I;
|
||||
ci[WS(rs, 6)] = T1F + T1I;
|
||||
}
|
||||
{
|
||||
E T1Q, T1S, T1T, T1N, T1V, T1L, T1M, T1W, T1U;
|
||||
T1Q = KP559016994 * (T1O - T1P);
|
||||
T1S = T1O + T1P;
|
||||
T1T = FNMS(KP250000000, T1S, T1R);
|
||||
T1L = TU - TV;
|
||||
T1M = TX - TY;
|
||||
T1N = FMA(KP951056516, T1L, KP587785252 * T1M);
|
||||
T1V = FNMS(KP587785252, T1L, KP951056516 * T1M);
|
||||
ci[WS(rs, 9)] = T1S + T1R;
|
||||
T1W = T1T - T1Q;
|
||||
cr[WS(rs, 8)] = T1V - T1W;
|
||||
ci[WS(rs, 7)] = T1V + T1W;
|
||||
T1U = T1Q + T1T;
|
||||
cr[WS(rs, 6)] = T1N - T1U;
|
||||
ci[WS(rs, 5)] = T1N + T1U;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 10, "hf_10", twinstr, &GENUS, { 72, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hf_10) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_10, &desc);
|
||||
}
|
||||
#endif
|
||||
581
fftw-3.3.10/rdft/scalar/r2cf/hf_12.c
Normal file
581
fftw-3.3.10/rdft/scalar/r2cf/hf_12.c
Normal file
@@ -0,0 +1,581 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 68 FP multiplications,
|
||||
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2s, T1s, T2f, T1d, T21, T1H;
|
||||
E T1Z, Te, T2p, T1l, T2h, TT, T1V, T1A, T1T;
|
||||
T1 = cr[0];
|
||||
T2i = ci[0];
|
||||
{
|
||||
E Th, Tk, Ti, T2d, Tg, Tj;
|
||||
Th = cr[WS(rs, 6)];
|
||||
Tk = ci[WS(rs, 6)];
|
||||
Tg = W[10];
|
||||
Ti = Tg * Th;
|
||||
T2d = Tg * Tk;
|
||||
Tj = W[11];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2e = FNMS(Tj, Th, T2d);
|
||||
}
|
||||
{
|
||||
E TW, TZ, TX, T1X, TV, TY;
|
||||
TW = cr[WS(rs, 9)];
|
||||
TZ = ci[WS(rs, 9)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T1X = TV * TZ;
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T1Y = FNMS(TY, TW, T1X);
|
||||
}
|
||||
{
|
||||
E TC, TF, TD, T1R, TB, TE;
|
||||
TC = cr[WS(rs, 3)];
|
||||
TF = ci[WS(rs, 3)];
|
||||
TB = W[4];
|
||||
TD = TB * TC;
|
||||
T1R = TB * TF;
|
||||
TE = W[5];
|
||||
TG = FMA(TE, TF, TD);
|
||||
T1S = FNMS(TE, TC, T1R);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
|
||||
Tn = cr[WS(rs, 10)];
|
||||
Tq = ci[WS(rs, 10)];
|
||||
Tm = W[18];
|
||||
To = Tm * Tn;
|
||||
T1o = Tm * Tq;
|
||||
Tt = cr[WS(rs, 2)];
|
||||
Tw = ci[WS(rs, 2)];
|
||||
Ts = W[2];
|
||||
Tu = Ts * Tt;
|
||||
T1q = Ts * Tw;
|
||||
{
|
||||
E Tr, T1p, Tx, T1r, Tp, Tv;
|
||||
Tp = W[19];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1p = FNMS(Tp, Tn, T1o);
|
||||
Tv = W[3];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1r = FNMS(Tv, Tt, T1q);
|
||||
Ty = Tr + Tx;
|
||||
T2s = Tx - Tr;
|
||||
T1s = T1p - T1r;
|
||||
T2f = T1p + T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
|
||||
T12 = cr[WS(rs, 1)];
|
||||
T15 = ci[WS(rs, 1)];
|
||||
T11 = W[0];
|
||||
T13 = T11 * T12;
|
||||
T1D = T11 * T15;
|
||||
T18 = cr[WS(rs, 5)];
|
||||
T1b = ci[WS(rs, 5)];
|
||||
T17 = W[8];
|
||||
T19 = T17 * T18;
|
||||
T1F = T17 * T1b;
|
||||
{
|
||||
E T16, T1E, T1c, T1G, T14, T1a;
|
||||
T14 = W[1];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1E = FNMS(T14, T12, T1D);
|
||||
T1a = W[9];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T1G = FNMS(T1a, T18, T1F);
|
||||
T1d = T16 + T1c;
|
||||
T21 = T1c - T16;
|
||||
T1H = T1E - T1G;
|
||||
T1Z = T1E + T1G;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
|
||||
T3 = cr[WS(rs, 4)];
|
||||
T6 = ci[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1h = T2 * T6;
|
||||
T9 = cr[WS(rs, 8)];
|
||||
Tc = ci[WS(rs, 8)];
|
||||
T8 = W[14];
|
||||
Ta = T8 * T9;
|
||||
T1j = T8 * Tc;
|
||||
{
|
||||
E T7, T1i, Td, T1k, T5, Tb;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1i = FNMS(T5, T3, T1h);
|
||||
Tb = W[15];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1k = FNMS(Tb, T9, T1j);
|
||||
Te = T7 + Td;
|
||||
T2p = Td - T7;
|
||||
T1l = T1i - T1k;
|
||||
T2h = T1i + T1k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
|
||||
TI = cr[WS(rs, 7)];
|
||||
TL = ci[WS(rs, 7)];
|
||||
TH = W[12];
|
||||
TJ = TH * TI;
|
||||
T1w = TH * TL;
|
||||
TO = cr[WS(rs, 11)];
|
||||
TR = ci[WS(rs, 11)];
|
||||
TN = W[20];
|
||||
TP = TN * TO;
|
||||
T1y = TN * TR;
|
||||
{
|
||||
E TM, T1x, TS, T1z, TK, TQ;
|
||||
TK = W[13];
|
||||
TM = FMA(TK, TL, TJ);
|
||||
T1x = FNMS(TK, TI, T1w);
|
||||
TQ = W[21];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1z = FNMS(TQ, TO, T1y);
|
||||
TT = TM + TS;
|
||||
T1V = TS - TM;
|
||||
T1A = T1x - T1z;
|
||||
T1T = T1x + T1z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
|
||||
{
|
||||
E Tf, Tz, T2g, T2j;
|
||||
Tf = T1 + Te;
|
||||
Tz = Tl + Ty;
|
||||
TA = Tf + Tz;
|
||||
T28 = Tf - Tz;
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2m = T2j - T2g;
|
||||
}
|
||||
{
|
||||
E TU, T1e, T29, T2a;
|
||||
TU = TG + TT;
|
||||
T1e = T10 + T1d;
|
||||
T1f = TU + T1e;
|
||||
T2l = TU - T1e;
|
||||
T29 = T1S + T1T;
|
||||
T2a = T1Y + T1Z;
|
||||
T2b = T29 - T2a;
|
||||
T2c = T29 + T2a;
|
||||
}
|
||||
ci[WS(rs, 5)] = TA - T1f;
|
||||
cr[WS(rs, 9)] = T2l - T2m;
|
||||
ci[WS(rs, 8)] = T2l + T2m;
|
||||
cr[0] = TA + T1f;
|
||||
cr[WS(rs, 3)] = T28 - T2b;
|
||||
cr[WS(rs, 6)] = T2c - T2k;
|
||||
ci[WS(rs, 11)] = T2c + T2k;
|
||||
ci[WS(rs, 2)] = T28 + T2b;
|
||||
}
|
||||
{
|
||||
E T1m, T1K, T2q, T2y, T2t, T2z, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
|
||||
E T1O;
|
||||
{
|
||||
E T1g, T2o, T2r, T1n;
|
||||
T1g = FNMS(KP500000000, Te, T1);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1K = FMA(KP866025403, T1l, T1g);
|
||||
T2o = FNMS(KP500000000, T2h, T2i);
|
||||
T2q = FNMS(KP866025403, T2p, T2o);
|
||||
T2y = FMA(KP866025403, T2p, T2o);
|
||||
T2r = FNMS(KP500000000, T2f, T2e);
|
||||
T2t = FNMS(KP866025403, T2s, T2r);
|
||||
T2z = FMA(KP866025403, T2s, T2r);
|
||||
T1n = FNMS(KP500000000, Ty, Tl);
|
||||
T1t = FNMS(KP866025403, T1s, T1n);
|
||||
T1L = FMA(KP866025403, T1s, T1n);
|
||||
}
|
||||
{
|
||||
E T1v, T1U, T20, T1C;
|
||||
T1v = FNMS(KP500000000, TT, TG);
|
||||
T1B = FNMS(KP866025403, T1A, T1v);
|
||||
T1N = FMA(KP866025403, T1A, T1v);
|
||||
T1U = FNMS(KP500000000, T1T, T1S);
|
||||
T1W = FNMS(KP866025403, T1V, T1U);
|
||||
T25 = FMA(KP866025403, T1V, T1U);
|
||||
T20 = FNMS(KP500000000, T1Z, T1Y);
|
||||
T22 = FNMS(KP866025403, T21, T20);
|
||||
T26 = FMA(KP866025403, T21, T20);
|
||||
T1C = FNMS(KP500000000, T1d, T10);
|
||||
T1I = FNMS(KP866025403, T1H, T1C);
|
||||
T1O = FMA(KP866025403, T1H, T1C);
|
||||
}
|
||||
{
|
||||
E T1u, T1J, T2v, T2w;
|
||||
T1u = T1m + T1t;
|
||||
T1J = T1B + T1I;
|
||||
cr[WS(rs, 2)] = T1u - T1J;
|
||||
ci[WS(rs, 3)] = T1u + T1J;
|
||||
T2v = T1W + T22;
|
||||
T2w = T2t + T2q;
|
||||
cr[WS(rs, 8)] = -(T2v + T2w);
|
||||
ci[WS(rs, 9)] = T2w - T2v;
|
||||
}
|
||||
{
|
||||
E T2B, T2C, T2x, T2A;
|
||||
T2B = T25 + T26;
|
||||
T2C = T2z + T2y;
|
||||
cr[WS(rs, 10)] = T2B - T2C;
|
||||
ci[WS(rs, 7)] = T2B + T2C;
|
||||
T2x = T1O - T1N;
|
||||
T2A = T2y - T2z;
|
||||
cr[WS(rs, 7)] = T2x - T2A;
|
||||
ci[WS(rs, 10)] = T2x + T2A;
|
||||
}
|
||||
{
|
||||
E T1M, T1P, T24, T27;
|
||||
T1M = T1K + T1L;
|
||||
T1P = T1N + T1O;
|
||||
ci[WS(rs, 1)] = T1M - T1P;
|
||||
cr[WS(rs, 4)] = T1M + T1P;
|
||||
T24 = T1K - T1L;
|
||||
T27 = T25 - T26;
|
||||
ci[WS(rs, 4)] = T24 - T27;
|
||||
cr[WS(rs, 1)] = T24 + T27;
|
||||
}
|
||||
{
|
||||
E T1Q, T23, T2n, T2u;
|
||||
T1Q = T1m - T1t;
|
||||
T23 = T1W - T22;
|
||||
ci[0] = T1Q - T23;
|
||||
cr[WS(rs, 5)] = T1Q + T23;
|
||||
T2n = T1I - T1B;
|
||||
T2u = T2q - T2t;
|
||||
cr[WS(rs, 11)] = T2n - T2u;
|
||||
ci[WS(rs, 6)] = T2n + T2u;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, { 72, 22, 46, 0 } };
|
||||
|
||||
void X(codelet_hf_12) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_12, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 12 -dit -name hf_12 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 60 FP multiplications,
|
||||
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_12(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 22); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, T1W, T18, T23, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
|
||||
E T1G, Ti, T1S, T1d, T26, Tt, T1a, T1T, T25, TA, T1y, T1j, T1B, TL, T1g;
|
||||
E T1z, T1A;
|
||||
{
|
||||
E T6, T16, Tb, T17;
|
||||
T1 = cr[0];
|
||||
T1W = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 4)];
|
||||
T5 = ci[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T16 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = cr[WS(rs, 8)];
|
||||
Ta = ci[WS(rs, 8)];
|
||||
T7 = W[14];
|
||||
T9 = W[15];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T17 = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T18 = KP866025403 * (T16 - T17);
|
||||
T23 = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
T15 = FNMS(KP500000000, Tc, T1);
|
||||
T1V = T16 + T17;
|
||||
T22 = FNMS(KP500000000, T1V, T1W);
|
||||
}
|
||||
{
|
||||
E T11, T1n, TW, T1m;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = cr[WS(rs, 9)];
|
||||
TQ = ci[WS(rs, 9)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1E = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = cr[WS(rs, 5)];
|
||||
T10 = ci[WS(rs, 5)];
|
||||
TX = W[8];
|
||||
TZ = W[9];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1n = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = cr[WS(rs, 1)];
|
||||
TV = ci[WS(rs, 1)];
|
||||
TS = W[0];
|
||||
TU = W[1];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1m = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
T1o = KP866025403 * (T1m - T1n);
|
||||
T1D = KP866025403 * (T11 - TW);
|
||||
T12 = TW + T11;
|
||||
T1l = FNMS(KP500000000, T12, TR);
|
||||
T1F = T1m + T1n;
|
||||
T1G = FNMS(KP500000000, T1F, T1E);
|
||||
}
|
||||
{
|
||||
E Ts, T1c, Tn, T1b;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = cr[WS(rs, 6)];
|
||||
Th = ci[WS(rs, 6)];
|
||||
Te = W[10];
|
||||
Tg = W[11];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T1S = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = cr[WS(rs, 2)];
|
||||
Tr = ci[WS(rs, 2)];
|
||||
To = W[2];
|
||||
Tq = W[3];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1c = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = cr[WS(rs, 10)];
|
||||
Tm = ci[WS(rs, 10)];
|
||||
Tj = W[18];
|
||||
Tl = W[19];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1b = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
T1d = KP866025403 * (T1b - T1c);
|
||||
T26 = KP866025403 * (Ts - Tn);
|
||||
Tt = Tn + Ts;
|
||||
T1a = FNMS(KP500000000, Tt, Ti);
|
||||
T1T = T1b + T1c;
|
||||
T25 = FNMS(KP500000000, T1T, T1S);
|
||||
}
|
||||
{
|
||||
E TK, T1i, TF, T1h;
|
||||
{
|
||||
E Tx, Tz, Tw, Ty;
|
||||
Tx = cr[WS(rs, 3)];
|
||||
Tz = ci[WS(rs, 3)];
|
||||
Tw = W[4];
|
||||
Ty = W[5];
|
||||
TA = FMA(Tw, Tx, Ty * Tz);
|
||||
T1y = FNMS(Ty, Tx, Tw * Tz);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = cr[WS(rs, 11)];
|
||||
TJ = ci[WS(rs, 11)];
|
||||
TG = W[20];
|
||||
TI = W[21];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1i = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
{
|
||||
E TC, TE, TB, TD;
|
||||
TC = cr[WS(rs, 7)];
|
||||
TE = ci[WS(rs, 7)];
|
||||
TB = W[12];
|
||||
TD = W[13];
|
||||
TF = FMA(TB, TC, TD * TE);
|
||||
T1h = FNMS(TD, TC, TB * TE);
|
||||
}
|
||||
T1j = KP866025403 * (T1h - T1i);
|
||||
T1B = KP866025403 * (TK - TF);
|
||||
TL = TF + TK;
|
||||
T1g = FNMS(KP500000000, TL, TA);
|
||||
T1z = T1h + T1i;
|
||||
T1A = FNMS(KP500000000, T1z, T1y);
|
||||
}
|
||||
{
|
||||
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
|
||||
{
|
||||
E Td, Tu, T1U, T1X;
|
||||
Td = T1 + Tc;
|
||||
Tu = Ti + Tt;
|
||||
Tv = Td + Tu;
|
||||
T1N = Td - Tu;
|
||||
T1U = T1S + T1T;
|
||||
T1X = T1V + T1W;
|
||||
T1Y = T1U + T1X;
|
||||
T20 = T1X - T1U;
|
||||
}
|
||||
{
|
||||
E TM, T13, T1O, T1P;
|
||||
TM = TA + TL;
|
||||
T13 = TR + T12;
|
||||
T14 = TM + T13;
|
||||
T1Z = TM - T13;
|
||||
T1O = T1y + T1z;
|
||||
T1P = T1E + T1F;
|
||||
T1Q = T1O - T1P;
|
||||
T1R = T1O + T1P;
|
||||
}
|
||||
ci[WS(rs, 5)] = Tv - T14;
|
||||
cr[WS(rs, 9)] = T1Z - T20;
|
||||
ci[WS(rs, 8)] = T1Z + T20;
|
||||
cr[0] = Tv + T14;
|
||||
cr[WS(rs, 3)] = T1N - T1Q;
|
||||
cr[WS(rs, 6)] = T1R - T1Y;
|
||||
ci[WS(rs, 11)] = T1R + T1Y;
|
||||
ci[WS(rs, 2)] = T1N + T1Q;
|
||||
}
|
||||
{
|
||||
E T1f, T1x, T28, T2a, T1q, T21, T1I, T29;
|
||||
{
|
||||
E T19, T1e, T24, T27;
|
||||
T19 = T15 - T18;
|
||||
T1e = T1a - T1d;
|
||||
T1f = T19 + T1e;
|
||||
T1x = T19 - T1e;
|
||||
T24 = T22 - T23;
|
||||
T27 = T25 - T26;
|
||||
T28 = T24 - T27;
|
||||
T2a = T27 + T24;
|
||||
}
|
||||
{
|
||||
E T1k, T1p, T1C, T1H;
|
||||
T1k = T1g - T1j;
|
||||
T1p = T1l - T1o;
|
||||
T1q = T1k + T1p;
|
||||
T21 = T1p - T1k;
|
||||
T1C = T1A - T1B;
|
||||
T1H = T1D - T1G;
|
||||
T1I = T1C + T1H;
|
||||
T29 = T1H - T1C;
|
||||
}
|
||||
cr[WS(rs, 2)] = T1f - T1q;
|
||||
cr[WS(rs, 8)] = T29 - T2a;
|
||||
ci[WS(rs, 9)] = T29 + T2a;
|
||||
ci[WS(rs, 3)] = T1f + T1q;
|
||||
ci[0] = T1x - T1I;
|
||||
cr[WS(rs, 11)] = T21 - T28;
|
||||
ci[WS(rs, 6)] = T21 + T28;
|
||||
cr[WS(rs, 5)] = T1x + T1I;
|
||||
}
|
||||
{
|
||||
E T1t, T1J, T2e, T2g, T1w, T2b, T1M, T2f;
|
||||
{
|
||||
E T1r, T1s, T2c, T2d;
|
||||
T1r = T15 + T18;
|
||||
T1s = T1a + T1d;
|
||||
T1t = T1r + T1s;
|
||||
T1J = T1r - T1s;
|
||||
T2c = T23 + T22;
|
||||
T2d = T26 + T25;
|
||||
T2e = T2c - T2d;
|
||||
T2g = T2d + T2c;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1K, T1L;
|
||||
T1u = T1g + T1j;
|
||||
T1v = T1l + T1o;
|
||||
T1w = T1u + T1v;
|
||||
T2b = T1v - T1u;
|
||||
T1K = T1B + T1A;
|
||||
T1L = T1D + T1G;
|
||||
T1M = T1K - T1L;
|
||||
T2f = T1K + T1L;
|
||||
}
|
||||
ci[WS(rs, 1)] = T1t - T1w;
|
||||
cr[WS(rs, 1)] = T1J + T1M;
|
||||
cr[WS(rs, 4)] = T1t + T1w;
|
||||
ci[WS(rs, 4)] = T1J - T1M;
|
||||
cr[WS(rs, 7)] = T2b - T2e;
|
||||
ci[WS(rs, 7)] = T2f + T2g;
|
||||
ci[WS(rs, 10)] = T2b + T2e;
|
||||
cr[WS(rs, 10)] = T2f - T2g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 12, "hf_12", twinstr, &GENUS, { 88, 30, 30, 0 } };
|
||||
|
||||
void X(codelet_hf_12) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_12, &desc);
|
||||
}
|
||||
#endif
|
||||
816
fftw-3.3.10/rdft/scalar/r2cf/hf_15.c
Normal file
816
fftw-3.3.10/rdft/scalar/r2cf/hf_15.c
Normal file
@@ -0,0 +1,816 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 140 FP multiplications,
|
||||
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
|
||||
* 51 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T1, T3i, T1G, T3l, Te, T1B, T3j, T3k, T1y, T2i, T2a, T2M, T37, T2Y, Tz;
|
||||
E T2e, T1O, T2t, T39, T2U, TT, T2f, T1V, T2z, T3a, T2V, T1e, T2h, T23, T2G;
|
||||
E T36, T2X;
|
||||
{
|
||||
E T7, T1D, Td, T1F;
|
||||
T1 = cr[0];
|
||||
T3i = ci[0];
|
||||
{
|
||||
E T3, T6, T4, T1C, T2, T5;
|
||||
T3 = cr[WS(rs, 5)];
|
||||
T6 = ci[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1C = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1D = FNMS(T5, T3, T1C);
|
||||
}
|
||||
{
|
||||
E T9, Tc, Ta, T1E, T8, Tb;
|
||||
T9 = cr[WS(rs, 10)];
|
||||
Tc = ci[WS(rs, 10)];
|
||||
T8 = W[18];
|
||||
Ta = T8 * T9;
|
||||
T1E = T8 * Tc;
|
||||
Tb = W[19];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1F = FNMS(Tb, T9, T1E);
|
||||
}
|
||||
T1G = T1D - T1F;
|
||||
T3l = Td - T7;
|
||||
Te = T7 + Td;
|
||||
T1B = FNMS(KP500000000, Te, T1);
|
||||
T3j = T1D + T1F;
|
||||
T3k = FNMS(KP500000000, T3j, T3i);
|
||||
}
|
||||
{
|
||||
E T1k, T2I, T1w, T28, T1q, T26;
|
||||
{
|
||||
E T1g, T1j, T1h, T2H, T1f, T1i;
|
||||
T1g = cr[WS(rs, 9)];
|
||||
T1j = ci[WS(rs, 9)];
|
||||
T1f = W[16];
|
||||
T1h = T1f * T1g;
|
||||
T2H = T1f * T1j;
|
||||
T1i = W[17];
|
||||
T1k = FMA(T1i, T1j, T1h);
|
||||
T2I = FNMS(T1i, T1g, T2H);
|
||||
}
|
||||
{
|
||||
E T1s, T1v, T1t, T27, T1r, T1u;
|
||||
T1s = cr[WS(rs, 4)];
|
||||
T1v = ci[WS(rs, 4)];
|
||||
T1r = W[6];
|
||||
T1t = T1r * T1s;
|
||||
T27 = T1r * T1v;
|
||||
T1u = W[7];
|
||||
T1w = FMA(T1u, T1v, T1t);
|
||||
T28 = FNMS(T1u, T1s, T27);
|
||||
}
|
||||
{
|
||||
E T1m, T1p, T1n, T25, T1l, T1o;
|
||||
T1m = cr[WS(rs, 14)];
|
||||
T1p = ci[WS(rs, 14)];
|
||||
T1l = W[26];
|
||||
T1n = T1l * T1m;
|
||||
T25 = T1l * T1p;
|
||||
T1o = W[27];
|
||||
T1q = FMA(T1o, T1p, T1n);
|
||||
T26 = FNMS(T1o, T1m, T25);
|
||||
}
|
||||
{
|
||||
E T29, T1x, T24, T2L, T2J, T2K;
|
||||
T29 = T26 - T28;
|
||||
T1x = T1q + T1w;
|
||||
T24 = FNMS(KP500000000, T1x, T1k);
|
||||
T1y = T1k + T1x;
|
||||
T2i = FMA(KP866025403, T29, T24);
|
||||
T2a = FNMS(KP866025403, T29, T24);
|
||||
T2L = T1q - T1w;
|
||||
T2J = T26 + T28;
|
||||
T2K = FNMS(KP500000000, T2J, T2I);
|
||||
T2M = FNMS(KP866025403, T2L, T2K);
|
||||
T37 = T2I + T2J;
|
||||
T2Y = FMA(KP866025403, T2L, T2K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tl, T2p, Tx, T1M, Tr, T1K;
|
||||
{
|
||||
E Th, Tk, Ti, T2o, Tg, Tj;
|
||||
Th = cr[WS(rs, 3)];
|
||||
Tk = ci[WS(rs, 3)];
|
||||
Tg = W[4];
|
||||
Ti = Tg * Th;
|
||||
T2o = Tg * Tk;
|
||||
Tj = W[5];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2p = FNMS(Tj, Th, T2o);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1L, Ts, Tv;
|
||||
Tt = cr[WS(rs, 13)];
|
||||
Tw = ci[WS(rs, 13)];
|
||||
Ts = W[24];
|
||||
Tu = Ts * Tt;
|
||||
T1L = Ts * Tw;
|
||||
Tv = W[25];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1M = FNMS(Tv, Tt, T1L);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1J, Tm, Tp;
|
||||
Tn = cr[WS(rs, 8)];
|
||||
Tq = ci[WS(rs, 8)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1J = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1K = FNMS(Tp, Tn, T1J);
|
||||
}
|
||||
{
|
||||
E T1N, Ty, T1I, T2s, T2q, T2r;
|
||||
T1N = T1K - T1M;
|
||||
Ty = Tr + Tx;
|
||||
T1I = FNMS(KP500000000, Ty, Tl);
|
||||
Tz = Tl + Ty;
|
||||
T2e = FMA(KP866025403, T1N, T1I);
|
||||
T1O = FNMS(KP866025403, T1N, T1I);
|
||||
T2s = Tr - Tx;
|
||||
T2q = T1K + T1M;
|
||||
T2r = FNMS(KP500000000, T2q, T2p);
|
||||
T2t = FNMS(KP866025403, T2s, T2r);
|
||||
T39 = T2p + T2q;
|
||||
T2U = FMA(KP866025403, T2s, T2r);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TF, T2v, TR, T1T, TL, T1R;
|
||||
{
|
||||
E TB, TE, TC, T2u, TA, TD;
|
||||
TB = cr[WS(rs, 12)];
|
||||
TE = ci[WS(rs, 12)];
|
||||
TA = W[22];
|
||||
TC = TA * TB;
|
||||
T2u = TA * TE;
|
||||
TD = W[23];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T2v = FNMS(TD, TB, T2u);
|
||||
}
|
||||
{
|
||||
E TN, TQ, TO, T1S, TM, TP;
|
||||
TN = cr[WS(rs, 7)];
|
||||
TQ = ci[WS(rs, 7)];
|
||||
TM = W[12];
|
||||
TO = TM * TN;
|
||||
T1S = TM * TQ;
|
||||
TP = W[13];
|
||||
TR = FMA(TP, TQ, TO);
|
||||
T1T = FNMS(TP, TN, T1S);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1Q, TG, TJ;
|
||||
TH = cr[WS(rs, 2)];
|
||||
TK = ci[WS(rs, 2)];
|
||||
TG = W[2];
|
||||
TI = TG * TH;
|
||||
T1Q = TG * TK;
|
||||
TJ = W[3];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1R = FNMS(TJ, TH, T1Q);
|
||||
}
|
||||
{
|
||||
E T1U, TS, T1P, T2y, T2w, T2x;
|
||||
T1U = T1R - T1T;
|
||||
TS = TL + TR;
|
||||
T1P = FNMS(KP500000000, TS, TF);
|
||||
TT = TF + TS;
|
||||
T2f = FMA(KP866025403, T1U, T1P);
|
||||
T1V = FNMS(KP866025403, T1U, T1P);
|
||||
T2y = TL - TR;
|
||||
T2w = T1R + T1T;
|
||||
T2x = FNMS(KP500000000, T2w, T2v);
|
||||
T2z = FNMS(KP866025403, T2y, T2x);
|
||||
T3a = T2v + T2w;
|
||||
T2V = FMA(KP866025403, T2y, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T10, T2C, T1c, T21, T16, T1Z;
|
||||
{
|
||||
E TW, TZ, TX, T2B, TV, TY;
|
||||
TW = cr[WS(rs, 6)];
|
||||
TZ = ci[WS(rs, 6)];
|
||||
TV = W[10];
|
||||
TX = TV * TW;
|
||||
T2B = TV * TZ;
|
||||
TY = W[11];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T2C = FNMS(TY, TW, T2B);
|
||||
}
|
||||
{
|
||||
E T18, T1b, T19, T20, T17, T1a;
|
||||
T18 = cr[WS(rs, 1)];
|
||||
T1b = ci[WS(rs, 1)];
|
||||
T17 = W[0];
|
||||
T19 = T17 * T18;
|
||||
T20 = T17 * T1b;
|
||||
T1a = W[1];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T21 = FNMS(T1a, T18, T20);
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1Y, T11, T14;
|
||||
T12 = cr[WS(rs, 11)];
|
||||
T15 = ci[WS(rs, 11)];
|
||||
T11 = W[20];
|
||||
T13 = T11 * T12;
|
||||
T1Y = T11 * T15;
|
||||
T14 = W[21];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1Z = FNMS(T14, T12, T1Y);
|
||||
}
|
||||
{
|
||||
E T22, T1d, T1X, T2F, T2D, T2E;
|
||||
T22 = T1Z - T21;
|
||||
T1d = T16 + T1c;
|
||||
T1X = FNMS(KP500000000, T1d, T10);
|
||||
T1e = T10 + T1d;
|
||||
T2h = FMA(KP866025403, T22, T1X);
|
||||
T23 = FNMS(KP866025403, T22, T1X);
|
||||
T2F = T16 - T1c;
|
||||
T2D = T1Z + T21;
|
||||
T2E = FNMS(KP500000000, T2D, T2C);
|
||||
T2G = FNMS(KP866025403, T2F, T2E);
|
||||
T36 = T2C + T2D;
|
||||
T2X = FMA(KP866025403, T2F, T2E);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
|
||||
{
|
||||
E T38, T3b, TU, T1z;
|
||||
T38 = T36 - T37;
|
||||
T3b = T39 - T3a;
|
||||
T3c = FNMS(KP618033988, T3b, T38);
|
||||
T3e = FMA(KP618033988, T38, T3b);
|
||||
Tf = T1 + Te;
|
||||
TU = Tz + TT;
|
||||
T1z = T1e + T1y;
|
||||
T1A = TU + T1z;
|
||||
T33 = FNMS(KP250000000, T1A, Tf);
|
||||
T34 = TU - T1z;
|
||||
}
|
||||
cr[0] = Tf + T1A;
|
||||
T3d = FMA(KP559016994, T34, T33);
|
||||
ci[WS(rs, 5)] = FNMS(KP951056516, T3e, T3d);
|
||||
cr[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
|
||||
T35 = FNMS(KP559016994, T34, T33);
|
||||
ci[WS(rs, 2)] = FNMS(KP951056516, T3c, T35);
|
||||
cr[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
|
||||
}
|
||||
{
|
||||
E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
|
||||
{
|
||||
E T2W, T2Z, T1W, T2b;
|
||||
T2W = T2U - T2V;
|
||||
T2Z = T2X - T2Y;
|
||||
T30 = FMA(KP618033988, T2Z, T2W);
|
||||
T32 = FNMS(KP618033988, T2W, T2Z);
|
||||
T1H = FNMS(KP866025403, T1G, T1B);
|
||||
T1W = T1O + T1V;
|
||||
T2b = T23 + T2a;
|
||||
T2c = T1W + T2b;
|
||||
T2R = FNMS(KP250000000, T2c, T1H);
|
||||
T2S = T1W - T2b;
|
||||
}
|
||||
cr[WS(rs, 5)] = T1H + T2c;
|
||||
T31 = FNMS(KP559016994, T2S, T2R);
|
||||
cr[WS(rs, 2)] = FNMS(KP951056516, T32, T31);
|
||||
ci[WS(rs, 6)] = FMA(KP951056516, T32, T31);
|
||||
T2T = FMA(KP559016994, T2S, T2R);
|
||||
ci[0] = FNMS(KP951056516, T30, T2T);
|
||||
ci[WS(rs, 3)] = FMA(KP951056516, T30, T2T);
|
||||
}
|
||||
{
|
||||
E T2O, T2Q, T2d, T2k, T2l, T2m, T2n, T2P;
|
||||
{
|
||||
E T2A, T2N, T2g, T2j;
|
||||
T2A = T2t - T2z;
|
||||
T2N = T2G - T2M;
|
||||
T2O = FMA(KP618033988, T2N, T2A);
|
||||
T2Q = FNMS(KP618033988, T2A, T2N);
|
||||
T2d = FMA(KP866025403, T1G, T1B);
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2l = FNMS(KP250000000, T2k, T2d);
|
||||
T2m = T2g - T2j;
|
||||
}
|
||||
ci[WS(rs, 4)] = T2d + T2k;
|
||||
T2n = FMA(KP559016994, T2m, T2l);
|
||||
cr[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
|
||||
cr[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
|
||||
T2P = FNMS(KP559016994, T2m, T2l);
|
||||
cr[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
|
||||
ci[WS(rs, 1)] = FMA(KP951056516, T2Q, T2P);
|
||||
}
|
||||
{
|
||||
E T3s, T3u, T3m, T3h, T3n, T3o, T3t, T3p;
|
||||
{
|
||||
E T3q, T3r, T3f, T3g;
|
||||
T3q = T2h - T2i;
|
||||
T3r = T2e - T2f;
|
||||
T3s = FNMS(KP618033988, T3r, T3q);
|
||||
T3u = FMA(KP618033988, T3q, T3r);
|
||||
T3m = FMA(KP866025403, T3l, T3k);
|
||||
T3f = T2t + T2z;
|
||||
T3g = T2G + T2M;
|
||||
T3h = T3f + T3g;
|
||||
T3n = FNMS(KP250000000, T3h, T3m);
|
||||
T3o = T3f - T3g;
|
||||
}
|
||||
cr[WS(rs, 10)] = -(T3h + T3m);
|
||||
T3t = FMA(KP559016994, T3o, T3n);
|
||||
ci[WS(rs, 10)] = FMA(KP951056516, T3u, T3t);
|
||||
ci[WS(rs, 13)] = FNMS(KP951056516, T3u, T3t);
|
||||
T3p = FNMS(KP559016994, T3o, T3n);
|
||||
cr[WS(rs, 13)] = FMS(KP951056516, T3s, T3p);
|
||||
ci[WS(rs, 7)] = FMA(KP951056516, T3s, T3p);
|
||||
}
|
||||
{
|
||||
E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
|
||||
{
|
||||
E T3O, T3P, T3I, T3J;
|
||||
T3O = TT - Tz;
|
||||
T3P = T1y - T1e;
|
||||
T3Q = FMA(KP618033988, T3P, T3O);
|
||||
T3S = FNMS(KP618033988, T3O, T3P);
|
||||
T3H = T3j + T3i;
|
||||
T3I = T39 + T3a;
|
||||
T3J = T36 + T37;
|
||||
T3K = T3I + T3J;
|
||||
T3L = FNMS(KP250000000, T3K, T3H);
|
||||
T3M = T3I - T3J;
|
||||
}
|
||||
ci[WS(rs, 14)] = T3K + T3H;
|
||||
T3R = FNMS(KP559016994, T3M, T3L);
|
||||
cr[WS(rs, 12)] = FMS(KP951056516, T3S, T3R);
|
||||
ci[WS(rs, 11)] = FMA(KP951056516, T3S, T3R);
|
||||
T3N = FMA(KP559016994, T3M, T3L);
|
||||
cr[WS(rs, 9)] = FMS(KP951056516, T3Q, T3N);
|
||||
ci[WS(rs, 8)] = FMA(KP951056516, T3Q, T3N);
|
||||
}
|
||||
{
|
||||
E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
|
||||
{
|
||||
E T3C, T3D, T3w, T3x;
|
||||
T3C = T1O - T1V;
|
||||
T3D = T23 - T2a;
|
||||
T3E = FMA(KP618033988, T3D, T3C);
|
||||
T3G = FNMS(KP618033988, T3C, T3D);
|
||||
T3v = FNMS(KP866025403, T3l, T3k);
|
||||
T3w = T2U + T2V;
|
||||
T3x = T2X + T2Y;
|
||||
T3y = T3w + T3x;
|
||||
T3z = FNMS(KP250000000, T3y, T3v);
|
||||
T3A = T3x - T3w;
|
||||
}
|
||||
ci[WS(rs, 9)] = T3y + T3v;
|
||||
T3F = FMA(KP559016994, T3A, T3z);
|
||||
cr[WS(rs, 8)] = FMS(KP951056516, T3G, T3F);
|
||||
ci[WS(rs, 12)] = FMA(KP951056516, T3G, T3F);
|
||||
T3B = FNMS(KP559016994, T3A, T3z);
|
||||
cr[WS(rs, 11)] = FMS(KP951056516, T3E, T3B);
|
||||
cr[WS(rs, 14)] = -(FMA(KP951056516, T3E, T3B));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, { 72, 28, 112, 0 } };
|
||||
|
||||
void X(codelet_hf_15) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_15, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 15 -dit -name hf_15 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 112 FP multiplications,
|
||||
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
|
||||
* 65 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_15(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 28); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T1q, T2Q, Td, T1n, T2T, T3l, T13, T1k, T1l, T2E, T2F, T3j, T1H, T1T, T2k;
|
||||
E T2w, T2f, T2v, T1M, T1U, Tu, TL, TM, T2H, T2I, T3i, T1w, T1Q, T29, T2t;
|
||||
E T24, T2s, T1B, T1R;
|
||||
{
|
||||
E T1, T2R, T6, T1o, Tb, T1p, Tc, T2S;
|
||||
T1 = cr[0];
|
||||
T2R = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 5)];
|
||||
T5 = ci[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1o = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = cr[WS(rs, 10)];
|
||||
Ta = ci[WS(rs, 10)];
|
||||
T7 = W[18];
|
||||
T9 = W[19];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T1p = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T1q = KP866025403 * (T1o - T1p);
|
||||
T2Q = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
Td = T1 + Tc;
|
||||
T1n = FNMS(KP500000000, Tc, T1);
|
||||
T2S = T1o + T1p;
|
||||
T2T = FNMS(KP500000000, T2S, T2R);
|
||||
T3l = T2S + T2R;
|
||||
}
|
||||
{
|
||||
E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
|
||||
E T2i;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = cr[WS(rs, 6)];
|
||||
TQ = ci[WS(rs, 6)];
|
||||
TN = W[10];
|
||||
TP = W[11];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T2c = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E T15, T17, T14, T16;
|
||||
T15 = cr[WS(rs, 9)];
|
||||
T17 = ci[WS(rs, 9)];
|
||||
T14 = W[16];
|
||||
T16 = W[17];
|
||||
T18 = FMA(T14, T15, T16 * T17);
|
||||
T2h = FNMS(T16, T15, T14 * T17);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = cr[WS(rs, 11)];
|
||||
TV = ci[WS(rs, 11)];
|
||||
TS = W[20];
|
||||
TU = W[21];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1E = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = cr[WS(rs, 1)];
|
||||
T10 = ci[WS(rs, 1)];
|
||||
TX = W[0];
|
||||
TZ = W[1];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1F = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
T12 = TW + T11;
|
||||
T2d = T1E + T1F;
|
||||
{
|
||||
E T1a, T1c, T19, T1b;
|
||||
T1a = cr[WS(rs, 14)];
|
||||
T1c = ci[WS(rs, 14)];
|
||||
T19 = W[26];
|
||||
T1b = W[27];
|
||||
T1d = FMA(T19, T1a, T1b * T1c);
|
||||
T1J = FNMS(T1b, T1a, T19 * T1c);
|
||||
}
|
||||
{
|
||||
E T1f, T1h, T1e, T1g;
|
||||
T1f = cr[WS(rs, 4)];
|
||||
T1h = ci[WS(rs, 4)];
|
||||
T1e = W[6];
|
||||
T1g = W[7];
|
||||
T1i = FMA(T1e, T1f, T1g * T1h);
|
||||
T1K = FNMS(T1g, T1f, T1e * T1h);
|
||||
}
|
||||
T1j = T1d + T1i;
|
||||
T2i = T1J + T1K;
|
||||
{
|
||||
E T1D, T1G, T2g, T2j;
|
||||
T13 = TR + T12;
|
||||
T1k = T18 + T1j;
|
||||
T1l = T13 + T1k;
|
||||
T2E = T2c + T2d;
|
||||
T2F = T2h + T2i;
|
||||
T3j = T2E + T2F;
|
||||
T1D = FNMS(KP500000000, T12, TR);
|
||||
T1G = KP866025403 * (T1E - T1F);
|
||||
T1H = T1D - T1G;
|
||||
T1T = T1D + T1G;
|
||||
T2g = KP866025403 * (T1d - T1i);
|
||||
T2j = FNMS(KP500000000, T2i, T2h);
|
||||
T2k = T2g - T2j;
|
||||
T2w = T2g + T2j;
|
||||
{
|
||||
E T2b, T2e, T1I, T1L;
|
||||
T2b = KP866025403 * (T11 - TW);
|
||||
T2e = FNMS(KP500000000, T2d, T2c);
|
||||
T2f = T2b + T2e;
|
||||
T2v = T2e - T2b;
|
||||
T1I = FNMS(KP500000000, T1j, T18);
|
||||
T1L = KP866025403 * (T1J - T1K);
|
||||
T1M = T1I - T1L;
|
||||
T1U = T1I + T1L;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
|
||||
E T27;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = cr[WS(rs, 3)];
|
||||
Th = ci[WS(rs, 3)];
|
||||
Te = W[4];
|
||||
Tg = W[5];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T21 = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = cr[WS(rs, 12)];
|
||||
Ty = ci[WS(rs, 12)];
|
||||
Tv = W[22];
|
||||
Tx = W[23];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T26 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = cr[WS(rs, 8)];
|
||||
Tm = ci[WS(rs, 8)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1t = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = cr[WS(rs, 13)];
|
||||
Tr = ci[WS(rs, 13)];
|
||||
To = W[24];
|
||||
Tq = W[25];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1u = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn + Ts;
|
||||
T22 = T1t + T1u;
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = cr[WS(rs, 2)];
|
||||
TD = ci[WS(rs, 2)];
|
||||
TA = W[2];
|
||||
TC = W[3];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1y = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = cr[WS(rs, 7)];
|
||||
TI = ci[WS(rs, 7)];
|
||||
TF = W[12];
|
||||
TH = W[13];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T1z = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
TK = TE + TJ;
|
||||
T27 = T1y + T1z;
|
||||
{
|
||||
E T1s, T1v, T25, T28;
|
||||
Tu = Ti + Tt;
|
||||
TL = Tz + TK;
|
||||
TM = Tu + TL;
|
||||
T2H = T21 + T22;
|
||||
T2I = T26 + T27;
|
||||
T3i = T2H + T2I;
|
||||
T1s = FNMS(KP500000000, Tt, Ti);
|
||||
T1v = KP866025403 * (T1t - T1u);
|
||||
T1w = T1s - T1v;
|
||||
T1Q = T1s + T1v;
|
||||
T25 = KP866025403 * (TJ - TE);
|
||||
T28 = FNMS(KP500000000, T27, T26);
|
||||
T29 = T25 + T28;
|
||||
T2t = T28 - T25;
|
||||
{
|
||||
E T20, T23, T1x, T1A;
|
||||
T20 = KP866025403 * (Ts - Tn);
|
||||
T23 = FNMS(KP500000000, T22, T21);
|
||||
T24 = T20 + T23;
|
||||
T2s = T23 - T20;
|
||||
T1x = FNMS(KP500000000, TK, Tz);
|
||||
T1A = KP866025403 * (T1y - T1z);
|
||||
T1B = T1x - T1A;
|
||||
T1R = T1x + T1A;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
|
||||
T2C = KP559016994 * (TM - T1l);
|
||||
T1m = TM + T1l;
|
||||
T2B = FNMS(KP250000000, T1m, Td);
|
||||
T2G = T2E - T2F;
|
||||
T2J = T2H - T2I;
|
||||
T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
|
||||
T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
|
||||
cr[0] = Td + T1m;
|
||||
T2L = T2C + T2B;
|
||||
ci[WS(rs, 5)] = T2L - T2M;
|
||||
cr[WS(rs, 6)] = T2L + T2M;
|
||||
T2D = T2B - T2C;
|
||||
ci[WS(rs, 2)] = T2D - T2K;
|
||||
cr[WS(rs, 3)] = T2D + T2K;
|
||||
}
|
||||
{
|
||||
E T3k, T3m, T3n, T3h, T3p, T3f, T3g, T3q, T3o;
|
||||
T3k = KP559016994 * (T3i - T3j);
|
||||
T3m = T3i + T3j;
|
||||
T3n = FNMS(KP250000000, T3m, T3l);
|
||||
T3f = T1k - T13;
|
||||
T3g = Tu - TL;
|
||||
T3h = FNMS(KP951056516, T3g, KP587785252 * T3f);
|
||||
T3p = FMA(KP587785252, T3g, KP951056516 * T3f);
|
||||
ci[WS(rs, 14)] = T3m + T3l;
|
||||
T3q = T3n - T3k;
|
||||
cr[WS(rs, 12)] = T3p - T3q;
|
||||
ci[WS(rs, 11)] = T3p + T3q;
|
||||
T3o = T3k + T3n;
|
||||
cr[WS(rs, 9)] = T3h - T3o;
|
||||
ci[WS(rs, 8)] = T3h + T3o;
|
||||
}
|
||||
{
|
||||
E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
|
||||
{
|
||||
E T2u, T2x, T1C, T1N;
|
||||
T2u = T2s - T2t;
|
||||
T2x = T2v - T2w;
|
||||
T2y = FMA(KP951056516, T2u, KP587785252 * T2x);
|
||||
T2A = FNMS(KP587785252, T2u, KP951056516 * T2x);
|
||||
T1r = T1n - T1q;
|
||||
T1C = T1w + T1B;
|
||||
T1N = T1H + T1M;
|
||||
T1O = T1C + T1N;
|
||||
T2p = KP559016994 * (T1C - T1N);
|
||||
T2q = FNMS(KP250000000, T1O, T1r);
|
||||
}
|
||||
cr[WS(rs, 5)] = T1r + T1O;
|
||||
T2z = T2q - T2p;
|
||||
cr[WS(rs, 2)] = T2z - T2A;
|
||||
ci[WS(rs, 6)] = T2z + T2A;
|
||||
T2r = T2p + T2q;
|
||||
ci[0] = T2r - T2y;
|
||||
ci[WS(rs, 3)] = T2r + T2y;
|
||||
}
|
||||
{
|
||||
E T35, T3d, T39, T3a, T38, T3b, T3e, T3c;
|
||||
{
|
||||
E T33, T34, T36, T37;
|
||||
T33 = T1w - T1B;
|
||||
T34 = T1H - T1M;
|
||||
T35 = FMA(KP951056516, T33, KP587785252 * T34);
|
||||
T3d = FNMS(KP587785252, T33, KP951056516 * T34);
|
||||
T39 = T2T - T2Q;
|
||||
T36 = T2v + T2w;
|
||||
T37 = T2s + T2t;
|
||||
T3a = T37 + T36;
|
||||
T38 = KP559016994 * (T36 - T37);
|
||||
T3b = FNMS(KP250000000, T3a, T39);
|
||||
}
|
||||
ci[WS(rs, 9)] = T3a + T39;
|
||||
T3e = T38 + T3b;
|
||||
cr[WS(rs, 8)] = T3d - T3e;
|
||||
ci[WS(rs, 12)] = T3d + T3e;
|
||||
T3c = T38 - T3b;
|
||||
cr[WS(rs, 11)] = T35 + T3c;
|
||||
cr[WS(rs, 14)] = T3c - T35;
|
||||
}
|
||||
{
|
||||
E T2X, T31, T2U, T2P, T2Y, T2Z, T32, T30;
|
||||
{
|
||||
E T2V, T2W, T2N, T2O;
|
||||
T2V = T1T - T1U;
|
||||
T2W = T1Q - T1R;
|
||||
T2X = FNMS(KP587785252, T2W, KP951056516 * T2V);
|
||||
T31 = FMA(KP951056516, T2W, KP587785252 * T2V);
|
||||
T2U = T2Q + T2T;
|
||||
T2N = T2k - T2f;
|
||||
T2O = T24 + T29;
|
||||
T2P = T2N - T2O;
|
||||
T2Y = FMA(KP250000000, T2P, T2U);
|
||||
T2Z = KP559016994 * (T2O + T2N);
|
||||
}
|
||||
cr[WS(rs, 10)] = T2P - T2U;
|
||||
T32 = T2Z + T2Y;
|
||||
ci[WS(rs, 10)] = T31 + T32;
|
||||
ci[WS(rs, 13)] = T32 - T31;
|
||||
T30 = T2Y - T2Z;
|
||||
cr[WS(rs, 13)] = T2X - T30;
|
||||
ci[WS(rs, 7)] = T2X + T30;
|
||||
}
|
||||
{
|
||||
E T2m, T2o, T1P, T1W, T1X, T1Y, T1Z, T2n;
|
||||
{
|
||||
E T2a, T2l, T1S, T1V;
|
||||
T2a = T24 - T29;
|
||||
T2l = T2f + T2k;
|
||||
T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
|
||||
T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
|
||||
T1P = T1n + T1q;
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T1W = T1S + T1V;
|
||||
T1X = KP559016994 * (T1S - T1V);
|
||||
T1Y = FNMS(KP250000000, T1W, T1P);
|
||||
}
|
||||
ci[WS(rs, 4)] = T1P + T1W;
|
||||
T1Z = T1X + T1Y;
|
||||
cr[WS(rs, 4)] = T1Z - T2m;
|
||||
cr[WS(rs, 1)] = T1Z + T2m;
|
||||
T2n = T1Y - T1X;
|
||||
cr[WS(rs, 7)] = T2n - T2o;
|
||||
ci[WS(rs, 1)] = T2n + T2o;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 15, "hf_15", twinstr, &GENUS, { 128, 56, 56, 0 } };
|
||||
|
||||
void X(codelet_hf_15) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_15, &desc);
|
||||
}
|
||||
#endif
|
||||
796
fftw-3.3.10/rdft/scalar/r2cf/hf_16.c
Normal file
796
fftw-3.3.10/rdft/scalar/r2cf/hf_16.c
Normal file
@@ -0,0 +1,796 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:14 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 100 FP multiplications,
|
||||
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 60 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T8, T3A, T1I, T3o, T1s, T35, T2k, T2w, T1F, T36, T2p, T2r, Tl, T3z, T1N;
|
||||
E T3k, Tz, T2W, T1P, T1U, T11, T30, T25, T2g, T1e, T31, T2a, T2h, TM, T2V;
|
||||
E T1W, T21;
|
||||
{
|
||||
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
|
||||
T1 = cr[0];
|
||||
T3n = ci[0];
|
||||
T3 = cr[WS(rs, 8)];
|
||||
T6 = ci[WS(rs, 8)];
|
||||
T2 = W[14];
|
||||
T4 = T2 * T3;
|
||||
T3l = T2 * T6;
|
||||
T5 = W[15];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T3m = FNMS(T5, T3, T3l);
|
||||
T8 = T1 + T7;
|
||||
T3A = T3n - T3m;
|
||||
T1I = T1 - T7;
|
||||
T3o = T3m + T3n;
|
||||
}
|
||||
{
|
||||
E T1h, T1k, T1i, T2s, T1n, T1q, T1o, T2u, T1g, T1m;
|
||||
T1h = cr[WS(rs, 15)];
|
||||
T1k = ci[WS(rs, 15)];
|
||||
T1g = W[28];
|
||||
T1i = T1g * T1h;
|
||||
T2s = T1g * T1k;
|
||||
T1n = cr[WS(rs, 7)];
|
||||
T1q = ci[WS(rs, 7)];
|
||||
T1m = W[12];
|
||||
T1o = T1m * T1n;
|
||||
T2u = T1m * T1q;
|
||||
{
|
||||
E T1l, T2t, T1r, T2v, T1j, T1p;
|
||||
T1j = W[29];
|
||||
T1l = FMA(T1j, T1k, T1i);
|
||||
T2t = FNMS(T1j, T1h, T2s);
|
||||
T1p = W[13];
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T2v = FNMS(T1p, T1n, T2u);
|
||||
T1s = T1l + T1r;
|
||||
T35 = T2t + T2v;
|
||||
T2k = T1l - T1r;
|
||||
T2w = T2t - T2v;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1u, T1x, T1v, T2l, T1A, T1D, T1B, T2n, T1t, T1z;
|
||||
T1u = cr[WS(rs, 3)];
|
||||
T1x = ci[WS(rs, 3)];
|
||||
T1t = W[4];
|
||||
T1v = T1t * T1u;
|
||||
T2l = T1t * T1x;
|
||||
T1A = cr[WS(rs, 11)];
|
||||
T1D = ci[WS(rs, 11)];
|
||||
T1z = W[20];
|
||||
T1B = T1z * T1A;
|
||||
T2n = T1z * T1D;
|
||||
{
|
||||
E T1y, T2m, T1E, T2o, T1w, T1C;
|
||||
T1w = W[5];
|
||||
T1y = FMA(T1w, T1x, T1v);
|
||||
T2m = FNMS(T1w, T1u, T2l);
|
||||
T1C = W[21];
|
||||
T1E = FMA(T1C, T1D, T1B);
|
||||
T2o = FNMS(T1C, T1A, T2n);
|
||||
T1F = T1y + T1E;
|
||||
T36 = T2m + T2o;
|
||||
T2p = T2m - T2o;
|
||||
T2r = T1E - T1y;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
|
||||
Ta = cr[WS(rs, 4)];
|
||||
Td = ci[WS(rs, 4)];
|
||||
T9 = W[6];
|
||||
Tb = T9 * Ta;
|
||||
T1J = T9 * Td;
|
||||
Tg = cr[WS(rs, 12)];
|
||||
Tj = ci[WS(rs, 12)];
|
||||
Tf = W[22];
|
||||
Th = Tf * Tg;
|
||||
T1L = Tf * Tj;
|
||||
{
|
||||
E Te, T1K, Tk, T1M, Tc, Ti;
|
||||
Tc = W[7];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1K = FNMS(Tc, Ta, T1J);
|
||||
Ti = W[23];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1M = FNMS(Ti, Tg, T1L);
|
||||
Tl = Te + Tk;
|
||||
T3z = Te - Tk;
|
||||
T1N = T1K - T1M;
|
||||
T3k = T1K + T1M;
|
||||
}
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, T1Q, Tu, Tx, Tv, T1S, Tn, Tt;
|
||||
To = cr[WS(rs, 2)];
|
||||
Tr = ci[WS(rs, 2)];
|
||||
Tn = W[2];
|
||||
Tp = Tn * To;
|
||||
T1Q = Tn * Tr;
|
||||
Tu = cr[WS(rs, 10)];
|
||||
Tx = ci[WS(rs, 10)];
|
||||
Tt = W[18];
|
||||
Tv = Tt * Tu;
|
||||
T1S = Tt * Tx;
|
||||
{
|
||||
E Ts, T1R, Ty, T1T, Tq, Tw;
|
||||
Tq = W[3];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
T1R = FNMS(Tq, To, T1Q);
|
||||
Tw = W[19];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
T1T = FNMS(Tw, Tu, T1S);
|
||||
Tz = Ts + Ty;
|
||||
T2W = T1R + T1T;
|
||||
T1P = Ts - Ty;
|
||||
T1U = T1R - T1T;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TT, TR, T2c, TW, TZ, TX, T2e, TP, TV;
|
||||
TQ = cr[WS(rs, 1)];
|
||||
TT = ci[WS(rs, 1)];
|
||||
TP = W[0];
|
||||
TR = TP * TQ;
|
||||
T2c = TP * TT;
|
||||
TW = cr[WS(rs, 9)];
|
||||
TZ = ci[WS(rs, 9)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T2e = TV * TZ;
|
||||
{
|
||||
E TU, T2d, T10, T2f, TS, TY;
|
||||
TS = W[1];
|
||||
TU = FMA(TS, TT, TR);
|
||||
T2d = FNMS(TS, TQ, T2c);
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T2f = FNMS(TY, TW, T2e);
|
||||
T11 = TU + T10;
|
||||
T30 = T2d + T2f;
|
||||
T25 = TU - T10;
|
||||
T2g = T2d - T2f;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T13, T16, T14, T26, T19, T1c, T1a, T28, T12, T18;
|
||||
T13 = cr[WS(rs, 5)];
|
||||
T16 = ci[WS(rs, 5)];
|
||||
T12 = W[8];
|
||||
T14 = T12 * T13;
|
||||
T26 = T12 * T16;
|
||||
T19 = cr[WS(rs, 13)];
|
||||
T1c = ci[WS(rs, 13)];
|
||||
T18 = W[24];
|
||||
T1a = T18 * T19;
|
||||
T28 = T18 * T1c;
|
||||
{
|
||||
E T17, T27, T1d, T29, T15, T1b;
|
||||
T15 = W[9];
|
||||
T17 = FMA(T15, T16, T14);
|
||||
T27 = FNMS(T15, T13, T26);
|
||||
T1b = W[25];
|
||||
T1d = FMA(T1b, T1c, T1a);
|
||||
T29 = FNMS(T1b, T19, T28);
|
||||
T1e = T17 + T1d;
|
||||
T31 = T27 + T29;
|
||||
T2a = T27 - T29;
|
||||
T2h = T17 - T1d;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
|
||||
TB = cr[WS(rs, 14)];
|
||||
TE = ci[WS(rs, 14)];
|
||||
TA = W[26];
|
||||
TC = TA * TB;
|
||||
T1X = TA * TE;
|
||||
TH = cr[WS(rs, 6)];
|
||||
TK = ci[WS(rs, 6)];
|
||||
TG = W[10];
|
||||
TI = TG * TH;
|
||||
T1Z = TG * TK;
|
||||
{
|
||||
E TF, T1Y, TL, T20, TD, TJ;
|
||||
TD = W[27];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1Y = FNMS(TD, TB, T1X);
|
||||
TJ = W[11];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T20 = FNMS(TJ, TH, T1Z);
|
||||
TM = TF + TL;
|
||||
T2V = T1Y + T20;
|
||||
T1W = TF - TL;
|
||||
T21 = T1Y - T20;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
|
||||
{
|
||||
E Tm, TN, T3j, T3p;
|
||||
Tm = T8 + Tl;
|
||||
TN = Tz + TM;
|
||||
TO = Tm + TN;
|
||||
T3e = Tm - TN;
|
||||
T3j = T2W + T2V;
|
||||
T3p = T3k + T3o;
|
||||
T3q = T3j + T3p;
|
||||
T3s = T3p - T3j;
|
||||
}
|
||||
{
|
||||
E T1f, T1G, T3f, T3g;
|
||||
T1f = T11 + T1e;
|
||||
T1G = T1s + T1F;
|
||||
T1H = T1f + T1G;
|
||||
T3r = T1G - T1f;
|
||||
T3f = T35 + T36;
|
||||
T3g = T30 + T31;
|
||||
T3h = T3f - T3g;
|
||||
T3i = T3g + T3f;
|
||||
}
|
||||
ci[WS(rs, 7)] = TO - T1H;
|
||||
cr[WS(rs, 12)] = T3r - T3s;
|
||||
ci[WS(rs, 11)] = T3r + T3s;
|
||||
cr[0] = TO + T1H;
|
||||
cr[WS(rs, 4)] = T3e - T3h;
|
||||
cr[WS(rs, 8)] = T3i - T3q;
|
||||
ci[WS(rs, 15)] = T3i + T3q;
|
||||
ci[WS(rs, 3)] = T3e + T3h;
|
||||
}
|
||||
{
|
||||
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
|
||||
{
|
||||
E T2U, T2X, T3t, T3u;
|
||||
T2U = T8 - Tl;
|
||||
T2X = T2V - T2W;
|
||||
T2Y = T2U - T2X;
|
||||
T3a = T2U + T2X;
|
||||
T3t = Tz - TM;
|
||||
T3u = T3o - T3k;
|
||||
T3v = T3t + T3u;
|
||||
T3x = T3u - T3t;
|
||||
}
|
||||
{
|
||||
E T2Z, T32, T34, T37;
|
||||
T2Z = T11 - T1e;
|
||||
T32 = T30 - T31;
|
||||
T33 = T2Z + T32;
|
||||
T3b = T2Z - T32;
|
||||
T34 = T1s - T1F;
|
||||
T37 = T35 - T36;
|
||||
T38 = T34 - T37;
|
||||
T3c = T34 + T37;
|
||||
}
|
||||
{
|
||||
E T39, T3y, T3d, T3w;
|
||||
T39 = T33 + T38;
|
||||
ci[WS(rs, 5)] = FNMS(KP707106781, T39, T2Y);
|
||||
cr[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
|
||||
T3y = T3c - T3b;
|
||||
cr[WS(rs, 10)] = FMS(KP707106781, T3y, T3x);
|
||||
ci[WS(rs, 13)] = FMA(KP707106781, T3y, T3x);
|
||||
T3d = T3b + T3c;
|
||||
cr[WS(rs, 6)] = FNMS(KP707106781, T3d, T3a);
|
||||
ci[WS(rs, 1)] = FMA(KP707106781, T3d, T3a);
|
||||
T3w = T38 - T33;
|
||||
cr[WS(rs, 14)] = FMS(KP707106781, T3w, T3v);
|
||||
ci[WS(rs, 9)] = FMA(KP707106781, T3w, T3v);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1O, T3B, T3H, T2E, T23, T3I, T2O, T2R, T2H, T3C, T2j, T2B, T2L, T2S, T2y;
|
||||
E T2C;
|
||||
{
|
||||
E T1V, T22, T2b, T2i;
|
||||
T1O = T1I - T1N;
|
||||
T3B = T3z + T3A;
|
||||
T3H = T3A - T3z;
|
||||
T2E = T1I + T1N;
|
||||
T1V = T1P - T1U;
|
||||
T22 = T1W + T21;
|
||||
T23 = T1V + T22;
|
||||
T3I = T22 - T1V;
|
||||
{
|
||||
E T2M, T2N, T2F, T2G;
|
||||
T2M = T2k + T2p;
|
||||
T2N = T2w + T2r;
|
||||
T2O = FNMS(KP414213562, T2N, T2M);
|
||||
T2R = FMA(KP414213562, T2M, T2N);
|
||||
T2F = T1P + T1U;
|
||||
T2G = T1W - T21;
|
||||
T2H = T2F + T2G;
|
||||
T3C = T2F - T2G;
|
||||
}
|
||||
T2b = T25 - T2a;
|
||||
T2i = T2g + T2h;
|
||||
T2j = FNMS(KP414213562, T2i, T2b);
|
||||
T2B = FMA(KP414213562, T2b, T2i);
|
||||
{
|
||||
E T2J, T2K, T2q, T2x;
|
||||
T2J = T25 + T2a;
|
||||
T2K = T2g - T2h;
|
||||
T2L = FMA(KP414213562, T2K, T2J);
|
||||
T2S = FNMS(KP414213562, T2J, T2K);
|
||||
T2q = T2k - T2p;
|
||||
T2x = T2r - T2w;
|
||||
T2y = FNMS(KP414213562, T2x, T2q);
|
||||
T2C = FMA(KP414213562, T2q, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T24, T2z, T3J, T3K;
|
||||
T24 = FMA(KP707106781, T23, T1O);
|
||||
T2z = T2j + T2y;
|
||||
cr[WS(rs, 7)] = FNMS(KP923879532, T2z, T24);
|
||||
ci[0] = FMA(KP923879532, T2z, T24);
|
||||
T3J = FMA(KP707106781, T3I, T3H);
|
||||
T3K = T2S + T2R;
|
||||
cr[WS(rs, 9)] = FMS(KP923879532, T3K, T3J);
|
||||
ci[WS(rs, 14)] = FMA(KP923879532, T3K, T3J);
|
||||
}
|
||||
{
|
||||
E T3L, T3M, T2A, T2D;
|
||||
T3L = FNMS(KP707106781, T3I, T3H);
|
||||
T3M = T2O - T2L;
|
||||
cr[WS(rs, 13)] = FMS(KP923879532, T3M, T3L);
|
||||
ci[WS(rs, 10)] = FMA(KP923879532, T3M, T3L);
|
||||
T2A = FNMS(KP707106781, T23, T1O);
|
||||
T2D = T2B + T2C;
|
||||
ci[WS(rs, 4)] = FNMS(KP923879532, T2D, T2A);
|
||||
cr[WS(rs, 3)] = FMA(KP923879532, T2D, T2A);
|
||||
}
|
||||
{
|
||||
E T2I, T2P, T3D, T3E;
|
||||
T2I = FMA(KP707106781, T2H, T2E);
|
||||
T2P = T2L + T2O;
|
||||
ci[WS(rs, 6)] = FNMS(KP923879532, T2P, T2I);
|
||||
cr[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
|
||||
T3D = FMA(KP707106781, T3C, T3B);
|
||||
T3E = T2C - T2B;
|
||||
cr[WS(rs, 15)] = FMS(KP923879532, T3E, T3D);
|
||||
ci[WS(rs, 8)] = FMA(KP923879532, T3E, T3D);
|
||||
}
|
||||
{
|
||||
E T3F, T3G, T2Q, T2T;
|
||||
T3F = FNMS(KP707106781, T3C, T3B);
|
||||
T3G = T2y - T2j;
|
||||
cr[WS(rs, 11)] = FMS(KP923879532, T3G, T3F);
|
||||
ci[WS(rs, 12)] = FMA(KP923879532, T3G, T3F);
|
||||
T2Q = FNMS(KP707106781, T2H, T2E);
|
||||
T2T = T2R - T2S;
|
||||
cr[WS(rs, 5)] = FNMS(KP923879532, T2T, T2Q);
|
||||
ci[WS(rs, 2)] = FMA(KP923879532, T2T, T2Q);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, { 104, 30, 70, 0 } };
|
||||
|
||||
void X(codelet_hf_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 16 -dit -name hf_16 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 84 FP multiplications,
|
||||
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 52 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_16(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 30); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T38, T1t, T2U, Ti, T37, T1w, T2R, Tu, T2t, T1C, T2c, TF, T2s, T1H;
|
||||
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2k, T24, T2j, TS, T13, T2w, T2x;
|
||||
E T2y, T2z, T1O, T2h, T1T, T2g;
|
||||
{
|
||||
E T1, T2T, T6, T2S;
|
||||
T1 = cr[0];
|
||||
T2T = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 8)];
|
||||
T5 = ci[WS(rs, 8)];
|
||||
T2 = W[14];
|
||||
T4 = W[15];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T2S = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T38 = T2T - T2S;
|
||||
T1t = T1 - T6;
|
||||
T2U = T2S + T2T;
|
||||
}
|
||||
{
|
||||
E Tc, T1u, Th, T1v;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = cr[WS(rs, 4)];
|
||||
Tb = ci[WS(rs, 4)];
|
||||
T8 = W[6];
|
||||
Ta = W[7];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T1u = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 12)];
|
||||
Tg = ci[WS(rs, 12)];
|
||||
Td = W[22];
|
||||
Tf = W[23];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T1v = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T37 = Tc - Th;
|
||||
T1w = T1u - T1v;
|
||||
T2R = T1u + T1v;
|
||||
}
|
||||
{
|
||||
E To, T1z, Tt, T1A, T1y, T1B;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = cr[WS(rs, 2)];
|
||||
Tn = ci[WS(rs, 2)];
|
||||
Tk = W[2];
|
||||
Tm = W[3];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
T1z = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = cr[WS(rs, 10)];
|
||||
Ts = ci[WS(rs, 10)];
|
||||
Tp = W[18];
|
||||
Tr = W[19];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
T1A = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T2t = T1z + T1A;
|
||||
T1y = To - Tt;
|
||||
T1B = T1z - T1A;
|
||||
T1C = T1y - T1B;
|
||||
T2c = T1y + T1B;
|
||||
}
|
||||
{
|
||||
E Tz, T1E, TE, T1F, T1D, T1G;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = cr[WS(rs, 14)];
|
||||
Ty = ci[WS(rs, 14)];
|
||||
Tv = W[26];
|
||||
Tx = W[27];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1E = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = cr[WS(rs, 6)];
|
||||
TD = ci[WS(rs, 6)];
|
||||
TA = W[10];
|
||||
TC = W[11];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1F = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T2s = T1E + T1F;
|
||||
T1D = Tz - TE;
|
||||
T1G = T1E - T1F;
|
||||
T1H = T1D + T1G;
|
||||
T2d = T1D - T1G;
|
||||
}
|
||||
{
|
||||
E T19, T1V, T1p, T22, T1e, T1W, T1k, T21;
|
||||
{
|
||||
E T16, T18, T15, T17;
|
||||
T16 = cr[WS(rs, 15)];
|
||||
T18 = ci[WS(rs, 15)];
|
||||
T15 = W[28];
|
||||
T17 = W[29];
|
||||
T19 = FMA(T15, T16, T17 * T18);
|
||||
T1V = FNMS(T17, T16, T15 * T18);
|
||||
}
|
||||
{
|
||||
E T1m, T1o, T1l, T1n;
|
||||
T1m = cr[WS(rs, 11)];
|
||||
T1o = ci[WS(rs, 11)];
|
||||
T1l = W[20];
|
||||
T1n = W[21];
|
||||
T1p = FMA(T1l, T1m, T1n * T1o);
|
||||
T22 = FNMS(T1n, T1m, T1l * T1o);
|
||||
}
|
||||
{
|
||||
E T1b, T1d, T1a, T1c;
|
||||
T1b = cr[WS(rs, 7)];
|
||||
T1d = ci[WS(rs, 7)];
|
||||
T1a = W[12];
|
||||
T1c = W[13];
|
||||
T1e = FMA(T1a, T1b, T1c * T1d);
|
||||
T1W = FNMS(T1c, T1b, T1a * T1d);
|
||||
}
|
||||
{
|
||||
E T1h, T1j, T1g, T1i;
|
||||
T1h = cr[WS(rs, 3)];
|
||||
T1j = ci[WS(rs, 3)];
|
||||
T1g = W[4];
|
||||
T1i = W[5];
|
||||
T1k = FMA(T1g, T1h, T1i * T1j);
|
||||
T21 = FNMS(T1i, T1h, T1g * T1j);
|
||||
}
|
||||
T1f = T19 + T1e;
|
||||
T1q = T1k + T1p;
|
||||
T2B = T1f - T1q;
|
||||
T2C = T1V + T1W;
|
||||
T2D = T21 + T22;
|
||||
T2E = T2C - T2D;
|
||||
{
|
||||
E T1X, T1Y, T20, T23;
|
||||
T1X = T1V - T1W;
|
||||
T1Y = T1k - T1p;
|
||||
T1Z = T1X + T1Y;
|
||||
T2k = T1X - T1Y;
|
||||
T20 = T19 - T1e;
|
||||
T23 = T21 - T22;
|
||||
T24 = T20 - T23;
|
||||
T2j = T20 + T23;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, T1P, T12, T1M, TR, T1Q, TX, T1L;
|
||||
{
|
||||
E TJ, TL, TI, TK;
|
||||
TJ = cr[WS(rs, 1)];
|
||||
TL = ci[WS(rs, 1)];
|
||||
TI = W[0];
|
||||
TK = W[1];
|
||||
TM = FMA(TI, TJ, TK * TL);
|
||||
T1P = FNMS(TK, TJ, TI * TL);
|
||||
}
|
||||
{
|
||||
E TZ, T11, TY, T10;
|
||||
TZ = cr[WS(rs, 13)];
|
||||
T11 = ci[WS(rs, 13)];
|
||||
TY = W[24];
|
||||
T10 = W[25];
|
||||
T12 = FMA(TY, TZ, T10 * T11);
|
||||
T1M = FNMS(T10, TZ, TY * T11);
|
||||
}
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = cr[WS(rs, 9)];
|
||||
TQ = ci[WS(rs, 9)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1Q = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TU, TW, TT, TV;
|
||||
TU = cr[WS(rs, 5)];
|
||||
TW = ci[WS(rs, 5)];
|
||||
TT = W[8];
|
||||
TV = W[9];
|
||||
TX = FMA(TT, TU, TV * TW);
|
||||
T1L = FNMS(TV, TU, TT * TW);
|
||||
}
|
||||
TS = TM + TR;
|
||||
T13 = TX + T12;
|
||||
T2w = TS - T13;
|
||||
T2x = T1P + T1Q;
|
||||
T2y = T1L + T1M;
|
||||
T2z = T2x - T2y;
|
||||
{
|
||||
E T1K, T1N, T1R, T1S;
|
||||
T1K = TM - TR;
|
||||
T1N = T1L - T1M;
|
||||
T1O = T1K - T1N;
|
||||
T2h = T1K + T1N;
|
||||
T1R = T1P - T1Q;
|
||||
T1S = TX - T12;
|
||||
T1T = T1R + T1S;
|
||||
T2g = T1R - T1S;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1J, T27, T3a, T3c, T26, T3b, T2a, T35;
|
||||
{
|
||||
E T1x, T1I, T36, T39;
|
||||
T1x = T1t - T1w;
|
||||
T1I = KP707106781 * (T1C + T1H);
|
||||
T1J = T1x + T1I;
|
||||
T27 = T1x - T1I;
|
||||
T36 = KP707106781 * (T2c - T2d);
|
||||
T39 = T37 + T38;
|
||||
T3a = T36 + T39;
|
||||
T3c = T39 - T36;
|
||||
}
|
||||
{
|
||||
E T1U, T25, T28, T29;
|
||||
T1U = FNMS(KP382683432, T1T, KP923879532 * T1O);
|
||||
T25 = FMA(KP382683432, T1Z, KP923879532 * T24);
|
||||
T26 = T1U + T25;
|
||||
T3b = T25 - T1U;
|
||||
T28 = FMA(KP923879532, T1T, KP382683432 * T1O);
|
||||
T29 = FNMS(KP923879532, T1Z, KP382683432 * T24);
|
||||
T2a = T28 + T29;
|
||||
T35 = T29 - T28;
|
||||
}
|
||||
cr[WS(rs, 7)] = T1J - T26;
|
||||
cr[WS(rs, 11)] = T3b - T3c;
|
||||
ci[WS(rs, 12)] = T3b + T3c;
|
||||
ci[0] = T1J + T26;
|
||||
ci[WS(rs, 4)] = T27 - T2a;
|
||||
cr[WS(rs, 15)] = T35 - T3a;
|
||||
ci[WS(rs, 8)] = T35 + T3a;
|
||||
cr[WS(rs, 3)] = T27 + T2a;
|
||||
}
|
||||
{
|
||||
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
|
||||
{
|
||||
E Tj, TG, T2Q, T2V;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
TH = Tj + TG;
|
||||
T2L = Tj - TG;
|
||||
T2Q = T2t + T2s;
|
||||
T2V = T2R + T2U;
|
||||
T2W = T2Q + T2V;
|
||||
T2Y = T2V - T2Q;
|
||||
}
|
||||
{
|
||||
E T14, T1r, T2M, T2N;
|
||||
T14 = TS + T13;
|
||||
T1r = T1f + T1q;
|
||||
T1s = T14 + T1r;
|
||||
T2X = T1r - T14;
|
||||
T2M = T2C + T2D;
|
||||
T2N = T2x + T2y;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2N + T2M;
|
||||
}
|
||||
ci[WS(rs, 7)] = TH - T1s;
|
||||
cr[WS(rs, 12)] = T2X - T2Y;
|
||||
ci[WS(rs, 11)] = T2X + T2Y;
|
||||
cr[0] = TH + T1s;
|
||||
cr[WS(rs, 4)] = T2L - T2O;
|
||||
cr[WS(rs, 8)] = T2P - T2W;
|
||||
ci[WS(rs, 15)] = T2P + T2W;
|
||||
ci[WS(rs, 3)] = T2L + T2O;
|
||||
}
|
||||
{
|
||||
E T2f, T2n, T3g, T3i, T2m, T3h, T2q, T3d;
|
||||
{
|
||||
E T2b, T2e, T3e, T3f;
|
||||
T2b = T1t + T1w;
|
||||
T2e = KP707106781 * (T2c + T2d);
|
||||
T2f = T2b + T2e;
|
||||
T2n = T2b - T2e;
|
||||
T3e = KP707106781 * (T1H - T1C);
|
||||
T3f = T38 - T37;
|
||||
T3g = T3e + T3f;
|
||||
T3i = T3f - T3e;
|
||||
}
|
||||
{
|
||||
E T2i, T2l, T2o, T2p;
|
||||
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
|
||||
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
|
||||
T2m = T2i + T2l;
|
||||
T3h = T2l - T2i;
|
||||
T2o = FNMS(KP923879532, T2g, KP382683432 * T2h);
|
||||
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
|
||||
T2q = T2o + T2p;
|
||||
T3d = T2p - T2o;
|
||||
}
|
||||
ci[WS(rs, 6)] = T2f - T2m;
|
||||
cr[WS(rs, 13)] = T3h - T3i;
|
||||
ci[WS(rs, 10)] = T3h + T3i;
|
||||
cr[WS(rs, 1)] = T2f + T2m;
|
||||
cr[WS(rs, 5)] = T2n - T2q;
|
||||
cr[WS(rs, 9)] = T3d - T3g;
|
||||
ci[WS(rs, 14)] = T3d + T3g;
|
||||
ci[WS(rs, 2)] = T2n + T2q;
|
||||
}
|
||||
{
|
||||
E T2v, T2H, T32, T34, T2G, T2Z, T2K, T33;
|
||||
{
|
||||
E T2r, T2u, T30, T31;
|
||||
T2r = T7 - Ti;
|
||||
T2u = T2s - T2t;
|
||||
T2v = T2r - T2u;
|
||||
T2H = T2r + T2u;
|
||||
T30 = Tu - TF;
|
||||
T31 = T2U - T2R;
|
||||
T32 = T30 + T31;
|
||||
T34 = T31 - T30;
|
||||
}
|
||||
{
|
||||
E T2A, T2F, T2I, T2J;
|
||||
T2A = T2w + T2z;
|
||||
T2F = T2B - T2E;
|
||||
T2G = KP707106781 * (T2A + T2F);
|
||||
T2Z = KP707106781 * (T2F - T2A);
|
||||
T2I = T2w - T2z;
|
||||
T2J = T2B + T2E;
|
||||
T2K = KP707106781 * (T2I + T2J);
|
||||
T33 = KP707106781 * (T2J - T2I);
|
||||
}
|
||||
ci[WS(rs, 5)] = T2v - T2G;
|
||||
cr[WS(rs, 10)] = T33 - T34;
|
||||
ci[WS(rs, 13)] = T33 + T34;
|
||||
cr[WS(rs, 2)] = T2v + T2G;
|
||||
cr[WS(rs, 6)] = T2H - T2K;
|
||||
cr[WS(rs, 14)] = T2Z - T32;
|
||||
ci[WS(rs, 9)] = T2Z + T32;
|
||||
ci[WS(rs, 1)] = T2H + T2K;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 16, "hf_16", twinstr, &GENUS, { 136, 46, 38, 0 } };
|
||||
|
||||
void X(codelet_hf_16) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_16, &desc);
|
||||
}
|
||||
#endif
|
||||
117
fftw-3.3.10/rdft/scalar/r2cf/hf_2.c
Normal file
117
fftw-3.3.10/rdft/scalar/r2cf/hf_2.c
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hf_2 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 11 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
|
||||
T1 = cr[0];
|
||||
Ta = ci[0];
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
T8 = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T9 = FNMS(T5, T3, T8);
|
||||
ci[0] = T1 - T7;
|
||||
cr[0] = T1 + T7;
|
||||
cr[WS(rs, 1)] = T9 - Ta;
|
||||
ci[WS(rs, 1)] = T9 + Ta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 2, "hf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hf_2) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_2, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 2 -dit -name hf_2 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_2(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, T8, T6, T7;
|
||||
T1 = cr[0];
|
||||
T8 = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T7 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
ci[0] = T1 - T6;
|
||||
cr[0] = T1 + T6;
|
||||
cr[WS(rs, 1)] = T7 - T8;
|
||||
ci[WS(rs, 1)] = T7 + T8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 2, "hf_2", twinstr, &GENUS, { 4, 2, 2, 0 } };
|
||||
|
||||
void X(codelet_hf_2) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_2, &desc);
|
||||
}
|
||||
#endif
|
||||
1050
fftw-3.3.10/rdft/scalar/r2cf/hf_20.c
Normal file
1050
fftw-3.3.10/rdft/scalar/r2cf/hf_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1572
fftw-3.3.10/rdft/scalar/r2cf/hf_25.c
Normal file
1572
fftw-3.3.10/rdft/scalar/r2cf/hf_25.c
Normal file
File diff suppressed because it is too large
Load Diff
166
fftw-3.3.10/rdft/scalar/r2cf/hf_3.c
Normal file
166
fftw-3.3.10/rdft/scalar/r2cf/hf_3.c
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -dit -name hf_3 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 14 FP multiplications,
|
||||
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, Tl, T7, Th, Td, Tj;
|
||||
T1 = cr[0];
|
||||
Tl = ci[0];
|
||||
{
|
||||
E T3, T6, T4, Tg, T2, T5;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
Tg = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Th = FNMS(T5, T3, Tg);
|
||||
}
|
||||
{
|
||||
E T9, Tc, Ta, Ti, T8, Tb;
|
||||
T9 = cr[WS(rs, 2)];
|
||||
Tc = ci[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = T8 * T9;
|
||||
Ti = T8 * Tc;
|
||||
Tb = W[3];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
Tj = FNMS(Tb, T9, Ti);
|
||||
}
|
||||
{
|
||||
E Tk, Te, Tf, To, Tm, Tn;
|
||||
Tk = Th - Tj;
|
||||
Te = T7 + Td;
|
||||
Tf = FNMS(KP500000000, Te, T1);
|
||||
cr[0] = T1 + Te;
|
||||
ci[0] = FNMS(KP866025403, Tk, Tf);
|
||||
cr[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
|
||||
To = Td - T7;
|
||||
Tm = Th + Tj;
|
||||
Tn = FNMS(KP500000000, Tm, Tl);
|
||||
cr[WS(rs, 2)] = FMS(KP866025403, To, Tn);
|
||||
ci[WS(rs, 2)] = Tm + Tl;
|
||||
ci[WS(rs, 1)] = FMA(KP866025403, To, Tn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 3, "hf_3", twinstr, &GENUS, { 6, 4, 10, 0 } };
|
||||
|
||||
void X(codelet_hf_3) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_3, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 3 -dit -name hf_3 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 12 FP multiplications,
|
||||
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_3(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, Ti, T6, Te, Tb, Tf, Tc, Tj;
|
||||
T1 = cr[0];
|
||||
Ti = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Te = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = cr[WS(rs, 2)];
|
||||
Ta = ci[WS(rs, 2)];
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Tf = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
Tc = T6 + Tb;
|
||||
Tj = Te + Tf;
|
||||
{
|
||||
E Td, Tg, Th, Tk;
|
||||
cr[0] = T1 + Tc;
|
||||
Td = FNMS(KP500000000, Tc, T1);
|
||||
Tg = KP866025403 * (Te - Tf);
|
||||
ci[0] = Td - Tg;
|
||||
cr[WS(rs, 1)] = Td + Tg;
|
||||
ci[WS(rs, 2)] = Tj + Ti;
|
||||
Th = KP866025403 * (Tb - T6);
|
||||
Tk = FNMS(KP500000000, Tj, Ti);
|
||||
cr[WS(rs, 2)] = Th - Tk;
|
||||
ci[WS(rs, 1)] = Th + Tk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 3, "hf_3", twinstr, &GENUS, { 10, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hf_3) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_3, &desc);
|
||||
}
|
||||
#endif
|
||||
1809
fftw-3.3.10/rdft/scalar/r2cf/hf_32.c
Normal file
1809
fftw-3.3.10/rdft/scalar/r2cf/hf_32.c
Normal file
File diff suppressed because it is too large
Load Diff
196
fftw-3.3.10/rdft/scalar/r2cf/hf_4.c
Normal file
196
fftw-3.3.10/rdft/scalar/r2cf/hf_4.c
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hf_4 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
|
||||
T1 = cr[0];
|
||||
Tv = ci[0];
|
||||
{
|
||||
E T3, T6, T4, Tt, T2, T5;
|
||||
T3 = cr[WS(rs, 2)];
|
||||
T6 = ci[WS(rs, 2)];
|
||||
T2 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tt = T2 * T6;
|
||||
T5 = W[3];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tu = FNMS(T5, T3, Tt);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, Tn, T9, Tc;
|
||||
Ta = cr[WS(rs, 1)];
|
||||
Td = ci[WS(rs, 1)];
|
||||
T9 = W[0];
|
||||
Tb = T9 * Ta;
|
||||
Tn = T9 * Td;
|
||||
Tc = W[1];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
To = FNMS(Tc, Ta, Tn);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, Tp, Tf, Ti;
|
||||
Tg = cr[WS(rs, 3)];
|
||||
Tj = ci[WS(rs, 3)];
|
||||
Tf = W[4];
|
||||
Th = Tf * Tg;
|
||||
Tp = Tf * Tj;
|
||||
Ti = W[5];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
Tq = FNMS(Ti, Tg, Tp);
|
||||
}
|
||||
{
|
||||
E T8, Tl, Tm, Tr;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
ci[WS(rs, 1)] = T8 - Tl;
|
||||
cr[0] = T8 + Tl;
|
||||
Tm = T1 - T7;
|
||||
Tr = To - Tq;
|
||||
ci[0] = Tm - Tr;
|
||||
cr[WS(rs, 1)] = Tm + Tr;
|
||||
}
|
||||
{
|
||||
E Ts, Tw, Tx, Ty;
|
||||
Ts = To + Tq;
|
||||
Tw = Tu + Tv;
|
||||
cr[WS(rs, 2)] = Ts - Tw;
|
||||
ci[WS(rs, 3)] = Ts + Tw;
|
||||
Tx = Tk - Te;
|
||||
Ty = Tv - Tu;
|
||||
cr[WS(rs, 3)] = Tx - Ty;
|
||||
ci[WS(rs, 2)] = Tx + Ty;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hf_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 4 -dit -name hf_4 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
|
||||
T1 = cr[0];
|
||||
Tp = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 2)];
|
||||
T5 = ci[WS(rs, 2)];
|
||||
T2 = W[2];
|
||||
T4 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
To = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = cr[WS(rs, 1)];
|
||||
Tb = ci[WS(rs, 1)];
|
||||
T8 = W[0];
|
||||
Ta = W[1];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
Tk = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 3)];
|
||||
Tg = ci[WS(rs, 3)];
|
||||
Td = W[4];
|
||||
Tf = W[5];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tl = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E T7, Ti, Tj, Tm;
|
||||
T7 = T1 + T6;
|
||||
Ti = Tc + Th;
|
||||
ci[WS(rs, 1)] = T7 - Ti;
|
||||
cr[0] = T7 + Ti;
|
||||
Tj = T1 - T6;
|
||||
Tm = Tk - Tl;
|
||||
ci[0] = Tj - Tm;
|
||||
cr[WS(rs, 1)] = Tj + Tm;
|
||||
}
|
||||
{
|
||||
E Tn, Tq, Tr, Ts;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
cr[WS(rs, 2)] = Tn - Tq;
|
||||
ci[WS(rs, 3)] = Tn + Tq;
|
||||
Tr = Th - Tc;
|
||||
Ts = Tp - To;
|
||||
cr[WS(rs, 3)] = Tr - Ts;
|
||||
ci[WS(rs, 2)] = Tr + Ts;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 4, "hf_4", twinstr, &GENUS, { 16, 6, 6, 0 } };
|
||||
|
||||
void X(codelet_hf_4) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_4, &desc);
|
||||
}
|
||||
#endif
|
||||
253
fftw-3.3.10/rdft/scalar/r2cf/hf_5.c
Normal file
253
fftw-3.3.10/rdft/scalar/r2cf/hf_5.c
Normal file
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -dit -name hf_5 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 34 FP multiplications,
|
||||
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
|
||||
* 31 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, TJ, T7, Tx, Td, Tz, Te, TK, Tk, TC, Tq, TE, Tr, TL;
|
||||
T1 = cr[0];
|
||||
TJ = ci[0];
|
||||
{
|
||||
E T3, T6, T4, Tw, T9, Tc, Ta, Ty, T2, T8, T5, Tb;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
Tw = T2 * T6;
|
||||
T9 = cr[WS(rs, 4)];
|
||||
Tc = ci[WS(rs, 4)];
|
||||
T8 = W[6];
|
||||
Ta = T8 * T9;
|
||||
Ty = T8 * Tc;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tx = FNMS(T5, T3, Tw);
|
||||
Tb = W[7];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
Tz = FNMS(Tb, T9, Ty);
|
||||
Te = T7 + Td;
|
||||
TK = Tx + Tz;
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TB, Tm, Tp, Tn, TD, Tf, Tl, Ti, To;
|
||||
Tg = cr[WS(rs, 2)];
|
||||
Tj = ci[WS(rs, 2)];
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
TB = Tf * Tj;
|
||||
Tm = cr[WS(rs, 3)];
|
||||
Tp = ci[WS(rs, 3)];
|
||||
Tl = W[4];
|
||||
Tn = Tl * Tm;
|
||||
TD = Tl * Tp;
|
||||
Ti = W[3];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TC = FNMS(Ti, Tg, TB);
|
||||
To = W[5];
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
TE = FNMS(To, Tm, TD);
|
||||
Tr = Tk + Tq;
|
||||
TL = TC + TE;
|
||||
}
|
||||
{
|
||||
E Tu, Ts, Tt, TG, TI, TA, TF, Tv, TH;
|
||||
Tu = Te - Tr;
|
||||
Ts = Te + Tr;
|
||||
Tt = FNMS(KP250000000, Ts, T1);
|
||||
TA = Tx - Tz;
|
||||
TF = TC - TE;
|
||||
TG = FMA(KP618033988, TF, TA);
|
||||
TI = FNMS(KP618033988, TA, TF);
|
||||
cr[0] = T1 + Ts;
|
||||
Tv = FMA(KP559016994, Tu, Tt);
|
||||
ci[0] = FNMS(KP951056516, TG, Tv);
|
||||
cr[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
|
||||
TH = FNMS(KP559016994, Tu, Tt);
|
||||
cr[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
|
||||
ci[WS(rs, 1)] = FMA(KP951056516, TI, TH);
|
||||
}
|
||||
{
|
||||
E TO, TM, TN, TS, TU, TQ, TR, TT, TP;
|
||||
TO = TK - TL;
|
||||
TM = TK + TL;
|
||||
TN = FNMS(KP250000000, TM, TJ);
|
||||
TQ = Tk - Tq;
|
||||
TR = Td - T7;
|
||||
TS = FMA(KP618033988, TR, TQ);
|
||||
TU = FNMS(KP618033988, TQ, TR);
|
||||
ci[WS(rs, 4)] = TM + TJ;
|
||||
TT = FMA(KP559016994, TO, TN);
|
||||
cr[WS(rs, 4)] = FMS(KP951056516, TU, TT);
|
||||
ci[WS(rs, 3)] = FMA(KP951056516, TU, TT);
|
||||
TP = FNMS(KP559016994, TO, TN);
|
||||
cr[WS(rs, 3)] = FMS(KP951056516, TS, TP);
|
||||
ci[WS(rs, 2)] = FMA(KP951056516, TS, TP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hf_5", twinstr, &GENUS, { 14, 8, 26, 0 } };
|
||||
|
||||
void X(codelet_hf_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 5 -dit -name hf_5 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 28 FP multiplications,
|
||||
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 29 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_5(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 8); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, TE, Tu, Tx, TC, TB, TF, TG, TH, Tc, Tn, To;
|
||||
T1 = cr[0];
|
||||
TE = ci[0];
|
||||
{
|
||||
E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Ts = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E Tj, Tl, Ti, Tk;
|
||||
Tj = cr[WS(rs, 3)];
|
||||
Tl = ci[WS(rs, 3)];
|
||||
Ti = W[4];
|
||||
Tk = W[5];
|
||||
Tm = FMA(Ti, Tj, Tk * Tl);
|
||||
Tw = FNMS(Tk, Tj, Ti * Tl);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = cr[WS(rs, 4)];
|
||||
Ta = ci[WS(rs, 4)];
|
||||
T7 = W[6];
|
||||
T9 = W[7];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Tt = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 2)];
|
||||
Tg = ci[WS(rs, 2)];
|
||||
Td = W[2];
|
||||
Tf = W[3];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tv = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Tu = Ts - Tt;
|
||||
Tx = Tv - Tw;
|
||||
TC = Th - Tm;
|
||||
TB = Tb - T6;
|
||||
TF = Ts + Tt;
|
||||
TG = Tv + Tw;
|
||||
TH = TF + TG;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Th + Tm;
|
||||
To = Tc + Tn;
|
||||
}
|
||||
cr[0] = T1 + To;
|
||||
{
|
||||
E Ty, TA, Tr, Tz, Tp, Tq;
|
||||
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
|
||||
TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
|
||||
Tp = KP559016994 * (Tc - Tn);
|
||||
Tq = FNMS(KP250000000, To, T1);
|
||||
Tr = Tp + Tq;
|
||||
Tz = Tq - Tp;
|
||||
ci[0] = Tr - Ty;
|
||||
ci[WS(rs, 1)] = Tz + TA;
|
||||
cr[WS(rs, 1)] = Tr + Ty;
|
||||
cr[WS(rs, 2)] = Tz - TA;
|
||||
}
|
||||
ci[WS(rs, 4)] = TH + TE;
|
||||
{
|
||||
E TD, TL, TK, TM, TI, TJ;
|
||||
TD = FMA(KP587785252, TB, KP951056516 * TC);
|
||||
TL = FNMS(KP587785252, TC, KP951056516 * TB);
|
||||
TI = FNMS(KP250000000, TH, TE);
|
||||
TJ = KP559016994 * (TF - TG);
|
||||
TK = TI - TJ;
|
||||
TM = TJ + TI;
|
||||
cr[WS(rs, 3)] = TD - TK;
|
||||
ci[WS(rs, 3)] = TL + TM;
|
||||
ci[WS(rs, 2)] = TD + TK;
|
||||
cr[WS(rs, 4)] = TL - TM;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 5, "hf_5", twinstr, &GENUS, { 26, 14, 14, 0 } };
|
||||
|
||||
void X(codelet_hf_5) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_5, &desc);
|
||||
}
|
||||
#endif
|
||||
295
fftw-3.3.10/rdft/scalar/r2cf/hf_6.c
Normal file
295
fftw-3.3.10/rdft/scalar/r2cf/hf_6.c
Normal file
@@ -0,0 +1,295 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hf_6 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 32 FP multiplications,
|
||||
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T1, TV, T7, TX, Tl, TR, TB, TO, Ty, TS, TC, TJ;
|
||||
T1 = cr[0];
|
||||
TV = ci[0];
|
||||
{
|
||||
E T3, T6, T4, TW, T2, T5;
|
||||
T3 = cr[WS(rs, 3)];
|
||||
T6 = ci[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TW = T2 * T6;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TX = FNMS(T5, T3, TW);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TM, Tg, Tj, Th, TK, T9, Tf;
|
||||
Ta = cr[WS(rs, 2)];
|
||||
Td = ci[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TM = T9 * Td;
|
||||
Tg = cr[WS(rs, 5)];
|
||||
Tj = ci[WS(rs, 5)];
|
||||
Tf = W[8];
|
||||
Th = Tf * Tg;
|
||||
TK = Tf * Tj;
|
||||
{
|
||||
E Te, TN, Tk, TL, Tc, Ti;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TN = FNMS(Tc, Ta, TM);
|
||||
Ti = W[9];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TL = FNMS(Ti, Tg, TK);
|
||||
Tl = Te - Tk;
|
||||
TR = TN + TL;
|
||||
TB = Te + Tk;
|
||||
TO = TL - TN;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, TH, Tt, Tw, Tu, TF, Tm, Ts;
|
||||
Tn = cr[WS(rs, 4)];
|
||||
Tq = ci[WS(rs, 4)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
TH = Tm * Tq;
|
||||
Tt = cr[WS(rs, 1)];
|
||||
Tw = ci[WS(rs, 1)];
|
||||
Ts = W[0];
|
||||
Tu = Ts * Tt;
|
||||
TF = Ts * Tw;
|
||||
{
|
||||
E Tr, TI, Tx, TG, Tp, Tv;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
TI = FNMS(Tp, Tn, TH);
|
||||
Tv = W[1];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TG = FNMS(Tv, Tt, TF);
|
||||
Ty = Tr - Tx;
|
||||
TS = TI + TG;
|
||||
TC = Tr + Tx;
|
||||
TJ = TG - TI;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TP, T8, Tz, TE;
|
||||
TP = TJ - TO;
|
||||
T8 = T1 - T7;
|
||||
Tz = Tl + Ty;
|
||||
TE = FNMS(KP500000000, Tz, T8);
|
||||
ci[WS(rs, 2)] = T8 + Tz;
|
||||
cr[WS(rs, 1)] = FMA(KP866025403, TP, TE);
|
||||
ci[0] = FNMS(KP866025403, TP, TE);
|
||||
}
|
||||
{
|
||||
E TT, TA, TD, TQ;
|
||||
TT = TR - TS;
|
||||
TA = T1 + T7;
|
||||
TD = TB + TC;
|
||||
TQ = FNMS(KP500000000, TD, TA);
|
||||
cr[0] = TA + TD;
|
||||
ci[WS(rs, 1)] = FMA(KP866025403, TT, TQ);
|
||||
cr[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
|
||||
}
|
||||
{
|
||||
E T10, TU, TY, TZ;
|
||||
T10 = Ty - Tl;
|
||||
TU = TO + TJ;
|
||||
TY = TV - TX;
|
||||
TZ = FMA(KP500000000, TU, TY);
|
||||
cr[WS(rs, 3)] = TU - TY;
|
||||
ci[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
|
||||
cr[WS(rs, 5)] = FMS(KP866025403, T10, TZ);
|
||||
}
|
||||
{
|
||||
E T14, T11, T12, T13;
|
||||
T14 = TB - TC;
|
||||
T11 = TX + TV;
|
||||
T12 = TR + TS;
|
||||
T13 = FNMS(KP500000000, T12, T11);
|
||||
cr[WS(rs, 4)] = FMS(KP866025403, T14, T13);
|
||||
ci[WS(rs, 5)] = T12 + T11;
|
||||
ci[WS(rs, 3)] = FMA(KP866025403, T14, T13);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 6, "hf_6", twinstr, &GENUS, { 24, 10, 22, 0 } };
|
||||
|
||||
void X(codelet_hf_6) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_6, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 6 -dit -name hf_6 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 28 FP multiplications,
|
||||
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_6(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 10); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
|
||||
{
|
||||
E T1, TM, T6, TN;
|
||||
T1 = cr[0];
|
||||
TM = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 3)];
|
||||
T5 = ci[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TN = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
TS = TN + TM;
|
||||
Tv = T1 + T6;
|
||||
TO = TM - TN;
|
||||
}
|
||||
{
|
||||
E Tn, TE, Ts, TD;
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = cr[WS(rs, 4)];
|
||||
Tm = ci[WS(rs, 4)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TE = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = cr[WS(rs, 1)];
|
||||
Tr = ci[WS(rs, 1)];
|
||||
To = W[0];
|
||||
Tq = W[1];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TD = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn - Ts;
|
||||
TJ = TE + TD;
|
||||
Tx = Tn + Ts;
|
||||
TF = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tc, TA, Th, TB;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = cr[WS(rs, 2)];
|
||||
Tb = ci[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TA = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 5)];
|
||||
Tg = ci[WS(rs, 5)];
|
||||
Td = W[8];
|
||||
Tf = W[9];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TB = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
TI = TA + TB;
|
||||
Tw = Tc + Th;
|
||||
TC = TA - TB;
|
||||
}
|
||||
{
|
||||
E TG, Tu, Tz, TK, Ty, TH;
|
||||
TG = KP866025403 * (TC + TF);
|
||||
Tu = Ti + Tt;
|
||||
Tz = FNMS(KP500000000, Tu, T7);
|
||||
ci[WS(rs, 2)] = T7 + Tu;
|
||||
cr[WS(rs, 1)] = Tz + TG;
|
||||
ci[0] = Tz - TG;
|
||||
TK = KP866025403 * (TI - TJ);
|
||||
Ty = Tw + Tx;
|
||||
TH = FNMS(KP500000000, Ty, Tv);
|
||||
cr[0] = Tv + Ty;
|
||||
ci[WS(rs, 1)] = TH + TK;
|
||||
cr[WS(rs, 2)] = TH - TK;
|
||||
}
|
||||
{
|
||||
E TP, TL, TQ, TR, TT, TU;
|
||||
TP = KP866025403 * (Tt - Ti);
|
||||
TL = TF - TC;
|
||||
TQ = FMA(KP500000000, TL, TO);
|
||||
cr[WS(rs, 3)] = TL - TO;
|
||||
ci[WS(rs, 4)] = TP + TQ;
|
||||
cr[WS(rs, 5)] = TP - TQ;
|
||||
TR = KP866025403 * (Tw - Tx);
|
||||
TT = TI + TJ;
|
||||
TU = FNMS(KP500000000, TT, TS);
|
||||
cr[WS(rs, 4)] = TR - TU;
|
||||
ci[WS(rs, 5)] = TT + TS;
|
||||
ci[WS(rs, 3)] = TR + TU;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 6, "hf_6", twinstr, &GENUS, { 32, 14, 14, 0 } };
|
||||
|
||||
void X(codelet_hf_6) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_6, &desc);
|
||||
}
|
||||
#endif
|
||||
4105
fftw-3.3.10/rdft/scalar/r2cf/hf_64.c
Normal file
4105
fftw-3.3.10/rdft/scalar/r2cf/hf_64.c
Normal file
File diff suppressed because it is too large
Load Diff
354
fftw-3.3.10/rdft/scalar/r2cf/hf_7.c
Normal file
354
fftw-3.3.10/rdft/scalar/r2cf/hf_7.c
Normal file
@@ -0,0 +1,354 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 66 FP multiplications,
|
||||
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
|
||||
* 37 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, T19, Te, T1i, TR, T1a, Tr, T1h, TM, T1b, TE, T1g, TW, T1c;
|
||||
T1 = cr[0];
|
||||
T19 = ci[0];
|
||||
{
|
||||
E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T6 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
TN = T2 * T6;
|
||||
T9 = cr[WS(rs, 6)];
|
||||
Tc = ci[WS(rs, 6)];
|
||||
T8 = W[10];
|
||||
Ta = T8 * T9;
|
||||
TP = T8 * Tc;
|
||||
{
|
||||
E T7, TO, Td, TQ, T5, Tb;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TO = FNMS(T5, T3, TN);
|
||||
Tb = W[11];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
TQ = FNMS(Tb, T9, TP);
|
||||
Te = T7 + Td;
|
||||
T1i = Td - T7;
|
||||
TR = TO - TQ;
|
||||
T1a = TO + TQ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
|
||||
Tg = cr[WS(rs, 2)];
|
||||
Tj = ci[WS(rs, 2)];
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
TI = Tf * Tj;
|
||||
Tm = cr[WS(rs, 5)];
|
||||
Tp = ci[WS(rs, 5)];
|
||||
Tl = W[8];
|
||||
Tn = Tl * Tm;
|
||||
TK = Tl * Tp;
|
||||
{
|
||||
E Tk, TJ, Tq, TL, Ti, To;
|
||||
Ti = W[3];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TJ = FNMS(Ti, Tg, TI);
|
||||
To = W[9];
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
TL = FNMS(To, Tm, TK);
|
||||
Tr = Tk + Tq;
|
||||
T1h = Tq - Tk;
|
||||
TM = TJ - TL;
|
||||
T1b = TJ + TL;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
|
||||
Tt = cr[WS(rs, 3)];
|
||||
Tw = ci[WS(rs, 3)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
TS = Ts * Tw;
|
||||
Tz = cr[WS(rs, 4)];
|
||||
TC = ci[WS(rs, 4)];
|
||||
Ty = W[6];
|
||||
TA = Ty * Tz;
|
||||
TU = Ty * TC;
|
||||
{
|
||||
E Tx, TT, TD, TV, Tv, TB;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TT = FNMS(Tv, Tt, TS);
|
||||
TB = W[7];
|
||||
TD = FMA(TB, TC, TA);
|
||||
TV = FNMS(TB, Tz, TU);
|
||||
TE = Tx + TD;
|
||||
T1g = TD - Tx;
|
||||
TW = TT - TV;
|
||||
T1c = TT + TV;
|
||||
}
|
||||
}
|
||||
cr[0] = T1 + Te + Tr + TE;
|
||||
{
|
||||
E TG, TY, TF, TX, TH;
|
||||
TF = FNMS(KP356895867, Tr, Te);
|
||||
TG = FNMS(KP692021471, TF, TE);
|
||||
TX = FMA(KP554958132, TW, TR);
|
||||
TY = FMA(KP801937735, TX, TM);
|
||||
TH = FNMS(KP900968867, TG, T1);
|
||||
ci[0] = FNMS(KP974927912, TY, TH);
|
||||
cr[WS(rs, 1)] = FMA(KP974927912, TY, TH);
|
||||
}
|
||||
ci[WS(rs, 6)] = T1a + T1b + T1c + T19;
|
||||
{
|
||||
E T1r, T1u, T1q, T1t, T1s;
|
||||
T1q = FNMS(KP356895867, T1b, T1a);
|
||||
T1r = FNMS(KP692021471, T1q, T1c);
|
||||
T1t = FMA(KP554958132, T1g, T1i);
|
||||
T1u = FMA(KP801937735, T1t, T1h);
|
||||
T1s = FNMS(KP900968867, T1r, T19);
|
||||
cr[WS(rs, 6)] = FMS(KP974927912, T1u, T1s);
|
||||
ci[WS(rs, 5)] = FMA(KP974927912, T1u, T1s);
|
||||
}
|
||||
{
|
||||
E T1m, T1p, T1l, T1o, T1n;
|
||||
T1l = FNMS(KP356895867, T1a, T1c);
|
||||
T1m = FNMS(KP692021471, T1l, T1b);
|
||||
T1o = FMA(KP554958132, T1h, T1g);
|
||||
T1p = FNMS(KP801937735, T1o, T1i);
|
||||
T1n = FNMS(KP900968867, T1m, T19);
|
||||
cr[WS(rs, 5)] = FMS(KP974927912, T1p, T1n);
|
||||
ci[WS(rs, 4)] = FMA(KP974927912, T1p, T1n);
|
||||
}
|
||||
{
|
||||
E T1e, T1k, T1d, T1j, T1f;
|
||||
T1d = FNMS(KP356895867, T1c, T1b);
|
||||
T1e = FNMS(KP692021471, T1d, T1a);
|
||||
T1j = FNMS(KP554958132, T1i, T1h);
|
||||
T1k = FNMS(KP801937735, T1j, T1g);
|
||||
T1f = FNMS(KP900968867, T1e, T19);
|
||||
cr[WS(rs, 4)] = FMS(KP974927912, T1k, T1f);
|
||||
ci[WS(rs, 3)] = FMA(KP974927912, T1k, T1f);
|
||||
}
|
||||
{
|
||||
E T15, T18, T14, T17, T16;
|
||||
T14 = FNMS(KP356895867, TE, Tr);
|
||||
T15 = FNMS(KP692021471, T14, Te);
|
||||
T17 = FNMS(KP554958132, TR, TM);
|
||||
T18 = FNMS(KP801937735, T17, TW);
|
||||
T16 = FNMS(KP900968867, T15, T1);
|
||||
ci[WS(rs, 2)] = FNMS(KP974927912, T18, T16);
|
||||
cr[WS(rs, 3)] = FMA(KP974927912, T18, T16);
|
||||
}
|
||||
{
|
||||
E T10, T13, TZ, T12, T11;
|
||||
TZ = FNMS(KP356895867, Te, TE);
|
||||
T10 = FNMS(KP692021471, TZ, Tr);
|
||||
T12 = FMA(KP554958132, TM, TW);
|
||||
T13 = FNMS(KP801937735, T12, TR);
|
||||
T11 = FNMS(KP900968867, T10, T1);
|
||||
ci[WS(rs, 1)] = FNMS(KP974927912, T13, T11);
|
||||
cr[WS(rs, 2)] = FMA(KP974927912, T13, T11);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, { 18, 12, 54, 0 } };
|
||||
|
||||
void X(codelet_hf_7) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_7, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 7 -dit -name hf_7 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 60 FP multiplications,
|
||||
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
|
||||
* 29 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_7(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 12); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, TT, Tc, TV, TC, TO, Tn, TS, TI, TP, Ty, TU, TF, TQ;
|
||||
T1 = cr[0];
|
||||
TT = ci[0];
|
||||
{
|
||||
E T6, TA, Tb, TB;
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 1)];
|
||||
T5 = ci[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TA = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = cr[WS(rs, 6)];
|
||||
Ta = ci[WS(rs, 6)];
|
||||
T7 = W[10];
|
||||
T9 = W[11];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
TB = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
Tc = T6 + Tb;
|
||||
TV = TA + TB;
|
||||
TC = TA - TB;
|
||||
TO = Tb - T6;
|
||||
}
|
||||
{
|
||||
E Th, TG, Tm, TH;
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 2)];
|
||||
Tg = ci[WS(rs, 2)];
|
||||
Td = W[2];
|
||||
Tf = W[3];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TG = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tj, Tl, Ti, Tk;
|
||||
Tj = cr[WS(rs, 5)];
|
||||
Tl = ci[WS(rs, 5)];
|
||||
Ti = W[8];
|
||||
Tk = W[9];
|
||||
Tm = FMA(Ti, Tj, Tk * Tl);
|
||||
TH = FNMS(Tk, Tj, Ti * Tl);
|
||||
}
|
||||
Tn = Th + Tm;
|
||||
TS = TG + TH;
|
||||
TI = TG - TH;
|
||||
TP = Th - Tm;
|
||||
}
|
||||
{
|
||||
E Ts, TD, Tx, TE;
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = cr[WS(rs, 3)];
|
||||
Tr = ci[WS(rs, 3)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TD = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tu, Tw, Tt, Tv;
|
||||
Tu = cr[WS(rs, 4)];
|
||||
Tw = ci[WS(rs, 4)];
|
||||
Tt = W[6];
|
||||
Tv = W[7];
|
||||
Tx = FMA(Tt, Tu, Tv * Tw);
|
||||
TE = FNMS(Tv, Tu, Tt * Tw);
|
||||
}
|
||||
Ty = Ts + Tx;
|
||||
TU = TD + TE;
|
||||
TF = TD - TE;
|
||||
TQ = Tx - Ts;
|
||||
}
|
||||
{
|
||||
E TL, TK, TZ, T10;
|
||||
cr[0] = T1 + Tc + Tn + Ty;
|
||||
TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
|
||||
TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
|
||||
ci[0] = TK - TL;
|
||||
cr[WS(rs, 1)] = TK + TL;
|
||||
ci[WS(rs, 6)] = TV + TS + TU + TT;
|
||||
TZ = FMA(KP781831482, TO, KP433883739 * TQ) - (KP974927912 * TP);
|
||||
T10 = FMA(KP623489801, TV, TT) + FNMA(KP900968867, TU, KP222520933 * TS);
|
||||
cr[WS(rs, 6)] = TZ - T10;
|
||||
ci[WS(rs, 5)] = TZ + T10;
|
||||
}
|
||||
{
|
||||
E TX, TY, TR, TW;
|
||||
TX = FMA(KP974927912, TO, KP433883739 * TP) - (KP781831482 * TQ);
|
||||
TY = FMA(KP623489801, TU, TT) + FNMA(KP900968867, TS, KP222520933 * TV);
|
||||
cr[WS(rs, 5)] = TX - TY;
|
||||
ci[WS(rs, 4)] = TX + TY;
|
||||
TR = FMA(KP433883739, TO, KP781831482 * TP) + (KP974927912 * TQ);
|
||||
TW = FMA(KP623489801, TS, TT) + FNMA(KP222520933, TU, KP900968867 * TV);
|
||||
cr[WS(rs, 4)] = TR - TW;
|
||||
ci[WS(rs, 3)] = TR + TW;
|
||||
}
|
||||
{
|
||||
E TN, TM, TJ, Tz;
|
||||
TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
|
||||
TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
|
||||
ci[WS(rs, 2)] = TM - TN;
|
||||
cr[WS(rs, 3)] = TM + TN;
|
||||
TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
|
||||
Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
|
||||
ci[WS(rs, 1)] = Tz - TJ;
|
||||
cr[WS(rs, 2)] = Tz + TJ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 7, "hf_7", twinstr, &GENUS, { 36, 24, 36, 0 } };
|
||||
|
||||
void X(codelet_hf_7) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_7, &desc);
|
||||
}
|
||||
#endif
|
||||
376
fftw-3.3.10/rdft/scalar/r2cf/hf_8.c
Normal file
376
fftw-3.3.10/rdft/scalar/r2cf/hf_8.c
Normal file
@@ -0,0 +1,376 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:12 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 34 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
|
||||
E TX, Ty, TZ, TV, T10;
|
||||
T1 = cr[0];
|
||||
T1m = ci[0];
|
||||
{
|
||||
E T3, T6, T4, T1k, T2, T5;
|
||||
T3 = cr[WS(rs, 4)];
|
||||
T6 = ci[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1k = T2 * T6;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1l = FNMS(T5, T3, T1k);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TR, Tf, Ti;
|
||||
Tg = cr[WS(rs, 6)];
|
||||
Tj = ci[WS(rs, 6)];
|
||||
Tf = W[10];
|
||||
Th = Tf * Tg;
|
||||
TR = Tf * Tj;
|
||||
Ti = W[11];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TS = FNMS(Ti, Tg, TR);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TP, T9, Tc;
|
||||
Ta = cr[WS(rs, 2)];
|
||||
Td = ci[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TP = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TQ = FNMS(Tc, Ta, TP);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
|
||||
TB = cr[WS(rs, 7)];
|
||||
TE = ci[WS(rs, 7)];
|
||||
TA = W[12];
|
||||
TC = TA * TB;
|
||||
T13 = TA * TE;
|
||||
TH = cr[WS(rs, 3)];
|
||||
TK = ci[WS(rs, 3)];
|
||||
TG = W[4];
|
||||
TI = TG * TH;
|
||||
T15 = TG * TK;
|
||||
TD = W[13];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T14 = FNMS(TD, TB, T13);
|
||||
TJ = W[5];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T16 = FNMS(TJ, TH, T15);
|
||||
T12 = TF - TL;
|
||||
T17 = T14 - T16;
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
|
||||
To = cr[WS(rs, 1)];
|
||||
Tr = ci[WS(rs, 1)];
|
||||
Tn = W[0];
|
||||
Tp = Tn * To;
|
||||
TW = Tn * Tr;
|
||||
Tu = cr[WS(rs, 5)];
|
||||
Tx = ci[WS(rs, 5)];
|
||||
Tt = W[8];
|
||||
Tv = Tt * Tu;
|
||||
TY = Tt * Tx;
|
||||
Tq = W[1];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
TX = FNMS(Tq, To, TW);
|
||||
Tw = W[9];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
TZ = FNMS(Tw, Tu, TY);
|
||||
TV = Ts - Ty;
|
||||
T10 = TX - TZ;
|
||||
}
|
||||
{
|
||||
E TU, T1a, T1t, T1v, T19, T1u, T1d, T1w;
|
||||
{
|
||||
E TO, TT, T1r, T1s;
|
||||
TO = T1 - T7;
|
||||
TT = TQ - TS;
|
||||
TU = TO + TT;
|
||||
T1a = TO - TT;
|
||||
T1r = Te - Tk;
|
||||
T1s = T1m - T1l;
|
||||
T1t = T1r + T1s;
|
||||
T1v = T1s - T1r;
|
||||
}
|
||||
{
|
||||
E T11, T18, T1b, T1c;
|
||||
T11 = TV + T10;
|
||||
T18 = T12 - T17;
|
||||
T19 = T11 + T18;
|
||||
T1u = T18 - T11;
|
||||
T1b = TV - T10;
|
||||
T1c = T12 + T17;
|
||||
T1d = T1b + T1c;
|
||||
T1w = T1c - T1b;
|
||||
}
|
||||
ci[WS(rs, 2)] = FNMS(KP707106781, T19, TU);
|
||||
cr[WS(rs, 5)] = FMS(KP707106781, T1w, T1v);
|
||||
ci[WS(rs, 6)] = FMA(KP707106781, T1w, T1v);
|
||||
cr[WS(rs, 1)] = FMA(KP707106781, T19, TU);
|
||||
cr[WS(rs, 3)] = FNMS(KP707106781, T1d, T1a);
|
||||
cr[WS(rs, 7)] = FMS(KP707106781, T1u, T1t);
|
||||
ci[WS(rs, 4)] = FMA(KP707106781, T1u, T1t);
|
||||
ci[0] = FMA(KP707106781, T1d, T1a);
|
||||
}
|
||||
{
|
||||
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
|
||||
{
|
||||
E T8, Tl, T1j, T1n;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
Tm = T8 + Tl;
|
||||
T1e = T8 - Tl;
|
||||
T1j = TQ + TS;
|
||||
T1n = T1l + T1m;
|
||||
T1o = T1j + T1n;
|
||||
T1q = T1n - T1j;
|
||||
}
|
||||
{
|
||||
E Tz, TM, T1f, T1g;
|
||||
Tz = Ts + Ty;
|
||||
TM = TF + TL;
|
||||
TN = Tz + TM;
|
||||
T1p = TM - Tz;
|
||||
T1f = T14 + T16;
|
||||
T1g = TX + TZ;
|
||||
T1h = T1f - T1g;
|
||||
T1i = T1g + T1f;
|
||||
}
|
||||
ci[WS(rs, 3)] = Tm - TN;
|
||||
cr[WS(rs, 6)] = T1p - T1q;
|
||||
ci[WS(rs, 5)] = T1p + T1q;
|
||||
cr[0] = Tm + TN;
|
||||
cr[WS(rs, 2)] = T1e - T1h;
|
||||
cr[WS(rs, 4)] = T1i - T1o;
|
||||
ci[WS(rs, 7)] = T1i + T1o;
|
||||
ci[WS(rs, 1)] = T1e + T1h;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, { 44, 14, 22, 0 } };
|
||||
|
||||
void X(codelet_hf_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 8 -dit -name hf_8 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 32 FP multiplications,
|
||||
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_8(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, T1f, TH, T19, TF, T12, TR, TU, Ti, T1e, TK, T16, Tu, T13, TM;
|
||||
E TP;
|
||||
{
|
||||
E T1, T18, T6, T17;
|
||||
T1 = cr[0];
|
||||
T18 = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 4)];
|
||||
T5 = ci[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T17 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T1f = T18 - T17;
|
||||
TH = T1 - T6;
|
||||
T19 = T17 + T18;
|
||||
}
|
||||
{
|
||||
E Tz, TS, TE, TT;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = cr[WS(rs, 7)];
|
||||
Ty = ci[WS(rs, 7)];
|
||||
Tv = W[12];
|
||||
Tx = W[13];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
TS = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = cr[WS(rs, 3)];
|
||||
TD = ci[WS(rs, 3)];
|
||||
TA = W[4];
|
||||
TC = W[5];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
TT = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T12 = TS + TT;
|
||||
TR = Tz - TE;
|
||||
TU = TS - TT;
|
||||
}
|
||||
{
|
||||
E Tc, TI, Th, TJ;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = cr[WS(rs, 2)];
|
||||
Tb = ci[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TI = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = cr[WS(rs, 6)];
|
||||
Tg = ci[WS(rs, 6)];
|
||||
Td = W[10];
|
||||
Tf = W[11];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TJ = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T1e = Tc - Th;
|
||||
TK = TI - TJ;
|
||||
T16 = TI + TJ;
|
||||
}
|
||||
{
|
||||
E To, TN, Tt, TO;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = cr[WS(rs, 1)];
|
||||
Tn = ci[WS(rs, 1)];
|
||||
Tk = W[0];
|
||||
Tm = W[1];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
TN = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = cr[WS(rs, 5)];
|
||||
Ts = ci[WS(rs, 5)];
|
||||
Tp = W[8];
|
||||
Tr = W[9];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
TO = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T13 = TN + TO;
|
||||
TM = To - Tt;
|
||||
TP = TN - TO;
|
||||
}
|
||||
{
|
||||
E Tj, TG, T1b, T1c;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
ci[WS(rs, 3)] = Tj - TG;
|
||||
cr[0] = Tj + TG;
|
||||
T1b = TF - Tu;
|
||||
T1c = T19 - T16;
|
||||
cr[WS(rs, 6)] = T1b - T1c;
|
||||
ci[WS(rs, 5)] = T1b + T1c;
|
||||
{
|
||||
E TX, T1i, T10, T1h, TY, TZ;
|
||||
TX = TH - TK;
|
||||
T1i = T1f - T1e;
|
||||
TY = TM - TP;
|
||||
TZ = TR + TU;
|
||||
T10 = KP707106781 * (TY + TZ);
|
||||
T1h = KP707106781 * (TZ - TY);
|
||||
cr[WS(rs, 3)] = TX - T10;
|
||||
ci[WS(rs, 6)] = T1h + T1i;
|
||||
ci[0] = TX + T10;
|
||||
cr[WS(rs, 5)] = T1h - T1i;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T15, T1a, T11, T14;
|
||||
T15 = T13 + T12;
|
||||
T1a = T16 + T19;
|
||||
cr[WS(rs, 4)] = T15 - T1a;
|
||||
ci[WS(rs, 7)] = T15 + T1a;
|
||||
T11 = T7 - Ti;
|
||||
T14 = T12 - T13;
|
||||
cr[WS(rs, 2)] = T11 - T14;
|
||||
ci[WS(rs, 1)] = T11 + T14;
|
||||
{
|
||||
E TL, T1g, TW, T1d, TQ, TV;
|
||||
TL = TH + TK;
|
||||
T1g = T1e + T1f;
|
||||
TQ = TM + TP;
|
||||
TV = TR - TU;
|
||||
TW = KP707106781 * (TQ + TV);
|
||||
T1d = KP707106781 * (TV - TQ);
|
||||
ci[WS(rs, 2)] = TL - TW;
|
||||
ci[WS(rs, 4)] = T1d + T1g;
|
||||
cr[WS(rs, 1)] = TL + TW;
|
||||
cr[WS(rs, 7)] = T1d - T1g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 8, "hf_8", twinstr, &GENUS, { 52, 18, 14, 0 } };
|
||||
|
||||
void X(codelet_hf_8) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_8, &desc);
|
||||
}
|
||||
#endif
|
||||
487
fftw-3.3.10/rdft/scalar/r2cf/hf_9.c
Normal file
487
fftw-3.3.10/rdft/scalar/r2cf/hf_9.c
Normal file
@@ -0,0 +1,487 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:13 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 88 FP multiplications,
|
||||
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
|
||||
* 55 stack variables, 10 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
|
||||
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
|
||||
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
|
||||
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
|
||||
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T1, T1P, Te, T1S, T10, T1Q, T1a, T1d, Ty, T18, Tl, T13, T19, T1c, T1l;
|
||||
E T1r, TS, T1p, TF, T1o, T1g, T1q;
|
||||
T1 = cr[0];
|
||||
T1P = ci[0];
|
||||
{
|
||||
E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8;
|
||||
T3 = cr[WS(rs, 3)];
|
||||
T6 = ci[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TW = T2 * T6;
|
||||
T9 = cr[WS(rs, 6)];
|
||||
Tc = ci[WS(rs, 6)];
|
||||
T8 = W[10];
|
||||
Ta = T8 * T9;
|
||||
TY = T8 * Tc;
|
||||
{
|
||||
E T7, TX, Td, TZ, T5, Tb;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TX = FNMS(T5, T3, TW);
|
||||
Tb = W[11];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
TZ = FNMS(Tb, T9, TY);
|
||||
Te = T7 + Td;
|
||||
T1S = Td - T7;
|
||||
T10 = TX - TZ;
|
||||
T1Q = TX + TZ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Th, Tk, Ti, T12, Tx, T17, Tr, T15, Tg, Tj;
|
||||
Th = cr[WS(rs, 1)];
|
||||
Tk = ci[WS(rs, 1)];
|
||||
Tg = W[0];
|
||||
Ti = Tg * Th;
|
||||
T12 = Tg * Tk;
|
||||
{
|
||||
E Tt, Tw, Tu, T16, Ts, Tv;
|
||||
Tt = cr[WS(rs, 7)];
|
||||
Tw = ci[WS(rs, 7)];
|
||||
Ts = W[12];
|
||||
Tu = Ts * Tt;
|
||||
T16 = Ts * Tw;
|
||||
Tv = W[13];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T17 = FNMS(Tv, Tt, T16);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T14, Tm, Tp;
|
||||
Tn = cr[WS(rs, 4)];
|
||||
Tq = ci[WS(rs, 4)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
T14 = Tm * Tq;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T15 = FNMS(Tp, Tn, T14);
|
||||
}
|
||||
T1a = Tr - Tx;
|
||||
T1d = T15 - T17;
|
||||
Ty = Tr + Tx;
|
||||
T18 = T15 + T17;
|
||||
Tj = W[1];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T13 = FNMS(Tj, Th, T12);
|
||||
T19 = FNMS(KP500000000, T18, T13);
|
||||
T1c = FNMS(KP500000000, Ty, Tl);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T1n, TR, T1k, TL, T1i, TA, TD;
|
||||
TB = cr[WS(rs, 2)];
|
||||
TE = ci[WS(rs, 2)];
|
||||
TA = W[2];
|
||||
TC = TA * TB;
|
||||
T1n = TA * TE;
|
||||
{
|
||||
E TN, TQ, TO, T1j, TM, TP;
|
||||
TN = cr[WS(rs, 8)];
|
||||
TQ = ci[WS(rs, 8)];
|
||||
TM = W[14];
|
||||
TO = TM * TN;
|
||||
T1j = TM * TQ;
|
||||
TP = W[15];
|
||||
TR = FMA(TP, TQ, TO);
|
||||
T1k = FNMS(TP, TN, T1j);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1h, TG, TJ;
|
||||
TH = cr[WS(rs, 5)];
|
||||
TK = ci[WS(rs, 5)];
|
||||
TG = W[8];
|
||||
TI = TG * TH;
|
||||
T1h = TG * TK;
|
||||
TJ = W[9];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1i = FNMS(TJ, TH, T1h);
|
||||
}
|
||||
T1l = T1i - T1k;
|
||||
T1r = TR - TL;
|
||||
TS = TL + TR;
|
||||
T1p = T1i + T1k;
|
||||
TD = W[3];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1o = FNMS(TD, TB, T1n);
|
||||
T1g = FNMS(KP500000000, TS, TF);
|
||||
T1q = FNMS(KP500000000, T1p, T1o);
|
||||
}
|
||||
{
|
||||
E Tf, T21, TU, T24, T1O, T22, T1L, T23;
|
||||
Tf = T1 + Te;
|
||||
T21 = T1Q + T1P;
|
||||
{
|
||||
E Tz, TT, T1M, T1N;
|
||||
Tz = Tl + Ty;
|
||||
TT = TF + TS;
|
||||
TU = Tz + TT;
|
||||
T24 = TT - Tz;
|
||||
T1M = T13 + T18;
|
||||
T1N = T1o + T1p;
|
||||
T1O = T1M - T1N;
|
||||
T22 = T1M + T1N;
|
||||
}
|
||||
cr[0] = Tf + TU;
|
||||
ci[WS(rs, 8)] = T22 + T21;
|
||||
T1L = FNMS(KP500000000, TU, Tf);
|
||||
ci[WS(rs, 2)] = FNMS(KP866025403, T1O, T1L);
|
||||
cr[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
|
||||
T23 = FNMS(KP500000000, T22, T21);
|
||||
cr[WS(rs, 6)] = FMS(KP866025403, T24, T23);
|
||||
ci[WS(rs, 5)] = FMA(KP866025403, T24, T23);
|
||||
}
|
||||
{
|
||||
E T11, T1z, T1T, T1X, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G;
|
||||
E T1U, TV, T1R;
|
||||
TV = FNMS(KP500000000, Te, T1);
|
||||
T11 = FNMS(KP866025403, T10, TV);
|
||||
T1z = FMA(KP866025403, T10, TV);
|
||||
T1R = FNMS(KP500000000, T1Q, T1P);
|
||||
T1T = FMA(KP866025403, T1S, T1R);
|
||||
T1X = FNMS(KP866025403, T1S, T1R);
|
||||
{
|
||||
E T1b, T1e, T1m, T1s;
|
||||
T1b = FMA(KP866025403, T1a, T19);
|
||||
T1e = FNMS(KP866025403, T1d, T1c);
|
||||
T1f = FMA(KP176326980, T1e, T1b);
|
||||
T1w = FNMS(KP176326980, T1b, T1e);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1s = FNMS(KP866025403, T1r, T1q);
|
||||
T1t = FNMS(KP363970234, T1s, T1m);
|
||||
T1x = FMA(KP363970234, T1m, T1s);
|
||||
}
|
||||
T1u = FNMS(KP954188894, T1t, T1f);
|
||||
T1Y = FMA(KP954188894, T1x, T1w);
|
||||
{
|
||||
E T1A, T1B, T1D, T1E;
|
||||
T1A = FMA(KP866025403, T1r, T1q);
|
||||
T1B = FMA(KP866025403, T1l, T1g);
|
||||
T1C = FMA(KP176326980, T1B, T1A);
|
||||
T1I = FNMS(KP176326980, T1A, T1B);
|
||||
T1D = FMA(KP866025403, T1d, T1c);
|
||||
T1E = FNMS(KP866025403, T1a, T19);
|
||||
T1F = FMA(KP839099631, T1E, T1D);
|
||||
T1J = FNMS(KP839099631, T1D, T1E);
|
||||
}
|
||||
T1G = FMA(KP777861913, T1F, T1C);
|
||||
T1U = FNMS(KP777861913, T1J, T1I);
|
||||
cr[WS(rs, 2)] = FMA(KP984807753, T1u, T11);
|
||||
ci[WS(rs, 7)] = FNMS(KP984807753, T1U, T1T);
|
||||
ci[WS(rs, 6)] = FNMS(KP984807753, T1Y, T1X);
|
||||
cr[WS(rs, 1)] = FMA(KP984807753, T1G, T1z);
|
||||
{
|
||||
E T1V, T1W, T1H, T1K;
|
||||
T1V = FMA(KP492403876, T1U, T1T);
|
||||
T1W = FNMS(KP777861913, T1F, T1C);
|
||||
cr[WS(rs, 7)] = FMS(KP852868531, T1W, T1V);
|
||||
ci[WS(rs, 4)] = FMA(KP852868531, T1W, T1V);
|
||||
T1H = FNMS(KP492403876, T1G, T1z);
|
||||
T1K = FMA(KP777861913, T1J, T1I);
|
||||
ci[WS(rs, 1)] = FNMS(KP852868531, T1K, T1H);
|
||||
cr[WS(rs, 4)] = FMA(KP852868531, T1K, T1H);
|
||||
}
|
||||
{
|
||||
E T1v, T1y, T1Z, T20;
|
||||
T1v = FNMS(KP492403876, T1u, T11);
|
||||
T1y = FNMS(KP954188894, T1x, T1w);
|
||||
ci[WS(rs, 3)] = FNMS(KP852868531, T1y, T1v);
|
||||
ci[0] = FMA(KP852868531, T1y, T1v);
|
||||
T1Z = FMA(KP492403876, T1Y, T1X);
|
||||
T20 = FMA(KP954188894, T1t, T1f);
|
||||
cr[WS(rs, 5)] = FMS(KP852868531, T20, T1Z);
|
||||
cr[WS(rs, 8)] = -(FMA(KP852868531, T20, T1Z));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, { 24, 16, 72, 0 } };
|
||||
|
||||
void X(codelet_hf_9) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_9, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_hc2hc.native -compact -variables 4 -pipeline-latency 4 -n 9 -dit -name hf_9 -include rdft/scalar/hf.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 72 FP multiplications,
|
||||
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
|
||||
* 41 stack variables, 8 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/hf.h"
|
||||
|
||||
static void hf_9(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + ((mb - 1) * 16); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T1, T1B, TQ, T1A, Tc, TN, T1C, T1D, TL, T1x, T19, T1o, T1c, T1n, Tu;
|
||||
E T1w, TW, T1k, T11, T1l;
|
||||
{
|
||||
E T6, TO, Tb, TP;
|
||||
T1 = cr[0];
|
||||
T1B = ci[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = cr[WS(rs, 3)];
|
||||
T5 = ci[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TO = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = cr[WS(rs, 6)];
|
||||
Ta = ci[WS(rs, 6)];
|
||||
T7 = W[10];
|
||||
T9 = W[11];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
TP = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
TQ = KP866025403 * (TO - TP);
|
||||
T1A = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
TN = FNMS(KP500000000, Tc, T1);
|
||||
T1C = TO + TP;
|
||||
T1D = FNMS(KP500000000, T1C, T1B);
|
||||
}
|
||||
{
|
||||
E Tz, T13, TE, T14, TJ, T15, TK, T16;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = cr[WS(rs, 2)];
|
||||
Ty = ci[WS(rs, 2)];
|
||||
Tv = W[2];
|
||||
Tx = W[3];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T13 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = cr[WS(rs, 5)];
|
||||
TD = ci[WS(rs, 5)];
|
||||
TA = W[8];
|
||||
TC = W[9];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T14 = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = cr[WS(rs, 8)];
|
||||
TI = ci[WS(rs, 8)];
|
||||
TF = W[14];
|
||||
TH = W[15];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T15 = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
TK = TE + TJ;
|
||||
T16 = T14 + T15;
|
||||
TL = Tz + TK;
|
||||
T1x = T13 + T16;
|
||||
{
|
||||
E T17, T18, T1a, T1b;
|
||||
T17 = FNMS(KP500000000, T16, T13);
|
||||
T18 = KP866025403 * (TJ - TE);
|
||||
T19 = T17 - T18;
|
||||
T1o = T18 + T17;
|
||||
T1a = FNMS(KP500000000, TK, Tz);
|
||||
T1b = KP866025403 * (T14 - T15);
|
||||
T1c = T1a - T1b;
|
||||
T1n = T1a + T1b;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, TX, Tn, TT, Ts, TU, Tt, TY;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = cr[WS(rs, 1)];
|
||||
Th = ci[WS(rs, 1)];
|
||||
Te = W[0];
|
||||
Tg = W[1];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
TX = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = cr[WS(rs, 4)];
|
||||
Tm = ci[WS(rs, 4)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TT = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = cr[WS(rs, 7)];
|
||||
Tr = ci[WS(rs, 7)];
|
||||
To = W[12];
|
||||
Tq = W[13];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TU = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn + Ts;
|
||||
TY = TT + TU;
|
||||
Tu = Ti + Tt;
|
||||
T1w = TX + TY;
|
||||
{
|
||||
E TS, TV, TZ, T10;
|
||||
TS = FNMS(KP500000000, Tt, Ti);
|
||||
TV = KP866025403 * (TT - TU);
|
||||
TW = TS - TV;
|
||||
T1k = TS + TV;
|
||||
TZ = FNMS(KP500000000, TY, TX);
|
||||
T10 = KP866025403 * (Ts - Tn);
|
||||
T11 = TZ - T10;
|
||||
T1l = T10 + TZ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1y, Td, TM, T1v;
|
||||
T1y = KP866025403 * (T1w - T1x);
|
||||
Td = T1 + Tc;
|
||||
TM = Tu + TL;
|
||||
T1v = FNMS(KP500000000, TM, Td);
|
||||
cr[0] = Td + TM;
|
||||
cr[WS(rs, 3)] = T1v + T1y;
|
||||
ci[WS(rs, 2)] = T1v - T1y;
|
||||
}
|
||||
{
|
||||
E TR, T1I, T1e, T1K, T1i, T1H, T1f, T1J;
|
||||
TR = TN - TQ;
|
||||
T1I = T1D - T1A;
|
||||
{
|
||||
E T12, T1d, T1g, T1h;
|
||||
T12 = FMA(KP173648177, TW, KP984807753 * T11);
|
||||
T1d = FNMS(KP939692620, T1c, KP342020143 * T19);
|
||||
T1e = T12 + T1d;
|
||||
T1K = KP866025403 * (T1d - T12);
|
||||
T1g = FNMS(KP984807753, TW, KP173648177 * T11);
|
||||
T1h = FMA(KP342020143, T1c, KP939692620 * T19);
|
||||
T1i = KP866025403 * (T1g + T1h);
|
||||
T1H = T1g - T1h;
|
||||
}
|
||||
cr[WS(rs, 2)] = TR + T1e;
|
||||
ci[WS(rs, 6)] = T1H + T1I;
|
||||
T1f = FNMS(KP500000000, T1e, TR);
|
||||
ci[0] = T1f - T1i;
|
||||
ci[WS(rs, 3)] = T1f + T1i;
|
||||
T1J = FMS(KP500000000, T1H, T1I);
|
||||
cr[WS(rs, 5)] = T1J - T1K;
|
||||
cr[WS(rs, 8)] = T1K + T1J;
|
||||
}
|
||||
{
|
||||
E T1L, T1M, T1N, T1O;
|
||||
T1L = KP866025403 * (TL - Tu);
|
||||
T1M = T1C + T1B;
|
||||
T1N = T1w + T1x;
|
||||
T1O = FNMS(KP500000000, T1N, T1M);
|
||||
cr[WS(rs, 6)] = T1L - T1O;
|
||||
ci[WS(rs, 8)] = T1N + T1M;
|
||||
ci[WS(rs, 5)] = T1L + T1O;
|
||||
}
|
||||
{
|
||||
E T1j, T1E, T1q, T1z, T1u, T1F, T1r, T1G;
|
||||
T1j = TN + TQ;
|
||||
T1E = T1A + T1D;
|
||||
{
|
||||
E T1m, T1p, T1s, T1t;
|
||||
T1m = FMA(KP766044443, T1k, KP642787609 * T1l);
|
||||
T1p = FMA(KP173648177, T1n, KP984807753 * T1o);
|
||||
T1q = T1m + T1p;
|
||||
T1z = KP866025403 * (T1p - T1m);
|
||||
T1s = FNMS(KP642787609, T1k, KP766044443 * T1l);
|
||||
T1t = FNMS(KP984807753, T1n, KP173648177 * T1o);
|
||||
T1u = KP866025403 * (T1s - T1t);
|
||||
T1F = T1s + T1t;
|
||||
}
|
||||
cr[WS(rs, 1)] = T1j + T1q;
|
||||
T1r = FNMS(KP500000000, T1q, T1j);
|
||||
ci[WS(rs, 1)] = T1r - T1u;
|
||||
cr[WS(rs, 4)] = T1r + T1u;
|
||||
ci[WS(rs, 7)] = T1F + T1E;
|
||||
T1G = FNMS(KP500000000, T1F, T1E);
|
||||
cr[WS(rs, 7)] = T1z - T1G;
|
||||
ci[WS(rs, 4)] = T1z + T1G;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 1, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const hc2hc_desc desc = { 9, "hf_9", twinstr, &GENUS, { 60, 36, 36, 0 } };
|
||||
|
||||
void X(codelet_hf_9) (planner *p) {
|
||||
X(khc2hc_register) (p, hf_9, &desc);
|
||||
}
|
||||
#endif
|
||||
194
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_10.c
Normal file
194
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_10.c
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cfII_10 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 18 FP multiplications,
|
||||
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
|
||||
* 21 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
|
||||
E T1, To, T8, Tt, Ta, Ts, Te, Tq, Th, Tn;
|
||||
T1 = R0[0];
|
||||
To = R1[WS(rs, 2)];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 - T3;
|
||||
T5 = R0[WS(rs, 4)];
|
||||
T6 = R0[WS(rs, 1)];
|
||||
T7 = T5 - T6;
|
||||
T8 = T4 + T7;
|
||||
Tt = T5 + T6;
|
||||
Ta = T4 - T7;
|
||||
Ts = T2 + T3;
|
||||
}
|
||||
{
|
||||
E Tc, Td, Tm, Tf, Tg, Tl;
|
||||
Tc = R1[0];
|
||||
Td = R1[WS(rs, 4)];
|
||||
Tm = Tc + Td;
|
||||
Tf = R1[WS(rs, 1)];
|
||||
Tg = R1[WS(rs, 3)];
|
||||
Tl = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Tq = Tm + Tl;
|
||||
Th = Tf - Tg;
|
||||
Tn = Tl - Tm;
|
||||
}
|
||||
Cr[WS(csr, 2)] = T1 + T8;
|
||||
Ci[WS(csi, 2)] = Tn - To;
|
||||
{
|
||||
E Ti, Tk, Tb, Tj, T9;
|
||||
Ti = FMA(KP618033988, Th, Te);
|
||||
Tk = FNMS(KP618033988, Te, Th);
|
||||
T9 = FNMS(KP250000000, T8, T1);
|
||||
Tb = FMA(KP559016994, Ta, T9);
|
||||
Tj = FNMS(KP559016994, Ta, T9);
|
||||
Cr[WS(csr, 4)] = FNMS(KP951056516, Ti, Tb);
|
||||
Cr[WS(csr, 3)] = FMA(KP951056516, Tk, Tj);
|
||||
Cr[0] = FMA(KP951056516, Ti, Tb);
|
||||
Cr[WS(csr, 1)] = FNMS(KP951056516, Tk, Tj);
|
||||
}
|
||||
{
|
||||
E Tu, Tw, Tr, Tv, Tp;
|
||||
Tu = FMA(KP618033988, Tt, Ts);
|
||||
Tw = FNMS(KP618033988, Ts, Tt);
|
||||
Tp = FMA(KP250000000, Tn, To);
|
||||
Tr = FMA(KP559016994, Tq, Tp);
|
||||
Tv = FNMS(KP559016994, Tq, Tp);
|
||||
Ci[0] = -(FMA(KP951056516, Tu, Tr));
|
||||
Ci[WS(csi, 3)] = FMA(KP951056516, Tw, Tv);
|
||||
Ci[WS(csi, 4)] = FMS(KP951056516, Tu, Tr);
|
||||
Ci[WS(csi, 1)] = FNMS(KP951056516, Tw, Tv);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 10, "r2cfII_10", { 14, 0, 18, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_10) (planner *p) { X(kr2c_register) (p, r2cfII_10, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cfII_10 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 12 FP multiplications,
|
||||
* (or, 26 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 21 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
|
||||
E T1, To, T8, Tq, T9, Tp, Te, Ts, Th, Tn;
|
||||
T1 = R0[0];
|
||||
To = R1[WS(rs, 2)];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 - T3;
|
||||
T5 = R0[WS(rs, 4)];
|
||||
T6 = R0[WS(rs, 1)];
|
||||
T7 = T5 - T6;
|
||||
T8 = T4 + T7;
|
||||
Tq = T5 + T6;
|
||||
T9 = KP559016994 * (T4 - T7);
|
||||
Tp = T2 + T3;
|
||||
}
|
||||
{
|
||||
E Tc, Td, Tm, Tf, Tg, Tl;
|
||||
Tc = R1[0];
|
||||
Td = R1[WS(rs, 4)];
|
||||
Tm = Tc + Td;
|
||||
Tf = R1[WS(rs, 1)];
|
||||
Tg = R1[WS(rs, 3)];
|
||||
Tl = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Ts = KP559016994 * (Tm + Tl);
|
||||
Th = Tf - Tg;
|
||||
Tn = Tl - Tm;
|
||||
}
|
||||
Cr[WS(csr, 2)] = T1 + T8;
|
||||
Ci[WS(csi, 2)] = Tn - To;
|
||||
{
|
||||
E Ti, Tk, Tb, Tj, Ta;
|
||||
Ti = FMA(KP951056516, Te, KP587785252 * Th);
|
||||
Tk = FNMS(KP587785252, Te, KP951056516 * Th);
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
Tb = T9 + Ta;
|
||||
Tj = Ta - T9;
|
||||
Cr[WS(csr, 4)] = Tb - Ti;
|
||||
Cr[WS(csr, 3)] = Tj + Tk;
|
||||
Cr[0] = Tb + Ti;
|
||||
Cr[WS(csr, 1)] = Tj - Tk;
|
||||
}
|
||||
{
|
||||
E Tr, Tw, Tu, Tv, Tt;
|
||||
Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
|
||||
Tw = FNMS(KP587785252, Tp, KP951056516 * Tq);
|
||||
Tt = FMA(KP250000000, Tn, To);
|
||||
Tu = Ts + Tt;
|
||||
Tv = Tt - Ts;
|
||||
Ci[0] = -(Tr + Tu);
|
||||
Ci[WS(csi, 3)] = Tw + Tv;
|
||||
Ci[WS(csi, 4)] = Tr - Tu;
|
||||
Ci[WS(csi, 1)] = Tv - Tw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 10, "r2cfII_10", { 26, 6, 6, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_10) (planner *p) { X(kr2c_register) (p, r2cfII_10, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
225
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_12.c
Normal file
225
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_12.c
Normal file
@@ -0,0 +1,225 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cfII_12 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 45 FP additions, 24 FP multiplications,
|
||||
* (or, 21 additions, 0 multiplications, 24 fused multiply/add),
|
||||
* 28 stack variables, 3 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
|
||||
E Tx, Ty, T8, Tz, Tl, Tm, Tv, T5, TA, Tt, Te, Tf, Tu, T6, T7;
|
||||
E Tw, TF, TG;
|
||||
Tx = R0[WS(rs, 3)];
|
||||
T6 = R0[WS(rs, 5)];
|
||||
T7 = R0[WS(rs, 1)];
|
||||
Ty = T6 + T7;
|
||||
T8 = T6 - T7;
|
||||
Tz = FMA(KP500000000, Ty, Tx);
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = R1[WS(rs, 4)];
|
||||
Ti = R1[WS(rs, 2)];
|
||||
Tj = R1[0];
|
||||
Tk = Ti - Tj;
|
||||
Tl = FMA(KP500000000, Tk, Th);
|
||||
Tm = Ti + Tj;
|
||||
Tv = Ti - Tj - Th;
|
||||
}
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 4)];
|
||||
T4 = T2 - T3;
|
||||
T5 = FMA(KP500000000, T4, T1);
|
||||
TA = T3 + T2;
|
||||
Tt = T1 + T3 - T2;
|
||||
}
|
||||
{
|
||||
E Ta, Tb, Tc, Td;
|
||||
Ta = R1[WS(rs, 1)];
|
||||
Tb = R1[WS(rs, 3)];
|
||||
Tc = R1[WS(rs, 5)];
|
||||
Td = Tb - Tc;
|
||||
Te = FMA(KP500000000, Td, Ta);
|
||||
Tf = Tc + Tb;
|
||||
Tu = Ta + Tc - Tb;
|
||||
}
|
||||
Tw = Tu + Tv;
|
||||
Cr[WS(csr, 1)] = FNMS(KP707106781, Tw, Tt);
|
||||
Cr[WS(csr, 4)] = FMA(KP707106781, Tw, Tt);
|
||||
TF = Tx - Ty;
|
||||
TG = Tv - Tu;
|
||||
Ci[WS(csi, 4)] = FMS(KP707106781, TG, TF);
|
||||
Ci[WS(csi, 1)] = FMA(KP707106781, TG, TF);
|
||||
{
|
||||
E T9, TD, To, TE, Tg, Tn;
|
||||
T9 = FNMS(KP866025403, T8, T5);
|
||||
TD = FNMS(KP866025403, TA, Tz);
|
||||
Tg = FNMS(KP866025403, Tf, Te);
|
||||
Tn = FNMS(KP866025403, Tm, Tl);
|
||||
To = Tg - Tn;
|
||||
TE = Tg + Tn;
|
||||
Cr[WS(csr, 5)] = FNMS(KP707106781, To, T9);
|
||||
Ci[WS(csi, 3)] = FMA(KP707106781, TE, TD);
|
||||
Cr[0] = FMA(KP707106781, To, T9);
|
||||
Ci[WS(csi, 2)] = FMS(KP707106781, TE, TD);
|
||||
}
|
||||
{
|
||||
E Tp, TB, Ts, TC, Tq, Tr;
|
||||
Tp = FMA(KP866025403, T8, T5);
|
||||
TB = FMA(KP866025403, TA, Tz);
|
||||
Tq = FMA(KP866025403, Tm, Tl);
|
||||
Tr = FMA(KP866025403, Tf, Te);
|
||||
Ts = Tq - Tr;
|
||||
TC = Tr + Tq;
|
||||
Cr[WS(csr, 3)] = FNMS(KP707106781, Ts, Tp);
|
||||
Ci[WS(csi, 5)] = FNMS(KP707106781, TC, TB);
|
||||
Cr[WS(csr, 2)] = FMA(KP707106781, Ts, Tp);
|
||||
Ci[0] = -(FMA(KP707106781, TC, TB));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 12, "r2cfII_12", { 21, 0, 24, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_12) (planner *p) { X(kr2c_register) (p, r2cfII_12, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cfII_12 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 43 FP additions, 12 FP multiplications,
|
||||
* (or, 39 additions, 8 multiplications, 4 fused multiply/add),
|
||||
* 28 stack variables, 5 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP353553390, +0.353553390593273762200422181052424519642417969);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP612372435, +0.612372435695794524549321018676472847991486870);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
|
||||
E Tx, Tg, T4, Tz, Ty, Tj, TA, T9, Tm, Tl, Te, Tp, To, Tf, TE;
|
||||
E TF;
|
||||
{
|
||||
E T1, T3, T2, Th, Ti;
|
||||
T1 = R0[0];
|
||||
T3 = R0[WS(rs, 2)];
|
||||
T2 = R0[WS(rs, 4)];
|
||||
Tx = KP866025403 * (T2 + T3);
|
||||
Tg = FMA(KP500000000, T3 - T2, T1);
|
||||
T4 = T1 + T2 - T3;
|
||||
Tz = R0[WS(rs, 3)];
|
||||
Th = R0[WS(rs, 5)];
|
||||
Ti = R0[WS(rs, 1)];
|
||||
Ty = Th + Ti;
|
||||
Tj = KP866025403 * (Th - Ti);
|
||||
TA = FMA(KP500000000, Ty, Tz);
|
||||
}
|
||||
{
|
||||
E T5, T6, T7, T8;
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T6 = R1[WS(rs, 5)];
|
||||
T7 = R1[WS(rs, 3)];
|
||||
T8 = T6 - T7;
|
||||
T9 = T5 + T8;
|
||||
Tm = KP612372435 * (T6 + T7);
|
||||
Tl = FNMS(KP353553390, T8, KP707106781 * T5);
|
||||
}
|
||||
{
|
||||
E Td, Ta, Tb, Tc;
|
||||
Td = R1[WS(rs, 4)];
|
||||
Ta = R1[WS(rs, 2)];
|
||||
Tb = R1[0];
|
||||
Tc = Ta - Tb;
|
||||
Te = Tc - Td;
|
||||
Tp = FMA(KP353553390, Tc, KP707106781 * Td);
|
||||
To = KP612372435 * (Ta + Tb);
|
||||
}
|
||||
Tf = KP707106781 * (T9 + Te);
|
||||
Cr[WS(csr, 1)] = T4 - Tf;
|
||||
Cr[WS(csr, 4)] = T4 + Tf;
|
||||
TE = KP707106781 * (Te - T9);
|
||||
TF = Tz - Ty;
|
||||
Ci[WS(csi, 4)] = TE - TF;
|
||||
Ci[WS(csi, 1)] = TE + TF;
|
||||
{
|
||||
E Tk, TB, Tr, Tw, Tn, Tq;
|
||||
Tk = Tg - Tj;
|
||||
TB = Tx - TA;
|
||||
Tn = Tl - Tm;
|
||||
Tq = To - Tp;
|
||||
Tr = Tn + Tq;
|
||||
Tw = Tn - Tq;
|
||||
Cr[WS(csr, 5)] = Tk - Tr;
|
||||
Ci[WS(csi, 2)] = Tw + TB;
|
||||
Cr[0] = Tk + Tr;
|
||||
Ci[WS(csi, 3)] = Tw - TB;
|
||||
}
|
||||
{
|
||||
E Ts, TD, Tv, TC, Tt, Tu;
|
||||
Ts = Tg + Tj;
|
||||
TD = Tx + TA;
|
||||
Tt = To + Tp;
|
||||
Tu = Tm + Tl;
|
||||
Tv = Tt - Tu;
|
||||
TC = Tu + Tt;
|
||||
Cr[WS(csr, 3)] = Ts - Tv;
|
||||
Ci[WS(csi, 5)] = TD - TC;
|
||||
Cr[WS(csr, 2)] = Ts + Tv;
|
||||
Ci[0] = -(TC + TD);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 12, "r2cfII_12", { 39, 8, 4, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_12) (planner *p) { X(kr2c_register) (p, r2cfII_12, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
297
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_15.c
Normal file
297
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_15.c
Normal file
@@ -0,0 +1,297 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 41 FP multiplications,
|
||||
* (or, 38 additions, 7 multiplications, 34 fused multiply/add),
|
||||
* 42 stack variables, 12 constants, and 30 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP823639103, +0.823639103546331925877420039278190003029660514);
|
||||
DK(KP910592997, +0.910592997310029334643087372129977886038870291);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP690983005, +0.690983005625052575897706582817180941139845410);
|
||||
DK(KP447213595, +0.447213595499957939281834733746255247088123672);
|
||||
DK(KP552786404, +0.552786404500042060718165266253744752911876328);
|
||||
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
|
||||
E Ta, Tl, T1, T6, T7, TX, TT, T8, Tg, Th, TM, TZ, Tj, Tz, Tr;
|
||||
E Ts, TP, TY, Tu, TC;
|
||||
Ta = R0[WS(rs, 5)];
|
||||
Tl = R1[WS(rs, 2)];
|
||||
{
|
||||
E T2, T5, T3, T4, TR, TS;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 3)];
|
||||
T5 = R1[WS(rs, 4)];
|
||||
T3 = R0[WS(rs, 6)];
|
||||
T4 = R1[WS(rs, 1)];
|
||||
TR = T2 + T5;
|
||||
TS = T3 + T4;
|
||||
T6 = T2 + T3 - T4 - T5;
|
||||
T7 = FNMS(KP250000000, T6, T1);
|
||||
TX = FNMS(KP618033988, TR, TS);
|
||||
TT = FMA(KP618033988, TS, TR);
|
||||
T8 = (T3 + T5 - T2) - T4;
|
||||
}
|
||||
{
|
||||
E Tf, TL, TK, Ti, Ty;
|
||||
{
|
||||
E Tb, Tc, Td, Te;
|
||||
Tb = R1[0];
|
||||
Tg = R0[WS(rs, 2)];
|
||||
Tc = R1[WS(rs, 3)];
|
||||
Td = R1[WS(rs, 6)];
|
||||
Te = Tc + Td;
|
||||
Tf = Tb - Te;
|
||||
TL = Tc - Td;
|
||||
Th = Tb + Te;
|
||||
TK = Tg + Tb;
|
||||
}
|
||||
TM = FMA(KP618033988, TL, TK);
|
||||
TZ = FNMS(KP618033988, TK, TL);
|
||||
Ti = FMA(KP809016994, Th, Tg);
|
||||
Tj = FNMS(KP552786404, Ti, Tf);
|
||||
Ty = FMA(KP447213595, Th, Tf);
|
||||
Tz = FNMS(KP690983005, Ty, Tg);
|
||||
}
|
||||
{
|
||||
E Tq, TO, TN, Tt, TB;
|
||||
{
|
||||
E Tm, Tn, To, Tp;
|
||||
Tm = R0[WS(rs, 7)];
|
||||
Tr = R1[WS(rs, 5)];
|
||||
Tn = R0[WS(rs, 1)];
|
||||
To = R0[WS(rs, 4)];
|
||||
Tp = Tn + To;
|
||||
Tq = Tm - Tp;
|
||||
TO = To - Tn;
|
||||
Ts = Tm + Tp;
|
||||
TN = Tr + Tm;
|
||||
}
|
||||
TP = FMA(KP618033988, TO, TN);
|
||||
TY = FNMS(KP618033988, TN, TO);
|
||||
Tt = FMA(KP809016994, Ts, Tr);
|
||||
Tu = FNMS(KP552786404, Tt, Tq);
|
||||
TB = FMA(KP447213595, Ts, Tq);
|
||||
TC = FNMS(KP690983005, TB, Tr);
|
||||
}
|
||||
{
|
||||
E TF, TG, TH, TI;
|
||||
TF = T1 + T6;
|
||||
TG = Ts - Tr - Tl;
|
||||
TH = Ta + Tg - Th;
|
||||
TI = TG + TH;
|
||||
Cr[WS(csr, 2)] = FNMS(KP500000000, TI, TF);
|
||||
Ci[WS(csi, 2)] = KP866025403 * (TH - TG);
|
||||
Cr[WS(csr, 7)] = TF + TI;
|
||||
}
|
||||
{
|
||||
E Tx, T14, T10, T11, TE, T12, TA, TD, T13;
|
||||
Tx = FMA(KP559016994, T8, T7);
|
||||
T14 = TZ - TY;
|
||||
T10 = TY + TZ;
|
||||
T11 = FMA(KP500000000, T10, TX);
|
||||
TA = FNMS(KP809016994, Tz, Ta);
|
||||
TD = FNMS(KP809016994, TC, Tl);
|
||||
TE = TA - TD;
|
||||
T12 = TD + TA;
|
||||
Cr[WS(csr, 1)] = Tx + TE;
|
||||
Ci[WS(csi, 1)] = KP951056516 * (T10 - TX);
|
||||
Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP910592997, T12, T11));
|
||||
Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP910592997, T12, T11)));
|
||||
T13 = FNMS(KP500000000, TE, Tx);
|
||||
Cr[WS(csr, 3)] = FNMS(KP823639103, T14, T13);
|
||||
Cr[WS(csr, 6)] = FMA(KP823639103, T14, T13);
|
||||
}
|
||||
{
|
||||
E T9, TQ, TU, TV, Tw, TW, Tk, Tv, TJ;
|
||||
T9 = FNMS(KP559016994, T8, T7);
|
||||
TQ = TM - TP;
|
||||
TU = TP + TM;
|
||||
TV = FMA(KP500000000, TU, TT);
|
||||
Tk = FNMS(KP559016994, Tj, Ta);
|
||||
Tv = FNMS(KP559016994, Tu, Tl);
|
||||
Tw = Tk - Tv;
|
||||
TW = Tv + Tk;
|
||||
Cr[WS(csr, 4)] = T9 + Tw;
|
||||
Ci[WS(csi, 4)] = KP951056516 * (TT - TU);
|
||||
Ci[0] = -(KP951056516 * (FMA(KP910592997, TW, TV)));
|
||||
Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP910592997, TW, TV)));
|
||||
TJ = FNMS(KP500000000, Tw, T9);
|
||||
Cr[WS(csr, 5)] = FNMS(KP823639103, TQ, TJ);
|
||||
Cr[0] = FMA(KP823639103, TQ, TJ);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 15, "r2cfII_15", { 38, 7, 34, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_15) (planner *p) { X(kr2c_register) (p, r2cfII_15, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cfII_15 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 33 FP multiplications,
|
||||
* (or, 54 additions, 15 multiplications, 18 fused multiply/add),
|
||||
* 37 stack variables, 8 constants, and 30 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
|
||||
DK(KP309016994, +0.309016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
|
||||
E T1, T2, Tx, TR, TE, T7, TD, Th, Tm, Tr, TQ, TA, TB, Tf, Te;
|
||||
E Tu, TS, Td, TH, TO;
|
||||
T1 = R0[WS(rs, 5)];
|
||||
{
|
||||
E T3, Tv, T6, Tw, T4, T5;
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R1[0];
|
||||
Tv = T2 + T3;
|
||||
T4 = R1[WS(rs, 3)];
|
||||
T5 = R1[WS(rs, 6)];
|
||||
T6 = T4 + T5;
|
||||
Tw = T4 - T5;
|
||||
Tx = FMA(KP951056516, Tv, KP587785252 * Tw);
|
||||
TR = FNMS(KP587785252, Tv, KP951056516 * Tw);
|
||||
TE = KP559016994 * (T3 - T6);
|
||||
T7 = T3 + T6;
|
||||
TD = KP250000000 * T7;
|
||||
}
|
||||
{
|
||||
E Ti, Tl, Tj, Tk, Tp, Tq;
|
||||
Th = R0[0];
|
||||
Ti = R1[WS(rs, 4)];
|
||||
Tl = R0[WS(rs, 6)];
|
||||
Tj = R1[WS(rs, 1)];
|
||||
Tk = R0[WS(rs, 3)];
|
||||
Tp = Tk + Ti;
|
||||
Tq = Tl + Tj;
|
||||
Tm = Ti + Tj - (Tk + Tl);
|
||||
Tr = FMA(KP951056516, Tp, KP587785252 * Tq);
|
||||
TQ = FNMS(KP951056516, Tq, KP587785252 * Tp);
|
||||
TA = FMA(KP250000000, Tm, Th);
|
||||
TB = KP559016994 * (Tl + Ti - (Tk + Tj));
|
||||
}
|
||||
{
|
||||
E T9, Tt, Tc, Ts, Ta, Tb, TG;
|
||||
Tf = R1[WS(rs, 2)];
|
||||
T9 = R0[WS(rs, 7)];
|
||||
Te = R1[WS(rs, 5)];
|
||||
Tt = T9 + Te;
|
||||
Ta = R0[WS(rs, 1)];
|
||||
Tb = R0[WS(rs, 4)];
|
||||
Tc = Ta + Tb;
|
||||
Ts = Ta - Tb;
|
||||
Tu = FNMS(KP951056516, Tt, KP587785252 * Ts);
|
||||
TS = FMA(KP951056516, Ts, KP587785252 * Tt);
|
||||
Td = T9 + Tc;
|
||||
TG = KP559016994 * (T9 - Tc);
|
||||
TH = FNMS(KP309016994, Te, TG) + FNMA(KP250000000, Td, Tf);
|
||||
TO = FMS(KP809016994, Te, Tf) + FNMA(KP250000000, Td, TG);
|
||||
}
|
||||
{
|
||||
E Tn, T8, Tg, To;
|
||||
Tn = Th - Tm;
|
||||
T8 = T1 + T2 - T7;
|
||||
Tg = Td - Te - Tf;
|
||||
To = T8 + Tg;
|
||||
Ci[WS(csi, 2)] = KP866025403 * (T8 - Tg);
|
||||
Cr[WS(csr, 2)] = FNMS(KP500000000, To, Tn);
|
||||
Cr[WS(csr, 7)] = Tn + To;
|
||||
}
|
||||
{
|
||||
E TM, TX, TT, TV, TP, TU, TN, TW;
|
||||
TM = TB + TA;
|
||||
TX = KP866025403 * (TR + TS);
|
||||
TT = TR - TS;
|
||||
TV = FMS(KP500000000, TT, TQ);
|
||||
TN = T1 + TE + FNMS(KP809016994, T2, TD);
|
||||
TP = TN + TO;
|
||||
TU = KP866025403 * (TO - TN);
|
||||
Cr[WS(csr, 1)] = TM + TP;
|
||||
Ci[WS(csi, 1)] = TQ + TT;
|
||||
Ci[WS(csi, 6)] = TU - TV;
|
||||
Ci[WS(csi, 3)] = TU + TV;
|
||||
TW = FNMS(KP500000000, TP, TM);
|
||||
Cr[WS(csr, 3)] = TW - TX;
|
||||
Cr[WS(csr, 6)] = TW + TX;
|
||||
}
|
||||
{
|
||||
E Tz, TC, Ty, TK, TI, TL, TF, TJ;
|
||||
Tz = KP866025403 * (Tx + Tu);
|
||||
TC = TA - TB;
|
||||
Ty = Tu - Tx;
|
||||
TK = FMS(KP500000000, Ty, Tr);
|
||||
TF = FMA(KP309016994, T2, T1) + TD - TE;
|
||||
TI = TF + TH;
|
||||
TL = KP866025403 * (TH - TF);
|
||||
Ci[WS(csi, 4)] = Tr + Ty;
|
||||
Cr[WS(csr, 4)] = TC + TI;
|
||||
Ci[WS(csi, 5)] = TK - TL;
|
||||
Ci[0] = TK + TL;
|
||||
TJ = FNMS(KP500000000, TI, TC);
|
||||
Cr[0] = Tz + TJ;
|
||||
Cr[WS(csr, 5)] = TJ - Tz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 15, "r2cfII_15", { 54, 15, 18, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_15) (planner *p) { X(kr2c_register) (p, r2cfII_15, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
312
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_16.c
Normal file
312
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_16.c
Normal file
@@ -0,0 +1,312 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 48 FP multiplications,
|
||||
* (or, 18 additions, 0 multiplications, 48 fused multiply/add),
|
||||
* 32 stack variables, 7 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
|
||||
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
|
||||
E T5, TZ, TB, TT, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
|
||||
E TU;
|
||||
{
|
||||
E T1, TR, T4, TS, T2, T3;
|
||||
T1 = R0[0];
|
||||
TR = R0[WS(rs, 4)];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 6)];
|
||||
T4 = T2 - T3;
|
||||
TS = T2 + T3;
|
||||
T5 = FNMS(KP707106781, T4, T1);
|
||||
TZ = FNMS(KP707106781, TS, TR);
|
||||
TB = FMA(KP707106781, T4, T1);
|
||||
TT = FMA(KP707106781, TS, TR);
|
||||
}
|
||||
{
|
||||
E Tn, Ts, Tq, Tt, To, Tp;
|
||||
Tn = R1[WS(rs, 7)];
|
||||
Ts = R1[WS(rs, 3)];
|
||||
To = R1[WS(rs, 1)];
|
||||
Tp = R1[WS(rs, 5)];
|
||||
Tq = To - Tp;
|
||||
Tt = To + Tp;
|
||||
Tr = FMA(KP707106781, Tq, Tn);
|
||||
TK = FMA(KP707106781, Tt, Ts);
|
||||
Tu = FNMS(KP707106781, Tt, Ts);
|
||||
TJ = FMS(KP707106781, Tq, Tn);
|
||||
}
|
||||
{
|
||||
E Te, Tj, Th, Tk, Tf, Tg;
|
||||
Te = R1[0];
|
||||
Tj = R1[WS(rs, 4)];
|
||||
Tf = R1[WS(rs, 2)];
|
||||
Tg = R1[WS(rs, 6)];
|
||||
Th = Tf - Tg;
|
||||
Tk = Tf + Tg;
|
||||
Ti = FNMS(KP707106781, Th, Te);
|
||||
TH = FMA(KP707106781, Tk, Tj);
|
||||
Tl = FNMS(KP707106781, Tk, Tj);
|
||||
TG = FMA(KP707106781, Th, Te);
|
||||
}
|
||||
{
|
||||
E T8, TC, Tb, TD;
|
||||
{
|
||||
E T6, T7, T9, Ta;
|
||||
T6 = R0[WS(rs, 5)];
|
||||
T7 = R0[WS(rs, 1)];
|
||||
T8 = FMA(KP414213562, T7, T6);
|
||||
TC = FNMS(KP414213562, T6, T7);
|
||||
T9 = R0[WS(rs, 3)];
|
||||
Ta = R0[WS(rs, 7)];
|
||||
Tb = FMA(KP414213562, Ta, T9);
|
||||
TD = FMS(KP414213562, T9, Ta);
|
||||
}
|
||||
Tc = T8 - Tb;
|
||||
T10 = TD - TC;
|
||||
TE = TC + TD;
|
||||
TU = T8 + Tb;
|
||||
}
|
||||
{
|
||||
E Td, T13, Tw, T14, Tm, Tv;
|
||||
Td = FMA(KP923879532, Tc, T5);
|
||||
T13 = FNMS(KP923879532, T10, TZ);
|
||||
Tm = FMA(KP668178637, Tl, Ti);
|
||||
Tv = FMA(KP668178637, Tu, Tr);
|
||||
Tw = Tm - Tv;
|
||||
T14 = Tm + Tv;
|
||||
Cr[WS(csr, 6)] = FNMS(KP831469612, Tw, Td);
|
||||
Ci[WS(csi, 5)] = FNMS(KP831469612, T14, T13);
|
||||
Cr[WS(csr, 1)] = FMA(KP831469612, Tw, Td);
|
||||
Ci[WS(csi, 2)] = -(FMA(KP831469612, T14, T13));
|
||||
}
|
||||
{
|
||||
E Tx, T11, TA, T12, Ty, Tz;
|
||||
Tx = FNMS(KP923879532, Tc, T5);
|
||||
T11 = FMA(KP923879532, T10, TZ);
|
||||
Ty = FNMS(KP668178637, Tr, Tu);
|
||||
Tz = FNMS(KP668178637, Ti, Tl);
|
||||
TA = Ty - Tz;
|
||||
T12 = Tz + Ty;
|
||||
Cr[WS(csr, 5)] = FNMS(KP831469612, TA, Tx);
|
||||
Ci[WS(csi, 1)] = FMA(KP831469612, T12, T11);
|
||||
Cr[WS(csr, 2)] = FMA(KP831469612, TA, Tx);
|
||||
Ci[WS(csi, 6)] = FMS(KP831469612, T12, T11);
|
||||
}
|
||||
{
|
||||
E TF, TX, TM, TY, TI, TL;
|
||||
TF = FMA(KP923879532, TE, TB);
|
||||
TX = FNMS(KP923879532, TU, TT);
|
||||
TI = FNMS(KP198912367, TH, TG);
|
||||
TL = FMA(KP198912367, TK, TJ);
|
||||
TM = TI + TL;
|
||||
TY = TL - TI;
|
||||
Cr[WS(csr, 7)] = FNMS(KP980785280, TM, TF);
|
||||
Ci[WS(csi, 3)] = FMA(KP980785280, TY, TX);
|
||||
Cr[0] = FMA(KP980785280, TM, TF);
|
||||
Ci[WS(csi, 4)] = FMS(KP980785280, TY, TX);
|
||||
}
|
||||
{
|
||||
E TN, TV, TQ, TW, TO, TP;
|
||||
TN = FNMS(KP923879532, TE, TB);
|
||||
TV = FMA(KP923879532, TU, TT);
|
||||
TO = FMA(KP198912367, TG, TH);
|
||||
TP = FNMS(KP198912367, TJ, TK);
|
||||
TQ = TO - TP;
|
||||
TW = TO + TP;
|
||||
Cr[WS(csr, 4)] = FNMS(KP980785280, TQ, TN);
|
||||
Ci[WS(csi, 7)] = FNMS(KP980785280, TW, TV);
|
||||
Cr[WS(csr, 3)] = FMA(KP980785280, TQ, TN);
|
||||
Ci[0] = -(FMA(KP980785280, TW, TV));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 16, "r2cfII_16", { 18, 0, 48, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_16) (planner *p) { X(kr2c_register) (p, r2cfII_16, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cfII_16 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 30 FP multiplications,
|
||||
* (or, 54 additions, 18 multiplications, 12 fused multiply/add),
|
||||
* 32 stack variables, 7 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
|
||||
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
|
||||
E T5, T11, TB, TV, Tr, TK, Tu, TJ, Ti, TH, Tl, TG, Tc, T10, TE;
|
||||
E TS;
|
||||
{
|
||||
E T1, TU, T4, TT, T2, T3;
|
||||
T1 = R0[0];
|
||||
TU = R0[WS(rs, 4)];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 6)];
|
||||
T4 = KP707106781 * (T2 - T3);
|
||||
TT = KP707106781 * (T2 + T3);
|
||||
T5 = T1 + T4;
|
||||
T11 = TU - TT;
|
||||
TB = T1 - T4;
|
||||
TV = TT + TU;
|
||||
}
|
||||
{
|
||||
E Tq, Tt, Tp, Ts, Tn, To;
|
||||
Tq = R1[WS(rs, 7)];
|
||||
Tt = R1[WS(rs, 3)];
|
||||
Tn = R1[WS(rs, 1)];
|
||||
To = R1[WS(rs, 5)];
|
||||
Tp = KP707106781 * (Tn - To);
|
||||
Ts = KP707106781 * (Tn + To);
|
||||
Tr = Tp - Tq;
|
||||
TK = Tt - Ts;
|
||||
Tu = Ts + Tt;
|
||||
TJ = Tp + Tq;
|
||||
}
|
||||
{
|
||||
E Te, Tk, Th, Tj, Tf, Tg;
|
||||
Te = R1[0];
|
||||
Tk = R1[WS(rs, 4)];
|
||||
Tf = R1[WS(rs, 2)];
|
||||
Tg = R1[WS(rs, 6)];
|
||||
Th = KP707106781 * (Tf - Tg);
|
||||
Tj = KP707106781 * (Tf + Tg);
|
||||
Ti = Te + Th;
|
||||
TH = Tk - Tj;
|
||||
Tl = Tj + Tk;
|
||||
TG = Te - Th;
|
||||
}
|
||||
{
|
||||
E T8, TC, Tb, TD;
|
||||
{
|
||||
E T6, T7, T9, Ta;
|
||||
T6 = R0[WS(rs, 1)];
|
||||
T7 = R0[WS(rs, 5)];
|
||||
T8 = FNMS(KP382683432, T7, KP923879532 * T6);
|
||||
TC = FMA(KP382683432, T6, KP923879532 * T7);
|
||||
T9 = R0[WS(rs, 3)];
|
||||
Ta = R0[WS(rs, 7)];
|
||||
Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
|
||||
TD = FMA(KP923879532, T9, KP382683432 * Ta);
|
||||
}
|
||||
Tc = T8 + Tb;
|
||||
T10 = Tb - T8;
|
||||
TE = TC - TD;
|
||||
TS = TC + TD;
|
||||
}
|
||||
{
|
||||
E Td, TW, Tw, TR, Tm, Tv;
|
||||
Td = T5 - Tc;
|
||||
TW = TS + TV;
|
||||
Tm = FMA(KP195090322, Ti, KP980785280 * Tl);
|
||||
Tv = FNMS(KP980785280, Tu, KP195090322 * Tr);
|
||||
Tw = Tm + Tv;
|
||||
TR = Tv - Tm;
|
||||
Cr[WS(csr, 4)] = Td - Tw;
|
||||
Ci[WS(csi, 7)] = TR + TW;
|
||||
Cr[WS(csr, 3)] = Td + Tw;
|
||||
Ci[0] = TR - TW;
|
||||
}
|
||||
{
|
||||
E Tx, TY, TA, TX, Ty, Tz;
|
||||
Tx = T5 + Tc;
|
||||
TY = TV - TS;
|
||||
Ty = FNMS(KP195090322, Tl, KP980785280 * Ti);
|
||||
Tz = FMA(KP980785280, Tr, KP195090322 * Tu);
|
||||
TA = Ty + Tz;
|
||||
TX = Tz - Ty;
|
||||
Cr[WS(csr, 7)] = Tx - TA;
|
||||
Ci[WS(csi, 3)] = TX + TY;
|
||||
Cr[0] = Tx + TA;
|
||||
Ci[WS(csi, 4)] = TX - TY;
|
||||
}
|
||||
{
|
||||
E TF, T12, TM, TZ, TI, TL;
|
||||
TF = TB + TE;
|
||||
T12 = T10 - T11;
|
||||
TI = FMA(KP831469612, TG, KP555570233 * TH);
|
||||
TL = FMA(KP831469612, TJ, KP555570233 * TK);
|
||||
TM = TI - TL;
|
||||
TZ = TI + TL;
|
||||
Cr[WS(csr, 6)] = TF - TM;
|
||||
Ci[WS(csi, 2)] = T12 - TZ;
|
||||
Cr[WS(csr, 1)] = TF + TM;
|
||||
Ci[WS(csi, 5)] = -(TZ + T12);
|
||||
}
|
||||
{
|
||||
E TN, T14, TQ, T13, TO, TP;
|
||||
TN = TB - TE;
|
||||
T14 = T10 + T11;
|
||||
TO = FNMS(KP555570233, TJ, KP831469612 * TK);
|
||||
TP = FNMS(KP555570233, TG, KP831469612 * TH);
|
||||
TQ = TO - TP;
|
||||
T13 = TP + TO;
|
||||
Cr[WS(csr, 5)] = TN - TQ;
|
||||
Ci[WS(csi, 1)] = T13 + T14;
|
||||
Cr[WS(csr, 2)] = TN + TQ;
|
||||
Ci[WS(csi, 6)] = T13 - T14;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 16, "r2cfII_16", { 54, 18, 12, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_16) (planner *p) { X(kr2c_register) (p, r2cfII_16, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
86
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_2.c
Normal file
86
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_2.c
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cfII_2 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 0 FP additions, 0 FP multiplications,
|
||||
* (or, 0 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 3 stack variables, 0 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
|
||||
E T1, T2;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
Cr[0] = T1;
|
||||
Ci[0] = -T2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 2, "r2cfII_2", { 0, 0, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_2) (planner *p) { X(kr2c_register) (p, r2cfII_2, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cfII_2 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 0 FP additions, 0 FP multiplications,
|
||||
* (or, 0 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 3 stack variables, 0 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
|
||||
E T1, T2;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
Cr[0] = T1;
|
||||
Ci[0] = -T2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 2, "r2cfII_2", { 0, 0, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_2) (planner *p) { X(kr2c_register) (p, r2cfII_2, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
394
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_20.c
Normal file
394
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_20.c
Normal file
@@ -0,0 +1,394 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:28 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 63 FP multiplications,
|
||||
* (or, 39 additions, 0 multiplications, 63 fused multiply/add),
|
||||
* 53 stack variables, 10 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP690983005, +0.690983005625052575897706582817180941139845410);
|
||||
DK(KP447213595, +0.447213595499957939281834733746255247088123672);
|
||||
DK(KP552786404, +0.552786404500042060718165266253744752911876328);
|
||||
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP381966011, +0.381966011250105151795413165634361882279690820);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
|
||||
E Ti, T1d, T1f, T1e, Tg, T1p, TS, T1g, T1, T6, T7, T1r, T1k, T8, To;
|
||||
E Tp, Tv, TX, Tr, TV, Tx, TF, TC, TD, T12, TG, TK, T10, Tc, Tf;
|
||||
Ti = R1[WS(rs, 2)];
|
||||
T1d = R0[WS(rs, 5)];
|
||||
{
|
||||
E Ta, Tb, Td, Te;
|
||||
Ta = R0[WS(rs, 9)];
|
||||
Tb = R0[WS(rs, 1)];
|
||||
Tc = Ta - Tb;
|
||||
T1f = Ta + Tb;
|
||||
Td = R0[WS(rs, 3)];
|
||||
Te = R0[WS(rs, 7)];
|
||||
Tf = Td - Te;
|
||||
T1e = Td + Te;
|
||||
}
|
||||
Tg = FNMS(KP618033988, Tf, Tc);
|
||||
T1p = FMA(KP381966011, T1e, T1f);
|
||||
TS = FMA(KP618033988, Tc, Tf);
|
||||
T1g = FMA(KP381966011, T1f, T1e);
|
||||
{
|
||||
E T2, T5, T3, T4, T1i, T1j;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 4)];
|
||||
T5 = R0[WS(rs, 6)];
|
||||
T3 = R0[WS(rs, 8)];
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T1i = T2 + T5;
|
||||
T1j = T3 + T4;
|
||||
T6 = T2 + T3 - T4 - T5;
|
||||
T7 = FNMS(KP250000000, T6, T1);
|
||||
T1r = FNMS(KP618033988, T1i, T1j);
|
||||
T1k = FMA(KP618033988, T1j, T1i);
|
||||
T8 = (T3 + T5 - T2) - T4;
|
||||
}
|
||||
{
|
||||
E Tn, Tu, Tt, Tq, TU;
|
||||
{
|
||||
E Tj, Tk, Tl, Tm;
|
||||
Tj = R1[WS(rs, 8)];
|
||||
To = R1[WS(rs, 6)];
|
||||
Tk = R1[0];
|
||||
Tl = R1[WS(rs, 4)];
|
||||
Tm = Tk + Tl;
|
||||
Tn = Tj - Tm;
|
||||
Tu = Tk - Tl;
|
||||
Tp = Tj + Tm;
|
||||
Tt = To + Tj;
|
||||
}
|
||||
Tv = FNMS(KP618033988, Tu, Tt);
|
||||
TX = FMA(KP618033988, Tt, Tu);
|
||||
Tq = FMA(KP809016994, Tp, To);
|
||||
Tr = FNMS(KP552786404, Tq, Tn);
|
||||
TU = FMA(KP447213595, Tp, Tn);
|
||||
TV = FNMS(KP690983005, TU, To);
|
||||
}
|
||||
{
|
||||
E TJ, TE, TI, TZ;
|
||||
Tx = R1[WS(rs, 7)];
|
||||
{
|
||||
E Ty, Tz, TA, TB;
|
||||
Ty = R1[WS(rs, 1)];
|
||||
TF = R1[WS(rs, 3)];
|
||||
Tz = R1[WS(rs, 5)];
|
||||
TA = R1[WS(rs, 9)];
|
||||
TB = Tz + TA;
|
||||
TC = Ty + TB;
|
||||
TJ = Tz - TA;
|
||||
TE = Ty - TB;
|
||||
TI = TF + Ty;
|
||||
}
|
||||
TD = FMA(KP250000000, TC, Tx);
|
||||
T12 = FNMS(KP618033988, TI, TJ);
|
||||
TG = FNMS(KP552786404, TF, TE);
|
||||
TK = FMA(KP618033988, TJ, TI);
|
||||
TZ = FMA(KP447213595, TC, TE);
|
||||
T10 = FNMS(KP690983005, TZ, TF);
|
||||
}
|
||||
{
|
||||
E T19, T1w, T1c, T1x, T1a, T1b;
|
||||
T19 = T1 + T6;
|
||||
T1w = T1f + T1d - T1e;
|
||||
T1a = Ti + To - Tp;
|
||||
T1b = TC - TF - Tx;
|
||||
T1c = T1a + T1b;
|
||||
T1x = T1a - T1b;
|
||||
Cr[WS(csr, 2)] = FNMS(KP707106781, T1c, T19);
|
||||
Ci[WS(csi, 2)] = FMS(KP707106781, T1x, T1w);
|
||||
Cr[WS(csr, 7)] = FMA(KP707106781, T1c, T19);
|
||||
Ci[WS(csi, 7)] = FMA(KP707106781, T1x, T1w);
|
||||
}
|
||||
{
|
||||
E TT, T15, T1s, T1u, TY, T17, T13, T16;
|
||||
{
|
||||
E TR, T1q, TW, T11;
|
||||
TR = FMA(KP559016994, T8, T7);
|
||||
TT = FMA(KP951056516, TS, TR);
|
||||
T15 = FNMS(KP951056516, TS, TR);
|
||||
T1q = FNMS(KP809016994, T1p, T1d);
|
||||
T1s = FNMS(KP951056516, T1r, T1q);
|
||||
T1u = FMA(KP951056516, T1r, T1q);
|
||||
TW = FNMS(KP809016994, TV, Ti);
|
||||
TY = FMA(KP951056516, TX, TW);
|
||||
T17 = FNMS(KP951056516, TX, TW);
|
||||
T11 = FNMS(KP809016994, T10, Tx);
|
||||
T13 = FNMS(KP951056516, T12, T11);
|
||||
T16 = FMA(KP951056516, T12, T11);
|
||||
}
|
||||
{
|
||||
E T14, T1v, T18, T1t;
|
||||
T14 = TY - T13;
|
||||
Cr[WS(csr, 6)] = FNMS(KP707106781, T14, TT);
|
||||
Cr[WS(csr, 3)] = FMA(KP707106781, T14, TT);
|
||||
T1v = T17 + T16;
|
||||
Ci[WS(csi, 6)] = FMS(KP707106781, T1v, T1u);
|
||||
Ci[WS(csi, 3)] = FMA(KP707106781, T1v, T1u);
|
||||
T18 = T16 - T17;
|
||||
Cr[WS(csr, 8)] = FNMS(KP707106781, T18, T15);
|
||||
Cr[WS(csr, 1)] = FMA(KP707106781, T18, T15);
|
||||
T1t = TY + T13;
|
||||
Ci[WS(csi, 8)] = -(FMA(KP707106781, T1t, T1s));
|
||||
Ci[WS(csi, 1)] = FNMS(KP707106781, T1t, T1s);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Th, TN, T1l, T1n, Tw, TO, TL, TP;
|
||||
{
|
||||
E T9, T1h, Ts, TH;
|
||||
T9 = FNMS(KP559016994, T8, T7);
|
||||
Th = FNMS(KP951056516, Tg, T9);
|
||||
TN = FMA(KP951056516, Tg, T9);
|
||||
T1h = FMA(KP809016994, T1g, T1d);
|
||||
T1l = FMA(KP951056516, T1k, T1h);
|
||||
T1n = FNMS(KP951056516, T1k, T1h);
|
||||
Ts = FNMS(KP559016994, Tr, Ti);
|
||||
Tw = FNMS(KP951056516, Tv, Ts);
|
||||
TO = FMA(KP951056516, Tv, Ts);
|
||||
TH = FNMS(KP559016994, TG, TD);
|
||||
TL = FNMS(KP951056516, TK, TH);
|
||||
TP = FMA(KP951056516, TK, TH);
|
||||
}
|
||||
{
|
||||
E TM, T1m, TQ, T1o;
|
||||
TM = Tw - TL;
|
||||
Cr[WS(csr, 9)] = FNMS(KP707106781, TM, Th);
|
||||
Cr[0] = FMA(KP707106781, TM, Th);
|
||||
T1m = TO + TP;
|
||||
Ci[0] = -(FMA(KP707106781, T1m, T1l));
|
||||
Ci[WS(csi, 9)] = FNMS(KP707106781, T1m, T1l);
|
||||
TQ = TO - TP;
|
||||
Cr[WS(csr, 5)] = FNMS(KP707106781, TQ, TN);
|
||||
Cr[WS(csr, 4)] = FMA(KP707106781, TQ, TN);
|
||||
T1o = Tw + TL;
|
||||
Ci[WS(csi, 4)] = -(FMA(KP707106781, T1o, T1n));
|
||||
Ci[WS(csi, 5)] = FNMS(KP707106781, T1o, T1n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 20, "r2cfII_20", { 39, 0, 63, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_20) (planner *p) { X(kr2c_register) (p, r2cfII_20, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cfII_20 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 34 FP multiplications,
|
||||
* (or, 86 additions, 18 multiplications, 16 fused multiply/add),
|
||||
* 60 stack variables, 13 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP572061402, +0.572061402817684297600072783580302076536153377);
|
||||
DK(KP218508012, +0.218508012224410535399650602527877556893735408);
|
||||
DK(KP309016994, +0.309016994374947424102293417182819058860154590);
|
||||
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP176776695, +0.176776695296636881100211090526212259821208984);
|
||||
DK(KP395284707, +0.395284707521047416499861693054089816714944392);
|
||||
DK(KP672498511, +0.672498511963957326960058968885748755876783111);
|
||||
DK(KP415626937, +0.415626937777453428589967464113135184222253485);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
|
||||
E T8, TD, Tm, TN, T9, TC, TY, TE, Te, TF, Tl, TK, T12, TL, Tk;
|
||||
E TM, T1, T6, Tq, T1l, T1c, Tp, T1f, T1e, T1d, Ty, TW, T1g, T1m, Tx;
|
||||
E Tu;
|
||||
T8 = R1[WS(rs, 2)];
|
||||
TD = KP707106781 * T8;
|
||||
Tm = R1[WS(rs, 7)];
|
||||
TN = KP707106781 * Tm;
|
||||
{
|
||||
E Ta, TA, Td, TB, Tb, Tc;
|
||||
T9 = R1[WS(rs, 6)];
|
||||
Ta = R1[WS(rs, 8)];
|
||||
TA = T9 + Ta;
|
||||
Tb = R1[0];
|
||||
Tc = R1[WS(rs, 4)];
|
||||
Td = Tb + Tc;
|
||||
TB = Tb - Tc;
|
||||
TC = FMA(KP415626937, TA, KP672498511 * TB);
|
||||
TY = FNMS(KP415626937, TB, KP672498511 * TA);
|
||||
TE = KP395284707 * (Ta - Td);
|
||||
Te = Ta + Td;
|
||||
TF = KP176776695 * Te;
|
||||
}
|
||||
{
|
||||
E Tg, TJ, Tj, TI, Th, Ti;
|
||||
Tg = R1[WS(rs, 1)];
|
||||
Tl = R1[WS(rs, 3)];
|
||||
TJ = Tg + Tl;
|
||||
Th = R1[WS(rs, 5)];
|
||||
Ti = R1[WS(rs, 9)];
|
||||
Tj = Th + Ti;
|
||||
TI = Th - Ti;
|
||||
TK = FNMS(KP415626937, TJ, KP672498511 * TI);
|
||||
T12 = FMA(KP415626937, TI, KP672498511 * TJ);
|
||||
TL = KP395284707 * (Tg - Tj);
|
||||
Tk = Tg + Tj;
|
||||
TM = KP176776695 * Tk;
|
||||
}
|
||||
{
|
||||
E T2, T5, T3, T4, T1a, T1b;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 6)];
|
||||
T5 = R0[WS(rs, 8)];
|
||||
T3 = R0[WS(rs, 2)];
|
||||
T4 = R0[WS(rs, 4)];
|
||||
T1a = T4 + T2;
|
||||
T1b = T5 + T3;
|
||||
T6 = T2 + T3 - (T4 + T5);
|
||||
Tq = FMA(KP250000000, T6, T1);
|
||||
T1l = FNMS(KP951056516, T1b, KP587785252 * T1a);
|
||||
T1c = FMA(KP951056516, T1a, KP587785252 * T1b);
|
||||
Tp = KP559016994 * (T5 + T2 - (T4 + T3));
|
||||
}
|
||||
T1f = R0[WS(rs, 5)];
|
||||
{
|
||||
E Tv, Tw, Ts, Tt;
|
||||
Tv = R0[WS(rs, 9)];
|
||||
Tw = R0[WS(rs, 1)];
|
||||
Tx = Tv - Tw;
|
||||
T1e = Tv + Tw;
|
||||
Ts = R0[WS(rs, 3)];
|
||||
Tt = R0[WS(rs, 7)];
|
||||
Tu = Ts - Tt;
|
||||
T1d = Ts + Tt;
|
||||
}
|
||||
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
|
||||
TW = FNMS(KP951056516, Tx, KP587785252 * Tu);
|
||||
T1g = FMA(KP809016994, T1d, KP309016994 * T1e) + T1f;
|
||||
T1m = FNMS(KP809016994, T1e, T1f) - (KP309016994 * T1d);
|
||||
{
|
||||
E T7, T1r, To, T1q, Tf, Tn;
|
||||
T7 = T1 - T6;
|
||||
T1r = T1e + T1f - T1d;
|
||||
Tf = T8 + (T9 - Te);
|
||||
Tn = (Tk - Tl) - Tm;
|
||||
To = KP707106781 * (Tf + Tn);
|
||||
T1q = KP707106781 * (Tf - Tn);
|
||||
Cr[WS(csr, 2)] = T7 - To;
|
||||
Ci[WS(csi, 2)] = T1q - T1r;
|
||||
Cr[WS(csr, 7)] = T7 + To;
|
||||
Ci[WS(csi, 7)] = T1q + T1r;
|
||||
}
|
||||
{
|
||||
E T1h, T1j, TX, T15, T10, T16, T13, T17, TV, TZ, T11;
|
||||
T1h = T1c - T1g;
|
||||
T1j = T1c + T1g;
|
||||
TV = Tq - Tp;
|
||||
TX = TV - TW;
|
||||
T15 = TV + TW;
|
||||
TZ = FMA(KP218508012, T9, TD) + TF - TE;
|
||||
T10 = TY + TZ;
|
||||
T16 = TZ - TY;
|
||||
T11 = FNMS(KP218508012, Tl, TL) - (TM + TN);
|
||||
T13 = T11 - T12;
|
||||
T17 = T11 + T12;
|
||||
{
|
||||
E T14, T19, T18, T1i;
|
||||
T14 = T10 + T13;
|
||||
Cr[WS(csr, 5)] = TX - T14;
|
||||
Cr[WS(csr, 4)] = TX + T14;
|
||||
T19 = T17 - T16;
|
||||
Ci[WS(csi, 5)] = T19 - T1h;
|
||||
Ci[WS(csi, 4)] = T19 + T1h;
|
||||
T18 = T16 + T17;
|
||||
Cr[WS(csr, 9)] = T15 - T18;
|
||||
Cr[0] = T15 + T18;
|
||||
T1i = T13 - T10;
|
||||
Ci[0] = T1i - T1j;
|
||||
Ci[WS(csi, 9)] = T1i + T1j;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1n, T1p, Tz, TR, TH, TS, TP, TT, Tr, TG, TO;
|
||||
T1n = T1l + T1m;
|
||||
T1p = T1m - T1l;
|
||||
Tr = Tp + Tq;
|
||||
Tz = Tr + Ty;
|
||||
TR = Tr - Ty;
|
||||
TG = TD + TE + FNMS(KP572061402, T9, TF);
|
||||
TH = TC + TG;
|
||||
TS = TC - TG;
|
||||
TO = TL + TM + FNMS(KP572061402, Tl, TN);
|
||||
TP = TK - TO;
|
||||
TT = TK + TO;
|
||||
{
|
||||
E TQ, T1o, TU, T1k;
|
||||
TQ = TH + TP;
|
||||
Cr[WS(csr, 6)] = Tz - TQ;
|
||||
Cr[WS(csr, 3)] = Tz + TQ;
|
||||
T1o = TT - TS;
|
||||
Ci[WS(csi, 6)] = T1o - T1p;
|
||||
Ci[WS(csi, 3)] = T1o + T1p;
|
||||
TU = TS + TT;
|
||||
Cr[WS(csr, 8)] = TR - TU;
|
||||
Cr[WS(csr, 1)] = TR + TU;
|
||||
T1k = TP - TH;
|
||||
Ci[WS(csi, 8)] = T1k - T1n;
|
||||
Ci[WS(csi, 1)] = T1k + T1n;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 20, "r2cfII_20", { 86, 18, 16, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_20) (planner *p) { X(kr2c_register) (p, r2cfII_20, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
776
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_25.c
Normal file
776
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_25.c
Normal file
@@ -0,0 +1,776 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:28 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cfII_25 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 212 FP additions, 177 FP multiplications,
|
||||
* (or, 47 additions, 12 multiplications, 165 fused multiply/add),
|
||||
* 131 stack variables, 67 constants, and 50 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP876091699, +0.876091699473550838204498029706869638173524346);
|
||||
DK(KP792626838, +0.792626838241819413632131824093538848057784557);
|
||||
DK(KP690668130, +0.690668130712929053565177988380887884042527623);
|
||||
DK(KP809385824, +0.809385824416008241660603814668679683846476688);
|
||||
DK(KP860541664, +0.860541664367944677098261680920518816412804187);
|
||||
DK(KP681693190, +0.681693190061530575150324149145440022633095390);
|
||||
DK(KP560319534, +0.560319534973832390111614715371676131169633784);
|
||||
DK(KP237294955, +0.237294955877110315393888866460840817927895961);
|
||||
DK(KP897376177, +0.897376177523557693138608077137219684419427330);
|
||||
DK(KP584303379, +0.584303379262766050358567120694562180043261496);
|
||||
DK(KP653711795, +0.653711795629256296299985401753308353544378892);
|
||||
DK(KP997675361, +0.997675361079556513670859573984492383596555031);
|
||||
DK(KP645989928, +0.645989928319777763844272876603899665178054552);
|
||||
DK(KP591287873, +0.591287873858343558732323717242372865934480959);
|
||||
DK(KP952936919, +0.952936919628306576880750665357914584765951388);
|
||||
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
|
||||
DK(KP956723877, +0.956723877038460305821989399535483155872969262);
|
||||
DK(KP945422727, +0.945422727388575946270360266328811958657216298);
|
||||
DK(KP734762448, +0.734762448793050413546343770063151342619912334);
|
||||
DK(KP772036680, +0.772036680810363904029489473607579825330539880);
|
||||
DK(KP683113946, +0.683113946453479238701949862233725244439656928);
|
||||
DK(KP559154169, +0.559154169276087864842202529084232643714075927);
|
||||
DK(KP242145790, +0.242145790282157779872542093866183953459003101);
|
||||
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
|
||||
DK(KP999754674, +0.999754674276473633366203429228112409535557487);
|
||||
DK(KP904730450, +0.904730450839922351881287709692877908104763647);
|
||||
DK(KP916574801, +0.916574801383451584742370439148878693530976769);
|
||||
DK(KP829049696, +0.829049696159252993975487806364305442437946767);
|
||||
DK(KP831864738, +0.831864738706457140726048799369896829771167132);
|
||||
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
|
||||
DK(KP949179823, +0.949179823508441261575555465843363271711583843);
|
||||
DK(KP669429328, +0.669429328479476605641803240971985825917022098);
|
||||
DK(KP262346850, +0.262346850930607871785420028382979691334784273);
|
||||
DK(KP923225144, +0.923225144846402650453449441572664695995209956);
|
||||
DK(KP906616052, +0.906616052148196230441134447086066874408359177);
|
||||
DK(KP921078979, +0.921078979742360627699756128143719920817673854);
|
||||
DK(KP982009705, +0.982009705009746369461829878184175962711969869);
|
||||
DK(KP845997307, +0.845997307939530944175097360758058292389769300);
|
||||
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
|
||||
DK(KP803003575, +0.803003575438660414833440593570376004635464850);
|
||||
DK(KP763583905, +0.763583905359130246362948588764067237776594106);
|
||||
DK(KP248028675, +0.248028675328619457762448260696444630363259177);
|
||||
DK(KP904508497, +0.904508497187473712051146708591409529430077295);
|
||||
DK(KP894834959, +0.894834959464455102997960030820114611498661386);
|
||||
DK(KP958953096, +0.958953096729998668045963838399037225970891871);
|
||||
DK(KP867381224, +0.867381224396525206773171885031575671309956167);
|
||||
DK(KP912575812, +0.912575812670962425556968549836277086778922727);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP869845200, +0.869845200362138853122720822420327157933056305);
|
||||
DK(KP120146378, +0.120146378570687701782758537356596213647956445);
|
||||
DK(KP132830569, +0.132830569247582714407653942074819768844536507);
|
||||
DK(KP786782374, +0.786782374965295178365099601674911834788448471);
|
||||
DK(KP893101515, +0.893101515366181661711202267938416198338079437);
|
||||
DK(KP987388751, +0.987388751065621252324603216482382109400433949);
|
||||
DK(KP244189809, +0.244189809627953270309879511234821255780225091);
|
||||
DK(KP269969613, +0.269969613759572083574752974412347470060951301);
|
||||
DK(KP494780565, +0.494780565770515410344588413655324772219443730);
|
||||
DK(KP066152395, +0.066152395967733048213034281011006031460903353);
|
||||
DK(KP059835404, +0.059835404262124915169548397419498386427871950);
|
||||
DK(KP447533225, +0.447533225982656890041886979663652563063114397);
|
||||
DK(KP522847744, +0.522847744331509716623755382187077770911012542);
|
||||
DK(KP667278218, +0.667278218140296670899089292254759909713898805);
|
||||
DK(KP603558818, +0.603558818296015001454675132653458027918768137);
|
||||
DK(KP578046249, +0.578046249379945007321754579646815604023525655);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
|
||||
E T2v, TJ, T2A, T1K, T2y, T2z, TB, T15, T2d, T2l, T1g, T1s, T1N, T21, T1D;
|
||||
E T9, TQ, T2g, T2o, T1j, T1u, T1X, T25, T1z, Ti, TX, T2f, T2p, T1k, T1v;
|
||||
E T1U, T24, T1A, Ts, T1c, T2c, T2k, T1h, T1r, T1Q, T22, T1C, Tj, TC;
|
||||
{
|
||||
E TI, T2x, TF, T2w;
|
||||
T2v = R0[0];
|
||||
{
|
||||
E TG, TH, TD, TE;
|
||||
TG = R0[WS(rs, 10)];
|
||||
TH = R1[WS(rs, 2)];
|
||||
TI = TG + TH;
|
||||
T2x = TG - TH;
|
||||
TD = R0[WS(rs, 5)];
|
||||
TE = R1[WS(rs, 7)];
|
||||
TF = TD + TE;
|
||||
T2w = TD - TE;
|
||||
}
|
||||
TJ = FMA(KP618033988, TI, TF);
|
||||
T2A = T2w - T2x;
|
||||
T1K = FNMS(KP618033988, TF, TI);
|
||||
T2y = T2w + T2x;
|
||||
T2z = FNMS(KP250000000, T2y, T2v);
|
||||
}
|
||||
{
|
||||
E Tt, TA, T13, TZ, T10;
|
||||
Tt = R0[WS(rs, 2)];
|
||||
{
|
||||
E Tu, Tv, Tw, Tx, Ty, Tz;
|
||||
Tu = R0[WS(rs, 7)];
|
||||
Tv = R1[WS(rs, 9)];
|
||||
Tw = Tu - Tv;
|
||||
Tx = R0[WS(rs, 12)];
|
||||
Ty = R1[WS(rs, 4)];
|
||||
Tz = Tx - Ty;
|
||||
TA = Tw + Tz;
|
||||
T13 = Tz - Tw;
|
||||
TZ = Tu + Tv;
|
||||
T10 = Tx + Ty;
|
||||
}
|
||||
TB = Tt + TA;
|
||||
{
|
||||
E T11, T1M, T14, T1L, T12;
|
||||
T11 = FMA(KP618033988, T10, TZ);
|
||||
T1M = FNMS(KP618033988, TZ, T10);
|
||||
T12 = FNMS(KP250000000, TA, Tt);
|
||||
T14 = FNMS(KP559016994, T13, T12);
|
||||
T1L = FMA(KP559016994, T13, T12);
|
||||
T15 = FMA(KP578046249, T14, T11);
|
||||
T2d = FNMS(KP603558818, T1M, T1L);
|
||||
T2l = FMA(KP667278218, T1L, T1M);
|
||||
T1g = FNMS(KP522847744, T11, T14);
|
||||
T1s = FMA(KP447533225, T11, T14);
|
||||
T1N = FMA(KP059835404, T1M, T1L);
|
||||
T21 = FNMS(KP066152395, T1L, T1M);
|
||||
T1D = FNMS(KP494780565, T14, T11);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1, T8, TO, TK, TL;
|
||||
T1 = R0[WS(rs, 1)];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = R0[WS(rs, 6)];
|
||||
T3 = R1[WS(rs, 8)];
|
||||
T4 = T2 - T3;
|
||||
T5 = R0[WS(rs, 11)];
|
||||
T6 = R1[WS(rs, 3)];
|
||||
T7 = T5 - T6;
|
||||
T8 = T4 + T7;
|
||||
TO = T4 - T7;
|
||||
TK = T2 + T3;
|
||||
TL = T5 + T6;
|
||||
}
|
||||
T9 = T1 + T8;
|
||||
{
|
||||
E TM, T1V, TP, T1W, TN;
|
||||
TM = FMA(KP618033988, TL, TK);
|
||||
T1V = FNMS(KP618033988, TK, TL);
|
||||
TN = FNMS(KP250000000, T8, T1);
|
||||
TP = FMA(KP559016994, TO, TN);
|
||||
T1W = FNMS(KP559016994, TO, TN);
|
||||
TQ = FMA(KP269969613, TP, TM);
|
||||
T2g = FNMS(KP578046249, T1W, T1V);
|
||||
T2o = FMA(KP522847744, T1V, T1W);
|
||||
T1j = FNMS(KP244189809, TM, TP);
|
||||
T1u = FNMS(KP603558818, TM, TP);
|
||||
T1X = FMA(KP987388751, T1W, T1V);
|
||||
T25 = FNMS(KP893101515, T1V, T1W);
|
||||
T1z = FMA(KP667278218, TP, TM);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Th, Tg, TV, TS, TU;
|
||||
Th = R0[WS(rs, 4)];
|
||||
{
|
||||
E Ta, Tb, Tc, Td, Te, Tf;
|
||||
Ta = R0[WS(rs, 9)];
|
||||
Tb = R1[WS(rs, 11)];
|
||||
Tc = Ta - Tb;
|
||||
Td = R1[WS(rs, 6)];
|
||||
Te = R1[WS(rs, 1)];
|
||||
Tf = Td + Te;
|
||||
Tg = Tc - Tf;
|
||||
TV = Te - Td;
|
||||
TS = Tc + Tf;
|
||||
TU = Ta + Tb;
|
||||
}
|
||||
Ti = Tg + Th;
|
||||
{
|
||||
E TW, T1S, TT, T1T, TR;
|
||||
TW = FNMS(KP618033988, TV, TU);
|
||||
T1S = FMA(KP618033988, TU, TV);
|
||||
TR = FNMS(KP250000000, Tg, Th);
|
||||
TT = FMA(KP559016994, TS, TR);
|
||||
T1T = FNMS(KP559016994, TS, TR);
|
||||
TX = FMA(KP603558818, TW, TT);
|
||||
T2f = FNMS(KP447533225, T1S, T1T);
|
||||
T2p = FMA(KP494780565, T1T, T1S);
|
||||
T1k = FNMS(KP667278218, TT, TW);
|
||||
T1v = FNMS(KP786782374, TW, TT);
|
||||
T1U = FMA(KP132830569, T1T, T1S);
|
||||
T24 = FNMS(KP120146378, T1S, T1T);
|
||||
T1A = FMA(KP869845200, TT, TW);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tk, Tr, T1a, T16, T17;
|
||||
Tk = R0[WS(rs, 3)];
|
||||
{
|
||||
E Tl, Tm, Tn, To, Tp, Tq;
|
||||
Tl = R0[WS(rs, 8)];
|
||||
Tm = R1[WS(rs, 10)];
|
||||
Tn = Tl - Tm;
|
||||
To = R1[0];
|
||||
Tp = R1[WS(rs, 5)];
|
||||
Tq = To + Tp;
|
||||
Tr = Tn - Tq;
|
||||
T1a = Tn + Tq;
|
||||
T16 = Tl + Tm;
|
||||
T17 = Tp - To;
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
{
|
||||
E T18, T1P, T1b, T1O, T19;
|
||||
T18 = FMA(KP618033988, T17, T16);
|
||||
T1P = FNMS(KP618033988, T16, T17);
|
||||
T19 = FNMS(KP250000000, Tr, Tk);
|
||||
T1b = FMA(KP559016994, T1a, T19);
|
||||
T1O = FNMS(KP559016994, T1a, T19);
|
||||
T1c = FMA(KP987388751, T1b, T18);
|
||||
T2c = FNMS(KP059835404, T1P, T1O);
|
||||
T2k = FMA(KP066152395, T1O, T1P);
|
||||
T1h = FNMS(KP893101515, T18, T1b);
|
||||
T1r = FMA(KP132830569, T1b, T18);
|
||||
T1Q = FNMS(KP786782374, T1P, T1O);
|
||||
T22 = FMA(KP869845200, T1O, T1P);
|
||||
T1C = FNMS(KP120146378, T18, T1b);
|
||||
}
|
||||
}
|
||||
Tj = T9 - Ti;
|
||||
TC = Ts - TB;
|
||||
Ci[WS(csi, 2)] = -(KP951056516 * (FNMS(KP618033988, TC, Tj)));
|
||||
Ci[WS(csi, 7)] = KP951056516 * (FMA(KP618033988, Tj, TC));
|
||||
{
|
||||
E T3l, T3o, T3q, T3m, T3n, T3p;
|
||||
T3l = T2v + T2y;
|
||||
T3m = T9 + Ti;
|
||||
T3n = TB + Ts;
|
||||
T3o = T3m + T3n;
|
||||
T3q = T3m - T3n;
|
||||
Cr[WS(csr, 12)] = T3o + T3l;
|
||||
T3p = FNMS(KP250000000, T3o, T3l);
|
||||
Cr[WS(csr, 2)] = FMA(KP559016994, T3q, T3p);
|
||||
Cr[WS(csr, 7)] = FNMS(KP559016994, T3q, T3p);
|
||||
}
|
||||
{
|
||||
E T1B, T1E, T1x, T1I, T1G, T1t, T1w, T1F, T1y, T1J, T1H;
|
||||
T1B = FMA(KP912575812, T1A, T1z);
|
||||
T1E = FMA(KP867381224, T1D, T1C);
|
||||
T1t = FMA(KP958953096, T1s, T1r);
|
||||
T1w = FNMS(KP912575812, T1v, T1u);
|
||||
T1F = FNMS(KP894834959, T1w, T1t);
|
||||
T1x = FMA(KP894834959, T1w, T1t);
|
||||
T1I = FNMS(KP894834959, T1B, T1F);
|
||||
T1G = FNMS(KP904508497, T1F, T1E);
|
||||
T1y = FMA(KP248028675, T1x, TJ);
|
||||
T1J = FMA(KP559016994, T1I, T1E);
|
||||
T1H = FMA(KP763583905, T1G, T1B);
|
||||
Ci[WS(csi, 4)] = KP951056516 * (FNMS(KP803003575, T1H, T1y));
|
||||
Ci[WS(csi, 9)] = KP951056516 * (FNMS(KP992114701, T1J, T1y));
|
||||
}
|
||||
{
|
||||
E T2m, T2q, T2i, T2t, T2r, T2e, T2h, T2n, T2j, T2u, T2s;
|
||||
T2m = FNMS(KP845997307, T2l, T2k);
|
||||
T2q = FMA(KP982009705, T2p, T2o);
|
||||
T2e = FMA(KP845997307, T2d, T2c);
|
||||
T2h = FNMS(KP921078979, T2g, T2f);
|
||||
T2n = FNMS(KP906616052, T2h, T2e);
|
||||
T2i = FMA(KP906616052, T2h, T2e);
|
||||
T2t = T2m + T2n;
|
||||
T2r = FNMS(KP923225144, T2q, T2n);
|
||||
T2j = FMA(KP262346850, T2i, T1K);
|
||||
T2u = FNMS(KP669429328, T2t, T2q);
|
||||
T2s = FNMS(KP618033988, T2r, T2m);
|
||||
Ci[WS(csi, 8)] = KP951056516 * (FMA(KP949179823, T2s, T2j));
|
||||
Ci[WS(csi, 3)] = KP951056516 * (FNMS(KP876306680, T2u, T2j));
|
||||
}
|
||||
{
|
||||
E T1i, T1l, T1e, T1p, T1n, TY, T1d, T1m, T1f, T1q, T1o;
|
||||
T1i = FNMS(KP831864738, T1h, T1g);
|
||||
T1l = FMA(KP829049696, T1k, T1j);
|
||||
TY = FMA(KP916574801, TX, TQ);
|
||||
T1d = FMA(KP831864738, T1c, T15);
|
||||
T1m = FNMS(KP904730450, T1d, TY);
|
||||
T1e = FMA(KP904730450, T1d, TY);
|
||||
T1p = FNMS(KP999754674, T1m, T1i);
|
||||
T1n = FNMS(KP904508497, T1m, T1l);
|
||||
Ci[0] = -(KP951056516 * (FMA(KP968583161, T1e, TJ)));
|
||||
T1f = FNMS(KP242145790, T1e, TJ);
|
||||
T1q = FMA(KP559154169, T1p, T1l);
|
||||
T1o = FNMS(KP683113946, T1n, T1i);
|
||||
Ci[WS(csi, 5)] = -(KP951056516 * (FNMS(KP876306680, T1o, T1f)));
|
||||
Ci[WS(csi, 10)] = -(KP951056516 * (FNMS(KP968583161, T1q, T1f)));
|
||||
}
|
||||
{
|
||||
E T23, T26, T1Z, T2a, T28, T1R, T1Y, T27, T20, T2b, T29;
|
||||
T23 = FNMS(KP772036680, T22, T21);
|
||||
T26 = FMA(KP734762448, T25, T24);
|
||||
T1R = FMA(KP772036680, T1Q, T1N);
|
||||
T1Y = FMA(KP734762448, T1X, T1U);
|
||||
T27 = FNMS(KP945422727, T1Y, T1R);
|
||||
T1Z = FMA(KP945422727, T1Y, T1R);
|
||||
T2a = T27 - T23;
|
||||
T28 = FMA(KP956723877, T27, T26);
|
||||
Ci[WS(csi, 1)] = -(KP998026728 * (FMA(KP952936919, T1K, T1Z)));
|
||||
T20 = FNMS(KP262346850, T1Z, T1K);
|
||||
T2b = FMA(KP591287873, T2a, T26);
|
||||
T29 = FMA(KP645989928, T28, T23);
|
||||
Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP949179823, T29, T20)));
|
||||
Ci[WS(csi, 11)] = -(KP951056516 * (FNMS(KP992114701, T2b, T20)));
|
||||
}
|
||||
{
|
||||
E T2Y, T33, T31, T38, T36, T3e, T3f, T3c, T3j, T3h, T3a, T3b, T3g;
|
||||
T2Y = FNMS(KP559016994, T2A, T2z);
|
||||
T33 = FNMS(KP772036680, T1Q, T1N);
|
||||
{
|
||||
E T34, T2Z, T30, T35;
|
||||
T34 = FNMS(KP734762448, T1X, T1U);
|
||||
T2Z = FNMS(KP734762448, T25, T24);
|
||||
T30 = FMA(KP772036680, T22, T21);
|
||||
T35 = FNMS(KP956723877, T30, T2Z);
|
||||
T31 = FMA(KP956723877, T30, T2Z);
|
||||
T38 = FMA(KP618033988, T35, T34);
|
||||
T36 = T34 + T35;
|
||||
}
|
||||
T3e = FMA(KP921078979, T2g, T2f);
|
||||
T3f = FNMS(KP845997307, T2d, T2c);
|
||||
T3a = FMA(KP845997307, T2l, T2k);
|
||||
T3b = FNMS(KP982009705, T2p, T2o);
|
||||
T3g = FNMS(KP923225144, T3b, T3a);
|
||||
T3c = FMA(KP923225144, T3b, T3a);
|
||||
T3j = FNMS(KP997675361, T3g, T3e);
|
||||
T3h = FNMS(KP904508497, T3g, T3f);
|
||||
Cr[WS(csr, 1)] = FNMS(KP992114701, T31, T2Y);
|
||||
{
|
||||
E T32, T39, T37, T3d, T3k, T3i;
|
||||
T32 = FMA(KP248028675, T31, T2Y);
|
||||
T39 = FNMS(KP653711795, T33, T38);
|
||||
T37 = FMA(KP584303379, T36, T33);
|
||||
Cr[WS(csr, 6)] = FMA(KP949179823, T37, T32);
|
||||
Cr[WS(csr, 11)] = FNMS(KP897376177, T39, T32);
|
||||
T3d = FNMS(KP237294955, T3c, T2Y);
|
||||
T3k = FNMS(KP560319534, T3j, T3f);
|
||||
T3i = FMA(KP681693190, T3h, T3e);
|
||||
Cr[WS(csr, 3)] = FMA(KP860541664, T3i, T3d);
|
||||
Cr[WS(csr, 8)] = FMA(KP949179823, T3k, T3d);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2B, T2R, T2T, T2P, T2W, T2U, T2G, T2H, T2E, T2L, T2J;
|
||||
T2B = FMA(KP559016994, T2A, T2z);
|
||||
{
|
||||
E T2N, T2O, T2S, T2C, T2D, T2I;
|
||||
T2R = FNMS(KP958953096, T1s, T1r);
|
||||
T2T = FMA(KP912575812, T1v, T1u);
|
||||
T2N = FNMS(KP867381224, T1D, T1C);
|
||||
T2O = FNMS(KP912575812, T1A, T1z);
|
||||
T2S = FMA(KP809385824, T2O, T2N);
|
||||
T2P = FNMS(KP809385824, T2O, T2N);
|
||||
T2W = T2R + T2S;
|
||||
T2U = FNMS(KP894834959, T2T, T2S);
|
||||
T2G = FNMS(KP831864738, T1c, T15);
|
||||
T2H = FNMS(KP916574801, TX, TQ);
|
||||
T2C = FNMS(KP829049696, T1k, T1j);
|
||||
T2D = FMA(KP831864738, T1h, T1g);
|
||||
T2I = FNMS(KP904730450, T2D, T2C);
|
||||
T2E = FMA(KP904730450, T2D, T2C);
|
||||
T2L = FMA(KP904730450, T2G, T2I);
|
||||
T2J = T2H + T2I;
|
||||
}
|
||||
Cr[0] = FMA(KP968583161, T2E, T2B);
|
||||
{
|
||||
E T2Q, T2X, T2V, T2F, T2M, T2K;
|
||||
T2Q = FMA(KP248028675, T2P, T2B);
|
||||
T2X = FNMS(KP690668130, T2W, T2T);
|
||||
T2V = FNMS(KP618033988, T2U, T2R);
|
||||
Cr[WS(csr, 9)] = FMA(KP897376177, T2V, T2Q);
|
||||
Cr[WS(csr, 4)] = FNMS(KP803003575, T2X, T2Q);
|
||||
T2F = FNMS(KP242145790, T2E, T2B);
|
||||
T2M = FMA(KP618033988, T2L, T2H);
|
||||
T2K = FNMS(KP683113946, T2J, T2G);
|
||||
Cr[WS(csr, 5)] = FMA(KP792626838, T2K, T2F);
|
||||
Cr[WS(csr, 10)] = FMA(KP876091699, T2M, T2F);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 25, "r2cfII_25", { 47, 12, 165, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_25) (planner *p) { X(kr2c_register) (p, r2cfII_25, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cfII_25 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 213 FP additions, 148 FP multiplications,
|
||||
* (or, 126 additions, 61 multiplications, 87 fused multiply/add),
|
||||
* 94 stack variables, 38 constants, and 50 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
|
||||
DK(KP062790519, +0.062790519529313376076178224565631133122484832);
|
||||
DK(KP125581039, +0.125581039058626752152356449131262266244969664);
|
||||
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
|
||||
DK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
|
||||
DK(KP728968627, +0.728968627421411523146730319055259111372571664);
|
||||
DK(KP963507348, +0.963507348203430549974383005744259307057084020);
|
||||
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
|
||||
DK(KP497379774, +0.497379774329709576484567492012895936835134813);
|
||||
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
|
||||
DK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
|
||||
DK(KP684547105, +0.684547105928688673732283357621209269889519233);
|
||||
DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
|
||||
DK(KP481753674, +0.481753674101715274987191502872129653528542010);
|
||||
DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
|
||||
DK(KP248689887, +0.248689887164854788242283746006447968417567406);
|
||||
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
|
||||
DK(KP250666467, +0.250666467128608490746237519633017587885836494);
|
||||
DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
|
||||
DK(KP425779291, +0.425779291565072648862502445744251703979973042);
|
||||
DK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
|
||||
DK(KP637423989, +0.637423989748689710176712811676016195434917298);
|
||||
DK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
|
||||
DK(KP535826794, +0.535826794978996618271308767867639978063575346);
|
||||
DK(KP851558583, +0.851558583130145297725004891488503407959946084);
|
||||
DK(KP904827052, +0.904827052466019527713668647932697593970413911);
|
||||
DK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
|
||||
DK(KP125333233, +0.125333233564304245373118759816508793942918247);
|
||||
DK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
|
||||
DK(KP770513242, +0.770513242775789230803009636396177847271667672);
|
||||
DK(KP844327925, +0.844327925502015078548558063966681505381659241);
|
||||
DK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
|
||||
DK(KP293892626, +0.293892626146236564584352977319536384298826219);
|
||||
DK(KP475528258, +0.475528258147576786058219666689691071702849317);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
|
||||
E TE, TR, T2i, T1z, TL, TS, TB, T2d, T1l, T1i, T2c, T9, T23, TZ, TW;
|
||||
E T22, Ti, T26, T16, T13, T25, Ts, T2a, T1e, T1b, T29, TP, TQ;
|
||||
{
|
||||
E TK, T1y, TH, T1x;
|
||||
TE = R0[0];
|
||||
{
|
||||
E TI, TJ, TF, TG;
|
||||
TI = R0[WS(rs, 10)];
|
||||
TJ = R1[WS(rs, 2)];
|
||||
TK = TI - TJ;
|
||||
T1y = TI + TJ;
|
||||
TF = R0[WS(rs, 5)];
|
||||
TG = R1[WS(rs, 7)];
|
||||
TH = TF - TG;
|
||||
T1x = TF + TG;
|
||||
}
|
||||
TR = KP559016994 * (TH - TK);
|
||||
T2i = FNMS(KP587785252, T1x, KP951056516 * T1y);
|
||||
T1z = FMA(KP951056516, T1x, KP587785252 * T1y);
|
||||
TL = TH + TK;
|
||||
TS = FNMS(KP250000000, TL, TE);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tz, TA, T1k, T1j, T1g, T1h;
|
||||
Tt = R0[WS(rs, 3)];
|
||||
{
|
||||
E Tu, Tv, Tx, Ty;
|
||||
Tu = R0[WS(rs, 8)];
|
||||
Tv = R1[WS(rs, 10)];
|
||||
Tw = Tu - Tv;
|
||||
Tx = R1[0];
|
||||
Ty = R1[WS(rs, 5)];
|
||||
Tz = Tx + Ty;
|
||||
TA = Tw - Tz;
|
||||
T1k = Ty - Tx;
|
||||
T1j = Tu + Tv;
|
||||
}
|
||||
TB = Tt + TA;
|
||||
T2d = FNMS(KP293892626, T1j, KP475528258 * T1k);
|
||||
T1l = FMA(KP475528258, T1j, KP293892626 * T1k);
|
||||
T1g = FNMS(KP250000000, TA, Tt);
|
||||
T1h = KP559016994 * (Tw + Tz);
|
||||
T1i = T1g + T1h;
|
||||
T2c = T1g - T1h;
|
||||
}
|
||||
{
|
||||
E T1, T4, T7, T8, TY, TX, TU, TV;
|
||||
T1 = R0[WS(rs, 1)];
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = R0[WS(rs, 6)];
|
||||
T3 = R1[WS(rs, 8)];
|
||||
T4 = T2 - T3;
|
||||
T5 = R0[WS(rs, 11)];
|
||||
T6 = R1[WS(rs, 3)];
|
||||
T7 = T5 - T6;
|
||||
T8 = T4 + T7;
|
||||
TY = T5 + T6;
|
||||
TX = T2 + T3;
|
||||
}
|
||||
T9 = T1 + T8;
|
||||
T23 = FNMS(KP293892626, TX, KP475528258 * TY);
|
||||
TZ = FMA(KP475528258, TX, KP293892626 * TY);
|
||||
TU = KP559016994 * (T4 - T7);
|
||||
TV = FNMS(KP250000000, T8, T1);
|
||||
TW = TU + TV;
|
||||
T22 = TV - TU;
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tg, Th, T15, T14, T11, T12;
|
||||
Ta = R0[WS(rs, 4)];
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = R0[WS(rs, 9)];
|
||||
Tc = R1[WS(rs, 11)];
|
||||
Td = Tb - Tc;
|
||||
Te = R1[WS(rs, 1)];
|
||||
Tf = R1[WS(rs, 6)];
|
||||
Tg = Te + Tf;
|
||||
Th = Td - Tg;
|
||||
T15 = Tf - Te;
|
||||
T14 = Tb + Tc;
|
||||
}
|
||||
Ti = Ta + Th;
|
||||
T26 = FNMS(KP293892626, T14, KP475528258 * T15);
|
||||
T16 = FMA(KP475528258, T14, KP293892626 * T15);
|
||||
T11 = FNMS(KP250000000, Th, Ta);
|
||||
T12 = KP559016994 * (Td + Tg);
|
||||
T13 = T11 + T12;
|
||||
T25 = T11 - T12;
|
||||
}
|
||||
{
|
||||
E Tk, Tn, Tq, Tr, T1d, T1c, T19, T1a;
|
||||
Tk = R0[WS(rs, 2)];
|
||||
{
|
||||
E Tl, Tm, To, Tp;
|
||||
Tl = R0[WS(rs, 7)];
|
||||
Tm = R1[WS(rs, 9)];
|
||||
Tn = Tl - Tm;
|
||||
To = R0[WS(rs, 12)];
|
||||
Tp = R1[WS(rs, 4)];
|
||||
Tq = To - Tp;
|
||||
Tr = Tn + Tq;
|
||||
T1d = To + Tp;
|
||||
T1c = Tl + Tm;
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
T2a = FNMS(KP293892626, T1c, KP475528258 * T1d);
|
||||
T1e = FMA(KP475528258, T1c, KP293892626 * T1d);
|
||||
T19 = KP559016994 * (Tn - Tq);
|
||||
T1a = FNMS(KP250000000, Tr, Tk);
|
||||
T1b = T19 + T1a;
|
||||
T29 = T1a - T19;
|
||||
}
|
||||
TP = TB - Ts;
|
||||
TQ = T9 - Ti;
|
||||
Ci[WS(csi, 2)] = FNMS(KP951056516, TQ, KP587785252 * TP);
|
||||
Ci[WS(csi, 7)] = FMA(KP587785252, TQ, KP951056516 * TP);
|
||||
{
|
||||
E TM, TD, TN, Tj, TC, TO;
|
||||
TM = TE + TL;
|
||||
Tj = T9 + Ti;
|
||||
TC = Ts + TB;
|
||||
TD = KP559016994 * (Tj - TC);
|
||||
TN = Tj + TC;
|
||||
Cr[WS(csr, 12)] = TM + TN;
|
||||
TO = FNMS(KP250000000, TN, TM);
|
||||
Cr[WS(csr, 2)] = TD + TO;
|
||||
Cr[WS(csr, 7)] = TO - TD;
|
||||
}
|
||||
{
|
||||
E TT, T1J, T1Y, T1U, T1X, T1P, T1V, T1M, T1W, T1A, T1B, T1r, T1C, T1v, T18;
|
||||
E T1n, T1o, T1G, T1D;
|
||||
TT = TR + TS;
|
||||
{
|
||||
E T1H, T1I, T1S, T1T;
|
||||
T1H = FNMS(KP844327925, TW, KP1_071653589 * TZ);
|
||||
T1I = FNMS(KP1_274847979, T16, KP770513242 * T13);
|
||||
T1J = T1H - T1I;
|
||||
T1Y = T1H + T1I;
|
||||
T1S = FMA(KP125333233, T1i, KP1_984229402 * T1l);
|
||||
T1T = FMA(KP904827052, T1b, KP851558583 * T1e);
|
||||
T1U = T1S - T1T;
|
||||
T1X = T1T + T1S;
|
||||
}
|
||||
{
|
||||
E T1N, T1O, T1K, T1L;
|
||||
T1N = FMA(KP535826794, TW, KP1_688655851 * TZ);
|
||||
T1O = FMA(KP637423989, T13, KP1_541026485 * T16);
|
||||
T1P = T1N - T1O;
|
||||
T1V = T1N + T1O;
|
||||
T1K = FNMS(KP1_809654104, T1e, KP425779291 * T1b);
|
||||
T1L = FNMS(KP992114701, T1i, KP250666467 * T1l);
|
||||
T1M = T1K - T1L;
|
||||
T1W = T1K + T1L;
|
||||
}
|
||||
{
|
||||
E T1p, T1q, T1t, T1u;
|
||||
T1p = FMA(KP844327925, T13, KP1_071653589 * T16);
|
||||
T1q = FMA(KP248689887, TW, KP1_937166322 * TZ);
|
||||
T1A = T1q + T1p;
|
||||
T1t = FMA(KP481753674, T1b, KP1_752613360 * T1e);
|
||||
T1u = FMA(KP684547105, T1i, KP1_457937254 * T1l);
|
||||
T1B = T1t + T1u;
|
||||
T1r = T1p - T1q;
|
||||
T1C = T1A + T1B;
|
||||
T1v = T1t - T1u;
|
||||
}
|
||||
{
|
||||
E T10, T17, T1f, T1m;
|
||||
T10 = FNMS(KP497379774, TZ, KP968583161 * TW);
|
||||
T17 = FNMS(KP1_688655851, T16, KP535826794 * T13);
|
||||
T18 = T10 + T17;
|
||||
T1f = FNMS(KP963507348, T1e, KP876306680 * T1b);
|
||||
T1m = FNMS(KP1_369094211, T1l, KP728968627 * T1i);
|
||||
T1n = T1f + T1m;
|
||||
T1o = T18 + T1n;
|
||||
T1G = T10 - T17;
|
||||
T1D = T1f - T1m;
|
||||
}
|
||||
{
|
||||
E T1R, T1Q, T20, T1Z;
|
||||
Cr[0] = TT + T1o;
|
||||
Ci[0] = -(T1z + T1C);
|
||||
T1R = KP559016994 * (T1P + T1M);
|
||||
T1Q = FMA(KP250000000, T1M - T1P, TT);
|
||||
Cr[WS(csr, 4)] = FMA(KP951056516, T1J, T1Q) + FMA(KP587785252, T1U, T1R);
|
||||
Cr[WS(csr, 9)] = FMA(KP951056516, T1U, T1Q) + FNMA(KP587785252, T1J, T1R);
|
||||
T20 = KP559016994 * (T1Y + T1X);
|
||||
T1Z = FMA(KP250000000, T1X - T1Y, T1z);
|
||||
Ci[WS(csi, 9)] = FMA(KP587785252, T1V, KP951056516 * T1W) + T1Z - T20;
|
||||
Ci[WS(csi, 4)] = FMA(KP587785252, T1W, T1Z) + FNMS(KP951056516, T1V, T20);
|
||||
{
|
||||
E T1E, T1F, T1s, T1w;
|
||||
T1E = FMS(KP250000000, T1C, T1z);
|
||||
T1F = KP559016994 * (T1B - T1A);
|
||||
Ci[WS(csi, 5)] = FMA(KP951056516, T1D, T1E) + FNMA(KP587785252, T1G, T1F);
|
||||
Ci[WS(csi, 10)] = FMA(KP951056516, T1G, KP587785252 * T1D) + T1E + T1F;
|
||||
T1s = FNMS(KP250000000, T1o, TT);
|
||||
T1w = KP559016994 * (T18 - T1n);
|
||||
Cr[WS(csr, 5)] = FMA(KP587785252, T1r, T1s) + FMS(KP951056516, T1v, T1w);
|
||||
Cr[WS(csr, 10)] = T1w + FMA(KP587785252, T1v, T1s) - (KP951056516 * T1r);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T21, T2z, T2L, T2K, T2M, T2F, T2P, T2C, T2Q, T2l, T2o, T2p, T2w, T2u, T28;
|
||||
E T2f, T2g, T2s, T2h;
|
||||
T21 = TS - TR;
|
||||
{
|
||||
E T2x, T2y, T2I, T2J;
|
||||
T2x = FNMS(KP844327925, T29, KP1_071653589 * T2a);
|
||||
T2y = FNMS(KP125581039, T2d, KP998026728 * T2c);
|
||||
T2z = T2x + T2y;
|
||||
T2L = T2y - T2x;
|
||||
T2I = FNMS(KP481753674, T22, KP1_752613360 * T23);
|
||||
T2J = FMA(KP904827052, T25, KP851558583 * T26);
|
||||
T2K = T2I + T2J;
|
||||
T2M = T2I - T2J;
|
||||
}
|
||||
{
|
||||
E T2D, T2E, T2A, T2B;
|
||||
T2D = FMA(KP535826794, T29, KP1_688655851 * T2a);
|
||||
T2E = FMA(KP062790519, T2c, KP1_996053456 * T2d);
|
||||
T2F = T2D + T2E;
|
||||
T2P = T2E - T2D;
|
||||
T2A = FMA(KP876306680, T22, KP963507348 * T23);
|
||||
T2B = FNMS(KP425779291, T25, KP1_809654104 * T26);
|
||||
T2C = T2A + T2B;
|
||||
T2Q = T2A - T2B;
|
||||
}
|
||||
{
|
||||
E T2j, T2k, T2m, T2n;
|
||||
T2j = FNMS(KP125333233, T25, KP1_984229402 * T26);
|
||||
T2k = FMA(KP684547105, T22, KP1_457937254 * T23);
|
||||
T2l = T2j - T2k;
|
||||
T2m = FNMS(KP770513242, T2c, KP1_274847979 * T2d);
|
||||
T2n = FMA(KP998026728, T29, KP125581039 * T2a);
|
||||
T2o = T2m - T2n;
|
||||
T2p = T2l + T2o;
|
||||
T2w = T2k + T2j;
|
||||
T2u = T2n + T2m;
|
||||
}
|
||||
{
|
||||
E T24, T27, T2b, T2e;
|
||||
T24 = FNMS(KP1_369094211, T23, KP728968627 * T22);
|
||||
T27 = FMA(KP992114701, T25, KP250666467 * T26);
|
||||
T28 = T24 - T27;
|
||||
T2b = FNMS(KP1_996053456, T2a, KP062790519 * T29);
|
||||
T2e = FMA(KP637423989, T2c, KP1_541026485 * T2d);
|
||||
T2f = T2b - T2e;
|
||||
T2g = T28 + T2f;
|
||||
T2s = T24 + T27;
|
||||
T2h = T2b + T2e;
|
||||
}
|
||||
{
|
||||
E T2H, T2G, T2O, T2N;
|
||||
Cr[WS(csr, 1)] = T21 + T2g;
|
||||
Ci[WS(csi, 1)] = T2p - T2i;
|
||||
T2H = KP559016994 * (T2C - T2F);
|
||||
T2G = FNMS(KP250000000, T2C + T2F, T21);
|
||||
Cr[WS(csr, 8)] = FMA(KP951056516, T2z, T2G) + FNMA(KP587785252, T2K, T2H);
|
||||
Cr[WS(csr, 3)] = FMA(KP951056516, T2K, KP587785252 * T2z) + T2G + T2H;
|
||||
T2O = KP559016994 * (T2M + T2L);
|
||||
T2N = FMA(KP250000000, T2L - T2M, T2i);
|
||||
Ci[WS(csi, 3)] = T2N + FMA(KP587785252, T2P, T2O) - (KP951056516 * T2Q);
|
||||
Ci[WS(csi, 8)] = FMA(KP587785252, T2Q, T2N) + FMS(KP951056516, T2P, T2O);
|
||||
{
|
||||
E T2t, T2v, T2q, T2r;
|
||||
T2t = FNMS(KP250000000, T2g, T21);
|
||||
T2v = KP559016994 * (T28 - T2f);
|
||||
Cr[WS(csr, 6)] = FMA(KP951056516, T2u, T2t) + FNMA(KP587785252, T2w, T2v);
|
||||
Cr[WS(csr, 11)] = FMA(KP951056516, T2w, T2v) + FMA(KP587785252, T2u, T2t);
|
||||
T2q = KP250000000 * T2p;
|
||||
T2r = KP559016994 * (T2l - T2o);
|
||||
Ci[WS(csi, 6)] = FMS(KP951056516, T2h, T2i + T2q) + FNMA(KP587785252, T2s, T2r);
|
||||
Ci[WS(csi, 11)] = FMA(KP951056516, T2s, KP587785252 * T2h) + T2r - (T2i + T2q);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 25, "r2cfII_25", { 126, 61, 87, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_25) (planner *p) { X(kr2c_register) (p, r2cfII_25, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
96
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_3.c
Normal file
96
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_3.c
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cfII_3 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 2 FP multiplications,
|
||||
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
|
||||
* 7 stack variables, 2 constants, and 6 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
|
||||
E T3, T1, T2, T4;
|
||||
T3 = R0[0];
|
||||
T1 = R1[0];
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T4 = T2 - T1;
|
||||
Ci[0] = -(KP866025403 * (T1 + T2));
|
||||
Cr[0] = FNMS(KP500000000, T4, T3);
|
||||
Cr[WS(csr, 1)] = T3 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 3, "r2cfII_3", { 3, 1, 1, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_3) (planner *p) { X(kr2c_register) (p, r2cfII_3, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cfII_3 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 2 FP multiplications,
|
||||
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
|
||||
* 7 stack variables, 2 constants, and 6 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
T3 = R0[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
Cr[WS(csr, 1)] = T1 - T4;
|
||||
Ci[0] = -(KP866025403 * (T2 + T3));
|
||||
Cr[0] = FMA(KP500000000, T4, T1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 3, "r2cfII_3", { 3, 1, 1, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_3) (planner *p) { X(kr2c_register) (p, r2cfII_3, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
686
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_32.c
Normal file
686
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_32.c
Normal file
@@ -0,0 +1,686 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:25 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cfII_32 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 128 FP multiplications,
|
||||
* (or, 46 additions, 0 multiplications, 128 fused multiply/add),
|
||||
* 62 stack variables, 15 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP773010453, +0.773010453362736960810906609758469800971041293);
|
||||
DK(KP820678790, +0.820678790828660330972281985331011598767386482);
|
||||
DK(KP956940335, +0.956940335732208864935797886980269969482849206);
|
||||
DK(KP303346683, +0.303346683607342391675883946941299872384187453);
|
||||
DK(KP995184726, +0.995184726672196886244836953109479921575474869);
|
||||
DK(KP098491403, +0.098491403357164253077197521291327432293052451);
|
||||
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DK(KP881921264, +0.881921264348355029712756863660388349508442621);
|
||||
DK(KP534511135, +0.534511135950791641089685961295362908582039528);
|
||||
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
|
||||
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
|
||||
E T5, T2B, T1z, T2n, Tc, T2C, T1C, T2o, Tm, T1l, T1J, T27, Tv, T1k, T1G;
|
||||
E T26, T15, T1r, T1Y, T2e, T1c, T1s, T1V, T2d, TK, T1o, T1R, T2b, TR, T1p;
|
||||
E T1O, T2a;
|
||||
{
|
||||
E T1, T2l, T4, T2m, T2, T3;
|
||||
T1 = R0[0];
|
||||
T2l = R0[WS(rs, 8)];
|
||||
T2 = R0[WS(rs, 4)];
|
||||
T3 = R0[WS(rs, 12)];
|
||||
T4 = T2 - T3;
|
||||
T2m = T2 + T3;
|
||||
T5 = FNMS(KP707106781, T4, T1);
|
||||
T2B = FNMS(KP707106781, T2m, T2l);
|
||||
T1z = FMA(KP707106781, T4, T1);
|
||||
T2n = FMA(KP707106781, T2m, T2l);
|
||||
}
|
||||
{
|
||||
E T8, T1A, Tb, T1B;
|
||||
{
|
||||
E T6, T7, T9, Ta;
|
||||
T6 = R0[WS(rs, 10)];
|
||||
T7 = R0[WS(rs, 2)];
|
||||
T8 = FMA(KP414213562, T7, T6);
|
||||
T1A = FNMS(KP414213562, T6, T7);
|
||||
T9 = R0[WS(rs, 6)];
|
||||
Ta = R0[WS(rs, 14)];
|
||||
Tb = FMA(KP414213562, Ta, T9);
|
||||
T1B = FMS(KP414213562, T9, Ta);
|
||||
}
|
||||
Tc = T8 - Tb;
|
||||
T2C = T1B - T1A;
|
||||
T1C = T1A + T1B;
|
||||
T2o = T8 + Tb;
|
||||
}
|
||||
{
|
||||
E Te, Tj, Th, Tk, Tf, Tg;
|
||||
Te = R0[WS(rs, 7)];
|
||||
Tj = R0[WS(rs, 15)];
|
||||
Tf = R0[WS(rs, 3)];
|
||||
Tg = R0[WS(rs, 11)];
|
||||
Th = Tf + Tg;
|
||||
Tk = Tg - Tf;
|
||||
{
|
||||
E Ti, Tl, T1H, T1I;
|
||||
Ti = FNMS(KP707106781, Th, Te);
|
||||
Tl = FNMS(KP707106781, Tk, Tj);
|
||||
Tm = FNMS(KP668178637, Tl, Ti);
|
||||
T1l = FMA(KP668178637, Ti, Tl);
|
||||
T1H = FMA(KP707106781, Th, Te);
|
||||
T1I = FMA(KP707106781, Tk, Tj);
|
||||
T1J = FMA(KP198912367, T1I, T1H);
|
||||
T27 = FNMS(KP198912367, T1H, T1I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, Ts, Tq, Tt, To, Tp;
|
||||
Tn = R0[WS(rs, 9)];
|
||||
Ts = R0[WS(rs, 1)];
|
||||
To = R0[WS(rs, 5)];
|
||||
Tp = R0[WS(rs, 13)];
|
||||
Tq = To + Tp;
|
||||
Tt = To - Tp;
|
||||
{
|
||||
E Tr, Tu, T1E, T1F;
|
||||
Tr = FNMS(KP707106781, Tq, Tn);
|
||||
Tu = FNMS(KP707106781, Tt, Ts);
|
||||
Tv = FNMS(KP668178637, Tu, Tr);
|
||||
T1k = FMA(KP668178637, Tr, Tu);
|
||||
T1E = FMA(KP707106781, Tq, Tn);
|
||||
T1F = FMA(KP707106781, Tt, Ts);
|
||||
T1G = FMA(KP198912367, T1F, T1E);
|
||||
T26 = FNMS(KP198912367, T1E, T1F);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TT, T16, TW, T17, T10, T1a, T13, T19, TU, TV;
|
||||
TT = R1[WS(rs, 15)];
|
||||
T16 = R1[WS(rs, 7)];
|
||||
TU = R1[WS(rs, 3)];
|
||||
TV = R1[WS(rs, 11)];
|
||||
TW = TU - TV;
|
||||
T17 = TU + TV;
|
||||
{
|
||||
E TY, TZ, T11, T12;
|
||||
TY = R1[WS(rs, 9)];
|
||||
TZ = R1[WS(rs, 1)];
|
||||
T10 = FMA(KP414213562, TZ, TY);
|
||||
T1a = FNMS(KP414213562, TY, TZ);
|
||||
T11 = R1[WS(rs, 5)];
|
||||
T12 = R1[WS(rs, 13)];
|
||||
T13 = FMA(KP414213562, T12, T11);
|
||||
T19 = FMS(KP414213562, T11, T12);
|
||||
}
|
||||
{
|
||||
E TX, T14, T1W, T1X;
|
||||
TX = FMA(KP707106781, TW, TT);
|
||||
T14 = T10 - T13;
|
||||
T15 = FMA(KP923879532, T14, TX);
|
||||
T1r = FNMS(KP923879532, T14, TX);
|
||||
T1W = FMA(KP707106781, T17, T16);
|
||||
T1X = T10 + T13;
|
||||
T1Y = FNMS(KP923879532, T1X, T1W);
|
||||
T2e = FMA(KP923879532, T1X, T1W);
|
||||
}
|
||||
{
|
||||
E T18, T1b, T1T, T1U;
|
||||
T18 = FNMS(KP707106781, T17, T16);
|
||||
T1b = T19 - T1a;
|
||||
T1c = FNMS(KP923879532, T1b, T18);
|
||||
T1s = FMA(KP923879532, T1b, T18);
|
||||
T1T = FMS(KP707106781, TW, TT);
|
||||
T1U = T1a + T19;
|
||||
T1V = FNMS(KP923879532, T1U, T1T);
|
||||
T2d = FMA(KP923879532, T1U, T1T);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ty, TL, TB, TM, TF, TP, TI, TO, Tz, TA;
|
||||
Ty = R1[0];
|
||||
TL = R1[WS(rs, 8)];
|
||||
Tz = R1[WS(rs, 4)];
|
||||
TA = R1[WS(rs, 12)];
|
||||
TB = Tz - TA;
|
||||
TM = Tz + TA;
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = R1[WS(rs, 10)];
|
||||
TE = R1[WS(rs, 2)];
|
||||
TF = FMA(KP414213562, TE, TD);
|
||||
TP = FNMS(KP414213562, TD, TE);
|
||||
TG = R1[WS(rs, 6)];
|
||||
TH = R1[WS(rs, 14)];
|
||||
TI = FMA(KP414213562, TH, TG);
|
||||
TO = FMS(KP414213562, TG, TH);
|
||||
}
|
||||
{
|
||||
E TC, TJ, T1P, T1Q;
|
||||
TC = FNMS(KP707106781, TB, Ty);
|
||||
TJ = TF - TI;
|
||||
TK = FNMS(KP923879532, TJ, TC);
|
||||
T1o = FMA(KP923879532, TJ, TC);
|
||||
T1P = FMA(KP707106781, TM, TL);
|
||||
T1Q = TF + TI;
|
||||
T1R = FNMS(KP923879532, T1Q, T1P);
|
||||
T2b = FMA(KP923879532, T1Q, T1P);
|
||||
}
|
||||
{
|
||||
E TN, TQ, T1M, T1N;
|
||||
TN = FNMS(KP707106781, TM, TL);
|
||||
TQ = TO - TP;
|
||||
TR = FNMS(KP923879532, TQ, TN);
|
||||
T1p = FMA(KP923879532, TQ, TN);
|
||||
T1M = FMA(KP707106781, TB, Ty);
|
||||
T1N = TP + TO;
|
||||
T1O = FNMS(KP923879532, T1N, T1M);
|
||||
T2a = FMA(KP923879532, T1N, T1M);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tx, T1f, T2L, T2N, T1e, T2O, T1i, T2M;
|
||||
{
|
||||
E Td, Tw, T2J, T2K;
|
||||
Td = FNMS(KP923879532, Tc, T5);
|
||||
Tw = Tm - Tv;
|
||||
Tx = FMA(KP831469612, Tw, Td);
|
||||
T1f = FNMS(KP831469612, Tw, Td);
|
||||
T2J = FNMS(KP923879532, T2C, T2B);
|
||||
T2K = T1k + T1l;
|
||||
T2L = FMA(KP831469612, T2K, T2J);
|
||||
T2N = FNMS(KP831469612, T2K, T2J);
|
||||
}
|
||||
{
|
||||
E TS, T1d, T1g, T1h;
|
||||
TS = FNMS(KP534511135, TR, TK);
|
||||
T1d = FNMS(KP534511135, T1c, T15);
|
||||
T1e = TS - T1d;
|
||||
T2O = TS + T1d;
|
||||
T1g = FMA(KP534511135, TK, TR);
|
||||
T1h = FMA(KP534511135, T15, T1c);
|
||||
T1i = T1g - T1h;
|
||||
T2M = T1g + T1h;
|
||||
}
|
||||
Cr[WS(csr, 13)] = FNMS(KP881921264, T1e, Tx);
|
||||
Ci[WS(csi, 13)] = FNMS(KP881921264, T2M, T2L);
|
||||
Cr[WS(csr, 2)] = FMA(KP881921264, T1e, Tx);
|
||||
Ci[WS(csi, 2)] = -(FMA(KP881921264, T2M, T2L));
|
||||
Cr[WS(csr, 10)] = FNMS(KP881921264, T1i, T1f);
|
||||
Ci[WS(csi, 10)] = -(FMA(KP881921264, T2O, T2N));
|
||||
Cr[WS(csr, 5)] = FMA(KP881921264, T1i, T1f);
|
||||
Ci[WS(csi, 5)] = FNMS(KP881921264, T2O, T2N);
|
||||
}
|
||||
{
|
||||
E T29, T2h, T2r, T2t, T2g, T2u, T2k, T2s;
|
||||
{
|
||||
E T25, T28, T2p, T2q;
|
||||
T25 = FMA(KP923879532, T1C, T1z);
|
||||
T28 = T26 - T27;
|
||||
T29 = FMA(KP980785280, T28, T25);
|
||||
T2h = FNMS(KP980785280, T28, T25);
|
||||
T2p = FMA(KP923879532, T2o, T2n);
|
||||
T2q = T1G + T1J;
|
||||
T2r = FMA(KP980785280, T2q, T2p);
|
||||
T2t = FNMS(KP980785280, T2q, T2p);
|
||||
}
|
||||
{
|
||||
E T2c, T2f, T2i, T2j;
|
||||
T2c = FNMS(KP098491403, T2b, T2a);
|
||||
T2f = FMA(KP098491403, T2e, T2d);
|
||||
T2g = T2c + T2f;
|
||||
T2u = T2f - T2c;
|
||||
T2i = FMA(KP098491403, T2a, T2b);
|
||||
T2j = FNMS(KP098491403, T2d, T2e);
|
||||
T2k = T2i - T2j;
|
||||
T2s = T2i + T2j;
|
||||
}
|
||||
Cr[WS(csr, 15)] = FNMS(KP995184726, T2g, T29);
|
||||
Ci[WS(csi, 15)] = FNMS(KP995184726, T2s, T2r);
|
||||
Cr[0] = FMA(KP995184726, T2g, T29);
|
||||
Ci[0] = -(FMA(KP995184726, T2s, T2r));
|
||||
Cr[WS(csr, 8)] = FNMS(KP995184726, T2k, T2h);
|
||||
Ci[WS(csi, 8)] = FMS(KP995184726, T2u, T2t);
|
||||
Cr[WS(csr, 7)] = FMA(KP995184726, T2k, T2h);
|
||||
Ci[WS(csi, 7)] = FMA(KP995184726, T2u, T2t);
|
||||
}
|
||||
{
|
||||
E T1n, T1v, T2F, T2H, T1u, T2I, T1y, T2G;
|
||||
{
|
||||
E T1j, T1m, T2D, T2E;
|
||||
T1j = FMA(KP923879532, Tc, T5);
|
||||
T1m = T1k - T1l;
|
||||
T1n = FMA(KP831469612, T1m, T1j);
|
||||
T1v = FNMS(KP831469612, T1m, T1j);
|
||||
T2D = FMA(KP923879532, T2C, T2B);
|
||||
T2E = Tv + Tm;
|
||||
T2F = FMA(KP831469612, T2E, T2D);
|
||||
T2H = FNMS(KP831469612, T2E, T2D);
|
||||
}
|
||||
{
|
||||
E T1q, T1t, T1w, T1x;
|
||||
T1q = FMA(KP303346683, T1p, T1o);
|
||||
T1t = FMA(KP303346683, T1s, T1r);
|
||||
T1u = T1q - T1t;
|
||||
T2I = T1q + T1t;
|
||||
T1w = FNMS(KP303346683, T1r, T1s);
|
||||
T1x = FNMS(KP303346683, T1o, T1p);
|
||||
T1y = T1w - T1x;
|
||||
T2G = T1x + T1w;
|
||||
}
|
||||
Cr[WS(csr, 14)] = FNMS(KP956940335, T1u, T1n);
|
||||
Ci[WS(csi, 14)] = FMS(KP956940335, T2G, T2F);
|
||||
Cr[WS(csr, 1)] = FMA(KP956940335, T1u, T1n);
|
||||
Ci[WS(csi, 1)] = FMA(KP956940335, T2G, T2F);
|
||||
Cr[WS(csr, 9)] = FNMS(KP956940335, T1y, T1v);
|
||||
Ci[WS(csi, 9)] = FNMS(KP956940335, T2I, T2H);
|
||||
Cr[WS(csr, 6)] = FMA(KP956940335, T1y, T1v);
|
||||
Ci[WS(csi, 6)] = -(FMA(KP956940335, T2I, T2H));
|
||||
}
|
||||
{
|
||||
E T1L, T21, T2x, T2z, T20, T2A, T24, T2y;
|
||||
{
|
||||
E T1D, T1K, T2v, T2w;
|
||||
T1D = FNMS(KP923879532, T1C, T1z);
|
||||
T1K = T1G - T1J;
|
||||
T1L = FMA(KP980785280, T1K, T1D);
|
||||
T21 = FNMS(KP980785280, T1K, T1D);
|
||||
T2v = FNMS(KP923879532, T2o, T2n);
|
||||
T2w = T26 + T27;
|
||||
T2x = FNMS(KP980785280, T2w, T2v);
|
||||
T2z = FMA(KP980785280, T2w, T2v);
|
||||
}
|
||||
{
|
||||
E T1S, T1Z, T22, T23;
|
||||
T1S = FMA(KP820678790, T1R, T1O);
|
||||
T1Z = FNMS(KP820678790, T1Y, T1V);
|
||||
T20 = T1S + T1Z;
|
||||
T2A = T1Z - T1S;
|
||||
T22 = FMA(KP820678790, T1V, T1Y);
|
||||
T23 = FNMS(KP820678790, T1O, T1R);
|
||||
T24 = T22 - T23;
|
||||
T2y = T23 + T22;
|
||||
}
|
||||
Cr[WS(csr, 12)] = FNMS(KP773010453, T20, T1L);
|
||||
Ci[WS(csi, 12)] = FMS(KP773010453, T2y, T2x);
|
||||
Cr[WS(csr, 3)] = FMA(KP773010453, T20, T1L);
|
||||
Ci[WS(csi, 3)] = FMA(KP773010453, T2y, T2x);
|
||||
Cr[WS(csr, 11)] = FNMS(KP773010453, T24, T21);
|
||||
Ci[WS(csi, 11)] = FMA(KP773010453, T2A, T2z);
|
||||
Cr[WS(csr, 4)] = FMA(KP773010453, T24, T21);
|
||||
Ci[WS(csi, 4)] = FMS(KP773010453, T2A, T2z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 32, "r2cfII_32", { 46, 0, 128, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_32) (planner *p) { X(kr2c_register) (p, r2cfII_32, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cfII_32 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 82 FP multiplications,
|
||||
* (or, 138 additions, 46 multiplications, 36 fused multiply/add),
|
||||
* 62 stack variables, 15 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP471396736, +0.471396736825997648556387625905254377657460319);
|
||||
DK(KP881921264, +0.881921264348355029712756863660388349508442621);
|
||||
DK(KP634393284, +0.634393284163645498215171613225493370675687095);
|
||||
DK(KP773010453, +0.773010453362736960810906609758469800971041293);
|
||||
DK(KP290284677, +0.290284677254462367636192375817395274691476278);
|
||||
DK(KP956940335, +0.956940335732208864935797886980269969482849206);
|
||||
DK(KP995184726, +0.995184726672196886244836953109479921575474869);
|
||||
DK(KP098017140, +0.098017140329560601994195563888641845861136673);
|
||||
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
|
||||
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
|
||||
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
|
||||
E T5, T2D, T1z, T2q, Tc, T2C, T1C, T2n, Tm, T1k, T1J, T26, Tv, T1l, T1G;
|
||||
E T27, T15, T1r, T1Y, T2e, T1c, T1s, T1V, T2d, TK, T1o, T1R, T2b, TR, T1p;
|
||||
E T1O, T2a;
|
||||
{
|
||||
E T1, T2p, T4, T2o, T2, T3;
|
||||
T1 = R0[0];
|
||||
T2p = R0[WS(rs, 8)];
|
||||
T2 = R0[WS(rs, 4)];
|
||||
T3 = R0[WS(rs, 12)];
|
||||
T4 = KP707106781 * (T2 - T3);
|
||||
T2o = KP707106781 * (T2 + T3);
|
||||
T5 = T1 + T4;
|
||||
T2D = T2p - T2o;
|
||||
T1z = T1 - T4;
|
||||
T2q = T2o + T2p;
|
||||
}
|
||||
{
|
||||
E T8, T1A, Tb, T1B;
|
||||
{
|
||||
E T6, T7, T9, Ta;
|
||||
T6 = R0[WS(rs, 2)];
|
||||
T7 = R0[WS(rs, 10)];
|
||||
T8 = FNMS(KP382683432, T7, KP923879532 * T6);
|
||||
T1A = FMA(KP382683432, T6, KP923879532 * T7);
|
||||
T9 = R0[WS(rs, 6)];
|
||||
Ta = R0[WS(rs, 14)];
|
||||
Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
|
||||
T1B = FMA(KP923879532, T9, KP382683432 * Ta);
|
||||
}
|
||||
Tc = T8 + Tb;
|
||||
T2C = Tb - T8;
|
||||
T1C = T1A - T1B;
|
||||
T2n = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Te, Tk, Th, Tj, Tf, Tg;
|
||||
Te = R0[WS(rs, 1)];
|
||||
Tk = R0[WS(rs, 9)];
|
||||
Tf = R0[WS(rs, 5)];
|
||||
Tg = R0[WS(rs, 13)];
|
||||
Th = KP707106781 * (Tf - Tg);
|
||||
Tj = KP707106781 * (Tf + Tg);
|
||||
{
|
||||
E Ti, Tl, T1H, T1I;
|
||||
Ti = Te + Th;
|
||||
Tl = Tj + Tk;
|
||||
Tm = FNMS(KP195090322, Tl, KP980785280 * Ti);
|
||||
T1k = FMA(KP195090322, Ti, KP980785280 * Tl);
|
||||
T1H = Tk - Tj;
|
||||
T1I = Te - Th;
|
||||
T1J = FNMS(KP555570233, T1I, KP831469612 * T1H);
|
||||
T26 = FMA(KP831469612, T1I, KP555570233 * T1H);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tq, Tt, Tp, Ts, Tn, To;
|
||||
Tq = R0[WS(rs, 15)];
|
||||
Tt = R0[WS(rs, 7)];
|
||||
Tn = R0[WS(rs, 3)];
|
||||
To = R0[WS(rs, 11)];
|
||||
Tp = KP707106781 * (Tn - To);
|
||||
Ts = KP707106781 * (Tn + To);
|
||||
{
|
||||
E Tr, Tu, T1E, T1F;
|
||||
Tr = Tp - Tq;
|
||||
Tu = Ts + Tt;
|
||||
Tv = FMA(KP980785280, Tr, KP195090322 * Tu);
|
||||
T1l = FNMS(KP980785280, Tu, KP195090322 * Tr);
|
||||
T1E = Tt - Ts;
|
||||
T1F = Tp + Tq;
|
||||
T1G = FNMS(KP555570233, T1F, KP831469612 * T1E);
|
||||
T27 = FMA(KP831469612, T1F, KP555570233 * T1E);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TW, T1a, TV, T19, T10, T16, T13, T17, TT, TU;
|
||||
TW = R1[WS(rs, 15)];
|
||||
T1a = R1[WS(rs, 7)];
|
||||
TT = R1[WS(rs, 3)];
|
||||
TU = R1[WS(rs, 11)];
|
||||
TV = KP707106781 * (TT - TU);
|
||||
T19 = KP707106781 * (TT + TU);
|
||||
{
|
||||
E TY, TZ, T11, T12;
|
||||
TY = R1[WS(rs, 1)];
|
||||
TZ = R1[WS(rs, 9)];
|
||||
T10 = FNMS(KP382683432, TZ, KP923879532 * TY);
|
||||
T16 = FMA(KP382683432, TY, KP923879532 * TZ);
|
||||
T11 = R1[WS(rs, 5)];
|
||||
T12 = R1[WS(rs, 13)];
|
||||
T13 = FNMS(KP923879532, T12, KP382683432 * T11);
|
||||
T17 = FMA(KP923879532, T11, KP382683432 * T12);
|
||||
}
|
||||
{
|
||||
E TX, T14, T1W, T1X;
|
||||
TX = TV - TW;
|
||||
T14 = T10 + T13;
|
||||
T15 = TX + T14;
|
||||
T1r = TX - T14;
|
||||
T1W = T13 - T10;
|
||||
T1X = T1a - T19;
|
||||
T1Y = T1W - T1X;
|
||||
T2e = T1W + T1X;
|
||||
}
|
||||
{
|
||||
E T18, T1b, T1T, T1U;
|
||||
T18 = T16 + T17;
|
||||
T1b = T19 + T1a;
|
||||
T1c = T18 + T1b;
|
||||
T1s = T1b - T18;
|
||||
T1T = TV + TW;
|
||||
T1U = T16 - T17;
|
||||
T1V = T1T + T1U;
|
||||
T2d = T1U - T1T;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ty, TP, TB, TO, TF, TL, TI, TM, Tz, TA;
|
||||
Ty = R1[0];
|
||||
TP = R1[WS(rs, 8)];
|
||||
Tz = R1[WS(rs, 4)];
|
||||
TA = R1[WS(rs, 12)];
|
||||
TB = KP707106781 * (Tz - TA);
|
||||
TO = KP707106781 * (Tz + TA);
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = R1[WS(rs, 2)];
|
||||
TE = R1[WS(rs, 10)];
|
||||
TF = FNMS(KP382683432, TE, KP923879532 * TD);
|
||||
TL = FMA(KP382683432, TD, KP923879532 * TE);
|
||||
TG = R1[WS(rs, 6)];
|
||||
TH = R1[WS(rs, 14)];
|
||||
TI = FNMS(KP923879532, TH, KP382683432 * TG);
|
||||
TM = FMA(KP923879532, TG, KP382683432 * TH);
|
||||
}
|
||||
{
|
||||
E TC, TJ, T1P, T1Q;
|
||||
TC = Ty + TB;
|
||||
TJ = TF + TI;
|
||||
TK = TC + TJ;
|
||||
T1o = TC - TJ;
|
||||
T1P = TI - TF;
|
||||
T1Q = TP - TO;
|
||||
T1R = T1P - T1Q;
|
||||
T2b = T1P + T1Q;
|
||||
}
|
||||
{
|
||||
E TN, TQ, T1M, T1N;
|
||||
TN = TL + TM;
|
||||
TQ = TO + TP;
|
||||
TR = TN + TQ;
|
||||
T1p = TQ - TN;
|
||||
T1M = Ty - TB;
|
||||
T1N = TL - TM;
|
||||
T1O = T1M - T1N;
|
||||
T2a = T1M + T1N;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tx, T1f, T2s, T2u, T1e, T2l, T1i, T2t;
|
||||
{
|
||||
E Td, Tw, T2m, T2r;
|
||||
Td = T5 + Tc;
|
||||
Tw = Tm + Tv;
|
||||
Tx = Td - Tw;
|
||||
T1f = Td + Tw;
|
||||
T2m = T1l - T1k;
|
||||
T2r = T2n + T2q;
|
||||
T2s = T2m - T2r;
|
||||
T2u = T2m + T2r;
|
||||
}
|
||||
{
|
||||
E TS, T1d, T1g, T1h;
|
||||
TS = FMA(KP098017140, TK, KP995184726 * TR);
|
||||
T1d = FNMS(KP995184726, T1c, KP098017140 * T15);
|
||||
T1e = TS + T1d;
|
||||
T2l = T1d - TS;
|
||||
T1g = FNMS(KP098017140, TR, KP995184726 * TK);
|
||||
T1h = FMA(KP995184726, T15, KP098017140 * T1c);
|
||||
T1i = T1g + T1h;
|
||||
T2t = T1h - T1g;
|
||||
}
|
||||
Cr[WS(csr, 8)] = Tx - T1e;
|
||||
Ci[WS(csi, 8)] = T2t - T2u;
|
||||
Cr[WS(csr, 7)] = Tx + T1e;
|
||||
Ci[WS(csi, 7)] = T2t + T2u;
|
||||
Cr[WS(csr, 15)] = T1f - T1i;
|
||||
Ci[WS(csi, 15)] = T2l - T2s;
|
||||
Cr[0] = T1f + T1i;
|
||||
Ci[0] = T2l + T2s;
|
||||
}
|
||||
{
|
||||
E T29, T2h, T2M, T2O, T2g, T2J, T2k, T2N;
|
||||
{
|
||||
E T25, T28, T2K, T2L;
|
||||
T25 = T1z + T1C;
|
||||
T28 = T26 - T27;
|
||||
T29 = T25 + T28;
|
||||
T2h = T25 - T28;
|
||||
T2K = T1J + T1G;
|
||||
T2L = T2C + T2D;
|
||||
T2M = T2K - T2L;
|
||||
T2O = T2K + T2L;
|
||||
}
|
||||
{
|
||||
E T2c, T2f, T2i, T2j;
|
||||
T2c = FMA(KP956940335, T2a, KP290284677 * T2b);
|
||||
T2f = FNMS(KP290284677, T2e, KP956940335 * T2d);
|
||||
T2g = T2c + T2f;
|
||||
T2J = T2f - T2c;
|
||||
T2i = FMA(KP290284677, T2d, KP956940335 * T2e);
|
||||
T2j = FNMS(KP290284677, T2a, KP956940335 * T2b);
|
||||
T2k = T2i - T2j;
|
||||
T2N = T2j + T2i;
|
||||
}
|
||||
Cr[WS(csr, 14)] = T29 - T2g;
|
||||
Ci[WS(csi, 14)] = T2N - T2O;
|
||||
Cr[WS(csr, 1)] = T29 + T2g;
|
||||
Ci[WS(csi, 1)] = T2N + T2O;
|
||||
Cr[WS(csr, 9)] = T2h - T2k;
|
||||
Ci[WS(csi, 9)] = T2J - T2M;
|
||||
Cr[WS(csr, 6)] = T2h + T2k;
|
||||
Ci[WS(csi, 6)] = T2J + T2M;
|
||||
}
|
||||
{
|
||||
E T1n, T1v, T2y, T2A, T1u, T2v, T1y, T2z;
|
||||
{
|
||||
E T1j, T1m, T2w, T2x;
|
||||
T1j = T5 - Tc;
|
||||
T1m = T1k + T1l;
|
||||
T1n = T1j + T1m;
|
||||
T1v = T1j - T1m;
|
||||
T2w = Tv - Tm;
|
||||
T2x = T2q - T2n;
|
||||
T2y = T2w - T2x;
|
||||
T2A = T2w + T2x;
|
||||
}
|
||||
{
|
||||
E T1q, T1t, T1w, T1x;
|
||||
T1q = FMA(KP773010453, T1o, KP634393284 * T1p);
|
||||
T1t = FNMS(KP634393284, T1s, KP773010453 * T1r);
|
||||
T1u = T1q + T1t;
|
||||
T2v = T1t - T1q;
|
||||
T1w = FMA(KP634393284, T1r, KP773010453 * T1s);
|
||||
T1x = FNMS(KP634393284, T1o, KP773010453 * T1p);
|
||||
T1y = T1w - T1x;
|
||||
T2z = T1x + T1w;
|
||||
}
|
||||
Cr[WS(csr, 12)] = T1n - T1u;
|
||||
Ci[WS(csi, 12)] = T2z - T2A;
|
||||
Cr[WS(csr, 3)] = T1n + T1u;
|
||||
Ci[WS(csi, 3)] = T2z + T2A;
|
||||
Cr[WS(csr, 11)] = T1v - T1y;
|
||||
Ci[WS(csi, 11)] = T2v - T2y;
|
||||
Cr[WS(csr, 4)] = T1v + T1y;
|
||||
Ci[WS(csi, 4)] = T2v + T2y;
|
||||
}
|
||||
{
|
||||
E T1L, T21, T2G, T2I, T20, T2H, T24, T2B;
|
||||
{
|
||||
E T1D, T1K, T2E, T2F;
|
||||
T1D = T1z - T1C;
|
||||
T1K = T1G - T1J;
|
||||
T1L = T1D + T1K;
|
||||
T21 = T1D - T1K;
|
||||
T2E = T2C - T2D;
|
||||
T2F = T26 + T27;
|
||||
T2G = T2E - T2F;
|
||||
T2I = T2F + T2E;
|
||||
}
|
||||
{
|
||||
E T1S, T1Z, T22, T23;
|
||||
T1S = FMA(KP881921264, T1O, KP471396736 * T1R);
|
||||
T1Z = FMA(KP881921264, T1V, KP471396736 * T1Y);
|
||||
T20 = T1S - T1Z;
|
||||
T2H = T1S + T1Z;
|
||||
T22 = FNMS(KP471396736, T1V, KP881921264 * T1Y);
|
||||
T23 = FNMS(KP471396736, T1O, KP881921264 * T1R);
|
||||
T24 = T22 - T23;
|
||||
T2B = T23 + T22;
|
||||
}
|
||||
Cr[WS(csr, 13)] = T1L - T20;
|
||||
Ci[WS(csi, 13)] = T2B - T2G;
|
||||
Cr[WS(csr, 2)] = T1L + T20;
|
||||
Ci[WS(csi, 2)] = T2B + T2G;
|
||||
Cr[WS(csr, 10)] = T21 - T24;
|
||||
Ci[WS(csi, 10)] = T2I - T2H;
|
||||
Cr[WS(csr, 5)] = T21 + T24;
|
||||
Ci[WS(csi, 5)] = -(T2H + T2I);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 32, "r2cfII_32", { 138, 46, 36, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_32) (planner *p) { X(kr2c_register) (p, r2cfII_32, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
100
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_4.c
Normal file
100
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_4.c
Normal file
@@ -0,0 +1,100 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cfII_4 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 2 additions, 0 multiplications, 4 fused multiply/add),
|
||||
* 8 stack variables, 1 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
|
||||
E T1, T5, T4, T6, T2, T3;
|
||||
T1 = R0[0];
|
||||
T5 = R0[WS(rs, 1)];
|
||||
T2 = R1[0];
|
||||
T3 = R1[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
T6 = T2 + T3;
|
||||
Cr[WS(csr, 1)] = FNMS(KP707106781, T4, T1);
|
||||
Ci[WS(csi, 1)] = FNMS(KP707106781, T6, T5);
|
||||
Cr[0] = FMA(KP707106781, T4, T1);
|
||||
Ci[0] = -(FMA(KP707106781, T6, T5));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 4, "r2cfII_4", { 2, 0, 4, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_4) (planner *p) { X(kr2c_register) (p, r2cfII_4, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cfII_4 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 2 FP multiplications,
|
||||
* (or, 6 additions, 2 multiplications, 0 fused multiply/add),
|
||||
* 8 stack variables, 1 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
|
||||
E T1, T6, T4, T5, T2, T3;
|
||||
T1 = R0[0];
|
||||
T6 = R0[WS(rs, 1)];
|
||||
T2 = R1[0];
|
||||
T3 = R1[WS(rs, 1)];
|
||||
T4 = KP707106781 * (T2 - T3);
|
||||
T5 = KP707106781 * (T2 + T3);
|
||||
Cr[WS(csr, 1)] = T1 - T4;
|
||||
Ci[WS(csi, 1)] = T6 - T5;
|
||||
Cr[0] = T1 + T4;
|
||||
Ci[0] = -(T5 + T6);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 4, "r2cfII_4", { 6, 2, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_4) (planner *p) { X(kr2c_register) (p, r2cfII_4, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
126
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_5.c
Normal file
126
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_5.c
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cfII_5 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 7 FP multiplications,
|
||||
* (or, 7 additions, 2 multiplications, 5 fused multiply/add),
|
||||
* 17 stack variables, 4 constants, and 10 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
|
||||
E T1, T4, T7, T8, Tc, Tb, T9, Ta;
|
||||
T1 = R0[0];
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = R1[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
T5 = R0[WS(rs, 2)];
|
||||
T6 = R1[0];
|
||||
T7 = T5 - T6;
|
||||
T8 = T4 + T7;
|
||||
Tc = T5 + T6;
|
||||
Tb = T2 + T3;
|
||||
}
|
||||
Cr[WS(csr, 2)] = T1 + T8;
|
||||
Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP618033988, Tb, Tc)));
|
||||
Ci[0] = -(KP951056516 * (FMA(KP618033988, Tc, Tb)));
|
||||
T9 = FNMS(KP250000000, T8, T1);
|
||||
Ta = T4 - T7;
|
||||
Cr[0] = FMA(KP559016994, Ta, T9);
|
||||
Cr[WS(csr, 1)] = FNMS(KP559016994, Ta, T9);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 5, "r2cfII_5", { 7, 2, 5, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_5) (planner *p) { X(kr2c_register) (p, r2cfII_5, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cfII_5 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 6 FP multiplications,
|
||||
* (or, 9 additions, 3 multiplications, 3 fused multiply/add),
|
||||
* 17 stack variables, 4 constants, and 10 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
|
||||
E T8, T3, T6, T9, Tc, Tb, T7, Ta;
|
||||
T8 = R0[0];
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = R0[WS(rs, 1)];
|
||||
T2 = R1[WS(rs, 1)];
|
||||
T3 = T1 - T2;
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T5 = R1[0];
|
||||
T6 = T4 - T5;
|
||||
T9 = T3 + T6;
|
||||
Tc = T4 + T5;
|
||||
Tb = T1 + T2;
|
||||
}
|
||||
Cr[WS(csr, 2)] = T8 + T9;
|
||||
Ci[WS(csi, 1)] = FNMS(KP951056516, Tc, KP587785252 * Tb);
|
||||
Ci[0] = -(FMA(KP951056516, Tb, KP587785252 * Tc));
|
||||
T7 = KP559016994 * (T3 - T6);
|
||||
Ta = FNMS(KP250000000, T9, T8);
|
||||
Cr[0] = T7 + Ta;
|
||||
Cr[WS(csr, 1)] = Ta - T7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 5, "r2cfII_5", { 9, 3, 3, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_5) (planner *p) { X(kr2c_register) (p, r2cfII_5, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
117
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_6.c
Normal file
117
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_6.c
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cfII_6 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 13 FP additions, 6 FP multiplications,
|
||||
* (or, 7 additions, 0 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
|
||||
E T1, T9, T2, T3, T4, Tc, T8, Ta, T6, T7, T5, Tb;
|
||||
T1 = R0[0];
|
||||
T9 = R1[WS(rs, 1)];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 1)];
|
||||
T4 = T3 - T2;
|
||||
Tc = T2 + T3;
|
||||
T6 = R1[WS(rs, 2)];
|
||||
T7 = R1[0];
|
||||
T8 = T6 - T7;
|
||||
Ta = T6 + T7;
|
||||
Ci[WS(csi, 1)] = T9 - Ta;
|
||||
Cr[WS(csr, 1)] = T1 + T2 - T3;
|
||||
T5 = FMA(KP500000000, T4, T1);
|
||||
Cr[0] = FNMS(KP866025403, T8, T5);
|
||||
Cr[WS(csr, 2)] = FMA(KP866025403, T8, T5);
|
||||
Tb = FMA(KP500000000, Ta, T9);
|
||||
Ci[0] = -(FMA(KP866025403, Tc, Tb));
|
||||
Ci[WS(csi, 2)] = FMS(KP866025403, Tc, Tb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 6, "r2cfII_6", { 7, 0, 6, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_6) (planner *p) { X(kr2c_register) (p, r2cfII_6, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cfII_6 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 13 FP additions, 4 FP multiplications,
|
||||
* (or, 11 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 14 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
|
||||
E Ta, T7, T9, T1, T3, T2, T8, T4, T5, T6, Tb;
|
||||
Ta = R1[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 2)];
|
||||
T6 = R1[0];
|
||||
T7 = KP866025403 * (T5 - T6);
|
||||
T9 = T5 + T6;
|
||||
T1 = R0[0];
|
||||
T3 = R0[WS(rs, 1)];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T8 = KP866025403 * (T2 + T3);
|
||||
T4 = FMA(KP500000000, T3 - T2, T1);
|
||||
Cr[0] = T4 - T7;
|
||||
Tb = FMA(KP500000000, T9, Ta);
|
||||
Ci[0] = -(T8 + Tb);
|
||||
Ci[WS(csi, 2)] = T8 - Tb;
|
||||
Cr[WS(csr, 2)] = T4 + T7;
|
||||
Ci[WS(csi, 1)] = Ta - T9;
|
||||
Cr[WS(csr, 1)] = T1 + T2 - T3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 6, "r2cfII_6", { 11, 2, 2, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_6) (planner *p) { X(kr2c_register) (p, r2cfII_6, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
1548
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_64.c
Normal file
1548
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_64.c
Normal file
File diff suppressed because it is too large
Load Diff
148
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_7.c
Normal file
148
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_7.c
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cfII_7 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 18 FP multiplications,
|
||||
* (or, 9 additions, 3 multiplications, 15 fused multiply/add),
|
||||
* 23 stack variables, 6 constants, and 14 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
|
||||
E Td, Te, Tf, Tg, T3, T6, Tl, Tj, Th, T9;
|
||||
Td = R0[0];
|
||||
{
|
||||
E T1, T2, T7, T8, T4, T5;
|
||||
T1 = R0[WS(rs, 1)];
|
||||
T2 = R1[WS(rs, 2)];
|
||||
Te = T1 - T2;
|
||||
T7 = R1[WS(rs, 1)];
|
||||
T8 = R0[WS(rs, 2)];
|
||||
Tf = T8 - T7;
|
||||
T4 = R1[0];
|
||||
T5 = R0[WS(rs, 3)];
|
||||
Tg = T5 - T4;
|
||||
T3 = T1 + T2;
|
||||
T6 = T4 + T5;
|
||||
Tl = FNMS(KP356895867, Te, Tg);
|
||||
Tj = FNMS(KP356895867, Tf, Te);
|
||||
Th = FNMS(KP356895867, Tg, Tf);
|
||||
T9 = T7 + T8;
|
||||
}
|
||||
{
|
||||
E Ta, Tm, Tb, Ti, Tc, Tk;
|
||||
Ta = FMA(KP554958132, T9, T6);
|
||||
Ci[WS(csi, 2)] = KP974927912 * (FNMS(KP801937735, Ta, T3));
|
||||
Tm = FNMS(KP692021471, Tl, Tf);
|
||||
Cr[WS(csr, 2)] = FNMS(KP900968867, Tm, Td);
|
||||
Tb = FNMS(KP554958132, T3, T9);
|
||||
Ci[WS(csi, 1)] = -(KP974927912 * (FNMS(KP801937735, Tb, T6)));
|
||||
Ti = FNMS(KP692021471, Th, Te);
|
||||
Cr[WS(csr, 1)] = FNMS(KP900968867, Ti, Td);
|
||||
Cr[WS(csr, 3)] = Te + Tg + Tf + Td;
|
||||
Tc = FMA(KP554958132, T6, T3);
|
||||
Ci[0] = -(KP974927912 * (FMA(KP801937735, Tc, T9)));
|
||||
Tk = FNMS(KP692021471, Tj, Tg);
|
||||
Cr[0] = FNMS(KP900968867, Tk, Td);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 7, "r2cfII_7", { 9, 3, 15, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_7) (planner *p) { X(kr2c_register) (p, r2cfII_7, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cfII_7 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 18 FP multiplications,
|
||||
* (or, 12 additions, 6 multiplications, 12 fused multiply/add),
|
||||
* 20 stack variables, 6 constants, and 14 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
|
||||
E T1, Ta, Td, T4, Tb, T7, Tc, T8, T9;
|
||||
T1 = R0[0];
|
||||
T8 = R1[0];
|
||||
T9 = R0[WS(rs, 3)];
|
||||
Ta = T8 - T9;
|
||||
Td = T8 + T9;
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = R1[WS(rs, 2)];
|
||||
T4 = T2 - T3;
|
||||
Tb = T2 + T3;
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T6 = R0[WS(rs, 2)];
|
||||
T7 = T5 - T6;
|
||||
Tc = T5 + T6;
|
||||
}
|
||||
Ci[0] = -(FMA(KP781831482, Tb, KP974927912 * Tc) + (KP433883739 * Td));
|
||||
Ci[WS(csi, 1)] = FNMS(KP974927912, Td, KP781831482 * Tc) - (KP433883739 * Tb);
|
||||
Cr[0] = FMA(KP623489801, T4, T1) + FMA(KP222520933, T7, KP900968867 * Ta);
|
||||
Ci[WS(csi, 2)] = FNMS(KP781831482, Td, KP974927912 * Tb) - (KP433883739 * Tc);
|
||||
Cr[WS(csr, 2)] = FMA(KP900968867, T7, T1) + FNMA(KP623489801, Ta, KP222520933 * T4);
|
||||
Cr[WS(csr, 1)] = FMA(KP222520933, Ta, T1) + FNMA(KP623489801, T7, KP900968867 * T4);
|
||||
Cr[WS(csr, 3)] = T1 + T4 - (T7 + Ta);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 7, "r2cfII_7", { 12, 6, 12, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_7) (planner *p) { X(kr2c_register) (p, r2cfII_7, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
162
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_8.c
Normal file
162
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_8.c
Normal file
@@ -0,0 +1,162 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cfII_8 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 16 FP multiplications,
|
||||
* (or, 6 additions, 0 multiplications, 16 fused multiply/add),
|
||||
* 18 stack variables, 3 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
|
||||
E T1, Th, T4, Ti, T8, Te, Tb, Tf, T2, T3;
|
||||
T1 = R0[0];
|
||||
Th = R0[WS(rs, 2)];
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 - T3;
|
||||
Ti = T2 + T3;
|
||||
{
|
||||
E T6, T7, T9, Ta;
|
||||
T6 = R1[0];
|
||||
T7 = R1[WS(rs, 2)];
|
||||
T8 = FNMS(KP414213562, T7, T6);
|
||||
Te = FMA(KP414213562, T6, T7);
|
||||
T9 = R1[WS(rs, 3)];
|
||||
Ta = R1[WS(rs, 1)];
|
||||
Tb = FMS(KP414213562, Ta, T9);
|
||||
Tf = FMA(KP414213562, T9, Ta);
|
||||
}
|
||||
{
|
||||
E T5, Tc, Tj, Tk;
|
||||
T5 = FMA(KP707106781, T4, T1);
|
||||
Tc = T8 + Tb;
|
||||
Cr[WS(csr, 3)] = FNMS(KP923879532, Tc, T5);
|
||||
Cr[0] = FMA(KP923879532, Tc, T5);
|
||||
Tj = FMA(KP707106781, Ti, Th);
|
||||
Tk = Te + Tf;
|
||||
Ci[0] = -(FMA(KP923879532, Tk, Tj));
|
||||
Ci[WS(csi, 3)] = FNMS(KP923879532, Tk, Tj);
|
||||
}
|
||||
{
|
||||
E Td, Tg, Tl, Tm;
|
||||
Td = FNMS(KP707106781, T4, T1);
|
||||
Tg = Te - Tf;
|
||||
Cr[WS(csr, 2)] = FNMS(KP923879532, Tg, Td);
|
||||
Cr[WS(csr, 1)] = FMA(KP923879532, Tg, Td);
|
||||
Tl = FNMS(KP707106781, Ti, Th);
|
||||
Tm = Tb - T8;
|
||||
Ci[WS(csi, 2)] = FMS(KP923879532, Tm, Tl);
|
||||
Ci[WS(csi, 1)] = FMA(KP923879532, Tm, Tl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 8, "r2cfII_8", { 6, 0, 16, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_8) (planner *p) { X(kr2c_register) (p, r2cfII_8, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cfII_8 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 10 FP multiplications,
|
||||
* (or, 18 additions, 6 multiplications, 4 fused multiply/add),
|
||||
* 18 stack variables, 3 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
|
||||
E T1, Tj, T4, Ti, T8, Te, Tb, Tf, T2, T3;
|
||||
T1 = R0[0];
|
||||
Tj = R0[WS(rs, 2)];
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = KP707106781 * (T2 - T3);
|
||||
Ti = KP707106781 * (T2 + T3);
|
||||
{
|
||||
E T6, T7, T9, Ta;
|
||||
T6 = R1[0];
|
||||
T7 = R1[WS(rs, 2)];
|
||||
T8 = FNMS(KP382683432, T7, KP923879532 * T6);
|
||||
Te = FMA(KP382683432, T6, KP923879532 * T7);
|
||||
T9 = R1[WS(rs, 1)];
|
||||
Ta = R1[WS(rs, 3)];
|
||||
Tb = FNMS(KP923879532, Ta, KP382683432 * T9);
|
||||
Tf = FMA(KP923879532, T9, KP382683432 * Ta);
|
||||
}
|
||||
{
|
||||
E T5, Tc, Th, Tk;
|
||||
T5 = T1 + T4;
|
||||
Tc = T8 + Tb;
|
||||
Cr[WS(csr, 3)] = T5 - Tc;
|
||||
Cr[0] = T5 + Tc;
|
||||
Th = Te + Tf;
|
||||
Tk = Ti + Tj;
|
||||
Ci[0] = -(Th + Tk);
|
||||
Ci[WS(csi, 3)] = Tk - Th;
|
||||
}
|
||||
{
|
||||
E Td, Tg, Tl, Tm;
|
||||
Td = T1 - T4;
|
||||
Tg = Te - Tf;
|
||||
Cr[WS(csr, 2)] = Td - Tg;
|
||||
Cr[WS(csr, 1)] = Td + Tg;
|
||||
Tl = Tb - T8;
|
||||
Tm = Tj - Ti;
|
||||
Ci[WS(csi, 2)] = Tl - Tm;
|
||||
Ci[WS(csi, 1)] = Tl + Tm;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 8, "r2cfII_8", { 18, 6, 4, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_8) (planner *p) { X(kr2c_register) (p, r2cfII_8, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
223
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_9.c
Normal file
223
fftw-3.3.10/rdft/scalar/r2cf/r2cfII_9.c
Normal file
@@ -0,0 +1,223 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:24 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cfII_9 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 42 FP additions, 34 FP multiplications,
|
||||
* (or, 12 additions, 4 multiplications, 30 fused multiply/add),
|
||||
* 48 stack variables, 17 constants, and 18 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP879385241, +0.879385241571816768108218554649462939872416269);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP898197570, +0.898197570222573798468955502359086394667167570);
|
||||
DK(KP673648177, +0.673648177666930348851716626769314796000375677);
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP907603734, +0.907603734547952313649323976213898122064543220);
|
||||
DK(KP666666666, +0.666666666666666666666666666666666666666666667);
|
||||
DK(KP826351822, +0.826351822333069651148283373230685203999624323);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP315207469, +0.315207469095904627298647952427796244129086440);
|
||||
DK(KP420276625, +0.420276625461206169731530603237061658838781920);
|
||||
DK(KP203604859, +0.203604859554852403062088995281827210665664861);
|
||||
DK(KP152703644, +0.152703644666139302296566746461370407999248646);
|
||||
DK(KP726681596, +0.726681596905677465811651808188092531873167623);
|
||||
DK(KP968908795, +0.968908795874236621082202410917456709164223497);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
|
||||
E T1, T4, To, Ta, Tm, TB, Tq, Tt, Tf, Tj, TA, Tr, Ts, T2, T3;
|
||||
E T5, Tg;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 3)];
|
||||
T3 = R1[WS(rs, 1)];
|
||||
T4 = T2 - T3;
|
||||
To = T2 + T3;
|
||||
{
|
||||
E T6, T9, Tk, T7, T8, Tl;
|
||||
T6 = R0[WS(rs, 1)];
|
||||
T7 = R0[WS(rs, 4)];
|
||||
T8 = R1[WS(rs, 2)];
|
||||
T9 = T7 - T8;
|
||||
Tk = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
Tl = FNMS(KP500000000, T9, T6);
|
||||
Tm = FMA(KP968908795, Tl, Tk);
|
||||
TB = FNMS(KP726681596, Tk, Tl);
|
||||
Tq = FNMS(KP152703644, Tk, Tl);
|
||||
Tt = FMA(KP203604859, Tl, Tk);
|
||||
}
|
||||
{
|
||||
E Tb, Te, Ti, Tc, Td, Th;
|
||||
Tb = R0[WS(rs, 2)];
|
||||
Tc = R1[0];
|
||||
Td = R1[WS(rs, 3)];
|
||||
Te = Tc + Td;
|
||||
Ti = Tc - Td;
|
||||
Tf = Tb - Te;
|
||||
Th = FMA(KP500000000, Te, Tb);
|
||||
Tj = FNMS(KP152703644, Ti, Th);
|
||||
TA = FMA(KP203604859, Th, Ti);
|
||||
Tr = FNMS(KP420276625, Th, Ti);
|
||||
Ts = FMA(KP315207469, Ti, Th);
|
||||
}
|
||||
Ci[WS(csi, 1)] = KP866025403 * (Tf - Ta);
|
||||
T5 = T1 + T4;
|
||||
Tg = Ta + Tf;
|
||||
Cr[WS(csr, 1)] = FNMS(KP500000000, Tg, T5);
|
||||
Cr[WS(csr, 4)] = T5 + Tg;
|
||||
{
|
||||
E Ty, Tx, Tz, Tn, TD, TC;
|
||||
Tx = FNMS(KP826351822, Tr, Tq);
|
||||
Ty = FNMS(KP666666666, Tx, Tt);
|
||||
Tz = FMA(KP907603734, Ty, Ts);
|
||||
Ci[WS(csi, 2)] = KP866025403 * (FNMS(KP939692620, Tz, To));
|
||||
Tn = FMA(KP673648177, Tm, Tj);
|
||||
TC = FNMS(KP898197570, TB, TA);
|
||||
TD = FNMS(KP666666666, Tn, TC);
|
||||
Ci[0] = -(KP984807753 * (FMA(KP879385241, To, Tn)));
|
||||
Ci[WS(csi, 3)] = -(KP866025403 * (FMA(KP852868531, TD, To)));
|
||||
{
|
||||
E Tp, Tv, TF, TG, Tu, TE, Tw;
|
||||
Tp = FNMS(KP500000000, T4, T1);
|
||||
Tu = FNMS(KP907603734, Tt, Ts);
|
||||
Tv = FNMS(KP666666666, Tu, Tr);
|
||||
TE = FNMS(KP673648177, Tm, Tj);
|
||||
TF = FMA(KP898197570, TB, TA);
|
||||
TG = FMA(KP500000000, TF, TE);
|
||||
Cr[WS(csr, 3)] = FNMS(KP852868531, TG, Tp);
|
||||
Cr[0] = FMA(KP852868531, TF, Tp);
|
||||
Tw = FMA(KP826351822, Tv, Tq);
|
||||
Cr[WS(csr, 2)] = FNMS(KP852868531, Tw, Tp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 9, "r2cfII_9", { 12, 4, 30, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_9) (planner *p) { X(kr2c_register) (p, r2cfII_9, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cfII_9 -dft-II -include rdft/scalar/r2cfII.h */
|
||||
|
||||
/*
|
||||
* This function contains 42 FP additions, 30 FP multiplications,
|
||||
* (or, 25 additions, 13 multiplications, 17 fused multiply/add),
|
||||
* 39 stack variables, 14 constants, and 18 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cfII.h"
|
||||
|
||||
static void r2cfII_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP663413948, +0.663413948168938396205421319635891297216863310);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP556670399, +0.556670399226419366452912952047023132968291906);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP150383733, +0.150383733180435296639271897612501926072238258);
|
||||
DK(KP813797681, +0.813797681349373692844693217248393223289101568);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP296198132, +0.296198132726023843175338011893050938967728390);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
|
||||
E T1, T4, To, Ta, Tl, Tk, Tf, Ti, Th, T2, T3, T5, Tg;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 1)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 - T3;
|
||||
To = T2 + T3;
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = R0[WS(rs, 1)];
|
||||
T7 = R1[WS(rs, 2)];
|
||||
T8 = R0[WS(rs, 4)];
|
||||
T9 = T7 - T8;
|
||||
Ta = T6 - T9;
|
||||
Tl = T7 + T8;
|
||||
Tk = FMA(KP500000000, T9, T6);
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Td, Te;
|
||||
Tb = R0[WS(rs, 2)];
|
||||
Tc = R1[0];
|
||||
Td = R1[WS(rs, 3)];
|
||||
Te = Tc + Td;
|
||||
Tf = Tb - Te;
|
||||
Ti = FMA(KP500000000, Te, Tb);
|
||||
Th = Tc - Td;
|
||||
}
|
||||
Ci[WS(csi, 1)] = KP866025403 * (Tf - Ta);
|
||||
T5 = T1 - T4;
|
||||
Tg = Ta + Tf;
|
||||
Cr[WS(csr, 1)] = FNMS(KP500000000, Tg, T5);
|
||||
Cr[WS(csr, 4)] = T5 + Tg;
|
||||
{
|
||||
E Tr, Tt, Tw, Tv, Tu, Tp, Tq, Ts, Tj, Tm, Tn;
|
||||
Tr = FMA(KP500000000, T4, T1);
|
||||
Tt = FMA(KP296198132, Th, KP939692620 * Ti);
|
||||
Tw = FNMS(KP813797681, Th, KP342020143 * Ti);
|
||||
Tv = FNMS(KP984807753, Tk, KP150383733 * Tl);
|
||||
Tu = FMA(KP173648177, Tk, KP852868531 * Tl);
|
||||
Tp = FNMS(KP556670399, Tl, KP766044443 * Tk);
|
||||
Tq = FMA(KP852868531, Th, KP173648177 * Ti);
|
||||
Ts = Tp + Tq;
|
||||
Tj = FNMS(KP984807753, Ti, KP150383733 * Th);
|
||||
Tm = FMA(KP642787609, Tk, KP663413948 * Tl);
|
||||
Tn = Tj - Tm;
|
||||
Ci[0] = FNMS(KP866025403, To, Tn);
|
||||
Cr[0] = Tr + Ts;
|
||||
Ci[WS(csi, 3)] = FNMS(KP500000000, Tn, KP866025403 * ((Tp - Tq) - To));
|
||||
Cr[WS(csr, 3)] = FMA(KP866025403, Tm + Tj, Tr) - (KP500000000 * Ts);
|
||||
Ci[WS(csi, 2)] = FMA(KP866025403, To - (Tu + Tt), KP500000000 * (Tw - Tv));
|
||||
Cr[WS(csr, 2)] = FMA(KP500000000, Tt - Tu, Tr) + (KP866025403 * (Tv + Tw));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 9, "r2cfII_9", { 25, 13, 17, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cfII_9) (planner *p) { X(kr2c_register) (p, r2cfII_9, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
200
fftw-3.3.10/rdft/scalar/r2cf/r2cf_10.c
Normal file
200
fftw-3.3.10/rdft/scalar/r2cf/r2cf_10.c
Normal file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cf_10 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 34 FP additions, 14 FP multiplications,
|
||||
* (or, 24 additions, 4 multiplications, 10 fused multiply/add),
|
||||
* 26 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
|
||||
E T3, Tt, Td, Tn, Tg, To, Th, Tv, T6, Tq, T9, Tr, Ta, Tu, T1;
|
||||
E T2;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 2)];
|
||||
T3 = T1 - T2;
|
||||
Tt = T1 + T2;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = R0[WS(rs, 2)];
|
||||
Tc = R1[WS(rs, 4)];
|
||||
Td = Tb - Tc;
|
||||
Tn = Tb + Tc;
|
||||
Te = R0[WS(rs, 3)];
|
||||
Tf = R1[0];
|
||||
Tg = Te - Tf;
|
||||
To = Te + Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
Tv = Tn + To;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 3)];
|
||||
T6 = T4 - T5;
|
||||
Tq = T4 + T5;
|
||||
T7 = R0[WS(rs, 4)];
|
||||
T8 = R1[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
Tr = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Tu = Tq + Tr;
|
||||
{
|
||||
E Tl, Tm, Tk, Ti, Tj;
|
||||
Tl = T6 - T9;
|
||||
Tm = Tg - Td;
|
||||
Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP618033988, Tm, Tl)));
|
||||
Ci[WS(csi, 3)] = KP951056516 * (FMA(KP618033988, Tl, Tm));
|
||||
Tk = Ta - Th;
|
||||
Ti = Ta + Th;
|
||||
Tj = FNMS(KP250000000, Ti, T3);
|
||||
Cr[WS(csr, 1)] = FMA(KP559016994, Tk, Tj);
|
||||
Cr[WS(csr, 5)] = T3 + Ti;
|
||||
Cr[WS(csr, 3)] = FNMS(KP559016994, Tk, Tj);
|
||||
}
|
||||
{
|
||||
E Tp, Ts, Ty, Tw, Tx;
|
||||
Tp = Tn - To;
|
||||
Ts = Tq - Tr;
|
||||
Ci[WS(csi, 2)] = KP951056516 * (FNMS(KP618033988, Ts, Tp));
|
||||
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, Tp, Ts));
|
||||
Ty = Tu - Tv;
|
||||
Tw = Tu + Tv;
|
||||
Tx = FNMS(KP250000000, Tw, Tt);
|
||||
Cr[WS(csr, 2)] = FNMS(KP559016994, Ty, Tx);
|
||||
Cr[0] = Tt + Tw;
|
||||
Cr[WS(csr, 4)] = FMA(KP559016994, Ty, Tx);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 10, "r2cf_10", { 24, 4, 10, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_10) (planner *p) { X(kr2c_register) (p, r2cf_10, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 10 -name r2cf_10 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 34 FP additions, 12 FP multiplications,
|
||||
* (or, 28 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 26 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_10(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(40, rs), MAKE_VOLATILE_STRIDE(40, csr), MAKE_VOLATILE_STRIDE(40, csi)) {
|
||||
E Ti, Tt, Ta, Tn, Td, To, Te, Tv, T3, Tq, T6, Tr, T7, Tu, Tg;
|
||||
E Th;
|
||||
Tg = R0[0];
|
||||
Th = R1[WS(rs, 2)];
|
||||
Ti = Tg - Th;
|
||||
Tt = Tg + Th;
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = R0[WS(rs, 2)];
|
||||
T9 = R1[WS(rs, 4)];
|
||||
Ta = T8 - T9;
|
||||
Tn = T8 + T9;
|
||||
Tb = R0[WS(rs, 3)];
|
||||
Tc = R1[0];
|
||||
Td = Tb - Tc;
|
||||
To = Tb + Tc;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
Tv = Tn + To;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = R0[WS(rs, 1)];
|
||||
T2 = R1[WS(rs, 3)];
|
||||
T3 = T1 - T2;
|
||||
Tq = T1 + T2;
|
||||
T4 = R0[WS(rs, 4)];
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T6 = T4 - T5;
|
||||
Tr = T4 + T5;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
Tu = Tq + Tr;
|
||||
{
|
||||
E Tl, Tm, Tf, Tj, Tk;
|
||||
Tl = Td - Ta;
|
||||
Tm = T3 - T6;
|
||||
Ci[WS(csi, 1)] = FNMS(KP951056516, Tm, KP587785252 * Tl);
|
||||
Ci[WS(csi, 3)] = FMA(KP587785252, Tm, KP951056516 * Tl);
|
||||
Tf = KP559016994 * (T7 - Te);
|
||||
Tj = T7 + Te;
|
||||
Tk = FNMS(KP250000000, Tj, Ti);
|
||||
Cr[WS(csr, 1)] = Tf + Tk;
|
||||
Cr[WS(csr, 5)] = Ti + Tj;
|
||||
Cr[WS(csr, 3)] = Tk - Tf;
|
||||
}
|
||||
{
|
||||
E Tp, Ts, Ty, Tw, Tx;
|
||||
Tp = Tn - To;
|
||||
Ts = Tq - Tr;
|
||||
Ci[WS(csi, 2)] = FNMS(KP587785252, Ts, KP951056516 * Tp);
|
||||
Ci[WS(csi, 4)] = FMA(KP951056516, Ts, KP587785252 * Tp);
|
||||
Ty = KP559016994 * (Tu - Tv);
|
||||
Tw = Tu + Tv;
|
||||
Tx = FNMS(KP250000000, Tw, Tt);
|
||||
Cr[WS(csr, 2)] = Tx - Ty;
|
||||
Cr[0] = Tt + Tw;
|
||||
Cr[WS(csr, 4)] = Ty + Tx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 10, "r2cf_10", { 28, 6, 6, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_10) (planner *p) { X(kr2c_register) (p, r2cf_10, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
228
fftw-3.3.10/rdft/scalar/r2cf/r2cf_11.c
Normal file
228
fftw-3.3.10/rdft/scalar/r2cf/r2cf_11.c
Normal file
@@ -0,0 +1,228 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 11 -name r2cf_11 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 60 FP additions, 50 FP multiplications,
|
||||
* (or, 15 additions, 5 multiplications, 45 fused multiply/add),
|
||||
* 42 stack variables, 10 constants, and 22 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP918985947, +0.918985947228994779780736114132655398124909697);
|
||||
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
|
||||
DK(KP830830026, +0.830830026003772851058548298459246407048009821);
|
||||
DK(KP715370323, +0.715370323453429719112414662767260662417897278);
|
||||
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
|
||||
DK(KP876768831, +0.876768831002589333891339807079336796764054852);
|
||||
DK(KP778434453, +0.778434453334651800608337670740821884709317477);
|
||||
DK(KP634356270, +0.634356270682424498893150776899916060542806975);
|
||||
DK(KP342584725, +0.342584725681637509502641509861112333758894680);
|
||||
DK(KP521108558, +0.521108558113202722944698153526659300680427422);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
|
||||
E T1, T4, TC, Tg, TE, T7, TD, Ta, TF, Td, TB, TG, TM, TS, TJ;
|
||||
E TP, Ty, Tq, Ti, Tu, Tm, T5, T6;
|
||||
T1 = R0[0];
|
||||
{
|
||||
E T2, T3, Te, Tf;
|
||||
T2 = R1[0];
|
||||
T3 = R0[WS(rs, 5)];
|
||||
T4 = T2 + T3;
|
||||
TC = T3 - T2;
|
||||
Te = R1[WS(rs, 2)];
|
||||
Tf = R0[WS(rs, 3)];
|
||||
Tg = Te + Tf;
|
||||
TE = Tf - Te;
|
||||
}
|
||||
T5 = R0[WS(rs, 1)];
|
||||
T6 = R1[WS(rs, 4)];
|
||||
T7 = T5 + T6;
|
||||
TD = T5 - T6;
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = R1[WS(rs, 1)];
|
||||
T9 = R0[WS(rs, 4)];
|
||||
Ta = T8 + T9;
|
||||
TF = T9 - T8;
|
||||
Tb = R0[WS(rs, 2)];
|
||||
Tc = R1[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
TB = Tb - Tc;
|
||||
}
|
||||
TG = FMA(KP521108558, TF, TE);
|
||||
TM = FNMS(KP521108558, TD, TB);
|
||||
TS = FMA(KP521108558, TC, TD);
|
||||
TJ = FMA(KP521108558, TE, TC);
|
||||
TP = FNMS(KP521108558, TB, TF);
|
||||
{
|
||||
E Tx, Tp, Th, Tt, Tl;
|
||||
Tx = FNMS(KP342584725, Ta, T7);
|
||||
Ty = FNMS(KP634356270, Tx, Td);
|
||||
Tp = FNMS(KP342584725, T4, Ta);
|
||||
Tq = FNMS(KP634356270, Tp, Tg);
|
||||
Th = FNMS(KP342584725, Tg, Td);
|
||||
Ti = FNMS(KP634356270, Th, Ta);
|
||||
Tt = FNMS(KP342584725, Td, T4);
|
||||
Tu = FNMS(KP634356270, Tt, T7);
|
||||
Tl = FNMS(KP342584725, T7, Tg);
|
||||
Tm = FNMS(KP634356270, Tl, T4);
|
||||
}
|
||||
{
|
||||
E To, Tn, TI, TH;
|
||||
{
|
||||
E Tk, Tj, TU, TT;
|
||||
Tj = FNMS(KP778434453, Ti, T7);
|
||||
Tk = FNMS(KP876768831, Tj, T4);
|
||||
Cr[WS(csr, 5)] = FNMS(KP959492973, Tk, T1);
|
||||
TT = FMA(KP715370323, TS, TF);
|
||||
TU = FMA(KP830830026, TT, TB);
|
||||
Ci[WS(csi, 5)] = KP989821441 * (FMA(KP918985947, TU, TE));
|
||||
}
|
||||
Tn = FNMS(KP778434453, Tm, Ta);
|
||||
To = FNMS(KP876768831, Tn, Td);
|
||||
Cr[WS(csr, 4)] = FNMS(KP959492973, To, T1);
|
||||
{
|
||||
E TR, TQ, Ts, Tr;
|
||||
TQ = FMA(KP715370323, TP, TC);
|
||||
TR = FNMS(KP830830026, TQ, TE);
|
||||
Ci[WS(csi, 4)] = KP989821441 * (FNMS(KP918985947, TR, TD));
|
||||
Tr = FNMS(KP778434453, Tq, Td);
|
||||
Ts = FNMS(KP876768831, Tr, T7);
|
||||
Cr[WS(csr, 3)] = FNMS(KP959492973, Ts, T1);
|
||||
}
|
||||
{
|
||||
E TO, TN, Tw, Tv;
|
||||
TN = FNMS(KP715370323, TM, TE);
|
||||
TO = FNMS(KP830830026, TN, TF);
|
||||
Ci[WS(csi, 3)] = KP989821441 * (FNMS(KP918985947, TO, TC));
|
||||
Tv = FNMS(KP778434453, Tu, Tg);
|
||||
Tw = FNMS(KP876768831, Tv, Ta);
|
||||
Cr[WS(csr, 2)] = FNMS(KP959492973, Tw, T1);
|
||||
Cr[0] = T1 + T4 + T7 + Ta + Td + Tg;
|
||||
}
|
||||
TH = FMA(KP715370323, TG, TD);
|
||||
TI = FNMS(KP830830026, TH, TC);
|
||||
Ci[WS(csi, 2)] = KP989821441 * (FMA(KP918985947, TI, TB));
|
||||
{
|
||||
E TL, TK, TA, Tz;
|
||||
TK = FNMS(KP715370323, TJ, TB);
|
||||
TL = FMA(KP830830026, TK, TD);
|
||||
Ci[WS(csi, 1)] = KP989821441 * (FNMS(KP918985947, TL, TF));
|
||||
Tz = FNMS(KP778434453, Ty, T4);
|
||||
TA = FNMS(KP876768831, Tz, Tg);
|
||||
Cr[WS(csr, 1)] = FNMS(KP959492973, TA, T1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 11, "r2cf_11", { 15, 5, 45, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_11) (planner *p) { X(kr2c_register) (p, r2cf_11, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 11 -name r2cf_11 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 60 FP additions, 50 FP multiplications,
|
||||
* (or, 20 additions, 10 multiplications, 40 fused multiply/add),
|
||||
* 28 stack variables, 10 constants, and 22 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_11(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP654860733, +0.654860733945285064056925072466293553183791199);
|
||||
DK(KP142314838, +0.142314838273285140443792668616369668791051361);
|
||||
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
|
||||
DK(KP415415013, +0.415415013001886425529274149229623203524004910);
|
||||
DK(KP841253532, +0.841253532831181168861811648919367717513292498);
|
||||
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
|
||||
DK(KP909631995, +0.909631995354518371411715383079028460060241051);
|
||||
DK(KP281732556, +0.281732556841429697711417915346616899035777899);
|
||||
DK(KP540640817, +0.540640817455597582107635954318691695431770608);
|
||||
DK(KP755749574, +0.755749574354258283774035843972344420179717445);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(44, rs), MAKE_VOLATILE_STRIDE(44, csr), MAKE_VOLATILE_STRIDE(44, csi)) {
|
||||
E T1, T4, Tl, Tg, Th, Td, Ti, Ta, Tk, T7, Tj, Tb, Tc;
|
||||
T1 = R0[0];
|
||||
{
|
||||
E T2, T3, Te, Tf;
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = R1[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
Tl = T3 - T2;
|
||||
Te = R1[0];
|
||||
Tf = R0[WS(rs, 5)];
|
||||
Tg = Te + Tf;
|
||||
Th = Tf - Te;
|
||||
}
|
||||
Tb = R1[WS(rs, 1)];
|
||||
Tc = R0[WS(rs, 4)];
|
||||
Td = Tb + Tc;
|
||||
Ti = Tc - Tb;
|
||||
{
|
||||
E T8, T9, T5, T6;
|
||||
T8 = R1[WS(rs, 2)];
|
||||
T9 = R0[WS(rs, 3)];
|
||||
Ta = T8 + T9;
|
||||
Tk = T9 - T8;
|
||||
T5 = R0[WS(rs, 2)];
|
||||
T6 = R1[WS(rs, 3)];
|
||||
T7 = T5 + T6;
|
||||
Tj = T6 - T5;
|
||||
}
|
||||
Ci[WS(csi, 4)] = FMA(KP755749574, Th, KP540640817 * Ti) + FNMS(KP909631995, Tk, KP281732556 * Tj) - (KP989821441 * Tl);
|
||||
Cr[WS(csr, 4)] = FMA(KP841253532, Td, T1) + FNMS(KP959492973, T7, KP415415013 * Ta) + FNMA(KP142314838, T4, KP654860733 * Tg);
|
||||
Ci[WS(csi, 2)] = FMA(KP909631995, Th, KP755749574 * Tl) + FNMA(KP540640817, Tk, KP989821441 * Tj) - (KP281732556 * Ti);
|
||||
Ci[WS(csi, 5)] = FMA(KP281732556, Th, KP755749574 * Ti) + FNMS(KP909631995, Tj, KP989821441 * Tk) - (KP540640817 * Tl);
|
||||
Ci[WS(csi, 1)] = FMA(KP540640817, Th, KP909631995 * Tl) + FMA(KP989821441, Ti, KP755749574 * Tj) + (KP281732556 * Tk);
|
||||
Ci[WS(csi, 3)] = FMA(KP989821441, Th, KP540640817 * Tj) + FNMS(KP909631995, Ti, KP755749574 * Tk) - (KP281732556 * Tl);
|
||||
Cr[WS(csr, 3)] = FMA(KP415415013, Td, T1) + FNMS(KP654860733, Ta, KP841253532 * T7) + FNMA(KP959492973, T4, KP142314838 * Tg);
|
||||
Cr[WS(csr, 1)] = FMA(KP841253532, Tg, T1) + FNMS(KP959492973, Ta, KP415415013 * T4) + FNMA(KP654860733, T7, KP142314838 * Td);
|
||||
Cr[0] = T1 + Tg + T4 + Td + T7 + Ta;
|
||||
Cr[WS(csr, 2)] = FMA(KP415415013, Tg, T1) + FNMS(KP142314838, T7, KP841253532 * Ta) + FNMA(KP959492973, Td, KP654860733 * T4);
|
||||
Cr[WS(csr, 5)] = FMA(KP841253532, T4, T1) + FNMS(KP142314838, Ta, KP415415013 * T7) + FNMA(KP654860733, Td, KP959492973 * Tg);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 11, "r2cf_11", { 20, 10, 40, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_11) (planner *p) { X(kr2c_register) (p, r2cf_11, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
218
fftw-3.3.10/rdft/scalar/r2cf/r2cf_12.c
Normal file
218
fftw-3.3.10/rdft/scalar/r2cf/r2cf_12.c
Normal file
@@ -0,0 +1,218 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cf_12 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 38 FP additions, 10 FP multiplications,
|
||||
* (or, 30 additions, 2 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
|
||||
E T5, Tp, Tm, Tk, Ty, Tt, Ta, Tq, Tn, Tf, Tz, Tu, Tl, To;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
Tp = FNMS(KP500000000, T4, T1);
|
||||
Tm = T3 - T2;
|
||||
}
|
||||
{
|
||||
E Tg, Th, Ti, Tj;
|
||||
Tg = R1[WS(rs, 1)];
|
||||
Th = R1[WS(rs, 3)];
|
||||
Ti = R1[WS(rs, 5)];
|
||||
Tj = Th + Ti;
|
||||
Tk = FNMS(KP500000000, Tj, Tg);
|
||||
Ty = Ti - Th;
|
||||
Tt = Tg + Tj;
|
||||
}
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = R0[WS(rs, 3)];
|
||||
T7 = R0[WS(rs, 5)];
|
||||
T8 = R0[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
Tq = FNMS(KP500000000, T9, T6);
|
||||
Tn = T8 - T7;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Td, Te;
|
||||
Tb = R1[WS(rs, 4)];
|
||||
Tc = R1[0];
|
||||
Td = R1[WS(rs, 2)];
|
||||
Te = Tc + Td;
|
||||
Tf = FNMS(KP500000000, Te, Tb);
|
||||
Tz = Td - Tc;
|
||||
Tu = Tb + Te;
|
||||
}
|
||||
Cr[WS(csr, 3)] = T5 - Ta;
|
||||
Ci[WS(csi, 3)] = Tt - Tu;
|
||||
Tl = Tf - Tk;
|
||||
To = Tm - Tn;
|
||||
Ci[WS(csi, 1)] = FMA(KP866025403, To, Tl);
|
||||
Ci[WS(csi, 5)] = FNMS(KP866025403, To, Tl);
|
||||
{
|
||||
E Tx, TA, Tv, Tw;
|
||||
Tx = Tp - Tq;
|
||||
TA = Ty - Tz;
|
||||
Cr[WS(csr, 5)] = FNMS(KP866025403, TA, Tx);
|
||||
Cr[WS(csr, 1)] = FMA(KP866025403, TA, Tx);
|
||||
Tv = T5 + Ta;
|
||||
Tw = Tt + Tu;
|
||||
Cr[WS(csr, 6)] = Tv - Tw;
|
||||
Cr[0] = Tv + Tw;
|
||||
}
|
||||
{
|
||||
E Tr, Ts, TB, TC;
|
||||
Tr = Tp + Tq;
|
||||
Ts = Tk + Tf;
|
||||
Cr[WS(csr, 2)] = Tr - Ts;
|
||||
Cr[WS(csr, 4)] = Tr + Ts;
|
||||
TB = Ty + Tz;
|
||||
TC = Tm + Tn;
|
||||
Ci[WS(csi, 2)] = KP866025403 * (TB - TC);
|
||||
Ci[WS(csi, 4)] = KP866025403 * (TC + TB);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 12, "r2cf_12", { 30, 2, 8, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_12) (planner *p) { X(kr2c_register) (p, r2cf_12, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 12 -name r2cf_12 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 38 FP additions, 8 FP multiplications,
|
||||
* (or, 34 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 21 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_12(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(48, rs), MAKE_VOLATILE_STRIDE(48, csr), MAKE_VOLATILE_STRIDE(48, csi)) {
|
||||
E T5, Tp, Tb, Tn, Ty, Tt, Ta, Tq, Tc, Ti, Tz, Tu, Td, To;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = R0[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
Tp = FNMS(KP500000000, T4, T1);
|
||||
Tb = T3 - T2;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tl, Tm;
|
||||
Tj = R1[WS(rs, 1)];
|
||||
Tk = R1[WS(rs, 3)];
|
||||
Tl = R1[WS(rs, 5)];
|
||||
Tm = Tk + Tl;
|
||||
Tn = FNMS(KP500000000, Tm, Tj);
|
||||
Ty = Tl - Tk;
|
||||
Tt = Tj + Tm;
|
||||
}
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = R0[WS(rs, 3)];
|
||||
T7 = R0[WS(rs, 5)];
|
||||
T8 = R0[WS(rs, 1)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
Tq = FNMS(KP500000000, T9, T6);
|
||||
Tc = T8 - T7;
|
||||
}
|
||||
{
|
||||
E Te, Tf, Tg, Th;
|
||||
Te = R1[WS(rs, 4)];
|
||||
Tf = R1[0];
|
||||
Tg = R1[WS(rs, 2)];
|
||||
Th = Tf + Tg;
|
||||
Ti = FNMS(KP500000000, Th, Te);
|
||||
Tz = Tg - Tf;
|
||||
Tu = Te + Th;
|
||||
}
|
||||
Cr[WS(csr, 3)] = T5 - Ta;
|
||||
Ci[WS(csi, 3)] = Tt - Tu;
|
||||
Td = KP866025403 * (Tb - Tc);
|
||||
To = Ti - Tn;
|
||||
Ci[WS(csi, 1)] = Td + To;
|
||||
Ci[WS(csi, 5)] = To - Td;
|
||||
{
|
||||
E Tx, TA, Tv, Tw;
|
||||
Tx = Tp - Tq;
|
||||
TA = KP866025403 * (Ty - Tz);
|
||||
Cr[WS(csr, 5)] = Tx - TA;
|
||||
Cr[WS(csr, 1)] = Tx + TA;
|
||||
Tv = T5 + Ta;
|
||||
Tw = Tt + Tu;
|
||||
Cr[WS(csr, 6)] = Tv - Tw;
|
||||
Cr[0] = Tv + Tw;
|
||||
}
|
||||
{
|
||||
E Tr, Ts, TB, TC;
|
||||
Tr = Tp + Tq;
|
||||
Ts = Tn + Ti;
|
||||
Cr[WS(csr, 2)] = Tr - Ts;
|
||||
Cr[WS(csr, 4)] = Tr + Ts;
|
||||
TB = Ty + Tz;
|
||||
TC = Tb + Tc;
|
||||
Ci[WS(csi, 2)] = KP866025403 * (TB - TC);
|
||||
Ci[WS(csi, 4)] = KP866025403 * (TC + TB);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 12, "r2cf_12", { 34, 4, 4, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_12) (planner *p) { X(kr2c_register) (p, r2cf_12, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
3244
fftw-3.3.10/rdft/scalar/r2cf/r2cf_128.c
Normal file
3244
fftw-3.3.10/rdft/scalar/r2cf/r2cf_128.c
Normal file
File diff suppressed because it is too large
Load Diff
361
fftw-3.3.10/rdft/scalar/r2cf/r2cf_13.c
Normal file
361
fftw-3.3.10/rdft/scalar/r2cf/r2cf_13.c
Normal file
@@ -0,0 +1,361 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 13 -name r2cf_13 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 76 FP additions, 51 FP multiplications,
|
||||
* (or, 31 additions, 6 multiplications, 45 fused multiply/add),
|
||||
* 58 stack variables, 23 constants, and 26 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
|
||||
DK(KP516520780, +0.516520780623489722840901288569017135705033622);
|
||||
DK(KP859542535, +0.859542535098774820163672132761689612766401925);
|
||||
DK(KP581704778, +0.581704778510515730456870384989698884939833902);
|
||||
DK(KP514918778, +0.514918778086315755491789696138117261566051239);
|
||||
DK(KP769338817, +0.769338817572980603471413688209101117038278899);
|
||||
DK(KP686558370, +0.686558370781754340655719594850823015421401653);
|
||||
DK(KP226109445, +0.226109445035782405468510155372505010481906348);
|
||||
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
|
||||
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
|
||||
DK(KP301479260, +0.301479260047709873958013540496673347309208464);
|
||||
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
|
||||
DK(KP904176221, +0.904176221990848204433795481776887926501523162);
|
||||
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
|
||||
DK(KP522026385, +0.522026385161275033714027226654165028300441940);
|
||||
DK(KP957805992, +0.957805992594665126462521754605754580515587217);
|
||||
DK(KP600477271, +0.600477271932665282925769253334763009352012849);
|
||||
DK(KP853480001, +0.853480001859823990758994934970528322872359049);
|
||||
DK(KP612264650, +0.612264650376756543746494474777125408779395514);
|
||||
DK(KP038632954, +0.038632954644348171955506895830342264440241080);
|
||||
DK(KP302775637, +0.302775637731994646559610633735247973125648287);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
|
||||
E TN, TA, TD, TO, TR, TS, TZ, T12, Tu, Tx, Tj, Tw, TW, T13;
|
||||
TN = R0[0];
|
||||
{
|
||||
E T3, TP, Th, TB, Tp, Te, TC, Tm, T6, Tr, T9, Ts, Ta, TQ, T1;
|
||||
E T2;
|
||||
T1 = R0[WS(rs, 4)];
|
||||
T2 = R1[WS(rs, 2)];
|
||||
T3 = T1 - T2;
|
||||
TP = T1 + T2;
|
||||
{
|
||||
E Tn, Tf, Tg, To;
|
||||
Tn = R0[WS(rs, 6)];
|
||||
Tf = R0[WS(rs, 5)];
|
||||
Tg = R0[WS(rs, 2)];
|
||||
To = Tf + Tg;
|
||||
Th = Tf - Tg;
|
||||
TB = Tn + To;
|
||||
Tp = FMS(KP500000000, To, Tn);
|
||||
}
|
||||
{
|
||||
E Tk, Tc, Td, Tl;
|
||||
Tk = R1[0];
|
||||
Tc = R1[WS(rs, 4)];
|
||||
Td = R1[WS(rs, 1)];
|
||||
Tl = Td + Tc;
|
||||
Te = Tc - Td;
|
||||
TC = Tk + Tl;
|
||||
Tm = FNMS(KP500000000, Tl, Tk);
|
||||
}
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = R1[WS(rs, 5)];
|
||||
T5 = R0[WS(rs, 3)];
|
||||
T6 = T4 - T5;
|
||||
Tr = T4 + T5;
|
||||
T7 = R1[WS(rs, 3)];
|
||||
T8 = R0[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
Ts = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TQ = Tr + Ts;
|
||||
TA = T3 + Ta;
|
||||
TD = TB - TC;
|
||||
TO = TC + TB;
|
||||
TR = TP + TQ;
|
||||
TS = TO + TR;
|
||||
{
|
||||
E TX, TY, Tq, Tt;
|
||||
TX = Tm - Tp;
|
||||
TY = FNMS(KP500000000, TQ, TP);
|
||||
TZ = TX + TY;
|
||||
T12 = TX - TY;
|
||||
Tq = Tm + Tp;
|
||||
Tt = Tr - Ts;
|
||||
Tu = FMA(KP866025403, Tt, Tq);
|
||||
Tx = FNMS(KP866025403, Tt, Tq);
|
||||
}
|
||||
{
|
||||
E Tb, Ti, TU, TV;
|
||||
Tb = FNMS(KP500000000, Ta, T3);
|
||||
Ti = Te + Th;
|
||||
Tj = FMA(KP866025403, Ti, Tb);
|
||||
Tw = FNMS(KP866025403, Ti, Tb);
|
||||
TU = Th - Te;
|
||||
TV = T6 - T9;
|
||||
TW = TU + TV;
|
||||
T13 = TU - TV;
|
||||
}
|
||||
}
|
||||
Cr[0] = TN + TS;
|
||||
{
|
||||
E TE, TI, Tz, TK, TH, TM, TJ, TL;
|
||||
TE = FMA(KP302775637, TD, TA);
|
||||
TI = FNMS(KP302775637, TA, TD);
|
||||
{
|
||||
E Tv, Ty, TF, TG;
|
||||
Tv = FMA(KP038632954, Tu, Tj);
|
||||
Ty = FMA(KP612264650, Tx, Tw);
|
||||
Tz = FNMS(KP853480001, Ty, Tv);
|
||||
TK = FMA(KP853480001, Ty, Tv);
|
||||
TF = FNMS(KP038632954, Tj, Tu);
|
||||
TG = FNMS(KP612264650, Tw, Tx);
|
||||
TH = FNMS(KP853480001, TG, TF);
|
||||
TM = FMA(KP853480001, TG, TF);
|
||||
}
|
||||
Ci[WS(csi, 1)] = KP600477271 * (FMA(KP957805992, TE, Tz));
|
||||
Ci[WS(csi, 5)] = -(KP600477271 * (FNMS(KP957805992, TI, TH)));
|
||||
TJ = FMA(KP522026385, TH, TI);
|
||||
Ci[WS(csi, 2)] = KP575140729 * (FNMS(KP904176221, TK, TJ));
|
||||
Ci[WS(csi, 6)] = KP575140729 * (FMA(KP904176221, TK, TJ));
|
||||
TL = FNMS(KP522026385, Tz, TE);
|
||||
Ci[WS(csi, 3)] = KP575140729 * (FNMS(KP904176221, TM, TL));
|
||||
Ci[WS(csi, 4)] = -(KP575140729 * (FMA(KP904176221, TM, TL)));
|
||||
}
|
||||
{
|
||||
E T11, T17, T1c, T1e, T16, T18, TT, T10, T19, T1d;
|
||||
TT = FNMS(KP083333333, TS, TN);
|
||||
T10 = FMA(KP301479260, TZ, TW);
|
||||
T11 = FMA(KP503537032, T10, TT);
|
||||
T17 = FNMS(KP251768516, T10, TT);
|
||||
{
|
||||
E T1a, T1b, T14, T15;
|
||||
T1a = FNMS(KP226109445, TW, TZ);
|
||||
T1b = FMA(KP686558370, T12, T13);
|
||||
T1c = FNMS(KP769338817, T1b, T1a);
|
||||
T1e = FMA(KP769338817, T1b, T1a);
|
||||
T14 = FNMS(KP514918778, T13, T12);
|
||||
T15 = TO - TR;
|
||||
T16 = FMA(KP581704778, T15, T14);
|
||||
T18 = FNMS(KP859542535, T14, T15);
|
||||
}
|
||||
Cr[WS(csr, 5)] = FNMS(KP516520780, T16, T11);
|
||||
Cr[WS(csr, 1)] = FMA(KP516520780, T16, T11);
|
||||
T19 = FMA(KP300462606, T18, T17);
|
||||
Cr[WS(csr, 4)] = FNMS(KP503537032, T1c, T19);
|
||||
Cr[WS(csr, 3)] = FMA(KP503537032, T1c, T19);
|
||||
T1d = FNMS(KP300462606, T18, T17);
|
||||
Cr[WS(csr, 6)] = FNMS(KP503537032, T1e, T1d);
|
||||
Cr[WS(csr, 2)] = FMA(KP503537032, T1e, T1d);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 13, "r2cf_13", { 31, 6, 45, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_13) (planner *p) { X(kr2c_register) (p, r2cf_13, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 13 -name r2cf_13 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 76 FP additions, 34 FP multiplications,
|
||||
* (or, 57 additions, 15 multiplications, 19 fused multiply/add),
|
||||
* 55 stack variables, 20 constants, and 26 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_13(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
|
||||
DK(KP075902986, +0.075902986037193865983102897245103540356428373);
|
||||
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
|
||||
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
|
||||
DK(KP113854479, +0.113854479055790798974654345867655310534642560);
|
||||
DK(KP265966249, +0.265966249214837287587521063842185948798330267);
|
||||
DK(KP387390585, +0.387390585467617292130675966426762851778775217);
|
||||
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
|
||||
DK(KP132983124, +0.132983124607418643793760531921092974399165133);
|
||||
DK(KP258260390, +0.258260390311744861420450644284508567852516811);
|
||||
DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
|
||||
DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
|
||||
DK(KP300238635, +0.300238635966332641462884626667381504676006424);
|
||||
DK(KP011599105, +0.011599105605768290721655456654083252189827041);
|
||||
DK(KP156891391, +0.156891391051584611046832726756003269660212636);
|
||||
DK(KP256247671, +0.256247671582936600958684654061725059144125175);
|
||||
DK(KP174138601, +0.174138601152135905005660794929264742616964676);
|
||||
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(52, rs), MAKE_VOLATILE_STRIDE(52, csr), MAKE_VOLATILE_STRIDE(52, csi)) {
|
||||
E T13, Tb, Tm, TW, TX, T14, TU, T10, Tz, TB, Tu, TC, TR, T11;
|
||||
T13 = R0[0];
|
||||
{
|
||||
E Te, TO, Ta, Tv, To, T5, Tw, Tp, Th, Tr, Tk, Ts, Tl, TP, Tc;
|
||||
E Td;
|
||||
Tc = R0[WS(rs, 4)];
|
||||
Td = R1[WS(rs, 2)];
|
||||
Te = Tc - Td;
|
||||
TO = Tc + Td;
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = R1[0];
|
||||
T7 = R1[WS(rs, 1)];
|
||||
T8 = R1[WS(rs, 4)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
Tv = T7 - T8;
|
||||
To = FNMS(KP500000000, T9, T6);
|
||||
}
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[WS(rs, 6)];
|
||||
T2 = R0[WS(rs, 5)];
|
||||
T3 = R0[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
Tw = T2 - T3;
|
||||
Tp = FNMS(KP500000000, T4, T1);
|
||||
}
|
||||
{
|
||||
E Tf, Tg, Ti, Tj;
|
||||
Tf = R1[WS(rs, 5)];
|
||||
Tg = R0[WS(rs, 3)];
|
||||
Th = Tf - Tg;
|
||||
Tr = Tf + Tg;
|
||||
Ti = R1[WS(rs, 3)];
|
||||
Tj = R0[WS(rs, 1)];
|
||||
Tk = Ti - Tj;
|
||||
Ts = Ti + Tj;
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
TP = Tr + Ts;
|
||||
Tb = T5 - Ta;
|
||||
Tm = Te + Tl;
|
||||
TW = Ta + T5;
|
||||
TX = TO + TP;
|
||||
T14 = TW + TX;
|
||||
{
|
||||
E TS, TT, Tx, Ty;
|
||||
TS = Tv + Tw;
|
||||
TT = Th - Tk;
|
||||
TU = TS - TT;
|
||||
T10 = TS + TT;
|
||||
Tx = KP866025403 * (Tv - Tw);
|
||||
Ty = FNMS(KP500000000, Tl, Te);
|
||||
Tz = Tx + Ty;
|
||||
TB = Ty - Tx;
|
||||
}
|
||||
{
|
||||
E Tq, Tt, TN, TQ;
|
||||
Tq = To - Tp;
|
||||
Tt = KP866025403 * (Tr - Ts);
|
||||
Tu = Tq - Tt;
|
||||
TC = Tq + Tt;
|
||||
TN = To + Tp;
|
||||
TQ = FNMS(KP500000000, TP, TO);
|
||||
TR = TN - TQ;
|
||||
T11 = TN + TQ;
|
||||
}
|
||||
}
|
||||
Cr[0] = T13 + T14;
|
||||
{
|
||||
E Tn, TG, TE, TF, TJ, TM, TK, TL;
|
||||
Tn = FNMS(KP174138601, Tm, KP575140729 * Tb);
|
||||
TG = FMA(KP174138601, Tb, KP575140729 * Tm);
|
||||
{
|
||||
E TA, TD, TH, TI;
|
||||
TA = FNMS(KP156891391, Tz, KP256247671 * Tu);
|
||||
TD = FNMS(KP300238635, TC, KP011599105 * TB);
|
||||
TE = TA + TD;
|
||||
TF = KP1_732050807 * (TD - TA);
|
||||
TH = FMA(KP300238635, TB, KP011599105 * TC);
|
||||
TI = FMA(KP256247671, Tz, KP156891391 * Tu);
|
||||
TJ = TH - TI;
|
||||
TM = KP1_732050807 * (TI + TH);
|
||||
}
|
||||
Ci[WS(csi, 5)] = FMA(KP2_000000000, TE, Tn);
|
||||
Ci[WS(csi, 1)] = FMA(KP2_000000000, TJ, TG);
|
||||
TK = TG - TJ;
|
||||
Ci[WS(csi, 4)] = TF - TK;
|
||||
Ci[WS(csi, 3)] = TF + TK;
|
||||
TL = Tn - TE;
|
||||
Ci[WS(csi, 2)] = TL - TM;
|
||||
Ci[WS(csi, 6)] = TL + TM;
|
||||
}
|
||||
{
|
||||
E TZ, T1b, T19, T1e, T16, T1a, TV, TY, T1c, T1d;
|
||||
TV = FNMS(KP132983124, TU, KP258260390 * TR);
|
||||
TY = KP300462606 * (TW - TX);
|
||||
TZ = FMA(KP2_000000000, TV, TY);
|
||||
T1b = TY - TV;
|
||||
{
|
||||
E T17, T18, T12, T15;
|
||||
T17 = FMA(KP387390585, TU, KP265966249 * TR);
|
||||
T18 = FNMS(KP503537032, T11, KP113854479 * T10);
|
||||
T19 = T17 - T18;
|
||||
T1e = T17 + T18;
|
||||
T12 = FMA(KP251768516, T10, KP075902986 * T11);
|
||||
T15 = FNMS(KP083333333, T14, T13);
|
||||
T16 = FMA(KP2_000000000, T12, T15);
|
||||
T1a = T15 - T12;
|
||||
}
|
||||
Cr[WS(csr, 1)] = TZ + T16;
|
||||
Cr[WS(csr, 5)] = T16 - TZ;
|
||||
T1c = T1a - T1b;
|
||||
Cr[WS(csr, 2)] = T19 + T1c;
|
||||
Cr[WS(csr, 6)] = T1c - T19;
|
||||
T1d = T1b + T1a;
|
||||
Cr[WS(csr, 3)] = T1d - T1e;
|
||||
Cr[WS(csr, 4)] = T1e + T1d;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 13, "r2cf_13", { 57, 15, 19, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_13) (planner *p) { X(kr2c_register) (p, r2cf_13, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
263
fftw-3.3.10/rdft/scalar/r2cf/r2cf_14.c
Normal file
263
fftw-3.3.10/rdft/scalar/r2cf/r2cf_14.c
Normal file
@@ -0,0 +1,263 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 14 -name r2cf_14 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 62 FP additions, 36 FP multiplications,
|
||||
* (or, 32 additions, 6 multiplications, 30 fused multiply/add),
|
||||
* 33 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
|
||||
E T3, TN, To, TQ, Tx, TG, Ta, TO, Tw, TD, Th, TP, Tv, TJ, T1;
|
||||
E T2, TA, TK;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 3)];
|
||||
T3 = T1 - T2;
|
||||
TN = T1 + T2;
|
||||
{
|
||||
E Tk, TE, Tn, TF;
|
||||
{
|
||||
E Ti, Tj, Tl, Tm;
|
||||
Ti = R0[WS(rs, 3)];
|
||||
Tj = R1[WS(rs, 6)];
|
||||
Tk = Ti - Tj;
|
||||
TE = Ti + Tj;
|
||||
Tl = R0[WS(rs, 4)];
|
||||
Tm = R1[0];
|
||||
Tn = Tl - Tm;
|
||||
TF = Tl + Tm;
|
||||
}
|
||||
To = Tk + Tn;
|
||||
TQ = TE + TF;
|
||||
Tx = Tn - Tk;
|
||||
TG = TE - TF;
|
||||
}
|
||||
{
|
||||
E T6, TC, T9, TB;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 4)];
|
||||
T6 = T4 - T5;
|
||||
TC = T4 + T5;
|
||||
T7 = R0[WS(rs, 6)];
|
||||
T8 = R1[WS(rs, 2)];
|
||||
T9 = T7 - T8;
|
||||
TB = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
TO = TC + TB;
|
||||
Tw = T6 - T9;
|
||||
TD = TB - TC;
|
||||
}
|
||||
{
|
||||
E Td, TH, Tg, TI;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = R0[WS(rs, 2)];
|
||||
Tc = R1[WS(rs, 5)];
|
||||
Td = Tb - Tc;
|
||||
TH = Tb + Tc;
|
||||
Te = R0[WS(rs, 5)];
|
||||
Tf = R1[WS(rs, 1)];
|
||||
Tg = Te - Tf;
|
||||
TI = Te + Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
TP = TH + TI;
|
||||
Tv = Tg - Td;
|
||||
TJ = TH - TI;
|
||||
}
|
||||
Cr[WS(csr, 7)] = T3 + Ta + Th + To;
|
||||
Cr[0] = TN + TO + TP + TQ;
|
||||
TA = FMA(KP554958132, Tw, Tv);
|
||||
Ci[WS(csi, 3)] = KP974927912 * (FNMS(KP801937735, TA, Tx));
|
||||
{
|
||||
E TL, TM, Ty, Tz;
|
||||
TL = FNMS(KP554958132, TG, TD);
|
||||
Ci[WS(csi, 6)] = KP974927912 * (FNMS(KP801937735, TL, TJ));
|
||||
TM = FMA(KP554958132, TD, TJ);
|
||||
Ci[WS(csi, 4)] = KP974927912 * (FNMS(KP801937735, TM, TG));
|
||||
Ty = FNMS(KP554958132, Tx, Tw);
|
||||
Ci[WS(csi, 1)] = KP974927912 * (FNMS(KP801937735, Ty, Tv));
|
||||
Tz = FMA(KP554958132, Tv, Tx);
|
||||
Ci[WS(csi, 5)] = KP974927912 * (FMA(KP801937735, Tz, Tw));
|
||||
}
|
||||
TK = FMA(KP554958132, TJ, TG);
|
||||
Ci[WS(csi, 2)] = KP974927912 * (FMA(KP801937735, TK, TD));
|
||||
{
|
||||
E TU, TT, Tq, Tp;
|
||||
TT = FNMS(KP356895867, TO, TQ);
|
||||
TU = FNMS(KP692021471, TT, TP);
|
||||
Cr[WS(csr, 2)] = FNMS(KP900968867, TU, TN);
|
||||
Tp = FNMS(KP356895867, To, Th);
|
||||
Tq = FNMS(KP692021471, Tp, Ta);
|
||||
Cr[WS(csr, 3)] = FNMS(KP900968867, Tq, T3);
|
||||
}
|
||||
{
|
||||
E Tu, Tt, Ts, Tr;
|
||||
Tt = FNMS(KP356895867, Th, Ta);
|
||||
Tu = FNMS(KP692021471, Tt, To);
|
||||
Cr[WS(csr, 1)] = FNMS(KP900968867, Tu, T3);
|
||||
Tr = FNMS(KP356895867, Ta, To);
|
||||
Ts = FNMS(KP692021471, Tr, Th);
|
||||
Cr[WS(csr, 5)] = FNMS(KP900968867, Ts, T3);
|
||||
}
|
||||
{
|
||||
E TW, TV, TS, TR;
|
||||
TV = FNMS(KP356895867, TP, TO);
|
||||
TW = FNMS(KP692021471, TV, TQ);
|
||||
Cr[WS(csr, 6)] = FNMS(KP900968867, TW, TN);
|
||||
TR = FNMS(KP356895867, TQ, TP);
|
||||
TS = FNMS(KP692021471, TR, TO);
|
||||
Cr[WS(csr, 4)] = FNMS(KP900968867, TS, TN);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 14, "r2cf_14", { 32, 6, 30, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_14) (planner *p) { X(kr2c_register) (p, r2cf_14, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 14 -name r2cf_14 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 62 FP additions, 36 FP multiplications,
|
||||
* (or, 38 additions, 12 multiplications, 24 fused multiply/add),
|
||||
* 29 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) {
|
||||
E T3, TB, T6, Tv, Tn, Ts, Tk, Tt, Td, Ty, T9, Tw, Tg, Tz, T1;
|
||||
E T2;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 3)];
|
||||
T3 = T1 - T2;
|
||||
TB = T1 + T2;
|
||||
{
|
||||
E T4, T5, Tl, Tm;
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T5 = R1[WS(rs, 5)];
|
||||
T6 = T4 - T5;
|
||||
Tv = T4 + T5;
|
||||
Tl = R0[WS(rs, 6)];
|
||||
Tm = R1[WS(rs, 2)];
|
||||
Tn = Tl - Tm;
|
||||
Ts = Tl + Tm;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, Tb, Tc;
|
||||
Ti = R0[WS(rs, 1)];
|
||||
Tj = R1[WS(rs, 4)];
|
||||
Tk = Ti - Tj;
|
||||
Tt = Ti + Tj;
|
||||
Tb = R0[WS(rs, 3)];
|
||||
Tc = R1[WS(rs, 6)];
|
||||
Td = Tb - Tc;
|
||||
Ty = Tb + Tc;
|
||||
}
|
||||
{
|
||||
E T7, T8, Te, Tf;
|
||||
T7 = R0[WS(rs, 5)];
|
||||
T8 = R1[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
Tw = T7 + T8;
|
||||
Te = R0[WS(rs, 4)];
|
||||
Tf = R1[0];
|
||||
Tg = Te - Tf;
|
||||
Tz = Te + Tf;
|
||||
}
|
||||
{
|
||||
E Tp, Tr, Tq, Ta, To, Th;
|
||||
Tp = Tn - Tk;
|
||||
Tr = Tg - Td;
|
||||
Tq = T9 - T6;
|
||||
Ci[WS(csi, 1)] = FMA(KP781831482, Tp, KP974927912 * Tq) + (KP433883739 * Tr);
|
||||
Ci[WS(csi, 5)] = FMA(KP433883739, Tq, KP781831482 * Tr) - (KP974927912 * Tp);
|
||||
Ci[WS(csi, 3)] = FMA(KP433883739, Tp, KP974927912 * Tr) - (KP781831482 * Tq);
|
||||
Ta = T6 + T9;
|
||||
To = Tk + Tn;
|
||||
Th = Td + Tg;
|
||||
Cr[WS(csr, 3)] = FMA(KP623489801, Ta, T3) + FNMA(KP222520933, Th, KP900968867 * To);
|
||||
Cr[WS(csr, 7)] = T3 + To + Ta + Th;
|
||||
Cr[WS(csr, 1)] = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
|
||||
Cr[WS(csr, 5)] = FMA(KP623489801, Th, T3) + FNMA(KP900968867, Ta, KP222520933 * To);
|
||||
}
|
||||
{
|
||||
E Tu, TA, Tx, TC, TE, TD;
|
||||
Tu = Ts - Tt;
|
||||
TA = Ty - Tz;
|
||||
Tx = Tv - Tw;
|
||||
Ci[WS(csi, 2)] = FMA(KP974927912, Tu, KP433883739 * Tx) + (KP781831482 * TA);
|
||||
Ci[WS(csi, 6)] = FMA(KP974927912, Tx, KP433883739 * TA) - (KP781831482 * Tu);
|
||||
Ci[WS(csi, 4)] = FNMS(KP781831482, Tx, KP974927912 * TA) - (KP433883739 * Tu);
|
||||
TC = Tt + Ts;
|
||||
TE = Tv + Tw;
|
||||
TD = Ty + Tz;
|
||||
Cr[WS(csr, 6)] = FMA(KP623489801, TC, TB) + FNMA(KP900968867, TD, KP222520933 * TE);
|
||||
Cr[WS(csr, 2)] = FMA(KP623489801, TD, TB) + FNMA(KP900968867, TE, KP222520933 * TC);
|
||||
Cr[WS(csr, 4)] = FMA(KP623489801, TE, TB) + FNMA(KP222520933, TD, KP900968867 * TC);
|
||||
Cr[0] = TB + TC + TE + TD;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 14, "r2cf_14", { 38, 12, 24, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_14) (planner *p) { X(kr2c_register) (p, r2cf_14, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
304
fftw-3.3.10/rdft/scalar/r2cf/r2cf_15.c
Normal file
304
fftw-3.3.10/rdft/scalar/r2cf/r2cf_15.c
Normal file
@@ -0,0 +1,304 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cf_15 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 64 FP additions, 35 FP multiplications,
|
||||
* (or, 36 additions, 7 multiplications, 28 fused multiply/add),
|
||||
* 45 stack variables, 8 constants, and 30 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP910592997, +0.910592997310029334643087372129977886038870291);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP823639103, +0.823639103546331925877420039278190003029660514);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
|
||||
E Ti, TR, TF, TM, TN, T7, Te, Tf, TV, TW, TX, Ts, Tv, TH, Tl;
|
||||
E To, TG, TS, TT, TU;
|
||||
{
|
||||
E TD, Tg, Th, TE;
|
||||
TD = R0[0];
|
||||
Tg = R0[WS(rs, 5)];
|
||||
Th = R1[WS(rs, 2)];
|
||||
TE = Th + Tg;
|
||||
Ti = Tg - Th;
|
||||
TR = TD + TE;
|
||||
TF = FNMS(KP500000000, TE, TD);
|
||||
}
|
||||
{
|
||||
E Tj, Tq, Tt, Tm, T3, Tk, Ta, Tr, Td, Tu, T6, Tn;
|
||||
Tj = R1[WS(rs, 1)];
|
||||
Tq = R0[WS(rs, 3)];
|
||||
Tt = R1[WS(rs, 4)];
|
||||
Tm = R0[WS(rs, 6)];
|
||||
{
|
||||
E T1, T2, T8, T9;
|
||||
T1 = R0[WS(rs, 4)];
|
||||
T2 = R1[WS(rs, 6)];
|
||||
T3 = T1 - T2;
|
||||
Tk = T1 + T2;
|
||||
T8 = R1[WS(rs, 5)];
|
||||
T9 = R1[0];
|
||||
Ta = T8 - T9;
|
||||
Tr = T8 + T9;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, T4, T5;
|
||||
Tb = R0[WS(rs, 7)];
|
||||
Tc = R0[WS(rs, 2)];
|
||||
Td = Tb - Tc;
|
||||
Tu = Tb + Tc;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 3)];
|
||||
T6 = T4 - T5;
|
||||
Tn = T4 + T5;
|
||||
}
|
||||
TM = T6 - T3;
|
||||
TN = Td - Ta;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 + Te;
|
||||
TV = Tq + Tr;
|
||||
TW = Tt + Tu;
|
||||
TX = TV + TW;
|
||||
Ts = FNMS(KP500000000, Tr, Tq);
|
||||
Tv = FNMS(KP500000000, Tu, Tt);
|
||||
TH = Ts + Tv;
|
||||
Tl = FNMS(KP500000000, Tk, Tj);
|
||||
To = FNMS(KP500000000, Tn, Tm);
|
||||
TG = Tl + To;
|
||||
TS = Tj + Tk;
|
||||
TT = Tm + Tn;
|
||||
TU = TS + TT;
|
||||
}
|
||||
Ci[WS(csi, 5)] = KP866025403 * (Tf - Ti);
|
||||
{
|
||||
E TK, TQ, TO, TI, TJ, TP, TL;
|
||||
TK = TG - TH;
|
||||
TQ = FNMS(KP618033988, TM, TN);
|
||||
TO = FMA(KP618033988, TN, TM);
|
||||
TI = TG + TH;
|
||||
TJ = FNMS(KP250000000, TI, TF);
|
||||
Cr[WS(csr, 5)] = TF + TI;
|
||||
TP = FNMS(KP559016994, TK, TJ);
|
||||
Cr[WS(csr, 2)] = FMA(KP823639103, TQ, TP);
|
||||
Cr[WS(csr, 7)] = FNMS(KP823639103, TQ, TP);
|
||||
TL = FMA(KP559016994, TK, TJ);
|
||||
Cr[WS(csr, 1)] = FMA(KP823639103, TO, TL);
|
||||
Cr[WS(csr, 4)] = FNMS(KP823639103, TO, TL);
|
||||
}
|
||||
{
|
||||
E T11, T12, T10, TY, TZ;
|
||||
T11 = TW - TV;
|
||||
T12 = TS - TT;
|
||||
Ci[WS(csi, 3)] = KP951056516 * (FMA(KP618033988, T12, T11));
|
||||
Ci[WS(csi, 6)] = -(KP951056516 * (FNMS(KP618033988, T11, T12)));
|
||||
T10 = TU - TX;
|
||||
TY = TU + TX;
|
||||
TZ = FNMS(KP250000000, TY, TR);
|
||||
Cr[WS(csr, 3)] = FNMS(KP559016994, T10, TZ);
|
||||
Cr[0] = TR + TY;
|
||||
Cr[WS(csr, 6)] = FMA(KP559016994, T10, TZ);
|
||||
{
|
||||
E Tx, TB, TA, TC;
|
||||
{
|
||||
E Tp, Tw, Ty, Tz;
|
||||
Tp = Tl - To;
|
||||
Tw = Ts - Tv;
|
||||
Tx = FMA(KP618033988, Tw, Tp);
|
||||
TB = FNMS(KP618033988, Tp, Tw);
|
||||
Ty = FMA(KP250000000, Tf, Ti);
|
||||
Tz = Te - T7;
|
||||
TA = FMA(KP559016994, Tz, Ty);
|
||||
TC = FNMS(KP559016994, Tz, Ty);
|
||||
}
|
||||
Ci[WS(csi, 1)] = -(KP951056516 * (FNMS(KP910592997, TA, Tx)));
|
||||
Ci[WS(csi, 7)] = KP951056516 * (FMA(KP910592997, TC, TB));
|
||||
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP910592997, TA, Tx));
|
||||
Ci[WS(csi, 2)] = KP951056516 * (FNMS(KP910592997, TC, TB));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 15, "r2cf_15", { 36, 7, 28, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_15) (planner *p) { X(kr2c_register) (p, r2cf_15, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 15 -name r2cf_15 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 64 FP additions, 25 FP multiplications,
|
||||
* (or, 50 additions, 11 multiplications, 14 fused multiply/add),
|
||||
* 47 stack variables, 10 constants, and 30 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_15(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP484122918, +0.484122918275927110647408174972799951354115213);
|
||||
DK(KP216506350, +0.216506350946109661690930792688234045867850657);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP509036960, +0.509036960455127183450980863393907648510733164);
|
||||
DK(KP823639103, +0.823639103546331925877420039278190003029660514);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(60, rs), MAKE_VOLATILE_STRIDE(60, csr), MAKE_VOLATILE_STRIDE(60, csi)) {
|
||||
E Ti, TR, TL, TD, TE, T7, Te, Tf, TV, TW, TX, Tv, Ty, TH, To;
|
||||
E Tr, TG, TS, TT, TU;
|
||||
{
|
||||
E TJ, Tg, Th, TK;
|
||||
TJ = R0[0];
|
||||
Tg = R0[WS(rs, 5)];
|
||||
Th = R1[WS(rs, 2)];
|
||||
TK = Th + Tg;
|
||||
Ti = Tg - Th;
|
||||
TR = TJ + TK;
|
||||
TL = FNMS(KP500000000, TK, TJ);
|
||||
}
|
||||
{
|
||||
E Tm, Tt, Tw, Tp, T3, Tx, Ta, Tn, Td, Tq, T6, Tu;
|
||||
Tm = R1[WS(rs, 1)];
|
||||
Tt = R0[WS(rs, 3)];
|
||||
Tw = R1[WS(rs, 4)];
|
||||
Tp = R0[WS(rs, 6)];
|
||||
{
|
||||
E T1, T2, T8, T9;
|
||||
T1 = R0[WS(rs, 7)];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = T1 - T2;
|
||||
Tx = T1 + T2;
|
||||
T8 = R1[WS(rs, 6)];
|
||||
T9 = R0[WS(rs, 4)];
|
||||
Ta = T8 - T9;
|
||||
Tn = T9 + T8;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, T4, T5;
|
||||
Tb = R1[WS(rs, 3)];
|
||||
Tc = R0[WS(rs, 1)];
|
||||
Td = Tb - Tc;
|
||||
Tq = Tc + Tb;
|
||||
T4 = R1[0];
|
||||
T5 = R1[WS(rs, 5)];
|
||||
T6 = T4 - T5;
|
||||
Tu = T5 + T4;
|
||||
}
|
||||
TD = Ta - Td;
|
||||
TE = T6 + T3;
|
||||
T7 = T3 - T6;
|
||||
Te = Ta + Td;
|
||||
Tf = T7 - Te;
|
||||
TV = Tt + Tu;
|
||||
TW = Tw + Tx;
|
||||
TX = TV + TW;
|
||||
Tv = FNMS(KP500000000, Tu, Tt);
|
||||
Ty = FNMS(KP500000000, Tx, Tw);
|
||||
TH = Tv + Ty;
|
||||
To = FNMS(KP500000000, Tn, Tm);
|
||||
Tr = FNMS(KP500000000, Tq, Tp);
|
||||
TG = To + Tr;
|
||||
TS = Tm + Tn;
|
||||
TT = Tp + Tq;
|
||||
TU = TS + TT;
|
||||
}
|
||||
Ci[WS(csi, 5)] = KP866025403 * (Tf - Ti);
|
||||
{
|
||||
E TF, TP, TI, TM, TN, TQ, TO;
|
||||
TF = FMA(KP823639103, TD, KP509036960 * TE);
|
||||
TP = FNMS(KP509036960, TD, KP823639103 * TE);
|
||||
TI = KP559016994 * (TG - TH);
|
||||
TM = TG + TH;
|
||||
TN = FNMS(KP250000000, TM, TL);
|
||||
Cr[WS(csr, 5)] = TL + TM;
|
||||
TQ = TN - TI;
|
||||
Cr[WS(csr, 2)] = TP + TQ;
|
||||
Cr[WS(csr, 7)] = TQ - TP;
|
||||
TO = TI + TN;
|
||||
Cr[WS(csr, 1)] = TF + TO;
|
||||
Cr[WS(csr, 4)] = TO - TF;
|
||||
}
|
||||
{
|
||||
E T11, T12, T10, TY, TZ;
|
||||
T11 = TS - TT;
|
||||
T12 = TW - TV;
|
||||
Ci[WS(csi, 3)] = FMA(KP587785252, T11, KP951056516 * T12);
|
||||
Ci[WS(csi, 6)] = FNMS(KP951056516, T11, KP587785252 * T12);
|
||||
T10 = KP559016994 * (TU - TX);
|
||||
TY = TU + TX;
|
||||
TZ = FNMS(KP250000000, TY, TR);
|
||||
Cr[WS(csr, 3)] = TZ - T10;
|
||||
Cr[0] = TR + TY;
|
||||
Cr[WS(csr, 6)] = T10 + TZ;
|
||||
{
|
||||
E Tl, TB, TA, TC;
|
||||
{
|
||||
E Tj, Tk, Ts, Tz;
|
||||
Tj = FMA(KP866025403, Ti, KP216506350 * Tf);
|
||||
Tk = KP484122918 * (Te + T7);
|
||||
Tl = Tj + Tk;
|
||||
TB = Tk - Tj;
|
||||
Ts = To - Tr;
|
||||
Tz = Tv - Ty;
|
||||
TA = FMA(KP951056516, Ts, KP587785252 * Tz);
|
||||
TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
|
||||
}
|
||||
Ci[WS(csi, 1)] = Tl - TA;
|
||||
Ci[WS(csi, 7)] = TC - TB;
|
||||
Ci[WS(csi, 4)] = Tl + TA;
|
||||
Ci[WS(csi, 2)] = TB + TC;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 15, "r2cf_15", { 50, 11, 14, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_15) (planner *p) { X(kr2c_register) (p, r2cf_15, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
288
fftw-3.3.10/rdft/scalar/r2cf/r2cf_16.c
Normal file
288
fftw-3.3.10/rdft/scalar/r2cf/r2cf_16.c
Normal file
@@ -0,0 +1,288 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 58 FP additions, 20 FP multiplications,
|
||||
* (or, 38 additions, 0 multiplications, 20 fused multiply/add),
|
||||
* 34 stack variables, 3 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
|
||||
E T3, T6, T7, TN, TB, Ta, Td, Te, TO, TE, Tm, TT, Ty, TI, Tt;
|
||||
E TS, Tz, TL, TC, TD, TR, TU;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T5 = R0[WS(rs, 6)];
|
||||
T6 = T4 + T5;
|
||||
T7 = T3 + T6;
|
||||
TN = T4 - T5;
|
||||
TB = T1 - T2;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = R0[WS(rs, 1)];
|
||||
T9 = R0[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
TC = T8 - T9;
|
||||
Tb = R0[WS(rs, 7)];
|
||||
Tc = R0[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
TD = Tb - Tc;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TO = TD - TC;
|
||||
TE = TC + TD;
|
||||
{
|
||||
E Ti, TG, Tl, TH;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = R1[0];
|
||||
Th = R1[WS(rs, 4)];
|
||||
Ti = Tg + Th;
|
||||
TG = Tg - Th;
|
||||
Tj = R1[WS(rs, 2)];
|
||||
Tk = R1[WS(rs, 6)];
|
||||
Tl = Tj + Tk;
|
||||
TH = Tj - Tk;
|
||||
}
|
||||
Tm = Ti - Tl;
|
||||
TT = FMA(KP414213562, TG, TH);
|
||||
Ty = Ti + Tl;
|
||||
TI = FNMS(KP414213562, TH, TG);
|
||||
}
|
||||
{
|
||||
E Tp, TJ, Ts, TK;
|
||||
{
|
||||
E Tn, To, Tq, Tr;
|
||||
Tn = R1[WS(rs, 7)];
|
||||
To = R1[WS(rs, 3)];
|
||||
Tp = Tn + To;
|
||||
TJ = Tn - To;
|
||||
Tq = R1[WS(rs, 1)];
|
||||
Tr = R1[WS(rs, 5)];
|
||||
Ts = Tq + Tr;
|
||||
TK = Tr - Tq;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TS = FMA(KP414213562, TJ, TK);
|
||||
Tz = Tp + Ts;
|
||||
TL = FNMS(KP414213562, TK, TJ);
|
||||
}
|
||||
Cr[WS(csr, 4)] = T7 - Te;
|
||||
Ci[WS(csi, 4)] = Tz - Ty;
|
||||
{
|
||||
E Tf, Tu, Tv, Tw;
|
||||
Tf = T3 - T6;
|
||||
Tu = Tm + Tt;
|
||||
Cr[WS(csr, 6)] = FNMS(KP707106781, Tu, Tf);
|
||||
Cr[WS(csr, 2)] = FMA(KP707106781, Tu, Tf);
|
||||
Tv = Td - Ta;
|
||||
Tw = Tt - Tm;
|
||||
Ci[WS(csi, 2)] = FMA(KP707106781, Tw, Tv);
|
||||
Ci[WS(csi, 6)] = FMS(KP707106781, Tw, Tv);
|
||||
}
|
||||
{
|
||||
E Tx, TA, TF, TM;
|
||||
Tx = T7 + Te;
|
||||
TA = Ty + Tz;
|
||||
Cr[WS(csr, 8)] = Tx - TA;
|
||||
Cr[0] = Tx + TA;
|
||||
TF = FMA(KP707106781, TE, TB);
|
||||
TM = TI + TL;
|
||||
Cr[WS(csr, 7)] = FNMS(KP923879532, TM, TF);
|
||||
Cr[WS(csr, 1)] = FMA(KP923879532, TM, TF);
|
||||
}
|
||||
TR = FNMS(KP707106781, TO, TN);
|
||||
TU = TS - TT;
|
||||
Ci[WS(csi, 1)] = FMS(KP923879532, TU, TR);
|
||||
Ci[WS(csi, 7)] = FMA(KP923879532, TU, TR);
|
||||
{
|
||||
E TV, TW, TP, TQ;
|
||||
TV = FNMS(KP707106781, TE, TB);
|
||||
TW = TT + TS;
|
||||
Cr[WS(csr, 5)] = FNMS(KP923879532, TW, TV);
|
||||
Cr[WS(csr, 3)] = FMA(KP923879532, TW, TV);
|
||||
TP = FMA(KP707106781, TO, TN);
|
||||
TQ = TL - TI;
|
||||
Ci[WS(csi, 3)] = FMA(KP923879532, TQ, TP);
|
||||
Ci[WS(csi, 5)] = FMS(KP923879532, TQ, TP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 16, "r2cf_16", { 38, 0, 20, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_16) (planner *p) { X(kr2c_register) (p, r2cf_16, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 16 -name r2cf_16 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 58 FP additions, 12 FP multiplications,
|
||||
* (or, 54 additions, 8 multiplications, 4 fused multiply/add),
|
||||
* 34 stack variables, 3 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_16(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(64, rs), MAKE_VOLATILE_STRIDE(64, csr), MAKE_VOLATILE_STRIDE(64, csi)) {
|
||||
E T3, T6, T7, Tz, Ti, Ta, Td, Te, TA, Th, Tq, TV, TF, TP, Tx;
|
||||
E TU, TE, TM, Tg, Tf, TJ, TQ;
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 4)];
|
||||
T3 = T1 + T2;
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T5 = R0[WS(rs, 6)];
|
||||
T6 = T4 + T5;
|
||||
T7 = T3 + T6;
|
||||
Tz = T1 - T2;
|
||||
Ti = T4 - T5;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = R0[WS(rs, 1)];
|
||||
T9 = R0[WS(rs, 5)];
|
||||
Ta = T8 + T9;
|
||||
Tg = T8 - T9;
|
||||
Tb = R0[WS(rs, 7)];
|
||||
Tc = R0[WS(rs, 3)];
|
||||
Td = Tb + Tc;
|
||||
Tf = Tb - Tc;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
TA = KP707106781 * (Tg + Tf);
|
||||
Th = KP707106781 * (Tf - Tg);
|
||||
{
|
||||
E Tm, TN, Tp, TO;
|
||||
{
|
||||
E Tk, Tl, Tn, To;
|
||||
Tk = R1[WS(rs, 7)];
|
||||
Tl = R1[WS(rs, 3)];
|
||||
Tm = Tk - Tl;
|
||||
TN = Tk + Tl;
|
||||
Tn = R1[WS(rs, 1)];
|
||||
To = R1[WS(rs, 5)];
|
||||
Tp = Tn - To;
|
||||
TO = Tn + To;
|
||||
}
|
||||
Tq = FNMS(KP923879532, Tp, KP382683432 * Tm);
|
||||
TV = TN + TO;
|
||||
TF = FMA(KP923879532, Tm, KP382683432 * Tp);
|
||||
TP = TN - TO;
|
||||
}
|
||||
{
|
||||
E Tt, TK, Tw, TL;
|
||||
{
|
||||
E Tr, Ts, Tu, Tv;
|
||||
Tr = R1[0];
|
||||
Ts = R1[WS(rs, 4)];
|
||||
Tt = Tr - Ts;
|
||||
TK = Tr + Ts;
|
||||
Tu = R1[WS(rs, 2)];
|
||||
Tv = R1[WS(rs, 6)];
|
||||
Tw = Tu - Tv;
|
||||
TL = Tu + Tv;
|
||||
}
|
||||
Tx = FMA(KP382683432, Tt, KP923879532 * Tw);
|
||||
TU = TK + TL;
|
||||
TE = FNMS(KP382683432, Tw, KP923879532 * Tt);
|
||||
TM = TK - TL;
|
||||
}
|
||||
Cr[WS(csr, 4)] = T7 - Te;
|
||||
Ci[WS(csi, 4)] = TV - TU;
|
||||
{
|
||||
E Tj, Ty, TD, TG;
|
||||
Tj = Th - Ti;
|
||||
Ty = Tq - Tx;
|
||||
Ci[WS(csi, 1)] = Tj + Ty;
|
||||
Ci[WS(csi, 7)] = Ty - Tj;
|
||||
TD = Tz + TA;
|
||||
TG = TE + TF;
|
||||
Cr[WS(csr, 7)] = TD - TG;
|
||||
Cr[WS(csr, 1)] = TD + TG;
|
||||
}
|
||||
{
|
||||
E TB, TC, TH, TI;
|
||||
TB = Tz - TA;
|
||||
TC = Tx + Tq;
|
||||
Cr[WS(csr, 5)] = TB - TC;
|
||||
Cr[WS(csr, 3)] = TB + TC;
|
||||
TH = Ti + Th;
|
||||
TI = TF - TE;
|
||||
Ci[WS(csi, 3)] = TH + TI;
|
||||
Ci[WS(csi, 5)] = TI - TH;
|
||||
}
|
||||
TJ = T3 - T6;
|
||||
TQ = KP707106781 * (TM + TP);
|
||||
Cr[WS(csr, 6)] = TJ - TQ;
|
||||
Cr[WS(csr, 2)] = TJ + TQ;
|
||||
{
|
||||
E TR, TS, TT, TW;
|
||||
TR = Td - Ta;
|
||||
TS = KP707106781 * (TP - TM);
|
||||
Ci[WS(csi, 2)] = TR + TS;
|
||||
Ci[WS(csi, 6)] = TS - TR;
|
||||
TT = T7 + Te;
|
||||
TW = TU + TV;
|
||||
Cr[WS(csr, 8)] = TT - TW;
|
||||
Cr[0] = TT + TW;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 16, "r2cf_16", { 54, 8, 4, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_16) (planner *p) { X(kr2c_register) (p, r2cf_16, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
86
fftw-3.3.10/rdft/scalar/r2cf/r2cf_2.c
Normal file
86
fftw-3.3.10/rdft/scalar/r2cf/r2cf_2.c
Normal file
@@ -0,0 +1,86 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cf_2 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 2 FP additions, 0 FP multiplications,
|
||||
* (or, 2 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 3 stack variables, 0 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
|
||||
E T1, T2;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
Cr[WS(csr, 1)] = T1 - T2;
|
||||
Cr[0] = T1 + T2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 2, "r2cf_2", { 2, 0, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_2) (planner *p) { X(kr2c_register) (p, r2cf_2, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 2 -name r2cf_2 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 2 FP additions, 0 FP multiplications,
|
||||
* (or, 2 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 3 stack variables, 0 constants, and 4 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_2(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(8, csr), MAKE_VOLATILE_STRIDE(8, csi)) {
|
||||
E T1, T2;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
Cr[WS(csr, 1)] = T1 - T2;
|
||||
Cr[0] = T1 + T2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 2, "r2cf_2", { 2, 0, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_2) (planner *p) { X(kr2c_register) (p, r2cf_2, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
362
fftw-3.3.10/rdft/scalar/r2cf/r2cf_20.c
Normal file
362
fftw-3.3.10/rdft/scalar/r2cf/r2cf_20.c
Normal file
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:11 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 86 FP additions, 32 FP multiplications,
|
||||
* (or, 58 additions, 4 multiplications, 28 fused multiply/add),
|
||||
* 51 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
|
||||
E T3, T1d, TJ, TV, T16, T1k, T1l, T19, Ta, Th, Ti, T1e, T1f, T1g, TP;
|
||||
E TQ, TX, Tn, Ts, TK, TS, TT, TW, Ty, TD, TL;
|
||||
{
|
||||
E T1, T2, TF, TG, TH, TI;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 5)];
|
||||
TF = T1 + T2;
|
||||
TG = R1[WS(rs, 2)];
|
||||
TH = R1[WS(rs, 7)];
|
||||
TI = TG + TH;
|
||||
T3 = T1 - T2;
|
||||
T1d = TG - TH;
|
||||
TJ = TF - TI;
|
||||
TV = TF + TI;
|
||||
}
|
||||
{
|
||||
E T6, To, Tx, T17, TC, T18, T9, Tj, Td, Tu, Tm, T15, Tr, T14, Tg;
|
||||
E Tz;
|
||||
{
|
||||
E T4, T5, Tv, Tw;
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T5 = R0[WS(rs, 7)];
|
||||
T6 = T4 - T5;
|
||||
To = T4 + T5;
|
||||
Tv = R1[WS(rs, 6)];
|
||||
Tw = R1[WS(rs, 1)];
|
||||
Tx = Tv + Tw;
|
||||
T17 = Tw - Tv;
|
||||
}
|
||||
{
|
||||
E TA, TB, T7, T8;
|
||||
TA = R1[WS(rs, 8)];
|
||||
TB = R1[WS(rs, 3)];
|
||||
TC = TA + TB;
|
||||
T18 = TB - TA;
|
||||
T7 = R0[WS(rs, 8)];
|
||||
T8 = R0[WS(rs, 3)];
|
||||
T9 = T7 - T8;
|
||||
Tj = T7 + T8;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tk, Tl;
|
||||
Tb = R0[WS(rs, 4)];
|
||||
Tc = R0[WS(rs, 9)];
|
||||
Td = Tb - Tc;
|
||||
Tu = Tb + Tc;
|
||||
Tk = R1[0];
|
||||
Tl = R1[WS(rs, 5)];
|
||||
Tm = Tk + Tl;
|
||||
T15 = Tl - Tk;
|
||||
}
|
||||
{
|
||||
E Tp, Tq, Te, Tf;
|
||||
Tp = R1[WS(rs, 4)];
|
||||
Tq = R1[WS(rs, 9)];
|
||||
Tr = Tp + Tq;
|
||||
T14 = Tq - Tp;
|
||||
Te = R0[WS(rs, 6)];
|
||||
Tf = R0[WS(rs, 1)];
|
||||
Tg = Te - Tf;
|
||||
Tz = Te + Tf;
|
||||
}
|
||||
T16 = T14 - T15;
|
||||
T1k = T6 - T9;
|
||||
T1l = Td - Tg;
|
||||
T19 = T17 - T18;
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
T1e = T14 + T15;
|
||||
T1f = T17 + T18;
|
||||
T1g = T1e + T1f;
|
||||
TP = Tu + Tx;
|
||||
TQ = Tz + TC;
|
||||
TX = TP + TQ;
|
||||
Tn = Tj - Tm;
|
||||
Ts = To - Tr;
|
||||
TK = Ts + Tn;
|
||||
TS = To + Tr;
|
||||
TT = Tj + Tm;
|
||||
TW = TS + TT;
|
||||
Ty = Tu - Tx;
|
||||
TD = Tz - TC;
|
||||
TL = Ty + TD;
|
||||
}
|
||||
Cr[WS(csr, 5)] = T3 + Ti;
|
||||
Ci[WS(csi, 5)] = T1g - T1d;
|
||||
{
|
||||
E Tt, TE, TR, TU;
|
||||
Tt = Tn - Ts;
|
||||
TE = Ty - TD;
|
||||
Ci[WS(csi, 6)] = KP951056516 * (FNMS(KP618033988, TE, Tt));
|
||||
Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, Tt, TE));
|
||||
TR = TP - TQ;
|
||||
TU = TS - TT;
|
||||
Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP618033988, TU, TR)));
|
||||
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP618033988, TR, TU));
|
||||
}
|
||||
{
|
||||
E T10, TY, TZ, TO, TM, TN;
|
||||
T10 = TW - TX;
|
||||
TY = TW + TX;
|
||||
TZ = FNMS(KP250000000, TY, TV);
|
||||
Cr[WS(csr, 4)] = FMA(KP559016994, T10, TZ);
|
||||
Cr[0] = TV + TY;
|
||||
Cr[WS(csr, 8)] = FNMS(KP559016994, T10, TZ);
|
||||
TO = TK - TL;
|
||||
TM = TK + TL;
|
||||
TN = FNMS(KP250000000, TM, TJ);
|
||||
Cr[WS(csr, 2)] = FNMS(KP559016994, TO, TN);
|
||||
Cr[WS(csr, 10)] = TJ + TM;
|
||||
Cr[WS(csr, 6)] = FMA(KP559016994, TO, TN);
|
||||
}
|
||||
{
|
||||
E T1a, T1c, T13, T1b, T11, T12;
|
||||
T1a = FMA(KP618033988, T19, T16);
|
||||
T1c = FNMS(KP618033988, T16, T19);
|
||||
T11 = FNMS(KP250000000, Ti, T3);
|
||||
T12 = Ta - Th;
|
||||
T13 = FMA(KP559016994, T12, T11);
|
||||
T1b = FNMS(KP559016994, T12, T11);
|
||||
Cr[WS(csr, 9)] = FNMS(KP951056516, T1a, T13);
|
||||
Cr[WS(csr, 7)] = FMA(KP951056516, T1c, T1b);
|
||||
Cr[WS(csr, 1)] = FMA(KP951056516, T1a, T13);
|
||||
Cr[WS(csr, 3)] = FNMS(KP951056516, T1c, T1b);
|
||||
}
|
||||
{
|
||||
E T1m, T1o, T1j, T1n, T1h, T1i;
|
||||
T1m = FMA(KP618033988, T1l, T1k);
|
||||
T1o = FNMS(KP618033988, T1k, T1l);
|
||||
T1h = FMA(KP250000000, T1g, T1d);
|
||||
T1i = T1e - T1f;
|
||||
T1j = FNMS(KP559016994, T1i, T1h);
|
||||
T1n = FMA(KP559016994, T1i, T1h);
|
||||
Ci[WS(csi, 1)] = -(FMA(KP951056516, T1m, T1j));
|
||||
Ci[WS(csi, 7)] = FMA(KP951056516, T1o, T1n);
|
||||
Ci[WS(csi, 9)] = FMS(KP951056516, T1m, T1j);
|
||||
Ci[WS(csi, 3)] = FNMS(KP951056516, T1o, T1n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 20, "r2cf_20", { 58, 4, 28, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_20) (planner *p) { X(kr2c_register) (p, r2cf_20, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 20 -name r2cf_20 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 86 FP additions, 24 FP multiplications,
|
||||
* (or, 74 additions, 12 multiplications, 12 fused multiply/add),
|
||||
* 51 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_20(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(80, rs), MAKE_VOLATILE_STRIDE(80, csr), MAKE_VOLATILE_STRIDE(80, csi)) {
|
||||
E T3, T1m, TF, T17, Ts, TM, TN, Tz, Ta, Th, Ti, T1g, T1h, T1k, T10;
|
||||
E T13, T19, TG, TH, TI, T1d, T1e, T1j, TT, TW, T18;
|
||||
{
|
||||
E T1, T2, T15, TD, TE, T16;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 5)];
|
||||
T15 = T1 + T2;
|
||||
TD = R1[WS(rs, 7)];
|
||||
TE = R1[WS(rs, 2)];
|
||||
T16 = TE + TD;
|
||||
T3 = T1 - T2;
|
||||
T1m = T15 + T16;
|
||||
TF = TD - TE;
|
||||
T17 = T15 - T16;
|
||||
}
|
||||
{
|
||||
E T6, TU, Tv, T12, Ty, TZ, T9, TR, Td, TY, To, TS, Tr, TV, Tg;
|
||||
E T11;
|
||||
{
|
||||
E T4, T5, Tt, Tu;
|
||||
T4 = R0[WS(rs, 2)];
|
||||
T5 = R0[WS(rs, 7)];
|
||||
T6 = T4 - T5;
|
||||
TU = T4 + T5;
|
||||
Tt = R1[WS(rs, 8)];
|
||||
Tu = R1[WS(rs, 3)];
|
||||
Tv = Tt - Tu;
|
||||
T12 = Tt + Tu;
|
||||
}
|
||||
{
|
||||
E Tw, Tx, T7, T8;
|
||||
Tw = R1[WS(rs, 6)];
|
||||
Tx = R1[WS(rs, 1)];
|
||||
Ty = Tw - Tx;
|
||||
TZ = Tw + Tx;
|
||||
T7 = R0[WS(rs, 8)];
|
||||
T8 = R0[WS(rs, 3)];
|
||||
T9 = T7 - T8;
|
||||
TR = T7 + T8;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Tm, Tn;
|
||||
Tb = R0[WS(rs, 4)];
|
||||
Tc = R0[WS(rs, 9)];
|
||||
Td = Tb - Tc;
|
||||
TY = Tb + Tc;
|
||||
Tm = R1[0];
|
||||
Tn = R1[WS(rs, 5)];
|
||||
To = Tm - Tn;
|
||||
TS = Tm + Tn;
|
||||
}
|
||||
{
|
||||
E Tp, Tq, Te, Tf;
|
||||
Tp = R1[WS(rs, 4)];
|
||||
Tq = R1[WS(rs, 9)];
|
||||
Tr = Tp - Tq;
|
||||
TV = Tp + Tq;
|
||||
Te = R0[WS(rs, 6)];
|
||||
Tf = R0[WS(rs, 1)];
|
||||
Tg = Te - Tf;
|
||||
T11 = Te + Tf;
|
||||
}
|
||||
Ts = To - Tr;
|
||||
TM = T6 - T9;
|
||||
TN = Td - Tg;
|
||||
Tz = Tv - Ty;
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
T1g = TY + TZ;
|
||||
T1h = T11 + T12;
|
||||
T1k = T1g + T1h;
|
||||
T10 = TY - TZ;
|
||||
T13 = T11 - T12;
|
||||
T19 = T10 + T13;
|
||||
TG = Tr + To;
|
||||
TH = Ty + Tv;
|
||||
TI = TG + TH;
|
||||
T1d = TU + TV;
|
||||
T1e = TR + TS;
|
||||
T1j = T1d + T1e;
|
||||
TT = TR - TS;
|
||||
TW = TU - TV;
|
||||
T18 = TW + TT;
|
||||
}
|
||||
Cr[WS(csr, 5)] = T3 + Ti;
|
||||
Ci[WS(csi, 5)] = TF - TI;
|
||||
{
|
||||
E TX, T14, T1f, T1i;
|
||||
TX = TT - TW;
|
||||
T14 = T10 - T13;
|
||||
Ci[WS(csi, 6)] = FNMS(KP587785252, T14, KP951056516 * TX);
|
||||
Ci[WS(csi, 2)] = FMA(KP587785252, TX, KP951056516 * T14);
|
||||
T1f = T1d - T1e;
|
||||
T1i = T1g - T1h;
|
||||
Ci[WS(csi, 8)] = FNMS(KP951056516, T1i, KP587785252 * T1f);
|
||||
Ci[WS(csi, 4)] = FMA(KP951056516, T1f, KP587785252 * T1i);
|
||||
}
|
||||
{
|
||||
E T1l, T1n, T1o, T1c, T1a, T1b;
|
||||
T1l = KP559016994 * (T1j - T1k);
|
||||
T1n = T1j + T1k;
|
||||
T1o = FNMS(KP250000000, T1n, T1m);
|
||||
Cr[WS(csr, 4)] = T1l + T1o;
|
||||
Cr[0] = T1m + T1n;
|
||||
Cr[WS(csr, 8)] = T1o - T1l;
|
||||
T1c = KP559016994 * (T18 - T19);
|
||||
T1a = T18 + T19;
|
||||
T1b = FNMS(KP250000000, T1a, T17);
|
||||
Cr[WS(csr, 2)] = T1b - T1c;
|
||||
Cr[WS(csr, 10)] = T17 + T1a;
|
||||
Cr[WS(csr, 6)] = T1c + T1b;
|
||||
}
|
||||
{
|
||||
E TA, TC, Tl, TB, Tj, Tk;
|
||||
TA = FMA(KP951056516, Ts, KP587785252 * Tz);
|
||||
TC = FNMS(KP587785252, Ts, KP951056516 * Tz);
|
||||
Tj = KP559016994 * (Ta - Th);
|
||||
Tk = FNMS(KP250000000, Ti, T3);
|
||||
Tl = Tj + Tk;
|
||||
TB = Tk - Tj;
|
||||
Cr[WS(csr, 9)] = Tl - TA;
|
||||
Cr[WS(csr, 7)] = TB + TC;
|
||||
Cr[WS(csr, 1)] = Tl + TA;
|
||||
Cr[WS(csr, 3)] = TB - TC;
|
||||
}
|
||||
{
|
||||
E TO, TQ, TL, TP, TJ, TK;
|
||||
TO = FMA(KP951056516, TM, KP587785252 * TN);
|
||||
TQ = FNMS(KP587785252, TM, KP951056516 * TN);
|
||||
TJ = FMA(KP250000000, TI, TF);
|
||||
TK = KP559016994 * (TH - TG);
|
||||
TL = TJ + TK;
|
||||
TP = TK - TJ;
|
||||
Ci[WS(csi, 1)] = TL - TO;
|
||||
Ci[WS(csi, 7)] = TQ + TP;
|
||||
Ci[WS(csi, 9)] = TO + TL;
|
||||
Ci[WS(csi, 3)] = TP - TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 20, "r2cf_20", { 74, 12, 12, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_20) (planner *p) { X(kr2c_register) (p, r2cf_20, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
736
fftw-3.3.10/rdft/scalar/r2cf/r2cf_25.c
Normal file
736
fftw-3.3.10/rdft/scalar/r2cf/r2cf_25.c
Normal file
@@ -0,0 +1,736 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:11 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cf_25 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 200 FP additions, 168 FP multiplications,
|
||||
* (or, 44 additions, 12 multiplications, 156 fused multiply/add),
|
||||
* 127 stack variables, 66 constants, and 50 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP792626838, +0.792626838241819413632131824093538848057784557);
|
||||
DK(KP876091699, +0.876091699473550838204498029706869638173524346);
|
||||
DK(KP809385824, +0.809385824416008241660603814668679683846476688);
|
||||
DK(KP860541664, +0.860541664367944677098261680920518816412804187);
|
||||
DK(KP560319534, +0.560319534973832390111614715371676131169633784);
|
||||
DK(KP681693190, +0.681693190061530575150324149145440022633095390);
|
||||
DK(KP237294955, +0.237294955877110315393888866460840817927895961);
|
||||
DK(KP897376177, +0.897376177523557693138608077137219684419427330);
|
||||
DK(KP997675361, +0.997675361079556513670859573984492383596555031);
|
||||
DK(KP923225144, +0.923225144846402650453449441572664695995209956);
|
||||
DK(KP956723877, +0.956723877038460305821989399535483155872969262);
|
||||
DK(KP949179823, +0.949179823508441261575555465843363271711583843);
|
||||
DK(KP570584518, +0.570584518783621657366766175430996792655723863);
|
||||
DK(KP669429328, +0.669429328479476605641803240971985825917022098);
|
||||
DK(KP262346850, +0.262346850930607871785420028382979691334784273);
|
||||
DK(KP906616052, +0.906616052148196230441134447086066874408359177);
|
||||
DK(KP921078979, +0.921078979742360627699756128143719920817673854);
|
||||
DK(KP845997307, +0.845997307939530944175097360758058292389769300);
|
||||
DK(KP982009705, +0.982009705009746369461829878184175962711969869);
|
||||
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
|
||||
DK(KP559154169, +0.559154169276087864842202529084232643714075927);
|
||||
DK(KP683113946, +0.683113946453479238701949862233725244439656928);
|
||||
DK(KP242145790, +0.242145790282157779872542093866183953459003101);
|
||||
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
|
||||
DK(KP999754674, +0.999754674276473633366203429228112409535557487);
|
||||
DK(KP904508497, +0.904508497187473712051146708591409529430077295);
|
||||
DK(KP904730450, +0.904730450839922351881287709692877908104763647);
|
||||
DK(KP916574801, +0.916574801383451584742370439148878693530976769);
|
||||
DK(KP831864738, +0.831864738706457140726048799369896829771167132);
|
||||
DK(KP829049696, +0.829049696159252993975487806364305442437946767);
|
||||
DK(KP855719849, +0.855719849902058969314654733608091555096772472);
|
||||
DK(KP952936919, +0.952936919628306576880750665357914584765951388);
|
||||
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
|
||||
DK(KP690983005, +0.690983005625052575897706582817180941139845410);
|
||||
DK(KP522616830, +0.522616830205754336872861364785224694908468440);
|
||||
DK(KP772036680, +0.772036680810363904029489473607579825330539880);
|
||||
DK(KP734762448, +0.734762448793050413546343770063151342619912334);
|
||||
DK(KP803003575, +0.803003575438660414833440593570376004635464850);
|
||||
DK(KP999544308, +0.999544308746292983948881682379742149196758193);
|
||||
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
|
||||
DK(KP763932022, +0.763932022500210303590826331268723764559381640);
|
||||
DK(KP894834959, +0.894834959464455102997960030820114611498661386);
|
||||
DK(KP447417479, +0.447417479732227551498980015410057305749330693);
|
||||
DK(KP867381224, +0.867381224396525206773171885031575671309956167);
|
||||
DK(KP958953096, +0.958953096729998668045963838399037225970891871);
|
||||
DK(KP912575812, +0.912575812670962425556968549836277086778922727);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP244189809, +0.244189809627953270309879511234821255780225091);
|
||||
DK(KP522847744, +0.522847744331509716623755382187077770911012542);
|
||||
DK(KP578046249, +0.578046249379945007321754579646815604023525655);
|
||||
DK(KP269969613, +0.269969613759572083574752974412347470060951301);
|
||||
DK(KP667278218, +0.667278218140296670899089292254759909713898805);
|
||||
DK(KP494780565, +0.494780565770515410344588413655324772219443730);
|
||||
DK(KP447533225, +0.447533225982656890041886979663652563063114397);
|
||||
DK(KP603558818, +0.603558818296015001454675132653458027918768137);
|
||||
DK(KP120146378, +0.120146378570687701782758537356596213647956445);
|
||||
DK(KP869845200, +0.869845200362138853122720822420327157933056305);
|
||||
DK(KP786782374, +0.786782374965295178365099601674911834788448471);
|
||||
DK(KP132830569, +0.132830569247582714407653942074819768844536507);
|
||||
DK(KP893101515, +0.893101515366181661711202267938416198338079437);
|
||||
DK(KP066152395, +0.066152395967733048213034281011006031460903353);
|
||||
DK(KP059835404, +0.059835404262124915169548397419498386427871950);
|
||||
DK(KP987388751, +0.987388751065621252324603216482382109400433949);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
|
||||
E T2p, TJ, T2u, T1O, T2s, T2t, TB, T1c, T26, T2e, T1k, T1r, T1M, T21, T1B;
|
||||
E T9, TX, T29, T2k, T1h, T1v, T1R, T1X, T1z, Ti, TQ, T2a, T2j, T1g, T1u;
|
||||
E T1U, T1Y, T1y, Ts, T15, T27, T2f, T1j, T1s, T1J, T20, T1C, Tj, TC;
|
||||
{
|
||||
E TI, T2r, TF, T2q;
|
||||
T2p = R0[0];
|
||||
{
|
||||
E TG, TH, TD, TE;
|
||||
TG = R0[WS(rs, 5)];
|
||||
TH = R1[WS(rs, 7)];
|
||||
TI = TG - TH;
|
||||
T2r = TG + TH;
|
||||
TD = R1[WS(rs, 2)];
|
||||
TE = R0[WS(rs, 10)];
|
||||
TF = TD - TE;
|
||||
T2q = TD + TE;
|
||||
}
|
||||
TJ = FMA(KP618033988, TI, TF);
|
||||
T2u = T2q - T2r;
|
||||
T1O = FNMS(KP618033988, TF, TI);
|
||||
T2s = T2q + T2r;
|
||||
T2t = FNMS(KP250000000, T2s, T2p);
|
||||
}
|
||||
{
|
||||
E Tt, TA, T1a, T16, T17;
|
||||
Tt = R1[WS(rs, 1)];
|
||||
{
|
||||
E Tu, Tv, Tw, Tx, Ty, Tz;
|
||||
Tu = R0[WS(rs, 4)];
|
||||
Tv = R1[WS(rs, 11)];
|
||||
Tw = Tu + Tv;
|
||||
Tx = R1[WS(rs, 6)];
|
||||
Ty = R0[WS(rs, 9)];
|
||||
Tz = Tx + Ty;
|
||||
TA = Tw + Tz;
|
||||
T1a = Tz - Tw;
|
||||
T16 = Tv - Tu;
|
||||
T17 = Tx - Ty;
|
||||
}
|
||||
TB = Tt + TA;
|
||||
{
|
||||
E T18, T1L, T1b, T1K, T19;
|
||||
T18 = FNMS(KP618033988, T17, T16);
|
||||
T1L = FMA(KP618033988, T16, T17);
|
||||
T19 = FNMS(KP250000000, TA, Tt);
|
||||
T1b = FNMS(KP559016994, T1a, T19);
|
||||
T1K = FMA(KP559016994, T1a, T19);
|
||||
T1c = FNMS(KP987388751, T1b, T18);
|
||||
T26 = FNMS(KP059835404, T1L, T1K);
|
||||
T2e = FMA(KP066152395, T1K, T1L);
|
||||
T1k = FMA(KP893101515, T18, T1b);
|
||||
T1r = FNMS(KP132830569, T1b, T18);
|
||||
T1M = FNMS(KP786782374, T1L, T1K);
|
||||
T21 = FMA(KP869845200, T1K, T1L);
|
||||
T1B = FMA(KP120146378, T18, T1b);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1, T8, TV, TS, TU;
|
||||
T1 = R0[WS(rs, 2)];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = R1[WS(rs, 4)];
|
||||
T3 = R0[WS(rs, 12)];
|
||||
T4 = T2 + T3;
|
||||
T5 = R0[WS(rs, 7)];
|
||||
T6 = R1[WS(rs, 9)];
|
||||
T7 = T5 + T6;
|
||||
T8 = T4 + T7;
|
||||
TV = T5 - T6;
|
||||
TS = T4 - T7;
|
||||
TU = T3 - T2;
|
||||
}
|
||||
T9 = T1 + T8;
|
||||
{
|
||||
E TW, T1P, TT, T1Q, TR;
|
||||
TW = FNMS(KP618033988, TV, TU);
|
||||
T1P = FMA(KP618033988, TU, TV);
|
||||
TR = FMS(KP250000000, T8, T1);
|
||||
TT = FNMS(KP559016994, TS, TR);
|
||||
T1Q = FMA(KP559016994, TS, TR);
|
||||
TX = FMA(KP603558818, TW, TT);
|
||||
T29 = FNMS(KP447533225, T1P, T1Q);
|
||||
T2k = FMA(KP494780565, T1Q, T1P);
|
||||
T1h = FNMS(KP667278218, TT, TW);
|
||||
T1v = FNMS(KP786782374, TW, TT);
|
||||
T1R = FMA(KP132830569, T1Q, T1P);
|
||||
T1X = FNMS(KP120146378, T1P, T1Q);
|
||||
T1z = FMA(KP869845200, TT, TW);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ta, Th, TO, TK, TL;
|
||||
Ta = R1[0];
|
||||
{
|
||||
E Tb, Tc, Td, Te, Tf, Tg;
|
||||
Tb = R0[WS(rs, 3)];
|
||||
Tc = R1[WS(rs, 10)];
|
||||
Td = Tb + Tc;
|
||||
Te = R1[WS(rs, 5)];
|
||||
Tf = R0[WS(rs, 8)];
|
||||
Tg = Te + Tf;
|
||||
Th = Td + Tg;
|
||||
TO = Td - Tg;
|
||||
TK = Tb - Tc;
|
||||
TL = Tf - Te;
|
||||
}
|
||||
Ti = Ta + Th;
|
||||
{
|
||||
E TM, T1S, TP, T1T, TN;
|
||||
TM = FNMS(KP618033988, TL, TK);
|
||||
T1S = FMA(KP618033988, TK, TL);
|
||||
TN = FNMS(KP250000000, Th, Ta);
|
||||
TP = FMA(KP559016994, TO, TN);
|
||||
T1T = FNMS(KP559016994, TO, TN);
|
||||
TQ = FMA(KP269969613, TP, TM);
|
||||
T2a = FMA(KP578046249, T1T, T1S);
|
||||
T2j = FNMS(KP522847744, T1S, T1T);
|
||||
T1g = FNMS(KP244189809, TM, TP);
|
||||
T1u = FNMS(KP603558818, TM, TP);
|
||||
T1U = FNMS(KP987388751, T1T, T1S);
|
||||
T1Y = FMA(KP893101515, T1S, T1T);
|
||||
T1y = FMA(KP667278218, TP, TM);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tk, Tr, T13, TZ, T10;
|
||||
Tk = R0[WS(rs, 1)];
|
||||
{
|
||||
E Tl, Tm, Tn, To, Tp, Tq;
|
||||
Tl = R1[WS(rs, 3)];
|
||||
Tm = R0[WS(rs, 11)];
|
||||
Tn = Tl + Tm;
|
||||
To = R0[WS(rs, 6)];
|
||||
Tp = R1[WS(rs, 8)];
|
||||
Tq = To + Tp;
|
||||
Tr = Tn + Tq;
|
||||
T13 = Tn - Tq;
|
||||
TZ = Tm - Tl;
|
||||
T10 = Tp - To;
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
{
|
||||
E T11, T1I, T14, T1H, T12;
|
||||
T11 = FMA(KP618033988, T10, TZ);
|
||||
T1I = FNMS(KP618033988, TZ, T10);
|
||||
T12 = FMS(KP250000000, Tr, Tk);
|
||||
T14 = FNMS(KP559016994, T13, T12);
|
||||
T1H = FMA(KP559016994, T13, T12);
|
||||
T15 = FMA(KP578046249, T14, T11);
|
||||
T27 = FNMS(KP603558818, T1I, T1H);
|
||||
T2f = FMA(KP667278218, T1H, T1I);
|
||||
T1j = FNMS(KP522847744, T11, T14);
|
||||
T1s = FMA(KP447533225, T11, T14);
|
||||
T1J = FMA(KP059835404, T1I, T1H);
|
||||
T20 = FNMS(KP066152395, T1H, T1I);
|
||||
T1C = FNMS(KP494780565, T14, T11);
|
||||
}
|
||||
}
|
||||
Tj = T9 - Ti;
|
||||
TC = Ts - TB;
|
||||
Ci[WS(csi, 5)] = KP951056516 * (FNMS(KP618033988, TC, Tj));
|
||||
Ci[WS(csi, 10)] = KP951056516 * (FMA(KP618033988, Tj, TC));
|
||||
{
|
||||
E T39, T3c, T3e, T3a, T3b, T3d;
|
||||
T39 = T2p + T2s;
|
||||
T3a = T9 + Ti;
|
||||
T3b = Ts + TB;
|
||||
T3c = T3a + T3b;
|
||||
T3e = T3a - T3b;
|
||||
Cr[0] = T3c + T39;
|
||||
T3d = FNMS(KP250000000, T3c, T39);
|
||||
Cr[WS(csr, 5)] = FMA(KP559016994, T3e, T3d);
|
||||
Cr[WS(csr, 10)] = FNMS(KP559016994, T3e, T3d);
|
||||
}
|
||||
{
|
||||
E T1A, T1x, T1F, T1G;
|
||||
T1A = FNMS(KP912575812, T1z, T1y);
|
||||
{
|
||||
E T1t, T1w, T1E, T1D;
|
||||
T1t = FMA(KP958953096, T1s, T1r);
|
||||
T1w = FMA(KP912575812, T1v, T1u);
|
||||
T1D = FNMS(KP867381224, T1C, T1B);
|
||||
T1E = FMA(KP447417479, T1w, T1D);
|
||||
T1x = FNMS(KP894834959, T1w, T1t);
|
||||
T1F = FMA(KP763932022, T1E, T1t);
|
||||
}
|
||||
Ci[WS(csi, 4)] = KP951056516 * (FMA(KP992114701, T1x, TJ));
|
||||
T1G = FMA(KP999544308, T1F, T1A);
|
||||
Ci[WS(csi, 9)] = KP951056516 * (FNMS(KP803003575, T1G, TJ));
|
||||
}
|
||||
{
|
||||
E T1Z, T1N, T1W, T24, T1V, T23, T22, T25;
|
||||
T1Z = FNMS(KP734762448, T1Y, T1X);
|
||||
T1N = FNMS(KP772036680, T1M, T1J);
|
||||
T1V = FMA(KP734762448, T1U, T1R);
|
||||
T22 = FMA(KP772036680, T21, T20);
|
||||
T23 = FNMS(KP522616830, T1V, T22);
|
||||
T1W = FNMS(KP992114701, T1V, T1O);
|
||||
T24 = FMA(KP690983005, T23, T1N);
|
||||
Ci[WS(csi, 3)] = KP998026728 * (FNMS(KP952936919, T1W, T1N));
|
||||
T25 = FNMS(KP855719849, T24, T1Z);
|
||||
Ci[WS(csi, 8)] = -(KP951056516 * (FNMS(KP992114701, T25, T1O)));
|
||||
}
|
||||
{
|
||||
E T1i, T1l, T1e, T1p, T1n, TY, T1d, T1m, T1f, T1q, T1o;
|
||||
T1i = FNMS(KP829049696, T1h, T1g);
|
||||
T1l = FMA(KP831864738, T1k, T1j);
|
||||
TY = FNMS(KP916574801, TX, TQ);
|
||||
T1d = FMA(KP831864738, T1c, T15);
|
||||
T1m = FMA(KP904730450, T1d, TY);
|
||||
T1e = FNMS(KP904730450, T1d, TY);
|
||||
T1p = FNMS(KP904508497, T1m, T1i);
|
||||
T1n = FNMS(KP999754674, T1m, T1l);
|
||||
Ci[WS(csi, 1)] = -(KP951056516 * (FMA(KP968583161, T1e, TJ)));
|
||||
T1f = FNMS(KP242145790, T1e, TJ);
|
||||
T1q = FMA(KP683113946, T1p, T1l);
|
||||
T1o = FNMS(KP559154169, T1n, T1i);
|
||||
Ci[WS(csi, 6)] = -(KP951056516 * (FMA(KP968583161, T1o, T1f)));
|
||||
Ci[WS(csi, 11)] = -(KP951056516 * (FMA(KP876306680, T1q, T1f)));
|
||||
}
|
||||
{
|
||||
E T2l, T2c, T2n, T2i, T2d, T2o, T2m;
|
||||
T2l = FNMS(KP982009705, T2k, T2j);
|
||||
{
|
||||
E T2g, T28, T2b, T2h;
|
||||
T2g = FMA(KP845997307, T2f, T2e);
|
||||
T28 = FNMS(KP845997307, T27, T26);
|
||||
T2b = FNMS(KP921078979, T2a, T29);
|
||||
T2h = FMA(KP906616052, T2b, T28);
|
||||
T2c = FNMS(KP906616052, T2b, T28);
|
||||
T2n = T2g + T2h;
|
||||
T2i = FMA(KP618033988, T2h, T2g);
|
||||
}
|
||||
Ci[WS(csi, 2)] = -(KP998026728 * (FNMS(KP952936919, T1O, T2c)));
|
||||
T2d = FMA(KP262346850, T2c, T1O);
|
||||
T2o = FNMS(KP669429328, T2n, T2l);
|
||||
T2m = FMA(KP570584518, T2l, T2i);
|
||||
Ci[WS(csi, 12)] = KP951056516 * (FNMS(KP949179823, T2m, T2d));
|
||||
Ci[WS(csi, 7)] = KP951056516 * (FNMS(KP876306680, T2o, T2d));
|
||||
}
|
||||
{
|
||||
E T2P, T2W, T2V, T2Z, T32, T33, T2S, T37, T35, T2Q, T2R, T34;
|
||||
T2P = FNMS(KP559016994, T2u, T2t);
|
||||
T2W = FNMS(KP734762448, T1U, T1R);
|
||||
{
|
||||
E T2U, T2T, T2Y, T2X;
|
||||
T2U = FNMS(KP772036680, T21, T20);
|
||||
T2T = FMA(KP734762448, T1Y, T1X);
|
||||
T2X = FMA(KP772036680, T1M, T1J);
|
||||
T2Y = FMA(KP522616830, T2T, T2X);
|
||||
T2V = FMA(KP956723877, T2U, T2T);
|
||||
T2Z = FNMS(KP763932022, T2Y, T2U);
|
||||
}
|
||||
T32 = FMA(KP845997307, T27, T26);
|
||||
T33 = FMA(KP921078979, T2a, T29);
|
||||
T2Q = FNMS(KP845997307, T2f, T2e);
|
||||
T2R = FMA(KP982009705, T2k, T2j);
|
||||
T34 = FNMS(KP923225144, T2R, T2Q);
|
||||
T2S = FMA(KP923225144, T2R, T2Q);
|
||||
T37 = FNMS(KP904508497, T34, T32);
|
||||
T35 = FNMS(KP997675361, T34, T33);
|
||||
Cr[WS(csr, 2)] = FMA(KP949179823, T2S, T2P);
|
||||
Cr[WS(csr, 3)] = FMA(KP992114701, T2V, T2P);
|
||||
{
|
||||
E T30, T31, T38, T36;
|
||||
T30 = FMA(KP855719849, T2Z, T2W);
|
||||
Cr[WS(csr, 8)] = FNMS(KP897376177, T30, T2P);
|
||||
T31 = FNMS(KP237294955, T2S, T2P);
|
||||
T38 = FNMS(KP681693190, T37, T33);
|
||||
T36 = FMA(KP560319534, T35, T32);
|
||||
Cr[WS(csr, 12)] = FNMS(KP949179823, T36, T31);
|
||||
Cr[WS(csr, 7)] = FNMS(KP860541664, T38, T31);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2v, T2H, T2M, T2O, T2A, T2C, T2y, T2F, T2D, T2w, T2x, T2B;
|
||||
T2v = FMA(KP559016994, T2u, T2t);
|
||||
T2H = FNMS(KP912575812, T1v, T1u);
|
||||
{
|
||||
E T2I, T2K, T2L, T2J;
|
||||
T2I = FMA(KP867381224, T1C, T1B);
|
||||
T2J = FNMS(KP958953096, T1s, T1r);
|
||||
T2K = FMA(KP912575812, T1z, T1y);
|
||||
T2L = FNMS(KP447417479, T2K, T2J);
|
||||
T2M = FNMS(KP690983005, T2L, T2I);
|
||||
T2O = FNMS(KP809385824, T2K, T2I);
|
||||
}
|
||||
T2A = FMA(KP916574801, TX, TQ);
|
||||
T2C = FNMS(KP831864738, T1c, T15);
|
||||
T2w = FMA(KP829049696, T1h, T1g);
|
||||
T2x = FNMS(KP831864738, T1k, T1j);
|
||||
T2B = FMA(KP904730450, T2x, T2w);
|
||||
T2y = FNMS(KP904730450, T2x, T2w);
|
||||
T2F = T2A + T2B;
|
||||
T2D = FMA(KP904730450, T2C, T2B);
|
||||
Cr[WS(csr, 1)] = FMA(KP968583161, T2y, T2v);
|
||||
Cr[WS(csr, 4)] = FNMS(KP992114701, T2O, T2v);
|
||||
{
|
||||
E T2N, T2z, T2G, T2E;
|
||||
T2N = FNMS(KP999544308, T2M, T2H);
|
||||
Cr[WS(csr, 9)] = FNMS(KP803003575, T2N, T2v);
|
||||
T2z = FNMS(KP242145790, T2y, T2v);
|
||||
T2G = FMA(KP683113946, T2F, T2C);
|
||||
T2E = FNMS(KP618033988, T2D, T2A);
|
||||
Cr[WS(csr, 6)] = FNMS(KP876091699, T2E, T2z);
|
||||
Cr[WS(csr, 11)] = FNMS(KP792626838, T2G, T2z);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 25, "r2cf_25", { 44, 12, 156, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_25) (planner *p) { X(kr2c_register) (p, r2cf_25, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 25 -name r2cf_25 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 200 FP additions, 140 FP multiplications,
|
||||
* (or, 117 additions, 57 multiplications, 83 fused multiply/add),
|
||||
* 101 stack variables, 40 constants, and 50 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_25(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP998026728, +0.998026728428271561952336806863450553336905220);
|
||||
DK(KP125581039, +0.125581039058626752152356449131262266244969664);
|
||||
DK(KP1_996053456, +1.996053456856543123904673613726901106673810439);
|
||||
DK(KP062790519, +0.062790519529313376076178224565631133122484832);
|
||||
DK(KP809016994, +0.809016994374947424102293417182819058860154590);
|
||||
DK(KP309016994, +0.309016994374947424102293417182819058860154590);
|
||||
DK(KP1_369094211, +1.369094211857377347464566715242418539779038465);
|
||||
DK(KP728968627, +0.728968627421411523146730319055259111372571664);
|
||||
DK(KP963507348, +0.963507348203430549974383005744259307057084020);
|
||||
DK(KP876306680, +0.876306680043863587308115903922062583399064238);
|
||||
DK(KP497379774, +0.497379774329709576484567492012895936835134813);
|
||||
DK(KP968583161, +0.968583161128631119490168375464735813836012403);
|
||||
DK(KP684547105, +0.684547105928688673732283357621209269889519233);
|
||||
DK(KP1_457937254, +1.457937254842823046293460638110518222745143328);
|
||||
DK(KP481753674, +0.481753674101715274987191502872129653528542010);
|
||||
DK(KP1_752613360, +1.752613360087727174616231807844125166798128477);
|
||||
DK(KP248689887, +0.248689887164854788242283746006447968417567406);
|
||||
DK(KP1_937166322, +1.937166322257262238980336750929471627672024806);
|
||||
DK(KP992114701, +0.992114701314477831049793042785778521453036709);
|
||||
DK(KP250666467, +0.250666467128608490746237519633017587885836494);
|
||||
DK(KP425779291, +0.425779291565072648862502445744251703979973042);
|
||||
DK(KP1_809654104, +1.809654104932039055427337295865395187940827822);
|
||||
DK(KP1_274847979, +1.274847979497379420353425623352032390869834596);
|
||||
DK(KP770513242, +0.770513242775789230803009636396177847271667672);
|
||||
DK(KP844327925, +0.844327925502015078548558063966681505381659241);
|
||||
DK(KP1_071653589, +1.071653589957993236542617535735279956127150691);
|
||||
DK(KP125333233, +0.125333233564304245373118759816508793942918247);
|
||||
DK(KP1_984229402, +1.984229402628955662099586085571557042906073418);
|
||||
DK(KP904827052, +0.904827052466019527713668647932697593970413911);
|
||||
DK(KP851558583, +0.851558583130145297725004891488503407959946084);
|
||||
DK(KP637423989, +0.637423989748689710176712811676016195434917298);
|
||||
DK(KP1_541026485, +1.541026485551578461606019272792355694543335344);
|
||||
DK(KP535826794, +0.535826794978996618271308767867639978063575346);
|
||||
DK(KP1_688655851, +1.688655851004030157097116127933363010763318483);
|
||||
DK(KP293892626, +0.293892626146236564584352977319536384298826219);
|
||||
DK(KP475528258, +0.475528258147576786058219666689691071702849317);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(100, rs), MAKE_VOLATILE_STRIDE(100, csr), MAKE_VOLATILE_STRIDE(100, csi)) {
|
||||
E T8, T1j, T1V, T1l, T7, T9, Ta, T12, T2u, T1O, T19, T1P, Ti, T2r, T1K;
|
||||
E Tp, T1L, Tx, T2q, T1H, TE, T1I, TN, T2t, T1R, TU, T1S, T6, T1k, T3;
|
||||
E T2s, T2v;
|
||||
T8 = R0[0];
|
||||
{
|
||||
E T4, T5, T1, T2;
|
||||
T4 = R0[WS(rs, 5)];
|
||||
T5 = R1[WS(rs, 7)];
|
||||
T6 = T4 + T5;
|
||||
T1k = T4 - T5;
|
||||
T1 = R1[WS(rs, 2)];
|
||||
T2 = R0[WS(rs, 10)];
|
||||
T3 = T1 + T2;
|
||||
T1j = T1 - T2;
|
||||
}
|
||||
T1V = KP951056516 * T1k;
|
||||
T1l = FMA(KP951056516, T1j, KP587785252 * T1k);
|
||||
T7 = KP559016994 * (T3 - T6);
|
||||
T9 = T3 + T6;
|
||||
Ta = FNMS(KP250000000, T9, T8);
|
||||
{
|
||||
E T16, T13, T14, TY, T17, T11, T15, T18;
|
||||
T16 = R1[WS(rs, 1)];
|
||||
{
|
||||
E TW, TX, TZ, T10;
|
||||
TW = R0[WS(rs, 4)];
|
||||
TX = R1[WS(rs, 11)];
|
||||
T13 = TW + TX;
|
||||
TZ = R1[WS(rs, 6)];
|
||||
T10 = R0[WS(rs, 9)];
|
||||
T14 = TZ + T10;
|
||||
TY = TW - TX;
|
||||
T17 = T13 + T14;
|
||||
T11 = TZ - T10;
|
||||
}
|
||||
T12 = FMA(KP475528258, TY, KP293892626 * T11);
|
||||
T2u = T16 + T17;
|
||||
T1O = FNMS(KP293892626, TY, KP475528258 * T11);
|
||||
T15 = KP559016994 * (T13 - T14);
|
||||
T18 = FNMS(KP250000000, T17, T16);
|
||||
T19 = T15 + T18;
|
||||
T1P = T18 - T15;
|
||||
}
|
||||
{
|
||||
E Tm, Tj, Tk, Te, Tn, Th, Tl, To;
|
||||
Tm = R1[0];
|
||||
{
|
||||
E Tc, Td, Tf, Tg;
|
||||
Tc = R0[WS(rs, 3)];
|
||||
Td = R1[WS(rs, 10)];
|
||||
Tj = Tc + Td;
|
||||
Tf = R1[WS(rs, 5)];
|
||||
Tg = R0[WS(rs, 8)];
|
||||
Tk = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Tn = Tj + Tk;
|
||||
Th = Tf - Tg;
|
||||
}
|
||||
Ti = FMA(KP475528258, Te, KP293892626 * Th);
|
||||
T2r = Tm + Tn;
|
||||
T1K = FNMS(KP293892626, Te, KP475528258 * Th);
|
||||
Tl = KP559016994 * (Tj - Tk);
|
||||
To = FNMS(KP250000000, Tn, Tm);
|
||||
Tp = Tl + To;
|
||||
T1L = To - Tl;
|
||||
}
|
||||
{
|
||||
E TB, Ty, Tz, Tt, TC, Tw, TA, TD;
|
||||
TB = R0[WS(rs, 2)];
|
||||
{
|
||||
E Tr, Ts, Tu, Tv;
|
||||
Tr = R1[WS(rs, 4)];
|
||||
Ts = R0[WS(rs, 12)];
|
||||
Ty = Tr + Ts;
|
||||
Tu = R0[WS(rs, 7)];
|
||||
Tv = R1[WS(rs, 9)];
|
||||
Tz = Tu + Tv;
|
||||
Tt = Tr - Ts;
|
||||
TC = Ty + Tz;
|
||||
Tw = Tu - Tv;
|
||||
}
|
||||
Tx = FMA(KP475528258, Tt, KP293892626 * Tw);
|
||||
T2q = TB + TC;
|
||||
T1H = FNMS(KP293892626, Tt, KP475528258 * Tw);
|
||||
TA = KP559016994 * (Ty - Tz);
|
||||
TD = FNMS(KP250000000, TC, TB);
|
||||
TE = TA + TD;
|
||||
T1I = TD - TA;
|
||||
}
|
||||
{
|
||||
E TR, TO, TP, TJ, TS, TM, TQ, TT;
|
||||
TR = R0[WS(rs, 1)];
|
||||
{
|
||||
E TH, TI, TK, TL;
|
||||
TH = R1[WS(rs, 3)];
|
||||
TI = R0[WS(rs, 11)];
|
||||
TO = TH + TI;
|
||||
TK = R0[WS(rs, 6)];
|
||||
TL = R1[WS(rs, 8)];
|
||||
TP = TK + TL;
|
||||
TJ = TH - TI;
|
||||
TS = TO + TP;
|
||||
TM = TK - TL;
|
||||
}
|
||||
TN = FMA(KP475528258, TJ, KP293892626 * TM);
|
||||
T2t = TR + TS;
|
||||
T1R = FNMS(KP293892626, TJ, KP475528258 * TM);
|
||||
TQ = KP559016994 * (TO - TP);
|
||||
TT = FNMS(KP250000000, TS, TR);
|
||||
TU = TQ + TT;
|
||||
T1S = TT - TQ;
|
||||
}
|
||||
T2s = T2q - T2r;
|
||||
T2v = T2t - T2u;
|
||||
Ci[WS(csi, 5)] = FNMS(KP587785252, T2v, KP951056516 * T2s);
|
||||
Ci[WS(csi, 10)] = FMA(KP587785252, T2s, KP951056516 * T2v);
|
||||
{
|
||||
E T2z, T2y, T2A, T2w, T2x, T2B;
|
||||
T2z = T8 + T9;
|
||||
T2w = T2r + T2q;
|
||||
T2x = T2t + T2u;
|
||||
T2y = KP559016994 * (T2w - T2x);
|
||||
T2A = T2w + T2x;
|
||||
Cr[0] = T2z + T2A;
|
||||
T2B = FNMS(KP250000000, T2A, T2z);
|
||||
Cr[WS(csr, 5)] = T2y + T2B;
|
||||
Cr[WS(csr, 10)] = T2B - T2y;
|
||||
}
|
||||
{
|
||||
E Tb, Tq, TF, TG, T1E, T1F, T1G, T1B, T1C, T1D, TV, T1a, T1b, T1o, T1r;
|
||||
E T1s, T1z, T1x, T1e, T1h, T1i, T1u, T1t;
|
||||
Tb = T7 + Ta;
|
||||
Tq = FMA(KP1_688655851, Ti, KP535826794 * Tp);
|
||||
TF = FMA(KP1_541026485, Tx, KP637423989 * TE);
|
||||
TG = Tq - TF;
|
||||
T1E = FMA(KP851558583, TN, KP904827052 * TU);
|
||||
T1F = FMA(KP1_984229402, T12, KP125333233 * T19);
|
||||
T1G = T1E + T1F;
|
||||
T1B = FNMS(KP844327925, Tp, KP1_071653589 * Ti);
|
||||
T1C = FNMS(KP1_274847979, Tx, KP770513242 * TE);
|
||||
T1D = T1B + T1C;
|
||||
TV = FNMS(KP425779291, TU, KP1_809654104 * TN);
|
||||
T1a = FNMS(KP992114701, T19, KP250666467 * T12);
|
||||
T1b = TV + T1a;
|
||||
{
|
||||
E T1m, T1n, T1p, T1q;
|
||||
T1m = FMA(KP1_937166322, Ti, KP248689887 * Tp);
|
||||
T1n = FMA(KP1_071653589, Tx, KP844327925 * TE);
|
||||
T1o = T1m + T1n;
|
||||
T1p = FMA(KP1_752613360, TN, KP481753674 * TU);
|
||||
T1q = FMA(KP1_457937254, T12, KP684547105 * T19);
|
||||
T1r = T1p + T1q;
|
||||
T1s = T1o + T1r;
|
||||
T1z = T1q - T1p;
|
||||
T1x = T1n - T1m;
|
||||
}
|
||||
{
|
||||
E T1c, T1d, T1f, T1g;
|
||||
T1c = FNMS(KP497379774, Ti, KP968583161 * Tp);
|
||||
T1d = FNMS(KP1_688655851, Tx, KP535826794 * TE);
|
||||
T1e = T1c + T1d;
|
||||
T1f = FNMS(KP963507348, TN, KP876306680 * TU);
|
||||
T1g = FNMS(KP1_369094211, T12, KP728968627 * T19);
|
||||
T1h = T1f + T1g;
|
||||
T1i = T1e + T1h;
|
||||
T1u = T1f - T1g;
|
||||
T1t = T1d - T1c;
|
||||
}
|
||||
Cr[WS(csr, 1)] = Tb + T1i;
|
||||
Ci[WS(csi, 1)] = -(T1l + T1s);
|
||||
Cr[WS(csr, 4)] = Tb + TG + T1b;
|
||||
Ci[WS(csi, 4)] = T1l + T1D - T1G;
|
||||
Ci[WS(csi, 9)] = FMA(KP309016994, T1D, T1l) + FMA(KP587785252, T1a - TV, KP809016994 * T1G) - (KP951056516 * (Tq + TF));
|
||||
Cr[WS(csr, 9)] = FMA(KP309016994, TG, Tb) + FMA(KP951056516, T1B - T1C, KP587785252 * (T1F - T1E)) - (KP809016994 * T1b);
|
||||
{
|
||||
E T1v, T1w, T1y, T1A;
|
||||
T1v = FMS(KP250000000, T1s, T1l);
|
||||
T1w = KP559016994 * (T1r - T1o);
|
||||
Ci[WS(csi, 11)] = FMA(KP587785252, T1t, KP951056516 * T1u) + T1v - T1w;
|
||||
Ci[WS(csi, 6)] = FMA(KP951056516, T1t, T1v) + FNMS(KP587785252, T1u, T1w);
|
||||
T1y = FNMS(KP250000000, T1i, Tb);
|
||||
T1A = KP559016994 * (T1e - T1h);
|
||||
Cr[WS(csr, 11)] = FMA(KP587785252, T1x, T1y) + FNMA(KP951056516, T1z, T1A);
|
||||
Cr[WS(csr, 6)] = FMA(KP951056516, T1x, T1A) + FMA(KP587785252, T1z, T1y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1W, T1X, T1J, T1M, T1N, T21, T22, T23, T1Q, T1T, T1U, T1Y, T1Z, T20, T26;
|
||||
E T29, T2a, T2k, T2j, T2l, T2m, T2d, T2o, T2i;
|
||||
T1W = FNMS(KP587785252, T1j, T1V);
|
||||
T1X = Ta - T7;
|
||||
T1J = FNMS(KP125333233, T1I, KP1_984229402 * T1H);
|
||||
T1M = FMA(KP1_457937254, T1K, KP684547105 * T1L);
|
||||
T1N = T1J - T1M;
|
||||
T21 = FNMS(KP1_996053456, T1R, KP062790519 * T1S);
|
||||
T22 = FMA(KP1_541026485, T1O, KP637423989 * T1P);
|
||||
T23 = T21 - T22;
|
||||
T1Q = FNMS(KP770513242, T1P, KP1_274847979 * T1O);
|
||||
T1T = FMA(KP125581039, T1R, KP998026728 * T1S);
|
||||
T1U = T1Q - T1T;
|
||||
T1Y = FNMS(KP1_369094211, T1K, KP728968627 * T1L);
|
||||
T1Z = FMA(KP250666467, T1H, KP992114701 * T1I);
|
||||
T20 = T1Y - T1Z;
|
||||
{
|
||||
E T24, T25, T27, T28;
|
||||
T24 = FNMS(KP481753674, T1L, KP1_752613360 * T1K);
|
||||
T25 = FMA(KP851558583, T1H, KP904827052 * T1I);
|
||||
T26 = T24 - T25;
|
||||
T27 = FNMS(KP844327925, T1S, KP1_071653589 * T1R);
|
||||
T28 = FNMS(KP998026728, T1P, KP125581039 * T1O);
|
||||
T29 = T27 + T28;
|
||||
T2a = T26 + T29;
|
||||
T2k = T27 - T28;
|
||||
T2j = T24 + T25;
|
||||
}
|
||||
{
|
||||
E T2b, T2c, T2g, T2h;
|
||||
T2b = FNMS(KP425779291, T1I, KP1_809654104 * T1H);
|
||||
T2c = FMA(KP963507348, T1K, KP876306680 * T1L);
|
||||
T2l = T2c + T2b;
|
||||
T2g = FMA(KP1_688655851, T1R, KP535826794 * T1S);
|
||||
T2h = FMA(KP1_996053456, T1O, KP062790519 * T1P);
|
||||
T2m = T2g + T2h;
|
||||
T2d = T2b - T2c;
|
||||
T2o = T2l + T2m;
|
||||
T2i = T2g - T2h;
|
||||
}
|
||||
Ci[WS(csi, 2)] = T1W + T2a;
|
||||
Cr[WS(csr, 2)] = T1X + T2o;
|
||||
Ci[WS(csi, 3)] = T1N + T1U - T1W;
|
||||
Cr[WS(csr, 3)] = T1X + T20 + T23;
|
||||
Cr[WS(csr, 8)] = FMA(KP309016994, T20, T1X) + FNMA(KP809016994, T23, KP587785252 * (T1T + T1Q)) - (KP951056516 * (T1M + T1J));
|
||||
Ci[WS(csi, 8)] = FNMS(KP587785252, T21 + T22, KP309016994 * T1N) + FNMA(KP809016994, T1U, KP951056516 * (T1Y + T1Z)) - T1W;
|
||||
{
|
||||
E T2e, T2f, T2n, T2p;
|
||||
T2e = KP559016994 * (T26 - T29);
|
||||
T2f = FNMS(KP250000000, T2a, T1W);
|
||||
Ci[WS(csi, 7)] = FMA(KP951056516, T2d, T2e) + FNMS(KP587785252, T2i, T2f);
|
||||
Ci[WS(csi, 12)] = FMA(KP587785252, T2d, T2f) + FMS(KP951056516, T2i, T2e);
|
||||
T2n = KP559016994 * (T2l - T2m);
|
||||
T2p = FNMS(KP250000000, T2o, T1X);
|
||||
Cr[WS(csr, 7)] = FMA(KP951056516, T2j, KP587785252 * T2k) + T2n + T2p;
|
||||
Cr[WS(csr, 12)] = FMA(KP587785252, T2j, T2p) + FNMA(KP951056516, T2k, T2n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 25, "r2cf_25", { 117, 57, 83, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_25) (planner *p) { X(kr2c_register) (p, r2cf_25, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
96
fftw-3.3.10/rdft/scalar/r2cf/r2cf_3.c
Normal file
96
fftw-3.3.10/rdft/scalar/r2cf/r2cf_3.c
Normal file
@@ -0,0 +1,96 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cf_3 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 2 FP multiplications,
|
||||
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
|
||||
* 7 stack variables, 2 constants, and 6 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
T3 = R0[WS(rs, 1)];
|
||||
T4 = T2 + T3;
|
||||
Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1);
|
||||
Ci[WS(csi, 1)] = KP866025403 * (T3 - T2);
|
||||
Cr[0] = T1 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 3, "r2cf_3", { 3, 1, 1, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_3) (planner *p) { X(kr2c_register) (p, r2cf_3, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 3 -name r2cf_3 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 2 FP multiplications,
|
||||
* (or, 3 additions, 1 multiplications, 1 fused multiply/add),
|
||||
* 7 stack variables, 2 constants, and 6 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(12, rs), MAKE_VOLATILE_STRIDE(12, csr), MAKE_VOLATILE_STRIDE(12, csi)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = R0[0];
|
||||
T2 = R1[0];
|
||||
T3 = R0[WS(rs, 1)];
|
||||
T4 = T2 + T3;
|
||||
Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1);
|
||||
Ci[WS(csi, 1)] = KP866025403 * (T3 - T2);
|
||||
Cr[0] = T1 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 3, "r2cf_3", { 3, 1, 1, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_3) (planner *p) { X(kr2c_register) (p, r2cf_3, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
610
fftw-3.3.10/rdft/scalar/r2cf/r2cf_32.c
Normal file
610
fftw-3.3.10/rdft/scalar/r2cf/r2cf_32.c
Normal file
@@ -0,0 +1,610 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:11 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cf_32 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 156 FP additions, 68 FP multiplications,
|
||||
* (or, 88 additions, 0 multiplications, 68 fused multiply/add),
|
||||
* 54 stack variables, 7 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DK(KP668178637, +0.668178637919298919997757686523080761552472251);
|
||||
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DK(KP198912367, +0.198912367379658006911597622644676228597850501);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
|
||||
E T7, T2b, Tv, T1h, Te, T2n, Ty, T1i, Tt, T2d, TF, T1l, Tm, T2c, TC;
|
||||
E T1k, T1Z, T22, T2k, T2j, T1e, T1C, T19, T1B, T1S, T1V, T2h, T2g, TX, T1z;
|
||||
E TS, T1y;
|
||||
{
|
||||
E T1, T2, T3, T4, T5, T6;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 8)];
|
||||
T3 = T1 + T2;
|
||||
T4 = R0[WS(rs, 4)];
|
||||
T5 = R0[WS(rs, 12)];
|
||||
T6 = T4 + T5;
|
||||
T7 = T3 + T6;
|
||||
T2b = T3 - T6;
|
||||
Tv = T1 - T2;
|
||||
T1h = T4 - T5;
|
||||
}
|
||||
{
|
||||
E Ta, Tw, Td, Tx;
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = R0[WS(rs, 2)];
|
||||
T9 = R0[WS(rs, 10)];
|
||||
Ta = T8 + T9;
|
||||
Tw = T8 - T9;
|
||||
Tb = R0[WS(rs, 14)];
|
||||
Tc = R0[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
Tx = Tb - Tc;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T2n = Td - Ta;
|
||||
Ty = Tw + Tx;
|
||||
T1i = Tx - Tw;
|
||||
}
|
||||
{
|
||||
E Tp, TD, Ts, TE;
|
||||
{
|
||||
E Tn, To, Tq, Tr;
|
||||
Tn = R0[WS(rs, 15)];
|
||||
To = R0[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
TD = Tn - To;
|
||||
Tq = R0[WS(rs, 3)];
|
||||
Tr = R0[WS(rs, 11)];
|
||||
Ts = Tq + Tr;
|
||||
TE = Tq - Tr;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T2d = Tp - Ts;
|
||||
TF = FMA(KP414213562, TE, TD);
|
||||
T1l = FNMS(KP414213562, TD, TE);
|
||||
}
|
||||
{
|
||||
E Ti, TA, Tl, TB;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = R0[WS(rs, 1)];
|
||||
Th = R0[WS(rs, 9)];
|
||||
Ti = Tg + Th;
|
||||
TA = Tg - Th;
|
||||
Tj = R0[WS(rs, 5)];
|
||||
Tk = R0[WS(rs, 13)];
|
||||
Tl = Tj + Tk;
|
||||
TB = Tj - Tk;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T2c = Ti - Tl;
|
||||
TC = FNMS(KP414213562, TB, TA);
|
||||
T1k = FMA(KP414213562, TA, TB);
|
||||
}
|
||||
{
|
||||
E T11, T1X, T1c, T1Y, T14, T20, T17, T21, T1d, T18;
|
||||
{
|
||||
E TZ, T10, T1a, T1b;
|
||||
TZ = R1[WS(rs, 15)];
|
||||
T10 = R1[WS(rs, 7)];
|
||||
T11 = TZ - T10;
|
||||
T1X = TZ + T10;
|
||||
T1a = R1[WS(rs, 11)];
|
||||
T1b = R1[WS(rs, 3)];
|
||||
T1c = T1a - T1b;
|
||||
T1Y = T1b + T1a;
|
||||
}
|
||||
{
|
||||
E T12, T13, T15, T16;
|
||||
T12 = R1[WS(rs, 1)];
|
||||
T13 = R1[WS(rs, 9)];
|
||||
T14 = T12 - T13;
|
||||
T20 = T12 + T13;
|
||||
T15 = R1[WS(rs, 13)];
|
||||
T16 = R1[WS(rs, 5)];
|
||||
T17 = T15 - T16;
|
||||
T21 = T15 + T16;
|
||||
}
|
||||
T1Z = T1X + T1Y;
|
||||
T22 = T20 + T21;
|
||||
T2k = T21 - T20;
|
||||
T2j = T1X - T1Y;
|
||||
T1d = T17 - T14;
|
||||
T1e = FMA(KP707106781, T1d, T1c);
|
||||
T1C = FNMS(KP707106781, T1d, T1c);
|
||||
T18 = T14 + T17;
|
||||
T19 = FMA(KP707106781, T18, T11);
|
||||
T1B = FNMS(KP707106781, T18, T11);
|
||||
}
|
||||
{
|
||||
E TK, T1Q, TV, T1R, TN, T1T, TQ, T1U, TW, TR;
|
||||
{
|
||||
E TI, TJ, TT, TU;
|
||||
TI = R1[0];
|
||||
TJ = R1[WS(rs, 8)];
|
||||
TK = TI - TJ;
|
||||
T1Q = TI + TJ;
|
||||
TT = R1[WS(rs, 4)];
|
||||
TU = R1[WS(rs, 12)];
|
||||
TV = TT - TU;
|
||||
T1R = TT + TU;
|
||||
}
|
||||
{
|
||||
E TL, TM, TO, TP;
|
||||
TL = R1[WS(rs, 2)];
|
||||
TM = R1[WS(rs, 10)];
|
||||
TN = TL - TM;
|
||||
T1T = TL + TM;
|
||||
TO = R1[WS(rs, 14)];
|
||||
TP = R1[WS(rs, 6)];
|
||||
TQ = TO - TP;
|
||||
T1U = TO + TP;
|
||||
}
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T2h = T1U - T1T;
|
||||
T2g = T1Q - T1R;
|
||||
TW = TN - TQ;
|
||||
TX = FMA(KP707106781, TW, TV);
|
||||
T1z = FNMS(KP707106781, TW, TV);
|
||||
TR = TN + TQ;
|
||||
TS = FMA(KP707106781, TR, TK);
|
||||
T1y = FNMS(KP707106781, TR, TK);
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T27, T28, T29, T2a;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
T27 = Tf + Tu;
|
||||
T28 = T1S + T1V;
|
||||
T29 = T1Z + T22;
|
||||
T2a = T28 + T29;
|
||||
Cr[WS(csr, 8)] = Tf - Tu;
|
||||
Ci[WS(csi, 8)] = T29 - T28;
|
||||
Cr[WS(csr, 16)] = T27 - T2a;
|
||||
Cr[0] = T27 + T2a;
|
||||
}
|
||||
{
|
||||
E T1P, T25, T24, T26, T1W, T23;
|
||||
T1P = T7 - Te;
|
||||
T25 = Tt - Tm;
|
||||
T1W = T1S - T1V;
|
||||
T23 = T1Z - T22;
|
||||
T24 = T1W + T23;
|
||||
T26 = T23 - T1W;
|
||||
Cr[WS(csr, 12)] = FNMS(KP707106781, T24, T1P);
|
||||
Ci[WS(csi, 12)] = FMS(KP707106781, T26, T25);
|
||||
Cr[WS(csr, 4)] = FMA(KP707106781, T24, T1P);
|
||||
Ci[WS(csi, 4)] = FMA(KP707106781, T26, T25);
|
||||
}
|
||||
{
|
||||
E T2f, T2v, T2p, T2r, T2m, T2q, T2u, T2w, T2e, T2o;
|
||||
T2e = T2c + T2d;
|
||||
T2f = FMA(KP707106781, T2e, T2b);
|
||||
T2v = FNMS(KP707106781, T2e, T2b);
|
||||
T2o = T2d - T2c;
|
||||
T2p = FNMS(KP707106781, T2o, T2n);
|
||||
T2r = FMA(KP707106781, T2o, T2n);
|
||||
{
|
||||
E T2i, T2l, T2s, T2t;
|
||||
T2i = FMA(KP414213562, T2h, T2g);
|
||||
T2l = FNMS(KP414213562, T2k, T2j);
|
||||
T2m = T2i + T2l;
|
||||
T2q = T2l - T2i;
|
||||
T2s = FNMS(KP414213562, T2g, T2h);
|
||||
T2t = FMA(KP414213562, T2j, T2k);
|
||||
T2u = T2s + T2t;
|
||||
T2w = T2t - T2s;
|
||||
}
|
||||
Cr[WS(csr, 14)] = FNMS(KP923879532, T2m, T2f);
|
||||
Ci[WS(csi, 14)] = FMS(KP923879532, T2u, T2r);
|
||||
Cr[WS(csr, 2)] = FMA(KP923879532, T2m, T2f);
|
||||
Ci[WS(csi, 2)] = FMA(KP923879532, T2u, T2r);
|
||||
Ci[WS(csi, 6)] = FMS(KP923879532, T2q, T2p);
|
||||
Cr[WS(csr, 6)] = FMA(KP923879532, T2w, T2v);
|
||||
Ci[WS(csi, 10)] = FMA(KP923879532, T2q, T2p);
|
||||
Cr[WS(csr, 10)] = FNMS(KP923879532, T2w, T2v);
|
||||
}
|
||||
{
|
||||
E TH, T1t, T1s, T1u, T1g, T1o, T1n, T1p;
|
||||
{
|
||||
E Tz, TG, T1q, T1r;
|
||||
Tz = FMA(KP707106781, Ty, Tv);
|
||||
TG = TC + TF;
|
||||
TH = FMA(KP923879532, TG, Tz);
|
||||
T1t = FNMS(KP923879532, TG, Tz);
|
||||
T1q = FMA(KP198912367, T19, T1e);
|
||||
T1r = FMA(KP198912367, TS, TX);
|
||||
T1s = T1q - T1r;
|
||||
T1u = T1r + T1q;
|
||||
}
|
||||
{
|
||||
E TY, T1f, T1j, T1m;
|
||||
TY = FNMS(KP198912367, TX, TS);
|
||||
T1f = FNMS(KP198912367, T1e, T19);
|
||||
T1g = TY + T1f;
|
||||
T1o = T1f - TY;
|
||||
T1j = FNMS(KP707106781, T1i, T1h);
|
||||
T1m = T1k + T1l;
|
||||
T1n = FNMS(KP923879532, T1m, T1j);
|
||||
T1p = FMA(KP923879532, T1m, T1j);
|
||||
}
|
||||
Cr[WS(csr, 15)] = FNMS(KP980785280, T1g, TH);
|
||||
Ci[WS(csi, 15)] = FMA(KP980785280, T1s, T1p);
|
||||
Cr[WS(csr, 1)] = FMA(KP980785280, T1g, TH);
|
||||
Ci[WS(csi, 1)] = FMS(KP980785280, T1s, T1p);
|
||||
Ci[WS(csi, 7)] = FMA(KP980785280, T1o, T1n);
|
||||
Cr[WS(csr, 7)] = FMA(KP980785280, T1u, T1t);
|
||||
Ci[WS(csi, 9)] = FMS(KP980785280, T1o, T1n);
|
||||
Cr[WS(csr, 9)] = FNMS(KP980785280, T1u, T1t);
|
||||
}
|
||||
{
|
||||
E T1x, T1N, T1M, T1O, T1E, T1I, T1H, T1J;
|
||||
{
|
||||
E T1v, T1w, T1K, T1L;
|
||||
T1v = FNMS(KP707106781, Ty, Tv);
|
||||
T1w = T1k - T1l;
|
||||
T1x = FMA(KP923879532, T1w, T1v);
|
||||
T1N = FNMS(KP923879532, T1w, T1v);
|
||||
T1K = FNMS(KP668178637, T1y, T1z);
|
||||
T1L = FNMS(KP668178637, T1B, T1C);
|
||||
T1M = T1K - T1L;
|
||||
T1O = T1K + T1L;
|
||||
}
|
||||
{
|
||||
E T1A, T1D, T1F, T1G;
|
||||
T1A = FMA(KP668178637, T1z, T1y);
|
||||
T1D = FMA(KP668178637, T1C, T1B);
|
||||
T1E = T1A + T1D;
|
||||
T1I = T1D - T1A;
|
||||
T1F = FMA(KP707106781, T1i, T1h);
|
||||
T1G = TF - TC;
|
||||
T1H = FNMS(KP923879532, T1G, T1F);
|
||||
T1J = FMA(KP923879532, T1G, T1F);
|
||||
}
|
||||
Cr[WS(csr, 13)] = FNMS(KP831469612, T1E, T1x);
|
||||
Ci[WS(csi, 13)] = FMS(KP831469612, T1M, T1J);
|
||||
Cr[WS(csr, 3)] = FMA(KP831469612, T1E, T1x);
|
||||
Ci[WS(csi, 3)] = FMA(KP831469612, T1M, T1J);
|
||||
Ci[WS(csi, 5)] = FMS(KP831469612, T1I, T1H);
|
||||
Cr[WS(csr, 5)] = FNMS(KP831469612, T1O, T1N);
|
||||
Ci[WS(csi, 11)] = FMA(KP831469612, T1I, T1H);
|
||||
Cr[WS(csr, 11)] = FMA(KP831469612, T1O, T1N);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 32, "r2cf_32", { 88, 0, 68, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_32) (planner *p) { X(kr2c_register) (p, r2cf_32, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 32 -name r2cf_32 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 156 FP additions, 42 FP multiplications,
|
||||
* (or, 140 additions, 26 multiplications, 16 fused multiply/add),
|
||||
* 54 stack variables, 7 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_32(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP555570233, +0.555570233019602224742830813948532874374937191);
|
||||
DK(KP831469612, +0.831469612302545237078788377617905756738560812);
|
||||
DK(KP195090322, +0.195090322016128267848284868477022240927691618);
|
||||
DK(KP980785280, +0.980785280403230449126182236134239036973933731);
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(128, rs), MAKE_VOLATILE_STRIDE(128, csr), MAKE_VOLATILE_STRIDE(128, csi)) {
|
||||
E T7, T2b, Tv, T1l, Te, T2o, Ty, T1k, Tt, T2d, TF, T1h, Tm, T2c, TC;
|
||||
E T1i, T1Z, T22, T2k, T2j, T1e, T1C, T19, T1B, T1S, T1V, T2h, T2g, TX, T1z;
|
||||
E TS, T1y;
|
||||
{
|
||||
E T1, T2, T3, T4, T5, T6;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 8)];
|
||||
T3 = T1 + T2;
|
||||
T4 = R0[WS(rs, 4)];
|
||||
T5 = R0[WS(rs, 12)];
|
||||
T6 = T4 + T5;
|
||||
T7 = T3 + T6;
|
||||
T2b = T3 - T6;
|
||||
Tv = T1 - T2;
|
||||
T1l = T4 - T5;
|
||||
}
|
||||
{
|
||||
E Ta, Tw, Td, Tx;
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = R0[WS(rs, 2)];
|
||||
T9 = R0[WS(rs, 10)];
|
||||
Ta = T8 + T9;
|
||||
Tw = T8 - T9;
|
||||
Tb = R0[WS(rs, 14)];
|
||||
Tc = R0[WS(rs, 6)];
|
||||
Td = Tb + Tc;
|
||||
Tx = Tb - Tc;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T2o = Td - Ta;
|
||||
Ty = KP707106781 * (Tw + Tx);
|
||||
T1k = KP707106781 * (Tx - Tw);
|
||||
}
|
||||
{
|
||||
E Tp, TD, Ts, TE;
|
||||
{
|
||||
E Tn, To, Tq, Tr;
|
||||
Tn = R0[WS(rs, 15)];
|
||||
To = R0[WS(rs, 7)];
|
||||
Tp = Tn + To;
|
||||
TD = Tn - To;
|
||||
Tq = R0[WS(rs, 3)];
|
||||
Tr = R0[WS(rs, 11)];
|
||||
Ts = Tq + Tr;
|
||||
TE = Tq - Tr;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T2d = Tp - Ts;
|
||||
TF = FMA(KP923879532, TD, KP382683432 * TE);
|
||||
T1h = FNMS(KP923879532, TE, KP382683432 * TD);
|
||||
}
|
||||
{
|
||||
E Ti, TA, Tl, TB;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = R0[WS(rs, 1)];
|
||||
Th = R0[WS(rs, 9)];
|
||||
Ti = Tg + Th;
|
||||
TA = Tg - Th;
|
||||
Tj = R0[WS(rs, 5)];
|
||||
Tk = R0[WS(rs, 13)];
|
||||
Tl = Tj + Tk;
|
||||
TB = Tj - Tk;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T2c = Ti - Tl;
|
||||
TC = FNMS(KP382683432, TB, KP923879532 * TA);
|
||||
T1i = FMA(KP382683432, TA, KP923879532 * TB);
|
||||
}
|
||||
{
|
||||
E T11, T1X, T1d, T1Y, T14, T20, T17, T21, T1a, T18;
|
||||
{
|
||||
E TZ, T10, T1b, T1c;
|
||||
TZ = R1[WS(rs, 15)];
|
||||
T10 = R1[WS(rs, 7)];
|
||||
T11 = TZ - T10;
|
||||
T1X = TZ + T10;
|
||||
T1b = R1[WS(rs, 3)];
|
||||
T1c = R1[WS(rs, 11)];
|
||||
T1d = T1b - T1c;
|
||||
T1Y = T1b + T1c;
|
||||
}
|
||||
{
|
||||
E T12, T13, T15, T16;
|
||||
T12 = R1[WS(rs, 1)];
|
||||
T13 = R1[WS(rs, 9)];
|
||||
T14 = T12 - T13;
|
||||
T20 = T12 + T13;
|
||||
T15 = R1[WS(rs, 13)];
|
||||
T16 = R1[WS(rs, 5)];
|
||||
T17 = T15 - T16;
|
||||
T21 = T15 + T16;
|
||||
}
|
||||
T1Z = T1X + T1Y;
|
||||
T22 = T20 + T21;
|
||||
T2k = T21 - T20;
|
||||
T2j = T1X - T1Y;
|
||||
T1a = KP707106781 * (T17 - T14);
|
||||
T1e = T1a - T1d;
|
||||
T1C = T1d + T1a;
|
||||
T18 = KP707106781 * (T14 + T17);
|
||||
T19 = T11 + T18;
|
||||
T1B = T11 - T18;
|
||||
}
|
||||
{
|
||||
E TK, T1Q, TW, T1R, TN, T1T, TQ, T1U, TT, TR;
|
||||
{
|
||||
E TI, TJ, TU, TV;
|
||||
TI = R1[0];
|
||||
TJ = R1[WS(rs, 8)];
|
||||
TK = TI - TJ;
|
||||
T1Q = TI + TJ;
|
||||
TU = R1[WS(rs, 4)];
|
||||
TV = R1[WS(rs, 12)];
|
||||
TW = TU - TV;
|
||||
T1R = TU + TV;
|
||||
}
|
||||
{
|
||||
E TL, TM, TO, TP;
|
||||
TL = R1[WS(rs, 2)];
|
||||
TM = R1[WS(rs, 10)];
|
||||
TN = TL - TM;
|
||||
T1T = TL + TM;
|
||||
TO = R1[WS(rs, 14)];
|
||||
TP = R1[WS(rs, 6)];
|
||||
TQ = TO - TP;
|
||||
T1U = TO + TP;
|
||||
}
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T2h = T1U - T1T;
|
||||
T2g = T1Q - T1R;
|
||||
TT = KP707106781 * (TQ - TN);
|
||||
TX = TT - TW;
|
||||
T1z = TW + TT;
|
||||
TR = KP707106781 * (TN + TQ);
|
||||
TS = TK + TR;
|
||||
T1y = TK - TR;
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T27, T28, T29, T2a;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
T27 = Tf + Tu;
|
||||
T28 = T1S + T1V;
|
||||
T29 = T1Z + T22;
|
||||
T2a = T28 + T29;
|
||||
Cr[WS(csr, 8)] = Tf - Tu;
|
||||
Ci[WS(csi, 8)] = T29 - T28;
|
||||
Cr[WS(csr, 16)] = T27 - T2a;
|
||||
Cr[0] = T27 + T2a;
|
||||
}
|
||||
{
|
||||
E T1P, T25, T24, T26, T1W, T23;
|
||||
T1P = T7 - Te;
|
||||
T25 = Tt - Tm;
|
||||
T1W = T1S - T1V;
|
||||
T23 = T1Z - T22;
|
||||
T24 = KP707106781 * (T1W + T23);
|
||||
T26 = KP707106781 * (T23 - T1W);
|
||||
Cr[WS(csr, 12)] = T1P - T24;
|
||||
Ci[WS(csi, 12)] = T26 - T25;
|
||||
Cr[WS(csr, 4)] = T1P + T24;
|
||||
Ci[WS(csi, 4)] = T25 + T26;
|
||||
}
|
||||
{
|
||||
E T2f, T2v, T2p, T2r, T2m, T2q, T2u, T2w, T2e, T2n;
|
||||
T2e = KP707106781 * (T2c + T2d);
|
||||
T2f = T2b + T2e;
|
||||
T2v = T2b - T2e;
|
||||
T2n = KP707106781 * (T2d - T2c);
|
||||
T2p = T2n - T2o;
|
||||
T2r = T2o + T2n;
|
||||
{
|
||||
E T2i, T2l, T2s, T2t;
|
||||
T2i = FMA(KP923879532, T2g, KP382683432 * T2h);
|
||||
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
|
||||
T2m = T2i + T2l;
|
||||
T2q = T2l - T2i;
|
||||
T2s = FNMS(KP382683432, T2g, KP923879532 * T2h);
|
||||
T2t = FMA(KP382683432, T2j, KP923879532 * T2k);
|
||||
T2u = T2s + T2t;
|
||||
T2w = T2t - T2s;
|
||||
}
|
||||
Cr[WS(csr, 14)] = T2f - T2m;
|
||||
Ci[WS(csi, 14)] = T2u - T2r;
|
||||
Cr[WS(csr, 2)] = T2f + T2m;
|
||||
Ci[WS(csi, 2)] = T2r + T2u;
|
||||
Ci[WS(csi, 6)] = T2p + T2q;
|
||||
Cr[WS(csr, 6)] = T2v + T2w;
|
||||
Ci[WS(csi, 10)] = T2q - T2p;
|
||||
Cr[WS(csr, 10)] = T2v - T2w;
|
||||
}
|
||||
{
|
||||
E TH, T1t, T1s, T1u, T1g, T1o, T1n, T1p;
|
||||
{
|
||||
E Tz, TG, T1q, T1r;
|
||||
Tz = Tv + Ty;
|
||||
TG = TC + TF;
|
||||
TH = Tz + TG;
|
||||
T1t = Tz - TG;
|
||||
T1q = FNMS(KP195090322, TS, KP980785280 * TX);
|
||||
T1r = FMA(KP195090322, T19, KP980785280 * T1e);
|
||||
T1s = T1q + T1r;
|
||||
T1u = T1r - T1q;
|
||||
}
|
||||
{
|
||||
E TY, T1f, T1j, T1m;
|
||||
TY = FMA(KP980785280, TS, KP195090322 * TX);
|
||||
T1f = FNMS(KP195090322, T1e, KP980785280 * T19);
|
||||
T1g = TY + T1f;
|
||||
T1o = T1f - TY;
|
||||
T1j = T1h - T1i;
|
||||
T1m = T1k - T1l;
|
||||
T1n = T1j - T1m;
|
||||
T1p = T1m + T1j;
|
||||
}
|
||||
Cr[WS(csr, 15)] = TH - T1g;
|
||||
Ci[WS(csi, 15)] = T1s - T1p;
|
||||
Cr[WS(csr, 1)] = TH + T1g;
|
||||
Ci[WS(csi, 1)] = T1p + T1s;
|
||||
Ci[WS(csi, 7)] = T1n + T1o;
|
||||
Cr[WS(csr, 7)] = T1t + T1u;
|
||||
Ci[WS(csi, 9)] = T1o - T1n;
|
||||
Cr[WS(csr, 9)] = T1t - T1u;
|
||||
}
|
||||
{
|
||||
E T1x, T1N, T1M, T1O, T1E, T1I, T1H, T1J;
|
||||
{
|
||||
E T1v, T1w, T1K, T1L;
|
||||
T1v = Tv - Ty;
|
||||
T1w = T1i + T1h;
|
||||
T1x = T1v + T1w;
|
||||
T1N = T1v - T1w;
|
||||
T1K = FNMS(KP555570233, T1y, KP831469612 * T1z);
|
||||
T1L = FMA(KP555570233, T1B, KP831469612 * T1C);
|
||||
T1M = T1K + T1L;
|
||||
T1O = T1L - T1K;
|
||||
}
|
||||
{
|
||||
E T1A, T1D, T1F, T1G;
|
||||
T1A = FMA(KP831469612, T1y, KP555570233 * T1z);
|
||||
T1D = FNMS(KP555570233, T1C, KP831469612 * T1B);
|
||||
T1E = T1A + T1D;
|
||||
T1I = T1D - T1A;
|
||||
T1F = TF - TC;
|
||||
T1G = T1l + T1k;
|
||||
T1H = T1F - T1G;
|
||||
T1J = T1G + T1F;
|
||||
}
|
||||
Cr[WS(csr, 13)] = T1x - T1E;
|
||||
Ci[WS(csi, 13)] = T1M - T1J;
|
||||
Cr[WS(csr, 3)] = T1x + T1E;
|
||||
Ci[WS(csi, 3)] = T1J + T1M;
|
||||
Ci[WS(csi, 5)] = T1H + T1I;
|
||||
Cr[WS(csr, 5)] = T1N + T1O;
|
||||
Ci[WS(csi, 11)] = T1I - T1H;
|
||||
Cr[WS(csr, 11)] = T1N - T1O;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 32, "r2cf_32", { 140, 26, 16, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_32) (planner *p) { X(kr2c_register) (p, r2cf_32, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
98
fftw-3.3.10/rdft/scalar/r2cf/r2cf_4.c
Normal file
98
fftw-3.3.10/rdft/scalar/r2cf/r2cf_4.c
Normal file
@@ -0,0 +1,98 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cf_4 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 0 FP multiplications,
|
||||
* (or, 6 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 7 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
|
||||
E T1, T2, T3, T4, T5, T6;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
T4 = R1[0];
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
Cr[WS(csr, 1)] = T1 - T2;
|
||||
Ci[WS(csi, 1)] = T5 - T4;
|
||||
Cr[WS(csr, 2)] = T3 - T6;
|
||||
Cr[0] = T3 + T6;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 4, "r2cf_4", { 6, 0, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_4) (planner *p) { X(kr2c_register) (p, r2cf_4, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 4 -name r2cf_4 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 0 FP multiplications,
|
||||
* (or, 6 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 7 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) {
|
||||
E T1, T2, T3, T4, T5, T6;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = T1 + T2;
|
||||
T4 = R1[0];
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T6 = T4 + T5;
|
||||
Cr[WS(csr, 1)] = T1 - T2;
|
||||
Ci[WS(csi, 1)] = T5 - T4;
|
||||
Cr[WS(csr, 2)] = T3 - T6;
|
||||
Cr[0] = T3 + T6;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 4, "r2cf_4", { 6, 0, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_4) (planner *p) { X(kr2c_register) (p, r2cf_4, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
126
fftw-3.3.10/rdft/scalar/r2cf/r2cf_5.c
Normal file
126
fftw-3.3.10/rdft/scalar/r2cf/r2cf_5.c
Normal file
@@ -0,0 +1,126 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cf_5 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 7 FP multiplications,
|
||||
* (or, 7 additions, 2 multiplications, 5 fused multiply/add),
|
||||
* 17 stack variables, 4 constants, and 10 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
|
||||
E T7, T8, T9, T3, Ta, T6, Tb, Tc;
|
||||
T7 = R0[0];
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = R0[WS(rs, 2)];
|
||||
T2 = R1[0];
|
||||
T8 = T2 + T1;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T9 = T4 + T5;
|
||||
T3 = T1 - T2;
|
||||
Ta = T8 + T9;
|
||||
T6 = T4 - T5;
|
||||
}
|
||||
Ci[WS(csi, 1)] = KP951056516 * (FNMS(KP618033988, T6, T3));
|
||||
Ci[WS(csi, 2)] = KP951056516 * (FMA(KP618033988, T3, T6));
|
||||
Cr[0] = T7 + Ta;
|
||||
Tb = FNMS(KP250000000, Ta, T7);
|
||||
Tc = T8 - T9;
|
||||
Cr[WS(csr, 1)] = FMA(KP559016994, Tc, Tb);
|
||||
Cr[WS(csr, 2)] = FNMS(KP559016994, Tc, Tb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 5, "r2cf_5", { 7, 2, 5, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_5) (planner *p) { X(kr2c_register) (p, r2cf_5, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 5 -name r2cf_5 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 6 FP multiplications,
|
||||
* (or, 9 additions, 3 multiplications, 3 fused multiply/add),
|
||||
* 17 stack variables, 4 constants, and 10 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_5(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(20, rs), MAKE_VOLATILE_STRIDE(20, csr), MAKE_VOLATILE_STRIDE(20, csi)) {
|
||||
E Ta, T7, T8, T3, Tb, T6, T9, Tc;
|
||||
Ta = R0[0];
|
||||
{
|
||||
E T1, T2, T4, T5;
|
||||
T1 = R0[WS(rs, 2)];
|
||||
T2 = R1[0];
|
||||
T7 = T2 + T1;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T8 = T4 + T5;
|
||||
T3 = T1 - T2;
|
||||
Tb = T7 + T8;
|
||||
T6 = T4 - T5;
|
||||
}
|
||||
Ci[WS(csi, 1)] = FNMS(KP587785252, T6, KP951056516 * T3);
|
||||
Ci[WS(csi, 2)] = FMA(KP587785252, T3, KP951056516 * T6);
|
||||
Cr[0] = Ta + Tb;
|
||||
T9 = KP559016994 * (T7 - T8);
|
||||
Tc = FNMS(KP250000000, Tb, Ta);
|
||||
Cr[WS(csr, 1)] = T9 + Tc;
|
||||
Cr[WS(csr, 2)] = Tc - T9;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 5, "r2cf_5", { 9, 3, 3, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_5) (planner *p) { X(kr2c_register) (p, r2cf_5, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
128
fftw-3.3.10/rdft/scalar/r2cf/r2cf_6.c
Normal file
128
fftw-3.3.10/rdft/scalar/r2cf/r2cf_6.c
Normal file
@@ -0,0 +1,128 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cf_6 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 14 FP additions, 4 FP multiplications,
|
||||
* (or, 12 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 17 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
|
||||
E T3, Td, T9, Tc, T6, Tb, T1, T2, Ta, Te;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 1)];
|
||||
T3 = T1 - T2;
|
||||
Td = T1 + T2;
|
||||
{
|
||||
E T7, T8, T4, T5;
|
||||
T7 = R0[WS(rs, 2)];
|
||||
T8 = R1[0];
|
||||
T9 = T7 - T8;
|
||||
Tc = T7 + T8;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 2)];
|
||||
T6 = T4 - T5;
|
||||
Tb = T4 + T5;
|
||||
}
|
||||
Ci[WS(csi, 1)] = KP866025403 * (T9 - T6);
|
||||
Ta = T6 + T9;
|
||||
Cr[WS(csr, 1)] = FNMS(KP500000000, Ta, T3);
|
||||
Cr[WS(csr, 3)] = T3 + Ta;
|
||||
Ci[WS(csi, 2)] = KP866025403 * (Tb - Tc);
|
||||
Te = Tb + Tc;
|
||||
Cr[WS(csr, 2)] = FNMS(KP500000000, Te, Td);
|
||||
Cr[0] = Td + Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 6, "r2cf_6", { 12, 2, 2, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_6) (planner *p) { X(kr2c_register) (p, r2cf_6, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 6 -name r2cf_6 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 14 FP additions, 4 FP multiplications,
|
||||
* (or, 12 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 17 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) {
|
||||
E T3, Td, T9, Tc, T6, Tb, T1, T2, Ta, Te;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 1)];
|
||||
T3 = T1 - T2;
|
||||
Td = T1 + T2;
|
||||
{
|
||||
E T7, T8, T4, T5;
|
||||
T7 = R0[WS(rs, 2)];
|
||||
T8 = R1[0];
|
||||
T9 = T7 - T8;
|
||||
Tc = T7 + T8;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R1[WS(rs, 2)];
|
||||
T6 = T4 - T5;
|
||||
Tb = T4 + T5;
|
||||
}
|
||||
Ci[WS(csi, 1)] = KP866025403 * (T9 - T6);
|
||||
Ta = T6 + T9;
|
||||
Cr[WS(csr, 1)] = FNMS(KP500000000, Ta, T3);
|
||||
Cr[WS(csr, 3)] = T3 + Ta;
|
||||
Ci[WS(csi, 2)] = KP866025403 * (Tb - Tc);
|
||||
Te = Tb + Tc;
|
||||
Cr[WS(csr, 2)] = FNMS(KP500000000, Te, Td);
|
||||
Cr[0] = Td + Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 6, "r2cf_6", { 12, 2, 2, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_6) (planner *p) { X(kr2c_register) (p, r2cf_6, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
1408
fftw-3.3.10/rdft/scalar/r2cf/r2cf_64.c
Normal file
1408
fftw-3.3.10/rdft/scalar/r2cf/r2cf_64.c
Normal file
File diff suppressed because it is too large
Load Diff
148
fftw-3.3.10/rdft/scalar/r2cf/r2cf_7.c
Normal file
148
fftw-3.3.10/rdft/scalar/r2cf/r2cf_7.c
Normal file
@@ -0,0 +1,148 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cf_7 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 18 FP multiplications,
|
||||
* (or, 9 additions, 3 multiplications, 15 fused multiply/add),
|
||||
* 23 stack variables, 6 constants, and 14 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
|
||||
E T1, T4, Ta, T7, Tb, Td, Tj, Ti, Th, Tf;
|
||||
T1 = R0[0];
|
||||
{
|
||||
E T2, T3, T8, T9, T5, T6;
|
||||
T2 = R1[0];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
T8 = R1[WS(rs, 1)];
|
||||
T9 = R0[WS(rs, 2)];
|
||||
Ta = T8 + T9;
|
||||
T5 = R0[WS(rs, 1)];
|
||||
T6 = R1[WS(rs, 2)];
|
||||
T7 = T5 + T6;
|
||||
Tb = FNMS(KP356895867, Ta, T7);
|
||||
Td = FNMS(KP356895867, T4, Ta);
|
||||
Tj = T6 - T5;
|
||||
Ti = T9 - T8;
|
||||
Th = T3 - T2;
|
||||
Tf = FNMS(KP356895867, T7, T4);
|
||||
}
|
||||
{
|
||||
E Tc, Tm, Te, Tk, Tg, Tl;
|
||||
Tc = FNMS(KP692021471, Tb, T4);
|
||||
Cr[WS(csr, 3)] = FNMS(KP900968867, Tc, T1);
|
||||
Tm = FNMS(KP554958132, Th, Tj);
|
||||
Ci[WS(csi, 3)] = KP974927912 * (FNMS(KP801937735, Tm, Ti));
|
||||
Te = FNMS(KP692021471, Td, T7);
|
||||
Cr[WS(csr, 2)] = FNMS(KP900968867, Te, T1);
|
||||
Tk = FMA(KP554958132, Tj, Ti);
|
||||
Ci[WS(csi, 2)] = KP974927912 * (FNMS(KP801937735, Tk, Th));
|
||||
Cr[0] = T1 + T4 + T7 + Ta;
|
||||
Tg = FNMS(KP692021471, Tf, Ta);
|
||||
Cr[WS(csr, 1)] = FNMS(KP900968867, Tg, T1);
|
||||
Tl = FMA(KP554958132, Ti, Th);
|
||||
Ci[WS(csi, 1)] = KP974927912 * (FMA(KP801937735, Tl, Tj));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 7, "r2cf_7", { 9, 3, 15, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_7) (planner *p) { X(kr2c_register) (p, r2cf_7, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 7 -name r2cf_7 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 18 FP multiplications,
|
||||
* (or, 12 additions, 6 multiplications, 12 fused multiply/add),
|
||||
* 20 stack variables, 6 constants, and 14 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_7(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(28, rs), MAKE_VOLATILE_STRIDE(28, csr), MAKE_VOLATILE_STRIDE(28, csi)) {
|
||||
E T1, Ta, Tb, T4, Td, T7, Tc, T8, T9;
|
||||
T1 = R0[0];
|
||||
T8 = R1[0];
|
||||
T9 = R0[WS(rs, 3)];
|
||||
Ta = T8 + T9;
|
||||
Tb = T9 - T8;
|
||||
{
|
||||
E T2, T3, T5, T6;
|
||||
T2 = R0[WS(rs, 1)];
|
||||
T3 = R1[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
Td = T3 - T2;
|
||||
T5 = R1[WS(rs, 1)];
|
||||
T6 = R0[WS(rs, 2)];
|
||||
T7 = T5 + T6;
|
||||
Tc = T6 - T5;
|
||||
}
|
||||
Ci[WS(csi, 2)] = FNMS(KP781831482, Tc, KP974927912 * Tb) - (KP433883739 * Td);
|
||||
Ci[WS(csi, 1)] = FMA(KP781831482, Tb, KP974927912 * Td) + (KP433883739 * Tc);
|
||||
Cr[WS(csr, 2)] = FMA(KP623489801, T7, T1) + FNMA(KP900968867, T4, KP222520933 * Ta);
|
||||
Ci[WS(csi, 3)] = FMA(KP433883739, Tb, KP974927912 * Tc) - (KP781831482 * Td);
|
||||
Cr[WS(csr, 3)] = FMA(KP623489801, T4, T1) + FNMA(KP222520933, T7, KP900968867 * Ta);
|
||||
Cr[WS(csr, 1)] = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
|
||||
Cr[0] = T1 + Ta + T4 + T7;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 7, "r2cf_7", { 12, 6, 12, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_7) (planner *p) { X(kr2c_register) (p, r2cf_7, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
154
fftw-3.3.10/rdft/scalar/r2cf/r2cf_8.c
Normal file
154
fftw-3.3.10/rdft/scalar/r2cf/r2cf_8.c
Normal file
@@ -0,0 +1,154 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cf_8 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 20 FP additions, 4 FP multiplications,
|
||||
* (or, 16 additions, 0 multiplications, 4 fused multiply/add),
|
||||
* 14 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
|
||||
E T3, T7, Td, Tj, T6, Tf, Ta, Ti;
|
||||
{
|
||||
E T1, T2, Tb, Tc;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
T7 = T1 - T2;
|
||||
Tb = R1[WS(rs, 3)];
|
||||
Tc = R1[WS(rs, 1)];
|
||||
Td = Tb - Tc;
|
||||
Tj = Tb + Tc;
|
||||
}
|
||||
{
|
||||
E T4, T5, T8, T9;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R0[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
Tf = T4 - T5;
|
||||
T8 = R1[0];
|
||||
T9 = R1[WS(rs, 2)];
|
||||
Ta = T8 - T9;
|
||||
Ti = T8 + T9;
|
||||
}
|
||||
Cr[WS(csr, 2)] = T3 - T6;
|
||||
Ci[WS(csi, 2)] = Tj - Ti;
|
||||
{
|
||||
E Te, Tg, Th, Tk;
|
||||
Te = Ta + Td;
|
||||
Cr[WS(csr, 3)] = FNMS(KP707106781, Te, T7);
|
||||
Cr[WS(csr, 1)] = FMA(KP707106781, Te, T7);
|
||||
Tg = Td - Ta;
|
||||
Ci[WS(csi, 1)] = FMS(KP707106781, Tg, Tf);
|
||||
Ci[WS(csi, 3)] = FMA(KP707106781, Tg, Tf);
|
||||
Th = T3 + T6;
|
||||
Tk = Ti + Tj;
|
||||
Cr[WS(csr, 4)] = Th - Tk;
|
||||
Cr[0] = Th + Tk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 8, "r2cf_8", { 16, 0, 4, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_8) (planner *p) { X(kr2c_register) (p, r2cf_8, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 8 -name r2cf_8 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 20 FP additions, 2 FP multiplications,
|
||||
* (or, 20 additions, 2 multiplications, 0 fused multiply/add),
|
||||
* 14 stack variables, 1 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_8(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(32, rs), MAKE_VOLATILE_STRIDE(32, csr), MAKE_VOLATILE_STRIDE(32, csi)) {
|
||||
E T3, T7, Td, Tj, T6, Tg, Ta, Ti;
|
||||
{
|
||||
E T1, T2, Tb, Tc;
|
||||
T1 = R0[0];
|
||||
T2 = R0[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
T7 = T1 - T2;
|
||||
Tb = R1[WS(rs, 3)];
|
||||
Tc = R1[WS(rs, 1)];
|
||||
Td = Tb - Tc;
|
||||
Tj = Tb + Tc;
|
||||
}
|
||||
{
|
||||
E T4, T5, T8, T9;
|
||||
T4 = R0[WS(rs, 1)];
|
||||
T5 = R0[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
Tg = T4 - T5;
|
||||
T8 = R1[0];
|
||||
T9 = R1[WS(rs, 2)];
|
||||
Ta = T8 - T9;
|
||||
Ti = T8 + T9;
|
||||
}
|
||||
Cr[WS(csr, 2)] = T3 - T6;
|
||||
Ci[WS(csi, 2)] = Tj - Ti;
|
||||
{
|
||||
E Te, Tf, Th, Tk;
|
||||
Te = KP707106781 * (Ta + Td);
|
||||
Cr[WS(csr, 3)] = T7 - Te;
|
||||
Cr[WS(csr, 1)] = T7 + Te;
|
||||
Tf = KP707106781 * (Td - Ta);
|
||||
Ci[WS(csi, 1)] = Tf - Tg;
|
||||
Ci[WS(csi, 3)] = Tg + Tf;
|
||||
Th = T3 + T6;
|
||||
Tk = Ti + Tj;
|
||||
Cr[WS(csr, 4)] = Th - Tk;
|
||||
Cr[0] = Th + Tk;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 8, "r2cf_8", { 20, 2, 0, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_8) (planner *p) { X(kr2c_register) (p, r2cf_8, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
217
fftw-3.3.10/rdft/scalar/r2cf/r2cf_9.c
Normal file
217
fftw-3.3.10/rdft/scalar/r2cf/r2cf_9.c
Normal file
@@ -0,0 +1,217 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:46:10 EDT 2021 */
|
||||
|
||||
#include "rdft/codelet-rdft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cf_9 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 38 FP additions, 30 FP multiplications,
|
||||
* (or, 12 additions, 4 multiplications, 26 fused multiply/add),
|
||||
* 48 stack variables, 18 constants, and 18 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP907603734, +0.907603734547952313649323976213898122064543220);
|
||||
DK(KP347296355, +0.347296355333860697703433253538629592000751354);
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP666666666, +0.666666666666666666666666666666666666666666667);
|
||||
DK(KP898197570, +0.898197570222573798468955502359086394667167570);
|
||||
DK(KP673648177, +0.673648177666930348851716626769314796000375677);
|
||||
DK(KP879385241, +0.879385241571816768108218554649462939872416269);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP394930843, +0.394930843634698457567117349190734585290304520);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP586256827, +0.586256827714544512072145703099641959914944179);
|
||||
DK(KP726681596, +0.726681596905677465811651808188092531873167623);
|
||||
DK(KP968908795, +0.968908795874236621082202410917456709164223497);
|
||||
DK(KP203604859, +0.203604859554852403062088995281827210665664861);
|
||||
DK(KP152703644, +0.152703644666139302296566746461370407999248646);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP184792530, +0.184792530904095372701352047572203755870913560);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
|
||||
E T1, T4, To, Tk, Ta, Tu, Tf, Th, Tj, Tx, Tl, Tm, Ty, Tq, T2;
|
||||
E T3, T5, Tg;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 1)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
To = T3 - T2;
|
||||
{
|
||||
E T6, Tb, T9, Te, Ti;
|
||||
T6 = R1[0];
|
||||
Tb = R0[WS(rs, 1)];
|
||||
{
|
||||
E T7, T8, Tc, Td;
|
||||
T7 = R0[WS(rs, 2)];
|
||||
T8 = R1[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
Tk = T7 - T8;
|
||||
Tc = R1[WS(rs, 2)];
|
||||
Td = R0[WS(rs, 4)];
|
||||
Te = Tc + Td;
|
||||
Ti = Td - Tc;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Tu = FMA(KP184792530, Tk, Ti);
|
||||
Tf = Tb + Te;
|
||||
Th = FNMS(KP500000000, Te, Tb);
|
||||
Tj = FNMS(KP152703644, Ti, Th);
|
||||
Tx = FMA(KP203604859, Th, Ti);
|
||||
Tl = FMS(KP500000000, T9, T6);
|
||||
Tm = FNMS(KP968908795, Tl, Tk);
|
||||
Ty = FMA(KP726681596, Tk, Tl);
|
||||
Tq = FMA(KP586256827, Tl, Ti);
|
||||
}
|
||||
Ci[WS(csi, 3)] = KP866025403 * (Tf - Ta);
|
||||
T5 = T1 + T4;
|
||||
Tg = Ta + Tf;
|
||||
Cr[WS(csr, 3)] = FNMS(KP500000000, Tg, T5);
|
||||
Cr[0] = T5 + Tg;
|
||||
{
|
||||
E Tv, Tt, Tn, TC, TB;
|
||||
Tt = FMA(KP394930843, Th, To);
|
||||
Tv = FNMS(KP939692620, Tu, Tt);
|
||||
Ci[WS(csi, 2)] = KP984807753 * (FNMS(KP879385241, Tv, Tl));
|
||||
Tn = FMA(KP673648177, Tm, Tj);
|
||||
TB = FMA(KP898197570, Ty, Tx);
|
||||
TC = FMA(KP666666666, Tn, TB);
|
||||
Ci[WS(csi, 1)] = -(KP984807753 * (FNMS(KP879385241, To, Tn)));
|
||||
Ci[WS(csi, 4)] = KP866025403 * (FMA(KP852868531, TC, To));
|
||||
{
|
||||
E Tp, Ts, Tz, TA, Tr, Tw;
|
||||
Tp = FNMS(KP500000000, T4, T1);
|
||||
Tr = FNMS(KP347296355, Tq, Tk);
|
||||
Ts = FNMS(KP907603734, Tr, Th);
|
||||
Tw = FNMS(KP673648177, Tm, Tj);
|
||||
Tz = FNMS(KP898197570, Ty, Tx);
|
||||
TA = FNMS(KP500000000, Tz, Tw);
|
||||
Cr[WS(csr, 2)] = FNMS(KP939692620, Ts, Tp);
|
||||
Cr[WS(csr, 1)] = FMA(KP852868531, Tz, Tp);
|
||||
Cr[WS(csr, 4)] = FMA(KP852868531, TA, Tp);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 9, "r2cf_9", { 12, 4, 26, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_9) (planner *p) { X(kr2c_register) (p, r2cf_9, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_r2cf.native -compact -variables 4 -pipeline-latency 4 -n 9 -name r2cf_9 -include rdft/scalar/r2cf.h */
|
||||
|
||||
/*
|
||||
* This function contains 38 FP additions, 26 FP multiplications,
|
||||
* (or, 21 additions, 9 multiplications, 17 fused multiply/add),
|
||||
* 36 stack variables, 14 constants, and 18 memory accesses
|
||||
*/
|
||||
#include "rdft/scalar/r2cf.h"
|
||||
|
||||
static void r2cf_9(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP296198132, +0.296198132726023843175338011893050938967728390);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP813797681, +0.813797681349373692844693217248393223289101568);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP150383733, +0.150383733180435296639271897612501926072238258);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP663413948, +0.663413948168938396205421319635891297216863310);
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP556670399, +0.556670399226419366452912952047023132968291906);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(36, rs), MAKE_VOLATILE_STRIDE(36, csr), MAKE_VOLATILE_STRIDE(36, csi)) {
|
||||
E T1, T4, Tr, Ta, Tl, Ti, Tf, Tk, Tj, T2, T3, T5, Tg;
|
||||
T1 = R0[0];
|
||||
T2 = R1[WS(rs, 1)];
|
||||
T3 = R0[WS(rs, 3)];
|
||||
T4 = T2 + T3;
|
||||
Tr = T3 - T2;
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = R1[0];
|
||||
T7 = R0[WS(rs, 2)];
|
||||
T8 = R1[WS(rs, 3)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
Tl = T8 - T7;
|
||||
Ti = FNMS(KP500000000, T9, T6);
|
||||
}
|
||||
{
|
||||
E Tb, Tc, Td, Te;
|
||||
Tb = R0[WS(rs, 1)];
|
||||
Tc = R1[WS(rs, 2)];
|
||||
Td = R0[WS(rs, 4)];
|
||||
Te = Tc + Td;
|
||||
Tf = Tb + Te;
|
||||
Tk = FNMS(KP500000000, Te, Tb);
|
||||
Tj = Td - Tc;
|
||||
}
|
||||
Ci[WS(csi, 3)] = KP866025403 * (Tf - Ta);
|
||||
T5 = T1 + T4;
|
||||
Tg = Ta + Tf;
|
||||
Cr[WS(csr, 3)] = FNMS(KP500000000, Tg, T5);
|
||||
Cr[0] = T5 + Tg;
|
||||
{
|
||||
E Tt, Th, Tm, Tn, To, Tp, Tq, Ts;
|
||||
Tt = KP866025403 * Tr;
|
||||
Th = FNMS(KP500000000, T4, T1);
|
||||
Tm = FMA(KP766044443, Ti, KP556670399 * Tl);
|
||||
Tn = FMA(KP173648177, Tk, KP852868531 * Tj);
|
||||
To = Tm + Tn;
|
||||
Tp = FNMS(KP642787609, Ti, KP663413948 * Tl);
|
||||
Tq = FNMS(KP984807753, Tk, KP150383733 * Tj);
|
||||
Ts = Tp + Tq;
|
||||
Cr[WS(csr, 1)] = Th + To;
|
||||
Ci[WS(csi, 1)] = Tt + Ts;
|
||||
Cr[WS(csr, 4)] = FMA(KP866025403, Tp - Tq, Th) - (KP500000000 * To);
|
||||
Ci[WS(csi, 4)] = FNMS(KP500000000, Ts, KP866025403 * (Tr + (Tn - Tm)));
|
||||
Ci[WS(csi, 2)] = FNMS(KP342020143, Tk, KP813797681 * Tj) + FNMA(KP150383733, Tl, KP984807753 * Ti) - Tt;
|
||||
Cr[WS(csr, 2)] = FMA(KP173648177, Ti, Th) + FNMA(KP296198132, Tj, KP939692620 * Tk) - (KP852868531 * Tl);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kr2c_desc desc = { 9, "r2cf_9", { 21, 9, 17, 0 }, &GENUS };
|
||||
|
||||
void X(codelet_r2cf_9) (planner *p) { X(kr2c_register) (p, r2cf_9, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
Reference in New Issue
Block a user