Updates
This commit is contained in:
96
fftw-3.3.10/dft/scalar/codelets/Makefile.am
Normal file
96
fftw-3.3.10/dft/scalar/codelets/Makefile.am
Normal file
@@ -0,0 +1,96 @@
|
||||
# This Makefile.am specifies a set of codelets, efficient transforms
|
||||
# of small sizes, that are used as building blocks (kernels) by FFTW
|
||||
# to build up large transforms, as well as the options for generating
|
||||
# and compiling them.
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
###########################################################################
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = libdft_scalar_codelets.la
|
||||
|
||||
###########################################################################
|
||||
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
|
||||
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
|
||||
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
|
||||
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
|
||||
|
||||
###########################################################################
|
||||
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
|
||||
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
|
||||
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
|
||||
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
|
||||
|
||||
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
|
||||
# of trig. functions, it partially generates the trig. values on the fly
|
||||
# (this is faster for large sizes).
|
||||
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
|
||||
t2_5.c t2_10.c t2_20.c t2_25.c
|
||||
|
||||
###########################################################################
|
||||
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
|
||||
# but the planner seems to never (or hardly ever) use them on the machines
|
||||
# we have access to, preferring the Q codelets and the use of buffers
|
||||
# for sub-transforms. So, we comment them out, at least for now.
|
||||
|
||||
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
|
||||
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
|
||||
|
||||
# like f1, but partially generates its trig. table on the fly
|
||||
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
|
||||
|
||||
###########################################################################
|
||||
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
|
||||
# transposed. This is used for in-place transposes in sizes that are
|
||||
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
|
||||
# probably not use <r> bigger than 8 or so.
|
||||
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
|
||||
|
||||
###########################################################################
|
||||
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
|
||||
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
|
||||
|
||||
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
|
||||
|
||||
SOLVTAB_NAME = X(solvtab_dft_standard)
|
||||
XRENAME=X
|
||||
|
||||
# special rules for regenerating codelets.
|
||||
include $(top_srcdir)/support/Makefile.codelets
|
||||
|
||||
if MAINTAINER_MODE
|
||||
FLAGS_N1=$(DFT_FLAGS_COMMON)
|
||||
FLAGS_T1=$(DFT_FLAGS_COMMON)
|
||||
FLAGS_T2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_F1=$(DFT_FLAGS_COMMON)
|
||||
FLAGS_F2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
FLAGS_Q1=$(DFT_FLAGS_COMMON) -reload-twiddle
|
||||
FLAGS_Q2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
|
||||
n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
endif # MAINTAINER_MODE
|
||||
994
fftw-3.3.10/dft/scalar/codelets/Makefile.in
Normal file
994
fftw-3.3.10/dft/scalar/codelets/Makefile.in
Normal file
@@ -0,0 +1,994 @@
|
||||
# Makefile.in generated by automake 1.16.3 from Makefile.am.
|
||||
# @configure_input@
|
||||
|
||||
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
|
||||
|
||||
# This Makefile.in is free software; the Free Software Foundation
|
||||
# gives unlimited permission to copy and/or distribute it,
|
||||
# with or without modifications, as long as this notice is preserved.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
|
||||
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
|
||||
# PARTICULAR PURPOSE.
|
||||
|
||||
@SET_MAKE@
|
||||
|
||||
# This Makefile.am specifies a set of codelets, efficient transforms
|
||||
# of small sizes, that are used as building blocks (kernels) by FFTW
|
||||
# to build up large transforms, as well as the options for generating
|
||||
# and compiling them.
|
||||
|
||||
# You can customize FFTW for special needs, e.g. to handle certain
|
||||
# sizes more efficiently, by adding new codelets to the lists of those
|
||||
# included by default. If you change the list of codelets, any new
|
||||
# ones you added will be automatically generated when you run the
|
||||
# bootstrap script (see "Generating your own code" in the FFTW
|
||||
# manual).
|
||||
|
||||
# -*- makefile -*-
|
||||
# This file contains special make rules to generate codelets.
|
||||
# Most of this file requires GNU make .
|
||||
|
||||
VPATH = @srcdir@
|
||||
am__is_gnu_make = { \
|
||||
if test -z '$(MAKELEVEL)'; then \
|
||||
false; \
|
||||
elif test -n '$(MAKE_HOST)'; then \
|
||||
true; \
|
||||
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
|
||||
true; \
|
||||
else \
|
||||
false; \
|
||||
fi; \
|
||||
}
|
||||
am__make_running_with_option = \
|
||||
case $${target_option-} in \
|
||||
?) ;; \
|
||||
*) echo "am__make_running_with_option: internal error: invalid" \
|
||||
"target option '$${target_option-}' specified" >&2; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
has_opt=no; \
|
||||
sane_makeflags=$$MAKEFLAGS; \
|
||||
if $(am__is_gnu_make); then \
|
||||
sane_makeflags=$$MFLAGS; \
|
||||
else \
|
||||
case $$MAKEFLAGS in \
|
||||
*\\[\ \ ]*) \
|
||||
bs=\\; \
|
||||
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
|
||||
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
|
||||
esac; \
|
||||
fi; \
|
||||
skip_next=no; \
|
||||
strip_trailopt () \
|
||||
{ \
|
||||
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
|
||||
}; \
|
||||
for flg in $$sane_makeflags; do \
|
||||
test $$skip_next = yes && { skip_next=no; continue; }; \
|
||||
case $$flg in \
|
||||
*=*|--*) continue;; \
|
||||
-*I) strip_trailopt 'I'; skip_next=yes;; \
|
||||
-*I?*) strip_trailopt 'I';; \
|
||||
-*O) strip_trailopt 'O'; skip_next=yes;; \
|
||||
-*O?*) strip_trailopt 'O';; \
|
||||
-*l) strip_trailopt 'l'; skip_next=yes;; \
|
||||
-*l?*) strip_trailopt 'l';; \
|
||||
-[dEDm]) skip_next=yes;; \
|
||||
-[JT]) skip_next=yes;; \
|
||||
esac; \
|
||||
case $$flg in \
|
||||
*$$target_option*) has_opt=yes; break;; \
|
||||
esac; \
|
||||
done; \
|
||||
test $$has_opt = yes
|
||||
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
|
||||
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
|
||||
pkgdatadir = $(datadir)/@PACKAGE@
|
||||
pkgincludedir = $(includedir)/@PACKAGE@
|
||||
pkglibdir = $(libdir)/@PACKAGE@
|
||||
pkglibexecdir = $(libexecdir)/@PACKAGE@
|
||||
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
|
||||
install_sh_DATA = $(install_sh) -c -m 644
|
||||
install_sh_PROGRAM = $(install_sh) -c
|
||||
install_sh_SCRIPT = $(install_sh) -c
|
||||
INSTALL_HEADER = $(INSTALL_DATA)
|
||||
transform = $(program_transform_name)
|
||||
NORMAL_INSTALL = :
|
||||
PRE_INSTALL = :
|
||||
POST_INSTALL = :
|
||||
NORMAL_UNINSTALL = :
|
||||
PRE_UNINSTALL = :
|
||||
POST_UNINSTALL = :
|
||||
build_triplet = @build@
|
||||
host_triplet = @host@
|
||||
subdir = dft/scalar/codelets
|
||||
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
|
||||
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
|
||||
$(top_srcdir)/m4/acx_pthread.m4 \
|
||||
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
|
||||
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
|
||||
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
|
||||
$(top_srcdir)/m4/ax_gcc_version.m4 \
|
||||
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
|
||||
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
|
||||
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
|
||||
$(top_srcdir)/configure.ac
|
||||
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
|
||||
$(ACLOCAL_M4)
|
||||
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
|
||||
mkinstalldirs = $(install_sh) -d
|
||||
CONFIG_HEADER = $(top_builddir)/config.h
|
||||
CONFIG_CLEAN_FILES =
|
||||
CONFIG_CLEAN_VPATH_FILES =
|
||||
LTLIBRARIES = $(noinst_LTLIBRARIES)
|
||||
libdft_scalar_codelets_la_LIBADD =
|
||||
am__objects_1 = n1_2.lo n1_3.lo n1_4.lo n1_5.lo n1_6.lo n1_7.lo \
|
||||
n1_8.lo n1_9.lo n1_10.lo n1_11.lo n1_12.lo n1_13.lo n1_14.lo \
|
||||
n1_15.lo n1_16.lo n1_32.lo n1_64.lo n1_20.lo n1_25.lo
|
||||
am__objects_2 = t1_2.lo t1_3.lo t1_4.lo t1_5.lo t1_6.lo t1_7.lo \
|
||||
t1_8.lo t1_9.lo t1_10.lo t1_12.lo t1_15.lo t1_16.lo t1_32.lo \
|
||||
t1_64.lo t1_20.lo t1_25.lo
|
||||
am__objects_3 = t2_4.lo t2_8.lo t2_16.lo t2_32.lo t2_64.lo t2_5.lo \
|
||||
t2_10.lo t2_20.lo t2_25.lo
|
||||
am__objects_4 =
|
||||
am__objects_5 = q1_2.lo q1_4.lo q1_8.lo q1_3.lo q1_5.lo q1_6.lo
|
||||
am__objects_6 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
|
||||
$(am__objects_4) $(am__objects_4) $(am__objects_5)
|
||||
am__objects_7 = codlist.lo
|
||||
am__objects_8 = $(am__objects_6) $(am__objects_7)
|
||||
am_libdft_scalar_codelets_la_OBJECTS = $(am__objects_8)
|
||||
libdft_scalar_codelets_la_OBJECTS = \
|
||||
$(am_libdft_scalar_codelets_la_OBJECTS)
|
||||
AM_V_lt = $(am__v_lt_@AM_V@)
|
||||
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
|
||||
am__v_lt_0 = --silent
|
||||
am__v_lt_1 =
|
||||
AM_V_P = $(am__v_P_@AM_V@)
|
||||
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
|
||||
am__v_P_0 = false
|
||||
am__v_P_1 = :
|
||||
AM_V_GEN = $(am__v_GEN_@AM_V@)
|
||||
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
|
||||
am__v_GEN_0 = @echo " GEN " $@;
|
||||
am__v_GEN_1 =
|
||||
AM_V_at = $(am__v_at_@AM_V@)
|
||||
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
|
||||
am__v_at_0 = @
|
||||
am__v_at_1 =
|
||||
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
|
||||
depcomp = $(SHELL) $(top_srcdir)/depcomp
|
||||
am__maybe_remake_depfiles = depfiles
|
||||
am__depfiles_remade = ./$(DEPDIR)/codlist.Plo ./$(DEPDIR)/n1_10.Plo \
|
||||
./$(DEPDIR)/n1_11.Plo ./$(DEPDIR)/n1_12.Plo \
|
||||
./$(DEPDIR)/n1_13.Plo ./$(DEPDIR)/n1_14.Plo \
|
||||
./$(DEPDIR)/n1_15.Plo ./$(DEPDIR)/n1_16.Plo \
|
||||
./$(DEPDIR)/n1_2.Plo ./$(DEPDIR)/n1_20.Plo \
|
||||
./$(DEPDIR)/n1_25.Plo ./$(DEPDIR)/n1_3.Plo \
|
||||
./$(DEPDIR)/n1_32.Plo ./$(DEPDIR)/n1_4.Plo \
|
||||
./$(DEPDIR)/n1_5.Plo ./$(DEPDIR)/n1_6.Plo \
|
||||
./$(DEPDIR)/n1_64.Plo ./$(DEPDIR)/n1_7.Plo \
|
||||
./$(DEPDIR)/n1_8.Plo ./$(DEPDIR)/n1_9.Plo ./$(DEPDIR)/q1_2.Plo \
|
||||
./$(DEPDIR)/q1_3.Plo ./$(DEPDIR)/q1_4.Plo ./$(DEPDIR)/q1_5.Plo \
|
||||
./$(DEPDIR)/q1_6.Plo ./$(DEPDIR)/q1_8.Plo \
|
||||
./$(DEPDIR)/t1_10.Plo ./$(DEPDIR)/t1_12.Plo \
|
||||
./$(DEPDIR)/t1_15.Plo ./$(DEPDIR)/t1_16.Plo \
|
||||
./$(DEPDIR)/t1_2.Plo ./$(DEPDIR)/t1_20.Plo \
|
||||
./$(DEPDIR)/t1_25.Plo ./$(DEPDIR)/t1_3.Plo \
|
||||
./$(DEPDIR)/t1_32.Plo ./$(DEPDIR)/t1_4.Plo \
|
||||
./$(DEPDIR)/t1_5.Plo ./$(DEPDIR)/t1_6.Plo \
|
||||
./$(DEPDIR)/t1_64.Plo ./$(DEPDIR)/t1_7.Plo \
|
||||
./$(DEPDIR)/t1_8.Plo ./$(DEPDIR)/t1_9.Plo \
|
||||
./$(DEPDIR)/t2_10.Plo ./$(DEPDIR)/t2_16.Plo \
|
||||
./$(DEPDIR)/t2_20.Plo ./$(DEPDIR)/t2_25.Plo \
|
||||
./$(DEPDIR)/t2_32.Plo ./$(DEPDIR)/t2_4.Plo \
|
||||
./$(DEPDIR)/t2_5.Plo ./$(DEPDIR)/t2_64.Plo \
|
||||
./$(DEPDIR)/t2_8.Plo
|
||||
am__mv = mv -f
|
||||
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
|
||||
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
|
||||
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
|
||||
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
|
||||
$(AM_CFLAGS) $(CFLAGS)
|
||||
AM_V_CC = $(am__v_CC_@AM_V@)
|
||||
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
|
||||
am__v_CC_0 = @echo " CC " $@;
|
||||
am__v_CC_1 =
|
||||
CCLD = $(CC)
|
||||
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
|
||||
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
|
||||
$(AM_LDFLAGS) $(LDFLAGS) -o $@
|
||||
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
|
||||
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
|
||||
am__v_CCLD_0 = @echo " CCLD " $@;
|
||||
am__v_CCLD_1 =
|
||||
SOURCES = $(libdft_scalar_codelets_la_SOURCES)
|
||||
DIST_SOURCES = $(libdft_scalar_codelets_la_SOURCES)
|
||||
am__can_run_installinfo = \
|
||||
case $$AM_UPDATE_INFO_DIR in \
|
||||
n|no|NO) false;; \
|
||||
*) (install-info --version) >/dev/null 2>&1;; \
|
||||
esac
|
||||
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
|
||||
# Read a list of newline-separated strings from the standard input,
|
||||
# and print each of them once, without duplicates. Input order is
|
||||
# *not* preserved.
|
||||
am__uniquify_input = $(AWK) '\
|
||||
BEGIN { nonempty = 0; } \
|
||||
{ items[$$0] = 1; nonempty = 1; } \
|
||||
END { if (nonempty) { for (i in items) print i; }; } \
|
||||
'
|
||||
# Make sure the list of sources is unique. This is necessary because,
|
||||
# e.g., the same source file might be shared among _SOURCES variables
|
||||
# for different programs/libraries.
|
||||
am__define_uniq_tagged_files = \
|
||||
list='$(am__tagged_files)'; \
|
||||
unique=`for i in $$list; do \
|
||||
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
|
||||
done | $(am__uniquify_input)`
|
||||
ETAGS = etags
|
||||
CTAGS = ctags
|
||||
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp \
|
||||
$(top_srcdir)/support/Makefile.codelets
|
||||
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
|
||||
ACLOCAL = @ACLOCAL@
|
||||
ALLOCA = @ALLOCA@
|
||||
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
|
||||
AMTAR = @AMTAR@
|
||||
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
|
||||
AR = @AR@
|
||||
AS = @AS@
|
||||
AUTOCONF = @AUTOCONF@
|
||||
AUTOHEADER = @AUTOHEADER@
|
||||
AUTOMAKE = @AUTOMAKE@
|
||||
AVX2_CFLAGS = @AVX2_CFLAGS@
|
||||
AVX512_CFLAGS = @AVX512_CFLAGS@
|
||||
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
|
||||
AVX_CFLAGS = @AVX_CFLAGS@
|
||||
AWK = @AWK@
|
||||
CC = @CC@
|
||||
CCDEPMODE = @CCDEPMODE@
|
||||
CFLAGS = @CFLAGS@
|
||||
CHECK_PL_OPTS = @CHECK_PL_OPTS@
|
||||
CPP = @CPP@
|
||||
CPPFLAGS = @CPPFLAGS@
|
||||
CYGPATH_W = @CYGPATH_W@
|
||||
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
|
||||
C_MPI_FINT = @C_MPI_FINT@
|
||||
DEFS = @DEFS@
|
||||
DEPDIR = @DEPDIR@
|
||||
DLLTOOL = @DLLTOOL@
|
||||
DSYMUTIL = @DSYMUTIL@
|
||||
DUMPBIN = @DUMPBIN@
|
||||
ECHO_C = @ECHO_C@
|
||||
ECHO_N = @ECHO_N@
|
||||
ECHO_T = @ECHO_T@
|
||||
EGREP = @EGREP@
|
||||
EXEEXT = @EXEEXT@
|
||||
F77 = @F77@
|
||||
FFLAGS = @FFLAGS@
|
||||
FGREP = @FGREP@
|
||||
FLIBS = @FLIBS@
|
||||
GREP = @GREP@
|
||||
INDENT = @INDENT@
|
||||
INSTALL = @INSTALL@
|
||||
INSTALL_DATA = @INSTALL_DATA@
|
||||
INSTALL_PROGRAM = @INSTALL_PROGRAM@
|
||||
INSTALL_SCRIPT = @INSTALL_SCRIPT@
|
||||
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
|
||||
KCVI_CFLAGS = @KCVI_CFLAGS@
|
||||
LD = @LD@
|
||||
LDFLAGS = @LDFLAGS@
|
||||
LIBOBJS = @LIBOBJS@
|
||||
LIBQUADMATH = @LIBQUADMATH@
|
||||
LIBS = @LIBS@
|
||||
LIBTOOL = @LIBTOOL@
|
||||
LIPO = @LIPO@
|
||||
LN_S = @LN_S@
|
||||
LTLIBOBJS = @LTLIBOBJS@
|
||||
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
|
||||
MAINT = @MAINT@
|
||||
MAKEINFO = @MAKEINFO@
|
||||
MANIFEST_TOOL = @MANIFEST_TOOL@
|
||||
MKDIR_P = @MKDIR_P@
|
||||
MPICC = @MPICC@
|
||||
MPILIBS = @MPILIBS@
|
||||
MPIRUN = @MPIRUN@
|
||||
NEON_CFLAGS = @NEON_CFLAGS@
|
||||
NM = @NM@
|
||||
NMEDIT = @NMEDIT@
|
||||
OBJDUMP = @OBJDUMP@
|
||||
OBJEXT = @OBJEXT@
|
||||
OCAMLBUILD = @OCAMLBUILD@
|
||||
OPENMP_CFLAGS = @OPENMP_CFLAGS@
|
||||
OTOOL = @OTOOL@
|
||||
OTOOL64 = @OTOOL64@
|
||||
PACKAGE = @PACKAGE@
|
||||
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
|
||||
PACKAGE_NAME = @PACKAGE_NAME@
|
||||
PACKAGE_STRING = @PACKAGE_STRING@
|
||||
PACKAGE_TARNAME = @PACKAGE_TARNAME@
|
||||
PACKAGE_URL = @PACKAGE_URL@
|
||||
PACKAGE_VERSION = @PACKAGE_VERSION@
|
||||
PATH_SEPARATOR = @PATH_SEPARATOR@
|
||||
POW_LIB = @POW_LIB@
|
||||
PRECISION = @PRECISION@
|
||||
PREC_SUFFIX = @PREC_SUFFIX@
|
||||
PTHREAD_CC = @PTHREAD_CC@
|
||||
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
|
||||
PTHREAD_LIBS = @PTHREAD_LIBS@
|
||||
RANLIB = @RANLIB@
|
||||
SED = @SED@
|
||||
SET_MAKE = @SET_MAKE@
|
||||
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
|
||||
SHELL = @SHELL@
|
||||
SSE2_CFLAGS = @SSE2_CFLAGS@
|
||||
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
|
||||
STRIP = @STRIP@
|
||||
THREADLIBS = @THREADLIBS@
|
||||
VERSION = @VERSION@
|
||||
VSX_CFLAGS = @VSX_CFLAGS@
|
||||
abs_builddir = @abs_builddir@
|
||||
abs_srcdir = @abs_srcdir@
|
||||
abs_top_builddir = @abs_top_builddir@
|
||||
abs_top_srcdir = @abs_top_srcdir@
|
||||
ac_ct_AR = @ac_ct_AR@
|
||||
ac_ct_CC = @ac_ct_CC@
|
||||
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
|
||||
ac_ct_F77 = @ac_ct_F77@
|
||||
acx_pthread_config = @acx_pthread_config@
|
||||
am__include = @am__include@
|
||||
am__leading_dot = @am__leading_dot@
|
||||
am__quote = @am__quote@
|
||||
am__tar = @am__tar@
|
||||
am__untar = @am__untar@
|
||||
bindir = @bindir@
|
||||
build = @build@
|
||||
build_alias = @build_alias@
|
||||
build_cpu = @build_cpu@
|
||||
build_os = @build_os@
|
||||
build_vendor = @build_vendor@
|
||||
builddir = @builddir@
|
||||
datadir = @datadir@
|
||||
datarootdir = @datarootdir@
|
||||
docdir = @docdir@
|
||||
dvidir = @dvidir@
|
||||
exec_prefix = @exec_prefix@
|
||||
host = @host@
|
||||
host_alias = @host_alias@
|
||||
host_cpu = @host_cpu@
|
||||
host_os = @host_os@
|
||||
host_vendor = @host_vendor@
|
||||
htmldir = @htmldir@
|
||||
includedir = @includedir@
|
||||
infodir = @infodir@
|
||||
install_sh = @install_sh@
|
||||
libdir = @libdir@
|
||||
libexecdir = @libexecdir@
|
||||
localedir = @localedir@
|
||||
localstatedir = @localstatedir@
|
||||
mandir = @mandir@
|
||||
mkdir_p = @mkdir_p@
|
||||
oldincludedir = @oldincludedir@
|
||||
pdfdir = @pdfdir@
|
||||
prefix = @prefix@
|
||||
program_transform_name = @program_transform_name@
|
||||
psdir = @psdir@
|
||||
runstatedir = @runstatedir@
|
||||
sbindir = @sbindir@
|
||||
sharedstatedir = @sharedstatedir@
|
||||
srcdir = @srcdir@
|
||||
sysconfdir = @sysconfdir@
|
||||
target_alias = @target_alias@
|
||||
top_build_prefix = @top_build_prefix@
|
||||
top_builddir = @top_builddir@
|
||||
top_srcdir = @top_srcdir@
|
||||
|
||||
###########################################################################
|
||||
AM_CPPFLAGS = -I $(top_srcdir)
|
||||
noinst_LTLIBRARIES = libdft_scalar_codelets.la
|
||||
|
||||
###########################################################################
|
||||
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
|
||||
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
|
||||
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
|
||||
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
|
||||
|
||||
|
||||
###########################################################################
|
||||
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
|
||||
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
|
||||
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
|
||||
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
|
||||
|
||||
|
||||
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
|
||||
# of trig. functions, it partially generates the trig. values on the fly
|
||||
# (this is faster for large sizes).
|
||||
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
|
||||
t2_5.c t2_10.c t2_20.c t2_25.c
|
||||
|
||||
|
||||
###########################################################################
|
||||
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
|
||||
# but the planner seems to never (or hardly ever) use them on the machines
|
||||
# we have access to, preferring the Q codelets and the use of buffers
|
||||
# for sub-transforms. So, we comment them out, at least for now.
|
||||
|
||||
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
|
||||
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
|
||||
|
||||
# like f1, but partially generates its trig. table on the fly
|
||||
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
|
||||
|
||||
###########################################################################
|
||||
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
|
||||
# transposed. This is used for in-place transposes in sizes that are
|
||||
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
|
||||
# probably not use <r> bigger than 8 or so.
|
||||
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
|
||||
|
||||
###########################################################################
|
||||
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
|
||||
BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
|
||||
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
|
||||
SOLVTAB_NAME = X(solvtab_dft_standard)
|
||||
XRENAME = X
|
||||
CODLIST = codlist.c
|
||||
CODELET_NAME = codelet_
|
||||
|
||||
#INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
|
||||
@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
|
||||
@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
|
||||
@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
|
||||
@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
|
||||
@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
|
||||
@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
|
||||
@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE)
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
|
||||
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
|
||||
@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
|
||||
@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
|
||||
|
||||
# special rules for regenerating codelets.
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_N1 = $(DFT_FLAGS_COMMON)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_T1 = $(DFT_FLAGS_COMMON)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_T2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_F1 = $(DFT_FLAGS_COMMON)
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_F2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_Q1 = $(DFT_FLAGS_COMMON) -reload-twiddle
|
||||
@MAINTAINER_MODE_TRUE@FLAGS_Q2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
|
||||
all: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) all-am
|
||||
|
||||
.SUFFIXES:
|
||||
.SUFFIXES: .c .lo .o .obj
|
||||
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
|
||||
@for dep in $?; do \
|
||||
case '$(am__configure_deps)' in \
|
||||
*$$dep*) \
|
||||
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
|
||||
&& { if test -f $@; then exit 0; else break; fi; }; \
|
||||
exit 1;; \
|
||||
esac; \
|
||||
done; \
|
||||
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/codelets/Makefile'; \
|
||||
$(am__cd) $(top_srcdir) && \
|
||||
$(AUTOMAKE) --gnu dft/scalar/codelets/Makefile
|
||||
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
|
||||
@case '$?' in \
|
||||
*config.status*) \
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
|
||||
*) \
|
||||
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
|
||||
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
|
||||
esac;
|
||||
$(top_srcdir)/support/Makefile.codelets $(am__empty):
|
||||
|
||||
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
|
||||
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
|
||||
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
|
||||
$(am__aclocal_m4_deps):
|
||||
|
||||
clean-noinstLTLIBRARIES:
|
||||
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
|
||||
@list='$(noinst_LTLIBRARIES)'; \
|
||||
locs=`for p in $$list; do echo $$p; done | \
|
||||
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
|
||||
sort -u`; \
|
||||
test -z "$$locs" || { \
|
||||
echo rm -f $${locs}; \
|
||||
rm -f $${locs}; \
|
||||
}
|
||||
|
||||
libdft_scalar_codelets.la: $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_DEPENDENCIES) $(EXTRA_libdft_scalar_codelets_la_DEPENDENCIES)
|
||||
$(AM_V_CCLD)$(LINK) $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_LIBADD) $(LIBS)
|
||||
|
||||
mostlyclean-compile:
|
||||
-rm -f *.$(OBJEXT)
|
||||
|
||||
distclean-compile:
|
||||
-rm -f *.tab.c
|
||||
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_10.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_11.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_12.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_13.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_14.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_15.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_16.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_20.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_25.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_3.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_32.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_6.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_64.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_7.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_8.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_9.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_3.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_6.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_8.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_10.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_12.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_15.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_16.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_2.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_20.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_25.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_3.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_32.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_6.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_64.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_7.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_8.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_9.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_10.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_16.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_20.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_25.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_32.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_4.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_5.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_64.Plo@am__quote@ # am--include-marker
|
||||
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_8.Plo@am__quote@ # am--include-marker
|
||||
|
||||
$(am__depfiles_remade):
|
||||
@$(MKDIR_P) $(@D)
|
||||
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
|
||||
|
||||
am--depfiles: $(am__depfiles_remade)
|
||||
|
||||
.c.o:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
|
||||
|
||||
.c.obj:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
|
||||
|
||||
.c.lo:
|
||||
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
|
||||
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
|
||||
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
|
||||
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
|
||||
|
||||
mostlyclean-libtool:
|
||||
-rm -f *.lo
|
||||
|
||||
clean-libtool:
|
||||
-rm -rf .libs _libs
|
||||
|
||||
ID: $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); mkid -fID $$unique
|
||||
tags: tags-am
|
||||
TAGS: tags
|
||||
|
||||
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
set x; \
|
||||
here=`pwd`; \
|
||||
$(am__define_uniq_tagged_files); \
|
||||
shift; \
|
||||
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
|
||||
test -n "$$unique" || unique=$$empty_fix; \
|
||||
if test $$# -gt 0; then \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
"$$@" $$unique; \
|
||||
else \
|
||||
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
|
||||
$$unique; \
|
||||
fi; \
|
||||
fi
|
||||
ctags: ctags-am
|
||||
|
||||
CTAGS: ctags
|
||||
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
|
||||
$(am__define_uniq_tagged_files); \
|
||||
test -z "$(CTAGS_ARGS)$$unique" \
|
||||
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
|
||||
$$unique
|
||||
|
||||
GTAGS:
|
||||
here=`$(am__cd) $(top_builddir) && pwd` \
|
||||
&& $(am__cd) $(top_srcdir) \
|
||||
&& gtags -i $(GTAGS_ARGS) "$$here"
|
||||
cscopelist: cscopelist-am
|
||||
|
||||
cscopelist-am: $(am__tagged_files)
|
||||
list='$(am__tagged_files)'; \
|
||||
case "$(srcdir)" in \
|
||||
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
|
||||
*) sdir=$(subdir)/$(srcdir) ;; \
|
||||
esac; \
|
||||
for i in $$list; do \
|
||||
if test -f "$$i"; then \
|
||||
echo "$(subdir)/$$i"; \
|
||||
else \
|
||||
echo "$$sdir/$$i"; \
|
||||
fi; \
|
||||
done >> $(top_builddir)/cscope.files
|
||||
|
||||
distclean-tags:
|
||||
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
|
||||
|
||||
distdir: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) distdir-am
|
||||
|
||||
distdir-am: $(DISTFILES)
|
||||
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
|
||||
list='$(DISTFILES)'; \
|
||||
dist_files=`for file in $$list; do echo $$file; done | \
|
||||
sed -e "s|^$$srcdirstrip/||;t" \
|
||||
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
|
||||
case $$dist_files in \
|
||||
*/*) $(MKDIR_P) `echo "$$dist_files" | \
|
||||
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
|
||||
sort -u` ;; \
|
||||
esac; \
|
||||
for file in $$dist_files; do \
|
||||
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
|
||||
if test -d $$d/$$file; then \
|
||||
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
|
||||
if test -d "$(distdir)/$$file"; then \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
|
||||
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
|
||||
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
|
||||
fi; \
|
||||
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
|
||||
else \
|
||||
test -f "$(distdir)/$$file" \
|
||||
|| cp -p $$d/$$file "$(distdir)/$$file" \
|
||||
|| exit 1; \
|
||||
fi; \
|
||||
done
|
||||
check-am: all-am
|
||||
check: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) check-am
|
||||
all-am: Makefile $(LTLIBRARIES)
|
||||
installdirs:
|
||||
install: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) install-am
|
||||
install-exec: $(BUILT_SOURCES)
|
||||
$(MAKE) $(AM_MAKEFLAGS) install-exec-am
|
||||
install-data: install-data-am
|
||||
uninstall: uninstall-am
|
||||
|
||||
install-am: all-am
|
||||
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
|
||||
|
||||
installcheck: installcheck-am
|
||||
install-strip:
|
||||
if test -z '$(STRIP)'; then \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
install; \
|
||||
else \
|
||||
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
|
||||
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
|
||||
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
|
||||
fi
|
||||
mostlyclean-generic:
|
||||
|
||||
clean-generic:
|
||||
|
||||
distclean-generic:
|
||||
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
|
||||
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
|
||||
|
||||
maintainer-clean-generic:
|
||||
@echo "This command is intended for maintainers to use"
|
||||
@echo "it deletes files that may require special tools to rebuild."
|
||||
-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
|
||||
clean: clean-am
|
||||
|
||||
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
mostlyclean-am
|
||||
|
||||
distclean: distclean-am
|
||||
-rm -f ./$(DEPDIR)/codlist.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_11.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_13.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_14.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_8.Plo
|
||||
-rm -f Makefile
|
||||
distclean-am: clean-am distclean-compile distclean-generic \
|
||||
distclean-tags
|
||||
|
||||
dvi: dvi-am
|
||||
|
||||
dvi-am:
|
||||
|
||||
html: html-am
|
||||
|
||||
html-am:
|
||||
|
||||
info: info-am
|
||||
|
||||
info-am:
|
||||
|
||||
install-data-am:
|
||||
|
||||
install-dvi: install-dvi-am
|
||||
|
||||
install-dvi-am:
|
||||
|
||||
install-exec-am:
|
||||
|
||||
install-html: install-html-am
|
||||
|
||||
install-html-am:
|
||||
|
||||
install-info: install-info-am
|
||||
|
||||
install-info-am:
|
||||
|
||||
install-man:
|
||||
|
||||
install-pdf: install-pdf-am
|
||||
|
||||
install-pdf-am:
|
||||
|
||||
install-ps: install-ps-am
|
||||
|
||||
install-ps-am:
|
||||
|
||||
installcheck-am:
|
||||
|
||||
maintainer-clean: maintainer-clean-am
|
||||
-rm -f ./$(DEPDIR)/codlist.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_11.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_13.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_14.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/n1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/q1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_12.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_15.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_2.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_3.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_6.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_7.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_8.Plo
|
||||
-rm -f ./$(DEPDIR)/t1_9.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_10.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_16.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_20.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_25.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_32.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_4.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_5.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_64.Plo
|
||||
-rm -f ./$(DEPDIR)/t2_8.Plo
|
||||
-rm -f Makefile
|
||||
maintainer-clean-am: distclean-am maintainer-clean-generic \
|
||||
maintainer-clean-local
|
||||
|
||||
mostlyclean: mostlyclean-am
|
||||
|
||||
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
|
||||
mostlyclean-libtool
|
||||
|
||||
pdf: pdf-am
|
||||
|
||||
pdf-am:
|
||||
|
||||
ps: ps-am
|
||||
|
||||
ps-am:
|
||||
|
||||
uninstall-am:
|
||||
|
||||
.MAKE: all check install install-am install-exec install-strip
|
||||
|
||||
.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
|
||||
clean-generic clean-libtool clean-noinstLTLIBRARIES \
|
||||
cscopelist-am ctags ctags-am distclean distclean-compile \
|
||||
distclean-generic distclean-libtool distclean-tags distdir dvi \
|
||||
dvi-am html html-am info info-am install install-am \
|
||||
install-data install-data-am install-dvi install-dvi-am \
|
||||
install-exec install-exec-am install-html install-html-am \
|
||||
install-info install-info-am install-man install-pdf \
|
||||
install-pdf-am install-ps install-ps-am install-strip \
|
||||
installcheck installcheck-am installdirs maintainer-clean \
|
||||
maintainer-clean-generic maintainer-clean-local mostlyclean \
|
||||
mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
|
||||
pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am
|
||||
|
||||
.PRECIOUS: Makefile
|
||||
|
||||
|
||||
# only delete codlist.c in maintainer-mode, since it is included in the dist
|
||||
# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
|
||||
maintainer-clean-local:
|
||||
rm -f $(CODLIST)
|
||||
|
||||
# rule to build codlist
|
||||
@MAINTAINER_MODE_TRUE@$(CODLIST): Makefile
|
||||
@MAINTAINER_MODE_TRUE@ ( \
|
||||
@MAINTAINER_MODE_TRUE@ echo "#include \"kernel/ifftw.h\""; \
|
||||
@MAINTAINER_MODE_TRUE@ echo $(INCLUDE_SIMD_HEADER); \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
|
||||
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
|
||||
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);"; \
|
||||
@MAINTAINER_MODE_TRUE@ fi \
|
||||
@MAINTAINER_MODE_TRUE@ done; \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ echo; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "extern const solvtab $(SOLVTAB_NAME);"; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "const solvtab $(SOLVTAB_NAME) = {"; \
|
||||
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
|
||||
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
|
||||
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
|
||||
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),"; \
|
||||
@MAINTAINER_MODE_TRUE@ fi \
|
||||
@MAINTAINER_MODE_TRUE@ done; \
|
||||
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB_END"; \
|
||||
@MAINTAINER_MODE_TRUE@ echo "};"; \
|
||||
@MAINTAINER_MODE_TRUE@ ) >$@
|
||||
|
||||
# cancel the hideous builtin rules that cause an infinite loop
|
||||
@MAINTAINER_MODE_TRUE@%: %.o
|
||||
@MAINTAINER_MODE_TRUE@%: %.s
|
||||
@MAINTAINER_MODE_TRUE@%: %.c
|
||||
@MAINTAINER_MODE_TRUE@%: %.S
|
||||
|
||||
@MAINTAINER_MODE_TRUE@n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
@MAINTAINER_MODE_TRUE@q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
|
||||
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
.NOEXPORT:
|
||||
109
fftw-3.3.10/dft/scalar/codelets/codlist.c
Normal file
109
fftw-3.3.10/dft/scalar/codelets/codlist.c
Normal file
@@ -0,0 +1,109 @@
|
||||
#include "kernel/ifftw.h"
|
||||
|
||||
|
||||
extern void X(codelet_n1_2)(planner *);
|
||||
extern void X(codelet_n1_3)(planner *);
|
||||
extern void X(codelet_n1_4)(planner *);
|
||||
extern void X(codelet_n1_5)(planner *);
|
||||
extern void X(codelet_n1_6)(planner *);
|
||||
extern void X(codelet_n1_7)(planner *);
|
||||
extern void X(codelet_n1_8)(planner *);
|
||||
extern void X(codelet_n1_9)(planner *);
|
||||
extern void X(codelet_n1_10)(planner *);
|
||||
extern void X(codelet_n1_11)(planner *);
|
||||
extern void X(codelet_n1_12)(planner *);
|
||||
extern void X(codelet_n1_13)(planner *);
|
||||
extern void X(codelet_n1_14)(planner *);
|
||||
extern void X(codelet_n1_15)(planner *);
|
||||
extern void X(codelet_n1_16)(planner *);
|
||||
extern void X(codelet_n1_32)(planner *);
|
||||
extern void X(codelet_n1_64)(planner *);
|
||||
extern void X(codelet_n1_20)(planner *);
|
||||
extern void X(codelet_n1_25)(planner *);
|
||||
extern void X(codelet_t1_2)(planner *);
|
||||
extern void X(codelet_t1_3)(planner *);
|
||||
extern void X(codelet_t1_4)(planner *);
|
||||
extern void X(codelet_t1_5)(planner *);
|
||||
extern void X(codelet_t1_6)(planner *);
|
||||
extern void X(codelet_t1_7)(planner *);
|
||||
extern void X(codelet_t1_8)(planner *);
|
||||
extern void X(codelet_t1_9)(planner *);
|
||||
extern void X(codelet_t1_10)(planner *);
|
||||
extern void X(codelet_t1_12)(planner *);
|
||||
extern void X(codelet_t1_15)(planner *);
|
||||
extern void X(codelet_t1_16)(planner *);
|
||||
extern void X(codelet_t1_32)(planner *);
|
||||
extern void X(codelet_t1_64)(planner *);
|
||||
extern void X(codelet_t1_20)(planner *);
|
||||
extern void X(codelet_t1_25)(planner *);
|
||||
extern void X(codelet_t2_4)(planner *);
|
||||
extern void X(codelet_t2_8)(planner *);
|
||||
extern void X(codelet_t2_16)(planner *);
|
||||
extern void X(codelet_t2_32)(planner *);
|
||||
extern void X(codelet_t2_64)(planner *);
|
||||
extern void X(codelet_t2_5)(planner *);
|
||||
extern void X(codelet_t2_10)(planner *);
|
||||
extern void X(codelet_t2_20)(planner *);
|
||||
extern void X(codelet_t2_25)(planner *);
|
||||
extern void X(codelet_q1_2)(planner *);
|
||||
extern void X(codelet_q1_4)(planner *);
|
||||
extern void X(codelet_q1_8)(planner *);
|
||||
extern void X(codelet_q1_3)(planner *);
|
||||
extern void X(codelet_q1_5)(planner *);
|
||||
extern void X(codelet_q1_6)(planner *);
|
||||
|
||||
|
||||
extern const solvtab X(solvtab_dft_standard);
|
||||
const solvtab X(solvtab_dft_standard) = {
|
||||
SOLVTAB(X(codelet_n1_2)),
|
||||
SOLVTAB(X(codelet_n1_3)),
|
||||
SOLVTAB(X(codelet_n1_4)),
|
||||
SOLVTAB(X(codelet_n1_5)),
|
||||
SOLVTAB(X(codelet_n1_6)),
|
||||
SOLVTAB(X(codelet_n1_7)),
|
||||
SOLVTAB(X(codelet_n1_8)),
|
||||
SOLVTAB(X(codelet_n1_9)),
|
||||
SOLVTAB(X(codelet_n1_10)),
|
||||
SOLVTAB(X(codelet_n1_11)),
|
||||
SOLVTAB(X(codelet_n1_12)),
|
||||
SOLVTAB(X(codelet_n1_13)),
|
||||
SOLVTAB(X(codelet_n1_14)),
|
||||
SOLVTAB(X(codelet_n1_15)),
|
||||
SOLVTAB(X(codelet_n1_16)),
|
||||
SOLVTAB(X(codelet_n1_32)),
|
||||
SOLVTAB(X(codelet_n1_64)),
|
||||
SOLVTAB(X(codelet_n1_20)),
|
||||
SOLVTAB(X(codelet_n1_25)),
|
||||
SOLVTAB(X(codelet_t1_2)),
|
||||
SOLVTAB(X(codelet_t1_3)),
|
||||
SOLVTAB(X(codelet_t1_4)),
|
||||
SOLVTAB(X(codelet_t1_5)),
|
||||
SOLVTAB(X(codelet_t1_6)),
|
||||
SOLVTAB(X(codelet_t1_7)),
|
||||
SOLVTAB(X(codelet_t1_8)),
|
||||
SOLVTAB(X(codelet_t1_9)),
|
||||
SOLVTAB(X(codelet_t1_10)),
|
||||
SOLVTAB(X(codelet_t1_12)),
|
||||
SOLVTAB(X(codelet_t1_15)),
|
||||
SOLVTAB(X(codelet_t1_16)),
|
||||
SOLVTAB(X(codelet_t1_32)),
|
||||
SOLVTAB(X(codelet_t1_64)),
|
||||
SOLVTAB(X(codelet_t1_20)),
|
||||
SOLVTAB(X(codelet_t1_25)),
|
||||
SOLVTAB(X(codelet_t2_4)),
|
||||
SOLVTAB(X(codelet_t2_8)),
|
||||
SOLVTAB(X(codelet_t2_16)),
|
||||
SOLVTAB(X(codelet_t2_32)),
|
||||
SOLVTAB(X(codelet_t2_64)),
|
||||
SOLVTAB(X(codelet_t2_5)),
|
||||
SOLVTAB(X(codelet_t2_10)),
|
||||
SOLVTAB(X(codelet_t2_20)),
|
||||
SOLVTAB(X(codelet_t2_25)),
|
||||
SOLVTAB(X(codelet_q1_2)),
|
||||
SOLVTAB(X(codelet_q1_4)),
|
||||
SOLVTAB(X(codelet_q1_8)),
|
||||
SOLVTAB(X(codelet_q1_3)),
|
||||
SOLVTAB(X(codelet_q1_5)),
|
||||
SOLVTAB(X(codelet_q1_6)),
|
||||
SOLVTAB_END
|
||||
};
|
||||
362
fftw-3.3.10/dft/scalar/codelets/n1_10.c
Normal file
362
fftw-3.3.10/dft/scalar/codelets/n1_10.c
Normal file
@@ -0,0 +1,362 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 84 FP additions, 36 FP multiplications,
|
||||
* (or, 48 additions, 0 multiplications, 36 fused multiply/add),
|
||||
* 41 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
|
||||
E T3, Tj, TN, T1b, TU, TV, T1j, T1i, Tm, Tp, Tq, Ta, Th, Ti, TA;
|
||||
E TH, T17, T14, T1c, T1d, T1e, TO, TP, TQ;
|
||||
{
|
||||
E T1, T2, TL, TM;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = T1 - T2;
|
||||
Tj = T1 + T2;
|
||||
TL = ii[0];
|
||||
TM = ii[WS(is, 5)];
|
||||
TN = TL - TM;
|
||||
T1b = TL + TM;
|
||||
}
|
||||
{
|
||||
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
|
||||
{
|
||||
E T4, T5, Te, Tf;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 7)];
|
||||
T6 = T4 - T5;
|
||||
Tk = T4 + T5;
|
||||
Te = ri[WS(is, 6)];
|
||||
Tf = ri[WS(is, 1)];
|
||||
Tg = Te - Tf;
|
||||
To = Te + Tf;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tb, Tc;
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = T7 - T8;
|
||||
Tl = T7 + T8;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 9)];
|
||||
Td = Tb - Tc;
|
||||
Tn = Tb + Tc;
|
||||
}
|
||||
TU = T6 - T9;
|
||||
TV = Td - Tg;
|
||||
T1j = Tk - Tl;
|
||||
T1i = Tn - To;
|
||||
Tm = Tk + Tl;
|
||||
Tp = Tn + To;
|
||||
Tq = Tm + Tp;
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
}
|
||||
{
|
||||
E Tw, T15, TG, T13, Tz, T16, TD, T12;
|
||||
{
|
||||
E Tu, Tv, TE, TF;
|
||||
Tu = ii[WS(is, 2)];
|
||||
Tv = ii[WS(is, 7)];
|
||||
Tw = Tu - Tv;
|
||||
T15 = Tu + Tv;
|
||||
TE = ii[WS(is, 6)];
|
||||
TF = ii[WS(is, 1)];
|
||||
TG = TE - TF;
|
||||
T13 = TE + TF;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, TB, TC;
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = ii[WS(is, 3)];
|
||||
Tz = Tx - Ty;
|
||||
T16 = Tx + Ty;
|
||||
TB = ii[WS(is, 4)];
|
||||
TC = ii[WS(is, 9)];
|
||||
TD = TB - TC;
|
||||
T12 = TB + TC;
|
||||
}
|
||||
TA = Tw - Tz;
|
||||
TH = TD - TG;
|
||||
T17 = T15 - T16;
|
||||
T14 = T12 - T13;
|
||||
T1c = T15 + T16;
|
||||
T1d = T12 + T13;
|
||||
T1e = T1c + T1d;
|
||||
TO = Tw + Tz;
|
||||
TP = TD + TG;
|
||||
TQ = TO + TP;
|
||||
}
|
||||
ro[WS(os, 5)] = T3 + Ti;
|
||||
io[WS(os, 5)] = TN + TQ;
|
||||
ro[0] = Tj + Tq;
|
||||
io[0] = T1b + T1e;
|
||||
{
|
||||
E TI, TK, Tt, TJ, Tr, Ts;
|
||||
TI = FMA(KP618033988, TH, TA);
|
||||
TK = FNMS(KP618033988, TA, TH);
|
||||
Tr = FNMS(KP250000000, Ti, T3);
|
||||
Ts = Ta - Th;
|
||||
Tt = FMA(KP559016994, Ts, Tr);
|
||||
TJ = FNMS(KP559016994, Ts, Tr);
|
||||
ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
|
||||
ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
|
||||
}
|
||||
{
|
||||
E TW, TY, TT, TX, TR, TS;
|
||||
TW = FMA(KP618033988, TV, TU);
|
||||
TY = FNMS(KP618033988, TU, TV);
|
||||
TR = FNMS(KP250000000, TQ, TN);
|
||||
TS = TO - TP;
|
||||
TT = FMA(KP559016994, TS, TR);
|
||||
TX = FNMS(KP559016994, TS, TR);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
|
||||
io[WS(os, 7)] = FMA(KP951056516, TY, TX);
|
||||
io[WS(os, 9)] = FMA(KP951056516, TW, TT);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
|
||||
}
|
||||
{
|
||||
E T18, T1a, T11, T19, TZ, T10;
|
||||
T18 = FNMS(KP618033988, T17, T14);
|
||||
T1a = FMA(KP618033988, T14, T17);
|
||||
TZ = FNMS(KP250000000, Tq, Tj);
|
||||
T10 = Tm - Tp;
|
||||
T11 = FNMS(KP559016994, T10, TZ);
|
||||
T19 = FMA(KP559016994, T10, TZ);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
|
||||
ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
|
||||
ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
|
||||
}
|
||||
{
|
||||
E T1k, T1m, T1h, T1l, T1f, T1g;
|
||||
T1k = FNMS(KP618033988, T1j, T1i);
|
||||
T1m = FMA(KP618033988, T1i, T1j);
|
||||
T1f = FNMS(KP250000000, T1e, T1b);
|
||||
T1g = T1c - T1d;
|
||||
T1h = FNMS(KP559016994, T1g, T1f);
|
||||
T1l = FMA(KP559016994, T1g, T1f);
|
||||
io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
|
||||
io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
|
||||
io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
|
||||
io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 10, "n1_10", { 48, 0, 36, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 84 FP additions, 24 FP multiplications,
|
||||
* (or, 72 additions, 12 multiplications, 12 fused multiply/add),
|
||||
* 41 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
|
||||
E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
|
||||
E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
|
||||
{
|
||||
E T1, T2, TO, TP;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = T1 - T2;
|
||||
Tj = T1 + T2;
|
||||
TO = ii[0];
|
||||
TP = ii[WS(is, 5)];
|
||||
TQ = TO - TP;
|
||||
T1e = TO + TP;
|
||||
}
|
||||
{
|
||||
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
|
||||
{
|
||||
E T4, T5, Te, Tf;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 7)];
|
||||
T6 = T4 - T5;
|
||||
Tk = T4 + T5;
|
||||
Te = ri[WS(is, 6)];
|
||||
Tf = ri[WS(is, 1)];
|
||||
Tg = Te - Tf;
|
||||
To = Te + Tf;
|
||||
}
|
||||
{
|
||||
E T7, T8, Tb, Tc;
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = T7 - T8;
|
||||
Tl = T7 + T8;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 9)];
|
||||
Td = Tb - Tc;
|
||||
Tn = Tb + Tc;
|
||||
}
|
||||
TU = T6 - T9;
|
||||
TV = Td - Tg;
|
||||
T1c = Tk - Tl;
|
||||
T1b = Tn - To;
|
||||
Tm = Tk + Tl;
|
||||
Tp = Tn + To;
|
||||
Tq = Tm + Tp;
|
||||
Ta = T6 + T9;
|
||||
Th = Td + Tg;
|
||||
Ti = Ta + Th;
|
||||
}
|
||||
{
|
||||
E Tw, T15, TG, T13, Tz, T16, TD, T12;
|
||||
{
|
||||
E Tu, Tv, TE, TF;
|
||||
Tu = ii[WS(is, 2)];
|
||||
Tv = ii[WS(is, 7)];
|
||||
Tw = Tu - Tv;
|
||||
T15 = Tu + Tv;
|
||||
TE = ii[WS(is, 6)];
|
||||
TF = ii[WS(is, 1)];
|
||||
TG = TE - TF;
|
||||
T13 = TE + TF;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, TB, TC;
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = ii[WS(is, 3)];
|
||||
Tz = Tx - Ty;
|
||||
T16 = Tx + Ty;
|
||||
TB = ii[WS(is, 4)];
|
||||
TC = ii[WS(is, 9)];
|
||||
TD = TB - TC;
|
||||
T12 = TB + TC;
|
||||
}
|
||||
TA = Tw - Tz;
|
||||
TH = TD - TG;
|
||||
T17 = T15 - T16;
|
||||
T14 = T12 - T13;
|
||||
T1f = T15 + T16;
|
||||
T1g = T12 + T13;
|
||||
T1h = T1f + T1g;
|
||||
TL = Tw + Tz;
|
||||
TM = TD + TG;
|
||||
TR = TL + TM;
|
||||
}
|
||||
ro[WS(os, 5)] = T3 + Ti;
|
||||
io[WS(os, 5)] = TQ + TR;
|
||||
ro[0] = Tj + Tq;
|
||||
io[0] = T1e + T1h;
|
||||
{
|
||||
E TI, TK, Tt, TJ, Tr, Ts;
|
||||
TI = FMA(KP951056516, TA, KP587785252 * TH);
|
||||
TK = FNMS(KP587785252, TA, KP951056516 * TH);
|
||||
Tr = KP559016994 * (Ta - Th);
|
||||
Ts = FNMS(KP250000000, Ti, T3);
|
||||
Tt = Tr + Ts;
|
||||
TJ = Ts - Tr;
|
||||
ro[WS(os, 9)] = Tt - TI;
|
||||
ro[WS(os, 3)] = TJ + TK;
|
||||
ro[WS(os, 1)] = Tt + TI;
|
||||
ro[WS(os, 7)] = TJ - TK;
|
||||
}
|
||||
{
|
||||
E TW, TY, TT, TX, TN, TS;
|
||||
TW = FMA(KP951056516, TU, KP587785252 * TV);
|
||||
TY = FNMS(KP587785252, TU, KP951056516 * TV);
|
||||
TN = KP559016994 * (TL - TM);
|
||||
TS = FNMS(KP250000000, TR, TQ);
|
||||
TT = TN + TS;
|
||||
TX = TS - TN;
|
||||
io[WS(os, 1)] = TT - TW;
|
||||
io[WS(os, 7)] = TY + TX;
|
||||
io[WS(os, 9)] = TW + TT;
|
||||
io[WS(os, 3)] = TX - TY;
|
||||
}
|
||||
{
|
||||
E T18, T1a, T11, T19, TZ, T10;
|
||||
T18 = FNMS(KP587785252, T17, KP951056516 * T14);
|
||||
T1a = FMA(KP951056516, T17, KP587785252 * T14);
|
||||
TZ = FNMS(KP250000000, Tq, Tj);
|
||||
T10 = KP559016994 * (Tm - Tp);
|
||||
T11 = TZ - T10;
|
||||
T19 = T10 + TZ;
|
||||
ro[WS(os, 2)] = T11 - T18;
|
||||
ro[WS(os, 6)] = T19 + T1a;
|
||||
ro[WS(os, 8)] = T11 + T18;
|
||||
ro[WS(os, 4)] = T19 - T1a;
|
||||
}
|
||||
{
|
||||
E T1d, T1l, T1k, T1m, T1i, T1j;
|
||||
T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
|
||||
T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
|
||||
T1i = FNMS(KP250000000, T1h, T1e);
|
||||
T1j = KP559016994 * (T1f - T1g);
|
||||
T1k = T1i - T1j;
|
||||
T1m = T1j + T1i;
|
||||
io[WS(os, 2)] = T1d + T1k;
|
||||
io[WS(os, 6)] = T1m - T1l;
|
||||
io[WS(os, 8)] = T1k - T1d;
|
||||
io[WS(os, 4)] = T1l + T1m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 10, "n1_10", { 72, 12, 12, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
426
fftw-3.3.10/dft/scalar/codelets/n1_11.c
Normal file
426
fftw-3.3.10/dft/scalar/codelets/n1_11.c
Normal file
@@ -0,0 +1,426 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 140 FP additions, 110 FP multiplications,
|
||||
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
|
||||
* 62 stack variables, 10 constants, and 44 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
|
||||
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
|
||||
DK(KP918985947, +0.918985947228994779780736114132655398124909697);
|
||||
DK(KP830830026, +0.830830026003772851058548298459246407048009821);
|
||||
DK(KP876768831, +0.876768831002589333891339807079336796764054852);
|
||||
DK(KP778434453, +0.778434453334651800608337670740821884709317477);
|
||||
DK(KP715370323, +0.715370323453429719112414662767260662417897278);
|
||||
DK(KP521108558, +0.521108558113202722944698153526659300680427422);
|
||||
DK(KP634356270, +0.634356270682424498893150776899916060542806975);
|
||||
DK(KP342584725, +0.342584725681637509502641509861112333758894680);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
|
||||
E T1, T1f, T4, T1u, Tg, T1q, T7, T1t, Ta, T1s, Td, T1r, Ti, TP, T26;
|
||||
E TG, T1X, T1O, T1w, TY, T1F, T17, To, T1i, TA, T1k, Tr, T1h, Tu, T1j;
|
||||
E Tx, T1g, TC, TU, T21, TL, T1S, T1J, T1m, T13, T1A, T1c;
|
||||
T1 = ri[0];
|
||||
T1f = ii[0];
|
||||
{
|
||||
E T5, T6, Tp, Tq;
|
||||
{
|
||||
E T2, T3, Te, Tf;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
T1u = T3 - T2;
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = ri[WS(is, 6)];
|
||||
Tg = Te + Tf;
|
||||
T1q = Tf - Te;
|
||||
}
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 9)];
|
||||
T7 = T5 + T6;
|
||||
T1t = T6 - T5;
|
||||
{
|
||||
E T8, T9, Tb, Tc;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 8)];
|
||||
Ta = T8 + T9;
|
||||
T1s = T9 - T8;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 7)];
|
||||
Td = Tb + Tc;
|
||||
T1r = Tc - Tb;
|
||||
}
|
||||
{
|
||||
E Th, TO, T25, TF, T1W;
|
||||
Th = FNMS(KP342584725, Ta, T7);
|
||||
Ti = FNMS(KP634356270, Th, Td);
|
||||
TO = FNMS(KP342584725, T4, Ta);
|
||||
TP = FNMS(KP634356270, TO, Tg);
|
||||
T25 = FMA(KP521108558, T1q, T1u);
|
||||
T26 = FMA(KP715370323, T25, T1r);
|
||||
TF = FNMS(KP342584725, Td, T4);
|
||||
TG = FNMS(KP634356270, TF, T7);
|
||||
T1W = FMA(KP521108558, T1s, T1q);
|
||||
T1X = FNMS(KP715370323, T1W, T1t);
|
||||
}
|
||||
{
|
||||
E T1N, T1v, TX, T1E, T16;
|
||||
T1N = FNMS(KP521108558, T1t, T1r);
|
||||
T1O = FMA(KP715370323, T1N, T1q);
|
||||
T1v = FNMS(KP521108558, T1u, T1t);
|
||||
T1w = FNMS(KP715370323, T1v, T1s);
|
||||
TX = FNMS(KP342584725, T7, Tg);
|
||||
TY = FNMS(KP634356270, TX, T4);
|
||||
T1E = FMA(KP521108558, T1r, T1s);
|
||||
T1F = FMA(KP715370323, T1E, T1u);
|
||||
T16 = FNMS(KP342584725, Tg, Td);
|
||||
T17 = FNMS(KP634356270, T16, Ta);
|
||||
}
|
||||
{
|
||||
E Tm, Tn, Ty, Tz;
|
||||
Tm = ii[WS(is, 3)];
|
||||
Tn = ii[WS(is, 8)];
|
||||
To = Tm - Tn;
|
||||
T1i = Tm + Tn;
|
||||
Ty = ii[WS(is, 5)];
|
||||
Tz = ii[WS(is, 6)];
|
||||
TA = Ty - Tz;
|
||||
T1k = Ty + Tz;
|
||||
}
|
||||
Tp = ii[WS(is, 2)];
|
||||
Tq = ii[WS(is, 9)];
|
||||
Tr = Tp - Tq;
|
||||
T1h = Tp + Tq;
|
||||
{
|
||||
E Ts, Tt, Tv, Tw;
|
||||
Ts = ii[WS(is, 4)];
|
||||
Tt = ii[WS(is, 7)];
|
||||
Tu = Ts - Tt;
|
||||
T1j = Ts + Tt;
|
||||
Tv = ii[WS(is, 1)];
|
||||
Tw = ii[WS(is, 10)];
|
||||
Tx = Tv - Tw;
|
||||
T1g = Tv + Tw;
|
||||
}
|
||||
{
|
||||
E TB, TT, T20, TK, T1R;
|
||||
TB = FMA(KP521108558, TA, Tx);
|
||||
TC = FMA(KP715370323, TB, Tu);
|
||||
TT = FNMS(KP521108558, Tr, Tu);
|
||||
TU = FMA(KP715370323, TT, TA);
|
||||
T20 = FNMS(KP342584725, T1i, T1h);
|
||||
T21 = FNMS(KP634356270, T20, T1j);
|
||||
TK = FMA(KP521108558, To, TA);
|
||||
TL = FNMS(KP715370323, TK, Tr);
|
||||
T1R = FNMS(KP342584725, T1j, T1g);
|
||||
T1S = FNMS(KP634356270, T1R, T1h);
|
||||
}
|
||||
{
|
||||
E T1I, T1l, T12, T1z, T1b;
|
||||
T1I = FNMS(KP342584725, T1g, T1i);
|
||||
T1J = FNMS(KP634356270, T1I, T1k);
|
||||
T1l = FNMS(KP342584725, T1k, T1j);
|
||||
T1m = FNMS(KP634356270, T1l, T1i);
|
||||
T12 = FMA(KP521108558, Tu, To);
|
||||
T13 = FMA(KP715370323, T12, Tx);
|
||||
T1z = FNMS(KP342584725, T1h, T1k);
|
||||
T1A = FNMS(KP634356270, T1z, T1g);
|
||||
T1b = FNMS(KP521108558, Tx, Tr);
|
||||
T1c = FNMS(KP715370323, T1b, To);
|
||||
}
|
||||
}
|
||||
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
|
||||
io[0] = T1f + T1g + T1h + T1i + T1j + T1k;
|
||||
{
|
||||
E Tk, TE, Tj, TD, Tl;
|
||||
Tj = FNMS(KP778434453, Ti, T4);
|
||||
Tk = FNMS(KP876768831, Tj, Tg);
|
||||
TD = FMA(KP830830026, TC, Tr);
|
||||
TE = FMA(KP918985947, TD, To);
|
||||
Tl = FNMS(KP959492973, Tk, T1);
|
||||
ro[WS(os, 10)] = FNMS(KP989821441, TE, Tl);
|
||||
ro[WS(os, 1)] = FMA(KP989821441, TE, Tl);
|
||||
}
|
||||
{
|
||||
E T23, T28, T22, T27, T24;
|
||||
T22 = FNMS(KP778434453, T21, T1g);
|
||||
T23 = FNMS(KP876768831, T22, T1k);
|
||||
T27 = FMA(KP830830026, T26, T1t);
|
||||
T28 = FMA(KP918985947, T27, T1s);
|
||||
T24 = FNMS(KP959492973, T23, T1f);
|
||||
io[WS(os, 1)] = FMA(KP989821441, T28, T24);
|
||||
io[WS(os, 10)] = FNMS(KP989821441, T28, T24);
|
||||
}
|
||||
{
|
||||
E T1U, T1Z, T1T, T1Y, T1V;
|
||||
T1T = FNMS(KP778434453, T1S, T1k);
|
||||
T1U = FNMS(KP876768831, T1T, T1i);
|
||||
T1Y = FMA(KP830830026, T1X, T1u);
|
||||
T1Z = FNMS(KP918985947, T1Y, T1r);
|
||||
T1V = FNMS(KP959492973, T1U, T1f);
|
||||
io[WS(os, 2)] = FNMS(KP989821441, T1Z, T1V);
|
||||
io[WS(os, 9)] = FMA(KP989821441, T1Z, T1V);
|
||||
}
|
||||
{
|
||||
E TI, TN, TH, TM, TJ;
|
||||
TH = FNMS(KP778434453, TG, Tg);
|
||||
TI = FNMS(KP876768831, TH, Ta);
|
||||
TM = FMA(KP830830026, TL, Tx);
|
||||
TN = FNMS(KP918985947, TM, Tu);
|
||||
TJ = FNMS(KP959492973, TI, T1);
|
||||
ro[WS(os, 2)] = FNMS(KP989821441, TN, TJ);
|
||||
ro[WS(os, 9)] = FMA(KP989821441, TN, TJ);
|
||||
}
|
||||
{
|
||||
E TR, TW, TQ, TV, TS;
|
||||
TQ = FNMS(KP778434453, TP, Td);
|
||||
TR = FNMS(KP876768831, TQ, T7);
|
||||
TV = FNMS(KP830830026, TU, To);
|
||||
TW = FNMS(KP918985947, TV, Tx);
|
||||
TS = FNMS(KP959492973, TR, T1);
|
||||
ro[WS(os, 8)] = FNMS(KP989821441, TW, TS);
|
||||
ro[WS(os, 3)] = FMA(KP989821441, TW, TS);
|
||||
}
|
||||
{
|
||||
E T1L, T1Q, T1K, T1P, T1M;
|
||||
T1K = FNMS(KP778434453, T1J, T1j);
|
||||
T1L = FNMS(KP876768831, T1K, T1h);
|
||||
T1P = FNMS(KP830830026, T1O, T1s);
|
||||
T1Q = FNMS(KP918985947, T1P, T1u);
|
||||
T1M = FNMS(KP959492973, T1L, T1f);
|
||||
io[WS(os, 3)] = FMA(KP989821441, T1Q, T1M);
|
||||
io[WS(os, 8)] = FNMS(KP989821441, T1Q, T1M);
|
||||
}
|
||||
{
|
||||
E T10, T15, TZ, T14, T11;
|
||||
TZ = FNMS(KP778434453, TY, Ta);
|
||||
T10 = FNMS(KP876768831, TZ, Td);
|
||||
T14 = FNMS(KP830830026, T13, TA);
|
||||
T15 = FMA(KP918985947, T14, Tr);
|
||||
T11 = FNMS(KP959492973, T10, T1);
|
||||
ro[WS(os, 4)] = FNMS(KP989821441, T15, T11);
|
||||
ro[WS(os, 7)] = FMA(KP989821441, T15, T11);
|
||||
}
|
||||
{
|
||||
E T1C, T1H, T1B, T1G, T1D;
|
||||
T1B = FNMS(KP778434453, T1A, T1i);
|
||||
T1C = FNMS(KP876768831, T1B, T1j);
|
||||
T1G = FNMS(KP830830026, T1F, T1q);
|
||||
T1H = FMA(KP918985947, T1G, T1t);
|
||||
T1D = FNMS(KP959492973, T1C, T1f);
|
||||
io[WS(os, 4)] = FNMS(KP989821441, T1H, T1D);
|
||||
io[WS(os, 7)] = FMA(KP989821441, T1H, T1D);
|
||||
}
|
||||
{
|
||||
E T1o, T1y, T1n, T1x, T1p;
|
||||
T1n = FNMS(KP778434453, T1m, T1h);
|
||||
T1o = FNMS(KP876768831, T1n, T1g);
|
||||
T1x = FNMS(KP830830026, T1w, T1r);
|
||||
T1y = FNMS(KP918985947, T1x, T1q);
|
||||
T1p = FNMS(KP959492973, T1o, T1f);
|
||||
io[WS(os, 5)] = FMA(KP989821441, T1y, T1p);
|
||||
io[WS(os, 6)] = FNMS(KP989821441, T1y, T1p);
|
||||
}
|
||||
{
|
||||
E T19, T1e, T18, T1d, T1a;
|
||||
T18 = FNMS(KP778434453, T17, T7);
|
||||
T19 = FNMS(KP876768831, T18, T4);
|
||||
T1d = FNMS(KP830830026, T1c, Tu);
|
||||
T1e = FNMS(KP918985947, T1d, TA);
|
||||
T1a = FNMS(KP959492973, T19, T1);
|
||||
ro[WS(os, 6)] = FNMS(KP989821441, T1e, T1a);
|
||||
ro[WS(os, 5)] = FMA(KP989821441, T1e, T1a);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 11, "n1_11", { 30, 0, 110, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 140 FP additions, 100 FP multiplications,
|
||||
* (or, 60 additions, 20 multiplications, 80 fused multiply/add),
|
||||
* 41 stack variables, 10 constants, and 44 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP654860733, +0.654860733945285064056925072466293553183791199);
|
||||
DK(KP142314838, +0.142314838273285140443792668616369668791051361);
|
||||
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
|
||||
DK(KP415415013, +0.415415013001886425529274149229623203524004910);
|
||||
DK(KP841253532, +0.841253532831181168861811648919367717513292498);
|
||||
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
|
||||
DK(KP909631995, +0.909631995354518371411715383079028460060241051);
|
||||
DK(KP281732556, +0.281732556841429697711417915346616899035777899);
|
||||
DK(KP540640817, +0.540640817455597582107635954318691695431770608);
|
||||
DK(KP755749574, +0.755749574354258283774035843972344420179717445);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
|
||||
E T1, TM, T4, TG, Tk, TR, Tw, TN, T7, TK, Ta, TH, Tn, TQ, Td;
|
||||
E TJ, Tq, TO, Tt, TP, Tg, TI;
|
||||
{
|
||||
E T2, T3, Ti, Tj;
|
||||
T1 = ri[0];
|
||||
TM = ii[0];
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
TG = T3 - T2;
|
||||
Ti = ii[WS(is, 1)];
|
||||
Tj = ii[WS(is, 10)];
|
||||
Tk = Ti - Tj;
|
||||
TR = Ti + Tj;
|
||||
{
|
||||
E Tu, Tv, T5, T6;
|
||||
Tu = ii[WS(is, 2)];
|
||||
Tv = ii[WS(is, 9)];
|
||||
Tw = Tu - Tv;
|
||||
TN = Tu + Tv;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 9)];
|
||||
T7 = T5 + T6;
|
||||
TK = T6 - T5;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T8, T9, To, Tp;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 8)];
|
||||
Ta = T8 + T9;
|
||||
TH = T9 - T8;
|
||||
{
|
||||
E Tl, Tm, Tb, Tc;
|
||||
Tl = ii[WS(is, 3)];
|
||||
Tm = ii[WS(is, 8)];
|
||||
Tn = Tl - Tm;
|
||||
TQ = Tl + Tm;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 7)];
|
||||
Td = Tb + Tc;
|
||||
TJ = Tc - Tb;
|
||||
}
|
||||
To = ii[WS(is, 4)];
|
||||
Tp = ii[WS(is, 7)];
|
||||
Tq = To - Tp;
|
||||
TO = To + Tp;
|
||||
{
|
||||
E Tr, Ts, Te, Tf;
|
||||
Tr = ii[WS(is, 5)];
|
||||
Ts = ii[WS(is, 6)];
|
||||
Tt = Tr - Ts;
|
||||
TP = Tr + Ts;
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = ri[WS(is, 6)];
|
||||
Tg = Te + Tf;
|
||||
TI = Tf - Te;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tx, Th, TZ, T10;
|
||||
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
|
||||
io[0] = TM + TR + TN + TQ + TO + TP;
|
||||
Tx = FMA(KP755749574, Tk, KP540640817 * Tn) + FNMS(KP909631995, Tt, KP281732556 * Tq) - (KP989821441 * Tw);
|
||||
Th = FMA(KP841253532, Ta, T1) + FNMS(KP959492973, Td, KP415415013 * Tg) + FNMA(KP142314838, T7, KP654860733 * T4);
|
||||
ro[WS(os, 7)] = Th - Tx;
|
||||
ro[WS(os, 4)] = Th + Tx;
|
||||
TZ = FMA(KP755749574, TG, KP540640817 * TH) + FNMS(KP909631995, TI, KP281732556 * TJ) - (KP989821441 * TK);
|
||||
T10 = FMA(KP841253532, TQ, TM) + FNMS(KP959492973, TO, KP415415013 * TP) + FNMA(KP142314838, TN, KP654860733 * TR);
|
||||
io[WS(os, 4)] = TZ + T10;
|
||||
io[WS(os, 7)] = T10 - TZ;
|
||||
{
|
||||
E TX, TY, Tz, Ty;
|
||||
TX = FMA(KP909631995, TG, KP755749574 * TK) + FNMA(KP540640817, TI, KP989821441 * TJ) - (KP281732556 * TH);
|
||||
TY = FMA(KP415415013, TR, TM) + FNMS(KP142314838, TO, KP841253532 * TP) + FNMA(KP959492973, TQ, KP654860733 * TN);
|
||||
io[WS(os, 2)] = TX + TY;
|
||||
io[WS(os, 9)] = TY - TX;
|
||||
Tz = FMA(KP909631995, Tk, KP755749574 * Tw) + FNMA(KP540640817, Tt, KP989821441 * Tq) - (KP281732556 * Tn);
|
||||
Ty = FMA(KP415415013, T4, T1) + FNMS(KP142314838, Td, KP841253532 * Tg) + FNMA(KP959492973, Ta, KP654860733 * T7);
|
||||
ro[WS(os, 9)] = Ty - Tz;
|
||||
ro[WS(os, 2)] = Ty + Tz;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, TA, TT, TU;
|
||||
TB = FMA(KP540640817, Tk, KP909631995 * Tw) + FMA(KP989821441, Tn, KP755749574 * Tq) + (KP281732556 * Tt);
|
||||
TA = FMA(KP841253532, T4, T1) + FNMS(KP959492973, Tg, KP415415013 * T7) + FNMA(KP654860733, Td, KP142314838 * Ta);
|
||||
ro[WS(os, 10)] = TA - TB;
|
||||
ro[WS(os, 1)] = TA + TB;
|
||||
{
|
||||
E TV, TW, TD, TC;
|
||||
TV = FMA(KP540640817, TG, KP909631995 * TK) + FMA(KP989821441, TH, KP755749574 * TJ) + (KP281732556 * TI);
|
||||
TW = FMA(KP841253532, TR, TM) + FNMS(KP959492973, TP, KP415415013 * TN) + FNMA(KP654860733, TO, KP142314838 * TQ);
|
||||
io[WS(os, 1)] = TV + TW;
|
||||
io[WS(os, 10)] = TW - TV;
|
||||
TD = FMA(KP989821441, Tk, KP540640817 * Tq) + FNMS(KP909631995, Tn, KP755749574 * Tt) - (KP281732556 * Tw);
|
||||
TC = FMA(KP415415013, Ta, T1) + FNMS(KP654860733, Tg, KP841253532 * Td) + FNMA(KP959492973, T7, KP142314838 * T4);
|
||||
ro[WS(os, 8)] = TC - TD;
|
||||
ro[WS(os, 3)] = TC + TD;
|
||||
}
|
||||
TT = FMA(KP989821441, TG, KP540640817 * TJ) + FNMS(KP909631995, TH, KP755749574 * TI) - (KP281732556 * TK);
|
||||
TU = FMA(KP415415013, TQ, TM) + FNMS(KP654860733, TP, KP841253532 * TO) + FNMA(KP959492973, TN, KP142314838 * TR);
|
||||
io[WS(os, 3)] = TT + TU;
|
||||
io[WS(os, 8)] = TU - TT;
|
||||
{
|
||||
E TL, TS, TF, TE;
|
||||
TL = FMA(KP281732556, TG, KP755749574 * TH) + FNMS(KP909631995, TJ, KP989821441 * TI) - (KP540640817 * TK);
|
||||
TS = FMA(KP841253532, TN, TM) + FNMS(KP142314838, TP, KP415415013 * TO) + FNMA(KP654860733, TQ, KP959492973 * TR);
|
||||
io[WS(os, 5)] = TL + TS;
|
||||
io[WS(os, 6)] = TS - TL;
|
||||
TF = FMA(KP281732556, Tk, KP755749574 * Tn) + FNMS(KP909631995, Tq, KP989821441 * Tt) - (KP540640817 * Tw);
|
||||
TE = FMA(KP841253532, T7, T1) + FNMS(KP142314838, Tg, KP415415013 * Td) + FNMA(KP654860733, Ta, KP959492973 * T4);
|
||||
ro[WS(os, 6)] = TE - TF;
|
||||
ro[WS(os, 5)] = TE + TF;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 11, "n1_11", { 60, 20, 80, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
420
fftw-3.3.10/dft/scalar/codelets/n1_12.c
Normal file
420
fftw-3.3.10/dft/scalar/codelets/n1_12.c
Normal file
@@ -0,0 +1,420 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 24 FP multiplications,
|
||||
* (or, 72 additions, 0 multiplications, 24 fused multiply/add),
|
||||
* 43 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
|
||||
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1d, TG;
|
||||
E TJ, T1u, T1c, Tl, T1i, TL, TO, T1v, T1h;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = ri[WS(is, 8)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TR = FNMS(KP500000000, T4, T1);
|
||||
TA = T3 - T2;
|
||||
}
|
||||
{
|
||||
E To, Tp, Tq, Tr;
|
||||
To = ii[0];
|
||||
Tp = ii[WS(is, 4)];
|
||||
Tq = ii[WS(is, 8)];
|
||||
Tr = Tp + Tq;
|
||||
Ts = To + Tr;
|
||||
TS = Tp - Tq;
|
||||
Tz = FNMS(KP500000000, Tr, To);
|
||||
}
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = ri[WS(is, 6)];
|
||||
T7 = ri[WS(is, 10)];
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
TU = FNMS(KP500000000, T9, T6);
|
||||
TD = T8 - T7;
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tv, Tw;
|
||||
Tt = ii[WS(is, 6)];
|
||||
Tu = ii[WS(is, 10)];
|
||||
Tv = ii[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
Tx = Tt + Tw;
|
||||
TV = Tu - Tv;
|
||||
TC = FNMS(KP500000000, Tw, Tt);
|
||||
}
|
||||
{
|
||||
E Tc, Td, Te, Tf;
|
||||
Tc = ri[WS(is, 3)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = ri[WS(is, 11)];
|
||||
Tf = Td + Te;
|
||||
Tg = Tc + Tf;
|
||||
T1d = Te - Td;
|
||||
TG = FNMS(KP500000000, Tf, Tc);
|
||||
}
|
||||
{
|
||||
E T1a, TH, TI, T1b;
|
||||
T1a = ii[WS(is, 3)];
|
||||
TH = ii[WS(is, 7)];
|
||||
TI = ii[WS(is, 11)];
|
||||
T1b = TH + TI;
|
||||
TJ = TH - TI;
|
||||
T1u = T1a + T1b;
|
||||
T1c = FNMS(KP500000000, T1b, T1a);
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = Th + Tk;
|
||||
T1i = Tj - Ti;
|
||||
TL = FNMS(KP500000000, Tk, Th);
|
||||
}
|
||||
{
|
||||
E T1f, TM, TN, T1g;
|
||||
T1f = ii[WS(is, 9)];
|
||||
TM = ii[WS(is, 1)];
|
||||
TN = ii[WS(is, 5)];
|
||||
T1g = TM + TN;
|
||||
TO = TM - TN;
|
||||
T1v = T1f + T1g;
|
||||
T1h = FNMS(KP500000000, T1g, T1f);
|
||||
}
|
||||
{
|
||||
E Tb, Tm, T1t, T1w;
|
||||
Tb = T5 + Ta;
|
||||
Tm = Tg + Tl;
|
||||
ro[WS(os, 6)] = Tb - Tm;
|
||||
ro[0] = Tb + Tm;
|
||||
{
|
||||
E T1x, T1y, Tn, Ty;
|
||||
T1x = Ts + Tx;
|
||||
T1y = T1u + T1v;
|
||||
io[WS(os, 6)] = T1x - T1y;
|
||||
io[0] = T1x + T1y;
|
||||
Tn = Tg - Tl;
|
||||
Ty = Ts - Tx;
|
||||
io[WS(os, 3)] = Tn + Ty;
|
||||
io[WS(os, 9)] = Ty - Tn;
|
||||
}
|
||||
T1t = T5 - Ta;
|
||||
T1w = T1u - T1v;
|
||||
ro[WS(os, 3)] = T1t - T1w;
|
||||
ro[WS(os, 9)] = T1t + T1w;
|
||||
{
|
||||
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
|
||||
{
|
||||
E TZ, T10, T1e, T1j;
|
||||
TZ = FMA(KP866025403, TA, Tz);
|
||||
T10 = FMA(KP866025403, TD, TC);
|
||||
T11 = TZ - T10;
|
||||
T1l = TZ + T10;
|
||||
T1e = FMA(KP866025403, T1d, T1c);
|
||||
T1j = FMA(KP866025403, T1i, T1h);
|
||||
T1k = T1e - T1j;
|
||||
T1m = T1e + T1j;
|
||||
}
|
||||
{
|
||||
E T12, T13, T15, T16;
|
||||
T12 = FMA(KP866025403, TJ, TG);
|
||||
T13 = FMA(KP866025403, TO, TL);
|
||||
T14 = T12 - T13;
|
||||
T18 = T12 + T13;
|
||||
T15 = FMA(KP866025403, TS, TR);
|
||||
T16 = FMA(KP866025403, TV, TU);
|
||||
T17 = T15 + T16;
|
||||
T19 = T15 - T16;
|
||||
}
|
||||
io[WS(os, 1)] = T11 - T14;
|
||||
ro[WS(os, 1)] = T19 + T1k;
|
||||
io[WS(os, 7)] = T11 + T14;
|
||||
ro[WS(os, 7)] = T19 - T1k;
|
||||
ro[WS(os, 10)] = T17 - T18;
|
||||
io[WS(os, 10)] = T1l - T1m;
|
||||
ro[WS(os, 4)] = T17 + T18;
|
||||
io[WS(os, 4)] = T1l + T1m;
|
||||
}
|
||||
{
|
||||
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
|
||||
{
|
||||
E TB, TE, T1o, T1p;
|
||||
TB = FNMS(KP866025403, TA, Tz);
|
||||
TE = FNMS(KP866025403, TD, TC);
|
||||
TF = TB - TE;
|
||||
T1r = TB + TE;
|
||||
T1o = FNMS(KP866025403, T1d, T1c);
|
||||
T1p = FNMS(KP866025403, T1i, T1h);
|
||||
T1q = T1o - T1p;
|
||||
T1s = T1o + T1p;
|
||||
}
|
||||
{
|
||||
E TK, TP, TT, TW;
|
||||
TK = FNMS(KP866025403, TJ, TG);
|
||||
TP = FNMS(KP866025403, TO, TL);
|
||||
TQ = TK - TP;
|
||||
TY = TK + TP;
|
||||
TT = FNMS(KP866025403, TS, TR);
|
||||
TW = FNMS(KP866025403, TV, TU);
|
||||
TX = TT + TW;
|
||||
T1n = TT - TW;
|
||||
}
|
||||
io[WS(os, 5)] = TF - TQ;
|
||||
ro[WS(os, 5)] = T1n + T1q;
|
||||
io[WS(os, 11)] = TF + TQ;
|
||||
ro[WS(os, 11)] = T1n - T1q;
|
||||
ro[WS(os, 2)] = TX - TY;
|
||||
io[WS(os, 2)] = T1r - T1s;
|
||||
ro[WS(os, 8)] = TX + TY;
|
||||
io[WS(os, 8)] = T1r + T1s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 12, "n1_12", { 72, 0, 24, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 16 FP multiplications,
|
||||
* (or, 88 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 43 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
|
||||
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
|
||||
E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = ri[WS(is, 8)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TR = FNMS(KP500000000, T4, T1);
|
||||
TA = KP866025403 * (T3 - T2);
|
||||
}
|
||||
{
|
||||
E To, Tp, Tq, Tr;
|
||||
To = ii[0];
|
||||
Tp = ii[WS(is, 4)];
|
||||
Tq = ii[WS(is, 8)];
|
||||
Tr = Tp + Tq;
|
||||
Ts = To + Tr;
|
||||
TS = KP866025403 * (Tp - Tq);
|
||||
Tz = FNMS(KP500000000, Tr, To);
|
||||
}
|
||||
{
|
||||
E T6, T7, T8, T9;
|
||||
T6 = ri[WS(is, 6)];
|
||||
T7 = ri[WS(is, 10)];
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = T7 + T8;
|
||||
Ta = T6 + T9;
|
||||
TU = FNMS(KP500000000, T9, T6);
|
||||
TD = KP866025403 * (T8 - T7);
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tv, Tw;
|
||||
Tt = ii[WS(is, 6)];
|
||||
Tu = ii[WS(is, 10)];
|
||||
Tv = ii[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
Tx = Tt + Tw;
|
||||
TV = KP866025403 * (Tu - Tv);
|
||||
TC = FNMS(KP500000000, Tw, Tt);
|
||||
}
|
||||
{
|
||||
E Tc, Td, Te, Tf;
|
||||
Tc = ri[WS(is, 3)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = ri[WS(is, 11)];
|
||||
Tf = Td + Te;
|
||||
Tg = Tc + Tf;
|
||||
T1a = KP866025403 * (Te - Td);
|
||||
TG = FNMS(KP500000000, Tf, Tc);
|
||||
}
|
||||
{
|
||||
E T1b, TH, TI, T1c;
|
||||
T1b = ii[WS(is, 3)];
|
||||
TH = ii[WS(is, 7)];
|
||||
TI = ii[WS(is, 11)];
|
||||
T1c = TH + TI;
|
||||
TJ = KP866025403 * (TH - TI);
|
||||
T1u = T1b + T1c;
|
||||
T1d = FNMS(KP500000000, T1c, T1b);
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = Th + Tk;
|
||||
T1f = KP866025403 * (Tj - Ti);
|
||||
TL = FNMS(KP500000000, Tk, Th);
|
||||
}
|
||||
{
|
||||
E T1g, TM, TN, T1h;
|
||||
T1g = ii[WS(is, 9)];
|
||||
TM = ii[WS(is, 1)];
|
||||
TN = ii[WS(is, 5)];
|
||||
T1h = TM + TN;
|
||||
TO = KP866025403 * (TM - TN);
|
||||
T1v = T1g + T1h;
|
||||
T1i = FNMS(KP500000000, T1h, T1g);
|
||||
}
|
||||
{
|
||||
E Tb, Tm, T1t, T1w;
|
||||
Tb = T5 + Ta;
|
||||
Tm = Tg + Tl;
|
||||
ro[WS(os, 6)] = Tb - Tm;
|
||||
ro[0] = Tb + Tm;
|
||||
{
|
||||
E T1x, T1y, Tn, Ty;
|
||||
T1x = Ts + Tx;
|
||||
T1y = T1u + T1v;
|
||||
io[WS(os, 6)] = T1x - T1y;
|
||||
io[0] = T1x + T1y;
|
||||
Tn = Tg - Tl;
|
||||
Ty = Ts - Tx;
|
||||
io[WS(os, 3)] = Tn + Ty;
|
||||
io[WS(os, 9)] = Ty - Tn;
|
||||
}
|
||||
T1t = T5 - Ta;
|
||||
T1w = T1u - T1v;
|
||||
ro[WS(os, 3)] = T1t - T1w;
|
||||
ro[WS(os, 9)] = T1t + T1w;
|
||||
{
|
||||
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
|
||||
{
|
||||
E TZ, T10, T1e, T1j;
|
||||
TZ = TA + Tz;
|
||||
T10 = TD + TC;
|
||||
T11 = TZ - T10;
|
||||
T1l = TZ + T10;
|
||||
T1e = T1a + T1d;
|
||||
T1j = T1f + T1i;
|
||||
T1k = T1e - T1j;
|
||||
T1m = T1e + T1j;
|
||||
}
|
||||
{
|
||||
E T12, T13, T15, T16;
|
||||
T12 = TG + TJ;
|
||||
T13 = TL + TO;
|
||||
T14 = T12 - T13;
|
||||
T18 = T12 + T13;
|
||||
T15 = TR + TS;
|
||||
T16 = TU + TV;
|
||||
T17 = T15 + T16;
|
||||
T19 = T15 - T16;
|
||||
}
|
||||
io[WS(os, 1)] = T11 - T14;
|
||||
ro[WS(os, 1)] = T19 + T1k;
|
||||
io[WS(os, 7)] = T11 + T14;
|
||||
ro[WS(os, 7)] = T19 - T1k;
|
||||
ro[WS(os, 10)] = T17 - T18;
|
||||
io[WS(os, 10)] = T1l - T1m;
|
||||
ro[WS(os, 4)] = T17 + T18;
|
||||
io[WS(os, 4)] = T1l + T1m;
|
||||
}
|
||||
{
|
||||
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
|
||||
{
|
||||
E TB, TE, T1o, T1p;
|
||||
TB = Tz - TA;
|
||||
TE = TC - TD;
|
||||
TF = TB - TE;
|
||||
T1r = TB + TE;
|
||||
T1o = T1d - T1a;
|
||||
T1p = T1i - T1f;
|
||||
T1q = T1o - T1p;
|
||||
T1s = T1o + T1p;
|
||||
}
|
||||
{
|
||||
E TK, TP, TT, TW;
|
||||
TK = TG - TJ;
|
||||
TP = TL - TO;
|
||||
TQ = TK - TP;
|
||||
TY = TK + TP;
|
||||
TT = TR - TS;
|
||||
TW = TU - TV;
|
||||
TX = TT + TW;
|
||||
T1n = TT - TW;
|
||||
}
|
||||
io[WS(os, 5)] = TF - TQ;
|
||||
ro[WS(os, 5)] = T1n + T1q;
|
||||
io[WS(os, 11)] = TF + TQ;
|
||||
ro[WS(os, 11)] = T1n - T1q;
|
||||
ro[WS(os, 2)] = TX - TY;
|
||||
io[WS(os, 2)] = T1r - T1s;
|
||||
ro[WS(os, 8)] = TX + TY;
|
||||
io[WS(os, 8)] = T1r + T1s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 12, "n1_12", { 88, 8, 8, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
681
fftw-3.3.10/dft/scalar/codelets/n1_13.c
Normal file
681
fftw-3.3.10/dft/scalar/codelets/n1_13.c
Normal file
@@ -0,0 +1,681 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 176 FP additions, 114 FP multiplications,
|
||||
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
|
||||
* 76 stack variables, 25 constants, and 52 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP875502302, +0.875502302409147941146295545768755143177842006);
|
||||
DK(KP520028571, +0.520028571888864619117130500499232802493238139);
|
||||
DK(KP968287244, +0.968287244361984016049539446938120421179794516);
|
||||
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
|
||||
DK(KP600477271, +0.600477271932665282925769253334763009352012849);
|
||||
DK(KP957805992, +0.957805992594665126462521754605754580515587217);
|
||||
DK(KP516520780, +0.516520780623489722840901288569017135705033622);
|
||||
DK(KP581704778, +0.581704778510515730456870384989698884939833902);
|
||||
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
|
||||
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
|
||||
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
|
||||
DK(KP301479260, +0.301479260047709873958013540496673347309208464);
|
||||
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
|
||||
DK(KP859542535, +0.859542535098774820163672132761689612766401925);
|
||||
DK(KP514918778, +0.514918778086315755491789696138117261566051239);
|
||||
DK(KP522026385, +0.522026385161275033714027226654165028300441940);
|
||||
DK(KP853480001, +0.853480001859823990758994934970528322872359049);
|
||||
DK(KP612264650, +0.612264650376756543746494474777125408779395514);
|
||||
DK(KP038632954, +0.038632954644348171955506895830342264440241080);
|
||||
DK(KP302775637, +0.302775637731994646559610633735247973125648287);
|
||||
DK(KP769338817, +0.769338817572980603471413688209101117038278899);
|
||||
DK(KP686558370, +0.686558370781754340655719594850823015421401653);
|
||||
DK(KP226109445, +0.226109445035782405468510155372505010481906348);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
|
||||
E T1, T1P, T2n, T2o, To, TH, T2h, T2k, TB, TE, Tw, TF, T2c, T2j, T1j;
|
||||
E T1m, T12, T1f, T21, T24, T1U, T27, T1d, T1g, T1Y, T25;
|
||||
T1 = ri[0];
|
||||
T1P = ii[0];
|
||||
{
|
||||
E Tf, T2d, Tb, Ty, Tq, T6, Tx, Tr, Ti, Tt, Tl, Tu, Tm, T2e, Td;
|
||||
E Te, Tc, Tn;
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = Td + Te;
|
||||
T2d = Td - Te;
|
||||
{
|
||||
E T7, T8, T9, Ta;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 10)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
Tb = T7 + Ta;
|
||||
Ty = FMS(KP500000000, Ta, T7);
|
||||
Tq = T8 - T9;
|
||||
}
|
||||
{
|
||||
E T2, T3, T4, T5;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 3)];
|
||||
T4 = ri[WS(is, 9)];
|
||||
T5 = T3 + T4;
|
||||
T6 = T2 + T5;
|
||||
Tx = FNMS(KP500000000, T5, T2);
|
||||
Tr = T4 - T3;
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ri[WS(is, 11)];
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = Tg + Th;
|
||||
Tt = Tg - Th;
|
||||
Tj = ri[WS(is, 7)];
|
||||
Tk = ri[WS(is, 2)];
|
||||
Tl = Tj + Tk;
|
||||
Tu = Tj - Tk;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T2e = Tt + Tu;
|
||||
T2n = T6 - Tb;
|
||||
T2o = T2d + T2e;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Tf + Tm;
|
||||
To = Tc + Tn;
|
||||
TH = Tc - Tn;
|
||||
{
|
||||
E T2f, T2g, Tz, TA;
|
||||
T2f = FNMS(KP500000000, T2e, T2d);
|
||||
T2g = Tr + Tq;
|
||||
T2h = FMA(KP866025403, T2g, T2f);
|
||||
T2k = FNMS(KP866025403, T2g, T2f);
|
||||
Tz = Tx - Ty;
|
||||
TA = FNMS(KP500000000, Tm, Tf);
|
||||
TB = Tz + TA;
|
||||
TE = Tz - TA;
|
||||
}
|
||||
{
|
||||
E Ts, Tv, T2a, T2b;
|
||||
Ts = Tq - Tr;
|
||||
Tv = Tt - Tu;
|
||||
Tw = Ts + Tv;
|
||||
TF = Ts - Tv;
|
||||
T2a = Tx + Ty;
|
||||
T2b = Ti - Tl;
|
||||
T2c = FMA(KP866025403, T2b, T2a);
|
||||
T2j = FNMS(KP866025403, T2b, T2a);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, T1R, T10, T1l, T18, TX, T1k, T15, TP, T1a, TS, T1b, TT, T1S, TK;
|
||||
E TL, TU, T11;
|
||||
TK = ii[WS(is, 8)];
|
||||
TL = ii[WS(is, 5)];
|
||||
TM = TK - TL;
|
||||
T1R = TK + TL;
|
||||
{
|
||||
E T16, TY, TZ, T17;
|
||||
T16 = ii[WS(is, 12)];
|
||||
TY = ii[WS(is, 10)];
|
||||
TZ = ii[WS(is, 4)];
|
||||
T17 = TY + TZ;
|
||||
T10 = TY - TZ;
|
||||
T1l = T16 + T17;
|
||||
T18 = FMS(KP500000000, T17, T16);
|
||||
}
|
||||
{
|
||||
E T13, TV, TW, T14;
|
||||
T13 = ii[WS(is, 1)];
|
||||
TV = ii[WS(is, 9)];
|
||||
TW = ii[WS(is, 3)];
|
||||
T14 = TW + TV;
|
||||
TX = TV - TW;
|
||||
T1k = T13 + T14;
|
||||
T15 = FNMS(KP500000000, T14, T13);
|
||||
}
|
||||
{
|
||||
E TN, TO, TQ, TR;
|
||||
TN = ii[WS(is, 11)];
|
||||
TO = ii[WS(is, 6)];
|
||||
TP = TN - TO;
|
||||
T1a = TN + TO;
|
||||
TQ = ii[WS(is, 7)];
|
||||
TR = ii[WS(is, 2)];
|
||||
TS = TQ - TR;
|
||||
T1b = TQ + TR;
|
||||
}
|
||||
TT = TP + TS;
|
||||
T1S = T1a + T1b;
|
||||
T1j = TM + TT;
|
||||
T1m = T1k - T1l;
|
||||
TU = FNMS(KP500000000, TT, TM);
|
||||
T11 = TX + T10;
|
||||
T12 = FMA(KP866025403, T11, TU);
|
||||
T1f = FNMS(KP866025403, T11, TU);
|
||||
{
|
||||
E T1Z, T20, T1Q, T1T;
|
||||
T1Z = T15 - T18;
|
||||
T20 = FNMS(KP500000000, T1S, T1R);
|
||||
T21 = T1Z + T20;
|
||||
T24 = T1Z - T20;
|
||||
T1Q = T1k + T1l;
|
||||
T1T = T1R + T1S;
|
||||
T1U = T1Q + T1T;
|
||||
T27 = T1Q - T1T;
|
||||
}
|
||||
{
|
||||
E T19, T1c, T1W, T1X;
|
||||
T19 = T15 + T18;
|
||||
T1c = T1a - T1b;
|
||||
T1d = FMA(KP866025403, T1c, T19);
|
||||
T1g = FNMS(KP866025403, T1c, T19);
|
||||
T1W = T10 - TX;
|
||||
T1X = TP - TS;
|
||||
T1Y = T1W + T1X;
|
||||
T25 = T1W - T1X;
|
||||
}
|
||||
}
|
||||
ro[0] = T1 + To;
|
||||
io[0] = T1P + T1U;
|
||||
{
|
||||
E T1z, T1J, T1G, T1H, T1w, T1I, T1n, T1i, T1s, T1E, TD, T1D, TI, T1r, T1e;
|
||||
E T1h;
|
||||
{
|
||||
E T1x, T1y, T1u, T1v;
|
||||
T1x = FNMS(KP226109445, Tw, TB);
|
||||
T1y = FMA(KP686558370, TE, TF);
|
||||
T1z = FNMS(KP769338817, T1y, T1x);
|
||||
T1J = FMA(KP769338817, T1y, T1x);
|
||||
T1G = FMA(KP302775637, T1j, T1m);
|
||||
T1u = FNMS(KP038632954, T12, T1d);
|
||||
T1v = FNMS(KP612264650, T1f, T1g);
|
||||
T1H = FNMS(KP853480001, T1v, T1u);
|
||||
T1w = FMA(KP853480001, T1v, T1u);
|
||||
T1I = FNMS(KP522026385, T1H, T1G);
|
||||
}
|
||||
T1n = FNMS(KP302775637, T1m, T1j);
|
||||
T1e = FMA(KP038632954, T1d, T12);
|
||||
T1h = FMA(KP612264650, T1g, T1f);
|
||||
T1i = FNMS(KP853480001, T1h, T1e);
|
||||
T1s = FNMS(KP522026385, T1i, T1n);
|
||||
T1E = FMA(KP853480001, T1h, T1e);
|
||||
{
|
||||
E TG, T1q, Tp, TC, T1p;
|
||||
TG = FNMS(KP514918778, TF, TE);
|
||||
T1q = FNMS(KP859542535, TG, TH);
|
||||
Tp = FNMS(KP083333333, To, T1);
|
||||
TC = FMA(KP301479260, TB, Tw);
|
||||
T1p = FNMS(KP251768516, TC, Tp);
|
||||
TD = FMA(KP503537032, TC, Tp);
|
||||
T1D = FNMS(KP300462606, T1q, T1p);
|
||||
TI = FMA(KP581704778, TH, TG);
|
||||
T1r = FMA(KP300462606, T1q, T1p);
|
||||
}
|
||||
{
|
||||
E TJ, T1o, T1L, T1M;
|
||||
TJ = FMA(KP516520780, TI, TD);
|
||||
T1o = FMA(KP957805992, T1n, T1i);
|
||||
ro[WS(os, 1)] = FNMS(KP600477271, T1o, TJ);
|
||||
ro[WS(os, 12)] = FMA(KP600477271, T1o, TJ);
|
||||
{
|
||||
E T1t, T1A, T1N, T1O;
|
||||
T1t = FNMS(KP575140729, T1s, T1r);
|
||||
T1A = FMA(KP968287244, T1z, T1w);
|
||||
ro[WS(os, 9)] = FNMS(KP520028571, T1A, T1t);
|
||||
ro[WS(os, 3)] = FMA(KP520028571, T1A, T1t);
|
||||
T1N = FNMS(KP516520780, TI, TD);
|
||||
T1O = FMA(KP957805992, T1G, T1H);
|
||||
ro[WS(os, 8)] = FNMS(KP600477271, T1O, T1N);
|
||||
ro[WS(os, 5)] = FMA(KP600477271, T1O, T1N);
|
||||
}
|
||||
T1L = FNMS(KP520028571, T1E, T1D);
|
||||
T1M = FNMS(KP875502302, T1J, T1I);
|
||||
ro[WS(os, 11)] = FNMS(KP575140729, T1M, T1L);
|
||||
ro[WS(os, 6)] = FMA(KP575140729, T1M, T1L);
|
||||
{
|
||||
E T1F, T1K, T1B, T1C;
|
||||
T1F = FMA(KP520028571, T1E, T1D);
|
||||
T1K = FMA(KP875502302, T1J, T1I);
|
||||
ro[WS(os, 7)] = FNMS(KP575140729, T1K, T1F);
|
||||
ro[WS(os, 2)] = FMA(KP575140729, T1K, T1F);
|
||||
T1B = FMA(KP575140729, T1s, T1r);
|
||||
T1C = FNMS(KP968287244, T1z, T1w);
|
||||
ro[WS(os, 10)] = FNMS(KP520028571, T1C, T1B);
|
||||
ro[WS(os, 4)] = FMA(KP520028571, T1C, T1B);
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2F, T2N, T2v, T2u, T2A, T2K, T2p, T2m, T2C, T2M, T23, T2J, T28, T2z, T2i;
|
||||
E T2l;
|
||||
{
|
||||
E T2D, T2E, T2s, T2t;
|
||||
T2D = FNMS(KP226109445, T1Y, T21);
|
||||
T2E = FMA(KP686558370, T24, T25);
|
||||
T2F = FNMS(KP769338817, T2E, T2D);
|
||||
T2N = FMA(KP769338817, T2E, T2D);
|
||||
T2v = FNMS(KP302775637, T2n, T2o);
|
||||
T2s = FMA(KP038632954, T2c, T2h);
|
||||
T2t = FMA(KP612264650, T2j, T2k);
|
||||
T2u = FNMS(KP853480001, T2t, T2s);
|
||||
T2A = FNMS(KP522026385, T2u, T2v);
|
||||
T2K = FMA(KP853480001, T2t, T2s);
|
||||
}
|
||||
T2p = FMA(KP302775637, T2o, T2n);
|
||||
T2i = FNMS(KP038632954, T2h, T2c);
|
||||
T2l = FNMS(KP612264650, T2k, T2j);
|
||||
T2m = FNMS(KP853480001, T2l, T2i);
|
||||
T2C = FMA(KP853480001, T2l, T2i);
|
||||
T2M = FNMS(KP522026385, T2m, T2p);
|
||||
{
|
||||
E T26, T2y, T1V, T22, T2x;
|
||||
T26 = FNMS(KP514918778, T25, T24);
|
||||
T2y = FNMS(KP859542535, T26, T27);
|
||||
T1V = FNMS(KP083333333, T1U, T1P);
|
||||
T22 = FMA(KP301479260, T21, T1Y);
|
||||
T2x = FNMS(KP251768516, T22, T1V);
|
||||
T23 = FMA(KP503537032, T22, T1V);
|
||||
T2J = FNMS(KP300462606, T2y, T2x);
|
||||
T28 = FMA(KP581704778, T27, T26);
|
||||
T2z = FMA(KP300462606, T2y, T2x);
|
||||
}
|
||||
{
|
||||
E T29, T2q, T2L, T2O;
|
||||
T29 = FNMS(KP516520780, T28, T23);
|
||||
T2q = FMA(KP957805992, T2p, T2m);
|
||||
io[WS(os, 5)] = FNMS(KP600477271, T2q, T29);
|
||||
io[WS(os, 8)] = FMA(KP600477271, T2q, T29);
|
||||
{
|
||||
E T2r, T2w, T2P, T2Q;
|
||||
T2r = FMA(KP516520780, T28, T23);
|
||||
T2w = FMA(KP957805992, T2v, T2u);
|
||||
io[WS(os, 1)] = FMA(KP600477271, T2w, T2r);
|
||||
io[WS(os, 12)] = FNMS(KP600477271, T2w, T2r);
|
||||
T2P = FMA(KP520028571, T2K, T2J);
|
||||
T2Q = FMA(KP875502302, T2N, T2M);
|
||||
io[WS(os, 6)] = FNMS(KP575140729, T2Q, T2P);
|
||||
io[WS(os, 11)] = FMA(KP575140729, T2Q, T2P);
|
||||
}
|
||||
T2L = FNMS(KP520028571, T2K, T2J);
|
||||
T2O = FNMS(KP875502302, T2N, T2M);
|
||||
io[WS(os, 2)] = FNMS(KP575140729, T2O, T2L);
|
||||
io[WS(os, 7)] = FMA(KP575140729, T2O, T2L);
|
||||
{
|
||||
E T2H, T2I, T2B, T2G;
|
||||
T2H = FNMS(KP575140729, T2A, T2z);
|
||||
T2I = FMA(KP968287244, T2F, T2C);
|
||||
io[WS(os, 4)] = FNMS(KP520028571, T2I, T2H);
|
||||
io[WS(os, 10)] = FMA(KP520028571, T2I, T2H);
|
||||
T2B = FMA(KP575140729, T2A, T2z);
|
||||
T2G = FNMS(KP968287244, T2F, T2C);
|
||||
io[WS(os, 3)] = FNMS(KP520028571, T2G, T2B);
|
||||
io[WS(os, 9)] = FMA(KP520028571, T2G, T2B);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 13, "n1_13", { 62, 0, 114, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 176 FP additions, 68 FP multiplications,
|
||||
* (or, 138 additions, 30 multiplications, 38 fused multiply/add),
|
||||
* 71 stack variables, 20 constants, and 52 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
|
||||
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
|
||||
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
|
||||
DK(KP075902986, +0.075902986037193865983102897245103540356428373);
|
||||
DK(KP132983124, +0.132983124607418643793760531921092974399165133);
|
||||
DK(KP258260390, +0.258260390311744861420450644284508567852516811);
|
||||
DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
|
||||
DK(KP300238635, +0.300238635966332641462884626667381504676006424);
|
||||
DK(KP011599105, +0.011599105605768290721655456654083252189827041);
|
||||
DK(KP156891391, +0.156891391051584611046832726756003269660212636);
|
||||
DK(KP256247671, +0.256247671582936600958684654061725059144125175);
|
||||
DK(KP174138601, +0.174138601152135905005660794929264742616964676);
|
||||
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
|
||||
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
|
||||
DK(KP113854479, +0.113854479055790798974654345867655310534642560);
|
||||
DK(KP265966249, +0.265966249214837287587521063842185948798330267);
|
||||
DK(KP387390585, +0.387390585467617292130675966426762851778775217);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
|
||||
E T1, T1q, Tt, Tu, To, T22, T20, T24, TF, TH, TA, TI, T1X, T25, T2a;
|
||||
E T2d, T18, T1n, T2k, T2n, T1l, T1r, T1f, T1o, T2h, T2m;
|
||||
T1 = ri[0];
|
||||
T1q = ii[0];
|
||||
{
|
||||
E Tf, Tp, Tb, TC, Tx, T6, TB, Tw, Ti, Tq, Tl, Tr, Tm, Ts, Td;
|
||||
E Te, Tc, Tn;
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = ri[WS(is, 5)];
|
||||
Tf = Td + Te;
|
||||
Tp = Td - Te;
|
||||
{
|
||||
E T7, T8, T9, Ta;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 10)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
Tb = T7 + Ta;
|
||||
TC = T8 - T9;
|
||||
Tx = FNMS(KP500000000, Ta, T7);
|
||||
}
|
||||
{
|
||||
E T2, T3, T4, T5;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 3)];
|
||||
T4 = ri[WS(is, 9)];
|
||||
T5 = T3 + T4;
|
||||
T6 = T2 + T5;
|
||||
TB = T3 - T4;
|
||||
Tw = FNMS(KP500000000, T5, T2);
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ri[WS(is, 11)];
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = Tg + Th;
|
||||
Tq = Tg - Th;
|
||||
Tj = ri[WS(is, 7)];
|
||||
Tk = ri[WS(is, 2)];
|
||||
Tl = Tj + Tk;
|
||||
Tr = Tj - Tk;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
Ts = Tq + Tr;
|
||||
Tt = Tp + Ts;
|
||||
Tu = T6 - Tb;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Tf + Tm;
|
||||
To = Tc + Tn;
|
||||
T22 = KP300462606 * (Tc - Tn);
|
||||
{
|
||||
E T1Y, T1Z, TD, TE;
|
||||
T1Y = TB + TC;
|
||||
T1Z = Tq - Tr;
|
||||
T20 = T1Y - T1Z;
|
||||
T24 = T1Y + T1Z;
|
||||
TD = KP866025403 * (TB - TC);
|
||||
TE = FNMS(KP500000000, Ts, Tp);
|
||||
TF = TD - TE;
|
||||
TH = TD + TE;
|
||||
}
|
||||
{
|
||||
E Ty, Tz, T1V, T1W;
|
||||
Ty = Tw - Tx;
|
||||
Tz = KP866025403 * (Ti - Tl);
|
||||
TA = Ty + Tz;
|
||||
TI = Ty - Tz;
|
||||
T1V = Tw + Tx;
|
||||
T1W = FNMS(KP500000000, Tm, Tf);
|
||||
T1X = T1V - T1W;
|
||||
T25 = T1V + T1W;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TZ, T2b, TV, T1i, T1a, TQ, T1h, T19, T12, T1d, T15, T1c, T16, T2c, TX;
|
||||
E TY, TW, T17;
|
||||
TX = ii[WS(is, 8)];
|
||||
TY = ii[WS(is, 5)];
|
||||
TZ = TX + TY;
|
||||
T2b = TX - TY;
|
||||
{
|
||||
E TR, TS, TT, TU;
|
||||
TR = ii[WS(is, 12)];
|
||||
TS = ii[WS(is, 10)];
|
||||
TT = ii[WS(is, 4)];
|
||||
TU = TS + TT;
|
||||
TV = FNMS(KP500000000, TU, TR);
|
||||
T1i = TR + TU;
|
||||
T1a = TS - TT;
|
||||
}
|
||||
{
|
||||
E TM, TN, TO, TP;
|
||||
TM = ii[WS(is, 1)];
|
||||
TN = ii[WS(is, 3)];
|
||||
TO = ii[WS(is, 9)];
|
||||
TP = TN + TO;
|
||||
TQ = FNMS(KP500000000, TP, TM);
|
||||
T1h = TM + TP;
|
||||
T19 = TN - TO;
|
||||
}
|
||||
{
|
||||
E T10, T11, T13, T14;
|
||||
T10 = ii[WS(is, 11)];
|
||||
T11 = ii[WS(is, 6)];
|
||||
T12 = T10 + T11;
|
||||
T1d = T10 - T11;
|
||||
T13 = ii[WS(is, 7)];
|
||||
T14 = ii[WS(is, 2)];
|
||||
T15 = T13 + T14;
|
||||
T1c = T13 - T14;
|
||||
}
|
||||
T16 = T12 + T15;
|
||||
T2c = T1d + T1c;
|
||||
T2a = T1h - T1i;
|
||||
T2d = T2b + T2c;
|
||||
TW = TQ + TV;
|
||||
T17 = FNMS(KP500000000, T16, TZ);
|
||||
T18 = TW - T17;
|
||||
T1n = TW + T17;
|
||||
{
|
||||
E T2i, T2j, T1j, T1k;
|
||||
T2i = TQ - TV;
|
||||
T2j = KP866025403 * (T15 - T12);
|
||||
T2k = T2i + T2j;
|
||||
T2n = T2i - T2j;
|
||||
T1j = T1h + T1i;
|
||||
T1k = TZ + T16;
|
||||
T1l = KP300462606 * (T1j - T1k);
|
||||
T1r = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E T1b, T1e, T2f, T2g;
|
||||
T1b = T19 + T1a;
|
||||
T1e = T1c - T1d;
|
||||
T1f = T1b + T1e;
|
||||
T1o = T1e - T1b;
|
||||
T2f = FNMS(KP500000000, T2c, T2b);
|
||||
T2g = KP866025403 * (T1a - T19);
|
||||
T2h = T2f - T2g;
|
||||
T2m = T2g + T2f;
|
||||
}
|
||||
}
|
||||
ro[0] = T1 + To;
|
||||
io[0] = T1q + T1r;
|
||||
{
|
||||
E T1D, T1N, T1y, T1x, T1E, T1O, Tv, TK, T1J, T1Q, T1m, T1R, T1t, T1I, TG;
|
||||
E TJ;
|
||||
{
|
||||
E T1B, T1C, T1v, T1w;
|
||||
T1B = FMA(KP387390585, T1f, KP265966249 * T18);
|
||||
T1C = FMA(KP113854479, T1o, KP503537032 * T1n);
|
||||
T1D = T1B + T1C;
|
||||
T1N = T1C - T1B;
|
||||
T1y = FMA(KP575140729, Tu, KP174138601 * Tt);
|
||||
T1v = FNMS(KP156891391, TH, KP256247671 * TI);
|
||||
T1w = FMA(KP011599105, TF, KP300238635 * TA);
|
||||
T1x = T1v - T1w;
|
||||
T1E = T1y + T1x;
|
||||
T1O = KP1_732050807 * (T1v + T1w);
|
||||
}
|
||||
Tv = FNMS(KP174138601, Tu, KP575140729 * Tt);
|
||||
TG = FNMS(KP300238635, TF, KP011599105 * TA);
|
||||
TJ = FMA(KP256247671, TH, KP156891391 * TI);
|
||||
TK = TG - TJ;
|
||||
T1J = KP1_732050807 * (TJ + TG);
|
||||
T1Q = Tv - TK;
|
||||
{
|
||||
E T1g, T1H, T1p, T1s, T1G;
|
||||
T1g = FNMS(KP132983124, T1f, KP258260390 * T18);
|
||||
T1H = T1l - T1g;
|
||||
T1p = FNMS(KP251768516, T1o, KP075902986 * T1n);
|
||||
T1s = FNMS(KP083333333, T1r, T1q);
|
||||
T1G = T1s - T1p;
|
||||
T1m = FMA(KP2_000000000, T1g, T1l);
|
||||
T1R = T1H + T1G;
|
||||
T1t = FMA(KP2_000000000, T1p, T1s);
|
||||
T1I = T1G - T1H;
|
||||
}
|
||||
{
|
||||
E TL, T1u, T1P, T1S;
|
||||
TL = FMA(KP2_000000000, TK, Tv);
|
||||
T1u = T1m + T1t;
|
||||
io[WS(os, 1)] = TL + T1u;
|
||||
io[WS(os, 12)] = T1u - TL;
|
||||
{
|
||||
E T1z, T1A, T1T, T1U;
|
||||
T1z = FMS(KP2_000000000, T1x, T1y);
|
||||
T1A = T1t - T1m;
|
||||
io[WS(os, 5)] = T1z + T1A;
|
||||
io[WS(os, 8)] = T1A - T1z;
|
||||
T1T = T1R - T1Q;
|
||||
T1U = T1O + T1N;
|
||||
io[WS(os, 4)] = T1T - T1U;
|
||||
io[WS(os, 10)] = T1U + T1T;
|
||||
}
|
||||
T1P = T1N - T1O;
|
||||
T1S = T1Q + T1R;
|
||||
io[WS(os, 3)] = T1P + T1S;
|
||||
io[WS(os, 9)] = T1S - T1P;
|
||||
{
|
||||
E T1L, T1M, T1F, T1K;
|
||||
T1L = T1J + T1I;
|
||||
T1M = T1E + T1D;
|
||||
io[WS(os, 6)] = T1L - T1M;
|
||||
io[WS(os, 11)] = T1M + T1L;
|
||||
T1F = T1D - T1E;
|
||||
T1K = T1I - T1J;
|
||||
io[WS(os, 2)] = T1F + T1K;
|
||||
io[WS(os, 7)] = T1K - T1F;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2y, T2I, T2J, T2K, T2B, T2L, T2e, T2p, T2u, T2G, T23, T2F, T28, T2t, T2l;
|
||||
E T2o;
|
||||
{
|
||||
E T2w, T2x, T2z, T2A;
|
||||
T2w = FMA(KP387390585, T20, KP265966249 * T1X);
|
||||
T2x = FNMS(KP503537032, T25, KP113854479 * T24);
|
||||
T2y = T2w + T2x;
|
||||
T2I = T2w - T2x;
|
||||
T2J = FMA(KP575140729, T2a, KP174138601 * T2d);
|
||||
T2z = FNMS(KP300238635, T2n, KP011599105 * T2m);
|
||||
T2A = FNMS(KP156891391, T2h, KP256247671 * T2k);
|
||||
T2K = T2z + T2A;
|
||||
T2B = KP1_732050807 * (T2z - T2A);
|
||||
T2L = T2J + T2K;
|
||||
}
|
||||
T2e = FNMS(KP575140729, T2d, KP174138601 * T2a);
|
||||
T2l = FMA(KP256247671, T2h, KP156891391 * T2k);
|
||||
T2o = FMA(KP300238635, T2m, KP011599105 * T2n);
|
||||
T2p = T2l - T2o;
|
||||
T2u = T2e - T2p;
|
||||
T2G = KP1_732050807 * (T2o + T2l);
|
||||
{
|
||||
E T21, T2r, T26, T27, T2s;
|
||||
T21 = FNMS(KP132983124, T20, KP258260390 * T1X);
|
||||
T2r = T22 - T21;
|
||||
T26 = FMA(KP251768516, T24, KP075902986 * T25);
|
||||
T27 = FNMS(KP083333333, To, T1);
|
||||
T2s = T27 - T26;
|
||||
T23 = FMA(KP2_000000000, T21, T22);
|
||||
T2F = T2s - T2r;
|
||||
T28 = FMA(KP2_000000000, T26, T27);
|
||||
T2t = T2r + T2s;
|
||||
}
|
||||
{
|
||||
E T29, T2q, T2N, T2O;
|
||||
T29 = T23 + T28;
|
||||
T2q = FMA(KP2_000000000, T2p, T2e);
|
||||
ro[WS(os, 12)] = T29 - T2q;
|
||||
ro[WS(os, 1)] = T29 + T2q;
|
||||
{
|
||||
E T2v, T2C, T2P, T2Q;
|
||||
T2v = T2t - T2u;
|
||||
T2C = T2y - T2B;
|
||||
ro[WS(os, 10)] = T2v - T2C;
|
||||
ro[WS(os, 4)] = T2v + T2C;
|
||||
T2P = T28 - T23;
|
||||
T2Q = FMS(KP2_000000000, T2K, T2J);
|
||||
ro[WS(os, 5)] = T2P - T2Q;
|
||||
ro[WS(os, 8)] = T2P + T2Q;
|
||||
}
|
||||
T2N = T2F - T2G;
|
||||
T2O = T2L - T2I;
|
||||
ro[WS(os, 11)] = T2N - T2O;
|
||||
ro[WS(os, 6)] = T2N + T2O;
|
||||
{
|
||||
E T2H, T2M, T2D, T2E;
|
||||
T2H = T2F + T2G;
|
||||
T2M = T2I + T2L;
|
||||
ro[WS(os, 7)] = T2H - T2M;
|
||||
ro[WS(os, 2)] = T2H + T2M;
|
||||
T2D = T2t + T2u;
|
||||
T2E = T2y + T2B;
|
||||
ro[WS(os, 3)] = T2D - T2E;
|
||||
ro[WS(os, 9)] = T2D + T2E;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 13, "n1_13", { 138, 30, 38, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
513
fftw-3.3.10/dft/scalar/codelets/n1_14.c
Normal file
513
fftw-3.3.10/dft/scalar/codelets/n1_14.c
Normal file
@@ -0,0 +1,513 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 148 FP additions, 84 FP multiplications,
|
||||
* (or, 64 additions, 0 multiplications, 84 fused multiply/add),
|
||||
* 67 stack variables, 6 constants, and 56 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
|
||||
E T3, Tp, T1b, T1x, T1i, T1L, T1M, T1j, T1k, T1K, Ta, To, Th, Tz, T14;
|
||||
E TZ, Ts, Ty, Tv, T1Z, T2c, T27, TI, T23, T24, TP, TW, T22, T1c, T1e;
|
||||
E T1d, T1f, T1s, T1n, T1A, T1G, T1D, T1H, T1U, T1P;
|
||||
{
|
||||
E T1, T2, T19, T1a;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 7)];
|
||||
T3 = T1 - T2;
|
||||
Tp = T1 + T2;
|
||||
T19 = ii[0];
|
||||
T1a = ii[WS(is, 7)];
|
||||
T1b = T19 - T1a;
|
||||
T1x = T19 + T1a;
|
||||
}
|
||||
{
|
||||
E T6, Tq, T9, Tr, Tn, Tx, Tk, Tw, Tg, Tu, Td, Tt;
|
||||
{
|
||||
E T4, T5, Ti, Tj;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 9)];
|
||||
T6 = T4 - T5;
|
||||
Tq = T4 + T5;
|
||||
{
|
||||
E T7, T8, Tl, Tm;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 5)];
|
||||
T9 = T7 - T8;
|
||||
Tr = T7 + T8;
|
||||
Tl = ri[WS(is, 8)];
|
||||
Tm = ri[WS(is, 1)];
|
||||
Tn = Tl - Tm;
|
||||
Tx = Tl + Tm;
|
||||
}
|
||||
Ti = ri[WS(is, 6)];
|
||||
Tj = ri[WS(is, 13)];
|
||||
Tk = Ti - Tj;
|
||||
Tw = Ti + Tj;
|
||||
{
|
||||
E Te, Tf, Tb, Tc;
|
||||
Te = ri[WS(is, 10)];
|
||||
Tf = ri[WS(is, 3)];
|
||||
Tg = Te - Tf;
|
||||
Tu = Te + Tf;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 11)];
|
||||
Td = Tb - Tc;
|
||||
Tt = Tb + Tc;
|
||||
}
|
||||
}
|
||||
T1i = Tn - Tk;
|
||||
T1L = Tt - Tu;
|
||||
T1M = Tr - Tq;
|
||||
T1j = Tg - Td;
|
||||
T1k = T9 - T6;
|
||||
T1K = Tw - Tx;
|
||||
Ta = T6 + T9;
|
||||
To = Tk + Tn;
|
||||
Th = Td + Tg;
|
||||
Tz = FNMS(KP356895867, Th, Ta);
|
||||
T14 = FNMS(KP356895867, To, Th);
|
||||
TZ = FNMS(KP356895867, Ta, To);
|
||||
Ts = Tq + Tr;
|
||||
Ty = Tw + Tx;
|
||||
Tv = Tt + Tu;
|
||||
T1Z = FNMS(KP356895867, Ts, Ty);
|
||||
T2c = FNMS(KP356895867, Ty, Tv);
|
||||
T27 = FNMS(KP356895867, Tv, Ts);
|
||||
}
|
||||
{
|
||||
E TE, T1B, TH, T1C, TV, T1F, TS, T1E, TO, T1z, TL, T1y;
|
||||
{
|
||||
E TC, TD, TQ, TR;
|
||||
TC = ii[WS(is, 4)];
|
||||
TD = ii[WS(is, 11)];
|
||||
TE = TC - TD;
|
||||
T1B = TC + TD;
|
||||
{
|
||||
E TF, TG, TT, TU;
|
||||
TF = ii[WS(is, 10)];
|
||||
TG = ii[WS(is, 3)];
|
||||
TH = TF - TG;
|
||||
T1C = TF + TG;
|
||||
TT = ii[WS(is, 8)];
|
||||
TU = ii[WS(is, 1)];
|
||||
TV = TT - TU;
|
||||
T1F = TT + TU;
|
||||
}
|
||||
TQ = ii[WS(is, 6)];
|
||||
TR = ii[WS(is, 13)];
|
||||
TS = TQ - TR;
|
||||
T1E = TQ + TR;
|
||||
{
|
||||
E TM, TN, TJ, TK;
|
||||
TM = ii[WS(is, 12)];
|
||||
TN = ii[WS(is, 5)];
|
||||
TO = TM - TN;
|
||||
T1z = TM + TN;
|
||||
TJ = ii[WS(is, 2)];
|
||||
TK = ii[WS(is, 9)];
|
||||
TL = TJ - TK;
|
||||
T1y = TJ + TK;
|
||||
}
|
||||
}
|
||||
TI = TE - TH;
|
||||
T23 = T1F - T1E;
|
||||
T24 = T1C - T1B;
|
||||
TP = TL - TO;
|
||||
TW = TS - TV;
|
||||
T22 = T1y - T1z;
|
||||
T1c = TL + TO;
|
||||
T1e = TS + TV;
|
||||
T1d = TE + TH;
|
||||
T1f = FNMS(KP356895867, T1e, T1d);
|
||||
T1s = FNMS(KP356895867, T1d, T1c);
|
||||
T1n = FNMS(KP356895867, T1c, T1e);
|
||||
T1A = T1y + T1z;
|
||||
T1G = T1E + T1F;
|
||||
T1D = T1B + T1C;
|
||||
T1H = FNMS(KP356895867, T1G, T1D);
|
||||
T1U = FNMS(KP356895867, T1D, T1A);
|
||||
T1P = FNMS(KP356895867, T1A, T1G);
|
||||
}
|
||||
ro[WS(os, 7)] = T3 + Ta + Th + To;
|
||||
io[WS(os, 7)] = T1b + T1c + T1d + T1e;
|
||||
ro[0] = Tp + Ts + Tv + Ty;
|
||||
io[0] = T1x + T1A + T1D + T1G;
|
||||
{
|
||||
E TB, TY, TA, TX;
|
||||
TA = FNMS(KP692021471, Tz, To);
|
||||
TB = FNMS(KP900968867, TA, T3);
|
||||
TX = FMA(KP554958132, TW, TP);
|
||||
TY = FMA(KP801937735, TX, TI);
|
||||
ro[WS(os, 13)] = FNMS(KP974927912, TY, TB);
|
||||
ro[WS(os, 1)] = FMA(KP974927912, TY, TB);
|
||||
}
|
||||
{
|
||||
E T1u, T1w, T1t, T1v;
|
||||
T1t = FNMS(KP692021471, T1s, T1e);
|
||||
T1u = FNMS(KP900968867, T1t, T1b);
|
||||
T1v = FMA(KP554958132, T1i, T1k);
|
||||
T1w = FMA(KP801937735, T1v, T1j);
|
||||
io[WS(os, 1)] = FMA(KP974927912, T1w, T1u);
|
||||
io[WS(os, 13)] = FNMS(KP974927912, T1w, T1u);
|
||||
}
|
||||
{
|
||||
E T11, T13, T10, T12;
|
||||
T10 = FNMS(KP692021471, TZ, Th);
|
||||
T11 = FNMS(KP900968867, T10, T3);
|
||||
T12 = FMA(KP554958132, TI, TW);
|
||||
T13 = FNMS(KP801937735, T12, TP);
|
||||
ro[WS(os, 5)] = FNMS(KP974927912, T13, T11);
|
||||
ro[WS(os, 9)] = FMA(KP974927912, T13, T11);
|
||||
}
|
||||
{
|
||||
E T1p, T1r, T1o, T1q;
|
||||
T1o = FNMS(KP692021471, T1n, T1d);
|
||||
T1p = FNMS(KP900968867, T1o, T1b);
|
||||
T1q = FMA(KP554958132, T1j, T1i);
|
||||
T1r = FNMS(KP801937735, T1q, T1k);
|
||||
io[WS(os, 5)] = FNMS(KP974927912, T1r, T1p);
|
||||
io[WS(os, 9)] = FMA(KP974927912, T1r, T1p);
|
||||
}
|
||||
{
|
||||
E T16, T18, T15, T17;
|
||||
T15 = FNMS(KP692021471, T14, Ta);
|
||||
T16 = FNMS(KP900968867, T15, T3);
|
||||
T17 = FNMS(KP554958132, TP, TI);
|
||||
T18 = FNMS(KP801937735, T17, TW);
|
||||
ro[WS(os, 11)] = FNMS(KP974927912, T18, T16);
|
||||
ro[WS(os, 3)] = FMA(KP974927912, T18, T16);
|
||||
}
|
||||
{
|
||||
E T1h, T1m, T1g, T1l;
|
||||
T1g = FNMS(KP692021471, T1f, T1c);
|
||||
T1h = FNMS(KP900968867, T1g, T1b);
|
||||
T1l = FNMS(KP554958132, T1k, T1j);
|
||||
T1m = FNMS(KP801937735, T1l, T1i);
|
||||
io[WS(os, 3)] = FMA(KP974927912, T1m, T1h);
|
||||
io[WS(os, 11)] = FNMS(KP974927912, T1m, T1h);
|
||||
}
|
||||
{
|
||||
E T1J, T1O, T1I, T1N;
|
||||
T1I = FNMS(KP692021471, T1H, T1A);
|
||||
T1J = FNMS(KP900968867, T1I, T1x);
|
||||
T1N = FMA(KP554958132, T1M, T1L);
|
||||
T1O = FNMS(KP801937735, T1N, T1K);
|
||||
io[WS(os, 4)] = FMA(KP974927912, T1O, T1J);
|
||||
io[WS(os, 10)] = FNMS(KP974927912, T1O, T1J);
|
||||
}
|
||||
{
|
||||
E T2e, T2g, T2d, T2f;
|
||||
T2d = FNMS(KP692021471, T2c, Ts);
|
||||
T2e = FNMS(KP900968867, T2d, Tp);
|
||||
T2f = FMA(KP554958132, T22, T24);
|
||||
T2g = FNMS(KP801937735, T2f, T23);
|
||||
ro[WS(os, 10)] = FNMS(KP974927912, T2g, T2e);
|
||||
ro[WS(os, 4)] = FMA(KP974927912, T2g, T2e);
|
||||
}
|
||||
{
|
||||
E T1R, T1T, T1Q, T1S;
|
||||
T1Q = FNMS(KP692021471, T1P, T1D);
|
||||
T1R = FNMS(KP900968867, T1Q, T1x);
|
||||
T1S = FMA(KP554958132, T1L, T1K);
|
||||
T1T = FMA(KP801937735, T1S, T1M);
|
||||
io[WS(os, 2)] = FMA(KP974927912, T1T, T1R);
|
||||
io[WS(os, 12)] = FNMS(KP974927912, T1T, T1R);
|
||||
}
|
||||
{
|
||||
E T21, T26, T20, T25;
|
||||
T20 = FNMS(KP692021471, T1Z, Tv);
|
||||
T21 = FNMS(KP900968867, T20, Tp);
|
||||
T25 = FMA(KP554958132, T24, T23);
|
||||
T26 = FMA(KP801937735, T25, T22);
|
||||
ro[WS(os, 12)] = FNMS(KP974927912, T26, T21);
|
||||
ro[WS(os, 2)] = FMA(KP974927912, T26, T21);
|
||||
}
|
||||
{
|
||||
E T1W, T1Y, T1V, T1X;
|
||||
T1V = FNMS(KP692021471, T1U, T1G);
|
||||
T1W = FNMS(KP900968867, T1V, T1x);
|
||||
T1X = FNMS(KP554958132, T1K, T1M);
|
||||
T1Y = FNMS(KP801937735, T1X, T1L);
|
||||
io[WS(os, 6)] = FMA(KP974927912, T1Y, T1W);
|
||||
io[WS(os, 8)] = FNMS(KP974927912, T1Y, T1W);
|
||||
}
|
||||
{
|
||||
E T29, T2b, T28, T2a;
|
||||
T28 = FNMS(KP692021471, T27, Ty);
|
||||
T29 = FNMS(KP900968867, T28, Tp);
|
||||
T2a = FNMS(KP554958132, T23, T22);
|
||||
T2b = FNMS(KP801937735, T2a, T24);
|
||||
ro[WS(os, 8)] = FNMS(KP974927912, T2b, T29);
|
||||
ro[WS(os, 6)] = FMA(KP974927912, T2b, T29);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 14, "n1_14", { 64, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 148 FP additions, 72 FP multiplications,
|
||||
* (or, 100 additions, 24 multiplications, 48 fused multiply/add),
|
||||
* 43 stack variables, 6 constants, and 56 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
|
||||
E T3, Tp, T16, T1f, Ta, T1q, Ts, T10, TG, T1z, T19, T1i, Th, T1s, Tv;
|
||||
E T12, TU, T1B, T17, T1o, To, T1r, Ty, T11, TN, T1A, T18, T1l;
|
||||
{
|
||||
E T1, T2, T14, T15;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 7)];
|
||||
T3 = T1 - T2;
|
||||
Tp = T1 + T2;
|
||||
T14 = ii[0];
|
||||
T15 = ii[WS(is, 7)];
|
||||
T16 = T14 - T15;
|
||||
T1f = T14 + T15;
|
||||
}
|
||||
{
|
||||
E T6, Tq, T9, Tr;
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 9)];
|
||||
T6 = T4 - T5;
|
||||
Tq = T4 + T5;
|
||||
T7 = ri[WS(is, 12)];
|
||||
T8 = ri[WS(is, 5)];
|
||||
T9 = T7 - T8;
|
||||
Tr = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1q = Tr - Tq;
|
||||
Ts = Tq + Tr;
|
||||
T10 = T9 - T6;
|
||||
}
|
||||
{
|
||||
E TC, T1g, TF, T1h;
|
||||
{
|
||||
E TA, TB, TD, TE;
|
||||
TA = ii[WS(is, 2)];
|
||||
TB = ii[WS(is, 9)];
|
||||
TC = TA - TB;
|
||||
T1g = TA + TB;
|
||||
TD = ii[WS(is, 12)];
|
||||
TE = ii[WS(is, 5)];
|
||||
TF = TD - TE;
|
||||
T1h = TD + TE;
|
||||
}
|
||||
TG = TC - TF;
|
||||
T1z = T1g - T1h;
|
||||
T19 = TC + TF;
|
||||
T1i = T1g + T1h;
|
||||
}
|
||||
{
|
||||
E Td, Tt, Tg, Tu;
|
||||
{
|
||||
E Tb, Tc, Te, Tf;
|
||||
Tb = ri[WS(is, 4)];
|
||||
Tc = ri[WS(is, 11)];
|
||||
Td = Tb - Tc;
|
||||
Tt = Tb + Tc;
|
||||
Te = ri[WS(is, 10)];
|
||||
Tf = ri[WS(is, 3)];
|
||||
Tg = Te - Tf;
|
||||
Tu = Te + Tf;
|
||||
}
|
||||
Th = Td + Tg;
|
||||
T1s = Tt - Tu;
|
||||
Tv = Tt + Tu;
|
||||
T12 = Tg - Td;
|
||||
}
|
||||
{
|
||||
E TQ, T1m, TT, T1n;
|
||||
{
|
||||
E TO, TP, TR, TS;
|
||||
TO = ii[WS(is, 4)];
|
||||
TP = ii[WS(is, 11)];
|
||||
TQ = TO - TP;
|
||||
T1m = TO + TP;
|
||||
TR = ii[WS(is, 10)];
|
||||
TS = ii[WS(is, 3)];
|
||||
TT = TR - TS;
|
||||
T1n = TR + TS;
|
||||
}
|
||||
TU = TQ - TT;
|
||||
T1B = T1n - T1m;
|
||||
T17 = TQ + TT;
|
||||
T1o = T1m + T1n;
|
||||
}
|
||||
{
|
||||
E Tk, Tw, Tn, Tx;
|
||||
{
|
||||
E Ti, Tj, Tl, Tm;
|
||||
Ti = ri[WS(is, 6)];
|
||||
Tj = ri[WS(is, 13)];
|
||||
Tk = Ti - Tj;
|
||||
Tw = Ti + Tj;
|
||||
Tl = ri[WS(is, 8)];
|
||||
Tm = ri[WS(is, 1)];
|
||||
Tn = Tl - Tm;
|
||||
Tx = Tl + Tm;
|
||||
}
|
||||
To = Tk + Tn;
|
||||
T1r = Tw - Tx;
|
||||
Ty = Tw + Tx;
|
||||
T11 = Tn - Tk;
|
||||
}
|
||||
{
|
||||
E TJ, T1j, TM, T1k;
|
||||
{
|
||||
E TH, TI, TK, TL;
|
||||
TH = ii[WS(is, 6)];
|
||||
TI = ii[WS(is, 13)];
|
||||
TJ = TH - TI;
|
||||
T1j = TH + TI;
|
||||
TK = ii[WS(is, 8)];
|
||||
TL = ii[WS(is, 1)];
|
||||
TM = TK - TL;
|
||||
T1k = TK + TL;
|
||||
}
|
||||
TN = TJ - TM;
|
||||
T1A = T1k - T1j;
|
||||
T18 = TJ + TM;
|
||||
T1l = T1j + T1k;
|
||||
}
|
||||
ro[WS(os, 7)] = T3 + Ta + Th + To;
|
||||
io[WS(os, 7)] = T16 + T19 + T17 + T18;
|
||||
ro[0] = Tp + Ts + Tv + Ty;
|
||||
io[0] = T1f + T1i + T1o + T1l;
|
||||
{
|
||||
E TV, Tz, T1e, T1d;
|
||||
TV = FNMS(KP781831482, TN, KP974927912 * TG) - (KP433883739 * TU);
|
||||
Tz = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
|
||||
ro[WS(os, 5)] = Tz - TV;
|
||||
ro[WS(os, 9)] = Tz + TV;
|
||||
T1e = FNMS(KP781831482, T11, KP974927912 * T10) - (KP433883739 * T12);
|
||||
T1d = FMA(KP623489801, T18, T16) + FNMA(KP900968867, T17, KP222520933 * T19);
|
||||
io[WS(os, 5)] = T1d - T1e;
|
||||
io[WS(os, 9)] = T1e + T1d;
|
||||
}
|
||||
{
|
||||
E TX, TW, T1b, T1c;
|
||||
TX = FMA(KP781831482, TG, KP974927912 * TU) + (KP433883739 * TN);
|
||||
TW = FMA(KP623489801, Ta, T3) + FNMA(KP900968867, To, KP222520933 * Th);
|
||||
ro[WS(os, 13)] = TW - TX;
|
||||
ro[WS(os, 1)] = TW + TX;
|
||||
T1b = FMA(KP781831482, T10, KP974927912 * T12) + (KP433883739 * T11);
|
||||
T1c = FMA(KP623489801, T19, T16) + FNMA(KP900968867, T18, KP222520933 * T17);
|
||||
io[WS(os, 1)] = T1b + T1c;
|
||||
io[WS(os, 13)] = T1c - T1b;
|
||||
}
|
||||
{
|
||||
E TZ, TY, T13, T1a;
|
||||
TZ = FMA(KP433883739, TG, KP974927912 * TN) - (KP781831482 * TU);
|
||||
TY = FMA(KP623489801, Th, T3) + FNMA(KP222520933, To, KP900968867 * Ta);
|
||||
ro[WS(os, 11)] = TY - TZ;
|
||||
ro[WS(os, 3)] = TY + TZ;
|
||||
T13 = FMA(KP433883739, T10, KP974927912 * T11) - (KP781831482 * T12);
|
||||
T1a = FMA(KP623489801, T17, T16) + FNMA(KP222520933, T18, KP900968867 * T19);
|
||||
io[WS(os, 3)] = T13 + T1a;
|
||||
io[WS(os, 11)] = T1a - T13;
|
||||
}
|
||||
{
|
||||
E T1t, T1p, T1C, T1y;
|
||||
T1t = FNMS(KP433883739, T1r, KP781831482 * T1q) - (KP974927912 * T1s);
|
||||
T1p = FMA(KP623489801, T1i, T1f) + FNMA(KP900968867, T1l, KP222520933 * T1o);
|
||||
io[WS(os, 6)] = T1p - T1t;
|
||||
io[WS(os, 8)] = T1t + T1p;
|
||||
T1C = FNMS(KP433883739, T1A, KP781831482 * T1z) - (KP974927912 * T1B);
|
||||
T1y = FMA(KP623489801, Ts, Tp) + FNMA(KP900968867, Ty, KP222520933 * Tv);
|
||||
ro[WS(os, 6)] = T1y - T1C;
|
||||
ro[WS(os, 8)] = T1y + T1C;
|
||||
}
|
||||
{
|
||||
E T1v, T1u, T1E, T1D;
|
||||
T1v = FMA(KP433883739, T1q, KP781831482 * T1s) - (KP974927912 * T1r);
|
||||
T1u = FMA(KP623489801, T1o, T1f) + FNMA(KP222520933, T1l, KP900968867 * T1i);
|
||||
io[WS(os, 4)] = T1u - T1v;
|
||||
io[WS(os, 10)] = T1v + T1u;
|
||||
T1E = FMA(KP433883739, T1z, KP781831482 * T1B) - (KP974927912 * T1A);
|
||||
T1D = FMA(KP623489801, Tv, Tp) + FNMA(KP222520933, Ty, KP900968867 * Ts);
|
||||
ro[WS(os, 4)] = T1D - T1E;
|
||||
ro[WS(os, 10)] = T1D + T1E;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1G, T1F;
|
||||
T1w = FMA(KP974927912, T1q, KP433883739 * T1s) + (KP781831482 * T1r);
|
||||
T1x = FMA(KP623489801, T1l, T1f) + FNMA(KP900968867, T1o, KP222520933 * T1i);
|
||||
io[WS(os, 2)] = T1w + T1x;
|
||||
io[WS(os, 12)] = T1x - T1w;
|
||||
T1G = FMA(KP974927912, T1z, KP433883739 * T1B) + (KP781831482 * T1A);
|
||||
T1F = FMA(KP623489801, Ty, Tp) + FNMA(KP900968867, Tv, KP222520933 * Ts);
|
||||
ro[WS(os, 12)] = T1F - T1G;
|
||||
ro[WS(os, 2)] = T1F + T1G;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 14, "n1_14", { 100, 24, 48, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
554
fftw-3.3.10/dft/scalar/codelets/n1_15.c
Normal file
554
fftw-3.3.10/dft/scalar/codelets/n1_15.c
Normal file
@@ -0,0 +1,554 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 156 FP additions, 84 FP multiplications,
|
||||
* (or, 72 additions, 0 multiplications, 84 fused multiply/add),
|
||||
* 69 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
|
||||
E T5, T2l, Tx, TV, T1z, T1X, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
|
||||
E T1O, T1P, T1Z, T1l, T1q, T1B, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
|
||||
E T2f, T2g, T2m, T1R, T1S, T1Y, T1a, T1f, T1A, TW, TX, TY;
|
||||
{
|
||||
E T1, T1v, T4, T1y, Tw, T1w, Tt, T1x;
|
||||
T1 = ri[0];
|
||||
T1v = ii[0];
|
||||
{
|
||||
E T2, T3, Tu, Tv;
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
T1y = T3 - T2;
|
||||
Tu = ii[WS(is, 5)];
|
||||
Tv = ii[WS(is, 10)];
|
||||
Tw = Tu - Tv;
|
||||
T1w = Tu + Tv;
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
T2l = T1v + T1w;
|
||||
Tt = FNMS(KP500000000, T4, T1);
|
||||
Tx = FNMS(KP866025403, Tw, Tt);
|
||||
TV = FMA(KP866025403, Tw, Tt);
|
||||
T1x = FNMS(KP500000000, T1w, T1v);
|
||||
T1z = FMA(KP866025403, T1y, T1x);
|
||||
T1X = FNMS(KP866025403, T1y, T1x);
|
||||
}
|
||||
{
|
||||
E Th, Tk, TJ, T1k, T1h, T1i, TM, T1j, Tm, Tp, TO, T1p, T1m, T1n, TR;
|
||||
E T1o;
|
||||
{
|
||||
E Ti, Tj, TK, TL;
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = ri[WS(is, 11)];
|
||||
Tj = ri[WS(is, 1)];
|
||||
Tk = Ti + Tj;
|
||||
TJ = FNMS(KP500000000, Tk, Th);
|
||||
T1k = Tj - Ti;
|
||||
T1h = ii[WS(is, 6)];
|
||||
TK = ii[WS(is, 11)];
|
||||
TL = ii[WS(is, 1)];
|
||||
T1i = TK + TL;
|
||||
TM = TK - TL;
|
||||
T1j = FNMS(KP500000000, T1i, T1h);
|
||||
}
|
||||
{
|
||||
E Tn, To, TP, TQ;
|
||||
Tm = ri[WS(is, 9)];
|
||||
Tn = ri[WS(is, 14)];
|
||||
To = ri[WS(is, 4)];
|
||||
Tp = Tn + To;
|
||||
TO = FNMS(KP500000000, Tp, Tm);
|
||||
T1p = To - Tn;
|
||||
T1m = ii[WS(is, 9)];
|
||||
TP = ii[WS(is, 14)];
|
||||
TQ = ii[WS(is, 4)];
|
||||
T1n = TP + TQ;
|
||||
TR = TP - TQ;
|
||||
T1o = FNMS(KP500000000, T1n, T1m);
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
Tq = Tm + Tp;
|
||||
Tr = Tl + Tq;
|
||||
TN = FNMS(KP866025403, TM, TJ);
|
||||
TS = FNMS(KP866025403, TR, TO);
|
||||
TT = TN + TS;
|
||||
T2c = T1h + T1i;
|
||||
T2d = T1m + T1n;
|
||||
T2n = T2c + T2d;
|
||||
T1O = FNMS(KP866025403, T1k, T1j);
|
||||
T1P = FNMS(KP866025403, T1p, T1o);
|
||||
T1Z = T1O + T1P;
|
||||
T1l = FMA(KP866025403, T1k, T1j);
|
||||
T1q = FMA(KP866025403, T1p, T1o);
|
||||
T1B = T1l + T1q;
|
||||
TZ = FMA(KP866025403, TM, TJ);
|
||||
T10 = FMA(KP866025403, TR, TO);
|
||||
T11 = TZ + T10;
|
||||
}
|
||||
{
|
||||
E T6, T9, Ty, T19, T16, T17, TB, T18, Tb, Te, TD, T1e, T1b, T1c, TG;
|
||||
E T1d;
|
||||
{
|
||||
E T7, T8, Tz, TA;
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 13)];
|
||||
T9 = T7 + T8;
|
||||
Ty = FNMS(KP500000000, T9, T6);
|
||||
T19 = T8 - T7;
|
||||
T16 = ii[WS(is, 3)];
|
||||
Tz = ii[WS(is, 8)];
|
||||
TA = ii[WS(is, 13)];
|
||||
T17 = Tz + TA;
|
||||
TB = Tz - TA;
|
||||
T18 = FNMS(KP500000000, T17, T16);
|
||||
}
|
||||
{
|
||||
E Tc, Td, TE, TF;
|
||||
Tb = ri[WS(is, 12)];
|
||||
Tc = ri[WS(is, 2)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = Tc + Td;
|
||||
TD = FNMS(KP500000000, Te, Tb);
|
||||
T1e = Td - Tc;
|
||||
T1b = ii[WS(is, 12)];
|
||||
TE = ii[WS(is, 2)];
|
||||
TF = ii[WS(is, 7)];
|
||||
T1c = TE + TF;
|
||||
TG = TE - TF;
|
||||
T1d = FNMS(KP500000000, T1c, T1b);
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Tf = Tb + Te;
|
||||
Tg = Ta + Tf;
|
||||
TC = FNMS(KP866025403, TB, Ty);
|
||||
TH = FNMS(KP866025403, TG, TD);
|
||||
TI = TC + TH;
|
||||
T2f = T16 + T17;
|
||||
T2g = T1b + T1c;
|
||||
T2m = T2f + T2g;
|
||||
T1R = FNMS(KP866025403, T19, T18);
|
||||
T1S = FNMS(KP866025403, T1e, T1d);
|
||||
T1Y = T1R + T1S;
|
||||
T1a = FMA(KP866025403, T19, T18);
|
||||
T1f = FMA(KP866025403, T1e, T1d);
|
||||
T1A = T1a + T1f;
|
||||
TW = FMA(KP866025403, TB, Ty);
|
||||
TX = FMA(KP866025403, TG, TD);
|
||||
TY = TW + TX;
|
||||
}
|
||||
{
|
||||
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
|
||||
T2a = Tg - Tr;
|
||||
Ts = Tg + Tr;
|
||||
T29 = FNMS(KP250000000, Ts, T5);
|
||||
T2e = T2c - T2d;
|
||||
T2h = T2f - T2g;
|
||||
T2i = FNMS(KP618033988, T2h, T2e);
|
||||
T2k = FMA(KP618033988, T2e, T2h);
|
||||
ro[0] = T5 + Ts;
|
||||
T2j = FMA(KP559016994, T2a, T29);
|
||||
ro[WS(os, 9)] = FNMS(KP951056516, T2k, T2j);
|
||||
ro[WS(os, 6)] = FMA(KP951056516, T2k, T2j);
|
||||
T2b = FNMS(KP559016994, T2a, T29);
|
||||
ro[WS(os, 12)] = FNMS(KP951056516, T2i, T2b);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, T2i, T2b);
|
||||
}
|
||||
{
|
||||
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
|
||||
T2q = T2m - T2n;
|
||||
T2o = T2m + T2n;
|
||||
T2p = FNMS(KP250000000, T2o, T2l);
|
||||
T2s = Tl - Tq;
|
||||
T2t = Ta - Tf;
|
||||
T2u = FNMS(KP618033988, T2t, T2s);
|
||||
T2w = FMA(KP618033988, T2s, T2t);
|
||||
io[0] = T2l + T2o;
|
||||
T2v = FMA(KP559016994, T2q, T2p);
|
||||
io[WS(os, 6)] = FNMS(KP951056516, T2w, T2v);
|
||||
io[WS(os, 9)] = FMA(KP951056516, T2w, T2v);
|
||||
T2r = FNMS(KP559016994, T2q, T2p);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, T2u, T2r);
|
||||
io[WS(os, 12)] = FMA(KP951056516, T2u, T2r);
|
||||
}
|
||||
{
|
||||
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
|
||||
T1M = TI - TT;
|
||||
TU = TI + TT;
|
||||
T1L = FNMS(KP250000000, TU, Tx);
|
||||
T1Q = T1O - T1P;
|
||||
T1T = T1R - T1S;
|
||||
T1U = FNMS(KP618033988, T1T, T1Q);
|
||||
T1W = FMA(KP618033988, T1Q, T1T);
|
||||
ro[WS(os, 5)] = Tx + TU;
|
||||
T1V = FMA(KP559016994, T1M, T1L);
|
||||
ro[WS(os, 14)] = FNMS(KP951056516, T1W, T1V);
|
||||
ro[WS(os, 11)] = FMA(KP951056516, T1W, T1V);
|
||||
T1N = FNMS(KP559016994, T1M, T1L);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, T1U, T1N);
|
||||
ro[WS(os, 8)] = FMA(KP951056516, T1U, T1N);
|
||||
}
|
||||
{
|
||||
E T22, T20, T21, T26, T28, T24, T25, T27, T23;
|
||||
T22 = T1Y - T1Z;
|
||||
T20 = T1Y + T1Z;
|
||||
T21 = FNMS(KP250000000, T20, T1X);
|
||||
T24 = TN - TS;
|
||||
T25 = TC - TH;
|
||||
T26 = FNMS(KP618033988, T25, T24);
|
||||
T28 = FMA(KP618033988, T24, T25);
|
||||
io[WS(os, 5)] = T1X + T20;
|
||||
T27 = FMA(KP559016994, T22, T21);
|
||||
io[WS(os, 11)] = FNMS(KP951056516, T28, T27);
|
||||
io[WS(os, 14)] = FMA(KP951056516, T28, T27);
|
||||
T23 = FNMS(KP559016994, T22, T21);
|
||||
io[WS(os, 2)] = FMA(KP951056516, T26, T23);
|
||||
io[WS(os, 8)] = FNMS(KP951056516, T26, T23);
|
||||
}
|
||||
{
|
||||
E T1E, T1C, T1D, T1I, T1K, T1G, T1H, T1J, T1F;
|
||||
T1E = T1A - T1B;
|
||||
T1C = T1A + T1B;
|
||||
T1D = FNMS(KP250000000, T1C, T1z);
|
||||
T1G = TW - TX;
|
||||
T1H = TZ - T10;
|
||||
T1I = FMA(KP618033988, T1H, T1G);
|
||||
T1K = FNMS(KP618033988, T1G, T1H);
|
||||
io[WS(os, 10)] = T1z + T1C;
|
||||
T1J = FNMS(KP559016994, T1E, T1D);
|
||||
io[WS(os, 7)] = FMA(KP951056516, T1K, T1J);
|
||||
io[WS(os, 13)] = FNMS(KP951056516, T1K, T1J);
|
||||
T1F = FMA(KP559016994, T1E, T1D);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, T1I, T1F);
|
||||
io[WS(os, 4)] = FMA(KP951056516, T1I, T1F);
|
||||
}
|
||||
{
|
||||
E T14, T12, T13, T1s, T1u, T1g, T1r, T1t, T15;
|
||||
T14 = TY - T11;
|
||||
T12 = TY + T11;
|
||||
T13 = FNMS(KP250000000, T12, TV);
|
||||
T1g = T1a - T1f;
|
||||
T1r = T1l - T1q;
|
||||
T1s = FMA(KP618033988, T1r, T1g);
|
||||
T1u = FNMS(KP618033988, T1g, T1r);
|
||||
ro[WS(os, 10)] = TV + T12;
|
||||
T1t = FNMS(KP559016994, T14, T13);
|
||||
ro[WS(os, 7)] = FNMS(KP951056516, T1u, T1t);
|
||||
ro[WS(os, 13)] = FMA(KP951056516, T1u, T1t);
|
||||
T15 = FMA(KP559016994, T14, T13);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, T1s, T15);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, T1s, T15);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 15, "n1_15", { 72, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 156 FP additions, 56 FP multiplications,
|
||||
* (or, 128 additions, 28 multiplications, 28 fused multiply/add),
|
||||
* 69 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
|
||||
E T5, T2l, Tx, TV, T1C, T20, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
|
||||
E T1O, T1P, T22, T1l, T1q, T1w, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
|
||||
E T2f, T2g, T2m, T1R, T1S, T21, T1a, T1f, T1v, TW, TX, TY;
|
||||
{
|
||||
E T1, T1z, T4, T1y, Tw, T1A, Tt, T1B;
|
||||
T1 = ri[0];
|
||||
T1z = ii[0];
|
||||
{
|
||||
E T2, T3, Tu, Tv;
|
||||
T2 = ri[WS(is, 5)];
|
||||
T3 = ri[WS(is, 10)];
|
||||
T4 = T2 + T3;
|
||||
T1y = KP866025403 * (T3 - T2);
|
||||
Tu = ii[WS(is, 5)];
|
||||
Tv = ii[WS(is, 10)];
|
||||
Tw = KP866025403 * (Tu - Tv);
|
||||
T1A = Tu + Tv;
|
||||
}
|
||||
T5 = T1 + T4;
|
||||
T2l = T1z + T1A;
|
||||
Tt = FNMS(KP500000000, T4, T1);
|
||||
Tx = Tt - Tw;
|
||||
TV = Tt + Tw;
|
||||
T1B = FNMS(KP500000000, T1A, T1z);
|
||||
T1C = T1y + T1B;
|
||||
T20 = T1B - T1y;
|
||||
}
|
||||
{
|
||||
E Th, Tk, TJ, T1h, T1i, T1j, TM, T1k, Tm, Tp, TO, T1m, T1n, T1o, TR;
|
||||
E T1p;
|
||||
{
|
||||
E Ti, Tj, TK, TL;
|
||||
Th = ri[WS(is, 6)];
|
||||
Ti = ri[WS(is, 11)];
|
||||
Tj = ri[WS(is, 1)];
|
||||
Tk = Ti + Tj;
|
||||
TJ = FNMS(KP500000000, Tk, Th);
|
||||
T1h = KP866025403 * (Tj - Ti);
|
||||
T1i = ii[WS(is, 6)];
|
||||
TK = ii[WS(is, 11)];
|
||||
TL = ii[WS(is, 1)];
|
||||
T1j = TK + TL;
|
||||
TM = KP866025403 * (TK - TL);
|
||||
T1k = FNMS(KP500000000, T1j, T1i);
|
||||
}
|
||||
{
|
||||
E Tn, To, TP, TQ;
|
||||
Tm = ri[WS(is, 9)];
|
||||
Tn = ri[WS(is, 14)];
|
||||
To = ri[WS(is, 4)];
|
||||
Tp = Tn + To;
|
||||
TO = FNMS(KP500000000, Tp, Tm);
|
||||
T1m = KP866025403 * (To - Tn);
|
||||
T1n = ii[WS(is, 9)];
|
||||
TP = ii[WS(is, 14)];
|
||||
TQ = ii[WS(is, 4)];
|
||||
T1o = TP + TQ;
|
||||
TR = KP866025403 * (TP - TQ);
|
||||
T1p = FNMS(KP500000000, T1o, T1n);
|
||||
}
|
||||
Tl = Th + Tk;
|
||||
Tq = Tm + Tp;
|
||||
Tr = Tl + Tq;
|
||||
TN = TJ - TM;
|
||||
TS = TO - TR;
|
||||
TT = TN + TS;
|
||||
T2c = T1i + T1j;
|
||||
T2d = T1n + T1o;
|
||||
T2n = T2c + T2d;
|
||||
T1O = T1k - T1h;
|
||||
T1P = T1p - T1m;
|
||||
T22 = T1O + T1P;
|
||||
T1l = T1h + T1k;
|
||||
T1q = T1m + T1p;
|
||||
T1w = T1l + T1q;
|
||||
TZ = TJ + TM;
|
||||
T10 = TO + TR;
|
||||
T11 = TZ + T10;
|
||||
}
|
||||
{
|
||||
E T6, T9, Ty, T16, T17, T18, TB, T19, Tb, Te, TD, T1b, T1c, T1d, TG;
|
||||
E T1e;
|
||||
{
|
||||
E T7, T8, Tz, TA;
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = ri[WS(is, 8)];
|
||||
T8 = ri[WS(is, 13)];
|
||||
T9 = T7 + T8;
|
||||
Ty = FNMS(KP500000000, T9, T6);
|
||||
T16 = KP866025403 * (T8 - T7);
|
||||
T17 = ii[WS(is, 3)];
|
||||
Tz = ii[WS(is, 8)];
|
||||
TA = ii[WS(is, 13)];
|
||||
T18 = Tz + TA;
|
||||
TB = KP866025403 * (Tz - TA);
|
||||
T19 = FNMS(KP500000000, T18, T17);
|
||||
}
|
||||
{
|
||||
E Tc, Td, TE, TF;
|
||||
Tb = ri[WS(is, 12)];
|
||||
Tc = ri[WS(is, 2)];
|
||||
Td = ri[WS(is, 7)];
|
||||
Te = Tc + Td;
|
||||
TD = FNMS(KP500000000, Te, Tb);
|
||||
T1b = KP866025403 * (Td - Tc);
|
||||
T1c = ii[WS(is, 12)];
|
||||
TE = ii[WS(is, 2)];
|
||||
TF = ii[WS(is, 7)];
|
||||
T1d = TE + TF;
|
||||
TG = KP866025403 * (TE - TF);
|
||||
T1e = FNMS(KP500000000, T1d, T1c);
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Tf = Tb + Te;
|
||||
Tg = Ta + Tf;
|
||||
TC = Ty - TB;
|
||||
TH = TD - TG;
|
||||
TI = TC + TH;
|
||||
T2f = T17 + T18;
|
||||
T2g = T1c + T1d;
|
||||
T2m = T2f + T2g;
|
||||
T1R = T19 - T16;
|
||||
T1S = T1e - T1b;
|
||||
T21 = T1R + T1S;
|
||||
T1a = T16 + T19;
|
||||
T1f = T1b + T1e;
|
||||
T1v = T1a + T1f;
|
||||
TW = Ty + TB;
|
||||
TX = TD + TG;
|
||||
TY = TW + TX;
|
||||
}
|
||||
{
|
||||
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
|
||||
T2a = KP559016994 * (Tg - Tr);
|
||||
Ts = Tg + Tr;
|
||||
T29 = FNMS(KP250000000, Ts, T5);
|
||||
T2e = T2c - T2d;
|
||||
T2h = T2f - T2g;
|
||||
T2i = FNMS(KP587785252, T2h, KP951056516 * T2e);
|
||||
T2k = FMA(KP951056516, T2h, KP587785252 * T2e);
|
||||
ro[0] = T5 + Ts;
|
||||
T2j = T2a + T29;
|
||||
ro[WS(os, 9)] = T2j - T2k;
|
||||
ro[WS(os, 6)] = T2j + T2k;
|
||||
T2b = T29 - T2a;
|
||||
ro[WS(os, 12)] = T2b - T2i;
|
||||
ro[WS(os, 3)] = T2b + T2i;
|
||||
}
|
||||
{
|
||||
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
|
||||
T2q = KP559016994 * (T2m - T2n);
|
||||
T2o = T2m + T2n;
|
||||
T2p = FNMS(KP250000000, T2o, T2l);
|
||||
T2s = Tl - Tq;
|
||||
T2t = Ta - Tf;
|
||||
T2u = FNMS(KP587785252, T2t, KP951056516 * T2s);
|
||||
T2w = FMA(KP951056516, T2t, KP587785252 * T2s);
|
||||
io[0] = T2l + T2o;
|
||||
T2v = T2q + T2p;
|
||||
io[WS(os, 6)] = T2v - T2w;
|
||||
io[WS(os, 9)] = T2w + T2v;
|
||||
T2r = T2p - T2q;
|
||||
io[WS(os, 3)] = T2r - T2u;
|
||||
io[WS(os, 12)] = T2u + T2r;
|
||||
}
|
||||
{
|
||||
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
|
||||
T1M = KP559016994 * (TI - TT);
|
||||
TU = TI + TT;
|
||||
T1L = FNMS(KP250000000, TU, Tx);
|
||||
T1Q = T1O - T1P;
|
||||
T1T = T1R - T1S;
|
||||
T1U = FNMS(KP587785252, T1T, KP951056516 * T1Q);
|
||||
T1W = FMA(KP951056516, T1T, KP587785252 * T1Q);
|
||||
ro[WS(os, 5)] = Tx + TU;
|
||||
T1V = T1M + T1L;
|
||||
ro[WS(os, 14)] = T1V - T1W;
|
||||
ro[WS(os, 11)] = T1V + T1W;
|
||||
T1N = T1L - T1M;
|
||||
ro[WS(os, 2)] = T1N - T1U;
|
||||
ro[WS(os, 8)] = T1N + T1U;
|
||||
}
|
||||
{
|
||||
E T25, T23, T24, T1Z, T28, T1X, T1Y, T27, T26;
|
||||
T25 = KP559016994 * (T21 - T22);
|
||||
T23 = T21 + T22;
|
||||
T24 = FNMS(KP250000000, T23, T20);
|
||||
T1X = TN - TS;
|
||||
T1Y = TC - TH;
|
||||
T1Z = FNMS(KP587785252, T1Y, KP951056516 * T1X);
|
||||
T28 = FMA(KP951056516, T1Y, KP587785252 * T1X);
|
||||
io[WS(os, 5)] = T20 + T23;
|
||||
T27 = T25 + T24;
|
||||
io[WS(os, 11)] = T27 - T28;
|
||||
io[WS(os, 14)] = T28 + T27;
|
||||
T26 = T24 - T25;
|
||||
io[WS(os, 2)] = T1Z + T26;
|
||||
io[WS(os, 8)] = T26 - T1Z;
|
||||
}
|
||||
{
|
||||
E T1x, T1D, T1E, T1I, T1J, T1G, T1H, T1K, T1F;
|
||||
T1x = KP559016994 * (T1v - T1w);
|
||||
T1D = T1v + T1w;
|
||||
T1E = FNMS(KP250000000, T1D, T1C);
|
||||
T1G = TW - TX;
|
||||
T1H = TZ - T10;
|
||||
T1I = FMA(KP951056516, T1G, KP587785252 * T1H);
|
||||
T1J = FNMS(KP587785252, T1G, KP951056516 * T1H);
|
||||
io[WS(os, 10)] = T1C + T1D;
|
||||
T1K = T1E - T1x;
|
||||
io[WS(os, 7)] = T1J + T1K;
|
||||
io[WS(os, 13)] = T1K - T1J;
|
||||
T1F = T1x + T1E;
|
||||
io[WS(os, 1)] = T1F - T1I;
|
||||
io[WS(os, 4)] = T1I + T1F;
|
||||
}
|
||||
{
|
||||
E T13, T12, T14, T1s, T1u, T1g, T1r, T1t, T15;
|
||||
T13 = KP559016994 * (TY - T11);
|
||||
T12 = TY + T11;
|
||||
T14 = FNMS(KP250000000, T12, TV);
|
||||
T1g = T1a - T1f;
|
||||
T1r = T1l - T1q;
|
||||
T1s = FMA(KP951056516, T1g, KP587785252 * T1r);
|
||||
T1u = FNMS(KP587785252, T1g, KP951056516 * T1r);
|
||||
ro[WS(os, 10)] = TV + T12;
|
||||
T1t = T14 - T13;
|
||||
ro[WS(os, 7)] = T1t - T1u;
|
||||
ro[WS(os, 13)] = T1t + T1u;
|
||||
T15 = T13 + T14;
|
||||
ro[WS(os, 4)] = T15 - T1s;
|
||||
ro[WS(os, 1)] = T15 + T1s;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 15, "n1_15", { 128, 28, 28, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
560
fftw-3.3.10/dft/scalar/codelets/n1_16.c
Normal file
560
fftw-3.3.10/dft/scalar/codelets/n1_16.c
Normal file
@@ -0,0 +1,560 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:25 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 144 FP additions, 40 FP multiplications,
|
||||
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
|
||||
* 50 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
|
||||
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
|
||||
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
|
||||
E T1U, T1A;
|
||||
{
|
||||
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
|
||||
{
|
||||
E T1, T2, Tw, Tx;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 8)];
|
||||
T3 = T1 + T2;
|
||||
TL = T1 - T2;
|
||||
Tw = ii[0];
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = Tw + Tx;
|
||||
T1k = Tw - Tx;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tz, TA;
|
||||
T4 = ri[WS(is, 4)];
|
||||
T5 = ri[WS(is, 12)];
|
||||
T6 = T4 + T5;
|
||||
T1j = T4 - T5;
|
||||
Tz = ii[WS(is, 4)];
|
||||
TA = ii[WS(is, 12)];
|
||||
TB = Tz + TA;
|
||||
TM = Tz - TA;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1R = T3 - T6;
|
||||
T25 = Ty - TB;
|
||||
TC = Ty + TB;
|
||||
TN = TL - TM;
|
||||
T1x = TL + TM;
|
||||
T1H = T1k - T1j;
|
||||
T1l = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E Tp, T1c, T1a, T20, Ts, T17, T1f, T21;
|
||||
{
|
||||
E Tn, To, T18, T19;
|
||||
Tn = ri[WS(is, 15)];
|
||||
To = ri[WS(is, 7)];
|
||||
Tp = Tn + To;
|
||||
T1c = Tn - To;
|
||||
T18 = ii[WS(is, 15)];
|
||||
T19 = ii[WS(is, 7)];
|
||||
T1a = T18 - T19;
|
||||
T20 = T18 + T19;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T1d, T1e;
|
||||
Tq = ri[WS(is, 3)];
|
||||
Tr = ri[WS(is, 11)];
|
||||
Ts = Tq + Tr;
|
||||
T17 = Tq - Tr;
|
||||
T1d = ii[WS(is, 3)];
|
||||
T1e = ii[WS(is, 11)];
|
||||
T1f = T1d - T1e;
|
||||
T21 = T1d + T1e;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T22 = T20 - T21;
|
||||
T2h = T20 + T21;
|
||||
T1b = T17 + T1a;
|
||||
T1g = T1c - T1f;
|
||||
T1E = T1a - T17;
|
||||
T1Z = Tp - Ts;
|
||||
T1D = T1c + T1f;
|
||||
}
|
||||
{
|
||||
E Ta, TP, TF, TO, Td, TR, TI, TS;
|
||||
{
|
||||
E T8, T9, TD, TE;
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = ri[WS(is, 10)];
|
||||
Ta = T8 + T9;
|
||||
TP = T8 - T9;
|
||||
TD = ii[WS(is, 2)];
|
||||
TE = ii[WS(is, 10)];
|
||||
TF = TD + TE;
|
||||
TO = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TG, TH;
|
||||
Tb = ri[WS(is, 14)];
|
||||
Tc = ri[WS(is, 6)];
|
||||
Td = Tb + Tc;
|
||||
TR = Tb - Tc;
|
||||
TG = ii[WS(is, 14)];
|
||||
TH = ii[WS(is, 6)];
|
||||
TI = TG + TH;
|
||||
TS = TG - TH;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1S = TF - TI;
|
||||
T26 = Td - Ta;
|
||||
TJ = TF + TI;
|
||||
TQ = TO - TP;
|
||||
T1m = TR - TS;
|
||||
T1n = TP + TO;
|
||||
TT = TR + TS;
|
||||
}
|
||||
{
|
||||
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
|
||||
{
|
||||
E Tg, Th, TX, TY;
|
||||
Tg = ri[WS(is, 1)];
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = Tg + Th;
|
||||
T11 = Tg - Th;
|
||||
TX = ii[WS(is, 1)];
|
||||
TY = ii[WS(is, 9)];
|
||||
TZ = TX - TY;
|
||||
T1V = TX + TY;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, T12, T13;
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = ri[WS(is, 13)];
|
||||
Tl = Tj + Tk;
|
||||
TW = Tj - Tk;
|
||||
T12 = ii[WS(is, 5)];
|
||||
T13 = ii[WS(is, 13)];
|
||||
T14 = T12 - T13;
|
||||
T1W = T12 + T13;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1X = T1V - T1W;
|
||||
T2g = T1V + T1W;
|
||||
T10 = TW + TZ;
|
||||
T15 = T11 - T14;
|
||||
T1B = TZ - TW;
|
||||
T1U = Ti - Tl;
|
||||
T1A = T11 + T14;
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T2j, T2k;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
ro[WS(os, 8)] = Tf - Tu;
|
||||
ro[0] = Tf + Tu;
|
||||
T2j = TC + TJ;
|
||||
T2k = T2g + T2h;
|
||||
io[WS(os, 8)] = T2j - T2k;
|
||||
io[0] = T2j + T2k;
|
||||
}
|
||||
{
|
||||
E Tv, TK, T2f, T2i;
|
||||
Tv = Tt - Tm;
|
||||
TK = TC - TJ;
|
||||
io[WS(os, 4)] = Tv + TK;
|
||||
io[WS(os, 12)] = TK - Tv;
|
||||
T2f = T7 - Te;
|
||||
T2i = T2g - T2h;
|
||||
ro[WS(os, 12)] = T2f - T2i;
|
||||
ro[WS(os, 4)] = T2f + T2i;
|
||||
}
|
||||
{
|
||||
E T1T, T27, T24, T28, T1Y, T23;
|
||||
T1T = T1R + T1S;
|
||||
T27 = T25 - T26;
|
||||
T1Y = T1U + T1X;
|
||||
T23 = T1Z - T22;
|
||||
T24 = T1Y + T23;
|
||||
T28 = T23 - T1Y;
|
||||
ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T);
|
||||
io[WS(os, 6)] = FMA(KP707106781, T28, T27);
|
||||
ro[WS(os, 2)] = FMA(KP707106781, T24, T1T);
|
||||
io[WS(os, 14)] = FNMS(KP707106781, T28, T27);
|
||||
}
|
||||
{
|
||||
E T29, T2d, T2c, T2e, T2a, T2b;
|
||||
T29 = T1R - T1S;
|
||||
T2d = T26 + T25;
|
||||
T2a = T1X - T1U;
|
||||
T2b = T1Z + T22;
|
||||
T2c = T2a - T2b;
|
||||
T2e = T2a + T2b;
|
||||
ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29);
|
||||
io[WS(os, 2)] = FMA(KP707106781, T2e, T2d);
|
||||
ro[WS(os, 6)] = FMA(KP707106781, T2c, T29);
|
||||
io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d);
|
||||
}
|
||||
{
|
||||
E TV, T1v, T1p, T1r, T1i, T1q, T1u, T1w, TU, T1o;
|
||||
TU = TQ - TT;
|
||||
TV = FMA(KP707106781, TU, TN);
|
||||
T1v = FNMS(KP707106781, TU, TN);
|
||||
T1o = T1m - T1n;
|
||||
T1p = FNMS(KP707106781, T1o, T1l);
|
||||
T1r = FMA(KP707106781, T1o, T1l);
|
||||
{
|
||||
E T16, T1h, T1s, T1t;
|
||||
T16 = FMA(KP414213562, T15, T10);
|
||||
T1h = FNMS(KP414213562, T1g, T1b);
|
||||
T1i = T16 - T1h;
|
||||
T1q = T16 + T1h;
|
||||
T1s = FMA(KP414213562, T1b, T1g);
|
||||
T1t = FNMS(KP414213562, T10, T15);
|
||||
T1u = T1s - T1t;
|
||||
T1w = T1t + T1s;
|
||||
}
|
||||
ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV);
|
||||
io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r);
|
||||
ro[WS(os, 3)] = FMA(KP923879532, T1i, TV);
|
||||
io[WS(os, 3)] = FMA(KP923879532, T1u, T1r);
|
||||
io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p);
|
||||
ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v);
|
||||
io[WS(os, 15)] = FMA(KP923879532, T1q, T1p);
|
||||
ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v);
|
||||
}
|
||||
{
|
||||
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
|
||||
T1y = T1n + T1m;
|
||||
T1z = FMA(KP707106781, T1y, T1x);
|
||||
T1L = FNMS(KP707106781, T1y, T1x);
|
||||
T1I = TQ + TT;
|
||||
T1J = FNMS(KP707106781, T1I, T1H);
|
||||
T1P = FMA(KP707106781, T1I, T1H);
|
||||
{
|
||||
E T1C, T1F, T1M, T1N;
|
||||
T1C = FMA(KP414213562, T1B, T1A);
|
||||
T1F = FNMS(KP414213562, T1E, T1D);
|
||||
T1G = T1C + T1F;
|
||||
T1K = T1F - T1C;
|
||||
T1M = FNMS(KP414213562, T1A, T1B);
|
||||
T1N = FMA(KP414213562, T1D, T1E);
|
||||
T1O = T1M - T1N;
|
||||
T1Q = T1M + T1N;
|
||||
}
|
||||
ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z);
|
||||
io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P);
|
||||
ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z);
|
||||
io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P);
|
||||
io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J);
|
||||
ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L);
|
||||
io[WS(os, 5)] = FMA(KP923879532, T1K, T1J);
|
||||
ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 16, "n1_16", { 104, 0, 40, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 144 FP additions, 24 FP multiplications,
|
||||
* (or, 136 additions, 16 multiplications, 8 fused multiply/add),
|
||||
* 50 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
|
||||
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
|
||||
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
|
||||
E T1U, T1A;
|
||||
{
|
||||
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
|
||||
{
|
||||
E T1, T2, Tw, Tx;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 8)];
|
||||
T3 = T1 + T2;
|
||||
TL = T1 - T2;
|
||||
Tw = ii[0];
|
||||
Tx = ii[WS(is, 8)];
|
||||
Ty = Tw + Tx;
|
||||
T1k = Tw - Tx;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tz, TA;
|
||||
T4 = ri[WS(is, 4)];
|
||||
T5 = ri[WS(is, 12)];
|
||||
T6 = T4 + T5;
|
||||
T1j = T4 - T5;
|
||||
Tz = ii[WS(is, 4)];
|
||||
TA = ii[WS(is, 12)];
|
||||
TB = Tz + TA;
|
||||
TM = Tz - TA;
|
||||
}
|
||||
T7 = T3 + T6;
|
||||
T1R = T3 - T6;
|
||||
T25 = Ty - TB;
|
||||
TC = Ty + TB;
|
||||
TN = TL - TM;
|
||||
T1x = TL + TM;
|
||||
T1H = T1k - T1j;
|
||||
T1l = T1j + T1k;
|
||||
}
|
||||
{
|
||||
E Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
|
||||
{
|
||||
E Tn, To, T1d, T1e;
|
||||
Tn = ri[WS(is, 15)];
|
||||
To = ri[WS(is, 7)];
|
||||
Tp = Tn + To;
|
||||
T17 = Tn - To;
|
||||
T1d = ii[WS(is, 15)];
|
||||
T1e = ii[WS(is, 7)];
|
||||
T1f = T1d - T1e;
|
||||
T20 = T1d + T1e;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T18, T19;
|
||||
Tq = ri[WS(is, 3)];
|
||||
Tr = ri[WS(is, 11)];
|
||||
Ts = Tq + Tr;
|
||||
T1c = Tq - Tr;
|
||||
T18 = ii[WS(is, 3)];
|
||||
T19 = ii[WS(is, 11)];
|
||||
T1a = T18 - T19;
|
||||
T21 = T18 + T19;
|
||||
}
|
||||
Tt = Tp + Ts;
|
||||
T22 = T20 - T21;
|
||||
T2h = T20 + T21;
|
||||
T1b = T17 - T1a;
|
||||
T1g = T1c + T1f;
|
||||
T1E = T1f - T1c;
|
||||
T1Z = Tp - Ts;
|
||||
T1D = T17 + T1a;
|
||||
}
|
||||
{
|
||||
E Ta, TP, TF, TO, Td, TR, TI, TS;
|
||||
{
|
||||
E T8, T9, TD, TE;
|
||||
T8 = ri[WS(is, 2)];
|
||||
T9 = ri[WS(is, 10)];
|
||||
Ta = T8 + T9;
|
||||
TP = T8 - T9;
|
||||
TD = ii[WS(is, 2)];
|
||||
TE = ii[WS(is, 10)];
|
||||
TF = TD + TE;
|
||||
TO = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, TG, TH;
|
||||
Tb = ri[WS(is, 14)];
|
||||
Tc = ri[WS(is, 6)];
|
||||
Td = Tb + Tc;
|
||||
TR = Tb - Tc;
|
||||
TG = ii[WS(is, 14)];
|
||||
TH = ii[WS(is, 6)];
|
||||
TI = TG + TH;
|
||||
TS = TG - TH;
|
||||
}
|
||||
Te = Ta + Td;
|
||||
T1S = TF - TI;
|
||||
T26 = Td - Ta;
|
||||
TJ = TF + TI;
|
||||
TQ = TO - TP;
|
||||
T1m = TR - TS;
|
||||
T1n = TP + TO;
|
||||
TT = TR + TS;
|
||||
}
|
||||
{
|
||||
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
|
||||
{
|
||||
E Tg, Th, TX, TY;
|
||||
Tg = ri[WS(is, 1)];
|
||||
Th = ri[WS(is, 9)];
|
||||
Ti = Tg + Th;
|
||||
T11 = Tg - Th;
|
||||
TX = ii[WS(is, 1)];
|
||||
TY = ii[WS(is, 9)];
|
||||
TZ = TX - TY;
|
||||
T1V = TX + TY;
|
||||
}
|
||||
{
|
||||
E Tj, Tk, T12, T13;
|
||||
Tj = ri[WS(is, 5)];
|
||||
Tk = ri[WS(is, 13)];
|
||||
Tl = Tj + Tk;
|
||||
TW = Tj - Tk;
|
||||
T12 = ii[WS(is, 5)];
|
||||
T13 = ii[WS(is, 13)];
|
||||
T14 = T12 - T13;
|
||||
T1W = T12 + T13;
|
||||
}
|
||||
Tm = Ti + Tl;
|
||||
T1X = T1V - T1W;
|
||||
T2g = T1V + T1W;
|
||||
T10 = TW + TZ;
|
||||
T15 = T11 - T14;
|
||||
T1B = T11 + T14;
|
||||
T1U = Ti - Tl;
|
||||
T1A = TZ - TW;
|
||||
}
|
||||
{
|
||||
E Tf, Tu, T2j, T2k;
|
||||
Tf = T7 + Te;
|
||||
Tu = Tm + Tt;
|
||||
ro[WS(os, 8)] = Tf - Tu;
|
||||
ro[0] = Tf + Tu;
|
||||
T2j = TC + TJ;
|
||||
T2k = T2g + T2h;
|
||||
io[WS(os, 8)] = T2j - T2k;
|
||||
io[0] = T2j + T2k;
|
||||
}
|
||||
{
|
||||
E Tv, TK, T2f, T2i;
|
||||
Tv = Tt - Tm;
|
||||
TK = TC - TJ;
|
||||
io[WS(os, 4)] = Tv + TK;
|
||||
io[WS(os, 12)] = TK - Tv;
|
||||
T2f = T7 - Te;
|
||||
T2i = T2g - T2h;
|
||||
ro[WS(os, 12)] = T2f - T2i;
|
||||
ro[WS(os, 4)] = T2f + T2i;
|
||||
}
|
||||
{
|
||||
E T1T, T27, T24, T28, T1Y, T23;
|
||||
T1T = T1R + T1S;
|
||||
T27 = T25 - T26;
|
||||
T1Y = T1U + T1X;
|
||||
T23 = T1Z - T22;
|
||||
T24 = KP707106781 * (T1Y + T23);
|
||||
T28 = KP707106781 * (T23 - T1Y);
|
||||
ro[WS(os, 10)] = T1T - T24;
|
||||
io[WS(os, 6)] = T27 + T28;
|
||||
ro[WS(os, 2)] = T1T + T24;
|
||||
io[WS(os, 14)] = T27 - T28;
|
||||
}
|
||||
{
|
||||
E T29, T2d, T2c, T2e, T2a, T2b;
|
||||
T29 = T1R - T1S;
|
||||
T2d = T26 + T25;
|
||||
T2a = T1X - T1U;
|
||||
T2b = T1Z + T22;
|
||||
T2c = KP707106781 * (T2a - T2b);
|
||||
T2e = KP707106781 * (T2a + T2b);
|
||||
ro[WS(os, 14)] = T29 - T2c;
|
||||
io[WS(os, 2)] = T2d + T2e;
|
||||
ro[WS(os, 6)] = T29 + T2c;
|
||||
io[WS(os, 10)] = T2d - T2e;
|
||||
}
|
||||
{
|
||||
E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
|
||||
TU = KP707106781 * (TQ - TT);
|
||||
TV = TN + TU;
|
||||
T1r = TN - TU;
|
||||
T1o = KP707106781 * (T1m - T1n);
|
||||
T1p = T1l - T1o;
|
||||
T1v = T1l + T1o;
|
||||
{
|
||||
E T16, T1h, T1s, T1t;
|
||||
T16 = FMA(KP923879532, T10, KP382683432 * T15);
|
||||
T1h = FNMS(KP923879532, T1g, KP382683432 * T1b);
|
||||
T1i = T16 + T1h;
|
||||
T1q = T1h - T16;
|
||||
T1s = FNMS(KP923879532, T15, KP382683432 * T10);
|
||||
T1t = FMA(KP382683432, T1g, KP923879532 * T1b);
|
||||
T1u = T1s - T1t;
|
||||
T1w = T1s + T1t;
|
||||
}
|
||||
ro[WS(os, 11)] = TV - T1i;
|
||||
io[WS(os, 11)] = T1v - T1w;
|
||||
ro[WS(os, 3)] = TV + T1i;
|
||||
io[WS(os, 3)] = T1v + T1w;
|
||||
io[WS(os, 15)] = T1p - T1q;
|
||||
ro[WS(os, 15)] = T1r - T1u;
|
||||
io[WS(os, 7)] = T1p + T1q;
|
||||
ro[WS(os, 7)] = T1r + T1u;
|
||||
}
|
||||
{
|
||||
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
|
||||
T1y = KP707106781 * (T1n + T1m);
|
||||
T1z = T1x + T1y;
|
||||
T1L = T1x - T1y;
|
||||
T1I = KP707106781 * (TQ + TT);
|
||||
T1J = T1H - T1I;
|
||||
T1P = T1H + T1I;
|
||||
{
|
||||
E T1C, T1F, T1M, T1N;
|
||||
T1C = FMA(KP382683432, T1A, KP923879532 * T1B);
|
||||
T1F = FNMS(KP382683432, T1E, KP923879532 * T1D);
|
||||
T1G = T1C + T1F;
|
||||
T1K = T1F - T1C;
|
||||
T1M = FNMS(KP382683432, T1B, KP923879532 * T1A);
|
||||
T1N = FMA(KP923879532, T1E, KP382683432 * T1D);
|
||||
T1O = T1M - T1N;
|
||||
T1Q = T1M + T1N;
|
||||
}
|
||||
ro[WS(os, 9)] = T1z - T1G;
|
||||
io[WS(os, 9)] = T1P - T1Q;
|
||||
ro[WS(os, 1)] = T1z + T1G;
|
||||
io[WS(os, 1)] = T1P + T1Q;
|
||||
io[WS(os, 13)] = T1J - T1K;
|
||||
ro[WS(os, 13)] = T1L - T1O;
|
||||
io[WS(os, 5)] = T1J + T1K;
|
||||
ro[WS(os, 5)] = T1L + T1O;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 16, "n1_16", { 136, 16, 8, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
94
fftw-3.3.10/dft/scalar/codelets/n1_2.c
Normal file
94
fftw-3.3.10/dft/scalar/codelets/n1_2.c
Normal file
@@ -0,0 +1,94 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 0 FP multiplications,
|
||||
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 5 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 1)];
|
||||
ro[WS(os, 1)] = T1 - T2;
|
||||
ro[0] = T1 + T2;
|
||||
T3 = ii[0];
|
||||
T4 = ii[WS(is, 1)];
|
||||
io[WS(os, 1)] = T3 - T4;
|
||||
io[0] = T3 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 4 FP additions, 0 FP multiplications,
|
||||
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 5 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 1)];
|
||||
ro[WS(os, 1)] = T1 - T2;
|
||||
ro[0] = T1 + T2;
|
||||
T3 = ii[0];
|
||||
T4 = ii[WS(is, 1)];
|
||||
io[WS(os, 1)] = T3 - T4;
|
||||
io[0] = T3 + T4;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
718
fftw-3.3.10/dft/scalar/codelets/n1_20.c
Normal file
718
fftw-3.3.10/dft/scalar/codelets/n1_20.c
Normal file
@@ -0,0 +1,718 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 208 FP additions, 72 FP multiplications,
|
||||
* (or, 136 additions, 0 multiplications, 72 fused multiply/add),
|
||||
* 81 stack variables, 4 constants, and 80 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
|
||||
E T7, T2N, T3b, TD, TP, T1R, T2f, T1d, Tt, TA, TB, T2w, T2z, T2P, T35;
|
||||
E T36, T3d, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1T, T29, T2a, T2h, T1h;
|
||||
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2O, T32, T33, T3c, TE, TF, TG, TU;
|
||||
E TZ, T10, T1D, T1I, T1S, T26, T27, T2g, T1e, T1f, T1g;
|
||||
{
|
||||
E T3, T1N, TN, T2L, T6, TO, T1Q, T2M;
|
||||
{
|
||||
E T1, T2, TL, TM;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 10)];
|
||||
T3 = T1 + T2;
|
||||
T1N = T1 - T2;
|
||||
TL = ii[0];
|
||||
TM = ii[WS(is, 10)];
|
||||
TN = TL - TM;
|
||||
T2L = TL + TM;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1O, T1P;
|
||||
T4 = ri[WS(is, 5)];
|
||||
T5 = ri[WS(is, 15)];
|
||||
T6 = T4 + T5;
|
||||
TO = T4 - T5;
|
||||
T1O = ii[WS(is, 5)];
|
||||
T1P = ii[WS(is, 15)];
|
||||
T1Q = T1O - T1P;
|
||||
T2M = T1O + T1P;
|
||||
}
|
||||
T7 = T3 - T6;
|
||||
T2N = T2L - T2M;
|
||||
T3b = T2L + T2M;
|
||||
TD = T3 + T6;
|
||||
TP = TN - TO;
|
||||
T1R = T1N - T1Q;
|
||||
T2f = T1N + T1Q;
|
||||
T1d = TO + TN;
|
||||
}
|
||||
{
|
||||
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
|
||||
E T2y;
|
||||
{
|
||||
E Tn, To, T11, T12;
|
||||
Tn = ri[WS(is, 8)];
|
||||
To = ri[WS(is, 18)];
|
||||
Tp = Tn + To;
|
||||
T1o = Tn - To;
|
||||
T11 = ii[WS(is, 8)];
|
||||
T12 = ii[WS(is, 18)];
|
||||
T13 = T11 - T12;
|
||||
T2u = T11 + T12;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T1p, T1q;
|
||||
Tq = ri[WS(is, 13)];
|
||||
Tr = ri[WS(is, 3)];
|
||||
Ts = Tq + Tr;
|
||||
T14 = Tq - Tr;
|
||||
T1p = ii[WS(is, 13)];
|
||||
T1q = ii[WS(is, 3)];
|
||||
T1r = T1p - T1q;
|
||||
T2v = T1p + T1q;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, T16, T17;
|
||||
Tu = ri[WS(is, 12)];
|
||||
Tv = ri[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
T1t = Tu - Tv;
|
||||
T16 = ii[WS(is, 12)];
|
||||
T17 = ii[WS(is, 2)];
|
||||
T18 = T16 - T17;
|
||||
T2x = T16 + T17;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, T1u, T1v;
|
||||
Tx = ri[WS(is, 17)];
|
||||
Ty = ri[WS(is, 7)];
|
||||
Tz = Tx + Ty;
|
||||
T19 = Tx - Ty;
|
||||
T1u = ii[WS(is, 17)];
|
||||
T1v = ii[WS(is, 7)];
|
||||
T1w = T1u - T1v;
|
||||
T2y = T1u + T1v;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
TB = Tt + TA;
|
||||
T2w = T2u - T2v;
|
||||
T2z = T2x - T2y;
|
||||
T2P = T2w + T2z;
|
||||
T35 = T2u + T2v;
|
||||
T36 = T2x + T2y;
|
||||
T3d = T35 + T36;
|
||||
TH = Tp + Ts;
|
||||
TI = Tw + Tz;
|
||||
TJ = TH + TI;
|
||||
T15 = T13 - T14;
|
||||
T1a = T18 - T19;
|
||||
T1b = T15 + T1a;
|
||||
T1s = T1o - T1r;
|
||||
T1x = T1t - T1w;
|
||||
T1T = T1s + T1x;
|
||||
T29 = T1o + T1r;
|
||||
T2a = T1t + T1w;
|
||||
T2h = T29 + T2a;
|
||||
T1h = T14 + T13;
|
||||
T1i = T19 + T18;
|
||||
T1j = T1h + T1i;
|
||||
}
|
||||
{
|
||||
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
|
||||
E T2F;
|
||||
{
|
||||
E T8, T9, TQ, TR;
|
||||
T8 = ri[WS(is, 4)];
|
||||
T9 = ri[WS(is, 14)];
|
||||
Ta = T8 + T9;
|
||||
T1z = T8 - T9;
|
||||
TQ = ii[WS(is, 4)];
|
||||
TR = ii[WS(is, 14)];
|
||||
TS = TQ - TR;
|
||||
T2B = TQ + TR;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, T1A, T1B;
|
||||
Tb = ri[WS(is, 9)];
|
||||
Tc = ri[WS(is, 19)];
|
||||
Td = Tb + Tc;
|
||||
TT = Tb - Tc;
|
||||
T1A = ii[WS(is, 9)];
|
||||
T1B = ii[WS(is, 19)];
|
||||
T1C = T1A - T1B;
|
||||
T2C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tf, Tg, TV, TW;
|
||||
Tf = ri[WS(is, 16)];
|
||||
Tg = ri[WS(is, 6)];
|
||||
Th = Tf + Tg;
|
||||
T1E = Tf - Tg;
|
||||
TV = ii[WS(is, 16)];
|
||||
TW = ii[WS(is, 6)];
|
||||
TX = TV - TW;
|
||||
T2E = TV + TW;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, T1F, T1G;
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 11)];
|
||||
Tk = Ti + Tj;
|
||||
TY = Ti - Tj;
|
||||
T1F = ii[WS(is, 1)];
|
||||
T1G = ii[WS(is, 11)];
|
||||
T1H = T1F - T1G;
|
||||
T2F = T1F + T1G;
|
||||
}
|
||||
Te = Ta - Td;
|
||||
Tl = Th - Tk;
|
||||
Tm = Te + Tl;
|
||||
T2D = T2B - T2C;
|
||||
T2G = T2E - T2F;
|
||||
T2O = T2D + T2G;
|
||||
T32 = T2B + T2C;
|
||||
T33 = T2E + T2F;
|
||||
T3c = T32 + T33;
|
||||
TE = Ta + Td;
|
||||
TF = Th + Tk;
|
||||
TG = TE + TF;
|
||||
TU = TS - TT;
|
||||
TZ = TX - TY;
|
||||
T10 = TU + TZ;
|
||||
T1D = T1z - T1C;
|
||||
T1I = T1E - T1H;
|
||||
T1S = T1D + T1I;
|
||||
T26 = T1z + T1C;
|
||||
T27 = T1E + T1H;
|
||||
T2g = T26 + T27;
|
||||
T1e = TT + TS;
|
||||
T1f = TY + TX;
|
||||
T1g = T1e + T1f;
|
||||
}
|
||||
{
|
||||
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
|
||||
T2s = Tm - TB;
|
||||
TC = Tm + TB;
|
||||
T2r = FNMS(KP250000000, TC, T7);
|
||||
T2A = T2w - T2z;
|
||||
T2H = T2D - T2G;
|
||||
T2I = FNMS(KP618033988, T2H, T2A);
|
||||
T2K = FMA(KP618033988, T2A, T2H);
|
||||
ro[WS(os, 10)] = T7 + TC;
|
||||
T2J = FMA(KP559016994, T2s, T2r);
|
||||
ro[WS(os, 14)] = FNMS(KP951056516, T2K, T2J);
|
||||
ro[WS(os, 6)] = FMA(KP951056516, T2K, T2J);
|
||||
T2t = FNMS(KP559016994, T2s, T2r);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, T2I, T2t);
|
||||
ro[WS(os, 18)] = FMA(KP951056516, T2I, T2t);
|
||||
}
|
||||
{
|
||||
E T2S, T2Q, T2R, T2W, T2Y, T2U, T2V, T2X, T2T;
|
||||
T2S = T2O - T2P;
|
||||
T2Q = T2O + T2P;
|
||||
T2R = FNMS(KP250000000, T2Q, T2N);
|
||||
T2U = Tt - TA;
|
||||
T2V = Te - Tl;
|
||||
T2W = FNMS(KP618033988, T2V, T2U);
|
||||
T2Y = FMA(KP618033988, T2U, T2V);
|
||||
io[WS(os, 10)] = T2N + T2Q;
|
||||
T2X = FMA(KP559016994, T2S, T2R);
|
||||
io[WS(os, 6)] = FNMS(KP951056516, T2Y, T2X);
|
||||
io[WS(os, 14)] = FMA(KP951056516, T2Y, T2X);
|
||||
T2T = FNMS(KP559016994, T2S, T2R);
|
||||
io[WS(os, 2)] = FMA(KP951056516, T2W, T2T);
|
||||
io[WS(os, 18)] = FNMS(KP951056516, T2W, T2T);
|
||||
}
|
||||
{
|
||||
E T30, TK, T2Z, T38, T3a, T34, T37, T39, T31;
|
||||
T30 = TG - TJ;
|
||||
TK = TG + TJ;
|
||||
T2Z = FNMS(KP250000000, TK, TD);
|
||||
T34 = T32 - T33;
|
||||
T37 = T35 - T36;
|
||||
T38 = FMA(KP618033988, T37, T34);
|
||||
T3a = FNMS(KP618033988, T34, T37);
|
||||
ro[0] = TD + TK;
|
||||
T39 = FNMS(KP559016994, T30, T2Z);
|
||||
ro[WS(os, 12)] = FNMS(KP951056516, T3a, T39);
|
||||
ro[WS(os, 8)] = FMA(KP951056516, T3a, T39);
|
||||
T31 = FMA(KP559016994, T30, T2Z);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, T38, T31);
|
||||
ro[WS(os, 16)] = FMA(KP951056516, T38, T31);
|
||||
}
|
||||
{
|
||||
E T3g, T3e, T3f, T3k, T3m, T3i, T3j, T3l, T3h;
|
||||
T3g = T3c - T3d;
|
||||
T3e = T3c + T3d;
|
||||
T3f = FNMS(KP250000000, T3e, T3b);
|
||||
T3i = TE - TF;
|
||||
T3j = TH - TI;
|
||||
T3k = FMA(KP618033988, T3j, T3i);
|
||||
T3m = FNMS(KP618033988, T3i, T3j);
|
||||
io[0] = T3b + T3e;
|
||||
T3l = FNMS(KP559016994, T3g, T3f);
|
||||
io[WS(os, 8)] = FNMS(KP951056516, T3m, T3l);
|
||||
io[WS(os, 12)] = FMA(KP951056516, T3m, T3l);
|
||||
T3h = FMA(KP559016994, T3g, T3f);
|
||||
io[WS(os, 4)] = FMA(KP951056516, T3k, T3h);
|
||||
io[WS(os, 16)] = FNMS(KP951056516, T3k, T3h);
|
||||
}
|
||||
{
|
||||
E T24, T1c, T23, T2c, T2e, T28, T2b, T2d, T25;
|
||||
T24 = T10 - T1b;
|
||||
T1c = T10 + T1b;
|
||||
T23 = FNMS(KP250000000, T1c, TP);
|
||||
T28 = T26 - T27;
|
||||
T2b = T29 - T2a;
|
||||
T2c = FMA(KP618033988, T2b, T28);
|
||||
T2e = FNMS(KP618033988, T28, T2b);
|
||||
io[WS(os, 5)] = TP + T1c;
|
||||
T2d = FNMS(KP559016994, T24, T23);
|
||||
io[WS(os, 13)] = FNMS(KP951056516, T2e, T2d);
|
||||
io[WS(os, 17)] = FMA(KP951056516, T2e, T2d);
|
||||
T25 = FMA(KP559016994, T24, T23);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, T2c, T25);
|
||||
io[WS(os, 9)] = FMA(KP951056516, T2c, T25);
|
||||
}
|
||||
{
|
||||
E T2k, T2i, T2j, T2o, T2q, T2m, T2n, T2p, T2l;
|
||||
T2k = T2g - T2h;
|
||||
T2i = T2g + T2h;
|
||||
T2j = FNMS(KP250000000, T2i, T2f);
|
||||
T2m = TU - TZ;
|
||||
T2n = T15 - T1a;
|
||||
T2o = FMA(KP618033988, T2n, T2m);
|
||||
T2q = FNMS(KP618033988, T2m, T2n);
|
||||
ro[WS(os, 5)] = T2f + T2i;
|
||||
T2p = FNMS(KP559016994, T2k, T2j);
|
||||
ro[WS(os, 13)] = FMA(KP951056516, T2q, T2p);
|
||||
ro[WS(os, 17)] = FNMS(KP951056516, T2q, T2p);
|
||||
T2l = FMA(KP559016994, T2k, T2j);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, T2o, T2l);
|
||||
ro[WS(os, 9)] = FNMS(KP951056516, T2o, T2l);
|
||||
}
|
||||
{
|
||||
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
|
||||
T1m = T1g - T1j;
|
||||
T1k = T1g + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1d);
|
||||
T1y = T1s - T1x;
|
||||
T1J = T1D - T1I;
|
||||
T1K = FNMS(KP618033988, T1J, T1y);
|
||||
T1M = FMA(KP618033988, T1y, T1J);
|
||||
io[WS(os, 15)] = T1d + T1k;
|
||||
T1L = FMA(KP559016994, T1m, T1l);
|
||||
io[WS(os, 11)] = FNMS(KP951056516, T1M, T1L);
|
||||
io[WS(os, 19)] = FMA(KP951056516, T1M, T1L);
|
||||
T1n = FNMS(KP559016994, T1m, T1l);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, T1K, T1n);
|
||||
io[WS(os, 7)] = FMA(KP951056516, T1K, T1n);
|
||||
}
|
||||
{
|
||||
E T1W, T1U, T1V, T20, T22, T1Y, T1Z, T21, T1X;
|
||||
T1W = T1S - T1T;
|
||||
T1U = T1S + T1T;
|
||||
T1V = FNMS(KP250000000, T1U, T1R);
|
||||
T1Y = T1h - T1i;
|
||||
T1Z = T1e - T1f;
|
||||
T20 = FNMS(KP618033988, T1Z, T1Y);
|
||||
T22 = FMA(KP618033988, T1Y, T1Z);
|
||||
ro[WS(os, 15)] = T1R + T1U;
|
||||
T21 = FMA(KP559016994, T1W, T1V);
|
||||
ro[WS(os, 11)] = FMA(KP951056516, T22, T21);
|
||||
ro[WS(os, 19)] = FNMS(KP951056516, T22, T21);
|
||||
T1X = FNMS(KP559016994, T1W, T1V);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, T20, T1X);
|
||||
ro[WS(os, 7)] = FNMS(KP951056516, T20, T1X);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 20, "n1_20", { 136, 0, 72, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 208 FP additions, 48 FP multiplications,
|
||||
* (or, 184 additions, 24 multiplications, 24 fused multiply/add),
|
||||
* 81 stack variables, 4 constants, and 80 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
|
||||
E T7, T2Q, T3h, TD, TP, T1U, T2l, T1d, Tt, TA, TB, T2w, T2z, T2S, T35;
|
||||
E T36, T3f, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1W, T29, T2a, T2j, T1h;
|
||||
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2R, T32, T33, T3e, TE, TF, TG, TU;
|
||||
E TZ, T10, T1D, T1I, T1V, T26, T27, T2i, T1e, T1f, T1g;
|
||||
{
|
||||
E T3, T1Q, TN, T2O, T6, TO, T1T, T2P;
|
||||
{
|
||||
E T1, T2, TL, TM;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 10)];
|
||||
T3 = T1 + T2;
|
||||
T1Q = T1 - T2;
|
||||
TL = ii[0];
|
||||
TM = ii[WS(is, 10)];
|
||||
TN = TL - TM;
|
||||
T2O = TL + TM;
|
||||
}
|
||||
{
|
||||
E T4, T5, T1R, T1S;
|
||||
T4 = ri[WS(is, 5)];
|
||||
T5 = ri[WS(is, 15)];
|
||||
T6 = T4 + T5;
|
||||
TO = T4 - T5;
|
||||
T1R = ii[WS(is, 5)];
|
||||
T1S = ii[WS(is, 15)];
|
||||
T1T = T1R - T1S;
|
||||
T2P = T1R + T1S;
|
||||
}
|
||||
T7 = T3 - T6;
|
||||
T2Q = T2O - T2P;
|
||||
T3h = T2O + T2P;
|
||||
TD = T3 + T6;
|
||||
TP = TN - TO;
|
||||
T1U = T1Q - T1T;
|
||||
T2l = T1Q + T1T;
|
||||
T1d = TO + TN;
|
||||
}
|
||||
{
|
||||
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
|
||||
E T2y;
|
||||
{
|
||||
E Tn, To, T11, T12;
|
||||
Tn = ri[WS(is, 8)];
|
||||
To = ri[WS(is, 18)];
|
||||
Tp = Tn + To;
|
||||
T1o = Tn - To;
|
||||
T11 = ii[WS(is, 8)];
|
||||
T12 = ii[WS(is, 18)];
|
||||
T13 = T11 - T12;
|
||||
T2u = T11 + T12;
|
||||
}
|
||||
{
|
||||
E Tq, Tr, T1p, T1q;
|
||||
Tq = ri[WS(is, 13)];
|
||||
Tr = ri[WS(is, 3)];
|
||||
Ts = Tq + Tr;
|
||||
T14 = Tq - Tr;
|
||||
T1p = ii[WS(is, 13)];
|
||||
T1q = ii[WS(is, 3)];
|
||||
T1r = T1p - T1q;
|
||||
T2v = T1p + T1q;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, T16, T17;
|
||||
Tu = ri[WS(is, 12)];
|
||||
Tv = ri[WS(is, 2)];
|
||||
Tw = Tu + Tv;
|
||||
T1t = Tu - Tv;
|
||||
T16 = ii[WS(is, 12)];
|
||||
T17 = ii[WS(is, 2)];
|
||||
T18 = T16 - T17;
|
||||
T2x = T16 + T17;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, T1u, T1v;
|
||||
Tx = ri[WS(is, 17)];
|
||||
Ty = ri[WS(is, 7)];
|
||||
Tz = Tx + Ty;
|
||||
T19 = Tx - Ty;
|
||||
T1u = ii[WS(is, 17)];
|
||||
T1v = ii[WS(is, 7)];
|
||||
T1w = T1u - T1v;
|
||||
T2y = T1u + T1v;
|
||||
}
|
||||
Tt = Tp - Ts;
|
||||
TA = Tw - Tz;
|
||||
TB = Tt + TA;
|
||||
T2w = T2u - T2v;
|
||||
T2z = T2x - T2y;
|
||||
T2S = T2w + T2z;
|
||||
T35 = T2u + T2v;
|
||||
T36 = T2x + T2y;
|
||||
T3f = T35 + T36;
|
||||
TH = Tp + Ts;
|
||||
TI = Tw + Tz;
|
||||
TJ = TH + TI;
|
||||
T15 = T13 - T14;
|
||||
T1a = T18 - T19;
|
||||
T1b = T15 + T1a;
|
||||
T1s = T1o - T1r;
|
||||
T1x = T1t - T1w;
|
||||
T1W = T1s + T1x;
|
||||
T29 = T1o + T1r;
|
||||
T2a = T1t + T1w;
|
||||
T2j = T29 + T2a;
|
||||
T1h = T14 + T13;
|
||||
T1i = T19 + T18;
|
||||
T1j = T1h + T1i;
|
||||
}
|
||||
{
|
||||
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
|
||||
E T2F;
|
||||
{
|
||||
E T8, T9, TQ, TR;
|
||||
T8 = ri[WS(is, 4)];
|
||||
T9 = ri[WS(is, 14)];
|
||||
Ta = T8 + T9;
|
||||
T1z = T8 - T9;
|
||||
TQ = ii[WS(is, 4)];
|
||||
TR = ii[WS(is, 14)];
|
||||
TS = TQ - TR;
|
||||
T2B = TQ + TR;
|
||||
}
|
||||
{
|
||||
E Tb, Tc, T1A, T1B;
|
||||
Tb = ri[WS(is, 9)];
|
||||
Tc = ri[WS(is, 19)];
|
||||
Td = Tb + Tc;
|
||||
TT = Tb - Tc;
|
||||
T1A = ii[WS(is, 9)];
|
||||
T1B = ii[WS(is, 19)];
|
||||
T1C = T1A - T1B;
|
||||
T2C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tf, Tg, TV, TW;
|
||||
Tf = ri[WS(is, 16)];
|
||||
Tg = ri[WS(is, 6)];
|
||||
Th = Tf + Tg;
|
||||
T1E = Tf - Tg;
|
||||
TV = ii[WS(is, 16)];
|
||||
TW = ii[WS(is, 6)];
|
||||
TX = TV - TW;
|
||||
T2E = TV + TW;
|
||||
}
|
||||
{
|
||||
E Ti, Tj, T1F, T1G;
|
||||
Ti = ri[WS(is, 1)];
|
||||
Tj = ri[WS(is, 11)];
|
||||
Tk = Ti + Tj;
|
||||
TY = Ti - Tj;
|
||||
T1F = ii[WS(is, 1)];
|
||||
T1G = ii[WS(is, 11)];
|
||||
T1H = T1F - T1G;
|
||||
T2F = T1F + T1G;
|
||||
}
|
||||
Te = Ta - Td;
|
||||
Tl = Th - Tk;
|
||||
Tm = Te + Tl;
|
||||
T2D = T2B - T2C;
|
||||
T2G = T2E - T2F;
|
||||
T2R = T2D + T2G;
|
||||
T32 = T2B + T2C;
|
||||
T33 = T2E + T2F;
|
||||
T3e = T32 + T33;
|
||||
TE = Ta + Td;
|
||||
TF = Th + Tk;
|
||||
TG = TE + TF;
|
||||
TU = TS - TT;
|
||||
TZ = TX - TY;
|
||||
T10 = TU + TZ;
|
||||
T1D = T1z - T1C;
|
||||
T1I = T1E - T1H;
|
||||
T1V = T1D + T1I;
|
||||
T26 = T1z + T1C;
|
||||
T27 = T1E + T1H;
|
||||
T2i = T26 + T27;
|
||||
T1e = TT + TS;
|
||||
T1f = TY + TX;
|
||||
T1g = T1e + T1f;
|
||||
}
|
||||
{
|
||||
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
|
||||
T2s = KP559016994 * (Tm - TB);
|
||||
TC = Tm + TB;
|
||||
T2r = FNMS(KP250000000, TC, T7);
|
||||
T2A = T2w - T2z;
|
||||
T2H = T2D - T2G;
|
||||
T2I = FNMS(KP587785252, T2H, KP951056516 * T2A);
|
||||
T2K = FMA(KP951056516, T2H, KP587785252 * T2A);
|
||||
ro[WS(os, 10)] = T7 + TC;
|
||||
T2J = T2s + T2r;
|
||||
ro[WS(os, 14)] = T2J - T2K;
|
||||
ro[WS(os, 6)] = T2J + T2K;
|
||||
T2t = T2r - T2s;
|
||||
ro[WS(os, 2)] = T2t - T2I;
|
||||
ro[WS(os, 18)] = T2t + T2I;
|
||||
}
|
||||
{
|
||||
E T2V, T2T, T2U, T2N, T2Y, T2L, T2M, T2X, T2W;
|
||||
T2V = KP559016994 * (T2R - T2S);
|
||||
T2T = T2R + T2S;
|
||||
T2U = FNMS(KP250000000, T2T, T2Q);
|
||||
T2L = Tt - TA;
|
||||
T2M = Te - Tl;
|
||||
T2N = FNMS(KP587785252, T2M, KP951056516 * T2L);
|
||||
T2Y = FMA(KP951056516, T2M, KP587785252 * T2L);
|
||||
io[WS(os, 10)] = T2Q + T2T;
|
||||
T2X = T2V + T2U;
|
||||
io[WS(os, 6)] = T2X - T2Y;
|
||||
io[WS(os, 14)] = T2Y + T2X;
|
||||
T2W = T2U - T2V;
|
||||
io[WS(os, 2)] = T2N + T2W;
|
||||
io[WS(os, 18)] = T2W - T2N;
|
||||
}
|
||||
{
|
||||
E T2Z, TK, T30, T38, T3a, T34, T37, T39, T31;
|
||||
T2Z = KP559016994 * (TG - TJ);
|
||||
TK = TG + TJ;
|
||||
T30 = FNMS(KP250000000, TK, TD);
|
||||
T34 = T32 - T33;
|
||||
T37 = T35 - T36;
|
||||
T38 = FMA(KP951056516, T34, KP587785252 * T37);
|
||||
T3a = FNMS(KP587785252, T34, KP951056516 * T37);
|
||||
ro[0] = TD + TK;
|
||||
T39 = T30 - T2Z;
|
||||
ro[WS(os, 12)] = T39 - T3a;
|
||||
ro[WS(os, 8)] = T39 + T3a;
|
||||
T31 = T2Z + T30;
|
||||
ro[WS(os, 4)] = T31 - T38;
|
||||
ro[WS(os, 16)] = T31 + T38;
|
||||
}
|
||||
{
|
||||
E T3g, T3i, T3j, T3d, T3m, T3b, T3c, T3l, T3k;
|
||||
T3g = KP559016994 * (T3e - T3f);
|
||||
T3i = T3e + T3f;
|
||||
T3j = FNMS(KP250000000, T3i, T3h);
|
||||
T3b = TE - TF;
|
||||
T3c = TH - TI;
|
||||
T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
|
||||
T3m = FNMS(KP587785252, T3b, KP951056516 * T3c);
|
||||
io[0] = T3h + T3i;
|
||||
T3l = T3j - T3g;
|
||||
io[WS(os, 8)] = T3l - T3m;
|
||||
io[WS(os, 12)] = T3m + T3l;
|
||||
T3k = T3g + T3j;
|
||||
io[WS(os, 4)] = T3d + T3k;
|
||||
io[WS(os, 16)] = T3k - T3d;
|
||||
}
|
||||
{
|
||||
E T23, T1c, T24, T2c, T2e, T28, T2b, T2d, T25;
|
||||
T23 = KP559016994 * (T10 - T1b);
|
||||
T1c = T10 + T1b;
|
||||
T24 = FNMS(KP250000000, T1c, TP);
|
||||
T28 = T26 - T27;
|
||||
T2b = T29 - T2a;
|
||||
T2c = FMA(KP951056516, T28, KP587785252 * T2b);
|
||||
T2e = FNMS(KP587785252, T28, KP951056516 * T2b);
|
||||
io[WS(os, 5)] = TP + T1c;
|
||||
T2d = T24 - T23;
|
||||
io[WS(os, 13)] = T2d - T2e;
|
||||
io[WS(os, 17)] = T2d + T2e;
|
||||
T25 = T23 + T24;
|
||||
io[WS(os, 1)] = T25 - T2c;
|
||||
io[WS(os, 9)] = T25 + T2c;
|
||||
}
|
||||
{
|
||||
E T2k, T2m, T2n, T2h, T2p, T2f, T2g, T2q, T2o;
|
||||
T2k = KP559016994 * (T2i - T2j);
|
||||
T2m = T2i + T2j;
|
||||
T2n = FNMS(KP250000000, T2m, T2l);
|
||||
T2f = TU - TZ;
|
||||
T2g = T15 - T1a;
|
||||
T2h = FMA(KP951056516, T2f, KP587785252 * T2g);
|
||||
T2p = FNMS(KP587785252, T2f, KP951056516 * T2g);
|
||||
ro[WS(os, 5)] = T2l + T2m;
|
||||
T2q = T2n - T2k;
|
||||
ro[WS(os, 13)] = T2p + T2q;
|
||||
ro[WS(os, 17)] = T2q - T2p;
|
||||
T2o = T2k + T2n;
|
||||
ro[WS(os, 1)] = T2h + T2o;
|
||||
ro[WS(os, 9)] = T2o - T2h;
|
||||
}
|
||||
{
|
||||
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
|
||||
T1m = KP559016994 * (T1g - T1j);
|
||||
T1k = T1g + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1d);
|
||||
T1y = T1s - T1x;
|
||||
T1J = T1D - T1I;
|
||||
T1K = FNMS(KP587785252, T1J, KP951056516 * T1y);
|
||||
T1M = FMA(KP951056516, T1J, KP587785252 * T1y);
|
||||
io[WS(os, 15)] = T1d + T1k;
|
||||
T1L = T1m + T1l;
|
||||
io[WS(os, 11)] = T1L - T1M;
|
||||
io[WS(os, 19)] = T1L + T1M;
|
||||
T1n = T1l - T1m;
|
||||
io[WS(os, 3)] = T1n - T1K;
|
||||
io[WS(os, 7)] = T1n + T1K;
|
||||
}
|
||||
{
|
||||
E T1Z, T1X, T1Y, T1P, T21, T1N, T1O, T22, T20;
|
||||
T1Z = KP559016994 * (T1V - T1W);
|
||||
T1X = T1V + T1W;
|
||||
T1Y = FNMS(KP250000000, T1X, T1U);
|
||||
T1N = T1h - T1i;
|
||||
T1O = T1e - T1f;
|
||||
T1P = FNMS(KP587785252, T1O, KP951056516 * T1N);
|
||||
T21 = FMA(KP951056516, T1O, KP587785252 * T1N);
|
||||
ro[WS(os, 15)] = T1U + T1X;
|
||||
T22 = T1Z + T1Y;
|
||||
ro[WS(os, 11)] = T21 + T22;
|
||||
ro[WS(os, 19)] = T22 - T21;
|
||||
T20 = T1Y - T1Z;
|
||||
ro[WS(os, 3)] = T1P + T20;
|
||||
ro[WS(os, 7)] = T20 - T1P;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 20, "n1_20", { 184, 24, 24, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
1223
fftw-3.3.10/dft/scalar/codelets/n1_25.c
Normal file
1223
fftw-3.3.10/dft/scalar/codelets/n1_25.c
Normal file
File diff suppressed because it is too large
Load Diff
124
fftw-3.3.10/dft/scalar/codelets/n1_3.c
Normal file
124
fftw-3.3.10/dft/scalar/codelets/n1_3.c
Normal file
@@ -0,0 +1,124 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 6 FP multiplications,
|
||||
* (or, 6 additions, 0 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
|
||||
E T1, T9, T4, Tc, T8, Ta, T5, Tb;
|
||||
T1 = ri[0];
|
||||
T9 = ii[0];
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 2)];
|
||||
T4 = T2 + T3;
|
||||
Tc = T3 - T2;
|
||||
T6 = ii[WS(is, 1)];
|
||||
T7 = ii[WS(is, 2)];
|
||||
T8 = T6 - T7;
|
||||
Ta = T6 + T7;
|
||||
}
|
||||
ro[0] = T1 + T4;
|
||||
io[0] = T9 + Ta;
|
||||
T5 = FNMS(KP500000000, T4, T1);
|
||||
ro[WS(os, 2)] = FNMS(KP866025403, T8, T5);
|
||||
ro[WS(os, 1)] = FMA(KP866025403, T8, T5);
|
||||
Tb = FNMS(KP500000000, Ta, T9);
|
||||
io[WS(os, 1)] = FMA(KP866025403, Tc, Tb);
|
||||
io[WS(os, 2)] = FNMS(KP866025403, Tc, Tb);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 3, "n1_3", { 6, 0, 6, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 4 FP multiplications,
|
||||
* (or, 10 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
|
||||
E T1, Ta, T4, T9, T8, Tb, T5, Tc;
|
||||
T1 = ri[0];
|
||||
Ta = ii[0];
|
||||
{
|
||||
E T2, T3, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 2)];
|
||||
T4 = T2 + T3;
|
||||
T9 = KP866025403 * (T3 - T2);
|
||||
T6 = ii[WS(is, 1)];
|
||||
T7 = ii[WS(is, 2)];
|
||||
T8 = KP866025403 * (T6 - T7);
|
||||
Tb = T6 + T7;
|
||||
}
|
||||
ro[0] = T1 + T4;
|
||||
io[0] = Ta + Tb;
|
||||
T5 = FNMS(KP500000000, T4, T1);
|
||||
ro[WS(os, 2)] = T5 - T8;
|
||||
ro[WS(os, 1)] = T5 + T8;
|
||||
Tc = FNMS(KP500000000, Tb, Ta);
|
||||
io[WS(os, 1)] = T9 + Tc;
|
||||
io[WS(os, 2)] = Tc - T9;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 3, "n1_3", { 10, 2, 2, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
1318
fftw-3.3.10/dft/scalar/codelets/n1_32.c
Normal file
1318
fftw-3.3.10/dft/scalar/codelets/n1_32.c
Normal file
File diff suppressed because it is too large
Load Diff
138
fftw-3.3.10/dft/scalar/codelets/n1_4.c
Normal file
138
fftw-3.3.10/dft/scalar/codelets/n1_4.c
Normal file
@@ -0,0 +1,138 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 0 FP multiplications,
|
||||
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
|
||||
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
|
||||
{
|
||||
E T1, T2, T7, T8;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 2)];
|
||||
T3 = T1 + T2;
|
||||
Tb = T1 - T2;
|
||||
T7 = ii[0];
|
||||
T8 = ii[WS(is, 2)];
|
||||
T9 = T7 - T8;
|
||||
Tf = T7 + T8;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tc, Td;
|
||||
T4 = ri[WS(is, 1)];
|
||||
T5 = ri[WS(is, 3)];
|
||||
T6 = T4 + T5;
|
||||
Ta = T4 - T5;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 3)];
|
||||
Te = Tc - Td;
|
||||
Tg = Tc + Td;
|
||||
}
|
||||
ro[WS(os, 2)] = T3 - T6;
|
||||
io[WS(os, 2)] = Tf - Tg;
|
||||
ro[0] = T3 + T6;
|
||||
io[0] = Tf + Tg;
|
||||
io[WS(os, 1)] = T9 - Ta;
|
||||
ro[WS(os, 1)] = Tb + Te;
|
||||
io[WS(os, 3)] = Ta + T9;
|
||||
ro[WS(os, 3)] = Tb - Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 0 FP multiplications,
|
||||
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
|
||||
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
|
||||
{
|
||||
E T1, T2, T7, T8;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 2)];
|
||||
T3 = T1 + T2;
|
||||
Tb = T1 - T2;
|
||||
T7 = ii[0];
|
||||
T8 = ii[WS(is, 2)];
|
||||
T9 = T7 - T8;
|
||||
Tf = T7 + T8;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tc, Td;
|
||||
T4 = ri[WS(is, 1)];
|
||||
T5 = ri[WS(is, 3)];
|
||||
T6 = T4 + T5;
|
||||
Ta = T4 - T5;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 3)];
|
||||
Te = Tc - Td;
|
||||
Tg = Tc + Td;
|
||||
}
|
||||
ro[WS(os, 2)] = T3 - T6;
|
||||
io[WS(os, 2)] = Tf - Tg;
|
||||
ro[0] = T3 + T6;
|
||||
io[0] = Tf + Tg;
|
||||
io[WS(os, 1)] = T9 - Ta;
|
||||
ro[WS(os, 1)] = Tb + Te;
|
||||
io[WS(os, 3)] = Ta + T9;
|
||||
ro[WS(os, 3)] = Tb - Te;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
194
fftw-3.3.10/dft/scalar/codelets/n1_5.c
Normal file
194
fftw-3.3.10/dft/scalar/codelets/n1_5.c
Normal file
@@ -0,0 +1,194 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 18 FP multiplications,
|
||||
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
|
||||
* 21 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
|
||||
E T1, Tl, T8, Tt, Ta, Ts, Te, Tq, Th, To;
|
||||
T1 = ri[0];
|
||||
Tl = ii[0];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 4)];
|
||||
T4 = T2 + T3;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = T5 + T6;
|
||||
T8 = T4 + T7;
|
||||
Tt = T5 - T6;
|
||||
Ta = T4 - T7;
|
||||
Ts = T2 - T3;
|
||||
}
|
||||
{
|
||||
E Tc, Td, Tm, Tf, Tg, Tn;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 4)];
|
||||
Tm = Tc + Td;
|
||||
Tf = ii[WS(is, 2)];
|
||||
Tg = ii[WS(is, 3)];
|
||||
Tn = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Tq = Tm - Tn;
|
||||
Th = Tf - Tg;
|
||||
To = Tm + Tn;
|
||||
}
|
||||
ro[0] = T1 + T8;
|
||||
io[0] = Tl + To;
|
||||
{
|
||||
E Ti, Tk, Tb, Tj, T9;
|
||||
Ti = FMA(KP618033988, Th, Te);
|
||||
Tk = FNMS(KP618033988, Te, Th);
|
||||
T9 = FNMS(KP250000000, T8, T1);
|
||||
Tb = FMA(KP559016994, Ta, T9);
|
||||
Tj = FNMS(KP559016994, Ta, T9);
|
||||
ro[WS(os, 4)] = FNMS(KP951056516, Ti, Tb);
|
||||
ro[WS(os, 3)] = FMA(KP951056516, Tk, Tj);
|
||||
ro[WS(os, 1)] = FMA(KP951056516, Ti, Tb);
|
||||
ro[WS(os, 2)] = FNMS(KP951056516, Tk, Tj);
|
||||
}
|
||||
{
|
||||
E Tu, Tw, Tr, Tv, Tp;
|
||||
Tu = FMA(KP618033988, Tt, Ts);
|
||||
Tw = FNMS(KP618033988, Ts, Tt);
|
||||
Tp = FNMS(KP250000000, To, Tl);
|
||||
Tr = FMA(KP559016994, Tq, Tp);
|
||||
Tv = FNMS(KP559016994, Tq, Tp);
|
||||
io[WS(os, 1)] = FNMS(KP951056516, Tu, Tr);
|
||||
io[WS(os, 3)] = FNMS(KP951056516, Tw, Tv);
|
||||
io[WS(os, 4)] = FMA(KP951056516, Tu, Tr);
|
||||
io[WS(os, 2)] = FMA(KP951056516, Tw, Tv);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 5, "n1_5", { 14, 0, 18, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 32 FP additions, 12 FP multiplications,
|
||||
* (or, 26 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 21 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
|
||||
E T1, To, T8, Tt, T9, Ts, Te, Tp, Th, Tn;
|
||||
T1 = ri[0];
|
||||
To = ii[0];
|
||||
{
|
||||
E T2, T3, T4, T5, T6, T7;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 4)];
|
||||
T4 = T2 + T3;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 3)];
|
||||
T7 = T5 + T6;
|
||||
T8 = T4 + T7;
|
||||
Tt = T5 - T6;
|
||||
T9 = KP559016994 * (T4 - T7);
|
||||
Ts = T2 - T3;
|
||||
}
|
||||
{
|
||||
E Tc, Td, Tl, Tf, Tg, Tm;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 4)];
|
||||
Tl = Tc + Td;
|
||||
Tf = ii[WS(is, 2)];
|
||||
Tg = ii[WS(is, 3)];
|
||||
Tm = Tf + Tg;
|
||||
Te = Tc - Td;
|
||||
Tp = Tl + Tm;
|
||||
Th = Tf - Tg;
|
||||
Tn = KP559016994 * (Tl - Tm);
|
||||
}
|
||||
ro[0] = T1 + T8;
|
||||
io[0] = To + Tp;
|
||||
{
|
||||
E Ti, Tk, Tb, Tj, Ta;
|
||||
Ti = FMA(KP951056516, Te, KP587785252 * Th);
|
||||
Tk = FNMS(KP587785252, Te, KP951056516 * Th);
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
Tb = T9 + Ta;
|
||||
Tj = Ta - T9;
|
||||
ro[WS(os, 4)] = Tb - Ti;
|
||||
ro[WS(os, 3)] = Tj + Tk;
|
||||
ro[WS(os, 1)] = Tb + Ti;
|
||||
ro[WS(os, 2)] = Tj - Tk;
|
||||
}
|
||||
{
|
||||
E Tu, Tv, Tr, Tw, Tq;
|
||||
Tu = FMA(KP951056516, Ts, KP587785252 * Tt);
|
||||
Tv = FNMS(KP587785252, Ts, KP951056516 * Tt);
|
||||
Tq = FNMS(KP250000000, Tp, To);
|
||||
Tr = Tn + Tq;
|
||||
Tw = Tq - Tn;
|
||||
io[WS(os, 1)] = Tr - Tu;
|
||||
io[WS(os, 3)] = Tw - Tv;
|
||||
io[WS(os, 4)] = Tu + Tr;
|
||||
io[WS(os, 2)] = Tv + Tw;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 5, "n1_5", { 26, 6, 6, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
210
fftw-3.3.10/dft/scalar/codelets/n1_6.c
Normal file
210
fftw-3.3.10/dft/scalar/codelets/n1_6.c
Normal file
@@ -0,0 +1,210 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 36 FP additions, 12 FP multiplications,
|
||||
* (or, 24 additions, 0 multiplications, 12 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
|
||||
E T3, Tb, Tp, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tq;
|
||||
E Ty;
|
||||
{
|
||||
E T1, T2, Tn, To;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = T1 - T2;
|
||||
Tb = T1 + T2;
|
||||
Tn = ii[0];
|
||||
To = ii[WS(is, 3)];
|
||||
Tp = Tn - To;
|
||||
Tx = Tn + To;
|
||||
}
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 5)];
|
||||
T6 = T4 - T5;
|
||||
Tc = T4 + T5;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = T7 - T8;
|
||||
Td = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Te = Tc + Td;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ii[WS(is, 2)];
|
||||
Th = ii[WS(is, 5)];
|
||||
Ti = Tg - Th;
|
||||
Tu = Tg + Th;
|
||||
Tj = ii[WS(is, 4)];
|
||||
Tk = ii[WS(is, 1)];
|
||||
Tl = Tj - Tk;
|
||||
Tv = Tj + Tk;
|
||||
}
|
||||
Tq = Ti + Tl;
|
||||
Ty = Tu + Tv;
|
||||
ro[WS(os, 3)] = T3 + Ta;
|
||||
io[WS(os, 3)] = Tp + Tq;
|
||||
ro[0] = Tb + Te;
|
||||
io[0] = Tx + Ty;
|
||||
{
|
||||
E Tf, Tm, Tr, Ts;
|
||||
Tf = FNMS(KP500000000, Ta, T3);
|
||||
Tm = Ti - Tl;
|
||||
ro[WS(os, 5)] = FNMS(KP866025403, Tm, Tf);
|
||||
ro[WS(os, 1)] = FMA(KP866025403, Tm, Tf);
|
||||
Tr = FNMS(KP500000000, Tq, Tp);
|
||||
Ts = T9 - T6;
|
||||
io[WS(os, 1)] = FMA(KP866025403, Ts, Tr);
|
||||
io[WS(os, 5)] = FNMS(KP866025403, Ts, Tr);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tz, TA;
|
||||
Tt = FNMS(KP500000000, Te, Tb);
|
||||
Tw = Tu - Tv;
|
||||
ro[WS(os, 2)] = FNMS(KP866025403, Tw, Tt);
|
||||
ro[WS(os, 4)] = FMA(KP866025403, Tw, Tt);
|
||||
Tz = FNMS(KP500000000, Ty, Tx);
|
||||
TA = Td - Tc;
|
||||
io[WS(os, 2)] = FNMS(KP866025403, TA, Tz);
|
||||
io[WS(os, 4)] = FMA(KP866025403, TA, Tz);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 6, "n1_6", { 24, 0, 12, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 36 FP additions, 8 FP multiplications,
|
||||
* (or, 32 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
|
||||
E T3, Tb, Tq, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tr;
|
||||
E Ty;
|
||||
{
|
||||
E T1, T2, To, Tp;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = T1 - T2;
|
||||
Tb = T1 + T2;
|
||||
To = ii[0];
|
||||
Tp = ii[WS(is, 3)];
|
||||
Tq = To - Tp;
|
||||
Tx = To + Tp;
|
||||
}
|
||||
{
|
||||
E T4, T5, T7, T8;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 5)];
|
||||
T6 = T4 - T5;
|
||||
Tc = T4 + T5;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = T7 - T8;
|
||||
Td = T7 + T8;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
Te = Tc + Td;
|
||||
{
|
||||
E Tg, Th, Tj, Tk;
|
||||
Tg = ii[WS(is, 2)];
|
||||
Th = ii[WS(is, 5)];
|
||||
Ti = Tg - Th;
|
||||
Tu = Tg + Th;
|
||||
Tj = ii[WS(is, 4)];
|
||||
Tk = ii[WS(is, 1)];
|
||||
Tl = Tj - Tk;
|
||||
Tv = Tj + Tk;
|
||||
}
|
||||
Tr = Ti + Tl;
|
||||
Ty = Tu + Tv;
|
||||
ro[WS(os, 3)] = T3 + Ta;
|
||||
io[WS(os, 3)] = Tq + Tr;
|
||||
ro[0] = Tb + Te;
|
||||
io[0] = Tx + Ty;
|
||||
{
|
||||
E Tf, Tm, Tn, Ts;
|
||||
Tf = FNMS(KP500000000, Ta, T3);
|
||||
Tm = KP866025403 * (Ti - Tl);
|
||||
ro[WS(os, 5)] = Tf - Tm;
|
||||
ro[WS(os, 1)] = Tf + Tm;
|
||||
Tn = KP866025403 * (T9 - T6);
|
||||
Ts = FNMS(KP500000000, Tr, Tq);
|
||||
io[WS(os, 1)] = Tn + Ts;
|
||||
io[WS(os, 5)] = Ts - Tn;
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tz, TA;
|
||||
Tt = FNMS(KP500000000, Te, Tb);
|
||||
Tw = KP866025403 * (Tu - Tv);
|
||||
ro[WS(os, 2)] = Tt - Tw;
|
||||
ro[WS(os, 4)] = Tt + Tw;
|
||||
Tz = FNMS(KP500000000, Ty, Tx);
|
||||
TA = KP866025403 * (Td - Tc);
|
||||
io[WS(os, 2)] = Tz - TA;
|
||||
io[WS(os, 4)] = TA + Tz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 6, "n1_6", { 32, 4, 4, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
3086
fftw-3.3.10/dft/scalar/codelets/n1_64.c
Normal file
3086
fftw-3.3.10/dft/scalar/codelets/n1_64.c
Normal file
File diff suppressed because it is too large
Load Diff
249
fftw-3.3.10/dft/scalar/codelets/n1_7.c
Normal file
249
fftw-3.3.10/dft/scalar/codelets/n1_7.c
Normal file
@@ -0,0 +1,249 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 60 FP additions, 42 FP multiplications,
|
||||
* (or, 18 additions, 0 multiplications, 42 fused multiply/add),
|
||||
* 41 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
|
||||
E T1, Tz, T4, TI, Ta, TG, T7, TH, Tb, Tp, TT, TO, TJ, Tu, Tg;
|
||||
E TB, Tm, TC, Tj, TA, Tn, Ts, TQ, TL, TD, Tx;
|
||||
T1 = ri[0];
|
||||
Tz = ii[0];
|
||||
{
|
||||
E T2, T3, Te, Tf;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
TI = T3 - T2;
|
||||
{
|
||||
E T8, T9, T5, T6;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
TG = T9 - T8;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 5)];
|
||||
T7 = T5 + T6;
|
||||
TH = T6 - T5;
|
||||
}
|
||||
Tb = FNMS(KP356895867, T7, T4);
|
||||
Tp = FNMS(KP356895867, T4, Ta);
|
||||
TT = FMA(KP554958132, TG, TI);
|
||||
TO = FMA(KP554958132, TH, TG);
|
||||
TJ = FNMS(KP554958132, TI, TH);
|
||||
Tu = FNMS(KP356895867, Ta, T7);
|
||||
Te = ii[WS(is, 2)];
|
||||
Tf = ii[WS(is, 5)];
|
||||
Tg = Te - Tf;
|
||||
TB = Te + Tf;
|
||||
{
|
||||
E Tk, Tl, Th, Ti;
|
||||
Tk = ii[WS(is, 3)];
|
||||
Tl = ii[WS(is, 4)];
|
||||
Tm = Tk - Tl;
|
||||
TC = Tk + Tl;
|
||||
Th = ii[WS(is, 1)];
|
||||
Ti = ii[WS(is, 6)];
|
||||
Tj = Th - Ti;
|
||||
TA = Th + Ti;
|
||||
}
|
||||
Tn = FMA(KP554958132, Tm, Tj);
|
||||
Ts = FMA(KP554958132, Tg, Tm);
|
||||
TQ = FNMS(KP356895867, TB, TA);
|
||||
TL = FNMS(KP356895867, TA, TC);
|
||||
TD = FNMS(KP356895867, TC, TB);
|
||||
Tx = FNMS(KP554958132, Tj, Tg);
|
||||
}
|
||||
ro[0] = T1 + T4 + T7 + Ta;
|
||||
io[0] = Tz + TA + TB + TC;
|
||||
{
|
||||
E To, Td, Tc, TU, TS, TR;
|
||||
To = FMA(KP801937735, Tn, Tg);
|
||||
Tc = FNMS(KP692021471, Tb, Ta);
|
||||
Td = FNMS(KP900968867, Tc, T1);
|
||||
ro[WS(os, 6)] = FNMS(KP974927912, To, Td);
|
||||
ro[WS(os, 1)] = FMA(KP974927912, To, Td);
|
||||
TU = FMA(KP801937735, TT, TH);
|
||||
TR = FNMS(KP692021471, TQ, TC);
|
||||
TS = FNMS(KP900968867, TR, Tz);
|
||||
io[WS(os, 1)] = FMA(KP974927912, TU, TS);
|
||||
io[WS(os, 6)] = FNMS(KP974927912, TU, TS);
|
||||
}
|
||||
{
|
||||
E Tt, Tr, Tq, TP, TN, TM;
|
||||
Tt = FNMS(KP801937735, Ts, Tj);
|
||||
Tq = FNMS(KP692021471, Tp, T7);
|
||||
Tr = FNMS(KP900968867, Tq, T1);
|
||||
ro[WS(os, 5)] = FNMS(KP974927912, Tt, Tr);
|
||||
ro[WS(os, 2)] = FMA(KP974927912, Tt, Tr);
|
||||
TP = FNMS(KP801937735, TO, TI);
|
||||
TM = FNMS(KP692021471, TL, TB);
|
||||
TN = FNMS(KP900968867, TM, Tz);
|
||||
io[WS(os, 2)] = FMA(KP974927912, TP, TN);
|
||||
io[WS(os, 5)] = FNMS(KP974927912, TP, TN);
|
||||
}
|
||||
{
|
||||
E Ty, Tw, Tv, TK, TF, TE;
|
||||
Ty = FNMS(KP801937735, Tx, Tm);
|
||||
Tv = FNMS(KP692021471, Tu, T4);
|
||||
Tw = FNMS(KP900968867, Tv, T1);
|
||||
ro[WS(os, 4)] = FNMS(KP974927912, Ty, Tw);
|
||||
ro[WS(os, 3)] = FMA(KP974927912, Ty, Tw);
|
||||
TK = FNMS(KP801937735, TJ, TG);
|
||||
TE = FNMS(KP692021471, TD, TA);
|
||||
TF = FNMS(KP900968867, TE, Tz);
|
||||
io[WS(os, 3)] = FMA(KP974927912, TK, TF);
|
||||
io[WS(os, 4)] = FNMS(KP974927912, TK, TF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 7, "n1_7", { 18, 0, 42, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 60 FP additions, 36 FP multiplications,
|
||||
* (or, 36 additions, 12 multiplications, 24 fused multiply/add),
|
||||
* 25 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
|
||||
E T1, Tu, T4, Tq, Te, Tx, T7, Ts, Tk, Tv, Ta, Tr, Th, Tw;
|
||||
T1 = ri[0];
|
||||
Tu = ii[0];
|
||||
{
|
||||
E T2, T3, Tc, Td;
|
||||
T2 = ri[WS(is, 1)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
Tq = T3 - T2;
|
||||
Tc = ii[WS(is, 1)];
|
||||
Td = ii[WS(is, 6)];
|
||||
Te = Tc - Td;
|
||||
Tx = Tc + Td;
|
||||
}
|
||||
{
|
||||
E T5, T6, Ti, Tj;
|
||||
T5 = ri[WS(is, 2)];
|
||||
T6 = ri[WS(is, 5)];
|
||||
T7 = T5 + T6;
|
||||
Ts = T6 - T5;
|
||||
Ti = ii[WS(is, 2)];
|
||||
Tj = ii[WS(is, 5)];
|
||||
Tk = Ti - Tj;
|
||||
Tv = Ti + Tj;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tf, Tg;
|
||||
T8 = ri[WS(is, 3)];
|
||||
T9 = ri[WS(is, 4)];
|
||||
Ta = T8 + T9;
|
||||
Tr = T9 - T8;
|
||||
Tf = ii[WS(is, 3)];
|
||||
Tg = ii[WS(is, 4)];
|
||||
Th = Tf - Tg;
|
||||
Tw = Tf + Tg;
|
||||
}
|
||||
ro[0] = T1 + T4 + T7 + Ta;
|
||||
io[0] = Tu + Tx + Tv + Tw;
|
||||
{
|
||||
E Tl, Tb, TB, TC;
|
||||
Tl = FNMS(KP781831482, Th, KP974927912 * Te) - (KP433883739 * Tk);
|
||||
Tb = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
|
||||
ro[WS(os, 5)] = Tb - Tl;
|
||||
ro[WS(os, 2)] = Tb + Tl;
|
||||
TB = FNMS(KP781831482, Tr, KP974927912 * Tq) - (KP433883739 * Ts);
|
||||
TC = FMA(KP623489801, Tw, Tu) + FNMA(KP900968867, Tv, KP222520933 * Tx);
|
||||
io[WS(os, 2)] = TB + TC;
|
||||
io[WS(os, 5)] = TC - TB;
|
||||
}
|
||||
{
|
||||
E Tn, Tm, Tz, TA;
|
||||
Tn = FMA(KP781831482, Te, KP974927912 * Tk) + (KP433883739 * Th);
|
||||
Tm = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
|
||||
ro[WS(os, 6)] = Tm - Tn;
|
||||
ro[WS(os, 1)] = Tm + Tn;
|
||||
Tz = FMA(KP781831482, Tq, KP974927912 * Ts) + (KP433883739 * Tr);
|
||||
TA = FMA(KP623489801, Tx, Tu) + FNMA(KP900968867, Tw, KP222520933 * Tv);
|
||||
io[WS(os, 1)] = Tz + TA;
|
||||
io[WS(os, 6)] = TA - Tz;
|
||||
}
|
||||
{
|
||||
E Tp, To, Tt, Ty;
|
||||
Tp = FMA(KP433883739, Te, KP974927912 * Th) - (KP781831482 * Tk);
|
||||
To = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
|
||||
ro[WS(os, 4)] = To - Tp;
|
||||
ro[WS(os, 3)] = To + Tp;
|
||||
Tt = FMA(KP433883739, Tq, KP974927912 * Tr) - (KP781831482 * Ts);
|
||||
Ty = FMA(KP623489801, Tv, Tu) + FNMA(KP222520933, Tw, KP900968867 * Tx);
|
||||
io[WS(os, 3)] = Tt + Ty;
|
||||
io[WS(os, 4)] = Ty - Tt;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 7, "n1_7", { 36, 12, 24, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
266
fftw-3.3.10/dft/scalar/codelets/n1_8.c
Normal file
266
fftw-3.3.10/dft/scalar/codelets/n1_8.c
Normal file
@@ -0,0 +1,266 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 52 FP additions, 8 FP multiplications,
|
||||
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
|
||||
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
|
||||
E TG;
|
||||
{
|
||||
E T1, T2, Tj, Tk;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = T1 + T2;
|
||||
Tn = T1 - T2;
|
||||
{
|
||||
E Tg, Th, T4, T5;
|
||||
Tg = ii[0];
|
||||
Th = ii[WS(is, 4)];
|
||||
Ti = Tg + Th;
|
||||
TC = Tg - Th;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 6)];
|
||||
T6 = T4 + T5;
|
||||
TB = T4 - T5;
|
||||
}
|
||||
Tj = ii[WS(is, 2)];
|
||||
Tk = ii[WS(is, 6)];
|
||||
Tl = Tj + Tk;
|
||||
To = Tj - Tk;
|
||||
{
|
||||
E Tb, Tc, Tv, Tw, Tx, Ty;
|
||||
Tb = ri[WS(is, 7)];
|
||||
Tc = ri[WS(is, 3)];
|
||||
Tv = Tb - Tc;
|
||||
Tw = ii[WS(is, 7)];
|
||||
Tx = ii[WS(is, 3)];
|
||||
Ty = Tw - Tx;
|
||||
Td = Tb + Tc;
|
||||
TN = Tw + Tx;
|
||||
Tz = Tv - Ty;
|
||||
TH = Tv + Ty;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tq, Tr, Ts, Tt;
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = ri[WS(is, 5)];
|
||||
Tq = T8 - T9;
|
||||
Tr = ii[WS(is, 1)];
|
||||
Ts = ii[WS(is, 5)];
|
||||
Tt = Tr - Ts;
|
||||
Ta = T8 + T9;
|
||||
TM = Tr + Ts;
|
||||
Tu = Tq + Tt;
|
||||
TG = Tt - Tq;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Te, TP, TQ;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
ro[WS(os, 4)] = T7 - Te;
|
||||
ro[0] = T7 + Te;
|
||||
TP = Ti + Tl;
|
||||
TQ = TM + TN;
|
||||
io[WS(os, 4)] = TP - TQ;
|
||||
io[0] = TP + TQ;
|
||||
}
|
||||
{
|
||||
E Tf, Tm, TL, TO;
|
||||
Tf = Td - Ta;
|
||||
Tm = Ti - Tl;
|
||||
io[WS(os, 2)] = Tf + Tm;
|
||||
io[WS(os, 6)] = Tm - Tf;
|
||||
TL = T3 - T6;
|
||||
TO = TM - TN;
|
||||
ro[WS(os, 6)] = TL - TO;
|
||||
ro[WS(os, 2)] = TL + TO;
|
||||
}
|
||||
{
|
||||
E Tp, TA, TJ, TK;
|
||||
Tp = Tn + To;
|
||||
TA = Tu + Tz;
|
||||
ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
|
||||
ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
|
||||
TJ = TC - TB;
|
||||
TK = TG + TH;
|
||||
io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
|
||||
io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
|
||||
}
|
||||
{
|
||||
E TD, TE, TF, TI;
|
||||
TD = TB + TC;
|
||||
TE = Tz - Tu;
|
||||
io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
|
||||
io[WS(os, 3)] = FMA(KP707106781, TE, TD);
|
||||
TF = Tn - To;
|
||||
TI = TG - TH;
|
||||
ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
|
||||
ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 8, "n1_8", { 44, 0, 8, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 52 FP additions, 4 FP multiplications,
|
||||
* (or, 52 additions, 4 multiplications, 0 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
|
||||
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
|
||||
E TG;
|
||||
{
|
||||
E T1, T2, Tj, Tk;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 4)];
|
||||
T3 = T1 + T2;
|
||||
Tn = T1 - T2;
|
||||
{
|
||||
E Tg, Th, T4, T5;
|
||||
Tg = ii[0];
|
||||
Th = ii[WS(is, 4)];
|
||||
Ti = Tg + Th;
|
||||
TC = Tg - Th;
|
||||
T4 = ri[WS(is, 2)];
|
||||
T5 = ri[WS(is, 6)];
|
||||
T6 = T4 + T5;
|
||||
TB = T4 - T5;
|
||||
}
|
||||
Tj = ii[WS(is, 2)];
|
||||
Tk = ii[WS(is, 6)];
|
||||
Tl = Tj + Tk;
|
||||
To = Tj - Tk;
|
||||
{
|
||||
E Tb, Tc, Tv, Tw, Tx, Ty;
|
||||
Tb = ri[WS(is, 7)];
|
||||
Tc = ri[WS(is, 3)];
|
||||
Tv = Tb - Tc;
|
||||
Tw = ii[WS(is, 7)];
|
||||
Tx = ii[WS(is, 3)];
|
||||
Ty = Tw - Tx;
|
||||
Td = Tb + Tc;
|
||||
TN = Tw + Tx;
|
||||
Tz = Tv - Ty;
|
||||
TH = Tv + Ty;
|
||||
}
|
||||
{
|
||||
E T8, T9, Tq, Tr, Ts, Tt;
|
||||
T8 = ri[WS(is, 1)];
|
||||
T9 = ri[WS(is, 5)];
|
||||
Tq = T8 - T9;
|
||||
Tr = ii[WS(is, 1)];
|
||||
Ts = ii[WS(is, 5)];
|
||||
Tt = Tr - Ts;
|
||||
Ta = T8 + T9;
|
||||
TM = Tr + Ts;
|
||||
Tu = Tq + Tt;
|
||||
TG = Tt - Tq;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T7, Te, TP, TQ;
|
||||
T7 = T3 + T6;
|
||||
Te = Ta + Td;
|
||||
ro[WS(os, 4)] = T7 - Te;
|
||||
ro[0] = T7 + Te;
|
||||
TP = Ti + Tl;
|
||||
TQ = TM + TN;
|
||||
io[WS(os, 4)] = TP - TQ;
|
||||
io[0] = TP + TQ;
|
||||
}
|
||||
{
|
||||
E Tf, Tm, TL, TO;
|
||||
Tf = Td - Ta;
|
||||
Tm = Ti - Tl;
|
||||
io[WS(os, 2)] = Tf + Tm;
|
||||
io[WS(os, 6)] = Tm - Tf;
|
||||
TL = T3 - T6;
|
||||
TO = TM - TN;
|
||||
ro[WS(os, 6)] = TL - TO;
|
||||
ro[WS(os, 2)] = TL + TO;
|
||||
}
|
||||
{
|
||||
E Tp, TA, TJ, TK;
|
||||
Tp = Tn + To;
|
||||
TA = KP707106781 * (Tu + Tz);
|
||||
ro[WS(os, 5)] = Tp - TA;
|
||||
ro[WS(os, 1)] = Tp + TA;
|
||||
TJ = TC - TB;
|
||||
TK = KP707106781 * (TG + TH);
|
||||
io[WS(os, 5)] = TJ - TK;
|
||||
io[WS(os, 1)] = TJ + TK;
|
||||
}
|
||||
{
|
||||
E TD, TE, TF, TI;
|
||||
TD = TB + TC;
|
||||
TE = KP707106781 * (Tz - Tu);
|
||||
io[WS(os, 7)] = TD - TE;
|
||||
io[WS(os, 3)] = TD + TE;
|
||||
TF = Tn - To;
|
||||
TI = KP707106781 * (TG - TH);
|
||||
ro[WS(os, 7)] = TF - TI;
|
||||
ro[WS(os, 3)] = TF + TI;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 8, "n1_8", { 52, 4, 0, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
360
fftw-3.3.10/dft/scalar/codelets/n1_9.c
Normal file
360
fftw-3.3.10/dft/scalar/codelets/n1_9.c
Normal file
@@ -0,0 +1,360 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 80 FP additions, 56 FP multiplications,
|
||||
* (or, 24 additions, 0 multiplications, 56 fused multiply/add),
|
||||
* 41 stack variables, 10 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
|
||||
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
|
||||
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
|
||||
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
|
||||
E T5, TL, Tm, Tl, T1f, TM, Ta, T1c, TF, TW, TI, TX, Tf, T1d, Ts;
|
||||
E TZ, Tx, T10;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TL = FNMS(KP500000000, T4, T1);
|
||||
Tm = T3 - T2;
|
||||
}
|
||||
{
|
||||
E Th, Ti, Tj, Tk;
|
||||
Th = ii[0];
|
||||
Ti = ii[WS(is, 3)];
|
||||
Tj = ii[WS(is, 6)];
|
||||
Tk = Ti + Tj;
|
||||
Tl = FNMS(KP500000000, Tk, Th);
|
||||
T1f = Th + Tk;
|
||||
TM = Ti - Tj;
|
||||
}
|
||||
{
|
||||
E T6, Tz, T9, TE, TC, TH, TD, TG;
|
||||
T6 = ri[WS(is, 1)];
|
||||
Tz = ii[WS(is, 1)];
|
||||
{
|
||||
E T7, T8, TA, TB;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 7)];
|
||||
T9 = T7 + T8;
|
||||
TE = T7 - T8;
|
||||
TA = ii[WS(is, 4)];
|
||||
TB = ii[WS(is, 7)];
|
||||
TC = TA + TB;
|
||||
TH = TB - TA;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1c = Tz + TC;
|
||||
TD = FNMS(KP500000000, TC, Tz);
|
||||
TF = FNMS(KP866025403, TE, TD);
|
||||
TW = FMA(KP866025403, TE, TD);
|
||||
TG = FNMS(KP500000000, T9, T6);
|
||||
TI = FNMS(KP866025403, TH, TG);
|
||||
TX = FMA(KP866025403, TH, TG);
|
||||
}
|
||||
{
|
||||
E Tb, Tt, Te, Tw, Tr, Tu, To, Tv;
|
||||
Tb = ri[WS(is, 2)];
|
||||
Tt = ii[WS(is, 2)];
|
||||
{
|
||||
E Tc, Td, Tp, Tq;
|
||||
Tc = ri[WS(is, 5)];
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = Tc + Td;
|
||||
Tw = Td - Tc;
|
||||
Tp = ii[WS(is, 5)];
|
||||
Tq = ii[WS(is, 8)];
|
||||
Tr = Tp - Tq;
|
||||
Tu = Tp + Tq;
|
||||
}
|
||||
Tf = Tb + Te;
|
||||
T1d = Tt + Tu;
|
||||
To = FNMS(KP500000000, Te, Tb);
|
||||
Ts = FMA(KP866025403, Tr, To);
|
||||
TZ = FNMS(KP866025403, Tr, To);
|
||||
Tv = FNMS(KP500000000, Tu, Tt);
|
||||
Tx = FMA(KP866025403, Tw, Tv);
|
||||
T10 = FNMS(KP866025403, Tw, Tv);
|
||||
}
|
||||
{
|
||||
E T1e, Tg, T1b, T1i, T1g, T1h;
|
||||
T1e = T1c - T1d;
|
||||
Tg = Ta + Tf;
|
||||
T1b = FNMS(KP500000000, Tg, T5);
|
||||
ro[0] = T5 + Tg;
|
||||
ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
|
||||
ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
|
||||
T1i = Tf - Ta;
|
||||
T1g = T1c + T1d;
|
||||
T1h = FNMS(KP500000000, T1g, T1f);
|
||||
io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
|
||||
io[0] = T1f + T1g;
|
||||
io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
|
||||
}
|
||||
{
|
||||
E Tn, TN, TK, TS, TQ, TU, TR, TT;
|
||||
Tn = FMA(KP866025403, Tm, Tl);
|
||||
TN = FMA(KP866025403, TM, TL);
|
||||
{
|
||||
E Ty, TJ, TO, TP;
|
||||
Ty = FNMS(KP176326980, Tx, Ts);
|
||||
TJ = FNMS(KP839099631, TI, TF);
|
||||
TK = FNMS(KP777861913, TJ, Ty);
|
||||
TS = FMA(KP777861913, TJ, Ty);
|
||||
TO = FMA(KP176326980, Ts, Tx);
|
||||
TP = FMA(KP839099631, TF, TI);
|
||||
TQ = FMA(KP777861913, TP, TO);
|
||||
TU = FNMS(KP777861913, TP, TO);
|
||||
}
|
||||
io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
|
||||
ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
|
||||
TR = FNMS(KP492403876, TQ, TN);
|
||||
ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
|
||||
ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
|
||||
TT = FMA(KP492403876, TK, Tn);
|
||||
io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
|
||||
io[WS(os, 4)] = FMA(KP852868531, TU, TT);
|
||||
}
|
||||
{
|
||||
E TV, T17, T12, T1a, T16, T18, T13, T19;
|
||||
TV = FNMS(KP866025403, TM, TL);
|
||||
T17 = FNMS(KP866025403, Tm, Tl);
|
||||
{
|
||||
E TY, T11, T14, T15;
|
||||
TY = FMA(KP176326980, TX, TW);
|
||||
T11 = FNMS(KP363970234, T10, TZ);
|
||||
T12 = FNMS(KP954188894, T11, TY);
|
||||
T1a = FMA(KP954188894, T11, TY);
|
||||
T14 = FNMS(KP176326980, TW, TX);
|
||||
T15 = FMA(KP363970234, TZ, T10);
|
||||
T16 = FNMS(KP954188894, T15, T14);
|
||||
T18 = FMA(KP954188894, T15, T14);
|
||||
}
|
||||
ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
|
||||
io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
|
||||
T13 = FNMS(KP492403876, T12, TV);
|
||||
ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
|
||||
ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
|
||||
T19 = FMA(KP492403876, T18, T17);
|
||||
io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
|
||||
io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 9, "n1_9", { 24, 0, 56, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
|
||||
|
||||
/*
|
||||
* This function contains 80 FP additions, 40 FP multiplications,
|
||||
* (or, 60 additions, 20 multiplications, 20 fused multiply/add),
|
||||
* 39 stack variables, 8 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/n.h"
|
||||
|
||||
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
|
||||
{
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT i;
|
||||
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
|
||||
E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
|
||||
E T10, TG, TZ;
|
||||
{
|
||||
E T1, T2, T3, T4;
|
||||
T1 = ri[0];
|
||||
T2 = ri[WS(is, 3)];
|
||||
T3 = ri[WS(is, 6)];
|
||||
T4 = T2 + T3;
|
||||
T5 = T1 + T4;
|
||||
TO = KP866025403 * (T3 - T2);
|
||||
Th = FNMS(KP500000000, T4, T1);
|
||||
}
|
||||
{
|
||||
E TP, Ti, Tj, TQ;
|
||||
TP = ii[0];
|
||||
Ti = ii[WS(is, 3)];
|
||||
Tj = ii[WS(is, 6)];
|
||||
TQ = Ti + Tj;
|
||||
Tk = KP866025403 * (Ti - Tj);
|
||||
T1g = TP + TQ;
|
||||
TR = FNMS(KP500000000, TQ, TP);
|
||||
}
|
||||
{
|
||||
E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
|
||||
T6 = ri[WS(is, 1)];
|
||||
Ts = ii[WS(is, 1)];
|
||||
{
|
||||
E T7, T8, Tn, To;
|
||||
T7 = ri[WS(is, 4)];
|
||||
T8 = ri[WS(is, 7)];
|
||||
T9 = T7 + T8;
|
||||
Tr = KP866025403 * (T8 - T7);
|
||||
Tn = ii[WS(is, 4)];
|
||||
To = ii[WS(is, 7)];
|
||||
Tp = KP866025403 * (Tn - To);
|
||||
Tt = Tn + To;
|
||||
}
|
||||
Ta = T6 + T9;
|
||||
T1c = Ts + Tt;
|
||||
Tm = FNMS(KP500000000, T9, T6);
|
||||
Tq = Tm + Tp;
|
||||
TW = Tm - Tp;
|
||||
Tu = FNMS(KP500000000, Tt, Ts);
|
||||
Tv = Tr + Tu;
|
||||
TX = Tu - Tr;
|
||||
}
|
||||
{
|
||||
E Tb, TD, Te, TC, TA, TE, Tx, TF;
|
||||
Tb = ri[WS(is, 2)];
|
||||
TD = ii[WS(is, 2)];
|
||||
{
|
||||
E Tc, Td, Ty, Tz;
|
||||
Tc = ri[WS(is, 5)];
|
||||
Td = ri[WS(is, 8)];
|
||||
Te = Tc + Td;
|
||||
TC = KP866025403 * (Td - Tc);
|
||||
Ty = ii[WS(is, 5)];
|
||||
Tz = ii[WS(is, 8)];
|
||||
TA = KP866025403 * (Ty - Tz);
|
||||
TE = Ty + Tz;
|
||||
}
|
||||
Tf = Tb + Te;
|
||||
T1d = TD + TE;
|
||||
Tx = FNMS(KP500000000, Te, Tb);
|
||||
TB = Tx + TA;
|
||||
T10 = Tx - TA;
|
||||
TF = FNMS(KP500000000, TE, TD);
|
||||
TG = TC + TF;
|
||||
TZ = TF - TC;
|
||||
}
|
||||
{
|
||||
E T1e, Tg, T1b, T1f, T1h, T1i;
|
||||
T1e = KP866025403 * (T1c - T1d);
|
||||
Tg = Ta + Tf;
|
||||
T1b = FNMS(KP500000000, Tg, T5);
|
||||
ro[0] = T5 + Tg;
|
||||
ro[WS(os, 3)] = T1b + T1e;
|
||||
ro[WS(os, 6)] = T1b - T1e;
|
||||
T1f = KP866025403 * (Tf - Ta);
|
||||
T1h = T1c + T1d;
|
||||
T1i = FNMS(KP500000000, T1h, T1g);
|
||||
io[WS(os, 3)] = T1f + T1i;
|
||||
io[0] = T1g + T1h;
|
||||
io[WS(os, 6)] = T1i - T1f;
|
||||
}
|
||||
{
|
||||
E Tl, TS, TI, TN, TM, TT, TJ, TU;
|
||||
Tl = Th + Tk;
|
||||
TS = TO + TR;
|
||||
{
|
||||
E Tw, TH, TK, TL;
|
||||
Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
|
||||
TH = FMA(KP173648177, TB, KP984807753 * TG);
|
||||
TI = Tw + TH;
|
||||
TN = KP866025403 * (TH - Tw);
|
||||
TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
|
||||
TL = FNMS(KP984807753, TB, KP173648177 * TG);
|
||||
TM = KP866025403 * (TK - TL);
|
||||
TT = TK + TL;
|
||||
}
|
||||
ro[WS(os, 1)] = Tl + TI;
|
||||
io[WS(os, 1)] = TS + TT;
|
||||
TJ = FNMS(KP500000000, TI, Tl);
|
||||
ro[WS(os, 7)] = TJ - TM;
|
||||
ro[WS(os, 4)] = TJ + TM;
|
||||
TU = FNMS(KP500000000, TT, TS);
|
||||
io[WS(os, 4)] = TN + TU;
|
||||
io[WS(os, 7)] = TU - TN;
|
||||
}
|
||||
{
|
||||
E TV, T14, T12, T13, T17, T1a, T18, T19;
|
||||
TV = Th - Tk;
|
||||
T14 = TR - TO;
|
||||
{
|
||||
E TY, T11, T15, T16;
|
||||
TY = FMA(KP173648177, TW, KP984807753 * TX);
|
||||
T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
|
||||
T12 = TY + T11;
|
||||
T13 = KP866025403 * (T11 - TY);
|
||||
T15 = FNMS(KP984807753, TW, KP173648177 * TX);
|
||||
T16 = FMA(KP342020143, T10, KP939692620 * TZ);
|
||||
T17 = T15 - T16;
|
||||
T1a = KP866025403 * (T15 + T16);
|
||||
}
|
||||
ro[WS(os, 2)] = TV + T12;
|
||||
io[WS(os, 2)] = T14 + T17;
|
||||
T18 = FNMS(KP500000000, T17, T14);
|
||||
io[WS(os, 5)] = T13 + T18;
|
||||
io[WS(os, 8)] = T18 - T13;
|
||||
T19 = FNMS(KP500000000, T12, TV);
|
||||
ro[WS(os, 8)] = T19 - T1a;
|
||||
ro[WS(os, 5)] = T19 + T1a;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const kdft_desc desc = { 9, "n1_9", { 60, 20, 20, 0 }, &GENUS, 0, 0, 0, 0 };
|
||||
|
||||
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
|
||||
}
|
||||
|
||||
#endif
|
||||
149
fftw-3.3.10/dft/scalar/codelets/q1_2.c
Normal file
149
fftw-3.3.10/dft/scalar/codelets/q1_2.c
Normal file
@@ -0,0 +1,149 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 8 FP multiplications,
|
||||
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 17 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T2, T4, T7, T8, T9, Tb, Tc, Te, Th, Ti, Tj;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T4 = T1 - T2;
|
||||
T7 = iio[0];
|
||||
T8 = iio[WS(rs, 1)];
|
||||
T9 = T7 - T8;
|
||||
Tb = rio[WS(vs, 1)];
|
||||
Tc = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Te = Tb - Tc;
|
||||
Th = iio[WS(vs, 1)];
|
||||
Ti = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tj = Th - Ti;
|
||||
rio[0] = T1 + T2;
|
||||
iio[0] = T7 + T8;
|
||||
rio[WS(rs, 1)] = Tb + Tc;
|
||||
iio[WS(rs, 1)] = Th + Ti;
|
||||
{
|
||||
E Tf, Tk, Td, Tg;
|
||||
Td = W[0];
|
||||
Tf = Td * Te;
|
||||
Tk = Td * Tj;
|
||||
Tg = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tg, Tj, Tf);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tg, Te, Tk);
|
||||
}
|
||||
{
|
||||
E T5, Ta, T3, T6;
|
||||
T3 = W[0];
|
||||
T5 = T3 * T4;
|
||||
Ta = T3 * T9;
|
||||
T6 = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T6, T9, T5);
|
||||
iio[WS(vs, 1)] = FNMS(T6, T4, Ta);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_2) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_2, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 12 FP additions, 8 FP multiplications,
|
||||
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
|
||||
* 17 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T2, T4, T6, T7, T8, T9, Ta, Tc, Te, Tf, Tg;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T4 = T1 - T2;
|
||||
T6 = iio[0];
|
||||
T7 = iio[WS(rs, 1)];
|
||||
T8 = T6 - T7;
|
||||
T9 = rio[WS(vs, 1)];
|
||||
Ta = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tc = T9 - Ta;
|
||||
Te = iio[WS(vs, 1)];
|
||||
Tf = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tg = Te - Tf;
|
||||
rio[0] = T1 + T2;
|
||||
iio[0] = T6 + T7;
|
||||
rio[WS(rs, 1)] = T9 + Ta;
|
||||
iio[WS(rs, 1)] = Te + Tf;
|
||||
{
|
||||
E Tb, Td, T3, T5;
|
||||
Tb = W[0];
|
||||
Td = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tb, Tc, Td * Tg);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Td, Tc, Tb * Tg);
|
||||
T3 = W[0];
|
||||
T5 = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T3, T4, T5 * T8);
|
||||
iio[WS(vs, 1)] = FNMS(T5, T4, T3 * T8);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_2) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_2, &desc);
|
||||
}
|
||||
#endif
|
||||
316
fftw-3.3.10/dft/scalar/codelets/q1_3.c
Normal file
316
fftw-3.3.10/dft/scalar/codelets/q1_3.c
Normal file
@@ -0,0 +1,316 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 48 FP additions, 42 FP multiplications,
|
||||
* (or, 18 additions, 12 multiplications, 30 fused multiply/add),
|
||||
* 35 stack variables, 2 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T4, T6, Tg, Td, Te, T9, Tf, Tp, Ts, Tu, TE, TB, TC, Tx;
|
||||
E TD, TZ, T10, TV, T11, TN, TQ, TS, T12;
|
||||
{
|
||||
E T2, T3, Tv, Tw;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T6 = FNMS(KP500000000, T4, T1);
|
||||
Tg = T3 - T2;
|
||||
{
|
||||
E T7, T8, Tq, Tr;
|
||||
Td = iio[0];
|
||||
T7 = iio[WS(rs, 1)];
|
||||
T8 = iio[WS(rs, 2)];
|
||||
Te = T7 + T8;
|
||||
T9 = T7 - T8;
|
||||
Tf = FNMS(KP500000000, Te, Td);
|
||||
Tp = rio[WS(vs, 1)];
|
||||
Tq = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tr = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
Ts = Tq + Tr;
|
||||
Tu = FNMS(KP500000000, Ts, Tp);
|
||||
TE = Tr - Tq;
|
||||
}
|
||||
TB = iio[WS(vs, 1)];
|
||||
Tv = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tw = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
TC = Tv + Tw;
|
||||
Tx = Tv - Tw;
|
||||
TD = FNMS(KP500000000, TC, TB);
|
||||
{
|
||||
E TT, TU, TO, TP;
|
||||
TZ = iio[WS(vs, 2)];
|
||||
TT = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
TU = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T10 = TT + TU;
|
||||
TV = TT - TU;
|
||||
T11 = FNMS(KP500000000, T10, TZ);
|
||||
TN = rio[WS(vs, 2)];
|
||||
TO = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
TP = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
TQ = TO + TP;
|
||||
TS = FNMS(KP500000000, TQ, TN);
|
||||
T12 = TP - TO;
|
||||
}
|
||||
}
|
||||
rio[0] = T1 + T4;
|
||||
iio[0] = Td + Te;
|
||||
rio[WS(rs, 1)] = Tp + Ts;
|
||||
iio[WS(rs, 1)] = TB + TC;
|
||||
iio[WS(rs, 2)] = TZ + T10;
|
||||
rio[WS(rs, 2)] = TN + TQ;
|
||||
{
|
||||
E Ta, Th, Tb, Ti, T5, Tc;
|
||||
Ta = FMA(KP866025403, T9, T6);
|
||||
Th = FMA(KP866025403, Tg, Tf);
|
||||
T5 = W[0];
|
||||
Tb = T5 * Ta;
|
||||
Ti = T5 * Th;
|
||||
Tc = W[1];
|
||||
rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
|
||||
iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
|
||||
}
|
||||
{
|
||||
E T16, T19, T17, T1a, T15, T18;
|
||||
T16 = FNMS(KP866025403, TV, TS);
|
||||
T19 = FNMS(KP866025403, T12, T11);
|
||||
T15 = W[2];
|
||||
T17 = T15 * T16;
|
||||
T1a = T15 * T19;
|
||||
T18 = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
|
||||
}
|
||||
{
|
||||
E TI, TL, TJ, TM, TH, TK;
|
||||
TI = FNMS(KP866025403, Tx, Tu);
|
||||
TL = FNMS(KP866025403, TE, TD);
|
||||
TH = W[2];
|
||||
TJ = TH * TI;
|
||||
TM = TH * TL;
|
||||
TK = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
|
||||
}
|
||||
{
|
||||
E Ty, TF, Tz, TG, Tt, TA;
|
||||
Ty = FMA(KP866025403, Tx, Tu);
|
||||
TF = FMA(KP866025403, TE, TD);
|
||||
Tt = W[0];
|
||||
Tz = Tt * Ty;
|
||||
TG = Tt * TF;
|
||||
TA = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
|
||||
}
|
||||
{
|
||||
E TW, T13, TX, T14, TR, TY;
|
||||
TW = FMA(KP866025403, TV, TS);
|
||||
T13 = FMA(KP866025403, T12, T11);
|
||||
TR = W[0];
|
||||
TX = TR * TW;
|
||||
T14 = TR * T13;
|
||||
TY = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
|
||||
}
|
||||
{
|
||||
E Tk, Tn, Tl, To, Tj, Tm;
|
||||
Tk = FNMS(KP866025403, T9, T6);
|
||||
Tn = FNMS(KP866025403, Tg, Tf);
|
||||
Tj = W[2];
|
||||
Tl = Tj * Tk;
|
||||
To = Tj * Tn;
|
||||
Tm = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
|
||||
iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 18, 12, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_3) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_3, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 48 FP additions, 36 FP multiplications,
|
||||
* (or, 30 additions, 18 multiplications, 18 fused multiply/add),
|
||||
* 35 stack variables, 2 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
|
||||
E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
|
||||
{
|
||||
E T2, T3, Tr, Ts;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 2)];
|
||||
T4 = T2 + T3;
|
||||
T6 = FNMS(KP500000000, T4, T1);
|
||||
Tc = KP866025403 * (T3 - T2);
|
||||
{
|
||||
E T7, T8, Tm, Tn;
|
||||
Td = iio[0];
|
||||
T7 = iio[WS(rs, 1)];
|
||||
T8 = iio[WS(rs, 2)];
|
||||
Te = T7 + T8;
|
||||
T9 = KP866025403 * (T7 - T8);
|
||||
Tf = FNMS(KP500000000, Te, Td);
|
||||
Tl = rio[WS(vs, 1)];
|
||||
Tm = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tn = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
To = Tm + Tn;
|
||||
Tq = FNMS(KP500000000, To, Tl);
|
||||
Tw = KP866025403 * (Tn - Tm);
|
||||
}
|
||||
Tx = iio[WS(vs, 1)];
|
||||
Tr = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
Ts = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
Ty = Tr + Ts;
|
||||
Tt = KP866025403 * (Tr - Ts);
|
||||
Tz = FNMS(KP500000000, Ty, Tx);
|
||||
{
|
||||
E TL, TM, TG, TH;
|
||||
TR = iio[WS(vs, 2)];
|
||||
TL = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
TM = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
TS = TL + TM;
|
||||
TN = KP866025403 * (TL - TM);
|
||||
TT = FNMS(KP500000000, TS, TR);
|
||||
TF = rio[WS(vs, 2)];
|
||||
TG = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
TH = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
TI = TG + TH;
|
||||
TK = FNMS(KP500000000, TI, TF);
|
||||
TQ = KP866025403 * (TH - TG);
|
||||
}
|
||||
}
|
||||
rio[0] = T1 + T4;
|
||||
iio[0] = Td + Te;
|
||||
rio[WS(rs, 1)] = Tl + To;
|
||||
iio[WS(rs, 1)] = Tx + Ty;
|
||||
iio[WS(rs, 2)] = TR + TS;
|
||||
rio[WS(rs, 2)] = TF + TI;
|
||||
{
|
||||
E Ta, Tg, T5, Tb;
|
||||
Ta = T6 + T9;
|
||||
Tg = Tc + Tf;
|
||||
T5 = W[0];
|
||||
Tb = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
|
||||
iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
|
||||
}
|
||||
{
|
||||
E TW, TY, TV, TX;
|
||||
TW = TK - TN;
|
||||
TY = TT - TQ;
|
||||
TV = W[2];
|
||||
TX = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
|
||||
}
|
||||
{
|
||||
E TC, TE, TB, TD;
|
||||
TC = Tq - Tt;
|
||||
TE = Tz - Tw;
|
||||
TB = W[2];
|
||||
TD = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
|
||||
}
|
||||
{
|
||||
E Tu, TA, Tp, Tv;
|
||||
Tu = Tq + Tt;
|
||||
TA = Tw + Tz;
|
||||
Tp = W[0];
|
||||
Tv = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
|
||||
}
|
||||
{
|
||||
E TO, TU, TJ, TP;
|
||||
TO = TK + TN;
|
||||
TU = TQ + TT;
|
||||
TJ = W[0];
|
||||
TP = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
|
||||
}
|
||||
{
|
||||
E Ti, Tk, Th, Tj;
|
||||
Ti = T6 - T9;
|
||||
Tk = Tf - Tc;
|
||||
Th = W[2];
|
||||
Tj = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
|
||||
iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 30, 18, 18, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_3) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_3, &desc);
|
||||
}
|
||||
#endif
|
||||
524
fftw-3.3.10/dft/scalar/codelets/q1_4.c
Normal file
524
fftw-3.3.10/dft/scalar/codelets/q1_4.c
Normal file
@@ -0,0 +1,524 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 88 FP additions, 48 FP multiplications,
|
||||
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
|
||||
* 51 stack variables, 0 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T3, Tv, Tw, T6, Tc, Tf, Tx, Ts, Tm, Ti, T1H, T29, T2a, T1K, T1Q;
|
||||
E T1T, T2b, T26, T20, T1W, TB, T13, T14, TE, TK, TN, T15, T10, TU, TQ;
|
||||
E T19, T1B, T1C, T1c, T1i, T1l, T1D, T1y, T1s, T1o;
|
||||
{
|
||||
E T1, T2, Tb, Tg, Th, T8;
|
||||
{
|
||||
E T9, Ta, T4, T5;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
T9 = iio[0];
|
||||
Ta = iio[WS(rs, 2)];
|
||||
Tb = T9 - Ta;
|
||||
Tv = T9 + Ta;
|
||||
Tg = iio[WS(rs, 1)];
|
||||
Th = iio[WS(rs, 3)];
|
||||
Tw = Tg + Th;
|
||||
T4 = rio[WS(rs, 1)];
|
||||
T5 = rio[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T8 = T4 - T5;
|
||||
}
|
||||
Tc = T8 + Tb;
|
||||
Tf = T1 - T2;
|
||||
Tx = Tv - Tw;
|
||||
Ts = T3 - T6;
|
||||
Tm = Tb - T8;
|
||||
Ti = Tg - Th;
|
||||
}
|
||||
{
|
||||
E T1F, T1G, T1P, T1U, T1V, T1M;
|
||||
{
|
||||
E T1N, T1O, T1I, T1J;
|
||||
T1F = rio[WS(vs, 3)];
|
||||
T1G = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1H = T1F + T1G;
|
||||
T1N = iio[WS(vs, 3)];
|
||||
T1O = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1P = T1N - T1O;
|
||||
T29 = T1N + T1O;
|
||||
T1U = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1V = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2a = T1U + T1V;
|
||||
T1I = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1J = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T1K = T1I + T1J;
|
||||
T1M = T1I - T1J;
|
||||
}
|
||||
T1Q = T1M + T1P;
|
||||
T1T = T1F - T1G;
|
||||
T2b = T29 - T2a;
|
||||
T26 = T1H - T1K;
|
||||
T20 = T1P - T1M;
|
||||
T1W = T1U - T1V;
|
||||
}
|
||||
{
|
||||
E Tz, TA, TJ, TO, TP, TG;
|
||||
{
|
||||
E TH, TI, TC, TD;
|
||||
Tz = rio[WS(vs, 1)];
|
||||
TA = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
TB = Tz + TA;
|
||||
TH = iio[WS(vs, 1)];
|
||||
TI = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
TJ = TH - TI;
|
||||
T13 = TH + TI;
|
||||
TO = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
TP = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
T14 = TO + TP;
|
||||
TC = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
TD = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
TE = TC + TD;
|
||||
TG = TC - TD;
|
||||
}
|
||||
TK = TG + TJ;
|
||||
TN = Tz - TA;
|
||||
T15 = T13 - T14;
|
||||
T10 = TB - TE;
|
||||
TU = TJ - TG;
|
||||
TQ = TO - TP;
|
||||
}
|
||||
{
|
||||
E T17, T18, T1h, T1m, T1n, T1e;
|
||||
{
|
||||
E T1f, T1g, T1a, T1b;
|
||||
T17 = rio[WS(vs, 2)];
|
||||
T18 = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
T19 = T17 + T18;
|
||||
T1f = iio[WS(vs, 2)];
|
||||
T1g = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1h = T1f - T1g;
|
||||
T1B = T1f + T1g;
|
||||
T1m = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1n = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1C = T1m + T1n;
|
||||
T1a = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1b = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1c = T1a + T1b;
|
||||
T1e = T1a - T1b;
|
||||
}
|
||||
T1i = T1e + T1h;
|
||||
T1l = T17 - T18;
|
||||
T1D = T1B - T1C;
|
||||
T1y = T19 - T1c;
|
||||
T1s = T1h - T1e;
|
||||
T1o = T1m - T1n;
|
||||
}
|
||||
rio[0] = T3 + T6;
|
||||
iio[0] = Tv + Tw;
|
||||
rio[WS(rs, 1)] = TB + TE;
|
||||
iio[WS(rs, 1)] = T13 + T14;
|
||||
rio[WS(rs, 2)] = T19 + T1c;
|
||||
iio[WS(rs, 2)] = T1B + T1C;
|
||||
iio[WS(rs, 3)] = T29 + T2a;
|
||||
rio[WS(rs, 3)] = T1H + T1K;
|
||||
{
|
||||
E Tt, Ty, Tr, Tu;
|
||||
Tr = W[2];
|
||||
Tt = Tr * Ts;
|
||||
Ty = Tr * Tx;
|
||||
Tu = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
|
||||
iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
|
||||
}
|
||||
{
|
||||
E T27, T2c, T25, T28;
|
||||
T25 = W[2];
|
||||
T27 = T25 * T26;
|
||||
T2c = T25 * T2b;
|
||||
T28 = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
|
||||
}
|
||||
{
|
||||
E T11, T16, TZ, T12;
|
||||
TZ = W[2];
|
||||
T11 = TZ * T10;
|
||||
T16 = TZ * T15;
|
||||
T12 = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
|
||||
}
|
||||
{
|
||||
E T1z, T1E, T1x, T1A;
|
||||
T1x = W[2];
|
||||
T1z = T1x * T1y;
|
||||
T1E = T1x * T1D;
|
||||
T1A = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
|
||||
}
|
||||
{
|
||||
E Tj, Te, Tk, T7, Td;
|
||||
Tj = Tf - Ti;
|
||||
Te = W[5];
|
||||
Tk = Te * Tc;
|
||||
T7 = W[4];
|
||||
Td = T7 * Tc;
|
||||
iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
|
||||
rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
|
||||
}
|
||||
{
|
||||
E T1p, T1k, T1q, T1d, T1j;
|
||||
T1p = T1l - T1o;
|
||||
T1k = W[5];
|
||||
T1q = T1k * T1i;
|
||||
T1d = W[4];
|
||||
T1j = T1d * T1i;
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
|
||||
}
|
||||
{
|
||||
E T23, T22, T24, T1Z, T21;
|
||||
T23 = T1T + T1W;
|
||||
T22 = W[1];
|
||||
T24 = T22 * T20;
|
||||
T1Z = W[0];
|
||||
T21 = T1Z * T20;
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
|
||||
}
|
||||
{
|
||||
E TX, TW, TY, TT, TV;
|
||||
TX = TN + TQ;
|
||||
TW = W[1];
|
||||
TY = TW * TU;
|
||||
TT = W[0];
|
||||
TV = TT * TU;
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
|
||||
}
|
||||
{
|
||||
E TR, TM, TS, TF, TL;
|
||||
TR = TN - TQ;
|
||||
TM = W[5];
|
||||
TS = TM * TK;
|
||||
TF = W[4];
|
||||
TL = TF * TK;
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
|
||||
}
|
||||
{
|
||||
E Tp, To, Tq, Tl, Tn;
|
||||
Tp = Tf + Ti;
|
||||
To = W[1];
|
||||
Tq = To * Tm;
|
||||
Tl = W[0];
|
||||
Tn = Tl * Tm;
|
||||
iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
|
||||
rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
|
||||
}
|
||||
{
|
||||
E T1v, T1u, T1w, T1r, T1t;
|
||||
T1v = T1l + T1o;
|
||||
T1u = W[1];
|
||||
T1w = T1u * T1s;
|
||||
T1r = W[0];
|
||||
T1t = T1r * T1s;
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
|
||||
}
|
||||
{
|
||||
E T1X, T1S, T1Y, T1L, T1R;
|
||||
T1X = T1T - T1W;
|
||||
T1S = W[5];
|
||||
T1Y = T1S * T1Q;
|
||||
T1L = W[4];
|
||||
T1R = T1L * T1Q;
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_4) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 88 FP additions, 48 FP multiplications,
|
||||
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
|
||||
* 37 stack variables, 0 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
|
||||
E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
|
||||
E T1D, T1N;
|
||||
{
|
||||
E T1, T2, T9, Ta;
|
||||
T1 = rio[0];
|
||||
T2 = rio[WS(rs, 2)];
|
||||
T3 = T1 + T2;
|
||||
Te = T1 - T2;
|
||||
T9 = iio[0];
|
||||
Ta = iio[WS(rs, 2)];
|
||||
Tb = T9 - Ta;
|
||||
Tq = T9 + Ta;
|
||||
}
|
||||
{
|
||||
E T4, T5, Tf, Tg;
|
||||
T4 = rio[WS(rs, 1)];
|
||||
T5 = rio[WS(rs, 3)];
|
||||
T6 = T4 + T5;
|
||||
T8 = T4 - T5;
|
||||
Tf = iio[WS(rs, 1)];
|
||||
Tg = iio[WS(rs, 3)];
|
||||
Th = Tf - Tg;
|
||||
Tr = Tf + Tg;
|
||||
}
|
||||
{
|
||||
E Tt, Tu, TB, TC;
|
||||
Tt = rio[WS(vs, 1)];
|
||||
Tu = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
Tv = Tt + Tu;
|
||||
TG = Tt - Tu;
|
||||
TB = iio[WS(vs, 1)];
|
||||
TC = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
TD = TB - TC;
|
||||
TS = TB + TC;
|
||||
}
|
||||
{
|
||||
E Tw, Tx, TH, TI;
|
||||
Tw = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
Tx = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
Ty = Tw + Tx;
|
||||
TA = Tw - Tx;
|
||||
TH = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
TI = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
TJ = TH - TI;
|
||||
TT = TH + TI;
|
||||
}
|
||||
{
|
||||
E TV, TW, T13, T14;
|
||||
TV = rio[WS(vs, 2)];
|
||||
TW = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
TX = TV + TW;
|
||||
T18 = TV - TW;
|
||||
T13 = iio[WS(vs, 2)];
|
||||
T14 = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T15 = T13 - T14;
|
||||
T1k = T13 + T14;
|
||||
}
|
||||
{
|
||||
E TY, TZ, T19, T1a;
|
||||
TY = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
TZ = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T10 = TY + TZ;
|
||||
T12 = TY - TZ;
|
||||
T19 = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1a = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1b = T19 - T1a;
|
||||
T1l = T19 + T1a;
|
||||
}
|
||||
{
|
||||
E T1n, T1o, T1v, T1w;
|
||||
T1n = rio[WS(vs, 3)];
|
||||
T1o = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1p = T1n + T1o;
|
||||
T1A = T1n - T1o;
|
||||
T1v = iio[WS(vs, 3)];
|
||||
T1w = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T1x = T1v - T1w;
|
||||
T1M = T1v + T1w;
|
||||
}
|
||||
{
|
||||
E T1q, T1r, T1B, T1C;
|
||||
T1q = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1r = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T1s = T1q + T1r;
|
||||
T1u = T1q - T1r;
|
||||
T1B = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T1C = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T1D = T1B - T1C;
|
||||
T1N = T1B + T1C;
|
||||
}
|
||||
rio[0] = T3 + T6;
|
||||
iio[0] = Tq + Tr;
|
||||
rio[WS(rs, 1)] = Tv + Ty;
|
||||
iio[WS(rs, 1)] = TS + TT;
|
||||
rio[WS(rs, 2)] = TX + T10;
|
||||
iio[WS(rs, 2)] = T1k + T1l;
|
||||
iio[WS(rs, 3)] = T1M + T1N;
|
||||
rio[WS(rs, 3)] = T1p + T1s;
|
||||
{
|
||||
E Tc, Ti, T7, Td;
|
||||
Tc = T8 + Tb;
|
||||
Ti = Te - Th;
|
||||
T7 = W[4];
|
||||
Td = W[5];
|
||||
iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
|
||||
rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
|
||||
}
|
||||
{
|
||||
E T1K, T1O, T1J, T1L;
|
||||
T1K = T1p - T1s;
|
||||
T1O = T1M - T1N;
|
||||
T1J = W[2];
|
||||
T1L = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = Tb - T8;
|
||||
Tm = Te + Th;
|
||||
Tj = W[0];
|
||||
Tl = W[1];
|
||||
iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
|
||||
rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E To, Ts, Tn, Tp;
|
||||
To = T3 - T6;
|
||||
Ts = Tq - Tr;
|
||||
Tn = W[2];
|
||||
Tp = W[3];
|
||||
rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
|
||||
iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
|
||||
}
|
||||
{
|
||||
E T16, T1c, T11, T17;
|
||||
T16 = T12 + T15;
|
||||
T1c = T18 - T1b;
|
||||
T11 = W[4];
|
||||
T17 = W[5];
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
|
||||
}
|
||||
{
|
||||
E T1G, T1I, T1F, T1H;
|
||||
T1G = T1x - T1u;
|
||||
T1I = T1A + T1D;
|
||||
T1F = W[0];
|
||||
T1H = W[1];
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
|
||||
}
|
||||
{
|
||||
E TQ, TU, TP, TR;
|
||||
TQ = Tv - Ty;
|
||||
TU = TS - TT;
|
||||
TP = W[2];
|
||||
TR = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
|
||||
}
|
||||
{
|
||||
E T1e, T1g, T1d, T1f;
|
||||
T1e = T15 - T12;
|
||||
T1g = T18 + T1b;
|
||||
T1d = W[0];
|
||||
T1f = W[1];
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
|
||||
}
|
||||
{
|
||||
E T1i, T1m, T1h, T1j;
|
||||
T1i = TX - T10;
|
||||
T1m = T1k - T1l;
|
||||
T1h = W[2];
|
||||
T1j = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
|
||||
}
|
||||
{
|
||||
E T1y, T1E, T1t, T1z;
|
||||
T1y = T1u + T1x;
|
||||
T1E = T1A - T1D;
|
||||
T1t = W[4];
|
||||
T1z = W[5];
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = TD - TA;
|
||||
TO = TG + TJ;
|
||||
TL = W[0];
|
||||
TN = W[1];
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
|
||||
}
|
||||
{
|
||||
E TE, TK, Tz, TF;
|
||||
TE = TA + TD;
|
||||
TK = TG - TJ;
|
||||
Tz = W[4];
|
||||
TF = W[5];
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_4) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_4, &desc);
|
||||
}
|
||||
#endif
|
||||
992
fftw-3.3.10/dft/scalar/codelets/q1_5.c
Normal file
992
fftw-3.3.10/dft/scalar/codelets/q1_5.c
Normal file
@@ -0,0 +1,992 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 200 FP additions, 170 FP multiplications,
|
||||
* (or, 70 additions, 40 multiplications, 130 fused multiply/add),
|
||||
* 75 stack variables, 4 constants, and 100 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr, TV, T15, T1G;
|
||||
E T1q, T12, T14, T1h, T1d, T1B, T1m, T1k, T1l, T1P, T1Z, T2A, T2k, T1W, T1Y;
|
||||
E T2b, T27, T2v, T2g, T2e, T2f, T3Z, T3V, T4j, T44, T42, T43, T3D, T3N, T4o;
|
||||
E T48, T3K, T3M, T2J, T2T, T3u, T3e, T2Q, T2S, T35, T31, T3p, T3a, T38, T39;
|
||||
{
|
||||
E T7, Tv, T4, Tu;
|
||||
T1 = rio[0];
|
||||
{
|
||||
E T5, T6, T2, T3;
|
||||
T5 = rio[WS(rs, 2)];
|
||||
T6 = rio[WS(rs, 3)];
|
||||
T7 = T5 + T6;
|
||||
Tv = T5 - T6;
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
Tu = T2 - T3;
|
||||
}
|
||||
Tb = T4 - T7;
|
||||
TM = FNMS(KP618033988, Tu, Tv);
|
||||
Tw = FMA(KP618033988, Tv, Tu);
|
||||
T8 = T4 + T7;
|
||||
Ta = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Ti, Tp, Tf, To;
|
||||
Tn = iio[0];
|
||||
{
|
||||
E Tg, Th, Td, Te;
|
||||
Tg = iio[WS(rs, 2)];
|
||||
Th = iio[WS(rs, 3)];
|
||||
Ti = Tg - Th;
|
||||
Tp = Tg + Th;
|
||||
Td = iio[WS(rs, 1)];
|
||||
Te = iio[WS(rs, 4)];
|
||||
Tf = Td - Te;
|
||||
To = Td + Te;
|
||||
}
|
||||
Tj = FMA(KP618033988, Ti, Tf);
|
||||
TH = FNMS(KP618033988, Tf, Ti);
|
||||
Ts = To - Tp;
|
||||
Tq = To + Tp;
|
||||
Tr = FNMS(KP250000000, Tq, Tn);
|
||||
}
|
||||
{
|
||||
E T11, T1p, TY, T1o;
|
||||
TV = rio[WS(vs, 1)];
|
||||
{
|
||||
E TZ, T10, TW, TX;
|
||||
TZ = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
T10 = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
T11 = TZ + T10;
|
||||
T1p = TZ - T10;
|
||||
TW = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
TX = rio[WS(vs, 1) + WS(rs, 4)];
|
||||
TY = TW + TX;
|
||||
T1o = TW - TX;
|
||||
}
|
||||
T15 = TY - T11;
|
||||
T1G = FNMS(KP618033988, T1o, T1p);
|
||||
T1q = FMA(KP618033988, T1p, T1o);
|
||||
T12 = TY + T11;
|
||||
T14 = FNMS(KP250000000, T12, TV);
|
||||
}
|
||||
{
|
||||
E T1c, T1j, T19, T1i;
|
||||
T1h = iio[WS(vs, 1)];
|
||||
{
|
||||
E T1a, T1b, T17, T18;
|
||||
T1a = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
T1b = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
T1c = T1a - T1b;
|
||||
T1j = T1a + T1b;
|
||||
T17 = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
T18 = iio[WS(vs, 1) + WS(rs, 4)];
|
||||
T19 = T17 - T18;
|
||||
T1i = T17 + T18;
|
||||
}
|
||||
T1d = FMA(KP618033988, T1c, T19);
|
||||
T1B = FNMS(KP618033988, T19, T1c);
|
||||
T1m = T1i - T1j;
|
||||
T1k = T1i + T1j;
|
||||
T1l = FNMS(KP250000000, T1k, T1h);
|
||||
}
|
||||
{
|
||||
E T1V, T2j, T1S, T2i;
|
||||
T1P = rio[WS(vs, 2)];
|
||||
{
|
||||
E T1T, T1U, T1Q, T1R;
|
||||
T1T = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1U = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1V = T1T + T1U;
|
||||
T2j = T1T - T1U;
|
||||
T1Q = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1R = rio[WS(vs, 2) + WS(rs, 4)];
|
||||
T1S = T1Q + T1R;
|
||||
T2i = T1Q - T1R;
|
||||
}
|
||||
T1Z = T1S - T1V;
|
||||
T2A = FNMS(KP618033988, T2i, T2j);
|
||||
T2k = FMA(KP618033988, T2j, T2i);
|
||||
T1W = T1S + T1V;
|
||||
T1Y = FNMS(KP250000000, T1W, T1P);
|
||||
}
|
||||
{
|
||||
E T26, T2d, T23, T2c;
|
||||
T2b = iio[WS(vs, 2)];
|
||||
{
|
||||
E T24, T25, T21, T22;
|
||||
T24 = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T25 = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T26 = T24 - T25;
|
||||
T2d = T24 + T25;
|
||||
T21 = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T22 = iio[WS(vs, 2) + WS(rs, 4)];
|
||||
T23 = T21 - T22;
|
||||
T2c = T21 + T22;
|
||||
}
|
||||
T27 = FMA(KP618033988, T26, T23);
|
||||
T2v = FNMS(KP618033988, T23, T26);
|
||||
T2g = T2c - T2d;
|
||||
T2e = T2c + T2d;
|
||||
T2f = FNMS(KP250000000, T2e, T2b);
|
||||
}
|
||||
{
|
||||
E T3U, T41, T3R, T40;
|
||||
T3Z = iio[WS(vs, 4)];
|
||||
{
|
||||
E T3S, T3T, T3P, T3Q;
|
||||
T3S = iio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3T = iio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3U = T3S - T3T;
|
||||
T41 = T3S + T3T;
|
||||
T3P = iio[WS(vs, 4) + WS(rs, 1)];
|
||||
T3Q = iio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3R = T3P - T3Q;
|
||||
T40 = T3P + T3Q;
|
||||
}
|
||||
T3V = FMA(KP618033988, T3U, T3R);
|
||||
T4j = FNMS(KP618033988, T3R, T3U);
|
||||
T44 = T40 - T41;
|
||||
T42 = T40 + T41;
|
||||
T43 = FNMS(KP250000000, T42, T3Z);
|
||||
}
|
||||
{
|
||||
E T3J, T47, T3G, T46;
|
||||
T3D = rio[WS(vs, 4)];
|
||||
{
|
||||
E T3H, T3I, T3E, T3F;
|
||||
T3H = rio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3I = rio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3J = T3H + T3I;
|
||||
T47 = T3H - T3I;
|
||||
T3E = rio[WS(vs, 4) + WS(rs, 1)];
|
||||
T3F = rio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3G = T3E + T3F;
|
||||
T46 = T3E - T3F;
|
||||
}
|
||||
T3N = T3G - T3J;
|
||||
T4o = FNMS(KP618033988, T46, T47);
|
||||
T48 = FMA(KP618033988, T47, T46);
|
||||
T3K = T3G + T3J;
|
||||
T3M = FNMS(KP250000000, T3K, T3D);
|
||||
}
|
||||
{
|
||||
E T2P, T3d, T2M, T3c;
|
||||
T2J = rio[WS(vs, 3)];
|
||||
{
|
||||
E T2N, T2O, T2K, T2L;
|
||||
T2N = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2O = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2P = T2N + T2O;
|
||||
T3d = T2N - T2O;
|
||||
T2K = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2L = rio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2M = T2K + T2L;
|
||||
T3c = T2K - T2L;
|
||||
}
|
||||
T2T = T2M - T2P;
|
||||
T3u = FNMS(KP618033988, T3c, T3d);
|
||||
T3e = FMA(KP618033988, T3d, T3c);
|
||||
T2Q = T2M + T2P;
|
||||
T2S = FNMS(KP250000000, T2Q, T2J);
|
||||
}
|
||||
{
|
||||
E T30, T37, T2X, T36;
|
||||
T35 = iio[WS(vs, 3)];
|
||||
{
|
||||
E T2Y, T2Z, T2V, T2W;
|
||||
T2Y = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2Z = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T30 = T2Y - T2Z;
|
||||
T37 = T2Y + T2Z;
|
||||
T2V = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2W = iio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2X = T2V - T2W;
|
||||
T36 = T2V + T2W;
|
||||
}
|
||||
T31 = FMA(KP618033988, T30, T2X);
|
||||
T3p = FNMS(KP618033988, T2X, T30);
|
||||
T3a = T36 - T37;
|
||||
T38 = T36 + T37;
|
||||
T39 = FNMS(KP250000000, T38, T35);
|
||||
}
|
||||
rio[0] = T1 + T8;
|
||||
iio[0] = Tn + Tq;
|
||||
rio[WS(rs, 1)] = TV + T12;
|
||||
iio[WS(rs, 1)] = T1h + T1k;
|
||||
rio[WS(rs, 2)] = T1P + T1W;
|
||||
iio[WS(rs, 2)] = T2b + T2e;
|
||||
iio[WS(rs, 4)] = T3Z + T42;
|
||||
rio[WS(rs, 4)] = T3D + T3K;
|
||||
rio[WS(rs, 3)] = T2J + T2Q;
|
||||
iio[WS(rs, 3)] = T35 + T38;
|
||||
{
|
||||
E Tk, TA, Tx, TD, Tc, Tt;
|
||||
Tc = FMA(KP559016994, Tb, Ta);
|
||||
Tk = FMA(KP951056516, Tj, Tc);
|
||||
TA = FNMS(KP951056516, Tj, Tc);
|
||||
Tt = FMA(KP559016994, Ts, Tr);
|
||||
Tx = FNMS(KP951056516, Tw, Tt);
|
||||
TD = FMA(KP951056516, Tw, Tt);
|
||||
{
|
||||
E Tl, Ty, T9, Tm;
|
||||
T9 = W[0];
|
||||
Tl = T9 * Tk;
|
||||
Ty = T9 * Tx;
|
||||
Tm = W[1];
|
||||
rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
|
||||
iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
|
||||
}
|
||||
{
|
||||
E TB, TE, Tz, TC;
|
||||
Tz = W[6];
|
||||
TB = Tz * TA;
|
||||
TE = Tz * TD;
|
||||
TC = W[7];
|
||||
rio[WS(vs, 4)] = FMA(TC, TD, TB);
|
||||
iio[WS(vs, 4)] = FNMS(TC, TA, TE);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TQ, TN, TT, TG, TL;
|
||||
TG = FNMS(KP559016994, Tb, Ta);
|
||||
TI = FNMS(KP951056516, TH, TG);
|
||||
TQ = FMA(KP951056516, TH, TG);
|
||||
TL = FNMS(KP559016994, Ts, Tr);
|
||||
TN = FMA(KP951056516, TM, TL);
|
||||
TT = FNMS(KP951056516, TM, TL);
|
||||
{
|
||||
E TJ, TO, TF, TK;
|
||||
TF = W[2];
|
||||
TJ = TF * TI;
|
||||
TO = TF * TN;
|
||||
TK = W[3];
|
||||
rio[WS(vs, 2)] = FMA(TK, TN, TJ);
|
||||
iio[WS(vs, 2)] = FNMS(TK, TI, TO);
|
||||
}
|
||||
{
|
||||
E TR, TU, TP, TS;
|
||||
TP = W[4];
|
||||
TR = TP * TQ;
|
||||
TU = TP * TT;
|
||||
TS = W[5];
|
||||
rio[WS(vs, 3)] = FMA(TS, TT, TR);
|
||||
iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2w, T2E, T2B, T2H, T2u, T2z;
|
||||
T2u = FNMS(KP559016994, T1Z, T1Y);
|
||||
T2w = FNMS(KP951056516, T2v, T2u);
|
||||
T2E = FMA(KP951056516, T2v, T2u);
|
||||
T2z = FNMS(KP559016994, T2g, T2f);
|
||||
T2B = FMA(KP951056516, T2A, T2z);
|
||||
T2H = FNMS(KP951056516, T2A, T2z);
|
||||
{
|
||||
E T2x, T2C, T2t, T2y;
|
||||
T2t = W[2];
|
||||
T2x = T2t * T2w;
|
||||
T2C = T2t * T2B;
|
||||
T2y = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
|
||||
}
|
||||
{
|
||||
E T2F, T2I, T2D, T2G;
|
||||
T2D = W[4];
|
||||
T2F = T2D * T2E;
|
||||
T2I = T2D * T2H;
|
||||
T2G = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T4k, T4s, T4p, T4v, T4i, T4n;
|
||||
T4i = FNMS(KP559016994, T3N, T3M);
|
||||
T4k = FNMS(KP951056516, T4j, T4i);
|
||||
T4s = FMA(KP951056516, T4j, T4i);
|
||||
T4n = FNMS(KP559016994, T44, T43);
|
||||
T4p = FMA(KP951056516, T4o, T4n);
|
||||
T4v = FNMS(KP951056516, T4o, T4n);
|
||||
{
|
||||
E T4l, T4q, T4h, T4m;
|
||||
T4h = W[2];
|
||||
T4l = T4h * T4k;
|
||||
T4q = T4h * T4p;
|
||||
T4m = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
|
||||
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
|
||||
}
|
||||
{
|
||||
E T4t, T4w, T4r, T4u;
|
||||
T4r = W[4];
|
||||
T4t = T4r * T4s;
|
||||
T4w = T4r * T4v;
|
||||
T4u = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
|
||||
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T28, T2o, T2l, T2r, T20, T2h;
|
||||
T20 = FMA(KP559016994, T1Z, T1Y);
|
||||
T28 = FMA(KP951056516, T27, T20);
|
||||
T2o = FNMS(KP951056516, T27, T20);
|
||||
T2h = FMA(KP559016994, T2g, T2f);
|
||||
T2l = FNMS(KP951056516, T2k, T2h);
|
||||
T2r = FMA(KP951056516, T2k, T2h);
|
||||
{
|
||||
E T29, T2m, T1X, T2a;
|
||||
T1X = W[0];
|
||||
T29 = T1X * T28;
|
||||
T2m = T1X * T2l;
|
||||
T2a = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
|
||||
}
|
||||
{
|
||||
E T2p, T2s, T2n, T2q;
|
||||
T2n = W[6];
|
||||
T2p = T2n * T2o;
|
||||
T2s = T2n * T2r;
|
||||
T2q = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
|
||||
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T32, T3i, T3f, T3l, T2U, T3b;
|
||||
T2U = FMA(KP559016994, T2T, T2S);
|
||||
T32 = FMA(KP951056516, T31, T2U);
|
||||
T3i = FNMS(KP951056516, T31, T2U);
|
||||
T3b = FMA(KP559016994, T3a, T39);
|
||||
T3f = FNMS(KP951056516, T3e, T3b);
|
||||
T3l = FMA(KP951056516, T3e, T3b);
|
||||
{
|
||||
E T33, T3g, T2R, T34;
|
||||
T2R = W[0];
|
||||
T33 = T2R * T32;
|
||||
T3g = T2R * T3f;
|
||||
T34 = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
|
||||
}
|
||||
{
|
||||
E T3j, T3m, T3h, T3k;
|
||||
T3h = W[6];
|
||||
T3j = T3h * T3i;
|
||||
T3m = T3h * T3l;
|
||||
T3k = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
|
||||
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3q, T3y, T3v, T3B, T3o, T3t;
|
||||
T3o = FNMS(KP559016994, T2T, T2S);
|
||||
T3q = FNMS(KP951056516, T3p, T3o);
|
||||
T3y = FMA(KP951056516, T3p, T3o);
|
||||
T3t = FNMS(KP559016994, T3a, T39);
|
||||
T3v = FMA(KP951056516, T3u, T3t);
|
||||
T3B = FNMS(KP951056516, T3u, T3t);
|
||||
{
|
||||
E T3r, T3w, T3n, T3s;
|
||||
T3n = W[2];
|
||||
T3r = T3n * T3q;
|
||||
T3w = T3n * T3v;
|
||||
T3s = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
|
||||
}
|
||||
{
|
||||
E T3z, T3C, T3x, T3A;
|
||||
T3x = W[4];
|
||||
T3z = T3x * T3y;
|
||||
T3C = T3x * T3B;
|
||||
T3A = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3W, T4c, T49, T4f, T3O, T45;
|
||||
T3O = FMA(KP559016994, T3N, T3M);
|
||||
T3W = FMA(KP951056516, T3V, T3O);
|
||||
T4c = FNMS(KP951056516, T3V, T3O);
|
||||
T45 = FMA(KP559016994, T44, T43);
|
||||
T49 = FNMS(KP951056516, T48, T45);
|
||||
T4f = FMA(KP951056516, T48, T45);
|
||||
{
|
||||
E T3X, T4a, T3L, T3Y;
|
||||
T3L = W[0];
|
||||
T3X = T3L * T3W;
|
||||
T4a = T3L * T49;
|
||||
T3Y = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
|
||||
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
|
||||
}
|
||||
{
|
||||
E T4d, T4g, T4b, T4e;
|
||||
T4b = W[6];
|
||||
T4d = T4b * T4c;
|
||||
T4g = T4b * T4f;
|
||||
T4e = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
|
||||
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1C, T1K, T1H, T1N, T1A, T1F;
|
||||
T1A = FNMS(KP559016994, T15, T14);
|
||||
T1C = FNMS(KP951056516, T1B, T1A);
|
||||
T1K = FMA(KP951056516, T1B, T1A);
|
||||
T1F = FNMS(KP559016994, T1m, T1l);
|
||||
T1H = FMA(KP951056516, T1G, T1F);
|
||||
T1N = FNMS(KP951056516, T1G, T1F);
|
||||
{
|
||||
E T1D, T1I, T1z, T1E;
|
||||
T1z = W[2];
|
||||
T1D = T1z * T1C;
|
||||
T1I = T1z * T1H;
|
||||
T1E = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
|
||||
}
|
||||
{
|
||||
E T1L, T1O, T1J, T1M;
|
||||
T1J = W[4];
|
||||
T1L = T1J * T1K;
|
||||
T1O = T1J * T1N;
|
||||
T1M = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1e, T1u, T1r, T1x, T16, T1n;
|
||||
T16 = FMA(KP559016994, T15, T14);
|
||||
T1e = FMA(KP951056516, T1d, T16);
|
||||
T1u = FNMS(KP951056516, T1d, T16);
|
||||
T1n = FMA(KP559016994, T1m, T1l);
|
||||
T1r = FNMS(KP951056516, T1q, T1n);
|
||||
T1x = FMA(KP951056516, T1q, T1n);
|
||||
{
|
||||
E T1f, T1s, T13, T1g;
|
||||
T13 = W[0];
|
||||
T1f = T13 * T1e;
|
||||
T1s = T13 * T1r;
|
||||
T1g = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
|
||||
}
|
||||
{
|
||||
E T1v, T1y, T1t, T1w;
|
||||
T1t = W[6];
|
||||
T1v = T1t * T1u;
|
||||
T1y = T1t * T1x;
|
||||
T1w = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
|
||||
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 70, 40, 130, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_5) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
|
||||
|
||||
/*
|
||||
* This function contains 200 FP additions, 140 FP multiplications,
|
||||
* (or, 130 additions, 70 multiplications, 70 fused multiply/add),
|
||||
* 75 stack variables, 4 constants, and 100 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/q.h"
|
||||
|
||||
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
|
||||
E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
|
||||
E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
|
||||
E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
|
||||
E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
|
||||
{
|
||||
E T7, Tu, T4, Tt;
|
||||
T1 = rio[0];
|
||||
{
|
||||
E T5, T6, T2, T3;
|
||||
T5 = rio[WS(rs, 2)];
|
||||
T6 = rio[WS(rs, 3)];
|
||||
T7 = T5 + T6;
|
||||
Tu = T5 - T6;
|
||||
T2 = rio[WS(rs, 1)];
|
||||
T3 = rio[WS(rs, 4)];
|
||||
T4 = T2 + T3;
|
||||
Tt = T2 - T3;
|
||||
}
|
||||
Ta = KP559016994 * (T4 - T7);
|
||||
TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
|
||||
Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
|
||||
T8 = T4 + T7;
|
||||
Tb = FNMS(KP250000000, T8, T1);
|
||||
}
|
||||
{
|
||||
E Ti, Tn, Tf, Tm;
|
||||
Tp = iio[0];
|
||||
{
|
||||
E Tg, Th, Td, Te;
|
||||
Tg = iio[WS(rs, 2)];
|
||||
Th = iio[WS(rs, 3)];
|
||||
Ti = Tg - Th;
|
||||
Tn = Tg + Th;
|
||||
Td = iio[WS(rs, 1)];
|
||||
Te = iio[WS(rs, 4)];
|
||||
Tf = Td - Te;
|
||||
Tm = Td + Te;
|
||||
}
|
||||
Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
|
||||
TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
|
||||
To = KP559016994 * (Tm - Tn);
|
||||
Tq = Tm + Tn;
|
||||
Tr = FNMS(KP250000000, Tq, Tp);
|
||||
}
|
||||
{
|
||||
E TT, T1g, TQ, T1f;
|
||||
TN = rio[WS(vs, 1)];
|
||||
{
|
||||
E TR, TS, TO, TP;
|
||||
TR = rio[WS(vs, 1) + WS(rs, 2)];
|
||||
TS = rio[WS(vs, 1) + WS(rs, 3)];
|
||||
TT = TR + TS;
|
||||
T1g = TR - TS;
|
||||
TO = rio[WS(vs, 1) + WS(rs, 1)];
|
||||
TP = rio[WS(vs, 1) + WS(rs, 4)];
|
||||
TQ = TO + TP;
|
||||
T1f = TO - TP;
|
||||
}
|
||||
TW = KP559016994 * (TQ - TT);
|
||||
T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
|
||||
T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
|
||||
TU = TQ + TT;
|
||||
TX = FNMS(KP250000000, TU, TN);
|
||||
}
|
||||
{
|
||||
E T14, T19, T11, T18;
|
||||
T1b = iio[WS(vs, 1)];
|
||||
{
|
||||
E T12, T13, TZ, T10;
|
||||
T12 = iio[WS(vs, 1) + WS(rs, 2)];
|
||||
T13 = iio[WS(vs, 1) + WS(rs, 3)];
|
||||
T14 = T12 - T13;
|
||||
T19 = T12 + T13;
|
||||
TZ = iio[WS(vs, 1) + WS(rs, 1)];
|
||||
T10 = iio[WS(vs, 1) + WS(rs, 4)];
|
||||
T11 = TZ - T10;
|
||||
T18 = TZ + T10;
|
||||
}
|
||||
T15 = FMA(KP951056516, T11, KP587785252 * T14);
|
||||
T1p = FNMS(KP587785252, T11, KP951056516 * T14);
|
||||
T1a = KP559016994 * (T18 - T19);
|
||||
T1c = T18 + T19;
|
||||
T1d = FNMS(KP250000000, T1c, T1b);
|
||||
}
|
||||
{
|
||||
E T1F, T22, T1C, T21;
|
||||
T1z = rio[WS(vs, 2)];
|
||||
{
|
||||
E T1D, T1E, T1A, T1B;
|
||||
T1D = rio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1E = rio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1F = T1D + T1E;
|
||||
T22 = T1D - T1E;
|
||||
T1A = rio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1B = rio[WS(vs, 2) + WS(rs, 4)];
|
||||
T1C = T1A + T1B;
|
||||
T21 = T1A - T1B;
|
||||
}
|
||||
T1I = KP559016994 * (T1C - T1F);
|
||||
T2e = FNMS(KP587785252, T21, KP951056516 * T22);
|
||||
T23 = FMA(KP951056516, T21, KP587785252 * T22);
|
||||
T1G = T1C + T1F;
|
||||
T1J = FNMS(KP250000000, T1G, T1z);
|
||||
}
|
||||
{
|
||||
E T1Q, T1V, T1N, T1U;
|
||||
T1X = iio[WS(vs, 2)];
|
||||
{
|
||||
E T1O, T1P, T1L, T1M;
|
||||
T1O = iio[WS(vs, 2) + WS(rs, 2)];
|
||||
T1P = iio[WS(vs, 2) + WS(rs, 3)];
|
||||
T1Q = T1O - T1P;
|
||||
T1V = T1O + T1P;
|
||||
T1L = iio[WS(vs, 2) + WS(rs, 1)];
|
||||
T1M = iio[WS(vs, 2) + WS(rs, 4)];
|
||||
T1N = T1L - T1M;
|
||||
T1U = T1L + T1M;
|
||||
}
|
||||
T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
|
||||
T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
|
||||
T1W = KP559016994 * (T1U - T1V);
|
||||
T1Y = T1U + T1V;
|
||||
T1Z = FNMS(KP250000000, T1Y, T1X);
|
||||
}
|
||||
{
|
||||
E T3o, T3t, T3l, T3s;
|
||||
T3v = iio[WS(vs, 4)];
|
||||
{
|
||||
E T3m, T3n, T3j, T3k;
|
||||
T3m = iio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3n = iio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3o = T3m - T3n;
|
||||
T3t = T3m + T3n;
|
||||
T3j = iio[WS(vs, 4) + WS(rs, 1)];
|
||||
T3k = iio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3l = T3j - T3k;
|
||||
T3s = T3j + T3k;
|
||||
}
|
||||
T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
|
||||
T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
|
||||
T3u = KP559016994 * (T3s - T3t);
|
||||
T3w = T3s + T3t;
|
||||
T3x = FNMS(KP250000000, T3w, T3v);
|
||||
}
|
||||
{
|
||||
E T3d, T3A, T3a, T3z;
|
||||
T37 = rio[WS(vs, 4)];
|
||||
{
|
||||
E T3b, T3c, T38, T39;
|
||||
T3b = rio[WS(vs, 4) + WS(rs, 2)];
|
||||
T3c = rio[WS(vs, 4) + WS(rs, 3)];
|
||||
T3d = T3b + T3c;
|
||||
T3A = T3b - T3c;
|
||||
T38 = rio[WS(vs, 4) + WS(rs, 1)];
|
||||
T39 = rio[WS(vs, 4) + WS(rs, 4)];
|
||||
T3a = T38 + T39;
|
||||
T3z = T38 - T39;
|
||||
}
|
||||
T3g = KP559016994 * (T3a - T3d);
|
||||
T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
|
||||
T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
|
||||
T3e = T3a + T3d;
|
||||
T3h = FNMS(KP250000000, T3e, T37);
|
||||
}
|
||||
{
|
||||
E T2r, T2O, T2o, T2N;
|
||||
T2l = rio[WS(vs, 3)];
|
||||
{
|
||||
E T2p, T2q, T2m, T2n;
|
||||
T2p = rio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2q = rio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2r = T2p + T2q;
|
||||
T2O = T2p - T2q;
|
||||
T2m = rio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2n = rio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2o = T2m + T2n;
|
||||
T2N = T2m - T2n;
|
||||
}
|
||||
T2u = KP559016994 * (T2o - T2r);
|
||||
T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
|
||||
T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
|
||||
T2s = T2o + T2r;
|
||||
T2v = FNMS(KP250000000, T2s, T2l);
|
||||
}
|
||||
{
|
||||
E T2C, T2H, T2z, T2G;
|
||||
T2J = iio[WS(vs, 3)];
|
||||
{
|
||||
E T2A, T2B, T2x, T2y;
|
||||
T2A = iio[WS(vs, 3) + WS(rs, 2)];
|
||||
T2B = iio[WS(vs, 3) + WS(rs, 3)];
|
||||
T2C = T2A - T2B;
|
||||
T2H = T2A + T2B;
|
||||
T2x = iio[WS(vs, 3) + WS(rs, 1)];
|
||||
T2y = iio[WS(vs, 3) + WS(rs, 4)];
|
||||
T2z = T2x - T2y;
|
||||
T2G = T2x + T2y;
|
||||
}
|
||||
T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
|
||||
T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
|
||||
T2I = KP559016994 * (T2G - T2H);
|
||||
T2K = T2G + T2H;
|
||||
T2L = FNMS(KP250000000, T2K, T2J);
|
||||
}
|
||||
rio[0] = T1 + T8;
|
||||
iio[0] = Tp + Tq;
|
||||
rio[WS(rs, 1)] = TN + TU;
|
||||
iio[WS(rs, 1)] = T1b + T1c;
|
||||
rio[WS(rs, 2)] = T1z + T1G;
|
||||
iio[WS(rs, 2)] = T1X + T1Y;
|
||||
iio[WS(rs, 4)] = T3v + T3w;
|
||||
rio[WS(rs, 4)] = T37 + T3e;
|
||||
rio[WS(rs, 3)] = T2l + T2s;
|
||||
iio[WS(rs, 3)] = T2J + T2K;
|
||||
{
|
||||
E Tk, Ty, Tw, TA, Tc, Ts;
|
||||
Tc = Ta + Tb;
|
||||
Tk = Tc + Tj;
|
||||
Ty = Tc - Tj;
|
||||
Ts = To + Tr;
|
||||
Tw = Ts - Tv;
|
||||
TA = Tv + Ts;
|
||||
{
|
||||
E T9, Tl, Tx, Tz;
|
||||
T9 = W[0];
|
||||
Tl = W[1];
|
||||
rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
|
||||
iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
|
||||
Tx = W[6];
|
||||
Tz = W[7];
|
||||
rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
|
||||
iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TE, TK, TI, TM, TC, TH;
|
||||
TC = Tb - Ta;
|
||||
TE = TC - TD;
|
||||
TK = TC + TD;
|
||||
TH = Tr - To;
|
||||
TI = TG + TH;
|
||||
TM = TH - TG;
|
||||
{
|
||||
E TB, TF, TJ, TL;
|
||||
TB = W[2];
|
||||
TF = W[3];
|
||||
rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
|
||||
iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
|
||||
TJ = W[4];
|
||||
TL = W[5];
|
||||
rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
|
||||
iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2c, T2i, T2g, T2k, T2a, T2f;
|
||||
T2a = T1J - T1I;
|
||||
T2c = T2a - T2b;
|
||||
T2i = T2a + T2b;
|
||||
T2f = T1Z - T1W;
|
||||
T2g = T2e + T2f;
|
||||
T2k = T2f - T2e;
|
||||
{
|
||||
E T29, T2d, T2h, T2j;
|
||||
T29 = W[2];
|
||||
T2d = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
|
||||
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
|
||||
T2h = W[4];
|
||||
T2j = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
|
||||
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3K, T3Q, T3O, T3S, T3I, T3N;
|
||||
T3I = T3h - T3g;
|
||||
T3K = T3I - T3J;
|
||||
T3Q = T3I + T3J;
|
||||
T3N = T3x - T3u;
|
||||
T3O = T3M + T3N;
|
||||
T3S = T3N - T3M;
|
||||
{
|
||||
E T3H, T3L, T3P, T3R;
|
||||
T3H = W[2];
|
||||
T3L = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
|
||||
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
|
||||
T3P = W[4];
|
||||
T3R = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
|
||||
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1S, T26, T24, T28, T1K, T20;
|
||||
T1K = T1I + T1J;
|
||||
T1S = T1K + T1R;
|
||||
T26 = T1K - T1R;
|
||||
T20 = T1W + T1Z;
|
||||
T24 = T20 - T23;
|
||||
T28 = T23 + T20;
|
||||
{
|
||||
E T1H, T1T, T25, T27;
|
||||
T1H = W[0];
|
||||
T1T = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
|
||||
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
|
||||
T25 = W[6];
|
||||
T27 = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
|
||||
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2E, T2S, T2Q, T2U, T2w, T2M;
|
||||
T2w = T2u + T2v;
|
||||
T2E = T2w + T2D;
|
||||
T2S = T2w - T2D;
|
||||
T2M = T2I + T2L;
|
||||
T2Q = T2M - T2P;
|
||||
T2U = T2P + T2M;
|
||||
{
|
||||
E T2t, T2F, T2R, T2T;
|
||||
T2t = W[0];
|
||||
T2F = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
|
||||
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
|
||||
T2R = W[6];
|
||||
T2T = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
|
||||
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2Y, T34, T32, T36, T2W, T31;
|
||||
T2W = T2v - T2u;
|
||||
T2Y = T2W - T2X;
|
||||
T34 = T2W + T2X;
|
||||
T31 = T2L - T2I;
|
||||
T32 = T30 + T31;
|
||||
T36 = T31 - T30;
|
||||
{
|
||||
E T2V, T2Z, T33, T35;
|
||||
T2V = W[2];
|
||||
T2Z = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
|
||||
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
|
||||
T33 = W[4];
|
||||
T35 = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
|
||||
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3q, T3E, T3C, T3G, T3i, T3y;
|
||||
T3i = T3g + T3h;
|
||||
T3q = T3i + T3p;
|
||||
T3E = T3i - T3p;
|
||||
T3y = T3u + T3x;
|
||||
T3C = T3y - T3B;
|
||||
T3G = T3B + T3y;
|
||||
{
|
||||
E T3f, T3r, T3D, T3F;
|
||||
T3f = W[0];
|
||||
T3r = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
|
||||
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
|
||||
T3D = W[6];
|
||||
T3F = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
|
||||
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1q, T1w, T1u, T1y, T1o, T1t;
|
||||
T1o = TX - TW;
|
||||
T1q = T1o - T1p;
|
||||
T1w = T1o + T1p;
|
||||
T1t = T1d - T1a;
|
||||
T1u = T1s + T1t;
|
||||
T1y = T1t - T1s;
|
||||
{
|
||||
E T1n, T1r, T1v, T1x;
|
||||
T1n = W[2];
|
||||
T1r = W[3];
|
||||
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
|
||||
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
|
||||
T1v = W[4];
|
||||
T1x = W[5];
|
||||
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
|
||||
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T16, T1k, T1i, T1m, TY, T1e;
|
||||
TY = TW + TX;
|
||||
T16 = TY + T15;
|
||||
T1k = TY - T15;
|
||||
T1e = T1a + T1d;
|
||||
T1i = T1e - T1h;
|
||||
T1m = T1h + T1e;
|
||||
{
|
||||
E TV, T17, T1j, T1l;
|
||||
TV = W[0];
|
||||
T17 = W[1];
|
||||
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
|
||||
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
|
||||
T1j = W[6];
|
||||
T1l = W[7];
|
||||
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
|
||||
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 130, 70, 70, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_q1_5) (planner *p) {
|
||||
X(kdft_difsq_register) (p, q1_5, &desc);
|
||||
}
|
||||
#endif
|
||||
1320
fftw-3.3.10/dft/scalar/codelets/q1_6.c
Normal file
1320
fftw-3.3.10/dft/scalar/codelets/q1_6.c
Normal file
File diff suppressed because it is too large
Load Diff
2355
fftw-3.3.10/dft/scalar/codelets/q1_8.c
Normal file
2355
fftw-3.3.10/dft/scalar/codelets/q1_8.c
Normal file
File diff suppressed because it is too large
Load Diff
489
fftw-3.3.10/dft/scalar/codelets/t1_10.c
Normal file
489
fftw-3.3.10/dft/scalar/codelets/t1_10.c
Normal file
@@ -0,0 +1,489 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 72 FP multiplications,
|
||||
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
|
||||
* 47 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T1P, T16, T17, T18, T1s, T1x;
|
||||
E T25, Tl, Ty, Tz, T1I, T1J, T1O, T13, T14, T15, T1h, T1m, T24;
|
||||
{
|
||||
E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5;
|
||||
T1 = ri[0];
|
||||
T1T = ii[0];
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T6 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1R = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1S = FNMS(T5, T3, T1R);
|
||||
T8 = T1 - T7;
|
||||
T23 = T1T - T1S;
|
||||
T12 = T1 + T7;
|
||||
T1U = T1S + T1T;
|
||||
}
|
||||
{
|
||||
E TF, T1p, TY, T1w, TL, T1r, TS, T1u;
|
||||
{
|
||||
E TB, TE, TC, T1o, TA, TD;
|
||||
TB = ri[WS(rs, 4)];
|
||||
TE = ii[WS(rs, 4)];
|
||||
TA = W[6];
|
||||
TC = TA * TB;
|
||||
T1o = TA * TE;
|
||||
TD = W[7];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1p = FNMS(TD, TB, T1o);
|
||||
}
|
||||
{
|
||||
E TU, TX, TV, T1v, TT, TW;
|
||||
TU = ri[WS(rs, 1)];
|
||||
TX = ii[WS(rs, 1)];
|
||||
TT = W[0];
|
||||
TV = TT * TU;
|
||||
T1v = TT * TX;
|
||||
TW = W[1];
|
||||
TY = FMA(TW, TX, TV);
|
||||
T1w = FNMS(TW, TU, T1v);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1q, TG, TJ;
|
||||
TH = ri[WS(rs, 9)];
|
||||
TK = ii[WS(rs, 9)];
|
||||
TG = W[16];
|
||||
TI = TG * TH;
|
||||
T1q = TG * TK;
|
||||
TJ = W[17];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1r = FNMS(TJ, TH, T1q);
|
||||
}
|
||||
{
|
||||
E TO, TR, TP, T1t, TN, TQ;
|
||||
TO = ri[WS(rs, 6)];
|
||||
TR = ii[WS(rs, 6)];
|
||||
TN = W[10];
|
||||
TP = TN * TO;
|
||||
T1t = TN * TR;
|
||||
TQ = W[11];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1u = FNMS(TQ, TO, T1t);
|
||||
}
|
||||
TM = TF - TL;
|
||||
TZ = TS - TY;
|
||||
T10 = TM + TZ;
|
||||
T1F = T1p + T1r;
|
||||
T1G = T1u + T1w;
|
||||
T1P = T1F + T1G;
|
||||
T16 = TF + TL;
|
||||
T17 = TS + TY;
|
||||
T18 = T16 + T17;
|
||||
T1s = T1p - T1r;
|
||||
T1x = T1u - T1w;
|
||||
T25 = T1s + T1x;
|
||||
}
|
||||
{
|
||||
E Te, T1e, Tx, T1l, Tk, T1g, Tr, T1j;
|
||||
{
|
||||
E Ta, Td, Tb, T1d, T9, Tc;
|
||||
Ta = ri[WS(rs, 2)];
|
||||
Td = ii[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
T1d = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1e = FNMS(Tc, Ta, T1d);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1k, Ts, Tv;
|
||||
Tt = ri[WS(rs, 3)];
|
||||
Tw = ii[WS(rs, 3)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
T1k = Ts * Tw;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1l = FNMS(Tv, Tt, T1k);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, T1f, Tf, Ti;
|
||||
Tg = ri[WS(rs, 7)];
|
||||
Tj = ii[WS(rs, 7)];
|
||||
Tf = W[12];
|
||||
Th = Tf * Tg;
|
||||
T1f = Tf * Tj;
|
||||
Ti = W[13];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1g = FNMS(Ti, Tg, T1f);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1i, Tm, Tp;
|
||||
Tn = ri[WS(rs, 8)];
|
||||
Tq = ii[WS(rs, 8)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1i = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1j = FNMS(Tp, Tn, T1i);
|
||||
}
|
||||
Tl = Te - Tk;
|
||||
Ty = Tr - Tx;
|
||||
Tz = Tl + Ty;
|
||||
T1I = T1e + T1g;
|
||||
T1J = T1j + T1l;
|
||||
T1O = T1I + T1J;
|
||||
T13 = Te + Tk;
|
||||
T14 = Tr + Tx;
|
||||
T15 = T13 + T14;
|
||||
T1h = T1e - T1g;
|
||||
T1m = T1j - T1l;
|
||||
T24 = T1h + T1m;
|
||||
}
|
||||
{
|
||||
E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
|
||||
T1b = Tz - T10;
|
||||
T11 = Tz + T10;
|
||||
T1a = FNMS(KP250000000, T11, T8);
|
||||
T1n = T1h - T1m;
|
||||
T1y = T1s - T1x;
|
||||
T1z = FMA(KP618033988, T1y, T1n);
|
||||
T1B = FNMS(KP618033988, T1n, T1y);
|
||||
ri[WS(rs, 5)] = T8 + T11;
|
||||
T1A = FNMS(KP559016994, T1b, T1a);
|
||||
ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
|
||||
T1c = FMA(KP559016994, T1b, T1a);
|
||||
ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
|
||||
}
|
||||
{
|
||||
E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29;
|
||||
T28 = T24 - T25;
|
||||
T26 = T24 + T25;
|
||||
T27 = FNMS(KP250000000, T26, T23);
|
||||
T2a = Tl - Ty;
|
||||
T2b = TM - TZ;
|
||||
T2c = FMA(KP618033988, T2b, T2a);
|
||||
T2e = FNMS(KP618033988, T2a, T2b);
|
||||
ii[WS(rs, 5)] = T26 + T23;
|
||||
T2d = FNMS(KP559016994, T28, T27);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d);
|
||||
ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
|
||||
T29 = FMA(KP559016994, T28, T27);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29);
|
||||
ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29);
|
||||
}
|
||||
{
|
||||
E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
|
||||
T1D = T15 - T18;
|
||||
T19 = T15 + T18;
|
||||
T1C = FNMS(KP250000000, T19, T12);
|
||||
T1H = T1F - T1G;
|
||||
T1K = T1I - T1J;
|
||||
T1L = FNMS(KP618033988, T1K, T1H);
|
||||
T1N = FMA(KP618033988, T1H, T1K);
|
||||
ri[0] = T12 + T19;
|
||||
T1M = FMA(KP559016994, T1D, T1C);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
|
||||
ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M);
|
||||
T1E = FNMS(KP559016994, T1D, T1C);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
|
||||
ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E);
|
||||
}
|
||||
{
|
||||
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
|
||||
T1W = T1O - T1P;
|
||||
T1Q = T1O + T1P;
|
||||
T1V = FNMS(KP250000000, T1Q, T1U);
|
||||
T1Y = T16 - T17;
|
||||
T1Z = T13 - T14;
|
||||
T20 = FNMS(KP618033988, T1Z, T1Y);
|
||||
T22 = FMA(KP618033988, T1Y, T1Z);
|
||||
ii[0] = T1Q + T1U;
|
||||
T21 = FMA(KP559016994, T1W, T1V);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, T22, T21);
|
||||
ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21);
|
||||
T1X = FNMS(KP559016994, T1W, T1V);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
|
||||
ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 48, 18, 54, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_10, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 102 FP additions, 60 FP multiplications,
|
||||
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 45 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g;
|
||||
E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L;
|
||||
{
|
||||
E T1, T1B, T6, T1A;
|
||||
T1 = ri[0];
|
||||
T1B = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T5 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1A = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
T1O = T1B - T1A;
|
||||
TT = T1 + T6;
|
||||
T1C = T1A + T1B;
|
||||
}
|
||||
{
|
||||
E Tz, T1b, TP, T1f, TE, T1c, TK, T1e;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 4)];
|
||||
Ty = ii[WS(rs, 4)];
|
||||
Tv = W[6];
|
||||
Tx = W[7];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1b = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TM, TO, TL, TN;
|
||||
TM = ri[WS(rs, 1)];
|
||||
TO = ii[WS(rs, 1)];
|
||||
TL = W[0];
|
||||
TN = W[1];
|
||||
TP = FMA(TL, TM, TN * TO);
|
||||
T1f = FNMS(TN, TM, TL * TO);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 9)];
|
||||
TD = ii[WS(rs, 9)];
|
||||
TA = W[16];
|
||||
TC = W[17];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1c = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = ri[WS(rs, 6)];
|
||||
TJ = ii[WS(rs, 6)];
|
||||
TG = W[10];
|
||||
TI = W[11];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1e = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
TF = Tz - TE;
|
||||
TQ = TK - TP;
|
||||
TR = TF + TQ;
|
||||
T1o = T1b + T1c;
|
||||
T1p = T1e + T1f;
|
||||
T1y = T1o + T1p;
|
||||
TX = Tz + TE;
|
||||
TY = TK + TP;
|
||||
TZ = TX + TY;
|
||||
T1d = T1b - T1c;
|
||||
T1g = T1e - T1f;
|
||||
T1M = T1d + T1g;
|
||||
}
|
||||
{
|
||||
E Tc, T14, Ts, T18, Th, T15, Tn, T17;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tb = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T14 = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 3)];
|
||||
Tr = ii[WS(rs, 3)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T18 = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 7)];
|
||||
Tg = ii[WS(rs, 7)];
|
||||
Td = W[12];
|
||||
Tf = W[13];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T15 = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 8)];
|
||||
Tm = ii[WS(rs, 8)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T17 = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
Tt = Tn - Ts;
|
||||
Tu = Ti + Tt;
|
||||
T1r = T14 + T15;
|
||||
T1s = T17 + T18;
|
||||
T1x = T1r + T1s;
|
||||
TU = Tc + Th;
|
||||
TV = Tn + Ts;
|
||||
TW = TU + TV;
|
||||
T16 = T14 - T15;
|
||||
T19 = T17 - T18;
|
||||
T1L = T16 + T19;
|
||||
}
|
||||
{
|
||||
E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
|
||||
T11 = KP559016994 * (Tu - TR);
|
||||
TS = Tu + TR;
|
||||
T12 = FNMS(KP250000000, TS, T7);
|
||||
T1a = T16 - T19;
|
||||
T1h = T1d - T1g;
|
||||
T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
|
||||
T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
|
||||
ri[WS(rs, 5)] = T7 + TS;
|
||||
T1j = T12 - T11;
|
||||
ri[WS(rs, 7)] = T1j - T1k;
|
||||
ri[WS(rs, 3)] = T1j + T1k;
|
||||
T13 = T11 + T12;
|
||||
ri[WS(rs, 9)] = T13 - T1i;
|
||||
ri[WS(rs, 1)] = T13 + T1i;
|
||||
}
|
||||
{
|
||||
E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R;
|
||||
T1N = KP559016994 * (T1L - T1M);
|
||||
T1P = T1L + T1M;
|
||||
T1Q = FNMS(KP250000000, T1P, T1O);
|
||||
T1S = Ti - Tt;
|
||||
T1T = TF - TQ;
|
||||
T1U = FMA(KP951056516, T1S, KP587785252 * T1T);
|
||||
T1W = FNMS(KP587785252, T1S, KP951056516 * T1T);
|
||||
ii[WS(rs, 5)] = T1P + T1O;
|
||||
T1V = T1Q - T1N;
|
||||
ii[WS(rs, 3)] = T1V - T1W;
|
||||
ii[WS(rs, 7)] = T1W + T1V;
|
||||
T1R = T1N + T1Q;
|
||||
ii[WS(rs, 1)] = T1R - T1U;
|
||||
ii[WS(rs, 9)] = T1U + T1R;
|
||||
}
|
||||
{
|
||||
E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
|
||||
T1m = KP559016994 * (TW - TZ);
|
||||
T10 = TW + TZ;
|
||||
T1l = FNMS(KP250000000, T10, TT);
|
||||
T1q = T1o - T1p;
|
||||
T1t = T1r - T1s;
|
||||
T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
|
||||
T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
|
||||
ri[0] = TT + T10;
|
||||
T1v = T1m + T1l;
|
||||
ri[WS(rs, 4)] = T1v - T1w;
|
||||
ri[WS(rs, 6)] = T1v + T1w;
|
||||
T1n = T1l - T1m;
|
||||
ri[WS(rs, 2)] = T1n - T1u;
|
||||
ri[WS(rs, 8)] = T1n + T1u;
|
||||
}
|
||||
{
|
||||
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
|
||||
T1H = KP559016994 * (T1x - T1y);
|
||||
T1z = T1x + T1y;
|
||||
T1G = FNMS(KP250000000, T1z, T1C);
|
||||
T1D = TX - TY;
|
||||
T1E = TU - TV;
|
||||
T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
|
||||
T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
|
||||
ii[0] = T1z + T1C;
|
||||
T1K = T1H + T1G;
|
||||
ii[WS(rs, 4)] = T1J + T1K;
|
||||
ii[WS(rs, 6)] = T1K - T1J;
|
||||
T1I = T1G - T1H;
|
||||
ii[WS(rs, 2)] = T1F + T1I;
|
||||
ii[WS(rs, 8)] = T1I - T1F;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 10 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 72, 30, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_10, &desc);
|
||||
}
|
||||
#endif
|
||||
581
fftw-3.3.10/dft/scalar/codelets/t1_12.c
Normal file
581
fftw-3.3.10/dft/scalar/codelets/t1_12.c
Normal file
@@ -0,0 +1,581 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 68 FP multiplications,
|
||||
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2r, T1s, T2f, T1d, T21, T1H;
|
||||
E T1Z, Te, T2o, T1l, T2h, TT, T1V, T1A, T1T;
|
||||
T1 = ri[0];
|
||||
T2i = ii[0];
|
||||
{
|
||||
E Th, Tk, Ti, T2d, Tg, Tj;
|
||||
Th = ri[WS(rs, 6)];
|
||||
Tk = ii[WS(rs, 6)];
|
||||
Tg = W[10];
|
||||
Ti = Tg * Th;
|
||||
T2d = Tg * Tk;
|
||||
Tj = W[11];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2e = FNMS(Tj, Th, T2d);
|
||||
}
|
||||
{
|
||||
E TW, TZ, TX, T1X, TV, TY;
|
||||
TW = ri[WS(rs, 9)];
|
||||
TZ = ii[WS(rs, 9)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T1X = TV * TZ;
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T1Y = FNMS(TY, TW, T1X);
|
||||
}
|
||||
{
|
||||
E TC, TF, TD, T1R, TB, TE;
|
||||
TC = ri[WS(rs, 3)];
|
||||
TF = ii[WS(rs, 3)];
|
||||
TB = W[4];
|
||||
TD = TB * TC;
|
||||
T1R = TB * TF;
|
||||
TE = W[5];
|
||||
TG = FMA(TE, TF, TD);
|
||||
T1S = FNMS(TE, TC, T1R);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
|
||||
Tn = ri[WS(rs, 10)];
|
||||
Tq = ii[WS(rs, 10)];
|
||||
Tm = W[18];
|
||||
To = Tm * Tn;
|
||||
T1o = Tm * Tq;
|
||||
Tt = ri[WS(rs, 2)];
|
||||
Tw = ii[WS(rs, 2)];
|
||||
Ts = W[2];
|
||||
Tu = Ts * Tt;
|
||||
T1q = Ts * Tw;
|
||||
{
|
||||
E Tr, T1p, Tx, T1r, Tp, Tv;
|
||||
Tp = W[19];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1p = FNMS(Tp, Tn, T1o);
|
||||
Tv = W[3];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1r = FNMS(Tv, Tt, T1q);
|
||||
Ty = Tr + Tx;
|
||||
T2r = Tx - Tr;
|
||||
T1s = T1p - T1r;
|
||||
T2f = T1p + T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
|
||||
T12 = ri[WS(rs, 1)];
|
||||
T15 = ii[WS(rs, 1)];
|
||||
T11 = W[0];
|
||||
T13 = T11 * T12;
|
||||
T1D = T11 * T15;
|
||||
T18 = ri[WS(rs, 5)];
|
||||
T1b = ii[WS(rs, 5)];
|
||||
T17 = W[8];
|
||||
T19 = T17 * T18;
|
||||
T1F = T17 * T1b;
|
||||
{
|
||||
E T16, T1E, T1c, T1G, T14, T1a;
|
||||
T14 = W[1];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1E = FNMS(T14, T12, T1D);
|
||||
T1a = W[9];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T1G = FNMS(T1a, T18, T1F);
|
||||
T1d = T16 + T1c;
|
||||
T21 = T1c - T16;
|
||||
T1H = T1E - T1G;
|
||||
T1Z = T1E + T1G;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T6 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1h = T2 * T6;
|
||||
T9 = ri[WS(rs, 8)];
|
||||
Tc = ii[WS(rs, 8)];
|
||||
T8 = W[14];
|
||||
Ta = T8 * T9;
|
||||
T1j = T8 * Tc;
|
||||
{
|
||||
E T7, T1i, Td, T1k, T5, Tb;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1i = FNMS(T5, T3, T1h);
|
||||
Tb = W[15];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1k = FNMS(Tb, T9, T1j);
|
||||
Te = T7 + Td;
|
||||
T2o = Td - T7;
|
||||
T1l = T1i - T1k;
|
||||
T2h = T1i + T1k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
|
||||
TI = ri[WS(rs, 7)];
|
||||
TL = ii[WS(rs, 7)];
|
||||
TH = W[12];
|
||||
TJ = TH * TI;
|
||||
T1w = TH * TL;
|
||||
TO = ri[WS(rs, 11)];
|
||||
TR = ii[WS(rs, 11)];
|
||||
TN = W[20];
|
||||
TP = TN * TO;
|
||||
T1y = TN * TR;
|
||||
{
|
||||
E TM, T1x, TS, T1z, TK, TQ;
|
||||
TK = W[13];
|
||||
TM = FMA(TK, TL, TJ);
|
||||
T1x = FNMS(TK, TI, T1w);
|
||||
TQ = W[21];
|
||||
TS = FMA(TQ, TR, TP);
|
||||
T1z = FNMS(TQ, TO, T1y);
|
||||
TT = TM + TS;
|
||||
T1V = TS - TM;
|
||||
T1A = T1x - T1z;
|
||||
T1T = T1x + T1z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
|
||||
{
|
||||
E Tf, Tz, T2g, T2j;
|
||||
Tf = T1 + Te;
|
||||
Tz = Tl + Ty;
|
||||
TA = Tf + Tz;
|
||||
T28 = Tf - Tz;
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2m = T2j - T2g;
|
||||
}
|
||||
{
|
||||
E TU, T1e, T29, T2a;
|
||||
TU = TG + TT;
|
||||
T1e = T10 + T1d;
|
||||
T1f = TU + T1e;
|
||||
T2l = TU - T1e;
|
||||
T29 = T1S + T1T;
|
||||
T2a = T1Y + T1Z;
|
||||
T2b = T29 - T2a;
|
||||
T2c = T29 + T2a;
|
||||
}
|
||||
ri[WS(rs, 6)] = TA - T1f;
|
||||
ii[WS(rs, 6)] = T2k - T2c;
|
||||
ri[0] = TA + T1f;
|
||||
ii[0] = T2c + T2k;
|
||||
ri[WS(rs, 3)] = T28 - T2b;
|
||||
ii[WS(rs, 3)] = T2l + T2m;
|
||||
ri[WS(rs, 9)] = T28 + T2b;
|
||||
ii[WS(rs, 9)] = T2m - T2l;
|
||||
}
|
||||
{
|
||||
E T1m, T1K, T2p, T2y, T2s, T2x, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
|
||||
E T1O;
|
||||
{
|
||||
E T1g, T2n, T2q, T1n;
|
||||
T1g = FNMS(KP500000000, Te, T1);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1K = FMA(KP866025403, T1l, T1g);
|
||||
T2n = FNMS(KP500000000, T2h, T2i);
|
||||
T2p = FMA(KP866025403, T2o, T2n);
|
||||
T2y = FNMS(KP866025403, T2o, T2n);
|
||||
T2q = FNMS(KP500000000, T2f, T2e);
|
||||
T2s = FMA(KP866025403, T2r, T2q);
|
||||
T2x = FNMS(KP866025403, T2r, T2q);
|
||||
T1n = FNMS(KP500000000, Ty, Tl);
|
||||
T1t = FNMS(KP866025403, T1s, T1n);
|
||||
T1L = FMA(KP866025403, T1s, T1n);
|
||||
}
|
||||
{
|
||||
E T1v, T1U, T20, T1C;
|
||||
T1v = FNMS(KP500000000, TT, TG);
|
||||
T1B = FNMS(KP866025403, T1A, T1v);
|
||||
T1N = FMA(KP866025403, T1A, T1v);
|
||||
T1U = FNMS(KP500000000, T1T, T1S);
|
||||
T1W = FMA(KP866025403, T1V, T1U);
|
||||
T25 = FNMS(KP866025403, T1V, T1U);
|
||||
T20 = FNMS(KP500000000, T1Z, T1Y);
|
||||
T22 = FMA(KP866025403, T21, T20);
|
||||
T26 = FNMS(KP866025403, T21, T20);
|
||||
T1C = FNMS(KP500000000, T1d, T10);
|
||||
T1I = FNMS(KP866025403, T1H, T1C);
|
||||
T1O = FMA(KP866025403, T1H, T1C);
|
||||
}
|
||||
{
|
||||
E T1u, T1J, T2z, T2A;
|
||||
T1u = T1m + T1t;
|
||||
T1J = T1B + T1I;
|
||||
ri[WS(rs, 2)] = T1u - T1J;
|
||||
ri[WS(rs, 8)] = T1u + T1J;
|
||||
T2z = T2x + T2y;
|
||||
T2A = T25 + T26;
|
||||
ii[WS(rs, 2)] = T2z - T2A;
|
||||
ii[WS(rs, 8)] = T2A + T2z;
|
||||
}
|
||||
{
|
||||
E T1M, T1P, T2v, T2w;
|
||||
T1M = T1K + T1L;
|
||||
T1P = T1N + T1O;
|
||||
ri[WS(rs, 10)] = T1M - T1P;
|
||||
ri[WS(rs, 4)] = T1M + T1P;
|
||||
T2v = T1W + T22;
|
||||
T2w = T2s + T2p;
|
||||
ii[WS(rs, 4)] = T2v + T2w;
|
||||
ii[WS(rs, 10)] = T2w - T2v;
|
||||
}
|
||||
{
|
||||
E T1Q, T23, T2t, T2u;
|
||||
T1Q = T1K - T1L;
|
||||
T23 = T1W - T22;
|
||||
ri[WS(rs, 7)] = T1Q - T23;
|
||||
ri[WS(rs, 1)] = T1Q + T23;
|
||||
T2t = T2p - T2s;
|
||||
T2u = T1N - T1O;
|
||||
ii[WS(rs, 1)] = T2t - T2u;
|
||||
ii[WS(rs, 7)] = T2u + T2t;
|
||||
}
|
||||
{
|
||||
E T24, T27, T2B, T2C;
|
||||
T24 = T1m - T1t;
|
||||
T27 = T25 - T26;
|
||||
ri[WS(rs, 11)] = T24 - T27;
|
||||
ri[WS(rs, 5)] = T24 + T27;
|
||||
T2B = T2y - T2x;
|
||||
T2C = T1B - T1I;
|
||||
ii[WS(rs, 5)] = T2B - T2C;
|
||||
ii[WS(rs, 11)] = T2C + T2B;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 72, 22, 46, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_12) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_12, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 118 FP additions, 60 FP multiplications,
|
||||
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
|
||||
* 47 stack variables, 2 constants, and 48 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
|
||||
E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
|
||||
E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g;
|
||||
E T1A, T1B;
|
||||
{
|
||||
E T6, T16, Tb, T17;
|
||||
T1 = ri[0];
|
||||
T1W = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T5 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T16 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 8)];
|
||||
Ta = ii[WS(rs, 8)];
|
||||
T7 = W[14];
|
||||
T9 = W[15];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T17 = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T18 = KP866025403 * (T16 - T17);
|
||||
T21 = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
T15 = FNMS(KP500000000, Tc, T1);
|
||||
T1V = T16 + T17;
|
||||
T22 = FNMS(KP500000000, T1V, T1W);
|
||||
}
|
||||
{
|
||||
E T11, T1n, TW, T1m;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = ri[WS(rs, 9)];
|
||||
TQ = ii[WS(rs, 9)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1E = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = ri[WS(rs, 5)];
|
||||
T10 = ii[WS(rs, 5)];
|
||||
TX = W[8];
|
||||
TZ = W[9];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1n = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = ri[WS(rs, 1)];
|
||||
TV = ii[WS(rs, 1)];
|
||||
TS = W[0];
|
||||
TU = W[1];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1m = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
T1o = KP866025403 * (T1m - T1n);
|
||||
T1D = KP866025403 * (T11 - TW);
|
||||
T12 = TW + T11;
|
||||
T1l = FNMS(KP500000000, T12, TR);
|
||||
T1F = T1m + T1n;
|
||||
T1G = FNMS(KP500000000, T1F, T1E);
|
||||
}
|
||||
{
|
||||
E Ts, T1c, Tn, T1b;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = ri[WS(rs, 6)];
|
||||
Th = ii[WS(rs, 6)];
|
||||
Te = W[10];
|
||||
Tg = W[11];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T1S = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 2)];
|
||||
Tr = ii[WS(rs, 2)];
|
||||
To = W[2];
|
||||
Tq = W[3];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1c = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 10)];
|
||||
Tm = ii[WS(rs, 10)];
|
||||
Tj = W[18];
|
||||
Tl = W[19];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1b = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
T1d = KP866025403 * (T1b - T1c);
|
||||
T24 = KP866025403 * (Ts - Tn);
|
||||
Tt = Tn + Ts;
|
||||
T1a = FNMS(KP500000000, Tt, Ti);
|
||||
T1T = T1b + T1c;
|
||||
T25 = FNMS(KP500000000, T1T, T1S);
|
||||
}
|
||||
{
|
||||
E TK, T1i, TF, T1h;
|
||||
{
|
||||
E Tx, Tz, Tw, Ty;
|
||||
Tx = ri[WS(rs, 3)];
|
||||
Tz = ii[WS(rs, 3)];
|
||||
Tw = W[4];
|
||||
Ty = W[5];
|
||||
TA = FMA(Tw, Tx, Ty * Tz);
|
||||
T1z = FNMS(Ty, Tx, Tw * Tz);
|
||||
}
|
||||
{
|
||||
E TH, TJ, TG, TI;
|
||||
TH = ri[WS(rs, 11)];
|
||||
TJ = ii[WS(rs, 11)];
|
||||
TG = W[20];
|
||||
TI = W[21];
|
||||
TK = FMA(TG, TH, TI * TJ);
|
||||
T1i = FNMS(TI, TH, TG * TJ);
|
||||
}
|
||||
{
|
||||
E TC, TE, TB, TD;
|
||||
TC = ri[WS(rs, 7)];
|
||||
TE = ii[WS(rs, 7)];
|
||||
TB = W[12];
|
||||
TD = W[13];
|
||||
TF = FMA(TB, TC, TD * TE);
|
||||
T1h = FNMS(TD, TC, TB * TE);
|
||||
}
|
||||
T1j = KP866025403 * (T1h - T1i);
|
||||
T1y = KP866025403 * (TK - TF);
|
||||
TL = TF + TK;
|
||||
T1g = FNMS(KP500000000, TL, TA);
|
||||
T1A = T1h + T1i;
|
||||
T1B = FNMS(KP500000000, T1A, T1z);
|
||||
}
|
||||
{
|
||||
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
|
||||
{
|
||||
E Td, Tu, T1U, T1X;
|
||||
Td = T1 + Tc;
|
||||
Tu = Ti + Tt;
|
||||
Tv = Td + Tu;
|
||||
T1N = Td - Tu;
|
||||
T1U = T1S + T1T;
|
||||
T1X = T1V + T1W;
|
||||
T1Y = T1U + T1X;
|
||||
T20 = T1X - T1U;
|
||||
}
|
||||
{
|
||||
E TM, T13, T1O, T1P;
|
||||
TM = TA + TL;
|
||||
T13 = TR + T12;
|
||||
T14 = TM + T13;
|
||||
T1Z = TM - T13;
|
||||
T1O = T1z + T1A;
|
||||
T1P = T1E + T1F;
|
||||
T1Q = T1O - T1P;
|
||||
T1R = T1O + T1P;
|
||||
}
|
||||
ri[WS(rs, 6)] = Tv - T14;
|
||||
ii[WS(rs, 6)] = T1Y - T1R;
|
||||
ri[0] = Tv + T14;
|
||||
ii[0] = T1R + T1Y;
|
||||
ri[WS(rs, 3)] = T1N - T1Q;
|
||||
ii[WS(rs, 3)] = T1Z + T20;
|
||||
ri[WS(rs, 9)] = T1N + T1Q;
|
||||
ii[WS(rs, 9)] = T20 - T1Z;
|
||||
}
|
||||
{
|
||||
E T1t, T1x, T27, T2a, T1w, T28, T1I, T29;
|
||||
{
|
||||
E T1r, T1s, T23, T26;
|
||||
T1r = T15 + T18;
|
||||
T1s = T1a + T1d;
|
||||
T1t = T1r + T1s;
|
||||
T1x = T1r - T1s;
|
||||
T23 = T21 + T22;
|
||||
T26 = T24 + T25;
|
||||
T27 = T23 - T26;
|
||||
T2a = T26 + T23;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1C, T1H;
|
||||
T1u = T1g + T1j;
|
||||
T1v = T1l + T1o;
|
||||
T1w = T1u + T1v;
|
||||
T28 = T1u - T1v;
|
||||
T1C = T1y + T1B;
|
||||
T1H = T1D + T1G;
|
||||
T1I = T1C - T1H;
|
||||
T29 = T1C + T1H;
|
||||
}
|
||||
ri[WS(rs, 10)] = T1t - T1w;
|
||||
ii[WS(rs, 10)] = T2a - T29;
|
||||
ri[WS(rs, 4)] = T1t + T1w;
|
||||
ii[WS(rs, 4)] = T29 + T2a;
|
||||
ri[WS(rs, 7)] = T1x - T1I;
|
||||
ii[WS(rs, 7)] = T28 + T27;
|
||||
ri[WS(rs, 1)] = T1x + T1I;
|
||||
ii[WS(rs, 1)] = T27 - T28;
|
||||
}
|
||||
{
|
||||
E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e;
|
||||
{
|
||||
E T19, T1e, T2b, T2c;
|
||||
T19 = T15 - T18;
|
||||
T1e = T1a - T1d;
|
||||
T1f = T19 + T1e;
|
||||
T1J = T19 - T1e;
|
||||
T2b = T25 - T24;
|
||||
T2c = T22 - T21;
|
||||
T2d = T2b + T2c;
|
||||
T2f = T2c - T2b;
|
||||
}
|
||||
{
|
||||
E T1k, T1p, T1K, T1L;
|
||||
T1k = T1g - T1j;
|
||||
T1p = T1l - T1o;
|
||||
T1q = T1k + T1p;
|
||||
T2g = T1k - T1p;
|
||||
T1K = T1B - T1y;
|
||||
T1L = T1G - T1D;
|
||||
T1M = T1K - T1L;
|
||||
T2e = T1K + T1L;
|
||||
}
|
||||
ri[WS(rs, 2)] = T1f - T1q;
|
||||
ii[WS(rs, 2)] = T2d - T2e;
|
||||
ri[WS(rs, 8)] = T1f + T1q;
|
||||
ii[WS(rs, 8)] = T2e + T2d;
|
||||
ri[WS(rs, 11)] = T1J - T1M;
|
||||
ii[WS(rs, 11)] = T2g + T2f;
|
||||
ri[WS(rs, 5)] = T1J + T1M;
|
||||
ii[WS(rs, 5)] = T2f - T2g;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 12 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 88, 30, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_12) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_12, &desc);
|
||||
}
|
||||
#endif
|
||||
816
fftw-3.3.10/dft/scalar/codelets/t1_15.c
Normal file
816
fftw-3.3.10/dft/scalar/codelets/t1_15.c
Normal file
@@ -0,0 +1,816 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 140 FP multiplications,
|
||||
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
|
||||
* 51 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T1, T3j, T1G, T3u, Te, T1B, T3i, T3t, T1y, T2i, T2a, T2M, T37, T2V, Tz;
|
||||
E T2e, T1O, T2t, T39, T2X, TT, T2f, T1V, T2z, T3a, T2Y, T1e, T2h, T23, T2G;
|
||||
E T36, T2U;
|
||||
{
|
||||
E T7, T1D, Td, T1F;
|
||||
T1 = ri[0];
|
||||
T3j = ii[0];
|
||||
{
|
||||
E T3, T6, T4, T1C, T2, T5;
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T6 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = T2 * T3;
|
||||
T1C = T2 * T6;
|
||||
T5 = W[9];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1D = FNMS(T5, T3, T1C);
|
||||
}
|
||||
{
|
||||
E T9, Tc, Ta, T1E, T8, Tb;
|
||||
T9 = ri[WS(rs, 10)];
|
||||
Tc = ii[WS(rs, 10)];
|
||||
T8 = W[18];
|
||||
Ta = T8 * T9;
|
||||
T1E = T8 * Tc;
|
||||
Tb = W[19];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
T1F = FNMS(Tb, T9, T1E);
|
||||
}
|
||||
T1G = T1D - T1F;
|
||||
T3u = Td - T7;
|
||||
Te = T7 + Td;
|
||||
T1B = FNMS(KP500000000, Te, T1);
|
||||
T3i = T1D + T1F;
|
||||
T3t = FNMS(KP500000000, T3i, T3j);
|
||||
}
|
||||
{
|
||||
E T1k, T2I, T1w, T28, T1q, T26;
|
||||
{
|
||||
E T1g, T1j, T1h, T2H, T1f, T1i;
|
||||
T1g = ri[WS(rs, 9)];
|
||||
T1j = ii[WS(rs, 9)];
|
||||
T1f = W[16];
|
||||
T1h = T1f * T1g;
|
||||
T2H = T1f * T1j;
|
||||
T1i = W[17];
|
||||
T1k = FMA(T1i, T1j, T1h);
|
||||
T2I = FNMS(T1i, T1g, T2H);
|
||||
}
|
||||
{
|
||||
E T1s, T1v, T1t, T27, T1r, T1u;
|
||||
T1s = ri[WS(rs, 4)];
|
||||
T1v = ii[WS(rs, 4)];
|
||||
T1r = W[6];
|
||||
T1t = T1r * T1s;
|
||||
T27 = T1r * T1v;
|
||||
T1u = W[7];
|
||||
T1w = FMA(T1u, T1v, T1t);
|
||||
T28 = FNMS(T1u, T1s, T27);
|
||||
}
|
||||
{
|
||||
E T1m, T1p, T1n, T25, T1l, T1o;
|
||||
T1m = ri[WS(rs, 14)];
|
||||
T1p = ii[WS(rs, 14)];
|
||||
T1l = W[26];
|
||||
T1n = T1l * T1m;
|
||||
T25 = T1l * T1p;
|
||||
T1o = W[27];
|
||||
T1q = FMA(T1o, T1p, T1n);
|
||||
T26 = FNMS(T1o, T1m, T25);
|
||||
}
|
||||
{
|
||||
E T29, T1x, T24, T2L, T2J, T2K;
|
||||
T29 = T26 - T28;
|
||||
T1x = T1q + T1w;
|
||||
T24 = FNMS(KP500000000, T1x, T1k);
|
||||
T1y = T1k + T1x;
|
||||
T2i = FMA(KP866025403, T29, T24);
|
||||
T2a = FNMS(KP866025403, T29, T24);
|
||||
T2L = T1w - T1q;
|
||||
T2J = T26 + T28;
|
||||
T2K = FNMS(KP500000000, T2J, T2I);
|
||||
T2M = FMA(KP866025403, T2L, T2K);
|
||||
T37 = T2I + T2J;
|
||||
T2V = FNMS(KP866025403, T2L, T2K);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tl, T2p, Tx, T1M, Tr, T1K;
|
||||
{
|
||||
E Th, Tk, Ti, T2o, Tg, Tj;
|
||||
Th = ri[WS(rs, 3)];
|
||||
Tk = ii[WS(rs, 3)];
|
||||
Tg = W[4];
|
||||
Ti = Tg * Th;
|
||||
T2o = Tg * Tk;
|
||||
Tj = W[5];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T2p = FNMS(Tj, Th, T2o);
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, T1L, Ts, Tv;
|
||||
Tt = ri[WS(rs, 13)];
|
||||
Tw = ii[WS(rs, 13)];
|
||||
Ts = W[24];
|
||||
Tu = Ts * Tt;
|
||||
T1L = Ts * Tw;
|
||||
Tv = W[25];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1M = FNMS(Tv, Tt, T1L);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1J, Tm, Tp;
|
||||
Tn = ri[WS(rs, 8)];
|
||||
Tq = ii[WS(rs, 8)];
|
||||
Tm = W[14];
|
||||
To = Tm * Tn;
|
||||
T1J = Tm * Tq;
|
||||
Tp = W[15];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1K = FNMS(Tp, Tn, T1J);
|
||||
}
|
||||
{
|
||||
E T1N, Ty, T1I, T2s, T2q, T2r;
|
||||
T1N = T1K - T1M;
|
||||
Ty = Tr + Tx;
|
||||
T1I = FNMS(KP500000000, Ty, Tl);
|
||||
Tz = Tl + Ty;
|
||||
T2e = FMA(KP866025403, T1N, T1I);
|
||||
T1O = FNMS(KP866025403, T1N, T1I);
|
||||
T2s = Tx - Tr;
|
||||
T2q = T1K + T1M;
|
||||
T2r = FNMS(KP500000000, T2q, T2p);
|
||||
T2t = FMA(KP866025403, T2s, T2r);
|
||||
T39 = T2p + T2q;
|
||||
T2X = FNMS(KP866025403, T2s, T2r);
|
||||
}
|
||||
}
|
||||
{
|
||||
E TF, T2v, TR, T1T, TL, T1R;
|
||||
{
|
||||
E TB, TE, TC, T2u, TA, TD;
|
||||
TB = ri[WS(rs, 12)];
|
||||
TE = ii[WS(rs, 12)];
|
||||
TA = W[22];
|
||||
TC = TA * TB;
|
||||
T2u = TA * TE;
|
||||
TD = W[23];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T2v = FNMS(TD, TB, T2u);
|
||||
}
|
||||
{
|
||||
E TN, TQ, TO, T1S, TM, TP;
|
||||
TN = ri[WS(rs, 7)];
|
||||
TQ = ii[WS(rs, 7)];
|
||||
TM = W[12];
|
||||
TO = TM * TN;
|
||||
T1S = TM * TQ;
|
||||
TP = W[13];
|
||||
TR = FMA(TP, TQ, TO);
|
||||
T1T = FNMS(TP, TN, T1S);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T1Q, TG, TJ;
|
||||
TH = ri[WS(rs, 2)];
|
||||
TK = ii[WS(rs, 2)];
|
||||
TG = W[2];
|
||||
TI = TG * TH;
|
||||
T1Q = TG * TK;
|
||||
TJ = W[3];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T1R = FNMS(TJ, TH, T1Q);
|
||||
}
|
||||
{
|
||||
E T1U, TS, T1P, T2y, T2w, T2x;
|
||||
T1U = T1R - T1T;
|
||||
TS = TL + TR;
|
||||
T1P = FNMS(KP500000000, TS, TF);
|
||||
TT = TF + TS;
|
||||
T2f = FMA(KP866025403, T1U, T1P);
|
||||
T1V = FNMS(KP866025403, T1U, T1P);
|
||||
T2y = TR - TL;
|
||||
T2w = T1R + T1T;
|
||||
T2x = FNMS(KP500000000, T2w, T2v);
|
||||
T2z = FMA(KP866025403, T2y, T2x);
|
||||
T3a = T2v + T2w;
|
||||
T2Y = FNMS(KP866025403, T2y, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T10, T2C, T1c, T21, T16, T1Z;
|
||||
{
|
||||
E TW, TZ, TX, T2B, TV, TY;
|
||||
TW = ri[WS(rs, 6)];
|
||||
TZ = ii[WS(rs, 6)];
|
||||
TV = W[10];
|
||||
TX = TV * TW;
|
||||
T2B = TV * TZ;
|
||||
TY = W[11];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T2C = FNMS(TY, TW, T2B);
|
||||
}
|
||||
{
|
||||
E T18, T1b, T19, T20, T17, T1a;
|
||||
T18 = ri[WS(rs, 1)];
|
||||
T1b = ii[WS(rs, 1)];
|
||||
T17 = W[0];
|
||||
T19 = T17 * T18;
|
||||
T20 = T17 * T1b;
|
||||
T1a = W[1];
|
||||
T1c = FMA(T1a, T1b, T19);
|
||||
T21 = FNMS(T1a, T18, T20);
|
||||
}
|
||||
{
|
||||
E T12, T15, T13, T1Y, T11, T14;
|
||||
T12 = ri[WS(rs, 11)];
|
||||
T15 = ii[WS(rs, 11)];
|
||||
T11 = W[20];
|
||||
T13 = T11 * T12;
|
||||
T1Y = T11 * T15;
|
||||
T14 = W[21];
|
||||
T16 = FMA(T14, T15, T13);
|
||||
T1Z = FNMS(T14, T12, T1Y);
|
||||
}
|
||||
{
|
||||
E T22, T1d, T1X, T2F, T2D, T2E;
|
||||
T22 = T1Z - T21;
|
||||
T1d = T16 + T1c;
|
||||
T1X = FNMS(KP500000000, T1d, T10);
|
||||
T1e = T10 + T1d;
|
||||
T2h = FMA(KP866025403, T22, T1X);
|
||||
T23 = FNMS(KP866025403, T22, T1X);
|
||||
T2F = T1c - T16;
|
||||
T2D = T1Z + T21;
|
||||
T2E = FNMS(KP500000000, T2D, T2C);
|
||||
T2G = FMA(KP866025403, T2F, T2E);
|
||||
T36 = T2C + T2D;
|
||||
T2U = FNMS(KP866025403, T2F, T2E);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
|
||||
{
|
||||
E T38, T3b, TU, T1z;
|
||||
T38 = T36 - T37;
|
||||
T3b = T39 - T3a;
|
||||
T3c = FNMS(KP618033988, T3b, T38);
|
||||
T3e = FMA(KP618033988, T38, T3b);
|
||||
Tf = T1 + Te;
|
||||
TU = Tz + TT;
|
||||
T1z = T1e + T1y;
|
||||
T1A = TU + T1z;
|
||||
T33 = FNMS(KP250000000, T1A, Tf);
|
||||
T34 = TU - T1z;
|
||||
}
|
||||
ri[0] = Tf + T1A;
|
||||
T3d = FMA(KP559016994, T34, T33);
|
||||
ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
|
||||
ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
|
||||
T35 = FNMS(KP559016994, T34, T33);
|
||||
ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
|
||||
}
|
||||
{
|
||||
E T3q, T3s, T3k, T3h, T3l, T3m, T3r, T3n;
|
||||
{
|
||||
E T3o, T3p, T3f, T3g;
|
||||
T3o = T1e - T1y;
|
||||
T3p = Tz - TT;
|
||||
T3q = FNMS(KP618033988, T3p, T3o);
|
||||
T3s = FMA(KP618033988, T3o, T3p);
|
||||
T3k = T3i + T3j;
|
||||
T3f = T39 + T3a;
|
||||
T3g = T36 + T37;
|
||||
T3h = T3f + T3g;
|
||||
T3l = FNMS(KP250000000, T3h, T3k);
|
||||
T3m = T3f - T3g;
|
||||
}
|
||||
ii[0] = T3h + T3k;
|
||||
T3r = FMA(KP559016994, T3m, T3l);
|
||||
ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
|
||||
ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
|
||||
T3n = FNMS(KP559016994, T3m, T3l);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
|
||||
ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
|
||||
}
|
||||
{
|
||||
E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
|
||||
{
|
||||
E T2W, T2Z, T1W, T2b;
|
||||
T2W = T2U - T2V;
|
||||
T2Z = T2X - T2Y;
|
||||
T30 = FNMS(KP618033988, T2Z, T2W);
|
||||
T32 = FMA(KP618033988, T2W, T2Z);
|
||||
T1H = FNMS(KP866025403, T1G, T1B);
|
||||
T1W = T1O + T1V;
|
||||
T2b = T23 + T2a;
|
||||
T2c = T1W + T2b;
|
||||
T2R = FNMS(KP250000000, T2c, T1H);
|
||||
T2S = T1W - T2b;
|
||||
}
|
||||
ri[WS(rs, 5)] = T1H + T2c;
|
||||
T31 = FMA(KP559016994, T2S, T2R);
|
||||
ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
|
||||
ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
|
||||
T2T = FNMS(KP559016994, T2S, T2R);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
|
||||
ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
|
||||
}
|
||||
{
|
||||
E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
|
||||
{
|
||||
E T3O, T3P, T3I, T3J;
|
||||
T3O = T23 - T2a;
|
||||
T3P = T1O - T1V;
|
||||
T3Q = FNMS(KP618033988, T3P, T3O);
|
||||
T3S = FMA(KP618033988, T3O, T3P);
|
||||
T3H = FNMS(KP866025403, T3u, T3t);
|
||||
T3I = T2X + T2Y;
|
||||
T3J = T2U + T2V;
|
||||
T3K = T3I + T3J;
|
||||
T3L = FNMS(KP250000000, T3K, T3H);
|
||||
T3M = T3I - T3J;
|
||||
}
|
||||
ii[WS(rs, 5)] = T3K + T3H;
|
||||
T3R = FMA(KP559016994, T3M, T3L);
|
||||
ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
|
||||
ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
|
||||
T3N = FNMS(KP559016994, T3M, T3L);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
|
||||
ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
|
||||
}
|
||||
{
|
||||
E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
|
||||
{
|
||||
E T3C, T3D, T3w, T3x;
|
||||
T3C = T2e - T2f;
|
||||
T3D = T2h - T2i;
|
||||
T3E = FMA(KP618033988, T3D, T3C);
|
||||
T3G = FNMS(KP618033988, T3C, T3D);
|
||||
T3v = FMA(KP866025403, T3u, T3t);
|
||||
T3w = T2t + T2z;
|
||||
T3x = T2G + T2M;
|
||||
T3y = T3w + T3x;
|
||||
T3z = FNMS(KP250000000, T3y, T3v);
|
||||
T3A = T3w - T3x;
|
||||
}
|
||||
ii[WS(rs, 10)] = T3y + T3v;
|
||||
T3F = FNMS(KP559016994, T3A, T3z);
|
||||
ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
|
||||
ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
|
||||
T3B = FMA(KP559016994, T3A, T3z);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
|
||||
}
|
||||
{
|
||||
E T2O, T2Q, T2d, T2k, T2l, T2m, T2P, T2n;
|
||||
{
|
||||
E T2A, T2N, T2g, T2j;
|
||||
T2A = T2t - T2z;
|
||||
T2N = T2G - T2M;
|
||||
T2O = FMA(KP618033988, T2N, T2A);
|
||||
T2Q = FNMS(KP618033988, T2A, T2N);
|
||||
T2d = FMA(KP866025403, T1G, T1B);
|
||||
T2g = T2e + T2f;
|
||||
T2j = T2h + T2i;
|
||||
T2k = T2g + T2j;
|
||||
T2l = FNMS(KP250000000, T2k, T2d);
|
||||
T2m = T2g - T2j;
|
||||
}
|
||||
ri[WS(rs, 10)] = T2d + T2k;
|
||||
T2P = FNMS(KP559016994, T2m, T2l);
|
||||
ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
|
||||
ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
|
||||
T2n = FMA(KP559016994, T2m, T2l);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 72, 28, 112, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_15) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_15, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 184 FP additions, 112 FP multiplications,
|
||||
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
|
||||
* 65 stack variables, 6 constants, and 60 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
|
||||
E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
|
||||
E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
|
||||
E T24, T2v, T1B, T1R;
|
||||
{
|
||||
E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
|
||||
T1 = ri[0];
|
||||
T2R = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 5)];
|
||||
T5 = ii[WS(rs, 5)];
|
||||
T2 = W[8];
|
||||
T4 = W[9];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T1o = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 10)];
|
||||
Ta = ii[WS(rs, 10)];
|
||||
T7 = W[18];
|
||||
T9 = W[19];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
T1p = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
T1q = KP866025403 * (T1o - T1p);
|
||||
T34 = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
Td = T1 + Tc;
|
||||
T1n = FNMS(KP500000000, Tc, T1);
|
||||
T2Q = T1o + T1p;
|
||||
T2S = T2Q + T2R;
|
||||
T35 = FNMS(KP500000000, T2Q, T2R);
|
||||
}
|
||||
{
|
||||
E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
|
||||
E T2i;
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = ri[WS(rs, 6)];
|
||||
TQ = ii[WS(rs, 6)];
|
||||
TN = W[10];
|
||||
TP = W[11];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T2c = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E T15, T17, T14, T16;
|
||||
T15 = ri[WS(rs, 9)];
|
||||
T17 = ii[WS(rs, 9)];
|
||||
T14 = W[16];
|
||||
T16 = W[17];
|
||||
T18 = FMA(T14, T15, T16 * T17);
|
||||
T2h = FNMS(T16, T15, T14 * T17);
|
||||
}
|
||||
{
|
||||
E TT, TV, TS, TU;
|
||||
TT = ri[WS(rs, 11)];
|
||||
TV = ii[WS(rs, 11)];
|
||||
TS = W[20];
|
||||
TU = W[21];
|
||||
TW = FMA(TS, TT, TU * TV);
|
||||
T1E = FNMS(TU, TT, TS * TV);
|
||||
}
|
||||
{
|
||||
E TY, T10, TX, TZ;
|
||||
TY = ri[WS(rs, 1)];
|
||||
T10 = ii[WS(rs, 1)];
|
||||
TX = W[0];
|
||||
TZ = W[1];
|
||||
T11 = FMA(TX, TY, TZ * T10);
|
||||
T1F = FNMS(TZ, TY, TX * T10);
|
||||
}
|
||||
T12 = TW + T11;
|
||||
T2d = T1E + T1F;
|
||||
{
|
||||
E T1a, T1c, T19, T1b;
|
||||
T1a = ri[WS(rs, 14)];
|
||||
T1c = ii[WS(rs, 14)];
|
||||
T19 = W[26];
|
||||
T1b = W[27];
|
||||
T1d = FMA(T19, T1a, T1b * T1c);
|
||||
T1J = FNMS(T1b, T1a, T19 * T1c);
|
||||
}
|
||||
{
|
||||
E T1f, T1h, T1e, T1g;
|
||||
T1f = ri[WS(rs, 4)];
|
||||
T1h = ii[WS(rs, 4)];
|
||||
T1e = W[6];
|
||||
T1g = W[7];
|
||||
T1i = FMA(T1e, T1f, T1g * T1h);
|
||||
T1K = FNMS(T1g, T1f, T1e * T1h);
|
||||
}
|
||||
T1j = T1d + T1i;
|
||||
T2i = T1J + T1K;
|
||||
{
|
||||
E T1D, T1G, T2g, T2j;
|
||||
T13 = TR + T12;
|
||||
T1k = T18 + T1j;
|
||||
T1l = T13 + T1k;
|
||||
T2E = T2c + T2d;
|
||||
T2F = T2h + T2i;
|
||||
T2O = T2E + T2F;
|
||||
T1D = FNMS(KP500000000, T12, TR);
|
||||
T1G = KP866025403 * (T1E - T1F);
|
||||
T1H = T1D - T1G;
|
||||
T1T = T1D + T1G;
|
||||
T2g = KP866025403 * (T1i - T1d);
|
||||
T2j = FNMS(KP500000000, T2i, T2h);
|
||||
T2k = T2g + T2j;
|
||||
T2t = T2j - T2g;
|
||||
{
|
||||
E T2b, T2e, T1I, T1L;
|
||||
T2b = KP866025403 * (T11 - TW);
|
||||
T2e = FNMS(KP500000000, T2d, T2c);
|
||||
T2f = T2b + T2e;
|
||||
T2s = T2e - T2b;
|
||||
T1I = FNMS(KP500000000, T1j, T18);
|
||||
T1L = KP866025403 * (T1J - T1K);
|
||||
T1M = T1I - T1L;
|
||||
T1U = T1I + T1L;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
|
||||
E T27;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = ri[WS(rs, 3)];
|
||||
Th = ii[WS(rs, 3)];
|
||||
Te = W[4];
|
||||
Tg = W[5];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T21 = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 12)];
|
||||
Ty = ii[WS(rs, 12)];
|
||||
Tv = W[22];
|
||||
Tx = W[23];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T26 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 8)];
|
||||
Tm = ii[WS(rs, 8)];
|
||||
Tj = W[14];
|
||||
Tl = W[15];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
T1t = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 13)];
|
||||
Tr = ii[WS(rs, 13)];
|
||||
To = W[24];
|
||||
Tq = W[25];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
T1u = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn + Ts;
|
||||
T22 = T1t + T1u;
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 2)];
|
||||
TD = ii[WS(rs, 2)];
|
||||
TA = W[2];
|
||||
TC = W[3];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1y = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = ri[WS(rs, 7)];
|
||||
TI = ii[WS(rs, 7)];
|
||||
TF = W[12];
|
||||
TH = W[13];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T1z = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
TK = TE + TJ;
|
||||
T27 = T1y + T1z;
|
||||
{
|
||||
E T1s, T1v, T25, T28;
|
||||
Tu = Ti + Tt;
|
||||
TL = Tz + TK;
|
||||
TM = Tu + TL;
|
||||
T2H = T21 + T22;
|
||||
T2I = T26 + T27;
|
||||
T2N = T2H + T2I;
|
||||
T1s = FNMS(KP500000000, Tt, Ti);
|
||||
T1v = KP866025403 * (T1t - T1u);
|
||||
T1w = T1s - T1v;
|
||||
T1Q = T1s + T1v;
|
||||
T25 = KP866025403 * (TJ - TE);
|
||||
T28 = FNMS(KP500000000, T27, T26);
|
||||
T29 = T25 + T28;
|
||||
T2w = T28 - T25;
|
||||
{
|
||||
E T20, T23, T1x, T1A;
|
||||
T20 = KP866025403 * (Ts - Tn);
|
||||
T23 = FNMS(KP500000000, T22, T21);
|
||||
T24 = T20 + T23;
|
||||
T2v = T23 - T20;
|
||||
T1x = FNMS(KP500000000, TK, Tz);
|
||||
T1A = KP866025403 * (T1y - T1z);
|
||||
T1B = T1x - T1A;
|
||||
T1R = T1x + T1A;
|
||||
}
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
|
||||
T2C = KP559016994 * (TM - T1l);
|
||||
T1m = TM + T1l;
|
||||
T2B = FNMS(KP250000000, T1m, Td);
|
||||
T2G = T2E - T2F;
|
||||
T2J = T2H - T2I;
|
||||
T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
|
||||
T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
|
||||
ri[0] = Td + T1m;
|
||||
T2L = T2C + T2B;
|
||||
ri[WS(rs, 9)] = T2L - T2M;
|
||||
ri[WS(rs, 6)] = T2L + T2M;
|
||||
T2D = T2B - T2C;
|
||||
ri[WS(rs, 12)] = T2D - T2K;
|
||||
ri[WS(rs, 3)] = T2D + T2K;
|
||||
}
|
||||
{
|
||||
E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
|
||||
T2U = KP559016994 * (T2N - T2O);
|
||||
T2P = T2N + T2O;
|
||||
T2T = FNMS(KP250000000, T2P, T2S);
|
||||
T2W = T13 - T1k;
|
||||
T2X = Tu - TL;
|
||||
T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
|
||||
T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
|
||||
ii[0] = T2P + T2S;
|
||||
T2Z = T2U + T2T;
|
||||
ii[WS(rs, 6)] = T2Z - T30;
|
||||
ii[WS(rs, 9)] = T30 + T2Z;
|
||||
T2V = T2T - T2U;
|
||||
ii[WS(rs, 3)] = T2V - T2Y;
|
||||
ii[WS(rs, 12)] = T2Y + T2V;
|
||||
}
|
||||
{
|
||||
E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
|
||||
{
|
||||
E T2u, T2x, T1C, T1N;
|
||||
T2u = T2s - T2t;
|
||||
T2x = T2v - T2w;
|
||||
T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
|
||||
T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
|
||||
T1r = T1n - T1q;
|
||||
T1C = T1w + T1B;
|
||||
T1N = T1H + T1M;
|
||||
T1O = T1C + T1N;
|
||||
T2p = FNMS(KP250000000, T1O, T1r);
|
||||
T2q = KP559016994 * (T1C - T1N);
|
||||
}
|
||||
ri[WS(rs, 5)] = T1r + T1O;
|
||||
T2z = T2q + T2p;
|
||||
ri[WS(rs, 14)] = T2z - T2A;
|
||||
ri[WS(rs, 11)] = T2z + T2A;
|
||||
T2r = T2p - T2q;
|
||||
ri[WS(rs, 2)] = T2r - T2y;
|
||||
ri[WS(rs, 8)] = T2r + T2y;
|
||||
}
|
||||
{
|
||||
E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
|
||||
{
|
||||
E T3f, T3g, T3j, T3k;
|
||||
T3f = T1H - T1M;
|
||||
T3g = T1w - T1B;
|
||||
T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
|
||||
T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
|
||||
T3i = T35 - T34;
|
||||
T3j = T2v + T2w;
|
||||
T3k = T2s + T2t;
|
||||
T3l = T3j + T3k;
|
||||
T3m = FNMS(KP250000000, T3l, T3i);
|
||||
T3n = KP559016994 * (T3j - T3k);
|
||||
}
|
||||
ii[WS(rs, 5)] = T3l + T3i;
|
||||
T3p = T3n + T3m;
|
||||
ii[WS(rs, 11)] = T3p - T3q;
|
||||
ii[WS(rs, 14)] = T3q + T3p;
|
||||
T3o = T3m - T3n;
|
||||
ii[WS(rs, 2)] = T3h + T3o;
|
||||
ii[WS(rs, 8)] = T3o - T3h;
|
||||
}
|
||||
{
|
||||
E T3c, T3d, T36, T37, T33, T38, T3e, T39;
|
||||
{
|
||||
E T3a, T3b, T31, T32;
|
||||
T3a = T1Q - T1R;
|
||||
T3b = T1T - T1U;
|
||||
T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
|
||||
T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
|
||||
T36 = T34 + T35;
|
||||
T31 = T24 + T29;
|
||||
T32 = T2f + T2k;
|
||||
T37 = T31 + T32;
|
||||
T33 = KP559016994 * (T31 - T32);
|
||||
T38 = FNMS(KP250000000, T37, T36);
|
||||
}
|
||||
ii[WS(rs, 10)] = T37 + T36;
|
||||
T3e = T38 - T33;
|
||||
ii[WS(rs, 7)] = T3d + T3e;
|
||||
ii[WS(rs, 13)] = T3e - T3d;
|
||||
T39 = T33 + T38;
|
||||
ii[WS(rs, 1)] = T39 - T3c;
|
||||
ii[WS(rs, 4)] = T3c + T39;
|
||||
}
|
||||
{
|
||||
E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
|
||||
{
|
||||
E T2a, T2l, T1S, T1V;
|
||||
T2a = T24 - T29;
|
||||
T2l = T2f - T2k;
|
||||
T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
|
||||
T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
|
||||
T1P = T1n + T1q;
|
||||
T1S = T1Q + T1R;
|
||||
T1V = T1T + T1U;
|
||||
T1W = T1S + T1V;
|
||||
T1X = KP559016994 * (T1S - T1V);
|
||||
T1Y = FNMS(KP250000000, T1W, T1P);
|
||||
}
|
||||
ri[WS(rs, 10)] = T1P + T1W;
|
||||
T2n = T1Y - T1X;
|
||||
ri[WS(rs, 7)] = T2n - T2o;
|
||||
ri[WS(rs, 13)] = T2n + T2o;
|
||||
T1Z = T1X + T1Y;
|
||||
ri[WS(rs, 4)] = T1Z - T2m;
|
||||
ri[WS(rs, 1)] = T1Z + T2m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 128, 56, 56, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_15) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_15, &desc);
|
||||
}
|
||||
#endif
|
||||
796
fftw-3.3.10/dft/scalar/codelets/t1_16.c
Normal file
796
fftw-3.3.10/dft/scalar/codelets/t1_16.c
Normal file
@@ -0,0 +1,796 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 100 FP multiplications,
|
||||
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
|
||||
* 60 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N;
|
||||
E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
|
||||
E T1W, T21;
|
||||
{
|
||||
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
|
||||
T1 = ri[0];
|
||||
T3n = ii[0];
|
||||
T3 = ri[WS(rs, 8)];
|
||||
T6 = ii[WS(rs, 8)];
|
||||
T2 = W[14];
|
||||
T4 = T2 * T3;
|
||||
T3l = T2 * T6;
|
||||
T5 = W[15];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T3m = FNMS(T5, T3, T3l);
|
||||
T8 = T1 + T7;
|
||||
T3z = T3n - T3m;
|
||||
T1I = T1 - T7;
|
||||
T3o = T3m + T3n;
|
||||
}
|
||||
{
|
||||
E T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m;
|
||||
T1h = ri[WS(rs, 15)];
|
||||
T1k = ii[WS(rs, 15)];
|
||||
T1g = W[28];
|
||||
T1i = T1g * T1h;
|
||||
T2k = T1g * T1k;
|
||||
T1n = ri[WS(rs, 7)];
|
||||
T1q = ii[WS(rs, 7)];
|
||||
T1m = W[12];
|
||||
T1o = T1m * T1n;
|
||||
T2m = T1m * T1q;
|
||||
{
|
||||
E T1l, T2l, T1r, T2n, T1j, T1p;
|
||||
T1j = W[29];
|
||||
T1l = FMA(T1j, T1k, T1i);
|
||||
T2l = FNMS(T1j, T1h, T2k);
|
||||
T1p = W[13];
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T2n = FNMS(T1p, T1n, T2m);
|
||||
T1s = T1l + T1r;
|
||||
T35 = T2l + T2n;
|
||||
T2o = T2l - T2n;
|
||||
T2r = T1l - T1r;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
|
||||
T1u = ri[WS(rs, 3)];
|
||||
T1x = ii[WS(rs, 3)];
|
||||
T1t = W[4];
|
||||
T1v = T1t * T1u;
|
||||
T2s = T1t * T1x;
|
||||
T1A = ri[WS(rs, 11)];
|
||||
T1D = ii[WS(rs, 11)];
|
||||
T1z = W[20];
|
||||
T1B = T1z * T1A;
|
||||
T2u = T1z * T1D;
|
||||
{
|
||||
E T1y, T2t, T1E, T2v, T1w, T1C;
|
||||
T1w = W[5];
|
||||
T1y = FMA(T1w, T1x, T1v);
|
||||
T2t = FNMS(T1w, T1u, T2s);
|
||||
T1C = W[21];
|
||||
T1E = FMA(T1C, T1D, T1B);
|
||||
T2v = FNMS(T1C, T1A, T2u);
|
||||
T1F = T1y + T1E;
|
||||
T36 = T2t + T2v;
|
||||
T2p = T1y - T1E;
|
||||
T2w = T2t - T2v;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
|
||||
Ta = ri[WS(rs, 4)];
|
||||
Td = ii[WS(rs, 4)];
|
||||
T9 = W[6];
|
||||
Tb = T9 * Ta;
|
||||
T1J = T9 * Td;
|
||||
Tg = ri[WS(rs, 12)];
|
||||
Tj = ii[WS(rs, 12)];
|
||||
Tf = W[22];
|
||||
Th = Tf * Tg;
|
||||
T1L = Tf * Tj;
|
||||
{
|
||||
E Te, T1K, Tk, T1M, Tc, Ti;
|
||||
Tc = W[7];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
T1K = FNMS(Tc, Ta, T1J);
|
||||
Ti = W[23];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
T1M = FNMS(Ti, Tg, T1L);
|
||||
Tl = Te + Tk;
|
||||
T3A = Te - Tk;
|
||||
T1N = T1K - T1M;
|
||||
T3k = T1K + T1M;
|
||||
}
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
|
||||
To = ri[WS(rs, 2)];
|
||||
Tr = ii[WS(rs, 2)];
|
||||
Tn = W[2];
|
||||
Tp = Tn * To;
|
||||
T1P = Tn * Tr;
|
||||
Tu = ri[WS(rs, 10)];
|
||||
Tx = ii[WS(rs, 10)];
|
||||
Tt = W[18];
|
||||
Tv = Tt * Tu;
|
||||
T1R = Tt * Tx;
|
||||
{
|
||||
E Ts, T1Q, Ty, T1S, Tq, Tw;
|
||||
Tq = W[3];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
T1Q = FNMS(Tq, To, T1P);
|
||||
Tw = W[19];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
T1S = FNMS(Tw, Tu, T1R);
|
||||
Tz = Ts + Ty;
|
||||
T2V = T1Q + T1S;
|
||||
T1T = T1Q - T1S;
|
||||
T1U = Ts - Ty;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
|
||||
TQ = ri[WS(rs, 1)];
|
||||
TT = ii[WS(rs, 1)];
|
||||
TP = W[0];
|
||||
TR = TP * TQ;
|
||||
T25 = TP * TT;
|
||||
TW = ri[WS(rs, 9)];
|
||||
TZ = ii[WS(rs, 9)];
|
||||
TV = W[16];
|
||||
TX = TV * TW;
|
||||
T27 = TV * TZ;
|
||||
{
|
||||
E TU, T26, T10, T28, TS, TY;
|
||||
TS = W[1];
|
||||
TU = FMA(TS, TT, TR);
|
||||
T26 = FNMS(TS, TQ, T25);
|
||||
TY = W[17];
|
||||
T10 = FMA(TY, TZ, TX);
|
||||
T28 = FNMS(TY, TW, T27);
|
||||
T11 = TU + T10;
|
||||
T30 = T26 + T28;
|
||||
T29 = T26 - T28;
|
||||
T2c = TU - T10;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
|
||||
T13 = ri[WS(rs, 5)];
|
||||
T16 = ii[WS(rs, 5)];
|
||||
T12 = W[8];
|
||||
T14 = T12 * T13;
|
||||
T2d = T12 * T16;
|
||||
T19 = ri[WS(rs, 13)];
|
||||
T1c = ii[WS(rs, 13)];
|
||||
T18 = W[24];
|
||||
T1a = T18 * T19;
|
||||
T2f = T18 * T1c;
|
||||
{
|
||||
E T17, T2e, T1d, T2g, T15, T1b;
|
||||
T15 = W[9];
|
||||
T17 = FMA(T15, T16, T14);
|
||||
T2e = FNMS(T15, T13, T2d);
|
||||
T1b = W[25];
|
||||
T1d = FMA(T1b, T1c, T1a);
|
||||
T2g = FNMS(T1b, T19, T2f);
|
||||
T1e = T17 + T1d;
|
||||
T31 = T2e + T2g;
|
||||
T2a = T17 - T1d;
|
||||
T2h = T2e - T2g;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
|
||||
TB = ri[WS(rs, 14)];
|
||||
TE = ii[WS(rs, 14)];
|
||||
TA = W[26];
|
||||
TC = TA * TB;
|
||||
T1X = TA * TE;
|
||||
TH = ri[WS(rs, 6)];
|
||||
TK = ii[WS(rs, 6)];
|
||||
TG = W[10];
|
||||
TI = TG * TH;
|
||||
T1Z = TG * TK;
|
||||
{
|
||||
E TF, T1Y, TL, T20, TD, TJ;
|
||||
TD = W[27];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T1Y = FNMS(TD, TB, T1X);
|
||||
TJ = W[11];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T20 = FNMS(TJ, TH, T1Z);
|
||||
TM = TF + TL;
|
||||
T2W = T1Y + T20;
|
||||
T1W = TF - TL;
|
||||
T21 = T1Y - T20;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
|
||||
{
|
||||
E Tm, TN, T3j, T3p;
|
||||
Tm = T8 + Tl;
|
||||
TN = Tz + TM;
|
||||
TO = Tm + TN;
|
||||
T3e = Tm - TN;
|
||||
T3j = T2V + T2W;
|
||||
T3p = T3k + T3o;
|
||||
T3q = T3j + T3p;
|
||||
T3s = T3p - T3j;
|
||||
}
|
||||
{
|
||||
E T1f, T1G, T3f, T3g;
|
||||
T1f = T11 + T1e;
|
||||
T1G = T1s + T1F;
|
||||
T1H = T1f + T1G;
|
||||
T3r = T1G - T1f;
|
||||
T3f = T30 + T31;
|
||||
T3g = T35 + T36;
|
||||
T3h = T3f - T3g;
|
||||
T3i = T3f + T3g;
|
||||
}
|
||||
ri[WS(rs, 8)] = TO - T1H;
|
||||
ii[WS(rs, 8)] = T3q - T3i;
|
||||
ri[0] = TO + T1H;
|
||||
ii[0] = T3i + T3q;
|
||||
ri[WS(rs, 12)] = T3e - T3h;
|
||||
ii[WS(rs, 12)] = T3s - T3r;
|
||||
ri[WS(rs, 4)] = T3e + T3h;
|
||||
ii[WS(rs, 4)] = T3r + T3s;
|
||||
}
|
||||
{
|
||||
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
|
||||
{
|
||||
E T2U, T2X, T3t, T3u;
|
||||
T2U = T8 - Tl;
|
||||
T2X = T2V - T2W;
|
||||
T2Y = T2U + T2X;
|
||||
T3a = T2U - T2X;
|
||||
T3t = TM - Tz;
|
||||
T3u = T3o - T3k;
|
||||
T3v = T3t + T3u;
|
||||
T3x = T3u - T3t;
|
||||
}
|
||||
{
|
||||
E T2Z, T32, T34, T37;
|
||||
T2Z = T11 - T1e;
|
||||
T32 = T30 - T31;
|
||||
T33 = T2Z + T32;
|
||||
T3b = T32 - T2Z;
|
||||
T34 = T1s - T1F;
|
||||
T37 = T35 - T36;
|
||||
T38 = T34 - T37;
|
||||
T3c = T34 + T37;
|
||||
}
|
||||
{
|
||||
E T39, T3w, T3d, T3y;
|
||||
T39 = T33 + T38;
|
||||
ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
|
||||
ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
|
||||
T3w = T3b + T3c;
|
||||
ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
|
||||
ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
|
||||
T3d = T3b - T3c;
|
||||
ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
|
||||
ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
|
||||
T3y = T38 - T33;
|
||||
ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
|
||||
ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
|
||||
E T2C;
|
||||
{
|
||||
E T1V, T22, T2b, T2i;
|
||||
T1O = T1I - T1N;
|
||||
T3B = T3z - T3A;
|
||||
T3H = T3A + T3z;
|
||||
T2E = T1I + T1N;
|
||||
T1V = T1T - T1U;
|
||||
T22 = T1W + T21;
|
||||
T23 = T1V - T22;
|
||||
T3C = T1V + T22;
|
||||
{
|
||||
E T2M, T2N, T2F, T2G;
|
||||
T2M = T2r + T2w;
|
||||
T2N = T2o - T2p;
|
||||
T2O = FNMS(KP414213562, T2N, T2M);
|
||||
T2S = FMA(KP414213562, T2M, T2N);
|
||||
T2F = T1U + T1T;
|
||||
T2G = T1W - T21;
|
||||
T2H = T2F + T2G;
|
||||
T3I = T2G - T2F;
|
||||
}
|
||||
T2b = T29 + T2a;
|
||||
T2i = T2c - T2h;
|
||||
T2j = FMA(KP414213562, T2i, T2b);
|
||||
T2B = FNMS(KP414213562, T2b, T2i);
|
||||
{
|
||||
E T2J, T2K, T2q, T2x;
|
||||
T2J = T2c + T2h;
|
||||
T2K = T29 - T2a;
|
||||
T2L = FMA(KP414213562, T2K, T2J);
|
||||
T2R = FNMS(KP414213562, T2J, T2K);
|
||||
T2q = T2o + T2p;
|
||||
T2x = T2r - T2w;
|
||||
T2y = FNMS(KP414213562, T2x, T2q);
|
||||
T2C = FMA(KP414213562, T2q, T2x);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T24, T2z, T3J, T3K;
|
||||
T24 = FMA(KP707106781, T23, T1O);
|
||||
T2z = T2j - T2y;
|
||||
ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
|
||||
ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
|
||||
T3J = FMA(KP707106781, T3I, T3H);
|
||||
T3K = T2C - T2B;
|
||||
ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
|
||||
ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
|
||||
}
|
||||
{
|
||||
E T2A, T2D, T3L, T3M;
|
||||
T2A = FNMS(KP707106781, T23, T1O);
|
||||
T2D = T2B + T2C;
|
||||
ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
|
||||
ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
|
||||
T3L = FNMS(KP707106781, T3I, T3H);
|
||||
T3M = T2j + T2y;
|
||||
ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
|
||||
ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
|
||||
}
|
||||
{
|
||||
E T2I, T2P, T3D, T3E;
|
||||
T2I = FMA(KP707106781, T2H, T2E);
|
||||
T2P = T2L + T2O;
|
||||
ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
|
||||
ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
|
||||
T3D = FMA(KP707106781, T3C, T3B);
|
||||
T3E = T2R + T2S;
|
||||
ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
|
||||
ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
|
||||
}
|
||||
{
|
||||
E T2Q, T2T, T3F, T3G;
|
||||
T2Q = FNMS(KP707106781, T2H, T2E);
|
||||
T2T = T2R - T2S;
|
||||
ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
|
||||
ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
|
||||
T3F = FNMS(KP707106781, T3C, T3B);
|
||||
T3G = T2O - T2L;
|
||||
ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
|
||||
ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 104, 30, 70, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 174 FP additions, 84 FP multiplications,
|
||||
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
|
||||
* 52 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
|
||||
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
|
||||
E T2y, T2z, T1O, T2g, T1T, T2h;
|
||||
{
|
||||
E T1, T2T, T6, T2S;
|
||||
T1 = ri[0];
|
||||
T2T = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 8)];
|
||||
T5 = ii[WS(rs, 8)];
|
||||
T2 = W[14];
|
||||
T4 = W[15];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T2S = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T37 = T2T - T2S;
|
||||
T1t = T1 - T6;
|
||||
T2U = T2S + T2T;
|
||||
}
|
||||
{
|
||||
E Tc, T1u, Th, T1v;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 4)];
|
||||
Tb = ii[WS(rs, 4)];
|
||||
T8 = W[6];
|
||||
Ta = W[7];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
T1u = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 12)];
|
||||
Tg = ii[WS(rs, 12)];
|
||||
Td = W[22];
|
||||
Tf = W[23];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
T1v = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T38 = Tc - Th;
|
||||
T1w = T1u - T1v;
|
||||
T2R = T1u + T1v;
|
||||
}
|
||||
{
|
||||
E To, T1y, Tt, T1z, T1A, T1B;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = ri[WS(rs, 2)];
|
||||
Tn = ii[WS(rs, 2)];
|
||||
Tk = W[2];
|
||||
Tm = W[3];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
T1y = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = ri[WS(rs, 10)];
|
||||
Ts = ii[WS(rs, 10)];
|
||||
Tp = W[18];
|
||||
Tr = W[19];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
T1z = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T2s = T1y + T1z;
|
||||
T1A = T1y - T1z;
|
||||
T1B = To - Tt;
|
||||
T1C = T1A - T1B;
|
||||
T2c = T1B + T1A;
|
||||
}
|
||||
{
|
||||
E Tz, T1E, TE, T1F, T1D, T1G;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 14)];
|
||||
Ty = ii[WS(rs, 14)];
|
||||
Tv = W[26];
|
||||
Tx = W[27];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1E = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 6)];
|
||||
TD = ii[WS(rs, 6)];
|
||||
TA = W[10];
|
||||
TC = W[11];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T1F = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T2t = T1E + T1F;
|
||||
T1D = Tz - TE;
|
||||
T1G = T1E - T1F;
|
||||
T1H = T1D + T1G;
|
||||
T2d = T1D - T1G;
|
||||
}
|
||||
{
|
||||
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
|
||||
{
|
||||
E T16, T18, T15, T17;
|
||||
T16 = ri[WS(rs, 15)];
|
||||
T18 = ii[WS(rs, 15)];
|
||||
T15 = W[28];
|
||||
T17 = W[29];
|
||||
T19 = FMA(T15, T16, T17 * T18);
|
||||
T20 = FNMS(T17, T16, T15 * T18);
|
||||
}
|
||||
{
|
||||
E T1m, T1o, T1l, T1n;
|
||||
T1m = ri[WS(rs, 11)];
|
||||
T1o = ii[WS(rs, 11)];
|
||||
T1l = W[20];
|
||||
T1n = W[21];
|
||||
T1p = FMA(T1l, T1m, T1n * T1o);
|
||||
T1X = FNMS(T1n, T1m, T1l * T1o);
|
||||
}
|
||||
{
|
||||
E T1b, T1d, T1a, T1c;
|
||||
T1b = ri[WS(rs, 7)];
|
||||
T1d = ii[WS(rs, 7)];
|
||||
T1a = W[12];
|
||||
T1c = W[13];
|
||||
T1e = FMA(T1a, T1b, T1c * T1d);
|
||||
T21 = FNMS(T1c, T1b, T1a * T1d);
|
||||
}
|
||||
{
|
||||
E T1h, T1j, T1g, T1i;
|
||||
T1h = ri[WS(rs, 3)];
|
||||
T1j = ii[WS(rs, 3)];
|
||||
T1g = W[4];
|
||||
T1i = W[5];
|
||||
T1k = FMA(T1g, T1h, T1i * T1j);
|
||||
T1W = FNMS(T1i, T1h, T1g * T1j);
|
||||
}
|
||||
T1f = T19 + T1e;
|
||||
T1q = T1k + T1p;
|
||||
T2B = T1f - T1q;
|
||||
T2C = T20 + T21;
|
||||
T2D = T1W + T1X;
|
||||
T2E = T2C - T2D;
|
||||
{
|
||||
E T1V, T1Y, T22, T23;
|
||||
T1V = T19 - T1e;
|
||||
T1Y = T1W - T1X;
|
||||
T1Z = T1V - T1Y;
|
||||
T2j = T1V + T1Y;
|
||||
T22 = T20 - T21;
|
||||
T23 = T1k - T1p;
|
||||
T24 = T22 + T23;
|
||||
T2k = T22 - T23;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
|
||||
{
|
||||
E TJ, TL, TI, TK;
|
||||
TJ = ri[WS(rs, 1)];
|
||||
TL = ii[WS(rs, 1)];
|
||||
TI = W[0];
|
||||
TK = W[1];
|
||||
TM = FMA(TI, TJ, TK * TL);
|
||||
T1K = FNMS(TK, TJ, TI * TL);
|
||||
}
|
||||
{
|
||||
E TZ, T11, TY, T10;
|
||||
TZ = ri[WS(rs, 13)];
|
||||
T11 = ii[WS(rs, 13)];
|
||||
TY = W[24];
|
||||
T10 = W[25];
|
||||
T12 = FMA(TY, TZ, T10 * T11);
|
||||
T1R = FNMS(T10, TZ, TY * T11);
|
||||
}
|
||||
{
|
||||
E TO, TQ, TN, TP;
|
||||
TO = ri[WS(rs, 9)];
|
||||
TQ = ii[WS(rs, 9)];
|
||||
TN = W[16];
|
||||
TP = W[17];
|
||||
TR = FMA(TN, TO, TP * TQ);
|
||||
T1L = FNMS(TP, TO, TN * TQ);
|
||||
}
|
||||
{
|
||||
E TU, TW, TT, TV;
|
||||
TU = ri[WS(rs, 5)];
|
||||
TW = ii[WS(rs, 5)];
|
||||
TT = W[8];
|
||||
TV = W[9];
|
||||
TX = FMA(TT, TU, TV * TW);
|
||||
T1Q = FNMS(TV, TU, TT * TW);
|
||||
}
|
||||
TS = TM + TR;
|
||||
T13 = TX + T12;
|
||||
T2w = TS - T13;
|
||||
T2x = T1K + T1L;
|
||||
T2y = T1Q + T1R;
|
||||
T2z = T2x - T2y;
|
||||
{
|
||||
E T1M, T1N, T1P, T1S;
|
||||
T1M = T1K - T1L;
|
||||
T1N = TX - T12;
|
||||
T1O = T1M + T1N;
|
||||
T2g = T1M - T1N;
|
||||
T1P = TM - TR;
|
||||
T1S = T1Q - T1R;
|
||||
T1T = T1P - T1S;
|
||||
T2h = T1P + T1S;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
|
||||
{
|
||||
E T1x, T1I, T3e, T3f;
|
||||
T1x = T1t - T1w;
|
||||
T1I = KP707106781 * (T1C - T1H);
|
||||
T1J = T1x + T1I;
|
||||
T27 = T1x - T1I;
|
||||
T3e = KP707106781 * (T2d - T2c);
|
||||
T3f = T38 + T37;
|
||||
T3g = T3e + T3f;
|
||||
T3i = T3f - T3e;
|
||||
}
|
||||
{
|
||||
E T1U, T25, T28, T29;
|
||||
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
|
||||
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
|
||||
T26 = T1U + T25;
|
||||
T3h = T25 - T1U;
|
||||
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
|
||||
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
|
||||
T2a = T28 - T29;
|
||||
T3d = T28 + T29;
|
||||
}
|
||||
ri[WS(rs, 11)] = T1J - T26;
|
||||
ii[WS(rs, 11)] = T3g - T3d;
|
||||
ri[WS(rs, 3)] = T1J + T26;
|
||||
ii[WS(rs, 3)] = T3d + T3g;
|
||||
ri[WS(rs, 15)] = T27 - T2a;
|
||||
ii[WS(rs, 15)] = T3i - T3h;
|
||||
ri[WS(rs, 7)] = T27 + T2a;
|
||||
ii[WS(rs, 7)] = T3h + T3i;
|
||||
}
|
||||
{
|
||||
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
|
||||
{
|
||||
E T2r, T2u, T30, T31;
|
||||
T2r = T7 - Ti;
|
||||
T2u = T2s - T2t;
|
||||
T2v = T2r + T2u;
|
||||
T2H = T2r - T2u;
|
||||
T30 = TF - Tu;
|
||||
T31 = T2U - T2R;
|
||||
T32 = T30 + T31;
|
||||
T34 = T31 - T30;
|
||||
}
|
||||
{
|
||||
E T2A, T2F, T2I, T2J;
|
||||
T2A = T2w + T2z;
|
||||
T2F = T2B - T2E;
|
||||
T2G = KP707106781 * (T2A + T2F);
|
||||
T33 = KP707106781 * (T2F - T2A);
|
||||
T2I = T2z - T2w;
|
||||
T2J = T2B + T2E;
|
||||
T2K = KP707106781 * (T2I - T2J);
|
||||
T2Z = KP707106781 * (T2I + T2J);
|
||||
}
|
||||
ri[WS(rs, 10)] = T2v - T2G;
|
||||
ii[WS(rs, 10)] = T32 - T2Z;
|
||||
ri[WS(rs, 2)] = T2v + T2G;
|
||||
ii[WS(rs, 2)] = T2Z + T32;
|
||||
ri[WS(rs, 14)] = T2H - T2K;
|
||||
ii[WS(rs, 14)] = T34 - T33;
|
||||
ri[WS(rs, 6)] = T2H + T2K;
|
||||
ii[WS(rs, 6)] = T33 + T34;
|
||||
}
|
||||
{
|
||||
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
|
||||
{
|
||||
E T2b, T2e, T36, T39;
|
||||
T2b = T1t + T1w;
|
||||
T2e = KP707106781 * (T2c + T2d);
|
||||
T2f = T2b + T2e;
|
||||
T2n = T2b - T2e;
|
||||
T36 = KP707106781 * (T1C + T1H);
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 + T39;
|
||||
T3c = T39 - T36;
|
||||
}
|
||||
{
|
||||
E T2i, T2l, T2o, T2p;
|
||||
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
|
||||
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
|
||||
T2m = T2i + T2l;
|
||||
T3b = T2l - T2i;
|
||||
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
|
||||
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
|
||||
T2q = T2o - T2p;
|
||||
T35 = T2o + T2p;
|
||||
}
|
||||
ri[WS(rs, 9)] = T2f - T2m;
|
||||
ii[WS(rs, 9)] = T3a - T35;
|
||||
ri[WS(rs, 1)] = T2f + T2m;
|
||||
ii[WS(rs, 1)] = T35 + T3a;
|
||||
ri[WS(rs, 13)] = T2n - T2q;
|
||||
ii[WS(rs, 13)] = T3c - T3b;
|
||||
ri[WS(rs, 5)] = T2n + T2q;
|
||||
ii[WS(rs, 5)] = T3b + T3c;
|
||||
}
|
||||
{
|
||||
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
|
||||
{
|
||||
E Tj, TG, T2Q, T2V;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
TH = Tj + TG;
|
||||
T2L = Tj - TG;
|
||||
T2Q = T2s + T2t;
|
||||
T2V = T2R + T2U;
|
||||
T2W = T2Q + T2V;
|
||||
T2Y = T2V - T2Q;
|
||||
}
|
||||
{
|
||||
E T14, T1r, T2M, T2N;
|
||||
T14 = TS + T13;
|
||||
T1r = T1f + T1q;
|
||||
T1s = T14 + T1r;
|
||||
T2X = T1r - T14;
|
||||
T2M = T2x + T2y;
|
||||
T2N = T2C + T2D;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2M + T2N;
|
||||
}
|
||||
ri[WS(rs, 8)] = TH - T1s;
|
||||
ii[WS(rs, 8)] = T2W - T2P;
|
||||
ri[0] = TH + T1s;
|
||||
ii[0] = T2P + T2W;
|
||||
ri[WS(rs, 12)] = T2L - T2O;
|
||||
ii[WS(rs, 12)] = T2Y - T2X;
|
||||
ri[WS(rs, 4)] = T2L + T2O;
|
||||
ii[WS(rs, 4)] = T2X + T2Y;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 16 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 136, 46, 38, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_16, &desc);
|
||||
}
|
||||
#endif
|
||||
117
fftw-3.3.10/dft/scalar/codelets/t1_2.c
Normal file
117
fftw-3.3.10/dft/scalar/codelets/t1_2.c
Normal file
@@ -0,0 +1,117 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 11 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
|
||||
T1 = ri[0];
|
||||
Ta = ii[0];
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
T8 = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T9 = FNMS(T5, T3, T8);
|
||||
ri[WS(rs, 1)] = T1 - T7;
|
||||
ii[WS(rs, 1)] = Ta - T9;
|
||||
ri[0] = T1 + T7;
|
||||
ii[0] = T9 + Ta;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_2) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_2, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 6 FP additions, 4 FP multiplications,
|
||||
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
|
||||
* 9 stack variables, 0 constants, and 8 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
|
||||
E T1, T8, T6, T7;
|
||||
T1 = ri[0];
|
||||
T8 = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T7 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
ri[WS(rs, 1)] = T1 - T6;
|
||||
ii[WS(rs, 1)] = T8 - T7;
|
||||
ri[0] = T1 + T6;
|
||||
ii[0] = T7 + T8;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 2 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_2) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_2, &desc);
|
||||
}
|
||||
#endif
|
||||
1050
fftw-3.3.10/dft/scalar/codelets/t1_20.c
Normal file
1050
fftw-3.3.10/dft/scalar/codelets/t1_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1572
fftw-3.3.10/dft/scalar/codelets/t1_25.c
Normal file
1572
fftw-3.3.10/dft/scalar/codelets/t1_25.c
Normal file
File diff suppressed because it is too large
Load Diff
166
fftw-3.3.10/dft/scalar/codelets/t1_3.c
Normal file
166
fftw-3.3.10/dft/scalar/codelets/t1_3.c
Normal file
@@ -0,0 +1,166 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 14 FP multiplications,
|
||||
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, Tm, T7, Th, Td, Tj;
|
||||
T1 = ri[0];
|
||||
Tm = ii[0];
|
||||
{
|
||||
E T3, T6, T4, Tg, T2, T5;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
Tg = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Th = FNMS(T5, T3, Tg);
|
||||
}
|
||||
{
|
||||
E T9, Tc, Ta, Ti, T8, Tb;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tc = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = T8 * T9;
|
||||
Ti = T8 * Tc;
|
||||
Tb = W[3];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
Tj = FNMS(Tb, T9, Ti);
|
||||
}
|
||||
{
|
||||
E Tk, Te, Tf, To, Tl, Tn;
|
||||
Tk = Th - Tj;
|
||||
Te = T7 + Td;
|
||||
Tf = FNMS(KP500000000, Te, T1);
|
||||
ri[0] = T1 + Te;
|
||||
ri[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
|
||||
ri[WS(rs, 2)] = FNMS(KP866025403, Tk, Tf);
|
||||
To = Td - T7;
|
||||
Tl = Th + Tj;
|
||||
Tn = FNMS(KP500000000, Tl, Tm);
|
||||
ii[0] = Tl + Tm;
|
||||
ii[WS(rs, 2)] = FNMS(KP866025403, To, Tn);
|
||||
ii[WS(rs, 1)] = FMA(KP866025403, To, Tn);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 6, 4, 10, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_3) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_3, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 16 FP additions, 12 FP multiplications,
|
||||
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 2 constants, and 12 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
|
||||
E T1, Ti, T6, Te, Tb, Tf, Tc, Th;
|
||||
T1 = ri[0];
|
||||
Ti = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Te = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 2)];
|
||||
Ta = ii[WS(rs, 2)];
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Tf = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
Tc = T6 + Tb;
|
||||
Th = Te + Tf;
|
||||
ri[0] = T1 + Tc;
|
||||
ii[0] = Th + Ti;
|
||||
{
|
||||
E Td, Tg, Tj, Tk;
|
||||
Td = FNMS(KP500000000, Tc, T1);
|
||||
Tg = KP866025403 * (Te - Tf);
|
||||
ri[WS(rs, 2)] = Td - Tg;
|
||||
ri[WS(rs, 1)] = Td + Tg;
|
||||
Tj = KP866025403 * (Tb - T6);
|
||||
Tk = FNMS(KP500000000, Th, Ti);
|
||||
ii[WS(rs, 1)] = Tj + Tk;
|
||||
ii[WS(rs, 2)] = Tk - Tj;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 10, 6, 6, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_3) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_3, &desc);
|
||||
}
|
||||
#endif
|
||||
1809
fftw-3.3.10/dft/scalar/codelets/t1_32.c
Normal file
1809
fftw-3.3.10/dft/scalar/codelets/t1_32.c
Normal file
File diff suppressed because it is too large
Load Diff
196
fftw-3.3.10/dft/scalar/codelets/t1_4.c
Normal file
196
fftw-3.3.10/dft/scalar/codelets/t1_4.c
Normal file
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 15 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
|
||||
T1 = ri[0];
|
||||
Tv = ii[0];
|
||||
{
|
||||
E T3, T6, T4, Tt, T2, T5;
|
||||
T3 = ri[WS(rs, 2)];
|
||||
T6 = ii[WS(rs, 2)];
|
||||
T2 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tt = T2 * T6;
|
||||
T5 = W[3];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tu = FNMS(T5, T3, Tt);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, Tn, T9, Tc;
|
||||
Ta = ri[WS(rs, 1)];
|
||||
Td = ii[WS(rs, 1)];
|
||||
T9 = W[0];
|
||||
Tb = T9 * Ta;
|
||||
Tn = T9 * Td;
|
||||
Tc = W[1];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
To = FNMS(Tc, Ta, Tn);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, Tp, Tf, Ti;
|
||||
Tg = ri[WS(rs, 3)];
|
||||
Tj = ii[WS(rs, 3)];
|
||||
Tf = W[4];
|
||||
Th = Tf * Tg;
|
||||
Tp = Tf * Tj;
|
||||
Ti = W[5];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
Tq = FNMS(Ti, Tg, Tp);
|
||||
}
|
||||
{
|
||||
E T8, Tl, Ts, Tw;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
ri[WS(rs, 2)] = T8 - Tl;
|
||||
ri[0] = T8 + Tl;
|
||||
Ts = To + Tq;
|
||||
Tw = Tu + Tv;
|
||||
ii[0] = Ts + Tw;
|
||||
ii[WS(rs, 2)] = Tw - Ts;
|
||||
}
|
||||
{
|
||||
E Tm, Tr, Tx, Ty;
|
||||
Tm = T1 - T7;
|
||||
Tr = To - Tq;
|
||||
ri[WS(rs, 3)] = Tm - Tr;
|
||||
ri[WS(rs, 1)] = Tm + Tr;
|
||||
Tx = Tv - Tu;
|
||||
Ty = Te - Tk;
|
||||
ii[WS(rs, 1)] = Tx - Ty;
|
||||
ii[WS(rs, 3)] = Ty + Tx;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 22 FP additions, 12 FP multiplications,
|
||||
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
|
||||
* 13 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
|
||||
T1 = ri[0];
|
||||
Tp = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 2)];
|
||||
T5 = ii[WS(rs, 2)];
|
||||
T2 = W[2];
|
||||
T4 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
To = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 1)];
|
||||
Tb = ii[WS(rs, 1)];
|
||||
T8 = W[0];
|
||||
Ta = W[1];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
Tk = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 3)];
|
||||
Tg = ii[WS(rs, 3)];
|
||||
Td = W[4];
|
||||
Tf = W[5];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tl = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E T7, Ti, Tn, Tq;
|
||||
T7 = T1 + T6;
|
||||
Ti = Tc + Th;
|
||||
ri[WS(rs, 2)] = T7 - Ti;
|
||||
ri[0] = T7 + Ti;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
ii[0] = Tn + Tq;
|
||||
ii[WS(rs, 2)] = Tq - Tn;
|
||||
}
|
||||
{
|
||||
E Tj, Tm, Tr, Ts;
|
||||
Tj = T1 - T6;
|
||||
Tm = Tk - Tl;
|
||||
ri[WS(rs, 3)] = Tj - Tm;
|
||||
ri[WS(rs, 1)] = Tj + Tm;
|
||||
Tr = Tp - To;
|
||||
Ts = Tc - Th;
|
||||
ii[WS(rs, 1)] = Tr - Ts;
|
||||
ii[WS(rs, 3)] = Ts + Tr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 4 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_4, &desc);
|
||||
}
|
||||
#endif
|
||||
253
fftw-3.3.10/dft/scalar/codelets/t1_5.c
Normal file
253
fftw-3.3.10/dft/scalar/codelets/t1_5.c
Normal file
@@ -0,0 +1,253 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 34 FP multiplications,
|
||||
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
|
||||
* 31 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, TM, T7, Tx, Td, Tz, Te, TJ, Tk, TC, Tq, TE, Tr, TK;
|
||||
T1 = ri[0];
|
||||
TM = ii[0];
|
||||
{
|
||||
E T3, T6, T4, Tw, T9, Tc, Ta, Ty, T2, T8, T5, Tb;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
Tw = T2 * T6;
|
||||
T9 = ri[WS(rs, 4)];
|
||||
Tc = ii[WS(rs, 4)];
|
||||
T8 = W[6];
|
||||
Ta = T8 * T9;
|
||||
Ty = T8 * Tc;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tx = FNMS(T5, T3, Tw);
|
||||
Tb = W[7];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
Tz = FNMS(Tb, T9, Ty);
|
||||
Te = T7 + Td;
|
||||
TJ = Tx + Tz;
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TB, Tm, Tp, Tn, TD, Tf, Tl, Ti, To;
|
||||
Tg = ri[WS(rs, 2)];
|
||||
Tj = ii[WS(rs, 2)];
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
TB = Tf * Tj;
|
||||
Tm = ri[WS(rs, 3)];
|
||||
Tp = ii[WS(rs, 3)];
|
||||
Tl = W[4];
|
||||
Tn = Tl * Tm;
|
||||
TD = Tl * Tp;
|
||||
Ti = W[3];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TC = FNMS(Ti, Tg, TB);
|
||||
To = W[5];
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
TE = FNMS(To, Tm, TD);
|
||||
Tr = Tk + Tq;
|
||||
TK = TC + TE;
|
||||
}
|
||||
{
|
||||
E Tu, Ts, Tt, TG, TI, TA, TF, TH, Tv;
|
||||
Tu = Te - Tr;
|
||||
Ts = Te + Tr;
|
||||
Tt = FNMS(KP250000000, Ts, T1);
|
||||
TA = Tx - Tz;
|
||||
TF = TC - TE;
|
||||
TG = FMA(KP618033988, TF, TA);
|
||||
TI = FNMS(KP618033988, TA, TF);
|
||||
ri[0] = T1 + Ts;
|
||||
TH = FNMS(KP559016994, Tu, Tt);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, TI, TH);
|
||||
Tv = FMA(KP559016994, Tu, Tt);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, TG, Tv);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
|
||||
}
|
||||
{
|
||||
E TO, TL, TN, TS, TU, TQ, TR, TT, TP;
|
||||
TO = TJ - TK;
|
||||
TL = TJ + TK;
|
||||
TN = FNMS(KP250000000, TL, TM);
|
||||
TQ = T7 - Td;
|
||||
TR = Tk - Tq;
|
||||
TS = FMA(KP618033988, TR, TQ);
|
||||
TU = FNMS(KP618033988, TQ, TR);
|
||||
ii[0] = TL + TM;
|
||||
TT = FNMS(KP559016994, TO, TN);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, TU, TT);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, TU, TT);
|
||||
TP = FMA(KP559016994, TO, TN);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, TS, TP);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, TS, TP);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 14, 8, 26, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 40 FP additions, 28 FP multiplications,
|
||||
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 29 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T1, TE, Tu, Tx, TJ, TI, TB, TC, TD, Tc, Tn, To;
|
||||
T1 = ri[0];
|
||||
TE = ii[0];
|
||||
{
|
||||
E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Ts = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E Tj, Tl, Ti, Tk;
|
||||
Tj = ri[WS(rs, 3)];
|
||||
Tl = ii[WS(rs, 3)];
|
||||
Ti = W[4];
|
||||
Tk = W[5];
|
||||
Tm = FMA(Ti, Tj, Tk * Tl);
|
||||
Tw = FNMS(Tk, Tj, Ti * Tl);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 4)];
|
||||
Ta = ii[WS(rs, 4)];
|
||||
T7 = W[6];
|
||||
T9 = W[7];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
Tt = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 2)];
|
||||
Tg = ii[WS(rs, 2)];
|
||||
Td = W[2];
|
||||
Tf = W[3];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
Tv = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Tu = Ts - Tt;
|
||||
Tx = Tv - Tw;
|
||||
TJ = Th - Tm;
|
||||
TI = T6 - Tb;
|
||||
TB = Ts + Tt;
|
||||
TC = Tv + Tw;
|
||||
TD = TB + TC;
|
||||
Tc = T6 + Tb;
|
||||
Tn = Th + Tm;
|
||||
To = Tc + Tn;
|
||||
}
|
||||
ri[0] = T1 + To;
|
||||
ii[0] = TD + TE;
|
||||
{
|
||||
E Ty, TA, Tr, Tz, Tp, Tq;
|
||||
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
|
||||
TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
|
||||
Tp = KP559016994 * (Tc - Tn);
|
||||
Tq = FNMS(KP250000000, To, T1);
|
||||
Tr = Tp + Tq;
|
||||
Tz = Tq - Tp;
|
||||
ri[WS(rs, 4)] = Tr - Ty;
|
||||
ri[WS(rs, 3)] = Tz + TA;
|
||||
ri[WS(rs, 1)] = Tr + Ty;
|
||||
ri[WS(rs, 2)] = Tz - TA;
|
||||
}
|
||||
{
|
||||
E TK, TL, TH, TM, TF, TG;
|
||||
TK = FMA(KP951056516, TI, KP587785252 * TJ);
|
||||
TL = FNMS(KP587785252, TI, KP951056516 * TJ);
|
||||
TF = KP559016994 * (TB - TC);
|
||||
TG = FNMS(KP250000000, TD, TE);
|
||||
TH = TF + TG;
|
||||
TM = TG - TF;
|
||||
ii[WS(rs, 1)] = TH - TK;
|
||||
ii[WS(rs, 3)] = TM - TL;
|
||||
ii[WS(rs, 4)] = TK + TH;
|
||||
ii[WS(rs, 2)] = TL + TM;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 5 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 26, 14, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_5, &desc);
|
||||
}
|
||||
#endif
|
||||
295
fftw-3.3.10/dft/scalar/codelets/t1_6.c
Normal file
295
fftw-3.3.10/dft/scalar/codelets/t1_6.c
Normal file
@@ -0,0 +1,295 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 32 FP multiplications,
|
||||
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
|
||||
* 31 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T1, TX, T7, TW, Tl, TR, TB, TJ, Ty, TS, TC, TO;
|
||||
T1 = ri[0];
|
||||
TX = ii[0];
|
||||
{
|
||||
E T3, T6, T4, TV, T2, T5;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T6 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TV = T2 * T6;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TW = FNMS(T5, T3, TV);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TF, Tg, Tj, Th, TH, T9, Tf;
|
||||
Ta = ri[WS(rs, 2)];
|
||||
Td = ii[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TF = T9 * Td;
|
||||
Tg = ri[WS(rs, 5)];
|
||||
Tj = ii[WS(rs, 5)];
|
||||
Tf = W[8];
|
||||
Th = Tf * Tg;
|
||||
TH = Tf * Tj;
|
||||
{
|
||||
E Te, TG, Tk, TI, Tc, Ti;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TG = FNMS(Tc, Ta, TF);
|
||||
Ti = W[9];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TI = FNMS(Ti, Tg, TH);
|
||||
Tl = Te - Tk;
|
||||
TR = TG + TI;
|
||||
TB = Te + Tk;
|
||||
TJ = TG - TI;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, TK, Tt, Tw, Tu, TM, Tm, Ts;
|
||||
Tn = ri[WS(rs, 4)];
|
||||
Tq = ii[WS(rs, 4)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
TK = Tm * Tq;
|
||||
Tt = ri[WS(rs, 1)];
|
||||
Tw = ii[WS(rs, 1)];
|
||||
Ts = W[0];
|
||||
Tu = Ts * Tt;
|
||||
TM = Ts * Tw;
|
||||
{
|
||||
E Tr, TL, Tx, TN, Tp, Tv;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
TL = FNMS(Tp, Tn, TK);
|
||||
Tv = W[1];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TN = FNMS(Tv, Tt, TM);
|
||||
Ty = Tr - Tx;
|
||||
TS = TL + TN;
|
||||
TC = Tr + Tx;
|
||||
TO = TL - TN;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TP, T8, Tz, TE;
|
||||
TP = TJ - TO;
|
||||
T8 = T1 - T7;
|
||||
Tz = Tl + Ty;
|
||||
TE = FNMS(KP500000000, Tz, T8);
|
||||
ri[WS(rs, 3)] = T8 + Tz;
|
||||
ri[WS(rs, 1)] = FMA(KP866025403, TP, TE);
|
||||
ri[WS(rs, 5)] = FNMS(KP866025403, TP, TE);
|
||||
}
|
||||
{
|
||||
E T14, T11, T12, T13;
|
||||
T14 = Ty - Tl;
|
||||
T11 = TX - TW;
|
||||
T12 = TJ + TO;
|
||||
T13 = FNMS(KP500000000, T12, T11);
|
||||
ii[WS(rs, 1)] = FMA(KP866025403, T14, T13);
|
||||
ii[WS(rs, 3)] = T12 + T11;
|
||||
ii[WS(rs, 5)] = FNMS(KP866025403, T14, T13);
|
||||
}
|
||||
{
|
||||
E TT, TA, TD, TQ;
|
||||
TT = TR - TS;
|
||||
TA = T1 + T7;
|
||||
TD = TB + TC;
|
||||
TQ = FNMS(KP500000000, TD, TA);
|
||||
ri[0] = TA + TD;
|
||||
ri[WS(rs, 4)] = FMA(KP866025403, TT, TQ);
|
||||
ri[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
|
||||
}
|
||||
{
|
||||
E T10, TU, TY, TZ;
|
||||
T10 = TC - TB;
|
||||
TU = TR + TS;
|
||||
TY = TW + TX;
|
||||
TZ = FNMS(KP500000000, TU, TY);
|
||||
ii[0] = TU + TY;
|
||||
ii[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
|
||||
ii[WS(rs, 2)] = FNMS(KP866025403, T10, TZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 24, 10, 22, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_6) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_6, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 46 FP additions, 28 FP multiplications,
|
||||
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
|
||||
* 23 stack variables, 2 constants, and 24 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
|
||||
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
|
||||
{
|
||||
E T1, TN, T6, TM;
|
||||
T1 = ri[0];
|
||||
TN = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T5 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TM = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 - T6;
|
||||
TS = TN - TM;
|
||||
Tv = T1 + T6;
|
||||
TO = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tn, TD, Ts, TE;
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 4)];
|
||||
Tm = ii[WS(rs, 4)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TD = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 1)];
|
||||
Tr = ii[WS(rs, 1)];
|
||||
To = W[0];
|
||||
Tq = W[1];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TE = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn - Ts;
|
||||
TJ = TD + TE;
|
||||
Tx = Tn + Ts;
|
||||
TF = TD - TE;
|
||||
}
|
||||
{
|
||||
E Tc, TA, Th, TB;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tb = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TA = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 5)];
|
||||
Tg = ii[WS(rs, 5)];
|
||||
Td = W[8];
|
||||
Tf = W[9];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TB = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc - Th;
|
||||
TI = TA + TB;
|
||||
Tw = Tc + Th;
|
||||
TC = TA - TB;
|
||||
}
|
||||
{
|
||||
E TG, Tu, Tz, TR, TT, TU;
|
||||
TG = KP866025403 * (TC - TF);
|
||||
Tu = Ti + Tt;
|
||||
Tz = FNMS(KP500000000, Tu, T7);
|
||||
ri[WS(rs, 3)] = T7 + Tu;
|
||||
ri[WS(rs, 1)] = Tz + TG;
|
||||
ri[WS(rs, 5)] = Tz - TG;
|
||||
TR = KP866025403 * (Tt - Ti);
|
||||
TT = TC + TF;
|
||||
TU = FNMS(KP500000000, TT, TS);
|
||||
ii[WS(rs, 1)] = TR + TU;
|
||||
ii[WS(rs, 3)] = TT + TS;
|
||||
ii[WS(rs, 5)] = TU - TR;
|
||||
}
|
||||
{
|
||||
E TK, Ty, TH, TQ, TL, TP;
|
||||
TK = KP866025403 * (TI - TJ);
|
||||
Ty = Tw + Tx;
|
||||
TH = FNMS(KP500000000, Ty, Tv);
|
||||
ri[0] = Tv + Ty;
|
||||
ri[WS(rs, 4)] = TH + TK;
|
||||
ri[WS(rs, 2)] = TH - TK;
|
||||
TQ = KP866025403 * (Tx - Tw);
|
||||
TL = TI + TJ;
|
||||
TP = FNMS(KP500000000, TL, TO);
|
||||
ii[0] = TL + TO;
|
||||
ii[WS(rs, 4)] = TQ + TP;
|
||||
ii[WS(rs, 2)] = TP - TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 6 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 32, 14, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_6) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_6, &desc);
|
||||
}
|
||||
#endif
|
||||
4105
fftw-3.3.10/dft/scalar/codelets/t1_64.c
Normal file
4105
fftw-3.3.10/dft/scalar/codelets/t1_64.c
Normal file
File diff suppressed because it is too large
Load Diff
354
fftw-3.3.10/dft/scalar/codelets/t1_7.c
Normal file
354
fftw-3.3.10/dft/scalar/codelets/t1_7.c
Normal file
@@ -0,0 +1,354 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 66 FP multiplications,
|
||||
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
|
||||
* 37 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
|
||||
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
|
||||
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
|
||||
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, T1c, Te, T1h, TR, T19, Tr, T1g, TM, T1a, TE, T1i, TW, T1b;
|
||||
T1 = ri[0];
|
||||
T1c = ii[0];
|
||||
{
|
||||
E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T6 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = T2 * T3;
|
||||
TN = T2 * T6;
|
||||
T9 = ri[WS(rs, 6)];
|
||||
Tc = ii[WS(rs, 6)];
|
||||
T8 = W[10];
|
||||
Ta = T8 * T9;
|
||||
TP = T8 * Tc;
|
||||
{
|
||||
E T7, TO, Td, TQ, T5, Tb;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TO = FNMS(T5, T3, TN);
|
||||
Tb = W[11];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
TQ = FNMS(Tb, T9, TP);
|
||||
Te = T7 + Td;
|
||||
T1h = Td - T7;
|
||||
TR = TO - TQ;
|
||||
T19 = TO + TQ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
|
||||
Tg = ri[WS(rs, 2)];
|
||||
Tj = ii[WS(rs, 2)];
|
||||
Tf = W[2];
|
||||
Th = Tf * Tg;
|
||||
TI = Tf * Tj;
|
||||
Tm = ri[WS(rs, 5)];
|
||||
Tp = ii[WS(rs, 5)];
|
||||
Tl = W[8];
|
||||
Tn = Tl * Tm;
|
||||
TK = Tl * Tp;
|
||||
{
|
||||
E Tk, TJ, Tq, TL, Ti, To;
|
||||
Ti = W[3];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TJ = FNMS(Ti, Tg, TI);
|
||||
To = W[9];
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
TL = FNMS(To, Tm, TK);
|
||||
Tr = Tk + Tq;
|
||||
T1g = Tq - Tk;
|
||||
TM = TJ - TL;
|
||||
T1a = TJ + TL;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
|
||||
Tt = ri[WS(rs, 3)];
|
||||
Tw = ii[WS(rs, 3)];
|
||||
Ts = W[4];
|
||||
Tu = Ts * Tt;
|
||||
TS = Ts * Tw;
|
||||
Tz = ri[WS(rs, 4)];
|
||||
TC = ii[WS(rs, 4)];
|
||||
Ty = W[6];
|
||||
TA = Ty * Tz;
|
||||
TU = Ty * TC;
|
||||
{
|
||||
E Tx, TT, TD, TV, Tv, TB;
|
||||
Tv = W[5];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
TT = FNMS(Tv, Tt, TS);
|
||||
TB = W[7];
|
||||
TD = FMA(TB, TC, TA);
|
||||
TV = FNMS(TB, Tz, TU);
|
||||
TE = Tx + TD;
|
||||
T1i = TD - Tx;
|
||||
TW = TT - TV;
|
||||
T1b = TT + TV;
|
||||
}
|
||||
}
|
||||
ri[0] = T1 + Te + Tr + TE;
|
||||
ii[0] = T19 + T1a + T1b + T1c;
|
||||
{
|
||||
E TG, TY, TF, TX, TH;
|
||||
TF = FNMS(KP356895867, Tr, Te);
|
||||
TG = FNMS(KP692021471, TF, TE);
|
||||
TX = FMA(KP554958132, TW, TR);
|
||||
TY = FMA(KP801937735, TX, TM);
|
||||
TH = FNMS(KP900968867, TG, T1);
|
||||
ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
|
||||
ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
|
||||
}
|
||||
{
|
||||
E T1e, T1k, T1d, T1j, T1f;
|
||||
T1d = FNMS(KP356895867, T1a, T19);
|
||||
T1e = FNMS(KP692021471, T1d, T1b);
|
||||
T1j = FMA(KP554958132, T1i, T1h);
|
||||
T1k = FMA(KP801937735, T1j, T1g);
|
||||
T1f = FNMS(KP900968867, T1e, T1c);
|
||||
ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
|
||||
ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
|
||||
}
|
||||
{
|
||||
E T10, T13, TZ, T12, T11;
|
||||
TZ = FNMS(KP356895867, Te, TE);
|
||||
T10 = FNMS(KP692021471, TZ, Tr);
|
||||
T12 = FMA(KP554958132, TM, TW);
|
||||
T13 = FNMS(KP801937735, T12, TR);
|
||||
T11 = FNMS(KP900968867, T10, T1);
|
||||
ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
|
||||
ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
|
||||
}
|
||||
{
|
||||
E T1m, T1p, T1l, T1o, T1n;
|
||||
T1l = FNMS(KP356895867, T19, T1b);
|
||||
T1m = FNMS(KP692021471, T1l, T1a);
|
||||
T1o = FMA(KP554958132, T1g, T1i);
|
||||
T1p = FNMS(KP801937735, T1o, T1h);
|
||||
T1n = FNMS(KP900968867, T1m, T1c);
|
||||
ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
|
||||
ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
|
||||
}
|
||||
{
|
||||
E T15, T18, T14, T17, T16;
|
||||
T14 = FNMS(KP356895867, TE, Tr);
|
||||
T15 = FNMS(KP692021471, T14, Te);
|
||||
T17 = FNMS(KP554958132, TR, TM);
|
||||
T18 = FNMS(KP801937735, T17, TW);
|
||||
T16 = FNMS(KP900968867, T15, T1);
|
||||
ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
|
||||
ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
|
||||
}
|
||||
{
|
||||
E T1r, T1u, T1q, T1t, T1s;
|
||||
T1q = FNMS(KP356895867, T1b, T1a);
|
||||
T1r = FNMS(KP692021471, T1q, T19);
|
||||
T1t = FNMS(KP554958132, T1h, T1g);
|
||||
T1u = FNMS(KP801937735, T1t, T1i);
|
||||
T1s = FNMS(KP900968867, T1r, T1c);
|
||||
ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
|
||||
ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 18, 12, 54, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_7) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_7, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 72 FP additions, 60 FP multiplications,
|
||||
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
|
||||
* 29 stack variables, 6 constants, and 28 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
|
||||
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
|
||||
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
|
||||
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
|
||||
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
|
||||
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
|
||||
E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
|
||||
T1 = ri[0];
|
||||
TR = ii[0];
|
||||
{
|
||||
E T6, TA, Tb, TB;
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TA = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 6)];
|
||||
Ta = ii[WS(rs, 6)];
|
||||
T7 = W[10];
|
||||
T9 = W[11];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
TB = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
Tc = T6 + Tb;
|
||||
TS = Tb - T6;
|
||||
TC = TA - TB;
|
||||
TO = TA + TB;
|
||||
}
|
||||
{
|
||||
E Th, TG, Tm, TH;
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 2)];
|
||||
Tg = ii[WS(rs, 2)];
|
||||
Td = W[2];
|
||||
Tf = W[3];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TG = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
{
|
||||
E Tj, Tl, Ti, Tk;
|
||||
Tj = ri[WS(rs, 5)];
|
||||
Tl = ii[WS(rs, 5)];
|
||||
Ti = W[8];
|
||||
Tk = W[9];
|
||||
Tm = FMA(Ti, Tj, Tk * Tl);
|
||||
TH = FNMS(Tk, Tj, Ti * Tl);
|
||||
}
|
||||
Tn = Th + Tm;
|
||||
TT = Tm - Th;
|
||||
TI = TG - TH;
|
||||
TP = TG + TH;
|
||||
}
|
||||
{
|
||||
E Ts, TD, Tx, TE;
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 3)];
|
||||
Tr = ii[WS(rs, 3)];
|
||||
To = W[4];
|
||||
Tq = W[5];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TD = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
{
|
||||
E Tu, Tw, Tt, Tv;
|
||||
Tu = ri[WS(rs, 4)];
|
||||
Tw = ii[WS(rs, 4)];
|
||||
Tt = W[6];
|
||||
Tv = W[7];
|
||||
Tx = FMA(Tt, Tu, Tv * Tw);
|
||||
TE = FNMS(Tv, Tu, Tt * Tw);
|
||||
}
|
||||
Ty = Ts + Tx;
|
||||
TU = Tx - Ts;
|
||||
TF = TD - TE;
|
||||
TQ = TD + TE;
|
||||
}
|
||||
ri[0] = T1 + Tc + Tn + Ty;
|
||||
ii[0] = TO + TP + TQ + TR;
|
||||
{
|
||||
E TJ, Tz, TX, TY;
|
||||
TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
|
||||
Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
|
||||
ri[WS(rs, 5)] = Tz - TJ;
|
||||
ri[WS(rs, 2)] = Tz + TJ;
|
||||
TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
|
||||
TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
|
||||
ii[WS(rs, 2)] = TX + TY;
|
||||
ii[WS(rs, 5)] = TY - TX;
|
||||
}
|
||||
{
|
||||
E TL, TK, TV, TW;
|
||||
TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
|
||||
TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
|
||||
ri[WS(rs, 6)] = TK - TL;
|
||||
ri[WS(rs, 1)] = TK + TL;
|
||||
TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
|
||||
TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
|
||||
ii[WS(rs, 1)] = TV + TW;
|
||||
ii[WS(rs, 6)] = TW - TV;
|
||||
}
|
||||
{
|
||||
E TN, TM, TZ, T10;
|
||||
TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
|
||||
TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
|
||||
ri[WS(rs, 4)] = TM - TN;
|
||||
ri[WS(rs, 3)] = TM + TN;
|
||||
TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
|
||||
T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
|
||||
ii[WS(rs, 3)] = TZ + T10;
|
||||
ii[WS(rs, 4)] = T10 - TZ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 36, 24, 36, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_7) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_7, &desc);
|
||||
}
|
||||
#endif
|
||||
376
fftw-3.3.10/dft/scalar/codelets/t1_8.c
Normal file
376
fftw-3.3.10/dft/scalar/codelets/t1_8.c
Normal file
@@ -0,0 +1,376 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 36 FP multiplications,
|
||||
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
|
||||
* 34 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
|
||||
E TX, Ty, TZ, TV, T10;
|
||||
T1 = ri[0];
|
||||
T1m = ii[0];
|
||||
{
|
||||
E T3, T6, T4, T1k, T2, T5;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T6 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = T2 * T3;
|
||||
T1k = T2 * T6;
|
||||
T5 = W[7];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
T1l = FNMS(T5, T3, T1k);
|
||||
}
|
||||
{
|
||||
E Tg, Tj, Th, TR, Tf, Ti;
|
||||
Tg = ri[WS(rs, 6)];
|
||||
Tj = ii[WS(rs, 6)];
|
||||
Tf = W[10];
|
||||
Th = Tf * Tg;
|
||||
TR = Tf * Tj;
|
||||
Ti = W[11];
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TS = FNMS(Ti, Tg, TR);
|
||||
}
|
||||
{
|
||||
E Ta, Td, Tb, TP, T9, Tc;
|
||||
Ta = ri[WS(rs, 2)];
|
||||
Td = ii[WS(rs, 2)];
|
||||
T9 = W[2];
|
||||
Tb = T9 * Ta;
|
||||
TP = T9 * Td;
|
||||
Tc = W[3];
|
||||
Te = FMA(Tc, Td, Tb);
|
||||
TQ = FNMS(Tc, Ta, TP);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
|
||||
TB = ri[WS(rs, 7)];
|
||||
TE = ii[WS(rs, 7)];
|
||||
TA = W[12];
|
||||
TC = TA * TB;
|
||||
T13 = TA * TE;
|
||||
TH = ri[WS(rs, 3)];
|
||||
TK = ii[WS(rs, 3)];
|
||||
TG = W[4];
|
||||
TI = TG * TH;
|
||||
T15 = TG * TK;
|
||||
TD = W[13];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T14 = FNMS(TD, TB, T13);
|
||||
TJ = W[5];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T16 = FNMS(TJ, TH, T15);
|
||||
T12 = TF - TL;
|
||||
T17 = T14 - T16;
|
||||
}
|
||||
{
|
||||
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
|
||||
To = ri[WS(rs, 1)];
|
||||
Tr = ii[WS(rs, 1)];
|
||||
Tn = W[0];
|
||||
Tp = Tn * To;
|
||||
TW = Tn * Tr;
|
||||
Tu = ri[WS(rs, 5)];
|
||||
Tx = ii[WS(rs, 5)];
|
||||
Tt = W[8];
|
||||
Tv = Tt * Tu;
|
||||
TY = Tt * Tx;
|
||||
Tq = W[1];
|
||||
Ts = FMA(Tq, Tr, Tp);
|
||||
TX = FNMS(Tq, To, TW);
|
||||
Tw = W[9];
|
||||
Ty = FMA(Tw, Tx, Tv);
|
||||
TZ = FNMS(Tw, Tu, TY);
|
||||
TV = Ts - Ty;
|
||||
T10 = TX - TZ;
|
||||
}
|
||||
{
|
||||
E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
|
||||
{
|
||||
E TO, TT, T1r, T1s;
|
||||
TO = T1 - T7;
|
||||
TT = TQ - TS;
|
||||
TU = TO + TT;
|
||||
T1a = TO - TT;
|
||||
T1r = T1m - T1l;
|
||||
T1s = Te - Tk;
|
||||
T1t = T1r - T1s;
|
||||
T1v = T1s + T1r;
|
||||
}
|
||||
{
|
||||
E T11, T18, T1b, T1c;
|
||||
T11 = TV + T10;
|
||||
T18 = T12 - T17;
|
||||
T19 = T11 + T18;
|
||||
T1w = T18 - T11;
|
||||
T1b = T10 - TV;
|
||||
T1c = T12 + T17;
|
||||
T1d = T1b - T1c;
|
||||
T1u = T1b + T1c;
|
||||
}
|
||||
ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU);
|
||||
ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t);
|
||||
ri[WS(rs, 1)] = FMA(KP707106781, T19, TU);
|
||||
ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
|
||||
ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a);
|
||||
ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v);
|
||||
ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
|
||||
ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
|
||||
}
|
||||
{
|
||||
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
|
||||
{
|
||||
E T8, Tl, T1j, T1n;
|
||||
T8 = T1 + T7;
|
||||
Tl = Te + Tk;
|
||||
Tm = T8 + Tl;
|
||||
T1e = T8 - Tl;
|
||||
T1j = TQ + TS;
|
||||
T1n = T1l + T1m;
|
||||
T1o = T1j + T1n;
|
||||
T1q = T1n - T1j;
|
||||
}
|
||||
{
|
||||
E Tz, TM, T1f, T1g;
|
||||
Tz = Ts + Ty;
|
||||
TM = TF + TL;
|
||||
TN = Tz + TM;
|
||||
T1p = TM - Tz;
|
||||
T1f = TX + TZ;
|
||||
T1g = T14 + T16;
|
||||
T1h = T1f - T1g;
|
||||
T1i = T1f + T1g;
|
||||
}
|
||||
ri[WS(rs, 4)] = Tm - TN;
|
||||
ii[WS(rs, 4)] = T1o - T1i;
|
||||
ri[0] = Tm + TN;
|
||||
ii[0] = T1i + T1o;
|
||||
ri[WS(rs, 6)] = T1e - T1h;
|
||||
ii[WS(rs, 6)] = T1q - T1p;
|
||||
ri[WS(rs, 2)] = T1e + T1h;
|
||||
ii[WS(rs, 2)] = T1p + T1q;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 44, 14, 22, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 66 FP additions, 32 FP multiplications,
|
||||
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 28 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
|
||||
E TP;
|
||||
{
|
||||
E T1, T18, T6, T17;
|
||||
T1 = ri[0];
|
||||
T18 = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 4)];
|
||||
T5 = ii[WS(rs, 4)];
|
||||
T2 = W[6];
|
||||
T4 = W[7];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T17 = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
T7 = T1 + T6;
|
||||
T1e = T18 - T17;
|
||||
TH = T1 - T6;
|
||||
T19 = T17 + T18;
|
||||
}
|
||||
{
|
||||
E Tz, TS, TE, TT;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 7)];
|
||||
Ty = ii[WS(rs, 7)];
|
||||
Tv = W[12];
|
||||
Tx = W[13];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
TS = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 3)];
|
||||
TD = ii[WS(rs, 3)];
|
||||
TA = W[4];
|
||||
TC = W[5];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
TT = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
TF = Tz + TE;
|
||||
T13 = TS + TT;
|
||||
TR = Tz - TE;
|
||||
TU = TS - TT;
|
||||
}
|
||||
{
|
||||
E Tc, TI, Th, TJ;
|
||||
{
|
||||
E T9, Tb, T8, Ta;
|
||||
T9 = ri[WS(rs, 2)];
|
||||
Tb = ii[WS(rs, 2)];
|
||||
T8 = W[2];
|
||||
Ta = W[3];
|
||||
Tc = FMA(T8, T9, Ta * Tb);
|
||||
TI = FNMS(Ta, T9, T8 * Tb);
|
||||
}
|
||||
{
|
||||
E Te, Tg, Td, Tf;
|
||||
Te = ri[WS(rs, 6)];
|
||||
Tg = ii[WS(rs, 6)];
|
||||
Td = W[10];
|
||||
Tf = W[11];
|
||||
Th = FMA(Td, Te, Tf * Tg);
|
||||
TJ = FNMS(Tf, Te, Td * Tg);
|
||||
}
|
||||
Ti = Tc + Th;
|
||||
T1f = Tc - Th;
|
||||
TK = TI - TJ;
|
||||
T16 = TI + TJ;
|
||||
}
|
||||
{
|
||||
E To, TN, Tt, TO;
|
||||
{
|
||||
E Tl, Tn, Tk, Tm;
|
||||
Tl = ri[WS(rs, 1)];
|
||||
Tn = ii[WS(rs, 1)];
|
||||
Tk = W[0];
|
||||
Tm = W[1];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
TN = FNMS(Tm, Tl, Tk * Tn);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tp, Tr;
|
||||
Tq = ri[WS(rs, 5)];
|
||||
Ts = ii[WS(rs, 5)];
|
||||
Tp = W[8];
|
||||
Tr = W[9];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
TO = FNMS(Tr, Tq, Tp * Ts);
|
||||
}
|
||||
Tu = To + Tt;
|
||||
T12 = TN + TO;
|
||||
TM = To - Tt;
|
||||
TP = TN - TO;
|
||||
}
|
||||
{
|
||||
E Tj, TG, T1b, T1c;
|
||||
Tj = T7 + Ti;
|
||||
TG = Tu + TF;
|
||||
ri[WS(rs, 4)] = Tj - TG;
|
||||
ri[0] = Tj + TG;
|
||||
{
|
||||
E T15, T1a, T11, T14;
|
||||
T15 = T12 + T13;
|
||||
T1a = T16 + T19;
|
||||
ii[0] = T15 + T1a;
|
||||
ii[WS(rs, 4)] = T1a - T15;
|
||||
T11 = T7 - Ti;
|
||||
T14 = T12 - T13;
|
||||
ri[WS(rs, 6)] = T11 - T14;
|
||||
ri[WS(rs, 2)] = T11 + T14;
|
||||
}
|
||||
T1b = TF - Tu;
|
||||
T1c = T19 - T16;
|
||||
ii[WS(rs, 2)] = T1b + T1c;
|
||||
ii[WS(rs, 6)] = T1c - T1b;
|
||||
{
|
||||
E TX, T1g, T10, T1d, TY, TZ;
|
||||
TX = TH - TK;
|
||||
T1g = T1e - T1f;
|
||||
TY = TP - TM;
|
||||
TZ = TR + TU;
|
||||
T10 = KP707106781 * (TY - TZ);
|
||||
T1d = KP707106781 * (TY + TZ);
|
||||
ri[WS(rs, 7)] = TX - T10;
|
||||
ii[WS(rs, 5)] = T1g - T1d;
|
||||
ri[WS(rs, 3)] = TX + T10;
|
||||
ii[WS(rs, 1)] = T1d + T1g;
|
||||
}
|
||||
{
|
||||
E TL, T1i, TW, T1h, TQ, TV;
|
||||
TL = TH + TK;
|
||||
T1i = T1f + T1e;
|
||||
TQ = TM + TP;
|
||||
TV = TR - TU;
|
||||
TW = KP707106781 * (TQ + TV);
|
||||
T1h = KP707106781 * (TV - TQ);
|
||||
ri[WS(rs, 5)] = TL - TW;
|
||||
ii[WS(rs, 7)] = T1i - T1h;
|
||||
ri[WS(rs, 1)] = TL + TW;
|
||||
ii[WS(rs, 3)] = T1h + T1i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 8 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 52, 18, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_8, &desc);
|
||||
}
|
||||
#endif
|
||||
487
fftw-3.3.10/dft/scalar/codelets/t1_9.c
Normal file
487
fftw-3.3.10/dft/scalar/codelets/t1_9.c
Normal file
@@ -0,0 +1,487 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 88 FP multiplications,
|
||||
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
|
||||
* 55 stack variables, 10 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
|
||||
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
|
||||
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
|
||||
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
|
||||
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
|
||||
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T1, T1R, Te, T1W, T10, T1Q, T1l, T1r, Ty, T1p, Tl, T1o, T1g, T1q, T1a;
|
||||
E T1d, TS, T18, TF, T13, T19, T1c;
|
||||
T1 = ri[0];
|
||||
T1R = ii[0];
|
||||
{
|
||||
E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T6 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = T2 * T3;
|
||||
TW = T2 * T6;
|
||||
T9 = ri[WS(rs, 6)];
|
||||
Tc = ii[WS(rs, 6)];
|
||||
T8 = W[10];
|
||||
Ta = T8 * T9;
|
||||
TY = T8 * Tc;
|
||||
{
|
||||
E T7, TX, Td, TZ, T5, Tb;
|
||||
T5 = W[5];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TX = FNMS(T5, T3, TW);
|
||||
Tb = W[11];
|
||||
Td = FMA(Tb, Tc, Ta);
|
||||
TZ = FNMS(Tb, T9, TY);
|
||||
Te = T7 + Td;
|
||||
T1W = Td - T7;
|
||||
T10 = TX - TZ;
|
||||
T1Q = TX + TZ;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Th, Tk, Ti, T1n, Tx, T1i, Tr, T1k, Tg, Tj;
|
||||
Th = ri[WS(rs, 1)];
|
||||
Tk = ii[WS(rs, 1)];
|
||||
Tg = W[0];
|
||||
Ti = Tg * Th;
|
||||
T1n = Tg * Tk;
|
||||
{
|
||||
E Tt, Tw, Tu, T1h, Ts, Tv;
|
||||
Tt = ri[WS(rs, 7)];
|
||||
Tw = ii[WS(rs, 7)];
|
||||
Ts = W[12];
|
||||
Tu = Ts * Tt;
|
||||
T1h = Ts * Tw;
|
||||
Tv = W[13];
|
||||
Tx = FMA(Tv, Tw, Tu);
|
||||
T1i = FNMS(Tv, Tt, T1h);
|
||||
}
|
||||
{
|
||||
E Tn, Tq, To, T1j, Tm, Tp;
|
||||
Tn = ri[WS(rs, 4)];
|
||||
Tq = ii[WS(rs, 4)];
|
||||
Tm = W[6];
|
||||
To = Tm * Tn;
|
||||
T1j = Tm * Tq;
|
||||
Tp = W[7];
|
||||
Tr = FMA(Tp, Tq, To);
|
||||
T1k = FNMS(Tp, Tn, T1j);
|
||||
}
|
||||
T1l = T1i - T1k;
|
||||
T1r = Tr - Tx;
|
||||
Ty = Tr + Tx;
|
||||
T1p = T1k + T1i;
|
||||
Tj = W[1];
|
||||
Tl = FMA(Tj, Tk, Ti);
|
||||
T1o = FNMS(Tj, Th, T1n);
|
||||
T1g = FNMS(KP500000000, Ty, Tl);
|
||||
T1q = FNMS(KP500000000, T1p, T1o);
|
||||
}
|
||||
{
|
||||
E TB, TE, TC, T12, TR, T17, TL, T15, TA, TD;
|
||||
TB = ri[WS(rs, 2)];
|
||||
TE = ii[WS(rs, 2)];
|
||||
TA = W[2];
|
||||
TC = TA * TB;
|
||||
T12 = TA * TE;
|
||||
{
|
||||
E TN, TQ, TO, T16, TM, TP;
|
||||
TN = ri[WS(rs, 8)];
|
||||
TQ = ii[WS(rs, 8)];
|
||||
TM = W[14];
|
||||
TO = TM * TN;
|
||||
T16 = TM * TQ;
|
||||
TP = W[15];
|
||||
TR = FMA(TP, TQ, TO);
|
||||
T17 = FNMS(TP, TN, T16);
|
||||
}
|
||||
{
|
||||
E TH, TK, TI, T14, TG, TJ;
|
||||
TH = ri[WS(rs, 5)];
|
||||
TK = ii[WS(rs, 5)];
|
||||
TG = W[8];
|
||||
TI = TG * TH;
|
||||
T14 = TG * TK;
|
||||
TJ = W[9];
|
||||
TL = FMA(TJ, TK, TI);
|
||||
T15 = FNMS(TJ, TH, T14);
|
||||
}
|
||||
T1a = TR - TL;
|
||||
T1d = T15 - T17;
|
||||
TS = TL + TR;
|
||||
T18 = T15 + T17;
|
||||
TD = W[3];
|
||||
TF = FMA(TD, TE, TC);
|
||||
T13 = FNMS(TD, TB, T12);
|
||||
T19 = FNMS(KP500000000, T18, T13);
|
||||
T1c = FNMS(KP500000000, TS, TF);
|
||||
}
|
||||
{
|
||||
E Tf, T1S, TU, T1U, T1O, T1P, T1L, T1T;
|
||||
Tf = T1 + Te;
|
||||
T1S = T1Q + T1R;
|
||||
{
|
||||
E Tz, TT, T1M, T1N;
|
||||
Tz = Tl + Ty;
|
||||
TT = TF + TS;
|
||||
TU = Tz + TT;
|
||||
T1U = TT - Tz;
|
||||
T1M = T1o + T1p;
|
||||
T1N = T13 + T18;
|
||||
T1O = T1M - T1N;
|
||||
T1P = T1M + T1N;
|
||||
}
|
||||
ri[0] = Tf + TU;
|
||||
ii[0] = T1P + T1S;
|
||||
T1L = FNMS(KP500000000, TU, Tf);
|
||||
ri[WS(rs, 6)] = FNMS(KP866025403, T1O, T1L);
|
||||
ri[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
|
||||
T1T = FNMS(KP500000000, T1P, T1S);
|
||||
ii[WS(rs, 3)] = FMA(KP866025403, T1U, T1T);
|
||||
ii[WS(rs, 6)] = FNMS(KP866025403, T1U, T1T);
|
||||
}
|
||||
{
|
||||
E T11, T1z, T1X, T21, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G;
|
||||
E T22, TV, T1V;
|
||||
TV = FNMS(KP500000000, Te, T1);
|
||||
T11 = FMA(KP866025403, T10, TV);
|
||||
T1z = FNMS(KP866025403, T10, TV);
|
||||
T1V = FNMS(KP500000000, T1Q, T1R);
|
||||
T1X = FMA(KP866025403, T1W, T1V);
|
||||
T21 = FNMS(KP866025403, T1W, T1V);
|
||||
{
|
||||
E T1b, T1e, T1m, T1s;
|
||||
T1b = FMA(KP866025403, T1a, T19);
|
||||
T1e = FMA(KP866025403, T1d, T1c);
|
||||
T1f = FMA(KP176326980, T1e, T1b);
|
||||
T1w = FNMS(KP176326980, T1b, T1e);
|
||||
T1m = FNMS(KP866025403, T1l, T1g);
|
||||
T1s = FNMS(KP866025403, T1r, T1q);
|
||||
T1t = FMA(KP839099631, T1s, T1m);
|
||||
T1x = FNMS(KP839099631, T1m, T1s);
|
||||
}
|
||||
T1u = FMA(KP777861913, T1t, T1f);
|
||||
T1Y = FNMS(KP777861913, T1x, T1w);
|
||||
{
|
||||
E T1A, T1B, T1D, T1E;
|
||||
T1A = FMA(KP866025403, T1r, T1q);
|
||||
T1B = FMA(KP866025403, T1l, T1g);
|
||||
T1C = FMA(KP176326980, T1B, T1A);
|
||||
T1I = FNMS(KP176326980, T1A, T1B);
|
||||
T1D = FNMS(KP866025403, T1d, T1c);
|
||||
T1E = FNMS(KP866025403, T1a, T19);
|
||||
T1F = FNMS(KP363970234, T1E, T1D);
|
||||
T1J = FMA(KP363970234, T1D, T1E);
|
||||
}
|
||||
T1G = FNMS(KP954188894, T1F, T1C);
|
||||
T22 = FMA(KP954188894, T1J, T1I);
|
||||
ri[WS(rs, 1)] = FMA(KP984807753, T1u, T11);
|
||||
ii[WS(rs, 1)] = FNMS(KP984807753, T1Y, T1X);
|
||||
ri[WS(rs, 2)] = FMA(KP984807753, T1G, T1z);
|
||||
ii[WS(rs, 2)] = FNMS(KP984807753, T22, T21);
|
||||
{
|
||||
E T1v, T1y, T1Z, T20;
|
||||
T1v = FNMS(KP492403876, T1u, T11);
|
||||
T1y = FMA(KP777861913, T1x, T1w);
|
||||
ri[WS(rs, 4)] = FMA(KP852868531, T1y, T1v);
|
||||
ri[WS(rs, 7)] = FNMS(KP852868531, T1y, T1v);
|
||||
T1Z = FMA(KP492403876, T1Y, T1X);
|
||||
T20 = FNMS(KP777861913, T1t, T1f);
|
||||
ii[WS(rs, 4)] = FMA(KP852868531, T20, T1Z);
|
||||
ii[WS(rs, 7)] = FNMS(KP852868531, T20, T1Z);
|
||||
}
|
||||
{
|
||||
E T1H, T1K, T23, T24;
|
||||
T1H = FNMS(KP492403876, T1G, T1z);
|
||||
T1K = FNMS(KP954188894, T1J, T1I);
|
||||
ri[WS(rs, 5)] = FNMS(KP852868531, T1K, T1H);
|
||||
ri[WS(rs, 8)] = FMA(KP852868531, T1K, T1H);
|
||||
T23 = FMA(KP492403876, T22, T21);
|
||||
T24 = FMA(KP954188894, T1F, T1C);
|
||||
ii[WS(rs, 5)] = FNMS(KP852868531, T24, T23);
|
||||
ii[WS(rs, 8)] = FMA(KP852868531, T24, T23);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 24, 16, 72, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_9) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_9, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 96 FP additions, 72 FP multiplications,
|
||||
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
|
||||
* 41 stack variables, 8 constants, and 36 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
|
||||
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
|
||||
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
|
||||
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
|
||||
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
|
||||
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
|
||||
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
|
||||
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
|
||||
E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu;
|
||||
E T1w, TW, T1k, T11, T1l;
|
||||
{
|
||||
E T6, TO, Tb, TP;
|
||||
T1 = ri[0];
|
||||
T1B = ii[0];
|
||||
{
|
||||
E T3, T5, T2, T4;
|
||||
T3 = ri[WS(rs, 3)];
|
||||
T5 = ii[WS(rs, 3)];
|
||||
T2 = W[4];
|
||||
T4 = W[5];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
TO = FNMS(T4, T3, T2 * T5);
|
||||
}
|
||||
{
|
||||
E T8, Ta, T7, T9;
|
||||
T8 = ri[WS(rs, 6)];
|
||||
Ta = ii[WS(rs, 6)];
|
||||
T7 = W[10];
|
||||
T9 = W[11];
|
||||
Tb = FMA(T7, T8, T9 * Ta);
|
||||
TP = FNMS(T9, T8, T7 * Ta);
|
||||
}
|
||||
TQ = KP866025403 * (TO - TP);
|
||||
T1G = KP866025403 * (Tb - T6);
|
||||
Tc = T6 + Tb;
|
||||
TN = FNMS(KP500000000, Tc, T1);
|
||||
T1A = TO + TP;
|
||||
T1H = FNMS(KP500000000, T1A, T1B);
|
||||
}
|
||||
{
|
||||
E Tz, T19, TE, T14, TJ, T15, TK, T1a;
|
||||
{
|
||||
E Tw, Ty, Tv, Tx;
|
||||
Tw = ri[WS(rs, 2)];
|
||||
Ty = ii[WS(rs, 2)];
|
||||
Tv = W[2];
|
||||
Tx = W[3];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T19 = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
{
|
||||
E TB, TD, TA, TC;
|
||||
TB = ri[WS(rs, 5)];
|
||||
TD = ii[WS(rs, 5)];
|
||||
TA = W[8];
|
||||
TC = W[9];
|
||||
TE = FMA(TA, TB, TC * TD);
|
||||
T14 = FNMS(TC, TB, TA * TD);
|
||||
}
|
||||
{
|
||||
E TG, TI, TF, TH;
|
||||
TG = ri[WS(rs, 8)];
|
||||
TI = ii[WS(rs, 8)];
|
||||
TF = W[14];
|
||||
TH = W[15];
|
||||
TJ = FMA(TF, TG, TH * TI);
|
||||
T15 = FNMS(TH, TG, TF * TI);
|
||||
}
|
||||
TK = TE + TJ;
|
||||
T1a = T14 + T15;
|
||||
TL = Tz + TK;
|
||||
T1x = T19 + T1a;
|
||||
{
|
||||
E T13, T16, T18, T1b;
|
||||
T13 = FNMS(KP500000000, TK, Tz);
|
||||
T16 = KP866025403 * (T14 - T15);
|
||||
T17 = T13 + T16;
|
||||
T1o = T13 - T16;
|
||||
T18 = KP866025403 * (TJ - TE);
|
||||
T1b = FNMS(KP500000000, T1a, T19);
|
||||
T1c = T18 + T1b;
|
||||
T1n = T1b - T18;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Ti, TY, Tn, TT, Ts, TU, Tt, TZ;
|
||||
{
|
||||
E Tf, Th, Te, Tg;
|
||||
Tf = ri[WS(rs, 1)];
|
||||
Th = ii[WS(rs, 1)];
|
||||
Te = W[0];
|
||||
Tg = W[1];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
TY = FNMS(Tg, Tf, Te * Th);
|
||||
}
|
||||
{
|
||||
E Tk, Tm, Tj, Tl;
|
||||
Tk = ri[WS(rs, 4)];
|
||||
Tm = ii[WS(rs, 4)];
|
||||
Tj = W[6];
|
||||
Tl = W[7];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
TT = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
{
|
||||
E Tp, Tr, To, Tq;
|
||||
Tp = ri[WS(rs, 7)];
|
||||
Tr = ii[WS(rs, 7)];
|
||||
To = W[12];
|
||||
Tq = W[13];
|
||||
Ts = FMA(To, Tp, Tq * Tr);
|
||||
TU = FNMS(Tq, Tp, To * Tr);
|
||||
}
|
||||
Tt = Tn + Ts;
|
||||
TZ = TT + TU;
|
||||
Tu = Ti + Tt;
|
||||
T1w = TY + TZ;
|
||||
{
|
||||
E TS, TV, TX, T10;
|
||||
TS = FNMS(KP500000000, Tt, Ti);
|
||||
TV = KP866025403 * (TT - TU);
|
||||
TW = TS + TV;
|
||||
T1k = TS - TV;
|
||||
TX = KP866025403 * (Ts - Tn);
|
||||
T10 = FNMS(KP500000000, TZ, TY);
|
||||
T11 = TX + T10;
|
||||
T1l = T10 - TX;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1y, Td, TM, T1v;
|
||||
T1y = KP866025403 * (T1w - T1x);
|
||||
Td = T1 + Tc;
|
||||
TM = Tu + TL;
|
||||
T1v = FNMS(KP500000000, TM, Td);
|
||||
ri[0] = Td + TM;
|
||||
ri[WS(rs, 3)] = T1v + T1y;
|
||||
ri[WS(rs, 6)] = T1v - T1y;
|
||||
}
|
||||
{
|
||||
E T1D, T1z, T1C, T1E;
|
||||
T1D = KP866025403 * (TL - Tu);
|
||||
T1z = T1w + T1x;
|
||||
T1C = T1A + T1B;
|
||||
T1E = FNMS(KP500000000, T1z, T1C);
|
||||
ii[0] = T1z + T1C;
|
||||
ii[WS(rs, 6)] = T1E - T1D;
|
||||
ii[WS(rs, 3)] = T1D + T1E;
|
||||
}
|
||||
{
|
||||
E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K;
|
||||
TR = TN + TQ;
|
||||
T1I = T1G + T1H;
|
||||
{
|
||||
E T12, T1d, T1g, T1h;
|
||||
T12 = FMA(KP766044443, TW, KP642787609 * T11);
|
||||
T1d = FMA(KP173648177, T17, KP984807753 * T1c);
|
||||
T1e = T12 + T1d;
|
||||
T1J = KP866025403 * (T1d - T12);
|
||||
T1g = FNMS(KP642787609, TW, KP766044443 * T11);
|
||||
T1h = FNMS(KP984807753, T17, KP173648177 * T1c);
|
||||
T1i = KP866025403 * (T1g - T1h);
|
||||
T1F = T1g + T1h;
|
||||
}
|
||||
ri[WS(rs, 1)] = TR + T1e;
|
||||
ii[WS(rs, 1)] = T1F + T1I;
|
||||
T1f = FNMS(KP500000000, T1e, TR);
|
||||
ri[WS(rs, 7)] = T1f - T1i;
|
||||
ri[WS(rs, 4)] = T1f + T1i;
|
||||
T1K = FNMS(KP500000000, T1F, T1I);
|
||||
ii[WS(rs, 4)] = T1J + T1K;
|
||||
ii[WS(rs, 7)] = T1K - T1J;
|
||||
}
|
||||
{
|
||||
E T1j, T1M, T1q, T1N, T1u, T1L, T1r, T1O;
|
||||
T1j = TN - TQ;
|
||||
T1M = T1H - T1G;
|
||||
{
|
||||
E T1m, T1p, T1s, T1t;
|
||||
T1m = FMA(KP173648177, T1k, KP984807753 * T1l);
|
||||
T1p = FNMS(KP939692620, T1o, KP342020143 * T1n);
|
||||
T1q = T1m + T1p;
|
||||
T1N = KP866025403 * (T1p - T1m);
|
||||
T1s = FNMS(KP984807753, T1k, KP173648177 * T1l);
|
||||
T1t = FMA(KP342020143, T1o, KP939692620 * T1n);
|
||||
T1u = KP866025403 * (T1s + T1t);
|
||||
T1L = T1s - T1t;
|
||||
}
|
||||
ri[WS(rs, 2)] = T1j + T1q;
|
||||
ii[WS(rs, 2)] = T1L + T1M;
|
||||
T1r = FNMS(KP500000000, T1q, T1j);
|
||||
ri[WS(rs, 8)] = T1r - T1u;
|
||||
ri[WS(rs, 5)] = T1r + T1u;
|
||||
T1O = FNMS(KP500000000, T1L, T1M);
|
||||
ii[WS(rs, 5)] = T1N + T1O;
|
||||
ii[WS(rs, 8)] = T1O - T1N;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_FULL, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 60, 36, 36, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t1_9) (planner *p) {
|
||||
X(kdft_dit_register) (p, t1_9, &desc);
|
||||
}
|
||||
#endif
|
||||
509
fftw-3.3.10/dft/scalar/codelets/t2_10.c
Normal file
509
fftw-3.3.10/dft/scalar/codelets/t2_10.c
Normal file
@@ -0,0 +1,509 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 114 FP additions, 94 FP multiplications,
|
||||
* (or, 48 additions, 28 multiplications, 66 fused multiply/add),
|
||||
* 63 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T2, T3, T8, Tc, T5, T6, Tl, T7, TB, TF, T12, TY, To, Ts, Tw;
|
||||
E Tb, Td, Th;
|
||||
{
|
||||
E TA, TX, TE, T11, Ta, T4;
|
||||
T2 = W[0];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
T8 = W[4];
|
||||
TA = T2 * T8;
|
||||
TX = T3 * T8;
|
||||
Tc = W[5];
|
||||
TE = T2 * Tc;
|
||||
T11 = T3 * Tc;
|
||||
T5 = W[1];
|
||||
T6 = W[3];
|
||||
Ta = T2 * T6;
|
||||
Tl = FMA(T5, T6, T4);
|
||||
T7 = FNMS(T5, T6, T4);
|
||||
TB = FMA(T5, Tc, TA);
|
||||
TF = FNMS(T5, T8, TE);
|
||||
T12 = FNMS(T6, T8, T11);
|
||||
TY = FMA(T6, Tc, TX);
|
||||
{
|
||||
E Tr, Tv, T9, Tg;
|
||||
Tr = Tl * T8;
|
||||
Tv = Tl * Tc;
|
||||
To = FNMS(T5, T3, Ta);
|
||||
Ts = FMA(To, Tc, Tr);
|
||||
Tw = FNMS(To, T8, Tv);
|
||||
T9 = T7 * T8;
|
||||
Tg = T7 * Tc;
|
||||
Tb = FMA(T5, T3, Ta);
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Th = FNMS(Tb, T8, Tg);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tk, T1c, T24, T2d, TW, T19, T1a, T1P, T1Q, T1Z, T1g, T1h, T1i, T1C, T1H;
|
||||
E T2f, Tz, TM, TN, T1S, T1T, T1Y, T1d, T1e, T1f, T1r, T1w, T2e;
|
||||
{
|
||||
E T1, T23, Te, Tf, Ti, T21, Tj, T22;
|
||||
T1 = ri[0];
|
||||
T23 = ii[0];
|
||||
Te = ri[WS(rs, 5)];
|
||||
Tf = Td * Te;
|
||||
Ti = ii[WS(rs, 5)];
|
||||
T21 = Td * Ti;
|
||||
Tj = FMA(Th, Ti, Tf);
|
||||
Tk = T1 - Tj;
|
||||
T1c = T1 + Tj;
|
||||
T22 = FNMS(Th, Te, T21);
|
||||
T24 = T22 + T23;
|
||||
T2d = T23 - T22;
|
||||
}
|
||||
{
|
||||
E TR, T1z, T18, T1G, TV, T1B, T14, T1E;
|
||||
{
|
||||
E TO, TP, TQ, T1y;
|
||||
TO = ri[WS(rs, 4)];
|
||||
TP = T7 * TO;
|
||||
TQ = ii[WS(rs, 4)];
|
||||
T1y = T7 * TQ;
|
||||
TR = FMA(Tb, TQ, TP);
|
||||
T1z = FNMS(Tb, TO, T1y);
|
||||
}
|
||||
{
|
||||
E T15, T16, T17, T1F;
|
||||
T15 = ri[WS(rs, 1)];
|
||||
T16 = T2 * T15;
|
||||
T17 = ii[WS(rs, 1)];
|
||||
T1F = T2 * T17;
|
||||
T18 = FMA(T5, T17, T16);
|
||||
T1G = FNMS(T5, T15, T1F);
|
||||
}
|
||||
{
|
||||
E TS, TT, TU, T1A;
|
||||
TS = ri[WS(rs, 9)];
|
||||
TT = T8 * TS;
|
||||
TU = ii[WS(rs, 9)];
|
||||
T1A = T8 * TU;
|
||||
TV = FMA(Tc, TU, TT);
|
||||
T1B = FNMS(Tc, TS, T1A);
|
||||
}
|
||||
{
|
||||
E TZ, T10, T13, T1D;
|
||||
TZ = ri[WS(rs, 6)];
|
||||
T10 = TY * TZ;
|
||||
T13 = ii[WS(rs, 6)];
|
||||
T1D = TY * T13;
|
||||
T14 = FMA(T12, T13, T10);
|
||||
T1E = FNMS(T12, TZ, T1D);
|
||||
}
|
||||
TW = TR - TV;
|
||||
T19 = T14 - T18;
|
||||
T1a = TW + T19;
|
||||
T1P = T1z + T1B;
|
||||
T1Q = T1E + T1G;
|
||||
T1Z = T1P + T1Q;
|
||||
T1g = TR + TV;
|
||||
T1h = T14 + T18;
|
||||
T1i = T1g + T1h;
|
||||
T1C = T1z - T1B;
|
||||
T1H = T1E - T1G;
|
||||
T2f = T1C + T1H;
|
||||
}
|
||||
{
|
||||
E Tq, T1o, TL, T1v, Ty, T1q, TH, T1t;
|
||||
{
|
||||
E Tm, Tn, Tp, T1n;
|
||||
Tm = ri[WS(rs, 2)];
|
||||
Tn = Tl * Tm;
|
||||
Tp = ii[WS(rs, 2)];
|
||||
T1n = Tl * Tp;
|
||||
Tq = FMA(To, Tp, Tn);
|
||||
T1o = FNMS(To, Tm, T1n);
|
||||
}
|
||||
{
|
||||
E TI, TJ, TK, T1u;
|
||||
TI = ri[WS(rs, 3)];
|
||||
TJ = T3 * TI;
|
||||
TK = ii[WS(rs, 3)];
|
||||
T1u = T3 * TK;
|
||||
TL = FMA(T6, TK, TJ);
|
||||
T1v = FNMS(T6, TI, T1u);
|
||||
}
|
||||
{
|
||||
E Tt, Tu, Tx, T1p;
|
||||
Tt = ri[WS(rs, 7)];
|
||||
Tu = Ts * Tt;
|
||||
Tx = ii[WS(rs, 7)];
|
||||
T1p = Ts * Tx;
|
||||
Ty = FMA(Tw, Tx, Tu);
|
||||
T1q = FNMS(Tw, Tt, T1p);
|
||||
}
|
||||
{
|
||||
E TC, TD, TG, T1s;
|
||||
TC = ri[WS(rs, 8)];
|
||||
TD = TB * TC;
|
||||
TG = ii[WS(rs, 8)];
|
||||
T1s = TB * TG;
|
||||
TH = FMA(TF, TG, TD);
|
||||
T1t = FNMS(TF, TC, T1s);
|
||||
}
|
||||
Tz = Tq - Ty;
|
||||
TM = TH - TL;
|
||||
TN = Tz + TM;
|
||||
T1S = T1o + T1q;
|
||||
T1T = T1t + T1v;
|
||||
T1Y = T1S + T1T;
|
||||
T1d = Tq + Ty;
|
||||
T1e = TH + TL;
|
||||
T1f = T1d + T1e;
|
||||
T1r = T1o - T1q;
|
||||
T1w = T1t - T1v;
|
||||
T2e = T1r + T1w;
|
||||
}
|
||||
{
|
||||
E T1l, T1b, T1k, T1J, T1L, T1x, T1I, T1K, T1m;
|
||||
T1l = TN - T1a;
|
||||
T1b = TN + T1a;
|
||||
T1k = FNMS(KP250000000, T1b, Tk);
|
||||
T1x = T1r - T1w;
|
||||
T1I = T1C - T1H;
|
||||
T1J = FMA(KP618033988, T1I, T1x);
|
||||
T1L = FNMS(KP618033988, T1x, T1I);
|
||||
ri[WS(rs, 5)] = Tk + T1b;
|
||||
T1K = FNMS(KP559016994, T1l, T1k);
|
||||
ri[WS(rs, 7)] = FNMS(KP951056516, T1L, T1K);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, T1L, T1K);
|
||||
T1m = FMA(KP559016994, T1l, T1k);
|
||||
ri[WS(rs, 9)] = FNMS(KP951056516, T1J, T1m);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, T1J, T1m);
|
||||
}
|
||||
{
|
||||
E T2i, T2g, T2h, T2m, T2o, T2k, T2l, T2n, T2j;
|
||||
T2i = T2e - T2f;
|
||||
T2g = T2e + T2f;
|
||||
T2h = FNMS(KP250000000, T2g, T2d);
|
||||
T2k = Tz - TM;
|
||||
T2l = TW - T19;
|
||||
T2m = FMA(KP618033988, T2l, T2k);
|
||||
T2o = FNMS(KP618033988, T2k, T2l);
|
||||
ii[WS(rs, 5)] = T2g + T2d;
|
||||
T2n = FNMS(KP559016994, T2i, T2h);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, T2o, T2n);
|
||||
ii[WS(rs, 7)] = FMA(KP951056516, T2o, T2n);
|
||||
T2j = FMA(KP559016994, T2i, T2h);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, T2m, T2j);
|
||||
ii[WS(rs, 9)] = FMA(KP951056516, T2m, T2j);
|
||||
}
|
||||
{
|
||||
E T1N, T1j, T1M, T1V, T1X, T1R, T1U, T1W, T1O;
|
||||
T1N = T1f - T1i;
|
||||
T1j = T1f + T1i;
|
||||
T1M = FNMS(KP250000000, T1j, T1c);
|
||||
T1R = T1P - T1Q;
|
||||
T1U = T1S - T1T;
|
||||
T1V = FNMS(KP618033988, T1U, T1R);
|
||||
T1X = FMA(KP618033988, T1R, T1U);
|
||||
ri[0] = T1c + T1j;
|
||||
T1W = FMA(KP559016994, T1N, T1M);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, T1X, T1W);
|
||||
ri[WS(rs, 6)] = FMA(KP951056516, T1X, T1W);
|
||||
T1O = FNMS(KP559016994, T1N, T1M);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, T1V, T1O);
|
||||
ri[WS(rs, 8)] = FMA(KP951056516, T1V, T1O);
|
||||
}
|
||||
{
|
||||
E T26, T20, T25, T2a, T2c, T28, T29, T2b, T27;
|
||||
T26 = T1Y - T1Z;
|
||||
T20 = T1Y + T1Z;
|
||||
T25 = FNMS(KP250000000, T20, T24);
|
||||
T28 = T1g - T1h;
|
||||
T29 = T1d - T1e;
|
||||
T2a = FNMS(KP618033988, T29, T28);
|
||||
T2c = FMA(KP618033988, T28, T29);
|
||||
ii[0] = T20 + T24;
|
||||
T2b = FMA(KP559016994, T26, T25);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, T2c, T2b);
|
||||
ii[WS(rs, 6)] = FNMS(KP951056516, T2c, T2b);
|
||||
T27 = FNMS(KP559016994, T26, T25);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, T2a, T27);
|
||||
ii[WS(rs, 8)] = FNMS(KP951056516, T2a, T27);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 48, 28, 66, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_10, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 114 FP additions, 80 FP multiplications,
|
||||
* (or, 76 additions, 42 multiplications, 38 fused multiply/add),
|
||||
* 63 stack variables, 4 constants, and 40 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
|
||||
E T2, T5, T3, T6, T8, Tm, Tc, Tk, T9, Td, Te, TM, TO, Tg, Tp;
|
||||
E Tv, Tx, Tr;
|
||||
{
|
||||
E T4, Tb, T7, Ta;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
T3 = W[2];
|
||||
T6 = W[3];
|
||||
T4 = T2 * T3;
|
||||
Tb = T5 * T3;
|
||||
T7 = T5 * T6;
|
||||
Ta = T2 * T6;
|
||||
T8 = T4 - T7;
|
||||
Tm = Ta - Tb;
|
||||
Tc = Ta + Tb;
|
||||
Tk = T4 + T7;
|
||||
T9 = W[4];
|
||||
Td = W[5];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
TM = FMA(T3, T9, T6 * Td);
|
||||
TO = FNMS(T6, T9, T3 * Td);
|
||||
Tg = FNMS(Tc, T9, T8 * Td);
|
||||
Tp = FMA(Tk, T9, Tm * Td);
|
||||
Tv = FMA(T2, T9, T5 * Td);
|
||||
Tx = FNMS(T5, T9, T2 * Td);
|
||||
Tr = FNMS(Tm, T9, Tk * Td);
|
||||
}
|
||||
{
|
||||
E Tj, T1S, TX, T1G, TL, TU, TV, T1s, T1t, T1C, T11, T12, T13, T1h, T1k;
|
||||
E T1Q, Tu, TD, TE, T1v, T1w, T1B, TY, TZ, T10, T1a, T1d, T1P;
|
||||
{
|
||||
E T1, T1F, Ti, T1E, Tf, Th;
|
||||
T1 = ri[0];
|
||||
T1F = ii[0];
|
||||
Tf = ri[WS(rs, 5)];
|
||||
Th = ii[WS(rs, 5)];
|
||||
Ti = FMA(Te, Tf, Tg * Th);
|
||||
T1E = FNMS(Tg, Tf, Te * Th);
|
||||
Tj = T1 - Ti;
|
||||
T1S = T1F - T1E;
|
||||
TX = T1 + Ti;
|
||||
T1G = T1E + T1F;
|
||||
}
|
||||
{
|
||||
E TH, T1f, TT, T1j, TK, T1g, TQ, T1i;
|
||||
{
|
||||
E TF, TG, TR, TS;
|
||||
TF = ri[WS(rs, 4)];
|
||||
TG = ii[WS(rs, 4)];
|
||||
TH = FMA(T8, TF, Tc * TG);
|
||||
T1f = FNMS(Tc, TF, T8 * TG);
|
||||
TR = ri[WS(rs, 1)];
|
||||
TS = ii[WS(rs, 1)];
|
||||
TT = FMA(T2, TR, T5 * TS);
|
||||
T1j = FNMS(T5, TR, T2 * TS);
|
||||
}
|
||||
{
|
||||
E TI, TJ, TN, TP;
|
||||
TI = ri[WS(rs, 9)];
|
||||
TJ = ii[WS(rs, 9)];
|
||||
TK = FMA(T9, TI, Td * TJ);
|
||||
T1g = FNMS(Td, TI, T9 * TJ);
|
||||
TN = ri[WS(rs, 6)];
|
||||
TP = ii[WS(rs, 6)];
|
||||
TQ = FMA(TM, TN, TO * TP);
|
||||
T1i = FNMS(TO, TN, TM * TP);
|
||||
}
|
||||
TL = TH - TK;
|
||||
TU = TQ - TT;
|
||||
TV = TL + TU;
|
||||
T1s = T1f + T1g;
|
||||
T1t = T1i + T1j;
|
||||
T1C = T1s + T1t;
|
||||
T11 = TH + TK;
|
||||
T12 = TQ + TT;
|
||||
T13 = T11 + T12;
|
||||
T1h = T1f - T1g;
|
||||
T1k = T1i - T1j;
|
||||
T1Q = T1h + T1k;
|
||||
}
|
||||
{
|
||||
E To, T18, TC, T1c, Tt, T19, Tz, T1b;
|
||||
{
|
||||
E Tl, Tn, TA, TB;
|
||||
Tl = ri[WS(rs, 2)];
|
||||
Tn = ii[WS(rs, 2)];
|
||||
To = FMA(Tk, Tl, Tm * Tn);
|
||||
T18 = FNMS(Tm, Tl, Tk * Tn);
|
||||
TA = ri[WS(rs, 3)];
|
||||
TB = ii[WS(rs, 3)];
|
||||
TC = FMA(T3, TA, T6 * TB);
|
||||
T1c = FNMS(T6, TA, T3 * TB);
|
||||
}
|
||||
{
|
||||
E Tq, Ts, Tw, Ty;
|
||||
Tq = ri[WS(rs, 7)];
|
||||
Ts = ii[WS(rs, 7)];
|
||||
Tt = FMA(Tp, Tq, Tr * Ts);
|
||||
T19 = FNMS(Tr, Tq, Tp * Ts);
|
||||
Tw = ri[WS(rs, 8)];
|
||||
Ty = ii[WS(rs, 8)];
|
||||
Tz = FMA(Tv, Tw, Tx * Ty);
|
||||
T1b = FNMS(Tx, Tw, Tv * Ty);
|
||||
}
|
||||
Tu = To - Tt;
|
||||
TD = Tz - TC;
|
||||
TE = Tu + TD;
|
||||
T1v = T18 + T19;
|
||||
T1w = T1b + T1c;
|
||||
T1B = T1v + T1w;
|
||||
TY = To + Tt;
|
||||
TZ = Tz + TC;
|
||||
T10 = TY + TZ;
|
||||
T1a = T18 - T19;
|
||||
T1d = T1b - T1c;
|
||||
T1P = T1a + T1d;
|
||||
}
|
||||
{
|
||||
E T15, TW, T16, T1m, T1o, T1e, T1l, T1n, T17;
|
||||
T15 = KP559016994 * (TE - TV);
|
||||
TW = TE + TV;
|
||||
T16 = FNMS(KP250000000, TW, Tj);
|
||||
T1e = T1a - T1d;
|
||||
T1l = T1h - T1k;
|
||||
T1m = FMA(KP951056516, T1e, KP587785252 * T1l);
|
||||
T1o = FNMS(KP587785252, T1e, KP951056516 * T1l);
|
||||
ri[WS(rs, 5)] = Tj + TW;
|
||||
T1n = T16 - T15;
|
||||
ri[WS(rs, 7)] = T1n - T1o;
|
||||
ri[WS(rs, 3)] = T1n + T1o;
|
||||
T17 = T15 + T16;
|
||||
ri[WS(rs, 9)] = T17 - T1m;
|
||||
ri[WS(rs, 1)] = T17 + T1m;
|
||||
}
|
||||
{
|
||||
E T1R, T1T, T1U, T1Y, T20, T1W, T1X, T1Z, T1V;
|
||||
T1R = KP559016994 * (T1P - T1Q);
|
||||
T1T = T1P + T1Q;
|
||||
T1U = FNMS(KP250000000, T1T, T1S);
|
||||
T1W = Tu - TD;
|
||||
T1X = TL - TU;
|
||||
T1Y = FMA(KP951056516, T1W, KP587785252 * T1X);
|
||||
T20 = FNMS(KP587785252, T1W, KP951056516 * T1X);
|
||||
ii[WS(rs, 5)] = T1T + T1S;
|
||||
T1Z = T1U - T1R;
|
||||
ii[WS(rs, 3)] = T1Z - T20;
|
||||
ii[WS(rs, 7)] = T20 + T1Z;
|
||||
T1V = T1R + T1U;
|
||||
ii[WS(rs, 1)] = T1V - T1Y;
|
||||
ii[WS(rs, 9)] = T1Y + T1V;
|
||||
}
|
||||
{
|
||||
E T1q, T14, T1p, T1y, T1A, T1u, T1x, T1z, T1r;
|
||||
T1q = KP559016994 * (T10 - T13);
|
||||
T14 = T10 + T13;
|
||||
T1p = FNMS(KP250000000, T14, TX);
|
||||
T1u = T1s - T1t;
|
||||
T1x = T1v - T1w;
|
||||
T1y = FNMS(KP587785252, T1x, KP951056516 * T1u);
|
||||
T1A = FMA(KP951056516, T1x, KP587785252 * T1u);
|
||||
ri[0] = TX + T14;
|
||||
T1z = T1q + T1p;
|
||||
ri[WS(rs, 4)] = T1z - T1A;
|
||||
ri[WS(rs, 6)] = T1z + T1A;
|
||||
T1r = T1p - T1q;
|
||||
ri[WS(rs, 2)] = T1r - T1y;
|
||||
ri[WS(rs, 8)] = T1r + T1y;
|
||||
}
|
||||
{
|
||||
E T1L, T1D, T1K, T1J, T1N, T1H, T1I, T1O, T1M;
|
||||
T1L = KP559016994 * (T1B - T1C);
|
||||
T1D = T1B + T1C;
|
||||
T1K = FNMS(KP250000000, T1D, T1G);
|
||||
T1H = T11 - T12;
|
||||
T1I = TY - TZ;
|
||||
T1J = FNMS(KP587785252, T1I, KP951056516 * T1H);
|
||||
T1N = FMA(KP951056516, T1I, KP587785252 * T1H);
|
||||
ii[0] = T1D + T1G;
|
||||
T1O = T1L + T1K;
|
||||
ii[WS(rs, 4)] = T1N + T1O;
|
||||
ii[WS(rs, 6)] = T1O - T1N;
|
||||
T1M = T1K - T1L;
|
||||
ii[WS(rs, 2)] = T1J + T1M;
|
||||
ii[WS(rs, 8)] = T1M - T1J;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 76, 42, 38, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_10) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_10, &desc);
|
||||
}
|
||||
#endif
|
||||
836
fftw-3.3.10/dft/scalar/codelets/t2_16.c
Normal file
836
fftw-3.3.10/dft/scalar/codelets/t2_16.c
Normal file
@@ -0,0 +1,836 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 134 FP multiplications,
|
||||
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
|
||||
* 90 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
|
||||
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
|
||||
{
|
||||
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
|
||||
T2 = W[0];
|
||||
Tf = W[2];
|
||||
Tg = T2 * Tf;
|
||||
TM = W[6];
|
||||
TN = T2 * TM;
|
||||
TO = W[7];
|
||||
TS = T2 * TO;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tp = Tf * T3;
|
||||
T6 = W[5];
|
||||
Ta = T2 * T6;
|
||||
Tt = Tf * T6;
|
||||
T5 = W[1];
|
||||
Th = W[3];
|
||||
Tl = T2 * Th;
|
||||
Tz = FMA(T5, Th, Tg);
|
||||
Ti = FNMS(T5, Th, Tg);
|
||||
T7 = FMA(T5, T6, T4);
|
||||
TZ = FNMS(Th, T3, Tt);
|
||||
TT = FNMS(T5, TM, TS);
|
||||
Tq = FNMS(Th, T6, Tp);
|
||||
TW = FMA(Th, T6, Tp);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
Tu = FMA(Th, T3, Tt);
|
||||
TP = FMA(T5, TO, TN);
|
||||
TI = FMA(T5, T3, Ta);
|
||||
TF = FNMS(T5, T6, T4);
|
||||
{
|
||||
E T1y, T1C, T1e, T1i;
|
||||
T1y = Tz * T3;
|
||||
T1C = Tz * T6;
|
||||
TC = FNMS(T5, Tf, Tl);
|
||||
T1z = FMA(TC, T6, T1y);
|
||||
T1O = FMA(TC, T3, T1C);
|
||||
T1D = FNMS(TC, T3, T1C);
|
||||
T1L = FNMS(TC, T6, T1y);
|
||||
T1e = Ti * T3;
|
||||
T1i = Ti * T6;
|
||||
Tm = FMA(T5, Tf, Tl);
|
||||
T1f = FMA(Tm, T6, T1e);
|
||||
T1p = FMA(Tm, T3, T1i);
|
||||
T1j = FNMS(Tm, T3, T1i);
|
||||
T1m = FNMS(Tm, T6, T1e);
|
||||
}
|
||||
}
|
||||
{
|
||||
E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
|
||||
E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
|
||||
E T2d, T38;
|
||||
{
|
||||
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
|
||||
T1 = ri[0];
|
||||
T3z = ii[0];
|
||||
T8 = ri[WS(rs, 8)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ii[WS(rs, 8)];
|
||||
T3x = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Te = T1 + Td;
|
||||
T1U = T1 - Td;
|
||||
T3y = FNMS(Tb, T8, T3x);
|
||||
T3A = T3y + T3z;
|
||||
T3L = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
|
||||
T1u = ri[WS(rs, 15)];
|
||||
T1v = TM * T1u;
|
||||
T1w = ii[WS(rs, 15)];
|
||||
T2w = TM * T1w;
|
||||
T1A = ri[WS(rs, 7)];
|
||||
T1B = T1z * T1A;
|
||||
T1E = ii[WS(rs, 7)];
|
||||
T2y = T1z * T1E;
|
||||
{
|
||||
E T1x, T1F, T2x, T2z;
|
||||
T1x = FMA(TO, T1w, T1v);
|
||||
T1F = FMA(T1D, T1E, T1B);
|
||||
T1G = T1x + T1F;
|
||||
T2D = T1x - T1F;
|
||||
T2x = FNMS(TO, T1u, T2w);
|
||||
T2z = FNMS(T1D, T1A, T2y);
|
||||
T2A = T2x - T2z;
|
||||
T3h = T2x + T2z;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
|
||||
T1H = ri[WS(rs, 3)];
|
||||
T1I = Tf * T1H;
|
||||
T1J = ii[WS(rs, 3)];
|
||||
T2E = Tf * T1J;
|
||||
T1M = ri[WS(rs, 11)];
|
||||
T1N = T1L * T1M;
|
||||
T1P = ii[WS(rs, 11)];
|
||||
T2G = T1L * T1P;
|
||||
{
|
||||
E T1K, T1Q, T2F, T2H;
|
||||
T1K = FMA(Th, T1J, T1I);
|
||||
T1Q = FMA(T1O, T1P, T1N);
|
||||
T1R = T1K + T1Q;
|
||||
T2B = T1K - T1Q;
|
||||
T2F = FNMS(Th, T1H, T2E);
|
||||
T2H = FNMS(T1O, T1M, T2G);
|
||||
T2I = T2F - T2H;
|
||||
T3i = T2F + T2H;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
|
||||
Tj = ri[WS(rs, 4)];
|
||||
Tk = Ti * Tj;
|
||||
Tn = ii[WS(rs, 4)];
|
||||
T1V = Ti * Tn;
|
||||
Tr = ri[WS(rs, 12)];
|
||||
Ts = Tq * Tr;
|
||||
Tv = ii[WS(rs, 12)];
|
||||
T1X = Tq * Tv;
|
||||
{
|
||||
E To, Tw, T1W, T1Y;
|
||||
To = FMA(Tm, Tn, Tk);
|
||||
Tw = FMA(Tu, Tv, Ts);
|
||||
Tx = To + Tw;
|
||||
T3M = To - Tw;
|
||||
T1W = FNMS(Tm, Tj, T1V);
|
||||
T1Y = FNMS(Tu, Tr, T1X);
|
||||
T1Z = T1W - T1Y;
|
||||
T3w = T1W + T1Y;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TA, TB, TD, T21, TG, TH, TJ, T23;
|
||||
TA = ri[WS(rs, 2)];
|
||||
TB = Tz * TA;
|
||||
TD = ii[WS(rs, 2)];
|
||||
T21 = Tz * TD;
|
||||
TG = ri[WS(rs, 10)];
|
||||
TH = TF * TG;
|
||||
TJ = ii[WS(rs, 10)];
|
||||
T23 = TF * TJ;
|
||||
{
|
||||
E TE, TK, T22, T24;
|
||||
TE = FMA(TC, TD, TB);
|
||||
TK = FMA(TI, TJ, TH);
|
||||
TL = TE + TK;
|
||||
T26 = TE - TK;
|
||||
T22 = FNMS(TC, TA, T21);
|
||||
T24 = FNMS(TI, TG, T23);
|
||||
T25 = T22 - T24;
|
||||
T37 = T22 + T24;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
|
||||
T15 = ri[WS(rs, 1)];
|
||||
T16 = T2 * T15;
|
||||
T17 = ii[WS(rs, 1)];
|
||||
T2h = T2 * T17;
|
||||
T19 = ri[WS(rs, 9)];
|
||||
T1a = T3 * T19;
|
||||
T1b = ii[WS(rs, 9)];
|
||||
T2j = T3 * T1b;
|
||||
{
|
||||
E T18, T1c, T2i, T2k;
|
||||
T18 = FMA(T5, T17, T16);
|
||||
T1c = FMA(T6, T1b, T1a);
|
||||
T1d = T18 + T1c;
|
||||
T2o = T18 - T1c;
|
||||
T2i = FNMS(T5, T15, T2h);
|
||||
T2k = FNMS(T6, T19, T2j);
|
||||
T2l = T2i - T2k;
|
||||
T3c = T2i + T2k;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
|
||||
T1g = ri[WS(rs, 5)];
|
||||
T1h = T1f * T1g;
|
||||
T1k = ii[WS(rs, 5)];
|
||||
T2p = T1f * T1k;
|
||||
T1n = ri[WS(rs, 13)];
|
||||
T1o = T1m * T1n;
|
||||
T1q = ii[WS(rs, 13)];
|
||||
T2r = T1m * T1q;
|
||||
{
|
||||
E T1l, T1r, T2q, T2s;
|
||||
T1l = FMA(T1j, T1k, T1h);
|
||||
T1r = FMA(T1p, T1q, T1o);
|
||||
T1s = T1l + T1r;
|
||||
T2m = T1l - T1r;
|
||||
T2q = FNMS(T1j, T1g, T2p);
|
||||
T2s = FNMS(T1p, T1n, T2r);
|
||||
T2t = T2q - T2s;
|
||||
T3d = T2q + T2s;
|
||||
}
|
||||
}
|
||||
{
|
||||
E TQ, TR, TU, T29, TX, TY, T10, T2b;
|
||||
TQ = ri[WS(rs, 14)];
|
||||
TR = TP * TQ;
|
||||
TU = ii[WS(rs, 14)];
|
||||
T29 = TP * TU;
|
||||
TX = ri[WS(rs, 6)];
|
||||
TY = TW * TX;
|
||||
T10 = ii[WS(rs, 6)];
|
||||
T2b = TW * T10;
|
||||
{
|
||||
E TV, T11, T2a, T2c;
|
||||
TV = FMA(TT, TU, TR);
|
||||
T11 = FMA(TZ, T10, TY);
|
||||
T12 = TV + T11;
|
||||
T28 = TV - T11;
|
||||
T2a = FNMS(TT, TQ, T29);
|
||||
T2c = FNMS(TZ, TX, T2b);
|
||||
T2d = T2a - T2c;
|
||||
T38 = T2a + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
|
||||
{
|
||||
E Ty, T13, T3v, T3B;
|
||||
Ty = Te + Tx;
|
||||
T13 = TL + T12;
|
||||
T14 = Ty + T13;
|
||||
T3q = Ty - T13;
|
||||
T3v = T37 + T38;
|
||||
T3B = T3w + T3A;
|
||||
T3C = T3v + T3B;
|
||||
T3E = T3B - T3v;
|
||||
}
|
||||
{
|
||||
E T1t, T1S, T3r, T3s;
|
||||
T1t = T1d + T1s;
|
||||
T1S = T1G + T1R;
|
||||
T1T = T1t + T1S;
|
||||
T3D = T1S - T1t;
|
||||
T3r = T3c + T3d;
|
||||
T3s = T3h + T3i;
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3r + T3s;
|
||||
}
|
||||
ri[WS(rs, 8)] = T14 - T1T;
|
||||
ii[WS(rs, 8)] = T3C - T3u;
|
||||
ri[0] = T14 + T1T;
|
||||
ii[0] = T3u + T3C;
|
||||
ri[WS(rs, 12)] = T3q - T3t;
|
||||
ii[WS(rs, 12)] = T3E - T3D;
|
||||
ri[WS(rs, 4)] = T3q + T3t;
|
||||
ii[WS(rs, 4)] = T3D + T3E;
|
||||
}
|
||||
{
|
||||
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
|
||||
{
|
||||
E T36, T39, T3F, T3G;
|
||||
T36 = Te - Tx;
|
||||
T39 = T37 - T38;
|
||||
T3a = T36 + T39;
|
||||
T3m = T36 - T39;
|
||||
T3F = T12 - TL;
|
||||
T3G = T3A - T3w;
|
||||
T3H = T3F + T3G;
|
||||
T3J = T3G - T3F;
|
||||
}
|
||||
{
|
||||
E T3b, T3e, T3g, T3j;
|
||||
T3b = T1d - T1s;
|
||||
T3e = T3c - T3d;
|
||||
T3f = T3b + T3e;
|
||||
T3n = T3e - T3b;
|
||||
T3g = T1G - T1R;
|
||||
T3j = T3h - T3i;
|
||||
T3k = T3g - T3j;
|
||||
T3o = T3g + T3j;
|
||||
}
|
||||
{
|
||||
E T3l, T3I, T3p, T3K;
|
||||
T3l = T3f + T3k;
|
||||
ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
|
||||
ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
|
||||
T3I = T3n + T3o;
|
||||
ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
|
||||
ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
|
||||
T3p = T3n - T3o;
|
||||
ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
|
||||
ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
|
||||
T3K = T3k - T3f;
|
||||
ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
|
||||
ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
|
||||
E T2O;
|
||||
{
|
||||
E T27, T2e, T2n, T2u;
|
||||
T20 = T1U - T1Z;
|
||||
T3N = T3L - T3M;
|
||||
T3T = T3M + T3L;
|
||||
T2Q = T1U + T1Z;
|
||||
T27 = T25 - T26;
|
||||
T2e = T28 + T2d;
|
||||
T2f = T27 - T2e;
|
||||
T3O = T27 + T2e;
|
||||
{
|
||||
E T2Y, T2Z, T2R, T2S;
|
||||
T2Y = T2D + T2I;
|
||||
T2Z = T2A - T2B;
|
||||
T30 = FNMS(KP414213562, T2Z, T2Y);
|
||||
T34 = FMA(KP414213562, T2Y, T2Z);
|
||||
T2R = T26 + T25;
|
||||
T2S = T28 - T2d;
|
||||
T2T = T2R + T2S;
|
||||
T3U = T2S - T2R;
|
||||
}
|
||||
T2n = T2l + T2m;
|
||||
T2u = T2o - T2t;
|
||||
T2v = FMA(KP414213562, T2u, T2n);
|
||||
T2N = FNMS(KP414213562, T2n, T2u);
|
||||
{
|
||||
E T2V, T2W, T2C, T2J;
|
||||
T2V = T2o + T2t;
|
||||
T2W = T2l - T2m;
|
||||
T2X = FMA(KP414213562, T2W, T2V);
|
||||
T33 = FNMS(KP414213562, T2V, T2W);
|
||||
T2C = T2A + T2B;
|
||||
T2J = T2D - T2I;
|
||||
T2K = FNMS(KP414213562, T2J, T2C);
|
||||
T2O = FMA(KP414213562, T2C, T2J);
|
||||
}
|
||||
}
|
||||
{
|
||||
E T2g, T2L, T3V, T3W;
|
||||
T2g = FMA(KP707106781, T2f, T20);
|
||||
T2L = T2v - T2K;
|
||||
ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
|
||||
ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
|
||||
T3V = FMA(KP707106781, T3U, T3T);
|
||||
T3W = T2O - T2N;
|
||||
ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
|
||||
ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
|
||||
}
|
||||
{
|
||||
E T2M, T2P, T3X, T3Y;
|
||||
T2M = FNMS(KP707106781, T2f, T20);
|
||||
T2P = T2N + T2O;
|
||||
ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
|
||||
ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
|
||||
T3X = FNMS(KP707106781, T3U, T3T);
|
||||
T3Y = T2v + T2K;
|
||||
ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
|
||||
ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
|
||||
}
|
||||
{
|
||||
E T2U, T31, T3P, T3Q;
|
||||
T2U = FMA(KP707106781, T2T, T2Q);
|
||||
T31 = T2X + T30;
|
||||
ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
|
||||
ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
|
||||
T3P = FMA(KP707106781, T3O, T3N);
|
||||
T3Q = T33 + T34;
|
||||
ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
|
||||
ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
|
||||
}
|
||||
{
|
||||
E T32, T35, T3R, T3S;
|
||||
T32 = FNMS(KP707106781, T2T, T2Q);
|
||||
T35 = T33 - T34;
|
||||
ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
|
||||
ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
|
||||
T3R = FNMS(KP707106781, T3O, T3N);
|
||||
T3S = T30 - T2X;
|
||||
ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
|
||||
ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_CEXP, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 104, 42, 92, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_16, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 196 FP additions, 108 FP multiplications,
|
||||
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
|
||||
* 82 stack variables, 3 constants, and 64 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
|
||||
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
|
||||
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
|
||||
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
|
||||
{
|
||||
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
|
||||
{
|
||||
E Th, Tn, Tj, Tm;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
Tg = W[2];
|
||||
Ti = W[3];
|
||||
Th = T2 * Tg;
|
||||
Tn = T5 * Tg;
|
||||
Tj = T5 * Ti;
|
||||
Tm = T2 * Ti;
|
||||
Tk = Th - Tj;
|
||||
To = Tm + Tn;
|
||||
TE = Tm - Tn;
|
||||
TC = Th + Tj;
|
||||
T6 = W[5];
|
||||
T7 = T5 * T6;
|
||||
Tv = Tg * T6;
|
||||
Ta = T2 * T6;
|
||||
Ts = Ti * T6;
|
||||
T3 = W[4];
|
||||
T4 = T2 * T3;
|
||||
Tw = Ti * T3;
|
||||
Tb = T5 * T3;
|
||||
Tr = Tg * T3;
|
||||
}
|
||||
T8 = T4 + T7;
|
||||
TW = Tv - Tw;
|
||||
TJ = Ta + Tb;
|
||||
Tt = Tr - Ts;
|
||||
TU = Tr + Ts;
|
||||
Tc = Ta - Tb;
|
||||
Tx = Tv + Tw;
|
||||
TH = T4 - T7;
|
||||
TN = W[6];
|
||||
TO = W[7];
|
||||
TP = FMA(T2, TN, T5 * TO);
|
||||
TR = FNMS(T5, TN, T2 * TO);
|
||||
{
|
||||
E T1d, T1e, T19, T1a;
|
||||
T1d = Tk * T6;
|
||||
T1e = To * T3;
|
||||
T1f = T1d - T1e;
|
||||
T1k = T1d + T1e;
|
||||
T19 = Tk * T3;
|
||||
T1a = To * T6;
|
||||
T1b = T19 + T1a;
|
||||
T1i = T19 - T1a;
|
||||
}
|
||||
{
|
||||
E T1w, T1x, T1s, T1t;
|
||||
T1w = TC * T6;
|
||||
T1x = TE * T3;
|
||||
T1y = T1w - T1x;
|
||||
T1H = T1w + T1x;
|
||||
T1s = TC * T3;
|
||||
T1t = TE * T6;
|
||||
T1u = T1s + T1t;
|
||||
T1F = T1s - T1t;
|
||||
}
|
||||
}
|
||||
{
|
||||
E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
|
||||
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
|
||||
E T2S, T2T, T28, T2A, T2d, T2B;
|
||||
{
|
||||
E T1, T3d, Te, T3c, T9, Td;
|
||||
T1 = ri[0];
|
||||
T3d = ii[0];
|
||||
T9 = ri[WS(rs, 8)];
|
||||
Td = ii[WS(rs, 8)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T3c = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T3r = T3d - T3c;
|
||||
T1N = T1 - Te;
|
||||
T3e = T3c + T3d;
|
||||
}
|
||||
{
|
||||
E Tq, T1O, Tz, T1P;
|
||||
{
|
||||
E Tl, Tp, Tu, Ty;
|
||||
Tl = ri[WS(rs, 4)];
|
||||
Tp = ii[WS(rs, 4)];
|
||||
Tq = FMA(Tk, Tl, To * Tp);
|
||||
T1O = FNMS(To, Tl, Tk * Tp);
|
||||
Tu = ri[WS(rs, 12)];
|
||||
Ty = ii[WS(rs, 12)];
|
||||
Tz = FMA(Tt, Tu, Tx * Ty);
|
||||
T1P = FNMS(Tx, Tu, Tt * Ty);
|
||||
}
|
||||
TA = Tq + Tz;
|
||||
T3s = Tq - Tz;
|
||||
T1Q = T1O - T1P;
|
||||
T3b = T1O + T1P;
|
||||
}
|
||||
{
|
||||
E TG, T1S, TL, T1T, T1U, T1V;
|
||||
{
|
||||
E TD, TF, TI, TK;
|
||||
TD = ri[WS(rs, 2)];
|
||||
TF = ii[WS(rs, 2)];
|
||||
TG = FMA(TC, TD, TE * TF);
|
||||
T1S = FNMS(TE, TD, TC * TF);
|
||||
TI = ri[WS(rs, 10)];
|
||||
TK = ii[WS(rs, 10)];
|
||||
TL = FMA(TH, TI, TJ * TK);
|
||||
T1T = FNMS(TJ, TI, TH * TK);
|
||||
}
|
||||
TM = TG + TL;
|
||||
T2M = T1S + T1T;
|
||||
T1U = T1S - T1T;
|
||||
T1V = TG - TL;
|
||||
T1W = T1U - T1V;
|
||||
T2w = T1V + T1U;
|
||||
}
|
||||
{
|
||||
E TT, T1Y, TY, T1Z, T1X, T20;
|
||||
{
|
||||
E TQ, TS, TV, TX;
|
||||
TQ = ri[WS(rs, 14)];
|
||||
TS = ii[WS(rs, 14)];
|
||||
TT = FMA(TP, TQ, TR * TS);
|
||||
T1Y = FNMS(TR, TQ, TP * TS);
|
||||
TV = ri[WS(rs, 6)];
|
||||
TX = ii[WS(rs, 6)];
|
||||
TY = FMA(TU, TV, TW * TX);
|
||||
T1Z = FNMS(TW, TV, TU * TX);
|
||||
}
|
||||
TZ = TT + TY;
|
||||
T2N = T1Y + T1Z;
|
||||
T1X = TT - TY;
|
||||
T20 = T1Y - T1Z;
|
||||
T21 = T1X + T20;
|
||||
T2x = T1X - T20;
|
||||
}
|
||||
{
|
||||
E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
|
||||
{
|
||||
E T1p, T1q, T1G, T1I;
|
||||
T1p = ri[WS(rs, 15)];
|
||||
T1q = ii[WS(rs, 15)];
|
||||
T1r = FMA(TN, T1p, TO * T1q);
|
||||
T2k = FNMS(TO, T1p, TN * T1q);
|
||||
T1G = ri[WS(rs, 11)];
|
||||
T1I = ii[WS(rs, 11)];
|
||||
T1J = FMA(T1F, T1G, T1H * T1I);
|
||||
T2h = FNMS(T1H, T1G, T1F * T1I);
|
||||
}
|
||||
{
|
||||
E T1v, T1z, T1C, T1D;
|
||||
T1v = ri[WS(rs, 7)];
|
||||
T1z = ii[WS(rs, 7)];
|
||||
T1A = FMA(T1u, T1v, T1y * T1z);
|
||||
T2l = FNMS(T1y, T1v, T1u * T1z);
|
||||
T1C = ri[WS(rs, 3)];
|
||||
T1D = ii[WS(rs, 3)];
|
||||
T1E = FMA(Tg, T1C, Ti * T1D);
|
||||
T2g = FNMS(Ti, T1C, Tg * T1D);
|
||||
}
|
||||
T1B = T1r + T1A;
|
||||
T1K = T1E + T1J;
|
||||
T2V = T1B - T1K;
|
||||
T2W = T2k + T2l;
|
||||
T2X = T2g + T2h;
|
||||
T2Y = T2W - T2X;
|
||||
{
|
||||
E T2f, T2i, T2m, T2n;
|
||||
T2f = T1r - T1A;
|
||||
T2i = T2g - T2h;
|
||||
T2j = T2f - T2i;
|
||||
T2D = T2f + T2i;
|
||||
T2m = T2k - T2l;
|
||||
T2n = T1E - T1J;
|
||||
T2o = T2m + T2n;
|
||||
T2E = T2m - T2n;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
|
||||
{
|
||||
E T12, T13, T1j, T1l;
|
||||
T12 = ri[WS(rs, 1)];
|
||||
T13 = ii[WS(rs, 1)];
|
||||
T14 = FMA(T2, T12, T5 * T13);
|
||||
T24 = FNMS(T5, T12, T2 * T13);
|
||||
T1j = ri[WS(rs, 13)];
|
||||
T1l = ii[WS(rs, 13)];
|
||||
T1m = FMA(T1i, T1j, T1k * T1l);
|
||||
T2b = FNMS(T1k, T1j, T1i * T1l);
|
||||
}
|
||||
{
|
||||
E T15, T16, T1c, T1g;
|
||||
T15 = ri[WS(rs, 9)];
|
||||
T16 = ii[WS(rs, 9)];
|
||||
T17 = FMA(T3, T15, T6 * T16);
|
||||
T25 = FNMS(T6, T15, T3 * T16);
|
||||
T1c = ri[WS(rs, 5)];
|
||||
T1g = ii[WS(rs, 5)];
|
||||
T1h = FMA(T1b, T1c, T1f * T1g);
|
||||
T2a = FNMS(T1f, T1c, T1b * T1g);
|
||||
}
|
||||
T18 = T14 + T17;
|
||||
T1n = T1h + T1m;
|
||||
T2Q = T18 - T1n;
|
||||
T2R = T24 + T25;
|
||||
T2S = T2a + T2b;
|
||||
T2T = T2R - T2S;
|
||||
{
|
||||
E T26, T27, T29, T2c;
|
||||
T26 = T24 - T25;
|
||||
T27 = T1h - T1m;
|
||||
T28 = T26 + T27;
|
||||
T2A = T26 - T27;
|
||||
T29 = T14 - T17;
|
||||
T2c = T2a - T2b;
|
||||
T2d = T29 - T2c;
|
||||
T2B = T29 + T2c;
|
||||
}
|
||||
}
|
||||
{
|
||||
E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
|
||||
{
|
||||
E T1R, T22, T3y, T3z;
|
||||
T1R = T1N - T1Q;
|
||||
T22 = KP707106781 * (T1W - T21);
|
||||
T23 = T1R + T22;
|
||||
T2r = T1R - T22;
|
||||
T3y = KP707106781 * (T2x - T2w);
|
||||
T3z = T3s + T3r;
|
||||
T3A = T3y + T3z;
|
||||
T3C = T3z - T3y;
|
||||
}
|
||||
{
|
||||
E T2e, T2p, T2s, T2t;
|
||||
T2e = FMA(KP923879532, T28, KP382683432 * T2d);
|
||||
T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
|
||||
T2q = T2e + T2p;
|
||||
T3B = T2p - T2e;
|
||||
T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
|
||||
T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
|
||||
T2u = T2s - T2t;
|
||||
T3x = T2s + T2t;
|
||||
}
|
||||
ri[WS(rs, 11)] = T23 - T2q;
|
||||
ii[WS(rs, 11)] = T3A - T3x;
|
||||
ri[WS(rs, 3)] = T23 + T2q;
|
||||
ii[WS(rs, 3)] = T3x + T3A;
|
||||
ri[WS(rs, 15)] = T2r - T2u;
|
||||
ii[WS(rs, 15)] = T3C - T3B;
|
||||
ri[WS(rs, 7)] = T2r + T2u;
|
||||
ii[WS(rs, 7)] = T3B + T3C;
|
||||
}
|
||||
{
|
||||
E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
|
||||
{
|
||||
E T2L, T2O, T3k, T3l;
|
||||
T2L = Tf - TA;
|
||||
T2O = T2M - T2N;
|
||||
T2P = T2L + T2O;
|
||||
T31 = T2L - T2O;
|
||||
T3k = TZ - TM;
|
||||
T3l = T3e - T3b;
|
||||
T3m = T3k + T3l;
|
||||
T3o = T3l - T3k;
|
||||
}
|
||||
{
|
||||
E T2U, T2Z, T32, T33;
|
||||
T2U = T2Q + T2T;
|
||||
T2Z = T2V - T2Y;
|
||||
T30 = KP707106781 * (T2U + T2Z);
|
||||
T3n = KP707106781 * (T2Z - T2U);
|
||||
T32 = T2T - T2Q;
|
||||
T33 = T2V + T2Y;
|
||||
T34 = KP707106781 * (T32 - T33);
|
||||
T3j = KP707106781 * (T32 + T33);
|
||||
}
|
||||
ri[WS(rs, 10)] = T2P - T30;
|
||||
ii[WS(rs, 10)] = T3m - T3j;
|
||||
ri[WS(rs, 2)] = T2P + T30;
|
||||
ii[WS(rs, 2)] = T3j + T3m;
|
||||
ri[WS(rs, 14)] = T31 - T34;
|
||||
ii[WS(rs, 14)] = T3o - T3n;
|
||||
ri[WS(rs, 6)] = T31 + T34;
|
||||
ii[WS(rs, 6)] = T3n + T3o;
|
||||
}
|
||||
{
|
||||
E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
|
||||
{
|
||||
E T2v, T2y, T3q, T3t;
|
||||
T2v = T1N + T1Q;
|
||||
T2y = KP707106781 * (T2w + T2x);
|
||||
T2z = T2v + T2y;
|
||||
T2H = T2v - T2y;
|
||||
T3q = KP707106781 * (T1W + T21);
|
||||
T3t = T3r - T3s;
|
||||
T3u = T3q + T3t;
|
||||
T3w = T3t - T3q;
|
||||
}
|
||||
{
|
||||
E T2C, T2F, T2I, T2J;
|
||||
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
|
||||
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
|
||||
T2G = T2C + T2F;
|
||||
T3v = T2F - T2C;
|
||||
T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
|
||||
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
|
||||
T2K = T2I - T2J;
|
||||
T3p = T2I + T2J;
|
||||
}
|
||||
ri[WS(rs, 9)] = T2z - T2G;
|
||||
ii[WS(rs, 9)] = T3u - T3p;
|
||||
ri[WS(rs, 1)] = T2z + T2G;
|
||||
ii[WS(rs, 1)] = T3p + T3u;
|
||||
ri[WS(rs, 13)] = T2H - T2K;
|
||||
ii[WS(rs, 13)] = T3w - T3v;
|
||||
ri[WS(rs, 5)] = T2H + T2K;
|
||||
ii[WS(rs, 5)] = T3v + T3w;
|
||||
}
|
||||
{
|
||||
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
|
||||
{
|
||||
E TB, T10, T3a, T3f;
|
||||
TB = Tf + TA;
|
||||
T10 = TM + TZ;
|
||||
T11 = TB + T10;
|
||||
T35 = TB - T10;
|
||||
T3a = T2M + T2N;
|
||||
T3f = T3b + T3e;
|
||||
T3g = T3a + T3f;
|
||||
T3i = T3f - T3a;
|
||||
}
|
||||
{
|
||||
E T1o, T1L, T36, T37;
|
||||
T1o = T18 + T1n;
|
||||
T1L = T1B + T1K;
|
||||
T1M = T1o + T1L;
|
||||
T3h = T1L - T1o;
|
||||
T36 = T2R + T2S;
|
||||
T37 = T2W + T2X;
|
||||
T38 = T36 - T37;
|
||||
T39 = T36 + T37;
|
||||
}
|
||||
ri[WS(rs, 8)] = T11 - T1M;
|
||||
ii[WS(rs, 8)] = T3g - T39;
|
||||
ri[0] = T11 + T1M;
|
||||
ii[0] = T39 + T3g;
|
||||
ri[WS(rs, 12)] = T35 - T38;
|
||||
ii[WS(rs, 12)] = T3i - T3h;
|
||||
ri[WS(rs, 4)] = T35 + T38;
|
||||
ii[WS(rs, 4)] = T3h + T3i;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 9 },
|
||||
{ TW_CEXP, 0, 15 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 156, 68, 40, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_16) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_16, &desc);
|
||||
}
|
||||
#endif
|
||||
1097
fftw-3.3.10/dft/scalar/codelets/t2_20.c
Normal file
1097
fftw-3.3.10/dft/scalar/codelets/t2_20.c
Normal file
File diff suppressed because it is too large
Load Diff
1620
fftw-3.3.10/dft/scalar/codelets/t2_25.c
Normal file
1620
fftw-3.3.10/dft/scalar/codelets/t2_25.c
Normal file
File diff suppressed because it is too large
Load Diff
1893
fftw-3.3.10/dft/scalar/codelets/t2_32.c
Normal file
1893
fftw-3.3.10/dft/scalar/codelets/t2_32.c
Normal file
File diff suppressed because it is too large
Load Diff
200
fftw-3.3.10/dft/scalar/codelets/t2_4.c
Normal file
200
fftw-3.3.10/dft/scalar/codelets/t2_4.c
Normal file
@@ -0,0 +1,200 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T2, T6, T3, T5, T7, Tb, T4, Ta;
|
||||
T2 = W[0];
|
||||
T6 = W[3];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Ta = T2 * T6;
|
||||
T5 = W[1];
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Tb = FNMS(T5, T3, Ta);
|
||||
{
|
||||
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
|
||||
T1 = ri[0];
|
||||
Tx = ii[0];
|
||||
{
|
||||
E T8, T9, Tc, Tv;
|
||||
T8 = ri[WS(rs, 2)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ii[WS(rs, 2)];
|
||||
Tv = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
Tw = FNMS(Tb, T8, Tv);
|
||||
}
|
||||
{
|
||||
E Tf, Tg, Th, Tp;
|
||||
Tf = ri[WS(rs, 1)];
|
||||
Tg = T2 * Tf;
|
||||
Th = ii[WS(rs, 1)];
|
||||
Tp = T2 * Th;
|
||||
Ti = FMA(T5, Th, Tg);
|
||||
Tq = FNMS(T5, Tf, Tp);
|
||||
}
|
||||
{
|
||||
E Tj, Tk, Tl, Tr;
|
||||
Tj = ri[WS(rs, 3)];
|
||||
Tk = T3 * Tj;
|
||||
Tl = ii[WS(rs, 3)];
|
||||
Tr = T3 * Tl;
|
||||
Tm = FMA(T6, Tl, Tk);
|
||||
Ts = FNMS(T6, Tj, Tr);
|
||||
}
|
||||
{
|
||||
E Te, Tn, Tu, Ty;
|
||||
Te = T1 + Td;
|
||||
Tn = Ti + Tm;
|
||||
ri[WS(rs, 2)] = Te - Tn;
|
||||
ri[0] = Te + Tn;
|
||||
Tu = Tq + Ts;
|
||||
Ty = Tw + Tx;
|
||||
ii[0] = Tu + Ty;
|
||||
ii[WS(rs, 2)] = Ty - Tu;
|
||||
}
|
||||
{
|
||||
E To, Tt, Tz, TA;
|
||||
To = T1 - Td;
|
||||
Tt = Tq - Ts;
|
||||
ri[WS(rs, 3)] = To - Tt;
|
||||
ri[WS(rs, 1)] = To + Tt;
|
||||
Tz = Tx - Tw;
|
||||
TA = Ti - Tm;
|
||||
ii[WS(rs, 1)] = Tz - TA;
|
||||
ii[WS(rs, 3)] = TA + Tz;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_4, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 24 FP additions, 16 FP multiplications,
|
||||
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
|
||||
* 21 stack variables, 0 constants, and 16 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
|
||||
E T2, T4, T3, T5, T6, T8;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T3 = W[2];
|
||||
T5 = W[3];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
T8 = FNMS(T4, T3, T2 * T5);
|
||||
{
|
||||
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
|
||||
T1 = ri[0];
|
||||
Tp = ii[0];
|
||||
T7 = ri[WS(rs, 2)];
|
||||
T9 = ii[WS(rs, 2)];
|
||||
Ta = FMA(T6, T7, T8 * T9);
|
||||
To = FNMS(T8, T7, T6 * T9);
|
||||
{
|
||||
E Tc, Td, Tf, Tg;
|
||||
Tc = ri[WS(rs, 1)];
|
||||
Td = ii[WS(rs, 1)];
|
||||
Te = FMA(T2, Tc, T4 * Td);
|
||||
Tk = FNMS(T4, Tc, T2 * Td);
|
||||
Tf = ri[WS(rs, 3)];
|
||||
Tg = ii[WS(rs, 3)];
|
||||
Th = FMA(T3, Tf, T5 * Tg);
|
||||
Tl = FNMS(T5, Tf, T3 * Tg);
|
||||
}
|
||||
{
|
||||
E Tb, Ti, Tn, Tq;
|
||||
Tb = T1 + Ta;
|
||||
Ti = Te + Th;
|
||||
ri[WS(rs, 2)] = Tb - Ti;
|
||||
ri[0] = Tb + Ti;
|
||||
Tn = Tk + Tl;
|
||||
Tq = To + Tp;
|
||||
ii[0] = Tn + Tq;
|
||||
ii[WS(rs, 2)] = Tq - Tn;
|
||||
}
|
||||
{
|
||||
E Tj, Tm, Tr, Ts;
|
||||
Tj = T1 - Ta;
|
||||
Tm = Tk - Tl;
|
||||
ri[WS(rs, 3)] = Tj - Tm;
|
||||
ri[WS(rs, 1)] = Tj + Tm;
|
||||
Tr = Tp - To;
|
||||
Ts = Te - Th;
|
||||
ii[WS(rs, 1)] = Tr - Ts;
|
||||
ii[WS(rs, 3)] = Ts + Tr;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_4) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_4, &desc);
|
||||
}
|
||||
#endif
|
||||
264
fftw-3.3.10/dft/scalar/codelets/t2_5.c
Normal file
264
fftw-3.3.10/dft/scalar/codelets/t2_5.c
Normal file
@@ -0,0 +1,264 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 40 FP multiplications,
|
||||
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
|
||||
* 38 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
|
||||
T2 = W[0];
|
||||
Ta = W[3];
|
||||
T8 = W[2];
|
||||
T9 = T2 * T8;
|
||||
Te = T2 * Ta;
|
||||
T5 = W[1];
|
||||
Tb = FNMS(T5, Ta, T9);
|
||||
Tm = FNMS(T5, T8, Te);
|
||||
Tf = FMA(T5, T8, Te);
|
||||
Tj = FMA(T5, Ta, T9);
|
||||
{
|
||||
E T1, TO, T7, Th, Ti, Tz, TB, TL, To, Ts, Tt, TE, TG, TM;
|
||||
T1 = ri[0];
|
||||
TO = ii[0];
|
||||
{
|
||||
E T3, T4, T6, Ty, Tc, Td, Tg, TA;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T4 = T2 * T3;
|
||||
T6 = ii[WS(rs, 1)];
|
||||
Ty = T2 * T6;
|
||||
Tc = ri[WS(rs, 4)];
|
||||
Td = Tb * Tc;
|
||||
Tg = ii[WS(rs, 4)];
|
||||
TA = Tb * Tg;
|
||||
T7 = FMA(T5, T6, T4);
|
||||
Th = FMA(Tf, Tg, Td);
|
||||
Ti = T7 + Th;
|
||||
Tz = FNMS(T5, T3, Ty);
|
||||
TB = FNMS(Tf, Tc, TA);
|
||||
TL = Tz + TB;
|
||||
}
|
||||
{
|
||||
E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
|
||||
Tk = ri[WS(rs, 2)];
|
||||
Tl = Tj * Tk;
|
||||
Tn = ii[WS(rs, 2)];
|
||||
TD = Tj * Tn;
|
||||
Tp = ri[WS(rs, 3)];
|
||||
Tq = T8 * Tp;
|
||||
Tr = ii[WS(rs, 3)];
|
||||
TF = T8 * Tr;
|
||||
To = FMA(Tm, Tn, Tl);
|
||||
Ts = FMA(Ta, Tr, Tq);
|
||||
Tt = To + Ts;
|
||||
TE = FNMS(Tm, Tk, TD);
|
||||
TG = FNMS(Ta, Tp, TF);
|
||||
TM = TE + TG;
|
||||
}
|
||||
{
|
||||
E Tw, Tu, Tv, TI, TK, TC, TH, TJ, Tx;
|
||||
Tw = Ti - Tt;
|
||||
Tu = Ti + Tt;
|
||||
Tv = FNMS(KP250000000, Tu, T1);
|
||||
TC = Tz - TB;
|
||||
TH = TE - TG;
|
||||
TI = FMA(KP618033988, TH, TC);
|
||||
TK = FNMS(KP618033988, TC, TH);
|
||||
ri[0] = T1 + Tu;
|
||||
TJ = FNMS(KP559016994, Tw, Tv);
|
||||
ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
|
||||
ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ);
|
||||
Tx = FMA(KP559016994, Tw, Tv);
|
||||
ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx);
|
||||
ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
|
||||
}
|
||||
{
|
||||
E TQ, TN, TP, TU, TW, TS, TT, TV, TR;
|
||||
TQ = TL - TM;
|
||||
TN = TL + TM;
|
||||
TP = FNMS(KP250000000, TN, TO);
|
||||
TS = T7 - Th;
|
||||
TT = To - Ts;
|
||||
TU = FMA(KP618033988, TT, TS);
|
||||
TW = FNMS(KP618033988, TS, TT);
|
||||
ii[0] = TN + TO;
|
||||
TV = FNMS(KP559016994, TQ, TP);
|
||||
ii[WS(rs, 2)] = FMA(KP951056516, TW, TV);
|
||||
ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV);
|
||||
TR = FMA(KP559016994, TQ, TP);
|
||||
ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR);
|
||||
ii[WS(rs, 4)] = FMA(KP951056516, TU, TR);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 14, 10, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_5, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 44 FP additions, 32 FP multiplications,
|
||||
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
|
||||
* 37 stack variables, 4 constants, and 20 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
|
||||
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
|
||||
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
|
||||
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
|
||||
E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
|
||||
{
|
||||
E T8, Te, Ta, Td;
|
||||
T2 = W[0];
|
||||
T4 = W[1];
|
||||
T7 = W[2];
|
||||
T9 = W[3];
|
||||
T8 = T2 * T7;
|
||||
Te = T4 * T7;
|
||||
Ta = T4 * T9;
|
||||
Td = T2 * T9;
|
||||
Tb = T8 - Ta;
|
||||
Tl = Td - Te;
|
||||
Tf = Td + Te;
|
||||
Tj = T8 + Ta;
|
||||
}
|
||||
{
|
||||
E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts;
|
||||
T1 = ri[0];
|
||||
TI = ii[0];
|
||||
{
|
||||
E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
|
||||
{
|
||||
E T3, T5, To, Tp;
|
||||
T3 = ri[WS(rs, 1)];
|
||||
T5 = ii[WS(rs, 1)];
|
||||
T6 = FMA(T2, T3, T4 * T5);
|
||||
Tw = FNMS(T4, T3, T2 * T5);
|
||||
To = ri[WS(rs, 3)];
|
||||
Tp = ii[WS(rs, 3)];
|
||||
Tq = FMA(T7, To, T9 * Tp);
|
||||
TA = FNMS(T9, To, T7 * Tp);
|
||||
}
|
||||
{
|
||||
E Tc, Tg, Tk, Tm;
|
||||
Tc = ri[WS(rs, 4)];
|
||||
Tg = ii[WS(rs, 4)];
|
||||
Th = FMA(Tb, Tc, Tf * Tg);
|
||||
Tx = FNMS(Tf, Tc, Tb * Tg);
|
||||
Tk = ri[WS(rs, 2)];
|
||||
Tm = ii[WS(rs, 2)];
|
||||
Tn = FMA(Tj, Tk, Tl * Tm);
|
||||
Tz = FNMS(Tl, Tk, Tj * Tm);
|
||||
}
|
||||
Ty = Tw - Tx;
|
||||
TB = Tz - TA;
|
||||
TN = Tn - Tq;
|
||||
TM = T6 - Th;
|
||||
TF = Tw + Tx;
|
||||
TG = Tz + TA;
|
||||
TH = TF + TG;
|
||||
Ti = T6 + Th;
|
||||
Tr = Tn + Tq;
|
||||
Ts = Ti + Tr;
|
||||
}
|
||||
ri[0] = T1 + Ts;
|
||||
ii[0] = TH + TI;
|
||||
{
|
||||
E TC, TE, Tv, TD, Tt, Tu;
|
||||
TC = FMA(KP951056516, Ty, KP587785252 * TB);
|
||||
TE = FNMS(KP587785252, Ty, KP951056516 * TB);
|
||||
Tt = KP559016994 * (Ti - Tr);
|
||||
Tu = FNMS(KP250000000, Ts, T1);
|
||||
Tv = Tt + Tu;
|
||||
TD = Tu - Tt;
|
||||
ri[WS(rs, 4)] = Tv - TC;
|
||||
ri[WS(rs, 3)] = TD + TE;
|
||||
ri[WS(rs, 1)] = Tv + TC;
|
||||
ri[WS(rs, 2)] = TD - TE;
|
||||
}
|
||||
{
|
||||
E TO, TP, TL, TQ, TJ, TK;
|
||||
TO = FMA(KP951056516, TM, KP587785252 * TN);
|
||||
TP = FNMS(KP587785252, TM, KP951056516 * TN);
|
||||
TJ = KP559016994 * (TF - TG);
|
||||
TK = FNMS(KP250000000, TH, TI);
|
||||
TL = TJ + TK;
|
||||
TQ = TK - TJ;
|
||||
ii[WS(rs, 1)] = TL - TO;
|
||||
ii[WS(rs, 3)] = TQ - TP;
|
||||
ii[WS(rs, 4)] = TO + TL;
|
||||
ii[WS(rs, 2)] = TP + TQ;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 30, 18, 14, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_5) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_5, &desc);
|
||||
}
|
||||
#endif
|
||||
4243
fftw-3.3.10/dft/scalar/codelets/t2_64.c
Normal file
4243
fftw-3.3.10/dft/scalar/codelets/t2_64.c
Normal file
File diff suppressed because it is too large
Load Diff
390
fftw-3.3.10/dft/scalar/codelets/t2_8.c
Normal file
390
fftw-3.3.10/dft/scalar/codelets/t2_8.c
Normal file
@@ -0,0 +1,390 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-14 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*
|
||||
*/
|
||||
|
||||
/* This file was automatically generated --- DO NOT EDIT */
|
||||
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
|
||||
|
||||
#include "dft/codelet-dft.h"
|
||||
|
||||
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 50 FP multiplications,
|
||||
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
|
||||
* 48 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
|
||||
{
|
||||
E T4, Tm, Tr, Ta, TB, TF;
|
||||
T2 = W[0];
|
||||
T3 = W[2];
|
||||
T4 = T2 * T3;
|
||||
Tl = W[4];
|
||||
Tm = T2 * Tl;
|
||||
Tn = W[5];
|
||||
Tr = T2 * Tn;
|
||||
T5 = W[1];
|
||||
T6 = W[3];
|
||||
Ta = T2 * T6;
|
||||
Tf = FMA(T5, T6, T4);
|
||||
T7 = FNMS(T5, T6, T4);
|
||||
Ts = FNMS(T5, Tl, Tr);
|
||||
Tb = FMA(T5, T3, Ta);
|
||||
To = FMA(T5, Tn, Tm);
|
||||
TB = Tf * Tl;
|
||||
TF = Tf * Tn;
|
||||
Ti = FNMS(T5, T3, Ta);
|
||||
TC = FMA(Ti, Tn, TB);
|
||||
TG = FNMS(Ti, Tl, TF);
|
||||
}
|
||||
{
|
||||
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
|
||||
E TI, T11, T13, T15, T16;
|
||||
T1 = ri[0];
|
||||
T1s = ii[0];
|
||||
{
|
||||
E T8, T9, Tc, T1q;
|
||||
T8 = ri[WS(rs, 4)];
|
||||
T9 = T7 * T8;
|
||||
Tc = ii[WS(rs, 4)];
|
||||
T1q = T7 * Tc;
|
||||
Td = FMA(Tb, Tc, T9);
|
||||
T1r = FNMS(Tb, T8, T1q);
|
||||
}
|
||||
{
|
||||
E Tp, Tq, Tt, TX;
|
||||
Tp = ri[WS(rs, 6)];
|
||||
Tq = To * Tp;
|
||||
Tt = ii[WS(rs, 6)];
|
||||
TX = To * Tt;
|
||||
Tu = FMA(Ts, Tt, Tq);
|
||||
TY = FNMS(Ts, Tp, TX);
|
||||
}
|
||||
{
|
||||
E Tg, Th, Tj, TV;
|
||||
Tg = ri[WS(rs, 2)];
|
||||
Th = Tf * Tg;
|
||||
Tj = ii[WS(rs, 2)];
|
||||
TV = Tf * Tj;
|
||||
Tk = FMA(Ti, Tj, Th);
|
||||
TW = FNMS(Ti, Tg, TV);
|
||||
}
|
||||
{
|
||||
E TK, TL, TM, T19, TO, TP, TQ, T1b;
|
||||
TK = ri[WS(rs, 7)];
|
||||
TL = Tl * TK;
|
||||
TM = ii[WS(rs, 7)];
|
||||
T19 = Tl * TM;
|
||||
TO = ri[WS(rs, 3)];
|
||||
TP = T3 * TO;
|
||||
TQ = ii[WS(rs, 3)];
|
||||
T1b = T3 * TQ;
|
||||
TN = FMA(Tn, TM, TL);
|
||||
TR = FMA(T6, TQ, TP);
|
||||
T18 = TN - TR;
|
||||
T1a = FNMS(Tn, TK, T19);
|
||||
T1c = FNMS(T6, TO, T1b);
|
||||
T1d = T1a - T1c;
|
||||
}
|
||||
{
|
||||
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
|
||||
Tx = ri[WS(rs, 1)];
|
||||
Ty = T2 * Tx;
|
||||
Tz = ii[WS(rs, 1)];
|
||||
T12 = T2 * Tz;
|
||||
TD = ri[WS(rs, 5)];
|
||||
TE = TC * TD;
|
||||
TH = ii[WS(rs, 5)];
|
||||
T14 = TC * TH;
|
||||
TA = FMA(T5, Tz, Ty);
|
||||
TI = FMA(TG, TH, TE);
|
||||
T11 = TA - TI;
|
||||
T13 = FNMS(T5, Tx, T12);
|
||||
T15 = FNMS(TG, TD, T14);
|
||||
T16 = T13 - T15;
|
||||
}
|
||||
{
|
||||
E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
|
||||
{
|
||||
E TU, TZ, T1x, T1y;
|
||||
TU = T1 - Td;
|
||||
TZ = TW - TY;
|
||||
T10 = TU + TZ;
|
||||
T1g = TU - TZ;
|
||||
T1x = T1s - T1r;
|
||||
T1y = Tk - Tu;
|
||||
T1z = T1x - T1y;
|
||||
T1B = T1y + T1x;
|
||||
}
|
||||
{
|
||||
E T17, T1e, T1h, T1i;
|
||||
T17 = T11 + T16;
|
||||
T1e = T18 - T1d;
|
||||
T1f = T17 + T1e;
|
||||
T1C = T1e - T17;
|
||||
T1h = T16 - T11;
|
||||
T1i = T18 + T1d;
|
||||
T1j = T1h - T1i;
|
||||
T1A = T1h + T1i;
|
||||
}
|
||||
ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
|
||||
ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
|
||||
ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
|
||||
ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
|
||||
ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
|
||||
ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
|
||||
ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
|
||||
ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
|
||||
}
|
||||
{
|
||||
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
|
||||
{
|
||||
E Te, Tv, T1p, T1t;
|
||||
Te = T1 + Td;
|
||||
Tv = Tk + Tu;
|
||||
Tw = Te + Tv;
|
||||
T1k = Te - Tv;
|
||||
T1p = TW + TY;
|
||||
T1t = T1r + T1s;
|
||||
T1u = T1p + T1t;
|
||||
T1w = T1t - T1p;
|
||||
}
|
||||
{
|
||||
E TJ, TS, T1l, T1m;
|
||||
TJ = TA + TI;
|
||||
TS = TN + TR;
|
||||
TT = TJ + TS;
|
||||
T1v = TS - TJ;
|
||||
T1l = T13 + T15;
|
||||
T1m = T1a + T1c;
|
||||
T1n = T1l - T1m;
|
||||
T1o = T1l + T1m;
|
||||
}
|
||||
ri[WS(rs, 4)] = Tw - TT;
|
||||
ii[WS(rs, 4)] = T1u - T1o;
|
||||
ri[0] = Tw + TT;
|
||||
ii[0] = T1o + T1u;
|
||||
ri[WS(rs, 6)] = T1k - T1n;
|
||||
ii[WS(rs, 6)] = T1w - T1v;
|
||||
ri[WS(rs, 2)] = T1k + T1n;
|
||||
ii[WS(rs, 2)] = T1v + T1w;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 44, 20, 30, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_8, &desc);
|
||||
}
|
||||
#else
|
||||
|
||||
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
|
||||
|
||||
/*
|
||||
* This function contains 74 FP additions, 44 FP multiplications,
|
||||
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
|
||||
* 42 stack variables, 1 constants, and 32 memory accesses
|
||||
*/
|
||||
#include "dft/scalar/t.h"
|
||||
|
||||
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
|
||||
{
|
||||
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
|
||||
{
|
||||
INT m;
|
||||
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
|
||||
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
|
||||
{
|
||||
E T4, Tb, T7, Ta;
|
||||
T2 = W[0];
|
||||
T5 = W[1];
|
||||
T3 = W[2];
|
||||
T6 = W[3];
|
||||
T4 = T2 * T3;
|
||||
Tb = T5 * T3;
|
||||
T7 = T5 * T6;
|
||||
Ta = T2 * T6;
|
||||
T8 = T4 - T7;
|
||||
Tc = Ta + Tb;
|
||||
Tg = T4 + T7;
|
||||
Ti = Ta - Tb;
|
||||
Tl = W[4];
|
||||
Tm = W[5];
|
||||
Tn = FMA(T2, Tl, T5 * Tm);
|
||||
Tz = FNMS(Ti, Tl, Tg * Tm);
|
||||
Tp = FNMS(T5, Tl, T2 * Tm);
|
||||
Tx = FMA(Tg, Tl, Ti * Tm);
|
||||
}
|
||||
{
|
||||
E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
|
||||
E TT;
|
||||
{
|
||||
E T1, T1c, Te, T1b, T9, Td;
|
||||
T1 = ri[0];
|
||||
T1c = ii[0];
|
||||
T9 = ri[WS(rs, 4)];
|
||||
Td = ii[WS(rs, 4)];
|
||||
Te = FMA(T8, T9, Tc * Td);
|
||||
T1b = FNMS(Tc, T9, T8 * Td);
|
||||
Tf = T1 + Te;
|
||||
T1i = T1c - T1b;
|
||||
TL = T1 - Te;
|
||||
T1d = T1b + T1c;
|
||||
}
|
||||
{
|
||||
E TF, TW, TI, TX;
|
||||
{
|
||||
E TD, TE, TG, TH;
|
||||
TD = ri[WS(rs, 7)];
|
||||
TE = ii[WS(rs, 7)];
|
||||
TF = FMA(Tl, TD, Tm * TE);
|
||||
TW = FNMS(Tm, TD, Tl * TE);
|
||||
TG = ri[WS(rs, 3)];
|
||||
TH = ii[WS(rs, 3)];
|
||||
TI = FMA(T3, TG, T6 * TH);
|
||||
TX = FNMS(T6, TG, T3 * TH);
|
||||
}
|
||||
TJ = TF + TI;
|
||||
T17 = TW + TX;
|
||||
TV = TF - TI;
|
||||
TY = TW - TX;
|
||||
}
|
||||
{
|
||||
E Tk, TM, Tr, TN;
|
||||
{
|
||||
E Th, Tj, To, Tq;
|
||||
Th = ri[WS(rs, 2)];
|
||||
Tj = ii[WS(rs, 2)];
|
||||
Tk = FMA(Tg, Th, Ti * Tj);
|
||||
TM = FNMS(Ti, Th, Tg * Tj);
|
||||
To = ri[WS(rs, 6)];
|
||||
Tq = ii[WS(rs, 6)];
|
||||
Tr = FMA(Tn, To, Tp * Tq);
|
||||
TN = FNMS(Tp, To, Tn * Tq);
|
||||
}
|
||||
Ts = Tk + Tr;
|
||||
T1j = Tk - Tr;
|
||||
TO = TM - TN;
|
||||
T1a = TM + TN;
|
||||
}
|
||||
{
|
||||
E Tw, TR, TB, TS;
|
||||
{
|
||||
E Tu, Tv, Ty, TA;
|
||||
Tu = ri[WS(rs, 1)];
|
||||
Tv = ii[WS(rs, 1)];
|
||||
Tw = FMA(T2, Tu, T5 * Tv);
|
||||
TR = FNMS(T5, Tu, T2 * Tv);
|
||||
Ty = ri[WS(rs, 5)];
|
||||
TA = ii[WS(rs, 5)];
|
||||
TB = FMA(Tx, Ty, Tz * TA);
|
||||
TS = FNMS(Tz, Ty, Tx * TA);
|
||||
}
|
||||
TC = Tw + TB;
|
||||
T16 = TR + TS;
|
||||
TQ = Tw - TB;
|
||||
TT = TR - TS;
|
||||
}
|
||||
{
|
||||
E Tt, TK, T1f, T1g;
|
||||
Tt = Tf + Ts;
|
||||
TK = TC + TJ;
|
||||
ri[WS(rs, 4)] = Tt - TK;
|
||||
ri[0] = Tt + TK;
|
||||
{
|
||||
E T19, T1e, T15, T18;
|
||||
T19 = T16 + T17;
|
||||
T1e = T1a + T1d;
|
||||
ii[0] = T19 + T1e;
|
||||
ii[WS(rs, 4)] = T1e - T19;
|
||||
T15 = Tf - Ts;
|
||||
T18 = T16 - T17;
|
||||
ri[WS(rs, 6)] = T15 - T18;
|
||||
ri[WS(rs, 2)] = T15 + T18;
|
||||
}
|
||||
T1f = TJ - TC;
|
||||
T1g = T1d - T1a;
|
||||
ii[WS(rs, 2)] = T1f + T1g;
|
||||
ii[WS(rs, 6)] = T1g - T1f;
|
||||
{
|
||||
E T11, T1k, T14, T1h, T12, T13;
|
||||
T11 = TL - TO;
|
||||
T1k = T1i - T1j;
|
||||
T12 = TT - TQ;
|
||||
T13 = TV + TY;
|
||||
T14 = KP707106781 * (T12 - T13);
|
||||
T1h = KP707106781 * (T12 + T13);
|
||||
ri[WS(rs, 7)] = T11 - T14;
|
||||
ii[WS(rs, 5)] = T1k - T1h;
|
||||
ri[WS(rs, 3)] = T11 + T14;
|
||||
ii[WS(rs, 1)] = T1h + T1k;
|
||||
}
|
||||
{
|
||||
E TP, T1m, T10, T1l, TU, TZ;
|
||||
TP = TL + TO;
|
||||
T1m = T1j + T1i;
|
||||
TU = TQ + TT;
|
||||
TZ = TV - TY;
|
||||
T10 = KP707106781 * (TU + TZ);
|
||||
T1l = KP707106781 * (TZ - TU);
|
||||
ri[WS(rs, 5)] = TP - T10;
|
||||
ii[WS(rs, 7)] = T1m - T1l;
|
||||
ri[WS(rs, 1)] = TP + T10;
|
||||
ii[WS(rs, 3)] = T1l + T1m;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static const tw_instr twinstr[] = {
|
||||
{ TW_CEXP, 0, 1 },
|
||||
{ TW_CEXP, 0, 3 },
|
||||
{ TW_CEXP, 0, 7 },
|
||||
{ TW_NEXT, 1, 0 }
|
||||
};
|
||||
|
||||
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 56, 26, 18, 0 }, 0, 0, 0 };
|
||||
|
||||
void X(codelet_t2_8) (planner *p) {
|
||||
X(kdft_dit_register) (p, t2_8, &desc);
|
||||
}
|
||||
#endif
|
||||
Reference in New Issue
Block a user