This commit is contained in:
2025-07-12 12:17:44 +03:00
parent c759f60ff7
commit 792e1b937a
3507 changed files with 492613 additions and 0 deletions

View File

@@ -0,0 +1,96 @@
# This Makefile.am specifies a set of codelets, efficient transforms
# of small sizes, that are used as building blocks (kernels) by FFTW
# to build up large transforms, as well as the options for generating
# and compiling them.
# You can customize FFTW for special needs, e.g. to handle certain
# sizes more efficiently, by adding new codelets to the lists of those
# included by default. If you change the list of codelets, any new
# ones you added will be automatically generated when you run the
# bootstrap script (see "Generating your own code" in the FFTW
# manual).
###########################################################################
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = libdft_scalar_codelets.la
###########################################################################
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
###########################################################################
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
# of trig. functions, it partially generates the trig. values on the fly
# (this is faster for large sizes).
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
t2_5.c t2_10.c t2_20.c t2_25.c
###########################################################################
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
# but the planner seems to never (or hardly ever) use them on the machines
# we have access to, preferring the Q codelets and the use of buffers
# for sub-transforms. So, we comment them out, at least for now.
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
# like f1, but partially generates its trig. table on the fly
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
###########################################################################
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
# transposed. This is used for in-place transposes in sizes that are
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
# probably not use <r> bigger than 8 or so.
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
###########################################################################
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
BUILT_SOURCES= $(ALL_CODELETS) $(CODLIST)
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
SOLVTAB_NAME = X(solvtab_dft_standard)
XRENAME=X
# special rules for regenerating codelets.
include $(top_srcdir)/support/Makefile.codelets
if MAINTAINER_MODE
FLAGS_N1=$(DFT_FLAGS_COMMON)
FLAGS_T1=$(DFT_FLAGS_COMMON)
FLAGS_T2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
FLAGS_F1=$(DFT_FLAGS_COMMON)
FLAGS_F2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
FLAGS_Q1=$(DFT_FLAGS_COMMON) -reload-twiddle
FLAGS_Q2=$(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
endif # MAINTAINER_MODE

View File

@@ -0,0 +1,994 @@
# Makefile.in generated by automake 1.16.3 from Makefile.am.
# @configure_input@
# Copyright (C) 1994-2020 Free Software Foundation, Inc.
# This Makefile.in is free software; the Free Software Foundation
# gives unlimited permission to copy and/or distribute it,
# with or without modifications, as long as this notice is preserved.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY, to the extent permitted by law; without
# even the implied warranty of MERCHANTABILITY or FITNESS FOR A
# PARTICULAR PURPOSE.
@SET_MAKE@
# This Makefile.am specifies a set of codelets, efficient transforms
# of small sizes, that are used as building blocks (kernels) by FFTW
# to build up large transforms, as well as the options for generating
# and compiling them.
# You can customize FFTW for special needs, e.g. to handle certain
# sizes more efficiently, by adding new codelets to the lists of those
# included by default. If you change the list of codelets, any new
# ones you added will be automatically generated when you run the
# bootstrap script (see "Generating your own code" in the FFTW
# manual).
# -*- makefile -*-
# This file contains special make rules to generate codelets.
# Most of this file requires GNU make .
VPATH = @srcdir@
am__is_gnu_make = { \
if test -z '$(MAKELEVEL)'; then \
false; \
elif test -n '$(MAKE_HOST)'; then \
true; \
elif test -n '$(MAKE_VERSION)' && test -n '$(CURDIR)'; then \
true; \
else \
false; \
fi; \
}
am__make_running_with_option = \
case $${target_option-} in \
?) ;; \
*) echo "am__make_running_with_option: internal error: invalid" \
"target option '$${target_option-}' specified" >&2; \
exit 1;; \
esac; \
has_opt=no; \
sane_makeflags=$$MAKEFLAGS; \
if $(am__is_gnu_make); then \
sane_makeflags=$$MFLAGS; \
else \
case $$MAKEFLAGS in \
*\\[\ \ ]*) \
bs=\\; \
sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
| sed "s/$$bs$$bs[$$bs $$bs ]*//g"`;; \
esac; \
fi; \
skip_next=no; \
strip_trailopt () \
{ \
flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
}; \
for flg in $$sane_makeflags; do \
test $$skip_next = yes && { skip_next=no; continue; }; \
case $$flg in \
*=*|--*) continue;; \
-*I) strip_trailopt 'I'; skip_next=yes;; \
-*I?*) strip_trailopt 'I';; \
-*O) strip_trailopt 'O'; skip_next=yes;; \
-*O?*) strip_trailopt 'O';; \
-*l) strip_trailopt 'l'; skip_next=yes;; \
-*l?*) strip_trailopt 'l';; \
-[dEDm]) skip_next=yes;; \
-[JT]) skip_next=yes;; \
esac; \
case $$flg in \
*$$target_option*) has_opt=yes; break;; \
esac; \
done; \
test $$has_opt = yes
am__make_dryrun = (target_option=n; $(am__make_running_with_option))
am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
pkgdatadir = $(datadir)/@PACKAGE@
pkgincludedir = $(includedir)/@PACKAGE@
pkglibdir = $(libdir)/@PACKAGE@
pkglibexecdir = $(libexecdir)/@PACKAGE@
am__cd = CDPATH="$${ZSH_VERSION+.}$(PATH_SEPARATOR)" && cd
install_sh_DATA = $(install_sh) -c -m 644
install_sh_PROGRAM = $(install_sh) -c
install_sh_SCRIPT = $(install_sh) -c
INSTALL_HEADER = $(INSTALL_DATA)
transform = $(program_transform_name)
NORMAL_INSTALL = :
PRE_INSTALL = :
POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
build_triplet = @build@
host_triplet = @host@
subdir = dft/scalar/codelets
ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
am__aclocal_m4_deps = $(top_srcdir)/m4/acx_mpi.m4 \
$(top_srcdir)/m4/acx_pthread.m4 \
$(top_srcdir)/m4/ax_cc_maxopt.m4 \
$(top_srcdir)/m4/ax_check_compiler_flags.m4 \
$(top_srcdir)/m4/ax_compiler_vendor.m4 \
$(top_srcdir)/m4/ax_gcc_aligns_stack.m4 \
$(top_srcdir)/m4/ax_gcc_version.m4 \
$(top_srcdir)/m4/ax_openmp.m4 $(top_srcdir)/m4/libtool.m4 \
$(top_srcdir)/m4/ltoptions.m4 $(top_srcdir)/m4/ltsugar.m4 \
$(top_srcdir)/m4/ltversion.m4 $(top_srcdir)/m4/lt~obsolete.m4 \
$(top_srcdir)/configure.ac
am__configure_deps = $(am__aclocal_m4_deps) $(CONFIGURE_DEPENDENCIES) \
$(ACLOCAL_M4)
DIST_COMMON = $(srcdir)/Makefile.am $(am__DIST_COMMON)
mkinstalldirs = $(install_sh) -d
CONFIG_HEADER = $(top_builddir)/config.h
CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
LTLIBRARIES = $(noinst_LTLIBRARIES)
libdft_scalar_codelets_la_LIBADD =
am__objects_1 = n1_2.lo n1_3.lo n1_4.lo n1_5.lo n1_6.lo n1_7.lo \
n1_8.lo n1_9.lo n1_10.lo n1_11.lo n1_12.lo n1_13.lo n1_14.lo \
n1_15.lo n1_16.lo n1_32.lo n1_64.lo n1_20.lo n1_25.lo
am__objects_2 = t1_2.lo t1_3.lo t1_4.lo t1_5.lo t1_6.lo t1_7.lo \
t1_8.lo t1_9.lo t1_10.lo t1_12.lo t1_15.lo t1_16.lo t1_32.lo \
t1_64.lo t1_20.lo t1_25.lo
am__objects_3 = t2_4.lo t2_8.lo t2_16.lo t2_32.lo t2_64.lo t2_5.lo \
t2_10.lo t2_20.lo t2_25.lo
am__objects_4 =
am__objects_5 = q1_2.lo q1_4.lo q1_8.lo q1_3.lo q1_5.lo q1_6.lo
am__objects_6 = $(am__objects_1) $(am__objects_2) $(am__objects_3) \
$(am__objects_4) $(am__objects_4) $(am__objects_5)
am__objects_7 = codlist.lo
am__objects_8 = $(am__objects_6) $(am__objects_7)
am_libdft_scalar_codelets_la_OBJECTS = $(am__objects_8)
libdft_scalar_codelets_la_OBJECTS = \
$(am_libdft_scalar_codelets_la_OBJECTS)
AM_V_lt = $(am__v_lt_@AM_V@)
am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
am__v_lt_0 = --silent
am__v_lt_1 =
AM_V_P = $(am__v_P_@AM_V@)
am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
am__v_P_0 = false
am__v_P_1 = :
AM_V_GEN = $(am__v_GEN_@AM_V@)
am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
am__v_GEN_0 = @echo " GEN " $@;
am__v_GEN_1 =
AM_V_at = $(am__v_at_@AM_V@)
am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
am__v_at_0 = @
am__v_at_1 =
DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
depcomp = $(SHELL) $(top_srcdir)/depcomp
am__maybe_remake_depfiles = depfiles
am__depfiles_remade = ./$(DEPDIR)/codlist.Plo ./$(DEPDIR)/n1_10.Plo \
./$(DEPDIR)/n1_11.Plo ./$(DEPDIR)/n1_12.Plo \
./$(DEPDIR)/n1_13.Plo ./$(DEPDIR)/n1_14.Plo \
./$(DEPDIR)/n1_15.Plo ./$(DEPDIR)/n1_16.Plo \
./$(DEPDIR)/n1_2.Plo ./$(DEPDIR)/n1_20.Plo \
./$(DEPDIR)/n1_25.Plo ./$(DEPDIR)/n1_3.Plo \
./$(DEPDIR)/n1_32.Plo ./$(DEPDIR)/n1_4.Plo \
./$(DEPDIR)/n1_5.Plo ./$(DEPDIR)/n1_6.Plo \
./$(DEPDIR)/n1_64.Plo ./$(DEPDIR)/n1_7.Plo \
./$(DEPDIR)/n1_8.Plo ./$(DEPDIR)/n1_9.Plo ./$(DEPDIR)/q1_2.Plo \
./$(DEPDIR)/q1_3.Plo ./$(DEPDIR)/q1_4.Plo ./$(DEPDIR)/q1_5.Plo \
./$(DEPDIR)/q1_6.Plo ./$(DEPDIR)/q1_8.Plo \
./$(DEPDIR)/t1_10.Plo ./$(DEPDIR)/t1_12.Plo \
./$(DEPDIR)/t1_15.Plo ./$(DEPDIR)/t1_16.Plo \
./$(DEPDIR)/t1_2.Plo ./$(DEPDIR)/t1_20.Plo \
./$(DEPDIR)/t1_25.Plo ./$(DEPDIR)/t1_3.Plo \
./$(DEPDIR)/t1_32.Plo ./$(DEPDIR)/t1_4.Plo \
./$(DEPDIR)/t1_5.Plo ./$(DEPDIR)/t1_6.Plo \
./$(DEPDIR)/t1_64.Plo ./$(DEPDIR)/t1_7.Plo \
./$(DEPDIR)/t1_8.Plo ./$(DEPDIR)/t1_9.Plo \
./$(DEPDIR)/t2_10.Plo ./$(DEPDIR)/t2_16.Plo \
./$(DEPDIR)/t2_20.Plo ./$(DEPDIR)/t2_25.Plo \
./$(DEPDIR)/t2_32.Plo ./$(DEPDIR)/t2_4.Plo \
./$(DEPDIR)/t2_5.Plo ./$(DEPDIR)/t2_64.Plo \
./$(DEPDIR)/t2_8.Plo
am__mv = mv -f
COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
$(AM_CFLAGS) $(CFLAGS)
AM_V_CC = $(am__v_CC_@AM_V@)
am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
am__v_CC_0 = @echo " CC " $@;
am__v_CC_1 =
CCLD = $(CC)
LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
$(AM_LDFLAGS) $(LDFLAGS) -o $@
AM_V_CCLD = $(am__v_CCLD_@AM_V@)
am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
am__v_CCLD_0 = @echo " CCLD " $@;
am__v_CCLD_1 =
SOURCES = $(libdft_scalar_codelets_la_SOURCES)
DIST_SOURCES = $(libdft_scalar_codelets_la_SOURCES)
am__can_run_installinfo = \
case $$AM_UPDATE_INFO_DIR in \
n|no|NO) false;; \
*) (install-info --version) >/dev/null 2>&1;; \
esac
am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
# Read a list of newline-separated strings from the standard input,
# and print each of them once, without duplicates. Input order is
# *not* preserved.
am__uniquify_input = $(AWK) '\
BEGIN { nonempty = 0; } \
{ items[$$0] = 1; nonempty = 1; } \
END { if (nonempty) { for (i in items) print i; }; } \
'
# Make sure the list of sources is unique. This is necessary because,
# e.g., the same source file might be shared among _SOURCES variables
# for different programs/libraries.
am__define_uniq_tagged_files = \
list='$(am__tagged_files)'; \
unique=`for i in $$list; do \
if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
done | $(am__uniquify_input)`
ETAGS = etags
CTAGS = ctags
am__DIST_COMMON = $(srcdir)/Makefile.in $(top_srcdir)/depcomp \
$(top_srcdir)/support/Makefile.codelets
DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
ACLOCAL = @ACLOCAL@
ALLOCA = @ALLOCA@
ALTIVEC_CFLAGS = @ALTIVEC_CFLAGS@
AMTAR = @AMTAR@
AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
AR = @AR@
AS = @AS@
AUTOCONF = @AUTOCONF@
AUTOHEADER = @AUTOHEADER@
AUTOMAKE = @AUTOMAKE@
AVX2_CFLAGS = @AVX2_CFLAGS@
AVX512_CFLAGS = @AVX512_CFLAGS@
AVX_128_FMA_CFLAGS = @AVX_128_FMA_CFLAGS@
AVX_CFLAGS = @AVX_CFLAGS@
AWK = @AWK@
CC = @CC@
CCDEPMODE = @CCDEPMODE@
CFLAGS = @CFLAGS@
CHECK_PL_OPTS = @CHECK_PL_OPTS@
CPP = @CPP@
CPPFLAGS = @CPPFLAGS@
CYGPATH_W = @CYGPATH_W@
C_FFTW_R2R_KIND = @C_FFTW_R2R_KIND@
C_MPI_FINT = @C_MPI_FINT@
DEFS = @DEFS@
DEPDIR = @DEPDIR@
DLLTOOL = @DLLTOOL@
DSYMUTIL = @DSYMUTIL@
DUMPBIN = @DUMPBIN@
ECHO_C = @ECHO_C@
ECHO_N = @ECHO_N@
ECHO_T = @ECHO_T@
EGREP = @EGREP@
EXEEXT = @EXEEXT@
F77 = @F77@
FFLAGS = @FFLAGS@
FGREP = @FGREP@
FLIBS = @FLIBS@
GREP = @GREP@
INDENT = @INDENT@
INSTALL = @INSTALL@
INSTALL_DATA = @INSTALL_DATA@
INSTALL_PROGRAM = @INSTALL_PROGRAM@
INSTALL_SCRIPT = @INSTALL_SCRIPT@
INSTALL_STRIP_PROGRAM = @INSTALL_STRIP_PROGRAM@
KCVI_CFLAGS = @KCVI_CFLAGS@
LD = @LD@
LDFLAGS = @LDFLAGS@
LIBOBJS = @LIBOBJS@
LIBQUADMATH = @LIBQUADMATH@
LIBS = @LIBS@
LIBTOOL = @LIBTOOL@
LIPO = @LIPO@
LN_S = @LN_S@
LTLIBOBJS = @LTLIBOBJS@
LT_SYS_LIBRARY_PATH = @LT_SYS_LIBRARY_PATH@
MAINT = @MAINT@
MAKEINFO = @MAKEINFO@
MANIFEST_TOOL = @MANIFEST_TOOL@
MKDIR_P = @MKDIR_P@
MPICC = @MPICC@
MPILIBS = @MPILIBS@
MPIRUN = @MPIRUN@
NEON_CFLAGS = @NEON_CFLAGS@
NM = @NM@
NMEDIT = @NMEDIT@
OBJDUMP = @OBJDUMP@
OBJEXT = @OBJEXT@
OCAMLBUILD = @OCAMLBUILD@
OPENMP_CFLAGS = @OPENMP_CFLAGS@
OTOOL = @OTOOL@
OTOOL64 = @OTOOL64@
PACKAGE = @PACKAGE@
PACKAGE_BUGREPORT = @PACKAGE_BUGREPORT@
PACKAGE_NAME = @PACKAGE_NAME@
PACKAGE_STRING = @PACKAGE_STRING@
PACKAGE_TARNAME = @PACKAGE_TARNAME@
PACKAGE_URL = @PACKAGE_URL@
PACKAGE_VERSION = @PACKAGE_VERSION@
PATH_SEPARATOR = @PATH_SEPARATOR@
POW_LIB = @POW_LIB@
PRECISION = @PRECISION@
PREC_SUFFIX = @PREC_SUFFIX@
PTHREAD_CC = @PTHREAD_CC@
PTHREAD_CFLAGS = @PTHREAD_CFLAGS@
PTHREAD_LIBS = @PTHREAD_LIBS@
RANLIB = @RANLIB@
SED = @SED@
SET_MAKE = @SET_MAKE@
SHARED_VERSION_INFO = @SHARED_VERSION_INFO@
SHELL = @SHELL@
SSE2_CFLAGS = @SSE2_CFLAGS@
STACK_ALIGN_CFLAGS = @STACK_ALIGN_CFLAGS@
STRIP = @STRIP@
THREADLIBS = @THREADLIBS@
VERSION = @VERSION@
VSX_CFLAGS = @VSX_CFLAGS@
abs_builddir = @abs_builddir@
abs_srcdir = @abs_srcdir@
abs_top_builddir = @abs_top_builddir@
abs_top_srcdir = @abs_top_srcdir@
ac_ct_AR = @ac_ct_AR@
ac_ct_CC = @ac_ct_CC@
ac_ct_DUMPBIN = @ac_ct_DUMPBIN@
ac_ct_F77 = @ac_ct_F77@
acx_pthread_config = @acx_pthread_config@
am__include = @am__include@
am__leading_dot = @am__leading_dot@
am__quote = @am__quote@
am__tar = @am__tar@
am__untar = @am__untar@
bindir = @bindir@
build = @build@
build_alias = @build_alias@
build_cpu = @build_cpu@
build_os = @build_os@
build_vendor = @build_vendor@
builddir = @builddir@
datadir = @datadir@
datarootdir = @datarootdir@
docdir = @docdir@
dvidir = @dvidir@
exec_prefix = @exec_prefix@
host = @host@
host_alias = @host_alias@
host_cpu = @host_cpu@
host_os = @host_os@
host_vendor = @host_vendor@
htmldir = @htmldir@
includedir = @includedir@
infodir = @infodir@
install_sh = @install_sh@
libdir = @libdir@
libexecdir = @libexecdir@
localedir = @localedir@
localstatedir = @localstatedir@
mandir = @mandir@
mkdir_p = @mkdir_p@
oldincludedir = @oldincludedir@
pdfdir = @pdfdir@
prefix = @prefix@
program_transform_name = @program_transform_name@
psdir = @psdir@
runstatedir = @runstatedir@
sbindir = @sbindir@
sharedstatedir = @sharedstatedir@
srcdir = @srcdir@
sysconfdir = @sysconfdir@
target_alias = @target_alias@
top_build_prefix = @top_build_prefix@
top_builddir = @top_builddir@
top_srcdir = @top_srcdir@
###########################################################################
AM_CPPFLAGS = -I $(top_srcdir)
noinst_LTLIBRARIES = libdft_scalar_codelets.la
###########################################################################
# n1_<n> is a hard-coded FFT of size <n> (base cases of FFT recursion)
N1 = n1_2.c n1_3.c n1_4.c n1_5.c n1_6.c n1_7.c n1_8.c n1_9.c n1_10.c \
n1_11.c n1_12.c n1_13.c n1_14.c n1_15.c n1_16.c n1_32.c n1_64.c \
n1_20.c n1_25.c # n1_30.c n1_40.c n1_50.c
###########################################################################
# t1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIT step
T1 = t1_2.c t1_3.c t1_4.c t1_5.c t1_6.c t1_7.c t1_8.c t1_9.c \
t1_10.c t1_12.c t1_15.c t1_16.c t1_32.c t1_64.c \
t1_20.c t1_25.c # t1_30.c t1_40.c t1_50.c
# t2_<r> is also a twiddle FFT, but instead of using a complete lookup table
# of trig. functions, it partially generates the trig. values on the fly
# (this is faster for large sizes).
T2 = t2_4.c t2_8.c t2_16.c t2_32.c t2_64.c \
t2_5.c t2_10.c t2_20.c t2_25.c
###########################################################################
# The F (DIF) codelets are used for a kind of in-place transform algorithm,
# but the planner seems to never (or hardly ever) use them on the machines
# we have access to, preferring the Q codelets and the use of buffers
# for sub-transforms. So, we comment them out, at least for now.
# f1_<r> is a "twiddle" FFT of size <r>, implementing a radix-r DIF step
F1 = # f1_2.c f1_3.c f1_4.c f1_5.c f1_6.c f1_7.c f1_8.c f1_9.c f1_10.c f1_12.c f1_15.c f1_16.c f1_32.c f1_64.c
# like f1, but partially generates its trig. table on the fly
F2 = # f2_4.c f2_8.c f2_16.c f2_32.c f2_64.c
###########################################################################
# q1_<r> is <r> twiddle FFTs of size <r> (DIF step), where the output is
# transposed. This is used for in-place transposes in sizes that are
# divisible by <r>^2. These codelets have size ~ <r>^2, so you should
# probably not use <r> bigger than 8 or so.
Q1 = q1_2.c q1_4.c q1_8.c q1_3.c q1_5.c q1_6.c
###########################################################################
ALL_CODELETS = $(N1) $(T1) $(T2) $(F1) $(F2) $(Q1)
BUILT_SOURCES = $(ALL_CODELETS) $(CODLIST)
libdft_scalar_codelets_la_SOURCES = $(BUILT_SOURCES)
SOLVTAB_NAME = X(solvtab_dft_standard)
XRENAME = X
CODLIST = codlist.c
CODELET_NAME = codelet_
#INDENT = indent -kr -cs -i5 -l800 -fca -nfc1 -sc -sob -cli4 -TR -Tplanner -TV
@MAINTAINER_MODE_TRUE@TWOVERS = sh ${top_srcdir}/support/twovers.sh
@MAINTAINER_MODE_TRUE@GENFFTDIR = ${top_builddir}/genfft
@MAINTAINER_MODE_TRUE@GEN_NOTW = ${GENFFTDIR}/gen_notw.native
@MAINTAINER_MODE_TRUE@GEN_NOTW_C = ${GENFFTDIR}/gen_notw_c.native
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE = ${GENFFTDIR}/gen_twiddle.native
@MAINTAINER_MODE_TRUE@GEN_TWIDDLE_C = ${GENFFTDIR}/gen_twiddle_c.native
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ = ${GENFFTDIR}/gen_twidsq.native
@MAINTAINER_MODE_TRUE@GEN_TWIDSQ_C = ${GENFFTDIR}/gen_twidsq_c.native
@MAINTAINER_MODE_TRUE@GEN_R2CF = ${GENFFTDIR}/gen_r2cf.native
@MAINTAINER_MODE_TRUE@GEN_R2CB = ${GENFFTDIR}/gen_r2cb.native
@MAINTAINER_MODE_TRUE@GEN_HC2HC = ${GENFFTDIR}/gen_hc2hc.native
@MAINTAINER_MODE_TRUE@GEN_HC2C = ${GENFFTDIR}/gen_hc2c.native
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT = ${GENFFTDIR}/gen_hc2cdft.native
@MAINTAINER_MODE_TRUE@GEN_HC2CDFT_C = ${GENFFTDIR}/gen_hc2cdft_c.native
@MAINTAINER_MODE_TRUE@GEN_R2R = ${GENFFTDIR}/gen_r2r.native
@MAINTAINER_MODE_TRUE@PRELUDE_DFT = ${top_srcdir}/support/codelet_prelude.dft
@MAINTAINER_MODE_TRUE@PRELUDE_RDFT = ${top_srcdir}/support/codelet_prelude.rdft
@MAINTAINER_MODE_TRUE@ADD_DATE = sed -e s/@DATE@/"`date`"/
@MAINTAINER_MODE_TRUE@COPYRIGHT = ${top_srcdir}/COPYRIGHT
@MAINTAINER_MODE_TRUE@CODELET_DEPS = $(COPYRIGHT) $(PRELUDE)
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_DFT = cat $(COPYRIGHT) $(PRELUDE_DFT)
@MAINTAINER_MODE_TRUE@PRELUDE_COMMANDS_RDFT = cat $(COPYRIGHT) $(PRELUDE_RDFT)
@MAINTAINER_MODE_TRUE@FLAGS_COMMON = -compact -variables 4
@MAINTAINER_MODE_TRUE@DFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
@MAINTAINER_MODE_TRUE@RDFT_FLAGS_COMMON = $(FLAGS_COMMON) -pipeline-latency 4
# special rules for regenerating codelets.
@MAINTAINER_MODE_TRUE@FLAGS_N1 = $(DFT_FLAGS_COMMON)
@MAINTAINER_MODE_TRUE@FLAGS_T1 = $(DFT_FLAGS_COMMON)
@MAINTAINER_MODE_TRUE@FLAGS_T2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
@MAINTAINER_MODE_TRUE@FLAGS_F1 = $(DFT_FLAGS_COMMON)
@MAINTAINER_MODE_TRUE@FLAGS_F2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
@MAINTAINER_MODE_TRUE@FLAGS_Q1 = $(DFT_FLAGS_COMMON) -reload-twiddle
@MAINTAINER_MODE_TRUE@FLAGS_Q2 = $(DFT_FLAGS_COMMON) -twiddle-log3 -precompute-twiddles
all: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) all-am
.SUFFIXES:
.SUFFIXES: .c .lo .o .obj
$(srcdir)/Makefile.in: @MAINTAINER_MODE_TRUE@ $(srcdir)/Makefile.am $(top_srcdir)/support/Makefile.codelets $(am__configure_deps)
@for dep in $?; do \
case '$(am__configure_deps)' in \
*$$dep*) \
( cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh ) \
&& { if test -f $@; then exit 0; else break; fi; }; \
exit 1;; \
esac; \
done; \
echo ' cd $(top_srcdir) && $(AUTOMAKE) --gnu dft/scalar/codelets/Makefile'; \
$(am__cd) $(top_srcdir) && \
$(AUTOMAKE) --gnu dft/scalar/codelets/Makefile
Makefile: $(srcdir)/Makefile.in $(top_builddir)/config.status
@case '$?' in \
*config.status*) \
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh;; \
*) \
echo ' cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles)'; \
cd $(top_builddir) && $(SHELL) ./config.status $(subdir)/$@ $(am__maybe_remake_depfiles);; \
esac;
$(top_srcdir)/support/Makefile.codelets $(am__empty):
$(top_builddir)/config.status: $(top_srcdir)/configure $(CONFIG_STATUS_DEPENDENCIES)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(top_srcdir)/configure: @MAINTAINER_MODE_TRUE@ $(am__configure_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(ACLOCAL_M4): @MAINTAINER_MODE_TRUE@ $(am__aclocal_m4_deps)
cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
$(am__aclocal_m4_deps):
clean-noinstLTLIBRARIES:
-test -z "$(noinst_LTLIBRARIES)" || rm -f $(noinst_LTLIBRARIES)
@list='$(noinst_LTLIBRARIES)'; \
locs=`for p in $$list; do echo $$p; done | \
sed 's|^[^/]*$$|.|; s|/[^/]*$$||; s|$$|/so_locations|' | \
sort -u`; \
test -z "$$locs" || { \
echo rm -f $${locs}; \
rm -f $${locs}; \
}
libdft_scalar_codelets.la: $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_DEPENDENCIES) $(EXTRA_libdft_scalar_codelets_la_DEPENDENCIES)
$(AM_V_CCLD)$(LINK) $(libdft_scalar_codelets_la_OBJECTS) $(libdft_scalar_codelets_la_LIBADD) $(LIBS)
mostlyclean-compile:
-rm -f *.$(OBJEXT)
distclean-compile:
-rm -f *.tab.c
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/codlist.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_10.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_11.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_12.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_13.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_14.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_15.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_16.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_20.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_25.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_3.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_32.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_6.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_64.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_7.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_8.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/n1_9.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_3.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_6.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/q1_8.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_10.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_12.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_15.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_16.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_2.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_20.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_25.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_3.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_32.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_6.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_64.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_7.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_8.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t1_9.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_10.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_16.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_20.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_25.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_32.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_4.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_5.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_64.Plo@am__quote@ # am--include-marker
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/t2_8.Plo@am__quote@ # am--include-marker
$(am__depfiles_remade):
@$(MKDIR_P) $(@D)
@echo '# dummy' >$@-t && $(am__mv) $@-t $@
am--depfiles: $(am__depfiles_remade)
.c.o:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<
.c.obj:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
.c.lo:
@am__fastdepCC_TRUE@ $(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
@am__fastdepCC_TRUE@ $(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
@AMDEP_TRUE@@am__fastdepCC_FALSE@ $(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@ DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
@am__fastdepCC_FALSE@ $(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<
mostlyclean-libtool:
-rm -f *.lo
clean-libtool:
-rm -rf .libs _libs
ID: $(am__tagged_files)
$(am__define_uniq_tagged_files); mkid -fID $$unique
tags: tags-am
TAGS: tags
tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
set x; \
here=`pwd`; \
$(am__define_uniq_tagged_files); \
shift; \
if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
test -n "$$unique" || unique=$$empty_fix; \
if test $$# -gt 0; then \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
"$$@" $$unique; \
else \
$(ETAGS) $(ETAGSFLAGS) $(AM_ETAGSFLAGS) $(ETAGS_ARGS) \
$$unique; \
fi; \
fi
ctags: ctags-am
CTAGS: ctags
ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
$(am__define_uniq_tagged_files); \
test -z "$(CTAGS_ARGS)$$unique" \
|| $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
$$unique
GTAGS:
here=`$(am__cd) $(top_builddir) && pwd` \
&& $(am__cd) $(top_srcdir) \
&& gtags -i $(GTAGS_ARGS) "$$here"
cscopelist: cscopelist-am
cscopelist-am: $(am__tagged_files)
list='$(am__tagged_files)'; \
case "$(srcdir)" in \
[\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
*) sdir=$(subdir)/$(srcdir) ;; \
esac; \
for i in $$list; do \
if test -f "$$i"; then \
echo "$(subdir)/$$i"; \
else \
echo "$$sdir/$$i"; \
fi; \
done >> $(top_builddir)/cscope.files
distclean-tags:
-rm -f TAGS ID GTAGS GRTAGS GSYMS GPATH tags
distdir: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) distdir-am
distdir-am: $(DISTFILES)
@srcdirstrip=`echo "$(srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
topsrcdirstrip=`echo "$(top_srcdir)" | sed 's/[].[^$$\\*]/\\\\&/g'`; \
list='$(DISTFILES)'; \
dist_files=`for file in $$list; do echo $$file; done | \
sed -e "s|^$$srcdirstrip/||;t" \
-e "s|^$$topsrcdirstrip/|$(top_builddir)/|;t"`; \
case $$dist_files in \
*/*) $(MKDIR_P) `echo "$$dist_files" | \
sed '/\//!d;s|^|$(distdir)/|;s,/[^/]*$$,,' | \
sort -u` ;; \
esac; \
for file in $$dist_files; do \
if test -f $$file || test -d $$file; then d=.; else d=$(srcdir); fi; \
if test -d $$d/$$file; then \
dir=`echo "/$$file" | sed -e 's,/[^/]*$$,,'`; \
if test -d "$(distdir)/$$file"; then \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
if test -d $(srcdir)/$$file && test $$d != $(srcdir); then \
cp -fpR $(srcdir)/$$file "$(distdir)$$dir" || exit 1; \
find "$(distdir)/$$file" -type d ! -perm -700 -exec chmod u+rwx {} \;; \
fi; \
cp -fpR $$d/$$file "$(distdir)$$dir" || exit 1; \
else \
test -f "$(distdir)/$$file" \
|| cp -p $$d/$$file "$(distdir)/$$file" \
|| exit 1; \
fi; \
done
check-am: all-am
check: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) check-am
all-am: Makefile $(LTLIBRARIES)
installdirs:
install: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) install-am
install-exec: $(BUILT_SOURCES)
$(MAKE) $(AM_MAKEFLAGS) install-exec-am
install-data: install-data-am
uninstall: uninstall-am
install-am: all-am
@$(MAKE) $(AM_MAKEFLAGS) install-exec-am install-data-am
installcheck: installcheck-am
install-strip:
if test -z '$(STRIP)'; then \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
install; \
else \
$(MAKE) $(AM_MAKEFLAGS) INSTALL_PROGRAM="$(INSTALL_STRIP_PROGRAM)" \
install_sh_PROGRAM="$(INSTALL_STRIP_PROGRAM)" INSTALL_STRIP_FLAG=-s \
"INSTALL_PROGRAM_ENV=STRIPPROG='$(STRIP)'" install; \
fi
mostlyclean-generic:
clean-generic:
distclean-generic:
-test -z "$(CONFIG_CLEAN_FILES)" || rm -f $(CONFIG_CLEAN_FILES)
-test . = "$(srcdir)" || test -z "$(CONFIG_CLEAN_VPATH_FILES)" || rm -f $(CONFIG_CLEAN_VPATH_FILES)
maintainer-clean-generic:
@echo "This command is intended for maintainers to use"
@echo "it deletes files that may require special tools to rebuild."
-test -z "$(BUILT_SOURCES)" || rm -f $(BUILT_SOURCES)
clean: clean-am
clean-am: clean-generic clean-libtool clean-noinstLTLIBRARIES \
mostlyclean-am
distclean: distclean-am
-rm -f ./$(DEPDIR)/codlist.Plo
-rm -f ./$(DEPDIR)/n1_10.Plo
-rm -f ./$(DEPDIR)/n1_11.Plo
-rm -f ./$(DEPDIR)/n1_12.Plo
-rm -f ./$(DEPDIR)/n1_13.Plo
-rm -f ./$(DEPDIR)/n1_14.Plo
-rm -f ./$(DEPDIR)/n1_15.Plo
-rm -f ./$(DEPDIR)/n1_16.Plo
-rm -f ./$(DEPDIR)/n1_2.Plo
-rm -f ./$(DEPDIR)/n1_20.Plo
-rm -f ./$(DEPDIR)/n1_25.Plo
-rm -f ./$(DEPDIR)/n1_3.Plo
-rm -f ./$(DEPDIR)/n1_32.Plo
-rm -f ./$(DEPDIR)/n1_4.Plo
-rm -f ./$(DEPDIR)/n1_5.Plo
-rm -f ./$(DEPDIR)/n1_6.Plo
-rm -f ./$(DEPDIR)/n1_64.Plo
-rm -f ./$(DEPDIR)/n1_7.Plo
-rm -f ./$(DEPDIR)/n1_8.Plo
-rm -f ./$(DEPDIR)/n1_9.Plo
-rm -f ./$(DEPDIR)/q1_2.Plo
-rm -f ./$(DEPDIR)/q1_3.Plo
-rm -f ./$(DEPDIR)/q1_4.Plo
-rm -f ./$(DEPDIR)/q1_5.Plo
-rm -f ./$(DEPDIR)/q1_6.Plo
-rm -f ./$(DEPDIR)/q1_8.Plo
-rm -f ./$(DEPDIR)/t1_10.Plo
-rm -f ./$(DEPDIR)/t1_12.Plo
-rm -f ./$(DEPDIR)/t1_15.Plo
-rm -f ./$(DEPDIR)/t1_16.Plo
-rm -f ./$(DEPDIR)/t1_2.Plo
-rm -f ./$(DEPDIR)/t1_20.Plo
-rm -f ./$(DEPDIR)/t1_25.Plo
-rm -f ./$(DEPDIR)/t1_3.Plo
-rm -f ./$(DEPDIR)/t1_32.Plo
-rm -f ./$(DEPDIR)/t1_4.Plo
-rm -f ./$(DEPDIR)/t1_5.Plo
-rm -f ./$(DEPDIR)/t1_6.Plo
-rm -f ./$(DEPDIR)/t1_64.Plo
-rm -f ./$(DEPDIR)/t1_7.Plo
-rm -f ./$(DEPDIR)/t1_8.Plo
-rm -f ./$(DEPDIR)/t1_9.Plo
-rm -f ./$(DEPDIR)/t2_10.Plo
-rm -f ./$(DEPDIR)/t2_16.Plo
-rm -f ./$(DEPDIR)/t2_20.Plo
-rm -f ./$(DEPDIR)/t2_25.Plo
-rm -f ./$(DEPDIR)/t2_32.Plo
-rm -f ./$(DEPDIR)/t2_4.Plo
-rm -f ./$(DEPDIR)/t2_5.Plo
-rm -f ./$(DEPDIR)/t2_64.Plo
-rm -f ./$(DEPDIR)/t2_8.Plo
-rm -f Makefile
distclean-am: clean-am distclean-compile distclean-generic \
distclean-tags
dvi: dvi-am
dvi-am:
html: html-am
html-am:
info: info-am
info-am:
install-data-am:
install-dvi: install-dvi-am
install-dvi-am:
install-exec-am:
install-html: install-html-am
install-html-am:
install-info: install-info-am
install-info-am:
install-man:
install-pdf: install-pdf-am
install-pdf-am:
install-ps: install-ps-am
install-ps-am:
installcheck-am:
maintainer-clean: maintainer-clean-am
-rm -f ./$(DEPDIR)/codlist.Plo
-rm -f ./$(DEPDIR)/n1_10.Plo
-rm -f ./$(DEPDIR)/n1_11.Plo
-rm -f ./$(DEPDIR)/n1_12.Plo
-rm -f ./$(DEPDIR)/n1_13.Plo
-rm -f ./$(DEPDIR)/n1_14.Plo
-rm -f ./$(DEPDIR)/n1_15.Plo
-rm -f ./$(DEPDIR)/n1_16.Plo
-rm -f ./$(DEPDIR)/n1_2.Plo
-rm -f ./$(DEPDIR)/n1_20.Plo
-rm -f ./$(DEPDIR)/n1_25.Plo
-rm -f ./$(DEPDIR)/n1_3.Plo
-rm -f ./$(DEPDIR)/n1_32.Plo
-rm -f ./$(DEPDIR)/n1_4.Plo
-rm -f ./$(DEPDIR)/n1_5.Plo
-rm -f ./$(DEPDIR)/n1_6.Plo
-rm -f ./$(DEPDIR)/n1_64.Plo
-rm -f ./$(DEPDIR)/n1_7.Plo
-rm -f ./$(DEPDIR)/n1_8.Plo
-rm -f ./$(DEPDIR)/n1_9.Plo
-rm -f ./$(DEPDIR)/q1_2.Plo
-rm -f ./$(DEPDIR)/q1_3.Plo
-rm -f ./$(DEPDIR)/q1_4.Plo
-rm -f ./$(DEPDIR)/q1_5.Plo
-rm -f ./$(DEPDIR)/q1_6.Plo
-rm -f ./$(DEPDIR)/q1_8.Plo
-rm -f ./$(DEPDIR)/t1_10.Plo
-rm -f ./$(DEPDIR)/t1_12.Plo
-rm -f ./$(DEPDIR)/t1_15.Plo
-rm -f ./$(DEPDIR)/t1_16.Plo
-rm -f ./$(DEPDIR)/t1_2.Plo
-rm -f ./$(DEPDIR)/t1_20.Plo
-rm -f ./$(DEPDIR)/t1_25.Plo
-rm -f ./$(DEPDIR)/t1_3.Plo
-rm -f ./$(DEPDIR)/t1_32.Plo
-rm -f ./$(DEPDIR)/t1_4.Plo
-rm -f ./$(DEPDIR)/t1_5.Plo
-rm -f ./$(DEPDIR)/t1_6.Plo
-rm -f ./$(DEPDIR)/t1_64.Plo
-rm -f ./$(DEPDIR)/t1_7.Plo
-rm -f ./$(DEPDIR)/t1_8.Plo
-rm -f ./$(DEPDIR)/t1_9.Plo
-rm -f ./$(DEPDIR)/t2_10.Plo
-rm -f ./$(DEPDIR)/t2_16.Plo
-rm -f ./$(DEPDIR)/t2_20.Plo
-rm -f ./$(DEPDIR)/t2_25.Plo
-rm -f ./$(DEPDIR)/t2_32.Plo
-rm -f ./$(DEPDIR)/t2_4.Plo
-rm -f ./$(DEPDIR)/t2_5.Plo
-rm -f ./$(DEPDIR)/t2_64.Plo
-rm -f ./$(DEPDIR)/t2_8.Plo
-rm -f Makefile
maintainer-clean-am: distclean-am maintainer-clean-generic \
maintainer-clean-local
mostlyclean: mostlyclean-am
mostlyclean-am: mostlyclean-compile mostlyclean-generic \
mostlyclean-libtool
pdf: pdf-am
pdf-am:
ps: ps-am
ps-am:
uninstall-am:
.MAKE: all check install install-am install-exec install-strip
.PHONY: CTAGS GTAGS TAGS all all-am am--depfiles check check-am clean \
clean-generic clean-libtool clean-noinstLTLIBRARIES \
cscopelist-am ctags ctags-am distclean distclean-compile \
distclean-generic distclean-libtool distclean-tags distdir dvi \
dvi-am html html-am info info-am install install-am \
install-data install-data-am install-dvi install-dvi-am \
install-exec install-exec-am install-html install-html-am \
install-info install-info-am install-man install-pdf \
install-pdf-am install-ps install-ps-am install-strip \
installcheck installcheck-am installdirs maintainer-clean \
maintainer-clean-generic maintainer-clean-local mostlyclean \
mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
pdf pdf-am ps ps-am tags tags-am uninstall uninstall-am
.PRECIOUS: Makefile
# only delete codlist.c in maintainer-mode, since it is included in the dist
# FIXME: is there a way to delete in 'make clean' only when builddir != srcdir?
maintainer-clean-local:
rm -f $(CODLIST)
# rule to build codlist
@MAINTAINER_MODE_TRUE@$(CODLIST): Makefile
@MAINTAINER_MODE_TRUE@ ( \
@MAINTAINER_MODE_TRUE@ echo "#include \"kernel/ifftw.h\""; \
@MAINTAINER_MODE_TRUE@ echo $(INCLUDE_SIMD_HEADER); \
@MAINTAINER_MODE_TRUE@ echo; \
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
@MAINTAINER_MODE_TRUE@ echo "extern void $(XRENAME)($(CODELET_NAME)$$j)(planner *);"; \
@MAINTAINER_MODE_TRUE@ fi \
@MAINTAINER_MODE_TRUE@ done; \
@MAINTAINER_MODE_TRUE@ echo; \
@MAINTAINER_MODE_TRUE@ echo; \
@MAINTAINER_MODE_TRUE@ echo "extern const solvtab $(SOLVTAB_NAME);"; \
@MAINTAINER_MODE_TRUE@ echo "const solvtab $(SOLVTAB_NAME) = {"; \
@MAINTAINER_MODE_TRUE@ for i in $(ALL_CODELETS) NIL; do \
@MAINTAINER_MODE_TRUE@ if test "$$i" != NIL; then \
@MAINTAINER_MODE_TRUE@ j=`basename $$i | sed -e 's/[.][cS]$$//g'`; \
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB($(XRENAME)($(CODELET_NAME)$$j)),"; \
@MAINTAINER_MODE_TRUE@ fi \
@MAINTAINER_MODE_TRUE@ done; \
@MAINTAINER_MODE_TRUE@ echo " SOLVTAB_END"; \
@MAINTAINER_MODE_TRUE@ echo "};"; \
@MAINTAINER_MODE_TRUE@ ) >$@
# cancel the hideous builtin rules that cause an infinite loop
@MAINTAINER_MODE_TRUE@%: %.o
@MAINTAINER_MODE_TRUE@%: %.s
@MAINTAINER_MODE_TRUE@%: %.c
@MAINTAINER_MODE_TRUE@%: %.S
@MAINTAINER_MODE_TRUE@n1_%.c: $(CODELET_DEPS) $(GEN_NOTW)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_NOTW) $(FLAGS_N1) -n $* -name n1_$* -include "dft/scalar/n.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@t1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T1) -n $* -name t1_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@t2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_T2) -n $* -name t2_$* -include "dft/scalar/t.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@f1_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F1) -dif -n $* -name f1_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@f2_%.c: $(CODELET_DEPS) $(GEN_TWIDDLE)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDDLE) $(FLAGS_F2) -dif -n $* -name f2_$* -include "dft/scalar/f.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@q1_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q1) -dif -n $* -name q1_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
@MAINTAINER_MODE_TRUE@q2_%.c: $(CODELET_DEPS) $(GEN_TWIDSQ)
@MAINTAINER_MODE_TRUE@ ($(PRELUDE_COMMANDS_DFT); $(TWOVERS) $(GEN_TWIDSQ) $(FLAGS_Q2) -dif -n $* -name q2_$* -include "dft/scalar/q.h") | $(ADD_DATE) | $(INDENT) >$@
# Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded.
.NOEXPORT:

View File

@@ -0,0 +1,109 @@
#include "kernel/ifftw.h"
extern void X(codelet_n1_2)(planner *);
extern void X(codelet_n1_3)(planner *);
extern void X(codelet_n1_4)(planner *);
extern void X(codelet_n1_5)(planner *);
extern void X(codelet_n1_6)(planner *);
extern void X(codelet_n1_7)(planner *);
extern void X(codelet_n1_8)(planner *);
extern void X(codelet_n1_9)(planner *);
extern void X(codelet_n1_10)(planner *);
extern void X(codelet_n1_11)(planner *);
extern void X(codelet_n1_12)(planner *);
extern void X(codelet_n1_13)(planner *);
extern void X(codelet_n1_14)(planner *);
extern void X(codelet_n1_15)(planner *);
extern void X(codelet_n1_16)(planner *);
extern void X(codelet_n1_32)(planner *);
extern void X(codelet_n1_64)(planner *);
extern void X(codelet_n1_20)(planner *);
extern void X(codelet_n1_25)(planner *);
extern void X(codelet_t1_2)(planner *);
extern void X(codelet_t1_3)(planner *);
extern void X(codelet_t1_4)(planner *);
extern void X(codelet_t1_5)(planner *);
extern void X(codelet_t1_6)(planner *);
extern void X(codelet_t1_7)(planner *);
extern void X(codelet_t1_8)(planner *);
extern void X(codelet_t1_9)(planner *);
extern void X(codelet_t1_10)(planner *);
extern void X(codelet_t1_12)(planner *);
extern void X(codelet_t1_15)(planner *);
extern void X(codelet_t1_16)(planner *);
extern void X(codelet_t1_32)(planner *);
extern void X(codelet_t1_64)(planner *);
extern void X(codelet_t1_20)(planner *);
extern void X(codelet_t1_25)(planner *);
extern void X(codelet_t2_4)(planner *);
extern void X(codelet_t2_8)(planner *);
extern void X(codelet_t2_16)(planner *);
extern void X(codelet_t2_32)(planner *);
extern void X(codelet_t2_64)(planner *);
extern void X(codelet_t2_5)(planner *);
extern void X(codelet_t2_10)(planner *);
extern void X(codelet_t2_20)(planner *);
extern void X(codelet_t2_25)(planner *);
extern void X(codelet_q1_2)(planner *);
extern void X(codelet_q1_4)(planner *);
extern void X(codelet_q1_8)(planner *);
extern void X(codelet_q1_3)(planner *);
extern void X(codelet_q1_5)(planner *);
extern void X(codelet_q1_6)(planner *);
extern const solvtab X(solvtab_dft_standard);
const solvtab X(solvtab_dft_standard) = {
SOLVTAB(X(codelet_n1_2)),
SOLVTAB(X(codelet_n1_3)),
SOLVTAB(X(codelet_n1_4)),
SOLVTAB(X(codelet_n1_5)),
SOLVTAB(X(codelet_n1_6)),
SOLVTAB(X(codelet_n1_7)),
SOLVTAB(X(codelet_n1_8)),
SOLVTAB(X(codelet_n1_9)),
SOLVTAB(X(codelet_n1_10)),
SOLVTAB(X(codelet_n1_11)),
SOLVTAB(X(codelet_n1_12)),
SOLVTAB(X(codelet_n1_13)),
SOLVTAB(X(codelet_n1_14)),
SOLVTAB(X(codelet_n1_15)),
SOLVTAB(X(codelet_n1_16)),
SOLVTAB(X(codelet_n1_32)),
SOLVTAB(X(codelet_n1_64)),
SOLVTAB(X(codelet_n1_20)),
SOLVTAB(X(codelet_n1_25)),
SOLVTAB(X(codelet_t1_2)),
SOLVTAB(X(codelet_t1_3)),
SOLVTAB(X(codelet_t1_4)),
SOLVTAB(X(codelet_t1_5)),
SOLVTAB(X(codelet_t1_6)),
SOLVTAB(X(codelet_t1_7)),
SOLVTAB(X(codelet_t1_8)),
SOLVTAB(X(codelet_t1_9)),
SOLVTAB(X(codelet_t1_10)),
SOLVTAB(X(codelet_t1_12)),
SOLVTAB(X(codelet_t1_15)),
SOLVTAB(X(codelet_t1_16)),
SOLVTAB(X(codelet_t1_32)),
SOLVTAB(X(codelet_t1_64)),
SOLVTAB(X(codelet_t1_20)),
SOLVTAB(X(codelet_t1_25)),
SOLVTAB(X(codelet_t2_4)),
SOLVTAB(X(codelet_t2_8)),
SOLVTAB(X(codelet_t2_16)),
SOLVTAB(X(codelet_t2_32)),
SOLVTAB(X(codelet_t2_64)),
SOLVTAB(X(codelet_t2_5)),
SOLVTAB(X(codelet_t2_10)),
SOLVTAB(X(codelet_t2_20)),
SOLVTAB(X(codelet_t2_25)),
SOLVTAB(X(codelet_q1_2)),
SOLVTAB(X(codelet_q1_4)),
SOLVTAB(X(codelet_q1_8)),
SOLVTAB(X(codelet_q1_3)),
SOLVTAB(X(codelet_q1_5)),
SOLVTAB(X(codelet_q1_6)),
SOLVTAB_END
};

View File

@@ -0,0 +1,362 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
/*
* This function contains 84 FP additions, 36 FP multiplications,
* (or, 48 additions, 0 multiplications, 36 fused multiply/add),
* 41 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
E T3, Tj, TN, T1b, TU, TV, T1j, T1i, Tm, Tp, Tq, Ta, Th, Ti, TA;
E TH, T17, T14, T1c, T1d, T1e, TO, TP, TQ;
{
E T1, T2, TL, TM;
T1 = ri[0];
T2 = ri[WS(is, 5)];
T3 = T1 - T2;
Tj = T1 + T2;
TL = ii[0];
TM = ii[WS(is, 5)];
TN = TL - TM;
T1b = TL + TM;
}
{
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
{
E T4, T5, Te, Tf;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 7)];
T6 = T4 - T5;
Tk = T4 + T5;
Te = ri[WS(is, 6)];
Tf = ri[WS(is, 1)];
Tg = Te - Tf;
To = Te + Tf;
}
{
E T7, T8, Tb, Tc;
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 3)];
T9 = T7 - T8;
Tl = T7 + T8;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 9)];
Td = Tb - Tc;
Tn = Tb + Tc;
}
TU = T6 - T9;
TV = Td - Tg;
T1j = Tk - Tl;
T1i = Tn - To;
Tm = Tk + Tl;
Tp = Tn + To;
Tq = Tm + Tp;
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
}
{
E Tw, T15, TG, T13, Tz, T16, TD, T12;
{
E Tu, Tv, TE, TF;
Tu = ii[WS(is, 2)];
Tv = ii[WS(is, 7)];
Tw = Tu - Tv;
T15 = Tu + Tv;
TE = ii[WS(is, 6)];
TF = ii[WS(is, 1)];
TG = TE - TF;
T13 = TE + TF;
}
{
E Tx, Ty, TB, TC;
Tx = ii[WS(is, 8)];
Ty = ii[WS(is, 3)];
Tz = Tx - Ty;
T16 = Tx + Ty;
TB = ii[WS(is, 4)];
TC = ii[WS(is, 9)];
TD = TB - TC;
T12 = TB + TC;
}
TA = Tw - Tz;
TH = TD - TG;
T17 = T15 - T16;
T14 = T12 - T13;
T1c = T15 + T16;
T1d = T12 + T13;
T1e = T1c + T1d;
TO = Tw + Tz;
TP = TD + TG;
TQ = TO + TP;
}
ro[WS(os, 5)] = T3 + Ti;
io[WS(os, 5)] = TN + TQ;
ro[0] = Tj + Tq;
io[0] = T1b + T1e;
{
E TI, TK, Tt, TJ, Tr, Ts;
TI = FMA(KP618033988, TH, TA);
TK = FNMS(KP618033988, TA, TH);
Tr = FNMS(KP250000000, Ti, T3);
Ts = Ta - Th;
Tt = FMA(KP559016994, Ts, Tr);
TJ = FNMS(KP559016994, Ts, Tr);
ro[WS(os, 9)] = FNMS(KP951056516, TI, Tt);
ro[WS(os, 3)] = FMA(KP951056516, TK, TJ);
ro[WS(os, 1)] = FMA(KP951056516, TI, Tt);
ro[WS(os, 7)] = FNMS(KP951056516, TK, TJ);
}
{
E TW, TY, TT, TX, TR, TS;
TW = FMA(KP618033988, TV, TU);
TY = FNMS(KP618033988, TU, TV);
TR = FNMS(KP250000000, TQ, TN);
TS = TO - TP;
TT = FMA(KP559016994, TS, TR);
TX = FNMS(KP559016994, TS, TR);
io[WS(os, 1)] = FNMS(KP951056516, TW, TT);
io[WS(os, 7)] = FMA(KP951056516, TY, TX);
io[WS(os, 9)] = FMA(KP951056516, TW, TT);
io[WS(os, 3)] = FNMS(KP951056516, TY, TX);
}
{
E T18, T1a, T11, T19, TZ, T10;
T18 = FNMS(KP618033988, T17, T14);
T1a = FMA(KP618033988, T14, T17);
TZ = FNMS(KP250000000, Tq, Tj);
T10 = Tm - Tp;
T11 = FNMS(KP559016994, T10, TZ);
T19 = FMA(KP559016994, T10, TZ);
ro[WS(os, 2)] = FNMS(KP951056516, T18, T11);
ro[WS(os, 6)] = FMA(KP951056516, T1a, T19);
ro[WS(os, 8)] = FMA(KP951056516, T18, T11);
ro[WS(os, 4)] = FNMS(KP951056516, T1a, T19);
}
{
E T1k, T1m, T1h, T1l, T1f, T1g;
T1k = FNMS(KP618033988, T1j, T1i);
T1m = FMA(KP618033988, T1i, T1j);
T1f = FNMS(KP250000000, T1e, T1b);
T1g = T1c - T1d;
T1h = FNMS(KP559016994, T1g, T1f);
T1l = FMA(KP559016994, T1g, T1f);
io[WS(os, 2)] = FMA(KP951056516, T1k, T1h);
io[WS(os, 6)] = FNMS(KP951056516, T1m, T1l);
io[WS(os, 8)] = FNMS(KP951056516, T1k, T1h);
io[WS(os, 4)] = FMA(KP951056516, T1m, T1l);
}
}
}
}
static const kdft_desc desc = { 10, "n1_10", { 48, 0, 36, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 10 -name n1_10 -include dft/scalar/n.h */
/*
* This function contains 84 FP additions, 24 FP multiplications,
* (or, 72 additions, 12 multiplications, 12 fused multiply/add),
* 41 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(40, is), MAKE_VOLATILE_STRIDE(40, os)) {
E T3, Tj, TQ, T1e, TU, TV, T1c, T1b, Tm, Tp, Tq, Ta, Th, Ti, TA;
E TH, T17, T14, T1f, T1g, T1h, TL, TM, TR;
{
E T1, T2, TO, TP;
T1 = ri[0];
T2 = ri[WS(is, 5)];
T3 = T1 - T2;
Tj = T1 + T2;
TO = ii[0];
TP = ii[WS(is, 5)];
TQ = TO - TP;
T1e = TO + TP;
}
{
E T6, Tk, Tg, To, T9, Tl, Td, Tn;
{
E T4, T5, Te, Tf;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 7)];
T6 = T4 - T5;
Tk = T4 + T5;
Te = ri[WS(is, 6)];
Tf = ri[WS(is, 1)];
Tg = Te - Tf;
To = Te + Tf;
}
{
E T7, T8, Tb, Tc;
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 3)];
T9 = T7 - T8;
Tl = T7 + T8;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 9)];
Td = Tb - Tc;
Tn = Tb + Tc;
}
TU = T6 - T9;
TV = Td - Tg;
T1c = Tk - Tl;
T1b = Tn - To;
Tm = Tk + Tl;
Tp = Tn + To;
Tq = Tm + Tp;
Ta = T6 + T9;
Th = Td + Tg;
Ti = Ta + Th;
}
{
E Tw, T15, TG, T13, Tz, T16, TD, T12;
{
E Tu, Tv, TE, TF;
Tu = ii[WS(is, 2)];
Tv = ii[WS(is, 7)];
Tw = Tu - Tv;
T15 = Tu + Tv;
TE = ii[WS(is, 6)];
TF = ii[WS(is, 1)];
TG = TE - TF;
T13 = TE + TF;
}
{
E Tx, Ty, TB, TC;
Tx = ii[WS(is, 8)];
Ty = ii[WS(is, 3)];
Tz = Tx - Ty;
T16 = Tx + Ty;
TB = ii[WS(is, 4)];
TC = ii[WS(is, 9)];
TD = TB - TC;
T12 = TB + TC;
}
TA = Tw - Tz;
TH = TD - TG;
T17 = T15 - T16;
T14 = T12 - T13;
T1f = T15 + T16;
T1g = T12 + T13;
T1h = T1f + T1g;
TL = Tw + Tz;
TM = TD + TG;
TR = TL + TM;
}
ro[WS(os, 5)] = T3 + Ti;
io[WS(os, 5)] = TQ + TR;
ro[0] = Tj + Tq;
io[0] = T1e + T1h;
{
E TI, TK, Tt, TJ, Tr, Ts;
TI = FMA(KP951056516, TA, KP587785252 * TH);
TK = FNMS(KP587785252, TA, KP951056516 * TH);
Tr = KP559016994 * (Ta - Th);
Ts = FNMS(KP250000000, Ti, T3);
Tt = Tr + Ts;
TJ = Ts - Tr;
ro[WS(os, 9)] = Tt - TI;
ro[WS(os, 3)] = TJ + TK;
ro[WS(os, 1)] = Tt + TI;
ro[WS(os, 7)] = TJ - TK;
}
{
E TW, TY, TT, TX, TN, TS;
TW = FMA(KP951056516, TU, KP587785252 * TV);
TY = FNMS(KP587785252, TU, KP951056516 * TV);
TN = KP559016994 * (TL - TM);
TS = FNMS(KP250000000, TR, TQ);
TT = TN + TS;
TX = TS - TN;
io[WS(os, 1)] = TT - TW;
io[WS(os, 7)] = TY + TX;
io[WS(os, 9)] = TW + TT;
io[WS(os, 3)] = TX - TY;
}
{
E T18, T1a, T11, T19, TZ, T10;
T18 = FNMS(KP587785252, T17, KP951056516 * T14);
T1a = FMA(KP951056516, T17, KP587785252 * T14);
TZ = FNMS(KP250000000, Tq, Tj);
T10 = KP559016994 * (Tm - Tp);
T11 = TZ - T10;
T19 = T10 + TZ;
ro[WS(os, 2)] = T11 - T18;
ro[WS(os, 6)] = T19 + T1a;
ro[WS(os, 8)] = T11 + T18;
ro[WS(os, 4)] = T19 - T1a;
}
{
E T1d, T1l, T1k, T1m, T1i, T1j;
T1d = FNMS(KP587785252, T1c, KP951056516 * T1b);
T1l = FMA(KP951056516, T1c, KP587785252 * T1b);
T1i = FNMS(KP250000000, T1h, T1e);
T1j = KP559016994 * (T1f - T1g);
T1k = T1i - T1j;
T1m = T1j + T1i;
io[WS(os, 2)] = T1d + T1k;
io[WS(os, 6)] = T1m - T1l;
io[WS(os, 8)] = T1k - T1d;
io[WS(os, 4)] = T1l + T1m;
}
}
}
}
static const kdft_desc desc = { 10, "n1_10", { 72, 12, 12, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_10) (planner *p) { X(kdft_register) (p, n1_10, &desc);
}
#endif

View File

@@ -0,0 +1,426 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
/*
* This function contains 140 FP additions, 110 FP multiplications,
* (or, 30 additions, 0 multiplications, 110 fused multiply/add),
* 62 stack variables, 10 constants, and 44 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
DK(KP918985947, +0.918985947228994779780736114132655398124909697);
DK(KP830830026, +0.830830026003772851058548298459246407048009821);
DK(KP876768831, +0.876768831002589333891339807079336796764054852);
DK(KP778434453, +0.778434453334651800608337670740821884709317477);
DK(KP715370323, +0.715370323453429719112414662767260662417897278);
DK(KP521108558, +0.521108558113202722944698153526659300680427422);
DK(KP634356270, +0.634356270682424498893150776899916060542806975);
DK(KP342584725, +0.342584725681637509502641509861112333758894680);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
E T1, T1f, T4, T1u, Tg, T1q, T7, T1t, Ta, T1s, Td, T1r, Ti, TP, T26;
E TG, T1X, T1O, T1w, TY, T1F, T17, To, T1i, TA, T1k, Tr, T1h, Tu, T1j;
E Tx, T1g, TC, TU, T21, TL, T1S, T1J, T1m, T13, T1A, T1c;
T1 = ri[0];
T1f = ii[0];
{
E T5, T6, Tp, Tq;
{
E T2, T3, Te, Tf;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
T1u = T3 - T2;
Te = ri[WS(is, 5)];
Tf = ri[WS(is, 6)];
Tg = Te + Tf;
T1q = Tf - Te;
}
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 9)];
T7 = T5 + T6;
T1t = T6 - T5;
{
E T8, T9, Tb, Tc;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 8)];
Ta = T8 + T9;
T1s = T9 - T8;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 7)];
Td = Tb + Tc;
T1r = Tc - Tb;
}
{
E Th, TO, T25, TF, T1W;
Th = FNMS(KP342584725, Ta, T7);
Ti = FNMS(KP634356270, Th, Td);
TO = FNMS(KP342584725, T4, Ta);
TP = FNMS(KP634356270, TO, Tg);
T25 = FMA(KP521108558, T1q, T1u);
T26 = FMA(KP715370323, T25, T1r);
TF = FNMS(KP342584725, Td, T4);
TG = FNMS(KP634356270, TF, T7);
T1W = FMA(KP521108558, T1s, T1q);
T1X = FNMS(KP715370323, T1W, T1t);
}
{
E T1N, T1v, TX, T1E, T16;
T1N = FNMS(KP521108558, T1t, T1r);
T1O = FMA(KP715370323, T1N, T1q);
T1v = FNMS(KP521108558, T1u, T1t);
T1w = FNMS(KP715370323, T1v, T1s);
TX = FNMS(KP342584725, T7, Tg);
TY = FNMS(KP634356270, TX, T4);
T1E = FMA(KP521108558, T1r, T1s);
T1F = FMA(KP715370323, T1E, T1u);
T16 = FNMS(KP342584725, Tg, Td);
T17 = FNMS(KP634356270, T16, Ta);
}
{
E Tm, Tn, Ty, Tz;
Tm = ii[WS(is, 3)];
Tn = ii[WS(is, 8)];
To = Tm - Tn;
T1i = Tm + Tn;
Ty = ii[WS(is, 5)];
Tz = ii[WS(is, 6)];
TA = Ty - Tz;
T1k = Ty + Tz;
}
Tp = ii[WS(is, 2)];
Tq = ii[WS(is, 9)];
Tr = Tp - Tq;
T1h = Tp + Tq;
{
E Ts, Tt, Tv, Tw;
Ts = ii[WS(is, 4)];
Tt = ii[WS(is, 7)];
Tu = Ts - Tt;
T1j = Ts + Tt;
Tv = ii[WS(is, 1)];
Tw = ii[WS(is, 10)];
Tx = Tv - Tw;
T1g = Tv + Tw;
}
{
E TB, TT, T20, TK, T1R;
TB = FMA(KP521108558, TA, Tx);
TC = FMA(KP715370323, TB, Tu);
TT = FNMS(KP521108558, Tr, Tu);
TU = FMA(KP715370323, TT, TA);
T20 = FNMS(KP342584725, T1i, T1h);
T21 = FNMS(KP634356270, T20, T1j);
TK = FMA(KP521108558, To, TA);
TL = FNMS(KP715370323, TK, Tr);
T1R = FNMS(KP342584725, T1j, T1g);
T1S = FNMS(KP634356270, T1R, T1h);
}
{
E T1I, T1l, T12, T1z, T1b;
T1I = FNMS(KP342584725, T1g, T1i);
T1J = FNMS(KP634356270, T1I, T1k);
T1l = FNMS(KP342584725, T1k, T1j);
T1m = FNMS(KP634356270, T1l, T1i);
T12 = FMA(KP521108558, Tu, To);
T13 = FMA(KP715370323, T12, Tx);
T1z = FNMS(KP342584725, T1h, T1k);
T1A = FNMS(KP634356270, T1z, T1g);
T1b = FNMS(KP521108558, Tx, Tr);
T1c = FNMS(KP715370323, T1b, To);
}
}
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
io[0] = T1f + T1g + T1h + T1i + T1j + T1k;
{
E Tk, TE, Tj, TD, Tl;
Tj = FNMS(KP778434453, Ti, T4);
Tk = FNMS(KP876768831, Tj, Tg);
TD = FMA(KP830830026, TC, Tr);
TE = FMA(KP918985947, TD, To);
Tl = FNMS(KP959492973, Tk, T1);
ro[WS(os, 10)] = FNMS(KP989821441, TE, Tl);
ro[WS(os, 1)] = FMA(KP989821441, TE, Tl);
}
{
E T23, T28, T22, T27, T24;
T22 = FNMS(KP778434453, T21, T1g);
T23 = FNMS(KP876768831, T22, T1k);
T27 = FMA(KP830830026, T26, T1t);
T28 = FMA(KP918985947, T27, T1s);
T24 = FNMS(KP959492973, T23, T1f);
io[WS(os, 1)] = FMA(KP989821441, T28, T24);
io[WS(os, 10)] = FNMS(KP989821441, T28, T24);
}
{
E T1U, T1Z, T1T, T1Y, T1V;
T1T = FNMS(KP778434453, T1S, T1k);
T1U = FNMS(KP876768831, T1T, T1i);
T1Y = FMA(KP830830026, T1X, T1u);
T1Z = FNMS(KP918985947, T1Y, T1r);
T1V = FNMS(KP959492973, T1U, T1f);
io[WS(os, 2)] = FNMS(KP989821441, T1Z, T1V);
io[WS(os, 9)] = FMA(KP989821441, T1Z, T1V);
}
{
E TI, TN, TH, TM, TJ;
TH = FNMS(KP778434453, TG, Tg);
TI = FNMS(KP876768831, TH, Ta);
TM = FMA(KP830830026, TL, Tx);
TN = FNMS(KP918985947, TM, Tu);
TJ = FNMS(KP959492973, TI, T1);
ro[WS(os, 2)] = FNMS(KP989821441, TN, TJ);
ro[WS(os, 9)] = FMA(KP989821441, TN, TJ);
}
{
E TR, TW, TQ, TV, TS;
TQ = FNMS(KP778434453, TP, Td);
TR = FNMS(KP876768831, TQ, T7);
TV = FNMS(KP830830026, TU, To);
TW = FNMS(KP918985947, TV, Tx);
TS = FNMS(KP959492973, TR, T1);
ro[WS(os, 8)] = FNMS(KP989821441, TW, TS);
ro[WS(os, 3)] = FMA(KP989821441, TW, TS);
}
{
E T1L, T1Q, T1K, T1P, T1M;
T1K = FNMS(KP778434453, T1J, T1j);
T1L = FNMS(KP876768831, T1K, T1h);
T1P = FNMS(KP830830026, T1O, T1s);
T1Q = FNMS(KP918985947, T1P, T1u);
T1M = FNMS(KP959492973, T1L, T1f);
io[WS(os, 3)] = FMA(KP989821441, T1Q, T1M);
io[WS(os, 8)] = FNMS(KP989821441, T1Q, T1M);
}
{
E T10, T15, TZ, T14, T11;
TZ = FNMS(KP778434453, TY, Ta);
T10 = FNMS(KP876768831, TZ, Td);
T14 = FNMS(KP830830026, T13, TA);
T15 = FMA(KP918985947, T14, Tr);
T11 = FNMS(KP959492973, T10, T1);
ro[WS(os, 4)] = FNMS(KP989821441, T15, T11);
ro[WS(os, 7)] = FMA(KP989821441, T15, T11);
}
{
E T1C, T1H, T1B, T1G, T1D;
T1B = FNMS(KP778434453, T1A, T1i);
T1C = FNMS(KP876768831, T1B, T1j);
T1G = FNMS(KP830830026, T1F, T1q);
T1H = FMA(KP918985947, T1G, T1t);
T1D = FNMS(KP959492973, T1C, T1f);
io[WS(os, 4)] = FNMS(KP989821441, T1H, T1D);
io[WS(os, 7)] = FMA(KP989821441, T1H, T1D);
}
{
E T1o, T1y, T1n, T1x, T1p;
T1n = FNMS(KP778434453, T1m, T1h);
T1o = FNMS(KP876768831, T1n, T1g);
T1x = FNMS(KP830830026, T1w, T1r);
T1y = FNMS(KP918985947, T1x, T1q);
T1p = FNMS(KP959492973, T1o, T1f);
io[WS(os, 5)] = FMA(KP989821441, T1y, T1p);
io[WS(os, 6)] = FNMS(KP989821441, T1y, T1p);
}
{
E T19, T1e, T18, T1d, T1a;
T18 = FNMS(KP778434453, T17, T7);
T19 = FNMS(KP876768831, T18, T4);
T1d = FNMS(KP830830026, T1c, Tu);
T1e = FNMS(KP918985947, T1d, TA);
T1a = FNMS(KP959492973, T19, T1);
ro[WS(os, 6)] = FNMS(KP989821441, T1e, T1a);
ro[WS(os, 5)] = FMA(KP989821441, T1e, T1a);
}
}
}
}
static const kdft_desc desc = { 11, "n1_11", { 30, 0, 110, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 11 -name n1_11 -include dft/scalar/n.h */
/*
* This function contains 140 FP additions, 100 FP multiplications,
* (or, 60 additions, 20 multiplications, 80 fused multiply/add),
* 41 stack variables, 10 constants, and 44 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP654860733, +0.654860733945285064056925072466293553183791199);
DK(KP142314838, +0.142314838273285140443792668616369668791051361);
DK(KP959492973, +0.959492973614497389890368057066327699062454848);
DK(KP415415013, +0.415415013001886425529274149229623203524004910);
DK(KP841253532, +0.841253532831181168861811648919367717513292498);
DK(KP989821441, +0.989821441880932732376092037776718787376519372);
DK(KP909631995, +0.909631995354518371411715383079028460060241051);
DK(KP281732556, +0.281732556841429697711417915346616899035777899);
DK(KP540640817, +0.540640817455597582107635954318691695431770608);
DK(KP755749574, +0.755749574354258283774035843972344420179717445);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(44, is), MAKE_VOLATILE_STRIDE(44, os)) {
E T1, TM, T4, TG, Tk, TR, Tw, TN, T7, TK, Ta, TH, Tn, TQ, Td;
E TJ, Tq, TO, Tt, TP, Tg, TI;
{
E T2, T3, Ti, Tj;
T1 = ri[0];
TM = ii[0];
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
TG = T3 - T2;
Ti = ii[WS(is, 1)];
Tj = ii[WS(is, 10)];
Tk = Ti - Tj;
TR = Ti + Tj;
{
E Tu, Tv, T5, T6;
Tu = ii[WS(is, 2)];
Tv = ii[WS(is, 9)];
Tw = Tu - Tv;
TN = Tu + Tv;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 9)];
T7 = T5 + T6;
TK = T6 - T5;
}
}
{
E T8, T9, To, Tp;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 8)];
Ta = T8 + T9;
TH = T9 - T8;
{
E Tl, Tm, Tb, Tc;
Tl = ii[WS(is, 3)];
Tm = ii[WS(is, 8)];
Tn = Tl - Tm;
TQ = Tl + Tm;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 7)];
Td = Tb + Tc;
TJ = Tc - Tb;
}
To = ii[WS(is, 4)];
Tp = ii[WS(is, 7)];
Tq = To - Tp;
TO = To + Tp;
{
E Tr, Ts, Te, Tf;
Tr = ii[WS(is, 5)];
Ts = ii[WS(is, 6)];
Tt = Tr - Ts;
TP = Tr + Ts;
Te = ri[WS(is, 5)];
Tf = ri[WS(is, 6)];
Tg = Te + Tf;
TI = Tf - Te;
}
}
{
E Tx, Th, TZ, T10;
ro[0] = T1 + T4 + T7 + Ta + Td + Tg;
io[0] = TM + TR + TN + TQ + TO + TP;
Tx = FMA(KP755749574, Tk, KP540640817 * Tn) + FNMS(KP909631995, Tt, KP281732556 * Tq) - (KP989821441 * Tw);
Th = FMA(KP841253532, Ta, T1) + FNMS(KP959492973, Td, KP415415013 * Tg) + FNMA(KP142314838, T7, KP654860733 * T4);
ro[WS(os, 7)] = Th - Tx;
ro[WS(os, 4)] = Th + Tx;
TZ = FMA(KP755749574, TG, KP540640817 * TH) + FNMS(KP909631995, TI, KP281732556 * TJ) - (KP989821441 * TK);
T10 = FMA(KP841253532, TQ, TM) + FNMS(KP959492973, TO, KP415415013 * TP) + FNMA(KP142314838, TN, KP654860733 * TR);
io[WS(os, 4)] = TZ + T10;
io[WS(os, 7)] = T10 - TZ;
{
E TX, TY, Tz, Ty;
TX = FMA(KP909631995, TG, KP755749574 * TK) + FNMA(KP540640817, TI, KP989821441 * TJ) - (KP281732556 * TH);
TY = FMA(KP415415013, TR, TM) + FNMS(KP142314838, TO, KP841253532 * TP) + FNMA(KP959492973, TQ, KP654860733 * TN);
io[WS(os, 2)] = TX + TY;
io[WS(os, 9)] = TY - TX;
Tz = FMA(KP909631995, Tk, KP755749574 * Tw) + FNMA(KP540640817, Tt, KP989821441 * Tq) - (KP281732556 * Tn);
Ty = FMA(KP415415013, T4, T1) + FNMS(KP142314838, Td, KP841253532 * Tg) + FNMA(KP959492973, Ta, KP654860733 * T7);
ro[WS(os, 9)] = Ty - Tz;
ro[WS(os, 2)] = Ty + Tz;
}
}
{
E TB, TA, TT, TU;
TB = FMA(KP540640817, Tk, KP909631995 * Tw) + FMA(KP989821441, Tn, KP755749574 * Tq) + (KP281732556 * Tt);
TA = FMA(KP841253532, T4, T1) + FNMS(KP959492973, Tg, KP415415013 * T7) + FNMA(KP654860733, Td, KP142314838 * Ta);
ro[WS(os, 10)] = TA - TB;
ro[WS(os, 1)] = TA + TB;
{
E TV, TW, TD, TC;
TV = FMA(KP540640817, TG, KP909631995 * TK) + FMA(KP989821441, TH, KP755749574 * TJ) + (KP281732556 * TI);
TW = FMA(KP841253532, TR, TM) + FNMS(KP959492973, TP, KP415415013 * TN) + FNMA(KP654860733, TO, KP142314838 * TQ);
io[WS(os, 1)] = TV + TW;
io[WS(os, 10)] = TW - TV;
TD = FMA(KP989821441, Tk, KP540640817 * Tq) + FNMS(KP909631995, Tn, KP755749574 * Tt) - (KP281732556 * Tw);
TC = FMA(KP415415013, Ta, T1) + FNMS(KP654860733, Tg, KP841253532 * Td) + FNMA(KP959492973, T7, KP142314838 * T4);
ro[WS(os, 8)] = TC - TD;
ro[WS(os, 3)] = TC + TD;
}
TT = FMA(KP989821441, TG, KP540640817 * TJ) + FNMS(KP909631995, TH, KP755749574 * TI) - (KP281732556 * TK);
TU = FMA(KP415415013, TQ, TM) + FNMS(KP654860733, TP, KP841253532 * TO) + FNMA(KP959492973, TN, KP142314838 * TR);
io[WS(os, 3)] = TT + TU;
io[WS(os, 8)] = TU - TT;
{
E TL, TS, TF, TE;
TL = FMA(KP281732556, TG, KP755749574 * TH) + FNMS(KP909631995, TJ, KP989821441 * TI) - (KP540640817 * TK);
TS = FMA(KP841253532, TN, TM) + FNMS(KP142314838, TP, KP415415013 * TO) + FNMA(KP654860733, TQ, KP959492973 * TR);
io[WS(os, 5)] = TL + TS;
io[WS(os, 6)] = TS - TL;
TF = FMA(KP281732556, Tk, KP755749574 * Tn) + FNMS(KP909631995, Tq, KP989821441 * Tt) - (KP540640817 * Tw);
TE = FMA(KP841253532, T7, T1) + FNMS(KP142314838, Tg, KP415415013 * Td) + FNMA(KP654860733, Ta, KP959492973 * T4);
ro[WS(os, 6)] = TE - TF;
ro[WS(os, 5)] = TE + TF;
}
}
}
}
}
static const kdft_desc desc = { 11, "n1_11", { 60, 20, 80, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_11) (planner *p) { X(kdft_register) (p, n1_11, &desc);
}
#endif

View File

@@ -0,0 +1,420 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
/*
* This function contains 96 FP additions, 24 FP multiplications,
* (or, 72 additions, 0 multiplications, 24 fused multiply/add),
* 43 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1d, TG;
E TJ, T1u, T1c, Tl, T1i, TL, TO, T1v, T1h;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = ri[WS(is, 8)];
T4 = T2 + T3;
T5 = T1 + T4;
TR = FNMS(KP500000000, T4, T1);
TA = T3 - T2;
}
{
E To, Tp, Tq, Tr;
To = ii[0];
Tp = ii[WS(is, 4)];
Tq = ii[WS(is, 8)];
Tr = Tp + Tq;
Ts = To + Tr;
TS = Tp - Tq;
Tz = FNMS(KP500000000, Tr, To);
}
{
E T6, T7, T8, T9;
T6 = ri[WS(is, 6)];
T7 = ri[WS(is, 10)];
T8 = ri[WS(is, 2)];
T9 = T7 + T8;
Ta = T6 + T9;
TU = FNMS(KP500000000, T9, T6);
TD = T8 - T7;
}
{
E Tt, Tu, Tv, Tw;
Tt = ii[WS(is, 6)];
Tu = ii[WS(is, 10)];
Tv = ii[WS(is, 2)];
Tw = Tu + Tv;
Tx = Tt + Tw;
TV = Tu - Tv;
TC = FNMS(KP500000000, Tw, Tt);
}
{
E Tc, Td, Te, Tf;
Tc = ri[WS(is, 3)];
Td = ri[WS(is, 7)];
Te = ri[WS(is, 11)];
Tf = Td + Te;
Tg = Tc + Tf;
T1d = Te - Td;
TG = FNMS(KP500000000, Tf, Tc);
}
{
E T1a, TH, TI, T1b;
T1a = ii[WS(is, 3)];
TH = ii[WS(is, 7)];
TI = ii[WS(is, 11)];
T1b = TH + TI;
TJ = TH - TI;
T1u = T1a + T1b;
T1c = FNMS(KP500000000, T1b, T1a);
}
{
E Th, Ti, Tj, Tk;
Th = ri[WS(is, 9)];
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 5)];
Tk = Ti + Tj;
Tl = Th + Tk;
T1i = Tj - Ti;
TL = FNMS(KP500000000, Tk, Th);
}
{
E T1f, TM, TN, T1g;
T1f = ii[WS(is, 9)];
TM = ii[WS(is, 1)];
TN = ii[WS(is, 5)];
T1g = TM + TN;
TO = TM - TN;
T1v = T1f + T1g;
T1h = FNMS(KP500000000, T1g, T1f);
}
{
E Tb, Tm, T1t, T1w;
Tb = T5 + Ta;
Tm = Tg + Tl;
ro[WS(os, 6)] = Tb - Tm;
ro[0] = Tb + Tm;
{
E T1x, T1y, Tn, Ty;
T1x = Ts + Tx;
T1y = T1u + T1v;
io[WS(os, 6)] = T1x - T1y;
io[0] = T1x + T1y;
Tn = Tg - Tl;
Ty = Ts - Tx;
io[WS(os, 3)] = Tn + Ty;
io[WS(os, 9)] = Ty - Tn;
}
T1t = T5 - Ta;
T1w = T1u - T1v;
ro[WS(os, 3)] = T1t - T1w;
ro[WS(os, 9)] = T1t + T1w;
{
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
{
E TZ, T10, T1e, T1j;
TZ = FMA(KP866025403, TA, Tz);
T10 = FMA(KP866025403, TD, TC);
T11 = TZ - T10;
T1l = TZ + T10;
T1e = FMA(KP866025403, T1d, T1c);
T1j = FMA(KP866025403, T1i, T1h);
T1k = T1e - T1j;
T1m = T1e + T1j;
}
{
E T12, T13, T15, T16;
T12 = FMA(KP866025403, TJ, TG);
T13 = FMA(KP866025403, TO, TL);
T14 = T12 - T13;
T18 = T12 + T13;
T15 = FMA(KP866025403, TS, TR);
T16 = FMA(KP866025403, TV, TU);
T17 = T15 + T16;
T19 = T15 - T16;
}
io[WS(os, 1)] = T11 - T14;
ro[WS(os, 1)] = T19 + T1k;
io[WS(os, 7)] = T11 + T14;
ro[WS(os, 7)] = T19 - T1k;
ro[WS(os, 10)] = T17 - T18;
io[WS(os, 10)] = T1l - T1m;
ro[WS(os, 4)] = T17 + T18;
io[WS(os, 4)] = T1l + T1m;
}
{
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
{
E TB, TE, T1o, T1p;
TB = FNMS(KP866025403, TA, Tz);
TE = FNMS(KP866025403, TD, TC);
TF = TB - TE;
T1r = TB + TE;
T1o = FNMS(KP866025403, T1d, T1c);
T1p = FNMS(KP866025403, T1i, T1h);
T1q = T1o - T1p;
T1s = T1o + T1p;
}
{
E TK, TP, TT, TW;
TK = FNMS(KP866025403, TJ, TG);
TP = FNMS(KP866025403, TO, TL);
TQ = TK - TP;
TY = TK + TP;
TT = FNMS(KP866025403, TS, TR);
TW = FNMS(KP866025403, TV, TU);
TX = TT + TW;
T1n = TT - TW;
}
io[WS(os, 5)] = TF - TQ;
ro[WS(os, 5)] = T1n + T1q;
io[WS(os, 11)] = TF + TQ;
ro[WS(os, 11)] = T1n - T1q;
ro[WS(os, 2)] = TX - TY;
io[WS(os, 2)] = T1r - T1s;
ro[WS(os, 8)] = TX + TY;
io[WS(os, 8)] = T1r + T1s;
}
}
}
}
}
static const kdft_desc desc = { 12, "n1_12", { 72, 0, 24, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 12 -name n1_12 -include dft/scalar/n.h */
/*
* This function contains 96 FP additions, 16 FP multiplications,
* (or, 88 additions, 8 multiplications, 8 fused multiply/add),
* 43 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(48, is), MAKE_VOLATILE_STRIDE(48, os)) {
E T5, TR, TA, Ts, TS, Tz, Ta, TU, TD, Tx, TV, TC, Tg, T1a, TG;
E TJ, T1u, T1d, Tl, T1f, TL, TO, T1v, T1i;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = ri[WS(is, 8)];
T4 = T2 + T3;
T5 = T1 + T4;
TR = FNMS(KP500000000, T4, T1);
TA = KP866025403 * (T3 - T2);
}
{
E To, Tp, Tq, Tr;
To = ii[0];
Tp = ii[WS(is, 4)];
Tq = ii[WS(is, 8)];
Tr = Tp + Tq;
Ts = To + Tr;
TS = KP866025403 * (Tp - Tq);
Tz = FNMS(KP500000000, Tr, To);
}
{
E T6, T7, T8, T9;
T6 = ri[WS(is, 6)];
T7 = ri[WS(is, 10)];
T8 = ri[WS(is, 2)];
T9 = T7 + T8;
Ta = T6 + T9;
TU = FNMS(KP500000000, T9, T6);
TD = KP866025403 * (T8 - T7);
}
{
E Tt, Tu, Tv, Tw;
Tt = ii[WS(is, 6)];
Tu = ii[WS(is, 10)];
Tv = ii[WS(is, 2)];
Tw = Tu + Tv;
Tx = Tt + Tw;
TV = KP866025403 * (Tu - Tv);
TC = FNMS(KP500000000, Tw, Tt);
}
{
E Tc, Td, Te, Tf;
Tc = ri[WS(is, 3)];
Td = ri[WS(is, 7)];
Te = ri[WS(is, 11)];
Tf = Td + Te;
Tg = Tc + Tf;
T1a = KP866025403 * (Te - Td);
TG = FNMS(KP500000000, Tf, Tc);
}
{
E T1b, TH, TI, T1c;
T1b = ii[WS(is, 3)];
TH = ii[WS(is, 7)];
TI = ii[WS(is, 11)];
T1c = TH + TI;
TJ = KP866025403 * (TH - TI);
T1u = T1b + T1c;
T1d = FNMS(KP500000000, T1c, T1b);
}
{
E Th, Ti, Tj, Tk;
Th = ri[WS(is, 9)];
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 5)];
Tk = Ti + Tj;
Tl = Th + Tk;
T1f = KP866025403 * (Tj - Ti);
TL = FNMS(KP500000000, Tk, Th);
}
{
E T1g, TM, TN, T1h;
T1g = ii[WS(is, 9)];
TM = ii[WS(is, 1)];
TN = ii[WS(is, 5)];
T1h = TM + TN;
TO = KP866025403 * (TM - TN);
T1v = T1g + T1h;
T1i = FNMS(KP500000000, T1h, T1g);
}
{
E Tb, Tm, T1t, T1w;
Tb = T5 + Ta;
Tm = Tg + Tl;
ro[WS(os, 6)] = Tb - Tm;
ro[0] = Tb + Tm;
{
E T1x, T1y, Tn, Ty;
T1x = Ts + Tx;
T1y = T1u + T1v;
io[WS(os, 6)] = T1x - T1y;
io[0] = T1x + T1y;
Tn = Tg - Tl;
Ty = Ts - Tx;
io[WS(os, 3)] = Tn + Ty;
io[WS(os, 9)] = Ty - Tn;
}
T1t = T5 - Ta;
T1w = T1u - T1v;
ro[WS(os, 3)] = T1t - T1w;
ro[WS(os, 9)] = T1t + T1w;
{
E T11, T1l, T1k, T1m, T14, T18, T17, T19;
{
E TZ, T10, T1e, T1j;
TZ = TA + Tz;
T10 = TD + TC;
T11 = TZ - T10;
T1l = TZ + T10;
T1e = T1a + T1d;
T1j = T1f + T1i;
T1k = T1e - T1j;
T1m = T1e + T1j;
}
{
E T12, T13, T15, T16;
T12 = TG + TJ;
T13 = TL + TO;
T14 = T12 - T13;
T18 = T12 + T13;
T15 = TR + TS;
T16 = TU + TV;
T17 = T15 + T16;
T19 = T15 - T16;
}
io[WS(os, 1)] = T11 - T14;
ro[WS(os, 1)] = T19 + T1k;
io[WS(os, 7)] = T11 + T14;
ro[WS(os, 7)] = T19 - T1k;
ro[WS(os, 10)] = T17 - T18;
io[WS(os, 10)] = T1l - T1m;
ro[WS(os, 4)] = T17 + T18;
io[WS(os, 4)] = T1l + T1m;
}
{
E TF, T1r, T1q, T1s, TQ, TY, TX, T1n;
{
E TB, TE, T1o, T1p;
TB = Tz - TA;
TE = TC - TD;
TF = TB - TE;
T1r = TB + TE;
T1o = T1d - T1a;
T1p = T1i - T1f;
T1q = T1o - T1p;
T1s = T1o + T1p;
}
{
E TK, TP, TT, TW;
TK = TG - TJ;
TP = TL - TO;
TQ = TK - TP;
TY = TK + TP;
TT = TR - TS;
TW = TU - TV;
TX = TT + TW;
T1n = TT - TW;
}
io[WS(os, 5)] = TF - TQ;
ro[WS(os, 5)] = T1n + T1q;
io[WS(os, 11)] = TF + TQ;
ro[WS(os, 11)] = T1n - T1q;
ro[WS(os, 2)] = TX - TY;
io[WS(os, 2)] = T1r - T1s;
ro[WS(os, 8)] = TX + TY;
io[WS(os, 8)] = T1r + T1s;
}
}
}
}
}
static const kdft_desc desc = { 12, "n1_12", { 88, 8, 8, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_12) (planner *p) { X(kdft_register) (p, n1_12, &desc);
}
#endif

View File

@@ -0,0 +1,681 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
/*
* This function contains 176 FP additions, 114 FP multiplications,
* (or, 62 additions, 0 multiplications, 114 fused multiply/add),
* 76 stack variables, 25 constants, and 52 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP875502302, +0.875502302409147941146295545768755143177842006);
DK(KP520028571, +0.520028571888864619117130500499232802493238139);
DK(KP968287244, +0.968287244361984016049539446938120421179794516);
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
DK(KP600477271, +0.600477271932665282925769253334763009352012849);
DK(KP957805992, +0.957805992594665126462521754605754580515587217);
DK(KP516520780, +0.516520780623489722840901288569017135705033622);
DK(KP581704778, +0.581704778510515730456870384989698884939833902);
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
DK(KP301479260, +0.301479260047709873958013540496673347309208464);
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
DK(KP859542535, +0.859542535098774820163672132761689612766401925);
DK(KP514918778, +0.514918778086315755491789696138117261566051239);
DK(KP522026385, +0.522026385161275033714027226654165028300441940);
DK(KP853480001, +0.853480001859823990758994934970528322872359049);
DK(KP612264650, +0.612264650376756543746494474777125408779395514);
DK(KP038632954, +0.038632954644348171955506895830342264440241080);
DK(KP302775637, +0.302775637731994646559610633735247973125648287);
DK(KP769338817, +0.769338817572980603471413688209101117038278899);
DK(KP686558370, +0.686558370781754340655719594850823015421401653);
DK(KP226109445, +0.226109445035782405468510155372505010481906348);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
E T1, T1P, T2n, T2o, To, TH, T2h, T2k, TB, TE, Tw, TF, T2c, T2j, T1j;
E T1m, T12, T1f, T21, T24, T1U, T27, T1d, T1g, T1Y, T25;
T1 = ri[0];
T1P = ii[0];
{
E Tf, T2d, Tb, Ty, Tq, T6, Tx, Tr, Ti, Tt, Tl, Tu, Tm, T2e, Td;
E Te, Tc, Tn;
Td = ri[WS(is, 8)];
Te = ri[WS(is, 5)];
Tf = Td + Te;
T2d = Td - Te;
{
E T7, T8, T9, Ta;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 10)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
Tb = T7 + Ta;
Ty = FMS(KP500000000, Ta, T7);
Tq = T8 - T9;
}
{
E T2, T3, T4, T5;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 3)];
T4 = ri[WS(is, 9)];
T5 = T3 + T4;
T6 = T2 + T5;
Tx = FNMS(KP500000000, T5, T2);
Tr = T4 - T3;
}
{
E Tg, Th, Tj, Tk;
Tg = ri[WS(is, 11)];
Th = ri[WS(is, 6)];
Ti = Tg + Th;
Tt = Tg - Th;
Tj = ri[WS(is, 7)];
Tk = ri[WS(is, 2)];
Tl = Tj + Tk;
Tu = Tj - Tk;
}
Tm = Ti + Tl;
T2e = Tt + Tu;
T2n = T6 - Tb;
T2o = T2d + T2e;
Tc = T6 + Tb;
Tn = Tf + Tm;
To = Tc + Tn;
TH = Tc - Tn;
{
E T2f, T2g, Tz, TA;
T2f = FNMS(KP500000000, T2e, T2d);
T2g = Tr + Tq;
T2h = FMA(KP866025403, T2g, T2f);
T2k = FNMS(KP866025403, T2g, T2f);
Tz = Tx - Ty;
TA = FNMS(KP500000000, Tm, Tf);
TB = Tz + TA;
TE = Tz - TA;
}
{
E Ts, Tv, T2a, T2b;
Ts = Tq - Tr;
Tv = Tt - Tu;
Tw = Ts + Tv;
TF = Ts - Tv;
T2a = Tx + Ty;
T2b = Ti - Tl;
T2c = FMA(KP866025403, T2b, T2a);
T2j = FNMS(KP866025403, T2b, T2a);
}
}
{
E TM, T1R, T10, T1l, T18, TX, T1k, T15, TP, T1a, TS, T1b, TT, T1S, TK;
E TL, TU, T11;
TK = ii[WS(is, 8)];
TL = ii[WS(is, 5)];
TM = TK - TL;
T1R = TK + TL;
{
E T16, TY, TZ, T17;
T16 = ii[WS(is, 12)];
TY = ii[WS(is, 10)];
TZ = ii[WS(is, 4)];
T17 = TY + TZ;
T10 = TY - TZ;
T1l = T16 + T17;
T18 = FMS(KP500000000, T17, T16);
}
{
E T13, TV, TW, T14;
T13 = ii[WS(is, 1)];
TV = ii[WS(is, 9)];
TW = ii[WS(is, 3)];
T14 = TW + TV;
TX = TV - TW;
T1k = T13 + T14;
T15 = FNMS(KP500000000, T14, T13);
}
{
E TN, TO, TQ, TR;
TN = ii[WS(is, 11)];
TO = ii[WS(is, 6)];
TP = TN - TO;
T1a = TN + TO;
TQ = ii[WS(is, 7)];
TR = ii[WS(is, 2)];
TS = TQ - TR;
T1b = TQ + TR;
}
TT = TP + TS;
T1S = T1a + T1b;
T1j = TM + TT;
T1m = T1k - T1l;
TU = FNMS(KP500000000, TT, TM);
T11 = TX + T10;
T12 = FMA(KP866025403, T11, TU);
T1f = FNMS(KP866025403, T11, TU);
{
E T1Z, T20, T1Q, T1T;
T1Z = T15 - T18;
T20 = FNMS(KP500000000, T1S, T1R);
T21 = T1Z + T20;
T24 = T1Z - T20;
T1Q = T1k + T1l;
T1T = T1R + T1S;
T1U = T1Q + T1T;
T27 = T1Q - T1T;
}
{
E T19, T1c, T1W, T1X;
T19 = T15 + T18;
T1c = T1a - T1b;
T1d = FMA(KP866025403, T1c, T19);
T1g = FNMS(KP866025403, T1c, T19);
T1W = T10 - TX;
T1X = TP - TS;
T1Y = T1W + T1X;
T25 = T1W - T1X;
}
}
ro[0] = T1 + To;
io[0] = T1P + T1U;
{
E T1z, T1J, T1G, T1H, T1w, T1I, T1n, T1i, T1s, T1E, TD, T1D, TI, T1r, T1e;
E T1h;
{
E T1x, T1y, T1u, T1v;
T1x = FNMS(KP226109445, Tw, TB);
T1y = FMA(KP686558370, TE, TF);
T1z = FNMS(KP769338817, T1y, T1x);
T1J = FMA(KP769338817, T1y, T1x);
T1G = FMA(KP302775637, T1j, T1m);
T1u = FNMS(KP038632954, T12, T1d);
T1v = FNMS(KP612264650, T1f, T1g);
T1H = FNMS(KP853480001, T1v, T1u);
T1w = FMA(KP853480001, T1v, T1u);
T1I = FNMS(KP522026385, T1H, T1G);
}
T1n = FNMS(KP302775637, T1m, T1j);
T1e = FMA(KP038632954, T1d, T12);
T1h = FMA(KP612264650, T1g, T1f);
T1i = FNMS(KP853480001, T1h, T1e);
T1s = FNMS(KP522026385, T1i, T1n);
T1E = FMA(KP853480001, T1h, T1e);
{
E TG, T1q, Tp, TC, T1p;
TG = FNMS(KP514918778, TF, TE);
T1q = FNMS(KP859542535, TG, TH);
Tp = FNMS(KP083333333, To, T1);
TC = FMA(KP301479260, TB, Tw);
T1p = FNMS(KP251768516, TC, Tp);
TD = FMA(KP503537032, TC, Tp);
T1D = FNMS(KP300462606, T1q, T1p);
TI = FMA(KP581704778, TH, TG);
T1r = FMA(KP300462606, T1q, T1p);
}
{
E TJ, T1o, T1L, T1M;
TJ = FMA(KP516520780, TI, TD);
T1o = FMA(KP957805992, T1n, T1i);
ro[WS(os, 1)] = FNMS(KP600477271, T1o, TJ);
ro[WS(os, 12)] = FMA(KP600477271, T1o, TJ);
{
E T1t, T1A, T1N, T1O;
T1t = FNMS(KP575140729, T1s, T1r);
T1A = FMA(KP968287244, T1z, T1w);
ro[WS(os, 9)] = FNMS(KP520028571, T1A, T1t);
ro[WS(os, 3)] = FMA(KP520028571, T1A, T1t);
T1N = FNMS(KP516520780, TI, TD);
T1O = FMA(KP957805992, T1G, T1H);
ro[WS(os, 8)] = FNMS(KP600477271, T1O, T1N);
ro[WS(os, 5)] = FMA(KP600477271, T1O, T1N);
}
T1L = FNMS(KP520028571, T1E, T1D);
T1M = FNMS(KP875502302, T1J, T1I);
ro[WS(os, 11)] = FNMS(KP575140729, T1M, T1L);
ro[WS(os, 6)] = FMA(KP575140729, T1M, T1L);
{
E T1F, T1K, T1B, T1C;
T1F = FMA(KP520028571, T1E, T1D);
T1K = FMA(KP875502302, T1J, T1I);
ro[WS(os, 7)] = FNMS(KP575140729, T1K, T1F);
ro[WS(os, 2)] = FMA(KP575140729, T1K, T1F);
T1B = FMA(KP575140729, T1s, T1r);
T1C = FNMS(KP968287244, T1z, T1w);
ro[WS(os, 10)] = FNMS(KP520028571, T1C, T1B);
ro[WS(os, 4)] = FMA(KP520028571, T1C, T1B);
}
}
}
{
E T2F, T2N, T2v, T2u, T2A, T2K, T2p, T2m, T2C, T2M, T23, T2J, T28, T2z, T2i;
E T2l;
{
E T2D, T2E, T2s, T2t;
T2D = FNMS(KP226109445, T1Y, T21);
T2E = FMA(KP686558370, T24, T25);
T2F = FNMS(KP769338817, T2E, T2D);
T2N = FMA(KP769338817, T2E, T2D);
T2v = FNMS(KP302775637, T2n, T2o);
T2s = FMA(KP038632954, T2c, T2h);
T2t = FMA(KP612264650, T2j, T2k);
T2u = FNMS(KP853480001, T2t, T2s);
T2A = FNMS(KP522026385, T2u, T2v);
T2K = FMA(KP853480001, T2t, T2s);
}
T2p = FMA(KP302775637, T2o, T2n);
T2i = FNMS(KP038632954, T2h, T2c);
T2l = FNMS(KP612264650, T2k, T2j);
T2m = FNMS(KP853480001, T2l, T2i);
T2C = FMA(KP853480001, T2l, T2i);
T2M = FNMS(KP522026385, T2m, T2p);
{
E T26, T2y, T1V, T22, T2x;
T26 = FNMS(KP514918778, T25, T24);
T2y = FNMS(KP859542535, T26, T27);
T1V = FNMS(KP083333333, T1U, T1P);
T22 = FMA(KP301479260, T21, T1Y);
T2x = FNMS(KP251768516, T22, T1V);
T23 = FMA(KP503537032, T22, T1V);
T2J = FNMS(KP300462606, T2y, T2x);
T28 = FMA(KP581704778, T27, T26);
T2z = FMA(KP300462606, T2y, T2x);
}
{
E T29, T2q, T2L, T2O;
T29 = FNMS(KP516520780, T28, T23);
T2q = FMA(KP957805992, T2p, T2m);
io[WS(os, 5)] = FNMS(KP600477271, T2q, T29);
io[WS(os, 8)] = FMA(KP600477271, T2q, T29);
{
E T2r, T2w, T2P, T2Q;
T2r = FMA(KP516520780, T28, T23);
T2w = FMA(KP957805992, T2v, T2u);
io[WS(os, 1)] = FMA(KP600477271, T2w, T2r);
io[WS(os, 12)] = FNMS(KP600477271, T2w, T2r);
T2P = FMA(KP520028571, T2K, T2J);
T2Q = FMA(KP875502302, T2N, T2M);
io[WS(os, 6)] = FNMS(KP575140729, T2Q, T2P);
io[WS(os, 11)] = FMA(KP575140729, T2Q, T2P);
}
T2L = FNMS(KP520028571, T2K, T2J);
T2O = FNMS(KP875502302, T2N, T2M);
io[WS(os, 2)] = FNMS(KP575140729, T2O, T2L);
io[WS(os, 7)] = FMA(KP575140729, T2O, T2L);
{
E T2H, T2I, T2B, T2G;
T2H = FNMS(KP575140729, T2A, T2z);
T2I = FMA(KP968287244, T2F, T2C);
io[WS(os, 4)] = FNMS(KP520028571, T2I, T2H);
io[WS(os, 10)] = FMA(KP520028571, T2I, T2H);
T2B = FMA(KP575140729, T2A, T2z);
T2G = FNMS(KP968287244, T2F, T2C);
io[WS(os, 3)] = FNMS(KP520028571, T2G, T2B);
io[WS(os, 9)] = FMA(KP520028571, T2G, T2B);
}
}
}
}
}
}
static const kdft_desc desc = { 13, "n1_13", { 62, 0, 114, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 13 -name n1_13 -include dft/scalar/n.h */
/*
* This function contains 176 FP additions, 68 FP multiplications,
* (or, 138 additions, 30 multiplications, 38 fused multiply/add),
* 71 stack variables, 20 constants, and 52 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
DK(KP083333333, +0.083333333333333333333333333333333333333333333);
DK(KP251768516, +0.251768516431883313623436926934233488546674281);
DK(KP075902986, +0.075902986037193865983102897245103540356428373);
DK(KP132983124, +0.132983124607418643793760531921092974399165133);
DK(KP258260390, +0.258260390311744861420450644284508567852516811);
DK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
DK(KP300238635, +0.300238635966332641462884626667381504676006424);
DK(KP011599105, +0.011599105605768290721655456654083252189827041);
DK(KP156891391, +0.156891391051584611046832726756003269660212636);
DK(KP256247671, +0.256247671582936600958684654061725059144125175);
DK(KP174138601, +0.174138601152135905005660794929264742616964676);
DK(KP575140729, +0.575140729474003121368385547455453388461001608);
DK(KP503537032, +0.503537032863766627246873853868466977093348562);
DK(KP113854479, +0.113854479055790798974654345867655310534642560);
DK(KP265966249, +0.265966249214837287587521063842185948798330267);
DK(KP387390585, +0.387390585467617292130675966426762851778775217);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP300462606, +0.300462606288665774426601772289207995520941381);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(52, is), MAKE_VOLATILE_STRIDE(52, os)) {
E T1, T1q, Tt, Tu, To, T22, T20, T24, TF, TH, TA, TI, T1X, T25, T2a;
E T2d, T18, T1n, T2k, T2n, T1l, T1r, T1f, T1o, T2h, T2m;
T1 = ri[0];
T1q = ii[0];
{
E Tf, Tp, Tb, TC, Tx, T6, TB, Tw, Ti, Tq, Tl, Tr, Tm, Ts, Td;
E Te, Tc, Tn;
Td = ri[WS(is, 8)];
Te = ri[WS(is, 5)];
Tf = Td + Te;
Tp = Td - Te;
{
E T7, T8, T9, Ta;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 10)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
Tb = T7 + Ta;
TC = T8 - T9;
Tx = FNMS(KP500000000, Ta, T7);
}
{
E T2, T3, T4, T5;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 3)];
T4 = ri[WS(is, 9)];
T5 = T3 + T4;
T6 = T2 + T5;
TB = T3 - T4;
Tw = FNMS(KP500000000, T5, T2);
}
{
E Tg, Th, Tj, Tk;
Tg = ri[WS(is, 11)];
Th = ri[WS(is, 6)];
Ti = Tg + Th;
Tq = Tg - Th;
Tj = ri[WS(is, 7)];
Tk = ri[WS(is, 2)];
Tl = Tj + Tk;
Tr = Tj - Tk;
}
Tm = Ti + Tl;
Ts = Tq + Tr;
Tt = Tp + Ts;
Tu = T6 - Tb;
Tc = T6 + Tb;
Tn = Tf + Tm;
To = Tc + Tn;
T22 = KP300462606 * (Tc - Tn);
{
E T1Y, T1Z, TD, TE;
T1Y = TB + TC;
T1Z = Tq - Tr;
T20 = T1Y - T1Z;
T24 = T1Y + T1Z;
TD = KP866025403 * (TB - TC);
TE = FNMS(KP500000000, Ts, Tp);
TF = TD - TE;
TH = TD + TE;
}
{
E Ty, Tz, T1V, T1W;
Ty = Tw - Tx;
Tz = KP866025403 * (Ti - Tl);
TA = Ty + Tz;
TI = Ty - Tz;
T1V = Tw + Tx;
T1W = FNMS(KP500000000, Tm, Tf);
T1X = T1V - T1W;
T25 = T1V + T1W;
}
}
{
E TZ, T2b, TV, T1i, T1a, TQ, T1h, T19, T12, T1d, T15, T1c, T16, T2c, TX;
E TY, TW, T17;
TX = ii[WS(is, 8)];
TY = ii[WS(is, 5)];
TZ = TX + TY;
T2b = TX - TY;
{
E TR, TS, TT, TU;
TR = ii[WS(is, 12)];
TS = ii[WS(is, 10)];
TT = ii[WS(is, 4)];
TU = TS + TT;
TV = FNMS(KP500000000, TU, TR);
T1i = TR + TU;
T1a = TS - TT;
}
{
E TM, TN, TO, TP;
TM = ii[WS(is, 1)];
TN = ii[WS(is, 3)];
TO = ii[WS(is, 9)];
TP = TN + TO;
TQ = FNMS(KP500000000, TP, TM);
T1h = TM + TP;
T19 = TN - TO;
}
{
E T10, T11, T13, T14;
T10 = ii[WS(is, 11)];
T11 = ii[WS(is, 6)];
T12 = T10 + T11;
T1d = T10 - T11;
T13 = ii[WS(is, 7)];
T14 = ii[WS(is, 2)];
T15 = T13 + T14;
T1c = T13 - T14;
}
T16 = T12 + T15;
T2c = T1d + T1c;
T2a = T1h - T1i;
T2d = T2b + T2c;
TW = TQ + TV;
T17 = FNMS(KP500000000, T16, TZ);
T18 = TW - T17;
T1n = TW + T17;
{
E T2i, T2j, T1j, T1k;
T2i = TQ - TV;
T2j = KP866025403 * (T15 - T12);
T2k = T2i + T2j;
T2n = T2i - T2j;
T1j = T1h + T1i;
T1k = TZ + T16;
T1l = KP300462606 * (T1j - T1k);
T1r = T1j + T1k;
}
{
E T1b, T1e, T2f, T2g;
T1b = T19 + T1a;
T1e = T1c - T1d;
T1f = T1b + T1e;
T1o = T1e - T1b;
T2f = FNMS(KP500000000, T2c, T2b);
T2g = KP866025403 * (T1a - T19);
T2h = T2f - T2g;
T2m = T2g + T2f;
}
}
ro[0] = T1 + To;
io[0] = T1q + T1r;
{
E T1D, T1N, T1y, T1x, T1E, T1O, Tv, TK, T1J, T1Q, T1m, T1R, T1t, T1I, TG;
E TJ;
{
E T1B, T1C, T1v, T1w;
T1B = FMA(KP387390585, T1f, KP265966249 * T18);
T1C = FMA(KP113854479, T1o, KP503537032 * T1n);
T1D = T1B + T1C;
T1N = T1C - T1B;
T1y = FMA(KP575140729, Tu, KP174138601 * Tt);
T1v = FNMS(KP156891391, TH, KP256247671 * TI);
T1w = FMA(KP011599105, TF, KP300238635 * TA);
T1x = T1v - T1w;
T1E = T1y + T1x;
T1O = KP1_732050807 * (T1v + T1w);
}
Tv = FNMS(KP174138601, Tu, KP575140729 * Tt);
TG = FNMS(KP300238635, TF, KP011599105 * TA);
TJ = FMA(KP256247671, TH, KP156891391 * TI);
TK = TG - TJ;
T1J = KP1_732050807 * (TJ + TG);
T1Q = Tv - TK;
{
E T1g, T1H, T1p, T1s, T1G;
T1g = FNMS(KP132983124, T1f, KP258260390 * T18);
T1H = T1l - T1g;
T1p = FNMS(KP251768516, T1o, KP075902986 * T1n);
T1s = FNMS(KP083333333, T1r, T1q);
T1G = T1s - T1p;
T1m = FMA(KP2_000000000, T1g, T1l);
T1R = T1H + T1G;
T1t = FMA(KP2_000000000, T1p, T1s);
T1I = T1G - T1H;
}
{
E TL, T1u, T1P, T1S;
TL = FMA(KP2_000000000, TK, Tv);
T1u = T1m + T1t;
io[WS(os, 1)] = TL + T1u;
io[WS(os, 12)] = T1u - TL;
{
E T1z, T1A, T1T, T1U;
T1z = FMS(KP2_000000000, T1x, T1y);
T1A = T1t - T1m;
io[WS(os, 5)] = T1z + T1A;
io[WS(os, 8)] = T1A - T1z;
T1T = T1R - T1Q;
T1U = T1O + T1N;
io[WS(os, 4)] = T1T - T1U;
io[WS(os, 10)] = T1U + T1T;
}
T1P = T1N - T1O;
T1S = T1Q + T1R;
io[WS(os, 3)] = T1P + T1S;
io[WS(os, 9)] = T1S - T1P;
{
E T1L, T1M, T1F, T1K;
T1L = T1J + T1I;
T1M = T1E + T1D;
io[WS(os, 6)] = T1L - T1M;
io[WS(os, 11)] = T1M + T1L;
T1F = T1D - T1E;
T1K = T1I - T1J;
io[WS(os, 2)] = T1F + T1K;
io[WS(os, 7)] = T1K - T1F;
}
}
}
{
E T2y, T2I, T2J, T2K, T2B, T2L, T2e, T2p, T2u, T2G, T23, T2F, T28, T2t, T2l;
E T2o;
{
E T2w, T2x, T2z, T2A;
T2w = FMA(KP387390585, T20, KP265966249 * T1X);
T2x = FNMS(KP503537032, T25, KP113854479 * T24);
T2y = T2w + T2x;
T2I = T2w - T2x;
T2J = FMA(KP575140729, T2a, KP174138601 * T2d);
T2z = FNMS(KP300238635, T2n, KP011599105 * T2m);
T2A = FNMS(KP156891391, T2h, KP256247671 * T2k);
T2K = T2z + T2A;
T2B = KP1_732050807 * (T2z - T2A);
T2L = T2J + T2K;
}
T2e = FNMS(KP575140729, T2d, KP174138601 * T2a);
T2l = FMA(KP256247671, T2h, KP156891391 * T2k);
T2o = FMA(KP300238635, T2m, KP011599105 * T2n);
T2p = T2l - T2o;
T2u = T2e - T2p;
T2G = KP1_732050807 * (T2o + T2l);
{
E T21, T2r, T26, T27, T2s;
T21 = FNMS(KP132983124, T20, KP258260390 * T1X);
T2r = T22 - T21;
T26 = FMA(KP251768516, T24, KP075902986 * T25);
T27 = FNMS(KP083333333, To, T1);
T2s = T27 - T26;
T23 = FMA(KP2_000000000, T21, T22);
T2F = T2s - T2r;
T28 = FMA(KP2_000000000, T26, T27);
T2t = T2r + T2s;
}
{
E T29, T2q, T2N, T2O;
T29 = T23 + T28;
T2q = FMA(KP2_000000000, T2p, T2e);
ro[WS(os, 12)] = T29 - T2q;
ro[WS(os, 1)] = T29 + T2q;
{
E T2v, T2C, T2P, T2Q;
T2v = T2t - T2u;
T2C = T2y - T2B;
ro[WS(os, 10)] = T2v - T2C;
ro[WS(os, 4)] = T2v + T2C;
T2P = T28 - T23;
T2Q = FMS(KP2_000000000, T2K, T2J);
ro[WS(os, 5)] = T2P - T2Q;
ro[WS(os, 8)] = T2P + T2Q;
}
T2N = T2F - T2G;
T2O = T2L - T2I;
ro[WS(os, 11)] = T2N - T2O;
ro[WS(os, 6)] = T2N + T2O;
{
E T2H, T2M, T2D, T2E;
T2H = T2F + T2G;
T2M = T2I + T2L;
ro[WS(os, 7)] = T2H - T2M;
ro[WS(os, 2)] = T2H + T2M;
T2D = T2t + T2u;
T2E = T2y + T2B;
ro[WS(os, 3)] = T2D - T2E;
ro[WS(os, 9)] = T2D + T2E;
}
}
}
}
}
}
static const kdft_desc desc = { 13, "n1_13", { 138, 30, 38, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_13) (planner *p) { X(kdft_register) (p, n1_13, &desc);
}
#endif

View File

@@ -0,0 +1,513 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
/*
* This function contains 148 FP additions, 84 FP multiplications,
* (or, 64 additions, 0 multiplications, 84 fused multiply/add),
* 67 stack variables, 6 constants, and 56 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
E T3, Tp, T1b, T1x, T1i, T1L, T1M, T1j, T1k, T1K, Ta, To, Th, Tz, T14;
E TZ, Ts, Ty, Tv, T1Z, T2c, T27, TI, T23, T24, TP, TW, T22, T1c, T1e;
E T1d, T1f, T1s, T1n, T1A, T1G, T1D, T1H, T1U, T1P;
{
E T1, T2, T19, T1a;
T1 = ri[0];
T2 = ri[WS(is, 7)];
T3 = T1 - T2;
Tp = T1 + T2;
T19 = ii[0];
T1a = ii[WS(is, 7)];
T1b = T19 - T1a;
T1x = T19 + T1a;
}
{
E T6, Tq, T9, Tr, Tn, Tx, Tk, Tw, Tg, Tu, Td, Tt;
{
E T4, T5, Ti, Tj;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 9)];
T6 = T4 - T5;
Tq = T4 + T5;
{
E T7, T8, Tl, Tm;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 5)];
T9 = T7 - T8;
Tr = T7 + T8;
Tl = ri[WS(is, 8)];
Tm = ri[WS(is, 1)];
Tn = Tl - Tm;
Tx = Tl + Tm;
}
Ti = ri[WS(is, 6)];
Tj = ri[WS(is, 13)];
Tk = Ti - Tj;
Tw = Ti + Tj;
{
E Te, Tf, Tb, Tc;
Te = ri[WS(is, 10)];
Tf = ri[WS(is, 3)];
Tg = Te - Tf;
Tu = Te + Tf;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 11)];
Td = Tb - Tc;
Tt = Tb + Tc;
}
}
T1i = Tn - Tk;
T1L = Tt - Tu;
T1M = Tr - Tq;
T1j = Tg - Td;
T1k = T9 - T6;
T1K = Tw - Tx;
Ta = T6 + T9;
To = Tk + Tn;
Th = Td + Tg;
Tz = FNMS(KP356895867, Th, Ta);
T14 = FNMS(KP356895867, To, Th);
TZ = FNMS(KP356895867, Ta, To);
Ts = Tq + Tr;
Ty = Tw + Tx;
Tv = Tt + Tu;
T1Z = FNMS(KP356895867, Ts, Ty);
T2c = FNMS(KP356895867, Ty, Tv);
T27 = FNMS(KP356895867, Tv, Ts);
}
{
E TE, T1B, TH, T1C, TV, T1F, TS, T1E, TO, T1z, TL, T1y;
{
E TC, TD, TQ, TR;
TC = ii[WS(is, 4)];
TD = ii[WS(is, 11)];
TE = TC - TD;
T1B = TC + TD;
{
E TF, TG, TT, TU;
TF = ii[WS(is, 10)];
TG = ii[WS(is, 3)];
TH = TF - TG;
T1C = TF + TG;
TT = ii[WS(is, 8)];
TU = ii[WS(is, 1)];
TV = TT - TU;
T1F = TT + TU;
}
TQ = ii[WS(is, 6)];
TR = ii[WS(is, 13)];
TS = TQ - TR;
T1E = TQ + TR;
{
E TM, TN, TJ, TK;
TM = ii[WS(is, 12)];
TN = ii[WS(is, 5)];
TO = TM - TN;
T1z = TM + TN;
TJ = ii[WS(is, 2)];
TK = ii[WS(is, 9)];
TL = TJ - TK;
T1y = TJ + TK;
}
}
TI = TE - TH;
T23 = T1F - T1E;
T24 = T1C - T1B;
TP = TL - TO;
TW = TS - TV;
T22 = T1y - T1z;
T1c = TL + TO;
T1e = TS + TV;
T1d = TE + TH;
T1f = FNMS(KP356895867, T1e, T1d);
T1s = FNMS(KP356895867, T1d, T1c);
T1n = FNMS(KP356895867, T1c, T1e);
T1A = T1y + T1z;
T1G = T1E + T1F;
T1D = T1B + T1C;
T1H = FNMS(KP356895867, T1G, T1D);
T1U = FNMS(KP356895867, T1D, T1A);
T1P = FNMS(KP356895867, T1A, T1G);
}
ro[WS(os, 7)] = T3 + Ta + Th + To;
io[WS(os, 7)] = T1b + T1c + T1d + T1e;
ro[0] = Tp + Ts + Tv + Ty;
io[0] = T1x + T1A + T1D + T1G;
{
E TB, TY, TA, TX;
TA = FNMS(KP692021471, Tz, To);
TB = FNMS(KP900968867, TA, T3);
TX = FMA(KP554958132, TW, TP);
TY = FMA(KP801937735, TX, TI);
ro[WS(os, 13)] = FNMS(KP974927912, TY, TB);
ro[WS(os, 1)] = FMA(KP974927912, TY, TB);
}
{
E T1u, T1w, T1t, T1v;
T1t = FNMS(KP692021471, T1s, T1e);
T1u = FNMS(KP900968867, T1t, T1b);
T1v = FMA(KP554958132, T1i, T1k);
T1w = FMA(KP801937735, T1v, T1j);
io[WS(os, 1)] = FMA(KP974927912, T1w, T1u);
io[WS(os, 13)] = FNMS(KP974927912, T1w, T1u);
}
{
E T11, T13, T10, T12;
T10 = FNMS(KP692021471, TZ, Th);
T11 = FNMS(KP900968867, T10, T3);
T12 = FMA(KP554958132, TI, TW);
T13 = FNMS(KP801937735, T12, TP);
ro[WS(os, 5)] = FNMS(KP974927912, T13, T11);
ro[WS(os, 9)] = FMA(KP974927912, T13, T11);
}
{
E T1p, T1r, T1o, T1q;
T1o = FNMS(KP692021471, T1n, T1d);
T1p = FNMS(KP900968867, T1o, T1b);
T1q = FMA(KP554958132, T1j, T1i);
T1r = FNMS(KP801937735, T1q, T1k);
io[WS(os, 5)] = FNMS(KP974927912, T1r, T1p);
io[WS(os, 9)] = FMA(KP974927912, T1r, T1p);
}
{
E T16, T18, T15, T17;
T15 = FNMS(KP692021471, T14, Ta);
T16 = FNMS(KP900968867, T15, T3);
T17 = FNMS(KP554958132, TP, TI);
T18 = FNMS(KP801937735, T17, TW);
ro[WS(os, 11)] = FNMS(KP974927912, T18, T16);
ro[WS(os, 3)] = FMA(KP974927912, T18, T16);
}
{
E T1h, T1m, T1g, T1l;
T1g = FNMS(KP692021471, T1f, T1c);
T1h = FNMS(KP900968867, T1g, T1b);
T1l = FNMS(KP554958132, T1k, T1j);
T1m = FNMS(KP801937735, T1l, T1i);
io[WS(os, 3)] = FMA(KP974927912, T1m, T1h);
io[WS(os, 11)] = FNMS(KP974927912, T1m, T1h);
}
{
E T1J, T1O, T1I, T1N;
T1I = FNMS(KP692021471, T1H, T1A);
T1J = FNMS(KP900968867, T1I, T1x);
T1N = FMA(KP554958132, T1M, T1L);
T1O = FNMS(KP801937735, T1N, T1K);
io[WS(os, 4)] = FMA(KP974927912, T1O, T1J);
io[WS(os, 10)] = FNMS(KP974927912, T1O, T1J);
}
{
E T2e, T2g, T2d, T2f;
T2d = FNMS(KP692021471, T2c, Ts);
T2e = FNMS(KP900968867, T2d, Tp);
T2f = FMA(KP554958132, T22, T24);
T2g = FNMS(KP801937735, T2f, T23);
ro[WS(os, 10)] = FNMS(KP974927912, T2g, T2e);
ro[WS(os, 4)] = FMA(KP974927912, T2g, T2e);
}
{
E T1R, T1T, T1Q, T1S;
T1Q = FNMS(KP692021471, T1P, T1D);
T1R = FNMS(KP900968867, T1Q, T1x);
T1S = FMA(KP554958132, T1L, T1K);
T1T = FMA(KP801937735, T1S, T1M);
io[WS(os, 2)] = FMA(KP974927912, T1T, T1R);
io[WS(os, 12)] = FNMS(KP974927912, T1T, T1R);
}
{
E T21, T26, T20, T25;
T20 = FNMS(KP692021471, T1Z, Tv);
T21 = FNMS(KP900968867, T20, Tp);
T25 = FMA(KP554958132, T24, T23);
T26 = FMA(KP801937735, T25, T22);
ro[WS(os, 12)] = FNMS(KP974927912, T26, T21);
ro[WS(os, 2)] = FMA(KP974927912, T26, T21);
}
{
E T1W, T1Y, T1V, T1X;
T1V = FNMS(KP692021471, T1U, T1G);
T1W = FNMS(KP900968867, T1V, T1x);
T1X = FNMS(KP554958132, T1K, T1M);
T1Y = FNMS(KP801937735, T1X, T1L);
io[WS(os, 6)] = FMA(KP974927912, T1Y, T1W);
io[WS(os, 8)] = FNMS(KP974927912, T1Y, T1W);
}
{
E T29, T2b, T28, T2a;
T28 = FNMS(KP692021471, T27, Ty);
T29 = FNMS(KP900968867, T28, Tp);
T2a = FNMS(KP554958132, T23, T22);
T2b = FNMS(KP801937735, T2a, T24);
ro[WS(os, 8)] = FNMS(KP974927912, T2b, T29);
ro[WS(os, 6)] = FMA(KP974927912, T2b, T29);
}
}
}
}
static const kdft_desc desc = { 14, "n1_14", { 64, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 14 -name n1_14 -include dft/scalar/n.h */
/*
* This function contains 148 FP additions, 72 FP multiplications,
* (or, 100 additions, 24 multiplications, 48 fused multiply/add),
* 43 stack variables, 6 constants, and 56 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(56, is), MAKE_VOLATILE_STRIDE(56, os)) {
E T3, Tp, T16, T1f, Ta, T1q, Ts, T10, TG, T1z, T19, T1i, Th, T1s, Tv;
E T12, TU, T1B, T17, T1o, To, T1r, Ty, T11, TN, T1A, T18, T1l;
{
E T1, T2, T14, T15;
T1 = ri[0];
T2 = ri[WS(is, 7)];
T3 = T1 - T2;
Tp = T1 + T2;
T14 = ii[0];
T15 = ii[WS(is, 7)];
T16 = T14 - T15;
T1f = T14 + T15;
}
{
E T6, Tq, T9, Tr;
{
E T4, T5, T7, T8;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 9)];
T6 = T4 - T5;
Tq = T4 + T5;
T7 = ri[WS(is, 12)];
T8 = ri[WS(is, 5)];
T9 = T7 - T8;
Tr = T7 + T8;
}
Ta = T6 + T9;
T1q = Tr - Tq;
Ts = Tq + Tr;
T10 = T9 - T6;
}
{
E TC, T1g, TF, T1h;
{
E TA, TB, TD, TE;
TA = ii[WS(is, 2)];
TB = ii[WS(is, 9)];
TC = TA - TB;
T1g = TA + TB;
TD = ii[WS(is, 12)];
TE = ii[WS(is, 5)];
TF = TD - TE;
T1h = TD + TE;
}
TG = TC - TF;
T1z = T1g - T1h;
T19 = TC + TF;
T1i = T1g + T1h;
}
{
E Td, Tt, Tg, Tu;
{
E Tb, Tc, Te, Tf;
Tb = ri[WS(is, 4)];
Tc = ri[WS(is, 11)];
Td = Tb - Tc;
Tt = Tb + Tc;
Te = ri[WS(is, 10)];
Tf = ri[WS(is, 3)];
Tg = Te - Tf;
Tu = Te + Tf;
}
Th = Td + Tg;
T1s = Tt - Tu;
Tv = Tt + Tu;
T12 = Tg - Td;
}
{
E TQ, T1m, TT, T1n;
{
E TO, TP, TR, TS;
TO = ii[WS(is, 4)];
TP = ii[WS(is, 11)];
TQ = TO - TP;
T1m = TO + TP;
TR = ii[WS(is, 10)];
TS = ii[WS(is, 3)];
TT = TR - TS;
T1n = TR + TS;
}
TU = TQ - TT;
T1B = T1n - T1m;
T17 = TQ + TT;
T1o = T1m + T1n;
}
{
E Tk, Tw, Tn, Tx;
{
E Ti, Tj, Tl, Tm;
Ti = ri[WS(is, 6)];
Tj = ri[WS(is, 13)];
Tk = Ti - Tj;
Tw = Ti + Tj;
Tl = ri[WS(is, 8)];
Tm = ri[WS(is, 1)];
Tn = Tl - Tm;
Tx = Tl + Tm;
}
To = Tk + Tn;
T1r = Tw - Tx;
Ty = Tw + Tx;
T11 = Tn - Tk;
}
{
E TJ, T1j, TM, T1k;
{
E TH, TI, TK, TL;
TH = ii[WS(is, 6)];
TI = ii[WS(is, 13)];
TJ = TH - TI;
T1j = TH + TI;
TK = ii[WS(is, 8)];
TL = ii[WS(is, 1)];
TM = TK - TL;
T1k = TK + TL;
}
TN = TJ - TM;
T1A = T1k - T1j;
T18 = TJ + TM;
T1l = T1j + T1k;
}
ro[WS(os, 7)] = T3 + Ta + Th + To;
io[WS(os, 7)] = T16 + T19 + T17 + T18;
ro[0] = Tp + Ts + Tv + Ty;
io[0] = T1f + T1i + T1o + T1l;
{
E TV, Tz, T1e, T1d;
TV = FNMS(KP781831482, TN, KP974927912 * TG) - (KP433883739 * TU);
Tz = FMA(KP623489801, To, T3) + FNMA(KP900968867, Th, KP222520933 * Ta);
ro[WS(os, 5)] = Tz - TV;
ro[WS(os, 9)] = Tz + TV;
T1e = FNMS(KP781831482, T11, KP974927912 * T10) - (KP433883739 * T12);
T1d = FMA(KP623489801, T18, T16) + FNMA(KP900968867, T17, KP222520933 * T19);
io[WS(os, 5)] = T1d - T1e;
io[WS(os, 9)] = T1e + T1d;
}
{
E TX, TW, T1b, T1c;
TX = FMA(KP781831482, TG, KP974927912 * TU) + (KP433883739 * TN);
TW = FMA(KP623489801, Ta, T3) + FNMA(KP900968867, To, KP222520933 * Th);
ro[WS(os, 13)] = TW - TX;
ro[WS(os, 1)] = TW + TX;
T1b = FMA(KP781831482, T10, KP974927912 * T12) + (KP433883739 * T11);
T1c = FMA(KP623489801, T19, T16) + FNMA(KP900968867, T18, KP222520933 * T17);
io[WS(os, 1)] = T1b + T1c;
io[WS(os, 13)] = T1c - T1b;
}
{
E TZ, TY, T13, T1a;
TZ = FMA(KP433883739, TG, KP974927912 * TN) - (KP781831482 * TU);
TY = FMA(KP623489801, Th, T3) + FNMA(KP222520933, To, KP900968867 * Ta);
ro[WS(os, 11)] = TY - TZ;
ro[WS(os, 3)] = TY + TZ;
T13 = FMA(KP433883739, T10, KP974927912 * T11) - (KP781831482 * T12);
T1a = FMA(KP623489801, T17, T16) + FNMA(KP222520933, T18, KP900968867 * T19);
io[WS(os, 3)] = T13 + T1a;
io[WS(os, 11)] = T1a - T13;
}
{
E T1t, T1p, T1C, T1y;
T1t = FNMS(KP433883739, T1r, KP781831482 * T1q) - (KP974927912 * T1s);
T1p = FMA(KP623489801, T1i, T1f) + FNMA(KP900968867, T1l, KP222520933 * T1o);
io[WS(os, 6)] = T1p - T1t;
io[WS(os, 8)] = T1t + T1p;
T1C = FNMS(KP433883739, T1A, KP781831482 * T1z) - (KP974927912 * T1B);
T1y = FMA(KP623489801, Ts, Tp) + FNMA(KP900968867, Ty, KP222520933 * Tv);
ro[WS(os, 6)] = T1y - T1C;
ro[WS(os, 8)] = T1y + T1C;
}
{
E T1v, T1u, T1E, T1D;
T1v = FMA(KP433883739, T1q, KP781831482 * T1s) - (KP974927912 * T1r);
T1u = FMA(KP623489801, T1o, T1f) + FNMA(KP222520933, T1l, KP900968867 * T1i);
io[WS(os, 4)] = T1u - T1v;
io[WS(os, 10)] = T1v + T1u;
T1E = FMA(KP433883739, T1z, KP781831482 * T1B) - (KP974927912 * T1A);
T1D = FMA(KP623489801, Tv, Tp) + FNMA(KP222520933, Ty, KP900968867 * Ts);
ro[WS(os, 4)] = T1D - T1E;
ro[WS(os, 10)] = T1D + T1E;
}
{
E T1w, T1x, T1G, T1F;
T1w = FMA(KP974927912, T1q, KP433883739 * T1s) + (KP781831482 * T1r);
T1x = FMA(KP623489801, T1l, T1f) + FNMA(KP900968867, T1o, KP222520933 * T1i);
io[WS(os, 2)] = T1w + T1x;
io[WS(os, 12)] = T1x - T1w;
T1G = FMA(KP974927912, T1z, KP433883739 * T1B) + (KP781831482 * T1A);
T1F = FMA(KP623489801, Ty, Tp) + FNMA(KP900968867, Tv, KP222520933 * Ts);
ro[WS(os, 12)] = T1F - T1G;
ro[WS(os, 2)] = T1F + T1G;
}
}
}
}
static const kdft_desc desc = { 14, "n1_14", { 100, 24, 48, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_14) (planner *p) { X(kdft_register) (p, n1_14, &desc);
}
#endif

View File

@@ -0,0 +1,554 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
/*
* This function contains 156 FP additions, 84 FP multiplications,
* (or, 72 additions, 0 multiplications, 84 fused multiply/add),
* 69 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
E T5, T2l, Tx, TV, T1z, T1X, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
E T1O, T1P, T1Z, T1l, T1q, T1B, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
E T2f, T2g, T2m, T1R, T1S, T1Y, T1a, T1f, T1A, TW, TX, TY;
{
E T1, T1v, T4, T1y, Tw, T1w, Tt, T1x;
T1 = ri[0];
T1v = ii[0];
{
E T2, T3, Tu, Tv;
T2 = ri[WS(is, 5)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
T1y = T3 - T2;
Tu = ii[WS(is, 5)];
Tv = ii[WS(is, 10)];
Tw = Tu - Tv;
T1w = Tu + Tv;
}
T5 = T1 + T4;
T2l = T1v + T1w;
Tt = FNMS(KP500000000, T4, T1);
Tx = FNMS(KP866025403, Tw, Tt);
TV = FMA(KP866025403, Tw, Tt);
T1x = FNMS(KP500000000, T1w, T1v);
T1z = FMA(KP866025403, T1y, T1x);
T1X = FNMS(KP866025403, T1y, T1x);
}
{
E Th, Tk, TJ, T1k, T1h, T1i, TM, T1j, Tm, Tp, TO, T1p, T1m, T1n, TR;
E T1o;
{
E Ti, Tj, TK, TL;
Th = ri[WS(is, 6)];
Ti = ri[WS(is, 11)];
Tj = ri[WS(is, 1)];
Tk = Ti + Tj;
TJ = FNMS(KP500000000, Tk, Th);
T1k = Tj - Ti;
T1h = ii[WS(is, 6)];
TK = ii[WS(is, 11)];
TL = ii[WS(is, 1)];
T1i = TK + TL;
TM = TK - TL;
T1j = FNMS(KP500000000, T1i, T1h);
}
{
E Tn, To, TP, TQ;
Tm = ri[WS(is, 9)];
Tn = ri[WS(is, 14)];
To = ri[WS(is, 4)];
Tp = Tn + To;
TO = FNMS(KP500000000, Tp, Tm);
T1p = To - Tn;
T1m = ii[WS(is, 9)];
TP = ii[WS(is, 14)];
TQ = ii[WS(is, 4)];
T1n = TP + TQ;
TR = TP - TQ;
T1o = FNMS(KP500000000, T1n, T1m);
}
Tl = Th + Tk;
Tq = Tm + Tp;
Tr = Tl + Tq;
TN = FNMS(KP866025403, TM, TJ);
TS = FNMS(KP866025403, TR, TO);
TT = TN + TS;
T2c = T1h + T1i;
T2d = T1m + T1n;
T2n = T2c + T2d;
T1O = FNMS(KP866025403, T1k, T1j);
T1P = FNMS(KP866025403, T1p, T1o);
T1Z = T1O + T1P;
T1l = FMA(KP866025403, T1k, T1j);
T1q = FMA(KP866025403, T1p, T1o);
T1B = T1l + T1q;
TZ = FMA(KP866025403, TM, TJ);
T10 = FMA(KP866025403, TR, TO);
T11 = TZ + T10;
}
{
E T6, T9, Ty, T19, T16, T17, TB, T18, Tb, Te, TD, T1e, T1b, T1c, TG;
E T1d;
{
E T7, T8, Tz, TA;
T6 = ri[WS(is, 3)];
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 13)];
T9 = T7 + T8;
Ty = FNMS(KP500000000, T9, T6);
T19 = T8 - T7;
T16 = ii[WS(is, 3)];
Tz = ii[WS(is, 8)];
TA = ii[WS(is, 13)];
T17 = Tz + TA;
TB = Tz - TA;
T18 = FNMS(KP500000000, T17, T16);
}
{
E Tc, Td, TE, TF;
Tb = ri[WS(is, 12)];
Tc = ri[WS(is, 2)];
Td = ri[WS(is, 7)];
Te = Tc + Td;
TD = FNMS(KP500000000, Te, Tb);
T1e = Td - Tc;
T1b = ii[WS(is, 12)];
TE = ii[WS(is, 2)];
TF = ii[WS(is, 7)];
T1c = TE + TF;
TG = TE - TF;
T1d = FNMS(KP500000000, T1c, T1b);
}
Ta = T6 + T9;
Tf = Tb + Te;
Tg = Ta + Tf;
TC = FNMS(KP866025403, TB, Ty);
TH = FNMS(KP866025403, TG, TD);
TI = TC + TH;
T2f = T16 + T17;
T2g = T1b + T1c;
T2m = T2f + T2g;
T1R = FNMS(KP866025403, T19, T18);
T1S = FNMS(KP866025403, T1e, T1d);
T1Y = T1R + T1S;
T1a = FMA(KP866025403, T19, T18);
T1f = FMA(KP866025403, T1e, T1d);
T1A = T1a + T1f;
TW = FMA(KP866025403, TB, Ty);
TX = FMA(KP866025403, TG, TD);
TY = TW + TX;
}
{
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
T2a = Tg - Tr;
Ts = Tg + Tr;
T29 = FNMS(KP250000000, Ts, T5);
T2e = T2c - T2d;
T2h = T2f - T2g;
T2i = FNMS(KP618033988, T2h, T2e);
T2k = FMA(KP618033988, T2e, T2h);
ro[0] = T5 + Ts;
T2j = FMA(KP559016994, T2a, T29);
ro[WS(os, 9)] = FNMS(KP951056516, T2k, T2j);
ro[WS(os, 6)] = FMA(KP951056516, T2k, T2j);
T2b = FNMS(KP559016994, T2a, T29);
ro[WS(os, 12)] = FNMS(KP951056516, T2i, T2b);
ro[WS(os, 3)] = FMA(KP951056516, T2i, T2b);
}
{
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
T2q = T2m - T2n;
T2o = T2m + T2n;
T2p = FNMS(KP250000000, T2o, T2l);
T2s = Tl - Tq;
T2t = Ta - Tf;
T2u = FNMS(KP618033988, T2t, T2s);
T2w = FMA(KP618033988, T2s, T2t);
io[0] = T2l + T2o;
T2v = FMA(KP559016994, T2q, T2p);
io[WS(os, 6)] = FNMS(KP951056516, T2w, T2v);
io[WS(os, 9)] = FMA(KP951056516, T2w, T2v);
T2r = FNMS(KP559016994, T2q, T2p);
io[WS(os, 3)] = FNMS(KP951056516, T2u, T2r);
io[WS(os, 12)] = FMA(KP951056516, T2u, T2r);
}
{
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
T1M = TI - TT;
TU = TI + TT;
T1L = FNMS(KP250000000, TU, Tx);
T1Q = T1O - T1P;
T1T = T1R - T1S;
T1U = FNMS(KP618033988, T1T, T1Q);
T1W = FMA(KP618033988, T1Q, T1T);
ro[WS(os, 5)] = Tx + TU;
T1V = FMA(KP559016994, T1M, T1L);
ro[WS(os, 14)] = FNMS(KP951056516, T1W, T1V);
ro[WS(os, 11)] = FMA(KP951056516, T1W, T1V);
T1N = FNMS(KP559016994, T1M, T1L);
ro[WS(os, 2)] = FNMS(KP951056516, T1U, T1N);
ro[WS(os, 8)] = FMA(KP951056516, T1U, T1N);
}
{
E T22, T20, T21, T26, T28, T24, T25, T27, T23;
T22 = T1Y - T1Z;
T20 = T1Y + T1Z;
T21 = FNMS(KP250000000, T20, T1X);
T24 = TN - TS;
T25 = TC - TH;
T26 = FNMS(KP618033988, T25, T24);
T28 = FMA(KP618033988, T24, T25);
io[WS(os, 5)] = T1X + T20;
T27 = FMA(KP559016994, T22, T21);
io[WS(os, 11)] = FNMS(KP951056516, T28, T27);
io[WS(os, 14)] = FMA(KP951056516, T28, T27);
T23 = FNMS(KP559016994, T22, T21);
io[WS(os, 2)] = FMA(KP951056516, T26, T23);
io[WS(os, 8)] = FNMS(KP951056516, T26, T23);
}
{
E T1E, T1C, T1D, T1I, T1K, T1G, T1H, T1J, T1F;
T1E = T1A - T1B;
T1C = T1A + T1B;
T1D = FNMS(KP250000000, T1C, T1z);
T1G = TW - TX;
T1H = TZ - T10;
T1I = FMA(KP618033988, T1H, T1G);
T1K = FNMS(KP618033988, T1G, T1H);
io[WS(os, 10)] = T1z + T1C;
T1J = FNMS(KP559016994, T1E, T1D);
io[WS(os, 7)] = FMA(KP951056516, T1K, T1J);
io[WS(os, 13)] = FNMS(KP951056516, T1K, T1J);
T1F = FMA(KP559016994, T1E, T1D);
io[WS(os, 1)] = FNMS(KP951056516, T1I, T1F);
io[WS(os, 4)] = FMA(KP951056516, T1I, T1F);
}
{
E T14, T12, T13, T1s, T1u, T1g, T1r, T1t, T15;
T14 = TY - T11;
T12 = TY + T11;
T13 = FNMS(KP250000000, T12, TV);
T1g = T1a - T1f;
T1r = T1l - T1q;
T1s = FMA(KP618033988, T1r, T1g);
T1u = FNMS(KP618033988, T1g, T1r);
ro[WS(os, 10)] = TV + T12;
T1t = FNMS(KP559016994, T14, T13);
ro[WS(os, 7)] = FNMS(KP951056516, T1u, T1t);
ro[WS(os, 13)] = FMA(KP951056516, T1u, T1t);
T15 = FMA(KP559016994, T14, T13);
ro[WS(os, 4)] = FNMS(KP951056516, T1s, T15);
ro[WS(os, 1)] = FMA(KP951056516, T1s, T15);
}
}
}
}
static const kdft_desc desc = { 15, "n1_15", { 72, 0, 84, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 15 -name n1_15 -include dft/scalar/n.h */
/*
* This function contains 156 FP additions, 56 FP multiplications,
* (or, 128 additions, 28 multiplications, 28 fused multiply/add),
* 69 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(60, is), MAKE_VOLATILE_STRIDE(60, os)) {
E T5, T2l, Tx, TV, T1C, T20, Tl, Tq, Tr, TN, TS, TT, T2c, T2d, T2n;
E T1O, T1P, T22, T1l, T1q, T1w, TZ, T10, T11, Ta, Tf, Tg, TC, TH, TI;
E T2f, T2g, T2m, T1R, T1S, T21, T1a, T1f, T1v, TW, TX, TY;
{
E T1, T1z, T4, T1y, Tw, T1A, Tt, T1B;
T1 = ri[0];
T1z = ii[0];
{
E T2, T3, Tu, Tv;
T2 = ri[WS(is, 5)];
T3 = ri[WS(is, 10)];
T4 = T2 + T3;
T1y = KP866025403 * (T3 - T2);
Tu = ii[WS(is, 5)];
Tv = ii[WS(is, 10)];
Tw = KP866025403 * (Tu - Tv);
T1A = Tu + Tv;
}
T5 = T1 + T4;
T2l = T1z + T1A;
Tt = FNMS(KP500000000, T4, T1);
Tx = Tt - Tw;
TV = Tt + Tw;
T1B = FNMS(KP500000000, T1A, T1z);
T1C = T1y + T1B;
T20 = T1B - T1y;
}
{
E Th, Tk, TJ, T1h, T1i, T1j, TM, T1k, Tm, Tp, TO, T1m, T1n, T1o, TR;
E T1p;
{
E Ti, Tj, TK, TL;
Th = ri[WS(is, 6)];
Ti = ri[WS(is, 11)];
Tj = ri[WS(is, 1)];
Tk = Ti + Tj;
TJ = FNMS(KP500000000, Tk, Th);
T1h = KP866025403 * (Tj - Ti);
T1i = ii[WS(is, 6)];
TK = ii[WS(is, 11)];
TL = ii[WS(is, 1)];
T1j = TK + TL;
TM = KP866025403 * (TK - TL);
T1k = FNMS(KP500000000, T1j, T1i);
}
{
E Tn, To, TP, TQ;
Tm = ri[WS(is, 9)];
Tn = ri[WS(is, 14)];
To = ri[WS(is, 4)];
Tp = Tn + To;
TO = FNMS(KP500000000, Tp, Tm);
T1m = KP866025403 * (To - Tn);
T1n = ii[WS(is, 9)];
TP = ii[WS(is, 14)];
TQ = ii[WS(is, 4)];
T1o = TP + TQ;
TR = KP866025403 * (TP - TQ);
T1p = FNMS(KP500000000, T1o, T1n);
}
Tl = Th + Tk;
Tq = Tm + Tp;
Tr = Tl + Tq;
TN = TJ - TM;
TS = TO - TR;
TT = TN + TS;
T2c = T1i + T1j;
T2d = T1n + T1o;
T2n = T2c + T2d;
T1O = T1k - T1h;
T1P = T1p - T1m;
T22 = T1O + T1P;
T1l = T1h + T1k;
T1q = T1m + T1p;
T1w = T1l + T1q;
TZ = TJ + TM;
T10 = TO + TR;
T11 = TZ + T10;
}
{
E T6, T9, Ty, T16, T17, T18, TB, T19, Tb, Te, TD, T1b, T1c, T1d, TG;
E T1e;
{
E T7, T8, Tz, TA;
T6 = ri[WS(is, 3)];
T7 = ri[WS(is, 8)];
T8 = ri[WS(is, 13)];
T9 = T7 + T8;
Ty = FNMS(KP500000000, T9, T6);
T16 = KP866025403 * (T8 - T7);
T17 = ii[WS(is, 3)];
Tz = ii[WS(is, 8)];
TA = ii[WS(is, 13)];
T18 = Tz + TA;
TB = KP866025403 * (Tz - TA);
T19 = FNMS(KP500000000, T18, T17);
}
{
E Tc, Td, TE, TF;
Tb = ri[WS(is, 12)];
Tc = ri[WS(is, 2)];
Td = ri[WS(is, 7)];
Te = Tc + Td;
TD = FNMS(KP500000000, Te, Tb);
T1b = KP866025403 * (Td - Tc);
T1c = ii[WS(is, 12)];
TE = ii[WS(is, 2)];
TF = ii[WS(is, 7)];
T1d = TE + TF;
TG = KP866025403 * (TE - TF);
T1e = FNMS(KP500000000, T1d, T1c);
}
Ta = T6 + T9;
Tf = Tb + Te;
Tg = Ta + Tf;
TC = Ty - TB;
TH = TD - TG;
TI = TC + TH;
T2f = T17 + T18;
T2g = T1c + T1d;
T2m = T2f + T2g;
T1R = T19 - T16;
T1S = T1e - T1b;
T21 = T1R + T1S;
T1a = T16 + T19;
T1f = T1b + T1e;
T1v = T1a + T1f;
TW = Ty + TB;
TX = TD + TG;
TY = TW + TX;
}
{
E T2a, Ts, T29, T2i, T2k, T2e, T2h, T2j, T2b;
T2a = KP559016994 * (Tg - Tr);
Ts = Tg + Tr;
T29 = FNMS(KP250000000, Ts, T5);
T2e = T2c - T2d;
T2h = T2f - T2g;
T2i = FNMS(KP587785252, T2h, KP951056516 * T2e);
T2k = FMA(KP951056516, T2h, KP587785252 * T2e);
ro[0] = T5 + Ts;
T2j = T2a + T29;
ro[WS(os, 9)] = T2j - T2k;
ro[WS(os, 6)] = T2j + T2k;
T2b = T29 - T2a;
ro[WS(os, 12)] = T2b - T2i;
ro[WS(os, 3)] = T2b + T2i;
}
{
E T2q, T2o, T2p, T2u, T2w, T2s, T2t, T2v, T2r;
T2q = KP559016994 * (T2m - T2n);
T2o = T2m + T2n;
T2p = FNMS(KP250000000, T2o, T2l);
T2s = Tl - Tq;
T2t = Ta - Tf;
T2u = FNMS(KP587785252, T2t, KP951056516 * T2s);
T2w = FMA(KP951056516, T2t, KP587785252 * T2s);
io[0] = T2l + T2o;
T2v = T2q + T2p;
io[WS(os, 6)] = T2v - T2w;
io[WS(os, 9)] = T2w + T2v;
T2r = T2p - T2q;
io[WS(os, 3)] = T2r - T2u;
io[WS(os, 12)] = T2u + T2r;
}
{
E T1M, TU, T1L, T1U, T1W, T1Q, T1T, T1V, T1N;
T1M = KP559016994 * (TI - TT);
TU = TI + TT;
T1L = FNMS(KP250000000, TU, Tx);
T1Q = T1O - T1P;
T1T = T1R - T1S;
T1U = FNMS(KP587785252, T1T, KP951056516 * T1Q);
T1W = FMA(KP951056516, T1T, KP587785252 * T1Q);
ro[WS(os, 5)] = Tx + TU;
T1V = T1M + T1L;
ro[WS(os, 14)] = T1V - T1W;
ro[WS(os, 11)] = T1V + T1W;
T1N = T1L - T1M;
ro[WS(os, 2)] = T1N - T1U;
ro[WS(os, 8)] = T1N + T1U;
}
{
E T25, T23, T24, T1Z, T28, T1X, T1Y, T27, T26;
T25 = KP559016994 * (T21 - T22);
T23 = T21 + T22;
T24 = FNMS(KP250000000, T23, T20);
T1X = TN - TS;
T1Y = TC - TH;
T1Z = FNMS(KP587785252, T1Y, KP951056516 * T1X);
T28 = FMA(KP951056516, T1Y, KP587785252 * T1X);
io[WS(os, 5)] = T20 + T23;
T27 = T25 + T24;
io[WS(os, 11)] = T27 - T28;
io[WS(os, 14)] = T28 + T27;
T26 = T24 - T25;
io[WS(os, 2)] = T1Z + T26;
io[WS(os, 8)] = T26 - T1Z;
}
{
E T1x, T1D, T1E, T1I, T1J, T1G, T1H, T1K, T1F;
T1x = KP559016994 * (T1v - T1w);
T1D = T1v + T1w;
T1E = FNMS(KP250000000, T1D, T1C);
T1G = TW - TX;
T1H = TZ - T10;
T1I = FMA(KP951056516, T1G, KP587785252 * T1H);
T1J = FNMS(KP587785252, T1G, KP951056516 * T1H);
io[WS(os, 10)] = T1C + T1D;
T1K = T1E - T1x;
io[WS(os, 7)] = T1J + T1K;
io[WS(os, 13)] = T1K - T1J;
T1F = T1x + T1E;
io[WS(os, 1)] = T1F - T1I;
io[WS(os, 4)] = T1I + T1F;
}
{
E T13, T12, T14, T1s, T1u, T1g, T1r, T1t, T15;
T13 = KP559016994 * (TY - T11);
T12 = TY + T11;
T14 = FNMS(KP250000000, T12, TV);
T1g = T1a - T1f;
T1r = T1l - T1q;
T1s = FMA(KP951056516, T1g, KP587785252 * T1r);
T1u = FNMS(KP587785252, T1g, KP951056516 * T1r);
ro[WS(os, 10)] = TV + T12;
T1t = T14 - T13;
ro[WS(os, 7)] = T1t - T1u;
ro[WS(os, 13)] = T1t + T1u;
T15 = T13 + T14;
ro[WS(os, 4)] = T15 - T1s;
ro[WS(os, 1)] = T15 + T1s;
}
}
}
}
static const kdft_desc desc = { 15, "n1_15", { 128, 28, 28, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_15) (planner *p) { X(kdft_register) (p, n1_15, &desc);
}
#endif

View File

@@ -0,0 +1,560 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:25 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
/*
* This function contains 144 FP additions, 40 FP multiplications,
* (or, 104 additions, 0 multiplications, 40 fused multiply/add),
* 50 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
E T1U, T1A;
{
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
{
E T1, T2, Tw, Tx;
T1 = ri[0];
T2 = ri[WS(is, 8)];
T3 = T1 + T2;
TL = T1 - T2;
Tw = ii[0];
Tx = ii[WS(is, 8)];
Ty = Tw + Tx;
T1k = Tw - Tx;
}
{
E T4, T5, Tz, TA;
T4 = ri[WS(is, 4)];
T5 = ri[WS(is, 12)];
T6 = T4 + T5;
T1j = T4 - T5;
Tz = ii[WS(is, 4)];
TA = ii[WS(is, 12)];
TB = Tz + TA;
TM = Tz - TA;
}
T7 = T3 + T6;
T1R = T3 - T6;
T25 = Ty - TB;
TC = Ty + TB;
TN = TL - TM;
T1x = TL + TM;
T1H = T1k - T1j;
T1l = T1j + T1k;
}
{
E Tp, T1c, T1a, T20, Ts, T17, T1f, T21;
{
E Tn, To, T18, T19;
Tn = ri[WS(is, 15)];
To = ri[WS(is, 7)];
Tp = Tn + To;
T1c = Tn - To;
T18 = ii[WS(is, 15)];
T19 = ii[WS(is, 7)];
T1a = T18 - T19;
T20 = T18 + T19;
}
{
E Tq, Tr, T1d, T1e;
Tq = ri[WS(is, 3)];
Tr = ri[WS(is, 11)];
Ts = Tq + Tr;
T17 = Tq - Tr;
T1d = ii[WS(is, 3)];
T1e = ii[WS(is, 11)];
T1f = T1d - T1e;
T21 = T1d + T1e;
}
Tt = Tp + Ts;
T22 = T20 - T21;
T2h = T20 + T21;
T1b = T17 + T1a;
T1g = T1c - T1f;
T1E = T1a - T17;
T1Z = Tp - Ts;
T1D = T1c + T1f;
}
{
E Ta, TP, TF, TO, Td, TR, TI, TS;
{
E T8, T9, TD, TE;
T8 = ri[WS(is, 2)];
T9 = ri[WS(is, 10)];
Ta = T8 + T9;
TP = T8 - T9;
TD = ii[WS(is, 2)];
TE = ii[WS(is, 10)];
TF = TD + TE;
TO = TD - TE;
}
{
E Tb, Tc, TG, TH;
Tb = ri[WS(is, 14)];
Tc = ri[WS(is, 6)];
Td = Tb + Tc;
TR = Tb - Tc;
TG = ii[WS(is, 14)];
TH = ii[WS(is, 6)];
TI = TG + TH;
TS = TG - TH;
}
Te = Ta + Td;
T1S = TF - TI;
T26 = Td - Ta;
TJ = TF + TI;
TQ = TO - TP;
T1m = TR - TS;
T1n = TP + TO;
TT = TR + TS;
}
{
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
{
E Tg, Th, TX, TY;
Tg = ri[WS(is, 1)];
Th = ri[WS(is, 9)];
Ti = Tg + Th;
T11 = Tg - Th;
TX = ii[WS(is, 1)];
TY = ii[WS(is, 9)];
TZ = TX - TY;
T1V = TX + TY;
}
{
E Tj, Tk, T12, T13;
Tj = ri[WS(is, 5)];
Tk = ri[WS(is, 13)];
Tl = Tj + Tk;
TW = Tj - Tk;
T12 = ii[WS(is, 5)];
T13 = ii[WS(is, 13)];
T14 = T12 - T13;
T1W = T12 + T13;
}
Tm = Ti + Tl;
T1X = T1V - T1W;
T2g = T1V + T1W;
T10 = TW + TZ;
T15 = T11 - T14;
T1B = TZ - TW;
T1U = Ti - Tl;
T1A = T11 + T14;
}
{
E Tf, Tu, T2j, T2k;
Tf = T7 + Te;
Tu = Tm + Tt;
ro[WS(os, 8)] = Tf - Tu;
ro[0] = Tf + Tu;
T2j = TC + TJ;
T2k = T2g + T2h;
io[WS(os, 8)] = T2j - T2k;
io[0] = T2j + T2k;
}
{
E Tv, TK, T2f, T2i;
Tv = Tt - Tm;
TK = TC - TJ;
io[WS(os, 4)] = Tv + TK;
io[WS(os, 12)] = TK - Tv;
T2f = T7 - Te;
T2i = T2g - T2h;
ro[WS(os, 12)] = T2f - T2i;
ro[WS(os, 4)] = T2f + T2i;
}
{
E T1T, T27, T24, T28, T1Y, T23;
T1T = T1R + T1S;
T27 = T25 - T26;
T1Y = T1U + T1X;
T23 = T1Z - T22;
T24 = T1Y + T23;
T28 = T23 - T1Y;
ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T);
io[WS(os, 6)] = FMA(KP707106781, T28, T27);
ro[WS(os, 2)] = FMA(KP707106781, T24, T1T);
io[WS(os, 14)] = FNMS(KP707106781, T28, T27);
}
{
E T29, T2d, T2c, T2e, T2a, T2b;
T29 = T1R - T1S;
T2d = T26 + T25;
T2a = T1X - T1U;
T2b = T1Z + T22;
T2c = T2a - T2b;
T2e = T2a + T2b;
ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29);
io[WS(os, 2)] = FMA(KP707106781, T2e, T2d);
ro[WS(os, 6)] = FMA(KP707106781, T2c, T29);
io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d);
}
{
E TV, T1v, T1p, T1r, T1i, T1q, T1u, T1w, TU, T1o;
TU = TQ - TT;
TV = FMA(KP707106781, TU, TN);
T1v = FNMS(KP707106781, TU, TN);
T1o = T1m - T1n;
T1p = FNMS(KP707106781, T1o, T1l);
T1r = FMA(KP707106781, T1o, T1l);
{
E T16, T1h, T1s, T1t;
T16 = FMA(KP414213562, T15, T10);
T1h = FNMS(KP414213562, T1g, T1b);
T1i = T16 - T1h;
T1q = T16 + T1h;
T1s = FMA(KP414213562, T1b, T1g);
T1t = FNMS(KP414213562, T10, T15);
T1u = T1s - T1t;
T1w = T1t + T1s;
}
ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV);
io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r);
ro[WS(os, 3)] = FMA(KP923879532, T1i, TV);
io[WS(os, 3)] = FMA(KP923879532, T1u, T1r);
io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p);
ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v);
io[WS(os, 15)] = FMA(KP923879532, T1q, T1p);
ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v);
}
{
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
T1y = T1n + T1m;
T1z = FMA(KP707106781, T1y, T1x);
T1L = FNMS(KP707106781, T1y, T1x);
T1I = TQ + TT;
T1J = FNMS(KP707106781, T1I, T1H);
T1P = FMA(KP707106781, T1I, T1H);
{
E T1C, T1F, T1M, T1N;
T1C = FMA(KP414213562, T1B, T1A);
T1F = FNMS(KP414213562, T1E, T1D);
T1G = T1C + T1F;
T1K = T1F - T1C;
T1M = FNMS(KP414213562, T1A, T1B);
T1N = FMA(KP414213562, T1D, T1E);
T1O = T1M - T1N;
T1Q = T1M + T1N;
}
ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z);
io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P);
ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z);
io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P);
io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J);
ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L);
io[WS(os, 5)] = FMA(KP923879532, T1K, T1J);
ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L);
}
}
}
}
static const kdft_desc desc = { 16, "n1_16", { 104, 0, 40, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 16 -name n1_16 -include dft/scalar/n.h */
/*
* This function contains 144 FP additions, 24 FP multiplications,
* (or, 136 additions, 16 multiplications, 8 fused multiply/add),
* 50 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) {
E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z;
E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B;
E T1U, T1A;
{
E T3, TL, Ty, T1k, T6, T1j, TB, TM;
{
E T1, T2, Tw, Tx;
T1 = ri[0];
T2 = ri[WS(is, 8)];
T3 = T1 + T2;
TL = T1 - T2;
Tw = ii[0];
Tx = ii[WS(is, 8)];
Ty = Tw + Tx;
T1k = Tw - Tx;
}
{
E T4, T5, Tz, TA;
T4 = ri[WS(is, 4)];
T5 = ri[WS(is, 12)];
T6 = T4 + T5;
T1j = T4 - T5;
Tz = ii[WS(is, 4)];
TA = ii[WS(is, 12)];
TB = Tz + TA;
TM = Tz - TA;
}
T7 = T3 + T6;
T1R = T3 - T6;
T25 = Ty - TB;
TC = Ty + TB;
TN = TL - TM;
T1x = TL + TM;
T1H = T1k - T1j;
T1l = T1j + T1k;
}
{
E Tp, T17, T1f, T20, Ts, T1c, T1a, T21;
{
E Tn, To, T1d, T1e;
Tn = ri[WS(is, 15)];
To = ri[WS(is, 7)];
Tp = Tn + To;
T17 = Tn - To;
T1d = ii[WS(is, 15)];
T1e = ii[WS(is, 7)];
T1f = T1d - T1e;
T20 = T1d + T1e;
}
{
E Tq, Tr, T18, T19;
Tq = ri[WS(is, 3)];
Tr = ri[WS(is, 11)];
Ts = Tq + Tr;
T1c = Tq - Tr;
T18 = ii[WS(is, 3)];
T19 = ii[WS(is, 11)];
T1a = T18 - T19;
T21 = T18 + T19;
}
Tt = Tp + Ts;
T22 = T20 - T21;
T2h = T20 + T21;
T1b = T17 - T1a;
T1g = T1c + T1f;
T1E = T1f - T1c;
T1Z = Tp - Ts;
T1D = T17 + T1a;
}
{
E Ta, TP, TF, TO, Td, TR, TI, TS;
{
E T8, T9, TD, TE;
T8 = ri[WS(is, 2)];
T9 = ri[WS(is, 10)];
Ta = T8 + T9;
TP = T8 - T9;
TD = ii[WS(is, 2)];
TE = ii[WS(is, 10)];
TF = TD + TE;
TO = TD - TE;
}
{
E Tb, Tc, TG, TH;
Tb = ri[WS(is, 14)];
Tc = ri[WS(is, 6)];
Td = Tb + Tc;
TR = Tb - Tc;
TG = ii[WS(is, 14)];
TH = ii[WS(is, 6)];
TI = TG + TH;
TS = TG - TH;
}
Te = Ta + Td;
T1S = TF - TI;
T26 = Td - Ta;
TJ = TF + TI;
TQ = TO - TP;
T1m = TR - TS;
T1n = TP + TO;
TT = TR + TS;
}
{
E Ti, T11, TZ, T1V, Tl, TW, T14, T1W;
{
E Tg, Th, TX, TY;
Tg = ri[WS(is, 1)];
Th = ri[WS(is, 9)];
Ti = Tg + Th;
T11 = Tg - Th;
TX = ii[WS(is, 1)];
TY = ii[WS(is, 9)];
TZ = TX - TY;
T1V = TX + TY;
}
{
E Tj, Tk, T12, T13;
Tj = ri[WS(is, 5)];
Tk = ri[WS(is, 13)];
Tl = Tj + Tk;
TW = Tj - Tk;
T12 = ii[WS(is, 5)];
T13 = ii[WS(is, 13)];
T14 = T12 - T13;
T1W = T12 + T13;
}
Tm = Ti + Tl;
T1X = T1V - T1W;
T2g = T1V + T1W;
T10 = TW + TZ;
T15 = T11 - T14;
T1B = T11 + T14;
T1U = Ti - Tl;
T1A = TZ - TW;
}
{
E Tf, Tu, T2j, T2k;
Tf = T7 + Te;
Tu = Tm + Tt;
ro[WS(os, 8)] = Tf - Tu;
ro[0] = Tf + Tu;
T2j = TC + TJ;
T2k = T2g + T2h;
io[WS(os, 8)] = T2j - T2k;
io[0] = T2j + T2k;
}
{
E Tv, TK, T2f, T2i;
Tv = Tt - Tm;
TK = TC - TJ;
io[WS(os, 4)] = Tv + TK;
io[WS(os, 12)] = TK - Tv;
T2f = T7 - Te;
T2i = T2g - T2h;
ro[WS(os, 12)] = T2f - T2i;
ro[WS(os, 4)] = T2f + T2i;
}
{
E T1T, T27, T24, T28, T1Y, T23;
T1T = T1R + T1S;
T27 = T25 - T26;
T1Y = T1U + T1X;
T23 = T1Z - T22;
T24 = KP707106781 * (T1Y + T23);
T28 = KP707106781 * (T23 - T1Y);
ro[WS(os, 10)] = T1T - T24;
io[WS(os, 6)] = T27 + T28;
ro[WS(os, 2)] = T1T + T24;
io[WS(os, 14)] = T27 - T28;
}
{
E T29, T2d, T2c, T2e, T2a, T2b;
T29 = T1R - T1S;
T2d = T26 + T25;
T2a = T1X - T1U;
T2b = T1Z + T22;
T2c = KP707106781 * (T2a - T2b);
T2e = KP707106781 * (T2a + T2b);
ro[WS(os, 14)] = T29 - T2c;
io[WS(os, 2)] = T2d + T2e;
ro[WS(os, 6)] = T29 + T2c;
io[WS(os, 10)] = T2d - T2e;
}
{
E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o;
TU = KP707106781 * (TQ - TT);
TV = TN + TU;
T1r = TN - TU;
T1o = KP707106781 * (T1m - T1n);
T1p = T1l - T1o;
T1v = T1l + T1o;
{
E T16, T1h, T1s, T1t;
T16 = FMA(KP923879532, T10, KP382683432 * T15);
T1h = FNMS(KP923879532, T1g, KP382683432 * T1b);
T1i = T16 + T1h;
T1q = T1h - T16;
T1s = FNMS(KP923879532, T15, KP382683432 * T10);
T1t = FMA(KP382683432, T1g, KP923879532 * T1b);
T1u = T1s - T1t;
T1w = T1s + T1t;
}
ro[WS(os, 11)] = TV - T1i;
io[WS(os, 11)] = T1v - T1w;
ro[WS(os, 3)] = TV + T1i;
io[WS(os, 3)] = T1v + T1w;
io[WS(os, 15)] = T1p - T1q;
ro[WS(os, 15)] = T1r - T1u;
io[WS(os, 7)] = T1p + T1q;
ro[WS(os, 7)] = T1r + T1u;
}
{
E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I;
T1y = KP707106781 * (T1n + T1m);
T1z = T1x + T1y;
T1L = T1x - T1y;
T1I = KP707106781 * (TQ + TT);
T1J = T1H - T1I;
T1P = T1H + T1I;
{
E T1C, T1F, T1M, T1N;
T1C = FMA(KP382683432, T1A, KP923879532 * T1B);
T1F = FNMS(KP382683432, T1E, KP923879532 * T1D);
T1G = T1C + T1F;
T1K = T1F - T1C;
T1M = FNMS(KP382683432, T1B, KP923879532 * T1A);
T1N = FMA(KP923879532, T1E, KP382683432 * T1D);
T1O = T1M - T1N;
T1Q = T1M + T1N;
}
ro[WS(os, 9)] = T1z - T1G;
io[WS(os, 9)] = T1P - T1Q;
ro[WS(os, 1)] = T1z + T1G;
io[WS(os, 1)] = T1P + T1Q;
io[WS(os, 13)] = T1J - T1K;
ro[WS(os, 13)] = T1L - T1O;
io[WS(os, 5)] = T1J + T1K;
ro[WS(os, 5)] = T1L + T1O;
}
}
}
}
static const kdft_desc desc = { 16, "n1_16", { 136, 16, 8, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_16) (planner *p) { X(kdft_register) (p, n1_16, &desc);
}
#endif

View File

@@ -0,0 +1,94 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 5 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 1)];
ro[WS(os, 1)] = T1 - T2;
ro[0] = T1 + T2;
T3 = ii[0];
T4 = ii[WS(is, 1)];
io[WS(os, 1)] = T3 - T4;
io[0] = T3 + T4;
}
}
}
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 2 -name n1_2 -include dft/scalar/n.h */
/*
* This function contains 4 FP additions, 0 FP multiplications,
* (or, 4 additions, 0 multiplications, 0 fused multiply/add),
* 5 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(8, is), MAKE_VOLATILE_STRIDE(8, os)) {
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 1)];
ro[WS(os, 1)] = T1 - T2;
ro[0] = T1 + T2;
T3 = ii[0];
T4 = ii[WS(is, 1)];
io[WS(os, 1)] = T3 - T4;
io[0] = T3 + T4;
}
}
}
static const kdft_desc desc = { 2, "n1_2", { 4, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_2) (planner *p) { X(kdft_register) (p, n1_2, &desc);
}
#endif

View File

@@ -0,0 +1,718 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
/*
* This function contains 208 FP additions, 72 FP multiplications,
* (or, 136 additions, 0 multiplications, 72 fused multiply/add),
* 81 stack variables, 4 constants, and 80 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
E T7, T2N, T3b, TD, TP, T1R, T2f, T1d, Tt, TA, TB, T2w, T2z, T2P, T35;
E T36, T3d, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1T, T29, T2a, T2h, T1h;
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2O, T32, T33, T3c, TE, TF, TG, TU;
E TZ, T10, T1D, T1I, T1S, T26, T27, T2g, T1e, T1f, T1g;
{
E T3, T1N, TN, T2L, T6, TO, T1Q, T2M;
{
E T1, T2, TL, TM;
T1 = ri[0];
T2 = ri[WS(is, 10)];
T3 = T1 + T2;
T1N = T1 - T2;
TL = ii[0];
TM = ii[WS(is, 10)];
TN = TL - TM;
T2L = TL + TM;
}
{
E T4, T5, T1O, T1P;
T4 = ri[WS(is, 5)];
T5 = ri[WS(is, 15)];
T6 = T4 + T5;
TO = T4 - T5;
T1O = ii[WS(is, 5)];
T1P = ii[WS(is, 15)];
T1Q = T1O - T1P;
T2M = T1O + T1P;
}
T7 = T3 - T6;
T2N = T2L - T2M;
T3b = T2L + T2M;
TD = T3 + T6;
TP = TN - TO;
T1R = T1N - T1Q;
T2f = T1N + T1Q;
T1d = TO + TN;
}
{
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
E T2y;
{
E Tn, To, T11, T12;
Tn = ri[WS(is, 8)];
To = ri[WS(is, 18)];
Tp = Tn + To;
T1o = Tn - To;
T11 = ii[WS(is, 8)];
T12 = ii[WS(is, 18)];
T13 = T11 - T12;
T2u = T11 + T12;
}
{
E Tq, Tr, T1p, T1q;
Tq = ri[WS(is, 13)];
Tr = ri[WS(is, 3)];
Ts = Tq + Tr;
T14 = Tq - Tr;
T1p = ii[WS(is, 13)];
T1q = ii[WS(is, 3)];
T1r = T1p - T1q;
T2v = T1p + T1q;
}
{
E Tu, Tv, T16, T17;
Tu = ri[WS(is, 12)];
Tv = ri[WS(is, 2)];
Tw = Tu + Tv;
T1t = Tu - Tv;
T16 = ii[WS(is, 12)];
T17 = ii[WS(is, 2)];
T18 = T16 - T17;
T2x = T16 + T17;
}
{
E Tx, Ty, T1u, T1v;
Tx = ri[WS(is, 17)];
Ty = ri[WS(is, 7)];
Tz = Tx + Ty;
T19 = Tx - Ty;
T1u = ii[WS(is, 17)];
T1v = ii[WS(is, 7)];
T1w = T1u - T1v;
T2y = T1u + T1v;
}
Tt = Tp - Ts;
TA = Tw - Tz;
TB = Tt + TA;
T2w = T2u - T2v;
T2z = T2x - T2y;
T2P = T2w + T2z;
T35 = T2u + T2v;
T36 = T2x + T2y;
T3d = T35 + T36;
TH = Tp + Ts;
TI = Tw + Tz;
TJ = TH + TI;
T15 = T13 - T14;
T1a = T18 - T19;
T1b = T15 + T1a;
T1s = T1o - T1r;
T1x = T1t - T1w;
T1T = T1s + T1x;
T29 = T1o + T1r;
T2a = T1t + T1w;
T2h = T29 + T2a;
T1h = T14 + T13;
T1i = T19 + T18;
T1j = T1h + T1i;
}
{
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
E T2F;
{
E T8, T9, TQ, TR;
T8 = ri[WS(is, 4)];
T9 = ri[WS(is, 14)];
Ta = T8 + T9;
T1z = T8 - T9;
TQ = ii[WS(is, 4)];
TR = ii[WS(is, 14)];
TS = TQ - TR;
T2B = TQ + TR;
}
{
E Tb, Tc, T1A, T1B;
Tb = ri[WS(is, 9)];
Tc = ri[WS(is, 19)];
Td = Tb + Tc;
TT = Tb - Tc;
T1A = ii[WS(is, 9)];
T1B = ii[WS(is, 19)];
T1C = T1A - T1B;
T2C = T1A + T1B;
}
{
E Tf, Tg, TV, TW;
Tf = ri[WS(is, 16)];
Tg = ri[WS(is, 6)];
Th = Tf + Tg;
T1E = Tf - Tg;
TV = ii[WS(is, 16)];
TW = ii[WS(is, 6)];
TX = TV - TW;
T2E = TV + TW;
}
{
E Ti, Tj, T1F, T1G;
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 11)];
Tk = Ti + Tj;
TY = Ti - Tj;
T1F = ii[WS(is, 1)];
T1G = ii[WS(is, 11)];
T1H = T1F - T1G;
T2F = T1F + T1G;
}
Te = Ta - Td;
Tl = Th - Tk;
Tm = Te + Tl;
T2D = T2B - T2C;
T2G = T2E - T2F;
T2O = T2D + T2G;
T32 = T2B + T2C;
T33 = T2E + T2F;
T3c = T32 + T33;
TE = Ta + Td;
TF = Th + Tk;
TG = TE + TF;
TU = TS - TT;
TZ = TX - TY;
T10 = TU + TZ;
T1D = T1z - T1C;
T1I = T1E - T1H;
T1S = T1D + T1I;
T26 = T1z + T1C;
T27 = T1E + T1H;
T2g = T26 + T27;
T1e = TT + TS;
T1f = TY + TX;
T1g = T1e + T1f;
}
{
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
T2s = Tm - TB;
TC = Tm + TB;
T2r = FNMS(KP250000000, TC, T7);
T2A = T2w - T2z;
T2H = T2D - T2G;
T2I = FNMS(KP618033988, T2H, T2A);
T2K = FMA(KP618033988, T2A, T2H);
ro[WS(os, 10)] = T7 + TC;
T2J = FMA(KP559016994, T2s, T2r);
ro[WS(os, 14)] = FNMS(KP951056516, T2K, T2J);
ro[WS(os, 6)] = FMA(KP951056516, T2K, T2J);
T2t = FNMS(KP559016994, T2s, T2r);
ro[WS(os, 2)] = FNMS(KP951056516, T2I, T2t);
ro[WS(os, 18)] = FMA(KP951056516, T2I, T2t);
}
{
E T2S, T2Q, T2R, T2W, T2Y, T2U, T2V, T2X, T2T;
T2S = T2O - T2P;
T2Q = T2O + T2P;
T2R = FNMS(KP250000000, T2Q, T2N);
T2U = Tt - TA;
T2V = Te - Tl;
T2W = FNMS(KP618033988, T2V, T2U);
T2Y = FMA(KP618033988, T2U, T2V);
io[WS(os, 10)] = T2N + T2Q;
T2X = FMA(KP559016994, T2S, T2R);
io[WS(os, 6)] = FNMS(KP951056516, T2Y, T2X);
io[WS(os, 14)] = FMA(KP951056516, T2Y, T2X);
T2T = FNMS(KP559016994, T2S, T2R);
io[WS(os, 2)] = FMA(KP951056516, T2W, T2T);
io[WS(os, 18)] = FNMS(KP951056516, T2W, T2T);
}
{
E T30, TK, T2Z, T38, T3a, T34, T37, T39, T31;
T30 = TG - TJ;
TK = TG + TJ;
T2Z = FNMS(KP250000000, TK, TD);
T34 = T32 - T33;
T37 = T35 - T36;
T38 = FMA(KP618033988, T37, T34);
T3a = FNMS(KP618033988, T34, T37);
ro[0] = TD + TK;
T39 = FNMS(KP559016994, T30, T2Z);
ro[WS(os, 12)] = FNMS(KP951056516, T3a, T39);
ro[WS(os, 8)] = FMA(KP951056516, T3a, T39);
T31 = FMA(KP559016994, T30, T2Z);
ro[WS(os, 4)] = FNMS(KP951056516, T38, T31);
ro[WS(os, 16)] = FMA(KP951056516, T38, T31);
}
{
E T3g, T3e, T3f, T3k, T3m, T3i, T3j, T3l, T3h;
T3g = T3c - T3d;
T3e = T3c + T3d;
T3f = FNMS(KP250000000, T3e, T3b);
T3i = TE - TF;
T3j = TH - TI;
T3k = FMA(KP618033988, T3j, T3i);
T3m = FNMS(KP618033988, T3i, T3j);
io[0] = T3b + T3e;
T3l = FNMS(KP559016994, T3g, T3f);
io[WS(os, 8)] = FNMS(KP951056516, T3m, T3l);
io[WS(os, 12)] = FMA(KP951056516, T3m, T3l);
T3h = FMA(KP559016994, T3g, T3f);
io[WS(os, 4)] = FMA(KP951056516, T3k, T3h);
io[WS(os, 16)] = FNMS(KP951056516, T3k, T3h);
}
{
E T24, T1c, T23, T2c, T2e, T28, T2b, T2d, T25;
T24 = T10 - T1b;
T1c = T10 + T1b;
T23 = FNMS(KP250000000, T1c, TP);
T28 = T26 - T27;
T2b = T29 - T2a;
T2c = FMA(KP618033988, T2b, T28);
T2e = FNMS(KP618033988, T28, T2b);
io[WS(os, 5)] = TP + T1c;
T2d = FNMS(KP559016994, T24, T23);
io[WS(os, 13)] = FNMS(KP951056516, T2e, T2d);
io[WS(os, 17)] = FMA(KP951056516, T2e, T2d);
T25 = FMA(KP559016994, T24, T23);
io[WS(os, 1)] = FNMS(KP951056516, T2c, T25);
io[WS(os, 9)] = FMA(KP951056516, T2c, T25);
}
{
E T2k, T2i, T2j, T2o, T2q, T2m, T2n, T2p, T2l;
T2k = T2g - T2h;
T2i = T2g + T2h;
T2j = FNMS(KP250000000, T2i, T2f);
T2m = TU - TZ;
T2n = T15 - T1a;
T2o = FMA(KP618033988, T2n, T2m);
T2q = FNMS(KP618033988, T2m, T2n);
ro[WS(os, 5)] = T2f + T2i;
T2p = FNMS(KP559016994, T2k, T2j);
ro[WS(os, 13)] = FMA(KP951056516, T2q, T2p);
ro[WS(os, 17)] = FNMS(KP951056516, T2q, T2p);
T2l = FMA(KP559016994, T2k, T2j);
ro[WS(os, 1)] = FMA(KP951056516, T2o, T2l);
ro[WS(os, 9)] = FNMS(KP951056516, T2o, T2l);
}
{
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
T1m = T1g - T1j;
T1k = T1g + T1j;
T1l = FNMS(KP250000000, T1k, T1d);
T1y = T1s - T1x;
T1J = T1D - T1I;
T1K = FNMS(KP618033988, T1J, T1y);
T1M = FMA(KP618033988, T1y, T1J);
io[WS(os, 15)] = T1d + T1k;
T1L = FMA(KP559016994, T1m, T1l);
io[WS(os, 11)] = FNMS(KP951056516, T1M, T1L);
io[WS(os, 19)] = FMA(KP951056516, T1M, T1L);
T1n = FNMS(KP559016994, T1m, T1l);
io[WS(os, 3)] = FNMS(KP951056516, T1K, T1n);
io[WS(os, 7)] = FMA(KP951056516, T1K, T1n);
}
{
E T1W, T1U, T1V, T20, T22, T1Y, T1Z, T21, T1X;
T1W = T1S - T1T;
T1U = T1S + T1T;
T1V = FNMS(KP250000000, T1U, T1R);
T1Y = T1h - T1i;
T1Z = T1e - T1f;
T20 = FNMS(KP618033988, T1Z, T1Y);
T22 = FMA(KP618033988, T1Y, T1Z);
ro[WS(os, 15)] = T1R + T1U;
T21 = FMA(KP559016994, T1W, T1V);
ro[WS(os, 11)] = FMA(KP951056516, T22, T21);
ro[WS(os, 19)] = FNMS(KP951056516, T22, T21);
T1X = FNMS(KP559016994, T1W, T1V);
ro[WS(os, 3)] = FMA(KP951056516, T20, T1X);
ro[WS(os, 7)] = FNMS(KP951056516, T20, T1X);
}
}
}
}
static const kdft_desc desc = { 20, "n1_20", { 136, 0, 72, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 20 -name n1_20 -include dft/scalar/n.h */
/*
* This function contains 208 FP additions, 48 FP multiplications,
* (or, 184 additions, 24 multiplications, 24 fused multiply/add),
* 81 stack variables, 4 constants, and 80 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_20(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(80, is), MAKE_VOLATILE_STRIDE(80, os)) {
E T7, T2Q, T3h, TD, TP, T1U, T2l, T1d, Tt, TA, TB, T2w, T2z, T2S, T35;
E T36, T3f, TH, TI, TJ, T15, T1a, T1b, T1s, T1x, T1W, T29, T2a, T2j, T1h;
E T1i, T1j, Te, Tl, Tm, T2D, T2G, T2R, T32, T33, T3e, TE, TF, TG, TU;
E TZ, T10, T1D, T1I, T1V, T26, T27, T2i, T1e, T1f, T1g;
{
E T3, T1Q, TN, T2O, T6, TO, T1T, T2P;
{
E T1, T2, TL, TM;
T1 = ri[0];
T2 = ri[WS(is, 10)];
T3 = T1 + T2;
T1Q = T1 - T2;
TL = ii[0];
TM = ii[WS(is, 10)];
TN = TL - TM;
T2O = TL + TM;
}
{
E T4, T5, T1R, T1S;
T4 = ri[WS(is, 5)];
T5 = ri[WS(is, 15)];
T6 = T4 + T5;
TO = T4 - T5;
T1R = ii[WS(is, 5)];
T1S = ii[WS(is, 15)];
T1T = T1R - T1S;
T2P = T1R + T1S;
}
T7 = T3 - T6;
T2Q = T2O - T2P;
T3h = T2O + T2P;
TD = T3 + T6;
TP = TN - TO;
T1U = T1Q - T1T;
T2l = T1Q + T1T;
T1d = TO + TN;
}
{
E Tp, T1o, T13, T2u, Ts, T14, T1r, T2v, Tw, T1t, T18, T2x, Tz, T19, T1w;
E T2y;
{
E Tn, To, T11, T12;
Tn = ri[WS(is, 8)];
To = ri[WS(is, 18)];
Tp = Tn + To;
T1o = Tn - To;
T11 = ii[WS(is, 8)];
T12 = ii[WS(is, 18)];
T13 = T11 - T12;
T2u = T11 + T12;
}
{
E Tq, Tr, T1p, T1q;
Tq = ri[WS(is, 13)];
Tr = ri[WS(is, 3)];
Ts = Tq + Tr;
T14 = Tq - Tr;
T1p = ii[WS(is, 13)];
T1q = ii[WS(is, 3)];
T1r = T1p - T1q;
T2v = T1p + T1q;
}
{
E Tu, Tv, T16, T17;
Tu = ri[WS(is, 12)];
Tv = ri[WS(is, 2)];
Tw = Tu + Tv;
T1t = Tu - Tv;
T16 = ii[WS(is, 12)];
T17 = ii[WS(is, 2)];
T18 = T16 - T17;
T2x = T16 + T17;
}
{
E Tx, Ty, T1u, T1v;
Tx = ri[WS(is, 17)];
Ty = ri[WS(is, 7)];
Tz = Tx + Ty;
T19 = Tx - Ty;
T1u = ii[WS(is, 17)];
T1v = ii[WS(is, 7)];
T1w = T1u - T1v;
T2y = T1u + T1v;
}
Tt = Tp - Ts;
TA = Tw - Tz;
TB = Tt + TA;
T2w = T2u - T2v;
T2z = T2x - T2y;
T2S = T2w + T2z;
T35 = T2u + T2v;
T36 = T2x + T2y;
T3f = T35 + T36;
TH = Tp + Ts;
TI = Tw + Tz;
TJ = TH + TI;
T15 = T13 - T14;
T1a = T18 - T19;
T1b = T15 + T1a;
T1s = T1o - T1r;
T1x = T1t - T1w;
T1W = T1s + T1x;
T29 = T1o + T1r;
T2a = T1t + T1w;
T2j = T29 + T2a;
T1h = T14 + T13;
T1i = T19 + T18;
T1j = T1h + T1i;
}
{
E Ta, T1z, TS, T2B, Td, TT, T1C, T2C, Th, T1E, TX, T2E, Tk, TY, T1H;
E T2F;
{
E T8, T9, TQ, TR;
T8 = ri[WS(is, 4)];
T9 = ri[WS(is, 14)];
Ta = T8 + T9;
T1z = T8 - T9;
TQ = ii[WS(is, 4)];
TR = ii[WS(is, 14)];
TS = TQ - TR;
T2B = TQ + TR;
}
{
E Tb, Tc, T1A, T1B;
Tb = ri[WS(is, 9)];
Tc = ri[WS(is, 19)];
Td = Tb + Tc;
TT = Tb - Tc;
T1A = ii[WS(is, 9)];
T1B = ii[WS(is, 19)];
T1C = T1A - T1B;
T2C = T1A + T1B;
}
{
E Tf, Tg, TV, TW;
Tf = ri[WS(is, 16)];
Tg = ri[WS(is, 6)];
Th = Tf + Tg;
T1E = Tf - Tg;
TV = ii[WS(is, 16)];
TW = ii[WS(is, 6)];
TX = TV - TW;
T2E = TV + TW;
}
{
E Ti, Tj, T1F, T1G;
Ti = ri[WS(is, 1)];
Tj = ri[WS(is, 11)];
Tk = Ti + Tj;
TY = Ti - Tj;
T1F = ii[WS(is, 1)];
T1G = ii[WS(is, 11)];
T1H = T1F - T1G;
T2F = T1F + T1G;
}
Te = Ta - Td;
Tl = Th - Tk;
Tm = Te + Tl;
T2D = T2B - T2C;
T2G = T2E - T2F;
T2R = T2D + T2G;
T32 = T2B + T2C;
T33 = T2E + T2F;
T3e = T32 + T33;
TE = Ta + Td;
TF = Th + Tk;
TG = TE + TF;
TU = TS - TT;
TZ = TX - TY;
T10 = TU + TZ;
T1D = T1z - T1C;
T1I = T1E - T1H;
T1V = T1D + T1I;
T26 = T1z + T1C;
T27 = T1E + T1H;
T2i = T26 + T27;
T1e = TT + TS;
T1f = TY + TX;
T1g = T1e + T1f;
}
{
E T2s, TC, T2r, T2I, T2K, T2A, T2H, T2J, T2t;
T2s = KP559016994 * (Tm - TB);
TC = Tm + TB;
T2r = FNMS(KP250000000, TC, T7);
T2A = T2w - T2z;
T2H = T2D - T2G;
T2I = FNMS(KP587785252, T2H, KP951056516 * T2A);
T2K = FMA(KP951056516, T2H, KP587785252 * T2A);
ro[WS(os, 10)] = T7 + TC;
T2J = T2s + T2r;
ro[WS(os, 14)] = T2J - T2K;
ro[WS(os, 6)] = T2J + T2K;
T2t = T2r - T2s;
ro[WS(os, 2)] = T2t - T2I;
ro[WS(os, 18)] = T2t + T2I;
}
{
E T2V, T2T, T2U, T2N, T2Y, T2L, T2M, T2X, T2W;
T2V = KP559016994 * (T2R - T2S);
T2T = T2R + T2S;
T2U = FNMS(KP250000000, T2T, T2Q);
T2L = Tt - TA;
T2M = Te - Tl;
T2N = FNMS(KP587785252, T2M, KP951056516 * T2L);
T2Y = FMA(KP951056516, T2M, KP587785252 * T2L);
io[WS(os, 10)] = T2Q + T2T;
T2X = T2V + T2U;
io[WS(os, 6)] = T2X - T2Y;
io[WS(os, 14)] = T2Y + T2X;
T2W = T2U - T2V;
io[WS(os, 2)] = T2N + T2W;
io[WS(os, 18)] = T2W - T2N;
}
{
E T2Z, TK, T30, T38, T3a, T34, T37, T39, T31;
T2Z = KP559016994 * (TG - TJ);
TK = TG + TJ;
T30 = FNMS(KP250000000, TK, TD);
T34 = T32 - T33;
T37 = T35 - T36;
T38 = FMA(KP951056516, T34, KP587785252 * T37);
T3a = FNMS(KP587785252, T34, KP951056516 * T37);
ro[0] = TD + TK;
T39 = T30 - T2Z;
ro[WS(os, 12)] = T39 - T3a;
ro[WS(os, 8)] = T39 + T3a;
T31 = T2Z + T30;
ro[WS(os, 4)] = T31 - T38;
ro[WS(os, 16)] = T31 + T38;
}
{
E T3g, T3i, T3j, T3d, T3m, T3b, T3c, T3l, T3k;
T3g = KP559016994 * (T3e - T3f);
T3i = T3e + T3f;
T3j = FNMS(KP250000000, T3i, T3h);
T3b = TE - TF;
T3c = TH - TI;
T3d = FMA(KP951056516, T3b, KP587785252 * T3c);
T3m = FNMS(KP587785252, T3b, KP951056516 * T3c);
io[0] = T3h + T3i;
T3l = T3j - T3g;
io[WS(os, 8)] = T3l - T3m;
io[WS(os, 12)] = T3m + T3l;
T3k = T3g + T3j;
io[WS(os, 4)] = T3d + T3k;
io[WS(os, 16)] = T3k - T3d;
}
{
E T23, T1c, T24, T2c, T2e, T28, T2b, T2d, T25;
T23 = KP559016994 * (T10 - T1b);
T1c = T10 + T1b;
T24 = FNMS(KP250000000, T1c, TP);
T28 = T26 - T27;
T2b = T29 - T2a;
T2c = FMA(KP951056516, T28, KP587785252 * T2b);
T2e = FNMS(KP587785252, T28, KP951056516 * T2b);
io[WS(os, 5)] = TP + T1c;
T2d = T24 - T23;
io[WS(os, 13)] = T2d - T2e;
io[WS(os, 17)] = T2d + T2e;
T25 = T23 + T24;
io[WS(os, 1)] = T25 - T2c;
io[WS(os, 9)] = T25 + T2c;
}
{
E T2k, T2m, T2n, T2h, T2p, T2f, T2g, T2q, T2o;
T2k = KP559016994 * (T2i - T2j);
T2m = T2i + T2j;
T2n = FNMS(KP250000000, T2m, T2l);
T2f = TU - TZ;
T2g = T15 - T1a;
T2h = FMA(KP951056516, T2f, KP587785252 * T2g);
T2p = FNMS(KP587785252, T2f, KP951056516 * T2g);
ro[WS(os, 5)] = T2l + T2m;
T2q = T2n - T2k;
ro[WS(os, 13)] = T2p + T2q;
ro[WS(os, 17)] = T2q - T2p;
T2o = T2k + T2n;
ro[WS(os, 1)] = T2h + T2o;
ro[WS(os, 9)] = T2o - T2h;
}
{
E T1m, T1k, T1l, T1K, T1M, T1y, T1J, T1L, T1n;
T1m = KP559016994 * (T1g - T1j);
T1k = T1g + T1j;
T1l = FNMS(KP250000000, T1k, T1d);
T1y = T1s - T1x;
T1J = T1D - T1I;
T1K = FNMS(KP587785252, T1J, KP951056516 * T1y);
T1M = FMA(KP951056516, T1J, KP587785252 * T1y);
io[WS(os, 15)] = T1d + T1k;
T1L = T1m + T1l;
io[WS(os, 11)] = T1L - T1M;
io[WS(os, 19)] = T1L + T1M;
T1n = T1l - T1m;
io[WS(os, 3)] = T1n - T1K;
io[WS(os, 7)] = T1n + T1K;
}
{
E T1Z, T1X, T1Y, T1P, T21, T1N, T1O, T22, T20;
T1Z = KP559016994 * (T1V - T1W);
T1X = T1V + T1W;
T1Y = FNMS(KP250000000, T1X, T1U);
T1N = T1h - T1i;
T1O = T1e - T1f;
T1P = FNMS(KP587785252, T1O, KP951056516 * T1N);
T21 = FMA(KP951056516, T1O, KP587785252 * T1N);
ro[WS(os, 15)] = T1U + T1X;
T22 = T1Z + T1Y;
ro[WS(os, 11)] = T21 + T22;
ro[WS(os, 19)] = T22 - T21;
T20 = T1Y - T1Z;
ro[WS(os, 3)] = T1P + T20;
ro[WS(os, 7)] = T20 - T1P;
}
}
}
}
static const kdft_desc desc = { 20, "n1_20", { 184, 24, 24, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_20) (planner *p) { X(kdft_register) (p, n1_20, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,124 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
/*
* This function contains 12 FP additions, 6 FP multiplications,
* (or, 6 additions, 0 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
E T1, T9, T4, Tc, T8, Ta, T5, Tb;
T1 = ri[0];
T9 = ii[0];
{
E T2, T3, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 2)];
T4 = T2 + T3;
Tc = T3 - T2;
T6 = ii[WS(is, 1)];
T7 = ii[WS(is, 2)];
T8 = T6 - T7;
Ta = T6 + T7;
}
ro[0] = T1 + T4;
io[0] = T9 + Ta;
T5 = FNMS(KP500000000, T4, T1);
ro[WS(os, 2)] = FNMS(KP866025403, T8, T5);
ro[WS(os, 1)] = FMA(KP866025403, T8, T5);
Tb = FNMS(KP500000000, Ta, T9);
io[WS(os, 1)] = FMA(KP866025403, Tc, Tb);
io[WS(os, 2)] = FNMS(KP866025403, Tc, Tb);
}
}
}
static const kdft_desc desc = { 3, "n1_3", { 6, 0, 6, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 3 -name n1_3 -include dft/scalar/n.h */
/*
* This function contains 12 FP additions, 4 FP multiplications,
* (or, 10 additions, 2 multiplications, 2 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(12, is), MAKE_VOLATILE_STRIDE(12, os)) {
E T1, Ta, T4, T9, T8, Tb, T5, Tc;
T1 = ri[0];
Ta = ii[0];
{
E T2, T3, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 2)];
T4 = T2 + T3;
T9 = KP866025403 * (T3 - T2);
T6 = ii[WS(is, 1)];
T7 = ii[WS(is, 2)];
T8 = KP866025403 * (T6 - T7);
Tb = T6 + T7;
}
ro[0] = T1 + T4;
io[0] = Ta + Tb;
T5 = FNMS(KP500000000, T4, T1);
ro[WS(os, 2)] = T5 - T8;
ro[WS(os, 1)] = T5 + T8;
Tc = FNMS(KP500000000, Tb, Ta);
io[WS(os, 1)] = T9 + Tc;
io[WS(os, 2)] = Tc - T9;
}
}
}
static const kdft_desc desc = { 3, "n1_3", { 10, 2, 2, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_3) (planner *p) { X(kdft_register) (p, n1_3, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,138 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
{
E T1, T2, T7, T8;
T1 = ri[0];
T2 = ri[WS(is, 2)];
T3 = T1 + T2;
Tb = T1 - T2;
T7 = ii[0];
T8 = ii[WS(is, 2)];
T9 = T7 - T8;
Tf = T7 + T8;
}
{
E T4, T5, Tc, Td;
T4 = ri[WS(is, 1)];
T5 = ri[WS(is, 3)];
T6 = T4 + T5;
Ta = T4 - T5;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 3)];
Te = Tc - Td;
Tg = Tc + Td;
}
ro[WS(os, 2)] = T3 - T6;
io[WS(os, 2)] = Tf - Tg;
ro[0] = T3 + T6;
io[0] = Tf + Tg;
io[WS(os, 1)] = T9 - Ta;
ro[WS(os, 1)] = Tb + Te;
io[WS(os, 3)] = Ta + T9;
ro[WS(os, 3)] = Tb - Te;
}
}
}
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 4 -name n1_4 -include dft/scalar/n.h */
/*
* This function contains 16 FP additions, 0 FP multiplications,
* (or, 16 additions, 0 multiplications, 0 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(16, is), MAKE_VOLATILE_STRIDE(16, os)) {
E T3, Tb, T9, Tf, T6, Ta, Te, Tg;
{
E T1, T2, T7, T8;
T1 = ri[0];
T2 = ri[WS(is, 2)];
T3 = T1 + T2;
Tb = T1 - T2;
T7 = ii[0];
T8 = ii[WS(is, 2)];
T9 = T7 - T8;
Tf = T7 + T8;
}
{
E T4, T5, Tc, Td;
T4 = ri[WS(is, 1)];
T5 = ri[WS(is, 3)];
T6 = T4 + T5;
Ta = T4 - T5;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 3)];
Te = Tc - Td;
Tg = Tc + Td;
}
ro[WS(os, 2)] = T3 - T6;
io[WS(os, 2)] = Tf - Tg;
ro[0] = T3 + T6;
io[0] = Tf + Tg;
io[WS(os, 1)] = T9 - Ta;
ro[WS(os, 1)] = Tb + Te;
io[WS(os, 3)] = Ta + T9;
ro[WS(os, 3)] = Tb - Te;
}
}
}
static const kdft_desc desc = { 4, "n1_4", { 16, 0, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_4) (planner *p) { X(kdft_register) (p, n1_4, &desc);
}
#endif

View File

@@ -0,0 +1,194 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
/*
* This function contains 32 FP additions, 18 FP multiplications,
* (or, 14 additions, 0 multiplications, 18 fused multiply/add),
* 21 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
E T1, Tl, T8, Tt, Ta, Ts, Te, Tq, Th, To;
T1 = ri[0];
Tl = ii[0];
{
E T2, T3, T4, T5, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 4)];
T4 = T2 + T3;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 3)];
T7 = T5 + T6;
T8 = T4 + T7;
Tt = T5 - T6;
Ta = T4 - T7;
Ts = T2 - T3;
}
{
E Tc, Td, Tm, Tf, Tg, Tn;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 4)];
Tm = Tc + Td;
Tf = ii[WS(is, 2)];
Tg = ii[WS(is, 3)];
Tn = Tf + Tg;
Te = Tc - Td;
Tq = Tm - Tn;
Th = Tf - Tg;
To = Tm + Tn;
}
ro[0] = T1 + T8;
io[0] = Tl + To;
{
E Ti, Tk, Tb, Tj, T9;
Ti = FMA(KP618033988, Th, Te);
Tk = FNMS(KP618033988, Te, Th);
T9 = FNMS(KP250000000, T8, T1);
Tb = FMA(KP559016994, Ta, T9);
Tj = FNMS(KP559016994, Ta, T9);
ro[WS(os, 4)] = FNMS(KP951056516, Ti, Tb);
ro[WS(os, 3)] = FMA(KP951056516, Tk, Tj);
ro[WS(os, 1)] = FMA(KP951056516, Ti, Tb);
ro[WS(os, 2)] = FNMS(KP951056516, Tk, Tj);
}
{
E Tu, Tw, Tr, Tv, Tp;
Tu = FMA(KP618033988, Tt, Ts);
Tw = FNMS(KP618033988, Ts, Tt);
Tp = FNMS(KP250000000, To, Tl);
Tr = FMA(KP559016994, Tq, Tp);
Tv = FNMS(KP559016994, Tq, Tp);
io[WS(os, 1)] = FNMS(KP951056516, Tu, Tr);
io[WS(os, 3)] = FNMS(KP951056516, Tw, Tv);
io[WS(os, 4)] = FMA(KP951056516, Tu, Tr);
io[WS(os, 2)] = FMA(KP951056516, Tw, Tv);
}
}
}
}
static const kdft_desc desc = { 5, "n1_5", { 14, 0, 18, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 5 -name n1_5 -include dft/scalar/n.h */
/*
* This function contains 32 FP additions, 12 FP multiplications,
* (or, 26 additions, 6 multiplications, 6 fused multiply/add),
* 21 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(20, is), MAKE_VOLATILE_STRIDE(20, os)) {
E T1, To, T8, Tt, T9, Ts, Te, Tp, Th, Tn;
T1 = ri[0];
To = ii[0];
{
E T2, T3, T4, T5, T6, T7;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 4)];
T4 = T2 + T3;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 3)];
T7 = T5 + T6;
T8 = T4 + T7;
Tt = T5 - T6;
T9 = KP559016994 * (T4 - T7);
Ts = T2 - T3;
}
{
E Tc, Td, Tl, Tf, Tg, Tm;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 4)];
Tl = Tc + Td;
Tf = ii[WS(is, 2)];
Tg = ii[WS(is, 3)];
Tm = Tf + Tg;
Te = Tc - Td;
Tp = Tl + Tm;
Th = Tf - Tg;
Tn = KP559016994 * (Tl - Tm);
}
ro[0] = T1 + T8;
io[0] = To + Tp;
{
E Ti, Tk, Tb, Tj, Ta;
Ti = FMA(KP951056516, Te, KP587785252 * Th);
Tk = FNMS(KP587785252, Te, KP951056516 * Th);
Ta = FNMS(KP250000000, T8, T1);
Tb = T9 + Ta;
Tj = Ta - T9;
ro[WS(os, 4)] = Tb - Ti;
ro[WS(os, 3)] = Tj + Tk;
ro[WS(os, 1)] = Tb + Ti;
ro[WS(os, 2)] = Tj - Tk;
}
{
E Tu, Tv, Tr, Tw, Tq;
Tu = FMA(KP951056516, Ts, KP587785252 * Tt);
Tv = FNMS(KP587785252, Ts, KP951056516 * Tt);
Tq = FNMS(KP250000000, Tp, To);
Tr = Tn + Tq;
Tw = Tq - Tn;
io[WS(os, 1)] = Tr - Tu;
io[WS(os, 3)] = Tw - Tv;
io[WS(os, 4)] = Tu + Tr;
io[WS(os, 2)] = Tv + Tw;
}
}
}
}
static const kdft_desc desc = { 5, "n1_5", { 26, 6, 6, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_5) (planner *p) { X(kdft_register) (p, n1_5, &desc);
}
#endif

View File

@@ -0,0 +1,210 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
/*
* This function contains 36 FP additions, 12 FP multiplications,
* (or, 24 additions, 0 multiplications, 12 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
E T3, Tb, Tp, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tq;
E Ty;
{
E T1, T2, Tn, To;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = T1 - T2;
Tb = T1 + T2;
Tn = ii[0];
To = ii[WS(is, 3)];
Tp = Tn - To;
Tx = Tn + To;
}
{
E T4, T5, T7, T8;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 5)];
T6 = T4 - T5;
Tc = T4 + T5;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 1)];
T9 = T7 - T8;
Td = T7 + T8;
}
Ta = T6 + T9;
Te = Tc + Td;
{
E Tg, Th, Tj, Tk;
Tg = ii[WS(is, 2)];
Th = ii[WS(is, 5)];
Ti = Tg - Th;
Tu = Tg + Th;
Tj = ii[WS(is, 4)];
Tk = ii[WS(is, 1)];
Tl = Tj - Tk;
Tv = Tj + Tk;
}
Tq = Ti + Tl;
Ty = Tu + Tv;
ro[WS(os, 3)] = T3 + Ta;
io[WS(os, 3)] = Tp + Tq;
ro[0] = Tb + Te;
io[0] = Tx + Ty;
{
E Tf, Tm, Tr, Ts;
Tf = FNMS(KP500000000, Ta, T3);
Tm = Ti - Tl;
ro[WS(os, 5)] = FNMS(KP866025403, Tm, Tf);
ro[WS(os, 1)] = FMA(KP866025403, Tm, Tf);
Tr = FNMS(KP500000000, Tq, Tp);
Ts = T9 - T6;
io[WS(os, 1)] = FMA(KP866025403, Ts, Tr);
io[WS(os, 5)] = FNMS(KP866025403, Ts, Tr);
}
{
E Tt, Tw, Tz, TA;
Tt = FNMS(KP500000000, Te, Tb);
Tw = Tu - Tv;
ro[WS(os, 2)] = FNMS(KP866025403, Tw, Tt);
ro[WS(os, 4)] = FMA(KP866025403, Tw, Tt);
Tz = FNMS(KP500000000, Ty, Tx);
TA = Td - Tc;
io[WS(os, 2)] = FNMS(KP866025403, TA, Tz);
io[WS(os, 4)] = FMA(KP866025403, TA, Tz);
}
}
}
}
static const kdft_desc desc = { 6, "n1_6", { 24, 0, 12, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 6 -name n1_6 -include dft/scalar/n.h */
/*
* This function contains 36 FP additions, 8 FP multiplications,
* (or, 32 additions, 4 multiplications, 4 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_6(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(24, is), MAKE_VOLATILE_STRIDE(24, os)) {
E T3, Tb, Tq, Tx, T6, Tc, T9, Td, Ta, Te, Ti, Tu, Tl, Tv, Tr;
E Ty;
{
E T1, T2, To, Tp;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = T1 - T2;
Tb = T1 + T2;
To = ii[0];
Tp = ii[WS(is, 3)];
Tq = To - Tp;
Tx = To + Tp;
}
{
E T4, T5, T7, T8;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 5)];
T6 = T4 - T5;
Tc = T4 + T5;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 1)];
T9 = T7 - T8;
Td = T7 + T8;
}
Ta = T6 + T9;
Te = Tc + Td;
{
E Tg, Th, Tj, Tk;
Tg = ii[WS(is, 2)];
Th = ii[WS(is, 5)];
Ti = Tg - Th;
Tu = Tg + Th;
Tj = ii[WS(is, 4)];
Tk = ii[WS(is, 1)];
Tl = Tj - Tk;
Tv = Tj + Tk;
}
Tr = Ti + Tl;
Ty = Tu + Tv;
ro[WS(os, 3)] = T3 + Ta;
io[WS(os, 3)] = Tq + Tr;
ro[0] = Tb + Te;
io[0] = Tx + Ty;
{
E Tf, Tm, Tn, Ts;
Tf = FNMS(KP500000000, Ta, T3);
Tm = KP866025403 * (Ti - Tl);
ro[WS(os, 5)] = Tf - Tm;
ro[WS(os, 1)] = Tf + Tm;
Tn = KP866025403 * (T9 - T6);
Ts = FNMS(KP500000000, Tr, Tq);
io[WS(os, 1)] = Tn + Ts;
io[WS(os, 5)] = Ts - Tn;
}
{
E Tt, Tw, Tz, TA;
Tt = FNMS(KP500000000, Te, Tb);
Tw = KP866025403 * (Tu - Tv);
ro[WS(os, 2)] = Tt - Tw;
ro[WS(os, 4)] = Tt + Tw;
Tz = FNMS(KP500000000, Ty, Tx);
TA = KP866025403 * (Td - Tc);
io[WS(os, 2)] = Tz - TA;
io[WS(os, 4)] = TA + Tz;
}
}
}
}
static const kdft_desc desc = { 6, "n1_6", { 32, 4, 4, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_6) (planner *p) { X(kdft_register) (p, n1_6, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,249 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
/*
* This function contains 60 FP additions, 42 FP multiplications,
* (or, 18 additions, 0 multiplications, 42 fused multiply/add),
* 41 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
E T1, Tz, T4, TI, Ta, TG, T7, TH, Tb, Tp, TT, TO, TJ, Tu, Tg;
E TB, Tm, TC, Tj, TA, Tn, Ts, TQ, TL, TD, Tx;
T1 = ri[0];
Tz = ii[0];
{
E T2, T3, Te, Tf;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
TI = T3 - T2;
{
E T8, T9, T5, T6;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
TG = T9 - T8;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 5)];
T7 = T5 + T6;
TH = T6 - T5;
}
Tb = FNMS(KP356895867, T7, T4);
Tp = FNMS(KP356895867, T4, Ta);
TT = FMA(KP554958132, TG, TI);
TO = FMA(KP554958132, TH, TG);
TJ = FNMS(KP554958132, TI, TH);
Tu = FNMS(KP356895867, Ta, T7);
Te = ii[WS(is, 2)];
Tf = ii[WS(is, 5)];
Tg = Te - Tf;
TB = Te + Tf;
{
E Tk, Tl, Th, Ti;
Tk = ii[WS(is, 3)];
Tl = ii[WS(is, 4)];
Tm = Tk - Tl;
TC = Tk + Tl;
Th = ii[WS(is, 1)];
Ti = ii[WS(is, 6)];
Tj = Th - Ti;
TA = Th + Ti;
}
Tn = FMA(KP554958132, Tm, Tj);
Ts = FMA(KP554958132, Tg, Tm);
TQ = FNMS(KP356895867, TB, TA);
TL = FNMS(KP356895867, TA, TC);
TD = FNMS(KP356895867, TC, TB);
Tx = FNMS(KP554958132, Tj, Tg);
}
ro[0] = T1 + T4 + T7 + Ta;
io[0] = Tz + TA + TB + TC;
{
E To, Td, Tc, TU, TS, TR;
To = FMA(KP801937735, Tn, Tg);
Tc = FNMS(KP692021471, Tb, Ta);
Td = FNMS(KP900968867, Tc, T1);
ro[WS(os, 6)] = FNMS(KP974927912, To, Td);
ro[WS(os, 1)] = FMA(KP974927912, To, Td);
TU = FMA(KP801937735, TT, TH);
TR = FNMS(KP692021471, TQ, TC);
TS = FNMS(KP900968867, TR, Tz);
io[WS(os, 1)] = FMA(KP974927912, TU, TS);
io[WS(os, 6)] = FNMS(KP974927912, TU, TS);
}
{
E Tt, Tr, Tq, TP, TN, TM;
Tt = FNMS(KP801937735, Ts, Tj);
Tq = FNMS(KP692021471, Tp, T7);
Tr = FNMS(KP900968867, Tq, T1);
ro[WS(os, 5)] = FNMS(KP974927912, Tt, Tr);
ro[WS(os, 2)] = FMA(KP974927912, Tt, Tr);
TP = FNMS(KP801937735, TO, TI);
TM = FNMS(KP692021471, TL, TB);
TN = FNMS(KP900968867, TM, Tz);
io[WS(os, 2)] = FMA(KP974927912, TP, TN);
io[WS(os, 5)] = FNMS(KP974927912, TP, TN);
}
{
E Ty, Tw, Tv, TK, TF, TE;
Ty = FNMS(KP801937735, Tx, Tm);
Tv = FNMS(KP692021471, Tu, T4);
Tw = FNMS(KP900968867, Tv, T1);
ro[WS(os, 4)] = FNMS(KP974927912, Ty, Tw);
ro[WS(os, 3)] = FMA(KP974927912, Ty, Tw);
TK = FNMS(KP801937735, TJ, TG);
TE = FNMS(KP692021471, TD, TA);
TF = FNMS(KP900968867, TE, Tz);
io[WS(os, 3)] = FMA(KP974927912, TK, TF);
io[WS(os, 4)] = FNMS(KP974927912, TK, TF);
}
}
}
}
static const kdft_desc desc = { 7, "n1_7", { 18, 0, 42, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 7 -name n1_7 -include dft/scalar/n.h */
/*
* This function contains 60 FP additions, 36 FP multiplications,
* (or, 36 additions, 12 multiplications, 24 fused multiply/add),
* 25 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(28, is), MAKE_VOLATILE_STRIDE(28, os)) {
E T1, Tu, T4, Tq, Te, Tx, T7, Ts, Tk, Tv, Ta, Tr, Th, Tw;
T1 = ri[0];
Tu = ii[0];
{
E T2, T3, Tc, Td;
T2 = ri[WS(is, 1)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
Tq = T3 - T2;
Tc = ii[WS(is, 1)];
Td = ii[WS(is, 6)];
Te = Tc - Td;
Tx = Tc + Td;
}
{
E T5, T6, Ti, Tj;
T5 = ri[WS(is, 2)];
T6 = ri[WS(is, 5)];
T7 = T5 + T6;
Ts = T6 - T5;
Ti = ii[WS(is, 2)];
Tj = ii[WS(is, 5)];
Tk = Ti - Tj;
Tv = Ti + Tj;
}
{
E T8, T9, Tf, Tg;
T8 = ri[WS(is, 3)];
T9 = ri[WS(is, 4)];
Ta = T8 + T9;
Tr = T9 - T8;
Tf = ii[WS(is, 3)];
Tg = ii[WS(is, 4)];
Th = Tf - Tg;
Tw = Tf + Tg;
}
ro[0] = T1 + T4 + T7 + Ta;
io[0] = Tu + Tx + Tv + Tw;
{
E Tl, Tb, TB, TC;
Tl = FNMS(KP781831482, Th, KP974927912 * Te) - (KP433883739 * Tk);
Tb = FMA(KP623489801, Ta, T1) + FNMA(KP900968867, T7, KP222520933 * T4);
ro[WS(os, 5)] = Tb - Tl;
ro[WS(os, 2)] = Tb + Tl;
TB = FNMS(KP781831482, Tr, KP974927912 * Tq) - (KP433883739 * Ts);
TC = FMA(KP623489801, Tw, Tu) + FNMA(KP900968867, Tv, KP222520933 * Tx);
io[WS(os, 2)] = TB + TC;
io[WS(os, 5)] = TC - TB;
}
{
E Tn, Tm, Tz, TA;
Tn = FMA(KP781831482, Te, KP974927912 * Tk) + (KP433883739 * Th);
Tm = FMA(KP623489801, T4, T1) + FNMA(KP900968867, Ta, KP222520933 * T7);
ro[WS(os, 6)] = Tm - Tn;
ro[WS(os, 1)] = Tm + Tn;
Tz = FMA(KP781831482, Tq, KP974927912 * Ts) + (KP433883739 * Tr);
TA = FMA(KP623489801, Tx, Tu) + FNMA(KP900968867, Tw, KP222520933 * Tv);
io[WS(os, 1)] = Tz + TA;
io[WS(os, 6)] = TA - Tz;
}
{
E Tp, To, Tt, Ty;
Tp = FMA(KP433883739, Te, KP974927912 * Th) - (KP781831482 * Tk);
To = FMA(KP623489801, T7, T1) + FNMA(KP222520933, Ta, KP900968867 * T4);
ro[WS(os, 4)] = To - Tp;
ro[WS(os, 3)] = To + Tp;
Tt = FMA(KP433883739, Tq, KP974927912 * Tr) - (KP781831482 * Ts);
Ty = FMA(KP623489801, Tv, Tu) + FNMA(KP222520933, Tw, KP900968867 * Tx);
io[WS(os, 3)] = Tt + Ty;
io[WS(os, 4)] = Ty - Tt;
}
}
}
}
static const kdft_desc desc = { 7, "n1_7", { 36, 12, 24, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_7) (planner *p) { X(kdft_register) (p, n1_7, &desc);
}
#endif

View File

@@ -0,0 +1,266 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
/*
* This function contains 52 FP additions, 8 FP multiplications,
* (or, 44 additions, 0 multiplications, 8 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
E TG;
{
E T1, T2, Tj, Tk;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = T1 + T2;
Tn = T1 - T2;
{
E Tg, Th, T4, T5;
Tg = ii[0];
Th = ii[WS(is, 4)];
Ti = Tg + Th;
TC = Tg - Th;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 6)];
T6 = T4 + T5;
TB = T4 - T5;
}
Tj = ii[WS(is, 2)];
Tk = ii[WS(is, 6)];
Tl = Tj + Tk;
To = Tj - Tk;
{
E Tb, Tc, Tv, Tw, Tx, Ty;
Tb = ri[WS(is, 7)];
Tc = ri[WS(is, 3)];
Tv = Tb - Tc;
Tw = ii[WS(is, 7)];
Tx = ii[WS(is, 3)];
Ty = Tw - Tx;
Td = Tb + Tc;
TN = Tw + Tx;
Tz = Tv - Ty;
TH = Tv + Ty;
}
{
E T8, T9, Tq, Tr, Ts, Tt;
T8 = ri[WS(is, 1)];
T9 = ri[WS(is, 5)];
Tq = T8 - T9;
Tr = ii[WS(is, 1)];
Ts = ii[WS(is, 5)];
Tt = Tr - Ts;
Ta = T8 + T9;
TM = Tr + Ts;
Tu = Tq + Tt;
TG = Tt - Tq;
}
}
{
E T7, Te, TP, TQ;
T7 = T3 + T6;
Te = Ta + Td;
ro[WS(os, 4)] = T7 - Te;
ro[0] = T7 + Te;
TP = Ti + Tl;
TQ = TM + TN;
io[WS(os, 4)] = TP - TQ;
io[0] = TP + TQ;
}
{
E Tf, Tm, TL, TO;
Tf = Td - Ta;
Tm = Ti - Tl;
io[WS(os, 2)] = Tf + Tm;
io[WS(os, 6)] = Tm - Tf;
TL = T3 - T6;
TO = TM - TN;
ro[WS(os, 6)] = TL - TO;
ro[WS(os, 2)] = TL + TO;
}
{
E Tp, TA, TJ, TK;
Tp = Tn + To;
TA = Tu + Tz;
ro[WS(os, 5)] = FNMS(KP707106781, TA, Tp);
ro[WS(os, 1)] = FMA(KP707106781, TA, Tp);
TJ = TC - TB;
TK = TG + TH;
io[WS(os, 5)] = FNMS(KP707106781, TK, TJ);
io[WS(os, 1)] = FMA(KP707106781, TK, TJ);
}
{
E TD, TE, TF, TI;
TD = TB + TC;
TE = Tz - Tu;
io[WS(os, 7)] = FNMS(KP707106781, TE, TD);
io[WS(os, 3)] = FMA(KP707106781, TE, TD);
TF = Tn - To;
TI = TG - TH;
ro[WS(os, 7)] = FNMS(KP707106781, TI, TF);
ro[WS(os, 3)] = FMA(KP707106781, TI, TF);
}
}
}
}
static const kdft_desc desc = { 8, "n1_8", { 44, 0, 8, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 8 -name n1_8 -include dft/scalar/n.h */
/*
* This function contains 52 FP additions, 4 FP multiplications,
* (or, 52 additions, 4 multiplications, 0 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(32, is), MAKE_VOLATILE_STRIDE(32, os)) {
E T3, Tn, Ti, TC, T6, TB, Tl, To, Td, TN, Tz, TH, Ta, TM, Tu;
E TG;
{
E T1, T2, Tj, Tk;
T1 = ri[0];
T2 = ri[WS(is, 4)];
T3 = T1 + T2;
Tn = T1 - T2;
{
E Tg, Th, T4, T5;
Tg = ii[0];
Th = ii[WS(is, 4)];
Ti = Tg + Th;
TC = Tg - Th;
T4 = ri[WS(is, 2)];
T5 = ri[WS(is, 6)];
T6 = T4 + T5;
TB = T4 - T5;
}
Tj = ii[WS(is, 2)];
Tk = ii[WS(is, 6)];
Tl = Tj + Tk;
To = Tj - Tk;
{
E Tb, Tc, Tv, Tw, Tx, Ty;
Tb = ri[WS(is, 7)];
Tc = ri[WS(is, 3)];
Tv = Tb - Tc;
Tw = ii[WS(is, 7)];
Tx = ii[WS(is, 3)];
Ty = Tw - Tx;
Td = Tb + Tc;
TN = Tw + Tx;
Tz = Tv - Ty;
TH = Tv + Ty;
}
{
E T8, T9, Tq, Tr, Ts, Tt;
T8 = ri[WS(is, 1)];
T9 = ri[WS(is, 5)];
Tq = T8 - T9;
Tr = ii[WS(is, 1)];
Ts = ii[WS(is, 5)];
Tt = Tr - Ts;
Ta = T8 + T9;
TM = Tr + Ts;
Tu = Tq + Tt;
TG = Tt - Tq;
}
}
{
E T7, Te, TP, TQ;
T7 = T3 + T6;
Te = Ta + Td;
ro[WS(os, 4)] = T7 - Te;
ro[0] = T7 + Te;
TP = Ti + Tl;
TQ = TM + TN;
io[WS(os, 4)] = TP - TQ;
io[0] = TP + TQ;
}
{
E Tf, Tm, TL, TO;
Tf = Td - Ta;
Tm = Ti - Tl;
io[WS(os, 2)] = Tf + Tm;
io[WS(os, 6)] = Tm - Tf;
TL = T3 - T6;
TO = TM - TN;
ro[WS(os, 6)] = TL - TO;
ro[WS(os, 2)] = TL + TO;
}
{
E Tp, TA, TJ, TK;
Tp = Tn + To;
TA = KP707106781 * (Tu + Tz);
ro[WS(os, 5)] = Tp - TA;
ro[WS(os, 1)] = Tp + TA;
TJ = TC - TB;
TK = KP707106781 * (TG + TH);
io[WS(os, 5)] = TJ - TK;
io[WS(os, 1)] = TJ + TK;
}
{
E TD, TE, TF, TI;
TD = TB + TC;
TE = KP707106781 * (Tz - Tu);
io[WS(os, 7)] = TD - TE;
io[WS(os, 3)] = TD + TE;
TF = Tn - To;
TI = KP707106781 * (TG - TH);
ro[WS(os, 7)] = TF - TI;
ro[WS(os, 3)] = TF + TI;
}
}
}
}
static const kdft_desc desc = { 8, "n1_8", { 52, 4, 0, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_8) (planner *p) { X(kdft_register) (p, n1_8, &desc);
}
#endif

View File

@@ -0,0 +1,360 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:24 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_notw.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
/*
* This function contains 80 FP additions, 56 FP multiplications,
* (or, 24 additions, 0 multiplications, 56 fused multiply/add),
* 41 stack variables, 10 constants, and 36 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
E T5, TL, Tm, Tl, T1f, TM, Ta, T1c, TF, TW, TI, TX, Tf, T1d, Ts;
E TZ, Tx, T10;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
T5 = T1 + T4;
TL = FNMS(KP500000000, T4, T1);
Tm = T3 - T2;
}
{
E Th, Ti, Tj, Tk;
Th = ii[0];
Ti = ii[WS(is, 3)];
Tj = ii[WS(is, 6)];
Tk = Ti + Tj;
Tl = FNMS(KP500000000, Tk, Th);
T1f = Th + Tk;
TM = Ti - Tj;
}
{
E T6, Tz, T9, TE, TC, TH, TD, TG;
T6 = ri[WS(is, 1)];
Tz = ii[WS(is, 1)];
{
E T7, T8, TA, TB;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 7)];
T9 = T7 + T8;
TE = T7 - T8;
TA = ii[WS(is, 4)];
TB = ii[WS(is, 7)];
TC = TA + TB;
TH = TB - TA;
}
Ta = T6 + T9;
T1c = Tz + TC;
TD = FNMS(KP500000000, TC, Tz);
TF = FNMS(KP866025403, TE, TD);
TW = FMA(KP866025403, TE, TD);
TG = FNMS(KP500000000, T9, T6);
TI = FNMS(KP866025403, TH, TG);
TX = FMA(KP866025403, TH, TG);
}
{
E Tb, Tt, Te, Tw, Tr, Tu, To, Tv;
Tb = ri[WS(is, 2)];
Tt = ii[WS(is, 2)];
{
E Tc, Td, Tp, Tq;
Tc = ri[WS(is, 5)];
Td = ri[WS(is, 8)];
Te = Tc + Td;
Tw = Td - Tc;
Tp = ii[WS(is, 5)];
Tq = ii[WS(is, 8)];
Tr = Tp - Tq;
Tu = Tp + Tq;
}
Tf = Tb + Te;
T1d = Tt + Tu;
To = FNMS(KP500000000, Te, Tb);
Ts = FMA(KP866025403, Tr, To);
TZ = FNMS(KP866025403, Tr, To);
Tv = FNMS(KP500000000, Tu, Tt);
Tx = FMA(KP866025403, Tw, Tv);
T10 = FNMS(KP866025403, Tw, Tv);
}
{
E T1e, Tg, T1b, T1i, T1g, T1h;
T1e = T1c - T1d;
Tg = Ta + Tf;
T1b = FNMS(KP500000000, Tg, T5);
ro[0] = T5 + Tg;
ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b);
ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b);
T1i = Tf - Ta;
T1g = T1c + T1d;
T1h = FNMS(KP500000000, T1g, T1f);
io[WS(os, 3)] = FMA(KP866025403, T1i, T1h);
io[0] = T1f + T1g;
io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h);
}
{
E Tn, TN, TK, TS, TQ, TU, TR, TT;
Tn = FMA(KP866025403, Tm, Tl);
TN = FMA(KP866025403, TM, TL);
{
E Ty, TJ, TO, TP;
Ty = FNMS(KP176326980, Tx, Ts);
TJ = FNMS(KP839099631, TI, TF);
TK = FNMS(KP777861913, TJ, Ty);
TS = FMA(KP777861913, TJ, Ty);
TO = FMA(KP176326980, Ts, Tx);
TP = FMA(KP839099631, TF, TI);
TQ = FMA(KP777861913, TP, TO);
TU = FNMS(KP777861913, TP, TO);
}
io[WS(os, 1)] = FNMS(KP984807753, TK, Tn);
ro[WS(os, 1)] = FMA(KP984807753, TQ, TN);
TR = FNMS(KP492403876, TQ, TN);
ro[WS(os, 4)] = FMA(KP852868531, TS, TR);
ro[WS(os, 7)] = FNMS(KP852868531, TS, TR);
TT = FMA(KP492403876, TK, Tn);
io[WS(os, 7)] = FNMS(KP852868531, TU, TT);
io[WS(os, 4)] = FMA(KP852868531, TU, TT);
}
{
E TV, T17, T12, T1a, T16, T18, T13, T19;
TV = FNMS(KP866025403, TM, TL);
T17 = FNMS(KP866025403, Tm, Tl);
{
E TY, T11, T14, T15;
TY = FMA(KP176326980, TX, TW);
T11 = FNMS(KP363970234, T10, TZ);
T12 = FNMS(KP954188894, T11, TY);
T1a = FMA(KP954188894, T11, TY);
T14 = FNMS(KP176326980, TW, TX);
T15 = FMA(KP363970234, TZ, T10);
T16 = FNMS(KP954188894, T15, T14);
T18 = FMA(KP954188894, T15, T14);
}
ro[WS(os, 2)] = FMA(KP984807753, T12, TV);
io[WS(os, 2)] = FNMS(KP984807753, T18, T17);
T13 = FNMS(KP492403876, T12, TV);
ro[WS(os, 5)] = FNMS(KP852868531, T16, T13);
ro[WS(os, 8)] = FMA(KP852868531, T16, T13);
T19 = FMA(KP492403876, T18, T17);
io[WS(os, 5)] = FNMS(KP852868531, T1a, T19);
io[WS(os, 8)] = FMA(KP852868531, T1a, T19);
}
}
}
}
static const kdft_desc desc = { 9, "n1_9", { 24, 0, 56, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_notw.native -compact -variables 4 -pipeline-latency 4 -n 9 -name n1_9 -include dft/scalar/n.h */
/*
* This function contains 80 FP additions, 40 FP multiplications,
* (or, 60 additions, 20 multiplications, 20 fused multiply/add),
* 39 stack variables, 8 constants, and 36 memory accesses
*/
#include "dft/scalar/n.h"
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs)
{
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT i;
for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(36, is), MAKE_VOLATILE_STRIDE(36, os)) {
E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB;
E T10, TG, TZ;
{
E T1, T2, T3, T4;
T1 = ri[0];
T2 = ri[WS(is, 3)];
T3 = ri[WS(is, 6)];
T4 = T2 + T3;
T5 = T1 + T4;
TO = KP866025403 * (T3 - T2);
Th = FNMS(KP500000000, T4, T1);
}
{
E TP, Ti, Tj, TQ;
TP = ii[0];
Ti = ii[WS(is, 3)];
Tj = ii[WS(is, 6)];
TQ = Ti + Tj;
Tk = KP866025403 * (Ti - Tj);
T1g = TP + TQ;
TR = FNMS(KP500000000, TQ, TP);
}
{
E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu;
T6 = ri[WS(is, 1)];
Ts = ii[WS(is, 1)];
{
E T7, T8, Tn, To;
T7 = ri[WS(is, 4)];
T8 = ri[WS(is, 7)];
T9 = T7 + T8;
Tr = KP866025403 * (T8 - T7);
Tn = ii[WS(is, 4)];
To = ii[WS(is, 7)];
Tp = KP866025403 * (Tn - To);
Tt = Tn + To;
}
Ta = T6 + T9;
T1c = Ts + Tt;
Tm = FNMS(KP500000000, T9, T6);
Tq = Tm + Tp;
TW = Tm - Tp;
Tu = FNMS(KP500000000, Tt, Ts);
Tv = Tr + Tu;
TX = Tu - Tr;
}
{
E Tb, TD, Te, TC, TA, TE, Tx, TF;
Tb = ri[WS(is, 2)];
TD = ii[WS(is, 2)];
{
E Tc, Td, Ty, Tz;
Tc = ri[WS(is, 5)];
Td = ri[WS(is, 8)];
Te = Tc + Td;
TC = KP866025403 * (Td - Tc);
Ty = ii[WS(is, 5)];
Tz = ii[WS(is, 8)];
TA = KP866025403 * (Ty - Tz);
TE = Ty + Tz;
}
Tf = Tb + Te;
T1d = TD + TE;
Tx = FNMS(KP500000000, Te, Tb);
TB = Tx + TA;
T10 = Tx - TA;
TF = FNMS(KP500000000, TE, TD);
TG = TC + TF;
TZ = TF - TC;
}
{
E T1e, Tg, T1b, T1f, T1h, T1i;
T1e = KP866025403 * (T1c - T1d);
Tg = Ta + Tf;
T1b = FNMS(KP500000000, Tg, T5);
ro[0] = T5 + Tg;
ro[WS(os, 3)] = T1b + T1e;
ro[WS(os, 6)] = T1b - T1e;
T1f = KP866025403 * (Tf - Ta);
T1h = T1c + T1d;
T1i = FNMS(KP500000000, T1h, T1g);
io[WS(os, 3)] = T1f + T1i;
io[0] = T1g + T1h;
io[WS(os, 6)] = T1i - T1f;
}
{
E Tl, TS, TI, TN, TM, TT, TJ, TU;
Tl = Th + Tk;
TS = TO + TR;
{
E Tw, TH, TK, TL;
Tw = FMA(KP766044443, Tq, KP642787609 * Tv);
TH = FMA(KP173648177, TB, KP984807753 * TG);
TI = Tw + TH;
TN = KP866025403 * (TH - Tw);
TK = FNMS(KP642787609, Tq, KP766044443 * Tv);
TL = FNMS(KP984807753, TB, KP173648177 * TG);
TM = KP866025403 * (TK - TL);
TT = TK + TL;
}
ro[WS(os, 1)] = Tl + TI;
io[WS(os, 1)] = TS + TT;
TJ = FNMS(KP500000000, TI, Tl);
ro[WS(os, 7)] = TJ - TM;
ro[WS(os, 4)] = TJ + TM;
TU = FNMS(KP500000000, TT, TS);
io[WS(os, 4)] = TN + TU;
io[WS(os, 7)] = TU - TN;
}
{
E TV, T14, T12, T13, T17, T1a, T18, T19;
TV = Th - Tk;
T14 = TR - TO;
{
E TY, T11, T15, T16;
TY = FMA(KP173648177, TW, KP984807753 * TX);
T11 = FNMS(KP939692620, T10, KP342020143 * TZ);
T12 = TY + T11;
T13 = KP866025403 * (T11 - TY);
T15 = FNMS(KP984807753, TW, KP173648177 * TX);
T16 = FMA(KP342020143, T10, KP939692620 * TZ);
T17 = T15 - T16;
T1a = KP866025403 * (T15 + T16);
}
ro[WS(os, 2)] = TV + T12;
io[WS(os, 2)] = T14 + T17;
T18 = FNMS(KP500000000, T17, T14);
io[WS(os, 5)] = T13 + T18;
io[WS(os, 8)] = T18 - T13;
T19 = FNMS(KP500000000, T12, TV);
ro[WS(os, 8)] = T19 - T1a;
ro[WS(os, 5)] = T19 + T1a;
}
}
}
}
static const kdft_desc desc = { 9, "n1_9", { 60, 20, 20, 0 }, &GENUS, 0, 0, 0, 0 };
void X(codelet_n1_9) (planner *p) { X(kdft_register) (p, n1_9, &desc);
}
#endif

View File

@@ -0,0 +1,149 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
/*
* This function contains 12 FP additions, 8 FP multiplications,
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
* 17 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T2, T4, T7, T8, T9, Tb, Tc, Te, Th, Ti, Tj;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T4 = T1 - T2;
T7 = iio[0];
T8 = iio[WS(rs, 1)];
T9 = T7 - T8;
Tb = rio[WS(vs, 1)];
Tc = rio[WS(vs, 1) + WS(rs, 1)];
Te = Tb - Tc;
Th = iio[WS(vs, 1)];
Ti = iio[WS(vs, 1) + WS(rs, 1)];
Tj = Th - Ti;
rio[0] = T1 + T2;
iio[0] = T7 + T8;
rio[WS(rs, 1)] = Tb + Tc;
iio[WS(rs, 1)] = Th + Ti;
{
E Tf, Tk, Td, Tg;
Td = W[0];
Tf = Td * Te;
Tk = Td * Tj;
Tg = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tg, Tj, Tf);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tg, Te, Tk);
}
{
E T5, Ta, T3, T6;
T3 = W[0];
T5 = T3 * T4;
Ta = T3 * T9;
T6 = W[1];
rio[WS(vs, 1)] = FMA(T6, T9, T5);
iio[WS(vs, 1)] = FNMS(T6, T4, Ta);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
void X(codelet_q1_2) (planner *p) {
X(kdft_difsq_register) (p, q1_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 2 -name q1_2 -include dft/scalar/q.h */
/*
* This function contains 12 FP additions, 8 FP multiplications,
* (or, 8 additions, 4 multiplications, 4 fused multiply/add),
* 17 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_2(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T2, T4, T6, T7, T8, T9, Ta, Tc, Te, Tf, Tg;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T4 = T1 - T2;
T6 = iio[0];
T7 = iio[WS(rs, 1)];
T8 = T6 - T7;
T9 = rio[WS(vs, 1)];
Ta = rio[WS(vs, 1) + WS(rs, 1)];
Tc = T9 - Ta;
Te = iio[WS(vs, 1)];
Tf = iio[WS(vs, 1) + WS(rs, 1)];
Tg = Te - Tf;
rio[0] = T1 + T2;
iio[0] = T6 + T7;
rio[WS(rs, 1)] = T9 + Ta;
iio[WS(rs, 1)] = Te + Tf;
{
E Tb, Td, T3, T5;
Tb = W[0];
Td = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tb, Tc, Td * Tg);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Td, Tc, Tb * Tg);
T3 = W[0];
T5 = W[1];
rio[WS(vs, 1)] = FMA(T3, T4, T5 * T8);
iio[WS(vs, 1)] = FNMS(T5, T4, T3 * T8);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "q1_2", twinstr, &GENUS, { 8, 4, 4, 0 }, 0, 0, 0 };
void X(codelet_q1_2) (planner *p) {
X(kdft_difsq_register) (p, q1_2, &desc);
}
#endif

View File

@@ -0,0 +1,316 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
/*
* This function contains 48 FP additions, 42 FP multiplications,
* (or, 18 additions, 12 multiplications, 30 fused multiply/add),
* 35 stack variables, 2 constants, and 36 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T4, T6, Tg, Td, Te, T9, Tf, Tp, Ts, Tu, TE, TB, TC, Tx;
E TD, TZ, T10, TV, T11, TN, TQ, TS, T12;
{
E T2, T3, Tv, Tw;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 2)];
T4 = T2 + T3;
T6 = FNMS(KP500000000, T4, T1);
Tg = T3 - T2;
{
E T7, T8, Tq, Tr;
Td = iio[0];
T7 = iio[WS(rs, 1)];
T8 = iio[WS(rs, 2)];
Te = T7 + T8;
T9 = T7 - T8;
Tf = FNMS(KP500000000, Te, Td);
Tp = rio[WS(vs, 1)];
Tq = rio[WS(vs, 1) + WS(rs, 1)];
Tr = rio[WS(vs, 1) + WS(rs, 2)];
Ts = Tq + Tr;
Tu = FNMS(KP500000000, Ts, Tp);
TE = Tr - Tq;
}
TB = iio[WS(vs, 1)];
Tv = iio[WS(vs, 1) + WS(rs, 1)];
Tw = iio[WS(vs, 1) + WS(rs, 2)];
TC = Tv + Tw;
Tx = Tv - Tw;
TD = FNMS(KP500000000, TC, TB);
{
E TT, TU, TO, TP;
TZ = iio[WS(vs, 2)];
TT = iio[WS(vs, 2) + WS(rs, 1)];
TU = iio[WS(vs, 2) + WS(rs, 2)];
T10 = TT + TU;
TV = TT - TU;
T11 = FNMS(KP500000000, T10, TZ);
TN = rio[WS(vs, 2)];
TO = rio[WS(vs, 2) + WS(rs, 1)];
TP = rio[WS(vs, 2) + WS(rs, 2)];
TQ = TO + TP;
TS = FNMS(KP500000000, TQ, TN);
T12 = TP - TO;
}
}
rio[0] = T1 + T4;
iio[0] = Td + Te;
rio[WS(rs, 1)] = Tp + Ts;
iio[WS(rs, 1)] = TB + TC;
iio[WS(rs, 2)] = TZ + T10;
rio[WS(rs, 2)] = TN + TQ;
{
E Ta, Th, Tb, Ti, T5, Tc;
Ta = FMA(KP866025403, T9, T6);
Th = FMA(KP866025403, Tg, Tf);
T5 = W[0];
Tb = T5 * Ta;
Ti = T5 * Th;
Tc = W[1];
rio[WS(vs, 1)] = FMA(Tc, Th, Tb);
iio[WS(vs, 1)] = FNMS(Tc, Ta, Ti);
}
{
E T16, T19, T17, T1a, T15, T18;
T16 = FNMS(KP866025403, TV, TS);
T19 = FNMS(KP866025403, T12, T11);
T15 = W[2];
T17 = T15 * T16;
T1a = T15 * T19;
T18 = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T18, T19, T17);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T18, T16, T1a);
}
{
E TI, TL, TJ, TM, TH, TK;
TI = FNMS(KP866025403, Tx, Tu);
TL = FNMS(KP866025403, TE, TD);
TH = W[2];
TJ = TH * TI;
TM = TH * TL;
TK = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TK, TL, TJ);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TK, TI, TM);
}
{
E Ty, TF, Tz, TG, Tt, TA;
Ty = FMA(KP866025403, Tx, Tu);
TF = FMA(KP866025403, TE, TD);
Tt = W[0];
Tz = Tt * Ty;
TG = Tt * TF;
TA = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TA, TF, Tz);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TA, Ty, TG);
}
{
E TW, T13, TX, T14, TR, TY;
TW = FMA(KP866025403, TV, TS);
T13 = FMA(KP866025403, T12, T11);
TR = W[0];
TX = TR * TW;
T14 = TR * T13;
TY = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TY, T13, TX);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TY, TW, T14);
}
{
E Tk, Tn, Tl, To, Tj, Tm;
Tk = FNMS(KP866025403, T9, T6);
Tn = FNMS(KP866025403, Tg, Tf);
Tj = W[2];
Tl = Tj * Tk;
To = Tj * Tn;
Tm = W[3];
rio[WS(vs, 2)] = FMA(Tm, Tn, Tl);
iio[WS(vs, 2)] = FNMS(Tm, Tk, To);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 18, 12, 30, 0 }, 0, 0, 0 };
void X(codelet_q1_3) (planner *p) {
X(kdft_difsq_register) (p, q1_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 3 -name q1_3 -include dft/scalar/q.h */
/*
* This function contains 48 FP additions, 36 FP multiplications,
* (or, 30 additions, 18 multiplications, 18 fused multiply/add),
* 35 stack variables, 2 constants, and 36 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_3(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, T4, T6, Tc, Td, Te, T9, Tf, Tl, To, Tq, Tw, Tx, Ty, Tt;
E Tz, TR, TS, TN, TT, TF, TI, TK, TQ;
{
E T2, T3, Tr, Ts;
T1 = rio[0];
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 2)];
T4 = T2 + T3;
T6 = FNMS(KP500000000, T4, T1);
Tc = KP866025403 * (T3 - T2);
{
E T7, T8, Tm, Tn;
Td = iio[0];
T7 = iio[WS(rs, 1)];
T8 = iio[WS(rs, 2)];
Te = T7 + T8;
T9 = KP866025403 * (T7 - T8);
Tf = FNMS(KP500000000, Te, Td);
Tl = rio[WS(vs, 1)];
Tm = rio[WS(vs, 1) + WS(rs, 1)];
Tn = rio[WS(vs, 1) + WS(rs, 2)];
To = Tm + Tn;
Tq = FNMS(KP500000000, To, Tl);
Tw = KP866025403 * (Tn - Tm);
}
Tx = iio[WS(vs, 1)];
Tr = iio[WS(vs, 1) + WS(rs, 1)];
Ts = iio[WS(vs, 1) + WS(rs, 2)];
Ty = Tr + Ts;
Tt = KP866025403 * (Tr - Ts);
Tz = FNMS(KP500000000, Ty, Tx);
{
E TL, TM, TG, TH;
TR = iio[WS(vs, 2)];
TL = iio[WS(vs, 2) + WS(rs, 1)];
TM = iio[WS(vs, 2) + WS(rs, 2)];
TS = TL + TM;
TN = KP866025403 * (TL - TM);
TT = FNMS(KP500000000, TS, TR);
TF = rio[WS(vs, 2)];
TG = rio[WS(vs, 2) + WS(rs, 1)];
TH = rio[WS(vs, 2) + WS(rs, 2)];
TI = TG + TH;
TK = FNMS(KP500000000, TI, TF);
TQ = KP866025403 * (TH - TG);
}
}
rio[0] = T1 + T4;
iio[0] = Td + Te;
rio[WS(rs, 1)] = Tl + To;
iio[WS(rs, 1)] = Tx + Ty;
iio[WS(rs, 2)] = TR + TS;
rio[WS(rs, 2)] = TF + TI;
{
E Ta, Tg, T5, Tb;
Ta = T6 + T9;
Tg = Tc + Tf;
T5 = W[0];
Tb = W[1];
rio[WS(vs, 1)] = FMA(T5, Ta, Tb * Tg);
iio[WS(vs, 1)] = FNMS(Tb, Ta, T5 * Tg);
}
{
E TW, TY, TV, TX;
TW = TK - TN;
TY = TT - TQ;
TV = W[2];
TX = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(TV, TW, TX * TY);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(TX, TW, TV * TY);
}
{
E TC, TE, TB, TD;
TC = Tq - Tt;
TE = Tz - Tw;
TB = W[2];
TD = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TB, TC, TD * TE);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TD, TC, TB * TE);
}
{
E Tu, TA, Tp, Tv;
Tu = Tq + Tt;
TA = Tw + Tz;
Tp = W[0];
Tv = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tp, Tu, Tv * TA);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tv, Tu, Tp * TA);
}
{
E TO, TU, TJ, TP;
TO = TK + TN;
TU = TQ + TT;
TJ = W[0];
TP = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(TJ, TO, TP * TU);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(TP, TO, TJ * TU);
}
{
E Ti, Tk, Th, Tj;
Ti = T6 - T9;
Tk = Tf - Tc;
Th = W[2];
Tj = W[3];
rio[WS(vs, 2)] = FMA(Th, Ti, Tj * Tk);
iio[WS(vs, 2)] = FNMS(Tj, Ti, Th * Tk);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "q1_3", twinstr, &GENUS, { 30, 18, 18, 0 }, 0, 0, 0 };
void X(codelet_q1_3) (planner *p) {
X(kdft_difsq_register) (p, q1_3, &desc);
}
#endif

View File

@@ -0,0 +1,524 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
/*
* This function contains 88 FP additions, 48 FP multiplications,
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
* 51 stack variables, 0 constants, and 64 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T3, Tv, Tw, T6, Tc, Tf, Tx, Ts, Tm, Ti, T1H, T29, T2a, T1K, T1Q;
E T1T, T2b, T26, T20, T1W, TB, T13, T14, TE, TK, TN, T15, T10, TU, TQ;
E T19, T1B, T1C, T1c, T1i, T1l, T1D, T1y, T1s, T1o;
{
E T1, T2, Tb, Tg, Th, T8;
{
E T9, Ta, T4, T5;
T1 = rio[0];
T2 = rio[WS(rs, 2)];
T3 = T1 + T2;
T9 = iio[0];
Ta = iio[WS(rs, 2)];
Tb = T9 - Ta;
Tv = T9 + Ta;
Tg = iio[WS(rs, 1)];
Th = iio[WS(rs, 3)];
Tw = Tg + Th;
T4 = rio[WS(rs, 1)];
T5 = rio[WS(rs, 3)];
T6 = T4 + T5;
T8 = T4 - T5;
}
Tc = T8 + Tb;
Tf = T1 - T2;
Tx = Tv - Tw;
Ts = T3 - T6;
Tm = Tb - T8;
Ti = Tg - Th;
}
{
E T1F, T1G, T1P, T1U, T1V, T1M;
{
E T1N, T1O, T1I, T1J;
T1F = rio[WS(vs, 3)];
T1G = rio[WS(vs, 3) + WS(rs, 2)];
T1H = T1F + T1G;
T1N = iio[WS(vs, 3)];
T1O = iio[WS(vs, 3) + WS(rs, 2)];
T1P = T1N - T1O;
T29 = T1N + T1O;
T1U = iio[WS(vs, 3) + WS(rs, 1)];
T1V = iio[WS(vs, 3) + WS(rs, 3)];
T2a = T1U + T1V;
T1I = rio[WS(vs, 3) + WS(rs, 1)];
T1J = rio[WS(vs, 3) + WS(rs, 3)];
T1K = T1I + T1J;
T1M = T1I - T1J;
}
T1Q = T1M + T1P;
T1T = T1F - T1G;
T2b = T29 - T2a;
T26 = T1H - T1K;
T20 = T1P - T1M;
T1W = T1U - T1V;
}
{
E Tz, TA, TJ, TO, TP, TG;
{
E TH, TI, TC, TD;
Tz = rio[WS(vs, 1)];
TA = rio[WS(vs, 1) + WS(rs, 2)];
TB = Tz + TA;
TH = iio[WS(vs, 1)];
TI = iio[WS(vs, 1) + WS(rs, 2)];
TJ = TH - TI;
T13 = TH + TI;
TO = iio[WS(vs, 1) + WS(rs, 1)];
TP = iio[WS(vs, 1) + WS(rs, 3)];
T14 = TO + TP;
TC = rio[WS(vs, 1) + WS(rs, 1)];
TD = rio[WS(vs, 1) + WS(rs, 3)];
TE = TC + TD;
TG = TC - TD;
}
TK = TG + TJ;
TN = Tz - TA;
T15 = T13 - T14;
T10 = TB - TE;
TU = TJ - TG;
TQ = TO - TP;
}
{
E T17, T18, T1h, T1m, T1n, T1e;
{
E T1f, T1g, T1a, T1b;
T17 = rio[WS(vs, 2)];
T18 = rio[WS(vs, 2) + WS(rs, 2)];
T19 = T17 + T18;
T1f = iio[WS(vs, 2)];
T1g = iio[WS(vs, 2) + WS(rs, 2)];
T1h = T1f - T1g;
T1B = T1f + T1g;
T1m = iio[WS(vs, 2) + WS(rs, 1)];
T1n = iio[WS(vs, 2) + WS(rs, 3)];
T1C = T1m + T1n;
T1a = rio[WS(vs, 2) + WS(rs, 1)];
T1b = rio[WS(vs, 2) + WS(rs, 3)];
T1c = T1a + T1b;
T1e = T1a - T1b;
}
T1i = T1e + T1h;
T1l = T17 - T18;
T1D = T1B - T1C;
T1y = T19 - T1c;
T1s = T1h - T1e;
T1o = T1m - T1n;
}
rio[0] = T3 + T6;
iio[0] = Tv + Tw;
rio[WS(rs, 1)] = TB + TE;
iio[WS(rs, 1)] = T13 + T14;
rio[WS(rs, 2)] = T19 + T1c;
iio[WS(rs, 2)] = T1B + T1C;
iio[WS(rs, 3)] = T29 + T2a;
rio[WS(rs, 3)] = T1H + T1K;
{
E Tt, Ty, Tr, Tu;
Tr = W[2];
Tt = Tr * Ts;
Ty = Tr * Tx;
Tu = W[3];
rio[WS(vs, 2)] = FMA(Tu, Tx, Tt);
iio[WS(vs, 2)] = FNMS(Tu, Ts, Ty);
}
{
E T27, T2c, T25, T28;
T25 = W[2];
T27 = T25 * T26;
T2c = T25 * T2b;
T28 = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T28, T2b, T27);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T28, T26, T2c);
}
{
E T11, T16, TZ, T12;
TZ = W[2];
T11 = TZ * T10;
T16 = TZ * T15;
T12 = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T12, T15, T11);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T12, T10, T16);
}
{
E T1z, T1E, T1x, T1A;
T1x = W[2];
T1z = T1x * T1y;
T1E = T1x * T1D;
T1A = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1A, T1D, T1z);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1A, T1y, T1E);
}
{
E Tj, Te, Tk, T7, Td;
Tj = Tf - Ti;
Te = W[5];
Tk = Te * Tc;
T7 = W[4];
Td = T7 * Tc;
iio[WS(vs, 3)] = FNMS(Te, Tj, Td);
rio[WS(vs, 3)] = FMA(T7, Tj, Tk);
}
{
E T1p, T1k, T1q, T1d, T1j;
T1p = T1l - T1o;
T1k = W[5];
T1q = T1k * T1i;
T1d = W[4];
T1j = T1d * T1i;
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T1k, T1p, T1j);
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T1d, T1p, T1q);
}
{
E T23, T22, T24, T1Z, T21;
T23 = T1T + T1W;
T22 = W[1];
T24 = T22 * T20;
T1Z = W[0];
T21 = T1Z * T20;
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T22, T23, T21);
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1Z, T23, T24);
}
{
E TX, TW, TY, TT, TV;
TX = TN + TQ;
TW = W[1];
TY = TW * TU;
TT = W[0];
TV = TT * TU;
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TW, TX, TV);
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TT, TX, TY);
}
{
E TR, TM, TS, TF, TL;
TR = TN - TQ;
TM = W[5];
TS = TM * TK;
TF = W[4];
TL = TF * TK;
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TM, TR, TL);
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TR, TS);
}
{
E Tp, To, Tq, Tl, Tn;
Tp = Tf + Ti;
To = W[1];
Tq = To * Tm;
Tl = W[0];
Tn = Tl * Tm;
iio[WS(vs, 1)] = FNMS(To, Tp, Tn);
rio[WS(vs, 1)] = FMA(Tl, Tp, Tq);
}
{
E T1v, T1u, T1w, T1r, T1t;
T1v = T1l + T1o;
T1u = W[1];
T1w = T1u * T1s;
T1r = W[0];
T1t = T1r * T1s;
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1u, T1v, T1t);
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1r, T1v, T1w);
}
{
E T1X, T1S, T1Y, T1L, T1R;
T1X = T1T - T1W;
T1S = W[5];
T1Y = T1S * T1Q;
T1L = W[4];
T1R = T1L * T1Q;
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1S, T1X, T1R);
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1L, T1X, T1Y);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
void X(codelet_q1_4) (planner *p) {
X(kdft_difsq_register) (p, q1_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 4 -name q1_4 -include dft/scalar/q.h */
/*
* This function contains 88 FP additions, 48 FP multiplications,
* (or, 64 additions, 24 multiplications, 24 fused multiply/add),
* 37 stack variables, 0 constants, and 64 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_4(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T3, Te, Tb, Tq, T6, T8, Th, Tr, Tv, TG, TD, TS, Ty, TA, TJ;
E TT, TX, T18, T15, T1k, T10, T12, T1b, T1l, T1p, T1A, T1x, T1M, T1s, T1u;
E T1D, T1N;
{
E T1, T2, T9, Ta;
T1 = rio[0];
T2 = rio[WS(rs, 2)];
T3 = T1 + T2;
Te = T1 - T2;
T9 = iio[0];
Ta = iio[WS(rs, 2)];
Tb = T9 - Ta;
Tq = T9 + Ta;
}
{
E T4, T5, Tf, Tg;
T4 = rio[WS(rs, 1)];
T5 = rio[WS(rs, 3)];
T6 = T4 + T5;
T8 = T4 - T5;
Tf = iio[WS(rs, 1)];
Tg = iio[WS(rs, 3)];
Th = Tf - Tg;
Tr = Tf + Tg;
}
{
E Tt, Tu, TB, TC;
Tt = rio[WS(vs, 1)];
Tu = rio[WS(vs, 1) + WS(rs, 2)];
Tv = Tt + Tu;
TG = Tt - Tu;
TB = iio[WS(vs, 1)];
TC = iio[WS(vs, 1) + WS(rs, 2)];
TD = TB - TC;
TS = TB + TC;
}
{
E Tw, Tx, TH, TI;
Tw = rio[WS(vs, 1) + WS(rs, 1)];
Tx = rio[WS(vs, 1) + WS(rs, 3)];
Ty = Tw + Tx;
TA = Tw - Tx;
TH = iio[WS(vs, 1) + WS(rs, 1)];
TI = iio[WS(vs, 1) + WS(rs, 3)];
TJ = TH - TI;
TT = TH + TI;
}
{
E TV, TW, T13, T14;
TV = rio[WS(vs, 2)];
TW = rio[WS(vs, 2) + WS(rs, 2)];
TX = TV + TW;
T18 = TV - TW;
T13 = iio[WS(vs, 2)];
T14 = iio[WS(vs, 2) + WS(rs, 2)];
T15 = T13 - T14;
T1k = T13 + T14;
}
{
E TY, TZ, T19, T1a;
TY = rio[WS(vs, 2) + WS(rs, 1)];
TZ = rio[WS(vs, 2) + WS(rs, 3)];
T10 = TY + TZ;
T12 = TY - TZ;
T19 = iio[WS(vs, 2) + WS(rs, 1)];
T1a = iio[WS(vs, 2) + WS(rs, 3)];
T1b = T19 - T1a;
T1l = T19 + T1a;
}
{
E T1n, T1o, T1v, T1w;
T1n = rio[WS(vs, 3)];
T1o = rio[WS(vs, 3) + WS(rs, 2)];
T1p = T1n + T1o;
T1A = T1n - T1o;
T1v = iio[WS(vs, 3)];
T1w = iio[WS(vs, 3) + WS(rs, 2)];
T1x = T1v - T1w;
T1M = T1v + T1w;
}
{
E T1q, T1r, T1B, T1C;
T1q = rio[WS(vs, 3) + WS(rs, 1)];
T1r = rio[WS(vs, 3) + WS(rs, 3)];
T1s = T1q + T1r;
T1u = T1q - T1r;
T1B = iio[WS(vs, 3) + WS(rs, 1)];
T1C = iio[WS(vs, 3) + WS(rs, 3)];
T1D = T1B - T1C;
T1N = T1B + T1C;
}
rio[0] = T3 + T6;
iio[0] = Tq + Tr;
rio[WS(rs, 1)] = Tv + Ty;
iio[WS(rs, 1)] = TS + TT;
rio[WS(rs, 2)] = TX + T10;
iio[WS(rs, 2)] = T1k + T1l;
iio[WS(rs, 3)] = T1M + T1N;
rio[WS(rs, 3)] = T1p + T1s;
{
E Tc, Ti, T7, Td;
Tc = T8 + Tb;
Ti = Te - Th;
T7 = W[4];
Td = W[5];
iio[WS(vs, 3)] = FNMS(Td, Ti, T7 * Tc);
rio[WS(vs, 3)] = FMA(Td, Tc, T7 * Ti);
}
{
E T1K, T1O, T1J, T1L;
T1K = T1p - T1s;
T1O = T1M - T1N;
T1J = W[2];
T1L = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T1J, T1K, T1L * T1O);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T1L, T1K, T1J * T1O);
}
{
E Tk, Tm, Tj, Tl;
Tk = Tb - T8;
Tm = Te + Th;
Tj = W[0];
Tl = W[1];
iio[WS(vs, 1)] = FNMS(Tl, Tm, Tj * Tk);
rio[WS(vs, 1)] = FMA(Tl, Tk, Tj * Tm);
}
{
E To, Ts, Tn, Tp;
To = T3 - T6;
Ts = Tq - Tr;
Tn = W[2];
Tp = W[3];
rio[WS(vs, 2)] = FMA(Tn, To, Tp * Ts);
iio[WS(vs, 2)] = FNMS(Tp, To, Tn * Ts);
}
{
E T16, T1c, T11, T17;
T16 = T12 + T15;
T1c = T18 - T1b;
T11 = W[4];
T17 = W[5];
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T17, T1c, T11 * T16);
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T17, T16, T11 * T1c);
}
{
E T1G, T1I, T1F, T1H;
T1G = T1x - T1u;
T1I = T1A + T1D;
T1F = W[0];
T1H = W[1];
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T1H, T1I, T1F * T1G);
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T1H, T1G, T1F * T1I);
}
{
E TQ, TU, TP, TR;
TQ = Tv - Ty;
TU = TS - TT;
TP = W[2];
TR = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(TP, TQ, TR * TU);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(TR, TQ, TP * TU);
}
{
E T1e, T1g, T1d, T1f;
T1e = T15 - T12;
T1g = T18 + T1b;
T1d = W[0];
T1f = W[1];
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1f, T1g, T1d * T1e);
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1f, T1e, T1d * T1g);
}
{
E T1i, T1m, T1h, T1j;
T1i = TX - T10;
T1m = T1k - T1l;
T1h = W[2];
T1j = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T1h, T1i, T1j * T1m);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T1j, T1i, T1h * T1m);
}
{
E T1y, T1E, T1t, T1z;
T1y = T1u + T1x;
T1E = T1A - T1D;
T1t = W[4];
T1z = W[5];
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T1z, T1E, T1t * T1y);
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T1z, T1y, T1t * T1E);
}
{
E TM, TO, TL, TN;
TM = TD - TA;
TO = TG + TJ;
TL = W[0];
TN = W[1];
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(TN, TO, TL * TM);
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TN, TM, TL * TO);
}
{
E TE, TK, Tz, TF;
TE = TA + TD;
TK = TG - TJ;
Tz = W[4];
TF = W[5];
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(TF, TK, Tz * TE);
rio[WS(vs, 3) + WS(rs, 1)] = FMA(TF, TE, Tz * TK);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "q1_4", twinstr, &GENUS, { 64, 24, 24, 0 }, 0, 0, 0 };
void X(codelet_q1_4) (planner *p) {
X(kdft_difsq_register) (p, q1_4, &desc);
}
#endif

View File

@@ -0,0 +1,992 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:41 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twidsq.native -fma -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
/*
* This function contains 200 FP additions, 170 FP multiplications,
* (or, 70 additions, 40 multiplications, 130 fused multiply/add),
* 75 stack variables, 4 constants, and 100 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, Tb, TM, Tw, T8, Ta, Tn, Tj, TH, Ts, Tq, Tr, TV, T15, T1G;
E T1q, T12, T14, T1h, T1d, T1B, T1m, T1k, T1l, T1P, T1Z, T2A, T2k, T1W, T1Y;
E T2b, T27, T2v, T2g, T2e, T2f, T3Z, T3V, T4j, T44, T42, T43, T3D, T3N, T4o;
E T48, T3K, T3M, T2J, T2T, T3u, T3e, T2Q, T2S, T35, T31, T3p, T3a, T38, T39;
{
E T7, Tv, T4, Tu;
T1 = rio[0];
{
E T5, T6, T2, T3;
T5 = rio[WS(rs, 2)];
T6 = rio[WS(rs, 3)];
T7 = T5 + T6;
Tv = T5 - T6;
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 4)];
T4 = T2 + T3;
Tu = T2 - T3;
}
Tb = T4 - T7;
TM = FNMS(KP618033988, Tu, Tv);
Tw = FMA(KP618033988, Tv, Tu);
T8 = T4 + T7;
Ta = FNMS(KP250000000, T8, T1);
}
{
E Ti, Tp, Tf, To;
Tn = iio[0];
{
E Tg, Th, Td, Te;
Tg = iio[WS(rs, 2)];
Th = iio[WS(rs, 3)];
Ti = Tg - Th;
Tp = Tg + Th;
Td = iio[WS(rs, 1)];
Te = iio[WS(rs, 4)];
Tf = Td - Te;
To = Td + Te;
}
Tj = FMA(KP618033988, Ti, Tf);
TH = FNMS(KP618033988, Tf, Ti);
Ts = To - Tp;
Tq = To + Tp;
Tr = FNMS(KP250000000, Tq, Tn);
}
{
E T11, T1p, TY, T1o;
TV = rio[WS(vs, 1)];
{
E TZ, T10, TW, TX;
TZ = rio[WS(vs, 1) + WS(rs, 2)];
T10 = rio[WS(vs, 1) + WS(rs, 3)];
T11 = TZ + T10;
T1p = TZ - T10;
TW = rio[WS(vs, 1) + WS(rs, 1)];
TX = rio[WS(vs, 1) + WS(rs, 4)];
TY = TW + TX;
T1o = TW - TX;
}
T15 = TY - T11;
T1G = FNMS(KP618033988, T1o, T1p);
T1q = FMA(KP618033988, T1p, T1o);
T12 = TY + T11;
T14 = FNMS(KP250000000, T12, TV);
}
{
E T1c, T1j, T19, T1i;
T1h = iio[WS(vs, 1)];
{
E T1a, T1b, T17, T18;
T1a = iio[WS(vs, 1) + WS(rs, 2)];
T1b = iio[WS(vs, 1) + WS(rs, 3)];
T1c = T1a - T1b;
T1j = T1a + T1b;
T17 = iio[WS(vs, 1) + WS(rs, 1)];
T18 = iio[WS(vs, 1) + WS(rs, 4)];
T19 = T17 - T18;
T1i = T17 + T18;
}
T1d = FMA(KP618033988, T1c, T19);
T1B = FNMS(KP618033988, T19, T1c);
T1m = T1i - T1j;
T1k = T1i + T1j;
T1l = FNMS(KP250000000, T1k, T1h);
}
{
E T1V, T2j, T1S, T2i;
T1P = rio[WS(vs, 2)];
{
E T1T, T1U, T1Q, T1R;
T1T = rio[WS(vs, 2) + WS(rs, 2)];
T1U = rio[WS(vs, 2) + WS(rs, 3)];
T1V = T1T + T1U;
T2j = T1T - T1U;
T1Q = rio[WS(vs, 2) + WS(rs, 1)];
T1R = rio[WS(vs, 2) + WS(rs, 4)];
T1S = T1Q + T1R;
T2i = T1Q - T1R;
}
T1Z = T1S - T1V;
T2A = FNMS(KP618033988, T2i, T2j);
T2k = FMA(KP618033988, T2j, T2i);
T1W = T1S + T1V;
T1Y = FNMS(KP250000000, T1W, T1P);
}
{
E T26, T2d, T23, T2c;
T2b = iio[WS(vs, 2)];
{
E T24, T25, T21, T22;
T24 = iio[WS(vs, 2) + WS(rs, 2)];
T25 = iio[WS(vs, 2) + WS(rs, 3)];
T26 = T24 - T25;
T2d = T24 + T25;
T21 = iio[WS(vs, 2) + WS(rs, 1)];
T22 = iio[WS(vs, 2) + WS(rs, 4)];
T23 = T21 - T22;
T2c = T21 + T22;
}
T27 = FMA(KP618033988, T26, T23);
T2v = FNMS(KP618033988, T23, T26);
T2g = T2c - T2d;
T2e = T2c + T2d;
T2f = FNMS(KP250000000, T2e, T2b);
}
{
E T3U, T41, T3R, T40;
T3Z = iio[WS(vs, 4)];
{
E T3S, T3T, T3P, T3Q;
T3S = iio[WS(vs, 4) + WS(rs, 2)];
T3T = iio[WS(vs, 4) + WS(rs, 3)];
T3U = T3S - T3T;
T41 = T3S + T3T;
T3P = iio[WS(vs, 4) + WS(rs, 1)];
T3Q = iio[WS(vs, 4) + WS(rs, 4)];
T3R = T3P - T3Q;
T40 = T3P + T3Q;
}
T3V = FMA(KP618033988, T3U, T3R);
T4j = FNMS(KP618033988, T3R, T3U);
T44 = T40 - T41;
T42 = T40 + T41;
T43 = FNMS(KP250000000, T42, T3Z);
}
{
E T3J, T47, T3G, T46;
T3D = rio[WS(vs, 4)];
{
E T3H, T3I, T3E, T3F;
T3H = rio[WS(vs, 4) + WS(rs, 2)];
T3I = rio[WS(vs, 4) + WS(rs, 3)];
T3J = T3H + T3I;
T47 = T3H - T3I;
T3E = rio[WS(vs, 4) + WS(rs, 1)];
T3F = rio[WS(vs, 4) + WS(rs, 4)];
T3G = T3E + T3F;
T46 = T3E - T3F;
}
T3N = T3G - T3J;
T4o = FNMS(KP618033988, T46, T47);
T48 = FMA(KP618033988, T47, T46);
T3K = T3G + T3J;
T3M = FNMS(KP250000000, T3K, T3D);
}
{
E T2P, T3d, T2M, T3c;
T2J = rio[WS(vs, 3)];
{
E T2N, T2O, T2K, T2L;
T2N = rio[WS(vs, 3) + WS(rs, 2)];
T2O = rio[WS(vs, 3) + WS(rs, 3)];
T2P = T2N + T2O;
T3d = T2N - T2O;
T2K = rio[WS(vs, 3) + WS(rs, 1)];
T2L = rio[WS(vs, 3) + WS(rs, 4)];
T2M = T2K + T2L;
T3c = T2K - T2L;
}
T2T = T2M - T2P;
T3u = FNMS(KP618033988, T3c, T3d);
T3e = FMA(KP618033988, T3d, T3c);
T2Q = T2M + T2P;
T2S = FNMS(KP250000000, T2Q, T2J);
}
{
E T30, T37, T2X, T36;
T35 = iio[WS(vs, 3)];
{
E T2Y, T2Z, T2V, T2W;
T2Y = iio[WS(vs, 3) + WS(rs, 2)];
T2Z = iio[WS(vs, 3) + WS(rs, 3)];
T30 = T2Y - T2Z;
T37 = T2Y + T2Z;
T2V = iio[WS(vs, 3) + WS(rs, 1)];
T2W = iio[WS(vs, 3) + WS(rs, 4)];
T2X = T2V - T2W;
T36 = T2V + T2W;
}
T31 = FMA(KP618033988, T30, T2X);
T3p = FNMS(KP618033988, T2X, T30);
T3a = T36 - T37;
T38 = T36 + T37;
T39 = FNMS(KP250000000, T38, T35);
}
rio[0] = T1 + T8;
iio[0] = Tn + Tq;
rio[WS(rs, 1)] = TV + T12;
iio[WS(rs, 1)] = T1h + T1k;
rio[WS(rs, 2)] = T1P + T1W;
iio[WS(rs, 2)] = T2b + T2e;
iio[WS(rs, 4)] = T3Z + T42;
rio[WS(rs, 4)] = T3D + T3K;
rio[WS(rs, 3)] = T2J + T2Q;
iio[WS(rs, 3)] = T35 + T38;
{
E Tk, TA, Tx, TD, Tc, Tt;
Tc = FMA(KP559016994, Tb, Ta);
Tk = FMA(KP951056516, Tj, Tc);
TA = FNMS(KP951056516, Tj, Tc);
Tt = FMA(KP559016994, Ts, Tr);
Tx = FNMS(KP951056516, Tw, Tt);
TD = FMA(KP951056516, Tw, Tt);
{
E Tl, Ty, T9, Tm;
T9 = W[0];
Tl = T9 * Tk;
Ty = T9 * Tx;
Tm = W[1];
rio[WS(vs, 1)] = FMA(Tm, Tx, Tl);
iio[WS(vs, 1)] = FNMS(Tm, Tk, Ty);
}
{
E TB, TE, Tz, TC;
Tz = W[6];
TB = Tz * TA;
TE = Tz * TD;
TC = W[7];
rio[WS(vs, 4)] = FMA(TC, TD, TB);
iio[WS(vs, 4)] = FNMS(TC, TA, TE);
}
}
{
E TI, TQ, TN, TT, TG, TL;
TG = FNMS(KP559016994, Tb, Ta);
TI = FNMS(KP951056516, TH, TG);
TQ = FMA(KP951056516, TH, TG);
TL = FNMS(KP559016994, Ts, Tr);
TN = FMA(KP951056516, TM, TL);
TT = FNMS(KP951056516, TM, TL);
{
E TJ, TO, TF, TK;
TF = W[2];
TJ = TF * TI;
TO = TF * TN;
TK = W[3];
rio[WS(vs, 2)] = FMA(TK, TN, TJ);
iio[WS(vs, 2)] = FNMS(TK, TI, TO);
}
{
E TR, TU, TP, TS;
TP = W[4];
TR = TP * TQ;
TU = TP * TT;
TS = W[5];
rio[WS(vs, 3)] = FMA(TS, TT, TR);
iio[WS(vs, 3)] = FNMS(TS, TQ, TU);
}
}
{
E T2w, T2E, T2B, T2H, T2u, T2z;
T2u = FNMS(KP559016994, T1Z, T1Y);
T2w = FNMS(KP951056516, T2v, T2u);
T2E = FMA(KP951056516, T2v, T2u);
T2z = FNMS(KP559016994, T2g, T2f);
T2B = FMA(KP951056516, T2A, T2z);
T2H = FNMS(KP951056516, T2A, T2z);
{
E T2x, T2C, T2t, T2y;
T2t = W[2];
T2x = T2t * T2w;
T2C = T2t * T2B;
T2y = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T2y, T2B, T2x);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2y, T2w, T2C);
}
{
E T2F, T2I, T2D, T2G;
T2D = W[4];
T2F = T2D * T2E;
T2I = T2D * T2H;
T2G = W[5];
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2G, T2H, T2F);
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2G, T2E, T2I);
}
}
{
E T4k, T4s, T4p, T4v, T4i, T4n;
T4i = FNMS(KP559016994, T3N, T3M);
T4k = FNMS(KP951056516, T4j, T4i);
T4s = FMA(KP951056516, T4j, T4i);
T4n = FNMS(KP559016994, T44, T43);
T4p = FMA(KP951056516, T4o, T4n);
T4v = FNMS(KP951056516, T4o, T4n);
{
E T4l, T4q, T4h, T4m;
T4h = W[2];
T4l = T4h * T4k;
T4q = T4h * T4p;
T4m = W[3];
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T4m, T4p, T4l);
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T4m, T4k, T4q);
}
{
E T4t, T4w, T4r, T4u;
T4r = W[4];
T4t = T4r * T4s;
T4w = T4r * T4v;
T4u = W[5];
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T4u, T4v, T4t);
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T4u, T4s, T4w);
}
}
{
E T28, T2o, T2l, T2r, T20, T2h;
T20 = FMA(KP559016994, T1Z, T1Y);
T28 = FMA(KP951056516, T27, T20);
T2o = FNMS(KP951056516, T27, T20);
T2h = FMA(KP559016994, T2g, T2f);
T2l = FNMS(KP951056516, T2k, T2h);
T2r = FMA(KP951056516, T2k, T2h);
{
E T29, T2m, T1X, T2a;
T1X = W[0];
T29 = T1X * T28;
T2m = T1X * T2l;
T2a = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T2a, T2l, T29);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T2a, T28, T2m);
}
{
E T2p, T2s, T2n, T2q;
T2n = W[6];
T2p = T2n * T2o;
T2s = T2n * T2r;
T2q = W[7];
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T2q, T2r, T2p);
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T2q, T2o, T2s);
}
}
{
E T32, T3i, T3f, T3l, T2U, T3b;
T2U = FMA(KP559016994, T2T, T2S);
T32 = FMA(KP951056516, T31, T2U);
T3i = FNMS(KP951056516, T31, T2U);
T3b = FMA(KP559016994, T3a, T39);
T3f = FNMS(KP951056516, T3e, T3b);
T3l = FMA(KP951056516, T3e, T3b);
{
E T33, T3g, T2R, T34;
T2R = W[0];
T33 = T2R * T32;
T3g = T2R * T3f;
T34 = W[1];
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T34, T3f, T33);
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T34, T32, T3g);
}
{
E T3j, T3m, T3h, T3k;
T3h = W[6];
T3j = T3h * T3i;
T3m = T3h * T3l;
T3k = W[7];
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T3k, T3l, T3j);
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T3k, T3i, T3m);
}
}
{
E T3q, T3y, T3v, T3B, T3o, T3t;
T3o = FNMS(KP559016994, T2T, T2S);
T3q = FNMS(KP951056516, T3p, T3o);
T3y = FMA(KP951056516, T3p, T3o);
T3t = FNMS(KP559016994, T3a, T39);
T3v = FMA(KP951056516, T3u, T3t);
T3B = FNMS(KP951056516, T3u, T3t);
{
E T3r, T3w, T3n, T3s;
T3n = W[2];
T3r = T3n * T3q;
T3w = T3n * T3v;
T3s = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T3s, T3v, T3r);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T3s, T3q, T3w);
}
{
E T3z, T3C, T3x, T3A;
T3x = W[4];
T3z = T3x * T3y;
T3C = T3x * T3B;
T3A = W[5];
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T3A, T3B, T3z);
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T3A, T3y, T3C);
}
}
{
E T3W, T4c, T49, T4f, T3O, T45;
T3O = FMA(KP559016994, T3N, T3M);
T3W = FMA(KP951056516, T3V, T3O);
T4c = FNMS(KP951056516, T3V, T3O);
T45 = FMA(KP559016994, T44, T43);
T49 = FNMS(KP951056516, T48, T45);
T4f = FMA(KP951056516, T48, T45);
{
E T3X, T4a, T3L, T3Y;
T3L = W[0];
T3X = T3L * T3W;
T4a = T3L * T49;
T3Y = W[1];
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3Y, T49, T3X);
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3Y, T3W, T4a);
}
{
E T4d, T4g, T4b, T4e;
T4b = W[6];
T4d = T4b * T4c;
T4g = T4b * T4f;
T4e = W[7];
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T4e, T4f, T4d);
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T4e, T4c, T4g);
}
}
{
E T1C, T1K, T1H, T1N, T1A, T1F;
T1A = FNMS(KP559016994, T15, T14);
T1C = FNMS(KP951056516, T1B, T1A);
T1K = FMA(KP951056516, T1B, T1A);
T1F = FNMS(KP559016994, T1m, T1l);
T1H = FMA(KP951056516, T1G, T1F);
T1N = FNMS(KP951056516, T1G, T1F);
{
E T1D, T1I, T1z, T1E;
T1z = W[2];
T1D = T1z * T1C;
T1I = T1z * T1H;
T1E = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1E, T1H, T1D);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1E, T1C, T1I);
}
{
E T1L, T1O, T1J, T1M;
T1J = W[4];
T1L = T1J * T1K;
T1O = T1J * T1N;
T1M = W[5];
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1M, T1N, T1L);
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1M, T1K, T1O);
}
}
{
E T1e, T1u, T1r, T1x, T16, T1n;
T16 = FMA(KP559016994, T15, T14);
T1e = FMA(KP951056516, T1d, T16);
T1u = FNMS(KP951056516, T1d, T16);
T1n = FMA(KP559016994, T1m, T1l);
T1r = FNMS(KP951056516, T1q, T1n);
T1x = FMA(KP951056516, T1q, T1n);
{
E T1f, T1s, T13, T1g;
T13 = W[0];
T1f = T13 * T1e;
T1s = T13 * T1r;
T1g = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(T1g, T1r, T1f);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T1g, T1e, T1s);
}
{
E T1v, T1y, T1t, T1w;
T1t = W[6];
T1v = T1t * T1u;
T1y = T1t * T1x;
T1w = W[7];
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1w, T1x, T1v);
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1w, T1u, T1y);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 70, 40, 130, 0 }, 0, 0, 0 };
void X(codelet_q1_5) (planner *p) {
X(kdft_difsq_register) (p, q1_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twidsq.native -compact -variables 4 -pipeline-latency 4 -reload-twiddle -dif -n 5 -name q1_5 -include dft/scalar/q.h */
/*
* This function contains 200 FP additions, 140 FP multiplications,
* (or, 130 additions, 70 multiplications, 70 fused multiply/add),
* 75 stack variables, 4 constants, and 100 memory accesses
*/
#include "dft/scalar/q.h"
static void q1_5(R *rio, R *iio, const R *W, stride rs, stride vs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs), MAKE_VOLATILE_STRIDE(0, vs)) {
E T1, Ta, TG, Tv, T8, Tb, Tp, Tj, TD, To, Tq, Tr, TN, TW, T1s;
E T1h, TU, TX, T1b, T15, T1p, T1a, T1c, T1d, T1z, T1I, T2e, T23, T1G, T1J;
E T1X, T1R, T2b, T1W, T1Y, T1Z, T3v, T3p, T3J, T3u, T3w, T3x, T37, T3g, T3M;
E T3B, T3e, T3h, T2l, T2u, T30, T2P, T2s, T2v, T2J, T2D, T2X, T2I, T2K, T2L;
{
E T7, Tu, T4, Tt;
T1 = rio[0];
{
E T5, T6, T2, T3;
T5 = rio[WS(rs, 2)];
T6 = rio[WS(rs, 3)];
T7 = T5 + T6;
Tu = T5 - T6;
T2 = rio[WS(rs, 1)];
T3 = rio[WS(rs, 4)];
T4 = T2 + T3;
Tt = T2 - T3;
}
Ta = KP559016994 * (T4 - T7);
TG = FNMS(KP587785252, Tt, KP951056516 * Tu);
Tv = FMA(KP951056516, Tt, KP587785252 * Tu);
T8 = T4 + T7;
Tb = FNMS(KP250000000, T8, T1);
}
{
E Ti, Tn, Tf, Tm;
Tp = iio[0];
{
E Tg, Th, Td, Te;
Tg = iio[WS(rs, 2)];
Th = iio[WS(rs, 3)];
Ti = Tg - Th;
Tn = Tg + Th;
Td = iio[WS(rs, 1)];
Te = iio[WS(rs, 4)];
Tf = Td - Te;
Tm = Td + Te;
}
Tj = FMA(KP951056516, Tf, KP587785252 * Ti);
TD = FNMS(KP587785252, Tf, KP951056516 * Ti);
To = KP559016994 * (Tm - Tn);
Tq = Tm + Tn;
Tr = FNMS(KP250000000, Tq, Tp);
}
{
E TT, T1g, TQ, T1f;
TN = rio[WS(vs, 1)];
{
E TR, TS, TO, TP;
TR = rio[WS(vs, 1) + WS(rs, 2)];
TS = rio[WS(vs, 1) + WS(rs, 3)];
TT = TR + TS;
T1g = TR - TS;
TO = rio[WS(vs, 1) + WS(rs, 1)];
TP = rio[WS(vs, 1) + WS(rs, 4)];
TQ = TO + TP;
T1f = TO - TP;
}
TW = KP559016994 * (TQ - TT);
T1s = FNMS(KP587785252, T1f, KP951056516 * T1g);
T1h = FMA(KP951056516, T1f, KP587785252 * T1g);
TU = TQ + TT;
TX = FNMS(KP250000000, TU, TN);
}
{
E T14, T19, T11, T18;
T1b = iio[WS(vs, 1)];
{
E T12, T13, TZ, T10;
T12 = iio[WS(vs, 1) + WS(rs, 2)];
T13 = iio[WS(vs, 1) + WS(rs, 3)];
T14 = T12 - T13;
T19 = T12 + T13;
TZ = iio[WS(vs, 1) + WS(rs, 1)];
T10 = iio[WS(vs, 1) + WS(rs, 4)];
T11 = TZ - T10;
T18 = TZ + T10;
}
T15 = FMA(KP951056516, T11, KP587785252 * T14);
T1p = FNMS(KP587785252, T11, KP951056516 * T14);
T1a = KP559016994 * (T18 - T19);
T1c = T18 + T19;
T1d = FNMS(KP250000000, T1c, T1b);
}
{
E T1F, T22, T1C, T21;
T1z = rio[WS(vs, 2)];
{
E T1D, T1E, T1A, T1B;
T1D = rio[WS(vs, 2) + WS(rs, 2)];
T1E = rio[WS(vs, 2) + WS(rs, 3)];
T1F = T1D + T1E;
T22 = T1D - T1E;
T1A = rio[WS(vs, 2) + WS(rs, 1)];
T1B = rio[WS(vs, 2) + WS(rs, 4)];
T1C = T1A + T1B;
T21 = T1A - T1B;
}
T1I = KP559016994 * (T1C - T1F);
T2e = FNMS(KP587785252, T21, KP951056516 * T22);
T23 = FMA(KP951056516, T21, KP587785252 * T22);
T1G = T1C + T1F;
T1J = FNMS(KP250000000, T1G, T1z);
}
{
E T1Q, T1V, T1N, T1U;
T1X = iio[WS(vs, 2)];
{
E T1O, T1P, T1L, T1M;
T1O = iio[WS(vs, 2) + WS(rs, 2)];
T1P = iio[WS(vs, 2) + WS(rs, 3)];
T1Q = T1O - T1P;
T1V = T1O + T1P;
T1L = iio[WS(vs, 2) + WS(rs, 1)];
T1M = iio[WS(vs, 2) + WS(rs, 4)];
T1N = T1L - T1M;
T1U = T1L + T1M;
}
T1R = FMA(KP951056516, T1N, KP587785252 * T1Q);
T2b = FNMS(KP587785252, T1N, KP951056516 * T1Q);
T1W = KP559016994 * (T1U - T1V);
T1Y = T1U + T1V;
T1Z = FNMS(KP250000000, T1Y, T1X);
}
{
E T3o, T3t, T3l, T3s;
T3v = iio[WS(vs, 4)];
{
E T3m, T3n, T3j, T3k;
T3m = iio[WS(vs, 4) + WS(rs, 2)];
T3n = iio[WS(vs, 4) + WS(rs, 3)];
T3o = T3m - T3n;
T3t = T3m + T3n;
T3j = iio[WS(vs, 4) + WS(rs, 1)];
T3k = iio[WS(vs, 4) + WS(rs, 4)];
T3l = T3j - T3k;
T3s = T3j + T3k;
}
T3p = FMA(KP951056516, T3l, KP587785252 * T3o);
T3J = FNMS(KP587785252, T3l, KP951056516 * T3o);
T3u = KP559016994 * (T3s - T3t);
T3w = T3s + T3t;
T3x = FNMS(KP250000000, T3w, T3v);
}
{
E T3d, T3A, T3a, T3z;
T37 = rio[WS(vs, 4)];
{
E T3b, T3c, T38, T39;
T3b = rio[WS(vs, 4) + WS(rs, 2)];
T3c = rio[WS(vs, 4) + WS(rs, 3)];
T3d = T3b + T3c;
T3A = T3b - T3c;
T38 = rio[WS(vs, 4) + WS(rs, 1)];
T39 = rio[WS(vs, 4) + WS(rs, 4)];
T3a = T38 + T39;
T3z = T38 - T39;
}
T3g = KP559016994 * (T3a - T3d);
T3M = FNMS(KP587785252, T3z, KP951056516 * T3A);
T3B = FMA(KP951056516, T3z, KP587785252 * T3A);
T3e = T3a + T3d;
T3h = FNMS(KP250000000, T3e, T37);
}
{
E T2r, T2O, T2o, T2N;
T2l = rio[WS(vs, 3)];
{
E T2p, T2q, T2m, T2n;
T2p = rio[WS(vs, 3) + WS(rs, 2)];
T2q = rio[WS(vs, 3) + WS(rs, 3)];
T2r = T2p + T2q;
T2O = T2p - T2q;
T2m = rio[WS(vs, 3) + WS(rs, 1)];
T2n = rio[WS(vs, 3) + WS(rs, 4)];
T2o = T2m + T2n;
T2N = T2m - T2n;
}
T2u = KP559016994 * (T2o - T2r);
T30 = FNMS(KP587785252, T2N, KP951056516 * T2O);
T2P = FMA(KP951056516, T2N, KP587785252 * T2O);
T2s = T2o + T2r;
T2v = FNMS(KP250000000, T2s, T2l);
}
{
E T2C, T2H, T2z, T2G;
T2J = iio[WS(vs, 3)];
{
E T2A, T2B, T2x, T2y;
T2A = iio[WS(vs, 3) + WS(rs, 2)];
T2B = iio[WS(vs, 3) + WS(rs, 3)];
T2C = T2A - T2B;
T2H = T2A + T2B;
T2x = iio[WS(vs, 3) + WS(rs, 1)];
T2y = iio[WS(vs, 3) + WS(rs, 4)];
T2z = T2x - T2y;
T2G = T2x + T2y;
}
T2D = FMA(KP951056516, T2z, KP587785252 * T2C);
T2X = FNMS(KP587785252, T2z, KP951056516 * T2C);
T2I = KP559016994 * (T2G - T2H);
T2K = T2G + T2H;
T2L = FNMS(KP250000000, T2K, T2J);
}
rio[0] = T1 + T8;
iio[0] = Tp + Tq;
rio[WS(rs, 1)] = TN + TU;
iio[WS(rs, 1)] = T1b + T1c;
rio[WS(rs, 2)] = T1z + T1G;
iio[WS(rs, 2)] = T1X + T1Y;
iio[WS(rs, 4)] = T3v + T3w;
rio[WS(rs, 4)] = T37 + T3e;
rio[WS(rs, 3)] = T2l + T2s;
iio[WS(rs, 3)] = T2J + T2K;
{
E Tk, Ty, Tw, TA, Tc, Ts;
Tc = Ta + Tb;
Tk = Tc + Tj;
Ty = Tc - Tj;
Ts = To + Tr;
Tw = Ts - Tv;
TA = Tv + Ts;
{
E T9, Tl, Tx, Tz;
T9 = W[0];
Tl = W[1];
rio[WS(vs, 1)] = FMA(T9, Tk, Tl * Tw);
iio[WS(vs, 1)] = FNMS(Tl, Tk, T9 * Tw);
Tx = W[6];
Tz = W[7];
rio[WS(vs, 4)] = FMA(Tx, Ty, Tz * TA);
iio[WS(vs, 4)] = FNMS(Tz, Ty, Tx * TA);
}
}
{
E TE, TK, TI, TM, TC, TH;
TC = Tb - Ta;
TE = TC - TD;
TK = TC + TD;
TH = Tr - To;
TI = TG + TH;
TM = TH - TG;
{
E TB, TF, TJ, TL;
TB = W[2];
TF = W[3];
rio[WS(vs, 2)] = FMA(TB, TE, TF * TI);
iio[WS(vs, 2)] = FNMS(TF, TE, TB * TI);
TJ = W[4];
TL = W[5];
rio[WS(vs, 3)] = FMA(TJ, TK, TL * TM);
iio[WS(vs, 3)] = FNMS(TL, TK, TJ * TM);
}
}
{
E T2c, T2i, T2g, T2k, T2a, T2f;
T2a = T1J - T1I;
T2c = T2a - T2b;
T2i = T2a + T2b;
T2f = T1Z - T1W;
T2g = T2e + T2f;
T2k = T2f - T2e;
{
E T29, T2d, T2h, T2j;
T29 = W[2];
T2d = W[3];
rio[WS(vs, 2) + WS(rs, 2)] = FMA(T29, T2c, T2d * T2g);
iio[WS(vs, 2) + WS(rs, 2)] = FNMS(T2d, T2c, T29 * T2g);
T2h = W[4];
T2j = W[5];
rio[WS(vs, 3) + WS(rs, 2)] = FMA(T2h, T2i, T2j * T2k);
iio[WS(vs, 3) + WS(rs, 2)] = FNMS(T2j, T2i, T2h * T2k);
}
}
{
E T3K, T3Q, T3O, T3S, T3I, T3N;
T3I = T3h - T3g;
T3K = T3I - T3J;
T3Q = T3I + T3J;
T3N = T3x - T3u;
T3O = T3M + T3N;
T3S = T3N - T3M;
{
E T3H, T3L, T3P, T3R;
T3H = W[2];
T3L = W[3];
rio[WS(vs, 2) + WS(rs, 4)] = FMA(T3H, T3K, T3L * T3O);
iio[WS(vs, 2) + WS(rs, 4)] = FNMS(T3L, T3K, T3H * T3O);
T3P = W[4];
T3R = W[5];
rio[WS(vs, 3) + WS(rs, 4)] = FMA(T3P, T3Q, T3R * T3S);
iio[WS(vs, 3) + WS(rs, 4)] = FNMS(T3R, T3Q, T3P * T3S);
}
}
{
E T1S, T26, T24, T28, T1K, T20;
T1K = T1I + T1J;
T1S = T1K + T1R;
T26 = T1K - T1R;
T20 = T1W + T1Z;
T24 = T20 - T23;
T28 = T23 + T20;
{
E T1H, T1T, T25, T27;
T1H = W[0];
T1T = W[1];
rio[WS(vs, 1) + WS(rs, 2)] = FMA(T1H, T1S, T1T * T24);
iio[WS(vs, 1) + WS(rs, 2)] = FNMS(T1T, T1S, T1H * T24);
T25 = W[6];
T27 = W[7];
rio[WS(vs, 4) + WS(rs, 2)] = FMA(T25, T26, T27 * T28);
iio[WS(vs, 4) + WS(rs, 2)] = FNMS(T27, T26, T25 * T28);
}
}
{
E T2E, T2S, T2Q, T2U, T2w, T2M;
T2w = T2u + T2v;
T2E = T2w + T2D;
T2S = T2w - T2D;
T2M = T2I + T2L;
T2Q = T2M - T2P;
T2U = T2P + T2M;
{
E T2t, T2F, T2R, T2T;
T2t = W[0];
T2F = W[1];
rio[WS(vs, 1) + WS(rs, 3)] = FMA(T2t, T2E, T2F * T2Q);
iio[WS(vs, 1) + WS(rs, 3)] = FNMS(T2F, T2E, T2t * T2Q);
T2R = W[6];
T2T = W[7];
rio[WS(vs, 4) + WS(rs, 3)] = FMA(T2R, T2S, T2T * T2U);
iio[WS(vs, 4) + WS(rs, 3)] = FNMS(T2T, T2S, T2R * T2U);
}
}
{
E T2Y, T34, T32, T36, T2W, T31;
T2W = T2v - T2u;
T2Y = T2W - T2X;
T34 = T2W + T2X;
T31 = T2L - T2I;
T32 = T30 + T31;
T36 = T31 - T30;
{
E T2V, T2Z, T33, T35;
T2V = W[2];
T2Z = W[3];
rio[WS(vs, 2) + WS(rs, 3)] = FMA(T2V, T2Y, T2Z * T32);
iio[WS(vs, 2) + WS(rs, 3)] = FNMS(T2Z, T2Y, T2V * T32);
T33 = W[4];
T35 = W[5];
rio[WS(vs, 3) + WS(rs, 3)] = FMA(T33, T34, T35 * T36);
iio[WS(vs, 3) + WS(rs, 3)] = FNMS(T35, T34, T33 * T36);
}
}
{
E T3q, T3E, T3C, T3G, T3i, T3y;
T3i = T3g + T3h;
T3q = T3i + T3p;
T3E = T3i - T3p;
T3y = T3u + T3x;
T3C = T3y - T3B;
T3G = T3B + T3y;
{
E T3f, T3r, T3D, T3F;
T3f = W[0];
T3r = W[1];
rio[WS(vs, 1) + WS(rs, 4)] = FMA(T3f, T3q, T3r * T3C);
iio[WS(vs, 1) + WS(rs, 4)] = FNMS(T3r, T3q, T3f * T3C);
T3D = W[6];
T3F = W[7];
rio[WS(vs, 4) + WS(rs, 4)] = FMA(T3D, T3E, T3F * T3G);
iio[WS(vs, 4) + WS(rs, 4)] = FNMS(T3F, T3E, T3D * T3G);
}
}
{
E T1q, T1w, T1u, T1y, T1o, T1t;
T1o = TX - TW;
T1q = T1o - T1p;
T1w = T1o + T1p;
T1t = T1d - T1a;
T1u = T1s + T1t;
T1y = T1t - T1s;
{
E T1n, T1r, T1v, T1x;
T1n = W[2];
T1r = W[3];
rio[WS(vs, 2) + WS(rs, 1)] = FMA(T1n, T1q, T1r * T1u);
iio[WS(vs, 2) + WS(rs, 1)] = FNMS(T1r, T1q, T1n * T1u);
T1v = W[4];
T1x = W[5];
rio[WS(vs, 3) + WS(rs, 1)] = FMA(T1v, T1w, T1x * T1y);
iio[WS(vs, 3) + WS(rs, 1)] = FNMS(T1x, T1w, T1v * T1y);
}
}
{
E T16, T1k, T1i, T1m, TY, T1e;
TY = TW + TX;
T16 = TY + T15;
T1k = TY - T15;
T1e = T1a + T1d;
T1i = T1e - T1h;
T1m = T1h + T1e;
{
E TV, T17, T1j, T1l;
TV = W[0];
T17 = W[1];
rio[WS(vs, 1) + WS(rs, 1)] = FMA(TV, T16, T17 * T1i);
iio[WS(vs, 1) + WS(rs, 1)] = FNMS(T17, T16, TV * T1i);
T1j = W[6];
T1l = W[7];
rio[WS(vs, 4) + WS(rs, 1)] = FMA(T1j, T1k, T1l * T1m);
iio[WS(vs, 4) + WS(rs, 1)] = FNMS(T1l, T1k, T1j * T1m);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "q1_5", twinstr, &GENUS, { 130, 70, 70, 0 }, 0, 0, 0 };
void X(codelet_q1_5) (planner *p) {
X(kdft_difsq_register) (p, q1_5, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,489 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
/*
* This function contains 102 FP additions, 72 FP multiplications,
* (or, 48 additions, 18 multiplications, 54 fused multiply/add),
* 47 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T8, T23, T12, T1U, TM, TZ, T10, T1F, T1G, T1P, T16, T17, T18, T1s, T1x;
E T25, Tl, Ty, Tz, T1I, T1J, T1O, T13, T14, T15, T1h, T1m, T24;
{
E T1, T1T, T3, T6, T4, T1R, T2, T7, T1S, T5;
T1 = ri[0];
T1T = ii[0];
T3 = ri[WS(rs, 5)];
T6 = ii[WS(rs, 5)];
T2 = W[8];
T4 = T2 * T3;
T1R = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1S = FNMS(T5, T3, T1R);
T8 = T1 - T7;
T23 = T1T - T1S;
T12 = T1 + T7;
T1U = T1S + T1T;
}
{
E TF, T1p, TY, T1w, TL, T1r, TS, T1u;
{
E TB, TE, TC, T1o, TA, TD;
TB = ri[WS(rs, 4)];
TE = ii[WS(rs, 4)];
TA = W[6];
TC = TA * TB;
T1o = TA * TE;
TD = W[7];
TF = FMA(TD, TE, TC);
T1p = FNMS(TD, TB, T1o);
}
{
E TU, TX, TV, T1v, TT, TW;
TU = ri[WS(rs, 1)];
TX = ii[WS(rs, 1)];
TT = W[0];
TV = TT * TU;
T1v = TT * TX;
TW = W[1];
TY = FMA(TW, TX, TV);
T1w = FNMS(TW, TU, T1v);
}
{
E TH, TK, TI, T1q, TG, TJ;
TH = ri[WS(rs, 9)];
TK = ii[WS(rs, 9)];
TG = W[16];
TI = TG * TH;
T1q = TG * TK;
TJ = W[17];
TL = FMA(TJ, TK, TI);
T1r = FNMS(TJ, TH, T1q);
}
{
E TO, TR, TP, T1t, TN, TQ;
TO = ri[WS(rs, 6)];
TR = ii[WS(rs, 6)];
TN = W[10];
TP = TN * TO;
T1t = TN * TR;
TQ = W[11];
TS = FMA(TQ, TR, TP);
T1u = FNMS(TQ, TO, T1t);
}
TM = TF - TL;
TZ = TS - TY;
T10 = TM + TZ;
T1F = T1p + T1r;
T1G = T1u + T1w;
T1P = T1F + T1G;
T16 = TF + TL;
T17 = TS + TY;
T18 = T16 + T17;
T1s = T1p - T1r;
T1x = T1u - T1w;
T25 = T1s + T1x;
}
{
E Te, T1e, Tx, T1l, Tk, T1g, Tr, T1j;
{
E Ta, Td, Tb, T1d, T9, Tc;
Ta = ri[WS(rs, 2)];
Td = ii[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
T1d = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
T1e = FNMS(Tc, Ta, T1d);
}
{
E Tt, Tw, Tu, T1k, Ts, Tv;
Tt = ri[WS(rs, 3)];
Tw = ii[WS(rs, 3)];
Ts = W[4];
Tu = Ts * Tt;
T1k = Ts * Tw;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
T1l = FNMS(Tv, Tt, T1k);
}
{
E Tg, Tj, Th, T1f, Tf, Ti;
Tg = ri[WS(rs, 7)];
Tj = ii[WS(rs, 7)];
Tf = W[12];
Th = Tf * Tg;
T1f = Tf * Tj;
Ti = W[13];
Tk = FMA(Ti, Tj, Th);
T1g = FNMS(Ti, Tg, T1f);
}
{
E Tn, Tq, To, T1i, Tm, Tp;
Tn = ri[WS(rs, 8)];
Tq = ii[WS(rs, 8)];
Tm = W[14];
To = Tm * Tn;
T1i = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1j = FNMS(Tp, Tn, T1i);
}
Tl = Te - Tk;
Ty = Tr - Tx;
Tz = Tl + Ty;
T1I = T1e + T1g;
T1J = T1j + T1l;
T1O = T1I + T1J;
T13 = Te + Tk;
T14 = Tr + Tx;
T15 = T13 + T14;
T1h = T1e - T1g;
T1m = T1j - T1l;
T24 = T1h + T1m;
}
{
E T1b, T11, T1a, T1z, T1B, T1n, T1y, T1A, T1c;
T1b = Tz - T10;
T11 = Tz + T10;
T1a = FNMS(KP250000000, T11, T8);
T1n = T1h - T1m;
T1y = T1s - T1x;
T1z = FMA(KP618033988, T1y, T1n);
T1B = FNMS(KP618033988, T1n, T1y);
ri[WS(rs, 5)] = T8 + T11;
T1A = FNMS(KP559016994, T1b, T1a);
ri[WS(rs, 7)] = FNMS(KP951056516, T1B, T1A);
ri[WS(rs, 3)] = FMA(KP951056516, T1B, T1A);
T1c = FMA(KP559016994, T1b, T1a);
ri[WS(rs, 9)] = FNMS(KP951056516, T1z, T1c);
ri[WS(rs, 1)] = FMA(KP951056516, T1z, T1c);
}
{
E T28, T26, T27, T2c, T2e, T2a, T2b, T2d, T29;
T28 = T24 - T25;
T26 = T24 + T25;
T27 = FNMS(KP250000000, T26, T23);
T2a = Tl - Ty;
T2b = TM - TZ;
T2c = FMA(KP618033988, T2b, T2a);
T2e = FNMS(KP618033988, T2a, T2b);
ii[WS(rs, 5)] = T26 + T23;
T2d = FNMS(KP559016994, T28, T27);
ii[WS(rs, 3)] = FNMS(KP951056516, T2e, T2d);
ii[WS(rs, 7)] = FMA(KP951056516, T2e, T2d);
T29 = FMA(KP559016994, T28, T27);
ii[WS(rs, 1)] = FNMS(KP951056516, T2c, T29);
ii[WS(rs, 9)] = FMA(KP951056516, T2c, T29);
}
{
E T1D, T19, T1C, T1L, T1N, T1H, T1K, T1M, T1E;
T1D = T15 - T18;
T19 = T15 + T18;
T1C = FNMS(KP250000000, T19, T12);
T1H = T1F - T1G;
T1K = T1I - T1J;
T1L = FNMS(KP618033988, T1K, T1H);
T1N = FMA(KP618033988, T1H, T1K);
ri[0] = T12 + T19;
T1M = FMA(KP559016994, T1D, T1C);
ri[WS(rs, 4)] = FNMS(KP951056516, T1N, T1M);
ri[WS(rs, 6)] = FMA(KP951056516, T1N, T1M);
T1E = FNMS(KP559016994, T1D, T1C);
ri[WS(rs, 2)] = FNMS(KP951056516, T1L, T1E);
ri[WS(rs, 8)] = FMA(KP951056516, T1L, T1E);
}
{
E T1W, T1Q, T1V, T20, T22, T1Y, T1Z, T21, T1X;
T1W = T1O - T1P;
T1Q = T1O + T1P;
T1V = FNMS(KP250000000, T1Q, T1U);
T1Y = T16 - T17;
T1Z = T13 - T14;
T20 = FNMS(KP618033988, T1Z, T1Y);
T22 = FMA(KP618033988, T1Y, T1Z);
ii[0] = T1Q + T1U;
T21 = FMA(KP559016994, T1W, T1V);
ii[WS(rs, 4)] = FMA(KP951056516, T22, T21);
ii[WS(rs, 6)] = FNMS(KP951056516, T22, T21);
T1X = FNMS(KP559016994, T1W, T1V);
ii[WS(rs, 2)] = FMA(KP951056516, T20, T1X);
ii[WS(rs, 8)] = FNMS(KP951056516, T20, T1X);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 10 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 48, 18, 54, 0 }, 0, 0, 0 };
void X(codelet_t1_10) (planner *p) {
X(kdft_dit_register) (p, t1_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 10 -name t1_10 -include dft/scalar/t.h */
/*
* This function contains 102 FP additions, 60 FP multiplications,
* (or, 72 additions, 30 multiplications, 30 fused multiply/add),
* 45 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + (mb * 18); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 18, MAKE_VOLATILE_STRIDE(20, rs)) {
E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g;
E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L;
{
E T1, T1B, T6, T1A;
T1 = ri[0];
T1B = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 5)];
T5 = ii[WS(rs, 5)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1A = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
T1O = T1B - T1A;
TT = T1 + T6;
T1C = T1A + T1B;
}
{
E Tz, T1b, TP, T1f, TE, T1c, TK, T1e;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 4)];
Ty = ii[WS(rs, 4)];
Tv = W[6];
Tx = W[7];
Tz = FMA(Tv, Tw, Tx * Ty);
T1b = FNMS(Tx, Tw, Tv * Ty);
}
{
E TM, TO, TL, TN;
TM = ri[WS(rs, 1)];
TO = ii[WS(rs, 1)];
TL = W[0];
TN = W[1];
TP = FMA(TL, TM, TN * TO);
T1f = FNMS(TN, TM, TL * TO);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 9)];
TD = ii[WS(rs, 9)];
TA = W[16];
TC = W[17];
TE = FMA(TA, TB, TC * TD);
T1c = FNMS(TC, TB, TA * TD);
}
{
E TH, TJ, TG, TI;
TH = ri[WS(rs, 6)];
TJ = ii[WS(rs, 6)];
TG = W[10];
TI = W[11];
TK = FMA(TG, TH, TI * TJ);
T1e = FNMS(TI, TH, TG * TJ);
}
TF = Tz - TE;
TQ = TK - TP;
TR = TF + TQ;
T1o = T1b + T1c;
T1p = T1e + T1f;
T1y = T1o + T1p;
TX = Tz + TE;
TY = TK + TP;
TZ = TX + TY;
T1d = T1b - T1c;
T1g = T1e - T1f;
T1M = T1d + T1g;
}
{
E Tc, T14, Ts, T18, Th, T15, Tn, T17;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 2)];
Tb = ii[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
T14 = FNMS(Ta, T9, T8 * Tb);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 3)];
Tr = ii[WS(rs, 3)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
T18 = FNMS(Tq, Tp, To * Tr);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 7)];
Tg = ii[WS(rs, 7)];
Td = W[12];
Tf = W[13];
Th = FMA(Td, Te, Tf * Tg);
T15 = FNMS(Tf, Te, Td * Tg);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 8)];
Tm = ii[WS(rs, 8)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T17 = FNMS(Tl, Tk, Tj * Tm);
}
Ti = Tc - Th;
Tt = Tn - Ts;
Tu = Ti + Tt;
T1r = T14 + T15;
T1s = T17 + T18;
T1x = T1r + T1s;
TU = Tc + Th;
TV = Tn + Ts;
TW = TU + TV;
T16 = T14 - T15;
T19 = T17 - T18;
T1L = T16 + T19;
}
{
E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13;
T11 = KP559016994 * (Tu - TR);
TS = Tu + TR;
T12 = FNMS(KP250000000, TS, T7);
T1a = T16 - T19;
T1h = T1d - T1g;
T1i = FMA(KP951056516, T1a, KP587785252 * T1h);
T1k = FNMS(KP587785252, T1a, KP951056516 * T1h);
ri[WS(rs, 5)] = T7 + TS;
T1j = T12 - T11;
ri[WS(rs, 7)] = T1j - T1k;
ri[WS(rs, 3)] = T1j + T1k;
T13 = T11 + T12;
ri[WS(rs, 9)] = T13 - T1i;
ri[WS(rs, 1)] = T13 + T1i;
}
{
E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R;
T1N = KP559016994 * (T1L - T1M);
T1P = T1L + T1M;
T1Q = FNMS(KP250000000, T1P, T1O);
T1S = Ti - Tt;
T1T = TF - TQ;
T1U = FMA(KP951056516, T1S, KP587785252 * T1T);
T1W = FNMS(KP587785252, T1S, KP951056516 * T1T);
ii[WS(rs, 5)] = T1P + T1O;
T1V = T1Q - T1N;
ii[WS(rs, 3)] = T1V - T1W;
ii[WS(rs, 7)] = T1W + T1V;
T1R = T1N + T1Q;
ii[WS(rs, 1)] = T1R - T1U;
ii[WS(rs, 9)] = T1U + T1R;
}
{
E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n;
T1m = KP559016994 * (TW - TZ);
T10 = TW + TZ;
T1l = FNMS(KP250000000, T10, TT);
T1q = T1o - T1p;
T1t = T1r - T1s;
T1u = FNMS(KP587785252, T1t, KP951056516 * T1q);
T1w = FMA(KP951056516, T1t, KP587785252 * T1q);
ri[0] = TT + T10;
T1v = T1m + T1l;
ri[WS(rs, 4)] = T1v - T1w;
ri[WS(rs, 6)] = T1v + T1w;
T1n = T1l - T1m;
ri[WS(rs, 2)] = T1n - T1u;
ri[WS(rs, 8)] = T1n + T1u;
}
{
E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I;
T1H = KP559016994 * (T1x - T1y);
T1z = T1x + T1y;
T1G = FNMS(KP250000000, T1z, T1C);
T1D = TX - TY;
T1E = TU - TV;
T1F = FNMS(KP587785252, T1E, KP951056516 * T1D);
T1J = FMA(KP951056516, T1E, KP587785252 * T1D);
ii[0] = T1z + T1C;
T1K = T1H + T1G;
ii[WS(rs, 4)] = T1J + T1K;
ii[WS(rs, 6)] = T1K - T1J;
T1I = T1G - T1H;
ii[WS(rs, 2)] = T1F + T1I;
ii[WS(rs, 8)] = T1I - T1F;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 10 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t1_10", twinstr, &GENUS, { 72, 30, 30, 0 }, 0, 0, 0 };
void X(codelet_t1_10) (planner *p) {
X(kdft_dit_register) (p, t1_10, &desc);
}
#endif

View File

@@ -0,0 +1,581 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
/*
* This function contains 118 FP additions, 68 FP multiplications,
* (or, 72 additions, 22 multiplications, 46 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, T2i, Tl, T2e, T10, T1Y, TG, T1S, Ty, T2r, T1s, T2f, T1d, T21, T1H;
E T1Z, Te, T2o, T1l, T2h, TT, T1V, T1A, T1T;
T1 = ri[0];
T2i = ii[0];
{
E Th, Tk, Ti, T2d, Tg, Tj;
Th = ri[WS(rs, 6)];
Tk = ii[WS(rs, 6)];
Tg = W[10];
Ti = Tg * Th;
T2d = Tg * Tk;
Tj = W[11];
Tl = FMA(Tj, Tk, Ti);
T2e = FNMS(Tj, Th, T2d);
}
{
E TW, TZ, TX, T1X, TV, TY;
TW = ri[WS(rs, 9)];
TZ = ii[WS(rs, 9)];
TV = W[16];
TX = TV * TW;
T1X = TV * TZ;
TY = W[17];
T10 = FMA(TY, TZ, TX);
T1Y = FNMS(TY, TW, T1X);
}
{
E TC, TF, TD, T1R, TB, TE;
TC = ri[WS(rs, 3)];
TF = ii[WS(rs, 3)];
TB = W[4];
TD = TB * TC;
T1R = TB * TF;
TE = W[5];
TG = FMA(TE, TF, TD);
T1S = FNMS(TE, TC, T1R);
}
{
E Tn, Tq, To, T1o, Tt, Tw, Tu, T1q, Tm, Ts;
Tn = ri[WS(rs, 10)];
Tq = ii[WS(rs, 10)];
Tm = W[18];
To = Tm * Tn;
T1o = Tm * Tq;
Tt = ri[WS(rs, 2)];
Tw = ii[WS(rs, 2)];
Ts = W[2];
Tu = Ts * Tt;
T1q = Ts * Tw;
{
E Tr, T1p, Tx, T1r, Tp, Tv;
Tp = W[19];
Tr = FMA(Tp, Tq, To);
T1p = FNMS(Tp, Tn, T1o);
Tv = W[3];
Tx = FMA(Tv, Tw, Tu);
T1r = FNMS(Tv, Tt, T1q);
Ty = Tr + Tx;
T2r = Tx - Tr;
T1s = T1p - T1r;
T2f = T1p + T1r;
}
}
{
E T12, T15, T13, T1D, T18, T1b, T19, T1F, T11, T17;
T12 = ri[WS(rs, 1)];
T15 = ii[WS(rs, 1)];
T11 = W[0];
T13 = T11 * T12;
T1D = T11 * T15;
T18 = ri[WS(rs, 5)];
T1b = ii[WS(rs, 5)];
T17 = W[8];
T19 = T17 * T18;
T1F = T17 * T1b;
{
E T16, T1E, T1c, T1G, T14, T1a;
T14 = W[1];
T16 = FMA(T14, T15, T13);
T1E = FNMS(T14, T12, T1D);
T1a = W[9];
T1c = FMA(T1a, T1b, T19);
T1G = FNMS(T1a, T18, T1F);
T1d = T16 + T1c;
T21 = T1c - T16;
T1H = T1E - T1G;
T1Z = T1E + T1G;
}
}
{
E T3, T6, T4, T1h, T9, Tc, Ta, T1j, T2, T8;
T3 = ri[WS(rs, 4)];
T6 = ii[WS(rs, 4)];
T2 = W[6];
T4 = T2 * T3;
T1h = T2 * T6;
T9 = ri[WS(rs, 8)];
Tc = ii[WS(rs, 8)];
T8 = W[14];
Ta = T8 * T9;
T1j = T8 * Tc;
{
E T7, T1i, Td, T1k, T5, Tb;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1i = FNMS(T5, T3, T1h);
Tb = W[15];
Td = FMA(Tb, Tc, Ta);
T1k = FNMS(Tb, T9, T1j);
Te = T7 + Td;
T2o = Td - T7;
T1l = T1i - T1k;
T2h = T1i + T1k;
}
}
{
E TI, TL, TJ, T1w, TO, TR, TP, T1y, TH, TN;
TI = ri[WS(rs, 7)];
TL = ii[WS(rs, 7)];
TH = W[12];
TJ = TH * TI;
T1w = TH * TL;
TO = ri[WS(rs, 11)];
TR = ii[WS(rs, 11)];
TN = W[20];
TP = TN * TO;
T1y = TN * TR;
{
E TM, T1x, TS, T1z, TK, TQ;
TK = W[13];
TM = FMA(TK, TL, TJ);
T1x = FNMS(TK, TI, T1w);
TQ = W[21];
TS = FMA(TQ, TR, TP);
T1z = FNMS(TQ, TO, T1y);
TT = TM + TS;
T1V = TS - TM;
T1A = T1x - T1z;
T1T = T1x + T1z;
}
}
{
E TA, T28, T2k, T2m, T1f, T2l, T2b, T2c;
{
E Tf, Tz, T2g, T2j;
Tf = T1 + Te;
Tz = Tl + Ty;
TA = Tf + Tz;
T28 = Tf - Tz;
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2m = T2j - T2g;
}
{
E TU, T1e, T29, T2a;
TU = TG + TT;
T1e = T10 + T1d;
T1f = TU + T1e;
T2l = TU - T1e;
T29 = T1S + T1T;
T2a = T1Y + T1Z;
T2b = T29 - T2a;
T2c = T29 + T2a;
}
ri[WS(rs, 6)] = TA - T1f;
ii[WS(rs, 6)] = T2k - T2c;
ri[0] = TA + T1f;
ii[0] = T2c + T2k;
ri[WS(rs, 3)] = T28 - T2b;
ii[WS(rs, 3)] = T2l + T2m;
ri[WS(rs, 9)] = T28 + T2b;
ii[WS(rs, 9)] = T2m - T2l;
}
{
E T1m, T1K, T2p, T2y, T2s, T2x, T1t, T1L, T1B, T1N, T1W, T25, T22, T26, T1I;
E T1O;
{
E T1g, T2n, T2q, T1n;
T1g = FNMS(KP500000000, Te, T1);
T1m = FNMS(KP866025403, T1l, T1g);
T1K = FMA(KP866025403, T1l, T1g);
T2n = FNMS(KP500000000, T2h, T2i);
T2p = FMA(KP866025403, T2o, T2n);
T2y = FNMS(KP866025403, T2o, T2n);
T2q = FNMS(KP500000000, T2f, T2e);
T2s = FMA(KP866025403, T2r, T2q);
T2x = FNMS(KP866025403, T2r, T2q);
T1n = FNMS(KP500000000, Ty, Tl);
T1t = FNMS(KP866025403, T1s, T1n);
T1L = FMA(KP866025403, T1s, T1n);
}
{
E T1v, T1U, T20, T1C;
T1v = FNMS(KP500000000, TT, TG);
T1B = FNMS(KP866025403, T1A, T1v);
T1N = FMA(KP866025403, T1A, T1v);
T1U = FNMS(KP500000000, T1T, T1S);
T1W = FMA(KP866025403, T1V, T1U);
T25 = FNMS(KP866025403, T1V, T1U);
T20 = FNMS(KP500000000, T1Z, T1Y);
T22 = FMA(KP866025403, T21, T20);
T26 = FNMS(KP866025403, T21, T20);
T1C = FNMS(KP500000000, T1d, T10);
T1I = FNMS(KP866025403, T1H, T1C);
T1O = FMA(KP866025403, T1H, T1C);
}
{
E T1u, T1J, T2z, T2A;
T1u = T1m + T1t;
T1J = T1B + T1I;
ri[WS(rs, 2)] = T1u - T1J;
ri[WS(rs, 8)] = T1u + T1J;
T2z = T2x + T2y;
T2A = T25 + T26;
ii[WS(rs, 2)] = T2z - T2A;
ii[WS(rs, 8)] = T2A + T2z;
}
{
E T1M, T1P, T2v, T2w;
T1M = T1K + T1L;
T1P = T1N + T1O;
ri[WS(rs, 10)] = T1M - T1P;
ri[WS(rs, 4)] = T1M + T1P;
T2v = T1W + T22;
T2w = T2s + T2p;
ii[WS(rs, 4)] = T2v + T2w;
ii[WS(rs, 10)] = T2w - T2v;
}
{
E T1Q, T23, T2t, T2u;
T1Q = T1K - T1L;
T23 = T1W - T22;
ri[WS(rs, 7)] = T1Q - T23;
ri[WS(rs, 1)] = T1Q + T23;
T2t = T2p - T2s;
T2u = T1N - T1O;
ii[WS(rs, 1)] = T2t - T2u;
ii[WS(rs, 7)] = T2u + T2t;
}
{
E T24, T27, T2B, T2C;
T24 = T1m - T1t;
T27 = T25 - T26;
ri[WS(rs, 11)] = T24 - T27;
ri[WS(rs, 5)] = T24 + T27;
T2B = T2y - T2x;
T2C = T1B - T1I;
ii[WS(rs, 5)] = T2B - T2C;
ii[WS(rs, 11)] = T2C + T2B;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 12 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 72, 22, 46, 0 }, 0, 0, 0 };
void X(codelet_t1_12) (planner *p) {
X(kdft_dit_register) (p, t1_12, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 12 -name t1_12 -include dft/scalar/t.h */
/*
* This function contains 118 FP additions, 60 FP multiplications,
* (or, 88 additions, 30 multiplications, 30 fused multiply/add),
* 47 stack variables, 2 constants, and 48 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_12(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 22); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 22, MAKE_VOLATILE_STRIDE(24, rs)) {
E T1, T1W, T18, T21, Tc, T15, T1V, T22, TR, T1E, T1o, T1D, T12, T1l, T1F;
E T1G, Ti, T1S, T1d, T24, Tt, T1a, T1T, T25, TA, T1z, T1j, T1y, TL, T1g;
E T1A, T1B;
{
E T6, T16, Tb, T17;
T1 = ri[0];
T1W = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 4)];
T5 = ii[WS(rs, 4)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T16 = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 8)];
Ta = ii[WS(rs, 8)];
T7 = W[14];
T9 = W[15];
Tb = FMA(T7, T8, T9 * Ta);
T17 = FNMS(T9, T8, T7 * Ta);
}
T18 = KP866025403 * (T16 - T17);
T21 = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
T15 = FNMS(KP500000000, Tc, T1);
T1V = T16 + T17;
T22 = FNMS(KP500000000, T1V, T1W);
}
{
E T11, T1n, TW, T1m;
{
E TO, TQ, TN, TP;
TO = ri[WS(rs, 9)];
TQ = ii[WS(rs, 9)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1E = FNMS(TP, TO, TN * TQ);
}
{
E TY, T10, TX, TZ;
TY = ri[WS(rs, 5)];
T10 = ii[WS(rs, 5)];
TX = W[8];
TZ = W[9];
T11 = FMA(TX, TY, TZ * T10);
T1n = FNMS(TZ, TY, TX * T10);
}
{
E TT, TV, TS, TU;
TT = ri[WS(rs, 1)];
TV = ii[WS(rs, 1)];
TS = W[0];
TU = W[1];
TW = FMA(TS, TT, TU * TV);
T1m = FNMS(TU, TT, TS * TV);
}
T1o = KP866025403 * (T1m - T1n);
T1D = KP866025403 * (T11 - TW);
T12 = TW + T11;
T1l = FNMS(KP500000000, T12, TR);
T1F = T1m + T1n;
T1G = FNMS(KP500000000, T1F, T1E);
}
{
E Ts, T1c, Tn, T1b;
{
E Tf, Th, Te, Tg;
Tf = ri[WS(rs, 6)];
Th = ii[WS(rs, 6)];
Te = W[10];
Tg = W[11];
Ti = FMA(Te, Tf, Tg * Th);
T1S = FNMS(Tg, Tf, Te * Th);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 2)];
Tr = ii[WS(rs, 2)];
To = W[2];
Tq = W[3];
Ts = FMA(To, Tp, Tq * Tr);
T1c = FNMS(Tq, Tp, To * Tr);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 10)];
Tm = ii[WS(rs, 10)];
Tj = W[18];
Tl = W[19];
Tn = FMA(Tj, Tk, Tl * Tm);
T1b = FNMS(Tl, Tk, Tj * Tm);
}
T1d = KP866025403 * (T1b - T1c);
T24 = KP866025403 * (Ts - Tn);
Tt = Tn + Ts;
T1a = FNMS(KP500000000, Tt, Ti);
T1T = T1b + T1c;
T25 = FNMS(KP500000000, T1T, T1S);
}
{
E TK, T1i, TF, T1h;
{
E Tx, Tz, Tw, Ty;
Tx = ri[WS(rs, 3)];
Tz = ii[WS(rs, 3)];
Tw = W[4];
Ty = W[5];
TA = FMA(Tw, Tx, Ty * Tz);
T1z = FNMS(Ty, Tx, Tw * Tz);
}
{
E TH, TJ, TG, TI;
TH = ri[WS(rs, 11)];
TJ = ii[WS(rs, 11)];
TG = W[20];
TI = W[21];
TK = FMA(TG, TH, TI * TJ);
T1i = FNMS(TI, TH, TG * TJ);
}
{
E TC, TE, TB, TD;
TC = ri[WS(rs, 7)];
TE = ii[WS(rs, 7)];
TB = W[12];
TD = W[13];
TF = FMA(TB, TC, TD * TE);
T1h = FNMS(TD, TC, TB * TE);
}
T1j = KP866025403 * (T1h - T1i);
T1y = KP866025403 * (TK - TF);
TL = TF + TK;
T1g = FNMS(KP500000000, TL, TA);
T1A = T1h + T1i;
T1B = FNMS(KP500000000, T1A, T1z);
}
{
E Tv, T1N, T1Y, T20, T14, T1Z, T1Q, T1R;
{
E Td, Tu, T1U, T1X;
Td = T1 + Tc;
Tu = Ti + Tt;
Tv = Td + Tu;
T1N = Td - Tu;
T1U = T1S + T1T;
T1X = T1V + T1W;
T1Y = T1U + T1X;
T20 = T1X - T1U;
}
{
E TM, T13, T1O, T1P;
TM = TA + TL;
T13 = TR + T12;
T14 = TM + T13;
T1Z = TM - T13;
T1O = T1z + T1A;
T1P = T1E + T1F;
T1Q = T1O - T1P;
T1R = T1O + T1P;
}
ri[WS(rs, 6)] = Tv - T14;
ii[WS(rs, 6)] = T1Y - T1R;
ri[0] = Tv + T14;
ii[0] = T1R + T1Y;
ri[WS(rs, 3)] = T1N - T1Q;
ii[WS(rs, 3)] = T1Z + T20;
ri[WS(rs, 9)] = T1N + T1Q;
ii[WS(rs, 9)] = T20 - T1Z;
}
{
E T1t, T1x, T27, T2a, T1w, T28, T1I, T29;
{
E T1r, T1s, T23, T26;
T1r = T15 + T18;
T1s = T1a + T1d;
T1t = T1r + T1s;
T1x = T1r - T1s;
T23 = T21 + T22;
T26 = T24 + T25;
T27 = T23 - T26;
T2a = T26 + T23;
}
{
E T1u, T1v, T1C, T1H;
T1u = T1g + T1j;
T1v = T1l + T1o;
T1w = T1u + T1v;
T28 = T1u - T1v;
T1C = T1y + T1B;
T1H = T1D + T1G;
T1I = T1C - T1H;
T29 = T1C + T1H;
}
ri[WS(rs, 10)] = T1t - T1w;
ii[WS(rs, 10)] = T2a - T29;
ri[WS(rs, 4)] = T1t + T1w;
ii[WS(rs, 4)] = T29 + T2a;
ri[WS(rs, 7)] = T1x - T1I;
ii[WS(rs, 7)] = T28 + T27;
ri[WS(rs, 1)] = T1x + T1I;
ii[WS(rs, 1)] = T27 - T28;
}
{
E T1f, T1J, T2d, T2f, T1q, T2g, T1M, T2e;
{
E T19, T1e, T2b, T2c;
T19 = T15 - T18;
T1e = T1a - T1d;
T1f = T19 + T1e;
T1J = T19 - T1e;
T2b = T25 - T24;
T2c = T22 - T21;
T2d = T2b + T2c;
T2f = T2c - T2b;
}
{
E T1k, T1p, T1K, T1L;
T1k = T1g - T1j;
T1p = T1l - T1o;
T1q = T1k + T1p;
T2g = T1k - T1p;
T1K = T1B - T1y;
T1L = T1G - T1D;
T1M = T1K - T1L;
T2e = T1K + T1L;
}
ri[WS(rs, 2)] = T1f - T1q;
ii[WS(rs, 2)] = T2d - T2e;
ri[WS(rs, 8)] = T1f + T1q;
ii[WS(rs, 8)] = T2e + T2d;
ri[WS(rs, 11)] = T1J - T1M;
ii[WS(rs, 11)] = T2g + T2f;
ri[WS(rs, 5)] = T1J + T1M;
ii[WS(rs, 5)] = T2f - T2g;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 12 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 12, "t1_12", twinstr, &GENUS, { 88, 30, 30, 0 }, 0, 0, 0 };
void X(codelet_t1_12) (planner *p) {
X(kdft_dit_register) (p, t1_12, &desc);
}
#endif

View File

@@ -0,0 +1,816 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
/*
* This function contains 184 FP additions, 140 FP multiplications,
* (or, 72 additions, 28 multiplications, 112 fused multiply/add),
* 51 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T1, T3j, T1G, T3u, Te, T1B, T3i, T3t, T1y, T2i, T2a, T2M, T37, T2V, Tz;
E T2e, T1O, T2t, T39, T2X, TT, T2f, T1V, T2z, T3a, T2Y, T1e, T2h, T23, T2G;
E T36, T2U;
{
E T7, T1D, Td, T1F;
T1 = ri[0];
T3j = ii[0];
{
E T3, T6, T4, T1C, T2, T5;
T3 = ri[WS(rs, 5)];
T6 = ii[WS(rs, 5)];
T2 = W[8];
T4 = T2 * T3;
T1C = T2 * T6;
T5 = W[9];
T7 = FMA(T5, T6, T4);
T1D = FNMS(T5, T3, T1C);
}
{
E T9, Tc, Ta, T1E, T8, Tb;
T9 = ri[WS(rs, 10)];
Tc = ii[WS(rs, 10)];
T8 = W[18];
Ta = T8 * T9;
T1E = T8 * Tc;
Tb = W[19];
Td = FMA(Tb, Tc, Ta);
T1F = FNMS(Tb, T9, T1E);
}
T1G = T1D - T1F;
T3u = Td - T7;
Te = T7 + Td;
T1B = FNMS(KP500000000, Te, T1);
T3i = T1D + T1F;
T3t = FNMS(KP500000000, T3i, T3j);
}
{
E T1k, T2I, T1w, T28, T1q, T26;
{
E T1g, T1j, T1h, T2H, T1f, T1i;
T1g = ri[WS(rs, 9)];
T1j = ii[WS(rs, 9)];
T1f = W[16];
T1h = T1f * T1g;
T2H = T1f * T1j;
T1i = W[17];
T1k = FMA(T1i, T1j, T1h);
T2I = FNMS(T1i, T1g, T2H);
}
{
E T1s, T1v, T1t, T27, T1r, T1u;
T1s = ri[WS(rs, 4)];
T1v = ii[WS(rs, 4)];
T1r = W[6];
T1t = T1r * T1s;
T27 = T1r * T1v;
T1u = W[7];
T1w = FMA(T1u, T1v, T1t);
T28 = FNMS(T1u, T1s, T27);
}
{
E T1m, T1p, T1n, T25, T1l, T1o;
T1m = ri[WS(rs, 14)];
T1p = ii[WS(rs, 14)];
T1l = W[26];
T1n = T1l * T1m;
T25 = T1l * T1p;
T1o = W[27];
T1q = FMA(T1o, T1p, T1n);
T26 = FNMS(T1o, T1m, T25);
}
{
E T29, T1x, T24, T2L, T2J, T2K;
T29 = T26 - T28;
T1x = T1q + T1w;
T24 = FNMS(KP500000000, T1x, T1k);
T1y = T1k + T1x;
T2i = FMA(KP866025403, T29, T24);
T2a = FNMS(KP866025403, T29, T24);
T2L = T1w - T1q;
T2J = T26 + T28;
T2K = FNMS(KP500000000, T2J, T2I);
T2M = FMA(KP866025403, T2L, T2K);
T37 = T2I + T2J;
T2V = FNMS(KP866025403, T2L, T2K);
}
}
{
E Tl, T2p, Tx, T1M, Tr, T1K;
{
E Th, Tk, Ti, T2o, Tg, Tj;
Th = ri[WS(rs, 3)];
Tk = ii[WS(rs, 3)];
Tg = W[4];
Ti = Tg * Th;
T2o = Tg * Tk;
Tj = W[5];
Tl = FMA(Tj, Tk, Ti);
T2p = FNMS(Tj, Th, T2o);
}
{
E Tt, Tw, Tu, T1L, Ts, Tv;
Tt = ri[WS(rs, 13)];
Tw = ii[WS(rs, 13)];
Ts = W[24];
Tu = Ts * Tt;
T1L = Ts * Tw;
Tv = W[25];
Tx = FMA(Tv, Tw, Tu);
T1M = FNMS(Tv, Tt, T1L);
}
{
E Tn, Tq, To, T1J, Tm, Tp;
Tn = ri[WS(rs, 8)];
Tq = ii[WS(rs, 8)];
Tm = W[14];
To = Tm * Tn;
T1J = Tm * Tq;
Tp = W[15];
Tr = FMA(Tp, Tq, To);
T1K = FNMS(Tp, Tn, T1J);
}
{
E T1N, Ty, T1I, T2s, T2q, T2r;
T1N = T1K - T1M;
Ty = Tr + Tx;
T1I = FNMS(KP500000000, Ty, Tl);
Tz = Tl + Ty;
T2e = FMA(KP866025403, T1N, T1I);
T1O = FNMS(KP866025403, T1N, T1I);
T2s = Tx - Tr;
T2q = T1K + T1M;
T2r = FNMS(KP500000000, T2q, T2p);
T2t = FMA(KP866025403, T2s, T2r);
T39 = T2p + T2q;
T2X = FNMS(KP866025403, T2s, T2r);
}
}
{
E TF, T2v, TR, T1T, TL, T1R;
{
E TB, TE, TC, T2u, TA, TD;
TB = ri[WS(rs, 12)];
TE = ii[WS(rs, 12)];
TA = W[22];
TC = TA * TB;
T2u = TA * TE;
TD = W[23];
TF = FMA(TD, TE, TC);
T2v = FNMS(TD, TB, T2u);
}
{
E TN, TQ, TO, T1S, TM, TP;
TN = ri[WS(rs, 7)];
TQ = ii[WS(rs, 7)];
TM = W[12];
TO = TM * TN;
T1S = TM * TQ;
TP = W[13];
TR = FMA(TP, TQ, TO);
T1T = FNMS(TP, TN, T1S);
}
{
E TH, TK, TI, T1Q, TG, TJ;
TH = ri[WS(rs, 2)];
TK = ii[WS(rs, 2)];
TG = W[2];
TI = TG * TH;
T1Q = TG * TK;
TJ = W[3];
TL = FMA(TJ, TK, TI);
T1R = FNMS(TJ, TH, T1Q);
}
{
E T1U, TS, T1P, T2y, T2w, T2x;
T1U = T1R - T1T;
TS = TL + TR;
T1P = FNMS(KP500000000, TS, TF);
TT = TF + TS;
T2f = FMA(KP866025403, T1U, T1P);
T1V = FNMS(KP866025403, T1U, T1P);
T2y = TR - TL;
T2w = T1R + T1T;
T2x = FNMS(KP500000000, T2w, T2v);
T2z = FMA(KP866025403, T2y, T2x);
T3a = T2v + T2w;
T2Y = FNMS(KP866025403, T2y, T2x);
}
}
{
E T10, T2C, T1c, T21, T16, T1Z;
{
E TW, TZ, TX, T2B, TV, TY;
TW = ri[WS(rs, 6)];
TZ = ii[WS(rs, 6)];
TV = W[10];
TX = TV * TW;
T2B = TV * TZ;
TY = W[11];
T10 = FMA(TY, TZ, TX);
T2C = FNMS(TY, TW, T2B);
}
{
E T18, T1b, T19, T20, T17, T1a;
T18 = ri[WS(rs, 1)];
T1b = ii[WS(rs, 1)];
T17 = W[0];
T19 = T17 * T18;
T20 = T17 * T1b;
T1a = W[1];
T1c = FMA(T1a, T1b, T19);
T21 = FNMS(T1a, T18, T20);
}
{
E T12, T15, T13, T1Y, T11, T14;
T12 = ri[WS(rs, 11)];
T15 = ii[WS(rs, 11)];
T11 = W[20];
T13 = T11 * T12;
T1Y = T11 * T15;
T14 = W[21];
T16 = FMA(T14, T15, T13);
T1Z = FNMS(T14, T12, T1Y);
}
{
E T22, T1d, T1X, T2F, T2D, T2E;
T22 = T1Z - T21;
T1d = T16 + T1c;
T1X = FNMS(KP500000000, T1d, T10);
T1e = T10 + T1d;
T2h = FMA(KP866025403, T22, T1X);
T23 = FNMS(KP866025403, T22, T1X);
T2F = T1c - T16;
T2D = T1Z + T21;
T2E = FNMS(KP500000000, T2D, T2C);
T2G = FMA(KP866025403, T2F, T2E);
T36 = T2C + T2D;
T2U = FNMS(KP866025403, T2F, T2E);
}
}
{
E T3c, T3e, Tf, T1A, T33, T34, T3d, T35;
{
E T38, T3b, TU, T1z;
T38 = T36 - T37;
T3b = T39 - T3a;
T3c = FNMS(KP618033988, T3b, T38);
T3e = FMA(KP618033988, T38, T3b);
Tf = T1 + Te;
TU = Tz + TT;
T1z = T1e + T1y;
T1A = TU + T1z;
T33 = FNMS(KP250000000, T1A, Tf);
T34 = TU - T1z;
}
ri[0] = Tf + T1A;
T3d = FMA(KP559016994, T34, T33);
ri[WS(rs, 9)] = FNMS(KP951056516, T3e, T3d);
ri[WS(rs, 6)] = FMA(KP951056516, T3e, T3d);
T35 = FNMS(KP559016994, T34, T33);
ri[WS(rs, 12)] = FNMS(KP951056516, T3c, T35);
ri[WS(rs, 3)] = FMA(KP951056516, T3c, T35);
}
{
E T3q, T3s, T3k, T3h, T3l, T3m, T3r, T3n;
{
E T3o, T3p, T3f, T3g;
T3o = T1e - T1y;
T3p = Tz - TT;
T3q = FNMS(KP618033988, T3p, T3o);
T3s = FMA(KP618033988, T3o, T3p);
T3k = T3i + T3j;
T3f = T39 + T3a;
T3g = T36 + T37;
T3h = T3f + T3g;
T3l = FNMS(KP250000000, T3h, T3k);
T3m = T3f - T3g;
}
ii[0] = T3h + T3k;
T3r = FMA(KP559016994, T3m, T3l);
ii[WS(rs, 6)] = FNMS(KP951056516, T3s, T3r);
ii[WS(rs, 9)] = FMA(KP951056516, T3s, T3r);
T3n = FNMS(KP559016994, T3m, T3l);
ii[WS(rs, 3)] = FNMS(KP951056516, T3q, T3n);
ii[WS(rs, 12)] = FMA(KP951056516, T3q, T3n);
}
{
E T30, T32, T1H, T2c, T2R, T2S, T31, T2T;
{
E T2W, T2Z, T1W, T2b;
T2W = T2U - T2V;
T2Z = T2X - T2Y;
T30 = FNMS(KP618033988, T2Z, T2W);
T32 = FMA(KP618033988, T2W, T2Z);
T1H = FNMS(KP866025403, T1G, T1B);
T1W = T1O + T1V;
T2b = T23 + T2a;
T2c = T1W + T2b;
T2R = FNMS(KP250000000, T2c, T1H);
T2S = T1W - T2b;
}
ri[WS(rs, 5)] = T1H + T2c;
T31 = FMA(KP559016994, T2S, T2R);
ri[WS(rs, 14)] = FNMS(KP951056516, T32, T31);
ri[WS(rs, 11)] = FMA(KP951056516, T32, T31);
T2T = FNMS(KP559016994, T2S, T2R);
ri[WS(rs, 2)] = FNMS(KP951056516, T30, T2T);
ri[WS(rs, 8)] = FMA(KP951056516, T30, T2T);
}
{
E T3Q, T3S, T3H, T3K, T3L, T3M, T3R, T3N;
{
E T3O, T3P, T3I, T3J;
T3O = T23 - T2a;
T3P = T1O - T1V;
T3Q = FNMS(KP618033988, T3P, T3O);
T3S = FMA(KP618033988, T3O, T3P);
T3H = FNMS(KP866025403, T3u, T3t);
T3I = T2X + T2Y;
T3J = T2U + T2V;
T3K = T3I + T3J;
T3L = FNMS(KP250000000, T3K, T3H);
T3M = T3I - T3J;
}
ii[WS(rs, 5)] = T3K + T3H;
T3R = FMA(KP559016994, T3M, T3L);
ii[WS(rs, 11)] = FNMS(KP951056516, T3S, T3R);
ii[WS(rs, 14)] = FMA(KP951056516, T3S, T3R);
T3N = FNMS(KP559016994, T3M, T3L);
ii[WS(rs, 2)] = FMA(KP951056516, T3Q, T3N);
ii[WS(rs, 8)] = FNMS(KP951056516, T3Q, T3N);
}
{
E T3E, T3G, T3v, T3y, T3z, T3A, T3F, T3B;
{
E T3C, T3D, T3w, T3x;
T3C = T2e - T2f;
T3D = T2h - T2i;
T3E = FMA(KP618033988, T3D, T3C);
T3G = FNMS(KP618033988, T3C, T3D);
T3v = FMA(KP866025403, T3u, T3t);
T3w = T2t + T2z;
T3x = T2G + T2M;
T3y = T3w + T3x;
T3z = FNMS(KP250000000, T3y, T3v);
T3A = T3w - T3x;
}
ii[WS(rs, 10)] = T3y + T3v;
T3F = FNMS(KP559016994, T3A, T3z);
ii[WS(rs, 7)] = FMA(KP951056516, T3G, T3F);
ii[WS(rs, 13)] = FNMS(KP951056516, T3G, T3F);
T3B = FMA(KP559016994, T3A, T3z);
ii[WS(rs, 1)] = FNMS(KP951056516, T3E, T3B);
ii[WS(rs, 4)] = FMA(KP951056516, T3E, T3B);
}
{
E T2O, T2Q, T2d, T2k, T2l, T2m, T2P, T2n;
{
E T2A, T2N, T2g, T2j;
T2A = T2t - T2z;
T2N = T2G - T2M;
T2O = FMA(KP618033988, T2N, T2A);
T2Q = FNMS(KP618033988, T2A, T2N);
T2d = FMA(KP866025403, T1G, T1B);
T2g = T2e + T2f;
T2j = T2h + T2i;
T2k = T2g + T2j;
T2l = FNMS(KP250000000, T2k, T2d);
T2m = T2g - T2j;
}
ri[WS(rs, 10)] = T2d + T2k;
T2P = FNMS(KP559016994, T2m, T2l);
ri[WS(rs, 7)] = FNMS(KP951056516, T2Q, T2P);
ri[WS(rs, 13)] = FMA(KP951056516, T2Q, T2P);
T2n = FMA(KP559016994, T2m, T2l);
ri[WS(rs, 4)] = FNMS(KP951056516, T2O, T2n);
ri[WS(rs, 1)] = FMA(KP951056516, T2O, T2n);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 72, 28, 112, 0 }, 0, 0, 0 };
void X(codelet_t1_15) (planner *p) {
X(kdft_dit_register) (p, t1_15, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 15 -name t1_15 -include dft/scalar/t.h */
/*
* This function contains 184 FP additions, 112 FP multiplications,
* (or, 128 additions, 56 multiplications, 56 fused multiply/add),
* 65 stack variables, 6 constants, and 60 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_15(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 28); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 28, MAKE_VOLATILE_STRIDE(30, rs)) {
E T1q, T34, Td, T1n, T2S, T35, T13, T1k, T1l, T2E, T2F, T2O, T1H, T1T, T2k;
E T2t, T2f, T2s, T1M, T1U, Tu, TL, TM, T2H, T2I, T2N, T1w, T1Q, T29, T2w;
E T24, T2v, T1B, T1R;
{
E T1, T2R, T6, T1o, Tb, T1p, Tc, T2Q;
T1 = ri[0];
T2R = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 5)];
T5 = ii[WS(rs, 5)];
T2 = W[8];
T4 = W[9];
T6 = FMA(T2, T3, T4 * T5);
T1o = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 10)];
Ta = ii[WS(rs, 10)];
T7 = W[18];
T9 = W[19];
Tb = FMA(T7, T8, T9 * Ta);
T1p = FNMS(T9, T8, T7 * Ta);
}
T1q = KP866025403 * (T1o - T1p);
T34 = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
Td = T1 + Tc;
T1n = FNMS(KP500000000, Tc, T1);
T2Q = T1o + T1p;
T2S = T2Q + T2R;
T35 = FNMS(KP500000000, T2Q, T2R);
}
{
E TR, T2c, T18, T2h, TW, T1E, T11, T1F, T12, T2d, T1d, T1J, T1i, T1K, T1j;
E T2i;
{
E TO, TQ, TN, TP;
TO = ri[WS(rs, 6)];
TQ = ii[WS(rs, 6)];
TN = W[10];
TP = W[11];
TR = FMA(TN, TO, TP * TQ);
T2c = FNMS(TP, TO, TN * TQ);
}
{
E T15, T17, T14, T16;
T15 = ri[WS(rs, 9)];
T17 = ii[WS(rs, 9)];
T14 = W[16];
T16 = W[17];
T18 = FMA(T14, T15, T16 * T17);
T2h = FNMS(T16, T15, T14 * T17);
}
{
E TT, TV, TS, TU;
TT = ri[WS(rs, 11)];
TV = ii[WS(rs, 11)];
TS = W[20];
TU = W[21];
TW = FMA(TS, TT, TU * TV);
T1E = FNMS(TU, TT, TS * TV);
}
{
E TY, T10, TX, TZ;
TY = ri[WS(rs, 1)];
T10 = ii[WS(rs, 1)];
TX = W[0];
TZ = W[1];
T11 = FMA(TX, TY, TZ * T10);
T1F = FNMS(TZ, TY, TX * T10);
}
T12 = TW + T11;
T2d = T1E + T1F;
{
E T1a, T1c, T19, T1b;
T1a = ri[WS(rs, 14)];
T1c = ii[WS(rs, 14)];
T19 = W[26];
T1b = W[27];
T1d = FMA(T19, T1a, T1b * T1c);
T1J = FNMS(T1b, T1a, T19 * T1c);
}
{
E T1f, T1h, T1e, T1g;
T1f = ri[WS(rs, 4)];
T1h = ii[WS(rs, 4)];
T1e = W[6];
T1g = W[7];
T1i = FMA(T1e, T1f, T1g * T1h);
T1K = FNMS(T1g, T1f, T1e * T1h);
}
T1j = T1d + T1i;
T2i = T1J + T1K;
{
E T1D, T1G, T2g, T2j;
T13 = TR + T12;
T1k = T18 + T1j;
T1l = T13 + T1k;
T2E = T2c + T2d;
T2F = T2h + T2i;
T2O = T2E + T2F;
T1D = FNMS(KP500000000, T12, TR);
T1G = KP866025403 * (T1E - T1F);
T1H = T1D - T1G;
T1T = T1D + T1G;
T2g = KP866025403 * (T1i - T1d);
T2j = FNMS(KP500000000, T2i, T2h);
T2k = T2g + T2j;
T2t = T2j - T2g;
{
E T2b, T2e, T1I, T1L;
T2b = KP866025403 * (T11 - TW);
T2e = FNMS(KP500000000, T2d, T2c);
T2f = T2b + T2e;
T2s = T2e - T2b;
T1I = FNMS(KP500000000, T1j, T18);
T1L = KP866025403 * (T1J - T1K);
T1M = T1I - T1L;
T1U = T1I + T1L;
}
}
}
{
E Ti, T21, Tz, T26, Tn, T1t, Ts, T1u, Tt, T22, TE, T1y, TJ, T1z, TK;
E T27;
{
E Tf, Th, Te, Tg;
Tf = ri[WS(rs, 3)];
Th = ii[WS(rs, 3)];
Te = W[4];
Tg = W[5];
Ti = FMA(Te, Tf, Tg * Th);
T21 = FNMS(Tg, Tf, Te * Th);
}
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 12)];
Ty = ii[WS(rs, 12)];
Tv = W[22];
Tx = W[23];
Tz = FMA(Tv, Tw, Tx * Ty);
T26 = FNMS(Tx, Tw, Tv * Ty);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 8)];
Tm = ii[WS(rs, 8)];
Tj = W[14];
Tl = W[15];
Tn = FMA(Tj, Tk, Tl * Tm);
T1t = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 13)];
Tr = ii[WS(rs, 13)];
To = W[24];
Tq = W[25];
Ts = FMA(To, Tp, Tq * Tr);
T1u = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn + Ts;
T22 = T1t + T1u;
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 2)];
TD = ii[WS(rs, 2)];
TA = W[2];
TC = W[3];
TE = FMA(TA, TB, TC * TD);
T1y = FNMS(TC, TB, TA * TD);
}
{
E TG, TI, TF, TH;
TG = ri[WS(rs, 7)];
TI = ii[WS(rs, 7)];
TF = W[12];
TH = W[13];
TJ = FMA(TF, TG, TH * TI);
T1z = FNMS(TH, TG, TF * TI);
}
TK = TE + TJ;
T27 = T1y + T1z;
{
E T1s, T1v, T25, T28;
Tu = Ti + Tt;
TL = Tz + TK;
TM = Tu + TL;
T2H = T21 + T22;
T2I = T26 + T27;
T2N = T2H + T2I;
T1s = FNMS(KP500000000, Tt, Ti);
T1v = KP866025403 * (T1t - T1u);
T1w = T1s - T1v;
T1Q = T1s + T1v;
T25 = KP866025403 * (TJ - TE);
T28 = FNMS(KP500000000, T27, T26);
T29 = T25 + T28;
T2w = T28 - T25;
{
E T20, T23, T1x, T1A;
T20 = KP866025403 * (Ts - Tn);
T23 = FNMS(KP500000000, T22, T21);
T24 = T20 + T23;
T2v = T23 - T20;
T1x = FNMS(KP500000000, TK, Tz);
T1A = KP866025403 * (T1y - T1z);
T1B = T1x - T1A;
T1R = T1x + T1A;
}
}
}
{
E T2C, T1m, T2B, T2K, T2M, T2G, T2J, T2L, T2D;
T2C = KP559016994 * (TM - T1l);
T1m = TM + T1l;
T2B = FNMS(KP250000000, T1m, Td);
T2G = T2E - T2F;
T2J = T2H - T2I;
T2K = FNMS(KP587785252, T2J, KP951056516 * T2G);
T2M = FMA(KP951056516, T2J, KP587785252 * T2G);
ri[0] = Td + T1m;
T2L = T2C + T2B;
ri[WS(rs, 9)] = T2L - T2M;
ri[WS(rs, 6)] = T2L + T2M;
T2D = T2B - T2C;
ri[WS(rs, 12)] = T2D - T2K;
ri[WS(rs, 3)] = T2D + T2K;
}
{
E T2U, T2P, T2T, T2Y, T30, T2W, T2X, T2Z, T2V;
T2U = KP559016994 * (T2N - T2O);
T2P = T2N + T2O;
T2T = FNMS(KP250000000, T2P, T2S);
T2W = T13 - T1k;
T2X = Tu - TL;
T2Y = FNMS(KP587785252, T2X, KP951056516 * T2W);
T30 = FMA(KP951056516, T2X, KP587785252 * T2W);
ii[0] = T2P + T2S;
T2Z = T2U + T2T;
ii[WS(rs, 6)] = T2Z - T30;
ii[WS(rs, 9)] = T30 + T2Z;
T2V = T2T - T2U;
ii[WS(rs, 3)] = T2V - T2Y;
ii[WS(rs, 12)] = T2Y + T2V;
}
{
E T2y, T2A, T1r, T1O, T2p, T2q, T2z, T2r;
{
E T2u, T2x, T1C, T1N;
T2u = T2s - T2t;
T2x = T2v - T2w;
T2y = FNMS(KP587785252, T2x, KP951056516 * T2u);
T2A = FMA(KP951056516, T2x, KP587785252 * T2u);
T1r = T1n - T1q;
T1C = T1w + T1B;
T1N = T1H + T1M;
T1O = T1C + T1N;
T2p = FNMS(KP250000000, T1O, T1r);
T2q = KP559016994 * (T1C - T1N);
}
ri[WS(rs, 5)] = T1r + T1O;
T2z = T2q + T2p;
ri[WS(rs, 14)] = T2z - T2A;
ri[WS(rs, 11)] = T2z + T2A;
T2r = T2p - T2q;
ri[WS(rs, 2)] = T2r - T2y;
ri[WS(rs, 8)] = T2r + T2y;
}
{
E T3h, T3q, T3i, T3l, T3m, T3n, T3p, T3o;
{
E T3f, T3g, T3j, T3k;
T3f = T1H - T1M;
T3g = T1w - T1B;
T3h = FNMS(KP587785252, T3g, KP951056516 * T3f);
T3q = FMA(KP951056516, T3g, KP587785252 * T3f);
T3i = T35 - T34;
T3j = T2v + T2w;
T3k = T2s + T2t;
T3l = T3j + T3k;
T3m = FNMS(KP250000000, T3l, T3i);
T3n = KP559016994 * (T3j - T3k);
}
ii[WS(rs, 5)] = T3l + T3i;
T3p = T3n + T3m;
ii[WS(rs, 11)] = T3p - T3q;
ii[WS(rs, 14)] = T3q + T3p;
T3o = T3m - T3n;
ii[WS(rs, 2)] = T3h + T3o;
ii[WS(rs, 8)] = T3o - T3h;
}
{
E T3c, T3d, T36, T37, T33, T38, T3e, T39;
{
E T3a, T3b, T31, T32;
T3a = T1Q - T1R;
T3b = T1T - T1U;
T3c = FMA(KP951056516, T3a, KP587785252 * T3b);
T3d = FNMS(KP587785252, T3a, KP951056516 * T3b);
T36 = T34 + T35;
T31 = T24 + T29;
T32 = T2f + T2k;
T37 = T31 + T32;
T33 = KP559016994 * (T31 - T32);
T38 = FNMS(KP250000000, T37, T36);
}
ii[WS(rs, 10)] = T37 + T36;
T3e = T38 - T33;
ii[WS(rs, 7)] = T3d + T3e;
ii[WS(rs, 13)] = T3e - T3d;
T39 = T33 + T38;
ii[WS(rs, 1)] = T39 - T3c;
ii[WS(rs, 4)] = T3c + T39;
}
{
E T2m, T2o, T1P, T1W, T1X, T1Y, T2n, T1Z;
{
E T2a, T2l, T1S, T1V;
T2a = T24 - T29;
T2l = T2f - T2k;
T2m = FMA(KP951056516, T2a, KP587785252 * T2l);
T2o = FNMS(KP587785252, T2a, KP951056516 * T2l);
T1P = T1n + T1q;
T1S = T1Q + T1R;
T1V = T1T + T1U;
T1W = T1S + T1V;
T1X = KP559016994 * (T1S - T1V);
T1Y = FNMS(KP250000000, T1W, T1P);
}
ri[WS(rs, 10)] = T1P + T1W;
T2n = T1Y - T1X;
ri[WS(rs, 7)] = T2n - T2o;
ri[WS(rs, 13)] = T2n + T2o;
T1Z = T1X + T1Y;
ri[WS(rs, 4)] = T1Z - T2m;
ri[WS(rs, 1)] = T1Z + T2m;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 15, "t1_15", twinstr, &GENUS, { 128, 56, 56, 0 }, 0, 0, 0 };
void X(codelet_t1_15) (planner *p) {
X(kdft_dit_register) (p, t1_15, &desc);
}
#endif

View File

@@ -0,0 +1,796 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:28 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
/*
* This function contains 174 FP additions, 100 FP multiplications,
* (or, 104 additions, 30 multiplications, 70 fused multiply/add),
* 60 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T8, T3z, T1I, T3o, T1s, T35, T2o, T2r, T1F, T36, T2p, T2w, Tl, T3A, T1N;
E T3k, Tz, T2V, T1T, T1U, T11, T30, T29, T2c, T1e, T31, T2a, T2h, TM, T2W;
E T1W, T21;
{
E T1, T3n, T3, T6, T4, T3l, T2, T7, T3m, T5;
T1 = ri[0];
T3n = ii[0];
T3 = ri[WS(rs, 8)];
T6 = ii[WS(rs, 8)];
T2 = W[14];
T4 = T2 * T3;
T3l = T2 * T6;
T5 = W[15];
T7 = FMA(T5, T6, T4);
T3m = FNMS(T5, T3, T3l);
T8 = T1 + T7;
T3z = T3n - T3m;
T1I = T1 - T7;
T3o = T3m + T3n;
}
{
E T1h, T1k, T1i, T2k, T1n, T1q, T1o, T2m, T1g, T1m;
T1h = ri[WS(rs, 15)];
T1k = ii[WS(rs, 15)];
T1g = W[28];
T1i = T1g * T1h;
T2k = T1g * T1k;
T1n = ri[WS(rs, 7)];
T1q = ii[WS(rs, 7)];
T1m = W[12];
T1o = T1m * T1n;
T2m = T1m * T1q;
{
E T1l, T2l, T1r, T2n, T1j, T1p;
T1j = W[29];
T1l = FMA(T1j, T1k, T1i);
T2l = FNMS(T1j, T1h, T2k);
T1p = W[13];
T1r = FMA(T1p, T1q, T1o);
T2n = FNMS(T1p, T1n, T2m);
T1s = T1l + T1r;
T35 = T2l + T2n;
T2o = T2l - T2n;
T2r = T1l - T1r;
}
}
{
E T1u, T1x, T1v, T2s, T1A, T1D, T1B, T2u, T1t, T1z;
T1u = ri[WS(rs, 3)];
T1x = ii[WS(rs, 3)];
T1t = W[4];
T1v = T1t * T1u;
T2s = T1t * T1x;
T1A = ri[WS(rs, 11)];
T1D = ii[WS(rs, 11)];
T1z = W[20];
T1B = T1z * T1A;
T2u = T1z * T1D;
{
E T1y, T2t, T1E, T2v, T1w, T1C;
T1w = W[5];
T1y = FMA(T1w, T1x, T1v);
T2t = FNMS(T1w, T1u, T2s);
T1C = W[21];
T1E = FMA(T1C, T1D, T1B);
T2v = FNMS(T1C, T1A, T2u);
T1F = T1y + T1E;
T36 = T2t + T2v;
T2p = T1y - T1E;
T2w = T2t - T2v;
}
}
{
E Ta, Td, Tb, T1J, Tg, Tj, Th, T1L, T9, Tf;
Ta = ri[WS(rs, 4)];
Td = ii[WS(rs, 4)];
T9 = W[6];
Tb = T9 * Ta;
T1J = T9 * Td;
Tg = ri[WS(rs, 12)];
Tj = ii[WS(rs, 12)];
Tf = W[22];
Th = Tf * Tg;
T1L = Tf * Tj;
{
E Te, T1K, Tk, T1M, Tc, Ti;
Tc = W[7];
Te = FMA(Tc, Td, Tb);
T1K = FNMS(Tc, Ta, T1J);
Ti = W[23];
Tk = FMA(Ti, Tj, Th);
T1M = FNMS(Ti, Tg, T1L);
Tl = Te + Tk;
T3A = Te - Tk;
T1N = T1K - T1M;
T3k = T1K + T1M;
}
}
{
E To, Tr, Tp, T1P, Tu, Tx, Tv, T1R, Tn, Tt;
To = ri[WS(rs, 2)];
Tr = ii[WS(rs, 2)];
Tn = W[2];
Tp = Tn * To;
T1P = Tn * Tr;
Tu = ri[WS(rs, 10)];
Tx = ii[WS(rs, 10)];
Tt = W[18];
Tv = Tt * Tu;
T1R = Tt * Tx;
{
E Ts, T1Q, Ty, T1S, Tq, Tw;
Tq = W[3];
Ts = FMA(Tq, Tr, Tp);
T1Q = FNMS(Tq, To, T1P);
Tw = W[19];
Ty = FMA(Tw, Tx, Tv);
T1S = FNMS(Tw, Tu, T1R);
Tz = Ts + Ty;
T2V = T1Q + T1S;
T1T = T1Q - T1S;
T1U = Ts - Ty;
}
}
{
E TQ, TT, TR, T25, TW, TZ, TX, T27, TP, TV;
TQ = ri[WS(rs, 1)];
TT = ii[WS(rs, 1)];
TP = W[0];
TR = TP * TQ;
T25 = TP * TT;
TW = ri[WS(rs, 9)];
TZ = ii[WS(rs, 9)];
TV = W[16];
TX = TV * TW;
T27 = TV * TZ;
{
E TU, T26, T10, T28, TS, TY;
TS = W[1];
TU = FMA(TS, TT, TR);
T26 = FNMS(TS, TQ, T25);
TY = W[17];
T10 = FMA(TY, TZ, TX);
T28 = FNMS(TY, TW, T27);
T11 = TU + T10;
T30 = T26 + T28;
T29 = T26 - T28;
T2c = TU - T10;
}
}
{
E T13, T16, T14, T2d, T19, T1c, T1a, T2f, T12, T18;
T13 = ri[WS(rs, 5)];
T16 = ii[WS(rs, 5)];
T12 = W[8];
T14 = T12 * T13;
T2d = T12 * T16;
T19 = ri[WS(rs, 13)];
T1c = ii[WS(rs, 13)];
T18 = W[24];
T1a = T18 * T19;
T2f = T18 * T1c;
{
E T17, T2e, T1d, T2g, T15, T1b;
T15 = W[9];
T17 = FMA(T15, T16, T14);
T2e = FNMS(T15, T13, T2d);
T1b = W[25];
T1d = FMA(T1b, T1c, T1a);
T2g = FNMS(T1b, T19, T2f);
T1e = T17 + T1d;
T31 = T2e + T2g;
T2a = T17 - T1d;
T2h = T2e - T2g;
}
}
{
E TB, TE, TC, T1X, TH, TK, TI, T1Z, TA, TG;
TB = ri[WS(rs, 14)];
TE = ii[WS(rs, 14)];
TA = W[26];
TC = TA * TB;
T1X = TA * TE;
TH = ri[WS(rs, 6)];
TK = ii[WS(rs, 6)];
TG = W[10];
TI = TG * TH;
T1Z = TG * TK;
{
E TF, T1Y, TL, T20, TD, TJ;
TD = W[27];
TF = FMA(TD, TE, TC);
T1Y = FNMS(TD, TB, T1X);
TJ = W[11];
TL = FMA(TJ, TK, TI);
T20 = FNMS(TJ, TH, T1Z);
TM = TF + TL;
T2W = T1Y + T20;
T1W = TF - TL;
T21 = T1Y - T20;
}
}
{
E TO, T3e, T3q, T3s, T1H, T3r, T3h, T3i;
{
E Tm, TN, T3j, T3p;
Tm = T8 + Tl;
TN = Tz + TM;
TO = Tm + TN;
T3e = Tm - TN;
T3j = T2V + T2W;
T3p = T3k + T3o;
T3q = T3j + T3p;
T3s = T3p - T3j;
}
{
E T1f, T1G, T3f, T3g;
T1f = T11 + T1e;
T1G = T1s + T1F;
T1H = T1f + T1G;
T3r = T1G - T1f;
T3f = T30 + T31;
T3g = T35 + T36;
T3h = T3f - T3g;
T3i = T3f + T3g;
}
ri[WS(rs, 8)] = TO - T1H;
ii[WS(rs, 8)] = T3q - T3i;
ri[0] = TO + T1H;
ii[0] = T3i + T3q;
ri[WS(rs, 12)] = T3e - T3h;
ii[WS(rs, 12)] = T3s - T3r;
ri[WS(rs, 4)] = T3e + T3h;
ii[WS(rs, 4)] = T3r + T3s;
}
{
E T2Y, T3a, T3v, T3x, T33, T3b, T38, T3c;
{
E T2U, T2X, T3t, T3u;
T2U = T8 - Tl;
T2X = T2V - T2W;
T2Y = T2U + T2X;
T3a = T2U - T2X;
T3t = TM - Tz;
T3u = T3o - T3k;
T3v = T3t + T3u;
T3x = T3u - T3t;
}
{
E T2Z, T32, T34, T37;
T2Z = T11 - T1e;
T32 = T30 - T31;
T33 = T2Z + T32;
T3b = T32 - T2Z;
T34 = T1s - T1F;
T37 = T35 - T36;
T38 = T34 - T37;
T3c = T34 + T37;
}
{
E T39, T3w, T3d, T3y;
T39 = T33 + T38;
ri[WS(rs, 10)] = FNMS(KP707106781, T39, T2Y);
ri[WS(rs, 2)] = FMA(KP707106781, T39, T2Y);
T3w = T3b + T3c;
ii[WS(rs, 2)] = FMA(KP707106781, T3w, T3v);
ii[WS(rs, 10)] = FNMS(KP707106781, T3w, T3v);
T3d = T3b - T3c;
ri[WS(rs, 14)] = FNMS(KP707106781, T3d, T3a);
ri[WS(rs, 6)] = FMA(KP707106781, T3d, T3a);
T3y = T38 - T33;
ii[WS(rs, 6)] = FMA(KP707106781, T3y, T3x);
ii[WS(rs, 14)] = FNMS(KP707106781, T3y, T3x);
}
}
{
E T1O, T3B, T3H, T2E, T23, T3C, T2O, T2S, T2H, T3I, T2j, T2B, T2L, T2R, T2y;
E T2C;
{
E T1V, T22, T2b, T2i;
T1O = T1I - T1N;
T3B = T3z - T3A;
T3H = T3A + T3z;
T2E = T1I + T1N;
T1V = T1T - T1U;
T22 = T1W + T21;
T23 = T1V - T22;
T3C = T1V + T22;
{
E T2M, T2N, T2F, T2G;
T2M = T2r + T2w;
T2N = T2o - T2p;
T2O = FNMS(KP414213562, T2N, T2M);
T2S = FMA(KP414213562, T2M, T2N);
T2F = T1U + T1T;
T2G = T1W - T21;
T2H = T2F + T2G;
T3I = T2G - T2F;
}
T2b = T29 + T2a;
T2i = T2c - T2h;
T2j = FMA(KP414213562, T2i, T2b);
T2B = FNMS(KP414213562, T2b, T2i);
{
E T2J, T2K, T2q, T2x;
T2J = T2c + T2h;
T2K = T29 - T2a;
T2L = FMA(KP414213562, T2K, T2J);
T2R = FNMS(KP414213562, T2J, T2K);
T2q = T2o + T2p;
T2x = T2r - T2w;
T2y = FNMS(KP414213562, T2x, T2q);
T2C = FMA(KP414213562, T2q, T2x);
}
}
{
E T24, T2z, T3J, T3K;
T24 = FMA(KP707106781, T23, T1O);
T2z = T2j - T2y;
ri[WS(rs, 11)] = FNMS(KP923879532, T2z, T24);
ri[WS(rs, 3)] = FMA(KP923879532, T2z, T24);
T3J = FMA(KP707106781, T3I, T3H);
T3K = T2C - T2B;
ii[WS(rs, 3)] = FMA(KP923879532, T3K, T3J);
ii[WS(rs, 11)] = FNMS(KP923879532, T3K, T3J);
}
{
E T2A, T2D, T3L, T3M;
T2A = FNMS(KP707106781, T23, T1O);
T2D = T2B + T2C;
ri[WS(rs, 7)] = FNMS(KP923879532, T2D, T2A);
ri[WS(rs, 15)] = FMA(KP923879532, T2D, T2A);
T3L = FNMS(KP707106781, T3I, T3H);
T3M = T2j + T2y;
ii[WS(rs, 7)] = FNMS(KP923879532, T3M, T3L);
ii[WS(rs, 15)] = FMA(KP923879532, T3M, T3L);
}
{
E T2I, T2P, T3D, T3E;
T2I = FMA(KP707106781, T2H, T2E);
T2P = T2L + T2O;
ri[WS(rs, 9)] = FNMS(KP923879532, T2P, T2I);
ri[WS(rs, 1)] = FMA(KP923879532, T2P, T2I);
T3D = FMA(KP707106781, T3C, T3B);
T3E = T2R + T2S;
ii[WS(rs, 1)] = FMA(KP923879532, T3E, T3D);
ii[WS(rs, 9)] = FNMS(KP923879532, T3E, T3D);
}
{
E T2Q, T2T, T3F, T3G;
T2Q = FNMS(KP707106781, T2H, T2E);
T2T = T2R - T2S;
ri[WS(rs, 13)] = FNMS(KP923879532, T2T, T2Q);
ri[WS(rs, 5)] = FMA(KP923879532, T2T, T2Q);
T3F = FNMS(KP707106781, T3C, T3B);
T3G = T2O - T2L;
ii[WS(rs, 5)] = FMA(KP923879532, T3G, T3F);
ii[WS(rs, 13)] = FNMS(KP923879532, T3G, T3F);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 16 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 104, 30, 70, 0 }, 0, 0, 0 };
void X(codelet_t1_16) (planner *p) {
X(kdft_dit_register) (p, t1_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 16 -name t1_16 -include dft/scalar/t.h */
/*
* This function contains 174 FP additions, 84 FP multiplications,
* (or, 136 additions, 46 multiplications, 38 fused multiply/add),
* 52 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 30); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 30, MAKE_VOLATILE_STRIDE(32, rs)) {
E T7, T37, T1t, T2U, Ti, T38, T1w, T2R, Tu, T2s, T1C, T2c, TF, T2t, T1H;
E T2d, T1f, T1q, T2B, T2C, T2D, T2E, T1Z, T2j, T24, T2k, TS, T13, T2w, T2x;
E T2y, T2z, T1O, T2g, T1T, T2h;
{
E T1, T2T, T6, T2S;
T1 = ri[0];
T2T = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 8)];
T5 = ii[WS(rs, 8)];
T2 = W[14];
T4 = W[15];
T6 = FMA(T2, T3, T4 * T5);
T2S = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T37 = T2T - T2S;
T1t = T1 - T6;
T2U = T2S + T2T;
}
{
E Tc, T1u, Th, T1v;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 4)];
Tb = ii[WS(rs, 4)];
T8 = W[6];
Ta = W[7];
Tc = FMA(T8, T9, Ta * Tb);
T1u = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 12)];
Tg = ii[WS(rs, 12)];
Td = W[22];
Tf = W[23];
Th = FMA(Td, Te, Tf * Tg);
T1v = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T38 = Tc - Th;
T1w = T1u - T1v;
T2R = T1u + T1v;
}
{
E To, T1y, Tt, T1z, T1A, T1B;
{
E Tl, Tn, Tk, Tm;
Tl = ri[WS(rs, 2)];
Tn = ii[WS(rs, 2)];
Tk = W[2];
Tm = W[3];
To = FMA(Tk, Tl, Tm * Tn);
T1y = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = ri[WS(rs, 10)];
Ts = ii[WS(rs, 10)];
Tp = W[18];
Tr = W[19];
Tt = FMA(Tp, Tq, Tr * Ts);
T1z = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T2s = T1y + T1z;
T1A = T1y - T1z;
T1B = To - Tt;
T1C = T1A - T1B;
T2c = T1B + T1A;
}
{
E Tz, T1E, TE, T1F, T1D, T1G;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 14)];
Ty = ii[WS(rs, 14)];
Tv = W[26];
Tx = W[27];
Tz = FMA(Tv, Tw, Tx * Ty);
T1E = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 6)];
TD = ii[WS(rs, 6)];
TA = W[10];
TC = W[11];
TE = FMA(TA, TB, TC * TD);
T1F = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T2t = T1E + T1F;
T1D = Tz - TE;
T1G = T1E - T1F;
T1H = T1D + T1G;
T2d = T1D - T1G;
}
{
E T19, T20, T1p, T1X, T1e, T21, T1k, T1W;
{
E T16, T18, T15, T17;
T16 = ri[WS(rs, 15)];
T18 = ii[WS(rs, 15)];
T15 = W[28];
T17 = W[29];
T19 = FMA(T15, T16, T17 * T18);
T20 = FNMS(T17, T16, T15 * T18);
}
{
E T1m, T1o, T1l, T1n;
T1m = ri[WS(rs, 11)];
T1o = ii[WS(rs, 11)];
T1l = W[20];
T1n = W[21];
T1p = FMA(T1l, T1m, T1n * T1o);
T1X = FNMS(T1n, T1m, T1l * T1o);
}
{
E T1b, T1d, T1a, T1c;
T1b = ri[WS(rs, 7)];
T1d = ii[WS(rs, 7)];
T1a = W[12];
T1c = W[13];
T1e = FMA(T1a, T1b, T1c * T1d);
T21 = FNMS(T1c, T1b, T1a * T1d);
}
{
E T1h, T1j, T1g, T1i;
T1h = ri[WS(rs, 3)];
T1j = ii[WS(rs, 3)];
T1g = W[4];
T1i = W[5];
T1k = FMA(T1g, T1h, T1i * T1j);
T1W = FNMS(T1i, T1h, T1g * T1j);
}
T1f = T19 + T1e;
T1q = T1k + T1p;
T2B = T1f - T1q;
T2C = T20 + T21;
T2D = T1W + T1X;
T2E = T2C - T2D;
{
E T1V, T1Y, T22, T23;
T1V = T19 - T1e;
T1Y = T1W - T1X;
T1Z = T1V - T1Y;
T2j = T1V + T1Y;
T22 = T20 - T21;
T23 = T1k - T1p;
T24 = T22 + T23;
T2k = T22 - T23;
}
}
{
E TM, T1K, T12, T1R, TR, T1L, TX, T1Q;
{
E TJ, TL, TI, TK;
TJ = ri[WS(rs, 1)];
TL = ii[WS(rs, 1)];
TI = W[0];
TK = W[1];
TM = FMA(TI, TJ, TK * TL);
T1K = FNMS(TK, TJ, TI * TL);
}
{
E TZ, T11, TY, T10;
TZ = ri[WS(rs, 13)];
T11 = ii[WS(rs, 13)];
TY = W[24];
T10 = W[25];
T12 = FMA(TY, TZ, T10 * T11);
T1R = FNMS(T10, TZ, TY * T11);
}
{
E TO, TQ, TN, TP;
TO = ri[WS(rs, 9)];
TQ = ii[WS(rs, 9)];
TN = W[16];
TP = W[17];
TR = FMA(TN, TO, TP * TQ);
T1L = FNMS(TP, TO, TN * TQ);
}
{
E TU, TW, TT, TV;
TU = ri[WS(rs, 5)];
TW = ii[WS(rs, 5)];
TT = W[8];
TV = W[9];
TX = FMA(TT, TU, TV * TW);
T1Q = FNMS(TV, TU, TT * TW);
}
TS = TM + TR;
T13 = TX + T12;
T2w = TS - T13;
T2x = T1K + T1L;
T2y = T1Q + T1R;
T2z = T2x - T2y;
{
E T1M, T1N, T1P, T1S;
T1M = T1K - T1L;
T1N = TX - T12;
T1O = T1M + T1N;
T2g = T1M - T1N;
T1P = TM - TR;
T1S = T1Q - T1R;
T1T = T1P - T1S;
T2h = T1P + T1S;
}
}
{
E T1J, T27, T3g, T3i, T26, T3h, T2a, T3d;
{
E T1x, T1I, T3e, T3f;
T1x = T1t - T1w;
T1I = KP707106781 * (T1C - T1H);
T1J = T1x + T1I;
T27 = T1x - T1I;
T3e = KP707106781 * (T2d - T2c);
T3f = T38 + T37;
T3g = T3e + T3f;
T3i = T3f - T3e;
}
{
E T1U, T25, T28, T29;
T1U = FMA(KP923879532, T1O, KP382683432 * T1T);
T25 = FNMS(KP923879532, T24, KP382683432 * T1Z);
T26 = T1U + T25;
T3h = T25 - T1U;
T28 = FNMS(KP923879532, T1T, KP382683432 * T1O);
T29 = FMA(KP382683432, T24, KP923879532 * T1Z);
T2a = T28 - T29;
T3d = T28 + T29;
}
ri[WS(rs, 11)] = T1J - T26;
ii[WS(rs, 11)] = T3g - T3d;
ri[WS(rs, 3)] = T1J + T26;
ii[WS(rs, 3)] = T3d + T3g;
ri[WS(rs, 15)] = T27 - T2a;
ii[WS(rs, 15)] = T3i - T3h;
ri[WS(rs, 7)] = T27 + T2a;
ii[WS(rs, 7)] = T3h + T3i;
}
{
E T2v, T2H, T32, T34, T2G, T33, T2K, T2Z;
{
E T2r, T2u, T30, T31;
T2r = T7 - Ti;
T2u = T2s - T2t;
T2v = T2r + T2u;
T2H = T2r - T2u;
T30 = TF - Tu;
T31 = T2U - T2R;
T32 = T30 + T31;
T34 = T31 - T30;
}
{
E T2A, T2F, T2I, T2J;
T2A = T2w + T2z;
T2F = T2B - T2E;
T2G = KP707106781 * (T2A + T2F);
T33 = KP707106781 * (T2F - T2A);
T2I = T2z - T2w;
T2J = T2B + T2E;
T2K = KP707106781 * (T2I - T2J);
T2Z = KP707106781 * (T2I + T2J);
}
ri[WS(rs, 10)] = T2v - T2G;
ii[WS(rs, 10)] = T32 - T2Z;
ri[WS(rs, 2)] = T2v + T2G;
ii[WS(rs, 2)] = T2Z + T32;
ri[WS(rs, 14)] = T2H - T2K;
ii[WS(rs, 14)] = T34 - T33;
ri[WS(rs, 6)] = T2H + T2K;
ii[WS(rs, 6)] = T33 + T34;
}
{
E T2f, T2n, T3a, T3c, T2m, T3b, T2q, T35;
{
E T2b, T2e, T36, T39;
T2b = T1t + T1w;
T2e = KP707106781 * (T2c + T2d);
T2f = T2b + T2e;
T2n = T2b - T2e;
T36 = KP707106781 * (T1C + T1H);
T39 = T37 - T38;
T3a = T36 + T39;
T3c = T39 - T36;
}
{
E T2i, T2l, T2o, T2p;
T2i = FMA(KP382683432, T2g, KP923879532 * T2h);
T2l = FNMS(KP382683432, T2k, KP923879532 * T2j);
T2m = T2i + T2l;
T3b = T2l - T2i;
T2o = FNMS(KP382683432, T2h, KP923879532 * T2g);
T2p = FMA(KP923879532, T2k, KP382683432 * T2j);
T2q = T2o - T2p;
T35 = T2o + T2p;
}
ri[WS(rs, 9)] = T2f - T2m;
ii[WS(rs, 9)] = T3a - T35;
ri[WS(rs, 1)] = T2f + T2m;
ii[WS(rs, 1)] = T35 + T3a;
ri[WS(rs, 13)] = T2n - T2q;
ii[WS(rs, 13)] = T3c - T3b;
ri[WS(rs, 5)] = T2n + T2q;
ii[WS(rs, 5)] = T3b + T3c;
}
{
E TH, T2L, T2W, T2Y, T1s, T2X, T2O, T2P;
{
E Tj, TG, T2Q, T2V;
Tj = T7 + Ti;
TG = Tu + TF;
TH = Tj + TG;
T2L = Tj - TG;
T2Q = T2s + T2t;
T2V = T2R + T2U;
T2W = T2Q + T2V;
T2Y = T2V - T2Q;
}
{
E T14, T1r, T2M, T2N;
T14 = TS + T13;
T1r = T1f + T1q;
T1s = T14 + T1r;
T2X = T1r - T14;
T2M = T2x + T2y;
T2N = T2C + T2D;
T2O = T2M - T2N;
T2P = T2M + T2N;
}
ri[WS(rs, 8)] = TH - T1s;
ii[WS(rs, 8)] = T2W - T2P;
ri[0] = TH + T1s;
ii[0] = T2P + T2W;
ri[WS(rs, 12)] = T2L - T2O;
ii[WS(rs, 12)] = T2Y - T2X;
ri[WS(rs, 4)] = T2L + T2O;
ii[WS(rs, 4)] = T2X + T2Y;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 16 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t1_16", twinstr, &GENUS, { 136, 46, 38, 0 }, 0, 0, 0 };
void X(codelet_t1_16) (planner *p) {
X(kdft_dit_register) (p, t1_16, &desc);
}
#endif

View File

@@ -0,0 +1,117 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 11 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, Ta, T3, T6, T4, T8, T2, T7, T9, T5;
T1 = ri[0];
Ta = ii[0];
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
T8 = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
T9 = FNMS(T5, T3, T8);
ri[WS(rs, 1)] = T1 - T7;
ii[WS(rs, 1)] = Ta - T9;
ri[0] = T1 + T7;
ii[0] = T9 + Ta;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
void X(codelet_t1_2) (planner *p) {
X(kdft_dit_register) (p, t1_2, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 2 -name t1_2 -include dft/scalar/t.h */
/*
* This function contains 6 FP additions, 4 FP multiplications,
* (or, 4 additions, 2 multiplications, 2 fused multiply/add),
* 9 stack variables, 0 constants, and 8 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_2(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 2); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 2, MAKE_VOLATILE_STRIDE(4, rs)) {
E T1, T8, T6, T7;
T1 = ri[0];
T8 = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
T7 = FNMS(T4, T3, T2 * T5);
}
ri[WS(rs, 1)] = T1 - T6;
ii[WS(rs, 1)] = T8 - T7;
ri[0] = T1 + T6;
ii[0] = T7 + T8;
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 2 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 2, "t1_2", twinstr, &GENUS, { 4, 2, 2, 0 }, 0, 0, 0 };
void X(codelet_t1_2) (planner *p) {
X(kdft_dit_register) (p, t1_2, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,166 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
/*
* This function contains 16 FP additions, 14 FP multiplications,
* (or, 6 additions, 4 multiplications, 10 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, Tm, T7, Th, Td, Tj;
T1 = ri[0];
Tm = ii[0];
{
E T3, T6, T4, Tg, T2, T5;
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
Tg = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Th = FNMS(T5, T3, Tg);
}
{
E T9, Tc, Ta, Ti, T8, Tb;
T9 = ri[WS(rs, 2)];
Tc = ii[WS(rs, 2)];
T8 = W[2];
Ta = T8 * T9;
Ti = T8 * Tc;
Tb = W[3];
Td = FMA(Tb, Tc, Ta);
Tj = FNMS(Tb, T9, Ti);
}
{
E Tk, Te, Tf, To, Tl, Tn;
Tk = Th - Tj;
Te = T7 + Td;
Tf = FNMS(KP500000000, Te, T1);
ri[0] = T1 + Te;
ri[WS(rs, 1)] = FMA(KP866025403, Tk, Tf);
ri[WS(rs, 2)] = FNMS(KP866025403, Tk, Tf);
To = Td - T7;
Tl = Th + Tj;
Tn = FNMS(KP500000000, Tl, Tm);
ii[0] = Tl + Tm;
ii[WS(rs, 2)] = FNMS(KP866025403, To, Tn);
ii[WS(rs, 1)] = FMA(KP866025403, To, Tn);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 6, 4, 10, 0 }, 0, 0, 0 };
void X(codelet_t1_3) (planner *p) {
X(kdft_dit_register) (p, t1_3, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 3 -name t1_3 -include dft/scalar/t.h */
/*
* This function contains 16 FP additions, 12 FP multiplications,
* (or, 10 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 2 constants, and 12 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(6, rs)) {
E T1, Ti, T6, Te, Tb, Tf, Tc, Th;
T1 = ri[0];
Ti = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
Te = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 2)];
Ta = ii[WS(rs, 2)];
T7 = W[2];
T9 = W[3];
Tb = FMA(T7, T8, T9 * Ta);
Tf = FNMS(T9, T8, T7 * Ta);
}
Tc = T6 + Tb;
Th = Te + Tf;
ri[0] = T1 + Tc;
ii[0] = Th + Ti;
{
E Td, Tg, Tj, Tk;
Td = FNMS(KP500000000, Tc, T1);
Tg = KP866025403 * (Te - Tf);
ri[WS(rs, 2)] = Td - Tg;
ri[WS(rs, 1)] = Td + Tg;
Tj = KP866025403 * (Tb - T6);
Tk = FNMS(KP500000000, Th, Ti);
ii[WS(rs, 1)] = Tj + Tk;
ii[WS(rs, 2)] = Tk - Tj;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 3, "t1_3", twinstr, &GENUS, { 10, 6, 6, 0 }, 0, 0, 0 };
void X(codelet_t1_3) (planner *p) {
X(kdft_dit_register) (p, t1_3, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,196 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 15 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Tv, T7, Tu, Te, To, Tk, Tq;
T1 = ri[0];
Tv = ii[0];
{
E T3, T6, T4, Tt, T2, T5;
T3 = ri[WS(rs, 2)];
T6 = ii[WS(rs, 2)];
T2 = W[2];
T4 = T2 * T3;
Tt = T2 * T6;
T5 = W[3];
T7 = FMA(T5, T6, T4);
Tu = FNMS(T5, T3, Tt);
}
{
E Ta, Td, Tb, Tn, T9, Tc;
Ta = ri[WS(rs, 1)];
Td = ii[WS(rs, 1)];
T9 = W[0];
Tb = T9 * Ta;
Tn = T9 * Td;
Tc = W[1];
Te = FMA(Tc, Td, Tb);
To = FNMS(Tc, Ta, Tn);
}
{
E Tg, Tj, Th, Tp, Tf, Ti;
Tg = ri[WS(rs, 3)];
Tj = ii[WS(rs, 3)];
Tf = W[4];
Th = Tf * Tg;
Tp = Tf * Tj;
Ti = W[5];
Tk = FMA(Ti, Tj, Th);
Tq = FNMS(Ti, Tg, Tp);
}
{
E T8, Tl, Ts, Tw;
T8 = T1 + T7;
Tl = Te + Tk;
ri[WS(rs, 2)] = T8 - Tl;
ri[0] = T8 + Tl;
Ts = To + Tq;
Tw = Tu + Tv;
ii[0] = Ts + Tw;
ii[WS(rs, 2)] = Tw - Ts;
}
{
E Tm, Tr, Tx, Ty;
Tm = T1 - T7;
Tr = To - Tq;
ri[WS(rs, 3)] = Tm - Tr;
ri[WS(rs, 1)] = Tm + Tr;
Tx = Tv - Tu;
Ty = Te - Tk;
ii[WS(rs, 1)] = Tx - Ty;
ii[WS(rs, 3)] = Ty + Tx;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
void X(codelet_t1_4) (planner *p) {
X(kdft_dit_register) (p, t1_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 4 -name t1_4 -include dft/scalar/t.h */
/*
* This function contains 22 FP additions, 12 FP multiplications,
* (or, 16 additions, 6 multiplications, 6 fused multiply/add),
* 13 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) {
E T1, Tp, T6, To, Tc, Tk, Th, Tl;
T1 = ri[0];
Tp = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 2)];
T5 = ii[WS(rs, 2)];
T2 = W[2];
T4 = W[3];
T6 = FMA(T2, T3, T4 * T5);
To = FNMS(T4, T3, T2 * T5);
}
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 1)];
Tb = ii[WS(rs, 1)];
T8 = W[0];
Ta = W[1];
Tc = FMA(T8, T9, Ta * Tb);
Tk = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 3)];
Tg = ii[WS(rs, 3)];
Td = W[4];
Tf = W[5];
Th = FMA(Td, Te, Tf * Tg);
Tl = FNMS(Tf, Te, Td * Tg);
}
{
E T7, Ti, Tn, Tq;
T7 = T1 + T6;
Ti = Tc + Th;
ri[WS(rs, 2)] = T7 - Ti;
ri[0] = T7 + Ti;
Tn = Tk + Tl;
Tq = To + Tp;
ii[0] = Tn + Tq;
ii[WS(rs, 2)] = Tq - Tn;
}
{
E Tj, Tm, Tr, Ts;
Tj = T1 - T6;
Tm = Tk - Tl;
ri[WS(rs, 3)] = Tj - Tm;
ri[WS(rs, 1)] = Tj + Tm;
Tr = Tp - To;
Ts = Tc - Th;
ii[WS(rs, 1)] = Tr - Ts;
ii[WS(rs, 3)] = Ts + Tr;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 4 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t1_4", twinstr, &GENUS, { 16, 6, 6, 0 }, 0, 0, 0 };
void X(codelet_t1_4) (planner *p) {
X(kdft_dit_register) (p, t1_4, &desc);
}
#endif

View File

@@ -0,0 +1,253 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:26 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
/*
* This function contains 40 FP additions, 34 FP multiplications,
* (or, 14 additions, 8 multiplications, 26 fused multiply/add),
* 31 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, TM, T7, Tx, Td, Tz, Te, TJ, Tk, TC, Tq, TE, Tr, TK;
T1 = ri[0];
TM = ii[0];
{
E T3, T6, T4, Tw, T9, Tc, Ta, Ty, T2, T8, T5, Tb;
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
Tw = T2 * T6;
T9 = ri[WS(rs, 4)];
Tc = ii[WS(rs, 4)];
T8 = W[6];
Ta = T8 * T9;
Ty = T8 * Tc;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tx = FNMS(T5, T3, Tw);
Tb = W[7];
Td = FMA(Tb, Tc, Ta);
Tz = FNMS(Tb, T9, Ty);
Te = T7 + Td;
TJ = Tx + Tz;
}
{
E Tg, Tj, Th, TB, Tm, Tp, Tn, TD, Tf, Tl, Ti, To;
Tg = ri[WS(rs, 2)];
Tj = ii[WS(rs, 2)];
Tf = W[2];
Th = Tf * Tg;
TB = Tf * Tj;
Tm = ri[WS(rs, 3)];
Tp = ii[WS(rs, 3)];
Tl = W[4];
Tn = Tl * Tm;
TD = Tl * Tp;
Ti = W[3];
Tk = FMA(Ti, Tj, Th);
TC = FNMS(Ti, Tg, TB);
To = W[5];
Tq = FMA(To, Tp, Tn);
TE = FNMS(To, Tm, TD);
Tr = Tk + Tq;
TK = TC + TE;
}
{
E Tu, Ts, Tt, TG, TI, TA, TF, TH, Tv;
Tu = Te - Tr;
Ts = Te + Tr;
Tt = FNMS(KP250000000, Ts, T1);
TA = Tx - Tz;
TF = TC - TE;
TG = FMA(KP618033988, TF, TA);
TI = FNMS(KP618033988, TA, TF);
ri[0] = T1 + Ts;
TH = FNMS(KP559016994, Tu, Tt);
ri[WS(rs, 2)] = FNMS(KP951056516, TI, TH);
ri[WS(rs, 3)] = FMA(KP951056516, TI, TH);
Tv = FMA(KP559016994, Tu, Tt);
ri[WS(rs, 4)] = FNMS(KP951056516, TG, Tv);
ri[WS(rs, 1)] = FMA(KP951056516, TG, Tv);
}
{
E TO, TL, TN, TS, TU, TQ, TR, TT, TP;
TO = TJ - TK;
TL = TJ + TK;
TN = FNMS(KP250000000, TL, TM);
TQ = T7 - Td;
TR = Tk - Tq;
TS = FMA(KP618033988, TR, TQ);
TU = FNMS(KP618033988, TQ, TR);
ii[0] = TL + TM;
TT = FNMS(KP559016994, TO, TN);
ii[WS(rs, 2)] = FMA(KP951056516, TU, TT);
ii[WS(rs, 3)] = FNMS(KP951056516, TU, TT);
TP = FMA(KP559016994, TO, TN);
ii[WS(rs, 1)] = FNMS(KP951056516, TS, TP);
ii[WS(rs, 4)] = FMA(KP951056516, TS, TP);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 14, 8, 26, 0 }, 0, 0, 0 };
void X(codelet_t1_5) (planner *p) {
X(kdft_dit_register) (p, t1_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 5 -name t1_5 -include dft/scalar/t.h */
/*
* This function contains 40 FP additions, 28 FP multiplications,
* (or, 26 additions, 14 multiplications, 14 fused multiply/add),
* 29 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(10, rs)) {
E T1, TE, Tu, Tx, TJ, TI, TB, TC, TD, Tc, Tn, To;
T1 = ri[0];
TE = ii[0];
{
E T6, Ts, Tm, Tw, Tb, Tt, Th, Tv;
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
Ts = FNMS(T4, T3, T2 * T5);
}
{
E Tj, Tl, Ti, Tk;
Tj = ri[WS(rs, 3)];
Tl = ii[WS(rs, 3)];
Ti = W[4];
Tk = W[5];
Tm = FMA(Ti, Tj, Tk * Tl);
Tw = FNMS(Tk, Tj, Ti * Tl);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 4)];
Ta = ii[WS(rs, 4)];
T7 = W[6];
T9 = W[7];
Tb = FMA(T7, T8, T9 * Ta);
Tt = FNMS(T9, T8, T7 * Ta);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 2)];
Tg = ii[WS(rs, 2)];
Td = W[2];
Tf = W[3];
Th = FMA(Td, Te, Tf * Tg);
Tv = FNMS(Tf, Te, Td * Tg);
}
Tu = Ts - Tt;
Tx = Tv - Tw;
TJ = Th - Tm;
TI = T6 - Tb;
TB = Ts + Tt;
TC = Tv + Tw;
TD = TB + TC;
Tc = T6 + Tb;
Tn = Th + Tm;
To = Tc + Tn;
}
ri[0] = T1 + To;
ii[0] = TD + TE;
{
E Ty, TA, Tr, Tz, Tp, Tq;
Ty = FMA(KP951056516, Tu, KP587785252 * Tx);
TA = FNMS(KP587785252, Tu, KP951056516 * Tx);
Tp = KP559016994 * (Tc - Tn);
Tq = FNMS(KP250000000, To, T1);
Tr = Tp + Tq;
Tz = Tq - Tp;
ri[WS(rs, 4)] = Tr - Ty;
ri[WS(rs, 3)] = Tz + TA;
ri[WS(rs, 1)] = Tr + Ty;
ri[WS(rs, 2)] = Tz - TA;
}
{
E TK, TL, TH, TM, TF, TG;
TK = FMA(KP951056516, TI, KP587785252 * TJ);
TL = FNMS(KP587785252, TI, KP951056516 * TJ);
TF = KP559016994 * (TB - TC);
TG = FNMS(KP250000000, TD, TE);
TH = TF + TG;
TM = TG - TF;
ii[WS(rs, 1)] = TH - TK;
ii[WS(rs, 3)] = TM - TL;
ii[WS(rs, 4)] = TK + TH;
ii[WS(rs, 2)] = TL + TM;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 5 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t1_5", twinstr, &GENUS, { 26, 14, 14, 0 }, 0, 0, 0 };
void X(codelet_t1_5) (planner *p) {
X(kdft_dit_register) (p, t1_5, &desc);
}
#endif

View File

@@ -0,0 +1,295 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
/*
* This function contains 46 FP additions, 32 FP multiplications,
* (or, 24 additions, 10 multiplications, 22 fused multiply/add),
* 31 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T1, TX, T7, TW, Tl, TR, TB, TJ, Ty, TS, TC, TO;
T1 = ri[0];
TX = ii[0];
{
E T3, T6, T4, TV, T2, T5;
T3 = ri[WS(rs, 3)];
T6 = ii[WS(rs, 3)];
T2 = W[4];
T4 = T2 * T3;
TV = T2 * T6;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TW = FNMS(T5, T3, TV);
}
{
E Ta, Td, Tb, TF, Tg, Tj, Th, TH, T9, Tf;
Ta = ri[WS(rs, 2)];
Td = ii[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
TF = T9 * Td;
Tg = ri[WS(rs, 5)];
Tj = ii[WS(rs, 5)];
Tf = W[8];
Th = Tf * Tg;
TH = Tf * Tj;
{
E Te, TG, Tk, TI, Tc, Ti;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TG = FNMS(Tc, Ta, TF);
Ti = W[9];
Tk = FMA(Ti, Tj, Th);
TI = FNMS(Ti, Tg, TH);
Tl = Te - Tk;
TR = TG + TI;
TB = Te + Tk;
TJ = TG - TI;
}
}
{
E Tn, Tq, To, TK, Tt, Tw, Tu, TM, Tm, Ts;
Tn = ri[WS(rs, 4)];
Tq = ii[WS(rs, 4)];
Tm = W[6];
To = Tm * Tn;
TK = Tm * Tq;
Tt = ri[WS(rs, 1)];
Tw = ii[WS(rs, 1)];
Ts = W[0];
Tu = Ts * Tt;
TM = Ts * Tw;
{
E Tr, TL, Tx, TN, Tp, Tv;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
TL = FNMS(Tp, Tn, TK);
Tv = W[1];
Tx = FMA(Tv, Tw, Tu);
TN = FNMS(Tv, Tt, TM);
Ty = Tr - Tx;
TS = TL + TN;
TC = Tr + Tx;
TO = TL - TN;
}
}
{
E TP, T8, Tz, TE;
TP = TJ - TO;
T8 = T1 - T7;
Tz = Tl + Ty;
TE = FNMS(KP500000000, Tz, T8);
ri[WS(rs, 3)] = T8 + Tz;
ri[WS(rs, 1)] = FMA(KP866025403, TP, TE);
ri[WS(rs, 5)] = FNMS(KP866025403, TP, TE);
}
{
E T14, T11, T12, T13;
T14 = Ty - Tl;
T11 = TX - TW;
T12 = TJ + TO;
T13 = FNMS(KP500000000, T12, T11);
ii[WS(rs, 1)] = FMA(KP866025403, T14, T13);
ii[WS(rs, 3)] = T12 + T11;
ii[WS(rs, 5)] = FNMS(KP866025403, T14, T13);
}
{
E TT, TA, TD, TQ;
TT = TR - TS;
TA = T1 + T7;
TD = TB + TC;
TQ = FNMS(KP500000000, TD, TA);
ri[0] = TA + TD;
ri[WS(rs, 4)] = FMA(KP866025403, TT, TQ);
ri[WS(rs, 2)] = FNMS(KP866025403, TT, TQ);
}
{
E T10, TU, TY, TZ;
T10 = TC - TB;
TU = TR + TS;
TY = TW + TX;
TZ = FNMS(KP500000000, TU, TY);
ii[0] = TU + TY;
ii[WS(rs, 4)] = FMA(KP866025403, T10, TZ);
ii[WS(rs, 2)] = FNMS(KP866025403, T10, TZ);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 6 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 24, 10, 22, 0 }, 0, 0, 0 };
void X(codelet_t1_6) (planner *p) {
X(kdft_dit_register) (p, t1_6, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 6 -name t1_6 -include dft/scalar/t.h */
/*
* This function contains 46 FP additions, 28 FP multiplications,
* (or, 32 additions, 14 multiplications, 14 fused multiply/add),
* 23 stack variables, 2 constants, and 24 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_6(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 10); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 10, MAKE_VOLATILE_STRIDE(12, rs)) {
E T7, TS, Tv, TO, Tt, TJ, Tx, TF, Ti, TI, Tw, TC;
{
E T1, TN, T6, TM;
T1 = ri[0];
TN = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 3)];
T5 = ii[WS(rs, 3)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TM = FNMS(T4, T3, T2 * T5);
}
T7 = T1 - T6;
TS = TN - TM;
Tv = T1 + T6;
TO = TM + TN;
}
{
E Tn, TD, Ts, TE;
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 4)];
Tm = ii[WS(rs, 4)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TD = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 1)];
Tr = ii[WS(rs, 1)];
To = W[0];
Tq = W[1];
Ts = FMA(To, Tp, Tq * Tr);
TE = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn - Ts;
TJ = TD + TE;
Tx = Tn + Ts;
TF = TD - TE;
}
{
E Tc, TA, Th, TB;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 2)];
Tb = ii[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TA = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 5)];
Tg = ii[WS(rs, 5)];
Td = W[8];
Tf = W[9];
Th = FMA(Td, Te, Tf * Tg);
TB = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc - Th;
TI = TA + TB;
Tw = Tc + Th;
TC = TA - TB;
}
{
E TG, Tu, Tz, TR, TT, TU;
TG = KP866025403 * (TC - TF);
Tu = Ti + Tt;
Tz = FNMS(KP500000000, Tu, T7);
ri[WS(rs, 3)] = T7 + Tu;
ri[WS(rs, 1)] = Tz + TG;
ri[WS(rs, 5)] = Tz - TG;
TR = KP866025403 * (Tt - Ti);
TT = TC + TF;
TU = FNMS(KP500000000, TT, TS);
ii[WS(rs, 1)] = TR + TU;
ii[WS(rs, 3)] = TT + TS;
ii[WS(rs, 5)] = TU - TR;
}
{
E TK, Ty, TH, TQ, TL, TP;
TK = KP866025403 * (TI - TJ);
Ty = Tw + Tx;
TH = FNMS(KP500000000, Ty, Tv);
ri[0] = Tv + Ty;
ri[WS(rs, 4)] = TH + TK;
ri[WS(rs, 2)] = TH - TK;
TQ = KP866025403 * (Tx - Tw);
TL = TI + TJ;
TP = FNMS(KP500000000, TL, TO);
ii[0] = TL + TO;
ii[WS(rs, 4)] = TQ + TP;
ii[WS(rs, 2)] = TP - TQ;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 6 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 6, "t1_6", twinstr, &GENUS, { 32, 14, 14, 0 }, 0, 0, 0 };
void X(codelet_t1_6) (planner *p) {
X(kdft_dit_register) (p, t1_6, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,354 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
/*
* This function contains 72 FP additions, 66 FP multiplications,
* (or, 18 additions, 12 multiplications, 54 fused multiply/add),
* 37 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP801937735, +0.801937735804838252472204639014890102331838324);
DK(KP554958132, +0.554958132087371191422194871006410481067288862);
DK(KP692021471, +0.692021471630095869627814897002069140197260599);
DK(KP356895867, +0.356895867892209443894399510021300583399127187);
{
INT m;
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, T1c, Te, T1h, TR, T19, Tr, T1g, TM, T1a, TE, T1i, TW, T1b;
T1 = ri[0];
T1c = ii[0];
{
E T3, T6, T4, TN, T9, Tc, Ta, TP, T2, T8;
T3 = ri[WS(rs, 1)];
T6 = ii[WS(rs, 1)];
T2 = W[0];
T4 = T2 * T3;
TN = T2 * T6;
T9 = ri[WS(rs, 6)];
Tc = ii[WS(rs, 6)];
T8 = W[10];
Ta = T8 * T9;
TP = T8 * Tc;
{
E T7, TO, Td, TQ, T5, Tb;
T5 = W[1];
T7 = FMA(T5, T6, T4);
TO = FNMS(T5, T3, TN);
Tb = W[11];
Td = FMA(Tb, Tc, Ta);
TQ = FNMS(Tb, T9, TP);
Te = T7 + Td;
T1h = Td - T7;
TR = TO - TQ;
T19 = TO + TQ;
}
}
{
E Tg, Tj, Th, TI, Tm, Tp, Tn, TK, Tf, Tl;
Tg = ri[WS(rs, 2)];
Tj = ii[WS(rs, 2)];
Tf = W[2];
Th = Tf * Tg;
TI = Tf * Tj;
Tm = ri[WS(rs, 5)];
Tp = ii[WS(rs, 5)];
Tl = W[8];
Tn = Tl * Tm;
TK = Tl * Tp;
{
E Tk, TJ, Tq, TL, Ti, To;
Ti = W[3];
Tk = FMA(Ti, Tj, Th);
TJ = FNMS(Ti, Tg, TI);
To = W[9];
Tq = FMA(To, Tp, Tn);
TL = FNMS(To, Tm, TK);
Tr = Tk + Tq;
T1g = Tq - Tk;
TM = TJ - TL;
T1a = TJ + TL;
}
}
{
E Tt, Tw, Tu, TS, Tz, TC, TA, TU, Ts, Ty;
Tt = ri[WS(rs, 3)];
Tw = ii[WS(rs, 3)];
Ts = W[4];
Tu = Ts * Tt;
TS = Ts * Tw;
Tz = ri[WS(rs, 4)];
TC = ii[WS(rs, 4)];
Ty = W[6];
TA = Ty * Tz;
TU = Ty * TC;
{
E Tx, TT, TD, TV, Tv, TB;
Tv = W[5];
Tx = FMA(Tv, Tw, Tu);
TT = FNMS(Tv, Tt, TS);
TB = W[7];
TD = FMA(TB, TC, TA);
TV = FNMS(TB, Tz, TU);
TE = Tx + TD;
T1i = TD - Tx;
TW = TT - TV;
T1b = TT + TV;
}
}
ri[0] = T1 + Te + Tr + TE;
ii[0] = T19 + T1a + T1b + T1c;
{
E TG, TY, TF, TX, TH;
TF = FNMS(KP356895867, Tr, Te);
TG = FNMS(KP692021471, TF, TE);
TX = FMA(KP554958132, TW, TR);
TY = FMA(KP801937735, TX, TM);
TH = FNMS(KP900968867, TG, T1);
ri[WS(rs, 6)] = FNMS(KP974927912, TY, TH);
ri[WS(rs, 1)] = FMA(KP974927912, TY, TH);
}
{
E T1e, T1k, T1d, T1j, T1f;
T1d = FNMS(KP356895867, T1a, T19);
T1e = FNMS(KP692021471, T1d, T1b);
T1j = FMA(KP554958132, T1i, T1h);
T1k = FMA(KP801937735, T1j, T1g);
T1f = FNMS(KP900968867, T1e, T1c);
ii[WS(rs, 1)] = FMA(KP974927912, T1k, T1f);
ii[WS(rs, 6)] = FNMS(KP974927912, T1k, T1f);
}
{
E T10, T13, TZ, T12, T11;
TZ = FNMS(KP356895867, Te, TE);
T10 = FNMS(KP692021471, TZ, Tr);
T12 = FMA(KP554958132, TM, TW);
T13 = FNMS(KP801937735, T12, TR);
T11 = FNMS(KP900968867, T10, T1);
ri[WS(rs, 5)] = FNMS(KP974927912, T13, T11);
ri[WS(rs, 2)] = FMA(KP974927912, T13, T11);
}
{
E T1m, T1p, T1l, T1o, T1n;
T1l = FNMS(KP356895867, T19, T1b);
T1m = FNMS(KP692021471, T1l, T1a);
T1o = FMA(KP554958132, T1g, T1i);
T1p = FNMS(KP801937735, T1o, T1h);
T1n = FNMS(KP900968867, T1m, T1c);
ii[WS(rs, 2)] = FMA(KP974927912, T1p, T1n);
ii[WS(rs, 5)] = FNMS(KP974927912, T1p, T1n);
}
{
E T15, T18, T14, T17, T16;
T14 = FNMS(KP356895867, TE, Tr);
T15 = FNMS(KP692021471, T14, Te);
T17 = FNMS(KP554958132, TR, TM);
T18 = FNMS(KP801937735, T17, TW);
T16 = FNMS(KP900968867, T15, T1);
ri[WS(rs, 4)] = FNMS(KP974927912, T18, T16);
ri[WS(rs, 3)] = FMA(KP974927912, T18, T16);
}
{
E T1r, T1u, T1q, T1t, T1s;
T1q = FNMS(KP356895867, T1b, T1a);
T1r = FNMS(KP692021471, T1q, T19);
T1t = FNMS(KP554958132, T1h, T1g);
T1u = FNMS(KP801937735, T1t, T1i);
T1s = FNMS(KP900968867, T1r, T1c);
ii[WS(rs, 3)] = FMA(KP974927912, T1u, T1s);
ii[WS(rs, 4)] = FNMS(KP974927912, T1u, T1s);
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 18, 12, 54, 0 }, 0, 0, 0 };
void X(codelet_t1_7) (planner *p) {
X(kdft_dit_register) (p, t1_7, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 7 -name t1_7 -include dft/scalar/t.h */
/*
* This function contains 72 FP additions, 60 FP multiplications,
* (or, 36 additions, 24 multiplications, 36 fused multiply/add),
* 29 stack variables, 6 constants, and 28 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_7(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP222520933, +0.222520933956314404288902564496794759466355569);
DK(KP900968867, +0.900968867902419126236102319507445051165919162);
DK(KP623489801, +0.623489801858733530525004884004239810632274731);
DK(KP433883739, +0.433883739117558120475768332848358754609990728);
DK(KP781831482, +0.781831482468029808708444526674057750232334519);
DK(KP974927912, +0.974927912181823607018131682993931217232785801);
{
INT m;
for (m = mb, W = W + (mb * 12); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 12, MAKE_VOLATILE_STRIDE(14, rs)) {
E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ;
T1 = ri[0];
TR = ii[0];
{
E T6, TA, Tb, TB;
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T2 = W[0];
T4 = W[1];
T6 = FMA(T2, T3, T4 * T5);
TA = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 6)];
Ta = ii[WS(rs, 6)];
T7 = W[10];
T9 = W[11];
Tb = FMA(T7, T8, T9 * Ta);
TB = FNMS(T9, T8, T7 * Ta);
}
Tc = T6 + Tb;
TS = Tb - T6;
TC = TA - TB;
TO = TA + TB;
}
{
E Th, TG, Tm, TH;
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 2)];
Tg = ii[WS(rs, 2)];
Td = W[2];
Tf = W[3];
Th = FMA(Td, Te, Tf * Tg);
TG = FNMS(Tf, Te, Td * Tg);
}
{
E Tj, Tl, Ti, Tk;
Tj = ri[WS(rs, 5)];
Tl = ii[WS(rs, 5)];
Ti = W[8];
Tk = W[9];
Tm = FMA(Ti, Tj, Tk * Tl);
TH = FNMS(Tk, Tj, Ti * Tl);
}
Tn = Th + Tm;
TT = Tm - Th;
TI = TG - TH;
TP = TG + TH;
}
{
E Ts, TD, Tx, TE;
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 3)];
Tr = ii[WS(rs, 3)];
To = W[4];
Tq = W[5];
Ts = FMA(To, Tp, Tq * Tr);
TD = FNMS(Tq, Tp, To * Tr);
}
{
E Tu, Tw, Tt, Tv;
Tu = ri[WS(rs, 4)];
Tw = ii[WS(rs, 4)];
Tt = W[6];
Tv = W[7];
Tx = FMA(Tt, Tu, Tv * Tw);
TE = FNMS(Tv, Tu, Tt * Tw);
}
Ty = Ts + Tx;
TU = Tx - Ts;
TF = TD - TE;
TQ = TD + TE;
}
ri[0] = T1 + Tc + Tn + Ty;
ii[0] = TO + TP + TQ + TR;
{
E TJ, Tz, TX, TY;
TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI);
Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc);
ri[WS(rs, 5)] = Tz - TJ;
ri[WS(rs, 2)] = Tz + TJ;
TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT);
TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO);
ii[WS(rs, 2)] = TX + TY;
ii[WS(rs, 5)] = TY - TX;
}
{
E TL, TK, TV, TW;
TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF);
TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn);
ri[WS(rs, 6)] = TK - TL;
ri[WS(rs, 1)] = TK + TL;
TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU);
TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP);
ii[WS(rs, 1)] = TV + TW;
ii[WS(rs, 6)] = TW - TV;
}
{
E TN, TM, TZ, T10;
TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI);
TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc);
ri[WS(rs, 4)] = TM - TN;
ri[WS(rs, 3)] = TM + TN;
TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT);
T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO);
ii[WS(rs, 3)] = TZ + T10;
ii[WS(rs, 4)] = T10 - TZ;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 7, "t1_7", twinstr, &GENUS, { 36, 24, 36, 0 }, 0, 0, 0 };
void X(codelet_t1_7) (planner *p) {
X(kdft_dit_register) (p, t1_7, &desc);
}
#endif

View File

@@ -0,0 +1,376 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
/*
* This function contains 66 FP additions, 36 FP multiplications,
* (or, 44 additions, 14 multiplications, 22 fused multiply/add),
* 34 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T1, T1m, T7, T1l, Tk, TS, Te, TQ, TF, T14, TL, T16, T12, T17, Ts;
E TX, Ty, TZ, TV, T10;
T1 = ri[0];
T1m = ii[0];
{
E T3, T6, T4, T1k, T2, T5;
T3 = ri[WS(rs, 4)];
T6 = ii[WS(rs, 4)];
T2 = W[6];
T4 = T2 * T3;
T1k = T2 * T6;
T5 = W[7];
T7 = FMA(T5, T6, T4);
T1l = FNMS(T5, T3, T1k);
}
{
E Tg, Tj, Th, TR, Tf, Ti;
Tg = ri[WS(rs, 6)];
Tj = ii[WS(rs, 6)];
Tf = W[10];
Th = Tf * Tg;
TR = Tf * Tj;
Ti = W[11];
Tk = FMA(Ti, Tj, Th);
TS = FNMS(Ti, Tg, TR);
}
{
E Ta, Td, Tb, TP, T9, Tc;
Ta = ri[WS(rs, 2)];
Td = ii[WS(rs, 2)];
T9 = W[2];
Tb = T9 * Ta;
TP = T9 * Td;
Tc = W[3];
Te = FMA(Tc, Td, Tb);
TQ = FNMS(Tc, Ta, TP);
}
{
E TB, TE, TC, T13, TH, TK, TI, T15, TA, TG, TD, TJ;
TB = ri[WS(rs, 7)];
TE = ii[WS(rs, 7)];
TA = W[12];
TC = TA * TB;
T13 = TA * TE;
TH = ri[WS(rs, 3)];
TK = ii[WS(rs, 3)];
TG = W[4];
TI = TG * TH;
T15 = TG * TK;
TD = W[13];
TF = FMA(TD, TE, TC);
T14 = FNMS(TD, TB, T13);
TJ = W[5];
TL = FMA(TJ, TK, TI);
T16 = FNMS(TJ, TH, T15);
T12 = TF - TL;
T17 = T14 - T16;
}
{
E To, Tr, Tp, TW, Tu, Tx, Tv, TY, Tn, Tt, Tq, Tw;
To = ri[WS(rs, 1)];
Tr = ii[WS(rs, 1)];
Tn = W[0];
Tp = Tn * To;
TW = Tn * Tr;
Tu = ri[WS(rs, 5)];
Tx = ii[WS(rs, 5)];
Tt = W[8];
Tv = Tt * Tu;
TY = Tt * Tx;
Tq = W[1];
Ts = FMA(Tq, Tr, Tp);
TX = FNMS(Tq, To, TW);
Tw = W[9];
Ty = FMA(Tw, Tx, Tv);
TZ = FNMS(Tw, Tu, TY);
TV = Ts - Ty;
T10 = TX - TZ;
}
{
E TU, T1a, T1t, T1v, T19, T1w, T1d, T1u;
{
E TO, TT, T1r, T1s;
TO = T1 - T7;
TT = TQ - TS;
TU = TO + TT;
T1a = TO - TT;
T1r = T1m - T1l;
T1s = Te - Tk;
T1t = T1r - T1s;
T1v = T1s + T1r;
}
{
E T11, T18, T1b, T1c;
T11 = TV + T10;
T18 = T12 - T17;
T19 = T11 + T18;
T1w = T18 - T11;
T1b = T10 - TV;
T1c = T12 + T17;
T1d = T1b - T1c;
T1u = T1b + T1c;
}
ri[WS(rs, 5)] = FNMS(KP707106781, T19, TU);
ii[WS(rs, 5)] = FNMS(KP707106781, T1u, T1t);
ri[WS(rs, 1)] = FMA(KP707106781, T19, TU);
ii[WS(rs, 1)] = FMA(KP707106781, T1u, T1t);
ri[WS(rs, 7)] = FNMS(KP707106781, T1d, T1a);
ii[WS(rs, 7)] = FNMS(KP707106781, T1w, T1v);
ri[WS(rs, 3)] = FMA(KP707106781, T1d, T1a);
ii[WS(rs, 3)] = FMA(KP707106781, T1w, T1v);
}
{
E Tm, T1e, T1o, T1q, TN, T1p, T1h, T1i;
{
E T8, Tl, T1j, T1n;
T8 = T1 + T7;
Tl = Te + Tk;
Tm = T8 + Tl;
T1e = T8 - Tl;
T1j = TQ + TS;
T1n = T1l + T1m;
T1o = T1j + T1n;
T1q = T1n - T1j;
}
{
E Tz, TM, T1f, T1g;
Tz = Ts + Ty;
TM = TF + TL;
TN = Tz + TM;
T1p = TM - Tz;
T1f = TX + TZ;
T1g = T14 + T16;
T1h = T1f - T1g;
T1i = T1f + T1g;
}
ri[WS(rs, 4)] = Tm - TN;
ii[WS(rs, 4)] = T1o - T1i;
ri[0] = Tm + TN;
ii[0] = T1i + T1o;
ri[WS(rs, 6)] = T1e - T1h;
ii[WS(rs, 6)] = T1q - T1p;
ri[WS(rs, 2)] = T1e + T1h;
ii[WS(rs, 2)] = T1p + T1q;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 8 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 44, 14, 22, 0 }, 0, 0, 0 };
void X(codelet_t1_8) (planner *p) {
X(kdft_dit_register) (p, t1_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 8 -name t1_8 -include dft/scalar/t.h */
/*
* This function contains 66 FP additions, 32 FP multiplications,
* (or, 52 additions, 18 multiplications, 14 fused multiply/add),
* 28 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 14); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 14, MAKE_VOLATILE_STRIDE(16, rs)) {
E T7, T1e, TH, T19, TF, T13, TR, TU, Ti, T1f, TK, T16, Tu, T12, TM;
E TP;
{
E T1, T18, T6, T17;
T1 = ri[0];
T18 = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 4)];
T5 = ii[WS(rs, 4)];
T2 = W[6];
T4 = W[7];
T6 = FMA(T2, T3, T4 * T5);
T17 = FNMS(T4, T3, T2 * T5);
}
T7 = T1 + T6;
T1e = T18 - T17;
TH = T1 - T6;
T19 = T17 + T18;
}
{
E Tz, TS, TE, TT;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 7)];
Ty = ii[WS(rs, 7)];
Tv = W[12];
Tx = W[13];
Tz = FMA(Tv, Tw, Tx * Ty);
TS = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 3)];
TD = ii[WS(rs, 3)];
TA = W[4];
TC = W[5];
TE = FMA(TA, TB, TC * TD);
TT = FNMS(TC, TB, TA * TD);
}
TF = Tz + TE;
T13 = TS + TT;
TR = Tz - TE;
TU = TS - TT;
}
{
E Tc, TI, Th, TJ;
{
E T9, Tb, T8, Ta;
T9 = ri[WS(rs, 2)];
Tb = ii[WS(rs, 2)];
T8 = W[2];
Ta = W[3];
Tc = FMA(T8, T9, Ta * Tb);
TI = FNMS(Ta, T9, T8 * Tb);
}
{
E Te, Tg, Td, Tf;
Te = ri[WS(rs, 6)];
Tg = ii[WS(rs, 6)];
Td = W[10];
Tf = W[11];
Th = FMA(Td, Te, Tf * Tg);
TJ = FNMS(Tf, Te, Td * Tg);
}
Ti = Tc + Th;
T1f = Tc - Th;
TK = TI - TJ;
T16 = TI + TJ;
}
{
E To, TN, Tt, TO;
{
E Tl, Tn, Tk, Tm;
Tl = ri[WS(rs, 1)];
Tn = ii[WS(rs, 1)];
Tk = W[0];
Tm = W[1];
To = FMA(Tk, Tl, Tm * Tn);
TN = FNMS(Tm, Tl, Tk * Tn);
}
{
E Tq, Ts, Tp, Tr;
Tq = ri[WS(rs, 5)];
Ts = ii[WS(rs, 5)];
Tp = W[8];
Tr = W[9];
Tt = FMA(Tp, Tq, Tr * Ts);
TO = FNMS(Tr, Tq, Tp * Ts);
}
Tu = To + Tt;
T12 = TN + TO;
TM = To - Tt;
TP = TN - TO;
}
{
E Tj, TG, T1b, T1c;
Tj = T7 + Ti;
TG = Tu + TF;
ri[WS(rs, 4)] = Tj - TG;
ri[0] = Tj + TG;
{
E T15, T1a, T11, T14;
T15 = T12 + T13;
T1a = T16 + T19;
ii[0] = T15 + T1a;
ii[WS(rs, 4)] = T1a - T15;
T11 = T7 - Ti;
T14 = T12 - T13;
ri[WS(rs, 6)] = T11 - T14;
ri[WS(rs, 2)] = T11 + T14;
}
T1b = TF - Tu;
T1c = T19 - T16;
ii[WS(rs, 2)] = T1b + T1c;
ii[WS(rs, 6)] = T1c - T1b;
{
E TX, T1g, T10, T1d, TY, TZ;
TX = TH - TK;
T1g = T1e - T1f;
TY = TP - TM;
TZ = TR + TU;
T10 = KP707106781 * (TY - TZ);
T1d = KP707106781 * (TY + TZ);
ri[WS(rs, 7)] = TX - T10;
ii[WS(rs, 5)] = T1g - T1d;
ri[WS(rs, 3)] = TX + T10;
ii[WS(rs, 1)] = T1d + T1g;
}
{
E TL, T1i, TW, T1h, TQ, TV;
TL = TH + TK;
T1i = T1f + T1e;
TQ = TM + TP;
TV = TR - TU;
TW = KP707106781 * (TQ + TV);
T1h = KP707106781 * (TV - TQ);
ri[WS(rs, 5)] = TL - TW;
ii[WS(rs, 7)] = T1i - T1h;
ri[WS(rs, 1)] = TL + TW;
ii[WS(rs, 3)] = T1h + T1i;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 8 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t1_8", twinstr, &GENUS, { 52, 18, 14, 0 }, 0, 0, 0 };
void X(codelet_t1_8) (planner *p) {
X(kdft_dit_register) (p, t1_8, &desc);
}
#endif

View File

@@ -0,0 +1,487 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:27 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
/*
* This function contains 96 FP additions, 88 FP multiplications,
* (or, 24 additions, 16 multiplications, 72 fused multiply/add),
* 55 stack variables, 10 constants, and 36 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP852868531, +0.852868531952443209628250963940074071936020296);
DK(KP492403876, +0.492403876506104029683371512294761506835321626);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP954188894, +0.954188894138671133499268364187245676532219158);
DK(KP363970234, +0.363970234266202361351047882776834043890471784);
DK(KP777861913, +0.777861913430206160028177977318626690410586096);
DK(KP839099631, +0.839099631177280011763127298123181364687434283);
DK(KP176326980, +0.176326980708464973471090386868618986121633062);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T1, T1R, Te, T1W, T10, T1Q, T1l, T1r, Ty, T1p, Tl, T1o, T1g, T1q, T1a;
E T1d, TS, T18, TF, T13, T19, T1c;
T1 = ri[0];
T1R = ii[0];
{
E T3, T6, T4, TW, T9, Tc, Ta, TY, T2, T8;
T3 = ri[WS(rs, 3)];
T6 = ii[WS(rs, 3)];
T2 = W[4];
T4 = T2 * T3;
TW = T2 * T6;
T9 = ri[WS(rs, 6)];
Tc = ii[WS(rs, 6)];
T8 = W[10];
Ta = T8 * T9;
TY = T8 * Tc;
{
E T7, TX, Td, TZ, T5, Tb;
T5 = W[5];
T7 = FMA(T5, T6, T4);
TX = FNMS(T5, T3, TW);
Tb = W[11];
Td = FMA(Tb, Tc, Ta);
TZ = FNMS(Tb, T9, TY);
Te = T7 + Td;
T1W = Td - T7;
T10 = TX - TZ;
T1Q = TX + TZ;
}
}
{
E Th, Tk, Ti, T1n, Tx, T1i, Tr, T1k, Tg, Tj;
Th = ri[WS(rs, 1)];
Tk = ii[WS(rs, 1)];
Tg = W[0];
Ti = Tg * Th;
T1n = Tg * Tk;
{
E Tt, Tw, Tu, T1h, Ts, Tv;
Tt = ri[WS(rs, 7)];
Tw = ii[WS(rs, 7)];
Ts = W[12];
Tu = Ts * Tt;
T1h = Ts * Tw;
Tv = W[13];
Tx = FMA(Tv, Tw, Tu);
T1i = FNMS(Tv, Tt, T1h);
}
{
E Tn, Tq, To, T1j, Tm, Tp;
Tn = ri[WS(rs, 4)];
Tq = ii[WS(rs, 4)];
Tm = W[6];
To = Tm * Tn;
T1j = Tm * Tq;
Tp = W[7];
Tr = FMA(Tp, Tq, To);
T1k = FNMS(Tp, Tn, T1j);
}
T1l = T1i - T1k;
T1r = Tr - Tx;
Ty = Tr + Tx;
T1p = T1k + T1i;
Tj = W[1];
Tl = FMA(Tj, Tk, Ti);
T1o = FNMS(Tj, Th, T1n);
T1g = FNMS(KP500000000, Ty, Tl);
T1q = FNMS(KP500000000, T1p, T1o);
}
{
E TB, TE, TC, T12, TR, T17, TL, T15, TA, TD;
TB = ri[WS(rs, 2)];
TE = ii[WS(rs, 2)];
TA = W[2];
TC = TA * TB;
T12 = TA * TE;
{
E TN, TQ, TO, T16, TM, TP;
TN = ri[WS(rs, 8)];
TQ = ii[WS(rs, 8)];
TM = W[14];
TO = TM * TN;
T16 = TM * TQ;
TP = W[15];
TR = FMA(TP, TQ, TO);
T17 = FNMS(TP, TN, T16);
}
{
E TH, TK, TI, T14, TG, TJ;
TH = ri[WS(rs, 5)];
TK = ii[WS(rs, 5)];
TG = W[8];
TI = TG * TH;
T14 = TG * TK;
TJ = W[9];
TL = FMA(TJ, TK, TI);
T15 = FNMS(TJ, TH, T14);
}
T1a = TR - TL;
T1d = T15 - T17;
TS = TL + TR;
T18 = T15 + T17;
TD = W[3];
TF = FMA(TD, TE, TC);
T13 = FNMS(TD, TB, T12);
T19 = FNMS(KP500000000, T18, T13);
T1c = FNMS(KP500000000, TS, TF);
}
{
E Tf, T1S, TU, T1U, T1O, T1P, T1L, T1T;
Tf = T1 + Te;
T1S = T1Q + T1R;
{
E Tz, TT, T1M, T1N;
Tz = Tl + Ty;
TT = TF + TS;
TU = Tz + TT;
T1U = TT - Tz;
T1M = T1o + T1p;
T1N = T13 + T18;
T1O = T1M - T1N;
T1P = T1M + T1N;
}
ri[0] = Tf + TU;
ii[0] = T1P + T1S;
T1L = FNMS(KP500000000, TU, Tf);
ri[WS(rs, 6)] = FNMS(KP866025403, T1O, T1L);
ri[WS(rs, 3)] = FMA(KP866025403, T1O, T1L);
T1T = FNMS(KP500000000, T1P, T1S);
ii[WS(rs, 3)] = FMA(KP866025403, T1U, T1T);
ii[WS(rs, 6)] = FNMS(KP866025403, T1U, T1T);
}
{
E T11, T1z, T1X, T21, T1f, T1w, T1t, T1x, T1u, T1Y, T1C, T1I, T1F, T1J, T1G;
E T22, TV, T1V;
TV = FNMS(KP500000000, Te, T1);
T11 = FMA(KP866025403, T10, TV);
T1z = FNMS(KP866025403, T10, TV);
T1V = FNMS(KP500000000, T1Q, T1R);
T1X = FMA(KP866025403, T1W, T1V);
T21 = FNMS(KP866025403, T1W, T1V);
{
E T1b, T1e, T1m, T1s;
T1b = FMA(KP866025403, T1a, T19);
T1e = FMA(KP866025403, T1d, T1c);
T1f = FMA(KP176326980, T1e, T1b);
T1w = FNMS(KP176326980, T1b, T1e);
T1m = FNMS(KP866025403, T1l, T1g);
T1s = FNMS(KP866025403, T1r, T1q);
T1t = FMA(KP839099631, T1s, T1m);
T1x = FNMS(KP839099631, T1m, T1s);
}
T1u = FMA(KP777861913, T1t, T1f);
T1Y = FNMS(KP777861913, T1x, T1w);
{
E T1A, T1B, T1D, T1E;
T1A = FMA(KP866025403, T1r, T1q);
T1B = FMA(KP866025403, T1l, T1g);
T1C = FMA(KP176326980, T1B, T1A);
T1I = FNMS(KP176326980, T1A, T1B);
T1D = FNMS(KP866025403, T1d, T1c);
T1E = FNMS(KP866025403, T1a, T19);
T1F = FNMS(KP363970234, T1E, T1D);
T1J = FMA(KP363970234, T1D, T1E);
}
T1G = FNMS(KP954188894, T1F, T1C);
T22 = FMA(KP954188894, T1J, T1I);
ri[WS(rs, 1)] = FMA(KP984807753, T1u, T11);
ii[WS(rs, 1)] = FNMS(KP984807753, T1Y, T1X);
ri[WS(rs, 2)] = FMA(KP984807753, T1G, T1z);
ii[WS(rs, 2)] = FNMS(KP984807753, T22, T21);
{
E T1v, T1y, T1Z, T20;
T1v = FNMS(KP492403876, T1u, T11);
T1y = FMA(KP777861913, T1x, T1w);
ri[WS(rs, 4)] = FMA(KP852868531, T1y, T1v);
ri[WS(rs, 7)] = FNMS(KP852868531, T1y, T1v);
T1Z = FMA(KP492403876, T1Y, T1X);
T20 = FNMS(KP777861913, T1t, T1f);
ii[WS(rs, 4)] = FMA(KP852868531, T20, T1Z);
ii[WS(rs, 7)] = FNMS(KP852868531, T20, T1Z);
}
{
E T1H, T1K, T23, T24;
T1H = FNMS(KP492403876, T1G, T1z);
T1K = FNMS(KP954188894, T1J, T1I);
ri[WS(rs, 5)] = FNMS(KP852868531, T1K, T1H);
ri[WS(rs, 8)] = FMA(KP852868531, T1K, T1H);
T23 = FMA(KP492403876, T22, T21);
T24 = FMA(KP954188894, T1F, T1C);
ii[WS(rs, 5)] = FNMS(KP852868531, T24, T23);
ii[WS(rs, 8)] = FMA(KP852868531, T24, T23);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 24, 16, 72, 0 }, 0, 0, 0 };
void X(codelet_t1_9) (planner *p) {
X(kdft_dit_register) (p, t1_9, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -n 9 -name t1_9 -include dft/scalar/t.h */
/*
* This function contains 96 FP additions, 72 FP multiplications,
* (or, 60 additions, 36 multiplications, 36 fused multiply/add),
* 41 stack variables, 8 constants, and 36 memory accesses
*/
#include "dft/scalar/t.h"
static void t1_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP939692620, +0.939692620785908384054109277324731469936208134);
DK(KP342020143, +0.342020143325668733044099614682259580763083368);
DK(KP984807753, +0.984807753012208059366743024589523013670643252);
DK(KP173648177, +0.173648177666930348851716626769314796000375677);
DK(KP642787609, +0.642787609686539326322643409907263432907559884);
DK(KP766044443, +0.766044443118978035202392650555416673935832457);
DK(KP500000000, +0.500000000000000000000000000000000000000000000);
DK(KP866025403, +0.866025403784438646763723170752936183471402627);
{
INT m;
for (m = mb, W = W + (mb * 16); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 16, MAKE_VOLATILE_STRIDE(18, rs)) {
E T1, T1B, TQ, T1G, Tc, TN, T1A, T1H, TL, T1x, T17, T1o, T1c, T1n, Tu;
E T1w, TW, T1k, T11, T1l;
{
E T6, TO, Tb, TP;
T1 = ri[0];
T1B = ii[0];
{
E T3, T5, T2, T4;
T3 = ri[WS(rs, 3)];
T5 = ii[WS(rs, 3)];
T2 = W[4];
T4 = W[5];
T6 = FMA(T2, T3, T4 * T5);
TO = FNMS(T4, T3, T2 * T5);
}
{
E T8, Ta, T7, T9;
T8 = ri[WS(rs, 6)];
Ta = ii[WS(rs, 6)];
T7 = W[10];
T9 = W[11];
Tb = FMA(T7, T8, T9 * Ta);
TP = FNMS(T9, T8, T7 * Ta);
}
TQ = KP866025403 * (TO - TP);
T1G = KP866025403 * (Tb - T6);
Tc = T6 + Tb;
TN = FNMS(KP500000000, Tc, T1);
T1A = TO + TP;
T1H = FNMS(KP500000000, T1A, T1B);
}
{
E Tz, T19, TE, T14, TJ, T15, TK, T1a;
{
E Tw, Ty, Tv, Tx;
Tw = ri[WS(rs, 2)];
Ty = ii[WS(rs, 2)];
Tv = W[2];
Tx = W[3];
Tz = FMA(Tv, Tw, Tx * Ty);
T19 = FNMS(Tx, Tw, Tv * Ty);
}
{
E TB, TD, TA, TC;
TB = ri[WS(rs, 5)];
TD = ii[WS(rs, 5)];
TA = W[8];
TC = W[9];
TE = FMA(TA, TB, TC * TD);
T14 = FNMS(TC, TB, TA * TD);
}
{
E TG, TI, TF, TH;
TG = ri[WS(rs, 8)];
TI = ii[WS(rs, 8)];
TF = W[14];
TH = W[15];
TJ = FMA(TF, TG, TH * TI);
T15 = FNMS(TH, TG, TF * TI);
}
TK = TE + TJ;
T1a = T14 + T15;
TL = Tz + TK;
T1x = T19 + T1a;
{
E T13, T16, T18, T1b;
T13 = FNMS(KP500000000, TK, Tz);
T16 = KP866025403 * (T14 - T15);
T17 = T13 + T16;
T1o = T13 - T16;
T18 = KP866025403 * (TJ - TE);
T1b = FNMS(KP500000000, T1a, T19);
T1c = T18 + T1b;
T1n = T1b - T18;
}
}
{
E Ti, TY, Tn, TT, Ts, TU, Tt, TZ;
{
E Tf, Th, Te, Tg;
Tf = ri[WS(rs, 1)];
Th = ii[WS(rs, 1)];
Te = W[0];
Tg = W[1];
Ti = FMA(Te, Tf, Tg * Th);
TY = FNMS(Tg, Tf, Te * Th);
}
{
E Tk, Tm, Tj, Tl;
Tk = ri[WS(rs, 4)];
Tm = ii[WS(rs, 4)];
Tj = W[6];
Tl = W[7];
Tn = FMA(Tj, Tk, Tl * Tm);
TT = FNMS(Tl, Tk, Tj * Tm);
}
{
E Tp, Tr, To, Tq;
Tp = ri[WS(rs, 7)];
Tr = ii[WS(rs, 7)];
To = W[12];
Tq = W[13];
Ts = FMA(To, Tp, Tq * Tr);
TU = FNMS(Tq, Tp, To * Tr);
}
Tt = Tn + Ts;
TZ = TT + TU;
Tu = Ti + Tt;
T1w = TY + TZ;
{
E TS, TV, TX, T10;
TS = FNMS(KP500000000, Tt, Ti);
TV = KP866025403 * (TT - TU);
TW = TS + TV;
T1k = TS - TV;
TX = KP866025403 * (Ts - Tn);
T10 = FNMS(KP500000000, TZ, TY);
T11 = TX + T10;
T1l = T10 - TX;
}
}
{
E T1y, Td, TM, T1v;
T1y = KP866025403 * (T1w - T1x);
Td = T1 + Tc;
TM = Tu + TL;
T1v = FNMS(KP500000000, TM, Td);
ri[0] = Td + TM;
ri[WS(rs, 3)] = T1v + T1y;
ri[WS(rs, 6)] = T1v - T1y;
}
{
E T1D, T1z, T1C, T1E;
T1D = KP866025403 * (TL - Tu);
T1z = T1w + T1x;
T1C = T1A + T1B;
T1E = FNMS(KP500000000, T1z, T1C);
ii[0] = T1z + T1C;
ii[WS(rs, 6)] = T1E - T1D;
ii[WS(rs, 3)] = T1D + T1E;
}
{
E TR, T1I, T1e, T1J, T1i, T1F, T1f, T1K;
TR = TN + TQ;
T1I = T1G + T1H;
{
E T12, T1d, T1g, T1h;
T12 = FMA(KP766044443, TW, KP642787609 * T11);
T1d = FMA(KP173648177, T17, KP984807753 * T1c);
T1e = T12 + T1d;
T1J = KP866025403 * (T1d - T12);
T1g = FNMS(KP642787609, TW, KP766044443 * T11);
T1h = FNMS(KP984807753, T17, KP173648177 * T1c);
T1i = KP866025403 * (T1g - T1h);
T1F = T1g + T1h;
}
ri[WS(rs, 1)] = TR + T1e;
ii[WS(rs, 1)] = T1F + T1I;
T1f = FNMS(KP500000000, T1e, TR);
ri[WS(rs, 7)] = T1f - T1i;
ri[WS(rs, 4)] = T1f + T1i;
T1K = FNMS(KP500000000, T1F, T1I);
ii[WS(rs, 4)] = T1J + T1K;
ii[WS(rs, 7)] = T1K - T1J;
}
{
E T1j, T1M, T1q, T1N, T1u, T1L, T1r, T1O;
T1j = TN - TQ;
T1M = T1H - T1G;
{
E T1m, T1p, T1s, T1t;
T1m = FMA(KP173648177, T1k, KP984807753 * T1l);
T1p = FNMS(KP939692620, T1o, KP342020143 * T1n);
T1q = T1m + T1p;
T1N = KP866025403 * (T1p - T1m);
T1s = FNMS(KP984807753, T1k, KP173648177 * T1l);
T1t = FMA(KP342020143, T1o, KP939692620 * T1n);
T1u = KP866025403 * (T1s + T1t);
T1L = T1s - T1t;
}
ri[WS(rs, 2)] = T1j + T1q;
ii[WS(rs, 2)] = T1L + T1M;
T1r = FNMS(KP500000000, T1q, T1j);
ri[WS(rs, 8)] = T1r - T1u;
ri[WS(rs, 5)] = T1r + T1u;
T1O = FNMS(KP500000000, T1L, T1M);
ii[WS(rs, 5)] = T1N + T1O;
ii[WS(rs, 8)] = T1O - T1N;
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_FULL, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 9, "t1_9", twinstr, &GENUS, { 60, 36, 36, 0 }, 0, 0, 0 };
void X(codelet_t1_9) (planner *p) {
X(kdft_dit_register) (p, t1_9, &desc);
}
#endif

View File

@@ -0,0 +1,509 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
/*
* This function contains 114 FP additions, 94 FP multiplications,
* (or, 48 additions, 28 multiplications, 66 fused multiply/add),
* 63 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
E T2, T3, T8, Tc, T5, T6, Tl, T7, TB, TF, T12, TY, To, Ts, Tw;
E Tb, Td, Th;
{
E TA, TX, TE, T11, Ta, T4;
T2 = W[0];
T3 = W[2];
T4 = T2 * T3;
T8 = W[4];
TA = T2 * T8;
TX = T3 * T8;
Tc = W[5];
TE = T2 * Tc;
T11 = T3 * Tc;
T5 = W[1];
T6 = W[3];
Ta = T2 * T6;
Tl = FMA(T5, T6, T4);
T7 = FNMS(T5, T6, T4);
TB = FMA(T5, Tc, TA);
TF = FNMS(T5, T8, TE);
T12 = FNMS(T6, T8, T11);
TY = FMA(T6, Tc, TX);
{
E Tr, Tv, T9, Tg;
Tr = Tl * T8;
Tv = Tl * Tc;
To = FNMS(T5, T3, Ta);
Ts = FMA(To, Tc, Tr);
Tw = FNMS(To, T8, Tv);
T9 = T7 * T8;
Tg = T7 * Tc;
Tb = FMA(T5, T3, Ta);
Td = FMA(Tb, Tc, T9);
Th = FNMS(Tb, T8, Tg);
}
}
{
E Tk, T1c, T24, T2d, TW, T19, T1a, T1P, T1Q, T1Z, T1g, T1h, T1i, T1C, T1H;
E T2f, Tz, TM, TN, T1S, T1T, T1Y, T1d, T1e, T1f, T1r, T1w, T2e;
{
E T1, T23, Te, Tf, Ti, T21, Tj, T22;
T1 = ri[0];
T23 = ii[0];
Te = ri[WS(rs, 5)];
Tf = Td * Te;
Ti = ii[WS(rs, 5)];
T21 = Td * Ti;
Tj = FMA(Th, Ti, Tf);
Tk = T1 - Tj;
T1c = T1 + Tj;
T22 = FNMS(Th, Te, T21);
T24 = T22 + T23;
T2d = T23 - T22;
}
{
E TR, T1z, T18, T1G, TV, T1B, T14, T1E;
{
E TO, TP, TQ, T1y;
TO = ri[WS(rs, 4)];
TP = T7 * TO;
TQ = ii[WS(rs, 4)];
T1y = T7 * TQ;
TR = FMA(Tb, TQ, TP);
T1z = FNMS(Tb, TO, T1y);
}
{
E T15, T16, T17, T1F;
T15 = ri[WS(rs, 1)];
T16 = T2 * T15;
T17 = ii[WS(rs, 1)];
T1F = T2 * T17;
T18 = FMA(T5, T17, T16);
T1G = FNMS(T5, T15, T1F);
}
{
E TS, TT, TU, T1A;
TS = ri[WS(rs, 9)];
TT = T8 * TS;
TU = ii[WS(rs, 9)];
T1A = T8 * TU;
TV = FMA(Tc, TU, TT);
T1B = FNMS(Tc, TS, T1A);
}
{
E TZ, T10, T13, T1D;
TZ = ri[WS(rs, 6)];
T10 = TY * TZ;
T13 = ii[WS(rs, 6)];
T1D = TY * T13;
T14 = FMA(T12, T13, T10);
T1E = FNMS(T12, TZ, T1D);
}
TW = TR - TV;
T19 = T14 - T18;
T1a = TW + T19;
T1P = T1z + T1B;
T1Q = T1E + T1G;
T1Z = T1P + T1Q;
T1g = TR + TV;
T1h = T14 + T18;
T1i = T1g + T1h;
T1C = T1z - T1B;
T1H = T1E - T1G;
T2f = T1C + T1H;
}
{
E Tq, T1o, TL, T1v, Ty, T1q, TH, T1t;
{
E Tm, Tn, Tp, T1n;
Tm = ri[WS(rs, 2)];
Tn = Tl * Tm;
Tp = ii[WS(rs, 2)];
T1n = Tl * Tp;
Tq = FMA(To, Tp, Tn);
T1o = FNMS(To, Tm, T1n);
}
{
E TI, TJ, TK, T1u;
TI = ri[WS(rs, 3)];
TJ = T3 * TI;
TK = ii[WS(rs, 3)];
T1u = T3 * TK;
TL = FMA(T6, TK, TJ);
T1v = FNMS(T6, TI, T1u);
}
{
E Tt, Tu, Tx, T1p;
Tt = ri[WS(rs, 7)];
Tu = Ts * Tt;
Tx = ii[WS(rs, 7)];
T1p = Ts * Tx;
Ty = FMA(Tw, Tx, Tu);
T1q = FNMS(Tw, Tt, T1p);
}
{
E TC, TD, TG, T1s;
TC = ri[WS(rs, 8)];
TD = TB * TC;
TG = ii[WS(rs, 8)];
T1s = TB * TG;
TH = FMA(TF, TG, TD);
T1t = FNMS(TF, TC, T1s);
}
Tz = Tq - Ty;
TM = TH - TL;
TN = Tz + TM;
T1S = T1o + T1q;
T1T = T1t + T1v;
T1Y = T1S + T1T;
T1d = Tq + Ty;
T1e = TH + TL;
T1f = T1d + T1e;
T1r = T1o - T1q;
T1w = T1t - T1v;
T2e = T1r + T1w;
}
{
E T1l, T1b, T1k, T1J, T1L, T1x, T1I, T1K, T1m;
T1l = TN - T1a;
T1b = TN + T1a;
T1k = FNMS(KP250000000, T1b, Tk);
T1x = T1r - T1w;
T1I = T1C - T1H;
T1J = FMA(KP618033988, T1I, T1x);
T1L = FNMS(KP618033988, T1x, T1I);
ri[WS(rs, 5)] = Tk + T1b;
T1K = FNMS(KP559016994, T1l, T1k);
ri[WS(rs, 7)] = FNMS(KP951056516, T1L, T1K);
ri[WS(rs, 3)] = FMA(KP951056516, T1L, T1K);
T1m = FMA(KP559016994, T1l, T1k);
ri[WS(rs, 9)] = FNMS(KP951056516, T1J, T1m);
ri[WS(rs, 1)] = FMA(KP951056516, T1J, T1m);
}
{
E T2i, T2g, T2h, T2m, T2o, T2k, T2l, T2n, T2j;
T2i = T2e - T2f;
T2g = T2e + T2f;
T2h = FNMS(KP250000000, T2g, T2d);
T2k = Tz - TM;
T2l = TW - T19;
T2m = FMA(KP618033988, T2l, T2k);
T2o = FNMS(KP618033988, T2k, T2l);
ii[WS(rs, 5)] = T2g + T2d;
T2n = FNMS(KP559016994, T2i, T2h);
ii[WS(rs, 3)] = FNMS(KP951056516, T2o, T2n);
ii[WS(rs, 7)] = FMA(KP951056516, T2o, T2n);
T2j = FMA(KP559016994, T2i, T2h);
ii[WS(rs, 1)] = FNMS(KP951056516, T2m, T2j);
ii[WS(rs, 9)] = FMA(KP951056516, T2m, T2j);
}
{
E T1N, T1j, T1M, T1V, T1X, T1R, T1U, T1W, T1O;
T1N = T1f - T1i;
T1j = T1f + T1i;
T1M = FNMS(KP250000000, T1j, T1c);
T1R = T1P - T1Q;
T1U = T1S - T1T;
T1V = FNMS(KP618033988, T1U, T1R);
T1X = FMA(KP618033988, T1R, T1U);
ri[0] = T1c + T1j;
T1W = FMA(KP559016994, T1N, T1M);
ri[WS(rs, 4)] = FNMS(KP951056516, T1X, T1W);
ri[WS(rs, 6)] = FMA(KP951056516, T1X, T1W);
T1O = FNMS(KP559016994, T1N, T1M);
ri[WS(rs, 2)] = FNMS(KP951056516, T1V, T1O);
ri[WS(rs, 8)] = FMA(KP951056516, T1V, T1O);
}
{
E T26, T20, T25, T2a, T2c, T28, T29, T2b, T27;
T26 = T1Y - T1Z;
T20 = T1Y + T1Z;
T25 = FNMS(KP250000000, T20, T24);
T28 = T1g - T1h;
T29 = T1d - T1e;
T2a = FNMS(KP618033988, T29, T28);
T2c = FMA(KP618033988, T28, T29);
ii[0] = T20 + T24;
T2b = FMA(KP559016994, T26, T25);
ii[WS(rs, 4)] = FMA(KP951056516, T2c, T2b);
ii[WS(rs, 6)] = FNMS(KP951056516, T2c, T2b);
T27 = FNMS(KP559016994, T26, T25);
ii[WS(rs, 2)] = FMA(KP951056516, T2a, T27);
ii[WS(rs, 8)] = FNMS(KP951056516, T2a, T27);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 48, 28, 66, 0 }, 0, 0, 0 };
void X(codelet_t2_10) (planner *p) {
X(kdft_dit_register) (p, t2_10, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 10 -name t2_10 -include dft/scalar/t.h */
/*
* This function contains 114 FP additions, 80 FP multiplications,
* (or, 76 additions, 42 multiplications, 38 fused multiply/add),
* 63 stack variables, 4 constants, and 40 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_10(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(20, rs)) {
E T2, T5, T3, T6, T8, Tm, Tc, Tk, T9, Td, Te, TM, TO, Tg, Tp;
E Tv, Tx, Tr;
{
E T4, Tb, T7, Ta;
T2 = W[0];
T5 = W[1];
T3 = W[2];
T6 = W[3];
T4 = T2 * T3;
Tb = T5 * T3;
T7 = T5 * T6;
Ta = T2 * T6;
T8 = T4 - T7;
Tm = Ta - Tb;
Tc = Ta + Tb;
Tk = T4 + T7;
T9 = W[4];
Td = W[5];
Te = FMA(T8, T9, Tc * Td);
TM = FMA(T3, T9, T6 * Td);
TO = FNMS(T6, T9, T3 * Td);
Tg = FNMS(Tc, T9, T8 * Td);
Tp = FMA(Tk, T9, Tm * Td);
Tv = FMA(T2, T9, T5 * Td);
Tx = FNMS(T5, T9, T2 * Td);
Tr = FNMS(Tm, T9, Tk * Td);
}
{
E Tj, T1S, TX, T1G, TL, TU, TV, T1s, T1t, T1C, T11, T12, T13, T1h, T1k;
E T1Q, Tu, TD, TE, T1v, T1w, T1B, TY, TZ, T10, T1a, T1d, T1P;
{
E T1, T1F, Ti, T1E, Tf, Th;
T1 = ri[0];
T1F = ii[0];
Tf = ri[WS(rs, 5)];
Th = ii[WS(rs, 5)];
Ti = FMA(Te, Tf, Tg * Th);
T1E = FNMS(Tg, Tf, Te * Th);
Tj = T1 - Ti;
T1S = T1F - T1E;
TX = T1 + Ti;
T1G = T1E + T1F;
}
{
E TH, T1f, TT, T1j, TK, T1g, TQ, T1i;
{
E TF, TG, TR, TS;
TF = ri[WS(rs, 4)];
TG = ii[WS(rs, 4)];
TH = FMA(T8, TF, Tc * TG);
T1f = FNMS(Tc, TF, T8 * TG);
TR = ri[WS(rs, 1)];
TS = ii[WS(rs, 1)];
TT = FMA(T2, TR, T5 * TS);
T1j = FNMS(T5, TR, T2 * TS);
}
{
E TI, TJ, TN, TP;
TI = ri[WS(rs, 9)];
TJ = ii[WS(rs, 9)];
TK = FMA(T9, TI, Td * TJ);
T1g = FNMS(Td, TI, T9 * TJ);
TN = ri[WS(rs, 6)];
TP = ii[WS(rs, 6)];
TQ = FMA(TM, TN, TO * TP);
T1i = FNMS(TO, TN, TM * TP);
}
TL = TH - TK;
TU = TQ - TT;
TV = TL + TU;
T1s = T1f + T1g;
T1t = T1i + T1j;
T1C = T1s + T1t;
T11 = TH + TK;
T12 = TQ + TT;
T13 = T11 + T12;
T1h = T1f - T1g;
T1k = T1i - T1j;
T1Q = T1h + T1k;
}
{
E To, T18, TC, T1c, Tt, T19, Tz, T1b;
{
E Tl, Tn, TA, TB;
Tl = ri[WS(rs, 2)];
Tn = ii[WS(rs, 2)];
To = FMA(Tk, Tl, Tm * Tn);
T18 = FNMS(Tm, Tl, Tk * Tn);
TA = ri[WS(rs, 3)];
TB = ii[WS(rs, 3)];
TC = FMA(T3, TA, T6 * TB);
T1c = FNMS(T6, TA, T3 * TB);
}
{
E Tq, Ts, Tw, Ty;
Tq = ri[WS(rs, 7)];
Ts = ii[WS(rs, 7)];
Tt = FMA(Tp, Tq, Tr * Ts);
T19 = FNMS(Tr, Tq, Tp * Ts);
Tw = ri[WS(rs, 8)];
Ty = ii[WS(rs, 8)];
Tz = FMA(Tv, Tw, Tx * Ty);
T1b = FNMS(Tx, Tw, Tv * Ty);
}
Tu = To - Tt;
TD = Tz - TC;
TE = Tu + TD;
T1v = T18 + T19;
T1w = T1b + T1c;
T1B = T1v + T1w;
TY = To + Tt;
TZ = Tz + TC;
T10 = TY + TZ;
T1a = T18 - T19;
T1d = T1b - T1c;
T1P = T1a + T1d;
}
{
E T15, TW, T16, T1m, T1o, T1e, T1l, T1n, T17;
T15 = KP559016994 * (TE - TV);
TW = TE + TV;
T16 = FNMS(KP250000000, TW, Tj);
T1e = T1a - T1d;
T1l = T1h - T1k;
T1m = FMA(KP951056516, T1e, KP587785252 * T1l);
T1o = FNMS(KP587785252, T1e, KP951056516 * T1l);
ri[WS(rs, 5)] = Tj + TW;
T1n = T16 - T15;
ri[WS(rs, 7)] = T1n - T1o;
ri[WS(rs, 3)] = T1n + T1o;
T17 = T15 + T16;
ri[WS(rs, 9)] = T17 - T1m;
ri[WS(rs, 1)] = T17 + T1m;
}
{
E T1R, T1T, T1U, T1Y, T20, T1W, T1X, T1Z, T1V;
T1R = KP559016994 * (T1P - T1Q);
T1T = T1P + T1Q;
T1U = FNMS(KP250000000, T1T, T1S);
T1W = Tu - TD;
T1X = TL - TU;
T1Y = FMA(KP951056516, T1W, KP587785252 * T1X);
T20 = FNMS(KP587785252, T1W, KP951056516 * T1X);
ii[WS(rs, 5)] = T1T + T1S;
T1Z = T1U - T1R;
ii[WS(rs, 3)] = T1Z - T20;
ii[WS(rs, 7)] = T20 + T1Z;
T1V = T1R + T1U;
ii[WS(rs, 1)] = T1V - T1Y;
ii[WS(rs, 9)] = T1Y + T1V;
}
{
E T1q, T14, T1p, T1y, T1A, T1u, T1x, T1z, T1r;
T1q = KP559016994 * (T10 - T13);
T14 = T10 + T13;
T1p = FNMS(KP250000000, T14, TX);
T1u = T1s - T1t;
T1x = T1v - T1w;
T1y = FNMS(KP587785252, T1x, KP951056516 * T1u);
T1A = FMA(KP951056516, T1x, KP587785252 * T1u);
ri[0] = TX + T14;
T1z = T1q + T1p;
ri[WS(rs, 4)] = T1z - T1A;
ri[WS(rs, 6)] = T1z + T1A;
T1r = T1p - T1q;
ri[WS(rs, 2)] = T1r - T1y;
ri[WS(rs, 8)] = T1r + T1y;
}
{
E T1L, T1D, T1K, T1J, T1N, T1H, T1I, T1O, T1M;
T1L = KP559016994 * (T1B - T1C);
T1D = T1B + T1C;
T1K = FNMS(KP250000000, T1D, T1G);
T1H = T11 - T12;
T1I = TY - TZ;
T1J = FNMS(KP587785252, T1I, KP951056516 * T1H);
T1N = FMA(KP951056516, T1I, KP587785252 * T1H);
ii[0] = T1D + T1G;
T1O = T1L + T1K;
ii[WS(rs, 4)] = T1N + T1O;
ii[WS(rs, 6)] = T1O - T1N;
T1M = T1K - T1L;
ii[WS(rs, 2)] = T1J + T1M;
ii[WS(rs, 8)] = T1M - T1J;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 10, "t2_10", twinstr, &GENUS, { 76, 42, 38, 0 }, 0, 0, 0 };
void X(codelet_t2_10) (planner *p) {
X(kdft_dit_register) (p, t2_10, &desc);
}
#endif

View File

@@ -0,0 +1,836 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
/*
* This function contains 196 FP additions, 134 FP multiplications,
* (or, 104 additions, 42 multiplications, 92 fused multiply/add),
* 90 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP414213562, +0.414213562373095048801688724209698078569671875);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, Tf, TM, TO, T3, T6, T5, Th, Tz, Ti, T7, TZ, TT, Tq, TW;
E Tb, Tu, TP, TI, TF, TC, T1z, T1O, T1D, T1L, Tm, T1f, T1p, T1j, T1m;
{
E TN, TS, T4, Tp, Ta, Tt, Tl, Tg;
T2 = W[0];
Tf = W[2];
Tg = T2 * Tf;
TM = W[6];
TN = T2 * TM;
TO = W[7];
TS = T2 * TO;
T3 = W[4];
T4 = T2 * T3;
Tp = Tf * T3;
T6 = W[5];
Ta = T2 * T6;
Tt = Tf * T6;
T5 = W[1];
Th = W[3];
Tl = T2 * Th;
Tz = FMA(T5, Th, Tg);
Ti = FNMS(T5, Th, Tg);
T7 = FMA(T5, T6, T4);
TZ = FNMS(Th, T3, Tt);
TT = FNMS(T5, TM, TS);
Tq = FNMS(Th, T6, Tp);
TW = FMA(Th, T6, Tp);
Tb = FNMS(T5, T3, Ta);
Tu = FMA(Th, T3, Tt);
TP = FMA(T5, TO, TN);
TI = FMA(T5, T3, Ta);
TF = FNMS(T5, T6, T4);
{
E T1y, T1C, T1e, T1i;
T1y = Tz * T3;
T1C = Tz * T6;
TC = FNMS(T5, Tf, Tl);
T1z = FMA(TC, T6, T1y);
T1O = FMA(TC, T3, T1C);
T1D = FNMS(TC, T3, T1C);
T1L = FNMS(TC, T6, T1y);
T1e = Ti * T3;
T1i = Ti * T6;
Tm = FMA(T5, Tf, Tl);
T1f = FMA(Tm, T6, T1e);
T1p = FMA(Tm, T3, T1i);
T1j = FNMS(Tm, T3, T1i);
T1m = FNMS(Tm, T6, T1e);
}
}
{
E Te, T1U, T3A, T3L, T1G, T2D, T2A, T3h, T1R, T2B, T2I, T3i, Tx, T3M, T1Z;
E T3w, TL, T26, T25, T37, T1d, T2o, T2l, T3c, T1s, T2m, T2t, T3d, T12, T28;
E T2d, T38;
{
E T1, T3z, T8, T9, Tc, T3x, Td, T3y;
T1 = ri[0];
T3z = ii[0];
T8 = ri[WS(rs, 8)];
T9 = T7 * T8;
Tc = ii[WS(rs, 8)];
T3x = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Te = T1 + Td;
T1U = T1 - Td;
T3y = FNMS(Tb, T8, T3x);
T3A = T3y + T3z;
T3L = T3z - T3y;
}
{
E T1u, T1v, T1w, T2w, T1A, T1B, T1E, T2y;
T1u = ri[WS(rs, 15)];
T1v = TM * T1u;
T1w = ii[WS(rs, 15)];
T2w = TM * T1w;
T1A = ri[WS(rs, 7)];
T1B = T1z * T1A;
T1E = ii[WS(rs, 7)];
T2y = T1z * T1E;
{
E T1x, T1F, T2x, T2z;
T1x = FMA(TO, T1w, T1v);
T1F = FMA(T1D, T1E, T1B);
T1G = T1x + T1F;
T2D = T1x - T1F;
T2x = FNMS(TO, T1u, T2w);
T2z = FNMS(T1D, T1A, T2y);
T2A = T2x - T2z;
T3h = T2x + T2z;
}
}
{
E T1H, T1I, T1J, T2E, T1M, T1N, T1P, T2G;
T1H = ri[WS(rs, 3)];
T1I = Tf * T1H;
T1J = ii[WS(rs, 3)];
T2E = Tf * T1J;
T1M = ri[WS(rs, 11)];
T1N = T1L * T1M;
T1P = ii[WS(rs, 11)];
T2G = T1L * T1P;
{
E T1K, T1Q, T2F, T2H;
T1K = FMA(Th, T1J, T1I);
T1Q = FMA(T1O, T1P, T1N);
T1R = T1K + T1Q;
T2B = T1K - T1Q;
T2F = FNMS(Th, T1H, T2E);
T2H = FNMS(T1O, T1M, T2G);
T2I = T2F - T2H;
T3i = T2F + T2H;
}
}
{
E Tj, Tk, Tn, T1V, Tr, Ts, Tv, T1X;
Tj = ri[WS(rs, 4)];
Tk = Ti * Tj;
Tn = ii[WS(rs, 4)];
T1V = Ti * Tn;
Tr = ri[WS(rs, 12)];
Ts = Tq * Tr;
Tv = ii[WS(rs, 12)];
T1X = Tq * Tv;
{
E To, Tw, T1W, T1Y;
To = FMA(Tm, Tn, Tk);
Tw = FMA(Tu, Tv, Ts);
Tx = To + Tw;
T3M = To - Tw;
T1W = FNMS(Tm, Tj, T1V);
T1Y = FNMS(Tu, Tr, T1X);
T1Z = T1W - T1Y;
T3w = T1W + T1Y;
}
}
{
E TA, TB, TD, T21, TG, TH, TJ, T23;
TA = ri[WS(rs, 2)];
TB = Tz * TA;
TD = ii[WS(rs, 2)];
T21 = Tz * TD;
TG = ri[WS(rs, 10)];
TH = TF * TG;
TJ = ii[WS(rs, 10)];
T23 = TF * TJ;
{
E TE, TK, T22, T24;
TE = FMA(TC, TD, TB);
TK = FMA(TI, TJ, TH);
TL = TE + TK;
T26 = TE - TK;
T22 = FNMS(TC, TA, T21);
T24 = FNMS(TI, TG, T23);
T25 = T22 - T24;
T37 = T22 + T24;
}
}
{
E T15, T16, T17, T2h, T19, T1a, T1b, T2j;
T15 = ri[WS(rs, 1)];
T16 = T2 * T15;
T17 = ii[WS(rs, 1)];
T2h = T2 * T17;
T19 = ri[WS(rs, 9)];
T1a = T3 * T19;
T1b = ii[WS(rs, 9)];
T2j = T3 * T1b;
{
E T18, T1c, T2i, T2k;
T18 = FMA(T5, T17, T16);
T1c = FMA(T6, T1b, T1a);
T1d = T18 + T1c;
T2o = T18 - T1c;
T2i = FNMS(T5, T15, T2h);
T2k = FNMS(T6, T19, T2j);
T2l = T2i - T2k;
T3c = T2i + T2k;
}
}
{
E T1g, T1h, T1k, T2p, T1n, T1o, T1q, T2r;
T1g = ri[WS(rs, 5)];
T1h = T1f * T1g;
T1k = ii[WS(rs, 5)];
T2p = T1f * T1k;
T1n = ri[WS(rs, 13)];
T1o = T1m * T1n;
T1q = ii[WS(rs, 13)];
T2r = T1m * T1q;
{
E T1l, T1r, T2q, T2s;
T1l = FMA(T1j, T1k, T1h);
T1r = FMA(T1p, T1q, T1o);
T1s = T1l + T1r;
T2m = T1l - T1r;
T2q = FNMS(T1j, T1g, T2p);
T2s = FNMS(T1p, T1n, T2r);
T2t = T2q - T2s;
T3d = T2q + T2s;
}
}
{
E TQ, TR, TU, T29, TX, TY, T10, T2b;
TQ = ri[WS(rs, 14)];
TR = TP * TQ;
TU = ii[WS(rs, 14)];
T29 = TP * TU;
TX = ri[WS(rs, 6)];
TY = TW * TX;
T10 = ii[WS(rs, 6)];
T2b = TW * T10;
{
E TV, T11, T2a, T2c;
TV = FMA(TT, TU, TR);
T11 = FMA(TZ, T10, TY);
T12 = TV + T11;
T28 = TV - T11;
T2a = FNMS(TT, TQ, T29);
T2c = FNMS(TZ, TX, T2b);
T2d = T2a - T2c;
T38 = T2a + T2c;
}
}
{
E T14, T3q, T3C, T3E, T1T, T3D, T3t, T3u;
{
E Ty, T13, T3v, T3B;
Ty = Te + Tx;
T13 = TL + T12;
T14 = Ty + T13;
T3q = Ty - T13;
T3v = T37 + T38;
T3B = T3w + T3A;
T3C = T3v + T3B;
T3E = T3B - T3v;
}
{
E T1t, T1S, T3r, T3s;
T1t = T1d + T1s;
T1S = T1G + T1R;
T1T = T1t + T1S;
T3D = T1S - T1t;
T3r = T3c + T3d;
T3s = T3h + T3i;
T3t = T3r - T3s;
T3u = T3r + T3s;
}
ri[WS(rs, 8)] = T14 - T1T;
ii[WS(rs, 8)] = T3C - T3u;
ri[0] = T14 + T1T;
ii[0] = T3u + T3C;
ri[WS(rs, 12)] = T3q - T3t;
ii[WS(rs, 12)] = T3E - T3D;
ri[WS(rs, 4)] = T3q + T3t;
ii[WS(rs, 4)] = T3D + T3E;
}
{
E T3a, T3m, T3H, T3J, T3f, T3n, T3k, T3o;
{
E T36, T39, T3F, T3G;
T36 = Te - Tx;
T39 = T37 - T38;
T3a = T36 + T39;
T3m = T36 - T39;
T3F = T12 - TL;
T3G = T3A - T3w;
T3H = T3F + T3G;
T3J = T3G - T3F;
}
{
E T3b, T3e, T3g, T3j;
T3b = T1d - T1s;
T3e = T3c - T3d;
T3f = T3b + T3e;
T3n = T3e - T3b;
T3g = T1G - T1R;
T3j = T3h - T3i;
T3k = T3g - T3j;
T3o = T3g + T3j;
}
{
E T3l, T3I, T3p, T3K;
T3l = T3f + T3k;
ri[WS(rs, 10)] = FNMS(KP707106781, T3l, T3a);
ri[WS(rs, 2)] = FMA(KP707106781, T3l, T3a);
T3I = T3n + T3o;
ii[WS(rs, 2)] = FMA(KP707106781, T3I, T3H);
ii[WS(rs, 10)] = FNMS(KP707106781, T3I, T3H);
T3p = T3n - T3o;
ri[WS(rs, 14)] = FNMS(KP707106781, T3p, T3m);
ri[WS(rs, 6)] = FMA(KP707106781, T3p, T3m);
T3K = T3k - T3f;
ii[WS(rs, 6)] = FMA(KP707106781, T3K, T3J);
ii[WS(rs, 14)] = FNMS(KP707106781, T3K, T3J);
}
}
{
E T20, T3N, T3T, T2Q, T2f, T3O, T30, T34, T2T, T3U, T2v, T2N, T2X, T33, T2K;
E T2O;
{
E T27, T2e, T2n, T2u;
T20 = T1U - T1Z;
T3N = T3L - T3M;
T3T = T3M + T3L;
T2Q = T1U + T1Z;
T27 = T25 - T26;
T2e = T28 + T2d;
T2f = T27 - T2e;
T3O = T27 + T2e;
{
E T2Y, T2Z, T2R, T2S;
T2Y = T2D + T2I;
T2Z = T2A - T2B;
T30 = FNMS(KP414213562, T2Z, T2Y);
T34 = FMA(KP414213562, T2Y, T2Z);
T2R = T26 + T25;
T2S = T28 - T2d;
T2T = T2R + T2S;
T3U = T2S - T2R;
}
T2n = T2l + T2m;
T2u = T2o - T2t;
T2v = FMA(KP414213562, T2u, T2n);
T2N = FNMS(KP414213562, T2n, T2u);
{
E T2V, T2W, T2C, T2J;
T2V = T2o + T2t;
T2W = T2l - T2m;
T2X = FMA(KP414213562, T2W, T2V);
T33 = FNMS(KP414213562, T2V, T2W);
T2C = T2A + T2B;
T2J = T2D - T2I;
T2K = FNMS(KP414213562, T2J, T2C);
T2O = FMA(KP414213562, T2C, T2J);
}
}
{
E T2g, T2L, T3V, T3W;
T2g = FMA(KP707106781, T2f, T20);
T2L = T2v - T2K;
ri[WS(rs, 11)] = FNMS(KP923879532, T2L, T2g);
ri[WS(rs, 3)] = FMA(KP923879532, T2L, T2g);
T3V = FMA(KP707106781, T3U, T3T);
T3W = T2O - T2N;
ii[WS(rs, 3)] = FMA(KP923879532, T3W, T3V);
ii[WS(rs, 11)] = FNMS(KP923879532, T3W, T3V);
}
{
E T2M, T2P, T3X, T3Y;
T2M = FNMS(KP707106781, T2f, T20);
T2P = T2N + T2O;
ri[WS(rs, 7)] = FNMS(KP923879532, T2P, T2M);
ri[WS(rs, 15)] = FMA(KP923879532, T2P, T2M);
T3X = FNMS(KP707106781, T3U, T3T);
T3Y = T2v + T2K;
ii[WS(rs, 7)] = FNMS(KP923879532, T3Y, T3X);
ii[WS(rs, 15)] = FMA(KP923879532, T3Y, T3X);
}
{
E T2U, T31, T3P, T3Q;
T2U = FMA(KP707106781, T2T, T2Q);
T31 = T2X + T30;
ri[WS(rs, 9)] = FNMS(KP923879532, T31, T2U);
ri[WS(rs, 1)] = FMA(KP923879532, T31, T2U);
T3P = FMA(KP707106781, T3O, T3N);
T3Q = T33 + T34;
ii[WS(rs, 1)] = FMA(KP923879532, T3Q, T3P);
ii[WS(rs, 9)] = FNMS(KP923879532, T3Q, T3P);
}
{
E T32, T35, T3R, T3S;
T32 = FNMS(KP707106781, T2T, T2Q);
T35 = T33 - T34;
ri[WS(rs, 13)] = FNMS(KP923879532, T35, T32);
ri[WS(rs, 5)] = FMA(KP923879532, T35, T32);
T3R = FNMS(KP707106781, T3O, T3N);
T3S = T30 - T2X;
ii[WS(rs, 5)] = FMA(KP923879532, T3S, T3R);
ii[WS(rs, 13)] = FNMS(KP923879532, T3S, T3R);
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_CEXP, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 104, 42, 92, 0 }, 0, 0, 0 };
void X(codelet_t2_16) (planner *p) {
X(kdft_dit_register) (p, t2_16, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 16 -name t2_16 -include dft/scalar/t.h */
/*
* This function contains 196 FP additions, 108 FP multiplications,
* (or, 156 additions, 68 multiplications, 40 fused multiply/add),
* 82 stack variables, 3 constants, and 64 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP382683432, +0.382683432365089771728459984030398866761344562);
DK(KP923879532, +0.923879532511286756128183189396788286822416626);
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 8); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 8, MAKE_VOLATILE_STRIDE(32, rs)) {
E T2, T5, Tg, Ti, Tk, To, TE, TC, T6, T3, T8, TW, TJ, Tt, TU;
E Tc, Tx, TH, TN, TO, TP, TR, T1f, T1k, T1b, T1i, T1y, T1H, T1u, T1F;
{
E T7, Tv, Ta, Ts, T4, Tw, Tb, Tr;
{
E Th, Tn, Tj, Tm;
T2 = W[0];
T5 = W[1];
Tg = W[2];
Ti = W[3];
Th = T2 * Tg;
Tn = T5 * Tg;
Tj = T5 * Ti;
Tm = T2 * Ti;
Tk = Th - Tj;
To = Tm + Tn;
TE = Tm - Tn;
TC = Th + Tj;
T6 = W[5];
T7 = T5 * T6;
Tv = Tg * T6;
Ta = T2 * T6;
Ts = Ti * T6;
T3 = W[4];
T4 = T2 * T3;
Tw = Ti * T3;
Tb = T5 * T3;
Tr = Tg * T3;
}
T8 = T4 + T7;
TW = Tv - Tw;
TJ = Ta + Tb;
Tt = Tr - Ts;
TU = Tr + Ts;
Tc = Ta - Tb;
Tx = Tv + Tw;
TH = T4 - T7;
TN = W[6];
TO = W[7];
TP = FMA(T2, TN, T5 * TO);
TR = FNMS(T5, TN, T2 * TO);
{
E T1d, T1e, T19, T1a;
T1d = Tk * T6;
T1e = To * T3;
T1f = T1d - T1e;
T1k = T1d + T1e;
T19 = Tk * T3;
T1a = To * T6;
T1b = T19 + T1a;
T1i = T19 - T1a;
}
{
E T1w, T1x, T1s, T1t;
T1w = TC * T6;
T1x = TE * T3;
T1y = T1w - T1x;
T1H = T1w + T1x;
T1s = TC * T3;
T1t = TE * T6;
T1u = T1s + T1t;
T1F = T1s - T1t;
}
}
{
E Tf, T3r, T1N, T3e, TA, T3s, T1Q, T3b, TM, T2M, T1W, T2w, TZ, T2N, T21;
E T2x, T1B, T1K, T2V, T2W, T2X, T2Y, T2j, T2D, T2o, T2E, T18, T1n, T2Q, T2R;
E T2S, T2T, T28, T2A, T2d, T2B;
{
E T1, T3d, Te, T3c, T9, Td;
T1 = ri[0];
T3d = ii[0];
T9 = ri[WS(rs, 8)];
Td = ii[WS(rs, 8)];
Te = FMA(T8, T9, Tc * Td);
T3c = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T3r = T3d - T3c;
T1N = T1 - Te;
T3e = T3c + T3d;
}
{
E Tq, T1O, Tz, T1P;
{
E Tl, Tp, Tu, Ty;
Tl = ri[WS(rs, 4)];
Tp = ii[WS(rs, 4)];
Tq = FMA(Tk, Tl, To * Tp);
T1O = FNMS(To, Tl, Tk * Tp);
Tu = ri[WS(rs, 12)];
Ty = ii[WS(rs, 12)];
Tz = FMA(Tt, Tu, Tx * Ty);
T1P = FNMS(Tx, Tu, Tt * Ty);
}
TA = Tq + Tz;
T3s = Tq - Tz;
T1Q = T1O - T1P;
T3b = T1O + T1P;
}
{
E TG, T1S, TL, T1T, T1U, T1V;
{
E TD, TF, TI, TK;
TD = ri[WS(rs, 2)];
TF = ii[WS(rs, 2)];
TG = FMA(TC, TD, TE * TF);
T1S = FNMS(TE, TD, TC * TF);
TI = ri[WS(rs, 10)];
TK = ii[WS(rs, 10)];
TL = FMA(TH, TI, TJ * TK);
T1T = FNMS(TJ, TI, TH * TK);
}
TM = TG + TL;
T2M = T1S + T1T;
T1U = T1S - T1T;
T1V = TG - TL;
T1W = T1U - T1V;
T2w = T1V + T1U;
}
{
E TT, T1Y, TY, T1Z, T1X, T20;
{
E TQ, TS, TV, TX;
TQ = ri[WS(rs, 14)];
TS = ii[WS(rs, 14)];
TT = FMA(TP, TQ, TR * TS);
T1Y = FNMS(TR, TQ, TP * TS);
TV = ri[WS(rs, 6)];
TX = ii[WS(rs, 6)];
TY = FMA(TU, TV, TW * TX);
T1Z = FNMS(TW, TV, TU * TX);
}
TZ = TT + TY;
T2N = T1Y + T1Z;
T1X = TT - TY;
T20 = T1Y - T1Z;
T21 = T1X + T20;
T2x = T1X - T20;
}
{
E T1r, T2k, T1J, T2h, T1A, T2l, T1E, T2g;
{
E T1p, T1q, T1G, T1I;
T1p = ri[WS(rs, 15)];
T1q = ii[WS(rs, 15)];
T1r = FMA(TN, T1p, TO * T1q);
T2k = FNMS(TO, T1p, TN * T1q);
T1G = ri[WS(rs, 11)];
T1I = ii[WS(rs, 11)];
T1J = FMA(T1F, T1G, T1H * T1I);
T2h = FNMS(T1H, T1G, T1F * T1I);
}
{
E T1v, T1z, T1C, T1D;
T1v = ri[WS(rs, 7)];
T1z = ii[WS(rs, 7)];
T1A = FMA(T1u, T1v, T1y * T1z);
T2l = FNMS(T1y, T1v, T1u * T1z);
T1C = ri[WS(rs, 3)];
T1D = ii[WS(rs, 3)];
T1E = FMA(Tg, T1C, Ti * T1D);
T2g = FNMS(Ti, T1C, Tg * T1D);
}
T1B = T1r + T1A;
T1K = T1E + T1J;
T2V = T1B - T1K;
T2W = T2k + T2l;
T2X = T2g + T2h;
T2Y = T2W - T2X;
{
E T2f, T2i, T2m, T2n;
T2f = T1r - T1A;
T2i = T2g - T2h;
T2j = T2f - T2i;
T2D = T2f + T2i;
T2m = T2k - T2l;
T2n = T1E - T1J;
T2o = T2m + T2n;
T2E = T2m - T2n;
}
}
{
E T14, T24, T1m, T2b, T17, T25, T1h, T2a;
{
E T12, T13, T1j, T1l;
T12 = ri[WS(rs, 1)];
T13 = ii[WS(rs, 1)];
T14 = FMA(T2, T12, T5 * T13);
T24 = FNMS(T5, T12, T2 * T13);
T1j = ri[WS(rs, 13)];
T1l = ii[WS(rs, 13)];
T1m = FMA(T1i, T1j, T1k * T1l);
T2b = FNMS(T1k, T1j, T1i * T1l);
}
{
E T15, T16, T1c, T1g;
T15 = ri[WS(rs, 9)];
T16 = ii[WS(rs, 9)];
T17 = FMA(T3, T15, T6 * T16);
T25 = FNMS(T6, T15, T3 * T16);
T1c = ri[WS(rs, 5)];
T1g = ii[WS(rs, 5)];
T1h = FMA(T1b, T1c, T1f * T1g);
T2a = FNMS(T1f, T1c, T1b * T1g);
}
T18 = T14 + T17;
T1n = T1h + T1m;
T2Q = T18 - T1n;
T2R = T24 + T25;
T2S = T2a + T2b;
T2T = T2R - T2S;
{
E T26, T27, T29, T2c;
T26 = T24 - T25;
T27 = T1h - T1m;
T28 = T26 + T27;
T2A = T26 - T27;
T29 = T14 - T17;
T2c = T2a - T2b;
T2d = T29 - T2c;
T2B = T29 + T2c;
}
}
{
E T23, T2r, T3A, T3C, T2q, T3B, T2u, T3x;
{
E T1R, T22, T3y, T3z;
T1R = T1N - T1Q;
T22 = KP707106781 * (T1W - T21);
T23 = T1R + T22;
T2r = T1R - T22;
T3y = KP707106781 * (T2x - T2w);
T3z = T3s + T3r;
T3A = T3y + T3z;
T3C = T3z - T3y;
}
{
E T2e, T2p, T2s, T2t;
T2e = FMA(KP923879532, T28, KP382683432 * T2d);
T2p = FNMS(KP923879532, T2o, KP382683432 * T2j);
T2q = T2e + T2p;
T3B = T2p - T2e;
T2s = FNMS(KP923879532, T2d, KP382683432 * T28);
T2t = FMA(KP382683432, T2o, KP923879532 * T2j);
T2u = T2s - T2t;
T3x = T2s + T2t;
}
ri[WS(rs, 11)] = T23 - T2q;
ii[WS(rs, 11)] = T3A - T3x;
ri[WS(rs, 3)] = T23 + T2q;
ii[WS(rs, 3)] = T3x + T3A;
ri[WS(rs, 15)] = T2r - T2u;
ii[WS(rs, 15)] = T3C - T3B;
ri[WS(rs, 7)] = T2r + T2u;
ii[WS(rs, 7)] = T3B + T3C;
}
{
E T2P, T31, T3m, T3o, T30, T3n, T34, T3j;
{
E T2L, T2O, T3k, T3l;
T2L = Tf - TA;
T2O = T2M - T2N;
T2P = T2L + T2O;
T31 = T2L - T2O;
T3k = TZ - TM;
T3l = T3e - T3b;
T3m = T3k + T3l;
T3o = T3l - T3k;
}
{
E T2U, T2Z, T32, T33;
T2U = T2Q + T2T;
T2Z = T2V - T2Y;
T30 = KP707106781 * (T2U + T2Z);
T3n = KP707106781 * (T2Z - T2U);
T32 = T2T - T2Q;
T33 = T2V + T2Y;
T34 = KP707106781 * (T32 - T33);
T3j = KP707106781 * (T32 + T33);
}
ri[WS(rs, 10)] = T2P - T30;
ii[WS(rs, 10)] = T3m - T3j;
ri[WS(rs, 2)] = T2P + T30;
ii[WS(rs, 2)] = T3j + T3m;
ri[WS(rs, 14)] = T31 - T34;
ii[WS(rs, 14)] = T3o - T3n;
ri[WS(rs, 6)] = T31 + T34;
ii[WS(rs, 6)] = T3n + T3o;
}
{
E T2z, T2H, T3u, T3w, T2G, T3v, T2K, T3p;
{
E T2v, T2y, T3q, T3t;
T2v = T1N + T1Q;
T2y = KP707106781 * (T2w + T2x);
T2z = T2v + T2y;
T2H = T2v - T2y;
T3q = KP707106781 * (T1W + T21);
T3t = T3r - T3s;
T3u = T3q + T3t;
T3w = T3t - T3q;
}
{
E T2C, T2F, T2I, T2J;
T2C = FMA(KP382683432, T2A, KP923879532 * T2B);
T2F = FNMS(KP382683432, T2E, KP923879532 * T2D);
T2G = T2C + T2F;
T3v = T2F - T2C;
T2I = FNMS(KP382683432, T2B, KP923879532 * T2A);
T2J = FMA(KP923879532, T2E, KP382683432 * T2D);
T2K = T2I - T2J;
T3p = T2I + T2J;
}
ri[WS(rs, 9)] = T2z - T2G;
ii[WS(rs, 9)] = T3u - T3p;
ri[WS(rs, 1)] = T2z + T2G;
ii[WS(rs, 1)] = T3p + T3u;
ri[WS(rs, 13)] = T2H - T2K;
ii[WS(rs, 13)] = T3w - T3v;
ri[WS(rs, 5)] = T2H + T2K;
ii[WS(rs, 5)] = T3v + T3w;
}
{
E T11, T35, T3g, T3i, T1M, T3h, T38, T39;
{
E TB, T10, T3a, T3f;
TB = Tf + TA;
T10 = TM + TZ;
T11 = TB + T10;
T35 = TB - T10;
T3a = T2M + T2N;
T3f = T3b + T3e;
T3g = T3a + T3f;
T3i = T3f - T3a;
}
{
E T1o, T1L, T36, T37;
T1o = T18 + T1n;
T1L = T1B + T1K;
T1M = T1o + T1L;
T3h = T1L - T1o;
T36 = T2R + T2S;
T37 = T2W + T2X;
T38 = T36 - T37;
T39 = T36 + T37;
}
ri[WS(rs, 8)] = T11 - T1M;
ii[WS(rs, 8)] = T3g - T39;
ri[0] = T11 + T1M;
ii[0] = T39 + T3g;
ri[WS(rs, 12)] = T35 - T38;
ii[WS(rs, 12)] = T3i - T3h;
ri[WS(rs, 4)] = T35 + T38;
ii[WS(rs, 4)] = T3h + T3i;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 9 },
{ TW_CEXP, 0, 15 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 16, "t2_16", twinstr, &GENUS, { 156, 68, 40, 0 }, 0, 0, 0 };
void X(codelet_t2_16) (planner *p) {
X(kdft_dit_register) (p, t2_16, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,200 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T2, T6, T3, T5, T7, Tb, T4, Ta;
T2 = W[0];
T6 = W[3];
T3 = W[2];
T4 = T2 * T3;
Ta = T2 * T6;
T5 = W[1];
T7 = FMA(T5, T6, T4);
Tb = FNMS(T5, T3, Ta);
{
E T1, Tx, Td, Tw, Ti, Tq, Tm, Ts;
T1 = ri[0];
Tx = ii[0];
{
E T8, T9, Tc, Tv;
T8 = ri[WS(rs, 2)];
T9 = T7 * T8;
Tc = ii[WS(rs, 2)];
Tv = T7 * Tc;
Td = FMA(Tb, Tc, T9);
Tw = FNMS(Tb, T8, Tv);
}
{
E Tf, Tg, Th, Tp;
Tf = ri[WS(rs, 1)];
Tg = T2 * Tf;
Th = ii[WS(rs, 1)];
Tp = T2 * Th;
Ti = FMA(T5, Th, Tg);
Tq = FNMS(T5, Tf, Tp);
}
{
E Tj, Tk, Tl, Tr;
Tj = ri[WS(rs, 3)];
Tk = T3 * Tj;
Tl = ii[WS(rs, 3)];
Tr = T3 * Tl;
Tm = FMA(T6, Tl, Tk);
Ts = FNMS(T6, Tj, Tr);
}
{
E Te, Tn, Tu, Ty;
Te = T1 + Td;
Tn = Ti + Tm;
ri[WS(rs, 2)] = Te - Tn;
ri[0] = Te + Tn;
Tu = Tq + Ts;
Ty = Tw + Tx;
ii[0] = Tu + Ty;
ii[WS(rs, 2)] = Ty - Tu;
}
{
E To, Tt, Tz, TA;
To = T1 - Td;
Tt = Tq - Ts;
ri[WS(rs, 3)] = To - Tt;
ri[WS(rs, 1)] = To + Tt;
Tz = Tx - Tw;
TA = Ti - Tm;
ii[WS(rs, 1)] = Tz - TA;
ii[WS(rs, 3)] = TA + Tz;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
void X(codelet_t2_4) (planner *p) {
X(kdft_dit_register) (p, t2_4, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 4 -name t2_4 -include dft/scalar/t.h */
/*
* This function contains 24 FP additions, 16 FP multiplications,
* (or, 16 additions, 8 multiplications, 8 fused multiply/add),
* 21 stack variables, 0 constants, and 16 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(8, rs)) {
E T2, T4, T3, T5, T6, T8;
T2 = W[0];
T4 = W[1];
T3 = W[2];
T5 = W[3];
T6 = FMA(T2, T3, T4 * T5);
T8 = FNMS(T4, T3, T2 * T5);
{
E T1, Tp, Ta, To, Te, Tk, Th, Tl, T7, T9;
T1 = ri[0];
Tp = ii[0];
T7 = ri[WS(rs, 2)];
T9 = ii[WS(rs, 2)];
Ta = FMA(T6, T7, T8 * T9);
To = FNMS(T8, T7, T6 * T9);
{
E Tc, Td, Tf, Tg;
Tc = ri[WS(rs, 1)];
Td = ii[WS(rs, 1)];
Te = FMA(T2, Tc, T4 * Td);
Tk = FNMS(T4, Tc, T2 * Td);
Tf = ri[WS(rs, 3)];
Tg = ii[WS(rs, 3)];
Th = FMA(T3, Tf, T5 * Tg);
Tl = FNMS(T5, Tf, T3 * Tg);
}
{
E Tb, Ti, Tn, Tq;
Tb = T1 + Ta;
Ti = Te + Th;
ri[WS(rs, 2)] = Tb - Ti;
ri[0] = Tb + Ti;
Tn = Tk + Tl;
Tq = To + Tp;
ii[0] = Tn + Tq;
ii[WS(rs, 2)] = Tq - Tn;
}
{
E Tj, Tm, Tr, Ts;
Tj = T1 - Ta;
Tm = Tk - Tl;
ri[WS(rs, 3)] = Tj - Tm;
ri[WS(rs, 1)] = Tj + Tm;
Tr = Tp - To;
Ts = Te - Th;
ii[WS(rs, 1)] = Tr - Ts;
ii[WS(rs, 3)] = Ts + Tr;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 4, "t2_4", twinstr, &GENUS, { 16, 8, 8, 0 }, 0, 0, 0 };
void X(codelet_t2_4) (planner *p) {
X(kdft_dit_register) (p, t2_4, &desc);
}
#endif

View File

@@ -0,0 +1,264 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:37 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
/*
* This function contains 44 FP additions, 40 FP multiplications,
* (or, 14 additions, 10 multiplications, 30 fused multiply/add),
* 38 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP618033988, +0.618033988749894848204586834365638117720309180);
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T2, Ta, T8, T5, Tb, Tm, Tf, Tj, T9, Te;
T2 = W[0];
Ta = W[3];
T8 = W[2];
T9 = T2 * T8;
Te = T2 * Ta;
T5 = W[1];
Tb = FNMS(T5, Ta, T9);
Tm = FNMS(T5, T8, Te);
Tf = FMA(T5, T8, Te);
Tj = FMA(T5, Ta, T9);
{
E T1, TO, T7, Th, Ti, Tz, TB, TL, To, Ts, Tt, TE, TG, TM;
T1 = ri[0];
TO = ii[0];
{
E T3, T4, T6, Ty, Tc, Td, Tg, TA;
T3 = ri[WS(rs, 1)];
T4 = T2 * T3;
T6 = ii[WS(rs, 1)];
Ty = T2 * T6;
Tc = ri[WS(rs, 4)];
Td = Tb * Tc;
Tg = ii[WS(rs, 4)];
TA = Tb * Tg;
T7 = FMA(T5, T6, T4);
Th = FMA(Tf, Tg, Td);
Ti = T7 + Th;
Tz = FNMS(T5, T3, Ty);
TB = FNMS(Tf, Tc, TA);
TL = Tz + TB;
}
{
E Tk, Tl, Tn, TD, Tp, Tq, Tr, TF;
Tk = ri[WS(rs, 2)];
Tl = Tj * Tk;
Tn = ii[WS(rs, 2)];
TD = Tj * Tn;
Tp = ri[WS(rs, 3)];
Tq = T8 * Tp;
Tr = ii[WS(rs, 3)];
TF = T8 * Tr;
To = FMA(Tm, Tn, Tl);
Ts = FMA(Ta, Tr, Tq);
Tt = To + Ts;
TE = FNMS(Tm, Tk, TD);
TG = FNMS(Ta, Tp, TF);
TM = TE + TG;
}
{
E Tw, Tu, Tv, TI, TK, TC, TH, TJ, Tx;
Tw = Ti - Tt;
Tu = Ti + Tt;
Tv = FNMS(KP250000000, Tu, T1);
TC = Tz - TB;
TH = TE - TG;
TI = FMA(KP618033988, TH, TC);
TK = FNMS(KP618033988, TC, TH);
ri[0] = T1 + Tu;
TJ = FNMS(KP559016994, Tw, Tv);
ri[WS(rs, 2)] = FNMS(KP951056516, TK, TJ);
ri[WS(rs, 3)] = FMA(KP951056516, TK, TJ);
Tx = FMA(KP559016994, Tw, Tv);
ri[WS(rs, 4)] = FNMS(KP951056516, TI, Tx);
ri[WS(rs, 1)] = FMA(KP951056516, TI, Tx);
}
{
E TQ, TN, TP, TU, TW, TS, TT, TV, TR;
TQ = TL - TM;
TN = TL + TM;
TP = FNMS(KP250000000, TN, TO);
TS = T7 - Th;
TT = To - Ts;
TU = FMA(KP618033988, TT, TS);
TW = FNMS(KP618033988, TS, TT);
ii[0] = TN + TO;
TV = FNMS(KP559016994, TQ, TP);
ii[WS(rs, 2)] = FMA(KP951056516, TW, TV);
ii[WS(rs, 3)] = FNMS(KP951056516, TW, TV);
TR = FMA(KP559016994, TQ, TP);
ii[WS(rs, 1)] = FNMS(KP951056516, TU, TR);
ii[WS(rs, 4)] = FMA(KP951056516, TU, TR);
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 14, 10, 30, 0 }, 0, 0, 0 };
void X(codelet_t2_5) (planner *p) {
X(kdft_dit_register) (p, t2_5, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 5 -name t2_5 -include dft/scalar/t.h */
/*
* This function contains 44 FP additions, 32 FP multiplications,
* (or, 30 additions, 18 multiplications, 14 fused multiply/add),
* 37 stack variables, 4 constants, and 20 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP250000000, +0.250000000000000000000000000000000000000000000);
DK(KP559016994, +0.559016994374947424102293417182819058860154590);
DK(KP587785252, +0.587785252292473129168705954639072768597652438);
DK(KP951056516, +0.951056516295153572116439333379382143405698634);
{
INT m;
for (m = mb, W = W + (mb * 4); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 4, MAKE_VOLATILE_STRIDE(10, rs)) {
E T2, T4, T7, T9, Tb, Tl, Tf, Tj;
{
E T8, Te, Ta, Td;
T2 = W[0];
T4 = W[1];
T7 = W[2];
T9 = W[3];
T8 = T2 * T7;
Te = T4 * T7;
Ta = T4 * T9;
Td = T2 * T9;
Tb = T8 - Ta;
Tl = Td - Te;
Tf = Td + Te;
Tj = T8 + Ta;
}
{
E T1, TI, Ty, TB, TN, TM, TF, TG, TH, Ti, Tr, Ts;
T1 = ri[0];
TI = ii[0];
{
E T6, Tw, Tq, TA, Th, Tx, Tn, Tz;
{
E T3, T5, To, Tp;
T3 = ri[WS(rs, 1)];
T5 = ii[WS(rs, 1)];
T6 = FMA(T2, T3, T4 * T5);
Tw = FNMS(T4, T3, T2 * T5);
To = ri[WS(rs, 3)];
Tp = ii[WS(rs, 3)];
Tq = FMA(T7, To, T9 * Tp);
TA = FNMS(T9, To, T7 * Tp);
}
{
E Tc, Tg, Tk, Tm;
Tc = ri[WS(rs, 4)];
Tg = ii[WS(rs, 4)];
Th = FMA(Tb, Tc, Tf * Tg);
Tx = FNMS(Tf, Tc, Tb * Tg);
Tk = ri[WS(rs, 2)];
Tm = ii[WS(rs, 2)];
Tn = FMA(Tj, Tk, Tl * Tm);
Tz = FNMS(Tl, Tk, Tj * Tm);
}
Ty = Tw - Tx;
TB = Tz - TA;
TN = Tn - Tq;
TM = T6 - Th;
TF = Tw + Tx;
TG = Tz + TA;
TH = TF + TG;
Ti = T6 + Th;
Tr = Tn + Tq;
Ts = Ti + Tr;
}
ri[0] = T1 + Ts;
ii[0] = TH + TI;
{
E TC, TE, Tv, TD, Tt, Tu;
TC = FMA(KP951056516, Ty, KP587785252 * TB);
TE = FNMS(KP587785252, Ty, KP951056516 * TB);
Tt = KP559016994 * (Ti - Tr);
Tu = FNMS(KP250000000, Ts, T1);
Tv = Tt + Tu;
TD = Tu - Tt;
ri[WS(rs, 4)] = Tv - TC;
ri[WS(rs, 3)] = TD + TE;
ri[WS(rs, 1)] = Tv + TC;
ri[WS(rs, 2)] = TD - TE;
}
{
E TO, TP, TL, TQ, TJ, TK;
TO = FMA(KP951056516, TM, KP587785252 * TN);
TP = FNMS(KP587785252, TM, KP951056516 * TN);
TJ = KP559016994 * (TF - TG);
TK = FNMS(KP250000000, TH, TI);
TL = TJ + TK;
TQ = TK - TJ;
ii[WS(rs, 1)] = TL - TO;
ii[WS(rs, 3)] = TQ - TP;
ii[WS(rs, 4)] = TO + TL;
ii[WS(rs, 2)] = TP + TQ;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 5, "t2_5", twinstr, &GENUS, { 30, 18, 14, 0 }, 0, 0, 0 };
void X(codelet_t2_5) (planner *p) {
X(kdft_dit_register) (p, t2_5, &desc);
}
#endif

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,390 @@
/*
* Copyright (c) 2003, 2007-14 Matteo Frigo
* Copyright (c) 2003, 2007-14 Massachusetts Institute of Technology
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
/* This file was automatically generated --- DO NOT EDIT */
/* Generated on Tue Sep 14 10:44:32 EDT 2021 */
#include "dft/codelet-dft.h"
#if defined(ARCH_PREFERS_FMA) || defined(ISA_EXTENSION_PREFERS_FMA)
/* Generated by: ../../../genfft/gen_twiddle.native -fma -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
/*
* This function contains 74 FP additions, 50 FP multiplications,
* (or, 44 additions, 20 multiplications, 30 fused multiply/add),
* 48 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T3, Tl, Tn, T5, T6, Tf, T7, Ts, Tb, To, Ti, TC, TG;
{
E T4, Tm, Tr, Ta, TB, TF;
T2 = W[0];
T3 = W[2];
T4 = T2 * T3;
Tl = W[4];
Tm = T2 * Tl;
Tn = W[5];
Tr = T2 * Tn;
T5 = W[1];
T6 = W[3];
Ta = T2 * T6;
Tf = FMA(T5, T6, T4);
T7 = FNMS(T5, T6, T4);
Ts = FNMS(T5, Tl, Tr);
Tb = FMA(T5, T3, Ta);
To = FMA(T5, Tn, Tm);
TB = Tf * Tl;
TF = Tf * Tn;
Ti = FNMS(T5, T3, Ta);
TC = FMA(Ti, Tn, TB);
TG = FNMS(Ti, Tl, TF);
}
{
E T1, T1s, Td, T1r, Tu, TY, Tk, TW, TN, TR, T18, T1a, T1c, T1d, TA;
E TI, T11, T13, T15, T16;
T1 = ri[0];
T1s = ii[0];
{
E T8, T9, Tc, T1q;
T8 = ri[WS(rs, 4)];
T9 = T7 * T8;
Tc = ii[WS(rs, 4)];
T1q = T7 * Tc;
Td = FMA(Tb, Tc, T9);
T1r = FNMS(Tb, T8, T1q);
}
{
E Tp, Tq, Tt, TX;
Tp = ri[WS(rs, 6)];
Tq = To * Tp;
Tt = ii[WS(rs, 6)];
TX = To * Tt;
Tu = FMA(Ts, Tt, Tq);
TY = FNMS(Ts, Tp, TX);
}
{
E Tg, Th, Tj, TV;
Tg = ri[WS(rs, 2)];
Th = Tf * Tg;
Tj = ii[WS(rs, 2)];
TV = Tf * Tj;
Tk = FMA(Ti, Tj, Th);
TW = FNMS(Ti, Tg, TV);
}
{
E TK, TL, TM, T19, TO, TP, TQ, T1b;
TK = ri[WS(rs, 7)];
TL = Tl * TK;
TM = ii[WS(rs, 7)];
T19 = Tl * TM;
TO = ri[WS(rs, 3)];
TP = T3 * TO;
TQ = ii[WS(rs, 3)];
T1b = T3 * TQ;
TN = FMA(Tn, TM, TL);
TR = FMA(T6, TQ, TP);
T18 = TN - TR;
T1a = FNMS(Tn, TK, T19);
T1c = FNMS(T6, TO, T1b);
T1d = T1a - T1c;
}
{
E Tx, Ty, Tz, T12, TD, TE, TH, T14;
Tx = ri[WS(rs, 1)];
Ty = T2 * Tx;
Tz = ii[WS(rs, 1)];
T12 = T2 * Tz;
TD = ri[WS(rs, 5)];
TE = TC * TD;
TH = ii[WS(rs, 5)];
T14 = TC * TH;
TA = FMA(T5, Tz, Ty);
TI = FMA(TG, TH, TE);
T11 = TA - TI;
T13 = FNMS(T5, Tx, T12);
T15 = FNMS(TG, TD, T14);
T16 = T13 - T15;
}
{
E T10, T1g, T1z, T1B, T1f, T1C, T1j, T1A;
{
E TU, TZ, T1x, T1y;
TU = T1 - Td;
TZ = TW - TY;
T10 = TU + TZ;
T1g = TU - TZ;
T1x = T1s - T1r;
T1y = Tk - Tu;
T1z = T1x - T1y;
T1B = T1y + T1x;
}
{
E T17, T1e, T1h, T1i;
T17 = T11 + T16;
T1e = T18 - T1d;
T1f = T17 + T1e;
T1C = T1e - T17;
T1h = T16 - T11;
T1i = T18 + T1d;
T1j = T1h - T1i;
T1A = T1h + T1i;
}
ri[WS(rs, 5)] = FNMS(KP707106781, T1f, T10);
ii[WS(rs, 5)] = FNMS(KP707106781, T1A, T1z);
ri[WS(rs, 1)] = FMA(KP707106781, T1f, T10);
ii[WS(rs, 1)] = FMA(KP707106781, T1A, T1z);
ri[WS(rs, 7)] = FNMS(KP707106781, T1j, T1g);
ii[WS(rs, 7)] = FNMS(KP707106781, T1C, T1B);
ri[WS(rs, 3)] = FMA(KP707106781, T1j, T1g);
ii[WS(rs, 3)] = FMA(KP707106781, T1C, T1B);
}
{
E Tw, T1k, T1u, T1w, TT, T1v, T1n, T1o;
{
E Te, Tv, T1p, T1t;
Te = T1 + Td;
Tv = Tk + Tu;
Tw = Te + Tv;
T1k = Te - Tv;
T1p = TW + TY;
T1t = T1r + T1s;
T1u = T1p + T1t;
T1w = T1t - T1p;
}
{
E TJ, TS, T1l, T1m;
TJ = TA + TI;
TS = TN + TR;
TT = TJ + TS;
T1v = TS - TJ;
T1l = T13 + T15;
T1m = T1a + T1c;
T1n = T1l - T1m;
T1o = T1l + T1m;
}
ri[WS(rs, 4)] = Tw - TT;
ii[WS(rs, 4)] = T1u - T1o;
ri[0] = Tw + TT;
ii[0] = T1o + T1u;
ri[WS(rs, 6)] = T1k - T1n;
ii[WS(rs, 6)] = T1w - T1v;
ri[WS(rs, 2)] = T1k + T1n;
ii[WS(rs, 2)] = T1v + T1w;
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 44, 20, 30, 0 }, 0, 0, 0 };
void X(codelet_t2_8) (planner *p) {
X(kdft_dit_register) (p, t2_8, &desc);
}
#else
/* Generated by: ../../../genfft/gen_twiddle.native -compact -variables 4 -pipeline-latency 4 -twiddle-log3 -precompute-twiddles -n 8 -name t2_8 -include dft/scalar/t.h */
/*
* This function contains 74 FP additions, 44 FP multiplications,
* (or, 56 additions, 26 multiplications, 18 fused multiply/add),
* 42 stack variables, 1 constants, and 32 memory accesses
*/
#include "dft/scalar/t.h"
static void t2_8(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms)
{
DK(KP707106781, +0.707106781186547524400844362104849039284835938);
{
INT m;
for (m = mb, W = W + (mb * 6); m < me; m = m + 1, ri = ri + ms, ii = ii + ms, W = W + 6, MAKE_VOLATILE_STRIDE(16, rs)) {
E T2, T5, T3, T6, T8, Tc, Tg, Ti, Tl, Tm, Tn, Tz, Tp, Tx;
{
E T4, Tb, T7, Ta;
T2 = W[0];
T5 = W[1];
T3 = W[2];
T6 = W[3];
T4 = T2 * T3;
Tb = T5 * T3;
T7 = T5 * T6;
Ta = T2 * T6;
T8 = T4 - T7;
Tc = Ta + Tb;
Tg = T4 + T7;
Ti = Ta - Tb;
Tl = W[4];
Tm = W[5];
Tn = FMA(T2, Tl, T5 * Tm);
Tz = FNMS(Ti, Tl, Tg * Tm);
Tp = FNMS(T5, Tl, T2 * Tm);
Tx = FMA(Tg, Tl, Ti * Tm);
}
{
E Tf, T1i, TL, T1d, TJ, T17, TV, TY, Ts, T1j, TO, T1a, TC, T16, TQ;
E TT;
{
E T1, T1c, Te, T1b, T9, Td;
T1 = ri[0];
T1c = ii[0];
T9 = ri[WS(rs, 4)];
Td = ii[WS(rs, 4)];
Te = FMA(T8, T9, Tc * Td);
T1b = FNMS(Tc, T9, T8 * Td);
Tf = T1 + Te;
T1i = T1c - T1b;
TL = T1 - Te;
T1d = T1b + T1c;
}
{
E TF, TW, TI, TX;
{
E TD, TE, TG, TH;
TD = ri[WS(rs, 7)];
TE = ii[WS(rs, 7)];
TF = FMA(Tl, TD, Tm * TE);
TW = FNMS(Tm, TD, Tl * TE);
TG = ri[WS(rs, 3)];
TH = ii[WS(rs, 3)];
TI = FMA(T3, TG, T6 * TH);
TX = FNMS(T6, TG, T3 * TH);
}
TJ = TF + TI;
T17 = TW + TX;
TV = TF - TI;
TY = TW - TX;
}
{
E Tk, TM, Tr, TN;
{
E Th, Tj, To, Tq;
Th = ri[WS(rs, 2)];
Tj = ii[WS(rs, 2)];
Tk = FMA(Tg, Th, Ti * Tj);
TM = FNMS(Ti, Th, Tg * Tj);
To = ri[WS(rs, 6)];
Tq = ii[WS(rs, 6)];
Tr = FMA(Tn, To, Tp * Tq);
TN = FNMS(Tp, To, Tn * Tq);
}
Ts = Tk + Tr;
T1j = Tk - Tr;
TO = TM - TN;
T1a = TM + TN;
}
{
E Tw, TR, TB, TS;
{
E Tu, Tv, Ty, TA;
Tu = ri[WS(rs, 1)];
Tv = ii[WS(rs, 1)];
Tw = FMA(T2, Tu, T5 * Tv);
TR = FNMS(T5, Tu, T2 * Tv);
Ty = ri[WS(rs, 5)];
TA = ii[WS(rs, 5)];
TB = FMA(Tx, Ty, Tz * TA);
TS = FNMS(Tz, Ty, Tx * TA);
}
TC = Tw + TB;
T16 = TR + TS;
TQ = Tw - TB;
TT = TR - TS;
}
{
E Tt, TK, T1f, T1g;
Tt = Tf + Ts;
TK = TC + TJ;
ri[WS(rs, 4)] = Tt - TK;
ri[0] = Tt + TK;
{
E T19, T1e, T15, T18;
T19 = T16 + T17;
T1e = T1a + T1d;
ii[0] = T19 + T1e;
ii[WS(rs, 4)] = T1e - T19;
T15 = Tf - Ts;
T18 = T16 - T17;
ri[WS(rs, 6)] = T15 - T18;
ri[WS(rs, 2)] = T15 + T18;
}
T1f = TJ - TC;
T1g = T1d - T1a;
ii[WS(rs, 2)] = T1f + T1g;
ii[WS(rs, 6)] = T1g - T1f;
{
E T11, T1k, T14, T1h, T12, T13;
T11 = TL - TO;
T1k = T1i - T1j;
T12 = TT - TQ;
T13 = TV + TY;
T14 = KP707106781 * (T12 - T13);
T1h = KP707106781 * (T12 + T13);
ri[WS(rs, 7)] = T11 - T14;
ii[WS(rs, 5)] = T1k - T1h;
ri[WS(rs, 3)] = T11 + T14;
ii[WS(rs, 1)] = T1h + T1k;
}
{
E TP, T1m, T10, T1l, TU, TZ;
TP = TL + TO;
T1m = T1j + T1i;
TU = TQ + TT;
TZ = TV - TY;
T10 = KP707106781 * (TU + TZ);
T1l = KP707106781 * (TZ - TU);
ri[WS(rs, 5)] = TP - T10;
ii[WS(rs, 7)] = T1m - T1l;
ri[WS(rs, 1)] = TP + T10;
ii[WS(rs, 3)] = T1l + T1m;
}
}
}
}
}
}
static const tw_instr twinstr[] = {
{ TW_CEXP, 0, 1 },
{ TW_CEXP, 0, 3 },
{ TW_CEXP, 0, 7 },
{ TW_NEXT, 1, 0 }
};
static const ct_desc desc = { 8, "t2_8", twinstr, &GENUS, { 56, 26, 18, 0 }, 0, 0, 0 };
void X(codelet_t2_8) (planner *p) {
X(kdft_dit_register) (p, t2_8, &desc);
}
#endif